diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index eeda759ff18ccb86ce6a585fe41cb972ea3ae295..e718b32cb6c48d11e73600509a17db107f438708 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -23,7 +23,7 @@ repos:
     -   id: clang-format-with-version-check
         name: clang-format
         description: Format files with ClangFormat.
-        entry: bash ./.clang_format.hook -i
+        entry: bash ./tools/codestyle/clang_format.hook -i
         language: system
         files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
 -   repo: local
@@ -52,7 +52,7 @@ repos:
     hooks:
     -   id: copyright_checker
         name: copyright_checker
-        entry: python ./.copyright.hook
+        entry: python ./tools/codestyle/copyright.hook
         language: system
         files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
         exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$
diff --git a/.travis.yml b/.travis.yml
index 8c772030925dcad3909f142b08e4d8057a3f89b7..a406841f6abf01f15826f34fe4c63b4c24486ccd 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -31,7 +31,7 @@ script:
     if [[ "$JOB" != "doc" ]]; then exit 0; fi;
     # For document only
     if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
-    if [[ "$TRAVIS_BRANCH" != "develop"  && ! "$TRAVIS_BRANCH" =~ ^v[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then exit 0; fi;
+    if [[ "$TRAVIS_BRANCH" != "develop"  && ! "$TRAVIS_BRANCH" =~ ^v|release/[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then exit 0; fi;
     export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh
     export DOCS_DIR=`pwd`
     cd ..
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4117f077219d3b8fc097631073eafa748ff918bc..23bb27e77b9eab0c322a71a8ff570d12d1050377 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -61,8 +61,11 @@ option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen"            OFF)
 option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
 option(WITH_FAST_BUNDLE_TEST    "Bundle tests that can be run in a single process together to reduce launch overhead"   OFF)
 option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
+option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
 option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
+option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
+option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
 
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
@@ -131,6 +134,10 @@ if (NOT DEFINED WITH_MKLDNN)
         set(WITH_MKLDNN OFF)
     endif()
 endif()
+
+if (REPLACE_ENFORCE_GLOG)
+  add_definitions("-DREPLACE_ENFORCE_GLOG")
+endif()
 ########################################################################################
 
 include(external/mklml)     # download mklml package
@@ -153,12 +160,24 @@ include(external/cares)
 if(WITH_DISTRIBUTE)
     if(WITH_GRPC)
         include(external/grpc)
+        message(STATUS "Use grpc framework.")
     else()
+        message(STATUS "Use brpc framework.")
         include(external/leveldb)
         include(external/brpc)
     endif()
 endif()
 
+if(WITH_BRPC_RDMA)
+    message(STATUS "Use brpc with rdma.")
+    if(WITH_GRPC)
+        message(FATAL_ERROR "Can't use grpc with brpc rdma.")
+    endif()
+    if(NOT WITH_DISTRIBUTE)
+        message(FATAL_ERROR "Can't use brpc rdma in no distribute env.")
+    endif()
+endif()
+
 include(external/snappy)    # download snappy
 include(external/snappystream)
 include(external/threadpool)
@@ -178,7 +197,7 @@ include(inference_lib)      # add paddle fluid inference libraries
 
 
 include_directories("${PADDLE_SOURCE_DIR}")
-include_directories("${PADDLE_SOURCE_DIR}/paddle/cuda/include")
+include_directories("${PADDLE_SOURCE_DIR}/paddle/legacy/cuda/include")
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/client/c")
 
@@ -222,7 +241,7 @@ add_subdirectory(proto)
 if(NOT MOBILE_INFERENCE AND NOT WITH_FLUID_ONLY)
     # "add_subdirectory(go)" should be placed after the following loine,
     # because it depends on paddle/optimizer.
-    add_subdirectory(paddle/optimizer)
+    add_subdirectory(paddle/legacy/optimizer)
 endif()
 
 # "add_subdirectory(paddle)" and "add_subdirectory(python)" should be
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index b1b02bcc2f4fd14297715bcf5bfd1617e3d5f0c9..b878f37a5b8e807e5aa346e0074a741f2f8b6cc5 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -159,4 +159,4 @@ This will enable VLOG messages generated by `buddy_allocator.{h,cc}` and in the
 - verbose level 1: [framework](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework)
 - verbose level 3: [operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)
 - verbose level 5: [memory](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory), [platform](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/platform)
-- verbose level 7: [math](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/math)
+- verbose level 7: [math](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/math)
diff --git a/Dockerfile b/Dockerfile
index 752fea5951bdc8c2cf79a17c960217c88ae62571..48c750358cfcb227667c429f19befcaa2f51ebbd 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -23,7 +23,7 @@ ENV HOME /root
 COPY ./paddle/scripts/docker/root/ /root/
 
 RUN apt-get update && \
-    apt-get install -y --allow-downgrades \
+    apt-get install -y --allow-downgrades patchelf \
     git python-pip python-dev python-opencv openssh-server bison \
     libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \
     wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
@@ -76,7 +76,8 @@ RUN easy_install -U pip && \
     pip install sphinx-rtd-theme==0.1.9 recommonmark
 
 RUN pip install pre-commit 'ipython==5.3.0' && \
-    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0'
+    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip install opencv-python
 
 #For docstring checker
 RUN pip install pylint pytest astroid isort
diff --git a/README.md b/README.md
index 8d89c6b1ec9e4aefbd64328dedb4e8c7cc50c21b..eb99ed21d02650ef16cc7da91836909c02895be9 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,6 @@
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
 [![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html)
 [![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html)
-[![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
 
@@ -19,6 +18,8 @@ learning to many products at Baidu.
 Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
 
+### Lastest PaddlePaddle Version: [Fluid](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid)
+
 ## Features
 
 - **Flexibility**
diff --git a/benchmark/fluid/Dockerfile b/benchmark/fluid/Dockerfile
index b9eaca5ee6b487bb37bb954b3c606c3096d37aeb..707fadb1fae97cefe8a41715cd57d71754abda41 100644
--- a/benchmark/fluid/Dockerfile
+++ b/benchmark/fluid/Dockerfile
@@ -1,11 +1,18 @@
 FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
+
+# Use UBUNTU_MIRROR can speed up apt-get speed.
+# ARG UBUNTU_MIRROR
+# RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
+
 RUN apt-get update && apt-get install -y python python-pip iputils-ping libgtk2.0-dev wget vim net-tools iftop python-opencv
 RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/lib/libnccl.so
-RUN pip install -U pip
-RUN pip install -U kubernetes paddlepaddle
 
 # IMPORTANT:
 # Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime.
+# exmaple: unset http_proxy && unset https_proxy && python fluid_benchmark.py ...
+
+RUN pip install -U pip
+RUN pip install -U kubernetes paddlepaddle
 
 RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()\npaddle.dataset.flowers.fetch()" | python'
 RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.mnist.train()\npaddle.dataset.mnist.test()\npaddle.dataset.imdb.fetch()" | python'
@@ -14,9 +21,11 @@ RUN pip uninstall -y paddlepaddle && mkdir /workspace
 
 ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
 ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
+RUN chmod +x /usr/bin/paddle_k8s
 
 ADD *.whl /
-RUN pip install /*.whl && rm -f /*.whl && chmod +x /usr/bin/paddle_k8s
+RUN pip install /*.whl && rm -f /*.whl 
 
 ENV LD_LIBRARY_PATH=/usr/local/lib
-ADD fluid_benchmark.py recordio_converter.py models/ /workspace/
+ADD fluid_benchmark.py recordio_converter.py args.py recordio_converter.py run.sh run_fluid_benchmark.sh /workspace/
+ADD models/ /workspace/models/
diff --git a/benchmark/fluid/args.py b/benchmark/fluid/args.py
index 68a3d42d7a8a8082730f4cae3b5d4ea33819ca2f..a79f25ccc6ace1594f3f331633130eaace5e175b 100644
--- a/benchmark/fluid/args.py
+++ b/benchmark/fluid/args.py
@@ -122,5 +122,13 @@ def parse_args():
         type=str,
         default="",
         help='Directory that contains all the training recordio files.')
+    parser.add_argument(
+        '--use_inference_transpiler',
+        action='store_true',
+        help='If set, use inference transpiler to optimize the program.')
+    parser.add_argument(
+        '--no_random',
+        action='store_true',
+        help='If set, keep the random seed and do not shuffle the data.')
     args = parser.parse_args()
     return args
diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
index aa70783ecd68be543b2d5aabee96a5b09bd72e6a..94ea7bd6aca7c9595037a2dacc5e36d4c77827e7 100644
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -97,7 +97,7 @@ def dist_transpile(trainer_id, args):
         return train_program, fluid.default_startup_program()
     else:
         raise ValueError(
-            'TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
+            'PADDLE_TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
         )
 
 
@@ -131,6 +131,7 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
     exe = fluid.Executor(place)
     exe.run(startup_prog)
 
+    # Use inference_transpiler to speedup
     if not args.use_reader_op:
         feed_var_list = [
             var for var in train_prog.global_block().vars.itervalues()
@@ -181,6 +182,10 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
         print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses))),
         # evaluation
         if not args.no_test and batch_acc and not args.use_reader_op:
+            if args.use_inference_transpiler:
+                t = fluid.InferenceTranspiler()
+                t.transpile(infer_prog, place)
+
             pass_test_acc = test(exe, infer_prog, test_reader, feeder,
                                  batch_acc)
             print(", Test Accuracy: %f" % pass_test_acc)
@@ -264,8 +269,6 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
                     break
             else:
                 loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
-            if args.update_method == "pserver":
-                exe.bcast_params()
             if args.use_reader_op:
                 num_samples += args.batch_size * args.gpus
             else:
@@ -301,9 +304,20 @@ def print_train_time(start_time, end_time, num_samples):
           (num_samples, train_elapsed, examples_per_sec))
 
 
+def print_paddle_envs():
+    print('----------- Configuration envs -----------')
+    for k in os.environ:
+        if "PADDLE_" in k:
+            print "ENV %s:%s" % (k, os.environ[k])
+    print('------------------------------------------------')
+
+
 def main():
     args = parse_args()
     print_arguments(args)
+    print_paddle_envs()
+    if args.no_random:
+        fluid.default_startup_program().random_seed = 1
 
     # the unique trainer id, starting from 0, needed by trainer
     # only
diff --git a/benchmark/fluid/kube_gen_job.py b/benchmark/fluid/kube_gen_job.py
index 9da8a69af1d7b671b2648b1b3702776c1c0650b0..dfe8b5cdd58456902fa8ec355e9837dface3f7be 100644
--- a/benchmark/fluid/kube_gen_job.py
+++ b/benchmark/fluid/kube_gen_job.py
@@ -17,6 +17,7 @@ import copy
 import argparse
 import random
 import os
+import copy
 from kube_templates import pserver, trainer, envs
 
 
@@ -108,10 +109,9 @@ def gen_job():
     tn_container["ports"][0]["containerPort"] = spreadport
 
     envs.append({"name": "PADDLE_JOB_NAME", "value": args.jobname})
-    envs.append({"name": "TRAINERS", "value": str(args.trainers)})
-    envs.append({"name": "PSERVERS", "value": str(args.pservers)})
+    envs.append({"name": "PADDLE_TRAINERS", "value": str(args.trainers)})
+    envs.append({"name": "PADDLE_PSERVERS", "value": str(args.pservers)})
     envs.append({"name": "ENTRY", "value": args.entry})
-    envs.append({"name": "PADDLE_INIT_PORT", "value": str(args.port)})
     envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)})
     # NOTE: these directories below are cluster specific, please modify
     # this settings before you run on your own cluster.
@@ -166,17 +166,23 @@ def gen_job():
     tn["spec"]["template"]["spec"]["volumes"] = volumes
     tn_container["volumeMounts"] = volumeMounts
 
-    ps_container["env"] = envs
-    ps_container["env"].append({"name": "TRAINING_ROLE", "value": "PSERVER"})
+    ps_container["env"] = copy.deepcopy(envs)
+    ps_container["env"].append({
+        "name": "PADDLE_TRAINING_ROLE",
+        "value": "PSERVER"
+    })
     tn_container["env"] = envs
     if args.disttype == "pserver":
         tn_container["env"].append({
-            "name": "TRAINING_ROLE",
+            "name": "PADDLE_TRAINING_ROLE",
             "value": "TRAINER"
         })
     elif args.disttype == "nccl2" or args.disttype == "local":
         # NCCL2 have no training role, set to plain WORKER
-        tn_container["env"].append({"name": "TRAINING_ROLE", "value": "WORKER"})
+        tn_container["env"].append({
+            "name": "PADDLE_TRAINING_ROLE",
+            "value": "WORKER"
+        })
 
     os.mkdir(args.jobname)
     if args.disttype == "pserver":
diff --git a/benchmark/fluid/models/machine_translation.py b/benchmark/fluid/models/machine_translation.py
index 69541adf6b7e53fcc1ac9d3c82b5a60ca0a72879..17f6b03826ae818a3671ea7f9355a8e8c04b50be 100644
--- a/benchmark/fluid/models/machine_translation.py
+++ b/benchmark/fluid/models/machine_translation.py
@@ -173,21 +173,6 @@ def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim,
         return avg_cost, feeding_list
 
 
-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = np.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    lod_t = core.LoDTensor()
-    lod_t.set(flattened_data, place)
-    lod_t.set_lod([lod])
-    return lod_t, lod[-1]
-
-
 def lodtensor_to_ndarray(lod_tensor):
     dims = lod_tensor.get_dims()
     ndarray = np.zeros(shape=dims).astype('float32')
diff --git a/benchmark/fluid/models/resnet.py b/benchmark/fluid/models/resnet.py
index 9ed1093c54a501cc93dbbf9c3651fe70914ce26b..d44a9c07d31cfae9d54ad5949b85c77e60eae258 100644
--- a/benchmark/fluid/models/resnet.py
+++ b/benchmark/fluid/models/resnet.py
@@ -197,12 +197,12 @@ def get_model(args):
     optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
 
     batched_train_reader = paddle.batch(
-        paddle.reader.shuffle(
+        train_reader if args.no_random else paddle.reader.shuffle(
             train_reader, buf_size=5120),
         batch_size=args.batch_size * args.gpus,
         drop_last=True)
     batched_test_reader = paddle.batch(
-        train_reader, batch_size=args.batch_size, drop_last=True)
+        test_reader, batch_size=args.batch_size, drop_last=True)
 
     return avg_cost, inference_program, optimizer, batched_train_reader,\
                    batched_test_reader, batch_acc
diff --git a/benchmark/fluid/models/stacked_dynamic_lstm.py b/benchmark/fluid/models/stacked_dynamic_lstm.py
index 211869af4e8d7180cb485811d3363c50d32f0f74..3231542a17ace99a17c9f9b9bdb3c2527637d9ef 100644
--- a/benchmark/fluid/models/stacked_dynamic_lstm.py
+++ b/benchmark/fluid/models/stacked_dynamic_lstm.py
@@ -125,18 +125,3 @@ def get_model(args):
         batch_size=args.batch_size)
 
     return loss, inference_program, adam, train_reader, test_reader, batch_acc
-
-
-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = numpy.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = fluid.LoDTensor()
-    res.set(flattened_data, place)
-    res.set_lod([lod])
-    return res
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index e3b9d94215a858c5c9a34e1b7e97540f1876801d..6ed51c648478efb9784d0c43b169c285e740e0f3 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -83,18 +83,20 @@ else()
   set(REFERENCE_CBLAS_LIB_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/lib)
 endif()
 
-find_path(REFERENCE_CBLAS_INCLUDE_DIR NAMES cblas.h PATHS
+if(WITH_SYSTEM_BLAS)
+  find_path(REFERENCE_CBLAS_INCLUDE_DIR NAMES cblas.h PATHS
         ${REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS})
-find_library(REFERENCE_CBLAS_LIBRARY NAMES cblas PATHS
+  find_library(REFERENCE_CBLAS_LIBRARY NAMES cblas PATHS
         ${REFERENCE_CBLAS_LIB_SEARCH_PATHS})
 
-if(REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
-  set(CBLAS_FOUND ON)
-  set(CBLAS_PROVIDER REFERENCE)
-  set(CBLAS_INC_DIR ${REFERENCE_CBLAS_INCLUDE_DIR})
-  set(CBLAS_LIBRARIES ${REFERENCE_CBLAS_LIBRARY})
-  add_definitions(-DPADDLE_USE_REFERENCE_CBLAS)
-  message(STATUS "Found reference-cblas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+  if(REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
+    set(CBLAS_FOUND ON)
+    set(CBLAS_PROVIDER REFERENCE)
+    set(CBLAS_INC_DIR ${REFERENCE_CBLAS_INCLUDE_DIR})
+    set(CBLAS_LIBRARIES ${REFERENCE_CBLAS_LIBRARY})
+    add_definitions(-DPADDLE_USE_REFERENCE_CBLAS)
+    message(STATUS "Found reference-cblas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+  endif()
 endif()
 
 if(IOS_USE_VECLIB_FOR_BLAS AND VECLIB_FOUND)
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 6a8b15a6b60a2e5635dc78fc877f0c8da9a2a998..e4af34d10ed92c501dd805addb62747c91c00978 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -174,3 +174,7 @@ endif(WITH_GOLANG)
 if(WITH_GRPC)
     add_definitions(-DPADDLE_WITH_GRPC)
 endif(WITH_GRPC)
+
+if(WITH_BRPC_RDMA)
+    add_definitions(-DPADDLE_WITH_BRPC_RDMA)
+endif(WITH_BRPC_RDMA)
diff --git a/cmake/external/anakin.cmake b/cmake/external/anakin.cmake
index f1cd9c99ebfe5dc5ee0d46d61f1e08256c27d9cd..fb3d8ef8d53436f387acc3069a0eb887e6f07c59 100644
--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
@@ -7,7 +7,17 @@ set(ANAKIN_INSTALL_DIR "${THIRD_PARTY_PATH}/install/anakin" CACHE PATH
 set(ANAKIN_INCLUDE "${ANAKIN_INSTALL_DIR}" CACHE STRING "root of Anakin header files")
 set(ANAKIN_LIBRARY "${ANAKIN_INSTALL_DIR}" CACHE STRING "path of Anakin library")
 
-set(ANAKIN_COMPILE_EXTRA_FLAGS -Wno-error=unused-variable -Wno-error=format-extra-args -Wno-error=comment -Wno-error=format -Wno-error=switch -Wno-error=return-type -Wno-error=non-virtual-dtor -Wno-reorder -Wno-error=cpp)
+set(ANAKIN_COMPILE_EXTRA_FLAGS 
+    -Wno-error=unused-variable -Wno-unused-variable 
+    -Wno-error=format-extra-args -Wno-format-extra-args
+    -Wno-error=comment -Wno-comment 
+    -Wno-error=format -Wno-format 
+    -Wno-error=switch -Wno-switch
+    -Wno-error=return-type -Wno-return-type 
+    -Wno-error=non-virtual-dtor -Wno-non-virtual-dtor
+    -Wno-sign-compare
+    -Wno-reorder 
+    -Wno-error=cpp)
 
 set(ANAKIN_LIBRARY_URL "https://github.com/pangge/Anakin/releases/download/3.0/anakin_release_simple.tar.gz")
 
@@ -26,13 +36,15 @@ function(fetch_include_recursively root_dir)
     endforeach()
 endfunction()
 
-# download library
-message(STATUS "Download Anakin library from ${ANAKIN_LIBRARY_URL}")
-execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
-execute_process(COMMAND bash -c "rm -rf ${ANAKIN_INSTALL_DIR}/*")
-execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget -q ${ANAKIN_LIBRARY_URL}")
-execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
-execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin_release_simple.tar.gz")
+if (NOT EXISTS "${ANAKIN_INSTALL_DIR}")
+    # download library
+    message(STATUS "Download Anakin library from ${ANAKIN_LIBRARY_URL}")
+    execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
+    execute_process(COMMAND bash -c "rm -rf ${ANAKIN_INSTALL_DIR}/*")
+    execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget -q ${ANAKIN_LIBRARY_URL}")
+    execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
+    execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin_release_simple.tar.gz")
+endif()
 
 if (WITH_ANAKIN)
     message(STATUS "Anakin for inference is enabled")
diff --git a/cmake/external/brpc.cmake b/cmake/external/brpc.cmake
index 8e2c913b2caae0c4eeb844d2b51a8975e81c1592..30b227b6452abf44171a1a4e04569e66b16e67a4 100644
--- a/cmake/external/brpc.cmake
+++ b/cmake/external/brpc.cmake
@@ -14,6 +14,15 @@
 
 INCLUDE(ExternalProject)
 
+find_library(SSL_LIBRARY NAMES ssl)
+ADD_LIBRARY(ssl SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET ssl PROPERTY IMPORTED_LOCATION ${SSL_LIBRARY})
+
+find_library(CRYPTO_LIBRARY NAMES crypto)
+ADD_LIBRARY(crypto SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET crypto PROPERTY IMPORTED_LOCATION ${CRYPTO_LIBRARY})
+
+
 SET(BRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/brpc)
 SET(BRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/brpc)
 SET(BRPC_INCLUDE_DIR "${BRPC_INSTALL_DIR}/include" CACHE PATH "brpc include directory." FORCE)
@@ -22,14 +31,14 @@ SET(BRPC_LIBRARIES "${BRPC_INSTALL_DIR}/lib/libbrpc.a" CACHE FILEPATH "brpc libr
 INCLUDE_DIRECTORIES(${BRPC_INCLUDE_DIR})
 
 # Reference https://stackoverflow.com/questions/45414507/pass-a-list-of-prefix-paths-to-externalproject-add-in-cmake-args
-set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf")
+set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib")
 
 # If minimal .a is need, you can set  WITH_DEBUG_SYMBOLS=OFF
 ExternalProject_Add(
     extern_brpc
     ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY  "https://github.com/brpc/brpc"
-    GIT_TAG         "6d153dd7ff00f960ae6895c9c5fff0ce9f07aff2"
+    GIT_REPOSITORY  "https://github.com/gongweibao/brpc"
+    GIT_TAG         "7dc04defad1fd4173aae170c3fcbde131b65155a"
     PREFIX          ${BRPC_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
@@ -42,6 +51,8 @@ ExternalProject_Add(
                     -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
                     -DCMAKE_PREFIX_PATH=${prefix_path}
                     -DBRPC_WITH_GLOG=ON
+                    -DIOBUF_WITH_HUGE_BLOCK=ON
+                    -DBRPC_WITH_RDMA=${WITH_BRPC_RDMA}
                     ${EXTERNAL_OPTIONAL_ARGS}
     LIST_SEPARATOR |
     CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${BRPC_INSTALL_DIR}
@@ -49,7 +60,7 @@ ExternalProject_Add(
                      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                      -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )
-ADD_DEPENDENCIES(extern_brpc protobuf leveldb gflags glog gtest snappy)
+ADD_DEPENDENCIES(extern_brpc protobuf ssl crypto leveldb gflags glog gtest snappy)
 ADD_LIBRARY(brpc STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES})
 ADD_DEPENDENCIES(brpc extern_brpc)
diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake
index ffdf91a354bd92bdaf3f88344f0a9256638b568c..85f40585da29bab9a107f5546e64870975f4c2d3 100644
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -40,12 +40,12 @@ ExternalProject_Add(
     # NOTE(wuyi):
     # this package is generated by following steps:
     # 1. git clone -b v1.8.x https://github.com/grpc/grpc.git
-    # 2. submodule update --init
+    # 2. git submodule update --init
     # 3. keep only zlib, cares, protobuf, boringssl under "third_party",
     #    checkout and clean other dirs under third_party
     # 4. remove .git, and package the directory.
-    URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.8.x.tar.gz"
-    URL_MD5  "c9c58ee7d0e8929a63155af6a2ecdbd0"
+    URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.10.x.tar.gz"
+    URL_MD5  "1f268a2aff6759839dccd256adcc91cf"
     PREFIX          ${GRPC_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CONFIGURE_COMMAND ""
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 25c07850dda7b2f69c2207c37b9d2368632104ec..20dda35c5ccd98f5672d867c26ab97a215483543 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -45,7 +45,8 @@ IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
 ELSE()
     MESSAGE(FATAL_ERROR "Should enable MKLML when build MKLDNN")
 ENDIF()
-SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result -Wno-unused-result")
+SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result")
+SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value")
 SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}")
 SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}")
 ExternalProject_Add(
@@ -53,7 +54,7 @@ ExternalProject_Add(
     ${EXTERNAL_PROJECT_LOG_ARGS}
     DEPENDS             ${MKLDNN_DEPENDS}
     GIT_REPOSITORY      "https://github.com/01org/mkl-dnn.git"
-    GIT_TAG             "db3424ad44901513c03a1ea31ccaacdf633fbe9f"
+    GIT_TAG             "a29d8487a63afca3d5b8c5bbdbb473cf8ccc6e51"
     PREFIX              ${MKLDNN_SOURCES_DIR}
     UPDATE_COMMAND      ""
     CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 4a49a92f2b131bbb38fcf93070ea811e0b1a14e8..ce6a88b51dc98ac46dd3935f12658d60d364ba8c 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -114,7 +114,12 @@ INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
 SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c)
 FILE(WRITE ${dummyfile} "const char *dummy_cblas = \"${dummyfile}\";")
 ADD_LIBRARY(cblas STATIC ${dummyfile})
-TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES})
+
+IF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
+  TARGET_LINK_LIBRARIES(cblas dynload_mklml)
+ELSE()
+  TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES})
+ENDIF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
 
 IF(NOT ${CBLAS_FOUND})
     ADD_DEPENDENCIES(cblas extern_openblas)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 0e2df86c19086357ab520edfcd8421e35768c928..eafb11b6f21e226fc68556a78d675dea94080140 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -96,6 +96,20 @@ if(NOT APPLE AND NOT ANDROID)
     set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt")
 endif(NOT APPLE AND NOT ANDROID)
 
+set_property(GLOBAL PROPERTY FLUID_MODULES "")
+# find all fluid modules is used for paddle fluid static library
+# for building inference libs
+function(find_fluid_modules TARGET_NAME)
+  get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
+  string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
+  string(FIND "${__target_path}" "fluid" pos)
+  if(pos GREATER 1)
+    get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
+    set(fluid_modules ${fluid_modules} ${TARGET_NAME})
+    set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}")
+  endif()
+endfunction(find_fluid_modules)
+
 function(merge_static_libs TARGET_NAME)
   set(libs ${ARGN})
   list(REMOVE_DUPLICATES libs)
@@ -195,6 +209,15 @@ function(cc_library TARGET_NAME)
         list(REMOVE_ITEM cc_library_DEPS warpctc)
         add_dependencies(${TARGET_NAME} warpctc)
       endif()
+      # Only deps libmklml.so, not link
+      if("${cc_library_DEPS};" MATCHES "mklml;")
+        list(REMOVE_ITEM cc_library_DEPS mklml)
+        if(NOT "${TARGET_NAME}" MATCHES "dynload_mklml")
+          list(APPEND cc_library_DEPS dynload_mklml)
+        endif()
+        add_dependencies(${TARGET_NAME} mklml)
+        target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
+      endif()
       target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
       add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
     endif()
@@ -234,13 +257,14 @@ function(cc_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS ARGS)
     cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_executable(${TARGET_NAME} ${cc_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
-    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
+    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
+    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
     add_test(NAME ${TARGET_NAME}
              COMMAND ${TARGET_NAME} ${cc_test_ARGS}
              WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     if (${cc_test_SERIAL})
         set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
     endif()
   endif()
 endfunction(cc_test)
@@ -300,11 +324,12 @@ function(nv_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS)
     cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main memory gtest gflags glog)
-    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main memory gtest gflags glog)
+    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
+    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
     add_test(${TARGET_NAME} ${TARGET_NAME})
     if (nv_test_SERIAL)
         set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
     endif()
   endif()
 endfunction(nv_test)
@@ -552,7 +577,7 @@ function(py_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS ARGS ENVS)
     cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_test(NAME ${TARGET_NAME}
-             COMMAND env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
+             COMMAND env FLAGS_init_allocated_mem=true PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
              ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
              WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   endif()
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index cd44fe2542bfa8c53721d61b70778226e640d375..c6979713231f631f8757e4139d6f685d4554b54e 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -12,19 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-set_property(GLOBAL PROPERTY FLUID_MODULES "")
-# find all fluid modules is used for paddle fluid static library
-function(find_fluid_modules TARGET_NAME)
-  get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
-  string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
-  string(FIND "${__target_path}" "fluid" pos)
-  if(pos GREATER 1)
-    get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
-    set(fluid_modules ${fluid_modules} ${TARGET_NAME})
-    set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}")
-  endif()
-endfunction(find_fluid_modules)
-
 # make package for paddle fluid shared and static library
 function(copy TARGET)
     set(options "")
@@ -149,21 +136,33 @@ copy(memory_lib
   DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail
 )
 
-set(module "inference")
-copy(inference_lib DEPS paddle_fluid_shared paddle_fluid
-  SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
-  DSTS ${dst_dir}/${module} ${dst_dir}/${module}
-)
+set(inference_deps paddle_fluid_shared paddle_fluid)
 
 if(WITH_CONTRIB)
-   set(contrib_dst_dir "${FLUID_INSTALL_DIR}/contrib/inference")
-   copy(contrib_inference_lib DEPS paddle_inference_api
+    message(STATUS "installing contrib")
+    set(contrib_dst_dir "${FLUID_INSTALL_DIR}/contrib/inference")
+    if (WITH_ANAKIN AND WITH_GPU)
+        copy(contrib_anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
+            SRCS
+            ${PADDLE_BINARY_DIR}/paddle/contrib/inference/libinference_anakin_api* # compiled anakin api
+            ${PADDLE_BINARY_DIR}/third_party/install/anakin/*.tar.gz # anakin release
+            DSTS ${contrib_dst_dir}/anakin ${contrib_dst_dir}/anakin)
+        list(APPEND inference_deps contrib_anakin_inference_lib)
+   endif()
+
+  copy(contrib_inference_lib DEPS paddle_inference_api paddle_inference_api_shared
         SRCS ${PADDLE_SOURCE_DIR}/paddle/contrib/inference/paddle_inference_api.h
-        ${PADDLE_BINARY_DIR}/paddle/contrib/inference/libpaddle_inference_api.*
-        DSTS ${contrib_dst_dir} ${contrib_dst_dir}
-   )
+        ${PADDLE_BINARY_DIR}/paddle/contrib/inference/libpaddle_inference_api*
+        DSTS ${contrib_dst_dir} ${contrib_dst_dir})
+  list(APPEND inference_deps contrib_inference_lib)
 endif()
 
+set(module "inference")
+copy(inference_lib DEPS ${inference_deps}
+  SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module}
+)
+
 set(module "platform")
 copy(platform_lib DEPS profiler_py_proto
   SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h
diff --git a/cmake/version.cmake b/cmake/version.cmake
index cde650128a068faf32f4abfff5cdfdeb656d8577..79b8e8ac496250d85427b77fbd6a9924a962a15b 100644
--- a/cmake/version.cmake
+++ b/cmake/version.cmake
@@ -1,16 +1,21 @@
 # Get the latest git tag.
 set(PADDLE_VERSION $ENV{PADDLE_VERSION})
 set(tmp_version "HEAD")
+set(TAG_VERSION_REGEX "[0-9]+\\.[0-9]+\\.[0-9]+(\\.(a|b|rc)\\.[0-9]+)?")
+set(COMMIT_VERSION_REGEX "[0-9a-f]+[0-9a-f]+[0-9a-f]+[0-9a-f]+[0-9a-f]+")
 while ("${PADDLE_VERSION}" STREQUAL "")
   execute_process(
-    COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 ${tmp_version}
+    COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 --always ${tmp_version}
     WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
     OUTPUT_VARIABLE GIT_TAG_NAME
     RESULT_VARIABLE GIT_RESULT
     ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
   if (NOT ${GIT_RESULT})
     # Check the tag is a correct version
-    if (${GIT_TAG_NAME} MATCHES "v[0-9]+\\.[0-9]+\\.[0-9]+(\\.(a|b|rc)\\.[0-9]+)?")
+    if (${GIT_TAG_NAME} MATCHES "${COMMIT_VERSION_REGEX}")
+      # if no tag was found, set PADDLE_VERSION to latest
+      set(PADDLE_VERSION "latest")
+    elseif (${GIT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}")
       string(REPLACE "v" "" PADDLE_VERSION ${GIT_TAG_NAME})
     else()  # otherwise, get the previous git tag name.
       set(tmp_version "${GIT_TAG_NAME}~1")
diff --git a/doc/about/about_us.rst b/doc/about/about_us.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f67d8b8130030db8d7e7d10b30271a913bd6272a
--- /dev/null
+++ b/doc/about/about_us.rst
@@ -0,0 +1,53 @@
+=========
+关于我们
+=========
+
+什么是PaddlePaddle
+--------------------
+
+- PaddlePaddle是百度自主研发并开源的深度学习框架，它能够让开发者和企业安全、快速地实现自己的AI想法
+
+- 项目团队汇聚了全球顶级的深度学习科学家，致力于为开发者和企业提供最好的深度学习研发体验
+
+- 框架具有易学、易用、安全、高效四大特性，是最适合中国开发者和企业的深度学习工具
+
+PaddlePaddle的技术特色
+-------------------------
+
+- 新一代深度学习框架： PaddlePaddle是基于“深度学习编程语言”的新一代深度学习框架，在保证性能的同时，极大的提升了框架对模型的表达能力，能够描述任意潜在可能出现的模型
+
+- 对大规模计算更加友好：经过百度内多种大规模计算业务的打磨，PaddlePaddle在分布式计算上表现优异，基于EDL技术能够节约大量计算资源，同时也能支持大规模稀疏模型的训练
+
+- 提供可视化的深度学习：通过Visual DL可以帮助开发者方便的观测训练整体趋势、数据样本质量和中间结果、参数分布和变化趋势、以及模型的结构，帮助开发者更便捷的完成编程过程
+
+提供基于PaddlePaddle的教育体系
+--------------------------------
+
+- 深度学习课程：百度与中国市场顶级的教育、培训机构共同开发了深度学习精品课程以及学习教材，帮助开发者从零掌握深度学习
+
+- 深度学习实训：对于目的是科研和学习的用户，PaddlePaddle提供了无需安装、线上运行的开发环境，并提供算法、算力、数据支持
+
+- 线下培训：提供丰富、高质量的线下教育活动，如青年教师培训、线下实战营、沙龙等多种形式的培训和交流
+
+
+提供基于PaddlePaddle的AI服务
+------------------------------
+
+- EadyDL：可以帮助零算法基础的企业快速完成一个深度学习任务，只需少量的数据即可得到优质的模型
+
+- AI市场：提供标准化的AI 能力、产品的交易机制，帮助企业快速找到所需，有效开展AI业务
+
+- 深度学习竞赛： PaddlePaddle汇聚顶尖深度学习开发者，企业可以发布自己的商业问题，通过竞赛方式快速找到最优的解决方案
+
+你对PaddlePaddle有任何的问题都可以通过以下方式联系到我们
+-----------------------------------------------------------
+
+- 学习/使用问题：可以在 `PaddlePaddle开源社区 <https://github.com/PaddlePaddle/Paddle/issues>`_，以及 `PaddlePaddle中文社区 <http://ai.baidu.com/forum/topic/list/168>`_ 向我们反馈
+
+- 对PaddlePaddle框架发展的建议：可发送邮件至Paddle-better@baidu.com
+
+我们期待与你一起打造世界顶级深度学习框架，共同推动AI技术的进步
+
+
+
+PaddlePaddle团队
diff --git a/doc/fluid/api/average.rst b/doc/fluid/api/average.rst
new file mode 100644
index 0000000000000000000000000000000000000000..496f5b29875443f0c44f50fcb3ca837f4e7bcd12
--- /dev/null
+++ b/doc/fluid/api/average.rst
@@ -0,0 +1,16 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+=============
+fluid.average
+=============
+
+.. _api_fluid_average_WeightedAverage:
+
+WeightedAverage
+---------------
+
+..  autoclass:: paddle.fluid.average.WeightedAverage
+    :members:
+    :noindex:
+
diff --git a/doc/fluid/api/backward.rst b/doc/fluid/api/backward.rst
new file mode 100644
index 0000000000000000000000000000000000000000..115e0d24b39928cfc349f72e0a21d6374cd8cd75
--- /dev/null
+++ b/doc/fluid/api/backward.rst
@@ -0,0 +1,23 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+==============
+fluid.backward
+==============
+
+.. _api_fluid_backward_append_backward:
+
+append_backward
+---------------
+
+..  autofunction:: paddle.fluid.backward.append_backward
+    :noindex:
+
+.. _api_fluid_backward_calc_gradient:
+
+calc_gradient
+-------------
+
+..  autofunction:: paddle.fluid.backward.calc_gradient
+    :noindex:
+
diff --git a/doc/fluid/api/clip.rst b/doc/fluid/api/clip.rst
index 3ba096388fc87dda3096a9030fe5749e61112c06..aeefbb95a46e5d5ed46375e388a720fad2711779 100644
--- a/doc/fluid/api/clip.rst
+++ b/doc/fluid/api/clip.rst
@@ -1,9 +1,11 @@
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
     !DO NOT EDIT THIS FILE MANUALLY!
 
-====
-clip
-====
+==========
+fluid.clip
+==========
+
+.. _api_fluid_clip_ErrorClipByValue:
 
 ErrorClipByValue
 ----------------
@@ -12,6 +14,8 @@ ErrorClipByValue
     :members:
     :noindex:
 
+.. _api_fluid_clip_GradientClipByValue:
+
 GradientClipByValue
 -------------------
 
@@ -19,6 +23,8 @@ GradientClipByValue
     :members:
     :noindex:
 
+.. _api_fluid_clip_GradientClipByNorm:
+
 GradientClipByNorm
 ------------------
 
@@ -26,6 +32,8 @@ GradientClipByNorm
     :members:
     :noindex:
 
+.. _api_fluid_clip_GradientClipByGlobalNorm:
+
 GradientClipByGlobalNorm
 ------------------------
 
@@ -33,15 +41,3 @@ GradientClipByGlobalNorm
     :members:
     :noindex:
 
-append_gradient_clip_ops
-------------------------
-
-..  autofunction:: paddle.fluid.clip.append_gradient_clip_ops
-    :noindex:
-
-error_clip_callback
--------------------
-
-..  autofunction:: paddle.fluid.clip.error_clip_callback
-    :noindex:
-
diff --git a/doc/fluid/api/data.rst b/doc/fluid/api/data.rst
deleted file mode 100644
index b56c7332cc284649c7e04328e51a7faa78593a39..0000000000000000000000000000000000000000
--- a/doc/fluid/api/data.rst
+++ /dev/null
@@ -1,10 +0,0 @@
-==================================
-Data Reader Interface and DataSets
-==================================
-
-..  toctree::
-    :maxdepth: 1
-
-    data/data_reader.rst
-    data/image.rst
-    data/dataset.rst
diff --git a/doc/fluid/api/data_feeder.rst b/doc/fluid/api/data_feeder.rst
index 3df5c0307ffed9d101da58b385840b115920e906..11d2890f5b3446e37c3ef31e5a17ebebe169dbc8 100644
--- a/doc/fluid/api/data_feeder.rst
+++ b/doc/fluid/api/data_feeder.rst
@@ -1,9 +1,11 @@
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
     !DO NOT EDIT THIS FILE MANUALLY!
 
-===========
-data_feeder
-===========
+=================
+fluid.data_feeder
+=================
+
+.. _api_fluid_data_feeder_DataFeeder:
 
 DataFeeder
 ----------
diff --git a/doc/fluid/api/detection.rst b/doc/fluid/api/detection.rst
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/doc/fluid/api/evaluator.rst b/doc/fluid/api/evaluator.rst
deleted file mode 100644
index c0dc9a0d1d9f2f70948dc3c905dca25d7dd43742..0000000000000000000000000000000000000000
--- a/doc/fluid/api/evaluator.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-=========
-evaluator
-=========
-
diff --git a/doc/fluid/api/executor.rst b/doc/fluid/api/executor.rst
index f67a14c49f372e67d18ec8e6f87da01109376d22..db2842e7f23e74130a966bb347004bee1ccb08fd 100644
--- a/doc/fluid/api/executor.rst
+++ b/doc/fluid/api/executor.rst
@@ -1,9 +1,11 @@
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
     !DO NOT EDIT THIS FILE MANUALLY!
 
-========
-executor
-========
+==============
+fluid.executor
+==============
+
+.. _api_fluid_executor_Executor:
 
 Executor
 --------
@@ -12,24 +14,32 @@ Executor
     :members:
     :noindex:
 
+.. _api_fluid_executor_global_scope:
+
 global_scope
 ------------
 
 ..  autofunction:: paddle.fluid.executor.global_scope
     :noindex:
 
+.. _api_fluid_executor_scope_guard:
+
 scope_guard
 -----------
 
 ..  autofunction:: paddle.fluid.executor.scope_guard
     :noindex:
 
-switch_scope
-------------
+.. _api_fluid_executor__switch_scope:
+
+_switch_scope
+-------------
 
-..  autofunction:: paddle.fluid.executor.switch_scope
+..  autofunction:: paddle.fluid.executor._switch_scope
     :noindex:
 
+.. _api_fluid_executor_fetch_var:
+
 fetch_var
 ---------
 
diff --git a/doc/fluid/api/fluid.rst b/doc/fluid/api/fluid.rst
new file mode 100644
index 0000000000000000000000000000000000000000..51cdfe0c2ed045a5b3247c4fdec9868d756eae86
--- /dev/null
+++ b/doc/fluid/api/fluid.rst
@@ -0,0 +1,378 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+=====
+fluid
+=====
+
+.. _api_fluid_Block:
+
+Block
+-----
+
+..  autoclass:: paddle.fluid.Block
+    :members:
+    :noindex:
+
+.. _api_fluid_Variable:
+
+Variable
+--------
+
+..  autoclass:: paddle.fluid.Variable
+    :members:
+    :noindex:
+
+.. _api_fluid_Program:
+
+Program
+-------
+
+..  autoclass:: paddle.fluid.Program
+    :members:
+    :noindex:
+
+.. _api_fluid_Operator:
+
+Operator
+--------
+
+..  autoclass:: paddle.fluid.Operator
+    :members:
+    :noindex:
+
+.. _api_fluid_default_startup_program:
+
+default_startup_program
+-----------------------
+
+..  autofunction:: paddle.fluid.default_startup_program
+    :noindex:
+
+.. _api_fluid_default_main_program:
+
+default_main_program
+--------------------
+
+..  autofunction:: paddle.fluid.default_main_program
+    :noindex:
+
+.. _api_fluid_program_guard:
+
+program_guard
+-------------
+
+..  autofunction:: paddle.fluid.program_guard
+    :noindex:
+
+.. _api_fluid_get_var:
+
+get_var
+-------
+
+..  autofunction:: paddle.fluid.get_var
+    :noindex:
+
+.. _api_fluid_Executor:
+
+Executor
+--------
+
+..  autoclass:: paddle.fluid.Executor
+    :members:
+    :noindex:
+
+.. _api_fluid_global_scope:
+
+global_scope
+------------
+
+..  autofunction:: paddle.fluid.global_scope
+    :noindex:
+
+.. _api_fluid_scope_guard:
+
+scope_guard
+-----------
+
+..  autofunction:: paddle.fluid.scope_guard
+    :noindex:
+
+.. _api_fluid__switch_scope:
+
+_switch_scope
+-------------
+
+..  autofunction:: paddle.fluid._switch_scope
+    :noindex:
+
+.. _api_fluid_fetch_var:
+
+fetch_var
+---------
+
+..  autofunction:: paddle.fluid.fetch_var
+    :noindex:
+
+.. _api_fluid_Go:
+
+Go
+--
+
+..  autoclass:: paddle.fluid.Go
+    :members:
+    :noindex:
+
+.. _api_fluid_make_channel:
+
+make_channel
+------------
+
+..  autofunction:: paddle.fluid.make_channel
+    :noindex:
+
+.. _api_fluid_channel_send:
+
+channel_send
+------------
+
+..  autofunction:: paddle.fluid.channel_send
+    :noindex:
+
+.. _api_fluid_channel_recv:
+
+channel_recv
+------------
+
+..  autofunction:: paddle.fluid.channel_recv
+    :noindex:
+
+.. _api_fluid_channel_close:
+
+channel_close
+-------------
+
+..  autofunction:: paddle.fluid.channel_close
+    :noindex:
+
+.. _api_fluid_Select:
+
+Select
+------
+
+..  autoclass:: paddle.fluid.Select
+    :members:
+    :noindex:
+
+.. _api_fluid_Trainer:
+
+Trainer
+-------
+
+..  autoclass:: paddle.fluid.Trainer
+    :members:
+    :noindex:
+
+.. _api_fluid_BeginEpochEvent:
+
+BeginEpochEvent
+---------------
+
+..  autoclass:: paddle.fluid.BeginEpochEvent
+    :members:
+    :noindex:
+
+.. _api_fluid_EndEpochEvent:
+
+EndEpochEvent
+-------------
+
+..  autoclass:: paddle.fluid.EndEpochEvent
+    :members:
+    :noindex:
+
+.. _api_fluid_BeginStepEvent:
+
+BeginStepEvent
+--------------
+
+..  autoclass:: paddle.fluid.BeginStepEvent
+    :members:
+    :noindex:
+
+.. _api_fluid_EndStepEvent:
+
+EndStepEvent
+------------
+
+..  autoclass:: paddle.fluid.EndStepEvent
+    :members:
+    :noindex:
+
+.. _api_fluid_CheckpointConfig:
+
+CheckpointConfig
+----------------
+
+..  autoclass:: paddle.fluid.CheckpointConfig
+    :members:
+    :noindex:
+
+.. _api_fluid_Inferencer:
+
+Inferencer
+----------
+
+..  autoclass:: paddle.fluid.Inferencer
+    :members:
+    :noindex:
+
+.. _api_fluid_DistributeTranspiler:
+
+DistributeTranspiler
+--------------------
+
+..  autoclass:: paddle.fluid.DistributeTranspiler
+    :members:
+    :noindex:
+
+.. _api_fluid_memory_optimize:
+
+memory_optimize
+---------------
+
+..  autofunction:: paddle.fluid.memory_optimize
+    :noindex:
+
+.. _api_fluid_release_memory:
+
+release_memory
+--------------
+
+..  autofunction:: paddle.fluid.release_memory
+    :noindex:
+
+.. _api_fluid_ParallelExecutor:
+
+ParallelExecutor
+----------------
+
+..  autoclass:: paddle.fluid.ParallelExecutor
+    :members:
+    :noindex:
+
+.. _api_fluid_ExecutionStrategy:
+
+ExecutionStrategy
+-----------------
+
+..  autoclass:: paddle.fluid.ExecutionStrategy
+    :members:
+    :noindex:
+
+.. _api_fluid_BuildStrategy:
+
+BuildStrategy
+-------------
+
+..  autoclass:: paddle.fluid.BuildStrategy
+    :members:
+    :noindex:
+
+.. _api_fluid_create_lod_tensor:
+
+create_lod_tensor
+-----------------
+
+..  autofunction:: paddle.fluid.create_lod_tensor
+    :noindex:
+
+.. _api_fluid_create_random_int_lodtensor:
+
+create_random_int_lodtensor
+---------------------------
+
+..  autofunction:: paddle.fluid.create_random_int_lodtensor
+    :noindex:
+
+.. _api_fluid_LoDTensor:
+
+LoDTensor
+---------
+
+..  autoclass:: paddle.fluid.LoDTensor
+    :members:
+    :noindex:
+
+.. _api_fluid_CPUPlace:
+
+CPUPlace
+--------
+
+..  autoclass:: paddle.fluid.CPUPlace
+    :members:
+    :noindex:
+
+.. _api_fluid_CUDAPlace:
+
+CUDAPlace
+---------
+
+..  autoclass:: paddle.fluid.CUDAPlace
+    :members:
+    :noindex:
+
+.. _api_fluid_CUDAPinnedPlace:
+
+CUDAPinnedPlace
+---------------
+
+..  autoclass:: paddle.fluid.CUDAPinnedPlace
+    :members:
+    :noindex:
+
+.. _api_fluid_Tensor:
+
+Tensor
+------
+
+..  autoclass:: paddle.fluid.Tensor
+    :members:
+    :noindex:
+
+.. _api_fluid_ParamAttr:
+
+ParamAttr
+---------
+
+..  autoclass:: paddle.fluid.ParamAttr
+    :members:
+    :noindex:
+
+.. _api_fluid_WeightNormParamAttr:
+
+WeightNormParamAttr
+-------------------
+
+..  autoclass:: paddle.fluid.WeightNormParamAttr
+    :members:
+    :noindex:
+
+.. _api_fluid_DataFeeder:
+
+DataFeeder
+----------
+
+..  autoclass:: paddle.fluid.DataFeeder
+    :members:
+    :noindex:
+
+.. _api_fluid_Scope:
+
+Scope
+-----
+
+..  autoclass:: paddle.fluid.Scope
+    :members:
+    :noindex:
+
diff --git a/doc/fluid/api/gen_doc.py b/doc/fluid/api/gen_doc.py
index 89ab880301b6ac687fd61f556f87f03792c37da3..02efce2bf8392c62a7600c272bedcadc6563f927 100644
--- a/doc/fluid/api/gen_doc.py
+++ b/doc/fluid/api/gen_doc.py
@@ -29,19 +29,27 @@ def parse_arg():
 
 
 class DocGenerator(object):
-    def __init__(self, module_name, stream=sys.stdout):
+    def __init__(self, module_name=None, stream=sys.stdout):
+        if module_name == "":
+            module_name = None
         self.stream = stream
-        self.module_name = module_name
-        if not hasattr(fluid, module_name):
-            raise ValueError("Cannot find fluid.{0}".format(module_name))
+        if module_name is None:
+            self.module_name = "fluid"
         else:
-            self.module = getattr(fluid, module_name)
+            self.module_name = "fluid." + module_name
+        if module_name is None:
+            self.module = fluid
+        else:
+            if not hasattr(fluid, module_name):
+                raise ValueError("Cannot find fluid.{0}".format(module_name))
+            else:
+                self.module = getattr(fluid, module_name)
         self.stream.write('''..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
     !DO NOT EDIT THIS FILE MANUALLY!
 
 ''')
 
-        self._print_header_(module_name, dot='=', is_title=True)
+        self._print_header_(self.module_name, dot='=', is_title=True)
 
     def print_submodule(self, submodule_name):
         submodule = getattr(self.module, submodule_name)
@@ -60,25 +68,29 @@ class DocGenerator(object):
         self._print_header_(name, dot='=', is_title=False)
 
     def print_item(self, name):
-        item = getattr(self.module, name)
+        item = getattr(self.module, name, None)
+        if item is None:
+            return
         if isinstance(item, types.TypeType):
             self.print_class(name)
         elif isinstance(item, types.FunctionType):
             self.print_method(name)
         else:
-            raise RuntimeError("Unsupported item {0}".format(name))
+            pass
 
     def print_class(self, name):
+        self._print_ref_(name)
         self._print_header_(name, dot='-', is_title=False)
-        self.stream.write('''..  autoclass:: paddle.fluid.{0}.{1}
+        self.stream.write('''..  autoclass:: paddle.{0}.{1}
     :members:
     :noindex:
 
 '''.format(self.module_name, name))
 
     def print_method(self, name):
+        self._print_ref_(name)
         self._print_header_(name, dot='-', is_title=False)
-        self.stream.write('''..  autofunction:: paddle.fluid.{0}.{1}
+        self.stream.write('''..  autofunction:: paddle.{0}.{1}
     :noindex:
 
 '''.format(self.module_name, name))
@@ -94,6 +106,10 @@ class DocGenerator(object):
         self.stream.write('\n')
         self.stream.write('\n')
 
+    def _print_ref_(self, name):
+        self.stream.write(".. _api_{0}_{1}:\n\n".format("_".join(
+            self.module_name.split(".")), name))
+
 
 def main():
     args = parse_arg()
diff --git a/doc/fluid/api/gen_doc.sh b/doc/fluid/api/gen_doc.sh
index 27f2419c06b3ba2d29c471c4928d098ccee9ea02..b14ee29873c50fd011f6c48b754767ac8918252a 100755
--- a/doc/fluid/api/gen_doc.sh
+++ b/doc/fluid/api/gen_doc.sh
@@ -1,7 +1,9 @@
 #!/bin/bash
-python gen_doc.py layers --submodules control_flow device io nn ops tensor detection learning_rate_scheduler > layers.rst
+python gen_doc.py layers --submodules control_flow device io nn ops tensor learning_rate_scheduler detection metric_op tensor > layers.rst
 
-for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer
+for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer transpiler recordio_writer backward average profiler
 do
   python gen_doc.py ${module} > ${module}.rst
 done
+
+python gen_doc.py "" > fluid.rst
diff --git a/doc/fluid/api/index_en.rst b/doc/fluid/api/index_en.rst
index 29cea9c68221b921939e8e09072d87f9f604e21b..359406819a993e7eaf2155c839373df44d97b103 100644
--- a/doc/fluid/api/index_en.rst
+++ b/doc/fluid/api/index_en.rst
@@ -1,10 +1,11 @@
-======================
-Fluid
-======================
+=============
+API Reference
+=============
 
 ..  toctree::
     :maxdepth: 1
 
+    fluid.rst
     layers.rst
     data_feeder.rst
     executor.rst
@@ -18,3 +19,8 @@ Fluid
     regularizer.rst
     io.rst
     data.rst
+    transpiler.rst
+    recordio_writer.rst
+    backward.rst
+    average.rst
+    profiler.rst
diff --git a/doc/fluid/api/initializer.rst b/doc/fluid/api/initializer.rst
index c49a98c744cdf907630ea8c74791ff2021d996e8..dc0b52b14fd242dfaded1cb9a8e0ab9eb66b0607 100644
--- a/doc/fluid/api/initializer.rst
+++ b/doc/fluid/api/initializer.rst
@@ -1,9 +1,11 @@
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
     !DO NOT EDIT THIS FILE MANUALLY!
 
-===========
-initializer
-===========
+=================
+fluid.initializer
+=================
+
+.. _api_fluid_initializer_Constant:
 
 Constant
 --------
@@ -12,6 +14,8 @@ Constant
     :members:
     :noindex:
 
+.. _api_fluid_initializer_Uniform:
+
 Uniform
 -------
 
@@ -19,6 +23,8 @@ Uniform
     :members:
     :noindex:
 
+.. _api_fluid_initializer_Normal:
+
 Normal
 ------
 
@@ -26,6 +32,8 @@ Normal
     :members:
     :noindex:
 
+.. _api_fluid_initializer_Xavier:
+
 Xavier
 ------
 
@@ -33,18 +41,42 @@ Xavier
     :members:
     :noindex:
 
+.. _api_fluid_initializer_Bilinear:
+
+Bilinear
+--------
+
+..  autoclass:: paddle.fluid.initializer.Bilinear
+    :members:
+    :noindex:
+
+.. _api_fluid_initializer_MSRA:
+
+MSRA
+----
+
+..  autoclass:: paddle.fluid.initializer.MSRA
+    :members:
+    :noindex:
+
+.. _api_fluid_initializer_force_init_on_cpu:
+
 force_init_on_cpu
 -----------------
 
 ..  autofunction:: paddle.fluid.initializer.force_init_on_cpu
     :noindex:
 
+.. _api_fluid_initializer_init_on_cpu:
+
 init_on_cpu
 -----------
 
 ..  autofunction:: paddle.fluid.initializer.init_on_cpu
     :noindex:
 
+.. _api_fluid_initializer_ConstantInitializer:
+
 ConstantInitializer
 -------------------
 
@@ -52,6 +84,8 @@ ConstantInitializer
     :members:
     :noindex:
 
+.. _api_fluid_initializer_UniformInitializer:
+
 UniformInitializer
 ------------------
 
@@ -59,6 +93,8 @@ UniformInitializer
     :members:
     :noindex:
 
+.. _api_fluid_initializer_NormalInitializer:
+
 NormalInitializer
 -----------------
 
@@ -66,6 +102,8 @@ NormalInitializer
     :members:
     :noindex:
 
+.. _api_fluid_initializer_XavierInitializer:
+
 XavierInitializer
 -----------------
 
@@ -73,3 +111,21 @@ XavierInitializer
     :members:
     :noindex:
 
+.. _api_fluid_initializer_BilinearInitializer:
+
+BilinearInitializer
+-------------------
+
+..  autoclass:: paddle.fluid.initializer.BilinearInitializer
+    :members:
+    :noindex:
+
+.. _api_fluid_initializer_MSRAInitializer:
+
+MSRAInitializer
+---------------
+
+..  autoclass:: paddle.fluid.initializer.MSRAInitializer
+    :members:
+    :noindex:
+
diff --git a/doc/fluid/api/io.rst b/doc/fluid/api/io.rst
index dd9d88b669957c22cd0a07fa4b7e219e2d6e5d61..7cee0bc4d9aa2c51517d23a381f14a8f63cc3681 100644
--- a/doc/fluid/api/io.rst
+++ b/doc/fluid/api/io.rst
@@ -1,9 +1,11 @@
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
     !DO NOT EDIT THIS FILE MANUALLY!
 
-==
-io
-==
+========
+fluid.io
+========
+
+.. _api_fluid_io_save_vars:
 
 save_vars
 ---------
@@ -11,51 +13,115 @@ save_vars
 ..  autofunction:: paddle.fluid.io.save_vars
     :noindex:
 
+.. _api_fluid_io_save_params:
+
 save_params
 -----------
 
 ..  autofunction:: paddle.fluid.io.save_params
     :noindex:
 
+.. _api_fluid_io_save_persistables:
+
 save_persistables
 -----------------
 
 ..  autofunction:: paddle.fluid.io.save_persistables
     :noindex:
 
+.. _api_fluid_io_load_vars:
+
 load_vars
 ---------
 
 ..  autofunction:: paddle.fluid.io.load_vars
     :noindex:
 
+.. _api_fluid_io_load_params:
+
 load_params
 -----------
 
 ..  autofunction:: paddle.fluid.io.load_params
     :noindex:
 
+.. _api_fluid_io_load_persistables:
+
 load_persistables
 -----------------
 
 ..  autofunction:: paddle.fluid.io.load_persistables
     :noindex:
 
+.. _api_fluid_io_save_inference_model:
+
 save_inference_model
 --------------------
 
 ..  autofunction:: paddle.fluid.io.save_inference_model
     :noindex:
 
+.. _api_fluid_io_load_inference_model:
+
 load_inference_model
 --------------------
 
 ..  autofunction:: paddle.fluid.io.load_inference_model
     :noindex:
 
+.. _api_fluid_io_get_inference_program:
+
 get_inference_program
 ---------------------
 
 ..  autofunction:: paddle.fluid.io.get_inference_program
     :noindex:
 
+.. _api_fluid_io_save_checkpoint:
+
+save_checkpoint
+---------------
+
+..  autofunction:: paddle.fluid.io.save_checkpoint
+    :noindex:
+
+.. _api_fluid_io_load_checkpoint:
+
+load_checkpoint
+---------------
+
+..  autofunction:: paddle.fluid.io.load_checkpoint
+    :noindex:
+
+.. _api_fluid_io_clean_checkpoint:
+
+clean_checkpoint
+----------------
+
+..  autofunction:: paddle.fluid.io.clean_checkpoint
+    :noindex:
+
+.. _api_fluid_io_load_persist_vars_without_grad:
+
+load_persist_vars_without_grad
+------------------------------
+
+..  autofunction:: paddle.fluid.io.load_persist_vars_without_grad
+    :noindex:
+
+.. _api_fluid_io_save_persist_vars_without_grad:
+
+save_persist_vars_without_grad
+------------------------------
+
+..  autofunction:: paddle.fluid.io.save_persist_vars_without_grad
+    :noindex:
+
+.. _api_fluid_io_get_latest_checkpoint_serial:
+
+get_latest_checkpoint_serial
+----------------------------
+
+..  autofunction:: paddle.fluid.io.get_latest_checkpoint_serial
+    :noindex:
+
diff --git a/doc/fluid/api/layers.rst b/doc/fluid/api/layers.rst
index 8d1c9247b1250703ee605edd21b1cd8fe74a9787..d443c49657b92583e527035f49e74462cf41487d 100644
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -1,25 +1,31 @@
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
     !DO NOT EDIT THIS FILE MANUALLY!
 
-======
-layers
-======
+============
+fluid.layers
+============
 
 control_flow
 ============
 
+.. _api_fluid_layers_split_lod_tensor:
+
 split_lod_tensor
 ----------------
 
 ..  autofunction:: paddle.fluid.layers.split_lod_tensor
     :noindex:
 
+.. _api_fluid_layers_merge_lod_tensor:
+
 merge_lod_tensor
 ----------------
 
 ..  autofunction:: paddle.fluid.layers.merge_lod_tensor
     :noindex:
 
+.. _api_fluid_layers_BlockGuard:
+
 BlockGuard
 ----------
 
@@ -27,6 +33,8 @@ BlockGuard
     :members:
     :noindex:
 
+.. _api_fluid_layers_BlockGuardWithCompletion:
+
 BlockGuardWithCompletion
 ------------------------
 
@@ -34,12 +42,7 @@ BlockGuardWithCompletion
     :members:
     :noindex:
 
-StaticRNNMemoryLink
--------------------
-
-..  autoclass:: paddle.fluid.layers.StaticRNNMemoryLink
-    :members:
-    :noindex:
+.. _api_fluid_layers_WhileGuard:
 
 WhileGuard
 ----------
@@ -48,6 +51,8 @@ WhileGuard
     :members:
     :noindex:
 
+.. _api_fluid_layers_While:
+
 While
 -----
 
@@ -55,6 +60,8 @@ While
     :members:
     :noindex:
 
+.. _api_fluid_layers_Switch:
+
 Switch
 ------
 
@@ -62,78 +69,104 @@ Switch
     :members:
     :noindex:
 
+.. _api_fluid_layers_lod_rank_table:
+
 lod_rank_table
 --------------
 
 ..  autofunction:: paddle.fluid.layers.lod_rank_table
     :noindex:
 
+.. _api_fluid_layers_max_sequence_len:
+
 max_sequence_len
 ----------------
 
 ..  autofunction:: paddle.fluid.layers.max_sequence_len
     :noindex:
 
+.. _api_fluid_layers_lod_tensor_to_array:
+
 lod_tensor_to_array
 -------------------
 
 ..  autofunction:: paddle.fluid.layers.lod_tensor_to_array
     :noindex:
 
+.. _api_fluid_layers_array_to_lod_tensor:
+
 array_to_lod_tensor
 -------------------
 
 ..  autofunction:: paddle.fluid.layers.array_to_lod_tensor
     :noindex:
 
+.. _api_fluid_layers_increment:
+
 increment
 ---------
 
 ..  autofunction:: paddle.fluid.layers.increment
     :noindex:
 
+.. _api_fluid_layers_array_write:
+
 array_write
 -----------
 
 ..  autofunction:: paddle.fluid.layers.array_write
     :noindex:
 
+.. _api_fluid_layers_create_array:
+
 create_array
 ------------
 
 ..  autofunction:: paddle.fluid.layers.create_array
     :noindex:
 
+.. _api_fluid_layers_less_than:
+
 less_than
 ---------
 
 ..  autofunction:: paddle.fluid.layers.less_than
     :noindex:
 
+.. _api_fluid_layers_equal:
+
 equal
 -----
 
 ..  autofunction:: paddle.fluid.layers.equal
     :noindex:
 
+.. _api_fluid_layers_array_read:
+
 array_read
 ----------
 
 ..  autofunction:: paddle.fluid.layers.array_read
     :noindex:
 
+.. _api_fluid_layers_shrink_memory:
+
 shrink_memory
 -------------
 
 ..  autofunction:: paddle.fluid.layers.shrink_memory
     :noindex:
 
+.. _api_fluid_layers_array_length:
+
 array_length
 ------------
 
 ..  autofunction:: paddle.fluid.layers.array_length
     :noindex:
 
+.. _api_fluid_layers_IfElse:
+
 IfElse
 ------
 
@@ -141,6 +174,8 @@ IfElse
     :members:
     :noindex:
 
+.. _api_fluid_layers_DynamicRNN:
+
 DynamicRNN
 ----------
 
@@ -148,6 +183,8 @@ DynamicRNN
     :members:
     :noindex:
 
+.. _api_fluid_layers_ConditionalBlock:
+
 ConditionalBlock
 ----------------
 
@@ -155,6 +192,8 @@ ConditionalBlock
     :members:
     :noindex:
 
+.. _api_fluid_layers_StaticRNN:
+
 StaticRNN
 ---------
 
@@ -162,12 +201,16 @@ StaticRNN
     :members:
     :noindex:
 
+.. _api_fluid_layers_reorder_lod_tensor_by_rank:
+
 reorder_lod_tensor_by_rank
 --------------------------
 
 ..  autofunction:: paddle.fluid.layers.reorder_lod_tensor_by_rank
     :noindex:
 
+.. _api_fluid_layers_ParallelDo:
+
 ParallelDo
 ----------
 
@@ -175,15 +218,27 @@ ParallelDo
     :members:
     :noindex:
 
+.. _api_fluid_layers_Print:
+
 Print
 -----
 
 ..  autofunction:: paddle.fluid.layers.Print
     :noindex:
 
+.. _api_fluid_layers_is_empty:
+
+is_empty
+--------
+
+..  autofunction:: paddle.fluid.layers.is_empty
+    :noindex:
+
 device
 ======
 
+.. _api_fluid_layers_get_places:
+
 get_places
 ----------
 
@@ -193,12 +248,16 @@ get_places
 io
 ==
 
+.. _api_fluid_layers_data:
+
 data
 ----
 
 ..  autofunction:: paddle.fluid.layers.data
     :noindex:
 
+.. _api_fluid_layers_BlockGuardServ:
+
 BlockGuardServ
 --------------
 
@@ -206,6 +265,8 @@ BlockGuardServ
     :members:
     :noindex:
 
+.. _api_fluid_layers_ListenAndServ:
+
 ListenAndServ
 -------------
 
@@ -213,195 +274,291 @@ ListenAndServ
     :members:
     :noindex:
 
+.. _api_fluid_layers_Send:
+
 Send
 ----
 
 ..  autofunction:: paddle.fluid.layers.Send
     :noindex:
 
+.. _api_fluid_layers_Recv:
+
+Recv
+----
+
+..  autofunction:: paddle.fluid.layers.Recv
+    :noindex:
+
+.. _api_fluid_layers_open_recordio_file:
+
 open_recordio_file
 ------------------
 
 ..  autofunction:: paddle.fluid.layers.open_recordio_file
     :noindex:
 
+.. _api_fluid_layers_open_files:
+
 open_files
 ----------
 
 ..  autofunction:: paddle.fluid.layers.open_files
     :noindex:
 
+.. _api_fluid_layers_read_file:
+
 read_file
 ---------
 
 ..  autofunction:: paddle.fluid.layers.read_file
     :noindex:
 
+.. _api_fluid_layers_shuffle:
+
 shuffle
 -------
 
 ..  autofunction:: paddle.fluid.layers.shuffle
     :noindex:
 
+.. _api_fluid_layers_batch:
+
 batch
 -----
 
 ..  autofunction:: paddle.fluid.layers.batch
     :noindex:
 
+.. _api_fluid_layers_double_buffer:
+
 double_buffer
 -------------
 
 ..  autofunction:: paddle.fluid.layers.double_buffer
     :noindex:
 
+.. _api_fluid_layers_random_data_generator:
+
+random_data_generator
+---------------------
+
+..  autofunction:: paddle.fluid.layers.random_data_generator
+    :noindex:
+
+.. _api_fluid_layers_Preprocessor:
+
+Preprocessor
+------------
+
+..  autoclass:: paddle.fluid.layers.Preprocessor
+    :members:
+    :noindex:
+
+.. _api_fluid_layers_load:
+
+load
+----
+
+..  autofunction:: paddle.fluid.layers.load
+    :noindex:
+
 nn
 ==
 
+.. _api_fluid_layers_fc:
+
 fc
 --
 
 ..  autofunction:: paddle.fluid.layers.fc
     :noindex:
 
+.. _api_fluid_layers_embedding:
+
 embedding
 ---------
 
 ..  autofunction:: paddle.fluid.layers.embedding
     :noindex:
 
+.. _api_fluid_layers_dynamic_lstm:
+
 dynamic_lstm
 ------------
 
 ..  autofunction:: paddle.fluid.layers.dynamic_lstm
     :noindex:
 
+.. _api_fluid_layers_dynamic_lstmp:
+
 dynamic_lstmp
 -------------
 
 ..  autofunction:: paddle.fluid.layers.dynamic_lstmp
     :noindex:
 
+.. _api_fluid_layers_dynamic_gru:
+
 dynamic_gru
 -----------
 
 ..  autofunction:: paddle.fluid.layers.dynamic_gru
     :noindex:
 
+.. _api_fluid_layers_gru_unit:
+
 gru_unit
 --------
 
 ..  autofunction:: paddle.fluid.layers.gru_unit
     :noindex:
 
+.. _api_fluid_layers_linear_chain_crf:
+
 linear_chain_crf
 ----------------
 
 ..  autofunction:: paddle.fluid.layers.linear_chain_crf
     :noindex:
 
+.. _api_fluid_layers_crf_decoding:
+
 crf_decoding
 ------------
 
 ..  autofunction:: paddle.fluid.layers.crf_decoding
     :noindex:
 
+.. _api_fluid_layers_cos_sim:
+
 cos_sim
 -------
 
 ..  autofunction:: paddle.fluid.layers.cos_sim
     :noindex:
 
+.. _api_fluid_layers_cross_entropy:
+
 cross_entropy
 -------------
 
 ..  autofunction:: paddle.fluid.layers.cross_entropy
     :noindex:
 
+.. _api_fluid_layers_square_error_cost:
+
 square_error_cost
 -----------------
 
 ..  autofunction:: paddle.fluid.layers.square_error_cost
     :noindex:
 
+.. _api_fluid_layers_chunk_eval:
+
 chunk_eval
 ----------
 
 ..  autofunction:: paddle.fluid.layers.chunk_eval
     :noindex:
 
+.. _api_fluid_layers_sequence_conv:
+
 sequence_conv
 -------------
 
 ..  autofunction:: paddle.fluid.layers.sequence_conv
     :noindex:
 
+.. _api_fluid_layers_conv2d:
+
 conv2d
 ------
 
 ..  autofunction:: paddle.fluid.layers.conv2d
     :noindex:
 
+.. _api_fluid_layers_conv3d:
+
 conv3d
 ------
 
 ..  autofunction:: paddle.fluid.layers.conv3d
     :noindex:
 
+.. _api_fluid_layers_sequence_pool:
+
 sequence_pool
 -------------
 
 ..  autofunction:: paddle.fluid.layers.sequence_pool
     :noindex:
 
+.. _api_fluid_layers_sequence_softmax:
+
 sequence_softmax
 ----------------
 
 ..  autofunction:: paddle.fluid.layers.sequence_softmax
     :noindex:
 
+.. _api_fluid_layers_softmax:
+
 softmax
 -------
 
 ..  autofunction:: paddle.fluid.layers.softmax
     :noindex:
 
+.. _api_fluid_layers_pool2d:
+
 pool2d
 ------
 
 ..  autofunction:: paddle.fluid.layers.pool2d
     :noindex:
 
+.. _api_fluid_layers_pool3d:
+
 pool3d
 ------
 
 ..  autofunction:: paddle.fluid.layers.pool3d
     :noindex:
 
+.. _api_fluid_layers_batch_norm:
+
 batch_norm
 ----------
 
 ..  autofunction:: paddle.fluid.layers.batch_norm
     :noindex:
 
+.. _api_fluid_layers_beam_search_decode:
+
 beam_search_decode
 ------------------
 
 ..  autofunction:: paddle.fluid.layers.beam_search_decode
     :noindex:
 
+.. _api_fluid_layers_conv2d_transpose:
+
 conv2d_transpose
 ----------------
 
 ..  autofunction:: paddle.fluid.layers.conv2d_transpose
     :noindex:
 
+.. _api_fluid_layers_conv3d_transpose:
+
 conv3d_transpose
 ----------------
 
-..  autofunction:: paddle.fluid.layers.conv2d_transpose
+..  autofunction:: paddle.fluid.layers.conv3d_transpose
     :noindex:
 
+.. _api_fluid_layers_sequence_expand:
 
 sequence_expand
 ---------------
@@ -409,320 +566,498 @@ sequence_expand
 ..  autofunction:: paddle.fluid.layers.sequence_expand
     :noindex:
 
+.. _api_fluid_layers_lstm_unit:
+
 lstm_unit
 ---------
 
 ..  autofunction:: paddle.fluid.layers.lstm_unit
     :noindex:
 
+.. _api_fluid_layers_reduce_sum:
+
 reduce_sum
 ----------
 
 ..  autofunction:: paddle.fluid.layers.reduce_sum
     :noindex:
 
+.. _api_fluid_layers_reduce_mean:
+
 reduce_mean
 -----------
 
 ..  autofunction:: paddle.fluid.layers.reduce_mean
     :noindex:
 
+.. _api_fluid_layers_reduce_max:
+
 reduce_max
 ----------
 
 ..  autofunction:: paddle.fluid.layers.reduce_max
     :noindex:
 
+.. _api_fluid_layers_reduce_min:
+
 reduce_min
 ----------
 
 ..  autofunction:: paddle.fluid.layers.reduce_min
     :noindex:
 
+.. _api_fluid_layers_reduce_prod:
+
 reduce_prod
 -----------
 
 ..  autofunction:: paddle.fluid.layers.reduce_prod
     :noindex:
 
+.. _api_fluid_layers_sequence_first_step:
+
 sequence_first_step
 -------------------
 
 ..  autofunction:: paddle.fluid.layers.sequence_first_step
     :noindex:
 
+.. _api_fluid_layers_sequence_last_step:
+
 sequence_last_step
 ------------------
 
 ..  autofunction:: paddle.fluid.layers.sequence_last_step
     :noindex:
 
+.. _api_fluid_layers_dropout:
+
 dropout
 -------
 
 ..  autofunction:: paddle.fluid.layers.dropout
     :noindex:
 
+.. _api_fluid_layers_split:
+
 split
 -----
 
 ..  autofunction:: paddle.fluid.layers.split
     :noindex:
 
+.. _api_fluid_layers_ctc_greedy_decoder:
+
 ctc_greedy_decoder
 ------------------
 
 ..  autofunction:: paddle.fluid.layers.ctc_greedy_decoder
     :noindex:
 
+.. _api_fluid_layers_edit_distance:
+
 edit_distance
 -------------
 
 ..  autofunction:: paddle.fluid.layers.edit_distance
     :noindex:
 
+.. _api_fluid_layers_l2_normalize:
+
 l2_normalize
 ------------
 
 ..  autofunction:: paddle.fluid.layers.l2_normalize
     :noindex:
 
+.. _api_fluid_layers_matmul:
+
 matmul
 ------
 
 ..  autofunction:: paddle.fluid.layers.matmul
     :noindex:
 
+.. _api_fluid_layers_topk:
+
 topk
 ----
 
 ..  autofunction:: paddle.fluid.layers.topk
     :noindex:
 
+.. _api_fluid_layers_warpctc:
+
 warpctc
 -------
 
 ..  autofunction:: paddle.fluid.layers.warpctc
     :noindex:
 
+.. _api_fluid_layers_sequence_reshape:
+
 sequence_reshape
 ----------------
 
 ..  autofunction:: paddle.fluid.layers.sequence_reshape
     :noindex:
 
+.. _api_fluid_layers_transpose:
+
 transpose
 ---------
 
 ..  autofunction:: paddle.fluid.layers.transpose
     :noindex:
 
+.. _api_fluid_layers_im2sequence:
+
 im2sequence
 -----------
 
 ..  autofunction:: paddle.fluid.layers.im2sequence
     :noindex:
 
+.. _api_fluid_layers_nce:
+
 nce
 ---
 
 ..  autofunction:: paddle.fluid.layers.nce
     :noindex:
 
+.. _api_fluid_layers_beam_search:
+
 beam_search
 -----------
 
 ..  autofunction:: paddle.fluid.layers.beam_search
     :noindex:
 
+.. _api_fluid_layers_row_conv:
+
 row_conv
 --------
 
 ..  autofunction:: paddle.fluid.layers.row_conv
     :noindex:
 
+.. _api_fluid_layers_multiplex:
+
 multiplex
 ---------
 
 ..  autofunction:: paddle.fluid.layers.multiplex
     :noindex:
 
+.. _api_fluid_layers_layer_norm:
+
 layer_norm
 ----------
 
 ..  autofunction:: paddle.fluid.layers.layer_norm
     :noindex:
 
+.. _api_fluid_layers_softmax_with_cross_entropy:
+
 softmax_with_cross_entropy
 --------------------------
 
 ..  autofunction:: paddle.fluid.layers.softmax_with_cross_entropy
     :noindex:
 
+.. _api_fluid_layers_smooth_l1:
+
 smooth_l1
 ---------
 
 ..  autofunction:: paddle.fluid.layers.smooth_l1
     :noindex:
 
+.. _api_fluid_layers_one_hot:
+
 one_hot
 -------
 
 ..  autofunction:: paddle.fluid.layers.one_hot
     :noindex:
 
+.. _api_fluid_layers_autoincreased_step_counter:
+
 autoincreased_step_counter
 --------------------------
 
 ..  autofunction:: paddle.fluid.layers.autoincreased_step_counter
     :noindex:
 
+.. _api_fluid_layers_reshape:
+
 reshape
 -------
 
 ..  autofunction:: paddle.fluid.layers.reshape
     :noindex:
 
+.. _api_fluid_layers_lod_reset:
+
 lod_reset
 ---------
 
 ..  autofunction:: paddle.fluid.layers.lod_reset
     :noindex:
 
+.. _api_fluid_layers_lrn:
+
 lrn
 ---
 
 ..  autofunction:: paddle.fluid.layers.lrn
     :noindex:
 
+.. _api_fluid_layers_pad:
+
 pad
 ---
 
 ..  autofunction:: paddle.fluid.layers.pad
     :noindex:
 
+.. _api_fluid_layers_label_smooth:
+
 label_smooth
 ------------
 
 ..  autofunction:: paddle.fluid.layers.label_smooth
     :noindex:
 
+.. _api_fluid_layers_roi_pool:
+
 roi_pool
 --------
 
 ..  autofunction:: paddle.fluid.layers.roi_pool
     :noindex:
 
+.. _api_fluid_layers_dice_loss:
+
+dice_loss
+---------
+
+..  autofunction:: paddle.fluid.layers.dice_loss
+    :noindex:
+
+.. _api_fluid_layers_image_resize:
+
+image_resize
+------------
+
+..  autofunction:: paddle.fluid.layers.image_resize
+    :noindex:
+
+.. _api_fluid_layers_image_resize_short:
+
+image_resize_short
+------------------
+
+..  autofunction:: paddle.fluid.layers.image_resize_short
+    :noindex:
+
+.. _api_fluid_layers_resize_bilinear:
+
+resize_bilinear
+---------------
+
+..  autofunction:: paddle.fluid.layers.resize_bilinear
+    :noindex:
+
+.. _api_fluid_layers_gather:
+
+gather
+------
+
+..  autofunction:: paddle.fluid.layers.gather
+    :noindex:
+
+.. _api_fluid_layers_random_crop:
+
+random_crop
+-----------
+
+..  autofunction:: paddle.fluid.layers.random_crop
+    :noindex:
+
+.. _api_fluid_layers_mean_iou:
+
+mean_iou
+--------
+
+..  autofunction:: paddle.fluid.layers.mean_iou
+    :noindex:
+
+.. _api_fluid_layers_relu:
+
+relu
+----
+
+..  autofunction:: paddle.fluid.layers.relu
+    :noindex:
+
+.. _api_fluid_layers_log:
+
+log
+---
+
+..  autofunction:: paddle.fluid.layers.log
+    :noindex:
+
+.. _api_fluid_layers_crop:
+
+crop
+----
+
+..  autofunction:: paddle.fluid.layers.crop
+    :noindex:
+
 ops
 ===
 
+.. _api_fluid_layers_mean:
+
 mean
 ----
 
 ..  autofunction:: paddle.fluid.layers.mean
     :noindex:
 
+.. _api_fluid_layers_mul:
+
 mul
 ---
 
 ..  autofunction:: paddle.fluid.layers.mul
     :noindex:
 
+.. _api_fluid_layers_scale:
+
 scale
 -----
 
 ..  autofunction:: paddle.fluid.layers.scale
     :noindex:
 
+.. _api_fluid_layers_sigmoid_cross_entropy_with_logits:
+
 sigmoid_cross_entropy_with_logits
 ---------------------------------
 
 ..  autofunction:: paddle.fluid.layers.sigmoid_cross_entropy_with_logits
     :noindex:
 
+.. _api_fluid_layers_elementwise_add:
+
 elementwise_add
 ---------------
 
 ..  autofunction:: paddle.fluid.layers.elementwise_add
     :noindex:
 
+.. _api_fluid_layers_elementwise_div:
+
 elementwise_div
 ---------------
 
 ..  autofunction:: paddle.fluid.layers.elementwise_div
     :noindex:
 
+.. _api_fluid_layers_elementwise_sub:
+
 elementwise_sub
 ---------------
 
 ..  autofunction:: paddle.fluid.layers.elementwise_sub
     :noindex:
 
+.. _api_fluid_layers_elementwise_mul:
+
 elementwise_mul
 ---------------
 
 ..  autofunction:: paddle.fluid.layers.elementwise_mul
     :noindex:
 
+.. _api_fluid_layers_elementwise_max:
+
 elementwise_max
 ---------------
 
 ..  autofunction:: paddle.fluid.layers.elementwise_max
     :noindex:
 
+.. _api_fluid_layers_elementwise_min:
+
 elementwise_min
 ---------------
 
 ..  autofunction:: paddle.fluid.layers.elementwise_min
     :noindex:
 
+.. _api_fluid_layers_elementwise_pow:
+
 elementwise_pow
 ---------------
 
 ..  autofunction:: paddle.fluid.layers.elementwise_pow
     :noindex:
 
+.. _api_fluid_layers_clip:
+
 clip
 ----
 
 ..  autofunction:: paddle.fluid.layers.clip
     :noindex:
 
+.. _api_fluid_layers_clip_by_norm:
+
 clip_by_norm
 ------------
 
 ..  autofunction:: paddle.fluid.layers.clip_by_norm
     :noindex:
 
+.. _api_fluid_layers_logical_and:
+
 logical_and
 -----------
 
 ..  autofunction:: paddle.fluid.layers.logical_and
     :noindex:
 
+.. _api_fluid_layers_logical_or:
+
 logical_or
 ----------
 
 ..  autofunction:: paddle.fluid.layers.logical_or
     :noindex:
 
+.. _api_fluid_layers_logical_xor:
+
 logical_xor
 -----------
 
 ..  autofunction:: paddle.fluid.layers.logical_xor
     :noindex:
 
+.. _api_fluid_layers_logical_not:
+
 logical_not
 -----------
 
 ..  autofunction:: paddle.fluid.layers.logical_not
     :noindex:
 
-uniform_random
---------------
-
-..  autofunction:: paddle.fluid.layers.uniform_random
-    :noindex:
+.. _api_fluid_layers_uniform_random_batch_size_like:
 
 uniform_random_batch_size_like
 ------------------------------
@@ -730,23 +1065,23 @@ uniform_random_batch_size_like
 ..  autofunction:: paddle.fluid.layers.uniform_random_batch_size_like
     :noindex:
 
+.. _api_fluid_layers_gaussian_random:
+
 gaussian_random
 ---------------
 
 ..  autofunction:: paddle.fluid.layers.gaussian_random
     :noindex:
 
+.. _api_fluid_layers_gaussian_random_batch_size_like:
+
 gaussian_random_batch_size_like
 -------------------------------
 
 ..  autofunction:: paddle.fluid.layers.gaussian_random_batch_size_like
     :noindex:
 
-cumsum
-------
-
-..  autofunction:: paddle.fluid.layers.cumsum
-    :noindex:
+.. _api_fluid_layers_scatter:
 
 scatter
 -------
@@ -754,35 +1089,79 @@ scatter
 ..  autofunction:: paddle.fluid.layers.scatter
     :noindex:
 
+.. _api_fluid_layers_sum:
+
 sum
 ---
 
 ..  autofunction:: paddle.fluid.layers.sum
     :noindex:
 
+.. _api_fluid_layers_slice:
+
+slice
+-----
+
+..  autofunction:: paddle.fluid.layers.slice
+    :noindex:
+
+.. _api_fluid_layers_polygon_box_transform:
+
+polygon_box_transform
+---------------------
+
+..  autofunction:: paddle.fluid.layers.polygon_box_transform
+    :noindex:
+
+.. _api_fluid_layers_shape:
+
+shape
+-----
+
+..  autofunction:: paddle.fluid.layers.shape
+    :noindex:
+
+.. _api_fluid_layers_iou_similarity:
+
+iou_similarity
+--------------
+
+..  autofunction:: paddle.fluid.layers.iou_similarity
+    :noindex:
+
+.. _api_fluid_layers_maxout:
+
+maxout
+------
+
+..  autofunction:: paddle.fluid.layers.maxout
+    :noindex:
+
+.. _api_fluid_layers_sigmoid:
+
 sigmoid
 -------
 
 ..  autofunction:: paddle.fluid.layers.sigmoid
     :noindex:
 
+.. _api_fluid_layers_logsigmoid:
+
 logsigmoid
 ----------
 
 ..  autofunction:: paddle.fluid.layers.logsigmoid
     :noindex:
 
+.. _api_fluid_layers_exp:
+
 exp
 ---
 
 ..  autofunction:: paddle.fluid.layers.exp
     :noindex:
 
-relu
-----
-
-..  autofunction:: paddle.fluid.layers.relu
-    :noindex:
+.. _api_fluid_layers_tanh:
 
 tanh
 ----
@@ -790,71 +1169,87 @@ tanh
 ..  autofunction:: paddle.fluid.layers.tanh
     :noindex:
 
+.. _api_fluid_layers_tanh_shrink:
+
 tanh_shrink
 -----------
 
 ..  autofunction:: paddle.fluid.layers.tanh_shrink
     :noindex:
 
+.. _api_fluid_layers_softshrink:
+
 softshrink
 ----------
 
 ..  autofunction:: paddle.fluid.layers.softshrink
     :noindex:
 
+.. _api_fluid_layers_sqrt:
+
 sqrt
 ----
 
 ..  autofunction:: paddle.fluid.layers.sqrt
     :noindex:
 
+.. _api_fluid_layers_abs:
+
 abs
 ---
 
 ..  autofunction:: paddle.fluid.layers.abs
     :noindex:
 
+.. _api_fluid_layers_ceil:
+
 ceil
 ----
 
 ..  autofunction:: paddle.fluid.layers.ceil
     :noindex:
 
+.. _api_fluid_layers_floor:
+
 floor
 -----
 
 ..  autofunction:: paddle.fluid.layers.floor
     :noindex:
 
+.. _api_fluid_layers_cos:
+
 cos
 ---
 
 ..  autofunction:: paddle.fluid.layers.cos
     :noindex:
 
+.. _api_fluid_layers_sin:
+
 sin
 ---
 
 ..  autofunction:: paddle.fluid.layers.sin
     :noindex:
 
+.. _api_fluid_layers_round:
+
 round
 -----
 
 ..  autofunction:: paddle.fluid.layers.round
     :noindex:
 
+.. _api_fluid_layers_reciprocal:
+
 reciprocal
 ----------
 
 ..  autofunction:: paddle.fluid.layers.reciprocal
     :noindex:
 
-log
----
-
-..  autofunction:: paddle.fluid.layers.log
-    :noindex:
+.. _api_fluid_layers_square:
 
 square
 ------
@@ -862,71 +1257,79 @@ square
 ..  autofunction:: paddle.fluid.layers.square
     :noindex:
 
+.. _api_fluid_layers_softplus:
+
 softplus
 --------
 
 ..  autofunction:: paddle.fluid.layers.softplus
     :noindex:
 
+.. _api_fluid_layers_softsign:
+
 softsign
 --------
 
 ..  autofunction:: paddle.fluid.layers.softsign
     :noindex:
 
+.. _api_fluid_layers_brelu:
+
 brelu
 -----
 
 ..  autofunction:: paddle.fluid.layers.brelu
     :noindex:
 
+.. _api_fluid_layers_leaky_relu:
+
 leaky_relu
 ----------
 
 ..  autofunction:: paddle.fluid.layers.leaky_relu
     :noindex:
 
+.. _api_fluid_layers_soft_relu:
+
 soft_relu
 ---------
 
 ..  autofunction:: paddle.fluid.layers.soft_relu
     :noindex:
 
+.. _api_fluid_layers_elu:
+
 elu
 ---
 
 ..  autofunction:: paddle.fluid.layers.elu
     :noindex:
 
+.. _api_fluid_layers_relu6:
+
 relu6
 -----
 
 ..  autofunction:: paddle.fluid.layers.relu6
     :noindex:
 
+.. _api_fluid_layers_pow:
+
 pow
 ---
 
 ..  autofunction:: paddle.fluid.layers.pow
     :noindex:
 
+.. _api_fluid_layers_stanh:
+
 stanh
 -----
 
 ..  autofunction:: paddle.fluid.layers.stanh
     :noindex:
 
-hard_shrink
------------
-
-..  autofunction:: paddle.fluid.layers.hard_shrink
-    :noindex:
-
-thresholded_relu
-----------------
-
-..  autofunction:: paddle.fluid.layers.thresholded_relu
-    :noindex:
+.. _api_fluid_layers_hard_sigmoid:
 
 hard_sigmoid
 ------------
@@ -934,168 +1337,434 @@ hard_sigmoid
 ..  autofunction:: paddle.fluid.layers.hard_sigmoid
     :noindex:
 
+.. _api_fluid_layers_swish:
+
 swish
 -----
 
 ..  autofunction:: paddle.fluid.layers.swish
     :noindex:
 
+.. _api_fluid_layers_uniform_random:
+
+uniform_random
+--------------
+
+..  autofunction:: paddle.fluid.layers.uniform_random
+    :noindex:
+
+.. _api_fluid_layers_hard_shrink:
+
+hard_shrink
+-----------
+
+..  autofunction:: paddle.fluid.layers.hard_shrink
+    :noindex:
+
+.. _api_fluid_layers_cumsum:
+
+cumsum
+------
+
+..  autofunction:: paddle.fluid.layers.cumsum
+    :noindex:
+
+.. _api_fluid_layers_thresholded_relu:
+
+thresholded_relu
+----------------
+
+..  autofunction:: paddle.fluid.layers.thresholded_relu
+    :noindex:
+
 tensor
 ======
 
+.. _api_fluid_layers_create_tensor:
+
 create_tensor
 -------------
 
 ..  autofunction:: paddle.fluid.layers.create_tensor
     :noindex:
 
+.. _api_fluid_layers_create_parameter:
+
 create_parameter
 ----------------
 
 ..  autofunction:: paddle.fluid.layers.create_parameter
     :noindex:
 
+.. _api_fluid_layers_create_global_var:
+
 create_global_var
 -----------------
 
 ..  autofunction:: paddle.fluid.layers.create_global_var
     :noindex:
 
+.. _api_fluid_layers_cast:
+
 cast
 ----
 
 ..  autofunction:: paddle.fluid.layers.cast
     :noindex:
 
+.. _api_fluid_layers_concat:
+
 concat
 ------
 
 ..  autofunction:: paddle.fluid.layers.concat
     :noindex:
 
+.. _api_fluid_layers_sums:
+
 sums
 ----
 
 ..  autofunction:: paddle.fluid.layers.sums
     :noindex:
 
+.. _api_fluid_layers_assign:
+
 assign
 ------
 
 ..  autofunction:: paddle.fluid.layers.assign
     :noindex:
 
+.. _api_fluid_layers_fill_constant_batch_size_like:
+
 fill_constant_batch_size_like
 -----------------------------
 
 ..  autofunction:: paddle.fluid.layers.fill_constant_batch_size_like
     :noindex:
 
+.. _api_fluid_layers_fill_constant:
+
 fill_constant
 -------------
 
 ..  autofunction:: paddle.fluid.layers.fill_constant
     :noindex:
 
+.. _api_fluid_layers_argmin:
+
+argmin
+------
+
+..  autofunction:: paddle.fluid.layers.argmin
+    :noindex:
+
+.. _api_fluid_layers_argmax:
+
+argmax
+------
+
+..  autofunction:: paddle.fluid.layers.argmax
+    :noindex:
+
+.. _api_fluid_layers_argsort:
+
+argsort
+-------
+
+..  autofunction:: paddle.fluid.layers.argsort
+    :noindex:
+
+.. _api_fluid_layers_ones:
+
 ones
 ----
 
 ..  autofunction:: paddle.fluid.layers.ones
     :noindex:
 
+.. _api_fluid_layers_zeros:
+
 zeros
 -----
 
 ..  autofunction:: paddle.fluid.layers.zeros
     :noindex:
 
+.. _api_fluid_layers_reverse:
+
+reverse
+-------
+
+..  autofunction:: paddle.fluid.layers.reverse
+    :noindex:
+
+learning_rate_scheduler
+=======================
+
+.. _api_fluid_layers_exponential_decay:
+
+exponential_decay
+-----------------
+
+..  autofunction:: paddle.fluid.layers.exponential_decay
+    :noindex:
+
+.. _api_fluid_layers_natural_exp_decay:
+
+natural_exp_decay
+-----------------
+
+..  autofunction:: paddle.fluid.layers.natural_exp_decay
+    :noindex:
+
+.. _api_fluid_layers_inverse_time_decay:
+
+inverse_time_decay
+------------------
+
+..  autofunction:: paddle.fluid.layers.inverse_time_decay
+    :noindex:
+
+.. _api_fluid_layers_polynomial_decay:
+
+polynomial_decay
+----------------
+
+..  autofunction:: paddle.fluid.layers.polynomial_decay
+    :noindex:
+
+.. _api_fluid_layers_piecewise_decay:
+
+piecewise_decay
+---------------
+
+..  autofunction:: paddle.fluid.layers.piecewise_decay
+    :noindex:
+
+.. _api_fluid_layers_noam_decay:
+
+noam_decay
+----------
+
+..  autofunction:: paddle.fluid.layers.noam_decay
+    :noindex:
+
+.. _api_fluid_layers_append_LARS:
+
+append_LARS
+-----------
+
+..  autofunction:: paddle.fluid.layers.append_LARS
+    :noindex:
+
 detection
 =========
 
+.. _api_fluid_layers_prior_box:
+
+prior_box
+---------
+
+..  autofunction:: paddle.fluid.layers.prior_box
+    :noindex:
+
+.. _api_fluid_layers_multi_box_head:
+
 multi_box_head
 --------------
 
 ..  autofunction:: paddle.fluid.layers.multi_box_head
     :noindex:
 
+.. _api_fluid_layers_bipartite_match:
+
 bipartite_match
 ---------------
 
 ..  autofunction:: paddle.fluid.layers.bipartite_match
     :noindex:
 
+.. _api_fluid_layers_target_assign:
+
 target_assign
 -------------
 
 ..  autofunction:: paddle.fluid.layers.target_assign
     :noindex:
 
+.. _api_fluid_layers_detection_output:
+
 detection_output
 ----------------
 
 ..  autofunction:: paddle.fluid.layers.detection_output
     :noindex:
 
+.. _api_fluid_layers_ssd_loss:
+
 ssd_loss
 --------
 
 ..  autofunction:: paddle.fluid.layers.ssd_loss
     :noindex:
 
+.. _api_fluid_layers_detection_map:
+
 detection_map
 -------------
 
 ..  autofunction:: paddle.fluid.layers.detection_map
     :noindex:
 
+.. _api_fluid_layers_iou_similarity:
+
 iou_similarity
 --------------
 
 ..  autofunction:: paddle.fluid.layers.iou_similarity
     :noindex:
 
+.. _api_fluid_layers_box_coder:
+
 box_coder
 ---------
 
 ..  autofunction:: paddle.fluid.layers.box_coder
     :noindex:
 
-learning_rate_scheduler
-=======================
+metric_op
+=========
 
-exponential_decay
------------------
+.. _api_fluid_layers_accuracy:
 
-..  autofunction:: paddle.fluid.layers.exponential_decay
+accuracy
+--------
+
+..  autofunction:: paddle.fluid.layers.accuracy
     :noindex:
 
-natural_exp_decay
------------------
+.. _api_fluid_layers_auc:
 
-..  autofunction:: paddle.fluid.layers.natural_exp_decay
+auc
+---
+
+..  autofunction:: paddle.fluid.layers.auc
     :noindex:
 
-inverse_time_decay
-------------------
+tensor
+======
 
-..  autofunction:: paddle.fluid.layers.inverse_time_decay
+.. _api_fluid_layers_create_tensor:
+
+create_tensor
+-------------
+
+..  autofunction:: paddle.fluid.layers.create_tensor
     :noindex:
 
-polynomial_decay
+.. _api_fluid_layers_create_parameter:
+
+create_parameter
 ----------------
 
-..  autofunction:: paddle.fluid.layers.polynomial_decay
+..  autofunction:: paddle.fluid.layers.create_parameter
     :noindex:
 
-piecewise_decay
----------------
+.. _api_fluid_layers_create_global_var:
 
-..  autofunction:: paddle.fluid.layers.piecewise_decay
+create_global_var
+-----------------
+
+..  autofunction:: paddle.fluid.layers.create_global_var
     :noindex:
 
-noam_decay
-----------
+.. _api_fluid_layers_cast:
 
-..  autofunction:: paddle.fluid.layers.noam_decay
+cast
+----
+
+..  autofunction:: paddle.fluid.layers.cast
+    :noindex:
+
+.. _api_fluid_layers_concat:
+
+concat
+------
+
+..  autofunction:: paddle.fluid.layers.concat
+    :noindex:
+
+.. _api_fluid_layers_sums:
+
+sums
+----
+
+..  autofunction:: paddle.fluid.layers.sums
+    :noindex:
+
+.. _api_fluid_layers_assign:
+
+assign
+------
+
+..  autofunction:: paddle.fluid.layers.assign
+    :noindex:
+
+.. _api_fluid_layers_fill_constant_batch_size_like:
+
+fill_constant_batch_size_like
+-----------------------------
+
+..  autofunction:: paddle.fluid.layers.fill_constant_batch_size_like
+    :noindex:
+
+.. _api_fluid_layers_fill_constant:
+
+fill_constant
+-------------
+
+..  autofunction:: paddle.fluid.layers.fill_constant
+    :noindex:
+
+.. _api_fluid_layers_argmin:
+
+argmin
+------
+
+..  autofunction:: paddle.fluid.layers.argmin
+    :noindex:
+
+.. _api_fluid_layers_argmax:
+
+argmax
+------
+
+..  autofunction:: paddle.fluid.layers.argmax
+    :noindex:
+
+.. _api_fluid_layers_ones:
+
+ones
+----
+
+..  autofunction:: paddle.fluid.layers.ones
+    :noindex:
+
+.. _api_fluid_layers_zeros:
+
+zeros
+-----
+
+..  autofunction:: paddle.fluid.layers.zeros
+    :noindex:
+
+.. _api_fluid_layers_reverse:
+
+reverse
+-------
+
+..  autofunction:: paddle.fluid.layers.reverse
     :noindex:
 
diff --git a/doc/fluid/api/metrics.rst b/doc/fluid/api/metrics.rst
index ddf07775d7ea293acd421b8549d03b277ff0611d..0f54b2e2eb7ead353215c5dbd529293794e37123 100644
--- a/doc/fluid/api/metrics.rst
+++ b/doc/fluid/api/metrics.rst
@@ -1,9 +1,11 @@
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
     !DO NOT EDIT THIS FILE MANUALLY!
 
-=======
-metrics
-=======
+=============
+fluid.metrics
+=============
+
+.. _api_fluid_metrics_MetricBase:
 
 MetricBase
 ----------
@@ -12,6 +14,8 @@ MetricBase
     :members:
     :noindex:
 
+.. _api_fluid_metrics_CompositeMetric:
+
 CompositeMetric
 ---------------
 
@@ -19,6 +23,26 @@ CompositeMetric
     :members:
     :noindex:
 
+.. _api_fluid_metrics_Precision:
+
+Precision
+---------
+
+..  autoclass:: paddle.fluid.metrics.Precision
+    :members:
+    :noindex:
+
+.. _api_fluid_metrics_Recall:
+
+Recall
+------
+
+..  autoclass:: paddle.fluid.metrics.Recall
+    :members:
+    :noindex:
+
+.. _api_fluid_metrics_Accuracy:
+
 Accuracy
 --------
 
@@ -26,6 +50,8 @@ Accuracy
     :members:
     :noindex:
 
+.. _api_fluid_metrics_ChunkEvaluator:
+
 ChunkEvaluator
 --------------
 
@@ -33,6 +59,8 @@ ChunkEvaluator
     :members:
     :noindex:
 
+.. _api_fluid_metrics_EditDistance:
+
 EditDistance
 ------------
 
@@ -40,6 +68,8 @@ EditDistance
     :members:
     :noindex:
 
+.. _api_fluid_metrics_DetectionMAP:
+
 DetectionMAP
 ------------
 
@@ -47,6 +77,8 @@ DetectionMAP
     :members:
     :noindex:
 
+.. _api_fluid_metrics_Auc:
+
 Auc
 ---
 
diff --git a/doc/fluid/api/nets.rst b/doc/fluid/api/nets.rst
index 7ae3187304f386a08c5cb8a4ba093423a58a7f36..059733af18517257b6821d95fd628a9e13e6e98e 100644
--- a/doc/fluid/api/nets.rst
+++ b/doc/fluid/api/nets.rst
@@ -1,9 +1,11 @@
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
     !DO NOT EDIT THIS FILE MANUALLY!
 
-====
-nets
-====
+==========
+fluid.nets
+==========
+
+.. _api_fluid_nets_simple_img_conv_pool:
 
 simple_img_conv_pool
 --------------------
@@ -11,18 +13,24 @@ simple_img_conv_pool
 ..  autofunction:: paddle.fluid.nets.simple_img_conv_pool
     :noindex:
 
+.. _api_fluid_nets_sequence_conv_pool:
+
 sequence_conv_pool
 ------------------
 
 ..  autofunction:: paddle.fluid.nets.sequence_conv_pool
     :noindex:
 
+.. _api_fluid_nets_glu:
+
 glu
 ---
 
 ..  autofunction:: paddle.fluid.nets.glu
     :noindex:
 
+.. _api_fluid_nets_scaled_dot_product_attention:
+
 scaled_dot_product_attention
 ----------------------------
 
diff --git a/doc/fluid/api/optimizer.rst b/doc/fluid/api/optimizer.rst
index 79a0995fce303518d989693976c4e92e05795ca2..8d792120f2f16a8c92606b343eb4c3d4368bed14 100644
--- a/doc/fluid/api/optimizer.rst
+++ b/doc/fluid/api/optimizer.rst
@@ -1,9 +1,11 @@
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
     !DO NOT EDIT THIS FILE MANUALLY!
 
-=========
-optimizer
-=========
+===============
+fluid.optimizer
+===============
+
+.. _api_fluid_optimizer_SGD:
 
 SGD
 ---
@@ -12,6 +14,8 @@ SGD
     :members:
     :noindex:
 
+.. _api_fluid_optimizer_Momentum:
+
 Momentum
 --------
 
@@ -19,6 +23,8 @@ Momentum
     :members:
     :noindex:
 
+.. _api_fluid_optimizer_Adagrad:
+
 Adagrad
 -------
 
@@ -26,6 +32,8 @@ Adagrad
     :members:
     :noindex:
 
+.. _api_fluid_optimizer_Adam:
+
 Adam
 ----
 
@@ -33,6 +41,8 @@ Adam
     :members:
     :noindex:
 
+.. _api_fluid_optimizer_Adamax:
+
 Adamax
 ------
 
@@ -40,6 +50,8 @@ Adamax
     :members:
     :noindex:
 
+.. _api_fluid_optimizer_DecayedAdagrad:
+
 DecayedAdagrad
 --------------
 
@@ -47,6 +59,17 @@ DecayedAdagrad
     :members:
     :noindex:
 
+.. _api_fluid_optimizer_Ftrl:
+
+Ftrl
+----
+
+..  autoclass:: paddle.fluid.optimizer.Ftrl
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_SGDOptimizer:
+
 SGDOptimizer
 ------------
 
@@ -54,6 +77,8 @@ SGDOptimizer
     :members:
     :noindex:
 
+.. _api_fluid_optimizer_MomentumOptimizer:
+
 MomentumOptimizer
 -----------------
 
@@ -61,6 +86,8 @@ MomentumOptimizer
     :members:
     :noindex:
 
+.. _api_fluid_optimizer_AdagradOptimizer:
+
 AdagradOptimizer
 ----------------
 
@@ -68,6 +95,8 @@ AdagradOptimizer
     :members:
     :noindex:
 
+.. _api_fluid_optimizer_AdamOptimizer:
+
 AdamOptimizer
 -------------
 
@@ -75,6 +104,8 @@ AdamOptimizer
     :members:
     :noindex:
 
+.. _api_fluid_optimizer_AdamaxOptimizer:
+
 AdamaxOptimizer
 ---------------
 
@@ -82,6 +113,8 @@ AdamaxOptimizer
     :members:
     :noindex:
 
+.. _api_fluid_optimizer_DecayedAdagradOptimizer:
+
 DecayedAdagradOptimizer
 -----------------------
 
@@ -89,6 +122,26 @@ DecayedAdagradOptimizer
     :members:
     :noindex:
 
+.. _api_fluid_optimizer_RMSPropOptimizer:
+
+RMSPropOptimizer
+----------------
+
+..  autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_FtrlOptimizer:
+
+FtrlOptimizer
+-------------
+
+..  autoclass:: paddle.fluid.optimizer.FtrlOptimizer
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_Adadelta:
+
 Adadelta
 --------
 
@@ -96,6 +149,8 @@ Adadelta
     :members:
     :noindex:
 
+.. _api_fluid_optimizer_ModelAverage:
+
 ModelAverage
 ------------
 
@@ -103,6 +158,8 @@ ModelAverage
     :members:
     :noindex:
 
+.. _api_fluid_optimizer_Optimizer:
+
 Optimizer
 ---------
 
@@ -110,3 +167,12 @@ Optimizer
     :members:
     :noindex:
 
+.. _api_fluid_optimizer_RMSPropOptimizer:
+
+RMSPropOptimizer
+----------------
+
+..  autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
+    :members:
+    :noindex:
+
diff --git a/doc/fluid/api/param_attr.rst b/doc/fluid/api/param_attr.rst
index 8e4ddb2b0492d0fcfcade199fdd6dfe43faa7075..33035bbc7ca5c8d000adeaf1cb79806a3ea64604 100644
--- a/doc/fluid/api/param_attr.rst
+++ b/doc/fluid/api/param_attr.rst
@@ -1,9 +1,11 @@
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
     !DO NOT EDIT THIS FILE MANUALLY!
 
-==========
-param_attr
-==========
+================
+fluid.param_attr
+================
+
+.. _api_fluid_param_attr_ParamAttr:
 
 ParamAttr
 ---------
@@ -12,6 +14,8 @@ ParamAttr
     :members:
     :noindex:
 
+.. _api_fluid_param_attr_WeightNormParamAttr:
+
 WeightNormParamAttr
 -------------------
 
diff --git a/doc/fluid/api/profiler.rst b/doc/fluid/api/profiler.rst
index 74d102dcb0db35766c34e3d14939a8aa5861686b..c750a2d588df56728ac7f73051ab7a9e44dee232 100644
--- a/doc/fluid/api/profiler.rst
+++ b/doc/fluid/api/profiler.rst
@@ -1,9 +1,11 @@
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
     !DO NOT EDIT THIS FILE MANUALLY!
 
-========
-profiler
-========
+==============
+fluid.profiler
+==============
+
+.. _api_fluid_profiler_cuda_profiler:
 
 cuda_profiler
 -------------
@@ -11,15 +13,35 @@ cuda_profiler
 ..  autofunction:: paddle.fluid.profiler.cuda_profiler
     :noindex:
 
+.. _api_fluid_profiler_reset_profiler:
+
 reset_profiler
 --------------
 
 ..  autofunction:: paddle.fluid.profiler.reset_profiler
     :noindex:
 
+.. _api_fluid_profiler_profiler:
+
 profiler
 --------
 
 ..  autofunction:: paddle.fluid.profiler.profiler
     :noindex:
 
+.. _api_fluid_profiler_start_profiler:
+
+start_profiler
+--------------
+
+..  autofunction:: paddle.fluid.profiler.start_profiler
+    :noindex:
+
+.. _api_fluid_profiler_stop_profiler:
+
+stop_profiler
+-------------
+
+..  autofunction:: paddle.fluid.profiler.stop_profiler
+    :noindex:
+
diff --git a/doc/fluid/api/recordio_writer.rst b/doc/fluid/api/recordio_writer.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f0c12fd115478a29fbd178b533b7490b2f663717
--- /dev/null
+++ b/doc/fluid/api/recordio_writer.rst
@@ -0,0 +1,23 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+=====================
+fluid.recordio_writer
+=====================
+
+.. _api_fluid_recordio_writer_convert_reader_to_recordio_file:
+
+convert_reader_to_recordio_file
+-------------------------------
+
+..  autofunction:: paddle.fluid.recordio_writer.convert_reader_to_recordio_file
+    :noindex:
+
+.. _api_fluid_recordio_writer_convert_reader_to_recordio_files:
+
+convert_reader_to_recordio_files
+--------------------------------
+
+..  autofunction:: paddle.fluid.recordio_writer.convert_reader_to_recordio_files
+    :noindex:
+
diff --git a/doc/fluid/api/regularizer.rst b/doc/fluid/api/regularizer.rst
index 756bc53baa0625aef48dad0c35e7ae57421a70d0..987eaea903520d91c284c8da7a8cb066a1648069 100644
--- a/doc/fluid/api/regularizer.rst
+++ b/doc/fluid/api/regularizer.rst
@@ -1,9 +1,11 @@
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
     !DO NOT EDIT THIS FILE MANUALLY!
 
-===========
-regularizer
-===========
+=================
+fluid.regularizer
+=================
+
+.. _api_fluid_regularizer_append_regularization_ops:
 
 append_regularization_ops
 -------------------------
@@ -11,12 +13,7 @@ append_regularization_ops
 ..  autofunction:: paddle.fluid.regularizer.append_regularization_ops
     :noindex:
 
-WeightDecayRegularizer
-----------------------
-
-..  autoclass:: paddle.fluid.regularizer.WeightDecayRegularizer
-    :members:
-    :noindex:
+.. _api_fluid_regularizer_L1Decay:
 
 L1Decay
 -------
@@ -25,6 +22,8 @@ L1Decay
     :members:
     :noindex:
 
+.. _api_fluid_regularizer_L2Decay:
+
 L2Decay
 -------
 
@@ -32,6 +31,8 @@ L2Decay
     :members:
     :noindex:
 
+.. _api_fluid_regularizer_L1DecayRegularizer:
+
 L1DecayRegularizer
 ------------------
 
@@ -39,6 +40,8 @@ L1DecayRegularizer
     :members:
     :noindex:
 
+.. _api_fluid_regularizer_L2DecayRegularizer:
+
 L2DecayRegularizer
 ------------------
 
diff --git a/doc/fluid/api/transpiler.rst b/doc/fluid/api/transpiler.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d2ac04f1449c32cb414cea1b76d7469bbe9ccb85
--- /dev/null
+++ b/doc/fluid/api/transpiler.rst
@@ -0,0 +1,59 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+================
+fluid.transpiler
+================
+
+.. _api_fluid_transpiler_DistributeTranspiler:
+
+DistributeTranspiler
+--------------------
+
+..  autoclass:: paddle.fluid.transpiler.DistributeTranspiler
+    :members:
+    :noindex:
+
+.. _api_fluid_transpiler_InferenceTranspiler:
+
+InferenceTranspiler
+-------------------
+
+..  autoclass:: paddle.fluid.transpiler.InferenceTranspiler
+    :members:
+    :noindex:
+
+.. _api_fluid_transpiler_memory_optimize:
+
+memory_optimize
+---------------
+
+..  autofunction:: paddle.fluid.transpiler.memory_optimize
+    :noindex:
+
+.. _api_fluid_transpiler_release_memory:
+
+release_memory
+--------------
+
+..  autofunction:: paddle.fluid.transpiler.release_memory
+    :noindex:
+
+.. _api_fluid_transpiler_HashName:
+
+HashName
+--------
+
+..  autoclass:: paddle.fluid.transpiler.HashName
+    :members:
+    :noindex:
+
+.. _api_fluid_transpiler_RoundRobin:
+
+RoundRobin
+----------
+
+..  autoclass:: paddle.fluid.transpiler.RoundRobin
+    :members:
+    :noindex:
+
diff --git a/doc/fluid/design/concepts/lod_tensor.md b/doc/fluid/design/concepts/lod_tensor.md
index d606d7a790b4b0dc18553f2220d39cec8aa619ec..748488f6d5f2f1272e87b89047570632418da8dc 100644
--- a/doc/fluid/design/concepts/lod_tensor.md
+++ b/doc/fluid/design/concepts/lod_tensor.md
@@ -173,6 +173,7 @@ are transformed into offsets of elements/words as follows:
 
 ## Slicing of LoD Tensors
 
+
 When we use the above 2-level LoD Tensor as the input to a nested-RNN, we need to retrieve certain sequences.  Here we define the sequence identified by branch <i,j,...> as the **<i,j,...>-slice**.
 
 For example, the <2>-slice of above example is
@@ -189,3 +190,22 @@ and the <2,0>-slice of above slice is
 10  12
   ||
 ```
+
+## Length Representation vs Offset Representation
+
+The offset representation is an implementation-oriented decision and it makes understanding the idea behind LoDTensor difficult.
+Hence, we encapsulate this implementation detail in C++ and expose the original length representation in our Python API. 
+Specifically, we call this length representation `recursive_sequence_lengths` and users can use the following code to set or get the `recursive_sequence_lengths` of a LoDTensor in Python:
+```Python
+# length representation of lod called recursive_sequence_lengths
+recursive_seq_lens = [[3, 1, 2], [2, 2, 1, 3, 1, 2]]
+# Create a LoDTensor that has the above recursive_sequence_lengths info.
+# This recursive_sequence_lengths will be converted to an offset representation of LoD in the C++ implementation under the hood.
+tensor = fluid.LoDTensor(lod)
+
+# Set/Change the recursive_sequence_lengths info of LoDTensor
+tensor.set_recursive_sequence_lengths([[3, 1, 2]])
+# Get the recursive_sequence_lengths info of a LoDTensor (the offset-based LoD representation stored in C++ will be converted 
+# back to length-based recursive_sequence_lengths), new_recursive_seq_lens = [[3, 1, 2]]
+new_recursive_seq_lens = tensor.recursive_sequence_lengths()
+```
diff --git a/doc/fluid/design/concepts/python_data_feeding.md b/doc/fluid/design/concepts/python_data_feeding.md
new file mode 100644
index 0000000000000000000000000000000000000000..dffee8e02bacbc99bdfa8c54f1a146de340ad778
--- /dev/null
+++ b/doc/fluid/design/concepts/python_data_feeding.md
@@ -0,0 +1,130 @@
+# Python Data Feeding
+
+In the former implementation of Paddle Fluid, there are two ways to feed data:
+
+- Use `reader_op` in backend C++ side. This method only supports data feeding from recordio files and random data generators, but supports many kinds of `decorated_readers`. For examples, `double_buffer_reader` uses two threads to achieve better performance: one for time-consuming I/O operations, and the other for `Executor::Run()`. See [C++ Data Feeding](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/cpp_data_feeding.md) for details.
+
+- Feed data directly using `DataFeeder.feed()` in Python codes. It is more flexible than the first way. Many kinds of preprocessing steps can be performed before feeding using Python or any other languages, instead of adding many uncommon `operators` in C++ side. But this method is less efficient: the program cannot read the next mini-batch data before `Executor::Run()` ends. Moreover, `decorated_readers` such as `double_buffer_reader` cannot be used for better performance.
+
+In this document, we design a Python Data Feeding process combining the efficiency of the first way and the flexibility of the second way. A data queue `LoDTensorBlockingQueue` is designed to be shared by the Python and C++ side, while `LoDTensorArray` is pushed into the queue in Python side and `reader_op` in C++ side reads out the data from the queue.
+
+
+## Design of LoDTensorBlockingQueue
+`LoDTensorBlockingQueue` is a blocking queue with a fixed `capacity` and accepts `std::vector<framework::LoDTensor>` with shapes indicated by `dims`. Since `LoDTensorBlockingQueue` must be constructed using `capacity` and `dims`, it cannot be a `Variable` type. Therefore, a `LoDTensorBlockingQueueHolder` is designed to defer construction of `LoDTensorBlockingQueue`.
+
+```C++
+class LoDTensorBlockingQueueHolder;
+
+class LoDTensorBlockingQueue {
+  friend class LoDTensorBlockingQueueHolder;
+ private:
+  // `LoDTensorBlockingQueue` can only be constructed by 
+  // `LoDTensorBlockingQueueHolder::InitOnce()`
+  LoDTensorBlockingQueue(size_t capacity, const std::vector<framework::DDim>& dims);
+ 
+ public:
+  size_t Size() const { return queue_.Size(); } // Get the current size of the queue
+
+  size_t Cap() const { return queue_.Cap(); }// Get the capacity of the queue
+
+  void Close() { return queue_.Close(); }
+
+  bool IsClosed() const { return queue_.IsClosed(); }
+
+  // Block if Size() == Cap()
+  // Return false only when queue_.IsClosed() == true
+  bool Push(const std::vector<framework::LoDTensor> &lod_tensor_vec);
+  
+  // Block if Size() == 0.
+  // *Success == false when queue_.IsClosed() == true
+  std::vector<framework::LoDTensor> Pop(bool *success = nullptr);
+ 
+ private:
+  // Use reader::BlockingQueue as the inner data structure
+  BlockingQueue<std::vector<framework::LoDTensor>> queue_;
+  std::vector<framework::DDim> dims_;
+};
+
+class LoDTensorBlockingQueueHolder {
+ public:  
+  // Call the constructor of `LoDTensorBlockingQueue` to create queue_
+  // `InitOnce` can only called once, otherwise an exception would raise
+  void InitOnce(size_t capacity, const std::vector<framework::DDim>& dims) {
+    PADDLE_ENFORCE(queue_ == nullptr);
+    queue_.reset(new LoDTensorBlockingQueue(capacity, dims));
+  }
+
+  const std::shared_ptr<LoDTensorBlockingQueue>& GetQueue() const { return queue_; }
+
+ private:
+  std::shared_ptr<LoDTensorBlockingQueue> queue_;
+};
+```
+
+There are some major things that must be concerned:
+- `LoDTensorBlockingQueueHolder` should be a `Variable` in global scope, so that `reader_op` can find it when reading data.
+- A `Variable` of `LoDTensorBlockingQueueHolder` but not `VarDesc` must be created in Python code before `Executor::Run()` so that `Executor::Run()` can get the feeding data when it is called.
+- `Create_reader_op` should accept the name of the `LoDTensorBlockingQueueHolder` variable as an input.
+
+
+## Release of the GIL in pybind
+`Pybind11::gil_scoped_release` is used to release GIL (Global Interpreter Lock) when `LoDTensorBlockingQueue::Push()` or `Executor::Run()` method are invoked in Python side, making `LoDTensorBlockingQueue::Push()` and `Executor::Run()` run in parallel.
+
+
+## Design of PyReader
+`PyReader` is a reader which holds a `LoDTensorBlockingQueue` object.
+```C++
+class PyReader : public ReaderBase {
+ public:
+  explicit PyReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue);
+  
+  void ReadNext(std::vector<framework::LoDTensor>* out) override {
+    bool success;
+    *out = queue_->Pop(&success);
+    if (!success) out->clear();
+  }
+  
+  void ReInit() override { return; }
+
+ private:
+  std::shared_ptr<LoDTensorBlockingQueue> queue_;
+};
+```
+
+
+## Design of CreatePyReaderOp
+`CreatePyReaderOp` is used to create the `PyReader` object. It requires an input `blocking_queue` which indicates the name of the `LoDTensorBlockingQueueHolder` variable.
+```C++
+class CreatePyReaderOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    if (out->Get() != nullptr) return;
+    
+    const std::string& queue_name = Input("blocking_queue");
+    auto* queue_holder_var = scope.FindVar(queue_name);
+    PADDLE_ENFORCE(queue_holder_var != nullptr);
+		auto* queue_holder = queue_holder_var
+                    ->template GetMutable<framework::LoDTensorBlockingQueueHolder>();
+    out->Reset(new PyReader(queue_holder->GetQueue()));
+  }
+};
+```
+
+## Design of Python codes
+The design of Python codes are as follows. First, we construct a variable of `LoDTensorBlockingQueueHolder` and init it with given parameters, returning the `LoDTensorBlockingQueue` object after initialization. After that, a layer of `CreatePyReaderOp` is constructed and accepts the name of the `LoDTensorBlockingQueueHolder` variable. The `LoDTensorBlockingQueue` object and result of the layer are both returned.
+```Python
+def py_reader(capacity, shapes):
+  queue_name = unique_name.generate("lod_tensor_blocking_queue")
+  var = global_scope().var(feeder_name) # create LoDTensorBlockingQueueHolder Variable
+  feed_queue = core.init_lod_tensor_blocking_queue(var, capacity, shapes) # init the queue
+  out = create_var()
+  create_py_reader_op_with_queue_name(
+      inputs={'blocking_queue': queue_name},
+      outputs={'Out':[out]})  
+  return out, feed_queue
+```
diff --git a/doc/fluid/design/dist_train/dist_train_nccl2.md b/doc/fluid/design/dist_train/dist_train_nccl2.md
new file mode 100644
index 0000000000000000000000000000000000000000..aa7455ec5de0d46d7c2b0cef3b7ebf4754af3cb1
--- /dev/null
+++ b/doc/fluid/design/dist_train/dist_train_nccl2.md
@@ -0,0 +1,35 @@
+# Distributed Training with NCCL2
+
+We design a pattern that can enable training with `ParallelExecutor` and
+using [NCCL2](https://developer.nvidia.com/nccl) as it's collective
+communication library.
+
+In `ParallelExecutor` we can use `AllReduce` or `Reduce` and `Broadcast`
+to do multi GPU training. And if we initialize NCCL2 communicators as
+ranks in a distributed environment, we can simply run the `ParallelExecutor`
+as a distributed program! The only thing that may be different than in
+the single node version is that we need to broadcast the NCCL unique ID
+to all the nodes, and initialize communicators using that ID, so NCCL2
+will know each other as ranks.
+
+To achieve this feature, we introduce a new operator: `gen_nccl_id` op,
+so we are ***not*** "bind to" running NCCL2 with MPI, we can run it in
+what ever platform you like.
+
+It have two running modes:
+
+1. Generate and broadcast mode, which should be used on trainer 0;
+1. Listen and fetch mode, which should be used on trainers other than 0.
+
+In both two modes, this op can save the NCCL ID into current scope as a
+persistable variable, Then we can insert this op at the end of
+"startup program" of fluid, so that all workers can get the same ID to
+initialize NCCL communicator objects.
+
+<img src="src/ncc2_design.png">
+
+The above figure indicates the general process when training with NCCL2
+distributed. Each trainer have the number of communicators equal to the
+number of GPUs, but the ranks should match the global ranks number: here
+we have total 8 GPUs, so `nranks==8`, for each trainer, the ranks should
+be from 0 ~ 3 on trainer 0 and 4 ~ 7 on trainer 1.
diff --git a/doc/fluid/design/dist_train/distributed_lookup_table_design.md b/doc/fluid/design/dist_train/distributed_lookup_table_design.md
index 988729138926f035750b59eb245dde82502a3ad2..e284e1ec5cdd18d0049ce3c1a8349bbe1248cb48 100644
--- a/doc/fluid/design/dist_train/distributed_lookup_table_design.md
+++ b/doc/fluid/design/dist_train/distributed_lookup_table_design.md
@@ -1,6 +1,6 @@
 # Design Doc: Distributed Lookup Table Operator
 
-A lookup table operator in PaddlePaddle where the table could be out
+A distribute lookup table operator in PaddlePaddle where the table could be out
 of the memory of a computer.
 
 ## Background
@@ -24,14 +24,14 @@ memory, so we'd need a distributed storage service, which supports the
 lookup of rows.
 
 The following figure illustrates the multiplication of x with two
-non-zero elements, or say, two symbols, and a lookup table W:
+non-zero elements, or say two symbols, and a lookup table W:
 
 ![lookup table](./src/lookup_table.png)
 
 ### The Backward Algorithm
 
 The backward algorithm computes W'(x) using W(x).  W'(x) has the same
-scale of size as W(x) and is much smaller than W.
+the scale of size as W(x) and is much smaller than W.
 
 To optimize W given W', we can do simple SGD update:
 
@@ -44,85 +44,46 @@ $$W = f(W, W')$$
 The following figure illustrates the backward pass of the lookup
 operator: ![lookup table training](./src/lookup_table_training.png)
 
-## Distributed Storage Service
-
-The forward algorithm requires a distributed storage service for W.
-The backward algorithm prefers that the storage system can apply the
-optimization algorithm on W.  The following two sections describe two
-solutions -- the former doesn't require that the storage service can
-do optimization, the latter does.
-
-### Storage Service Doesn't Optimize
-
-In this design, we use highly-optimized distributed storage, e.g.,
-memcached, as the storage service, and we run the optimization
-algorithm on parameter servers of PaddlePaddle.  The following figure
-illustrates the training process.
-
-<!--
-Note: please update the following URL when update this digraph.
-<img src='https://g.gravizo.com/svg?
-digraph G {
-  rankdir="LR";
-  subgraph cluster1 {
-  P1 [label="pserver 1"];
-  P2 [label="pserver 2"];
-  T1 [label="trainer 1"];
-  T2 [label="trainer 2"];
-  T3 [label="trainer 3"];
-  }
-  KV [label="memcached"];
-  T1 -> P1;
-  T1 -> P2;
-  T2 -> P1;
-  T2 -> P2;
-  T3 -> P1;
-  T3 -> P2;
-  P1 -> KV [color=gray, weight=0.1];
-  KV -> P1 [color=gray, weight=0.1];
-  P2 -> KV [color=gray, weight=0.1];
-  KV -> P2 [color=gray, weight=0.1];
-  KV -> T1 [color=gray, weight=0.1];
-  KV -> T2 [color=gray, weight=0.1];
-  KV -> T3 [color=gray, weight=0.1];
-}
-)
-'/>
--->
-
-<img src='https://g.gravizo.com/svg?%20digraph%20G%20{%20rankdir=%22LR%22;%20subgraph%20cluster1%20{%20P1%20[label=%22pserver%201%22];%20P2%20[label=%22pserver%202%22];%20T1%20[label=%22trainer%201%22];%20T2%20[label=%22trainer%202%22];%20T3%20[label=%22trainer%203%22];%20}%20KV%20[label=%22memcached%22];%20T1%20-%3E%20P1;%20T1%20-%3E%20P2;%20T2%20-%3E%20P1;%20T2%20-%3E%20P2;%20T3%20-%3E%20P1;%20T3%20-%3E%20P2;%20P1%20-%3E%20KV%20[color=gray,%20weight=0.1];%20KV%20-%3E%20P1%20[color=gray,%20weight=0.1];%20P2%20-%3E%20KV%20[color=gray,%20weight=0.1];%20KV%20-%3E%20P2%20[color=gray,%20weight=0.1];%20KV%20-%3E%20T1%20[color=gray,%20weight=0.1];%20KV%20-%3E%20T2%20[color=gray,%20weight=0.1];%20KV%20-%3E%20T3%20[color=gray,%20weight=0.1];%20}'/>
-
-Each trainer runs the forward and backward passes using their local
-data:
-
-1. In the forward pass, when a trainer runs the forward algorithm of a
-   lookup operator, it retrieves W(x) from the storage service.
-1. The trainer computes W'(x) in the backward pass using W(x).
-
-During the global update process:
-
-1. Each trainer uploads its W'(x) to parameter servers.
-1. The parameter server runs the optimization algorithm, e.g., the
-   Adam optimization algorithm, which requires that
-   1. The parameter server retrieves W(x) from memcached, and
-   1. The parameter server pushes $\Delta W(x)=f(W(x), lambda \sum_j
-      W'(x))$ to memcached, where $f$ denotes the optimization
-      algorithm.
-
-### Storage Service Does Optimize
-
-This design is very similar to the above one, except that the
-optimization algorithm $f$ runs on the storage service.
-
-- Pro: parameter servers do not retrieve W(x) from the storage
-  service, thus saves half network communication.
-- Con: the storage service needs to be able to run the optimization
-  algorithm.
-
-## Conclusion
-
-Let us do the "storage service does not optimize" solution first, as a
-baseline at least, because it is easier to use a well-optimized
-distributed storage service like memcached.  We can do the "storage
-service does optimize" solution later or at the same time, which, if
-implemented carefully, should have better performance than the former.
+## Distributed Lookup Table
+### Problem 1: The lookup table may be very large.
+
+ In the condition like the search engine and recommendation system, the number of feature Id may be very large, say 100,000,000,000, then for a float value lookup table of size 8, the total size of the table is:
+
+ ```
+ 100,000,000,000 * 8 * 4(Bytes) = 2980.23 GB
+ ```
+
+### Solution: Distributed storage
+
+1. Paddle use [SelectedRows](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/selected_rows.md) as the storage format for the lookup table, the lookup table parameter will be split to multi-machine according to the hash of the feature ID, and data will also be split and send to the same machine to prefetch the parameter.
+
+1. For common parameters, the trainer will get the whole parameter for training, but for the big lookup table, the trainer can not store the whole parameter. Because the input data feature is very sparse, every time we only need a few parameters for training, so we use `prefetch_op` to only prefetch the parameter needed to trainer.
+
+### Problem 2. The Id in the lookup table is not sure before training.
+
+ The feature Id is calculated by the hash function because the feature data source is so large, we can not get all the Id before training. So we can not initialize the table before training.
+
+### Solution: Id auto growth
+
+At the beginning of training, paddle only malloc the memory for the lookup table at parameter server side, the Id and it's value will not be initialized. During training, when a parameter server received an Id, if it is already in the lookup table, it will return the existing parameter, if the Id does not exist, paddle will add it into the lookup table and initialize the value for it.
+
+### Problem 3: parameter load and save
+
+For common parameters, paddle use trainer to save and load them. But for distributed lookup table, trainer cannot do this because it's large size.
+
+### Solution: Parameter server side save and load
+
+Paddle support parameter server side save and load for distribute lookup table. Each machine of parameter servers will only save and load part of the whole table.
+
+## Architecture
+The whole architecture of the distribute lookup table is as below:
+
+### Training steps:
+1. Read a batch of data, the data is feature ids.
+1. The input ids will be split by `split_ids_op` with the same hash function of the lookup table.
+1. The `prefetch_op` use the split result to prefetch parameters back from the lookup table.
+1. Run forward-backward to get the gradient of the lookup table.
+1. `split_ids_op` split the gradient and then use `send_op` to the parameter server.
+1. parameter server update the table with the received gradient.
+
+![distribute lookup table](./src/distributed_lookup_table.jpeg)
diff --git a/doc/fluid/design/dist_train/src/distributed_lookup_table.graffle b/doc/fluid/design/dist_train/src/distributed_lookup_table.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..65dfdbbacd219739db6ddfdf243cc16c3c4e8d1e
Binary files /dev/null and b/doc/fluid/design/dist_train/src/distributed_lookup_table.graffle differ
diff --git a/doc/fluid/design/dist_train/src/distributed_lookup_table.jpeg b/doc/fluid/design/dist_train/src/distributed_lookup_table.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..5353a16fd329f62ff893d32706b9c3c0bcc46a07
Binary files /dev/null and b/doc/fluid/design/dist_train/src/distributed_lookup_table.jpeg differ
diff --git a/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.graffle b/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..96ca6d48f43bd9f49c6861dab006e2037873db87
Binary files /dev/null and b/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.graffle differ
diff --git a/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.png b/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.png
new file mode 100644
index 0000000000000000000000000000000000000000..afa25ab3b4e427bc595a855b12ab966478e01ed0
Binary files /dev/null and b/doc/fluid/design/dist_train/src/fluid_lookup_remote_table.png differ
diff --git a/doc/fluid/design/dist_train/src/ncc2_design.graffle b/doc/fluid/design/dist_train/src/ncc2_design.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..7d2753bbb03bc28c7a0054bb0aa424deb072ffbf
Binary files /dev/null and b/doc/fluid/design/dist_train/src/ncc2_design.graffle differ
diff --git a/doc/fluid/design/dist_train/src/ncc2_design.png b/doc/fluid/design/dist_train/src/ncc2_design.png
new file mode 100644
index 0000000000000000000000000000000000000000..da0d5ee81f5dfeb4ca1356601b0bb5870456e3d6
Binary files /dev/null and b/doc/fluid/design/dist_train/src/ncc2_design.png differ
diff --git a/doc/fluid/design/multi_devices/kernel_selection.md b/doc/fluid/design/multi_devices/kernel_selection.md
index 967317d5d2eeb818ab14faabca342cc8c4ed717e..4d2aab87b8cf30d03075e96cc4c67070efaf963a 100644
--- a/doc/fluid/design/multi_devices/kernel_selection.md
+++ b/doc/fluid/design/multi_devices/kernel_selection.md
@@ -74,10 +74,10 @@ void OperatorWithKernel::Run(
     auto kernel_type_for_var = this->GetKernelTypeForVar(...);
     if (kernel_type_for_var.place_ != expected_kernel_key.place_) {
       auto* trans_var = new_scope.Var(var_name);
-      auto* out = DataTransform(expected_kernel_key,
+      auto* out = TransformData(expected_kernel_key,
                                 kernel_type_for_var,
                                 *tensor_in);
-      CopyVariableWithTensor(...);
+      SetTensorToVariable(...);
     }
   }
 
diff --git a/doc/fluid/howto/cluster/fluid_cluster_train_cn.md b/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
index b99b90056b0a2e51f2668a6d27d94857bdc09c37..55326940ce7c7dbaa5bf19f1950f470527ddf4f0 100644
--- a/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
+++ b/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
@@ -168,13 +168,13 @@ cd /paddle/python/paddle/fluid/tests/book
 
 第二步，启动Parameter Server：
 ```bash
-PADDLE_INIT_PORT=6174 PADDLE_INIT_PSERVERS=192.168.1.2 TRAINERS=2 POD_IP=192.168.1.2 PADDLE_INIT_TRAINER_ID=1 TRAINING_ROLE=PSERVER python test_fit_a_line.py
+PADDLE_PSERVER_PORT=6174 PADDLE_PSERVER_IPS=192.168.1.2 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=192.168.1.2 PADDLE_TRAINER_ID=1 PADDLE_TRAINING_ROLE=PSERVER python test_fit_a_line.py
 ```
 执行命令后请等待出现提示： ```Server listening on 192.168.1.2:6174 ```, 表示Paramter Server已经正常启动。
 
 第三步，启动Trainer：
 ```bash
-PADDLE_INIT_PORT=6174 PADDLE_INIT_PSERVERS=192.168.1.3 TRAINERS=2 POD_IP=192.168.1.3 PADDLE_INIT_TRAINER_ID=1 TRAINING_ROLE=TRAINER python test_fit_a_line.py
+PADDLE_PSERVER_PORT=6174 PADDLE_PSERVER_IPS=192.168.1.3 PADDLE_TRAINERS=2 PADDLE_CURRENT_IPP=192.168.1.3 PADDLE_TRAINER_ID=1 PADDLE_TRAINING_ROLE=TRAINER python test_fit_a_line.py
 ```
 由于我们定义的Trainer的数量是2个，因此需要在另外一个计算节点上再启动一个Trainer。
 
diff --git a/doc/fluid/howto/cluster/fluid_recordio.md b/doc/fluid/howto/cluster/fluid_recordio.md
index 55ce63ec193948424cd0b87f13d56b9cf6154dfc..92859e8f622d0c155128821c54252113c5016989 100644
--- a/doc/fluid/howto/cluster/fluid_recordio.md
+++ b/doc/fluid/howto/cluster/fluid_recordio.md
@@ -114,8 +114,8 @@ def gen_train_list(file_pattern, trainers, trainer_id):
            ret_list.append(f)
    return ret_list
 
-trainers = int(os.getenv("TRAINERS"))
-trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
+trainers = int(os.getenv("PADDLE_TRAINERS"))
+trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
 data_file = fluid.layers.io.open_files(
     filenames=gen_train_list("./mnist-[0-9]*.recordio", 2, 0),
     thread_num=1,
diff --git a/doc/fluid/howto/inference/build_and_install_lib_cn.rst b/doc/fluid/howto/inference/build_and_install_lib_cn.rst
index c8d9992fcc92c25f8c14f71c79bde9f79fd92b1f..84005b54e07cf810649370d2c1f6b6c522434bf6 100644
--- a/doc/fluid/howto/inference/build_and_install_lib_cn.rst
+++ b/doc/fluid/howto/inference/build_and_install_lib_cn.rst
@@ -13,6 +13,7 @@ cpu_noavx_openblas       `fluid.tgz <https://guest:@paddleci.ngrok.io/repository
 cuda7.5_cudnn5_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_
 cuda8.0_cudnn5_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_
 cuda8.0_cudnn7_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/fluid.tgz>`_
+cuda9.0_cudnn7_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/fluid.tgz>`_
 ======================   ========================================
 
 从源码编译
diff --git a/doc/fluid/howto/optimization/host_memory_profiling_cn.md b/doc/fluid/howto/optimization/host_memory_profiling_cn.md
index 9b55a66ded8b48f7105c05f1462839a72ab5f904..7fb0883dd937465d15479b29df95078edb50e069 100644
--- a/doc/fluid/howto/optimization/host_memory_profiling_cn.md
+++ b/doc/fluid/howto/optimization/host_memory_profiling_cn.md
@@ -1,4 +1,4 @@
-## 堆内存分析和优化
+# 堆内存分析和优化
 
 计算机程序都可能有内存泄漏的风险。**内存泄漏**一般是由于程序在堆(heap)上分配了内存而没有释放，随着程序的运行占用的内存越来越大，一方面会影响程序的稳定性，可能让运行速度越来越慢，或者造成oom，甚至会影响运行程序的机器的稳定性，造成宕机。
 
@@ -20,11 +20,11 @@ Paddle也提供了基于gperftool的[CPU性能分析教程](https://github.com/P
 
 对于堆内存的分析，主要用到thread-caching malloc和heap-profiling using tcmalloc。
 
-## 使用流程
-#### 环境
+## 环境
+
 本教程基于paddle提供的Docker开发环境paddlepaddle/paddle:latest-dev，基于Ubuntu 16.04.4 LTS环境。
 
-#### 使用流程
+## 使用流程
 
 - 安装google-perftools
 
diff --git a/doc/fluid/howto/optimization/timeline_cn.md b/doc/fluid/howto/optimization/timeline_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..5d061e1c00d2ca0194153730a39486b8357fa5b0
--- /dev/null
+++ b/doc/fluid/howto/optimization/timeline_cn.md
@@ -0,0 +1,26 @@
+# 如何使用timeline工具做性能分析
+
+1. 在训练的主循环外加上`with profiler.profiler(...)`。运行之后，代码会在`/tmp/profile`目录下生成一个profile的记录文件。
+
+	**提示：**
+	请不要在timeline记录信息时运行太多次迭代，因为timeline中的记录数量和迭代次数是成正比的。
+
+	```python
+	with profiler.profiler('All', 'total', '/tmp/profile') as prof:
+	    for pass_id in range(pass_num):
+	        for batch_id, data in enumerate(train_reader()):
+	            exe.run(fluid.default_main_program(),
+	                    feed=feeder.feed(data),
+	                    fetch_list=[])
+	            ...
+	```
+
+1. 运行`python paddle/tools/timeline.py`来处理`/tmp/profile`，这个程序默认会生成一个`/tmp/timeline`文件，你也可以用命令行参数来修改这个路径，请参考[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py)。
+
+1. 打开chrome浏览器，访问<chrome://tracing/>，用`load`按钮来加载生成的`timeline`文件。
+
+	![chrome tracing](./tracing.jpeg)
+
+1. 结果如下图所示，可以放到来查看timetime的细节信息。
+
+	![chrome timeline](./timeline.jpeg)
diff --git a/doc/fluid/howto/optimization/timeline.md b/doc/fluid/howto/optimization/timeline_en.md
similarity index 100%
rename from doc/fluid/howto/optimization/timeline.md
rename to doc/fluid/howto/optimization/timeline_en.md
diff --git a/doc/v2/design/cluster_train/large_model_dist_train.md b/doc/v2/design/cluster_train/large_model_dist_train.md
index 0c4b5bc24c854b7062d509249bea9c50d42bd5f1..edb0245ea083e791b7f32ac57a330698299fceda 100644
--- a/doc/v2/design/cluster_train/large_model_dist_train.md
+++ b/doc/v2/design/cluster_train/large_model_dist_train.md
@@ -52,7 +52,7 @@ In `trainer_internal.cpp:L93 trainOneBatch`:
 
 When doing actual network forward and backward, at the beginning of each batch, the trainer will try to download one row of data from pserver.
 
-In `trainer/RemoteParameterUpdater.cpp`: `parameterUpdater_->getParametersRemote();`:
+In `legacy/trainer/RemoteParameterUpdater.cpp`: `parameterUpdater_->getParametersRemote();`:
 
 ```c++
 if (fullSize) {
diff --git a/doc/v2/design/interface/00.why_plain_c.md b/doc/v2/design/interface/00.why_plain_c.md
index a1443093342c5a3ed698fb6b52a751dfc7cb5319..826ff3141bc2512b525cb44ac0f18b376ce57e92 100644
--- a/doc/v2/design/interface/00.why_plain_c.md
+++ b/doc/v2/design/interface/00.why_plain_c.md
@@ -65,7 +65,7 @@ paddle_error paddle_matrix_get_shape(paddle_matrix matrix,
 而在CPP里面实现这个C的接口，文件 `paddle_matrix.cpp`
 
 ```cpp
-#include "paddle/math/matrix.h"
+#include "paddle/legacy/math/matrix.h"
 extern "C"
 paddle_error paddle_matrix_shape(paddle_matrix matrix,
                                  uint64_t *width,
diff --git a/doc/v2/design/mkl/mkldnn.md b/doc/v2/design/mkl/mkldnn.md
index bd5bcf6f67168c21cebb046a629b948d1661e75c..4876de0045979be20fa45bdc84d2594516f71c03 100644
--- a/doc/v2/design/mkl/mkldnn.md
+++ b/doc/v2/design/mkl/mkldnn.md
@@ -18,20 +18,20 @@ Figure 1. PaddlePaddle on IA
 具体的完成状态可以参见[这里](https://github.com/PaddlePaddle/Paddle/projects/21)。
 
 ## Contents
-
-- [Overview](#overview)
-- [Actions](#actions)
- 	- [CMake](#cmake)
- 	- [Matrix](#matrix)
-	- [Layers](#layers)
-	- [Activations](#activations)
-	- [Parameters](#parameters)
-	- [Gradients](#gradients)
-	- [Unit Tests](#unit-tests)
-	- [Python API](#python-api)
-	- [Benchmarking](#benchmarking)
-	- [Others](#others)
-- [Design Concerns](#design-concerns)
+
+- [Overview](#overview)
+- [Actions](#actions)
+ 	- [CMake](#cmake)
+ 	- [Matrix](#matrix)
+	- [Layers](#layers)
+	- [Activations](#activations)
+	- [Parameters](#parameters)
+	- [Gradients](#gradients)
+	- [Unit Tests](#unit-tests)
+	- [Python API](#python-api)
+	- [Benchmarking](#benchmarking)
+	- [Others](#others)
+- [Design Concerns](#design-concerns)
 
 ## Overview
 
@@ -218,20 +218,20 @@ if use_mkldnn
 我们总结出一些特别需要注意的点：
 
 1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数，
-我们决定使用已有的`deviceId_`变量来区分layer的属性，定义`-2`为`MKLDNNLayer`特有的设备ID。
-2. 重写父类Layer的**init**函数，修改`deviceId_`为`-2`，代表这个layer是用于跑在MKL-DNN的环境下。
+我们决定使用已有的`deviceId_`变量来区分layer的属性，定义`-2`为`MKLDNNLayer`特有的设备ID。
+2. 重写父类Layer的**init**函数，修改`deviceId_`为`-2`，代表这个layer是用于跑在MKL-DNN的环境下。
 3. 创建`MKLDNNBase`，定义一些除了layer和memory相关的类和函数。
-包括MKL-DNN会用到`MKLDNNStream`和`CPUEngine`，和未来可能还会用到`FPGAEngine`等。
+包括MKL-DNN会用到`MKLDNNStream`和`CPUEngine`，和未来可能还会用到`FPGAEngine`等。
 4. 如果MKL-DNN layer的后面接有cpu device，那么就会使`output_.value`与`extOutVal_`共享内存，
 同时数据格式就是`NCHW`，这样下一个cpu device就能拿到正确的数据。
 在有普通的CPU layer时， `extOutVal_`和`extOutGrad_`的格式始终是`NCHW`或者`NC`。
 
 ## References
 1. [MKL small library](https://github.com/01org/mkl-dnn#linking-your-application)是[Intel MKL](https://software.intel.com/en-us/mkl)的一个子集。
-主要包括了深度学习相关的数学原语与操作，一般由MKL-DNN在发布[新版本](https://github.com/01org/mkl-dnn/releases)时一起更新。
+主要包括了深度学习相关的数学原语与操作，一般由MKL-DNN在发布[新版本](https://github.com/01org/mkl-dnn/releases)时一起更新。
 2. [MKL-DNN System Requirements](https://github.com/01org/mkl-dnn#system-requirements)。
 目前在PaddlePaddle中，仅会在支持AVX2指令集及以上的机器才使用MKL-DNN。
 3. [原来的方案](https://github.com/PaddlePaddle/Paddle/pull/3096)会引入**nextLayer**的信息。
-但是在PaddlePaddle中，无论是重构前的layer还是重构后的op，都不会想要知道next layer/op的信息。
+但是在PaddlePaddle中，无论是重构前的layer还是重构后的op，都不会想要知道next layer/op的信息。
 4. MKL-DNN的高性能格式与PaddlePaddle原有的`NCHW`不同(PaddlePaddle中的cuDNN部分使用的也是`NCHW`，所以不存在这个问题)。
-所以需要引入一个转换方法，并且只需要在必要的时候转换这种格式，才能更好的发挥MKL-DNN的性能。
+所以需要引入一个转换方法，并且只需要在必要的时候转换这种格式，才能更好的发挥MKL-DNN的性能。
diff --git a/doc/v2/dev/new_layer_cn.rst b/doc/v2/dev/new_layer_cn.rst
index 3115654b2bd87995fa63bb7828fd1b3039aea8cc..e5a14346123d342de0b67757cbbce654bd4180dc 100644
--- a/doc/v2/dev/new_layer_cn.rst
+++ b/doc/v2/dev/new_layer_cn.rst
@@ -58,7 +58,7 @@ PaddlePaddle的base layer类可以自动计算上面的导数。
 实现C++类
 ===================
 
-一个网络层的C++类需要实现初始化，前向和后向。全连接层的实现位于:code:`paddle/gserver/layers/FullyConnectedLayer.h`及:code:`paddle/gserver/layers/FullyConnectedLayer.cpp`。这里我们展示一份简化过的代码。
+一个网络层的C++类需要实现初始化，前向和后向。全连接层的实现位于:code:`paddle/legacy/gserver/layers/FullyConnectedLayer.h`及:code:`paddle/legacy/gserver/layers/FullyConnectedLayer.cpp`。这里我们展示一份简化过的代码。
 
 这个类需要继承 :code:`paddle::Layer` 这个基类，并且需要重写基类中的以下几个虚函数：
 
@@ -153,7 +153,7 @@ PaddlePaddle的base layer类可以自动计算上面的导数。
 
 - 每个层在其 :code:`forward` 函数的开头必须调用 :code:`Layer::forward(passType);` 。
 - 之后使用 :code:`reserveOutput(batchSize, size);` 为输出分配内存。由于我们支持训练数据有不同的批次大小，所以这一步是必要的。 :code:`reserveOutput`  会相应地改变输出的尺寸。为了保证效率，如果需要扩大矩阵，我们会重新分配内存；如果需要缩减矩阵，我们会继续使用现有的内存块。
-- 之后使用矩阵运算函数来计算 :math:`\sum_i W_i x + b`。:code:`getInput(i).value` 返回第i个输入矩阵。每个输入都是一个 :math:`batchSize \times dim` 的矩阵，每行表示一个批次中的单个输入。对于我们支持的全部矩阵操作，请参考 :code:`paddle/math/Matrix.h`和:code:`paddle/math/BaseMatrix.h` 。
+- 之后使用矩阵运算函数来计算 :math:`\sum_i W_i x + b`。:code:`getInput(i).value` 返回第i个输入矩阵。每个输入都是一个 :math:`batchSize \times dim` 的矩阵，每行表示一个批次中的单个输入。对于我们支持的全部矩阵操作，请参考 :code:`paddle/legacy/math/Matrix.h`和:code:`paddle/legacy/math/BaseMatrix.h` 。
 - 最终，使用 :code:`forwardActivation();` 进行激活操作。这会自动进行网络配置中声明的激活操作。
 
 
@@ -262,7 +262,7 @@ PaddlePaddle的base layer类可以自动计算上面的导数。
     REGISTER_LAYER(fc, FullyConnectedLayer);
     }
 
-若 :code:`cpp` 被放在 :code:`paddle/gserver/layers` 目录下，其会自动被加入编译列表。
+若 :code:`cpp` 被放在 :code:`paddle/legacy/gserver/layers` 目录下，其会自动被加入编译列表。
 
 
 写梯度检查单元测试
@@ -270,7 +270,7 @@ PaddlePaddle的base layer类可以自动计算上面的导数。
 
 写梯度检查单元测试是一个验证新实现的层是否正确的相对简单的办法。梯度检查单元测试通过有限差分法来验证一个层的梯度。首先对输入做一个小的扰动 :math:`\Delta x` ，然后观察到输出的变化为 :math:`\Delta y` ，那么，梯度就可以通过这个方程计算得到 :math:`\frac{\Delta y}{\Delta x }` 。之后，再用这个梯度去和 :code:`backward` 函数得到的梯度去对比，以保证梯度计算的正确性。需要注意的是梯度检查仅仅验证了梯度的计算，并不保证 :code:`forward` 和 :code:`backward` 函数的实现是正确的。你需要一些更复杂的单元测试来保证你实现的网络层是正确的。
 
-所有网络层的梯度检查单测都位于 :code:`paddle/gserver/tests/test_LayerGrad.cpp` 。我们建议你在写新网络层时把测试代码放入新的文件中。下面列出了全连接层的梯度检查单元测试。它包含以下几步：
+所有网络层的梯度检查单测都位于 :code:`paddle/legacy/gserver/tests/test_LayerGrad.cpp` 。我们建议你在写新网络层时把测试代码放入新的文件中。下面列出了全连接层的梯度检查单元测试。它包含以下几步：
 
 + 生成网络层配置。网络层配置包含以下几项：
    - 偏置参数的大小。（例子中是4096）
@@ -322,7 +322,7 @@ PaddlePaddle的base layer类可以自动计算上面的导数。
       }
     }
 
-如果你要为了测试而增加新的文件，例如 :code:`paddle/gserver/tests/testFCGrad.cpp` ，你需要把该文件加入 :code:`paddle/gserver/tests/CMakeLists.txt` 中。下面给出了一个例子。当你执行命令 :code:`make tests` 时，所有的单测都会被执行一次。注意，有些层可能需要高精度来保证梯度检查单测正确执行。你需要在配置cmake时将 :code:`WITH_DOUBLE` 设置为 `ON` 。
+如果你要为了测试而增加新的文件，例如 :code:`paddle/legacy/gserver/tests/testFCGrad.cpp` ，你需要把该文件加入 :code:`paddle/legacy/gserver/tests/CMakeLists.txt` 中。下面给出了一个例子。当你执行命令 :code:`make tests` 时，所有的单测都会被执行一次。注意，有些层可能需要高精度来保证梯度检查单测正确执行。你需要在配置cmake时将 :code:`WITH_DOUBLE` 设置为 `ON` 。
 
 .. code-block:: bash
 
diff --git a/doc/v2/dev/new_layer_en.rst b/doc/v2/dev/new_layer_en.rst
index b05bb45f11eb253dfb87d6283c29ec6689394d22..ad723738801908a5f48343574c204bdbfc97ee08 100644
--- a/doc/v2/dev/new_layer_en.rst
+++ b/doc/v2/dev/new_layer_en.rst
@@ -58,7 +58,7 @@ Finally we can use chain rule to calculate :math:`\frac{\partial z}{\partial x}`
 Implement C++ Class
 ===================
 
-The C++ class of the layer implements the initialization, forward, and backward part of the layer. The fully connected layer is at :code:`paddle/gserver/layers/FullyConnectedLayer.h` and :code:`paddle/gserver/layers/FullyConnectedLayer.cpp`. We list simplified version of the code below.
+The C++ class of the layer implements the initialization, forward, and backward part of the layer. The fully connected layer is at :code:`paddle/legacy/gserver/layers/FullyConnectedLayer.h` and :code:`paddle/legacy/gserver/layers/FullyConnectedLayer.cpp`. We list simplified version of the code below.
 
 It needs to derive the base class :code:`paddle::Layer`, and it needs to override the following functions:
 
@@ -154,7 +154,7 @@ The implementation of the forward part has the following steps.
 
 - Every layer must call :code:`Layer::forward(passType);` at the beginning of its :code:`forward` function.
 - Then it allocates memory for the output using :code:`reserveOutput(batchSize, size);`. This step is necessary because we support the batches to have different batch sizes. :code:`reserveOutput` will change the size of the output accordingly. For the sake of efficiency, we will allocate new memory if we want to expand the matrix, but we will reuse the existing memory block if we want to shrink the matrix.
-- Then it computes :math:`\sum_i W_i x + b` using Matrix operations. :code:`getInput(i).value` retrieve the matrix of the i-th input. Each input is a :math:`batchSize \times dim` matrix, where each row represents an single input in a batch. For a complete lists of supported matrix operations, please refer to :code:`paddle/math/Matrix.h` and :code:`paddle/math/BaseMatrix.h`.
+- Then it computes :math:`\sum_i W_i x + b` using Matrix operations. :code:`getInput(i).value` retrieve the matrix of the i-th input. Each input is a :math:`batchSize \times dim` matrix, where each row represents an single input in a batch. For a complete lists of supported matrix operations, please refer to :code:`paddle/legacy/math/Matrix.h` and :code:`paddle/legacy/math/BaseMatrix.h`.
 - Finally it applies the activation function using :code:`forwardActivation();`. It will automatically applies the corresponding activation function specifies in the network configuration.
 
 
@@ -263,7 +263,7 @@ Finally, you can use :code:`REGISTER_LAYER(fc, FullyConnectedLayer);` to registe
     REGISTER_LAYER(fc, FullyConnectedLayer);
     }
 
-If the :code:`cpp` file is put into :code:`paddle/gserver/layers`, it will be automatically added to the compilation list.
+If the :code:`cpp` file is put into :code:`paddle/legacy/gserver/layers`, it will be automatically added to the compilation list.
 
 
 Write Gradient Check Unit Test
@@ -271,7 +271,7 @@ Write Gradient Check Unit Test
 
 An easy way to verify the correctness of new layer's implementation is to write a gradient check unit test. Gradient check unit test utilizes finite difference method to verify the gradient of a layer. It modifies the input with a small perturbation :math:`\Delta x` and observes the changes of output :math:`\Delta y`, the gradient can be computed as :math:`\frac{\Delta y}{\Delta x }`. This gradient can be compared with the gradient computed by the :code:`backward` function of the layer to ensure the correctness of the gradient computation. Notice that the gradient check only tests the correctness of the gradient computation, it does not necessarily guarantee the correctness of the implementation of the :code:`forward` and :code:`backward` function. You need to write more sophisticated unit tests to make sure your layer is implemented correctly.
 
-All the gradient check unit tests are located in :code:`paddle/gserver/tests/test_LayerGrad.cpp`. You are recommended to put your test into a new test file if you are planning to write a new layer. The gradient test of the gradient check unit test of the fully connected layer is listed below. It has the following steps.
+All the gradient check unit tests are located in :code:`paddle/legacy/gserver/tests/test_LayerGrad.cpp`. You are recommended to put your test into a new test file if you are planning to write a new layer. The gradient test of the gradient check unit test of the fully connected layer is listed below. It has the following steps.
 
 + Create layer configuration. A layer configuration can include the following attributes:
    - size of the bias parameter. (4096 in our example)
@@ -323,7 +323,7 @@ All the gradient check unit tests are located in :code:`paddle/gserver/tests/tes
       }
     }
 
-If you are creating a new file for the test, such as :code:`paddle/gserver/tests/testFCGrad.cpp`, you need to add the file to :code:`paddle/gserver/tests/CMakeLists.txt`. An example is given below. All the unit tests will run when you execute the command :code:`make tests`. Notice that some layers might need high accuracy for the gradient check unit tests to work well. You need to configure :code:`WITH_DOUBLE` to `ON` when configuring cmake.
+If you are creating a new file for the test, such as :code:`paddle/legacy/gserver/tests/testFCGrad.cpp`, you need to add the file to :code:`paddle/legacy/gserver/tests/CMakeLists.txt`. An example is given below. All the unit tests will run when you execute the command :code:`make tests`. Notice that some layers might need high accuracy for the gradient check unit tests to work well. You need to configure :code:`WITH_DOUBLE` to `ON` when configuring cmake.
 
 .. code-block:: bash
 
@@ -339,7 +339,7 @@ If you are creating a new file for the test, such as :code:`paddle/gserver/tests
 Implement Python Wrapper
 ========================
 
-Implementing Python wrapper allows us to use the added layer in configuration files. All the Python wrappers are in file :code:`python/paddle/trainer/config_parser.py`. An example of the Python wrapper for fully connected layer is listed below. It has the following steps:
+Implementing Python wrapper allows us to use the added layer in configuration files. All the Python wrappers are in file :code:`python/paddle/legacy/trainer/config_parser.py`. An example of the Python wrapper for fully connected layer is listed below. It has the following steps:
 
 - Use :code:`@config_layer('fc')` at the decorator for all the Python wrapper class. :code:`fc` is the identifier of the layer.
 - Implements :code:`__init__` constructor function.
diff --git a/doc/v2/faq/build_and_install/index_cn.rst b/doc/v2/faq/build_and_install/index_cn.rst
index f292684fb5fe2df06db5239e7f43fdfa1dd2f2bd..0d644777287aea0a572adb6fa40f498f9c147af7 100644
--- a/doc/v2/faq/build_and_install/index_cn.rst
+++ b/doc/v2/faq/build_and_install/index_cn.rst
@@ -213,3 +213,12 @@ virtualenv本身也是Python的一个包，可以用pip进行安装：
 保存并关闭文件。
 
 这样，每次打开终端时就会自动启动名为‘paddle’的Python环境了。
+
+10. 通过pip安装的PaddlePaddle在  :code:`import paddle.fluid` 报找不到 :code:`libmkldnn.so` 或 :code:`libmklml_intel.so`
+------------------------------------------------------------------------------------------
+出现这种问题的原因是在导入 :code:`paddle.fluid` 时需要加载 :code:`libmkldnn.so` 和 :code:`libmklml_intel.so`，
+但是系统没有找到该文件。一般通过pip安装PaddlePaddle时会将 :code:`libmkldnn.so` 和 :code:`libmklml_intel.so`
+拷贝到 :code:`/usr/local/lib` 路径下，所以解决办法是将该路径加到 :code:`LD_LIBRARY_PATH` 环境变量下，
+即： :code:`export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH` 。
+
+**注意**：如果是在虚拟环境中安装PaddlePaddle， :code:`libmkldnn.so` 和 :code:`libmklml_intel.so` 可能不在 :code:`/usr/local/lib` 路径下。
\ No newline at end of file
diff --git a/doc/v2/faq/parameter/index_cn.rst b/doc/v2/faq/parameter/index_cn.rst
index 1fa4b3e1311d2007ccba98fde9ff94300ea42c16..987e8cf088be4ee8daa7c28fdc855506cbfd31c7 100644
--- a/doc/v2/faq/parameter/index_cn.rst
+++ b/doc/v2/faq/parameter/index_cn.rst
@@ -196,6 +196,6 @@ PaddlePaddle保存的模型参数文件内容由16字节头信息和网络参数
         obj="process",
         args={"src_dict_path": src_dict_path})
 
-完整源码可参考 `sequence_recurrent <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_recurrent.py>`_ 示例。
+完整源码可参考 `sequence_recurrent <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_recurrent.py>`_ 示例。
 
 
diff --git a/doc/v2/howto/capi/compile_paddle_lib_cn.md b/doc/v2/howto/capi/compile_paddle_lib_cn.md
index e223fd33a8420abcdfdad53d1cfc5ed160a1b37e..2c87e9afc6911526cd51d6c691f262960accc9e8 100644
--- a/doc/v2/howto/capi/compile_paddle_lib_cn.md
+++ b/doc/v2/howto/capi/compile_paddle_lib_cn.md
@@ -18,7 +18,7 @@
 </tr>
 <tr>
 <td>cpu_avx_openblas</td>
-<td>暂无</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
 </tr>
 <tr>
 <td>cpu_noavx_openblas</td>
@@ -35,7 +35,12 @@
 <tr>
 <td>cuda8.0_cudnn7_avx_mkl</td>
 <td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
-</tr></tbody></table>
+</tr>
+<tr>
+<td>cuda9.0_cudnn7_avx_mkl</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+</tr>
+</tbody></table>
 
 ### 从源码编译
 
diff --git a/doc/v2/howto/capi/compile_paddle_lib_en.md b/doc/v2/howto/capi/compile_paddle_lib_en.md
index 6212a3081116d988630706e83d2349dd200b73ab..3fa8a18a9fbea21b494c416e6b938990fbb68337 100644
--- a/doc/v2/howto/capi/compile_paddle_lib_en.md
+++ b/doc/v2/howto/capi/compile_paddle_lib_en.md
@@ -17,7 +17,7 @@
 </tr>
 <tr>
 <td>cpu_avx_openblas</td>
-<td>-</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
 </tr>
 <tr>
 <td>cpu_noavx_openblas</td>
@@ -34,7 +34,12 @@
 <tr>
 <td>cuda8.0_cudnn7_avx_mkl</td>
 <td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
-</tr></tbody></table>
+</tr>
+<tr>
+<td>cuda9.0_cudnn7_avx_mkl</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+</tr>
+</tbody></table>
 
 ### From source
 
diff --git a/doc/v2/howto/capi/workflow_of_capi_cn.md b/doc/v2/howto/capi/workflow_of_capi_cn.md
index 3acdbae28e9b35f8a9104a89c9a5799f8c892334..db1568a2afbea3cca0d4e1fe053ba9536a60ab3d 100644
--- a/doc/v2/howto/capi/workflow_of_capi_cn.md
+++ b/doc/v2/howto/capi/workflow_of_capi_cn.md
@@ -28,9 +28,9 @@
 
 ### 准备预测模型
 
-准备预测模型部分，我们以手写数字识别任务为例进行介绍。手写数字识别任务定义了一个含有[两个隐层的简单全连接网络](https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/README.cn.md#softmax回归softmax-regression)，网络接受一幅图片作为输入，将图片分类到 0 ~ 9 类别标签之一。完整代码可以查看[此目录](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense) 中的相关脚本。
+准备预测模型部分，我们以手写数字识别任务为例进行介绍。手写数字识别任务定义了一个含有[两个隐层的简单全连接网络](https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/README.cn.md#softmax回归softmax-regression)，网络接受一幅图片作为输入，将图片分类到 0 ~ 9 类别标签之一。完整代码可以查看[此目录](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference/dense) 中的相关脚本。
 
-调用C-API开发预测程序需要一个训练好的模型，运行[MNIST手写数字识别目录](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense)下的[mnist_v2.py](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/examples/model_inference/dense/mnist_v2.py)脚本，在终端执行`python mnist_v2.py`，会使用 PaddlePaddle 内置的 [MNIST 数据集](http://yann.lecun.com/exdb/mnist/)进行训练。训练好的模型默认保存在当前运行目录下的`models`目录中。
+调用C-API开发预测程序需要一个训练好的模型，运行[MNIST手写数字识别目录](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference/dense)下的[mnist_v2.py](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/capi/examples/model_inference/dense/mnist_v2.py)脚本，在终端执行`python mnist_v2.py`，会使用 PaddlePaddle 内置的 [MNIST 数据集](http://yann.lecun.com/exdb/mnist/)进行训练。训练好的模型默认保存在当前运行目录下的`models`目录中。
 
 下面，我们将训练结束后存储下来的模型转换成预测模型。
 
@@ -48,7 +48,7 @@
     dump_v2_config(predict, "trainer_config.bin", True)
     ```
 
-    对[手写数字识别](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense)这个示例，[`mnist_v2.py`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense/mnist_v2.py)脚本集成了序列化神经网络结构的过程，可以直接运行 `python mnist_v2.py --task dump_config` 对神经网络结构进行序列化，结果会写入当前运行目录下的`trainer_config.bin`文件中。
+    对[手写数字识别](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference/dense)这个示例，[`mnist_v2.py`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference/dense/mnist_v2.py)脚本集成了序列化神经网络结构的过程，可以直接运行 `python mnist_v2.py --task dump_config` 对神经网络结构进行序列化，结果会写入当前运行目录下的`trainer_config.bin`文件中。
 
     使用这种方式，需要**在运行时将神经网络的多个可学习参数放在同一个目录中**，C-API可以通过分别指定序列化后的网络结构文件和参数目录来加载训练好的模型。
 
@@ -68,7 +68,7 @@
     merge_v2_model(net, param_file, output_file)
     ```
 
-    对[手写数字识别](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense)这个示例，可直接运行 `python` [merge_v2_model.py](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense/merge_v2_model.py)。序列化结果会写入当前运行目录下的`output.paddle.model`文件中。使用这种方式，运行时C-API可以通过指定`output.paddle.model`文件的路径来加载预测模型。
+    对[手写数字识别](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference/dense)这个示例，可直接运行 `python` [merge_v2_model.py](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference/dense/merge_v2_model.py)。序列化结果会写入当前运行目录下的`output.paddle.model`文件中。使用这种方式，运行时C-API可以通过指定`output.paddle.model`文件的路径来加载预测模型。
 
 #### 注意事项
 1. 为使用C-API，在调用`dump_v2_config`序列化神经网络结构时，参数`binary`必须指定为`True`。
@@ -77,10 +77,10 @@
 
 ### 编写预测代码
 
-预测代码更多详细示例代码请参考[C-API使用示例](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference) 目录下的代码示例。这一节对图1中预测代码编写的5个步骤进行介绍和说明。
+预测代码更多详细示例代码请参考[C-API使用示例](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/capi/examples/model_inference) 目录下的代码示例。这一节对图1中预测代码编写的5个步骤进行介绍和说明。
 
 #### step 1. 初始化PaddlePaddle运行环境
-第一步需调用[`paddle_init`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/main.h#L27) 初始化PaddlePaddle运行环境，该接口接受两个参数：参数的个数和参数列表。
+第一步需调用[`paddle_init`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/capi/main.h#L27) 初始化PaddlePaddle运行环境，该接口接受两个参数：参数的个数和参数列表。
 
 #### step2. 加载模型
 
@@ -88,8 +88,8 @@
 
 概念上，在 PaddlePaddle 内部，一个GradientMachine类的对象管理着一组计算层（PaddlePaddle Layers）来完成前向和反向计算，并处理与之相关的所有细节。在调用C-API预测时，只需进行前向计算而无需调用反向计算。这篇文档之后部分会使用`gradient machine`来特指调用PaddlePaddle C-API创建的GradientMachine类的对象。每一个 `gradient machine` 都会管理维护一份训练好的模型，下面是C-API提供的，两种常用的模型加载方式：
 
-1. 调用[`paddle_gradient_machine_load_parameter_from_disk`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/gradient_machine.h#L61)接口，从磁盘加载预测模型。这时`gradient machine`会独立拥有一份训练好的模型；
-1. 调用[`paddle_gradient_machine_create_shared_param`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/gradient_machine.h#L88)接口，与其它`gradient machine`的共享已经加载的预测模型。这种情况多出现在使用多线程预测时，通过多个线程共享同一个模型来减少内存开销。可参考[此示例](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/examples/model_inference/multi_thread/main.c)。
+1. 调用[`paddle_gradient_machine_load_parameter_from_disk`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/capi/gradient_machine.h#L61)接口，从磁盘加载预测模型。这时`gradient machine`会独立拥有一份训练好的模型；
+1. 调用[`paddle_gradient_machine_create_shared_param`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/capi/gradient_machine.h#L88)接口，与其它`gradient machine`的共享已经加载的预测模型。这种情况多出现在使用多线程预测时，通过多个线程共享同一个模型来减少内存开销。可参考[此示例](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/capi/examples/model_inference/multi_thread/main.c)。
 
 - 注意事项
 
@@ -117,7 +117,7 @@ C-API支持的所有输入数据类型和他们的组织方式，请参考“输
 
 #### step 4. 前向计算
 
-完成上述准备之后，通过调用 [`paddle_gradient_machine_forward`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/gradient_machine.h#L73) 接口完成神经网络的前向计算。
+完成上述准备之后，通过调用 [`paddle_gradient_machine_forward`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/capi/gradient_machine.h#L73) 接口完成神经网络的前向计算。
 
 #### step 5. 清理
 
diff --git a/doc/v2/howto/optimization/gpu_profiling_cn.rst b/doc/v2/howto/optimization/gpu_profiling_cn.rst
index 25bcaccb6975bc21fba2e8c5843da15c69948d72..f2396716bddd4810fa77c738d41f5482aa6d6055 100644
--- a/doc/v2/howto/optimization/gpu_profiling_cn.rst
+++ b/doc/v2/howto/optimization/gpu_profiling_cn.rst
@@ -50,12 +50,12 @@ GPU则还需要高并行性，才能发挥其全部能力。这正是它们速
 **nvprof** 是Nvidia性能分析工具， **nvvp** 则是带GUI的Nvidia可视化性能分析工具。
 在这个教程中，我们主要会介绍nvprof和nvvp。
 
-:code:`test_GpuProfiler` from :code:`paddle/math/tests` directory will be used to evaluate
+:code:`test_GpuProfiler` from :code:`paddle/legacy/math/tests` directory will be used to evaluate
 above profilers.
 
-:code:`paddle/math/test` 目录中的 :code:`test_GpuProfiler` 就是用于展示上述分析工具的用法。
+:code:`paddle/legacy/math/test` 目录中的 :code:`test_GpuProfiler` 就是用于展示上述分析工具的用法。
 
-.. literalinclude:: ../../../../paddle/math/tests/test_GpuProfiler.cpp
+.. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
    :language: c++
    :lines: 137-151
    :linenos:
@@ -83,7 +83,7 @@ program crashes when CPU version of PaddlePaddle invokes them.
 
 1. 加入 :code:`REGISTER_TIMER_INFO` 和 :code:`printAllStatus` 函数（如高亮部分）。
 
-    .. literalinclude:: ../../../../paddle/math/tests/test_GpuProfiler.cpp
+    .. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
         :language: c++
         :lines: 137-151
         :emphasize-lines: 8-12,14
@@ -101,8 +101,8 @@ program crashes when CPU version of PaddlePaddle invokes them.
     .. code-block:: bash
         :emphasize-lines: 1,12-15
 
-        > ./paddle/math/tests/test_GpuProfiler
-        I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/math/tests/test_GpuProfiler
+        > ./paddle/legacy/math/tests/test_GpuProfiler
+        I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/legacy/math/tests/test_GpuProfiler
         I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions
         I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done.
         [==========] Running 1 test from 1 test case.
@@ -130,7 +130,7 @@ nvprof 工具
 
 1. 将 :code:`REGISTER_GPU_PROFILER` 函数加到代码中（参考强调部分）。
 
-    .. literalinclude:: ../../../../paddle/math/tests/test_GpuProfiler.cpp
+    .. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
         :language: c++
         :lines: 137-151
         :emphasize-lines: 6-7
@@ -147,13 +147,13 @@ nvprof 工具
 
     .. code-block:: bash
 
-        nvprof  ./paddle/math/tests/test_GpuProfiler
+        nvprof  ./paddle/legacy/math/tests/test_GpuProfiler
 
 然后，您就能获得如下的分析结果：
 
 .. code-block:: bash
 
-    ==78544== Profiling application: ./paddle/math/tests/test_GpuProfiler
+    ==78544== Profiling application: ./paddle/legacy/math/tests/test_GpuProfiler
     ==78544== Profiling result:
     Time(%)     Time     Calls       Avg       Min       Max  Name
     27.60%  9.6305ms         5  1.9261ms  3.4560us  6.4035ms  [CUDA memcpy HtoD]
diff --git a/doc/v2/howto/optimization/gpu_profiling_en.rst b/doc/v2/howto/optimization/gpu_profiling_en.rst
index 50adb7da24906515cb5977db565e9f8a76599fef..6e439be9bba8935cdd65f1c131cfd3725530ec0e 100644
--- a/doc/v2/howto/optimization/gpu_profiling_en.rst
+++ b/doc/v2/howto/optimization/gpu_profiling_en.rst
@@ -51,10 +51,10 @@ For general GPU profiling, a bunch of tools are provided from both NVIDIA and th
 **nvprof** is Nvidia profiler and **nvvp** is (GUI based) Nvidia visual profiler.
 In this tutorial, we will focus on nvprof and nvvp.
 
-:code:`test_GpuProfiler` from :code:`paddle/math/tests` directory will be used to evaluate
+:code:`test_GpuProfiler` from :code:`paddle/legacy/math/tests` directory will be used to evaluate
 above profilers.
 
-.. literalinclude:: ../../../../paddle/math/tests/test_GpuProfiler.cpp
+.. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
    :language: c++
    :lines: 137-151
    :linenos:
@@ -80,7 +80,7 @@ As a simple example, consider the following:
 
 1. Add :code:`REGISTER_TIMER_INFO` and :code:`printAllStatus` functions (see the emphasize-lines).
 
-    .. literalinclude:: ../../../../paddle/math/tests/test_GpuProfiler.cpp
+    .. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
         :language: c++
         :lines: 137-151
         :emphasize-lines: 8-12,14
@@ -98,8 +98,8 @@ As a simple example, consider the following:
     .. code-block:: bash
         :emphasize-lines: 1,12-15
 
-        > ./paddle/math/tests/test_GpuProfiler
-        I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/math/tests/test_GpuProfiler
+        > ./paddle/legacy/math/tests/test_GpuProfiler
+        I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/legacy/math/tests/test_GpuProfiler
         I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions
         I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done.
         [==========] Running 1 test from 1 test case.
@@ -127,7 +127,7 @@ To use this command line profiler **nvprof**, you can simply issue the following
 
 1. Add :code:`REGISTER_GPU_PROFILER` function (see the emphasize-lines).
 
-    .. literalinclude:: ../../../../paddle/math/tests/test_GpuProfiler.cpp
+    .. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
         :language: c++
         :lines: 137-151
         :emphasize-lines: 6-7
@@ -144,13 +144,13 @@ To use this command line profiler **nvprof**, you can simply issue the following
 
     .. code-block:: bash
 
-        nvprof  ./paddle/math/tests/test_GpuProfiler
+        nvprof  ./paddle/legacy/math/tests/test_GpuProfiler
 
 Then, you can get the following profiling result:
 
 .. code-block:: bash
 
-    ==78544== Profiling application: ./paddle/math/tests/test_GpuProfiler
+    ==78544== Profiling application: ./paddle/legacy/math/tests/test_GpuProfiler
     ==78544== Profiling result:
     Time(%)     Time     Calls       Avg       Min       Max  Name
     27.60%  9.6305ms         5  1.9261ms  3.4560us  6.4035ms  [CUDA memcpy HtoD]
diff --git a/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst b/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst
index 67c7b774e9c476a3035037a421c84ebf17a31b09..9d6d417075485dceb1ee71f527b408aa6a6638ea 100644
--- a/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst
+++ b/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst
@@ -4,7 +4,7 @@
 单双层RNN API对比介绍
 #####################
 
-本文以PaddlePaddle的双层RNN单元测试为示例，用多对效果完全相同的、分别使用单双层RNN作为网络配置的模型，来讲解如何使用双层RNN。本文中所有的例子，都只是介绍双层RNN的API接口，并不是使用双层RNN解决实际的问题。如果想要了解双层RNN在具体问题中的使用，请参考\ :ref:`algo_hrnn_demo`\ 。本文中示例所使用的单元测试文件是\ `test_RecurrentGradientMachine.cpp <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/test_RecurrentGradientMachine.cpp>`_\ 。
+本文以PaddlePaddle的双层RNN单元测试为示例，用多对效果完全相同的、分别使用单双层RNN作为网络配置的模型，来讲解如何使用双层RNN。本文中所有的例子，都只是介绍双层RNN的API接口，并不是使用双层RNN解决实际的问题。如果想要了解双层RNN在具体问题中的使用，请参考\ :ref:`algo_hrnn_demo`\ 。本文中示例所使用的单元测试文件是\ `test_RecurrentGradientMachine.cpp <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/test_RecurrentGradientMachine.cpp>`_\ 。
 
 示例1：双层RNN，子序列间无Memory
 ================================
@@ -13,8 +13,8 @@
 
 在本示例中，单层RNN和双层RNN的网络配置，都是将每一句分好词后的句子，使用LSTM作为encoder，压缩成一个向量。区别是RNN使用两层序列模型，将多句话看成一个整体同时使用encoder压缩。二者语意上完全一致。这组语义相同的示例配置如下：
 
-* 单层RNN\: `sequence_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_layer_group.conf>`_
-* 双层RNN\: `sequence_nest_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_nest_layer_group.conf>`_
+* 单层RNN\: `sequence_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_layer_group.conf>`_
+* 双层RNN\: `sequence_nest_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_nest_layer_group.conf>`_
 
 
 读取双层序列数据
@@ -24,18 +24,18 @@
 
 - 本例中的原始数据一共有10个样本。每个样本由两部分组成，一个label（此处都为2）和一个已经分词后的句子。这个数据也被单层RNN网络直接使用。
 
-..  literalinclude:: ../../../../paddle/gserver/tests/Sequence/tour_train_wdseg
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/Sequence/tour_train_wdseg
     :language: text
 
 
 - 双层序列数据一共有4个样本。 每个样本间用空行分开，整体数据和原始数据完全一样。但于双层序列的LSTM来说，第一个样本同时encode两条数据成两个向量。这四条数据同时处理的句子数量为\ :code:`[2, 3, 2, 3]`\ 。
 
-..  literalinclude:: ../../../../paddle/gserver/tests/Sequence/tour_train_wdseg.nest
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/Sequence/tour_train_wdseg.nest
     :language: text
 
-其次，对于两种不同的输入数据类型，不同DataProvider对比如下(`sequenceGen.py <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequenceGen.py>`_)\：
+其次，对于两种不同的输入数据类型，不同DataProvider对比如下(`sequenceGen.py <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequenceGen.py>`_)\：
 
-..  literalinclude:: ../../../../paddle/gserver/tests/sequenceGen.py
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequenceGen.py
     :language: python
     :lines: 21-39
     :linenos:
@@ -47,7 +47,7 @@
     - words是原始数据中的每一句话，所对应的词表index数组。它是integer_value_sequence类型的，即整数数组。words即为这个数据中的单层时间序列。
     - label是原始数据中对于每一句话的分类标签，它是integer_value类型的。
 
-..  literalinclude:: ../../../../paddle/gserver/tests/sequenceGen.py
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequenceGen.py
     :language: python
     :lines: 42-71
     :linenos:
@@ -64,7 +64,7 @@
 
 首先，我们看一下单层RNN的配置。代码中9-15行(高亮部分)即为单层RNN序列的使用代码。这里使用了PaddlePaddle预定义好的RNN处理函数。在这个函数中，RNN对于每一个时间步通过了一个LSTM网络。
 
-..  literalinclude:: ../../../../paddle/gserver/tests/sequence_layer_group.conf
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_layer_group.conf
     :language: python
     :lines: 38-63
     :linenos:
@@ -85,7 +85,7 @@
 
 * 至此，\ :code:`lstm_last`\ 便和单层RNN配置中的\ :code:`lstm_last`\ 具有相同的结果了。
 
-..  literalinclude:: ../../../../paddle/gserver/tests/sequence_nest_layer_group.conf
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_nest_layer_group.conf
     :language: python
     :lines: 38-64
     :linenos:
@@ -107,7 +107,7 @@
 
 - 单层RNN：过了一个很简单的recurrent_group。每一个时间步，当前的输入y和上一个时间步的输出rnn_state做了一个全链接。
 
-..  literalinclude:: ../../../../paddle/gserver/tests/sequence_rnn.conf
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_rnn.conf
     :language: python
     :lines: 36-48
 
@@ -116,7 +116,7 @@
   - 内层inner_step的recurrent_group和单层序列的几乎一样。除了boot_layer=outer_mem，表示将外层的outer_mem作为内层memory的初始状态。外层outer_step中，outer_mem是一个子句的最后一个向量，即整个双层group是将前一个子句的最后一个向量，作为下一个子句memory的初始状态。
   - 从输入数据上看，单双层序列的句子是一样的，只是双层序列将其又做了子序列划分。因此双层序列的配置中，必须将前一个子句的最后一个元素，作为boot_layer传给下一个子句的memory，才能保证和单层序列的配置中“每个时间步都用了上一个时间步的输出结果”一致。
 
-..  literalinclude:: ../../../../paddle/gserver/tests/sequence_nest_rnn.conf
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_nest_rnn.conf
     :language: python
     :lines: 39-66
 
@@ -134,7 +134,7 @@
 
 **输入不等长** 是指recurrent_group的多个输入序列，在每个时间步的子序列长度可以不相等。但序列输出时，需要指定与某一个输入的序列信息是一致的。使用\ :red:`targetInlink`\ 可以指定哪一个输入和输出序列信息一致，默认指定第一个输入。 
 
-示例3的配置分别为\ `单层不等长RNN <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py>`_\ 和\ `双层不等长RNN <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py>`_\ 。
+示例3的配置分别为\ `单层不等长RNN <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py>`_\ 和\ `双层不等长RNN <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py>`_\ 。
 
 示例3对于单层RNN和双层RNN数据完全相同。
 
@@ -152,14 +152,14 @@
 
 * 单层RNN\:
 
-..  literalinclude:: ../../../../paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
     :language: python
     :lines: 42-59
     :linenos:
 
 * 双层RNN\ \:
 
-..  literalinclude:: ../../../../paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
     :language: python
     :lines: 41-80
     :linenos:
diff --git a/doc/v2/howto/rnn/hrnn_rnn_api_compare_en.rst b/doc/v2/howto/rnn/hrnn_rnn_api_compare_en.rst
index ae997f0805db5b01a34867c9e8b188c931721920..a4485f7b5edf21871444801230ab1ee191b1137b 100644
--- a/doc/v2/howto/rnn/hrnn_rnn_api_compare_en.rst
+++ b/doc/v2/howto/rnn/hrnn_rnn_api_compare_en.rst
@@ -4,7 +4,7 @@
 API comparision between RNN and hierarchical RNN
 #####################
 
-This article takes PaddlePaddle's hierarchical RNN unit test as an example. We will use several examples to illestrate the usage of single-layer and hierarchical RNNs. Each example has two model configurations, one for single-layer, and the other for hierarchical RNN. Although the implementations are different, both the two model configurations' effects are the same. All of the examples in this article only describe the API interface of the hierarchical RNN, while we do not use this hierarchical RNN to solve practical problems. If you want to understand the use of hierarchical RNN in specific issues, please refer to \ :ref:`algo_hrnn_demo`\ 。The unit test file used in this article's example is \ `test_RecurrentGradientMachine.cpp <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/test_RecurrentGradientMachine.cpp>`_\ 。
+This article takes PaddlePaddle's hierarchical RNN unit test as an example. We will use several examples to illestrate the usage of single-layer and hierarchical RNNs. Each example has two model configurations, one for single-layer, and the other for hierarchical RNN. Although the implementations are different, both the two model configurations' effects are the same. All of the examples in this article only describe the API interface of the hierarchical RNN, while we do not use this hierarchical RNN to solve practical problems. If you want to understand the use of hierarchical RNN in specific issues, please refer to \ :ref:`algo_hrnn_demo`\ 。The unit test file used in this article's example is \ `test_RecurrentGradientMachine.cpp <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/test_RecurrentGradientMachine.cpp>`_\ 。
 
 Example 1：Hierarchical RNN without Memory between subsequences
 ================================
@@ -13,8 +13,8 @@ The classical case in the hierarchical RNN is to perform sequence operations on
 
 In this example, the network configuration of single-layer RNNs and hierarchical RNNs are all to use LSTM as en encoder to compress a word-segmented sentence into a vector. The difference is that, RNN uses a hierarchical RNN model, treating multiple sentences as a whole to use encoder to compress simultaneously. They are completely consistent in their semantic meanings. This pair of semantically identical example configurations is as follows：
 
-* RNN\: `sequence_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_layer_group.conf>`_
-* Hierarchical RNN\: `sequence_nest_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_nest_layer_group.conf>`_
+* RNN\: `sequence_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_layer_group.conf>`_
+* Hierarchical RNN\: `sequence_nest_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_nest_layer_group.conf>`_
 
 
 Reading hierarchical sequence data
@@ -24,18 +24,18 @@ Firstly, the original data in this example is as follows \:
 
 - The original data in this example has 10 samples. Each of the sample includes two components: a lable(all 2 here), and a word-segmented sentence. This data is used by single RNN as well. 
 
-..  literalinclude:: ../../../../paddle/gserver/tests/Sequence/tour_train_wdseg
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/Sequence/tour_train_wdseg
     :language: text
 
 
 - The data for hierarchical RNN has 4 samples. Every sample is seperated by a blank line, while the content of the data is the same as the original data. But as for hierarchical LSTM, the first sample will encode two sentences into two vectors simultaneously. The sentence count dealed simultaneously by this 4 samples are \ :code:`[2, 3, 2, 3]`\ .
 
-..  literalinclude:: ../../../../paddle/gserver/tests/Sequence/tour_train_wdseg.nest
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/Sequence/tour_train_wdseg.nest
     :language: text
 
-Secondly, as for these two types of different input data formats, the contrast of different DataProviders are as follows (`sequenceGen.py <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequenceGen.py>`_)\：
+Secondly, as for these two types of different input data formats, the contrast of different DataProviders are as follows (`sequenceGen.py <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequenceGen.py>`_)\：
 
-..  literalinclude:: ../../../../paddle/gserver/tests/sequenceGen.py
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequenceGen.py
     :language: python
     :lines: 21-39
     :linenos:
@@ -47,7 +47,7 @@ Secondly, as for these two types of different input data formats, the contrast o
     - "words" is a list of word table indices corresponding to each word in the sentence in the original data. Its data type is integer_value_sequence, that is integer list. So, "words" is a singler-layer time series in the data. 
     - "label" is the categorical label of each sentence, whose data type is integer_value. 
 
-..  literalinclude:: ../../../../paddle/gserver/tests/sequenceGen.py
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequenceGen.py
     :language: python
     :lines: 42-71
     :linenos:
@@ -64,7 +64,7 @@ Model configuration
 
 Firstly, let's look at the configuration of single-layer RNN. The hightlighted part of line 9 to line 15 is the usage of single-layer RNN. Here we use the pre-defined RNN process function in PaddlePaddle. In this function, for each time step, RNN passes through an LSTM network. 
 
-..  literalinclude:: ../../../../paddle/gserver/tests/sequence_layer_group.conf
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_layer_group.conf
     :language: python
     :lines: 38-63
     :linenos:
@@ -85,7 +85,7 @@ Secondly, let's look at the model configuration of hierarchical RNN which has th
 
 * Till now, \ :code:`lstm_last`\ has the same result as \ :code:`lstm_last`\ in single-layer RNN configuration. 
 
-..  literalinclude:: ../../../../paddle/gserver/tests/sequence_nest_layer_group.conf
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_nest_layer_group.conf
     :language: python
     :lines: 38-64
     :linenos:
@@ -107,7 +107,7 @@ We select the different parts between single-layer RNN and hierarchical RNN conf
 
 - single-layer RNN：passes through a simple recurrent_group. For each time step, the current input y and the last time step's output rnn_state pass through a fully-connected layer. 
 
-..  literalinclude:: ../../../../paddle/gserver/tests/sequence_rnn.conf
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_rnn.conf
     :language: python
     :lines: 36-48
 
@@ -116,7 +116,7 @@ We select the different parts between single-layer RNN and hierarchical RNN conf
   - The recurrent_group of inner layer's inner_step is nearly the same as single-layer sequence, except for the case of boot_layer=outer_mem, which means using the outer layer's outer_mem as the initial state for the inner layer's memory. In the outer layer's out_step, outer_mem is the last vector of a subsequence, that is, the whole hierarchical group uses the last vector of the previous subsequence as the initial state for the next subsequence's memory. 
   - From the aspect of the input data, sentences from single-layer and hierarchical RNN are the same. The only difference is that, hierarchical RNN disassembes the sequence into subsequences. So in the hierarchical RNN configuration, we must use the last element of the previous subsequence as a boot_layer for the memory of the next subsequence, so that it makes no difference with "every time step uses the output of last time step" in the sigle-layer RNN configuration. 
 
-..  literalinclude:: ../../../../paddle/gserver/tests/sequence_nest_rnn.conf
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_nest_rnn.conf
     :language: python
     :lines: 39-66
 
@@ -134,7 +134,7 @@ Example 3：hierarchical RNN with unequal length inputs
 
 **unequal length inputs** means in the multiple input sequences of recurrent_group, the lengths of subsequences can be unequal. But the output of the sequence, needs to be consistent with one of the input sequences. Using \ :red:`targetInlink`\ can help you specify which of the input sequences and the output sequence can be consistent, by default is the first input. 
 
-The configurations of Example 3 are \ `sequence_rnn_multi_unequalength_inputs <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py>`_ \ and \ `sequence_nest_rnn_multi_unequalength_inputs <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py>`_\ . 
+The configurations of Example 3 are \ `sequence_rnn_multi_unequalength_inputs <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py>`_ \ and \ `sequence_nest_rnn_multi_unequalength_inputs <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py>`_\ .
 
 The data for the configurations of Example 3's single-layer RNN and hierarchical RNN are exactly the same. 
 
@@ -152,14 +152,14 @@ Similar to Example 2's configuration, Example 3's configuration uses single-laye
 
 * single-layer RNN\:
 
-..  literalinclude:: ../../../../paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
     :language: python
     :lines: 42-59
     :linenos:
 
 * hierarchical RNN\ \:
 
-..  literalinclude:: ../../../../paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
     :language: python
     :lines: 41-80
     :linenos:
diff --git a/go/pserver/optimizer.go b/go/pserver/optimizer.go
index f17577997bc94b08f3e296c4d6e35682ca3c0e57..eba0c47e195a80fc298f0fdd78c8d6345e963be8 100644
--- a/go/pserver/optimizer.go
+++ b/go/pserver/optimizer.go
@@ -16,7 +16,7 @@ package pserver
 
 // #cgo CFLAGS: -I ../../
 // #cgo LDFLAGS: ${SRCDIR}/client/c/libpaddle_go_optimizer.a -lstdc++ -lm
-// #include "paddle/optimizer/optimizer.h"
+// #include "paddle/legacy/optimizer/optimizer.h"
 // #include <stdlib.h>
 // #include <string.h>
 import "C"
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index d722eec1892206ac44c49e7a12d92be0c54df8c0..6653244507742b33d9524a7a0e4a5b2b575d358a 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -1,24 +1,24 @@
 if(NOT WITH_FLUID_ONLY)
-  add_subdirectory(cuda)
-  add_subdirectory(function)
-  add_subdirectory(utils)
-  add_subdirectory(math)
-  add_subdirectory(gserver)
-  add_subdirectory(parameter)
+  add_subdirectory(legacy/cuda)
+  add_subdirectory(legacy/function)
+  add_subdirectory(legacy/utils)
+  add_subdirectory(legacy/math)
+  add_subdirectory(legacy/gserver)
+  add_subdirectory(legacy/parameter)
 
   if(MOBILE_INFERENCE)
-    add_subdirectory(capi)
+    add_subdirectory(legacy/capi)
   else()
-    add_subdirectory(pserver)
-    add_subdirectory(trainer)
+    add_subdirectory(legacy/pserver)
+    add_subdirectory(legacy/trainer)
     add_subdirectory(scripts)
 
     if(WITH_C_API)
-      add_subdirectory(capi)
+      add_subdirectory(legacy/capi)
     endif()
 
     if(WITH_SWIG_PY)
-      add_subdirectory(api)
+      add_subdirectory(legacy/api)
     endif()
   endif()
 endif()
diff --git a/paddle/api/Arguments.cpp b/paddle/api/Arguments.cpp
deleted file mode 100644
index 62d6a574d55d2748635879a21cbbaa474f070cff..0000000000000000000000000000000000000000
--- a/paddle/api/Arguments.cpp
+++ /dev/null
@@ -1,174 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PaddleAPI.h"
-#include "PaddleAPIPrivate.h"
-
-#include "paddle/parameter/Argument.h"
-
-size_t Arguments::getSlotNum() const { return m->outputs.size(); }
-
-Arguments* Arguments::createArguments(size_t slotNum) {
-  auto args = new Arguments();
-  args->m->outputs.resize(slotNum);
-  return args;
-}
-
-void Arguments::resize(size_t slotNum) { m->outputs.resize(slotNum); }
-
-Arguments::Arguments() : m(new ArgumentsPrivate()) {}
-
-Arguments::~Arguments() { delete m; }
-
-Arguments* Arguments::createByPaddleArgumentVector(void* ptr) {
-  auto p = (std::vector<paddle::Argument>*)(ptr);
-  auto args = new Arguments();
-  args->m->outputs = *p;
-  return args;
-}
-
-Arguments* Arguments::createByPaddleArgument(const void* ptr) {
-  auto p = (paddle::Argument*)(ptr);
-  auto args = new Arguments();
-  args->m->outputs.push_back(*p);
-  return args;
-}
-
-Matrix* Arguments::getSlotValue(size_t idx) const throw(RangeError) {
-  auto& a = m->getArg(idx);
-  return Matrix::createByPaddleMatrixPtr(&a.value);
-}
-
-Matrix* Arguments::getSlotGrad(size_t idx) const throw(RangeError) {
-  auto& a = m->getArg(idx);
-  return Matrix::createByPaddleMatrixPtr(&a.grad);
-}
-
-IVector* Arguments::getSlotIds(size_t idx) const throw(RangeError) {
-  auto& a = m->getArg(idx);
-  return IVector::createByPaddleVectorPtr(&a.ids);
-}
-
-Matrix* Arguments::getSlotIn(size_t idx) const throw(RangeError) {
-  auto& a = m->getArg(idx);
-  return Matrix::createByPaddleMatrixPtr(&a.in);
-}
-
-void Arguments::setSlotValue(size_t idx, Matrix* mat) throw(RangeError) {
-  auto& a = m->getArg(idx);
-  a.value = m->cast<paddle::Matrix>(mat->getSharedPtr());
-}
-
-void Arguments::setSlotGrad(size_t idx, Matrix* mat) throw(RangeError) {
-  auto& a = m->getArg(idx);
-  a.grad = m->cast<paddle::Matrix>(mat->getSharedPtr());
-}
-
-void Arguments::setSlotIn(size_t idx, Matrix* mat) throw(RangeError) {
-  auto& a = m->getArg(idx);
-  a.in = m->cast<paddle::Matrix>(mat->getSharedPtr());
-}
-
-void Arguments::setSlotIds(size_t idx, IVector* vec) throw(RangeError) {
-  auto& a = m->getArg(idx);
-  auto& v = m->cast<paddle::IVector>(vec->getSharedPtr());
-  a.ids = v;
-}
-
-template <typename T1>
-static inline void doCopyFromSafely(std::shared_ptr<T1>& dest,
-                                    std::shared_ptr<T1>& src) {
-  if (src) {
-    if (dest) {
-      dest->copyFrom(*src);
-    } else {
-      dest = src;
-    }
-  }
-}
-
-IVector* Arguments::getSlotSequenceStartPositions(size_t idx) const
-    throw(RangeError) {
-  auto& a = m->getArg(idx);
-  if (a.sequenceStartPositions) {
-    return IVector::createByPaddleVectorPtr(
-        &a.sequenceStartPositions->getMutableVector(false));
-  } else {
-    return nullptr;
-  }
-}
-
-IVector* Arguments::getSlotSubSequenceStartPositions(size_t idx) const
-    throw(RangeError) {
-  auto& a = m->getArg(idx);
-  if (a.subSequenceStartPositions) {
-    return IVector::createByPaddleVectorPtr(
-        &a.subSequenceStartPositions->getMutableVector(false));
-  } else {
-    return nullptr;
-  }
-}
-
-void Arguments::setSlotSequenceStartPositions(size_t idx,
-                                              IVector* vec) throw(RangeError) {
-  auto& a = m->getArg(idx);
-  auto& v = m->cast<paddle::IVector>(vec->getSharedPtr());
-  a.sequenceStartPositions = std::make_shared<paddle::ICpuGpuVector>(v);
-}
-
-void Arguments::setSlotSubSequenceStartPositions(
-    size_t idx, IVector* vec) throw(RangeError) {
-  auto& a = m->getArg(idx);
-  auto& v = m->cast<paddle::IVector>(vec->getSharedPtr());
-  a.subSequenceStartPositions = std::make_shared<paddle::ICpuGpuVector>(v);
-}
-
-IVector* Arguments::getSlotSequenceDim(size_t idx) const throw(RangeError) {
-  auto& a = m->getArg(idx);
-  return IVector::createByPaddleVectorPtr(&a.cpuSequenceDims);
-}
-
-void Arguments::setSlotSequenceDim(size_t idx, IVector* vec) throw(RangeError) {
-  auto& a = m->getArg(idx);
-  a.cpuSequenceDims = m->cast<paddle::IVector>(vec->getSharedPtr());
-}
-
-float Arguments::sum() const { return paddle::Argument::sum(m->outputs); }
-
-int64_t Arguments::getBatchSize(size_t idx) const throw(RangeError) {
-  auto& a = m->getArg(idx);
-  return a.getBatchSize();
-}
-
-void Arguments::setSlotFrameHeight(size_t idx, size_t h) throw(RangeError) {
-  auto& a = m->getArg(idx);
-  a.setFrameHeight(h);
-}
-
-void Arguments::setSlotFrameWidth(size_t idx, size_t w) throw(RangeError) {
-  auto& a = m->getArg(idx);
-  a.setFrameWidth(w);
-}
-
-size_t Arguments::getSlotFrameHeight(size_t idx) const throw(RangeError) {
-  auto& a = m->getArg(idx);
-  return a.getFrameHeight();
-}
-
-size_t Arguments::getSlotFrameWidth(size_t idx) const throw(RangeError) {
-  auto& a = m->getArg(idx);
-  return a.getFrameWidth();
-}
-
-void* Arguments::getInternalArgumentsPtr() const { return &m->outputs; }
diff --git a/paddle/api/ConfigParser.cpp b/paddle/api/ConfigParser.cpp
deleted file mode 100644
index d362a1e7cf3c8cd05b8c85cfaf8dbbee8b827d4b..0000000000000000000000000000000000000000
--- a/paddle/api/ConfigParser.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PaddleAPI.h"
-#include "PaddleAPIPrivate.h"
-#include "paddle/trainer/Trainer.h"
-
-struct ParameterConfigPrivate {
-  paddle::ParameterPtr parameter;
-  paddle::ParameterConfig config;
-
-  inline paddle::ParameterConfig* getConfigPtr() {
-    if (parameter != nullptr) {
-      auto& conf = parameter->getConfig();
-      return const_cast<paddle::ParameterConfig*>(&conf);
-    } else {
-      return &config;
-    }
-  }
-};
-
-TrainerConfig::TrainerConfig() : m(new TrainerConfigPrivate()) {}
-
-TrainerConfig::~TrainerConfig() { delete m; }
-
-TrainerConfig* TrainerConfig::createFromTrainerConfigFile(
-    const std::string& confPath) {
-  LOG(INFO) << "load trainer config from " << confPath;
-  auto conf = std::make_shared<paddle::TrainerConfigHelper>(confPath);
-  auto retv = new TrainerConfig();
-  retv->m->conf = conf;
-  return retv;
-}
-
-TrainerConfig* TrainerConfig::createFromProtoString(const std::string& str) {
-  auto retv = new TrainerConfig();
-  paddle::TrainerConfig trainerConfigProto;
-  auto conf = std::make_shared<paddle::TrainerConfigHelper>(trainerConfigProto);
-  CHECK(conf->getMutableConfig().ParseFromString(str));
-  retv->m->conf = conf;
-  return retv;
-}
-
-ModelConfig::ModelConfig() : m(new ModelConfigPrivate()) {}
-
-ModelConfig::~ModelConfig() { delete m; }
-
-ModelConfig* TrainerConfig::getModelConfig() const {
-  auto retv = new ModelConfig();
-  retv->m->conf = m->conf;
-  return retv;
-}
-
-ParameterConfig::ParameterConfig() : m(new ParameterConfigPrivate()) {}
-
-ParameterConfig::~ParameterConfig() { delete m; }
-
-ParameterConfig* ParameterConfig::createParameterConfigFromParameterSharedPtr(
-    void* ptr) {
-  auto& p = *(paddle::ParameterPtr*)(ptr);
-  if (p != nullptr) {
-    auto conf = new ParameterConfig();
-    conf->m->parameter = p;
-    return conf;
-  } else {
-    return nullptr;
-  }
-}
-
-ParameterConfig* ParameterConfig::createParameterConfigFromParameterPtr(
-    void* ptr) {
-  auto& p = *(paddle::Parameter*)(ptr);
-  auto conf = new ParameterConfig();
-  conf->m->config = p.getConfig();
-  return conf;
-}
-
-std::string ParameterConfig::toProtoString() const {
-  return m->getConfigPtr()->SerializeAsString();
-}
-
-void* ParameterConfig::getRawPtr() { return m->getConfigPtr(); }
-
-OptimizationConfig::OptimizationConfig() : m(new OptimizationConfigPrivate()) {}
-
-OptimizationConfig::~OptimizationConfig() { delete m; }
-
-std::string OptimizationConfig::toProtoString() {
-  return m->getConfig().SerializeAsString();
-}
-
-OptimizationConfig* TrainerConfig::getOptimizationConfig() const {
-  auto opt_config = new OptimizationConfig();
-  opt_config->m->trainer_config = m->conf;
-  return opt_config;
-}
-
-OptimizationConfig* OptimizationConfig::createFromProtoString(
-    const std::string& str) {
-  auto conf = new OptimizationConfig();
-  conf->m->config.ParseFromString(str);
-  return conf;
-}
diff --git a/paddle/api/GradientMachine.cpp b/paddle/api/GradientMachine.cpp
deleted file mode 100644
index 0d9ad30de9c1f3f8f58c856a748abdc050ff8740..0000000000000000000000000000000000000000
--- a/paddle/api/GradientMachine.cpp
+++ /dev/null
@@ -1,196 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PaddleAPI.h"
-#include "PaddleAPIPrivate.h"
-
-#include "Internal.h"
-#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
-
-std::vector<int> GradientMachine::defaultParamTypes = {
-    PARAMETER_VALUE, PARAMETER_GRADIENT, PARAMETER_MOMENTUM};
-
-GradientMachine::GradientMachine() : m(new GradientMachinePrivate()) {}
-
-GradientMachine::~GradientMachine() { delete m; }
-
-GradientMachine* GradientMachine::createFromPaddleModelPtr(
-    const void* confPtr,
-    GradientMatchineCreateMode mode,
-    const std::vector<int>& types) {
-  auto& conf = *(const paddle::ModelConfig*)(confPtr);
-  std::vector<ParameterType> realTypes;
-  staticCastVector(&realTypes, types);
-  auto machineRawPtr = paddle::GradientMachine::create(conf, mode, realTypes);
-  auto machinePtr = std::shared_ptr<paddle::GradientMachine>(machineRawPtr);
-  if (machinePtr != nullptr) {
-    auto machine = new GradientMachine();
-    machine->m->machine = machinePtr;
-    return machine;
-  } else {
-    return nullptr;
-  }
-}
-
-GradientMachine* GradientMachine::createByConfigProtoStr(
-    const std::string& protoStr,
-    GradientMatchineCreateMode mode,
-    const std::vector<int>& types) {
-  paddle::ModelConfig conf;
-  conf.ParseFromString(protoStr);
-  if (conf.IsInitialized()) {
-    return GradientMachine::createFromPaddleModelPtr(&conf, mode, types);
-  } else {
-    return nullptr;
-  }
-}
-
-GradientMachine* GradientMachine::createByModelConfig(
-    ModelConfig* conf,
-    GradientMatchineCreateMode mode,
-    const std::vector<int>& types) {
-  auto confPtr = &conf->m->conf->getModelConfig();
-  return GradientMachine::createFromPaddleModelPtr(confPtr, mode, types);
-}
-
-void GradientMachine::start() { m->machine->start(); }
-
-void GradientMachine::finish() { m->machine->finish(); }
-
-void GradientMachine::onPassEnd() { m->machine->onPassEnd(); }
-
-void GradientMachine::prefetch(const Arguments& inArgs) {
-  auto& in =
-      m->cast<std::vector<paddle::Argument>>(inArgs.getInternalArgumentsPtr());
-  m->machine->prefetch(in);
-}
-
-void GradientMachine::forward(const Arguments& inArgs,
-                              Arguments* outArgs,
-                              PassType passType) {
-  auto& in =
-      m->cast<std::vector<paddle::Argument>>(inArgs.getInternalArgumentsPtr());
-  auto& out = m->cast<std::vector<paddle::Argument>>(
-      outArgs->getInternalArgumentsPtr());
-  paddle::PassType pt = (paddle::PassType)(passType);
-  m->machine->forward(in, &out, pt);
-}
-
-UpdateCallback::~UpdateCallback() {}
-
-void UpdateCallback::apply(Parameter* p) {
-  // UNUSED(p);
-}
-
-class UpdateCallbackWrapper {
- public:
-  explicit UpdateCallbackWrapper(const UpdateCallback& callback)
-      : callback(const_cast<UpdateCallback&>(callback)) {}
-
-  void operator()(paddle::Parameter* param) {
-    auto p = Parameter::createFromRawPtr(&param);
-    // @TODO Use Stack variable instead.
-    callback.apply(p);
-    delete p;
-  }
-
- private:
-  UpdateCallback& callback;
-};
-
-void GradientMachine::backward(const UpdateCallback& callback) {
-  m->machine->backward(UpdateCallbackWrapper(callback));
-}
-
-void GradientMachine::forwardBackward(const Arguments& inArgs,
-                                      Arguments* outArgs,
-                                      PassType passType,
-                                      const UpdateCallback& callback) {
-  auto& in =
-      m->cast<std::vector<paddle::Argument>>(inArgs.getInternalArgumentsPtr());
-  auto& out = m->cast<std::vector<paddle::Argument>>(
-      outArgs->getInternalArgumentsPtr());
-  paddle::PassType pt = (paddle::PassType)(passType);
-  m->machine->forwardBackward(in, &out, pt, UpdateCallbackWrapper(callback));
-}
-
-void GradientMachine::loadParameters(const std::string& path) {
-  m->machine->loadParameters(path);
-}
-
-size_t GradientMachine::getParameterSize() const {
-  return m->machine->getParameters().size();
-}
-
-Parameter* GradientMachine::getParameter(size_t i) throw(RangeError) {
-  auto params = m->machine->getParameters();
-  if (i < params.size()) {
-    return Parameter::createFromSharedPtr(&m->machine->getParameters()[i]);
-  } else {
-    throw RangeError();
-  }
-}
-
-size_t GradientMachine::getNonStaticParameterSize() const {
-  return m->machine->getNonStaticParameters().size();
-}
-
-Parameter* GradientMachine::getNonStaticParameter(size_t i) throw(RangeError) {
-  auto params = m->machine->getNonStaticParameters();
-  if (i < params.size()) {
-    return Parameter::createFromSharedPtr(
-        &m->machine->getNonStaticParameters()[i]);
-  } else {
-    throw RangeError();
-  }
-}
-
-void GradientMachine::randParameters() { m->machine->randParameters(); }
-
-Arguments* GradientMachine::getLayerOutput(const std::string& layerName) const
-    throw(UnsupportError) {
-  auto nn = m->machine;
-  if (nn) {
-    auto arg = nn->getLayerOutput(layerName);
-    return Arguments::createByPaddleArgument(&arg);
-  } else {
-    throw UnsupportError();
-  }
-}
-
-SequenceGenerator* GradientMachine::asSequenceGenerator(
-    const std::vector<std::string>& dict,
-    size_t begin_id,
-    size_t end_id,
-    size_t max_length,
-    size_t beam_size) {
-  SequenceGenerator* r =
-      SequenceGenerator::createByGradientMachineSharedPtr(&m->machine);
-  r->setDict(dict);
-  r->setBos(begin_id);
-  r->setEos(end_id);
-  r->setMaxLength(max_length);
-  r->setBeamSize(beam_size);
-  return r;
-}
-
-Evaluator* GradientMachine::makeEvaluator() {
-  auto ev = new Evaluator();
-  ev->m->rawPtr = m->machine->makeEvaluator();
-  return ev;
-}
-
-void GradientMachine::eval(Evaluator* evaluator) {
-  m->machine->eval(evaluator->m->rawPtr);
-}
diff --git a/paddle/api/Matrix.cpp b/paddle/api/Matrix.cpp
deleted file mode 100644
index 8282b4629dc08a7fcd9b52cbc3492ac10d8ed55c..0000000000000000000000000000000000000000
--- a/paddle/api/Matrix.cpp
+++ /dev/null
@@ -1,317 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/math/Matrix.h"
-#include <cstring>
-#include <iostream>
-#include "PaddleAPI.h"
-#include "paddle/math/CpuSparseMatrix.h"
-#include "paddle/math/SparseMatrix.h"
-
-struct MatrixPrivate {
-  std::shared_ptr<paddle::Matrix> mat;
-};
-
-Matrix::Matrix() : m(new MatrixPrivate()) {}
-
-Matrix* Matrix::createByPaddleMatrixPtr(void* sharedPtr) {
-  auto* mat = reinterpret_cast<paddle::MatrixPtr*>(sharedPtr);
-  if ((*mat) != nullptr) {
-    auto m = new Matrix();
-    m->m->mat = *mat;
-    return m;
-  } else {
-    return nullptr;
-  }
-}
-
-Matrix* Matrix::createZero(size_t height, size_t width, bool useGpu) {
-  auto m = new Matrix();
-  m->m->mat = paddle::Matrix::create(height, width, useGpu);
-  m->m->mat->zero();
-  return m;
-}
-
-Matrix* Matrix::createDense(const std::vector<float>& data,
-                            size_t height,
-                            size_t width,
-                            bool useGpu) {
-  auto m = new Matrix();
-  m->m->mat = paddle::Matrix::create(height, width, useGpu);
-  m->m->mat->copyFrom(data.data(), data.size());
-  return m;
-}
-
-Matrix* Matrix::createDenseFromNumpy(float* data,
-                                     int dim1,
-                                     int dim2,
-                                     bool copy,
-                                     bool useGpu) throw(UnsupportError) {
-  if (useGpu) {
-    /// Gpu mode only supports copy=True
-    if (!copy) {
-      throw UnsupportError("Gpu mode only supports copy=True");
-    }
-    return Matrix::createGpuDenseFromNumpy(data, dim1, dim2);
-  } else {
-    return Matrix::createCpuDenseFromNumpy(data, dim1, dim2, copy);
-  }
-}
-
-Matrix* Matrix::createCpuDenseFromNumpy(float* data,
-                                        int dim1,
-                                        int dim2,
-                                        bool copy) {
-  auto m = new Matrix();
-  if (copy) {
-    m->m->mat = paddle::Matrix::create(dim1, dim2);
-    m->m->mat->copyFrom(data, dim1 * dim2);
-  } else {
-    m->m->mat = paddle::Matrix::create(data, dim1, dim2, false);
-  }
-  return m;
-}
-
-Matrix* Matrix::createGpuDenseFromNumpy(float* data, int dim1, int dim2) {
-  auto m = new Matrix();
-  m->m->mat = paddle::Matrix::create(dim1, dim2, false, true);
-  m->m->mat->copyFrom(data, dim1 * dim2);
-  return m;
-}
-
-Matrix* Matrix::createSparse(size_t height,
-                             size_t width,
-                             size_t nnz,
-                             bool isNonVal,
-                             bool isTrans,
-                             bool useGpu) {
-  auto m = new Matrix();
-  m->m->mat = paddle::Matrix::createSparseMatrix(
-      height,
-      width,
-      nnz,
-      isNonVal ? paddle::NO_VALUE : paddle::FLOAT_VALUE,
-      isTrans,
-      useGpu);
-  return m;
-}
-
-Matrix::~Matrix() { delete m; }
-
-size_t Matrix::getHeight() const { return m->mat->getHeight(); }
-
-size_t Matrix::getWidth() const { return m->mat->getWidth(); }
-
-float Matrix::get(size_t x, size_t y) const throw(RangeError) {
-  if (x > this->getWidth() || y > this->getHeight()) {
-    RangeError e;
-    throw e;
-  }
-  return m->mat->getElement(x, y);
-}
-
-void Matrix::set(size_t x, size_t y, float val) throw(RangeError,
-                                                      UnsupportError) {
-  if (x > this->getWidth() || y > this->getHeight()) {
-    RangeError e;
-    throw e;
-  }
-  auto rawMat = m->mat.get();
-  if (auto cDenseMat = dynamic_cast<paddle::CpuMatrix*>(rawMat)) {
-    *(cDenseMat->getData() + x + y * cDenseMat->getWidth()) = val;
-  } else {
-    UnsupportError e;
-    throw e;
-  }
-}
-
-bool Matrix::isSparse() const {
-  auto raw_mat = m->mat.get();
-  return dynamic_cast<paddle::CpuSparseMatrix*>(raw_mat) != nullptr ||
-         dynamic_cast<paddle::GpuSparseMatrix*>(raw_mat) != nullptr;
-}
-
-SparseValueType Matrix::getSparseValueType() const throw(UnsupportError) {
-  auto cpuSparseMat =
-      std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
-  if (cpuSparseMat != nullptr) {
-    return (SparseValueType)cpuSparseMat->getValueType();
-  } else {
-    auto gpuSparseMat =
-        std::dynamic_pointer_cast<paddle::GpuSparseMatrix>(m->mat);
-    if (gpuSparseMat != nullptr) {
-      return (SparseValueType)gpuSparseMat->getValueType();
-    } else {
-      UnsupportError e;
-      throw e;
-    }
-  }
-}
-
-SparseFormatType Matrix::getSparseFormat() const throw(UnsupportError) {
-  auto cpuSparseMat =
-      std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
-  if (cpuSparseMat != nullptr) {
-    return (SparseFormatType)cpuSparseMat->getFormat();
-  } else {
-    auto gpuSparseMat =
-        std::dynamic_pointer_cast<paddle::GpuSparseMatrix>(m->mat);
-    if (gpuSparseMat != nullptr) {
-      return SPARSE_CSR;
-    } else {
-      UnsupportError e;
-      throw e;
-    }
-  }
-}
-
-IntArray Matrix::getSparseRowCols(size_t i) const
-    throw(UnsupportError, RangeError) {
-  auto cpuSparseMat =
-      std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
-  if (cpuSparseMat != nullptr &&
-      cpuSparseMat->getFormat() == paddle::SPARSE_CSR) {
-    if (i < cpuSparseMat->getHeight()) {
-      // cpuSparseMat->print(std::cout);
-      size_t len = cpuSparseMat->getColNum(i);
-      return IntArray(cpuSparseMat->getRowCols(i), len);
-    } else {
-      RangeError e;
-      throw e;
-    }
-  } else {
-    UnsupportError e;
-    throw e;
-  }
-}
-
-IntWithFloatArray Matrix::getSparseRowColsVal(size_t i) const
-    throw(UnsupportError, RangeError) {
-  auto cpuSparseMat =
-      std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
-  if (cpuSparseMat != nullptr &&
-      cpuSparseMat->getValueType() == paddle::FLOAT_VALUE) {
-    if (i < cpuSparseMat->getHeight()) {
-      return IntWithFloatArray(cpuSparseMat->getRowValues(i),
-                               cpuSparseMat->getRowCols(i),
-                               cpuSparseMat->getColNum(i));
-    } else {
-      RangeError e;
-      throw e;
-    }
-  } else {
-    UnsupportError e;
-    throw e;
-  }
-}
-
-FloatArray Matrix::getData() const {
-  auto rawMat = m->mat.get();
-  if (dynamic_cast<paddle::GpuMemoryHandle*>(rawMat->getMemoryHandle().get())) {
-    // is gpu. then copy data
-    float* data = rawMat->getData();
-    size_t len = rawMat->getElementCnt();
-    float* cpuData = new float[len];
-    hl_memcpy_device2host(cpuData, data, len * sizeof(float));
-    FloatArray ret_val(cpuData, len);
-    ret_val.needFree = true;
-    return ret_val;
-  } else {
-    FloatArray ret_val(rawMat->getData(), rawMat->getElementCnt());
-    return ret_val;
-  }
-}
-
-void Matrix::sparseCopyFrom(
-    const std::vector<int>& rows,
-    const std::vector<int>& cols,
-    const std::vector<float>& vals) throw(UnsupportError) {
-  auto cpuSparseMat =
-      std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
-  if (cpuSparseMat != nullptr) {
-    // LOG(INFO) <<"RowSize = "<<rows.size()
-    //  <<" ColSize = "<<cols.size()
-    //  <<" ValSize = "<<vals.size();
-    cpuSparseMat->copyFrom(const_cast<std::vector<int>&>(rows),
-                           const_cast<std::vector<int>&>(cols),
-                           const_cast<std::vector<float>&>(vals));
-  } else {
-    UnsupportError e;
-    throw e;
-  }
-}
-
-void* Matrix::getSharedPtr() const { return &m->mat; }
-
-void Matrix::toNumpyMatInplace(float** view_data,
-                               int* dim1,
-                               int* dim2) throw(UnsupportError) {
-  auto cpuMat = std::dynamic_pointer_cast<paddle::CpuMatrix>(m->mat);
-  if (cpuMat) {
-    *dim1 = cpuMat->getHeight();
-    *dim2 = cpuMat->getWidth();
-    *view_data = cpuMat->getData();
-  } else {
-    throw UnsupportError();
-  }
-}
-void Matrix::copyToNumpyMat(float** view_m_data,
-                            int* dim1,
-                            int* dim2) throw(UnsupportError) {
-  static_assert(sizeof(paddle::real) == sizeof(float),
-                "Currently PaddleAPI only support for single "
-                "precision version of paddle.");
-  if (this->isSparse()) {
-    throw UnsupportError();
-  } else {
-    *dim1 = m->mat->getHeight();
-    *dim2 = m->mat->getWidth();
-    *view_m_data = new float[(*dim1) * (*dim2)];
-    if (auto cpuMat = dynamic_cast<paddle::CpuMatrix*>(m->mat.get())) {
-      auto src = cpuMat->getData();
-      auto dest = *view_m_data;
-      std::memcpy(dest, src, sizeof(paddle::real) * (*dim1) * (*dim2));
-    } else if (auto gpuMat = dynamic_cast<paddle::GpuMatrix*>(m->mat.get())) {
-      auto src = gpuMat->getData();
-      auto dest = *view_m_data;
-      hl_memcpy_device2host(
-          dest, src, sizeof(paddle::real) * (*dim1) * (*dim2));
-    } else {
-      LOG(WARNING) << "Unexpected Situation";
-      throw UnsupportError();
-    }
-  }
-}
-
-void Matrix::copyFromNumpyMat(float* data,
-                              int dim1,
-                              int dim2) throw(UnsupportError, RangeError) {
-  if (isSparse()) {
-    throw UnsupportError();
-  } else {
-    if (this->getHeight() == (size_t)dim1 && this->getWidth() == (size_t)dim2) {
-      if (m->mat->getData() != data) {
-        m->mat->copyFrom(data, dim1 * dim2);
-      }
-    } else {
-      throw RangeError();
-    }
-  }
-}
-
-bool Matrix::isGpu() const {
-  auto rawPtr = m->mat.get();
-  return dynamic_cast<paddle::GpuMatrix*>(rawPtr) != nullptr ||
-         dynamic_cast<paddle::GpuSparseMatrix*>(rawPtr) != nullptr;
-}
diff --git a/paddle/api/Paddle.i b/paddle/api/Paddle.i
deleted file mode 100644
index 3237e73745dca58bed923b20851f0f0039a3487c..0000000000000000000000000000000000000000
--- a/paddle/api/Paddle.i
+++ /dev/null
@@ -1,202 +0,0 @@
-%module(directors="1") swig_paddle
-%include "std_string.i"
-%{
-#define SWIG_FILE_WITH_INIT
-#include "api/PaddleAPI.h"   
-%}
-
-%include "exception.i"
-%typemap(throws) UnsupportError %{
-  SWIG_exception(SWIG_RuntimeError, $1.what());
-  SWIG_fail;
-%}
-
-%include "std_vector.i"
-%include "std_pair.i"
-#ifdef SWIGPYTHON
-%include "numpy.i"
-#endif
-
-%init %{
-#ifdef SWIGPYTHON
-import_array();
-#endif
-%}
-
-
-namespace std {
-%template(vector_int) vector<int>;
-%template(vector_uint) vector<unsigned int>;
-%template(vector_float) vector<float>;
-%template(vector_string) vector<string>;
-%template(vector_vec_star) vector<Vector*>;
-}
-#ifdef SWIGPYTHON 
-%typemap(in) (int argc, char** argv) { 
-    int i = 0; 
-    if (!PyList_Check($input)) { 
-        PyErr_SetString(PyExc_ValueError, "Expecting a list"); 
-        return NULL; 
-    } 
-    $1 = PyList_Size($input); 
-    $2 = (char **) malloc(($1+1)*sizeof(char *)); 
-    for (i = 0; i < $1; i++) { 
-        PyObject *s = PyList_GetItem($input,i); 
-        if (!PyString_Check(s)) { 
-            free($2); 
-            PyErr_SetString(PyExc_ValueError, "List items must be strings"); 
-            return NULL; 
-        } 
-        $2[i] = PyString_AsString(s); 
-    } 
-    $2[i] = 0; 
-} 
-%typemap(freearg) (int argc, char** argv) { 
-    if ($2) free($2); 
-} 
-
-%typemap(out) FloatArray {
-  $result = PyList_New($1.length);
-  for (size_t i=0; i<$1.length; ++i) {
-    PyList_SetItem($result, i, PyFloat_FromDouble($1.buf[i]));
-  }  
-  if($1.needFree) {
-    delete [] $1.buf;  
-  }
-}
-
-%typemap(out) IntArray {
-  $result = PyList_New($1.length);  
-  for (size_t i=0; i<$1.length; ++i) {
-    PyList_SetItem($result, i, PyInt_FromLong($1.buf[i]));  
-  }
-  if ($1.needFree) {
-    delete [] $1.buf;  
-  }
-}
-
-%typemap(out) IntWithFloatArray {
-  $result = PyList_New($1.length);
-  for (size_t i=0; i<$1.length; ++i) {
-    PyList_SetItem($result, i, PyTuple_Pack(2, 
-      PyInt_FromLong($1.idxBuf[i]),
-      PyFloat_FromDouble($1.valBuf[i])
-    ));
-  }
-  if ($1.needFree) {
-    delete [] $1.idxBuf;
-    delete [] $1.valBuf;
-  } 
-}
-
-
-%rename(__getitem__) IVector::get;
-%rename(__setitem__) IVector::set;
-%rename(__len__) IVector::getSize;
-%rename(__getitem__) Vector::get;
-%rename(__setitem__) Vector::set;
-%rename(__len__) Vector::getSize;
-%rename(__len__) Parameter::getSize;
-%rename(__call__) ParameterTraverseCallback::apply;
-%rename(__repr__) Evaluator::toString;
-
-%apply (float* INPLACE_ARRAY2, int DIM1, int DIM2) { 
-  (float* data, int dim1, int dim2) 
-}
-
-%apply (float** ARGOUTVIEW_ARRAY2, int* DIM1, int* DIM2) { 
-  (float** view_data, int* dim1, int* dim2) 
-}
-
-%apply (float** ARGOUTVIEWM_ARRAY2, int* DIM1, int* DIM2) {
-  (float** view_m_data, int* dim1, int* dim2)  
-}
-
-%apply (int** ARGOUTVIEWM_ARRAY1, int* DIM1) {
-  (int** view_m_data, int* dim1)  
-}
-
-%apply (int* INPLACE_ARRAY1, int DIM1) { 
-  (int* data, int dim) 
-}
-
-%apply (int** ARGOUTVIEW_ARRAY1, int* DIM1) {
-  (int** view_data, int* dim1)  
-}
-
-%apply (float* INPLACE_ARRAY1, int DIM1) {
-  (float* data, int dim)
-}
-
-%apply (float** ARGOUTVIEW_ARRAY1, int* DIM1) {
-  (float** view_data, int* dim1)
-}
-
-%apply (float** ARGOUTVIEWM_ARRAY1, int* DIM1) {
-  (float** view_m_data, int* dim1)
-}
-
-#endif
-// The below functions internally create object by "new", so it should use
-// use SWIG to handle gc. There are hints for SWIG to handle GC.
-%newobject Matrix::createZero;
-%newobject Matrix::createSparse;
-%newobject Matrix::createDense;
-%newobject Matrix::createDenseFromNumpy;
-%newobject Matrix::createCpuDenseFromNumpy;
-%newobject Matrix::createGpuDenseFromNumpy;
-%newobject Vector::createZero;
-%newobject Vector::create;
-%newobject Vector::createVectorFromNumpy;
-%newobject Vector::createCpuVectorFromNumpy;
-%newobject Vector::createGpuVectorFromNumpy;
-%newobject IVector::createZero;
-%newobject IVector::create;
-%newobject IVector::createVectorFromNumpy;
-%newobject IVector::createCpuVectorFromNumpy;
-%newobject IVector::createGpuVectorFromNumpy;
-%newobject Trainer::createByCommandLine;
-%newobject Trainer::getForwardOutput;
-%newobject Trainer::getLayerOutput;
-%newobject Arguments::getSlotValue;
-%newobject Arguments::getSlotIds;
-%newobject Arguments::getSlotIn;
-%newobject Arguments::getSlotSequenceStartPositions;
-%newobject Arguments::getSlotSequenceDim;
-%newobject Arguments::createArguments;
-%newobject GradientMachine::createByConfigProtoStr;
-%newobject GradientMachine::createByModelConfig;
-%newobject GradientMachine::asSequenceGenerator;
-%newobject GradientMachine::getParameter;
-%newobject GradientMachine::getLayerOutput;
-%newobject GradientMachine::makeEvaluator;
-%newobject TrainerConfig::createFromTrainerConfigFile;
-%newobject TrainerConfig::getModelConfig;
-%newobject TrainerConfig::getOptimizationConfig;
-%newobject Parameter::getBuf;
-%newobject Parameter::getConfig;
-%newobject ParameterOptimizer::create;
-%newobject ParameterOptimizer::needSpecialTraversal;
-%newobject ParameterUpdater::createLocalUpdater;
-%newobject ParameterUpdater::createRemoteUpdater;
-%newobject ParameterUpdater::createNewRemoteUpdater;
-
-%feature("director") UpdateCallback;
-%feature("autodoc", 1); // To generate method stub, for code hint in ide
-
-// Ignore many private class, and method cannot be handled by swig.
-%ignore MatrixPrivate;
-%ignore TrainerPrivate;
-%ignore IVector::operator[];
-%ignore ArgumentsPrivate;
-%ignore GradientMachinePrivate;
-%ignore TrainerConfigPrivate;
-%ignore ModelConfigPrivate;
-%ignore ParameterPrivate;
-%ignore SequenceGeneratorPrivate;
-%ignore VectorPrivate;
-%ignore ParameterConfigPrivate;
-%ignore OptimizationConfigPrivate;
-%ignore ParameterTraverseCallbackPrivate;
-%include "utils/GlobalConstants.h"
-%include "api/PaddleAPI.h"
diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h
deleted file mode 100644
index 7866122006a996cbe5201c661cab9c81aa82a219..0000000000000000000000000000000000000000
--- a/paddle/api/PaddleAPI.h
+++ /dev/null
@@ -1,1054 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stddef.h>
-#include <stdint.h>
-#include <stdexcept>
-#include <string>
-#include <vector>
-#include "paddle/gserver/gradientmachines/GradientMachine.h"
-#include "paddle/utils/Common.h"
-#include "paddle/utils/GlobalConstants.h"
-
-/// Import PaddlePaddle's enumeration into global namespace.
-using namespace paddle::enumeration_wrapper;  // NOLINT
-
-/**
- * @brief Initialize paddle.
- *
- * In python, this method should be invoked as
- * @code
- *  import sys
- *  import paddle
- *  paddle.initPaddle(sys.argv)
- *  or you can change arguments as any list of str.
- * @endcode
- */
-void initPaddle(int argc, char** argv);
-
-/// Return FLAGS_use_gpu
-bool isUsingGpu();
-
-/// Set the Flags_use_gpu to the given parameter
-void setUseGpu(bool useGpu);
-
-/// Return true if this py_paddle is compiled in GPU Version
-bool isGpuVersion();
-
-/// Return FLAGS_trainer_count
-int getTrainerCount();
-
-/// The Error of IO Operation. Such as file not found, etc.
-class IOError {};
-
-/// Out of range error
-class RangeError {};
-
-/// Not support Error, such as access GPU memory directly, etc.
-class UnsupportError : public std::runtime_error {
- public:
-  UnsupportError() : std::runtime_error(" ") {}
-  explicit UnsupportError(const std::string& message)
-      : std::runtime_error(message) {}
-};
-
-/// This type will map to python's list of float.
-struct FloatArray {
-  const float* buf;
-  const size_t length;
-  bool needFree;  // true if the buf is dynamic alloced.
-  FloatArray(const float* b, const size_t l);
-};
-
-/// This type will map to python's list of int
-struct IntArray {
-  const int* buf;
-  const size_t length;
-  bool needFree;
-  IntArray(const int* b, const size_t l, bool f = false);
-};
-
-/// This type will map to python's list of (int, float)
-struct IntWithFloatArray {
-  const float* valBuf;
-  const int* idxBuf;
-  const size_t length;
-  bool needFree;
-  IntWithFloatArray(const float* v, const int* i, size_t l, bool f = false);
-};
-
-enum SparseValueType { SPARSE_NON_VALUE = 0, SPARSE_VALUE = 1 };
-
-enum SparseFormatType { SPARSE_CSR = 0, SPARSE_CSC = 1 };
-
-/**
- * In Python, -1UL is hard to write. So define a const value used by python
- * side.
- */
-const size_t NO_SPARSE_ID = -1UL;
-
-struct MatrixPrivate;
-class Matrix {
-  Matrix();  // User Cannot Create Matrix.
-  DISABLE_COPY(Matrix);
-  static Matrix* createByPaddleMatrixPtr(void* sharedPtr);
-
- public:
-  virtual ~Matrix();
-
-  /**
-   * Create A Matrix with height,width, which is filled by zero.
-   */
-  static Matrix* createZero(size_t height,
-                            size_t width,
-                            bool useGpu = isUsingGpu());
-
-  /**
-   * Create Sparse Matrix.
-   *
-   * After create sparse, sparseCopyFrom can be used to fill matrix.
-   *
-   * @param nnz  Number of non zero values.
-   *
-   * @note the default sparse type is SPARSE_CSR.
-   */
-  static Matrix* createSparse(size_t height,
-                              size_t width,
-                              size_t nnz,
-                              bool isNonVal = true,
-                              bool trans = false,
-                              bool useGpu = isUsingGpu());
-
-  /**
-   * Create Dense Matrix.
-   *
-   * @param data  list of float should be passed in python.
-   * @note        the value will be copy into a new matrix.
-   */
-  static Matrix* createDense(const std::vector<float>& data,
-                             size_t height,
-                             size_t width,
-                             bool useGpu = isUsingGpu());
-
-  static Matrix* createDenseFromNumpy(
-      float* data,
-      int dim1,
-      int dim2,
-      bool copy = true,
-      bool useGpu = isUsingGpu()) throw(UnsupportError);
-
-  /**
-   *  Create Cpu Dense Matrix from numpy matrix, dtype=float32
-   *
-   *  @param data  a numpy matrix.
-   *  @param dim1  dimension of data.
-   *  @param dim2  dimension of data.
-   *  @param copy  true if copy into a new matrix, false will create
-   *               matrix inplace. copy = false should be used with extreme
-   *               care because Matrix will share the memory with the given
-   *               numpy array. If the numpy array object is no longer valid,
-   *               the memory space will not be usable.
-   */
-  static Matrix* createCpuDenseFromNumpy(float* data,
-                                         int dim1,
-                                         int dim2,
-                                         bool copy = true);
-
-  /// Create Gpu Dense Matrix from numpy matrix, dtype=float32
-  static Matrix* createGpuDenseFromNumpy(float* data, int dim1, int dim2);
-
-  /**
-   * Cast to numpy matrix.
-   *
-   * @note    This method take no parameter in python.
-   * @note    This method in python will return a numpy matrix, not void.
-   * @note    Only CpuDenseMatrix is supported.
-   *
-   * Example:
-   * @code
-   * import paddle
-   * m = paddle.Matrix.createZero(10,2)
-   * numpy_mat = m.toNumpyMat()
-   * @endcode
-   */
-  void toNumpyMatInplace(float** view_data,
-                         int* dim1,
-                         int* dim2) throw(UnsupportError);
-
-  /// Copy To numpy mat.
-  void copyToNumpyMat(float** view_m_data,
-                      int* dim1,
-                      int* dim2) throw(UnsupportError);
-
-  /// Copy From Numpy Mat
-  void copyFromNumpyMat(float* data, int dim1, int dim2) throw(UnsupportError,
-                                                               RangeError);
-
-  /// return true if this matrix is sparse.
-  bool isSparse() const;
-
-  SparseValueType getSparseValueType() const throw(UnsupportError);
-
-  SparseFormatType getSparseFormat() const throw(UnsupportError);
-
-  IntArray getSparseRowCols(size_t i) const throw(UnsupportError, RangeError);
-
-  IntWithFloatArray getSparseRowColsVal(size_t i) const
-      throw(UnsupportError, RangeError);
-
-  size_t getHeight() const;
-
-  size_t getWidth() const;
-
-  float get(size_t x, size_t y) const throw(RangeError);
-
-  void set(size_t x, size_t y, float val) throw(RangeError, UnsupportError);
-
-  /// return type is list of float
-  FloatArray getData() const;
-
-  /**
-   * Copy from rows, cols, values.
-   *
-   * if sparse_nonvalue, the values should be []
-   */
-  void sparseCopyFrom(const std::vector<int>& rows,
-                      const std::vector<int>& cols,
-                      const std::vector<float>& values =
-                          std::vector<float>()) throw(UnsupportError);
-
-  bool isGpu() const;
-
- private:
-  void* getSharedPtr() const;
-
-  MatrixPrivate* m;
-  friend class Trainer;
-  friend class GradientMachine;
-  friend class Arguments;
-};
-
-struct VectorPrivate;
-class Vector {
-  DISABLE_COPY(Vector);
-  Vector();
-  static Vector* createByPaddleVectorPtr(void* ptr);
-
-  void* getSharedPtr();
-
- public:
-  ~Vector();
-
-  /// Create Vector filled with zero.
-  static Vector* createZero(size_t sz, bool useGpu = isUsingGpu());
-
-  /**
-   * Create Vector from list of float.
-   *
-   * It will create a new vector, and copy data into it.
-   */
-  static Vector* create(const std::vector<float>& data,
-                        bool useGpu = isUsingGpu());
-
-  static Vector* createVectorFromNumpy(
-      float* data,
-      int dim,
-      bool copy = true,
-      bool useGpu = isUsingGpu()) throw(UnsupportError);
-  /**
-   * Create Cpu Vector from numpy array, which dtype=float32
-   *
-   * If copy is false, it will create vector inplace.
-   */
-  static Vector* createCpuVectorFromNumpy(float* data,
-                                          int dim,
-                                          bool copy = true);
-
-  /// Create Gpu Vector from numpy array, which dtype=float32
-  static Vector* createGpuVectorFromNumpy(float* data, int dim);
-
-  /**
-   * copy from another vector
-   * throw(RangeError) if size of src vector is different from size of this
-   * vector
-   */
-  void copyFrom(Vector* src) throw(RangeError);
-
-  /// Cast to numpy array inplace.
-  void toNumpyArrayInplace(float** view_data, int* dim1) throw(UnsupportError);
-
-  /// Copy to numpy array.
-  void copyToNumpyArray(float** view_m_data, int* dim1);
-
-  /// Copy from numpy array.
-  void copyFromNumpyArray(float* data, int dim);
-
-  /// __getitem__ in python
-  float get(const size_t idx) const throw(RangeError, UnsupportError);
-
-  /// __setitem__ in python
-  void set(const size_t idx, float val) throw(RangeError, UnsupportError);
-
-  /// Return is GPU vector or not.
-  bool isGpu() const;
-
-  /// Return a list of float, the memory is alloced and copied.
-  FloatArray getData() const;
-
-  /// __len__ in python
-  size_t getSize() const;
-
- private:
-  VectorPrivate* m;
-
- private:
-  friend class Parameter;
-  friend class ParameterOptimizer;
-  friend struct ParameterTraverseCallbackPrivate;
-};
-
-struct IVectorPrivate;
-class IVector {
-  IVector();
-  DISABLE_COPY(IVector);
-  static IVector* createByPaddleVectorPtr(void* ptr);
-
- public:
-  /// Create IVector filled with zero
-  static IVector* createZero(size_t sz, bool useGpu = isUsingGpu());
-
-  /**
-   * Create IVector from list of int.
-   * It will create a new vector, and copy data into it.
-   */
-  static IVector* create(const std::vector<int>& data,
-                         bool useGpu = isUsingGpu());
-
-  static IVector* createVectorFromNumpy(
-      int* data,
-      int dim,
-      bool copy = true,
-      bool useGpu = isUsingGpu()) throw(UnsupportError);
-
-  /**
-   * Create Cpu IVector from numpy array, which dtype=int32
-   *
-   * If copy is false, it will create vector inplace
-   */
-  static IVector* createCpuVectorFromNumpy(int* data,
-                                           int dim,
-                                           bool copy = true);
-  /**
-   * Create Gpu IVector from numpy array, which dtype=int32
-   */
-  static IVector* createGpuVectorFromNumpy(int* data, int dim);
-
-  /// Cast to numpy array inplace.
-  void toNumpyArrayInplace(int** view_data, int* dim1) throw(UnsupportError);
-
-  /// Copy to numpy array.
-  void copyToNumpyArray(int** view_m_data, int* dim1);
-
-  /// Copy from numpy array.
-  void copyFromNumpyArray(int* data, int dim);
-
-  virtual ~IVector();
-
-  /// Return a list of int, the memory is alloced and copied.
-  IntArray getData() const;
-
-  /// This method will map to python [] method.
-  int& operator[](const size_t idx) throw(RangeError, UnsupportError);
-
-  const int& operator[](const size_t idx) const
-      throw(RangeError, UnsupportError);
-
-  inline int get(const size_t idx) const throw(RangeError, UnsupportError) {
-    return (*this)[idx];
-  }
-
-  inline void set(const size_t idx, int val) throw(RangeError, UnsupportError) {
-    (*this)[idx] = val;
-  }
-
-  /// Return true if it is gpu vector.
-  bool isGpu() const;
-
-  /// This method will map to python __len__();
-  size_t getSize() const;
-
- private:
-  void* getSharedPtr() const;
-
-  friend class Arguments;
-  IVectorPrivate* m;
-};
-
-struct ArgumentsPrivate;
-
-/// The Arguments is actual a std::vector<paddle::Argument> in paddle.
-class Arguments {
- private:
-  Arguments();  // Internal Create.
-  DISABLE_COPY(Arguments);
-
- public:
-  /**
-   * Create a arguments with size.
-   * Note that it can be zero.
-   */
-  static Arguments* createArguments(size_t slotNum);
-
-  void resize(size_t slotNum);
-
-  virtual ~Arguments();
-
-  /**
-   * Return the slot number that aguments contains.
-   *
-   * It is actually the vector's size
-   */
-  size_t getSlotNum() const;
-
-  /**
-   * The get functions of Arguments
-   *
-   * the param idx is the slot id
-   */
-  Matrix* getSlotValue(size_t idx) const throw(RangeError);
-  Matrix* getSlotGrad(size_t idx) const throw(RangeError);
-  IVector* getSlotIds(size_t idx) const throw(RangeError);
-  Matrix* getSlotIn(size_t idx) const throw(RangeError);
-  IVector* getSlotSequenceStartPositions(size_t idx) const throw(RangeError);
-  IVector* getSlotSubSequenceStartPositions(size_t idx) const throw(RangeError);
-  IVector* getSlotSequenceDim(size_t idx) const throw(RangeError);
-  // End Of get functions of Arguments
-
-  int64_t getBatchSize(size_t idx = 0) const throw(RangeError);
-
-  /**
-   * The set functions of Arguments.
-   *
-   * The param idx is the slot id.
-   * The other param is the input Matrix or vector.
-   */
-  void setSlotValue(size_t idx, Matrix* mat) throw(RangeError);
-  void setSlotGrad(size_t idx, Matrix* mat) throw(RangeError);
-  void setSlotIn(size_t idx, Matrix* mat) throw(RangeError);
-  void setSlotIds(size_t idx, IVector* vec) throw(RangeError);
-  void setSlotSequenceStartPositions(size_t idx,
-                                     IVector* vec) throw(RangeError);
-  void setSlotSubSequenceStartPositions(size_t idx,
-                                        IVector* vec) throw(RangeError);
-  void setSlotSequenceDim(size_t idx, IVector* vec) throw(RangeError);
-
-  /**
-   * Set the frame height of the idx-th Argument.
-   *
-   * @param ids The index of which Argument.
-   * @param h The height value.
-   */
-  void setSlotFrameHeight(size_t idx, size_t h) throw(RangeError);
-
-  /**
-   * Set the frame height of the idx-th Argument.
-   *
-   * @param ids The index of which Argument.
-   * @param h The height value.
-   */
-  void setSlotFrameWidth(size_t idx, size_t w) throw(RangeError);
-
-  size_t getSlotFrameHeight(size_t idx = 0) const throw(RangeError);
-  size_t getSlotFrameWidth(size_t idx = 0) const throw(RangeError);
-
-  float sum() const;
-
- private:
-  static Arguments* createByPaddleArgumentVector(void* ptr);
-  static Arguments* createByPaddleArgument(const void* ptr);
-  void* getInternalArgumentsPtr() const;
-
- private:
-  ArgumentsPrivate* m;
-  friend class Trainer;
-  friend class GradientMachine;
-  friend class SequenceGenerator;
-};
-
-enum GradientMatchineCreateMode {
-  CREATE_MODE_NORMAL = paddle::GradientMachine::kNormal,
-  CREATE_MODE_SGD_SPARSE_CPU_TRAINING =
-      paddle::GradientMachine::kSgdSparseCpuTraining,
-  CREATE_MODE_TESTING = paddle::GradientMachine::kTesting
-};
-
-struct ParameterConfigPrivate;
-class ParameterConfig {
-  DISABLE_COPY(ParameterConfig);
-  ParameterConfig();
-
-  /**
-   * Internal methods
-   */
-  static ParameterConfig* createParameterConfigFromParameterSharedPtr(
-      void* ptr);
-  static ParameterConfig* createParameterConfigFromParameterPtr(void* ptr);
-  void* getRawPtr();
-
- public:
-  ~ParameterConfig();
-
-  /**
-   * return proto buf string.
-   */
-  std::string toProtoString() const;
-
- private:
-  ParameterConfigPrivate* m;
-
- private:
-  friend class Parameter;
-  friend class ParameterOptimizer;
-  friend struct ParameterTraverseCallbackPrivate;
-};
-
-struct OptimizationConfigPrivate;
-class OptimizationConfig {
-  DISABLE_COPY(OptimizationConfig);
-  OptimizationConfig();
-
- public:
-  static OptimizationConfig* createFromProtoString(const std::string& str);
-  ~OptimizationConfig();
-
-  /**
-   * return protobuf string.
-   */
-  std::string toProtoString();
-
- private:
-  OptimizationConfigPrivate* m;
-
-  friend class TrainerConfig;
-  friend class ParameterOptimizer;
-  friend class ParameterUpdater;
-  friend class Trainer;
-};
-
-struct ParameterPrivate;
-class Parameter {
- private:
-  Parameter();
-  DISABLE_COPY(Parameter);
-
- public:
-  virtual ~Parameter();
-
-  /**
-   * get parameter name
-   */
-  std::string getName() const;
-
-  /**
-   * get buf in Parameter
-   */
-  Vector* getBuf(ParameterType type);
-
-  /**
-   * get id
-   */
-  size_t getID() const;
-
-  ParameterConfig* getConfig();
-  void setValueUpdated();
-
-  bool save(const std::string& filename) const;
-
-  bool load(const std::string& filename) const;
-
-  size_t getSize() const;
-
- private:
-  static Parameter* createFromRawPtr(void* ptr);
-  static Parameter* createFromSharedPtr(void* ptr);
-
- private:
-  ParameterPrivate* m;
-  friend class UpdateCallbackWrapper;
-  friend class GradientMachine;
-  friend class ParameterUpdater;
-};
-
-struct ModelConfigPrivate;
-/**
- * You can only get model config from TrainerConfig.
- *
- * It is used by GradientMachine.
- */
-class ModelConfig {
- private:
-  ModelConfig();
-  DISABLE_COPY(ModelConfig);
-
- public:
-  virtual ~ModelConfig();
-
- private:
-  ModelConfigPrivate* m;
-  friend class TrainerConfig;
-  friend struct TrainerConfigPrivate;
-  friend class GradientMachine;
-};
-
-struct TrainerConfigPrivate;
-/**
- * To get TrainerConfig from file.
- *
- * It is used by GradientMachine.
- */
-class TrainerConfig {
- private:
-  TrainerConfig();
-  DISABLE_COPY(TrainerConfig);
-
- public:
-  virtual ~TrainerConfig();
-
-  static TrainerConfig* createFromTrainerConfigFile(
-      const std::string& configPath);
-  static TrainerConfig* createFromProtoString(const std::string& str);
-
-  ModelConfig* getModelConfig() const;
-
-  OptimizationConfig* getOptimizationConfig() const;
-
- private:
-  TrainerConfigPrivate* m;
-  friend class Trainer;
-};
-
-/**
- * The callback in backword.
- *
- * You can inherit this class in python.
- *
- * @code
- * class UpdateCallbackInPython(paddle.UpdateCallback):
- *   def __init__(self):
- *     paddle.UpdateCallback.__init__(self)
- *
- *   def apply(self, param):
- *     assert isinstance(param, paddle.Parameter)
- * @endcode
- */
-class UpdateCallback {
- public:
-  virtual ~UpdateCallback();
-  virtual void apply(Parameter* p);
-};
-
-struct ParameterTraverseCallbackPrivate;
-class ParameterTraverseCallback {
-  DISABLE_COPY(ParameterTraverseCallback);
-  ParameterTraverseCallback();
-
- public:
-  ~ParameterTraverseCallback();
-
-  void apply(const std::vector<Vector*>& vecs,
-             const ParameterConfig& config,
-             size_t sparseId);
-
- private:
-  ParameterTraverseCallbackPrivate* m;
-  friend class ParameterOptimizer;
-};
-
-/**
- * The ParameterOptimizer Wrapper Class.
- *
- * Basically same as common/ParameterOptimizer.h
- */
-struct ParameterOptimizerPrivate;
-class ParameterOptimizer {
-  DISABLE_COPY(ParameterOptimizer);
-  ParameterOptimizer();
-
- public:
-  static ParameterOptimizer* create(OptimizationConfig* config);
-
-  ~ParameterOptimizer();
-
-  void init(size_t numRows, const ParameterConfig* config);
-
-  void startPass();
-
-  void finishPass();
-
-  void startBatch(size_t numSamplesProcessed);
-
-  void finishBatch();
-
-  void update(const std::vector<Vector*>& vecs,
-              const ParameterConfig& conf,
-              size_t sparseId = NO_SPARSE_ID);
-
-  std::vector<int> getParameterTypes() const;
-
-  ParameterTraverseCallback* needSpecialTraversal(
-      const ParameterConfig& config) const;
-
- private:
-  ParameterOptimizerPrivate* m;
-};
-
-class SequenceGenerator;
-class Evaluator;
-struct GradientMachinePrivate;
-class GradientMachine {
- private:
-  GradientMachine();
-  DISABLE_COPY(GradientMachine);
-
- public:
-  virtual ~GradientMachine();
-
-  /**
-   * Create By ProtoStr.
-   *
-   * The ProtoStr can be generate by python's protobuf code.
-   */
-  static GradientMachine* createByConfigProtoStr(
-      const std::string& protoStr,
-      GradientMatchineCreateMode mode = CREATE_MODE_NORMAL,
-      const std::vector<int>& parameterTypes = defaultParamTypes);
-
-  /**
-   * Create by ModelConfig object.
-   *
-   * To get ModelConfig, you can get TrainerConfig from config file, then get
-   * model config by TrainerConfig
-   */
-  static GradientMachine* createByModelConfig(
-      ModelConfig* conf,
-      GradientMatchineCreateMode mode = CREATE_MODE_NORMAL,
-      const std::vector<int>& parameterTypes = defaultParamTypes);
-
-  /**
-   * @brief finish
-   */
-  void finish();
-
-  void start();
-
-  /**
-   * Prefetch row ids of sparse parameter.
-   */
-  void prefetch(const Arguments& inArgs);
-
-  /**
-   * Do some thing when train pass ended.
-   */
-  void onPassEnd();
-
-  /**
-   * The forward stage of GradientMachine.
-   *
-   * @note  the outArgs could be zero length arguemnts.
-   * @note  THIS METHOD IS VERY USEFULL FOR PREDICT FROM TRAINED MODEL.
-   */
-  void forward(const Arguments& inArgs, Arguments* outArgs, PassType passType);
-
-  /**
-   * The backward stage of GradientMachine.
-   *
-   * @note  Currently the ParameterUpdater is not wrapped in SWIG, so backward
-   * cannot actually train a network. But you can write a update callback to
-   * change the parameter or implement a ParameterUpdater in python side.
-   */
-  void backward(const UpdateCallback& callback = UpdateCallback());
-
-  /**
-   * Combine forward/backward
-   */
-  void forwardBackward(const Arguments& inArgs,
-                       Arguments* outArgs,
-                       PassType passType,
-                       const UpdateCallback& callback = UpdateCallback());
-
-  void loadParameters(const std::string& path);
-
-  size_t getParameterSize() const;
-  Parameter* getParameter(size_t i) throw(RangeError);
-
-  size_t getNonStaticParameterSize() const;
-  Parameter* getNonStaticParameter(size_t i) throw(RangeError);
-
-  void randParameters();
-
-  Arguments* getLayerOutput(const std::string& layerName) const
-      throw(UnsupportError);
-
-  /**
-   * Create a sequence generator.
-   *
-   * @note  It just like a paddle_gen_sequence.
-   */
-  SequenceGenerator* asSequenceGenerator(
-      const std::vector<std::string>& dict = std::vector<std::string>(),
-      size_t begin_id = 0UL,
-      size_t end_id = 0UL,
-      size_t max_length = 100UL,
-      size_t beam_size = -1UL);
-
-  Evaluator* makeEvaluator();
-
-  void eval(Evaluator* evaluator);
-
- private:
-  GradientMachinePrivate* m;
-
-  static GradientMachine* createFromPaddleModelPtr(
-      const void* confPtr,
-      GradientMatchineCreateMode mode,
-      const std::vector<int>& types);
-
-  // Not to use c++ 11 init-list, so we use static var as function default arg.
-  static std::vector<int> defaultParamTypes;
-  friend class Trainer;
-  friend class ParameterUpdater;
-};
-
-struct ParameterUpdaterPrivate;
-class ParameterUpdater {
- private:
-  ParameterUpdater();
-
- public:
-  static ParameterUpdater* createLocalUpdater(OptimizationConfig* config);
-  static ParameterUpdater* createRemoteUpdater(OptimizationConfig* config,
-                                               int passCount,
-                                               bool useSparseUpdater);
-  static ParameterUpdater* createNewRemoteUpdater(
-      OptimizationConfig* config,
-      const std::string pserverSpec,
-      const bool useEtcd) throw(UnsupportError);
-  ~ParameterUpdater();
-
-  /**
-   * @brief initialize Parameter Updater by GradientMachine.
-   * @param gm
-   */
-  void init(const GradientMachine& gm);
-
-  /**
-   * @brief begin of a training/testing of one pass.
-   */
-  void startPass();
-
-  /**
-   * @brief end of a traning/testing of one pass.
-   */
-  void finishPass();
-
-  /**
-   * @brief begin of a training/testing of one batch.
-   * @param data batch's size
-   * @return PassType, mostly will be training.
-   */
-  PassType startBatch(size_t batchSize);
-
-  /**
-   * @brief end of a traning/testing of one batch
-   * @param cost current batch cost.
-   */
-  void finishBatch(float cost);
-
-  /**
-   * @brief update a parameter (by local optimizer or by cluster pserver)
-   * @param param
-   */
-  void update(Parameter* param);
-
-  /**
-   * @breif only get required sparse rows by default.
-   * @param fullSize: get full matrix parameter if *fullSize* set
-   * @param apply: get PARAMETER_APPLY on pserver if *apply* set
-   */
-  void getParametersRemote(bool fullSize = false, bool apply = false);
-
-  /**
-   * @brief restore the average parameter.
-   * @note It is only used in AverageOptimizer. Restore will get the current
-   * PARAMETER_VALUE back.
-   */
-  void restore();
-
-  /**
-   * @brief apply. Store the average parameter.
-   * @note It is only used in AverageOptimizer. Apply will store the current
-   * PARAMETER_VALUE to buffer, calcaualte current Average Parameter, and save
-   * it to PARAMETER_VALUE.
-   */
-  void apply();
-
-  /**
-   * @brief catchUpWith The Regularization will be delayed in many situations(
-   * pserver, local sparse). Catch Up means catch the regularization up, apply
-   * regularization to all params.
-   */
-  void catchUpWith();
-
- private:
-  ParameterUpdaterPrivate* m;
-};
-
-struct EvaluatorPrivate;
-class Evaluator {
- private:
-  Evaluator();
-  DISABLE_COPY(Evaluator);
-
- public:
-  ~Evaluator();
-
-  /**
-   * @brief begin an evaluate stage.
-   */
-  void start();
-
-  /**
-   * @brief end an evaluate stage.
-   */
-  void finish();
-
-  /**
-   * @brief toString will get a evaluate result.
-   *
-   * __repr__ method in python
-   */
-  std::string toString();
-
-  std::vector<std::string> getNames() const;
-
-  double getValue(const std::string name) const;
-
- private:
-  EvaluatorPrivate* m;
-
-  friend class GradientMachine;
-};
-
-struct TrainerPrivate;
-class Trainer {
- private:
-  TrainerPrivate* m;
-  Trainer();
-  Trainer(TrainerConfig* optConfig, GradientMachine* gm);
-  DISABLE_COPY(Trainer);
-
- public:
-  virtual ~Trainer();
-
-  /// Create A Trainer By TrainerConfig. using paddle command line.
-  static Trainer* createByCommandLine() throw(IOError);
-
-  static Trainer* create(TrainerConfig* optConfig,
-                         GradientMachine* gm) throw(IOError);
-
-  /// Start training
-  void startTrain();
-
-  /// Finish training
-  void finishTrain();
-
-  /// Start a pass.
-  void startTrainPass();
-
-  /// Finish a pass
-  void finishTrainPass();
-
-  /**
-   * Train one batch,
-   *
-   * @return true if all batch finished.
-   */
-  bool trainOneBatch(size_t batchSize);
-
-  void trainOneDataBatch(size_t batchSize, const Arguments& args);
-
-  void startTestPeriod();
-  void testOneDataBatch(size_t batchSize, const Arguments& args);
-  void finishTestPeriod();
-
-  void forwardOneBatch(size_t batchSize);
-
-  Arguments* getForwardOutput();
-
-  Arguments* getLayerOutput(const std::string& layerName) const;
-};
-
-/// the N-Best results generated from one input sequence.
-class ISequenceResults {
- public:
-  virtual ~ISequenceResults();
-
-  /// Number of result.
-  virtual size_t getSize() const = 0;
-
-  /**
-   * Get sentence from dictionary.
-   *
-   * @param id  the index of result.
-   * @param split  if true, the return sentence will be splited with ' ' by
-   *               each word. Default is false.
-   */
-  virtual std::string getSentence(size_t id, bool split = false) const
-      throw(RangeError) = 0;
-  virtual std::vector<int> getSequence(size_t id) const throw(RangeError) = 0;
-  virtual float getScore(size_t id) const throw(RangeError) = 0;
-};
-
-struct SequenceGeneratorPrivate;
-class SequenceGenerator {
-  DISABLE_COPY(SequenceGenerator);
-  SequenceGenerator();
-
- public:
-  virtual ~SequenceGenerator();
-
-  /**
-   * Generate Sequence by input.
-   *
-   * @note  The inArgs is just one sequence of data.
-   * @note  The return will get a N-best generate result by inArgs.
-   *        Sort by score.
-   */
-  ISequenceResults* generateSequence(const Arguments& inArgs) const;
-
-  void setDict(const std::vector<std::string>& dict);
-  void setBos(size_t bos);
-  void setEos(size_t eos);
-  void setMaxLength(size_t maxlength);
-  void setBeamSize(size_t beamSize);
-
- private:
-  static SequenceGenerator* createByGradientMachineSharedPtr(void* ptr);
-  friend class GradientMachine;
-
- private:
-  SequenceGeneratorPrivate* m;
-};
diff --git a/paddle/api/PaddleAPIPrivate.h b/paddle/api/PaddleAPIPrivate.h
deleted file mode 100644
index e141fcd761d7db2d3836a6343700ac4a7ca80c16..0000000000000000000000000000000000000000
--- a/paddle/api/PaddleAPIPrivate.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <memory>
-#include "PaddleAPI.h"
-#include "paddle/gserver/evaluators/Evaluator.h"
-#include "paddle/gserver/gradientmachines/GradientMachine.h"
-#include "paddle/parameter/ParameterUpdaterBase.h"
-#include "paddle/trainer/TrainerConfigHelper.h"
-
-struct GradientMachinePrivate {
-  std::shared_ptr<paddle::GradientMachine> machine;
-
-  template <typename T>
-  inline T& cast(void* ptr) {
-    return *(T*)(ptr);
-  }
-};
-
-struct OptimizationConfigPrivate {
-  std::shared_ptr<paddle::TrainerConfigHelper> trainer_config;
-  paddle::OptimizationConfig config;
-
-  const paddle::OptimizationConfig& getConfig() {
-    if (trainer_config != nullptr) {
-      return trainer_config->getOptConfig();
-    } else {
-      return config;
-    }
-  }
-};
-
-struct TrainerConfigPrivate {
-  std::shared_ptr<paddle::TrainerConfigHelper> conf;
-  TrainerConfigPrivate() {}
-};
-
-struct ModelConfigPrivate {
-  std::shared_ptr<paddle::TrainerConfigHelper> conf;
-};
-
-struct ArgumentsPrivate {
-  std::vector<paddle::Argument> outputs;
-
-  inline paddle::Argument& getArg(size_t idx) throw(RangeError) {
-    if (idx < outputs.size()) {
-      return outputs[idx];
-    } else {
-      RangeError e;
-      throw e;
-    }
-  }
-
-  template <typename T>
-  std::shared_ptr<T>& cast(void* rawPtr) const {
-    return *(std::shared_ptr<T>*)(rawPtr);
-  }
-};
-
-struct ParameterUpdaterPrivate {
-  std::unique_ptr<paddle::ParameterUpdater> updater;
-};
-
-struct ParameterPrivate {
-  std::shared_ptr<paddle::Parameter> sharedPtr;
-  paddle::Parameter* rawPtr;  // rawPtr only used in ParameterUpdater,
-                              // in other situation sharedPtr should
-                              // contains value.
-
-  ParameterPrivate() : sharedPtr(nullptr), rawPtr(nullptr) {}
-
-  paddle::Parameter* getPtr() {
-    if (sharedPtr) {
-      return sharedPtr.get();
-    } else {
-      return rawPtr;
-    }
-  }
-};
-
-struct EvaluatorPrivate {
-  paddle::Evaluator* rawPtr;
-
-  EvaluatorPrivate() : rawPtr(nullptr) {}
-  ~EvaluatorPrivate() { delete rawPtr; }
-};
diff --git a/paddle/api/Parameter.cpp b/paddle/api/Parameter.cpp
deleted file mode 100644
index 589d22e74e742de2595a9efd17412ddc55159230..0000000000000000000000000000000000000000
--- a/paddle/api/Parameter.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/parameter/Parameter.h"
-#include "PaddleAPI.h"
-#include "PaddleAPIPrivate.h"
-
-Parameter::Parameter() : m(new ParameterPrivate()) {}
-
-Parameter::~Parameter() { delete m; }
-
-Parameter* Parameter::createFromRawPtr(void* ptr) {
-  auto p = new Parameter();
-  p->m->rawPtr = *static_cast<paddle::Parameter**>(ptr);
-  return p;
-}
-
-Parameter* Parameter::createFromSharedPtr(void* ptr) {
-  auto& p = *(paddle::ParameterPtr*)(ptr);
-  if (p == nullptr) {
-    return nullptr;
-  } else {
-    auto retParam = new Parameter();
-    retParam->m->sharedPtr = p;
-    return retParam;
-  }
-}
-
-std::string Parameter::getName() const { return m->getPtr()->getName(); }
-
-Vector* Parameter::getBuf(ParameterType type) {
-  auto buf = m->getPtr()->getBuf(type);
-  return Vector::createByPaddleVectorPtr(&buf);
-}
-
-ParameterConfig* Parameter::getConfig() {
-  if (m->sharedPtr) {
-    return ParameterConfig::createParameterConfigFromParameterSharedPtr(
-        &m->sharedPtr);
-  } else {
-    return ParameterConfig::createParameterConfigFromParameterPtr(m->rawPtr);
-  }
-}
-
-size_t Parameter::getID() const { return m->getPtr()->getID(); }
-
-void Parameter::setValueUpdated() { m->getPtr()->setValueUpdated(); }
-
-bool Parameter::save(const std::string& filename) const {
-  return m->getPtr()->save(filename);
-}
-
-bool Parameter::load(const std::string& filename) const {
-  return m->getPtr()->load(filename);
-}
-
-size_t Parameter::getSize() const { return m->getPtr()->getSize(); }
diff --git a/paddle/api/ParameterOptimizer.cpp b/paddle/api/ParameterOptimizer.cpp
deleted file mode 100644
index d4620be3e6f26cdd4caffffac712e4ef936b222a..0000000000000000000000000000000000000000
--- a/paddle/api/ParameterOptimizer.cpp
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/parameter/ParameterOptimizer.h"
-#include <algorithm>
-#include "Internal.h"
-#include "PaddleAPI.h"
-#include "PaddleAPIPrivate.h"
-
-struct ParameterOptimizerPrivate {
-  std::unique_ptr<paddle::ParameterOptimizer> optimizer;
-};
-
-struct ParameterTraverseCallbackPrivate {
-  paddle::ParameterOptimizer::TraverseCallback callback;
-
-  ParameterTraverseCallbackPrivate() {}
-
-  ParameterTraverseCallbackPrivate(
-      const paddle::ParameterOptimizer::TraverseCallback& callback)
-      : callback(callback) {}
-
-  void apply(const std::vector<Vector*>& vecs,
-             const ParameterConfig& conf,
-             size_t sparseId) {
-    std::vector<paddle::VectorPtr> real_vecs;
-    real_vecs.resize(vecs.size());
-    std::transform(vecs.begin(), vecs.end(), real_vecs.begin(), [](Vector* v) {
-      if (v) {
-        return *(paddle::VectorPtr*)(v->getSharedPtr());
-      } else {
-        return paddle::VectorPtr();
-      }
-    });
-
-    paddle::ParameterConfig& real_conf =
-        *(paddle::ParameterConfig*)(const_cast<ParameterConfig&>(conf)
-                                        .getRawPtr());
-    callback(real_vecs.data(), real_conf, sparseId);
-  }
-};
-
-ParameterOptimizer::ParameterOptimizer() : m(new ParameterOptimizerPrivate()) {}
-
-ParameterOptimizer::~ParameterOptimizer() { delete m; }
-
-ParameterOptimizer* ParameterOptimizer::create(OptimizationConfig* config) {
-  CHECK(config != nullptr);
-  auto retOptimizer = new ParameterOptimizer();
-  retOptimizer->m->optimizer.reset(
-      paddle::ParameterOptimizer::create(config->m->getConfig(), false));
-  return retOptimizer;
-}
-
-void ParameterOptimizer::init(size_t numRows, const ParameterConfig* config) {
-  auto& conf = *(paddle::ParameterConfig*)(const_cast<ParameterConfig*>(config)
-                                               ->getRawPtr());
-  m->optimizer->init(numRows, &conf);
-}
-
-void ParameterOptimizer::startPass() { m->optimizer->startPass(); }
-
-void ParameterOptimizer::finishPass() { m->optimizer->finishPass(); }
-
-void ParameterOptimizer::startBatch(size_t numSamplesProcessed) {
-  constexpr size_t high_1 = 1UL << (sizeof(size_t) * 8 - 1);
-  CHECK_EQ(numSamplesProcessed & high_1, 0UL);  // Safely cast.
-  m->optimizer->startBatch((int64_t)numSamplesProcessed);
-}
-
-void ParameterOptimizer::finishBatch() { m->optimizer->finishBatch(); }
-
-void ParameterOptimizer::update(const std::vector<Vector*>& vecs,
-                                const ParameterConfig& conf,
-                                size_t sparseId) {
-  ParameterTraverseCallbackPrivate invoker(
-      [&](const paddle::VectorPtr _vecs[],
-          const paddle::ParameterConfig& config,
-          size_t sid = -1UL) { m->optimizer->update(_vecs, config, sid); });
-  invoker.apply(vecs, conf, sparseId);
-}
-
-std::vector<int> ParameterOptimizer::getParameterTypes() const {
-  std::vector<int> returnValue;
-  staticCastVector(&returnValue, m->optimizer->getParameterTypes());
-  return returnValue;
-}
-
-ParameterTraverseCallback::ParameterTraverseCallback()
-    : m(new ParameterTraverseCallbackPrivate()) {}
-
-ParameterTraverseCallback::~ParameterTraverseCallback() { delete m; }
-
-void ParameterTraverseCallback::apply(const std::vector<Vector*>& vecs,
-                                      const ParameterConfig& conf,
-                                      size_t sparseId) {
-  m->apply(vecs, conf, sparseId);
-}
-
-ParameterTraverseCallback* ParameterOptimizer::needSpecialTraversal(
-    const ParameterConfig& config) const {
-  auto& param_config =
-      *(paddle::ParameterConfig*)const_cast<ParameterConfig&>(config)
-           .getRawPtr();
-  auto callback = m->optimizer->needSpecialTraversal(param_config);
-  if (callback) {
-    auto retCallback = new ParameterTraverseCallback();
-    retCallback->m->callback = callback;
-    return retCallback;
-  } else {
-    return nullptr;
-  }
-}
diff --git a/paddle/api/ParameterUpdater.cpp b/paddle/api/ParameterUpdater.cpp
deleted file mode 100644
index 63c000c959f67dc682190b73bac24640ca8d0682..0000000000000000000000000000000000000000
--- a/paddle/api/ParameterUpdater.cpp
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PaddleAPI.h"
-
-#include "PaddleAPIPrivate.h"
-#ifndef PADDLE_WITHOUT_GOLANG
-#include "paddle/trainer/NewRemoteParameterUpdater.h"
-#endif
-#include "paddle/trainer/RemoteParameterUpdater.h"
-#include "paddle/trainer/ThreadParameterUpdater.h"
-
-ParameterUpdater::ParameterUpdater() : m(new ParameterUpdaterPrivate()) {}
-
-ParameterUpdater *ParameterUpdater::createLocalUpdater(
-    OptimizationConfig *config) {
-  auto updater = new ParameterUpdater();
-  updater->m->updater.reset(
-      new paddle::SgdThreadUpdater(config->m->getConfig()));
-  return updater;
-}
-
-ParameterUpdater *ParameterUpdater::createNewRemoteUpdater(
-    OptimizationConfig *config,
-    const std::string pserverSpec,
-    const bool useEtcd) throw(UnsupportError) {
-#ifndef PADDLE_WITHOUT_GOLANG
-  auto updater = new ParameterUpdater();
-  updater->m->updater.reset(new paddle::NewRemoteParameterUpdater(
-      config->m->getConfig(), pserverSpec, useEtcd));
-  return updater;
-#else
-  throw UnsupportError("not compiled with WITH_GOLANG");
-#endif
-}
-
-ParameterUpdater *ParameterUpdater::createRemoteUpdater(
-    OptimizationConfig *config, int passCount, bool useSparseUpdater) {
-  auto updater = new ParameterUpdater();
-  auto remoteUpdater = new paddle::RemoteParameterUpdater(
-      config->m->getConfig(), passCount, nullptr);
-  if (useSparseUpdater) {
-    std::unique_ptr<paddle::ParameterUpdater> remoteUpdaterPtr(remoteUpdater);
-    auto sparseRemoteUpdater =
-        new paddle::SparseRemoteParameterUpdaterComposite(
-            config->m->getConfig(),
-            passCount,
-            false,
-            std::move(remoteUpdaterPtr));
-    updater->m->updater.reset(sparseRemoteUpdater);
-  } else {
-    updater->m->updater.reset(remoteUpdater);
-  }
-  return updater;
-}
-
-ParameterUpdater::~ParameterUpdater() { delete m; }
-
-void ParameterUpdater::init(const GradientMachine &gm) {
-  m->updater->init(gm.m->machine->getNonStaticParameters());
-}
-
-void ParameterUpdater::startPass() { m->updater->startPass(); }
-
-void ParameterUpdater::finishPass() { m->updater->finishPass(); }
-
-PassType ParameterUpdater::startBatch(size_t batchSize) {
-  return m->updater->startBatch((int64_t)batchSize);
-}
-
-void ParameterUpdater::finishBatch(float cost) {
-  m->updater->finishBatch(cost);
-}
-
-void ParameterUpdater::update(Parameter *param) {
-  auto paddleParam = param->m->getPtr();
-  m->updater->update(paddleParam);
-}
-
-void ParameterUpdater::getParametersRemote(bool fullSize, bool apply) {
-  m->updater->getParametersRemote(fullSize, apply);
-}
-
-void ParameterUpdater::restore() { m->updater->restore(); }
-
-void ParameterUpdater::apply() { m->updater->apply(); }
-
-void ParameterUpdater::catchUpWith() { m->updater->catchUpWith(); }
diff --git a/paddle/api/SequenceGenerator.cpp b/paddle/api/SequenceGenerator.cpp
deleted file mode 100644
index 1446c3084238859a759669f3a32c7efde67dcc2b..0000000000000000000000000000000000000000
--- a/paddle/api/SequenceGenerator.cpp
+++ /dev/null
@@ -1,242 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include <iterator>
-#include <sstream>
-#include <vector>
-#include "PaddleAPI.h"
-#include "paddle/gserver/gradientmachines/GradientMachine.h"
-#include "paddle/parameter/Argument.h"
-#include "paddle/utils/Flags.h"
-
-// used to represent partial sequence
-struct Path {
-  std::vector<int> ids;
-  float logProb;
-  paddle::MachineState machineState;
-
-  Path() { logProb = 0; }
-
-  Path(std::vector<int>& ids, float logProb, paddle::MachineState& machineState)
-      : ids(ids), logProb(logProb), machineState(machineState) {}
-
-  bool operator<(const Path& other) const { return (logProb > other.logProb); }
-};
-
-// Return top k (k == beam_size) optimal paths using beam search. The last
-// element of inArgs is the Argument of feedback. gradMachine has MaxIdLayer
-// as output and outArgs thus stores top k labels and their probabilities per
-// position
-static void findNBest(paddle::GradientMachine* gradMachine,
-                      std::vector<paddle::Argument>& inArgs,
-                      std::vector<Path>& finalPaths,
-                      size_t bos_id,
-                      size_t eos_id,
-                      size_t max_length) {
-  std::vector<Path> paths;
-  Path emptyPath;
-  paths.push_back(emptyPath);
-  finalPaths.clear();
-  gradMachine->resetState();
-  paddle::Argument feedback = inArgs.back();
-  feedback.ids->setElement(0, (int)(bos_id));
-  float minFinalPathLogProb = 0;
-  size_t beam = 0;
-  int id;
-  std::vector<paddle::Argument> outArgs;
-  while (true) {  // iterate over each generated word
-    std::vector<Path> newPaths;
-    paddle::MachineState machineState;
-    for (size_t j = 0; j < paths.size(); j++) {
-      Path& path = paths[j];
-      if (path.machineState.size() > 0) {
-        gradMachine->setState(path.machineState);
-        feedback.ids->setElement(0, path.ids.back());
-      }
-      gradMachine->forward(inArgs, &outArgs, paddle::PASS_TEST);
-      gradMachine->getState(machineState);
-      beam = outArgs[0].ids->getSize();
-      for (size_t k = 0; k < beam; k++) {
-        id = outArgs[0].ids->getElement(k);
-        float prob = outArgs[0].in->getElement(0, k);
-        std::vector<int> nids(path.ids);
-        nids.push_back(id);
-        float newLogProb = path.logProb + log(prob);
-        Path newPath(nids, newLogProb, machineState);
-        if (id == (int)eos_id || nids.size() >= max_length) {
-          finalPaths.push_back(newPath);
-          if (minFinalPathLogProb > newPath.logProb) {
-            minFinalPathLogProb = newPath.logProb;
-          }
-        } else {
-          newPaths.push_back(newPath);
-        }
-      }
-    }
-
-    if (newPaths.size() == 0) {
-      break;
-    }
-    std::nth_element(newPaths.begin(),
-                     newPaths.begin() + std::min(beam, newPaths.size()),
-                     newPaths.end());
-    if (newPaths.size() > beam) {
-      newPaths.resize(beam);
-    }
-    // pathA < pathB means pathA.logProb > pathB.logProb
-    float maxPathLogProb =
-        std::min_element(newPaths.begin(), newPaths.end())->logProb;
-    if (finalPaths.size() >= beam && minFinalPathLogProb >= maxPathLogProb) {
-      break;
-    }
-    paths = newPaths;
-  }  // end while
-
-  std::partial_sort(finalPaths.begin(),
-                    finalPaths.begin() + std::min(beam, finalPaths.size()),
-                    finalPaths.end());
-  if (finalPaths.size() > beam) {
-    finalPaths.resize(beam);
-  }
-}
-
-struct SequenceGeneratorPrivate {
-  std::shared_ptr<paddle::GradientMachine> machine;
-  std::shared_ptr<std::vector<std::string>> dict;
-  size_t beginPos;
-  size_t endPos;
-  size_t maxLength;
-
-  paddle::Argument feedback;
-
-  template <typename T>
-  inline T& cast(void* ptr) {
-    return *(T*)(ptr);
-  }
-
-  inline void findNBest(std::vector<paddle::Argument>& inArgs,
-                        std::vector<Path>& path) {
-    ::findNBest(machine.get(), inArgs, path, beginPos, endPos, maxLength);
-  }
-
-  SequenceGeneratorPrivate()
-      : dict(std::make_shared<std::vector<std::string>>()),
-        beginPos(0UL),
-        endPos(0UL),
-        maxLength(0UL),
-        feedback(__create_feedback__()) {}
-
- private:
-  static paddle::Argument __create_feedback__() {
-    paddle::Argument feedback;
-    feedback.ids = paddle::IVector::create(/* size= */ 1, FLAGS_use_gpu);
-
-    feedback.sequenceStartPositions =
-        paddle::ICpuGpuVector::create(/* size= */ 2, /* useGpu= */ false);
-    feedback.sequenceStartPositions->getMutableData(false)[0] = 0;
-    feedback.sequenceStartPositions->getMutableData(false)[1] = 1;
-    return feedback;
-  }
-};
-
-SequenceGenerator::SequenceGenerator() : m(new SequenceGeneratorPrivate()) {}
-
-SequenceGenerator::~SequenceGenerator() { delete m; }
-
-class PathSequenceResults : public ISequenceResults {
-  // ISequenceResults interface
- public:
-  PathSequenceResults(const std::shared_ptr<std::vector<Path>>& path,
-                      const std::shared_ptr<std::vector<std::string>>& dict)
-      : path_(path), dict_(dict) {}
-
-  size_t getSize() const { return path_->size(); }
-  std::string getSentence(size_t id, bool split) const throw(RangeError) {
-    if (id < getSize()) {
-      Path& p = (*path_)[id];
-      std::ostringstream sout;
-      std::transform(p.ids.begin(),
-                     p.ids.end(),
-                     std::ostream_iterator<std::string>(sout, split ? " " : ""),
-                     [&](int id) { return (*dict_)[id]; });
-      return sout.str();
-    } else {
-      RangeError e;
-      throw e;
-    }
-  }
-  std::vector<int> getSequence(size_t id) const throw(RangeError) {
-    if (id < getSize()) {
-      Path& p = (*path_)[id];
-      return p.ids;
-    } else {
-      RangeError e;
-      throw e;
-    }
-  }
-  float getScore(size_t id) const throw(RangeError) {
-    if (id < getSize()) {
-      Path& p = (*path_)[id];
-      return p.logProb;
-    } else {
-      RangeError e;
-      throw e;
-    }
-  }
-
- private:
-  std::shared_ptr<std::vector<Path>> path_;
-  std::shared_ptr<std::vector<std::string>> dict_;
-};
-
-ISequenceResults* SequenceGenerator::generateSequence(
-    const Arguments& inArgs) const {
-  auto& in_args =
-      m->cast<std::vector<paddle::Argument>>(inArgs.getInternalArgumentsPtr());
-  for (auto& arg : in_args) {
-    arg.sequenceStartPositions = m->feedback.sequenceStartPositions;
-  }
-  in_args.push_back(m->feedback);
-  auto path = std::make_shared<std::vector<Path>>();
-  m->findNBest(in_args, *path);
-  return new PathSequenceResults(path, m->dict);
-}
-
-SequenceGenerator* SequenceGenerator::createByGradientMachineSharedPtr(
-    void* ptr) {
-  SequenceGenerator* r = new SequenceGenerator();
-  r->m->machine = r->m->cast<std::shared_ptr<paddle::GradientMachine>>(ptr);
-  return r;
-}
-
-void SequenceGenerator::setDict(const std::vector<std::string>& dict) {
-  *m->dict = dict;
-}
-
-void SequenceGenerator::setBos(size_t bos) { m->beginPos = bos; }
-
-void SequenceGenerator::setEos(size_t eos) { m->endPos = eos; }
-
-void SequenceGenerator::setMaxLength(size_t maxLength) {
-  m->maxLength = maxLength;
-}
-
-void SequenceGenerator::setBeamSize(size_t beamSize) {
-  if (beamSize != -1UL) {
-    FLAGS_beam_size = beamSize;
-  }
-}
-
-ISequenceResults::~ISequenceResults() {}
diff --git a/paddle/api/Trainer.cpp b/paddle/api/Trainer.cpp
deleted file mode 100644
index 795460b65051b4ec0d9772d2503f123c4a6ea3d0..0000000000000000000000000000000000000000
--- a/paddle/api/Trainer.cpp
+++ /dev/null
@@ -1,175 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PaddleAPI.h"
-#include "PaddleAPIPrivate.h"
-
-#include <stdlib.h>
-#include <atomic>
-#include <memory>
-
-#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
-#include "paddle/trainer/ParamUtil.h"
-#include "paddle/trainer/Trainer.h"
-#include "paddle/trainer/TrainerInternal.h"
-#include "paddle/utils/Flags.h"
-
-using paddle::real;
-
-DECLARE_string(config);
-DECLARE_string(init_model_path);
-DECLARE_int32(start_pass);
-
-struct TrainerPrivate : public paddle::Trainer {
-  bool _trainOneBatch(size_t batchSize);
-  bool forwardOneBatch(size_t batchSize);
-  void forwardOneDataBatch(const std::vector<paddle::Argument>& inArgs);
-  void setBatchSize(size_t batchSize);
-  std::vector<paddle::Argument>& getForwardOutput();
-
-  void startTestPeriod();
-  void finishTestPeriod();
-  void testOneDataBatch(const paddle::DataBatch& dataBatch);
-  TrainerPrivate() : paddle::Trainer() {}
-};
-
-Trainer::Trainer() : m(new TrainerPrivate()) {
-  auto conf = paddle::TrainerConfigHelper::createFromFlags();
-  if (conf != nullptr) {
-    m->init(conf);
-  }
-}
-
-Trainer::~Trainer() { delete m; }
-
-Trainer* Trainer::createByCommandLine() throw(IOError) {
-  auto retv = new Trainer();
-  if (retv->m->getConfig().IsInitialized()) {
-    return retv;
-  } else {
-    throw IOError();
-  }
-}
-
-Trainer::Trainer(TrainerConfig* config, GradientMachine* gm)
-    : m(new TrainerPrivate()) {
-  m->init(config->m->conf, /* testing= */ false, gm ? gm->m->machine : nullptr);
-}
-
-Trainer* Trainer::create(TrainerConfig* config,
-                         GradientMachine* gm) throw(IOError) {
-  auto retv = new Trainer(config, gm);
-  if (retv->m->getConfig().IsInitialized()) {
-    return retv;
-  } else {
-    retv->m->getConfig().CheckInitialized();
-    throw IOError();
-  }
-}
-
-void Trainer::startTrain() { m->startTrain(); }
-
-void Trainer::finishTrain() { m->finishTrain(); }
-
-void Trainer::startTrainPass() { m->startTrainPass(); }
-
-void Trainer::finishTrainPass() { m->finishTrainPass(); }
-
-void Trainer::trainOneDataBatch(size_t batchSize, const Arguments& inArgs) {
-  paddle::DataBatch dataBatch;
-  dataBatch.getStreams() = inArgs.m->outputs;
-  dataBatch.setSize(batchSize);
-  m->trainOneDataBatch(dataBatch);
-}
-
-bool Trainer::trainOneBatch(size_t batchSize) {
-  return m->_trainOneBatch(batchSize);
-}
-
-bool TrainerPrivate::_trainOneBatch(size_t batchSize) {
-  paddle::DataBatch dataBatch;
-  CHECK(dataProvider_) << "data_provider is not specified";
-  int num = dataProvider_->getNextBatch(batchSize, &dataBatch);
-  if (num == 0) {
-    return false;
-  }
-  trainOneDataBatch(dataBatch);
-  return false;
-}
-
-void TrainerPrivate::startTestPeriod() {
-  if (!tester_) {
-    createTester();
-  }
-  tester_->startTestPeriod();
-}
-
-void Trainer::startTestPeriod() { m->startTestPeriod(); }
-
-void TrainerPrivate::testOneDataBatch(const paddle::DataBatch& dataBatch) {
-  tester_->testOneDataBatch(dataBatch, &forwardOutput_);
-}
-
-void Trainer::testOneDataBatch(size_t batchSize, const Arguments& args) {
-  paddle::DataBatch dataBatch;
-  dataBatch.getStreams() = args.m->outputs;
-  dataBatch.setSize(batchSize);
-  m->testOneDataBatch(dataBatch);
-}
-
-void TrainerPrivate::finishTestPeriod() { tester_->finishTestPeriod(); }
-void Trainer::finishTestPeriod() { m->finishTestPeriod(); }
-
-Arguments* Trainer::getLayerOutput(const std::string& layerName) const {
-  auto nn = this->m->getGradientMachine();
-  CHECK(nn) << "trainerInternal_.getGradientMachine() is not NeuralNetwork";
-  auto arg = nn->getLayerOutput(layerName);
-  return Arguments::createByPaddleArgument(&arg);
-}
-
-void Trainer::forwardOneBatch(size_t batchSize) {
-  m->forwardOneBatch(batchSize);
-}
-
-bool TrainerPrivate::forwardOneBatch(size_t batchSize) {
-  CHECK(dataProvider_) << "data_provider is not specified";
-  paddle::DataBatch dataBatch;
-  int num = dataProvider_->getNextBatch(batchSize, &dataBatch);
-  if (num == 0) {
-    return false;
-  }
-
-  forwardOneDataBatch(dataBatch.getStreams());
-  return true;
-}
-
-void TrainerPrivate::forwardOneDataBatch(
-    const std::vector<paddle::Argument>& inArgs) {
-  std::vector<paddle::Argument>& outArgs = forwardOutput_;
-
-  if (config_->getOptConfig().use_sparse_remote_updater()) {
-    trainerInternal_.getGradientMachine()->prefetch(inArgs);
-    trainerInternal_.getParameterUpdater()->getParametersRemote();
-  }
-  trainerInternal_.getGradientMachine()->forward(
-      inArgs, &outArgs, paddle::PASS_TEST);
-}
-
-Arguments* Trainer::getForwardOutput() {
-  return Arguments::createByPaddleArgumentVector(&m->getForwardOutput());
-}
-
-std::vector<paddle::Argument>& TrainerPrivate::getForwardOutput() {
-  return forwardOutput_;
-}
diff --git a/paddle/api/Util.cpp b/paddle/api/Util.cpp
deleted file mode 100644
index 618e87e96459674302d8b468c3ac410e8d3af6a8..0000000000000000000000000000000000000000
--- a/paddle/api/Util.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PaddleAPI.h"
-
-#include "paddle/parameter/Parameter.h"
-#include "paddle/utils/Common.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/PythonUtil.h"
-#include "paddle/utils/Util.h"
-
-#include <algorithm>
-#include <iostream>
-#include <iterator>
-
-void initPaddle(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  paddle::initPython(argc, argv);
-  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
-}
-
-FloatArray::FloatArray(const float* b, const size_t l)
-    : buf(b), length(l), needFree(false) {}
-
-IntArray::IntArray(const int* b, const size_t l, bool f)
-    : buf(b), length(l), needFree(f) {}
-
-IntWithFloatArray::IntWithFloatArray(const float* v,
-                                     const int* i,
-                                     size_t l,
-                                     bool f)
-    : valBuf(v), idxBuf(i), length(l), needFree(f) {}
-
-bool isUsingGpu() { return FLAGS_use_gpu; }
-
-void setUseGpu(bool useGpu) { FLAGS_use_gpu = useGpu; }
-
-bool isGpuVersion() {
-#ifndef PADDLE_WITH_CUDA
-  return false;
-#else
-  return true;
-#endif
-}
-
-int getTrainerCount() { return FLAGS_trainer_count; }
-
-static_assert(NUM_PARAMETER_TYPES == paddle::NUM_PARAMETER_TYPES,
-              "The Parameter Type should be same in core/api and core/common");
diff --git a/paddle/api/Vector.cpp b/paddle/api/Vector.cpp
deleted file mode 100644
index e2a7b974ca78ae3e6e0e66c206a40c8811126b53..0000000000000000000000000000000000000000
--- a/paddle/api/Vector.cpp
+++ /dev/null
@@ -1,304 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PaddleAPI.h"
-
-#include "paddle/math/Vector.h"
-
-#include <cstring>
-
-struct IVectorPrivate {
-  paddle::IVectorPtr vec;
-};
-
-IVector::IVector() : m(new IVectorPrivate()) {}
-
-IVector* IVector::createZero(size_t sz, bool useGpu) {
-  auto v = new IVector();
-  v->m->vec = paddle::IVector::create(sz, useGpu);
-  v->m->vec->zeroMem();
-  return v;
-}
-
-IVector* IVector::create(const std::vector<int>& data, bool useGpu) {
-  auto v = new IVector();
-  v->m->vec = paddle::IVector::create(data.size(), useGpu);
-  v->m->vec->copyFrom(data.data(), data.size());
-  return v;
-}
-
-IVector* IVector::createVectorFromNumpy(int* data,
-                                        int dim,
-                                        bool copy,
-                                        bool useGpu) throw(UnsupportError) {
-  if (useGpu) {
-    /// if use gpu only copy=true is supported
-    if (!copy) {
-      throw UnsupportError("Gpu mode only supports copy=True");
-    }
-    return IVector::createGpuVectorFromNumpy(data, dim);
-  } else {
-    return IVector::createCpuVectorFromNumpy(data, dim, copy);
-  }
-}
-
-IVector* IVector::createCpuVectorFromNumpy(int* data, int dim, bool copy) {
-  auto v = new IVector();
-  if (copy) {
-    v->m->vec = paddle::IVector::create(dim, false);
-    v->m->vec->copyFrom(data, dim);
-  } else {
-    v->m->vec = paddle::IVector::create(data, dim, false);
-  }
-  return v;
-}
-
-IVector* IVector::createGpuVectorFromNumpy(int* data, int dim) {
-  auto v = new IVector();
-  v->m->vec = paddle::IVector::create(dim, true);
-  v->m->vec->copyFrom(data, dim);
-  return v;
-}
-
-bool IVector::isGpu() const {
-  return dynamic_cast<paddle::GpuIVector*>(m->vec.get()) != nullptr;
-}
-
-IntArray IVector::getData() const {
-  if (this->isGpu()) {
-    int* src = m->vec->getData();
-    size_t len = m->vec->getSize();
-    int* dest = new int[len];
-    hl_memcpy_device2host(dest, src, len * sizeof(int));
-    return IntArray(dest, len, true);
-  } else {
-    return IntArray(m->vec->getData(), m->vec->getSize());
-  }
-}
-
-int& IVector::operator[](const size_t idx) throw(RangeError, UnsupportError) {
-  if (this->isGpu()) {
-    UnsupportError e;
-    throw e;
-  } else {
-    if (idx >= m->vec->getSize()) {
-      RangeError e;
-      throw e;
-    }
-  }
-  return m->vec->getData()[idx];
-}
-
-const int& IVector::operator[](const size_t idx) const
-    throw(RangeError, UnsupportError) {
-  return (*const_cast<IVector*>(this))[idx];
-}
-
-IVector* IVector::createByPaddleVectorPtr(void* ptr) {
-  auto* p = (paddle::IVectorPtr*)ptr;
-  if ((*p) != nullptr) {
-    IVector* vec = new IVector();
-    vec->m->vec = *p;
-    return vec;
-  } else {
-    return nullptr;
-  }
-}
-
-IVector::~IVector() { delete m; }
-
-void* IVector::getSharedPtr() const { return &m->vec; }
-
-size_t IVector::getSize() const { return m->vec->getSize(); }
-
-void IVector::toNumpyArrayInplace(int** data, int* dim1) throw(UnsupportError) {
-  auto v = std::dynamic_pointer_cast<paddle::CpuIVector>(m->vec);
-  if (v) {
-    *data = v->getData();
-    *dim1 = v->getSize();
-  } else {
-    throw UnsupportError();
-  }
-}
-
-void IVector::copyToNumpyArray(int** view_m_data, int* dim1) {
-  *dim1 = m->vec->getSize();
-  *view_m_data = new int[*dim1];
-  if (auto cpuVec = dynamic_cast<paddle::CpuIVector*>(m->vec.get())) {
-    std::memcpy(*view_m_data, cpuVec->getData(), sizeof(int) * (*dim1));
-  } else if (auto gpuVec = dynamic_cast<paddle::GpuIVector*>(m->vec.get())) {
-    hl_memcpy_device2host(
-        *view_m_data, gpuVec->getData(), sizeof(int) * (*dim1));
-  } else {
-    LOG(INFO) << "Unexpected situation";
-  }
-}
-
-void IVector::copyFromNumpyArray(int* data, int dim) {
-  m->vec->resize(dim);
-  m->vec->copyFrom(data, dim);
-}
-
-struct VectorPrivate {
-  paddle::VectorPtr vec;
-
-  void safeAccessData(const size_t idx,
-                      const std::function<void(float&)>& func) const
-      throw(RangeError, UnsupportError) {
-    auto cpuVec = std::dynamic_pointer_cast<const paddle::CpuVector>(vec);
-    if (cpuVec != nullptr) {
-      if (idx < vec->getSize()) {
-        func(vec->getData()[idx]);
-      } else {
-        throw RangeError();
-      }
-    } else {
-      throw UnsupportError();
-    }
-  }
-};
-
-Vector::Vector() : m(new VectorPrivate()) {}
-
-Vector::~Vector() { delete m; }
-
-Vector* Vector::createZero(size_t sz, bool useGpu) {
-  auto retVec = new Vector();
-  retVec->m->vec = paddle::Vector::create(sz, useGpu);
-  retVec->m->vec->zero();
-  return retVec;
-}
-
-Vector* Vector::create(const std::vector<float>& data, bool useGpu) {
-  auto retVec = new Vector();
-  retVec->m->vec = paddle::Vector::create(data.size(), useGpu);
-  retVec->m->vec->copyFrom(data.data(), data.size());
-  return retVec;
-}
-
-Vector* Vector::createByPaddleVectorPtr(void* ptr) {
-  auto& v = *(paddle::VectorPtr*)(ptr);
-  if (v == nullptr) {
-    return nullptr;
-  } else {
-    auto retVec = new Vector();
-    retVec->m->vec = v;
-    return retVec;
-  }
-}
-
-Vector* Vector::createVectorFromNumpy(float* data,
-                                      int dim,
-                                      bool copy,
-                                      bool useGpu) throw(UnsupportError) {
-  if (useGpu) {
-    /// if use gpu only copy=True is supported
-    if (!copy) {
-      throw UnsupportError("Gpu mode only supports copy=True");
-    }
-    return Vector::createGpuVectorFromNumpy(data, dim);
-  } else {
-    return Vector::createCpuVectorFromNumpy(data, dim, copy);
-  }
-}
-
-Vector* Vector::createCpuVectorFromNumpy(float* data, int dim, bool copy) {
-  CHECK_GT(dim, 0);
-  auto retVec = new Vector();
-  if (copy) {
-    retVec->m->vec = paddle::Vector::create((size_t)dim, false);
-    retVec->m->vec->copyFrom(data, dim);
-  } else {
-    retVec->m->vec = paddle::Vector::create(data, (size_t)dim, false);
-  }
-  return retVec;
-}
-
-Vector* Vector::createGpuVectorFromNumpy(float* data, int dim) {
-  CHECK_GT(dim, 0);
-  auto retVec = new Vector();
-  retVec->m->vec = paddle::Vector::create((size_t)dim, true);
-  retVec->m->vec->copyFrom(data, (size_t)dim);
-  return retVec;
-}
-
-void Vector::toNumpyArrayInplace(float** view_data,
-                                 int* dim1) throw(UnsupportError) {
-  auto v = std::dynamic_pointer_cast<paddle::CpuVector>(m->vec);
-  if (v != nullptr) {
-    *view_data = v->getData();
-    *dim1 = (int)v->getSize();
-  } else {
-    throw UnsupportError();
-  }
-}
-
-void Vector::copyToNumpyArray(float** view_m_data, int* dim1) {
-  *dim1 = m->vec->getSize();
-  *view_m_data = new float[*dim1];
-  if (auto cpuVec = dynamic_cast<paddle::CpuVector*>(m->vec.get())) {
-    std::memcpy(*view_m_data, cpuVec->getData(), sizeof(float) * (*dim1));
-  } else if (auto gpuVec = dynamic_cast<paddle::GpuVector*>(m->vec.get())) {
-    hl_memcpy_device2host(
-        *view_m_data, gpuVec->getData(), sizeof(float) * (*dim1));
-  } else {
-    LOG(INFO) << "Unexpected situation";
-  }
-}
-
-void Vector::copyFromNumpyArray(float* data, int dim) {
-  m->vec->resize(dim);
-  m->vec->copyFrom(data, dim);
-}
-
-FloatArray Vector::getData() const {
-  if (this->isGpu()) {
-    float* src = m->vec->getData();
-    size_t len = m->vec->getSize();
-    float* dest = new float[len];
-    hl_memcpy_device2host(dest, src, len * sizeof(float));
-    FloatArray ret_val(dest, len);
-    ret_val.needFree = true;
-    return ret_val;
-  } else {
-    FloatArray ret_val(m->vec->getData(), m->vec->getSize());
-    return ret_val;
-  }
-}
-
-void Vector::copyFrom(Vector* src) throw(RangeError) {
-  if (src->m->vec->getSize() != m->vec->getSize()) {
-    throw RangeError();
-  }
-  m->vec->copyFrom(*src->m->vec);
-}
-
-bool Vector::isGpu() const {
-  return std::dynamic_pointer_cast<paddle::GpuVector>(m->vec) != nullptr;
-}
-
-float Vector::get(const size_t idx) const throw(RangeError, UnsupportError) {
-  float r;
-  m->safeAccessData(idx, [&](float& o) { r = o; });
-  return r;
-}
-
-void Vector::set(const size_t idx, float val) throw(RangeError,
-                                                    UnsupportError) {
-  m->safeAccessData(idx, [&](float& o) { o = val; });
-}
-
-size_t Vector::getSize() const { return m->vec->getSize(); }
-
-void* Vector::getSharedPtr() { return &m->vec; }
diff --git a/paddle/capi/Main.cpp b/paddle/capi/Main.cpp
deleted file mode 100644
index 0a289dede65406facf1f1cba584f4330f2569214..0000000000000000000000000000000000000000
--- a/paddle/capi/Main.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <fenv.h>
-#include <stdlib.h>
-#include <string.h>
-#include <vector>
-#include "capi_private.h"
-#include "main.h"
-#include "paddle/trainer/TrainerConfigHelper.h"
-#include "paddle/utils/Excepts.h"
-#include "paddle/utils/PythonUtil.h"
-
-static void initPaddle(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  paddle::initPython(argc, argv);
-}
-
-extern "C" {
-paddle_error paddle_init(int argc, char** argv) {
-  static bool isInit = false;
-  if (isInit) return kPD_NO_ERROR;
-
-  std::vector<char*> realArgv;
-  realArgv.reserve(argc + 1);
-  realArgv.push_back(strdup(""));
-  for (int i = 0; i < argc; ++i) {
-    realArgv.push_back(argv[i]);
-  }
-  initPaddle(argc + 1, realArgv.data());
-  free(realArgv[0]);
-  isInit = true;
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_init_thread() {
-  if (FLAGS_use_gpu) {
-    hl_init(FLAGS_gpu_id);
-  }
-  return kPD_NO_ERROR;
-}
-}
diff --git a/paddle/capi/capi_private.h b/paddle/capi/capi_private.h
deleted file mode 100644
index 3332f42a4a6e57fed6ddb20cf7d759d67e7240b5..0000000000000000000000000000000000000000
--- a/paddle/capi/capi_private.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "capi.h"
-#include "paddle/gserver/gradientmachines/GradientMachine.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/Vector.h"
-#include "paddle/parameter/Argument.h"
-#pragma once
-
-namespace paddle {
-namespace capi {
-
-enum CType { kIVECTOR = 0, kMATRIX, kARGUMENTS, kGRADIENT_MACHINE };
-
-#define STRUCT_HEADER CType type;
-
-struct CHeader {
-  STRUCT_HEADER
-};
-
-struct CIVector {
-  STRUCT_HEADER
-  IVectorPtr vec;
-
-  CIVector() : type(kIVECTOR) {}
-};
-
-struct CMatrix {
-  STRUCT_HEADER
-  MatrixPtr mat;
-
-  CMatrix() : type(kMATRIX) {}
-};
-
-struct CArguments {
-  STRUCT_HEADER
-  std::vector<paddle::Argument> args;
-
-  CArguments() : type(kARGUMENTS) {}
-
-  template <typename T>
-  paddle_error accessSeqPos(uint64_t ID, uint32_t nestedLevel, T callback) {
-    if (ID >= args.size()) return kPD_OUT_OF_RANGE;
-    switch (nestedLevel) {
-      case 0:
-        callback(args[ID].sequenceStartPositions);
-        break;
-      case 1:
-        callback(args[ID].subSequenceStartPositions);
-        break;
-      default:
-        return kPD_OUT_OF_RANGE;
-    }
-    return kPD_NO_ERROR;
-  }
-};
-
-struct CGradientMachine {
-  STRUCT_HEADER
-  paddle::GradientMachinePtr machine;
-
-  CGradientMachine() : type(kGRADIENT_MACHINE) {}
-};
-
-template <typename T>
-inline T* cast(void* ptr) {
-  return reinterpret_cast<T*>(ptr);
-}
-}  // namespace capi
-}  // namespace paddle
diff --git a/paddle/capi/gradient_machine.cpp b/paddle/capi/gradient_machine.cpp
deleted file mode 100644
index 8c3f504e5a2d807c0cc664af486ebab4a82ddec3..0000000000000000000000000000000000000000
--- a/paddle/capi/gradient_machine.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "gradient_machine.h"
-#include "capi_private.h"
-#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
-
-#define cast(v) paddle::capi::cast<paddle::capi::CGradientMachine>(v)
-
-enum GradientMatchineCreateMode {
-  CREATE_MODE_NORMAL = 0,
-  CREATE_MODE_TESTING = 4
-};
-
-namespace paddle {
-
-class MyNeuralNetwork : public NeuralNetwork {
- public:
-  MyNeuralNetwork(const std::string& name, NeuralNetwork* network)
-      : NeuralNetwork(name, network) {}
-};
-
-NeuralNetwork* newCustomNerualNetwork(const std::string& name,
-                                      NeuralNetwork* network) {
-  return new MyNeuralNetwork(name, network);
-}
-}  // namespace paddle
-
-extern "C" {
-paddle_error paddle_gradient_machine_create_for_inference(
-    paddle_gradient_machine* machine, void* modelConfigProtobuf, int size) {
-  if (modelConfigProtobuf == nullptr) return kPD_NULLPTR;
-  paddle::ModelConfig config;
-  if (!config.ParseFromArray(modelConfigProtobuf, size) ||
-      !config.IsInitialized()) {
-    return kPD_PROTOBUF_ERROR;
-  }
-
-  auto ptr = new paddle::capi::CGradientMachine();
-  ptr->machine.reset(paddle::GradientMachine::create(
-      config, CREATE_MODE_TESTING, {paddle::PARAMETER_VALUE}));
-  *machine = ptr;
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_gradient_machine_create_for_inference_with_parameters(
-    paddle_gradient_machine* machine, void* mergedModel, uint64_t size) {
-  if (mergedModel == nullptr) return kPD_NULLPTR;
-  std::istringstream is(std::string(static_cast<char*>(mergedModel), size));
-  int64_t modelConfigSize = 0;
-  is.read((char*)(&modelConfigSize), sizeof(modelConfigSize));
-  std::string modelConfigProtobuf;
-  modelConfigProtobuf.resize(modelConfigSize);
-  is.read(&modelConfigProtobuf[0], modelConfigSize);
-  paddle::TrainerConfig config;
-  paddle::ModelConfig modelConfig;
-  if (!config.ParseFromString(modelConfigProtobuf) || !config.IsInitialized()) {
-    if (!modelConfig.ParseFromString(modelConfigProtobuf) ||
-        !modelConfig.IsInitialized()) {
-      return kPD_PROTOBUF_ERROR;
-    }
-  } else {
-    modelConfig = config.model_config();
-  }
-  auto ptr = new paddle::capi::CGradientMachine();
-  ptr->machine.reset(paddle::GradientMachine::create(
-      modelConfig, CREATE_MODE_TESTING, {paddle::PARAMETER_VALUE}));
-  std::vector<paddle::ParameterPtr>& parameters = ptr->machine->getParameters();
-  for (auto& para : parameters) {
-    para->load(is);
-  }
-
-  *machine = ptr;
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_gradient_machine_destroy(paddle_gradient_machine machine) {
-  delete cast(machine);
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_gradient_machine_load_parameter_from_disk(
-    paddle_gradient_machine machine, const char* path) {
-  auto m = cast(machine);
-  if (m == nullptr || path == nullptr || m->machine == nullptr)
-    return kPD_NULLPTR;
-  m->machine->loadParameters(path);
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_gradient_machine_forward(paddle_gradient_machine machine,
-                                             paddle_arguments inArgs,
-                                             paddle_arguments outArgs,
-                                             bool isTrain) {
-  auto m = cast(machine);
-  auto in = paddle::capi::cast<paddle::capi::CArguments>(inArgs);
-  auto out = paddle::capi::cast<paddle::capi::CArguments>(outArgs);
-  if (m == nullptr || in == nullptr || out == nullptr || m->machine == nullptr)
-    return kPD_NULLPTR;
-  m->machine->forward(
-      in->args, &out->args, isTrain ? paddle::PASS_TRAIN : paddle::PASS_TEST);
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_gradient_machine_create_shared_param(
-    paddle_gradient_machine origin,
-    void* modelConfigProtobuf,
-    int size,
-    paddle_gradient_machine* slave) {
-  auto o = cast(origin);
-  if (origin == nullptr || slave == nullptr || o->machine == nullptr) {
-    return kPD_NULLPTR;
-  }
-  paddle::ModelConfig config;
-  if (!config.ParseFromArray(modelConfigProtobuf, size) ||
-      !config.IsInitialized()) {
-    return kPD_PROTOBUF_ERROR;
-  }
-
-  std::unique_ptr<paddle::capi::CGradientMachine> ptr(
-      new paddle::capi::CGradientMachine());
-  auto nn = paddle::NeuralNetwork::create(config);
-  nn->init(config,
-           [&o](int paramId, paddle::Parameter* param) {
-             auto p = o->machine->getParameters()[paramId];
-             param->enableSharedType(paddle::PARAMETER_VALUE,
-                                     p->getBuf(paddle::PARAMETER_VALUE));
-           },
-           {paddle::PARAMETER_VALUE},
-           false);
-  ptr->machine.reset(nn);
-  *slave = ptr.release();
-  return kPD_NO_ERROR;
-}
-}
-
-paddle_error paddle_gradient_machine_randomize_param(
-    paddle_gradient_machine machine) {
-  auto m = cast(machine);
-  if (m == nullptr || m->machine == nullptr) return kPD_NULLPTR;
-  m->machine->randParameters();
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_gradient_machine_get_layer_output(
-    paddle_gradient_machine machine,
-    const char* layerName,
-    paddle_arguments args) {
-  auto m = cast(machine);
-  auto out = paddle::capi::cast<paddle::capi::CArguments>(args);
-  if (m == nullptr || layerName == nullptr || out == nullptr ||
-      m->machine == nullptr) {
-    return kPD_NULLPTR;
-  }
-
-  auto layerOutput = m->machine->getLayerOutput(layerName);
-  out->args.push_back(layerOutput);
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_gradient_machine_release_layer_output(
-    paddle_gradient_machine machine) {
-  auto m = cast(machine);
-  if (m == nullptr || m->machine == nullptr) {
-    return kPD_NULLPTR;
-  }
-  m->machine->releaseOutput();
-  return kPD_NO_ERROR;
-}
diff --git a/paddle/capi/tests/test_Arguments.cpp b/paddle/capi/tests/test_Arguments.cpp
deleted file mode 100644
index bb08adf716bfd6e3c88747616e538e9da89a0e25..0000000000000000000000000000000000000000
--- a/paddle/capi/tests/test_Arguments.cpp
+++ /dev/null
@@ -1,129 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <functional>
-#include "capi.h"
-#include "gtest/gtest.h"
-#include "paddle/utils/ThreadLocal.h"
-
-static std::vector<paddle_real> randomBuffer(size_t bufSize) {
-  auto& eng = paddle::ThreadLocalRandomEngine::get();
-  std::uniform_real_distribution<paddle_real> dist(-1.0, 1.0);
-  std::vector<paddle_real> retv;
-  retv.reserve(bufSize);
-  for (size_t i = 0; i < bufSize; ++i) {
-    retv.push_back(dist(eng));
-  }
-  return retv;
-}
-
-TEST(CAPIArguments, create) {
-  //! TODO(yuyang18): Test GPU Code.
-  paddle_arguments args = paddle_arguments_create_none();
-  uint64_t size;
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_size(args, &size));
-  ASSERT_EQ(0UL, size);
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(args));
-}
-
-TEST(CAPIArguments, value) {
-  paddle_arguments args = paddle_arguments_create_none();
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_resize(args, 1));
-
-  paddle_matrix mat = paddle_matrix_create(128, 64, false);
-  for (size_t i = 0; i < 128; ++i) {
-    std::vector<paddle_real> sampleBuf = randomBuffer(64);
-    paddle_matrix_set_row(mat, i, sampleBuf.data());
-  }
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_set_value(args, 0, mat));
-
-  paddle_matrix val = paddle_matrix_create_none();
-
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_value(args, 0, val));
-
-  for (size_t i = 0; i < 128; ++i) {
-    paddle_real* row1;
-    paddle_real* row2;
-
-    ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_row(mat, i, &row1));
-    ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_row(val, i, &row2));
-    ASSERT_EQ(row1, row2);
-  }
-
-  paddle_ivector ivec = paddle_ivector_create_none();
-  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(ivec));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(val));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(args));
-}
-
-TEST(CAPIArguments, ids) {
-  paddle_arguments args = paddle_arguments_create_none();
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_resize(args, 1));
-
-  paddle_ivector ivec;
-  int array[3] = {1, 2, 3};
-  ivec = paddle_ivector_create(array, 3, true, false);
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_set_ids(args, 0, ivec));
-
-  paddle_ivector val = paddle_ivector_create_none();
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_ids(args, 0, val));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(ivec));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(val));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(args));
-}
-
-template <typename T1, typename T2>
-void testSequenceHelper(T1 setter, T2 getter) {
-  paddle_arguments args = paddle_arguments_create_none();
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_resize(args, 1));
-
-  paddle_ivector ivec;
-  int array[3] = {1, 2, 3};
-  ivec = paddle_ivector_create(array, 3, true, false);
-  ASSERT_EQ(kPD_NO_ERROR, setter(args, 0, ivec));
-
-  paddle_ivector val = paddle_ivector_create_none();
-  ASSERT_EQ(kPD_NO_ERROR, getter(args, 0, val));
-  uint64_t size;
-  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_get_size(val, &size));
-
-  int* rawBuf;
-  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_get(val, &rawBuf));
-  for (size_t i = 0; i < size; ++i) {
-    ASSERT_EQ(array[i], rawBuf[i]);
-  }
-
-  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(ivec));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(val));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(args));
-}
-
-TEST(CAPIArguments, Sequence) {
-  auto testSequence = [](uint32_t nestedLevel) {
-    testSequenceHelper(std::bind(paddle_arguments_set_sequence_start_pos,
-                                 std::placeholders::_1,
-                                 std::placeholders::_2,
-                                 nestedLevel,
-                                 std::placeholders::_3),
-                       std::bind(paddle_arguments_get_sequence_start_pos,
-                                 std::placeholders::_1,
-                                 std::placeholders::_2,
-                                 nestedLevel,
-                                 std::placeholders::_3));
-  };
-  for (uint32_t i = 0; i < 2; ++i) {  // test seq and sub-seq.
-    testSequence(i);
-  }
-}
diff --git a/paddle/capi/tests/test_GradientMachine.cpp b/paddle/capi/tests/test_GradientMachine.cpp
deleted file mode 100644
index 73b9e477b2a2749250e878cf2174dcf4cc599be1..0000000000000000000000000000000000000000
--- a/paddle/capi/tests/test_GradientMachine.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/gserver/gradientmachines/GradientMachine.h>
-#include <paddle/trainer/TrainerConfigHelper.h>
-#include <stdlib.h>
-#include <string.h>
-#include <type_traits>
-#include "capi.h"
-#include "paddle/utils/ThreadLocal.h"
-
-static std::vector<paddle_real> randomBuffer(size_t bufSize) {
-  auto& eng = paddle::ThreadLocalRandomEngine::get();
-  std::uniform_real_distribution<paddle_real> dist(-1.0, 1.0);
-  std::vector<paddle_real> retv;
-  retv.reserve(bufSize);
-  for (size_t i = 0; i < bufSize; ++i) {
-    retv.push_back(dist(eng));
-  }
-  return retv;
-}
-
-TEST(GradientMachine, testPredict) {
-  //! TODO(yuyang18): Test GPU Code.
-  paddle::TrainerConfigHelper config("./test_predict_network.py");
-  std::string buffer;
-  ASSERT_TRUE(config.getModelConfig().SerializeToString(&buffer));
-  paddle_gradient_machine machine;
-
-  ASSERT_EQ(kPD_NO_ERROR,
-            paddle_gradient_machine_create_for_inference(
-                &machine, &buffer[0], (int)buffer.size()));
-  std::unique_ptr<paddle::GradientMachine> gm(
-      paddle::GradientMachine::create(config.getModelConfig()));
-  ASSERT_NE(nullptr, gm);
-  gm->randParameters();
-  gm->saveParameters("./");
-
-  ASSERT_EQ(kPD_NO_ERROR,
-            paddle_gradient_machine_load_parameter_from_disk(machine, "./"));
-
-  paddle_gradient_machine machineSlave;
-  ASSERT_EQ(kPD_NO_ERROR,
-            paddle_gradient_machine_create_shared_param(
-                machine, &buffer[0], (int)buffer.size(), &machineSlave));
-  std::swap(machineSlave, machine);
-  paddle_arguments outArgs = paddle_arguments_create_none();
-
-  paddle_arguments inArgs = paddle_arguments_create_none();
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_resize(inArgs, 1));
-  paddle_matrix mat = paddle_matrix_create(1, 100, false);
-  static_assert(std::is_same<paddle_real, paddle::real>::value, "");
-
-  auto data = randomBuffer(100);
-  paddle_real* rowPtr;
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_row(mat, 0, &rowPtr));
-  memcpy(rowPtr, data.data(), data.size() * sizeof(paddle_real));
-
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_set_value(inArgs, 0, mat));
-  ASSERT_EQ(kPD_NO_ERROR,
-            paddle_gradient_machine_forward(machine, inArgs, outArgs, false));
-
-  uint64_t sz;
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_size(outArgs, &sz));
-  ASSERT_EQ(1UL, sz);
-
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_value(outArgs, 0, mat));
-  std::vector<paddle::Argument> paddleInArgs;
-  std::vector<paddle::Argument> paddleOutArgs;
-  paddleInArgs.resize(1);
-  paddleInArgs[0].value =
-      paddle::Matrix::create(data.data(), 1, 100, false, false);
-
-  gm->forward(paddleInArgs, &paddleOutArgs, paddle::PASS_TEST);
-
-  auto matPaddle = paddleOutArgs[0].value;
-
-  uint64_t height, width;
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_shape(mat, &height, &width));
-  ASSERT_EQ(matPaddle->getHeight(), height);
-  ASSERT_EQ(matPaddle->getWidth(), width);
-
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_row(mat, 0, &rowPtr));
-  for (size_t i = 0; i < width; ++i) {
-    ASSERT_NEAR(matPaddle->getData()[i], rowPtr[i], 1e-5);
-  }
-
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(inArgs));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(outArgs));
-  std::swap(machineSlave, machine);
-  ASSERT_EQ(kPD_NO_ERROR, paddle_gradient_machine_destroy(machineSlave));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_gradient_machine_destroy(machine));
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  std::vector<char*> argvs;
-  argvs.push_back(strdup("--use_gpu=false"));
-  paddle_init((int)argvs.size(), argvs.data());
-  for (auto each : argvs) {
-    free(each);
-  }
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/contrib/CMakeLists.txt b/paddle/contrib/CMakeLists.txt
index 70e3a0583d8ecf9db19a85c0978aae0ce0625570..4b19256ef4533a09162edf907f6cd51146517e46 100644
--- a/paddle/contrib/CMakeLists.txt
+++ b/paddle/contrib/CMakeLists.txt
@@ -14,4 +14,3 @@
 #
 
 add_subdirectory(inference)
-add_subdirectory(tape)
diff --git a/paddle/contrib/inference/CMakeLists.txt b/paddle/contrib/inference/CMakeLists.txt
index 0f56d648b1939e1d6af3368bb2423477a3b638fc..c30eff5010748685838feb984c9c817ffcf14c11 100644
--- a/paddle/contrib/inference/CMakeLists.txt
+++ b/paddle/contrib/inference/CMakeLists.txt
@@ -19,6 +19,9 @@ endif(APPLE)
 
 
 set(inference_deps paddle_inference_api paddle_fluid_api)
+if(WITH_GPU AND TENSORRT_FOUND)
+    set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine)
+endif()
 
 function(inference_api_test TARGET_NAME)
     if (WITH_TESTING)
@@ -43,6 +46,15 @@ cc_library(paddle_inference_api
     SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
     DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
 
+# Here the shared library doesn't depend on other fluid libraries, or double free will occur.
+cc_library(paddle_inference_api_shared SHARED
+    SRCS paddle_inference_api.cc paddle_inference_api_impl.cc)
+set_target_properties(paddle_inference_api_shared PROPERTIES OUTPUT_NAME paddle_inference_api)
+if(NOT APPLE)
+  set(LINK_FLAGS "-fPIC -fvisibility=hidden")
+  set_target_properties(paddle_inference_api_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+endif()
+
 cc_test(test_paddle_inference_api
         SRCS test_paddle_inference_api.cc
         DEPS paddle_inference_api)
@@ -50,17 +62,30 @@ cc_test(test_paddle_inference_api
 inference_api_test(test_paddle_inference_api_impl
                     ARGS test_word2vec test_image_classification)
 
-if (WITH_ANAKIN AND WITH_TESTING) # only needed in CI
+if(WITH_GPU AND TENSORRT_FOUND)
+cc_library(paddle_inference_tensorrt_subgraph_engine
+        SRCS paddle_inference_api_tensorrt_subgraph_engine.cc
+        DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api)
+
+inference_api_test(test_paddle_inference_api_tensorrt_subgraph_engine ARGS test_word2vec)
+endif()
+
+if (WITH_ANAKIN) # only needed in CI
     # Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's,
     # so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to
     # compile the libinference_anakin_api.a and compile with anakin.so.
-    nv_library(inference_anakin_api SHARED SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc)
+    nv_library(inference_anakin_api SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc)
+    nv_library(inference_anakin_api_shared SHARED SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc)
     target_compile_options(inference_anakin_api BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
+    target_compile_options(inference_anakin_api_shared BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
     target_link_libraries(inference_anakin_api anakin anakin_saber_common)
-    cc_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc
+    target_link_libraries(inference_anakin_api_shared anakin anakin_saber_common)
+    if (WITH_TESTING)
+        cc_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc
                                   ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin
                                   DEPS inference_anakin_api)
-    target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
+        target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
+     endif(WITH_TESTING)
 endif()
 
 if(WITH_TESTING)
diff --git a/paddle/contrib/inference/demo/CMakeLists.txt b/paddle/contrib/inference/demo/CMakeLists.txt
index 7b0fa77ad13c19f177e5b2446bcda6551471e45f..ecece6fe3471ad7b89c84c3e2b67af4ae9eb3c36 100644
--- a/paddle/contrib/inference/demo/CMakeLists.txt
+++ b/paddle/contrib/inference/demo/CMakeLists.txt
@@ -14,3 +14,48 @@
 #
 
 inference_api_test(simple_on_word2vec ARGS test_word2vec)
+
+option(WITH_INFERENCE_DEMO "Compile with Inference demo" OFF)
+if(NOT WITH_INFERENCE_DEMO)
+  return()
+endif()
+
+set(DEMO_INSTALL_DIR "${PADDLE_BINARY_DIR}/inference_demo")
+set(URL_ROOT http://paddlemodels.bj.bcebos.com/inference-vis-demos%2F)
+
+function(inference_download_test_demo TARGET)
+    if (NOT WITH_TESTING)
+        return()
+    endif()
+    set(options "")
+    set(oneValueArgs URL)
+    set(multiValueArgs SRCS)
+    cmake_parse_arguments(tests "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    set(test_dir "${DEMO_INSTALL_DIR}/${TARGET}")
+    message(STATUS "inference demo ${test_dir}")
+
+    if(NOT EXISTS "${test_dir}")
+        message(STATUS "Download ${TARGET} model from ${tests_URL}")
+        execute_process(COMMAND bash -c "mkdir -p ${test_dir}")
+        execute_process(COMMAND bash -c "cd ${test_dir}; wget -q ${tests_URL}")
+        execute_process(COMMAND bash -c "cd ${test_dir}; tar xzf *.tar.gz")
+    endif()
+
+    cc_test(${TARGET} SRCS "${tests_SRCS}"
+        DEPS paddle_inference_api paddle_fluid
+        ARGS --data=${test_dir}/data.txt
+             --modeldir=${test_dir}/model
+             --refer=${test_dir}/result.txt)
+endfunction()
+
+# disable mobilenet test
+#inference_download_test_demo(mobilenet_inference_demo
+#    SRCS vis_demo.cc
+#    URL ${URL_ROOT}mobilenet.tar.gz)
+inference_download_test_demo(se_resnext50_inference_demo
+    SRCS vis_demo.cc
+    URL ${URL_ROOT}se_resnext50.tar.gz)
+inference_download_test_demo(ocr_inference_demo
+    SRCS vis_demo.cc
+    URL ${URL_ROOT}ocr.tar.gz)
diff --git a/paddle/contrib/inference/demo/README.md b/paddle/contrib/inference/demo/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f1d256660299a68dc5d9d73dbe4a401a0e7d9680
--- /dev/null
+++ b/paddle/contrib/inference/demo/README.md
@@ -0,0 +1,36 @@
+# Infernce Demos
+
+Input data format:
+
+- Each line contains a single record
+- Each record's format is
+
+```
+<space splitted floats as data>\t<space splitted ints as shape>
+```
+
+Follow the C++ codes in `vis_demo.cc`.
+
+## MobileNet
+
+To execute the demo, simply run
+
+```sh
+./mobilenet_inference_demo --modeldir <model> --data <datafile>
+```
+
+## SE-ResNeXt-50
+
+To execute the demo, simply run
+
+```sh
+./se_resnext50_inference_demo --modeldir <model> --data <datafile>
+```
+
+## OCR
+
+To execute the demo, simply run
+
+```sh
+./ocr_inference_demo --modeldir <model> --data <datafile>
+```
diff --git a/paddle/contrib/inference/demo/simple_on_word2vec.cc b/paddle/contrib/inference/demo/simple_on_word2vec.cc
index 192a6414260ce06048b8c765402d89882cabc51b..c253014642f39a042430992548a285cc7078a959 100644
--- a/paddle/contrib/inference/demo/simple_on_word2vec.cc
+++ b/paddle/contrib/inference/demo/simple_on_word2vec.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 #include <memory>
 #include <thread>
 #include "paddle/contrib/inference/paddle_inference_api.h"
+
 namespace paddle {
 namespace demo {
 
@@ -40,10 +41,9 @@ void Main(bool use_gpu) {
     //# 2. Prepare input.
     int64_t data[4] = {1, 2, 3, 4};
 
-    PaddleBuf buf{.data = data, .length = sizeof(data)};
     PaddleTensor tensor{.name = "",
                         .shape = std::vector<int>({4, 1}),
-                        .data = buf,
+                        .data = PaddleBuf(data, sizeof(data)),
                         .dtype = PaddleDType::INT64};
 
     // For simplicity, we set all the slots with the same data.
@@ -55,14 +55,12 @@ void Main(bool use_gpu) {
 
     //# 4. Get output.
     ASSERT_EQ(outputs.size(), 1UL);
-    LOG(INFO) << "output buffer size: " << outputs.front().data.length;
-    const size_t num_elements = outputs.front().data.length / sizeof(float);
+    LOG(INFO) << "output buffer size: " << outputs.front().data.length();
+    const size_t num_elements = outputs.front().data.length() / sizeof(float);
     // The outputs' buffers are in CPU memory.
     for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
-      LOG(INFO) << static_cast<float*>(outputs.front().data.data)[i];
+      LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i];
     }
-    // TODO(Superjomn): this is should be free automatically
-    free(outputs[0].data.data);
   }
 }
 
@@ -86,10 +84,9 @@ void MainThreads(int num_threads, bool use_gpu) {
       for (int batch_id = 0; batch_id < num_batches; ++batch_id) {
         // 2. Dummy Input Data
         int64_t data[4] = {1, 2, 3, 4};
-        PaddleBuf buf{.data = data, .length = sizeof(data)};
         PaddleTensor tensor{.name = "",
                             .shape = std::vector<int>({4, 1}),
-                            .data = buf,
+                            .data = PaddleBuf(data, sizeof(data)),
                             .dtype = PaddleDType::INT64};
         std::vector<PaddleTensor> inputs(4, tensor);
         std::vector<PaddleTensor> outputs;
@@ -99,13 +96,13 @@ void MainThreads(int num_threads, bool use_gpu) {
         // 4. Get output.
         ASSERT_EQ(outputs.size(), 1UL);
         LOG(INFO) << "TID: " << tid << ", "
-                  << "output buffer size: " << outputs.front().data.length;
-        const size_t num_elements = outputs.front().data.length / sizeof(float);
+                  << "output buffer size: " << outputs.front().data.length();
+        const size_t num_elements =
+            outputs.front().data.length() / sizeof(float);
         // The outputs' buffers are in CPU memory.
         for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
-          LOG(INFO) << static_cast<float*>(outputs.front().data.data)[i];
+          LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i];
         }
-        free(outputs[0].data.data);
       }
     });
   }
diff --git a/paddle/contrib/inference/demo/utils.h b/paddle/contrib/inference/demo/utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..b5330d8d9d89260cfe3d5214e5a4ceb720cffdf1
--- /dev/null
+++ b/paddle/contrib/inference/demo/utils.h
@@ -0,0 +1,68 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+
+#include "paddle/contrib/inference/paddle_inference_api.h"
+
+namespace paddle {
+namespace demo {
+
+static void split(const std::string& str,
+                  char sep,
+                  std::vector<std::string>* pieces) {
+  pieces->clear();
+  if (str.empty()) {
+    return;
+  }
+  size_t pos = 0;
+  size_t next = str.find(sep, pos);
+  while (next != std::string::npos) {
+    pieces->push_back(str.substr(pos, next - pos));
+    pos = next + 1;
+    next = str.find(sep, pos);
+  }
+  if (!str.substr(pos).empty()) {
+    pieces->push_back(str.substr(pos));
+  }
+}
+
+/*
+ * Get a summary of a PaddleTensor content.
+ */
+static std::string SummaryTensor(const PaddleTensor& tensor) {
+  std::stringstream ss;
+  int num_elems = tensor.data.length() / PaddleDtypeSize(tensor.dtype);
+
+  ss << "data[:10]\t";
+  switch (tensor.dtype) {
+    case PaddleDType::INT64: {
+      for (int i = 0; i < std::min(num_elems, 10); i++) {
+        ss << static_cast<int64_t*>(tensor.data.data())[i] << " ";
+      }
+      break;
+    }
+    case PaddleDType::FLOAT32:
+      for (int i = 0; i < std::min(num_elems, 10); i++) {
+        ss << static_cast<float*>(tensor.data.data())[i] << " ";
+      }
+      break;
+  }
+  return ss.str();
+}
+
+}  // namespace demo
+}  // namespace paddle
diff --git a/paddle/contrib/inference/demo/vis_demo.cc b/paddle/contrib/inference/demo/vis_demo.cc
new file mode 100644
index 0000000000000000000000000000000000000000..45575f9a862de430236ae20cf498e542a45b1f4b
--- /dev/null
+++ b/paddle/contrib/inference/demo/vis_demo.cc
@@ -0,0 +1,149 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file contains demo for mobilenet, se-resnext50 and ocr.
+ */
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>  // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files.
+#include <gtest/gtest.h>
+#include <fstream>
+#include <iostream>
+#include "paddle/contrib/inference/demo/utils.h"
+#include "paddle/contrib/inference/paddle_inference_api.h"
+
+#ifdef PADDLE_WITH_CUDA
+DECLARE_double(fraction_of_gpu_memory_to_use);
+#endif
+
+namespace paddle {
+namespace demo {
+
+DEFINE_string(modeldir, "", "Directory of the inference model.");
+DEFINE_string(refer, "", "path to reference result for comparison.");
+DEFINE_string(
+    data,
+    "",
+    "path of data; each line is a record, format is "
+    "'<space splitted floats as data>\t<space splitted ints as shape'");
+
+struct Record {
+  std::vector<float> data;
+  std::vector<int32_t> shape;
+};
+
+void split(const std::string& str, char sep, std::vector<std::string>* pieces);
+
+Record ProcessALine(const std::string& line) {
+  LOG(INFO) << "process a line";
+  std::vector<std::string> columns;
+  split(line, '\t', &columns);
+  CHECK_EQ(columns.size(), 2UL)
+      << "data format error, should be <data>\t<shape>";
+
+  Record record;
+  std::vector<std::string> data_strs;
+  split(columns[0], ' ', &data_strs);
+  for (auto& d : data_strs) {
+    record.data.push_back(std::stof(d));
+  }
+
+  std::vector<std::string> shape_strs;
+  split(columns[1], ' ', &shape_strs);
+  for (auto& s : shape_strs) {
+    record.shape.push_back(std::stoi(s));
+  }
+  LOG(INFO) << "data size " << record.data.size();
+  LOG(INFO) << "data shape size " << record.shape.size();
+  return record;
+}
+
+void CheckOutput(const std::string& referfile, const PaddleTensor& output) {
+  std::string line;
+  std::ifstream file(referfile);
+  std::getline(file, line);
+  auto refer = ProcessALine(line);
+  file.close();
+
+  size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
+  LOG(INFO) << "predictor output numel " << numel;
+  LOG(INFO) << "reference output numel " << refer.data.size();
+  EXPECT_EQ(numel, refer.data.size());
+  switch (output.dtype) {
+    case PaddleDType::INT64: {
+      for (size_t i = 0; i < numel; ++i) {
+        EXPECT_EQ(static_cast<int64_t*>(output.data.data())[i], refer.data[i]);
+      }
+      break;
+    }
+    case PaddleDType::FLOAT32:
+      for (size_t i = 0; i < numel; ++i) {
+        EXPECT_NEAR(
+            static_cast<float*>(output.data.data())[i], refer.data[i], 1e-5);
+      }
+      break;
+  }
+}
+
+/*
+ * Use the native fluid engine to inference the demo.
+ */
+void Main(bool use_gpu) {
+  NativeConfig config;
+  config.param_file = FLAGS_modeldir + "/__params__";
+  config.prog_file = FLAGS_modeldir + "/__model__";
+  config.use_gpu = use_gpu;
+  config.device = 0;
+#ifdef PADDLE_WITH_CUDA
+  config.fraction_of_gpu_memory = FLAGS_fraction_of_gpu_memory_to_use;
+#endif
+
+  LOG(INFO) << "init predictor";
+  auto predictor =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+
+  LOG(INFO) << "begin to process data";
+  // Just a single batch of data.
+  std::string line;
+  std::ifstream file(FLAGS_data);
+  std::getline(file, line);
+  auto record = ProcessALine(line);
+  file.close();
+
+  // Inference.
+  PaddleTensor input{
+      .name = "xx",
+      .shape = record.shape,
+      .data = PaddleBuf(record.data.data(), record.data.size() * sizeof(float)),
+      .dtype = PaddleDType::FLOAT32};
+
+  LOG(INFO) << "run executor";
+  std::vector<PaddleTensor> output;
+  predictor->Run({input}, &output);
+
+  LOG(INFO) << "output.size " << output.size();
+  auto& tensor = output.front();
+  LOG(INFO) << "output: " << SummaryTensor(tensor);
+
+  // compare with reference result
+  CheckOutput(FLAGS_refer, tensor);
+}
+
+TEST(demo, vis_demo_cpu) { Main(false /*use_gpu*/); }
+#ifdef PADDLE_WITH_CUDA
+TEST(demo, vis_demo_gpu) { Main(true /*use_gpu*/); }
+#endif
+}  // namespace demo
+}  // namespace paddle
diff --git a/paddle/contrib/inference/high_level_api.md b/paddle/contrib/inference/high_level_api.md
new file mode 100644
index 0000000000000000000000000000000000000000..eb92885052a453d8c837bbf6f6e984efb509332a
--- /dev/null
+++ b/paddle/contrib/inference/high_level_api.md
@@ -0,0 +1,60 @@
+# Inference High-level APIs
+This document describes the high-level inference APIs, one can use them to deploy a Paddle model for an application quickly.
+
+The APIs are described in `paddle_inference_api.h`, just one header file, and two libaries `libpaddle_fluid.so` and `libpaddle_fluid_api.so` are needed for a deployment.
+
+## PaddleTensor
+We provide the `PaddleTensor` data structure to give a general tensor interface.
+
+The definition is 
+
+```c++
+struct PaddleTensor {
+  std::string name;  // variable name.
+  std::vector<int> shape;
+  PaddleBuf data;  // blob of data.
+  PaddleDType dtype;
+};
+```
+
+The data is stored in a continuous memory `PaddleBuf,` and a `PaddleDType` specifies tensor's data type. 
+The `name` field is used to specify the name of an input variable, 
+that is important when there are multiple inputs and need to distinguish which variable to set.
+
+## engine
+The inference APIs has two different underlying engines
+
+- the native engine, which is consists of the native operators and framework,
+- the Anakin engine, which has an Anakin library embedded.
+
+The native engine takes a native Paddle model as input, and supports any model that trained by Paddle, 
+the Anakin engine is faster for some model, 
+but it can only take the Anakin model as input(user need to transform the format first manually) and currently not all Paddle models are supported.
+
+```c++
+enum class PaddleEngineKind {
+  kNative = 0,  // Use the native Fluid facility.
+  kAnakin,      // Use Anakin for inference.
+};
+```
+
+## PaddlePredictor and how to create one
+The main interface is `PaddlePredictor,` there are following methods 
+
+- `bool Run(const std::vector<PaddleTensor>& inputs, std::vector<PaddleTensor>* output_data)`
+  - take inputs and output `output_data.`
+- `Clone` to clone a predictor from an existing one, with model parameter shared.
+
+There is a factory method to help create a predictor, and the user takes the ownership of this object.
+
+```c++
+template <typename ConfigT, PaddleEngineKind engine = PaddleEngineKind::kNative>
+std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
+```
+
+By specifying the engine kind and config, one can get a specific implementation.
+
+## Reference
+
+- [paddle_inference_api.h](./paddle_inference_api.h)
+- [some demos](./demo)
diff --git a/paddle/contrib/inference/high_level_api_cn.md b/paddle/contrib/inference/high_level_api_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..a57f015a4e44d43ee4e475cf606faa6f05e095fa
--- /dev/null
+++ b/paddle/contrib/inference/high_level_api_cn.md
@@ -0,0 +1,87 @@
+# Paddle 预测 API
+
+为了更简单方便的预测部署，Fluid 提供了一套高层 API 用来隐藏底层不同的优化实现。
+
+预测库包含:
+
+- 头文件 `paddle_inference_api.h` 定义了所有的接口
+- 库文件`libpaddle_fluid.so` 或 `libpaddle_fluid.a`
+- 库文件 `libpaddle_inference_api.so` 或 `libpaddle_inference_api.a`
+
+下面是详细的一些 API 概念介绍
+
+## PaddleTensor
+
+PaddleTensor 定义了预测最基本的输入输出的数据格式，其定义是
+
+```c++
+struct PaddleTensor {
+  std::string name;  // variable name.
+  std::vector<int> shape;
+  PaddleBuf data;  // blob of data.
+  PaddleDType dtype;
+};
+```
+
+- `name` 用于指定输入数据对应的 模型中variable 的名字 （暂时没有用，但会在后续支持任意 target 时启用）
+- `shape` 表示一个 Tensor 的 shape
+- `data`  数据以连续内存的方式存储在`PaddleBuf` 中，`PaddleBuf` 可以接收外面的数据或者独立`malloc`内存，详细可以参考头文件中相关定义。
+- `dtype` 表示 Tensor 的数据类型
+
+## engine
+
+高层 API 底层有多种优化实现，我们称之为 engine，目前有三种 engine
+
+- 原生 engine，由 paddle 原生的 forward operator 组成，可以天然支持所有paddle 训练出的模型，
+- Anakin engine，封装了 [Anakin](https://github.com/PaddlePaddle/Anakin) ，在某些模型上性能不错，但只能接受自带模型格式，无法支持所有 paddle 模型，
+- TensorRT mixed engine，用子图的方式支持了 [TensorRT](https://developer.nvidia.com/tensorrt) ，支持所有paddle 模型，并自动切割部分计算子图到 TensorRT 上加速（WIP）
+
+其实现为
+
+```c++
+enum class PaddleEngineKind {
+  kNative = 0,       // Use the native Fluid facility.
+  kAnakin,           // Use Anakin for inference.
+  kAutoMixedTensorRT // Automatically mixing TensorRT with the Fluid ops.
+};
+```
+
+## 预测部署过程
+
+总体上分为以下步骤
+
+1. 用合适的配置创建 `PaddlePredictor`
+2. 创建输入用的 `PaddleTensor`，传入到 `PaddlePredictor` 中
+3. 获取输出的 `PaddleTensor` ，将结果取出
+
+下面完整演示一个简单的模型，部分细节代码隐去
+
+```c++
+#include "paddle_inference_api.h"
+
+// 创建一个 config，并修改相关设置
+paddle::NativeConfig config;
+config.model_dir = "xxx";
+config.use_gpu = false;
+// 创建一个原生的 PaddlePredictor
+auto predictor =
+      paddle::CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+// 创建输入 tensor
+int64_t data[4] = {1, 2, 3, 4};
+paddle::PaddleTensor tensor{.name = "",
+                            .shape = std::vector<int>({4, 1}),
+                            .data = PaddleBuf(data, sizeof(data)),
+                            .dtype = PaddleDType::INT64};
+// 创建输出 tensor，输出 tensor 的内存可以复用
+std::vector<paddle::PaddleTensor> outputs;
+// 执行预测
+CHECK(predictor->Run(slots, &outputs));
+// 获取 outputs ...
+```
+
+编译时，联编 `libpaddle_fluid.a/.so` 和 `libpaddle_inference_api.a/.so` 便可。 
+
+## 详细代码参考
+
+- [inference demos](./demo)
+- [复杂单线程/多线程例子](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/contrib/inference/test_paddle_inference_api_impl.cc)
diff --git a/paddle/contrib/inference/paddle_inference_api.cc b/paddle/contrib/inference/paddle_inference_api.cc
index d67e1e7667800d6dd00cb8915b0d6dc7c664970b..4fe198ad7d4a752882965e9e7fc460741de53d22 100644
--- a/paddle/contrib/inference/paddle_inference_api.cc
+++ b/paddle/contrib/inference/paddle_inference_api.cc
@@ -13,3 +13,65 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/contrib/inference/paddle_inference_api.h"
+
+namespace paddle {
+
+int PaddleDtypeSize(PaddleDType dtype) {
+  switch (dtype) {
+    case PaddleDType::FLOAT32:
+      return sizeof(float);
+    case PaddleDType::INT64:
+      return sizeof(int64_t);
+    default:
+      assert(false);
+      return -1;
+  }
+}
+
+PaddleBuf::PaddleBuf(PaddleBuf&& other)
+    : data_(other.data_),
+      length_(other.length_),
+      memory_owned_(other.memory_owned_) {
+  other.memory_owned_ = false;
+  other.data_ = nullptr;
+  other.length_ = 0;
+}
+
+PaddleBuf::PaddleBuf(const PaddleBuf& other) { *this = other; }
+
+PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) {
+  // only the buffer with external memory can be copied
+  assert(!other.memory_owned_);
+  data_ = other.data_;
+  length_ = other.length_;
+  memory_owned_ = other.memory_owned_;
+  return *this;
+}
+
+void PaddleBuf::Resize(size_t length) {
+  // Only the owned memory can be reset, the external memory can't be changed.
+  if (length_ == length) return;
+  assert(memory_owned_);
+  Free();
+  data_ = new char[length];
+  length_ = length;
+  memory_owned_ = true;
+}
+
+void PaddleBuf::Reset(void* data, size_t length) {
+  Free();
+  memory_owned_ = false;
+  data_ = data;
+  length_ = length;
+}
+
+void PaddleBuf::Free() {
+  if (memory_owned_ && data_) {
+    assert(length_ > 0);
+    delete static_cast<char*>(data_);
+    data_ = nullptr;
+    length_ = 0;
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api.h b/paddle/contrib/inference/paddle_inference_api.h
index 77e2d77b6b7fe3eeed865c8de0818d059cfa6c6e..b8ba2d14a5c161d491d838888ea14b776f769f23 100644
--- a/paddle/contrib/inference/paddle_inference_api.h
+++ b/paddle/contrib/inference/paddle_inference_api.h
@@ -15,12 +15,13 @@ limitations under the License. */
 /*
  * This file contains the definition of a simple Inference API for Paddle.
  *
- * ATTENTION: It requires some C++ features, for lower version C++ or C, we
+ * ATTENTION: It requires some C++11 features, for lower version C++ or C, we
  * might release another API.
  */
 
 #pragma once
 
+#include <cassert>
 #include <memory>
 #include <string>
 #include <vector>
@@ -32,12 +33,38 @@ enum PaddleDType {
   INT64,
 };
 
-struct PaddleBuf {
-  void* data;     // pointer to the data memory.
-  size_t length;  // number of memory bytes.
+class PaddleBuf {
+ public:
+  PaddleBuf() = default;
+  PaddleBuf(PaddleBuf&& other);
+  // Copy only available when memory is managed externally.
+  explicit PaddleBuf(const PaddleBuf&);
+  PaddleBuf& operator=(const PaddleBuf&);
+  // Do not own the memory.
+  PaddleBuf(void* data, size_t length)
+      : data_(data), length_(length), memory_owned_{false} {}
+  // Own memory.
+  PaddleBuf(size_t length)
+      : data_(new char[length]), length_(length), memory_owned_(true) {}
+  // Resize to `length` bytes.
+  void Resize(size_t length);
+  // Reset to external memory.
+  void Reset(void* data, size_t length);
+  bool empty() const { return length_ == 0; }
+  void* data() const { return data_; }
+  size_t length() const { return length_; }
+
+  ~PaddleBuf() { Free(); }
+
+ private:
+  void Free();
+  void* data_{nullptr};  // pointer to the data memory.
+  size_t length_{0};     // number of memory bytes.
+  bool memory_owned_{true};
 };
 
 struct PaddleTensor {
+  PaddleTensor() = default;
   std::string name;  // variable name.
   std::vector<int> shape;
   // TODO(Superjomn) for LoD support, add a vector<vector<int>> field if needed.
@@ -46,12 +73,12 @@ struct PaddleTensor {
 };
 
 enum class PaddleEngineKind {
-  kNative = 0,  // Use the native Fluid facility.
-  kAnakin,      // Use Anakin for inference.
+  kNative = 0,         // Use the native Fluid facility.
+  kAnakin,             // Use Anakin for inference.
+  kAutoMixedTensorRT,  // Automatically mix Fluid with TensorRT.
   // TODO(Superjomn) support following engines latter.
   // kTensorRT,           // Use TensorRT for inference.
   // kAutoMixedAnakin,    // Automatically mix Fluid with Anakin.
-  // kAutoMixedTensorRT,  // Automatically mix Fluid with TensorRT.
 };
 
 /*
@@ -67,8 +94,9 @@ class PaddlePredictor {
 
   // Predict an record.
   // The caller should be responsible for allocating and releasing the memory of
-  // `inputs`. `inputs` should be alive until Run returns. caller should be
-  // responsible for releasing the memory of `output_data`.
+  // `inputs`. `inputs` should be available until Run returns. Caller should be
+  // responsible for the output tensor's buffer, either allocated or passed from
+  // outside.
   virtual bool Run(const std::vector<PaddleTensor>& inputs,
                    std::vector<PaddleTensor>* output_data) = 0;
 
@@ -81,8 +109,7 @@ class PaddlePredictor {
 
   // The common configs for all the predictors.
   struct Config {
-    std::string model_dir;      // path to the model directory.
-    bool enable_engine{false};  // Enable to execute (part of) the model on
+    std::string model_dir;  // path to the model directory.
   };
 };
 
@@ -103,6 +130,11 @@ struct AnakinConfig : public PaddlePredictor::Config {
   int max_batch_size{-1};
 };
 
+struct TensorRTConfig : public NativeConfig {
+  // Determine whether a subgraph will be executed by TRT.
+  int min_subgraph_size{1};
+};
+
 // A factory to help create different predictors.
 //
 // FOR EXTENSION DEVELOPER:
@@ -113,4 +145,7 @@ struct AnakinConfig : public PaddlePredictor::Config {
 // Similarly, each engine kind should map to a unique predictor implementation.
 template <typename ConfigT, PaddleEngineKind engine = PaddleEngineKind::kNative>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
+
+int PaddleDtypeSize(PaddleDType dtype);
+
 }  // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc b/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
index 5bafc58fa53f7d99de571f66b6224f0f2de66e32..ba2d30314715a57c5ab85e5ae1d8ac0512bbc74f 100644
--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
@@ -48,7 +48,7 @@ bool PaddleInferenceAnakinPredictor::Run(
     auto d_tensor_in_p = executor_.get_in(input.name);
     float *d_data_p = d_tensor_in_p->mutable_data();
     if (cudaMemcpy(d_data_p,
-                   static_cast<float *>(input.data.data),
+                   static_cast<float *>(input.data.data()),
                    d_tensor_in_p->valid_size() * sizeof(float),
                    cudaMemcpyHostToDevice) != 0) {
       LOG(ERROR) << "copy data from CPU to GPU error";
@@ -65,8 +65,11 @@ bool PaddleInferenceAnakinPredictor::Run(
   for (auto &output : *output_data) {
     auto *tensor = executor_.get_out(output.name);
     output.shape = tensor->shape();
+    if (output.data.length() < tensor->valid_size() * sizeof(float)) {
+      output.data.Resize(tensor->valid_size() * sizeof(float));
+    }
     // Copy data from GPU -> CPU
-    if (cudaMemcpy(output.data.data,
+    if (cudaMemcpy(output.data.data(),
                    tensor->mutable_data(),
                    tensor->valid_size() * sizeof(float),
                    cudaMemcpyDeviceToHost) != 0) {
diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc b/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
index 1d41a5c73e75723f8614d810eae09ed8cdc8cf2b..f92e9d4190412f5847e353ef1dc0324cad668c9a 100644
--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
@@ -37,28 +37,26 @@ TEST(inference, anakin) {
 
   float data[1 * 3 * 224 * 224] = {1.0f};
 
-  PaddleBuf buf{.data = data, .length = sizeof(data)};
   PaddleTensor tensor{.name = "input_0",
                       .shape = std::vector<int>({1, 3, 224, 224}),
-                      .data = buf,
+                      .data = PaddleBuf(data, sizeof(data)),
                       .dtype = PaddleDType::FLOAT32};
 
   // For simplicity, we set all the slots with the same data.
-  std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
+  std::vector<PaddleTensor> paddle_tensor_feeds;
+  paddle_tensor_feeds.emplace_back(std::move(tensor));
 
-  float data_out[1000];
-
-  PaddleBuf buf_out{.data = data_out, .length = sizeof(data)};
   PaddleTensor tensor_out{.name = "prob_out",
                           .shape = std::vector<int>({1000, 1}),
-                          .data = buf_out,
+                          .data = PaddleBuf(),
                           .dtype = PaddleDType::FLOAT32};
 
-  std::vector<PaddleTensor> outputs(1, tensor_out);
+  std::vector<PaddleTensor> outputs;
+  outputs.emplace_back(std::move(tensor_out));
 
   ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
 
-  float* data_o = static_cast<float*>(outputs[0].data.data);
+  float* data_o = static_cast<float*>(outputs[0].data.data());
   for (size_t j = 0; j < 1000; ++j) {
     LOG(INFO) << "output[" << j << "]: " << data_o[j];
   }
diff --git a/paddle/contrib/inference/paddle_inference_api_impl.cc b/paddle/contrib/inference/paddle_inference_api_impl.cc
index bda2981a14482e2c4a29773d37b074506cc344b1..b1e5b875981e0142f6970cf6864b7b598743654b 100644
--- a/paddle/contrib/inference/paddle_inference_api_impl.cc
+++ b/paddle/contrib/inference/paddle_inference_api_impl.cc
@@ -89,6 +89,7 @@ bool NativePaddlePredictor::Init(
     LOG(ERROR) << "fail to load inference model.";
     return false;
   }
+
   ctx_ = executor_->Prepare(*inference_program_, 0);
   executor_->CreateVariables(
       *inference_program_, sub_scope_ ? sub_scope_ : scope_.get(), 0);
@@ -119,6 +120,7 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
     return false;
   }
   for (size_t i = 0; i < feed_target_names_.size(); ++i) {
+    VLOG(4) << "setting " << i << "-th target";
     feed_targets[feed_target_names_[i]] = &feeds[i];
   }
   // get fetch variable
@@ -130,14 +132,16 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
   }
   // Run the inference program
   // if share variables, we need not create variables
+  VLOG(4) << "Run prepared context";
   executor_->RunPreparedContext(
       ctx_.get(),
       sub_scope_ != nullptr ? sub_scope_ : scope_.get(),
       &feed_targets,
       &fetch_targets,
       false /* don't create variable eatch time */);
+  VLOG(4) << "Finish prepared context";
   if (!GetFetch(fetchs, output_data)) {
-    LOG(ERROR) << "fail to get fetchs";
+    LOG(ERROR) << "fail to get fetches";
     return false;
   }
   VLOG(3) << "predict cost: " << timer.toc() << "ms";
@@ -178,8 +182,8 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
 
     // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
     std::memcpy(static_cast<void *>(input_ptr),
-                inputs[i].data.data,
-                inputs[i].data.length);
+                inputs[i].data.data(),
+                inputs[i].data.length());
     feeds->push_back(input);
   }
   return true;
@@ -241,10 +245,11 @@ bool NativePaddlePredictor::GetFetch(
     }
 
     outputs->at(i).shape = shape;
-    outputs->at(i).data.length = sizeof(float) * data.size();
-    outputs->at(i).data.data = malloc(outputs->at(i).data.length);
-    std::memcpy(
-        outputs->at(i).data.data, data.data(), outputs->at(i).data.length);
+    auto &buffer = outputs->at(i).data;
+    if (buffer.empty() || buffer.length() < sizeof(float) * data.size()) {
+      buffer.Resize(sizeof(float) * data.size());
+    }
+    std::memcpy(buffer.data(), data.data(), buffer.length());
     outputs->at(i).dtype = PaddleDType::FLOAT32;
     // TODO(panyx0718): support other types? fill tensor name? avoid a copy.
   }
diff --git a/paddle/contrib/inference/paddle_inference_api_impl.h b/paddle/contrib/inference/paddle_inference_api_impl.h
index 86d1db7bcc7567e104cd20c9f767ed4513f611f5..f9ec6f55449fc46b4a44b9563980cb5f8e80a951 100644
--- a/paddle/contrib/inference/paddle_inference_api_impl.h
+++ b/paddle/contrib/inference/paddle_inference_api_impl.h
@@ -22,9 +22,9 @@
 #include "paddle/contrib/inference/paddle_inference_api.h"
 
 #include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
@@ -44,7 +44,7 @@ class NativePaddlePredictor : public PaddlePredictor {
 
   ~NativePaddlePredictor() override;
 
- private:
+ protected:
   bool SetFeed(const std::vector<PaddleTensor> &input_datas,
                std::vector<framework::LoDTensor> *feeds);
   bool GetFetch(const std::vector<framework::LoDTensor> &fetchs,
diff --git a/paddle/contrib/inference/paddle_inference_api_tensorrt_subgraph_engine.cc b/paddle/contrib/inference/paddle_inference_api_tensorrt_subgraph_engine.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a11396cee91a758e86af2efd9e58b9da68442590
--- /dev/null
+++ b/paddle/contrib/inference/paddle_inference_api_tensorrt_subgraph_engine.cc
@@ -0,0 +1,126 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/contrib/inference/paddle_inference_api.h"
+#include "paddle/contrib/inference/paddle_inference_api_impl.h"
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/utils/singleton.h"
+
+namespace paddle {
+
+using inference::analysis::Argument;
+using inference::Singleton;
+using inference::analysis::Analyzer;
+using framework::proto::ProgramDesc;
+
+class TensorRTSubgraphPredictor : public NativePaddlePredictor {
+ public:
+  explicit TensorRTSubgraphPredictor(const TensorRTConfig& config)
+      : NativePaddlePredictor(config), config_(config) {}
+
+  bool Init(const std::shared_ptr<framework::Scope>& parent_scope) {
+    VLOG(3) << "Predictor::init()";
+
+    if (config_.use_gpu) {
+      place_ = paddle::platform::CUDAPlace(config_.device);
+    } else {
+      place_ = paddle::platform::CPUPlace();
+    }
+    if (parent_scope) {
+      scope_ = parent_scope;
+      sub_scope_ = &(parent_scope->NewScope());
+    } else {
+      paddle::framework::InitDevices(false);
+      scope_.reset(new paddle::framework::Scope());
+    }
+
+    executor_.reset(new paddle::framework::Executor(place_));
+
+    // Initialize the inference program
+    if (!config_.model_dir.empty()) {
+      // Parameters are saved in separate files sited in
+      // the specified `dirname`.
+      inference_program_ = paddle::inference::Load(
+          executor_.get(), scope_.get(), config_.model_dir);
+    } else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
+      // All parameters are saved in a single file.
+      // The file names should be consistent with that used
+      // in Python API `fluid.io.save_inference_model`.
+      inference_program_ = paddle::inference::Load(
+          executor_.get(), scope_.get(), config_.prog_file, config_.param_file);
+    } else {
+      LOG(ERROR) << "fail to load inference model.";
+      return false;
+    }
+
+    // Analyze inference_program
+    Argument argument;
+    argument.origin_program_desc.reset(
+        new ProgramDesc(*inference_program_->Proto()));
+    Singleton<Analyzer>::Global().Run(&argument);
+    CHECK(argument.transformed_program_desc);
+    VLOG(5) << "transformed program:\n"
+            << argument.transformed_program_desc->SerializeAsString();
+    VLOG(5) << "to prepare executor";
+    *inference_program_->Proto() = *argument.transformed_program_desc;
+    ctx_ = executor_->Prepare(*inference_program_, 0);
+
+    VLOG(5) << "to create variables";
+    executor_->CreateVariables(
+        *inference_program_, sub_scope_ ? sub_scope_ : scope_.get(), 0);
+
+    // Get the feed_target_names and fetch_target_names
+    feed_target_names_ = inference_program_->GetFeedTargetNames();
+    fetch_target_names_ = inference_program_->GetFetchTargetNames();
+    return true;
+  }
+
+ private:
+  TensorRTConfig config_;
+};
+
+template <>
+std::unique_ptr<PaddlePredictor>
+CreatePaddlePredictor<TensorRTConfig, PaddleEngineKind::kAutoMixedTensorRT>(
+    const TensorRTConfig& config) {
+  VLOG(3) << "create TensorRTSubgraphPredictor";
+  if (config.use_gpu) {
+    // 1. GPU memeroy
+    PADDLE_ENFORCE_GT(
+        config.fraction_of_gpu_memory,
+        0.f,
+        "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
+    PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
+    std::vector<std::string> flags;
+    if (config.fraction_of_gpu_memory >= 0.0f ||
+        config.fraction_of_gpu_memory <= 0.95f) {
+      flags.push_back("dummpy");
+      std::string flag = "--fraction_of_gpu_memory_to_use=" +
+                         std::to_string(config.fraction_of_gpu_memory);
+      flags.push_back(flag);
+      VLOG(3) << "set flag: " << flag;
+      framework::InitGflags(flags);
+    }
+  }
+
+  std::unique_ptr<PaddlePredictor> predictor(
+      new TensorRTSubgraphPredictor(config));
+  if (!dynamic_cast<TensorRTSubgraphPredictor*>(predictor.get())
+           ->Init(nullptr)) {
+    return nullptr;
+  }
+  return std::move(predictor);
+}
+
+}  // namespace paddle
diff --git a/paddle/contrib/inference/test_paddle_inference_api_impl.cc b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
index 5d843010e02b09087e6b328428e80fb40eb5bb97..c3649dcb96c77f449d876bef34c4aea7afb31daa 100644
--- a/paddle/contrib/inference/test_paddle_inference_api_impl.cc
+++ b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
@@ -27,13 +27,12 @@ namespace paddle {
 
 PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) {
   PaddleTensor pt;
-  pt.data.data = t->data<void>();
 
   if (t->type() == typeid(int64_t)) {
-    pt.data.length = t->numel() * sizeof(int64_t);
+    pt.data.Reset(t->data<void>(), t->numel() * sizeof(int64_t));
     pt.dtype = PaddleDType::INT64;
   } else if (t->type() == typeid(float)) {
-    pt.data.length = t->numel() * sizeof(float);
+    pt.data.Reset(t->data<void>(), t->numel() * sizeof(float));
     pt.dtype = PaddleDType::FLOAT32;
   } else {
     LOG(FATAL) << "unsupported type.";
@@ -79,8 +78,8 @@ void MainWord2Vec(bool use_gpu) {
   std::vector<PaddleTensor> outputs;
   ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
   ASSERT_EQ(outputs.size(), 1UL);
-  size_t len = outputs[0].data.length;
-  float* data = static_cast<float*>(outputs[0].data.data);
+  size_t len = outputs[0].data.length();
+  float* data = static_cast<float*>(outputs[0].data.data());
   for (size_t j = 0; j < len / sizeof(float); ++j) {
     ASSERT_LT(data[j], 1.0);
     ASSERT_GT(data[j], -1.0);
@@ -103,8 +102,6 @@ void MainWord2Vec(bool use_gpu) {
     EXPECT_LT(lod_data[i] - data[i], 1e-3);
     EXPECT_GT(lod_data[i] - data[i], -1e-3);
   }
-
-  free(outputs[0].data.data);
 }
 
 void MainImageClassification(bool use_gpu) {
@@ -143,13 +140,12 @@ void MainImageClassification(bool use_gpu) {
   std::vector<PaddleTensor> outputs;
   ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
   ASSERT_EQ(outputs.size(), 1UL);
-  size_t len = outputs[0].data.length;
-  float* data = static_cast<float*>(outputs[0].data.data);
+  size_t len = outputs[0].data.length();
+  float* data = static_cast<float*>(outputs[0].data.data());
   float* lod_data = output1.data<float>();
   for (size_t j = 0; j < len / sizeof(float); ++j) {
     EXPECT_NEAR(lod_data[j], data[j], 1e-3);
   }
-  free(data);
 }
 
 void MainThreadsWord2Vec(bool use_gpu) {
@@ -192,8 +188,8 @@ void MainThreadsWord2Vec(bool use_gpu) {
 
       // check outputs range
       ASSERT_EQ(local_outputs.size(), 1UL);
-      const size_t len = local_outputs[0].data.length;
-      float* data = static_cast<float*>(local_outputs[0].data.data);
+      const size_t len = local_outputs[0].data.length();
+      float* data = static_cast<float*>(local_outputs[0].data.data());
       for (size_t j = 0; j < len / sizeof(float); ++j) {
         ASSERT_LT(data[j], 1.0);
         ASSERT_GT(data[j], -1.0);
@@ -205,7 +201,6 @@ void MainThreadsWord2Vec(bool use_gpu) {
       for (int i = 0; i < refs[tid].numel(); ++i) {
         EXPECT_NEAR(ref_data[i], data[i], 1e-3);
       }
-      free(data);
     });
   }
   for (int i = 0; i < num_jobs; ++i) {
@@ -251,14 +246,13 @@ void MainThreadsImageClassification(bool use_gpu) {
 
       // check outputs correctness
       ASSERT_EQ(local_outputs.size(), 1UL);
-      const size_t len = local_outputs[0].data.length;
-      float* data = static_cast<float*>(local_outputs[0].data.data);
+      const size_t len = local_outputs[0].data.length();
+      float* data = static_cast<float*>(local_outputs[0].data.data());
       float* ref_data = refs[tid].data<float>();
-      EXPECT_EQ(refs[tid].numel(), len / sizeof(float));
+      EXPECT_EQ((size_t)refs[tid].numel(), len / sizeof(float));
       for (int i = 0; i < refs[tid].numel(); ++i) {
         EXPECT_NEAR(ref_data[i], data[i], 1e-3);
       }
-      free(data);
     });
   }
   for (int i = 0; i < num_jobs; ++i) {
diff --git a/paddle/contrib/inference/test_paddle_inference_api_tensorrt_subgraph_engine.cc b/paddle/contrib/inference/test_paddle_inference_api_tensorrt_subgraph_engine.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b100630dbe412ca811f1a8f2b8191356f5ebec2f
--- /dev/null
+++ b/paddle/contrib/inference/test_paddle_inference_api_tensorrt_subgraph_engine.cc
@@ -0,0 +1,64 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include "paddle/contrib/inference/paddle_inference_api.h"
+
+namespace paddle {
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+void Main(bool use_gpu) {
+  //# 1. Create PaddlePredictor with a config.
+  TensorRTConfig config;
+  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config.use_gpu = use_gpu;
+  config.fraction_of_gpu_memory = 0.15;
+  config.device = 0;
+  auto predictor =
+      CreatePaddlePredictor<TensorRTConfig,
+                            PaddleEngineKind::kAutoMixedTensorRT>(config);
+
+  for (int batch_id = 0; batch_id < 3; batch_id++) {
+    //# 2. Prepare input.
+    int64_t data[4] = {1, 2, 3, 4};
+
+    PaddleTensor tensor{.name = "",
+                        .shape = std::vector<int>({4, 1}),
+                        .data = PaddleBuf(data, sizeof(data)),
+                        .dtype = PaddleDType::INT64};
+
+    // For simplicity, we set all the slots with the same data.
+    std::vector<PaddleTensor> slots(4, tensor);
+
+    //# 3. Run
+    std::vector<PaddleTensor> outputs;
+    CHECK(predictor->Run(slots, &outputs));
+
+    //# 4. Get output.
+    ASSERT_EQ(outputs.size(), 1UL);
+    LOG(INFO) << "output buffer size: " << outputs.front().data.length();
+    const size_t num_elements = outputs.front().data.length() / sizeof(float);
+    // The outputs' buffers are in CPU memory.
+    for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
+      LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i];
+    }
+  }
+}
+
+TEST(paddle_inference_api_tensorrt_subgraph_engine, main) { Main(true); }
+
+}  // namespace paddle
\ No newline at end of file
diff --git a/paddle/contrib/tape/CMakeLists.txt b/paddle/contrib/tape/CMakeLists.txt
deleted file mode 100644
index 5450359d859de93ca19c56422f1243c7f445aff7..0000000000000000000000000000000000000000
--- a/paddle/contrib/tape/CMakeLists.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-if(APPLE)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
-endif(APPLE)
-
-cc_library(tape_variable SRCS variable.cc DEPS ${FLUID_CORE_MODULES} device_context framework_proto proto_desc operator)
-cc_library(tape SRCS tape.cc DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB} tape_variable)
-
-cc_test(test_tape
-        SRCS test_tape.cc
-        DEPS tape tape_variable)
diff --git a/paddle/contrib/tape/README.md b/paddle/contrib/tape/README.md
deleted file mode 100644
index 16c22a45d59664e44c83923371c0f0d957a8ca7f..0000000000000000000000000000000000000000
--- a/paddle/contrib/tape/README.md
+++ /dev/null
@@ -1,252 +0,0 @@
-# Dynamic Graph on Fluid
-
-PaddlePaddle Fluid is targeting the autodiff without tape, which, however, is very
-challenging and we are still way from there. DyNet and PyTorch provide a good design
-idea, the *tape*, that significantly eases the challenge.  Also, DyNet provides
-a C++ API that is as convenient as Python but with higher efficiency and could
-conveniently integrate with industrial/production systems. This package, `tape`,
-combines the good of
-
-1. tape from PyTorch and DyNet
-2. C++ API and core from DyNet
-3. rich set of operators from PaddlePaddle
-
-## Overview
-
-We can implement Dynet-like Tape(See this [survey](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/survey/dynamic_graph.md))
-by wrapping Paddle Fluid's `Operator` and `Variable`.
-
-The user API is straight forward since
-
-1. it is imperative. And it uses host language's control flow logic.
-1. it avoids extra concepts such as `Scope` and `Executor`.
-
-All of these benefits come at the cost of just adding one line `reset_global_tape`
-at every iteration.
-
-## Code Structure
-
-In short, the `Tape` contains a vector of `OpHandle`s. And an `OpHandle` contains its
-`type`, the pointers to the `Variable`s, and necessary attributes.
-
-```c++
-class Variable {
-public:
-  VriableHandle Grad(); // returns its gradient variable
-private:
-  framework::VarDesc desc_; // compile time infershape, necessary for lazy execution
-  framework::Variable var_; // run time variable, holds data memory
-};
-
-using VariableHandle = shared_ptr<Variable>;
-
-struct OpHandle {
-  string type_;
-  map<string, vector<VariableHandle>> inputs_;
-  map<string, vector<VariableHandle>> outputs_;
-  AttributeMap attrs_;
-};
-
-class Tape {
-public:
-  void AddOp(OpHandle); // add op
-  void Forward();       // execute the tape_
-  void Backward();      // execute the backward of the tape_
-private:
-  vector<OpHandle> tape_;
-};
-```
-
-We uses `Function` to indicate layers. It takes care of parameter
-initialization and `AddOp` to the Tape when it is called.
-
-```c++
-class Linear {
- public:
-  Linear(int in_dim, int out_dim, const std::string &act)
-      : w_(new Variable("LinearWeight")),
-        b_(new Variable("LinearBias")),
-        act_(act) {
-    Tape init_tape;
-
-    std::string initializer = "fill_constant";
-    framework::AttributeMap attrs;
-    attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
-    attrs["shape"] = std::vector<int>{in_dim, out_dim};
-    attrs["value"] = 1.0f;
-    init_tape.AddOp(initializer, {}, {{"Out", {w_}}}, attrs);
-
-    attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
-    attrs["shape"] = std::vector<int>{out_dim};
-    attrs["value"] = 1.0f;
-    init_tape.AddOp(initializer, {}, {{"Out", {b_}}}, attrs);
-
-    init_tape.Forward();
-  }
-
-  VariableHandle operator()(VariableHandle input) {
-    VariableHandle pre_bias(new Variable("linear"));
-    get_global_tape().AddOp("mul",
-                            {{"X", {input}}, {"Y", {w_}}},
-                            {{"Out", {pre_bias}}},
-                            {{"x_num_col_dims", 1}, {"y_num_col_dims", 1}});
-    VariableHandle pre_act(new Variable("linear"));
-    get_global_tape().AddOp("elementwise_add",
-                            {{"X", {pre_bias}}, {"Y", {b_}}},
-                            {{"Out", {pre_act}}},
-                            {{"axis", 1}});
-    VariableHandle post_act(new Variable("linear"));
-    get_global_tape().AddOp(act_,
-                            {{"X", {pre_act}}},
-                            {{"Out", {post_act}}},
-                            {});
-    return post_act;
-  }
-
-  std::vector<VariableHandle> Params() { return {w_, b_}; }
-
- private:
-  VariableHandle w_;
-  VariableHandle b_;
-  std::string act_;
-};
-```
-
-## User API
-
-```c++
-// Model function
-paddle::tape::Linear linear1(3, 3, "relu"); // init weight and bias
-paddle::tape::Linear linear2(3, 3, "relu"); // init weight and bias
-paddle::tape::Mean mean;
-
-// Optimizer
-paddle::tape::SGD sgd(0.001);
-
-// Data Feeder
-paddle::tape::Fill data_feeder(...);
-VariableHandle input(new paddle::tape::Variable("input"));
-VariableHandle label(new paddle::tape::Variable("label"));
-
-for (int i = 0; i < 2; ++i) {
-  reset_global_tape();
-
-  data_feeder(input, label);
-
-  auto loss = softmax(linear2(linear1(input)), label); // compile time InferShape & InferVarType
-  LOG(INFO) << loss.value(); // Run forward up to loss
-
-  // Run backward, store gradient of w at w->Grad()
-  get_global_tape.Backward(loss);
-
-  // Update w
-  sgd(linear1.Params());
-  sgd(linear2.Params());
-}
-```
-
-<details>
-  <summary></summary>
-digraph G {
-
-	subgraph cluster_0 {
-                node [shape=record,style=filled];
-		style=filled;
-		color=lightgrey;
-                linear1 [label="{type: mul | {input | {<before_mul1>X: before_mul1 |<weight1> Y: weight1}} |  {output |<before_bias1> Out: before_bias1}}"];
-                elementwise_add1 [label="{type: elementwise_add | {input | {<before_bias1>X: before_bias1 |<bias1> Y: bias1}} |  {output |<before_act1> Out: before_act1}}"];
-                relu1 [label="{type: relu | {input | {<before_act1>X: before_act1 }} |  {output |<after_act1> Out: after_act1}}"];
-
-		linear1 -> elementwise_add1->relu1;
-		label = "forward tape";
-	}
-
-        linear1:before_mul1->before_mul1
-        linear1:weight1->weight1
-        linear1:before_bias1->before_bias1
-
-        elementwise_add1:bias1->bias1
-        elementwise_add1:before_bias1->before_bias1
-        elementwise_add1:before_act1->before_act1
-
-        relu1:before_act1->before_act1
-        relu1:after_act1->after_act1
-
-	subgraph cluster_1 {
-                node [shape=record,style=filled];
-		style=filled;
-		color=lightgrey;
-                linear1_grad [label="{type: mul_grad | {input | {<before_mul1>X: before_mul1 |<weight1> Y: weight1|<before_bias1_grad> Out_grad: before_bias1_grad}} |  {output |{<before_mul1_grad>X_grad: before_mul1_grad |<weight1_grad> Y_grad: weight1_grad}}}"];
-
-                elementwise_add1_grad [label="{type: elementwise_add_grad | {input | <before_act1_grad> Out_grad: before_act1_grad} |  {output |{<before_bias1_grad>X_grad: before_bias1_grad |<bias1_grad> Y_grad: bias1_grad}}}"];
-
-                relu1_grad [label="{type: relu_grad |  {input |<after_act1_grad> Out_grad: after_act1_grad} | {ouput | {<before_act1_grad>X_grad: before_act1_grad }}}"];
-
-		linear1_grad -> elementwise_add1_grad ->relu1_grad [dir=back];
-                label = "backward tape";
-	}
-
-        relu1_grad:after_act1_grad->after_act1_grad
-        relu1_grad:before_act1_grad->before_act1_grad
-
-        elementwise_add1_grad:before_act1_grad->before_act1_grad
-        elementwise_add1_grad:before_bias1_grad->before_bias1_grad
-        elementwise_add1_grad:bias1_grad->bias1_grad
-
-        linear1_grad:before_mul1->before_mul1
-        linear1_grad:weight1->weight1
-        linear1_grad:before_bias1_grad->before_bias1_grad
-        linear1_grad:before_mul1_grad->before_mul1_grad
-        linear1_grad:weight1_grad->weight1_grad
-
-
-	subgraph cluster_2 {
-                node [shape=record];
-                label = "Linear1";
-                weight1
-                bias1
-	}
-
-        weight1 -> weight1_grad [ label="Grad()", style="dashed" ];
-        bias1 -> bias1_grad [ label="Grad()", style="dashed"];
-
-	
-
-}
-</details>
-
-![Image](https://github.com/tonyyang-svail/Paddle/blob/cpp_tap/paddle/contrib/tape/computation_graph.png)
-
-## Code Reuse
-
-We want to stay close to Paddle Fluid as much as possible.
-
-### Reuse All Operators
-
-As all Ops are registered at `OpInfoMap`, the effort of adding a new `Function`
-is about 10 lines of code, similar to expose an operator to Python.
-
-### Reuse Compile Time InferShape and InferVarType
-
-Note that all the symbolic information is stored at `tape::Varaible::desc_`, instead
-of `ProgramDesc.block.vars`, we create a temporary `BlockDesc` to do `InferShape` and
-`InferVarType` every time we `AddOp` to the tape.
-
-### Reuse Operator::Run
-
-We use smart pointer, instead of `Scope`, to manage memory. So we create a temporary
-`Scope` for every `Operator::Run()`.
-
-## Possible Feature
-
-### Release Memory on Backward
-
-We can release memory aggressively. During backward, we can delete the OpHandle once
-we have finished its backward. Since all the variable is managed by smart pointer, the
-memory is automatically released when its `ref_count` goes to 0.
-
-### Kernel Fusion
-
-As a symbolic representation of the Tape is constructed first before the actual
-execution, it would be possible to perform graph optimization. One use case is kernel
-fusion.
diff --git a/paddle/contrib/tape/computation_graph.png b/paddle/contrib/tape/computation_graph.png
deleted file mode 100644
index 6cf5ead735d5d18b204b079771e53d44483cf016..0000000000000000000000000000000000000000
Binary files a/paddle/contrib/tape/computation_graph.png and /dev/null differ
diff --git a/paddle/contrib/tape/function.h b/paddle/contrib/tape/function.h
deleted file mode 100644
index 8c9694d9a21b5948361164eab60a663ec4fd3803..0000000000000000000000000000000000000000
--- a/paddle/contrib/tape/function.h
+++ /dev/null
@@ -1,131 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-
-#include "paddle/contrib/tape/tape.h"
-#include "paddle/contrib/tape/variable.h"
-#include "paddle/fluid/framework/type_defs.h"
-
-namespace paddle {
-namespace tape {
-
-class Function {};
-
-class Fill {
- public:
-  Fill(const std::string &initializer, const framework::AttributeMap &attrs)
-      : initializer_(initializer), attrs_(attrs) {}
-
-  void operator()(VariableHandle var) {
-    get_global_tape().AddOp(initializer_, {}, {{"Out", {var}}}, attrs_);
-  }
-
- private:
-  const std::string initializer_;
-  const framework::AttributeMap attrs_;
-};
-
-class Mean {
- public:
-  VariableHandle operator()(VariableHandle var) {
-    VariableHandle out(new Variable("mean"));
-    get_global_tape().AddOp("mean", {{"X", {var}}}, {{"Out", {out}}}, {});
-    return out;
-  }
-};
-
-class Linear {
- public:
-  Linear(int in_dim, int out_dim, const std::string &act)
-      : w_(new Variable("LinearWeight")),
-        b_(new Variable("LinearBias")),
-        act_(act) {
-    Tape init_tape;
-
-    std::string initializer = "fill_constant";
-    framework::AttributeMap attrs;
-    attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
-    attrs["shape"] = std::vector<int>{in_dim, out_dim};
-    attrs["value"] = 1.0f;
-    init_tape.AddOp(initializer, {}, {{"Out", {w_}}}, attrs);
-
-    attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
-    attrs["shape"] = std::vector<int>{out_dim};
-    attrs["value"] = 1.0f;
-    init_tape.AddOp(initializer, {}, {{"Out", {b_}}}, attrs);
-
-    init_tape.Forward();
-  }
-
-  VariableHandle operator()(VariableHandle input) {
-    VariableHandle pre_bias(new Variable("linear"));
-    get_global_tape().AddOp("mul",
-                            {{"X", {input}}, {"Y", {w_}}},
-                            {{"Out", {pre_bias}}},
-                            {{"x_num_col_dims", 1}, {"y_num_col_dims", 1}});
-    VariableHandle pre_act(new Variable("linear"));
-    get_global_tape().AddOp("elementwise_add",
-                            {{"X", {pre_bias}}, {"Y", {b_}}},
-                            {{"Out", {pre_act}}},
-                            {{"axis", 1}});
-    VariableHandle post_act(new Variable("linear"));
-    get_global_tape().AddOp(
-        act_, {{"X", {pre_act}}}, {{"Out", {post_act}}}, {});
-    return post_act;
-  }
-
-  std::vector<VariableHandle> Params() { return {w_, b_}; }
-
- private:
-  VariableHandle w_;
-  VariableHandle b_;
-  std::string act_;
-};
-
-class SGD {
- public:
-  SGD(float learning_rate) : learning_rate_(new Variable("sgd")) {
-    Tape init_tape;
-
-    std::string initializer = "fill_constant";
-    framework::AttributeMap attrs;
-    attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
-    attrs["shape"] = std::vector<int>{1};
-    attrs["value"] = learning_rate;
-    init_tape.AddOp(initializer, {}, {{"Out", {learning_rate_}}}, attrs);
-
-    init_tape.Forward();
-  }
-
-  void operator()(VariableHandle input) {
-    PADDLE_ENFORCE(get_global_tape().HasBeenBackwarded(),
-                   "optimization must happen after the backward");
-    Tape temp_tape;
-    temp_tape.AddOp("sgd",
-                    {{"Param", {input}},
-                     {"LearningRate", {learning_rate_}},
-                     {"Grad", {input->Grad()}}},
-                    {{"ParamOut", {input}}},
-                    {});
-    temp_tape.Forward();
-  }
-
- private:
-  VariableHandle learning_rate_;
-};
-}
-}
diff --git a/paddle/contrib/tape/tape.cc b/paddle/contrib/tape/tape.cc
deleted file mode 100644
index 531499b6fe02abf200b7d4401494fd6350646622..0000000000000000000000000000000000000000
--- a/paddle/contrib/tape/tape.cc
+++ /dev/null
@@ -1,265 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/contrib/tape/tape.h"
-
-#include <list>
-#include <map>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/dim.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/pybind/pybind.h"
-
-namespace paddle {
-namespace tape {
-
-// borrowed from
-// https://stackoverflow.com/questions/874134/find-if-string-ends-with-another-string-in-c
-inline bool ends_with(std::string const &value, std::string const &ending) {
-  if (ending.size() > value.size()) return false;
-  return std::equal(ending.rbegin(), ending.rend(), value.rbegin());
-}
-
-std::ostream &operator<<(std::ostream &os, const framework::VarDesc &var_desc) {
-  os << var_desc.Name();
-  os << "[" << var_desc.GetType() << "]";
-  os << "[" << var_desc.GetDataType() << "]";
-  os << "{";
-  for (auto &i : var_desc.GetShape()) {
-    os << i << ",";
-  }
-  os << "}";
-  return os;
-}
-
-std::string to_string(const std::string &type,
-                      const VariableHandleMap &in_vars,
-                      const VariableHandleMap &out_vars,
-                      const framework::AttributeMap &attrs) {
-  std::stringstream ss;
-  ss << type << " ";
-  for (auto &param_name : in_vars) {
-    for (auto &var : param_name.second) {
-      ss << param_name.first << ":(" << var->Desc() << ") ";
-    }
-  }
-  for (auto &param_name : out_vars) {
-    for (auto &var : param_name.second) {
-      ss << param_name.first << ":(" << var->Desc() << ") ";
-    }
-  }
-  return ss.str();
-}
-
-framework::OpDesc CreateOpDesc(const std::string &type,
-                               const VariableHandleMap &in_vars,
-                               const VariableHandleMap &out_vars,
-                               const framework::AttributeMap &attrs) {
-  framework::VariableNameMap inputs;
-  for (auto &param_name : in_vars) {
-    for (auto &var : param_name.second) {
-      inputs[param_name.first].emplace_back(var->Name());
-    }
-  }
-  framework::VariableNameMap outputs;
-  for (auto &param_name : out_vars) {
-    for (auto &var : param_name.second) {
-      outputs[param_name.first].emplace_back(var->Name());
-    }
-  }
-  return framework::OpDesc(type, inputs, outputs, attrs);
-}
-
-void InferShapeAndVarType(const std::string &type,
-                          const VariableHandleMap &in_vars,
-                          VariableHandleMap *out_vars,
-                          const framework::AttributeMap &attrs) {
-  framework::OpDesc op_desc = CreateOpDesc(type, in_vars, *out_vars, attrs);
-
-  // Create a temporary block for compile-time
-  framework::ProgramDesc program_desc;
-  framework::BlockDesc *block_desc = program_desc.MutableBlock(0);
-  PADDLE_ENFORCE(block_desc);
-
-  for (auto &param_name : in_vars) {
-    for (auto &var : param_name.second) {
-      *block_desc->Var(var->Name())->Proto() = *var->MutableDesc()->Proto();
-    }
-  }
-  for (auto &param_name : *out_vars) {
-    for (auto &var : param_name.second) {
-      *block_desc->Var(var->Name())->Proto() = *var->MutableDesc()->Proto();
-    }
-  }
-
-  LOG(INFO) << "- " << to_string(type, in_vars, *out_vars, attrs);
-  op_desc.InferShape(*block_desc);
-  op_desc.InferVarType(block_desc);
-  for (auto &param_name : *out_vars) {
-    for (auto &var : param_name.second) {
-      *var->MutableDesc()->Proto() = *block_desc->Var(var->Name())->Proto();
-    }
-  }
-  LOG(INFO) << "+ " << to_string(type, in_vars, *out_vars, attrs);
-}
-
-void Tape::AddOp(const std::string &type,
-                 const VariableHandleMap &in_vars,
-                 VariableHandleMap out_vars,
-                 const framework::AttributeMap &attrs) {
-  InferShapeAndVarType(type, in_vars, &out_vars, attrs);
-  tape_.emplace_back(type, in_vars, out_vars, attrs);
-}
-
-// Temporary Scope for Operator::Run()
-class ScopeWrapper : public framework::Scope {
- public:
-  ScopeWrapper(const VariableHandleMap &in_vars,
-               const VariableHandleMap &out_vars) {
-    for (auto &v : in_vars) {
-      for (auto &vv : v.second) {
-        if (!vars_.count(vv->Name())) {
-          vars_[vv->Name()].reset(vv->Var());
-        }
-      }
-    }
-    for (auto &v : out_vars) {
-      for (auto &vv : v.second) {
-        if (!vars_.count(vv->Name())) {
-          vars_[vv->Name()].reset(vv->Var());
-        }
-      }
-    }
-  }
-
-  ~ScopeWrapper() {
-    for (auto &pair : vars_) {
-      pair.second.release();
-    }
-  }
-};
-
-void Tape::Forward() {
-  LOG(INFO) << "Starting forward -------------------------";
-  PADDLE_ENFORCE(!has_been_backwarded_);
-  while (current_position_ < tape_.size()) {
-    OpHandle &op = tape_[current_position_];
-
-    // Create Output Tensor, this is only necessary for OpWithKernel
-    for (auto &param2var : op.outputs_) {
-      for (auto &var : param2var.second) {
-        var->InitializeVariable();
-      }
-    }
-
-    framework::OpDesc op_desc =
-        CreateOpDesc(op.type_, op.inputs_, op.outputs_, op.attrs_);
-    ScopeWrapper scope(op.inputs_, op.outputs_);
-    framework::OpRegistry::CreateOp(op_desc)->Run(scope, platform::CPUPlace());
-    current_position_++;
-  }
-
-  LOG(INFO) << "Finishing forward -------------------------";
-}
-
-void Tape::Backward(VariableHandle target) {
-  PADDLE_ENFORCE(!has_been_backwarded_);
-
-  Forward();
-
-  // TODO(tonyyang-svail): check output of last op is target
-  backward_tape_.reset(new Tape());
-
-  framework::AttributeMap attrs;
-
-  // FIXME(tonyyang-svail): Need to infer_data_type
-  attrs["dtype"] = framework::proto::VarType::Type::VarType_Type_FP32;
-  attrs["shape"] = std::vector<int>{1};
-  attrs["value"] = 1.0f;
-  backward_tape_->AddOp(
-      "fill_constant", {}, {{"Out", {target->Grad()}}}, attrs);
-
-  for (auto it = tape_.rbegin(); it != tape_.rend(); ++it) {
-    framework::OpDesc op_desc =
-        CreateOpDesc(it->type_, it->inputs_, it->outputs_, it->attrs_);
-    std::unordered_map<std::string, std::string> grad_to_var;
-    std::vector<std::unique_ptr<framework::OpDesc>> grad_op_descs =
-        framework::OpInfoMap::Instance()
-            .Get(op_desc.Type())
-            .GradOpMaker()(op_desc, {}, &grad_to_var, {});
-
-    for (auto &op_desc : grad_op_descs) {
-      std::unordered_map<std::string, VariableHandle> name2var;
-      for (auto &param2vars : it->inputs_) {
-        for (auto &a : param2vars.second) {
-          name2var[a->Name()] = a;
-        }
-      }
-      for (auto &param2vars : it->outputs_) {
-        for (auto &a : param2vars.second) {
-          name2var[a->Name()] = a;
-        }
-      }
-
-      VariableHandleMap in_vars;
-      VariableHandleMap out_vars;
-      std::map<const framework::VariableNameMap *, VariableHandleMap *>
-          loop_over{{&op_desc->Inputs(), &in_vars},
-                    {&op_desc->Outputs(), &out_vars}};
-      for (auto &each : loop_over) {
-        auto &vmp = *each.first;
-        auto &vhm = *each.second;
-        for (auto &p2a : vmp) {
-          for (auto &argu : p2a.second) {
-            if (name2var.count(argu)) {
-              vhm[p2a.first].push_back(name2var[argu]);
-            } else {
-              PADDLE_ENFORCE(ends_with(argu, framework::kGradVarSuffix),
-                             argu.c_str());
-              std::string name = argu.substr(
-                  0, argu.size() - std::strlen(framework::kGradVarSuffix));
-              PADDLE_ENFORCE(name2var.count(name), name.c_str());
-              vhm[p2a.first].push_back(name2var[name]->Grad());
-            }
-          }
-        }
-      }
-
-      backward_tape_->AddOp(
-          op_desc->Type(), in_vars, out_vars, op_desc->GetAttrMap());
-    }
-
-    // TODO(tonyyang-svail): how to fill empty grad?
-    // TODO(tonyyang-svail): Sum var grad is necessary
-  }
-
-  backward_tape_->Forward();
-  has_been_backwarded_ = true;
-}
-
-Tape &get_global_tape() {
-  static Tape T;
-  return T;
-}
-
-void reset_global_tape() { get_global_tape() = Tape(); }
-}
-}
diff --git a/paddle/contrib/tape/tape.h b/paddle/contrib/tape/tape.h
deleted file mode 100644
index ed79de17a7fca58a2c542831560f0dd5ad34f960..0000000000000000000000000000000000000000
--- a/paddle/contrib/tape/tape.h
+++ /dev/null
@@ -1,64 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-#include <map>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "paddle/contrib/tape/variable.h"
-
-namespace paddle {
-namespace tape {
-
-using VariableHandleMap = std::map<std::string, std::vector<VariableHandle>>;
-
-struct OpHandle {
-  OpHandle(const std::string &type,
-           const VariableHandleMap &in_vars,
-           const VariableHandleMap &out_vars,
-           const framework::AttributeMap &attrs)
-      : type_(type), inputs_(in_vars), outputs_(out_vars), attrs_(attrs) {}
-
-  std::string type_;
-  VariableHandleMap inputs_;
-  VariableHandleMap outputs_;
-  framework::AttributeMap attrs_;
-};
-
-class Tape {
- public:
-  void AddOp(const std::string &type,
-             const VariableHandleMap &in_vars,
-             VariableHandleMap out_vars,
-             const framework::AttributeMap &attrs);
-  void Forward();
-  void Backward(VariableHandle target);
-
-  bool HasBeenBackwarded() { return has_been_backwarded_; }
-
- private:
-  bool has_been_backwarded_ = false;
-  size_t current_position_ = 0;
-
-  std::vector<OpHandle> tape_;
-  std::shared_ptr<Tape> backward_tape_;
-};
-
-Tape &get_global_tape();
-
-void reset_global_tape();
-}
-}
diff --git a/paddle/contrib/tape/test_tape.cc b/paddle/contrib/tape/test_tape.cc
deleted file mode 100644
index e9bfd21a7189c5867a52d2b25db09a462d5c7ba7..0000000000000000000000000000000000000000
--- a/paddle/contrib/tape/test_tape.cc
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "gtest/gtest.h"
-#include "paddle/contrib/tape/function.h"
-
-using namespace paddle::tape;
-
-TEST(Tape, TestMLP) {
-  LOG(INFO) << "TestMLP";
-  Linear linear1(3, 3, "relu");
-  Linear linear2(3, 3, "relu");
-  Mean mean;
-
-  SGD sgd(0.001);
-
-  std::string initializer = "fill_constant";
-  paddle::framework::AttributeMap attrs;
-  attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
-  attrs["shape"] = std::vector<int>{3, 3};
-  attrs["value"] = 1.0f;
-  Fill filler(initializer, attrs);
-
-  for (int i = 0; i < 2; ++i) {
-    reset_global_tape();
-
-    VariableHandle input(new Variable("input"));
-    filler(input);
-
-    auto loss = mean(linear2(linear1(input)));
-
-    get_global_tape().Backward(loss);
-
-    for (auto w : linear1.Params()) {
-      sgd(w);
-    }
-    for (auto w : linear2.Params()) {
-      sgd(w);
-    }
-  }
-}
-
-int main(int argc, char** argv) {
-  std::vector<paddle::platform::Place> places;
-  places.emplace_back(paddle::platform::CPUPlace());
-  paddle::platform::DeviceContextPool::Init(places);
-
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/contrib/tape/variable.cc b/paddle/contrib/tape/variable.cc
deleted file mode 100644
index 5ec1612909503f666bca0fce3246002879854156..0000000000000000000000000000000000000000
--- a/paddle/contrib/tape/variable.cc
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/contrib/tape/variable.h"
-
-namespace paddle {
-namespace tape {
-
-void Variable::InitializeVariable() {
-  LOG(INFO) << "Initialzing " << desc_.Name() << " as " << desc_.GetType();
-  framework::proto::VarType::Type var_type = desc_.GetType();
-  if (var_type == framework::proto::VarType::LOD_TENSOR) {
-    var_.GetMutable<framework::LoDTensor>();
-  } else if (var_type == framework::proto::VarType::SELECTED_ROWS) {
-    var_.GetMutable<framework::SelectedRows>();
-  } else {
-    PADDLE_THROW("Variable type %d is not in [LOD_TENSOR, SELECTED_ROWS]",
-                 var_type);
-  }
-}
-}
-}
diff --git a/paddle/contrib/tape/variable.h b/paddle/contrib/tape/variable.h
deleted file mode 100644
index 35c328e69c9ebe25e907a59e4d67b999aff1d876..0000000000000000000000000000000000000000
--- a/paddle/contrib/tape/variable.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-#include <memory>
-
-#include "paddle/fluid/framework/operator.h"  // framework::kGradVarSuffix
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/variable.h"
-
-namespace paddle {
-namespace tape {
-
-class Variable;
-using VariableHandle = std::shared_ptr<Variable>;
-
-/*
- * Combination of
- *     framework::VarDesc desc_;
- *     framework::Variable var_;
- */
-class Variable {
- public:
-  Variable(const std::string pre_fix)
-      : desc_(pre_fix + std::to_string(count())) {}
-
-  Variable(const std::string pre_fix, bool is_grad)
-      : desc_(pre_fix + (is_grad ? framework::kGradVarSuffix
-                                 : std::to_string(count()))) {}
-
-  ~Variable() { LOG(INFO) << "Deleting " << Name(); }
-
-  // Instantiate LoDTensor/SelectedRow
-  void InitializeVariable();
-
-  VariableHandle Grad() {
-    if (grad_.expired()) {
-      VariableHandle new_grad(new Variable(desc_.Name(), true));
-      grad_ = new_grad;
-      return new_grad;
-    } else {
-      return VariableHandle(grad_);
-    }
-  }
-
-  // Stochastic Gradient Descent with Momentum
-  //  VariableHandle Momentum ();
-
-  //  void init(const std::string& initializer,
-  //            const framework::AttributeMap& attrs);
-
-  // void value() {};
-
-  const framework::VarDesc& Desc() const { return desc_; }
-  framework::VarDesc* MutableDesc() { return &desc_; }
-
-  // TODO(tonyyang-svail): No need to expose name
-  std::string Name() const { return desc_.Name(); }
-
-  framework::Variable* Var() { return &var_; }
-
- private:
-  int count() {
-    static int counter = 0;
-    return counter++;
-  }
-
-  framework::VarDesc desc_;
-  framework::Variable var_;
-
-  std::weak_ptr<Variable> grad_;
-};
-}
-}
diff --git a/paddle/cuda/include/hl_base.h b/paddle/cuda/include/hl_base.h
deleted file mode 100644
index 77f5d82dbe2cad183491033736bac85961b6d320..0000000000000000000000000000000000000000
--- a/paddle/cuda/include/hl_base.h
+++ /dev/null
@@ -1,250 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cstddef>
-
-#ifdef PADDLE_TYPE_DOUBLE
-#define HL_FLOAT_MAX 3.40282347e+38F
-#define HL_FLOAT_MIN 1.17549435e-38F
-using real = double;
-#else
-#define HL_FLOAT_MAX 1.7976931348623157e+308
-#define HL_FLOAT_MIN 2.2250738585072014e-308
-using real = float;
-#endif
-
-/**
- * The maximum input value for exp, used to avoid overflow problem.
- * currently only used for tanh function.
- */
-#define EXP_MAX_INPUT 40.0
-
-/**
- * @brief DIVUP(x, y) is similar to ceil(x / y).
- * @note  For CUDA, DIVUP will be used to specify
- *        the size of blockDim.
- */
-#ifndef DIVUP
-#define DIVUP(x, y) (((x) + (y)-1) / (y))
-#endif
-
-/**
- * HPPL is an internal high performance parallel computing library
- * for high-level neural network routines, which can support many
- * heterogeneous compute architectures, such as GPU, FPGA, etc.
- */
-
-/**
- * @brief   HPPL CUDA Stream.
- *
- * @note    Each thread can use HPPL_STREAM_* after calling hl_init.
- *          HPPL_STREAM_DEFAULT is HPPL default stream.
- */
-typedef enum {
-  HPPL_STREAM_DEFAULT = 0, /* Thread Default Stream*/
-  HPPL_STREAM_1 = 1,
-  HPPL_STREAM_2 = 2,
-  HPPL_STREAM_3 = 3,
-  HPPL_STREAM_4 = 4,
-  HPPL_THREAD_STREAM_1 = 5,
-  HPPL_THREAD_STREAM_2 = 6,
-  HPPL_THREAD_STREAM_3 = 7,
-  HPPL_THREAD_STREAM_4 = 8,
-  HPPL_STREAM_END
-} hl_stream_t;
-
-/**
- * @brief HPPL activation mode.
- */
-typedef enum {
-  HL_ACTIVATION_SIGMOID = 0,
-  HL_ACTIVATION_RELU = 1,
-  HL_ACTIVATION_TANH = 2,
-  HL_ACTIVATION_LINEAR = 3,
-  HL_ACTIVATION_END
-} hl_activation_mode_t;
-
-/**
- * @brief Transpose type.
- */
-typedef enum {
-  HPPL_OP_N = 0, /* transpose */
-  HPPL_OP_T = 1, /* non transpose */
-  HPPL_OP_END
-} hl_trans_op_t;
-
-/**
- * @brief Lstm value.
- *
- * @param  gateValue         input value.
- * @param  prevStateValue    previous state value.
- * @param  stateValue        state value.
- * @param  stateActiveValue  state active value.
- * @param  outputValue       output value.
- */
-typedef struct {
-  real *gateValue;
-  real *prevStateValue;
-  real *stateValue;
-  real *stateActiveValue;
-  real *outputValue;
-  real *checkIg;
-  real *checkFg;
-  real *checkOg;
-} hl_lstm_value;
-
-/**
- * @brief Lstm gradient.
- *
- * @param  gateGrad          input gradient.
- * @param  prevStateGrad     previous state gradient.
- * @param  stateGrad         state gradient.
- * @param  stateActiveGrad   state active gradient.
- * @param  outputGrad        output gradient.
- */
-typedef struct {
-  real *gateGrad;
-  real *prevStateGrad;
-  real *stateGrad;
-  real *stateActiveGrad;
-  real *outputGrad;
-  real *checkIgGrad;
-  real *checkFgGrad;
-  real *checkOgGrad;
-} hl_lstm_grad;
-
-/**
- * @brief Gru value.
- *
- * @param  gateWeight           gate weight (updateGate + resetGate).
- * @param  stateWeight          frame state weight.
- * @param  gateValue            gate value results.
- * @param  resetOutputValue     resetOutput value.
- * @param  outputValue          output value.
- * @param  prevOutValue         previous output value.
- *
- */
-typedef struct {
-  real *gateWeight;
-  real *stateWeight;
-  real *gateValue;
-  real *resetOutputValue;
-  real *outputValue;
-  real *prevOutValue;
-} hl_gru_value;
-
-/**
- * @brief Gru gradient.
- *
- * @param  gateWeightGrad       gate weight gradient.
- * @param  stateWeightGrad      frame state weight gradient.
- * @param  gateGrad             gate gradient results.
- * @param  resetOutputGrad      resetOutput gradient.
- * @param  outputGrad           output gradient.
- * @param  prevOutGrad          previous output gradient.
- */
-typedef struct {
-  real *gateWeightGrad;
-  real *stateWeightGrad;
-  real *gateGrad;
-  real *resetOutputGrad;
-  real *outputGrad;
-  real *prevOutGrad;
-} hl_gru_grad;
-
-/**
- * @brief  Sparse matrix value type.
- */
-typedef enum {
-  HL_NO_VALUE = 0, /* matrix values only 0 or 1 */
-  HL_FLOAT_VALUE = 1,
-  HL_VALUE_END
-} hl_matrix_value_t;
-
-/**
- * @brief  HPPL matrix format.
- */
-typedef enum {
-  HL_SPARSE_CSR = 0,
-  HL_SPARSE_CSC = 1,
-  HL_SPARSE_END
-} hl_matrix_format_t;
-
-typedef struct _hl_matrix_s *hl_matrix_s;
-
-/**
- * @brief   HPPL sparse matrix.
- *
- * @param  matrix     sparse matrix.
- * @param  format     matrix format.
- * @param  type       the type of matrix values.
- * @param  rows       matrix rows.
- * @param  cols       matrix columns.
- * @param  nnz        nonzero values of sparse matrix.
- */
-typedef struct {
-  hl_matrix_s matrix;
-  hl_matrix_format_t format;
-  hl_matrix_value_t type;
-  int rows;
-  int cols;
-  size_t nnz;
-} _hl_sparse_matrix_s, *hl_sparse_matrix_s;
-
-#ifdef __NVCC__
-
-#include <cuda_runtime.h>
-#include "paddle/cuda/include/hl_cuda.h"
-#include "paddle/utils/Logging.h"
-
-extern __thread bool g_sync_flag;
-extern __thread cudaStream_t default_stream;
-#define STREAM_DEFAULT default_stream
-
-/**
- * @brief   Check cuda kernel execution.
- * @param   msg   error string
- */
-#define CHECK_SYNC(msg)                                               \
-  if (true == g_sync_flag) {                                          \
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);                       \
-    cudaError_t err = (cudaError_t)hl_get_device_last_error();        \
-    CHECK_EQ(cudaSuccess, err)                                        \
-        << "[" << msg << "] "                                         \
-        << "CUDA error: " << hl_get_device_error_string((size_t)err); \
-  }
-
-// __shfl has been deprecated as of CUDA 9.0.
-#if CUDA_VERSION < 9000
-template <typename T>
-__forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) {
-  return __shfl_down(val, delta);
-}
-
-template <typename T>
-__forceinline__ __device__ T
-__shfl_sync(unsigned, T val, int src_line, int width) {
-  return __shfl(val, src_line, width);
-}
-
-#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
-#else
-#define FULL_WARP_MASK 0xFFFFFFFF
-#define CREATE_SHFL_MASK(mask, predicate) \
-  mask = __ballot_sync(FULL_WARP_MASK, (predicate))
-#endif
-
-#endif  // __NVCC__
diff --git a/paddle/cuda/include/hl_gpu_gru.cuh b/paddle/cuda/include/hl_gpu_gru.cuh
deleted file mode 100644
index 9fcad2c3bc2fa255e3d7cd3e7940a32fd286751b..0000000000000000000000000000000000000000
--- a/paddle/cuda/include/hl_gpu_gru.cuh
+++ /dev/null
@@ -1,393 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-#ifndef HL_GPU_GRU_CUH_
-#define HL_GPU_GRU_CUH_
-
-#ifdef __NVCC__
-
-#include "paddle/utils/Logging.h"
-
-/*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
- */
-template<class OpResetOutput, bool isBatch>
-__global__ void KeGruForwardResetOutput(OpResetOutput opResetOutput,
-                                        real *gateValue,
-                                        real *resetOutputValue,
-                                        real *prevOutputValue,
-                                        int frameSize,
-                                        int batchSize,
-                                        hl_activation_mode_t active_gate) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
-
-  int batchIdx = 0;
-  if (isBatch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
-    gateValue += batchIdx * 3 * frameSize;
-    resetOutputValue += batchIdx * frameSize;
-  }
-
-  real rPrevOut = 0;
-  real rValueResetOutput;
-  real rValueUpdateGate = gateValue[frameIdx + frameSize * 0];
-  real rValueResetGate  = gateValue[frameIdx + frameSize * 1];
-
-  if (prevOutputValue) {
-    if (isBatch) prevOutputValue += batchIdx * frameSize;
-    rPrevOut = prevOutputValue[frameIdx];
-  }
-
-  opResetOutput(rValueUpdateGate,
-                rValueResetGate,
-                rPrevOut,
-                rValueResetOutput,
-                hppl::gpu::forward[active_gate]);
-
-  gateValue[frameIdx + frameSize * 0] = rValueUpdateGate;
-  gateValue[frameIdx + frameSize * 1] = rValueResetGate;
-  resetOutputValue[frameIdx] = rValueResetOutput;
-}
-
-/*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
- */
-template<class OpFinalOutput, bool isBatch>
-__global__ void KeGruForwardFinalOutput(OpFinalOutput opFinalOutput,
-                                        real *gateValue,
-                                        real *prevOutputValue,
-                                        real *outputValue,
-                                        int frameSize,
-                                        int batchSize,
-                                        hl_activation_mode_t active_node) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
-  int batchIdx = 0;
-  if (isBatch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
-    gateValue += batchIdx * 3 * frameSize;
-    outputValue += batchIdx * frameSize;
-  }
-
-  real rOutput;
-  real rPrevOut = 0;
-  real rValueUpdateGate = gateValue[frameIdx + frameSize * 0];
-  real rValueFrameState = gateValue[frameIdx + frameSize * 2];
-
-  if (prevOutputValue) {
-    if (isBatch) prevOutputValue += batchIdx * frameSize;
-    rPrevOut = prevOutputValue[frameIdx];
-  }
-
-  opFinalOutput(rValueUpdateGate,
-                rValueFrameState,
-                rPrevOut,
-                rOutput,
-                hppl::gpu::forward[active_node]);
-
-  gateValue[frameIdx + frameSize * 2] = rValueFrameState;
-  outputValue[frameIdx] = rOutput;
-}
-
-template<class OpResetOutput, class OpFinalOutput>
-void hl_gpu_gru_forward(OpResetOutput opResetOutput,
-                        OpFinalOutput opFinalOutput,
-                        hl_gru_value value,
-                        int frameSize,
-                        int batchSize,
-                        hl_activation_mode_t active_node,
-                        hl_activation_mode_t active_gate) {
-  dim3 threads;
-  dim3 grid;
-  if (batchSize == 1) {
-    int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
-    int frameBlocks = (frameSize + 1024 - 1) / 1024;
-    threads = dim3(framePerBlock, 1);
-    grid = dim3(frameBlocks, 1);
-  } else {
-    threads = dim3(32, 32);
-    grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
-  }
-
-  if (value.prevOutValue) {
-    hl_matrix_mul(value.prevOutValue, HPPL_OP_N,
-                  value.gateWeight, HPPL_OP_N,
-                  value.gateValue,
-                  batchSize, 2*frameSize, frameSize,
-                  /*alpha = */ 1, /*beta = */ 1,
-                  frameSize, 2* frameSize, 3*frameSize);
-  }
-
-  if (batchSize == 1) {
-    KeGruForwardResetOutput<OpResetOutput, /* isBatch= */false>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(opResetOutput,
-        value.gateValue, value.resetOutputValue, value.prevOutValue,
-        frameSize, batchSize, active_gate);
-  } else {
-    KeGruForwardResetOutput<OpResetOutput, /* isBatch= */true>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(opResetOutput,
-        value.gateValue, value.resetOutputValue, value.prevOutValue,
-        frameSize, batchSize, active_gate);
-  }
-
-  if (value.prevOutValue) {
-    hl_matrix_mul(value.resetOutputValue, HPPL_OP_N,
-                  value.stateWeight, HPPL_OP_N,
-                  value.gateValue + 2*frameSize,
-                  batchSize, frameSize, frameSize,
-                  /*alpha = */ 1, /*beta = */ 1,
-                  frameSize, frameSize, 3*frameSize);
-  }
-
-  if (batchSize == 1) {
-    KeGruForwardFinalOutput<OpFinalOutput, /* isBatch= */false>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(opFinalOutput,
-        value.gateValue, value.prevOutValue, value.outputValue,
-        frameSize, batchSize, active_node);
-  } else {
-    KeGruForwardFinalOutput<OpFinalOutput, /* isBatch= */true>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(opFinalOutput,
-        value.gateValue, value.prevOutValue, value.outputValue,
-        frameSize, batchSize, active_node);
-  }
-
-  CHECK_SYNC("hl_gpu_gru_forward failed");
-}
-
-/*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
- */
-template<class OpStateGrad, bool isBatch>
-__global__ void KeGruBackwardStateGrad(OpStateGrad opStateGrad,
-                                       real *gateValue,
-                                       real *gateGrad,
-                                       real *prevOutValue,
-                                       real *prevOutGrad,
-                                       real *outputGrad,
-                                       int frameSize,
-                                       int batchSize,
-                                       hl_activation_mode_t active_node) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
-  int batchIdx = 0;
-  if (isBatch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
-    gateValue += batchIdx * 3 * frameSize;
-    gateGrad  += batchIdx * 3 * frameSize;
-    outputGrad += batchIdx * frameSize;
-  }
-
-  real rUpdateGateGrad;
-  real rFrameStateGrad;
-  real rPrevOutValue = 0;
-  real rPrevOutGrad  = 0;
-  real rUpdateGateValue = gateValue[frameIdx + frameSize * 0];
-  real rFrameStateValue = gateValue[frameIdx + frameSize * 2];
-  real rOutGrad  = outputGrad[frameIdx];
-
-  if (prevOutValue && prevOutGrad) {
-    if (isBatch) prevOutValue += batchIdx * frameSize;
-    rPrevOutValue = prevOutValue[frameIdx];
-
-    if (isBatch) prevOutGrad  += batchIdx * frameSize;
-    rPrevOutGrad  = prevOutGrad[frameIdx];
-  }
-
-  opStateGrad(rUpdateGateValue,
-              rUpdateGateGrad,
-              rFrameStateValue,
-              rFrameStateGrad,
-              rPrevOutValue,
-              rPrevOutGrad,
-              rOutGrad,
-              hppl::gpu::backward[active_node]);
-
-  gateGrad[frameIdx + frameSize * 0] = rUpdateGateGrad;
-  gateGrad[frameIdx + frameSize * 2] = rFrameStateGrad;
-  if (prevOutGrad) {
-    prevOutGrad[frameIdx] = rPrevOutGrad;
-  }
-}
-
-/*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
- */
-template<class OpResetGrad, bool isBatch>
-__global__ void KeGruBackwardResetGrad(OpResetGrad opResetGrad,
-                                       real *gateValue,
-                                       real *gateGrad,
-                                       real *prevOutValue,
-                                       real *prevOutGrad,
-                                       real *resetOutputGrad,
-                                       int frameSize,
-                                       int batchSize,
-                                       hl_activation_mode_t active_gate) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
-  int batchIdx = 0;
-  if (isBatch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
-    gateValue += batchIdx * 3 * frameSize;
-    gateGrad  += batchIdx * 3 * frameSize;
-    resetOutputGrad += batchIdx * frameSize;
-  }
-
-  real rResetGateGrad;
-  real rPrevOutValue = 0;
-  real rPrevOutGrad  = 0;
-  real rResetOutputGrad = 0;
-  real rUpdateGateValue = gateValue[frameIdx + frameSize * 0];
-  real rUpdateGateGrad  = gateGrad[frameIdx + frameSize * 0];
-  real rResetGateValue  = gateValue[frameIdx + frameSize * 1];
-
-  if (prevOutValue && prevOutGrad) {
-    if (isBatch) prevOutValue += batchIdx * frameSize;
-    if (isBatch) prevOutGrad  += batchIdx * frameSize;
-    rPrevOutValue = prevOutValue[frameIdx];
-    rPrevOutGrad  = prevOutGrad[frameIdx];
-    rResetOutputGrad = resetOutputGrad[frameIdx];
-  }
-
-  opResetGrad(rUpdateGateValue,
-              rUpdateGateGrad,
-              rResetGateValue,
-              rResetGateGrad,
-              rPrevOutValue,
-              rPrevOutGrad,
-              rResetOutputGrad,
-              hppl::gpu::backward[active_gate]);
-
-  gateGrad[frameIdx + frameSize * 0] = rUpdateGateGrad;
-  gateGrad[frameIdx + frameSize * 1] = rResetGateGrad;
-  if (prevOutGrad) {
-    prevOutGrad[frameIdx] = rPrevOutGrad;
-  }
-}
-
-template<class OpStateGrad, class OpResetGrad>
-void hl_gpu_gru_backward(OpStateGrad opStateGrad,
-                         OpResetGrad opResetGrad,
-                         hl_gru_value value,
-                         hl_gru_grad  grad,
-                         int frameSize,
-                         int batchSize,
-                         hl_activation_mode_t active_node,
-                         hl_activation_mode_t active_gate) {
-  dim3 threads;
-  dim3 grid;
-  if (batchSize == 1) {
-    int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
-    int frameBlocks = (frameSize + 1024 - 1) / 1024;
-    threads = dim3(framePerBlock, 1);
-    grid = dim3(frameBlocks, 1);
-  } else {
-    threads = dim3(32, 32);
-    grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
-  }
-
-  if (batchSize == 1) {
-    KeGruBackwardStateGrad<OpStateGrad, /* isBatch= */false>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(opStateGrad,
-        value.gateValue, grad.gateGrad, value.prevOutValue, grad.prevOutGrad,
-        grad.outputGrad, frameSize, batchSize, active_node);
-  } else {
-    KeGruBackwardStateGrad<OpStateGrad, /* isBatch= */true>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(opStateGrad,
-        value.gateValue, grad.gateGrad, value.prevOutValue, grad.prevOutGrad,
-        grad.outputGrad, frameSize, batchSize, active_node);
-  }
-
-  if (value.prevOutValue && grad.prevOutGrad) {
-    hl_matrix_mul(grad.gateGrad + 2*frameSize, HPPL_OP_N,
-                  value.stateWeight, HPPL_OP_T,
-                  grad.resetOutputGrad,
-                  batchSize, frameSize, frameSize,
-                  /*alpha = */ 1, /*beta = */ 0,
-                  3*frameSize, frameSize, frameSize);
-    if (grad.stateWeightGrad) {
-      hl_matrix_mul(value.resetOutputValue, HPPL_OP_T,
-                    grad.gateGrad + 2*frameSize, HPPL_OP_N,
-                    grad.stateWeightGrad,
-                    frameSize, frameSize, batchSize,
-                    /*alpha = */ 1, /*beta = */ 1,
-                    frameSize, 3*frameSize, frameSize);
-    }
-  }
-
-  if (batchSize == 1) {
-    KeGruBackwardResetGrad<OpResetGrad, /* isBatch= */false>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(opResetGrad,
-        value.gateValue, grad.gateGrad, value.prevOutValue, grad.prevOutGrad,
-        grad.resetOutputGrad, frameSize, batchSize, active_gate);
-  } else {
-    KeGruBackwardResetGrad<OpResetGrad, /* isBatch= */true>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(opResetGrad,
-        value.gateValue, grad.gateGrad, value.prevOutValue, grad.prevOutGrad,
-        grad.resetOutputGrad, frameSize, batchSize, active_gate);
-  }
-
-  if (grad.prevOutGrad && value.prevOutValue) {
-    hl_matrix_mul(grad.gateGrad, HPPL_OP_N,
-                  value.gateWeight, HPPL_OP_T,
-                  grad.prevOutGrad,
-                  batchSize, frameSize, 2*frameSize,
-                  /*alpha = */ 1, /*beta = */ 1,
-                  3*frameSize, 2*frameSize, frameSize);
-    if (grad.gateWeightGrad) {
-      hl_matrix_mul(value.prevOutValue, HPPL_OP_T,
-                    grad.gateGrad, HPPL_OP_N,
-                    grad.gateWeightGrad,
-                    frameSize, 2*frameSize, batchSize,
-                    /*alpha = */ 1, /*beta = */ 1,
-                    frameSize, 3*frameSize, 2*frameSize);
-    }
-  }
-
-  CHECK_SYNC("hl_gpu_gru_backward failed");
-}
-
-#else
-
-template<class OpResetOutput, class OpFinalOutput>
-void hl_gpu_gru_forward(OpResetOutput opResetOutput,
-                        OpFinalOutput opFinalOutput,
-                        hl_gru_value value,
-                        int frameSize,
-                        int batchSize,
-                        hl_activation_mode_t active_node,
-                        hl_activation_mode_t active_gate) {}
-
-template<class OpStateGrad, class OpResetGrad>
-void hl_gpu_gru_backward(OpStateGrad opStateGrad,
-                         OpResetGrad opResetGrad,
-                         hl_gru_value value,
-                         hl_gru_grad  grad,
-                         int frameSize,
-                         int batchSize,
-                         hl_activation_mode_t active_node,
-                         hl_activation_mode_t active_gate) {}
-
-#endif
-
-#endif /* HL_GPU_GRU_CUH_ */
diff --git a/paddle/cuda/include/hl_gpu_lstm.cuh b/paddle/cuda/include/hl_gpu_lstm.cuh
deleted file mode 100644
index 92517a44d2353a42d905708fc9aa98727a13a9e9..0000000000000000000000000000000000000000
--- a/paddle/cuda/include/hl_gpu_lstm.cuh
+++ /dev/null
@@ -1,300 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-#ifndef HL_GPU_LSTM_CUH_
-#define HL_GPU_LSTM_CUH_
-
-#ifdef __NVCC__
-
-#include "paddle/utils/Logging.h"
-#include "hl_device_functions.cuh"
-
-/*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
- */
-template<class Op, bool isBatch>
-__global__ void KeLstmForward(Op op,
-                              hl_lstm_value value,
-                              int frameSize,
-                              int batchSize,
-                              hl_activation_mode_t active_node,
-                              hl_activation_mode_t active_gate,
-                              hl_activation_mode_t active_state) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
-
-  int batchIdx = 0;
-  if (isBatch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
-    value.gateValue += batchIdx * frameSize * 4;
-    value.outputValue += batchIdx * frameSize;
-    value.stateValue  += batchIdx * frameSize;
-    value.stateActiveValue += batchIdx * frameSize;
-  }
-
-  real rState;
-  real rPrevState = 0;
-  real rStateAtv;
-  real rOut;
-  real rValueIn;
-  real rValueIg;
-  real rValueFg;
-  real rValueOg;
-  real rCheckI = value.checkIg[frameIdx];
-  real rCheckF = value.checkFg[frameIdx];
-  real rCheckO = value.checkOg[frameIdx];
-
-  rValueIn = value.gateValue[frameIdx];
-  rValueIg = value.gateValue[frameIdx + frameSize];
-  rValueFg = value.gateValue[frameIdx + frameSize * 2];
-  rValueOg = value.gateValue[frameIdx + frameSize * 3];
-
-  if (value.prevStateValue) {
-    if (isBatch) value.prevStateValue += batchIdx * frameSize;
-    rPrevState = value.prevStateValue[frameIdx];
-  }
-
-  op(rValueIn,
-     rValueIg,
-     rValueFg,
-     rValueOg,
-     rPrevState,
-     rState,
-     rStateAtv,
-     rOut,
-     rCheckI,
-     rCheckF,
-     rCheckO,
-     hppl::gpu::forward[active_node],
-     hppl::gpu::forward[active_gate],
-     hppl::gpu::forward[active_state]);
-
-  value.gateValue[frameIdx] = rValueIn;
-  value.gateValue[frameIdx + frameSize] = rValueIg;
-  value.gateValue[frameIdx + frameSize * 2] = rValueFg;
-  value.gateValue[frameIdx + frameSize * 3] = rValueOg;
-
-  value.stateValue[frameIdx] = rState;
-  value.stateActiveValue[frameIdx] = rStateAtv;
-  value.outputValue[frameIdx] = rOut;
-}
-
-/*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
- */
-template<class Op, bool isBatch>
-__global__ void KeLstmBackward(Op op,
-                               hl_lstm_value value,
-                               hl_lstm_grad grad,
-                               int frameSize,
-                               int batchSize,
-                               hl_activation_mode_t active_node,
-                               hl_activation_mode_t active_gate,
-                               hl_activation_mode_t active_state) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
-
-  int batchIdx = 0;
-  if (isBatch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
-    value.gateValue += batchIdx * frameSize * 4;
-    value.stateValue += batchIdx * frameSize;
-    value.stateActiveValue += batchIdx * frameSize;
-    grad.gateGrad += batchIdx * frameSize * 4;
-    grad.stateGrad += batchIdx * frameSize;
-    grad.outputGrad += batchIdx * frameSize;
-  }
-
-  real rValueIn;
-  real rValueIg;
-  real rValueFg;
-  real rValueOg;
-  real rGradIn;
-  real rGradIg;
-  real rGradFg;
-  real rGradOg;
-  real rPrevState = 0;
-  real rPrevStateGrad;
-  real rState;
-  real rStateGrad;
-  real rStateAtv;
-  real rOutputGrad;
-  real rCheckI = value.checkIg[frameIdx];
-  real rCheckF = value.checkFg[frameIdx];
-  real rCheckO = value.checkOg[frameIdx];
-  real rCheckIGrad;
-  real rCheckFGrad;
-  real rCheckOGrad;
-
-  rValueIn = value.gateValue[frameIdx];
-  rValueIg = value.gateValue[frameIdx + frameSize];
-  rValueFg = value.gateValue[frameIdx + frameSize * 2];
-  rValueOg = value.gateValue[frameIdx + frameSize * 3];
-  rState = value.stateValue[frameIdx];
-  rStateAtv = value.stateActiveValue[frameIdx];
-  rOutputGrad = grad.outputGrad[frameIdx];
-  rStateGrad = grad.stateGrad[frameIdx];
-
-  if (value.prevStateValue) {
-    if (isBatch) value.prevStateValue += batchIdx * frameSize;
-    rPrevState = value.prevStateValue[frameIdx];
-  }
-
-  op(rValueIn,
-     rValueIg,
-     rValueFg,
-     rValueOg,
-     rGradIn,
-     rGradIg,
-     rGradFg,
-     rGradOg,
-     rPrevState,
-     rPrevStateGrad,
-     rState,
-     rStateGrad,
-     rStateAtv,
-     rOutputGrad,
-     rCheckI,
-     rCheckF,
-     rCheckO,
-     rCheckIGrad,
-     rCheckFGrad,
-     rCheckOGrad,
-     hppl::gpu::backward[active_node],
-     hppl::gpu::backward[active_gate],
-     hppl::gpu::backward[active_state]);
-
-  grad.gateGrad[frameIdx] = rGradIn;
-  grad.gateGrad[frameIdx + frameSize    ] = rGradIg;
-  grad.gateGrad[frameIdx + frameSize * 2] = rGradFg;
-  grad.gateGrad[frameIdx + frameSize * 3] = rGradOg;
-  grad.stateGrad[frameIdx] = rStateGrad;
-  if (grad.prevStateGrad) {
-    if (isBatch) grad.prevStateGrad += batchIdx * frameSize;
-    grad.prevStateGrad[frameIdx] = rPrevStateGrad;
-  }
-
-  if (isBatch) {
-    if (value.prevStateValue) {
-      if (grad.checkIgGrad) paddle::paddleAtomicAdd(grad.checkIgGrad+frameIdx, rCheckIGrad);
-      if (grad.checkFgGrad) paddle::paddleAtomicAdd(grad.checkFgGrad+frameIdx, rCheckFGrad);
-    }
-    if (grad.checkOgGrad) paddle::paddleAtomicAdd(grad.checkOgGrad+frameIdx, rCheckOGrad);
-  } else {
-    if (value.prevStateValue) {
-      if (grad.checkIgGrad) grad.checkIgGrad[frameIdx] += rCheckIGrad;
-      if (grad.checkFgGrad) grad.checkFgGrad[frameIdx] += rCheckFGrad;
-    }
-    if (grad.checkOgGrad) grad.checkOgGrad[frameIdx] += rCheckOGrad;
-  }
-}
-
-template<class Op>
-void hl_gpu_lstm_forward(Op op,
-                         hl_lstm_value value,
-                         int frameSize,
-                         int batchSize,
-                         hl_activation_mode_t active_node,
-                         hl_activation_mode_t active_gate,
-                         hl_activation_mode_t active_state) {
-  dim3 threads;
-  dim3 grid;
-  if (batchSize == 1) {
-    int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
-    int frameBlocks = (frameSize + 1024 - 1) / 1024;
-    threads = dim3(framePerBlock, 1);
-    grid = dim3(frameBlocks, 1);
-  } else {
-    /* framePerBlock = 32 batchPerBlock = 32 */
-    threads = dim3(32, 32);
-    grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
-  }
-
-  if (batchSize == 1) {
-    KeLstmForward<Op, /* isBatch= */false>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(op, value,
-      frameSize, batchSize, active_node, active_gate, active_state);
-  } else {
-    KeLstmForward<Op, /* isBatch= */true>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(op, value,
-      frameSize, batchSize, active_node, active_gate, active_state);
-  }
-
-  CHECK_SYNC("hl_gpu_lstm_forward failed");
-}
-
-template<class Op>
-void hl_gpu_lstm_backward(Op op,
-                          hl_lstm_value value,
-                          hl_lstm_grad grad,
-                          int frameSize,
-                          int batchSize,
-                          hl_activation_mode_t active_node,
-                          hl_activation_mode_t active_gate,
-                          hl_activation_mode_t active_state) {
-  dim3 threads;
-  dim3 grid;
-  if (batchSize == 1) {
-    int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
-    int frameBlocks = (frameSize + 1024 - 1) / 1024;
-    threads = dim3(framePerBlock, 1);
-    grid = dim3(frameBlocks, 1);
-  } else {
-    /* framePerBlock = 32 batchPerBlock = 32 */
-    threads = dim3(32, 32);
-    grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
-  }
-
-  if (batchSize == 1) {
-    KeLstmBackward<Op, /* isBatch= */false>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(op, value, grad,
-      frameSize, batchSize, active_node, active_gate, active_state);
-  } else {
-    KeLstmBackward<Op, /* isBatch= */true>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(op, value, grad,
-      frameSize, batchSize, active_node, active_gate, active_state);
-  }
-
-  CHECK_SYNC("hl_gpu_lstm_backward failed");
-}
-
-#else
-
-template<class Op>
-void hl_gpu_lstm_forward(Op op,
-                         hl_lstm_value value,
-                         int frameSize,
-                         int batchSize,
-                         hl_activation_mode_t active_node,
-                         hl_activation_mode_t active_gate,
-                         hl_activation_mode_t active_state) {}
-
-template<class Op>
-void hl_gpu_lstm_backward(Op op,
-                          hl_lstm_value value,
-                          hl_lstm_grad grad,
-                          int frameSize,
-                          int batchSize,
-                          hl_activation_mode_t active_node,
-                          hl_activation_mode_t active_gate,
-                          hl_activation_mode_t active_state) {}
-
-#endif
-
-#endif /* HL_GPU_LSTM_CUH_ */
diff --git a/paddle/cuda/include/hl_gpu_matrix_kernel.cuh b/paddle/cuda/include/hl_gpu_matrix_kernel.cuh
deleted file mode 100644
index 0db023ce3745f95ced8b3a33a1d6bcb20066b2ef..0000000000000000000000000000000000000000
--- a/paddle/cuda/include/hl_gpu_matrix_kernel.cuh
+++ /dev/null
@@ -1,629 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-
-#ifndef HL_GPU_MATRIX_KERNEL_CUH_
-#define HL_GPU_MATRIX_KERNEL_CUH_
-
-#include <algorithm>
-#include "paddle/utils/Logging.h"
-#include "hl_base.h"
-
-#ifdef __NVCC__
-/* gpu apply interface */
-
-template<class T, class Op>
-__global__ void KeEltWiseUnaryOp(T* A_d, const int border, Op op) {
-  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < border) {
-    op.gpuOperator(A_d[idx]);
-  }
-}
-
-template<class T, class Op>
-__global__ void KeEltWiseUnaryOp(T* A_d,
-                                 int dimM,
-                                 int dimN,
-                                 int lda,
-                                 Op op) {
-  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
-  for (int i = rowIdx; i < dimM; i += gridDim.y * blockDim.y) {
-    for (int j = colIdx; j < dimN; j += gridDim.x * blockDim.x) {
-      op.gpuOperator(A_d[i * lda + j]);
-    }
-  }
-}
-
-template<class T, class Op>
-__global__ void KeEltWiseBinaryOp(T* A_d, T *B_d, const int border, Op op) {
-  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < border) {
-    op.gpuOperator(A_d[idx], B_d[idx]);
-  }
-}
-
-template<class T, class Op, bool BAsRowVector, bool BAsColVector>
-__global__ void KeEltWiseBinaryOp(T *A_d,
-                                  T *B_d,
-                                  int dimM,
-                                  int dimN,
-                                  int lda,
-                                  int ldb,
-                                  Op op) {
-  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
-  for (int i = rowIdx; i < dimM; i += gridDim.y * blockDim.y) {
-    for (int j = colIdx; j < dimN; j += gridDim.x * blockDim.x) {
-      if (BAsRowVector == 0 && BAsColVector == 0) {
-        op.gpuOperator(A_d[i * lda + j], B_d[i * ldb + j]);
-      } else if (BAsRowVector == 1 && BAsColVector == 0) {
-        op.gpuOperator(A_d[i * lda + j], B_d[j]);
-      } else if (BAsRowVector == 0 && BAsColVector == 1) {
-        op.gpuOperator(A_d[i * lda + j], B_d[i * ldb]);
-      } else {
-        op.gpuOperator(A_d[i * lda + j], B_d[0]);
-      }
-    }
-  }
-}
-
-template<class T, class Op>
-__global__ void KeEltWiseTernaryOp(T* A_d,
-                                   T *B_d,
-                                   T *C_d,
-                                   const int border,
-                                   Op op) {
-  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < border) {
-    op.gpuOperator(A_d[idx], B_d[idx], C_d[idx]);
-  }
-}
-
-template<class T, class Op, bool CAsRowVector, bool CAsColVector>
-__global__ void KeEltWiseTernaryOp(T* A_d,
-                                   T* B_d,
-                                   T* C_d,
-                                   int dimM,
-                                   int dimN,
-                                   int lda,
-                                   int ldb,
-                                   int ldc,
-                                   Op op) {
-  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
-  for (int i = rowIdx; i < dimM; i += gridDim.y * blockDim.y) {
-    for (int j = colIdx; j < dimN; j += gridDim.x * blockDim.x) {
-      if (CAsRowVector == 0 && CAsColVector == 0) {
-        op.gpuOperator(A_d[i*lda + j], B_d[i*ldb + j], C_d[i*ldc + j]);
-      } else if (CAsRowVector == 1 && CAsColVector == 0) {
-        op.gpuOperator(A_d[i*lda + j], B_d[i*ldb + j], C_d[j]);
-      } else if (CAsRowVector == 0 && CAsColVector == 1) {
-        op.gpuOperator(A_d[i*lda + j], B_d[i*ldb + j], C_d[i*ldc]);
-      } else {
-        op.gpuOperator(A_d[i*lda + j], B_d[i*ldb + j], C_d[0]);
-      }
-    }
-  }
-}
-
-template<class T, class Op>
-__global__ void KeEltWiseQuaternaryOp(T* A_d,
-                                      T* B_d,
-                                      T* C_d,
-                                      T* D_d,
-                                      const int border,
-                                      Op op) {
-  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < border) {
-    op.gpuOperator(A_d[idx], B_d[idx], C_d[idx], D_d[idx]);
-  }
-}
-
-template<class T, class Op>
-__global__ void KeEltWiseQuaternaryOp(T* A_d,
-                                      T* B_d,
-                                      T* C_d,
-                                      T* D_d,
-                                      int dimM,
-                                      int dimN,
-                                      int lda,
-                                      int ldb,
-                                      int ldc,
-                                      int ldd,
-                                      Op op) {
-  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
-  for (int i = rowIdx; i < dimM; i += gridDim.y * blockDim.y) {
-    for (int j = colIdx; j < dimN; j += gridDim.x * blockDim.x) {
-      op.gpuOperator(A_d[i*lda + j],
-        B_d[i*ldb + j], C_d[i*ldc + j], D_d[i*ldd + j]);
-    }
-  }
-}
-
-/**
- * @brief   gpu element wise unary operator.
- */
-template <class T, class Op>
-void hl_gpu_apply_unary_op(Op op, T* A_d, int dimM, int dimN, int lda) {
-  CHECK_NOTNULL(A_d);
-
-  if (dimM == 1 || dimN == lda) {
-    int size = dimM * dimN;
-    int blockSize = size <= 1024 ? size : 1024;
-    int gridSize = (size + 1024 - 1) / 1024;
-    KeEltWiseUnaryOp<T, Op><<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
-      (A_d, size, op);
-  } else {
-    int blockSizeY = std::min(32, dimM);
-    int blockSizeX = (32 / blockSizeY) * 32;
-    int gridSizeX = std::min(32, (dimN + blockSizeX - 1) / blockSizeX);
-    int gridSizeY = std::min(32, (dimM + blockSizeY - 1) / blockSizeY);
-    dim3 threads(blockSizeX, blockSizeY);
-    dim3 grid(gridSizeX, gridSizeY);
-    KeEltWiseUnaryOp<T, Op><<<grid, threads, 0, STREAM_DEFAULT>>>
-      (A_d, dimM, dimN, lda, op);
-  }
-
-  CHECK_SYNC("hl_gpu_apply_unary_op failed");
-}
-
-/**
- * @brief   gpu element wise binary operator.
- */
-template <class T, class Op, bool BAsRowVector, bool BAsColVector>
-void hl_gpu_apply_binary_op(Op op,
-                            T* A_d,
-                            T* B_d,
-                            int dimM,
-                            int dimN,
-                            int lda,
-                            int ldb) {
-  CHECK_NOTNULL(A_d);
-
-  if ((BAsRowVector == 0 && BAsColVector == 0) &&
-      ((dimM == 1) || (dimN == lda && dimN == ldb))) {
-    int size = dimM * dimN;
-    int blockSize = size <= 1024 ? size : 1024;
-    int gridSize = (size + 1024 - 1) / 1024;
-    KeEltWiseBinaryOp<T, Op><<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
-      (A_d, B_d, size, op);
-  } else {
-    int blockSizeY = std::min(32, dimM);
-    int blockSizeX = (32 / blockSizeY) * 32;
-    int gridSizeX = std::min(32, (dimN + blockSizeX - 1) / blockSizeX);
-    int gridSizeY = std::min(32, (dimM + blockSizeY - 1) / blockSizeY);
-    dim3 threads(blockSizeX, blockSizeY);
-    dim3 grid(gridSizeX, gridSizeY);
-    KeEltWiseBinaryOp<T, Op, BAsRowVector, BAsColVector>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>
-      (A_d, B_d, dimM, dimN, lda, ldb, op);
-  }
-
-  CHECK_SYNC("hl_gpu_apply_binary_op failed");
-}
-
-/**
- * @brief   gpu element wise ternary operator.
- */
-template <class T, class Op, bool CAsRowVector, bool CAsColVector>
-void hl_gpu_apply_ternary_op(Op op,
-                             T* A_d,
-                             T* B_d,
-                             T* C_d,
-                             int dimM,
-                             int dimN,
-                             int lda,
-                             int ldb,
-                             int ldc) {
-  CHECK_NOTNULL(A_d);
-
-  if ((CAsRowVector == 0 && CAsColVector == 0) &&
-      ((dimM == 1) || (dimN == lda && dimN == ldb && dimN == ldc))) {
-    int size = dimM * dimN;
-    int blockSize = size <= 1024 ? size : 1024;
-    int gridSize = (size + 1024 - 1) / 1024;
-    KeEltWiseTernaryOp<T, Op><<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
-      (A_d, B_d, C_d, size, op);
-  } else {
-    int blockSizeY = std::min(32, dimM);
-    int blockSizeX = (32 / blockSizeY) * 32;
-    int gridSizeX = std::min(32, (dimN + blockSizeX - 1) / blockSizeX);
-    int gridSizeY = std::min(32, (dimM + blockSizeY - 1) / blockSizeY);
-    dim3 threads(blockSizeX, blockSizeY);
-    dim3 grid(gridSizeX, gridSizeY);
-    KeEltWiseTernaryOp<T, Op, CAsRowVector, CAsColVector>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>
-      (A_d, B_d, C_d, dimM, dimN, lda, ldb, ldc, op);
-  }
-
-  CHECK_SYNC("hl_gpu_apply_ternary_op failed");
-}
-
-
-/**
- * @brief   gpu element wise quaternary operator.
- */
-template <class T, class Op>
-void hl_gpu_apply_quaternary_op(Op op,
-                                T* A_d,
-                                T* B_d,
-                                T* C_d,
-                                T* D_d,
-                                int dimM,
-                                int dimN,
-                                int lda,
-                                int ldb,
-                                int ldc,
-                                int ldd) {
-  CHECK_NOTNULL(A_d);
-
-  if ((dimM == 1) ||
-      (dimN == lda && dimN == ldb && dimN == ldc && dimN == ldd)) {
-    int size = dimM * dimN;
-    int blockSize = size <= 1024 ? size : 1024;
-    int gridSize = (size + 1024 - 1) / 1024;
-    KeEltWiseQuaternaryOp<T, Op><<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
-      (A_d, B_d, C_d, D_d, size, op);
-  } else {
-    int blockSizeY = std::min(32, dimM);
-    int blockSizeX = (32 / blockSizeY) * 32;
-    int gridSizeX = std::min(32, (dimN + blockSizeX - 1) / blockSizeX);
-    int gridSizeY = std::min(32, (dimM + blockSizeY - 1) / blockSizeY);
-    dim3 threads(blockSizeX, blockSizeY);
-    dim3 grid(gridSizeX, gridSizeY);
-    KeEltWiseQuaternaryOp<T, Op><<<grid, threads, 0, STREAM_DEFAULT>>>
-      (A_d, B_d, C_d, D_d, dimM, dimN, lda, ldb, ldc, ldd, op);
-  }
-
-  CHECK_SYNC("hl_gpu_apply_quaternary_op failed");
-}
-
-#else
-
-template <class T, class Op>
-void hl_gpu_apply_unary_op(Op op, T* A_d, int dimM, int dimN, int lda) {}
-
-template <class T, class Op, bool BAsRowVector, bool BAsColVector>
-void hl_gpu_apply_binary_op(Op op,
-                            T* A_d,
-                            T* B_d,
-                            int dimM,
-                            int dimN,
-                            int lda,
-                            int ldb) {}
-
-template <class T, class Op, bool CAsRowVector, bool CAsColVector>
-void hl_gpu_apply_ternary_op(Op op,
-                             T* A_d,
-                             T* B_d,
-                             T* C_d,
-                             int dimM,
-                             int dimN,
-                             int lda,
-                             int ldb,
-                             int ldc) {}
-
-template <class T, class Op>
-void hl_gpu_apply_quaternary_op(Op op,
-                                T* A_d,
-                                T* B_d,
-                                T* C_d,
-                                T* D_d,
-                                int dimM,
-                                int dimN,
-                                int lda,
-                                int ldb,
-                                int ldc,
-                                int ldd) {}
-#endif
-
-#ifdef __NVCC__
-/**
- * @brief   matrix row operator.
- */
-
-template<class Agg, class Op>
-__device__ __inline__ real sumRow(Agg agg, Op op,
-                                  int idx, int blockSize,
-                                  int dimN, real *A) {
-  real tmp = agg.init();
-  int cnt = (dimN + blockSize -1) / blockSize;
-  for (int i = 0; i < cnt && idx < dimN; i++) {
-      tmp = agg(tmp, op(A[idx]));
-      idx += blockSize;
-  }
-  return tmp;
-}
-
-template<class Agg, class Op>
-__device__ __inline__ real sumRow(Agg agg, Op op,
-                                  int idx, int blockSize,
-                                  int dimN, real *A, real *B) {
-  real tmp = agg.init();
-  int cnt = (dimN + blockSize -1) / blockSize;
-  for (int i = 0; i < cnt && idx < dimN; i++) {
-    tmp = agg(tmp, op(A[idx], B[idx]));
-    idx += blockSize;
-  }
-  return tmp;
-}
-
-template<class Agg>
-__device__ __inline__ void aggRow(Agg agg, real *row, int size, int tid) {
-  for (int stride = size/2; stride > 0; stride = stride/2) {
-    if (tid < stride) {
-      row[tid] = agg(row[tid], row[tid + stride]);
-    }
-    __syncthreads();
-  }
-}
-
-template<class Agg, class Op, class Saver, int blockSize>
-__global__ void KeMatrixRowOp(Agg agg, Op op, Saver sv,
-                              int dimN,
-                              real *dst, int ld,
-                              real *A, int lda) {
-  __shared__ real row_s[blockSize];
-  int rowId = blockIdx.x + blockIdx.y*gridDim.x;
-  int tid = threadIdx.x;
-
-  A += rowId*lda;
-  row_s[tid] = sumRow(agg, op, tid, blockSize, dimN, A);
-  __syncthreads();
-
-  aggRow(agg, row_s, blockSize, tid);
-  __syncthreads();
-
-  if (tid == 0) {
-    dst[rowId*ld] = sv(dst[rowId*ld], row_s[0]);
-  }
-}
-
-template<class Agg, class Op, class Saver, int blockSize>
-__global__ void KeMatrixRowOp(Agg agg, Op op, Saver sv,
-                              int dimN,
-                              real *dst, int ld,
-                              real *A, int lda,
-                              real *B, int ldb) {
-  __shared__ real row_s[blockSize];
-  int rowId = blockIdx.x + blockIdx.y*gridDim.x;
-  int tid = threadIdx.x;
-
-  A += rowId*lda;
-  B += rowId*ldb;
-  row_s[tid] = sumRow(agg, op, tid, blockSize, dimN, A, B);
-  __syncthreads();
-
-  aggRow(agg, row_s, blockSize, tid);
-  __syncthreads();
-
-  if (tid == 0) {
-    dst[rowId*ld] = sv(dst[rowId*ld], row_s[0]);
-  }
-}
-
-/**
- * @brief   matrix column operator.
- */
-template <class Agg, class Op>
-__device__ __inline__ real sumCol(Agg agg, Op op,
-                                  int index, int stride,
-                                  int dimM, real *A, int lda) {
-  real tmp = agg.init();
-  for (; index < dimM;) {
-    tmp = agg(tmp, op(A[index*lda]));
-    index += stride;
-  }
-  return tmp;
-}
-
-template <class Agg, class Op>
-__device__ __inline__ real sumCol(Agg agg, Op op,
-                                  int index, int stride, int dimM,
-                                  real *A, int lda, real *B, int ldb) {
-  real tmp = agg.init();
-  for (; index < dimM;) {
-    tmp = agg(tmp, op(A[index*lda], B[index*ldb]));
-    index += stride;
-  }
-  return tmp;
-}
-
-template <class Agg, class Op, class Saver>
-__global__ void KeMatrixColumnOp(Agg agg, Op op, Saver sv,
-                                 int dimM, int dimN,
-                                 real *dst,
-                                 real *A, int lda) {
-  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (rowIdx < dimN) {
-    A += rowIdx;
-    real tmp = sumCol(agg, op, 0, 1, dimM, A, lda);
-    dst[rowIdx] = sv(dst[rowIdx], tmp);
-  }
-}
-
-template <class Agg, class Op, class Saver, int blockDimX, int blockDimY>
-__global__ void KeMatrixColumnOp_S(Agg agg, Op op, Saver sv,
-                                   int dimM, int dimN,
-                                   real *dst,
-                                   real *A, int lda) {
-  __shared__ real col_s[blockDimX*blockDimY];
-  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (rowIdx < dimN) {
-    A += rowIdx;
-    real tmp = sumCol(agg, op, threadIdx.y, blockDimY, dimM, A, lda);
-    col_s[threadIdx.x + threadIdx.y*blockDimX] = tmp;
-  }
-  __syncthreads();
-
-  if (rowIdx < dimN) {
-    if (threadIdx.y ==0) {
-      real tmp = agg.init();
-      for (int i=0; i < blockDimY; i++) {
-        tmp = agg(tmp, col_s[threadIdx.x + i*blockDimX]);
-      }
-      dst[rowIdx] = sv(dst[rowIdx], tmp);
-    }
-  }
-}
-
-template <class Agg, class Op, class Saver>
-__global__ void KeMatrixColumnOp(Agg agg, Op op, Saver sv,
-                                 int dimM, int dimN,
-                                 real *dst,
-                                 real *A, int lda,
-                                 real *B, int ldb) {
-  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (rowIdx < dimN) {
-    A += rowIdx;
-    B += rowIdx;
-    real tmp = sumCol(agg, op, 0, 1, dimM, A, lda, B, ldb);
-    dst[rowIdx] = sv(dst[rowIdx], tmp);
-  }
-}
-
-template <class Agg, class Op, class Saver, int blockDimX, int blockDimY>
-__global__ void KeMatrixColumnOp_S(Agg agg, Op op, Saver sv,
-                                   int dimM, int dimN,
-                                   real *dst,
-                                   real *A, int lda,
-                                   real *B, int ldb) {
-  __shared__ real col_s[blockDimX*blockDimY];
-  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (rowIdx < dimN) {
-    A += rowIdx;
-    B += rowIdx;
-    real tmp = sumCol(agg, op,
-        threadIdx.y, blockDimY, dimM, A, lda, B, ldb);
-    col_s[threadIdx.x + threadIdx.y*blockDimX] = tmp;
-  }
-  __syncthreads();
-
-  if (rowIdx < dimN) {
-    if (threadIdx.y ==0) {
-      real tmp = agg.init();
-      for (int i=0; i < blockDimY; i++) {
-        tmp = agg(tmp, col_s[threadIdx.x + i*blockDimX]);
-      }
-      dst[rowIdx] = sv(dst[rowIdx], tmp);
-    }
-  }
-}
-
-#endif
-
-template <class Agg, class Op, class Saver>
-void hl_gpu_matrix_row_op(Agg agg, Op op, Saver sv,
-                          int dimM, int dimN,
-                          real *dst, int ld,
-                          real *A, int lda) {
-#ifdef __NVCC__
-  CHECK_NOTNULL(dst);
-  CHECK_NOTNULL(A);
-
-  int blocksX = dimM;
-  int blocksY = 1;
-  dim3 threads(128, 1);
-  dim3 grid(blocksX, blocksY);
-  KeMatrixRowOp<Agg, Op, Saver, 128><<< grid, threads, 0, STREAM_DEFAULT >>>
-      (agg, op, sv, dimN, dst, ld, A, lda);
-
-  CHECK_SYNC("hl_matrix_row_op failed");
-#endif
-}
-
-template <class Agg, class Op, class Saver>
-void hl_gpu_matrix_row_op(Agg agg, Op op, Saver sv,
-                          int dimM, int dimN,
-                          real *dst, int ld,
-                          real *A, int lda,
-                          real *B, int ldb) {
-#ifdef __NVCC__
-  CHECK_NOTNULL(dst);
-  CHECK_NOTNULL(A);
-
-  int blocksX = dimM;
-  int blocksY = 1;
-  dim3 threads(128, 1);
-  dim3 grid(blocksX, blocksY);
-  KeMatrixRowOp<Agg, Op, Saver, 128><<< grid, threads, 0, STREAM_DEFAULT >>>
-    (agg, op, sv, dimN, dst, ld, A, lda, B, ldb);
-
-  CHECK_SYNC("hl_matrix_row_op failed");
-#endif
-}
-
-template <class Agg, class Op, class Saver>
-void hl_gpu_matrix_column_op(Agg agg, Op op, Saver sv,
-                             int dimM, int dimN,
-                             real *dst,
-                             real *A, int lda) {
-#ifdef __NVCC__
-  if (dimN >= 8192) {
-    int blocksX = (dimN + 128 -1) / 128;
-    int blocksY = 1;
-    dim3 threads(128, 1);
-    dim3 grid(blocksX, blocksY);
-    KeMatrixColumnOp<Agg, Op, Saver>
-        <<< grid, threads, 0, STREAM_DEFAULT >>>
-        (agg, op, sv, dimM, dimN, dst, A, lda);
-  } else {
-    int blocksX = (dimN + 32 -1) / 32;
-    int blocksY = 1;
-    dim3 threads(32, 32);
-    dim3 grid(blocksX, blocksY);
-    KeMatrixColumnOp_S<Agg, Op, Saver, 32, 32>
-        <<< grid, threads, 0, STREAM_DEFAULT>>>
-        (agg, op, sv, dimM, dimN, dst, A, lda);
-  }
-
-  CHECK_SYNC("hl_matrix_column_op failed");
-#endif
-}
-
-template <class Agg, class Op, class Saver>
-void hl_gpu_matrix_column_op(Agg agg, Op op, Saver sv,
-                             int dimM, int dimN,
-                             real *dst,
-                             real *A, int lda,
-                             real *B, int ldb) {
-#ifdef __NVCC__
-  if (dimN >= 8192) {
-    int blocksX = (dimN + 128 -1) / 128;
-    int blocksY = 1;
-    dim3 threads(128, 1);
-    dim3 grid(blocksX, blocksY);
-    KeMatrixColumnOp<Agg, Op, Saver>
-        <<< grid, threads, 0, STREAM_DEFAULT >>>
-        (agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
-  } else {
-    int blocksX = (dimN + 32 -1) / 32;
-    int blocksY = 1;
-    dim3 threads(32, 32);
-    dim3 grid(blocksX, blocksY);
-    KeMatrixColumnOp_S<Agg, Op, Saver, 32, 32>
-        <<< grid, threads, 0, STREAM_DEFAULT>>>
-        (agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
-  }
-
-  CHECK_SYNC("hl_matrix_column_op failed");
-#endif
-}
-
-#endif /* HL_GPU_MATRIX_KERNEL_CUH_ */
diff --git a/paddle/cuda/src/hl_cuda_aggregate.cu b/paddle/cuda/src/hl_cuda_aggregate.cu
deleted file mode 100644
index d30c264127f47da9a48acb71c59cb9e134ced127..0000000000000000000000000000000000000000
--- a/paddle/cuda/src/hl_cuda_aggregate.cu
+++ /dev/null
@@ -1,293 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "hl_aggregate.h"
-#include "hl_base.h"
-#include "hl_cuda.h"
-#include "hl_cuda.ph"
-#include "hl_matrix_base.cuh"
-#include "hl_thread.ph"
-#include "paddle/utils/Logging.h"
-
-/**
- * @brief   matrix row operator.
- */
-template <class Agg, int blockSize>
-__global__ void KeMatrixRowOp(Agg agg, real *E, real *Sum, int dimN) {
-  __shared__ real sum_s[blockSize];
-  int cnt = (dimN + blockSize - 1) / blockSize;
-  int rowId = blockIdx.x + blockIdx.y * gridDim.x;
-  int index = rowId * dimN;
-  int tid = threadIdx.x;
-  int lmt = tid;
-
-  real tmp = agg.init();
-  for (int ii = 0; ii < cnt && lmt < dimN; ii++) {
-    tmp = agg(tmp, E[index + lmt]);
-    lmt += blockSize;
-  }
-  sum_s[tid] = tmp;
-  __syncthreads();
-
-  for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
-    if (tid < stride) {
-      sum_s[tid] = agg(sum_s[tid], sum_s[tid + stride]);
-    }
-    __syncthreads();
-  }
-  __syncthreads();
-
-  if (tid == 0) {
-    Sum[rowId] = sum_s[0];
-  }
-}
-
-template <class Agg>
-void hl_matrix_row_op(Agg agg, real *A_d, real *C_d, int dimM, int dimN) {
-  int blocksX = dimM;
-  int blocksY = 1;
-  dim3 threads(128, 1);
-  dim3 grid(blocksX, blocksY);
-
-  KeMatrixRowOp<Agg, 128><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      agg, A_d, C_d, dimN);
-}
-
-void hl_matrix_row_sum(real *A_d, real *C_d, int dimM, int dimN) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-
-  hl_matrix_row_op(aggregate::sum(), A_d, C_d, dimM, dimN);
-  CHECK_SYNC("hl_matrix_row_sum failed");
-}
-
-void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-
-  hl_matrix_row_op(aggregate::max(), A_d, C_d, dimM, dimN);
-  CHECK_SYNC("hl_matrix_row_max failed");
-}
-
-void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-
-  hl_matrix_row_op(aggregate::min(), A_d, C_d, dimM, dimN);
-  CHECK_SYNC("hl_matrix_row_min failed");
-}
-
-/**
- * @brief   matrix column operator.
- */
-template <class Agg>
-__global__ void KeMatrixColumnOp(
-    Agg agg, real *E, real *Sum, int dimM, int dimN) {
-  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  real tmp = agg.init();
-  if (rowIdx < dimN) {
-    for (int index = 0; index < dimM; index++) {
-      tmp = agg(tmp, E[dimN * index + rowIdx]);
-    }
-    Sum[rowIdx] = tmp;
-  }
-}
-
-template <class Agg, int blockDimX, int blockDimY>
-__global__ void KeMatrixColumnOp_S(
-    Agg agg, real *E, real *Sum, int dimM, int dimN) {
-  __shared__ real _sum[blockDimX * blockDimY];
-  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  int index = threadIdx.y;
-
-  real tmp = agg.init();
-  if (rowIdx < dimN) {
-    for (; index < dimM;) {
-      tmp = agg(tmp, E[dimN * index + rowIdx]);
-      index += blockDimY;
-    }
-  }
-  _sum[threadIdx.x + threadIdx.y * blockDimX] = tmp;
-  __syncthreads();
-
-  if (rowIdx < dimN) {
-    if (threadIdx.y == 0) {
-      real tmp = agg.init();
-      for (int i = 0; i < blockDimY; i++) {
-        tmp = agg(tmp, _sum[threadIdx.x + i * blockDimX]);
-      }
-      Sum[rowIdx] = tmp;
-    }
-  }
-}
-
-template <class Agg>
-void hl_matrix_column_op(Agg agg, real *A_d, real *C_d, int dimM, int dimN) {
-  if (dimN >= 8192) {
-    int blocksX = (dimN + 128 - 1) / 128;
-    int blocksY = 1;
-    dim3 threads(128, 1);
-    dim3 grid(blocksX, blocksY);
-    KeMatrixColumnOp<Agg><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        agg, A_d, C_d, dimM, dimN);
-  } else {
-    int blocksX = (dimN + 32 - 1) / 32;
-    int blocksY = 1;
-    dim3 threads(32, 32);
-    dim3 grid(blocksX, blocksY);
-    KeMatrixColumnOp_S<Agg, 32, 32><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        agg, A_d, C_d, dimM, dimN);
-  }
-
-  return;
-}
-
-void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-
-  hl_matrix_column_op(aggregate::sum(), A_d, C_d, dimM, dimN);
-
-  CHECK_SYNC("hl_matrix_column_sum failed");
-}
-
-void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-
-  hl_matrix_column_op(aggregate::max(), A_d, C_d, dimM, dimN);
-
-  CHECK_SYNC("hl_matrix_column_max failed");
-}
-
-void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-
-  hl_matrix_column_op(aggregate::min(), A_d, C_d, dimM, dimN);
-
-  CHECK_SYNC("hl_matrix_column_min failed");
-}
-
-template <int blockSize>
-__global__ void KeVectorSum(real *E, real *Sum, int dimM) {
-  __shared__ double sum_s[blockSize];
-  int tid = threadIdx.x;
-  int index = blockIdx.y * blockDim.x + threadIdx.x;
-
-  sum_s[tid] = 0.0f;
-  while (index < dimM) {
-    sum_s[tid] += E[index];
-    index += blockDim.x * gridDim.y;
-  }
-  __syncthreads();
-
-  for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
-    if (tid < stride) {
-      sum_s[tid] += sum_s[tid + stride];
-    }
-    __syncthreads();
-  }
-  __syncthreads();
-
-  if (tid == 0) {
-    Sum[blockIdx.y] = sum_s[0];
-  }
-}
-
-void hl_vector_sum(real *A_d, real *C_h, int dimM) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_h);
-
-  int blockSize = 128;
-  int gridSize = 128;
-  int blocksX = 1;
-  int blocksY = gridSize;
-  dim3 threads(blockSize, 1);
-  dim3 grid(blocksX, blocksY);
-
-  struct _hl_event_st hl_event_st = {.cu_event = t_resource.event};
-  hl_event_t hl_event = &hl_event_st;
-  while (!hl_cuda_event_is_ready(hl_event)) {
-  }
-
-  KeVectorSum<128><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      A_d, t_resource.gpu_mem, dimM);
-  KeVectorSum<128><<<1, threads, 0, STREAM_DEFAULT>>>(
-      t_resource.gpu_mem, t_resource.cpu_mem, 128);
-
-  hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT);
-  hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event);
-
-  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-  cudaError_t err = (cudaError_t)hl_get_device_last_error();
-  CHECK_EQ(cudaSuccess, err) << "CUDA error: "
-                             << hl_get_device_error_string((size_t)err);
-}
-
-template <int blockSize>
-__global__ void KeVectorAbsSum(real *E, real *Sum, int dimM) {
-  __shared__ double sum_s[blockSize];
-  int tid = threadIdx.x;
-  int index = blockIdx.y * blockDim.x + threadIdx.x;
-
-  sum_s[tid] = 0.0f;
-  while (index < dimM) {
-    sum_s[tid] += abs(E[index]);
-    index += blockDim.x * gridDim.y;
-  }
-  __syncthreads();
-
-  for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
-    if (tid < stride) {
-      sum_s[tid] += sum_s[tid + stride];
-    }
-    __syncthreads();
-  }
-  __syncthreads();
-
-  if (tid == 0) {
-    Sum[blockIdx.y] = sum_s[0];
-  }
-}
-
-void hl_vector_abs_sum(real *A_d, real *C_h, int dimM) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_h);
-
-  int blockSize = 128;
-  int gridSize = 128;
-  int blocksX = 1;
-  int blocksY = gridSize;
-  dim3 threads(blockSize, 1);
-  dim3 grid(blocksX, blocksY);
-
-  struct _hl_event_st hl_event_st = {.cu_event = t_resource.event};
-  hl_event_t hl_event = &hl_event_st;
-  while (!hl_cuda_event_is_ready(hl_event)) {
-  }
-
-  KeVectorAbsSum<128><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      A_d, t_resource.gpu_mem, dimM);
-  KeVectorAbsSum<128><<<1, threads, 0, STREAM_DEFAULT>>>(
-      t_resource.gpu_mem, t_resource.cpu_mem, 128);
-
-  hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT);
-  hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event);
-
-  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-  cudaError_t err = (cudaError_t)hl_get_device_last_error();
-  CHECK_EQ(cudaSuccess, err) << "CUDA error: "
-                             << hl_get_device_error_string((size_t)err);
-}
diff --git a/paddle/cuda/src/hl_cuda_cublas.cc b/paddle/cuda/src/hl_cuda_cublas.cc
deleted file mode 100644
index 975df4287894090799c44bc0a4e9e08e4144e68f..0000000000000000000000000000000000000000
--- a/paddle/cuda/src/hl_cuda_cublas.cc
+++ /dev/null
@@ -1,400 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "hl_cuda_cublas.h"
-#include <sys/time.h>
-#include "hl_cuda.h"
-#include "hl_thread.ph"
-#include "paddle/utils/DynamicLoader.h"
-#include "paddle/utils/Logging.h"
-
-namespace dynload {
-
-std::once_flag cublas_dso_flag;
-void *cublas_dso_handle = nullptr;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load cublas routine
- * via operator overloading.
- *
- * note: default dynamic linked libs
- */
-#ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CUBLAS_WRAP(__name)                                       \
-  struct DynLoad__##__name {                                                   \
-    template <typename... Args>                                                \
-    cublasStatus_t operator()(Args... args) {                                  \
-      typedef cublasStatus_t (*cublasFunc)(Args...);                           \
-      std::call_once(cublas_dso_flag, GetCublasDsoHandle, &cublas_dso_handle); \
-      void *p_##__name = dlsym(cublas_dso_handle, #__name);                    \
-      return reinterpret_cast<cublasFunc>(p_##__name)(args...);                \
-    }                                                                          \
-  } __name;  // struct DynLoad__##__name
-#else
-#define DYNAMIC_LOAD_CUBLAS_WRAP(__name)      \
-  struct DynLoad__##__name {                  \
-    template <typename... Args>               \
-    cublasStatus_t operator()(Args... args) { \
-      return __name(args...);                 \
-    }                                         \
-  } __name;  // struct DynLoad__##__name
-#endif
-
-#define DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) DYNAMIC_LOAD_CUBLAS_WRAP(__name)
-
-// include all needed cublas functions in HPPL
-// clang-format off
-#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
-  __macro(cublasSgemv)                    \
-  __macro(cublasDgemv)                    \
-  __macro(cublasSgemm)                    \
-  __macro(cublasDgemm)                    \
-  __macro(cublasSgeam)                    \
-  __macro(cublasDgeam)                    \
-
-DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasCreate)
-DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasDestroy)
-DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasSetStream)
-DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasSetPointerMode)
-DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasGetPointerMode)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgemmBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgemmBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasCgemmBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasZgemmBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetrfBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetriBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgetrfBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgetriBatched)
-CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP)
-
-#undef DYNAMIC_LOAD_CUBLAS_WRAP
-#undef DYNAMIC_LOAD_CUBLAS_V2_WRAP
-#undef CUBLAS_BLAS_ROUTINE_EACH
-
-} /* namespace dynload */
-
-// clang-format on
-#ifndef PADDLE_TYPE_DOUBLE
-#define CUBLAS_GEAM dynload::cublasSgeam
-#define CUBLAS_GEMV dynload::cublasSgemv
-#define CUBLAS_GEMM dynload::cublasSgemm
-#define CUBLAS_GETRF dynload::cublasSgetrfBatched
-#define CUBLAS_GETRI dynload::cublasSgetriBatched
-#else
-#define CUBLAS_GEAM dynload::cublasDgeam
-#define CUBLAS_GEMV dynload::cublasDgemv
-#define CUBLAS_GEMM dynload::cublasDgemm
-#define CUBLAS_GETRF dynload::cublasDgetrfBatched
-#define CUBLAS_GETRI dynload::cublasDgetriBatched
-#endif
-
-const char *hl_cublas_get_error_string(cublasStatus_t status) {
-  switch (status) {
-    case CUBLAS_STATUS_NOT_INITIALIZED:
-      return "[cublas status]: not initialized";
-    case CUBLAS_STATUS_ALLOC_FAILED:
-      return "[cublas status]: allocate failed";
-    case CUBLAS_STATUS_INVALID_VALUE:
-      return "[cublas status]: invalid value";
-    case CUBLAS_STATUS_ARCH_MISMATCH:
-      return "[cublas status]: arch mismatch";
-    case CUBLAS_STATUS_MAPPING_ERROR:
-      return "[cublas status]: mapping error";
-    case CUBLAS_STATUS_EXECUTION_FAILED:
-      return "[cublas status]: execution failed";
-    case CUBLAS_STATUS_INTERNAL_ERROR:
-      return "[cublas status]: internal error";
-    case CUBLAS_STATUS_SUCCESS:
-      return "[cublas status]: success";
-    default:
-      return "[cublas status]: unknown error";
-  }
-}
-
-/**
- * Check build-in cublas function using glog and it also
- * support << operator for more details error info.
- */
-cublasStatus_t g_cublasStat;
-#define CHECK_CUBLAS(cublas_func)               \
-  g_cublasStat = cublas_func;                   \
-  CHECK_EQ(CUBLAS_STATUS_SUCCESS, g_cublasStat) \
-      << "Cublas Error: " << hl_cublas_get_error_string(g_cublasStat) << " "
-
-void hl_cublas_init(cublasHandle_t *cublas_handle, cudaStream_t stream) {
-  CHECK_CUBLAS(dynload::cublasCreate(cublas_handle))
-      << "[cublas init] Cublas create handle faild!";
-
-  CHECK_CUBLAS(dynload::cublasSetStream(*cublas_handle, stream))
-      << "[cublas init] Cublas set stream faild!";
-}
-
-void hl_matrix_transpose(
-    real *A_d, real *C_d, int dimM, int dimN, int lda, int ldc) {
-  real alpha = 1.0;
-  real beta = 0.0;
-
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-
-  CHECK_CUBLAS(CUBLAS_GEAM(t_resource.handle,
-                           CUBLAS_OP_T,
-                           CUBLAS_OP_N,
-                           dimM,
-                           dimN,
-                           &alpha,
-                           A_d,
-                           lda,
-                           &beta,
-                           nullptr,
-                           dimM,
-                           C_d,
-                           ldc));
-  CHECK_SYNC("hl_matrix_transpose failed");
-}
-
-void hl_matrix_transpose(real *A_d, real *C_d, int dimM, int dimN) {
-  hl_matrix_transpose(A_d, C_d, dimM, dimN, dimN, dimM);
-}
-
-void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) {
-  /* Solve Ax = I */
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-
-  /* Step 1: Compute the LU decomposition of matrix A */
-  real **inout_h = &A_d;
-  real **inout_d = (real **)hl_malloc_device(sizeof(real *));
-  hl_memcpy(inout_d, inout_h, sizeof(real *));
-
-  int *pivot_d = (int *)hl_malloc_device(dimN * sizeof(int));
-  int *info_d = (int *)t_resource.gpu_mem;
-
-  /* Note: cublasSgetrfBatched is used to calculate a number of
-     small-sized matrices. There may be a better way to reconstruct
-     the API for better performance.
-   */
-  CHECK_CUBLAS(
-      CUBLAS_GETRF(t_resource.handle, dimN, inout_d, lda, pivot_d, info_d, 1));
-
-  int info_h;
-  hl_memcpy(&info_h, info_d, sizeof(int));
-  if (info_h != 0) {
-    LOG(FATAL) << "Factorization of matrix failed: matrix may be singular.\n";
-  }
-
-  /* Step 2: Compute the inverse of the matrix given its LU decomposition */
-  real **out_h = &C_d;
-  real **out_d = (real **)hl_malloc_device(sizeof(real *));
-  hl_memcpy(out_d, out_h, sizeof(real *));
-
-  CHECK_CUBLAS(CUBLAS_GETRI(t_resource.handle,
-                            dimN,
-                            (const real **)inout_d,
-                            lda,
-                            pivot_d,
-                            out_d,
-                            ldc,
-                            info_d,
-                            1));
-
-  hl_memcpy(&info_h, info_d, sizeof(int));
-  if (info_h != 0) {
-    LOG(FATAL) << "Inversion of matrix failed: matrix may be singular.\n";
-  }
-
-  hl_free_mem_device(inout_d);
-  hl_free_mem_device(pivot_d);
-  hl_free_mem_device(out_d);
-
-  CHECK_SYNC("hl_matrix_inverse failed");
-}
-
-void hl_matrix_mul(real *A_d,
-                   hl_trans_op_t transa,
-                   real *B_d,
-                   hl_trans_op_t transb,
-                   real *C_d,
-                   int dimM,
-                   int dimN,
-                   int dimK,
-                   real alpha,
-                   real beta,
-                   int lda,
-                   int ldb,
-                   int ldc) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-  CHECK_NOTNULL(C_d);
-
-  if (dimN == 1 && dimM != 1 && dimK != 1 && transb == HPPL_OP_N) {
-    int m = (transa == HPPL_OP_N) ? dimM : dimK;
-    int n = (transa == HPPL_OP_N) ? dimK : dimM;
-    hl_matrix_mul_vector(
-        A_d, transa, B_d, C_d, m, n, alpha, beta, lda, ldb, ldc);
-    return;
-  }
-
-  if (dimM == 1 && dimN != 1 && dimK != 1 && transa == HPPL_OP_N) {
-    int m = (transb == HPPL_OP_N) ? dimK : dimN;
-    int n = (transb == HPPL_OP_N) ? dimN : dimK;
-    hl_trans_op_t trans = (transb == HPPL_OP_N) ? HPPL_OP_T : HPPL_OP_N;
-    hl_matrix_mul_vector(B_d, trans, A_d, C_d, m, n, alpha, beta, ldb, 1, 1);
-    return;
-  }
-
-  cublasStatus_t stat;
-  if ((HPPL_OP_N == transa) && (HPPL_OP_N == transb)) {
-    stat = CUBLAS_GEMM(t_resource.handle,
-                       CUBLAS_OP_N,
-                       CUBLAS_OP_N,
-                       dimN,
-                       dimM,
-                       dimK,
-                       &alpha,
-                       B_d,
-                       ldb,
-                       A_d,
-                       lda,
-                       &beta,
-                       C_d,
-                       ldc);
-  } else if ((HPPL_OP_T == transa) && (HPPL_OP_N == transb)) {
-    stat = CUBLAS_GEMM(t_resource.handle,
-                       CUBLAS_OP_N,
-                       CUBLAS_OP_T,
-                       dimN,
-                       dimM,
-                       dimK,
-                       &alpha,
-                       B_d,
-                       ldb,
-                       A_d,
-                       lda,
-                       &beta,
-                       C_d,
-                       ldc);
-  } else if ((HPPL_OP_N == transa) && (HPPL_OP_T == transb)) {
-    stat = CUBLAS_GEMM(t_resource.handle,
-                       CUBLAS_OP_T,
-                       CUBLAS_OP_N,
-                       dimN,
-                       dimM,
-                       dimK,
-                       &alpha,
-                       B_d,
-                       ldb,
-                       A_d,
-                       lda,
-                       &beta,
-                       C_d,
-                       ldc);
-  } else {
-    LOG(FATAL) << "parameter transa error!";
-  }
-  CHECK_EQ(stat, CUBLAS_STATUS_SUCCESS) << hl_cublas_get_error_string(stat);
-  CHECK_SYNC("hl_matrix_mul failed");
-}
-
-void hl_matrix_mul(real *A_d,
-                   hl_trans_op_t transa,
-                   real *B_d,
-                   hl_trans_op_t transb,
-                   real *C_d,
-                   int dimM,
-                   int dimN,
-                   int dimK,
-                   real alpha,
-                   real beta) {
-  int lda = (HPPL_OP_N == transa) ? dimK : dimM;
-  int ldb = (HPPL_OP_N == transb) ? dimN : dimK;
-  int ldc = dimN;
-
-  hl_matrix_mul(A_d,
-                transa,
-                B_d,
-                transb,
-                C_d,
-                dimM,
-                dimN,
-                dimK,
-                alpha,
-                beta,
-                lda,
-                ldb,
-                ldc);
-}
-
-void hl_matrix_mul_vector(real *A_d,
-                          hl_trans_op_t trans,
-                          real *B_d,
-                          real *C_d,
-                          int dimM,
-                          int dimN,
-                          real alpha,
-                          real beta,
-                          int lda,
-                          int incb,
-                          int incc) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-  CHECK_NOTNULL(C_d);
-
-  cublasStatus_t stat;
-  if (HPPL_OP_N == trans) {
-    stat = CUBLAS_GEMV(t_resource.handle,
-                       CUBLAS_OP_T,
-                       dimN,
-                       dimM,
-                       &alpha,
-                       A_d,
-                       lda,
-                       B_d,
-                       incb,
-                       &beta,
-                       C_d,
-                       incc);
-  } else if (HPPL_OP_T == trans) {
-    stat = CUBLAS_GEMV(t_resource.handle,
-                       CUBLAS_OP_N,
-                       dimN,
-                       dimM,
-                       &alpha,
-                       A_d,
-                       lda,
-                       B_d,
-                       incb,
-                       &beta,
-                       C_d,
-                       incc);
-  } else {
-    LOG(FATAL) << "parameter transa error!";
-  }
-
-  CHECK_EQ(stat, CUBLAS_STATUS_SUCCESS) << hl_cublas_get_error_string(stat);
-  CHECK_SYNC("hl_matrix_mul_vector");
-}
-
-void hl_matrix_mul_vector(real *A_d,
-                          hl_trans_op_t trans,
-                          real *B_d,
-                          real *C_d,
-                          int dimM,
-                          int dimN,
-                          real alpha,
-                          real beta) {
-  hl_matrix_mul_vector(
-      A_d, trans, B_d, C_d, dimM, dimN, alpha, beta, dimN, 1, 1);
-}
diff --git a/paddle/cuda/src/hl_cuda_cudnn.cc b/paddle/cuda/src/hl_cuda_cudnn.cc
deleted file mode 100644
index dfa935dcff9f7ae9f710d0f01a0217298d8cec04..0000000000000000000000000000000000000000
--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ /dev/null
@@ -1,1117 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "hl_cuda_cudnn.h"
-#include <cudnn.h>
-#include <gflags/gflags.h>
-#include "hl_cuda_cudnn.ph"
-#include "hl_thread.ph"
-#include "paddle/utils/DynamicLoader.h"
-#include "paddle/utils/Logging.h"
-
-DEFINE_int32(cudnn_conv_workspace_limit_in_mb,
-             4096,
-             "Specify cuDNN max workspace limit, in units MB, "
-             "4096MB=4GB by default.");
-
-namespace dynload {
-
-std::once_flag cudnn_dso_flag;
-void* cudnn_dso_handle = nullptr;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load cudbnn routine
- * via operator overloading: operator ()
- *
- * note: default dynamic linked libs
- **/
-
-#ifdef PADDLE_USE_DSO
-
-#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                                     \
-  struct DynLoad__##__name {                                                \
-    template <typename... Args>                                             \
-    auto operator()(Args... args) -> decltype(__name(args...)) {            \
-      using cudnn_func = decltype(__name(args...)) (*)(Args...);            \
-      std::call_once(cudnn_dso_flag, GetCudnnDsoHandle, &cudnn_dso_handle); \
-      void* p_##__name = dlsym(cudnn_dso_handle, #__name);                  \
-      return reinterpret_cast<cudnn_func>(p_##__name)(args...);             \
-    }                                                                       \
-  } __name; /* struct DynLoad__##__name */
-
-#else
-
-#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                          \
-  struct DynLoad__##__name {                                     \
-    template <typename... Args>                                  \
-    auto operator()(Args... args) -> decltype(__name(args...)) { \
-      return __name(args...);                                    \
-    }                                                            \
-  } __name; /* struct DynLoad__##__name */
-
-#endif
-
-/**
- * include all needed cudnn functions in HPPL
- * different cudnn version has different interfaces
- **/
-// clang-format off
-#define CUDNN_DNN_ROUTINE_EACH(__macro)                   \
-  __macro(cudnnSetTensor4dDescriptor)                     \
-  __macro(cudnnSetTensor4dDescriptorEx)                   \
-  __macro(cudnnGetConvolutionNdForwardOutputDim)          \
-  __macro(cudnnGetConvolutionForwardAlgorithm)            \
-  __macro(cudnnCreateTensorDescriptor)                    \
-  __macro(cudnnDestroyTensorDescriptor)                   \
-  __macro(cudnnCreateFilterDescriptor)                    \
-  __macro(cudnnSetFilter4dDescriptor)                     \
-  __macro(cudnnSetPooling2dDescriptor)                    \
-  __macro(cudnnDestroyFilterDescriptor)                   \
-  __macro(cudnnCreateConvolutionDescriptor)               \
-  __macro(cudnnCreatePoolingDescriptor)                   \
-  __macro(cudnnDestroyPoolingDescriptor)                  \
-  __macro(cudnnSetConvolution2dDescriptor)                \
-  __macro(cudnnDestroyConvolutionDescriptor)              \
-  __macro(cudnnCreate)                                    \
-  __macro(cudnnDestroy)                                   \
-  __macro(cudnnSetStream)                                 \
-  __macro(cudnnActivationForward)                         \
-  __macro(cudnnConvolutionForward)                        \
-  __macro(cudnnConvolutionBackwardBias)                   \
-  __macro(cudnnGetConvolutionForwardWorkspaceSize)        \
-  __macro(cudnnTransformTensor)                           \
-  __macro(cudnnPoolingForward)                            \
-  __macro(cudnnPoolingBackward)                           \
-  __macro(cudnnSoftmaxBackward)                           \
-  __macro(cudnnSoftmaxForward)                            \
-  __macro(cudnnGetVersion)                                \
-  __macro(cudnnGetErrorString)
-CUDNN_DNN_ROUTINE_EACH(DYNAMIC_LOAD_CUDNN_WRAP)
-
-#define CUDNN_DNN_ROUTINE_EACH_R2(__macro)                \
-  __macro(cudnnAddTensor)                                 \
-  __macro(cudnnConvolutionBackwardData)                   \
-  __macro(cudnnConvolutionBackwardFilter)
-CUDNN_DNN_ROUTINE_EACH_R2(DYNAMIC_LOAD_CUDNN_WRAP)
-
-// APIs available after R3:
-#if CUDNN_VERSION >= 3000
-#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3(__macro)              \
-  __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize)     \
-  __macro(cudnnGetConvolutionBackwardDataAlgorithm)           \
-  __macro(cudnnGetConvolutionBackwardFilterAlgorithm)         \
-  __macro(cudnnGetConvolutionBackwardDataWorkspaceSize)
-CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DYNAMIC_LOAD_CUDNN_WRAP)
-#undef CUDNN_DNN_ROUTINE_EACH_AFTER_R3
-#endif
-
-
-// APIs available after R4:
-#if CUDNN_VERSION >= 4007
-#define CUDNN_DNN_ROUTINE_EACH_AFTER_R4(__macro)             \
-  __macro(cudnnBatchNormalizationForwardTraining)            \
-  __macro(cudnnBatchNormalizationForwardInference)           \
-  __macro(cudnnBatchNormalizationBackward)
-CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DYNAMIC_LOAD_CUDNN_WRAP)
-#undef CUDNN_DNN_ROUTINE_EACH_AFTER_R4
-#endif
-
-// APIs in R5
-#if CUDNN_VERSION >= 5000
-#define CUDNN_DNN_ROUTINE_EACH_R5(__macro)                    \
-  __macro(cudnnCreateActivationDescriptor)                    \
-  __macro(cudnnSetActivationDescriptor)                       \
-  __macro(cudnnGetActivationDescriptor)                       \
-  __macro(cudnnDestroyActivationDescriptor)
-CUDNN_DNN_ROUTINE_EACH_R5(DYNAMIC_LOAD_CUDNN_WRAP)
-#undef CUDNN_DNN_ROUTINE_EACH_R5
-#endif
-
-#undef CUDNN_DNN_ROUTINE_EACH
-// clang-format on
-} /* namespace dynload */
-
-/**
- * Check build-in cudnn function using glog and it **does not**
- * support << operator for more details error info.
- */
-#define CHECK_CUDNN(cudnnFunc)                                         \
-  do {                                                                 \
-    cudnnStatus_t cudnnStat = cudnnFunc;                               \
-    CHECK_EQ(CUDNN_STATUS_SUCCESS, cudnnStat)                          \
-        << "Cudnn Error: " << dynload::cudnnGetErrorString(cudnnStat); \
-  } while (0)
-
-bool g_is_libcudnn_init = false;
-int g_cudnn_lib_version = 0;
-
-void hl_cudnn_desc_init(cudnnTensorDescriptor_t* cudnn_desc) {
-  CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(cudnn_desc));
-}
-
-void hl_cudnn_init(cudnnHandle_t* cudnn_handle, cudaStream_t stream) {
-  size_t cudnn_dso_ver = dynload::cudnnGetVersion();
-  size_t cudnn_dso_major = cudnn_dso_ver / 1000;
-  size_t cudnn_cuh_major = CUDNN_VERSION / 1000;
-
-  // Compare cudnn header version with that of cudnn.so.
-  CHECK((cudnn_cuh_major < 4 && cudnn_dso_major < 4) ||
-        (cudnn_cuh_major == cudnn_dso_major))
-      << "[cudnn init] libcudnn v" << cudnn_dso_major << " with header v"
-      << cudnn_cuh_major << " unmatched!\n"
-      << "PaddlePaddle Requirement: "
-      << "(header v[2-3] with libcudnn v[2-3]) Or "
-      << "(header v4 with libcudnn v4) Or "
-      << "(header v5 with libcudnn v5) Or"
-      << "(header v6 with libcudnn v6).";
-
-  CHECK(!(CUDNN_VERSION < 6000 && CUDNN_VERSION >= 5000 && CUDA_VERSION < 7050))
-      << "cudnn v5 requires cuda version >= 7.5";
-
-  CHECK(!(CUDNN_VERSION >= 6000 && CUDA_VERSION < 8000))
-      << "cudnn v6 requires cuda version >= 8.0";
-
-  CHECK_CUDNN(dynload::cudnnCreate(cudnn_handle));
-  CHECK_CUDNN(dynload::cudnnSetStream(*cudnn_handle, stream));
-
-  g_is_libcudnn_init = true;
-  g_cudnn_lib_version = cudnn_dso_ver;
-}
-
-int hl_get_cudnn_lib_version() { return g_cudnn_lib_version; }
-
-void hl_conv_workspace(hl_tensor_descriptor input,
-                       hl_tensor_descriptor output,
-                       hl_filter_descriptor filter,
-                       hl_convolution_descriptor conv,
-                       int* convFwdAlgo,
-                       size_t* fwdLimitBytes,
-                       int* convBwdDataAlgo,
-                       size_t* bwdDataLimitBytes,
-                       int* convBwdFilterAlgo,
-                       size_t* bwdFilterLimitBytes,
-                       bool useDilation) {
-#if CUDNN_VERSION >= 4000
-
-  CHECK_NOTNULL(input);
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(filter);
-  CHECK_NOTNULL(conv);
-
-  // Specify workspace limit directly
-  size_t memoryLimitBytes =
-      (1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb;
-
-  // For dilation
-  int algo = 0;
-
-  // cudnn convolution forward configuration
-  cudnnTensorDescriptor_t fwd_src_desc = GET_TENSOR_DESCRIPTOR(input);
-  cudnnTensorDescriptor_t fwd_dest_desc = GET_TENSOR_DESCRIPTOR(output);
-  cudnnFilterDescriptor_t fwd_filter_desc = GET_FILTER_DESCRIPTOR(filter);
-  cudnnConvolutionDescriptor_t fwd_conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
-  // cudnn convolution backward data configuration
-  cudnnFilterDescriptor_t bwd_data_filter_desc = GET_FILTER_DESCRIPTOR(filter);
-  cudnnTensorDescriptor_t bwd_data_diff_desc = GET_TENSOR_DESCRIPTOR(output);
-  cudnnTensorDescriptor_t bwd_data_grad_desc = GET_TENSOR_DESCRIPTOR(input);
-  cudnnConvolutionDescriptor_t bwd_data_conv_desc =
-      GET_CONVOLUTION_DESCRIPTOR(conv);
-  // cudnn convolution backward filter configuration
-  cudnnTensorDescriptor_t bwd_filter_src_desc = GET_TENSOR_DESCRIPTOR(input);
-  cudnnTensorDescriptor_t bwd_filter_diff_desc = GET_TENSOR_DESCRIPTOR(output);
-  cudnnConvolutionDescriptor_t bwd_filter_conv_desc =
-      GET_CONVOLUTION_DESCRIPTOR(conv);
-  cudnnFilterDescriptor_t bwd_filter_grad_desc = GET_FILTER_DESCRIPTOR(filter);
-
-  if (useDilation) {
-    convFwdAlgo = &algo;
-    convBwdDataAlgo = &algo;
-    convBwdFilterAlgo = &algo;
-  } else {
-    CHECK_CUDNN(dynload::cudnnGetConvolutionForwardAlgorithm(
-        t_resource.cudnn_handle,
-        fwd_src_desc,
-        fwd_filter_desc,
-        fwd_conv_desc,
-        fwd_dest_desc,
-        CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-        memoryLimitBytes,
-        reinterpret_cast<cudnnConvolutionFwdAlgo_t*>(convFwdAlgo)));
-    CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataAlgorithm(
-        t_resource.cudnn_handle,
-        bwd_data_filter_desc,
-        bwd_data_diff_desc,
-        bwd_data_conv_desc,
-        bwd_data_grad_desc,
-        CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
-        memoryLimitBytes,
-        reinterpret_cast<cudnnConvolutionBwdDataAlgo_t*>(convBwdDataAlgo)));
-    CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
-        t_resource.cudnn_handle,
-        bwd_filter_src_desc,
-        bwd_filter_diff_desc,
-        bwd_filter_conv_desc,
-        bwd_filter_grad_desc,
-        CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-        memoryLimitBytes,
-        reinterpret_cast<cudnnConvolutionBwdFilterAlgo_t*>(convBwdFilterAlgo)));
-  }
-
-  CHECK_CUDNN(dynload::cudnnGetConvolutionForwardWorkspaceSize(
-      t_resource.cudnn_handle,
-      fwd_src_desc,
-      fwd_filter_desc,
-      fwd_conv_desc,
-      fwd_dest_desc,
-      static_cast<cudnnConvolutionFwdAlgo_t>(*convFwdAlgo),
-      fwdLimitBytes));
-
-  CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
-      t_resource.cudnn_handle,
-      bwd_data_filter_desc,
-      bwd_data_diff_desc,
-      bwd_data_conv_desc,
-      bwd_data_grad_desc,
-      static_cast<cudnnConvolutionBwdDataAlgo_t>(*convBwdDataAlgo),
-      bwdDataLimitBytes));
-
-  CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
-      t_resource.cudnn_handle,
-      bwd_filter_src_desc,
-      bwd_filter_diff_desc,
-      bwd_filter_conv_desc,
-      bwd_filter_grad_desc,
-      static_cast<cudnnConvolutionBwdFilterAlgo_t>(*convBwdFilterAlgo),
-      bwdFilterLimitBytes));
-
-#endif
-}
-
-void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc,
-                                 int batch_size,
-                                 int feature_maps,
-                                 int height,
-                                 int width) {
-  CHECK_NOTNULL(image_desc);
-
-  cudnn_tensor_descriptor hl_desc =
-      (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
-  CHECK_NOTNULL(hl_desc);
-
-#ifndef PADDLE_TYPE_DOUBLE
-  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
-#else
-  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
-#endif
-  CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(&hl_desc->desc));
-
-  CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(hl_desc->desc,
-                                                  CUDNN_TENSOR_NCHW,
-                                                  data_type,
-                                                  batch_size,
-                                                  feature_maps,
-                                                  height,
-                                                  width));
-
-  hl_desc->format = CUDNN_TENSOR_NCHW;
-  hl_desc->data_type = data_type;
-  hl_desc->batch_size = batch_size;
-  hl_desc->feature_maps = feature_maps;
-  hl_desc->height = height;
-  hl_desc->width = width;
-
-  *image_desc = (hl_tensor_descriptor)hl_desc;
-}
-
-void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc) {
-  CHECK_NOTNULL(image_desc);
-
-  cudnn_tensor_descriptor hl_desc =
-      (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
-  CHECK_NOTNULL(hl_desc);
-
-#ifndef PADDLE_TYPE_DOUBLE
-  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
-#else
-  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
-#endif
-  CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(&hl_desc->desc));
-
-  hl_desc->data_type = data_type;
-
-  *image_desc = (hl_tensor_descriptor)hl_desc;
-}
-
-void hl_tensor_reshape(hl_tensor_descriptor image_desc,
-                       int batch_size,
-                       int feature_maps,
-                       int height,
-                       int width) {
-  const int stride_w = 1;
-  const int stride_h = width * stride_w;
-  const int stride_c = height * stride_h;
-  const int stride_n = feature_maps * stride_c;
-  return hl_tensor_reshape(image_desc,
-                           batch_size,
-                           feature_maps,
-                           height,
-                           width,
-                           stride_n,
-                           stride_c,
-                           stride_h,
-                           stride_w);
-}
-
-void hl_tensor_reshape(hl_tensor_descriptor image_desc,
-                       int batch_size,
-                       int feature_maps,
-                       int height,
-                       int width,
-                       int nStride,
-                       int cStride,
-                       int hStride,
-                       int wStride) {
-  CHECK_NOTNULL(image_desc);
-
-  cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
-  CHECK_NOTNULL(hl_desc->desc);
-
-  CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptorEx(hl_desc->desc,
-                                                    hl_desc->data_type,
-                                                    batch_size,
-                                                    feature_maps,
-                                                    height,
-                                                    width,
-                                                    nStride,
-                                                    cStride,
-                                                    hStride,
-                                                    wStride));
-
-  hl_desc->batch_size = batch_size;
-  hl_desc->feature_maps = feature_maps;
-  hl_desc->height = height;
-  hl_desc->width = width;
-}
-
-void hl_destroy_tensor_descriptor(hl_tensor_descriptor image_desc) {
-  CHECK_NOTNULL(image_desc);
-
-  cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
-  CHECK_NOTNULL(hl_desc->desc);
-
-  CHECK_CUDNN(dynload::cudnnDestroyTensorDescriptor(hl_desc->desc));
-
-  hl_desc->desc = NULL;
-
-  free(image_desc);
-}
-
-void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc,
-                                  hl_pooling_mode_t mode,
-                                  int height,
-                                  int width,
-                                  int height_padding,
-                                  int width_padding,
-                                  int stride_height,
-                                  int stride_width) {
-  cudnnPoolingMode_t cudnn_mode;
-  switch (mode) {
-    case HL_POOLING_MAX:
-      cudnn_mode = CUDNN_POOLING_MAX;
-      break;
-    case HL_POOLING_AVERAGE:
-      cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
-      break;
-    case HL_POOLING_AVERAGE_INCLUDE_PADDING:
-      cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
-      break;
-    default:
-      LOG(FATAL) << "parameter mode error";
-  }
-
-  CHECK_NOTNULL(pooling_desc);
-
-  cudnn_pooling_descriptor hl_pooling_desc =
-      (cudnn_pooling_descriptor)malloc(sizeof(_cudnn_pooling_descriptor));
-  CHECK_NOTNULL(hl_pooling_desc);
-
-  CHECK_CUDNN(dynload::cudnnCreatePoolingDescriptor(&hl_pooling_desc->desc));
-
-  CHECK_CUDNN(dynload::cudnnSetPooling2dDescriptor(hl_pooling_desc->desc,
-                                                   cudnn_mode,
-#if CUDNN_VERSION >= 5000
-                                                   CUDNN_PROPAGATE_NAN,
-#endif
-                                                   height,
-                                                   width,
-                                                   height_padding,
-                                                   width_padding,
-                                                   stride_height,
-                                                   stride_width));
-
-  hl_pooling_desc->mode = cudnn_mode;
-  hl_pooling_desc->window_height = height;
-  hl_pooling_desc->window_width = width;
-  hl_pooling_desc->stride_height = stride_height;
-  hl_pooling_desc->stride_width = stride_width;
-
-  *pooling_desc = (hl_pooling_descriptor)hl_pooling_desc;
-}
-
-void hl_destroy_pooling_descriptor(hl_pooling_descriptor pooling_desc) {
-  CHECK_NOTNULL(pooling_desc);
-
-  cudnn_pooling_descriptor hl_pooling = (cudnn_pooling_descriptor)pooling_desc;
-
-  CHECK_NOTNULL(hl_pooling->desc);
-  CHECK_CUDNN(dynload::cudnnDestroyPoolingDescriptor(hl_pooling->desc));
-
-  hl_pooling->desc = NULL;
-
-  free(pooling_desc);
-}
-
-void hl_pooling_forward(hl_tensor_descriptor input,
-                        real* input_image,
-                        hl_tensor_descriptor output,
-                        real* output_image,
-                        hl_pooling_descriptor pooling) {
-  cudnnPoolingDescriptor_t pooling_desc;
-  cudnnTensorDescriptor_t input_desc;
-  cudnnTensorDescriptor_t output_desc;
-
-  CHECK_NOTNULL(input);
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(pooling);
-  CHECK_NOTNULL(input_image);
-  CHECK_NOTNULL(output_image);
-
-  real alpha = 1.0f;
-  real beta = 1.0f;
-  input_desc = ((cudnn_tensor_descriptor)input)->desc;
-  output_desc = ((cudnn_tensor_descriptor)output)->desc;
-  pooling_desc = ((cudnn_pooling_descriptor)pooling)->desc;
-  CHECK_CUDNN(dynload::cudnnPoolingForward(t_resource.cudnn_handle,
-                                           pooling_desc,
-                                           &alpha,
-                                           input_desc,
-                                           input_image,
-                                           &beta,
-                                           output_desc,
-                                           output_image));
-  CHECK_SYNC("hl_pooling_forward failed");
-}
-
-void hl_pooling_backward(hl_tensor_descriptor input,
-                         real* input_image,
-                         real* input_image_grad,
-                         hl_tensor_descriptor output,
-                         real* output_image,
-                         real* output_image_grad,
-                         hl_pooling_descriptor pooling) {
-  cudnnPoolingDescriptor_t pooling_desc;
-  cudnnTensorDescriptor_t input_desc;
-  cudnnTensorDescriptor_t output_desc;
-
-  CHECK_NOTNULL(input);
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(pooling);
-  CHECK_NOTNULL(input_image);
-  CHECK_NOTNULL(input_image_grad);
-  CHECK_NOTNULL(output_image);
-  CHECK_NOTNULL(output_image_grad);
-
-  real alpha = 1.0f;
-  real beta = 1.0f;
-  input_desc = ((cudnn_tensor_descriptor)input)->desc;
-  output_desc = ((cudnn_tensor_descriptor)output)->desc;
-  pooling_desc = ((cudnn_pooling_descriptor)pooling)->desc;
-  CHECK_CUDNN(dynload::cudnnPoolingBackward(t_resource.cudnn_handle,
-                                            pooling_desc,
-                                            &alpha,
-                                            output_desc,
-                                            output_image,
-                                            output_desc,
-                                            output_image_grad,
-                                            input_desc,
-                                            input_image,
-                                            &beta,
-                                            input_desc,
-                                            input_image_grad));
-  CHECK_SYNC("hl_pooling_backward failed");
-}
-
-void hl_create_filter_descriptor(hl_filter_descriptor* filter,
-                                 int input_feature_maps,
-                                 int output_feature_maps,
-                                 int height,
-                                 int width) {
-  CHECK_NOTNULL(filter);
-
-  cudnn_filter_descriptor hl_filter =
-      (cudnn_filter_descriptor)malloc(sizeof(_cudnn_filter_descriptor));
-  CHECK_NOTNULL(hl_filter);
-
-  CHECK_CUDNN(dynload::cudnnCreateFilterDescriptor(&hl_filter->desc));
-
-#ifndef PADDLE_TYPE_DOUBLE
-  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
-#else
-  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
-#endif
-  CHECK_CUDNN(dynload::cudnnSetFilter4dDescriptor(hl_filter->desc,
-                                                  data_type,
-#if CUDNN_VERSION >= 5000
-                                                  CUDNN_TENSOR_NCHW,
-#endif
-                                                  output_feature_maps,
-                                                  input_feature_maps,
-                                                  height,
-                                                  width));
-
-  hl_filter->data_type = data_type;
-  hl_filter->output_feature_maps = output_feature_maps;
-  hl_filter->input_feature_maps = input_feature_maps;
-  hl_filter->filter_height = height;
-  hl_filter->filter_width = width;
-
-  *filter = (hl_filter_descriptor)hl_filter;
-}
-
-void hl_destroy_filter_descriptor(hl_filter_descriptor filter) {
-  CHECK_NOTNULL(filter);
-
-  cudnn_filter_descriptor hl_filter = (cudnn_filter_descriptor)filter;
-  CHECK_NOTNULL(hl_filter->desc);
-
-  CHECK_CUDNN(dynload::cudnnDestroyFilterDescriptor(hl_filter->desc));
-
-  hl_filter->desc = NULL;
-
-  free(filter);
-}
-
-void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
-                                      hl_tensor_descriptor image,
-                                      hl_filter_descriptor filter,
-                                      int padding_height,
-                                      int padding_width,
-                                      int stride_height,
-                                      int stride_width,
-                                      int dilation_h,
-                                      int dilation_w) {
-  CHECK_NOTNULL(conv);
-
-  cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)malloc(
-      sizeof(_cudnn_convolution_descriptor));
-
-  CHECK_NOTNULL(hl_conv);
-  CHECK_CUDNN(dynload::cudnnCreateConvolutionDescriptor(&hl_conv->desc));
-
-  cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
-
-#if CUDNN_VERSION >= 6000
-#ifndef PADDLE_TYPE_DOUBLE
-  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
-#else
-  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
-#endif
-  CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(hl_conv->desc,
-                                                       padding_height,
-                                                       padding_width,
-                                                       stride_height,
-                                                       stride_width,
-                                                       dilation_h,
-                                                       dilation_w,
-                                                       mode,
-                                                       data_type));
-#else
-  if (dilation_h > 1 || dilation_w > 1) {
-    LOG(FATAL)
-        << "Current cuDNN version does't support for dilation convolution. "
-        << "The dilation convolution requires cuDNN >= v6.0.";
-  }
-
-  CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(hl_conv->desc,
-                                                       padding_height,
-                                                       padding_width,
-                                                       stride_height,
-                                                       stride_width,
-                                                       dilation_h,
-                                                       dilation_w,
-                                                       mode));
-#endif
-
-  hl_conv->input_image = image;
-  hl_conv->filter = filter;
-  hl_conv->padding_height = padding_height;
-  hl_conv->padding_width = padding_width;
-  hl_conv->stride_height = stride_height;
-  hl_conv->stride_width = stride_width;
-  hl_conv->upscalex = 1;
-  hl_conv->upscaley = 1;
-  hl_conv->mode = mode;
-
-  *conv = (hl_convolution_descriptor)hl_conv;
-}
-
-void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
-                                     hl_tensor_descriptor image,
-                                     hl_filter_descriptor filter,
-                                     int padding_height,
-                                     int padding_width,
-                                     int stride_height,
-                                     int stride_width,
-                                     int dilation_h,
-                                     int dilation_w) {
-  CHECK_NOTNULL(conv);
-  CHECK_NOTNULL(image);
-  CHECK_NOTNULL(filter);
-
-  cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
-  cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
-
-#if CUDNN_VERSION >= 6000
-#ifndef PADDLE_TYPE_DOUBLE
-  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
-#else
-  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
-#endif
-  CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(conv_desc,
-                                                       padding_height,
-                                                       padding_width,
-                                                       stride_height,
-                                                       stride_width,
-                                                       dilation_h,
-                                                       dilation_w,
-                                                       mode,
-                                                       data_type));
-#else
-  CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(conv_desc,
-                                                       padding_height,
-                                                       padding_width,
-                                                       stride_height,
-                                                       stride_width,
-                                                       dilation_h,
-                                                       dilation_w,
-                                                       mode));
-#endif
-
-  cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
-  hl_conv->input_image = image;
-  hl_conv->filter = filter;
-  hl_conv->padding_height = padding_height;
-  hl_conv->padding_width = padding_width;
-  hl_conv->stride_height = stride_height;
-  hl_conv->stride_width = stride_width;
-  hl_conv->upscalex = 1;
-  hl_conv->upscaley = 1;
-  hl_conv->mode = mode;
-}
-
-void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv) {
-  CHECK_NOTNULL(conv);
-
-  cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
-  CHECK_NOTNULL(hl_conv->desc);
-
-  CHECK_CUDNN(dynload::cudnnDestroyConvolutionDescriptor(hl_conv->desc));
-  hl_conv->desc = NULL;
-
-  free(conv);
-}
-
-void hl_convolution_forward(hl_tensor_descriptor input,
-                            real* input_data,
-                            hl_tensor_descriptor output,
-                            real* output_data,
-                            hl_filter_descriptor filter,
-                            real* filter_data,
-                            hl_convolution_descriptor conv,
-                            void* gpuWorkSpace,
-                            size_t sizeInBytes,
-                            int convFwdAlgo) {
-  CHECK_NOTNULL(input);
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(filter);
-  CHECK_NOTNULL(conv);
-  CHECK_NOTNULL(input_data);
-  CHECK_NOTNULL(output_data);
-  CHECK_NOTNULL(filter_data);
-  cudnnTensorDescriptor_t src_desc = GET_TENSOR_DESCRIPTOR(input);
-  cudnnTensorDescriptor_t dest_desc = GET_TENSOR_DESCRIPTOR(output);
-  cudnnFilterDescriptor_t filter_desc = GET_FILTER_DESCRIPTOR(filter);
-  cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
-  real alpha = 1.0f;
-  real beta = 1.0f;
-  CHECK_CUDNN(dynload::cudnnConvolutionForward(
-      t_resource.cudnn_handle,
-      &alpha,
-      src_desc,
-      input_data,
-      filter_desc,
-      filter_data,
-      conv_desc,
-      static_cast<cudnnConvolutionFwdAlgo_t>(convFwdAlgo),
-      gpuWorkSpace,
-      sizeInBytes,
-      &beta,
-      dest_desc,
-      output_data));
-  CHECK_SYNC("hl_convolution_forward failed");
-}
-
-void hl_convolution_forward_add_bias(hl_tensor_descriptor bias,
-                                     real* bias_data,
-                                     hl_tensor_descriptor output,
-                                     real* output_data) {
-  CHECK_NOTNULL(bias);
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(bias_data);
-  CHECK_NOTNULL(output_data);
-
-  cudnnTensorDescriptor_t output_desc = GET_TENSOR_DESCRIPTOR(output);
-  cudnnTensorDescriptor_t bias_desc = GET_TENSOR_DESCRIPTOR(bias);
-  real alpha = 1.0f;
-  real beta = 1.0f;
-
-  CHECK_CUDNN(dynload::cudnnAddTensor(t_resource.cudnn_handle,
-#if CUDNN_VERSION < 4000
-                                      CUDNN_ADD_SAME_C,
-#endif
-                                      &alpha,
-                                      bias_desc,
-                                      bias_data,
-                                      &beta,
-                                      output_desc,
-                                      output_data));
-  CHECK_SYNC("hl_convolution_forward_add_bias failed");
-}
-
-void hl_convolution_backward_bias(hl_tensor_descriptor bias,
-                                  real* bias_grad_data,
-                                  hl_tensor_descriptor output,
-                                  real* output_grad_data) {
-  CHECK_NOTNULL(bias);
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(bias_grad_data);
-  CHECK_NOTNULL(output_grad_data);
-
-  real alpha = 1.0f;
-  real beta = 1.0f;
-  cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
-  cudnnTensorDescriptor_t bias_desc = GET_TENSOR_DESCRIPTOR(bias);
-  CHECK_CUDNN(dynload::cudnnConvolutionBackwardBias(t_resource.cudnn_handle,
-                                                    &alpha,
-                                                    diff_desc,
-                                                    output_grad_data,
-                                                    &beta,
-                                                    bias_desc,
-                                                    bias_grad_data));
-  CHECK_SYNC("hl_convolution_backward_bias failed");
-}
-
-void hl_convolution_backward_filter(hl_tensor_descriptor input,
-                                    real* input_data,
-                                    hl_tensor_descriptor output,
-                                    real* output_grad_data,
-                                    hl_filter_descriptor filter,
-                                    real* filter_grad_data,
-                                    hl_convolution_descriptor conv,
-                                    void* gpuWorkSpace,
-                                    size_t sizeInBytes,
-                                    int convBwdFilterAlgo) {
-  CHECK_NOTNULL(input);
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(filter);
-  CHECK_NOTNULL(conv);
-  CHECK_NOTNULL(input_data);
-  CHECK_NOTNULL(output_grad_data);
-  CHECK_NOTNULL(filter_grad_data);
-
-  real alpha = 1.0f;
-  real beta = 1.0f;
-  cudnnTensorDescriptor_t src_desc = GET_TENSOR_DESCRIPTOR(input);
-  cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
-  cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
-  cudnnFilterDescriptor_t grad_desc = GET_FILTER_DESCRIPTOR(filter);
-
-  CHECK_CUDNN(dynload::cudnnConvolutionBackwardFilter(
-      t_resource.cudnn_handle,
-      &alpha,
-      src_desc,
-      input_data,
-      diff_desc,
-      output_grad_data,
-      conv_desc,
-#if CUDNN_VERSION >= 4000
-      static_cast<cudnnConvolutionBwdFilterAlgo_t>(convBwdFilterAlgo),
-      gpuWorkSpace,
-      sizeInBytes,
-#endif
-      &beta,
-      grad_desc,
-      filter_grad_data));
-  CHECK_SYNC("hl_convolution_backward_filter failed");
-}
-
-void hl_convolution_backward_data(hl_tensor_descriptor input,
-                                  real* input_data_grad,
-                                  hl_tensor_descriptor output,
-                                  real* output_grad_data,
-                                  hl_filter_descriptor filter,
-                                  real* filter_data,
-                                  hl_convolution_descriptor conv,
-                                  void* gpuWorkSpace,
-                                  size_t sizeInBytes,
-                                  int convBwdDataAlgo) {
-  real alpha = 1.0f;
-  real beta = 1.0f;
-  cudnnFilterDescriptor_t filter_desc = GET_FILTER_DESCRIPTOR(filter);
-  cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
-  cudnnTensorDescriptor_t grad_desc = GET_TENSOR_DESCRIPTOR(input);
-  cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
-
-  CHECK_CUDNN(dynload::cudnnConvolutionBackwardData(
-      t_resource.cudnn_handle,
-      &alpha,
-      filter_desc,
-      filter_data,
-      diff_desc,
-      output_grad_data,
-      conv_desc,
-#if CUDNN_VERSION >= 4000
-      static_cast<cudnnConvolutionBwdDataAlgo_t>(convBwdDataAlgo),
-      gpuWorkSpace,
-      sizeInBytes,
-#endif
-      &beta,
-      grad_desc,
-      input_data_grad));
-  CHECK_SYNC("hl_convolution_backward_data failed");
-}
-
-void hl_softmax_forward(real* input, real* output, int height, int width) {
-#ifndef PADDLE_TYPE_DOUBLE
-  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
-#else
-  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
-#endif
-  CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(t_resource.cudnn_desc,
-                                                  CUDNN_TENSOR_NCHW,
-                                                  data_type,
-                                                  height,
-                                                  width,
-                                                  1,
-                                                  1));
-
-  real alpha = 1.0f;
-  real beta = 0.0f;
-  CHECK_CUDNN(dynload::cudnnSoftmaxForward(t_resource.cudnn_handle,
-                                           CUDNN_SOFTMAX_ACCURATE,
-                                           CUDNN_SOFTMAX_MODE_CHANNEL,
-                                           &alpha,
-                                           t_resource.cudnn_desc,
-                                           input,
-                                           &beta,
-                                           t_resource.cudnn_desc,
-                                           output));
-  CHECK_SYNC("hl_softmax_forward failed");
-}
-
-void hl_softmax_backward(real* output_value,
-                         real* output_grad,
-                         int height,
-                         int width) {
-#ifndef PADDLE_TYPE_DOUBLE
-  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
-#else
-  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
-#endif
-  CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(t_resource.cudnn_desc,
-                                                  CUDNN_TENSOR_NCHW,
-                                                  data_type,
-                                                  height,
-                                                  width,
-                                                  1,
-                                                  1));
-
-  real alpha = 1.0f;
-  real beta = 0.0f;
-  CHECK_CUDNN(dynload::cudnnSoftmaxBackward(t_resource.cudnn_handle,
-                                            CUDNN_SOFTMAX_ACCURATE,
-                                            CUDNN_SOFTMAX_MODE_CHANNEL,
-                                            &alpha,
-                                            t_resource.cudnn_desc,
-                                            output_value,
-                                            t_resource.cudnn_desc,
-                                            output_grad,
-                                            &beta,
-                                            t_resource.cudnn_desc,
-                                            output_grad));
-  CHECK_SYNC("hl_softmax_backward failed");
-}
-
-void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
-                                    real* input,
-                                    hl_tensor_descriptor outputDesc,
-                                    real* output,
-                                    hl_tensor_descriptor bnParamDesc,
-                                    real* scale,
-                                    real* bias,
-                                    double factor,
-                                    real* runningMean,
-                                    real* runningInvVar,
-                                    double epsilon,
-                                    real* savedMean,
-                                    real* savedVar) {
-#if CUDNN_VERSION >= 4007
-  if ((NULL != runningMean && NULL == runningInvVar) ||
-      (NULL == runningMean && NULL != runningInvVar)) {
-    LOG(FATAL) << "runningMean and runningInvVar can be NULL "
-               << "but only at the same time.";
-  }
-  if ((NULL != savedMean && NULL == savedVar) ||
-      (NULL == savedMean && NULL != savedVar)) {
-    LOG(FATAL) << "savedMean and savedVar can be NULL "
-               << "but only at the same time.";
-  }
-
-  cudnnTensorDescriptor_t xDesc = GET_TENSOR_DESCRIPTOR(inputDesc);
-  cudnnTensorDescriptor_t yDesc = GET_TENSOR_DESCRIPTOR(outputDesc);
-  cudnnTensorDescriptor_t bnDesc = GET_TENSOR_DESCRIPTOR(bnParamDesc);
-  real alpha = 1.0f;
-  real beta = 1.0f;
-  cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
-  CHECK_CUDNN(
-      dynload::cudnnBatchNormalizationForwardTraining(t_resource.cudnn_handle,
-                                                      mode,
-                                                      &alpha,
-                                                      &beta,
-                                                      xDesc,
-                                                      input,
-                                                      yDesc,
-                                                      output,
-                                                      bnDesc,
-                                                      scale,
-                                                      bias,
-                                                      factor,
-                                                      runningMean,
-                                                      runningInvVar,
-                                                      epsilon,
-                                                      savedMean,
-                                                      savedVar));
-
-  CHECK_SYNC("hl_batch_norm_forward_training failed");
-#else
-  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4007. "
-             << "But cudnn lib version is " << g_cudnn_lib_version;
-#endif
-}
-
-void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
-                                     real* input,
-                                     hl_tensor_descriptor outputDesc,
-                                     real* output,
-                                     hl_tensor_descriptor bnParamDesc,
-                                     real* scale,
-                                     real* bias,
-                                     real* estimatedMean,
-                                     real* estimatedInvVar,
-                                     double epsilon) {
-#if CUDNN_VERSION >= 4007
-  cudnnTensorDescriptor_t xDesc = GET_TENSOR_DESCRIPTOR(inputDesc);
-  cudnnTensorDescriptor_t yDesc = GET_TENSOR_DESCRIPTOR(outputDesc);
-  cudnnTensorDescriptor_t bnDesc = GET_TENSOR_DESCRIPTOR(bnParamDesc);
-  real alpha = 1.0f;
-  real beta = 1.0f;
-  cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
-
-  CHECK_CUDNN(
-      dynload::cudnnBatchNormalizationForwardInference(t_resource.cudnn_handle,
-                                                       mode,
-                                                       &alpha,
-                                                       &beta,
-                                                       xDesc,
-                                                       input,
-                                                       yDesc,
-                                                       output,
-                                                       bnDesc,
-                                                       scale,
-                                                       bias,
-                                                       estimatedMean,
-                                                       estimatedInvVar,
-                                                       epsilon));
-
-  CHECK_SYNC("hl_batch_norm_forward_inference failed");
-#else
-  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4007. "
-             << "But cudnn lib version is " << g_cudnn_lib_version;
-#endif
-}
-
-void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
-                            real* input,
-                            hl_tensor_descriptor outGradDesc,
-                            real* outGrad,
-                            hl_tensor_descriptor inGradDesc,
-                            real* inGrad,
-                            hl_tensor_descriptor dBnParamDesc,
-                            real* scale,
-                            real* scaleGrad,
-                            real* biasGrad,
-                            double epsilon,
-                            real* savedMean,
-                            real* savedInvVar) {
-#if CUDNN_VERSION >= 4007
-  if ((NULL != savedMean && NULL == savedInvVar) ||
-      (NULL == savedMean && NULL != savedInvVar)) {
-    LOG(FATAL) << "savedMean and savedVar can be NULL "
-               << "but only at the same time.";
-  }
-
-  cudnnTensorDescriptor_t xDesc = GET_TENSOR_DESCRIPTOR(inputDesc);
-  cudnnTensorDescriptor_t dyDesc = GET_TENSOR_DESCRIPTOR(outGradDesc);
-  cudnnTensorDescriptor_t dxDesc = GET_TENSOR_DESCRIPTOR(inGradDesc);
-  cudnnTensorDescriptor_t bnDesc = GET_TENSOR_DESCRIPTOR(dBnParamDesc);
-  real alpha = 1.0f;
-  real beta = 1.0f;
-  cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
-  CHECK_CUDNN(dynload::cudnnBatchNormalizationBackward(t_resource.cudnn_handle,
-                                                       mode,
-                                                       &alpha,
-                                                       &beta,
-                                                       &alpha,
-                                                       &beta,
-                                                       xDesc,
-                                                       input,
-                                                       dyDesc,
-                                                       outGrad,
-                                                       dxDesc,
-                                                       inGrad,
-                                                       bnDesc,
-                                                       scale,
-                                                       scaleGrad,
-                                                       biasGrad,
-                                                       epsilon,
-                                                       savedMean,
-                                                       savedInvVar));
-
-  CHECK_SYNC("hl_batch_norm_backward failed");
-#else
-  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4007. "
-             << "But cudnn lib version is " << g_cudnn_lib_version;
-#endif
-}
diff --git a/paddle/cuda/src/hl_cuda_device.cc b/paddle/cuda/src/hl_cuda_device.cc
deleted file mode 100644
index 3025aa48523d67fe3d7ed03f44252d1211d2a46a..0000000000000000000000000000000000000000
--- a/paddle/cuda/src/hl_cuda_device.cc
+++ /dev/null
@@ -1,677 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-// clang-format off
-// Because clang-format 4.X and clang-format 3.8+ format
-// following lines in different. So disable clang-format.
-#include "hl_cuda.h"
-#include <cuda_profiler_api.h>
-#include <string.h>
-#include <sys/syscall.h>
-#include <sys/time.h>
-#include <unistd.h>
-#include "hl_cuda.ph"
-#include "hl_thread.ph"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/DynamicLoader.h"
-// clang-format on
-
-namespace dynload {
-
-std::once_flag curand_dso_flag;
-void *curand_dso_handle = nullptr;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load curand routine
- * via operator overloading.
- *
- * note: default dynamic linked libs
- */
-#ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CURAND_WRAP(__name)                                       \
-  struct DynLoad__##__name {                                                   \
-    template <typename... Args>                                                \
-    curandStatus_t operator()(Args... args) {                                  \
-      typedef curandStatus_t (*curandFunc)(Args...);                           \
-      std::call_once(curand_dso_flag, GetCurandDsoHandle, &curand_dso_handle); \
-      void *p_##__name = dlsym(curand_dso_handle, #__name);                    \
-      return reinterpret_cast<curandFunc>(p_##__name)(args...);                \
-    }                                                                          \
-  } __name; /* struct DynLoad__##__name */
-#else
-#define DYNAMIC_LOAD_CURAND_WRAP(__name)      \
-  struct DynLoad__##__name {                  \
-    template <typename... Args>               \
-    curandStatus_t operator()(Args... args) { \
-      return __name(args...);                 \
-    }                                         \
-  } __name; /* struct DynLoad__##__name */
-#endif
-
-/* include all needed curand functions in HPPL */
-// clang-format off
-#define CURAND_RAND_ROUTINE_EACH(__macro)    \
-  __macro(curandCreateGenerator)             \
-  __macro(curandSetStream)                   \
-  __macro(curandSetPseudoRandomGeneratorSeed)\
-  __macro(curandGenerateUniform)             \
-  __macro(curandGenerateUniformDouble)
-// clang-format on
-
-CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP)
-
-#undef CURAND_RAND_ROUTINE_EACH
-#undef DYNAMIC_LOAD_CURAND_WRAP
-
-} /* namespace dynload */
-
-/**
- * @brief   global resource.
- */
-int g_system_device_num = 0;                /* system device number */
-int device_num = 0;                         /* use    device number */
-hl_device_prop *g_device;                   /* device info table */
-__thread thread_device_resources *t_device; /* device resources table */
-int g_cuda_lib_version = 0;
-
-/* number of global stream */
-#define NUMBER_OF_GLOBAL_STREAM (HPPL_THREAD_STREAM_1)
-/* number of thread stream */
-#define NUMBER_OF_THREAD_STREAM (HPPL_STREAM_END - HPPL_THREAD_STREAM_1)
-/* sizeof of device memory */
-#define HPPL_GPU_MEMORY_SIZE (256 * 4)
-
-/**
- * Check build-in cuda function using glog and it **does not**
- * support << operator for more details error info.
- */
-#define CHECK_CUDA(cudaFunc)                                         \
-  do {                                                               \
-    cudaError_t cudaStat = cudaFunc;                                 \
-    CHECK_EQ(cudaSuccess, cudaStat) << "Cuda Error: "                \
-                                    << cudaGetErrorString(cudaStat); \
-  } while (0)
-
-/**
- * @brief   thread resource.
- */
-__thread _hl_thread_resource t_resource = {{0},    /* stream */
-                                           0,      /* handle */
-                                           0,      /* gen */
-                                           0,      /* cudnn_handle */
-                                           0,      /* cudnn_desc */
-                                           NULL,   /* gen_mutex */
-                                           NULL,   /* gpu_mem */
-                                           NULL,   /* cpu_mem */
-                                           0,      /* event */
-                                           -1,     /* device */
-                                           0,      /* major */
-                                           false}; /* is_init */
-
-__thread cudaStream_t default_stream = 0;
-__thread bool g_sync_flag = true;
-bool hl_start_flag = false;
-
-inline pid_t gettid() {
-#if defined(__APPLE__) || defined(__OSX__)
-  // syscall is deprecated: first deprecated in macOS 10.12.
-  // syscall is unsupported;
-  // syscall pid_t tid = syscall(SYS_thread_selfid);
-  uint64_t tid;
-  pthread_threadid_np(NULL, &tid);
-#else
-#ifndef __NR_gettid
-#define __NR_gettid 224
-#endif
-  pid_t tid = syscall(__NR_gettid);
-#endif
-  CHECK_NE((int)tid, -1);
-  return tid;
-}
-
-void hl_init(int device) {
-  CHECK(hl_start_flag) << "[Init failed] hl_start() did not succeed.";
-
-  /* thread has been initialized */
-  if (true == t_resource.is_init) {
-    hl_set_device(device);
-    return;
-  }
-
-  /* create thread devcie resources */
-  char *tmp;
-  thread_device_resources device_res;
-  tmp = (char *)malloc(g_system_device_num * sizeof(thread_device_resources *) +
-                       device_num * sizeof(_thread_device_resources));
-  CHECK_NOTNULL(tmp);
-  t_device = (thread_device_resources *)tmp;
-  device_res = (thread_device_resources)(
-      (char *)tmp + g_system_device_num * sizeof(thread_device_resources *));
-  memset(t_device, 0, g_system_device_num * sizeof(thread_device_resources *));
-
-  char *tmp_stream = (char *)malloc(device_num * NUMBER_OF_THREAD_STREAM *
-                                    sizeof(cudaStream_t));
-  CHECK_NOTNULL(tmp_stream);
-
-  int num = 0;
-  for (int dev = 0; dev < g_system_device_num; dev++) {
-    if (!g_device[dev]) {
-      continue;
-    }
-
-    t_device[dev] = &device_res[num];
-    t_device[dev]->stream =
-        (cudaStream_t *)(tmp_stream +
-                         num * NUMBER_OF_THREAD_STREAM * sizeof(cudaStream_t));
-
-    hl_create_thread_resources(dev, t_device[dev]);
-    num++;
-  }
-
-  hl_cudnn_desc_init(&t_resource.cudnn_desc);
-
-  /* thread initialization is complete */
-  t_resource.is_init = true;
-  /* set device */
-  t_resource.device = -1;
-  hl_set_device(device);
-}
-
-void hl_fini() {
-  if (false == t_resource.is_init) {
-    return;
-  }
-
-  /* hppl stream fini */
-  t_resource.device = -1;
-  for (int i = NUMBER_OF_GLOBAL_STREAM; i < HPPL_STREAM_END; i++) {
-    t_resource.stream[i] = 0;
-  }
-
-  char *tmp = (char *)t_device;
-  char *tmp_stream = NULL;
-  for (int dev = 0; dev < g_system_device_num; dev++) {
-    if (!t_device[dev]) {
-      continue;
-    }
-    if (!tmp_stream) {
-      tmp_stream = (char *)t_device[dev]->stream;
-    }
-    for (int j = 0; j < NUMBER_OF_THREAD_STREAM; j++) {
-      CHECK_CUDA(cudaStreamDestroy(t_device[dev]->stream[j]));
-    }
-
-    /* free device memory */
-    hl_free_mem_device(t_device[dev]->gpu_mem);
-    hl_free_mem_host(t_device[dev]->cpu_mem);
-    CHECK_CUDA(cudaEventDestroy(t_device[dev]->mem_event));
-  }
-
-  free(tmp);
-  free(tmp_stream);
-  t_resource.is_init = false;
-}
-
-int hl_get_device_count() { return device_num; }
-
-void hl_set_device(int device) {
-  if (device == t_resource.device) {
-    return;
-  }
-
-  CHECK(device >= 0 && device < g_system_device_num && g_device[device])
-      << "Device: " << device << " is not specified in startup.";
-
-  CHECK_CUDA(cudaSetDevice(device));
-
-  /* switch thread stream */
-  for (int i = 0; i < NUMBER_OF_GLOBAL_STREAM; i++) {
-    t_resource.stream[i] = g_device[device]->device_resources->stream[i];
-  }
-
-  if (true == t_resource.is_init) {
-    for (int i = NUMBER_OF_GLOBAL_STREAM; i < HPPL_STREAM_END; i++) {
-      t_resource.stream[i] =
-          t_device[device]->stream[i - NUMBER_OF_GLOBAL_STREAM];
-    }
-    t_resource.gpu_mem = t_device[device]->gpu_mem;
-    t_resource.cpu_mem = t_device[device]->cpu_mem;
-    t_resource.event = t_device[device]->mem_event;
-  }
-
-  t_resource.handle = g_device[device]->device_resources->handle;
-  t_resource.gen = g_device[device]->device_resources->gen;
-  t_resource.cudnn_handle = g_device[device]->device_resources->cudnn_handle;
-  t_resource.gen_mutex = g_device[device]->device_resources->gen_mutex;
-  t_resource.device = device;
-  t_resource.major = g_device[device]->major;
-  default_stream = t_resource.stream[0];
-}
-
-int hl_get_device() {
-  int device;
-  CHECK_CUDA(cudaGetDevice(&device));
-  return device;
-}
-
-void *hl_malloc_device(size_t size) {
-  void *dest_d;
-
-  CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
-  CHECK_CUDA(cudaMalloc((void **)&dest_d, size));
-
-  return dest_d;
-}
-
-void hl_free_mem_device(void *dest_d) {
-  CHECK_NOTNULL(dest_d);
-
-  cudaError_t err = cudaFree(dest_d);
-  CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
-      << hl_get_device_error_string();
-}
-
-void *hl_malloc_host(size_t size) {
-  void *dest_h;
-
-  CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
-  CHECK_CUDA(cudaHostAlloc((void **)&dest_h, size, cudaHostAllocDefault));
-
-  return dest_h;
-}
-
-void hl_free_mem_host(void *dest_h) {
-  CHECK_NOTNULL(dest_h);
-
-  cudaError_t err = cudaFreeHost(dest_h);
-  CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
-      << hl_get_device_error_string();
-}
-
-void hl_memcpy(void *dst, void *src, size_t size) {
-  if (0 == size) {
-    return;
-  }
-  CHECK_NOTNULL(dst);
-  CHECK_NOTNULL(src);
-  CHECK_CUDA(cudaMemcpy(dst, src, size, cudaMemcpyDefault));
-}
-
-void hl_memset_device(void *dest_d, int value, size_t size) {
-  CHECK_CUDA(cudaMemset(dest_d, value, size));
-}
-
-void hl_memcpy_host2device(void *dest_d, void *src_h, size_t size) {
-  if (0 == size) {
-    return;
-  }
-  CHECK_NOTNULL(src_h);
-  CHECK_NOTNULL(dest_d);
-  CHECK_CUDA(cudaMemcpy(dest_d, src_h, size, cudaMemcpyHostToDevice));
-}
-
-void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) {
-  if (0 == size) {
-    return;
-  }
-  CHECK_NOTNULL(dest_h);
-  CHECK_NOTNULL(src_d);
-  CHECK_CUDA(cudaMemcpy(dest_h, src_d, size, cudaMemcpyDeviceToHost));
-}
-
-void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) {
-  if (0 == size) {
-    return;
-  }
-  CHECK_NOTNULL(dest_d);
-  CHECK_NOTNULL(src_d);
-  CHECK_CUDA(cudaMemcpy(dest_d, src_d, size, cudaMemcpyDeviceToDevice));
-}
-
-void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) {
-  cudaStream_t cu_stream;
-
-  if (0 == size) {
-    return;
-  }
-  CHECK_NOTNULL(dst);
-  CHECK_NOTNULL(src);
-  CHECK_LT(stream, HPPL_STREAM_END);
-  cu_stream = t_resource.stream[stream];
-
-  CHECK_CUDA(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, cu_stream));
-}
-
-void hl_start() {
-  hl_specify_devices_start(NULL, 0);
-  /* set default device */
-  hl_set_device(0);
-}
-
-bool hl_device_can_access_peer(int device, int peerDevice) {
-  int canAccessPeer;
-  CHECK_CUDA(cudaDeviceCanAccessPeer(&canAccessPeer, device, peerDevice));
-
-  if (canAccessPeer == 1) {
-    return true;
-  } else {
-    return false;
-  }
-}
-
-void hl_device_enable_peer_access(int peerDevice) {
-  cudaError_t err = cudaDeviceEnablePeerAccess(peerDevice, 0);
-  if (cudaErrorPeerAccessAlreadyEnabled == err) {
-    cudaGetLastError();
-  } else {
-    CHECK_CUDA(err);
-  }
-}
-
-void hl_create_global_resources(hl_device_prop device_prop) {
-  struct cudaDeviceProp cu_prop;
-  int device = device_prop->device;
-  global_device_resources device_res = device_prop->device_resources;
-
-  CHECK_CUDA(cudaSetDevice(device));
-  /* device properties */
-  CHECK_CUDA(cudaGetDeviceProperties(&cu_prop, device));
-
-  device_prop->major = cu_prop.major;
-  device_prop->minor = cu_prop.minor;
-  strncpy(device_prop->device_name, cu_prop.name, 256);
-  device_prop->device_mem = cu_prop.totalGlobalMem;
-
-  /* create device stream */
-  for (int j = 0; j < NUMBER_OF_GLOBAL_STREAM; j++) {
-    CHECK_CUDA(cudaStreamCreate(&device_res->stream[j]));
-  }
-
-  /* cublas init */
-  hl_cublas_init(&device_res->handle, device_res->stream[0]);
-
-  /* create curand gen */
-  CHECK_EQ(dynload::curandCreateGenerator(&device_res->gen,
-                                          CURAND_RNG_PSEUDO_DEFAULT),
-           CURAND_STATUS_SUCCESS)
-      << "[Start failed] Curand init failed.";
-
-  CHECK_EQ(dynload::curandSetStream(device_res->gen, device_res->stream[0]),
-           CURAND_STATUS_SUCCESS)
-      << "[Start failed] Curand set stream failed!";
-
-  /* create cudnn handle */
-  hl_cudnn_init(&device_res->cudnn_handle, device_res->stream[0]);
-
-  int seed = gettid();
-  CHECK_EQ(dynload::curandSetPseudoRandomGeneratorSeed(device_res->gen,
-                                                       seed + device),
-           CURAND_STATUS_SUCCESS);
-
-  device_res->gen_mutex = (pthread_mutex_t *)(malloc(sizeof(pthread_mutex_t)));
-  pthread_mutex_init(device_res->gen_mutex, NULL);
-
-  CHECK_CUDA(cudaRuntimeGetVersion(&g_cuda_lib_version));
-}
-
-int hl_get_cuda_version() { return g_cuda_lib_version; }
-
-void hl_create_thread_resources(int device,
-                                thread_device_resources device_res) {
-  CHECK_CUDA(cudaSetDevice(device));
-
-  /* create thread stream */
-  for (int j = 0; j < NUMBER_OF_THREAD_STREAM; j++) {
-    CHECK_CUDA(cudaStreamCreate(&device_res->stream[j]));
-  }
-
-  /* allocation device memory */
-  device_res->gpu_mem = (real *)hl_malloc_device(HPPL_GPU_MEMORY_SIZE);
-
-  /* allocation host memory */
-  device_res->cpu_mem = (real *)hl_malloc_host(HPPL_GPU_MEMORY_SIZE);
-
-  CHECK_CUDA(cudaEventCreate(&device_res->mem_event));
-}
-
-void hl_specify_devices_start(int *device, int number) {
-  if (hl_start_flag) return;
-
-  /* 1. get the number of devices */
-  CHECK_CUDA(cudaGetDeviceCount(&g_system_device_num));
-  CHECK_NE(g_system_device_num, 0) << "[Start failed] there is no GPU device";
-  if (device == NULL) {
-    number = g_system_device_num;
-  }
-
-  /* 2. check device & create device property table */
-  CHECK_LE(number, g_system_device_num)
-      << "[Start failed] System does not have enough device. "
-      << "Device number: " << g_system_device_num << "Input number: " << number;
-
-  char *tmp;
-  hl_device_prop device_prop;
-  tmp = (char *)malloc(g_system_device_num * sizeof(hl_device_prop *) +
-                       number * sizeof(_hl_device_prop));
-  CHECK(tmp) << "[Start failed] System memory is not enough.";
-
-  g_device = (hl_device_prop *)tmp;
-  device_prop = (hl_device_prop)(
-      (char *)tmp + g_system_device_num * sizeof(hl_device_prop *));
-  memset(g_device, 0, g_system_device_num * sizeof(hl_device_prop *));
-  int num = 0;
-  for (int i = 0; i < number; i++) {
-    int dev;
-    if (device == NULL) {
-      dev = i;
-    } else {
-      dev = device[i];
-    }
-
-    CHECK_LT(dev, g_system_device_num)
-        << "[Start failed] The specified device number is "
-        << "out of range. Max device number: " << g_system_device_num - 1
-        << " Specified devcie number: " << dev;
-
-    if (g_device[dev]) {
-      /* Warning */
-      LOG(WARNING) << "[Warning] Repeat specify device: " << dev;
-      continue;
-    }
-
-    g_device[dev] = &device_prop[num];
-    g_device[dev]->device = dev;
-    num++;
-  }
-  device_num = num;
-
-  /* 3.  create global device resources */
-  char *tmp_res = (char *)malloc(device_num * sizeof(_global_device_resources));
-  CHECK_NOTNULL(tmp_res);
-
-  char *tmp_stream = (char *)malloc(device_num * NUMBER_OF_GLOBAL_STREAM *
-                                    sizeof(cudaStream_t));
-  CHECK_NOTNULL(tmp_stream);
-
-  num = 0;
-  for (int i = 0; i < g_system_device_num; i++) {
-    if (!g_device[i]) {
-      continue;
-    }
-
-    g_device[i]->device_resources = (global_device_resources)(
-        tmp_res + num * sizeof(_global_device_resources));
-    g_device[i]->device_resources->stream =
-        (cudaStream_t *)(tmp_stream +
-                         num * NUMBER_OF_GLOBAL_STREAM * sizeof(cudaStream_t));
-
-    hl_create_global_resources(g_device[i]);
-    num++;
-  }
-
-  /* hl_start() is ok */
-  hl_start_flag = true;
-  /* set default device */
-  if (device == NULL) {
-    hl_set_device(0);
-  } else {
-    hl_set_device(device[0]);
-  }
-}
-
-void hl_rand(real *dest_d, size_t num) {
-  pthread_mutex_lock(t_resource.gen_mutex);
-  CHECK_EQ(
-#ifndef PADDLE_TYPE_DOUBLE
-      dynload::curandGenerateUniform(t_resource.gen, dest_d, num),
-#else
-      dynload::curandGenerateUniformDouble(t_resource.gen, dest_d, num),
-#endif
-      CURAND_STATUS_SUCCESS);
-  pthread_mutex_unlock(t_resource.gen_mutex);
-  CHECK_SYNC("hl_rand failed");
-}
-
-void hl_srand(unsigned int seed) {
-  pthread_mutex_lock(t_resource.gen_mutex);
-  CHECK_EQ(dynload::curandSetPseudoRandomGeneratorSeed(t_resource.gen, seed),
-           CURAND_STATUS_SUCCESS);
-  pthread_mutex_unlock(t_resource.gen_mutex);
-}
-
-void hl_set_sync_flag(bool flag) { g_sync_flag = flag; }
-
-bool hl_get_sync_flag() { return g_sync_flag; }
-
-void hl_stream_synchronize(hl_stream_t stream) {
-  cudaStream_t cu_stream;
-
-  CHECK_LT(stream, HPPL_STREAM_END) << __func__
-                                    << ": the parameter stream is error.";
-
-  cu_stream = t_resource.stream[stream];
-  CHECK_CUDA(cudaStreamSynchronize(cu_stream));
-}
-
-void hl_create_event(hl_event_t *event) {
-  CHECK_NOTNULL(event);
-
-  struct _hl_event_st *st_event =
-      (struct _hl_event_st *)malloc(sizeof(struct _hl_event_st));
-
-  CHECK_CUDA(cudaEventCreate(&st_event->cu_event));
-
-  *event = st_event;
-}
-
-float hl_event_elapsed_time(hl_event_t start, hl_event_t end) {
-  float time;
-  CHECK_NOTNULL(start);
-  CHECK_NOTNULL(end);
-
-  CHECK_CUDA(cudaEventElapsedTime(&time, start->cu_event, end->cu_event));
-  return time;
-}
-
-void hl_stream_record_event(hl_stream_t stream, hl_event_t event) {
-  cudaStream_t cu_stream;
-
-  CHECK_NOTNULL(event);
-  CHECK_LT(stream, HPPL_STREAM_END) << __func__
-                                    << ": the parameter stream is error.";
-
-  cu_stream = t_resource.stream[stream];
-  CHECK_CUDA(cudaEventRecord(event->cu_event, cu_stream));
-}
-
-void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) {
-  cudaStream_t cu_stream;
-
-  CHECK_NOTNULL(event);
-  CHECK_LT(stream, HPPL_STREAM_END) << __func__
-                                    << ": the parameter stream is error.";
-
-  cu_stream = t_resource.stream[stream];
-  CHECK_CUDA(cudaStreamWaitEvent(cu_stream, event->cu_event, 0));
-}
-
-void hl_destroy_event(hl_event_t event) {
-  CHECK_NOTNULL(event);
-  CHECK_CUDA(cudaEventDestroy(event->cu_event));
-
-  free(event);
-  event = NULL;
-}
-
-void hl_event_synchronize(hl_event_t event) {
-  CHECK_NOTNULL(event);
-  CHECK_CUDA(cudaEventSynchronize(event->cu_event));
-}
-
-void hl_get_device_name(char *name, int len, int device) {
-  CHECK_NOTNULL(name);
-  CHECK(device >= 0 && device < g_system_device_num && g_device[device])
-      << "Device(" << device << ") is not specified in startup.";
-
-  strncpy(name, g_device[device]->device_name, len);
-}
-
-void hl_get_device_memory(size_t *mem_size, int device) {
-  CHECK_NOTNULL(mem_size);
-  CHECK(device >= 0 && device < g_system_device_num && g_device[device])
-      << "Device(" << device << ") is not specified in startup.";
-
-  *mem_size = g_device[device]->device_mem;
-}
-
-void hl_get_device_compute_capability(int *major, int *minor, int device) {
-  CHECK_NOTNULL(major);
-  CHECK_NOTNULL(minor);
-  CHECK(device >= 0 && device < g_system_device_num && g_device[device])
-      << "Device(" << device << ") is not specified in startup.";
-
-  *major = g_device[device]->major;
-  *minor = g_device[device]->minor;
-}
-
-int hl_get_device_last_error() { return (int)cudaGetLastError(); }
-
-const char *hl_get_device_error_string() {
-  cudaError_t err = cudaGetLastError();
-  return cudaGetErrorString(err);
-}
-
-const char *hl_get_device_error_string(size_t err) {
-  return cudaGetErrorString((cudaError_t)err);
-}
-
-void hl_device_synchronize() { CHECK_CUDA(cudaDeviceSynchronize()); }
-void hl_set_device_flags_block() {
-  CHECK_CUDA(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
-}
-
-bool hl_cuda_event_is_ready(hl_event_t event) {
-  cudaError_t err = cudaEventQuery(event->cu_event);
-  CHECK(cudaSuccess == err || cudaErrorNotReady == err);
-
-  if (cudaErrorNotReady == err) {
-    return false;
-  }
-  return true;
-}
-
-void hl_profiler_start() { CHECK_CUDA(cudaProfilerStart()); }
-
-void hl_profiler_end() { CHECK_CUDA(cudaProfilerStop()); }
diff --git a/paddle/cuda/src/hl_cuda_lstm.cu b/paddle/cuda/src/hl_cuda_lstm.cu
deleted file mode 100644
index b8c4e433a118fb1c5af753751f91c34543b1114c..0000000000000000000000000000000000000000
--- a/paddle/cuda/src/hl_cuda_lstm.cu
+++ /dev/null
@@ -1,876 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "hl_activation_functions.h"
-#include "hl_base.h"
-#include "hl_cuda_cublas.h"
-#include "hl_device_functions.cuh"
-#include "paddle/utils/Logging.h"
-
-typedef hppl::Active<real>::forward t_forward;
-typedef hppl::Active<real>::backward t_backward;
-
-bool hl_lstm_sequence_parallel(int frameSize) {
-  if (frameSize == 32 || frameSize == 64) {
-    return true;
-  } else {
-    return false;
-  }
-}
-
-class frameValue {
- public:
-  real *value_;
-  __device__ frameValue(real *value) : value_(value) {}
-  template <int reversed, int frameSize>
-  __device__ inline void init(int start, int length, int idx) {
-    if (reversed == 0) {
-      value_ += start * frameSize + idx;
-    } else {
-      value_ += (start + length - 1) * frameSize + idx;
-    }
-  }
-  __device__ inline real *getPtr() const { return value_; }
-  __device__ inline real getValue() { return *value_; }
-  __device__ inline void setValue(real value) { *value_ = value; }
-  template <int reversed, int frameSize>
-  __device__ inline void nextFrame() {
-    if (reversed == 0) {
-      value_ += frameSize;
-    } else {
-      value_ -= frameSize;
-    }
-  }
-};
-
-__device__ __forceinline__ void ptx_sync(const int id, const int barriers) {
-  asm volatile("bar.sync %0, %1;" : : "r"(id), "r"(barriers) : "memory");
-}
-
-__device__ __forceinline__ void ptx_arrive(const int id, const int barriers) {
-  asm volatile("bar.arrive %0, %1;" : : "r"(id), "r"(barriers) : "memory");
-}
-
-template <int valueSize, int frameSize>
-__device__ __forceinline__ real forward_sequence(real value,
-                                                 real *shValue,
-                                                 real *state,
-                                                 real *preOutput,
-                                                 real *output,
-                                                 real check,
-                                                 int index,
-                                                 t_forward activeNode,
-                                                 t_forward activeGate,
-                                                 t_forward activeState) {
-  real out;
-  real prevOut;
-  real state_r;
-  const int idx = index % frameSize;
-  const int idy = index / frameSize;
-  // assert(index < valueSize);
-
-  if (idy == 0) {
-    value = activeNode(value);
-    shValue[index] = value;
-  }
-  if (idy == 1 || idy == 2) {
-    state_r = state[idx];
-    value += state_r * check;
-    value = activeGate(value);
-    shValue[index] = value;
-  }
-  ptx_sync(1, valueSize);
-  if (idy == 3) {
-    state_r = state[idx];
-    state_r = state_r * shValue[idx + frameSize * 2];
-    state_r += shValue[idx] * shValue[idx + frameSize];
-    state[idx] = state_r;
-    ptx_arrive(2, frameSize * 2);
-    value += state_r * check;
-    value = activeGate(value);
-    shValue[index] = value;
-    ptx_sync(3, frameSize * 2);
-    prevOut = preOutput[idx];
-    out = prevOut * value;
-    output[idx] = out;
-  }
-  if (idy == 0) {
-    ptx_sync(2, frameSize * 2);
-    prevOut = state[idx];
-    prevOut = activeState(prevOut);
-    preOutput[idx] = prevOut;
-    ptx_arrive(3, frameSize * 2);
-  }
-  return value;
-}
-
-#define OUTPUT_BARRIER_ID 10
-#define OUTPUT_BARRIER_ID2 11
-template <int valueSize,
-          int frameSize,
-          int reversed,
-          int computeThreads,
-          int blockSize>
-__global__ void KeLstmForward(real *gateValue,
-                              real *state,
-                              real *output,
-                              real *preOutput,
-                              real *checkIg,
-                              real *checkFg,
-                              real *checkOg,
-                              real *weight,
-                              const int *starts,
-                              hl_activation_mode_t active_node,
-                              hl_activation_mode_t active_gate,
-                              hl_activation_mode_t active_state) {
-  __shared__ real shValue[valueSize];
-  __shared__ real shState[frameSize];
-  __shared__ real shPrevOutput[frameSize];
-  __shared__ real shOutput[frameSize];
-
-  const int index = threadIdx.x;
-  int start = starts[blockIdx.x];
-  int length = starts[blockIdx.x + 1] - start;
-
-  /* init */
-  real check;
-  real value;
-  frameValue frameGate(gateValue);
-  frameValue frameState(state);
-  frameValue frameOutput(output);
-  frameValue framePreOutput(preOutput);
-  if (index < valueSize) {
-    const int idx = index % frameSize;
-    const int idy = index / frameSize;
-    frameGate.init<reversed, valueSize>(start, length, index);
-    value = frameGate.getValue();
-    if (idy == 0) {
-      shState[idx] = 0.0;
-    } else if (idy == 1) {
-      check = checkIg[idx];
-    } else if (idy == 2) {
-      check = checkFg[idx];
-    } else if (idy == 3) {
-      check = checkOg[idx];
-    }
-
-    if (idy == 3) {
-      frameState.init<reversed, frameSize>(start, length, idx);
-      frameOutput.init<reversed, frameSize>(start, length, idx);
-      framePreOutput.init<reversed, frameSize>(start, length, idx);
-    }
-
-    ptx_sync(1, valueSize);
-  }
-
-  for (int i = 0; i < length; ++i) {
-    if (index < valueSize) {
-      if (valueSize == 128) {
-        if (i != 0) {
-          ptx_sync(OUTPUT_BARRIER_ID2, blockSize);
-          value += shValue[index];
-        }
-      }
-      value = forward_sequence<valueSize, frameSize>(
-          value,
-          shValue,
-          shState,
-          shPrevOutput,
-          shOutput,
-          check,
-          index,
-          hppl::gpu::forward[active_node],
-          hppl::gpu::forward[active_gate],
-          hppl::gpu::forward[active_state]);
-      const int idx = index % frameSize;
-      const int idy = index / frameSize;
-      if (valueSize == 128) {
-        if (idy == 3) {
-          ptx_arrive(OUTPUT_BARRIER_ID, frameSize + 128);
-        }
-      }
-      if (valueSize == 256) {
-        ptx_sync(OUTPUT_BARRIER_ID, valueSize);
-      }
-      frameGate.setValue(value);
-      if (idy == 3) {
-        frameState.setValue(shState[idx]);
-        frameOutput.setValue(shOutput[idx]);
-        framePreOutput.setValue(shPrevOutput[idx]);
-        frameState.nextFrame<reversed, frameSize>();
-        frameOutput.nextFrame<reversed, frameSize>();
-        framePreOutput.nextFrame<reversed, frameSize>();
-      }
-      if (i != length - 1) {
-        frameGate.nextFrame<reversed, valueSize>();
-        value = frameGate.getValue();
-      }
-    }
-    if (i != length - 1) {
-      if (valueSize == 128) {
-        if (valueSize <= index) {
-          real B_r[frameSize];
-          const int computeIdx = index - valueSize;
-          if (i == 0) {
-#pragma unroll
-            for (int n = 0; n < frameSize; n++) {
-              B_r[n] = weight[n * valueSize + computeIdx];
-            }
-          }
-          ptx_sync(OUTPUT_BARRIER_ID, frameSize + 128);
-          real A_r[frameSize];
-          for (int n = 0; n < frameSize; n++) {
-            A_r[n] = shOutput[n];
-          }
-          real sum = 0.0f;
-          for (int n = 0; n < frameSize; n++) {
-            sum += A_r[n] * B_r[n];
-          }
-          shValue[computeIdx] = sum;
-          ptx_arrive(OUTPUT_BARRIER_ID2, blockSize);
-        }
-      }
-      if (valueSize == 256) {
-        real B_r[frameSize];
-        if (i == 0) {
-#pragma unroll
-          for (int n = 0; n < frameSize; n++) {
-            B_r[n] = weight[n * valueSize + index];
-          }
-        }
-        real sum = 0.0f;
-        for (int n = 0; n < frameSize; n++) {
-          sum += shOutput[n] * B_r[n];
-        }
-        value += sum;
-      }
-    }
-  }
-}
-
-void hl_lstm_parallel_forward(real *gateValue,
-                              real *stateValue,
-                              real *preOutputValue,
-                              real *outputValue,
-                              real *checkIg,
-                              real *checkFg,
-                              real *checkOg,
-                              real *weight,
-                              const int *sequence,
-                              int frameSize,
-                              int numSequences,
-                              bool reversed,
-                              hl_activation_mode_t active_node,
-                              hl_activation_mode_t active_gate,
-                              hl_activation_mode_t active_state) {
-  CHECK(frameSize == 32 || frameSize == 64);
-  dim3 grid(numSequences, 1);
-  if (!reversed) {
-    if (frameSize == 32) {
-      KeLstmForward<128, 32, 0, 128, 256><<<grid, 256, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          stateValue,
-          outputValue,
-          preOutputValue,
-          checkIg,
-          checkFg,
-          checkOg,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    } else if (frameSize == 64) {
-      KeLstmForward<256, 64, 0, 256, 256><<<grid, 256, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          stateValue,
-          outputValue,
-          preOutputValue,
-          checkIg,
-          checkFg,
-          checkOg,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    }
-  } else {
-    if (frameSize == 32) {
-      KeLstmForward<128, 32, 1, 128, 256><<<grid, 256, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          stateValue,
-          outputValue,
-          preOutputValue,
-          checkIg,
-          checkFg,
-          checkOg,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    } else if (frameSize == 64) {
-      KeLstmForward<256, 64, 1, 256, 256><<<grid, 256, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          stateValue,
-          outputValue,
-          preOutputValue,
-          checkIg,
-          checkFg,
-          checkOg,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    }
-  }
-  CHECK_SYNC("hl_lstm_parallel_forward failed");
-}
-
-__device__ __forceinline__ void transpose_32x32(real a[], const int idx) {
-  const int warp_size = 32;
-  int addr = idx % warp_size;
-  unsigned mask = 0u;
-  CREATE_SHFL_MASK(mask, addr < warp_size);
-#pragma unroll
-  for (int k = 1; k < 32; k++) {
-    // rSrc[k] = __shfl_sync(rSrc[k], (threadIdx.x + k) % 32, 32);
-    addr = __shfl_sync(mask, addr, (idx + 1) % 32, 32);
-    a[k] = __shfl_sync(mask, a[k], addr, 32);
-  }
-
-#pragma unroll
-  for (int tid = 0; tid < 31; tid++) {
-    real tmp = (idx > tid) ? a[0] : a[1];
-#pragma unroll
-    for (int k = 31; k > 0; k--) {
-      a[(k + 1) % 32] = (idx > tid) ? a[k] : a[(k + 1) % 32];
-    }
-    a[1] = tmp;
-  }
-
-  addr = (32 - idx) % 32;
-  CREATE_SHFL_MASK(mask, idx % 32 < warp_size);
-#pragma unroll
-  for (int k = 0; k < 32; k++) {
-    a[k] = __shfl_sync(mask, a[k], addr, 32);
-    addr = __shfl_sync(mask, addr, (idx + 31) % 32, 32);
-  }
-}
-
-template <int valueSize, int frameSize>
-__device__ void backward_sequence(real rGateValue,
-                                  real rOutputGrad,
-                                  real rPreOutputValue,
-                                  real &rGateGrad,
-                                  real &rStateGrad,
-                                  real *shStateGrad,
-                                  real *shStateValue,
-                                  real *shGateValue,
-                                  real rCheck,
-                                  real &rGateValuePrev,
-                                  int index,
-                                  t_backward activeNode,
-                                  t_backward activeGate,
-                                  t_backward activeState) {
-  const int frameIdx = index % frameSize;
-  const int frameIdy = index / frameSize;
-  if (frameIdy == 3) {
-    real rPrevOutputGrad;
-    rPrevOutputGrad = rOutputGrad * rGateValue;
-    rStateGrad = activeState(rPrevOutputGrad, rPreOutputValue);
-    rGateGrad = rOutputGrad * rPreOutputValue;
-    rGateGrad = activeGate(rGateGrad, rGateValue);
-    rStateGrad += rGateGrad * rCheck;
-    shStateGrad[index] = rStateGrad;
-    ptx_arrive(3, valueSize);
-  } else if (frameIdy == 1) {
-    shGateValue[frameIdx + frameSize] = rGateValue;
-    rStateGrad = rGateGrad * rCheck;
-    shStateGrad[index] = rStateGrad;
-    ptx_sync(3, valueSize);
-    rStateGrad += shStateGrad[frameIdx + frameSize * 2];
-    rStateGrad += shStateGrad[frameIdx + frameSize * 3];
-    rGateGrad = rStateGrad * shGateValue[frameIdx];
-    rGateGrad = activeGate(rGateGrad, rGateValue);
-  } else if (frameIdy == 2) {
-    rStateGrad = rStateGrad * rGateValuePrev;
-    rStateGrad += rGateGrad * rCheck;
-    shStateGrad[index] = rStateGrad;
-    ptx_sync(3, valueSize);
-    rStateGrad += shStateGrad[frameIdx + frameSize];
-    rStateGrad += shStateGrad[frameIdx + frameSize * 3];
-    rGateValuePrev = rGateValue;
-    rGateGrad = rStateGrad * shStateValue[frameIdx];
-    rGateGrad = activeGate(rGateGrad, rGateValue);
-  } else if (frameIdy == 0) {
-    shGateValue[frameIdx] = rGateValue;
-    ptx_sync(3, valueSize);
-    rStateGrad = shStateGrad[frameIdx + frameSize];
-    rStateGrad += shStateGrad[frameIdx + frameSize * 2];
-    rStateGrad += shStateGrad[frameIdx + frameSize * 3];
-    rGateGrad = rStateGrad * shGateValue[frameIdx + frameSize];
-    rGateGrad = activeNode(rGateGrad, rGateValue);
-  }
-}
-
-template <int valueSize, int frameSize>
-__device__ void load_weight(real rWeight[], real *weight, const int index) {
-  if (valueSize == 128) {
-    weight += index;
-#pragma unroll
-    for (int n = 0; n < frameSize; n++) {
-      rWeight[n] = weight[n * valueSize];
-    }
-    transpose_32x32(rWeight, index % 32);
-  }
-  if (valueSize == 256) {
-    int id = (index / 32) % 2;
-    weight += index - id * 32 + id * 32 * valueSize;
-#pragma unroll
-    for (int n = 0; n < 32; n++) {
-      rWeight[n] = weight[n * valueSize];
-      rWeight[n + 32] = weight[n * valueSize + 32];
-    }
-    transpose_32x32(rWeight, index % 32);
-    transpose_32x32(&rWeight[32], index % 32);
-  }
-}
-
-template <int valueSize, int frameSize, int reversed>
-__global__ void KeLstmBackward(real *gateValue,
-                               real *gateGrad,
-                               real *stateValue,
-                               real *stateGrad, /* do not need save */
-                               real *preOutputValue,
-                               real *preOutputGrad, /* do not need save */
-                               real *checkIg,
-                               real *checkIgGrad,
-                               real *checkFg,
-                               real *checkFgGrad,
-                               real *checkOg,
-                               real *checkOgGrad,
-                               real *outputGrad,
-                               real *weightValue,
-                               const int *starts,
-                               hl_activation_mode_t active_node,
-                               hl_activation_mode_t active_gate,
-                               hl_activation_mode_t active_state) {
-  __shared__ real shGateValue[valueSize];
-  __shared__ real shStateGrad[valueSize];
-  __shared__ real shStateValue[frameSize];
-  __shared__ real shGateGrad[4][frameSize];
-  __shared__ real shOutputGrad[4][frameSize];
-  const int index = threadIdx.x;
-  int start = starts[blockIdx.x];
-  int length = starts[blockIdx.x + 1] - start;
-
-  const int frameIdx = index % frameSize;
-  const int frameIdy = index / frameSize;
-  real rCheck;
-  real rCheckGrad;
-  real rGateGrad;
-  real rStateGrad;
-  real rGateValuePrev;
-  real rPreOutputValue;
-  real rOutputGrad;
-  real rGateValue;
-  real rStateValue;
-
-  frameValue frameGateValue(gateValue);
-  frameValue frameGateGrad(gateGrad);
-  frameValue framePreOutputValue(preOutputValue);
-  frameValue frameStateValue(stateValue);
-  frameValue frameOutputGrad(outputGrad);
-  if (frameIdy == 0) {
-  } else if (frameIdy == 1) {
-    rCheck = checkIg[frameIdx];
-  } else if (frameIdy == 2) {
-    rCheck = checkFg[frameIdx];
-    rGateValuePrev = 0.0;
-    rStateGrad = 0.0;
-  } else if (frameIdy == 3) {
-    rCheck = checkOg[frameIdx];
-    framePreOutputValue.init<!reversed, frameSize>(start, length, frameIdx);
-    frameOutputGrad.init<!reversed, frameSize>(start, length, frameIdx);
-    rOutputGrad = frameOutputGrad.getValue();
-    rPreOutputValue = framePreOutputValue.getValue();
-    frameStateValue.init<!reversed, frameSize>(start, length, frameIdx);
-    rStateValue = frameStateValue.getValue();
-  }
-
-  frameGateValue.init<!reversed, valueSize>(start, length, index);
-  frameGateGrad.init<!reversed, valueSize>(start, length, index);
-  rGateValue = frameGateValue.getValue();
-  rGateGrad = 0.0;
-  rCheckGrad = 0.0;
-
-  real B_r[frameSize];
-  load_weight<valueSize, frameSize>(B_r, weightValue, index);
-
-  for (int i = 0; i < length; ++i) {
-    if (frameIdy == 3) {
-      if (i != length - 1) {
-        frameStateValue.nextFrame<!reversed, frameSize>();
-        shStateValue[frameIdx] = frameStateValue.getValue();
-      } else {
-        shStateValue[frameIdx] = 0.0;
-      }
-    }
-    backward_sequence<valueSize, frameSize>(rGateValue,
-                                            rOutputGrad,
-                                            rPreOutputValue,
-                                            rGateGrad,
-                                            rStateGrad,
-                                            shStateGrad,
-                                            shStateValue,
-                                            shGateValue,
-                                            rCheck,
-                                            rGateValuePrev,
-                                            index,
-                                            hppl::gpu::backward[active_node],
-                                            hppl::gpu::backward[active_gate],
-                                            hppl::gpu::backward[active_state]);
-    if (frameIdy == 3) {
-      rCheckGrad += rGateGrad * rStateValue;
-      rStateValue = shStateValue[frameIdx];
-    }
-
-    frameGateGrad.setValue(rGateGrad);
-    frameGateGrad.nextFrame<!reversed, valueSize>();
-
-    if (i != length - 1) {
-      if (frameIdy == 3) {
-        framePreOutputValue.nextFrame<!reversed, frameSize>();
-        rPreOutputValue = framePreOutputValue.getValue();
-        frameOutputGrad.nextFrame<!reversed, frameSize>();
-        rOutputGrad = frameOutputGrad.getValue();
-      } else if (frameIdy == 2) {
-        rCheckGrad += rGateGrad * shStateValue[frameIdx];
-      } else if (frameIdy == 1) {
-        rCheckGrad += rGateGrad * shStateValue[frameIdx];
-      }
-
-      frameGateValue.nextFrame<!reversed, valueSize>();
-      rGateValue = frameGateValue.getValue();
-      shGateGrad[frameIdy][frameIdx] = rGateGrad;
-      if (valueSize == 128) {
-        real sum = 0.0f;
-#pragma unroll
-        for (int n = 0; n < frameSize; n++) {
-          sum += shGateGrad[frameIdy][n] * B_r[n];
-        }
-        if (frameIdy == 3) {
-          rOutputGrad += sum;
-        } else {
-          shOutputGrad[frameIdy][frameIdx] = sum;
-        }
-      }
-      if (valueSize == 256) {
-        ptx_sync(5, valueSize);
-        real A_r[frameSize];
-        for (int n = 0; n < frameSize; n++) {
-          A_r[n] = shGateGrad[frameIdy][n];
-        }
-        real sum = 0.0f;
-        for (int n = 0; n < frameSize; n++) {
-          sum += A_r[n] * B_r[n];
-        }
-        if (frameIdy == 3) {
-          rOutputGrad += sum;
-        } else {
-          shOutputGrad[frameIdy][frameIdx] = sum;
-        }
-      }
-
-      if (frameIdy == 3) {
-        ptx_sync(6, valueSize);
-#pragma unroll
-        for (int i = 0; i < 3; i++) {
-          rOutputGrad += shOutputGrad[i][frameIdx];
-        }
-      } else {
-        ptx_arrive(6, valueSize);
-      }
-    }
-  }
-
-  /* TODO: Temporary save & merger in another kernel */
-  if (frameIdy == 1) {
-    if (checkIgGrad)
-      paddle::paddleAtomicAdd(checkIgGrad + frameIdx, rCheckGrad);
-  } else if (frameIdy == 2) {
-    if (checkFgGrad)
-      paddle::paddleAtomicAdd(checkFgGrad + frameIdx, rCheckGrad);
-  } else if (frameIdy == 3) {
-    if (checkOgGrad)
-      paddle::paddleAtomicAdd(checkOgGrad + frameIdx, rCheckGrad);
-  }
-}
-
-void hl_lstm_parallel_backward_data(real *gateValue,
-                                    real *gateGrad,
-                                    real *stateValue,
-                                    real *stateGrad,
-                                    real *preOutputValue,
-                                    real *preOutputGrad,
-                                    real *outputGrad,
-                                    real *checkIg,
-                                    real *checkIgGrad,
-                                    real *checkFg,
-                                    real *checkFgGrad,
-                                    real *checkOg,
-                                    real *checkOgGrad,
-                                    real *weight,
-                                    const int *sequence,
-                                    int frameSize,
-                                    int numSequences,
-                                    bool reversed,
-                                    hl_activation_mode_t active_node,
-                                    hl_activation_mode_t active_gate,
-                                    hl_activation_mode_t active_state) {
-  CHECK(frameSize == 32 || frameSize == 64 || frameSize == 128 ||
-        frameSize == 256);
-  dim3 grid(numSequences, 1);
-  if (!reversed) {
-    if (frameSize == 32) {
-      KeLstmBackward<128, 32, 0><<<grid, 128, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          gateGrad,
-          stateValue,
-          stateGrad,
-          preOutputValue,
-          preOutputGrad,
-          checkIg,
-          checkIgGrad,
-          checkFg,
-          checkFgGrad,
-          checkOg,
-          checkOgGrad,
-          outputGrad,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    } else if (frameSize == 64) {
-      KeLstmBackward<256, 64, 0><<<grid, 256, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          gateGrad,
-          stateValue,
-          stateGrad,
-          preOutputValue,
-          preOutputGrad,
-          checkIg,
-          checkIgGrad,
-          checkFg,
-          checkFgGrad,
-          checkOg,
-          checkOgGrad,
-          outputGrad,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    } else if (frameSize == 128) {
-      KeLstmBackward<512, 128, 0><<<grid, 512, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          gateGrad,
-          stateValue,
-          stateGrad,
-          preOutputValue,
-          preOutputGrad,
-          checkIg,
-          checkIgGrad,
-          checkFg,
-          checkFgGrad,
-          checkOg,
-          checkOgGrad,
-          outputGrad,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    } else if (frameSize == 256) {
-      KeLstmBackward<1024, 256, 0><<<grid, 1024, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          gateGrad,
-          stateValue,
-          stateGrad,
-          preOutputValue,
-          preOutputGrad,
-          checkIg,
-          checkIgGrad,
-          checkFg,
-          checkFgGrad,
-          checkOg,
-          checkOgGrad,
-          outputGrad,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    }
-  } else {
-    if (frameSize == 32) {
-      KeLstmBackward<128, 32, 1><<<grid, 128, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          gateGrad,
-          stateValue,
-          stateGrad,
-          preOutputValue,
-          preOutputGrad,
-          checkIg,
-          checkIgGrad,
-          checkFg,
-          checkFgGrad,
-          checkOg,
-          checkOgGrad,
-          outputGrad,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    } else if (frameSize == 64) {
-      KeLstmBackward<256, 64, 1><<<grid, 256, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          gateGrad,
-          stateValue,
-          stateGrad,
-          preOutputValue,
-          preOutputGrad,
-          checkIg,
-          checkIgGrad,
-          checkFg,
-          checkFgGrad,
-          checkOg,
-          checkOgGrad,
-          outputGrad,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    } else if (frameSize == 128) {
-      KeLstmBackward<512, 128, 1><<<grid, 512, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          gateGrad,
-          stateValue,
-          stateGrad,
-          preOutputValue,
-          preOutputGrad,
-          checkIg,
-          checkIgGrad,
-          checkFg,
-          checkFgGrad,
-          checkOg,
-          checkOgGrad,
-          outputGrad,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    } else if (frameSize == 256) {
-      KeLstmBackward<1024, 256, 1><<<grid, 1024, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          gateGrad,
-          stateValue,
-          stateGrad,
-          preOutputValue,
-          preOutputGrad,
-          checkIg,
-          checkIgGrad,
-          checkFg,
-          checkFgGrad,
-          checkOg,
-          checkOgGrad,
-          outputGrad,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    }
-  }
-  CHECK_SYNC("hl_lstm_parallel_backward_data");
-}
-
-template <int B_X, int B_Y>
-__global__ void KeSetGradZero(real *gateGrad,
-                              const int *starts,
-                              int valueSize,
-                              int numSequences,
-                              bool reversed) {
-  // const int tid = threadIdx.x;
-
-  const int frameIdx = blockIdx.x * B_X + threadIdx.x;
-  const int numSeqId = blockIdx.y * B_Y + threadIdx.y;
-
-  if (numSeqId >= numSequences || frameIdx >= valueSize) return;
-
-  if (!reversed) {
-    int seqId = starts[numSeqId];
-    gateGrad[seqId * valueSize + frameIdx] = 0.0;
-  } else {
-    int seqId = starts[numSeqId + 1] - 1;
-    gateGrad[seqId * valueSize + frameIdx] = 0.0;
-  }
-}
-
-void hl_lstm_parallel_backward_weight(real *weightGrad,
-                                      real *outputValue,
-                                      real *gateGrad,
-                                      const int *sequence,
-                                      int frameSize,
-                                      int batchSize,
-                                      int numSequences,
-                                      bool reversed) {
-  int valueSize = 4 * frameSize;
-  dim3 threads(32, 32);
-  dim3 grid((valueSize + 32 - 1) / 32, (numSequences + 32 - 1) / 32);
-  KeSetGradZero<32, 32><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      gateGrad, sequence, valueSize, numSequences, reversed);
-
-  if (!reversed) {
-    hl_matrix_mul(outputValue,
-                  HPPL_OP_T,
-                  gateGrad + valueSize,
-                  HPPL_OP_N,
-                  weightGrad,
-                  frameSize,
-                  valueSize,
-                  batchSize - 1,
-                  1.0,
-                  1.0);
-  } else {
-    hl_matrix_mul(outputValue + frameSize,
-                  HPPL_OP_T,
-                  gateGrad,
-                  HPPL_OP_N,
-                  weightGrad,
-                  frameSize,
-                  valueSize,
-                  batchSize - 1,
-                  1.0,
-                  1.0);
-  }
-  CHECK_SYNC("hl_lstm_parallel_backward_weight");
-}
diff --git a/paddle/cuda/src/hl_cuda_matrix.cu b/paddle/cuda/src/hl_cuda_matrix.cu
deleted file mode 100644
index 3e17c8090c5036037e936af1d6feaa2239251679..0000000000000000000000000000000000000000
--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ /dev/null
@@ -1,806 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "hl_base.h"
-#include "hl_device_functions.cuh"
-#include "hl_gpu_matrix_kernel.cuh"
-#include "hl_matrix.h"
-#include "hl_matrix_apply.cuh"
-#include "hl_matrix_ops.cuh"
-#include "hl_sequence.h"
-#include "hl_sparse.ph"
-#include "paddle/utils/Logging.h"
-
-DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1 * a + p2 * b);
-void hl_matrix_add(real* A_d,
-                   real* B_d,
-                   real* C_d,
-                   int dimM,
-                   int dimN,
-                   real alpha,
-                   real beta) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-  CHECK_NOTNULL(C_d);
-
-  hl_gpu_apply_ternary_op<real, ternary::_add<real>, 0, 0>(
-      ternary::_add<real>(alpha, beta),
-      A_d,
-      B_d,
-      C_d,
-      dimM,
-      dimN,
-      dimN,
-      dimN,
-      dimN);
-  CHECK_SYNC("hl_matrix_add failed");
-}
-
-#ifdef PADDLE_TYPE_DOUBLE
-#define THRESHOLD 128
-#else
-#define THRESHOLD 64
-#endif
-__device__ __forceinline__ void findMax(real* I,
-                                        real* dfMax_s,
-                                        int blockSize,
-                                        int base,
-                                        int curIdx,
-                                        int nextIdx,
-                                        int dimN,
-                                        real* max) {
-  dfMax_s[base] = -1.0e20;
-  while (curIdx < dimN) {
-    if (dfMax_s[base] < I[nextIdx]) {
-      dfMax_s[base] = I[nextIdx];
-    }
-    nextIdx += blockSize;
-    curIdx += blockSize;
-  }
-  __syncthreads();
-
-  for (int stride = blockSize >> 1; stride > 0; stride >>= 1) {
-    __syncthreads();
-    if (base < stride) {
-      nextIdx = base + stride;
-      if (dfMax_s[base] < dfMax_s[nextIdx]) {
-        dfMax_s[base] = dfMax_s[nextIdx];
-      }
-    }
-  }
-
-  if (0 == base) {
-    max[0] = dfMax_s[0];
-  }
-  __syncthreads();
-}
-
-__device__ __forceinline__ void subMaxAndExp(real* I,
-                                             real* O,
-                                             int curIdx,
-                                             int nextIdx,
-                                             int blockSize,
-                                             int dimN,
-                                             real max) {
-  real val;
-  while (curIdx < dimN) {
-    val = I[nextIdx] - max;
-    if (val < -THRESHOLD) {
-      val = -THRESHOLD;
-    }
-    I[nextIdx] = val;
-#ifndef PADDLE_TYPE_DOUBLE
-    O[nextIdx] = __expf(val);
-#else
-    O[nextIdx] = exp(val);
-#endif
-    nextIdx += blockSize;
-    curIdx += blockSize;
-  }
-  __syncthreads();
-}
-
-__device__ __forceinline__ void valueSum(real* O,
-                                         real* dfMax_s,
-                                         int blockSize,
-                                         int base,
-                                         int curIdx,
-                                         int nextIdx,
-                                         int dimN) {
-  dfMax_s[base] = 0;
-  while (curIdx < dimN) {
-    dfMax_s[base] += O[nextIdx];
-    nextIdx += blockSize;
-    curIdx += blockSize;
-  }
-  __syncthreads();
-
-  for (int stride = blockSize >> 1; stride > 0; stride >>= 1) {
-    __syncthreads();
-    if (base < stride) {
-      nextIdx = base + stride;
-      dfMax_s[base] += dfMax_s[nextIdx];
-    }
-  }
-  __syncthreads();
-}
-
-__device__ __forceinline__ void divSum(
-    real* O, real sum, int curIdx, int nextIdx, int blockSize, int dimN) {
-  while (curIdx < dimN) {
-    O[nextIdx] /= sum;
-    nextIdx += blockSize;
-    curIdx += blockSize;
-  }
-}
-
-__device__ __forceinline__ void softmax(real* I,
-                                        real* O,
-                                        real* dfMax_s,
-                                        int blockSize,
-                                        int base,
-                                        int curIdx,
-                                        int nextIdx,
-                                        int dimN) {
-  __shared__ real max;
-
-  // find the max number
-  findMax(I, dfMax_s, blockSize, base, curIdx, nextIdx, dimN, &max);
-
-  // sub max Value and do Exp operation
-  subMaxAndExp(I, O, base, nextIdx, blockSize, dimN, max);
-
-  // add dimN values into blockDim.x buffer
-  // sum is in dfMax_s[0]
-  valueSum(O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN);
-
-  // divided by sum
-  divSum(O, dfMax_s[0], curIdx, nextIdx, blockSize, dimN);
-}
-
-template <int blockSize>
-__global__ void KeMatrixSoftMax(real* O, real* I, int dimN) {
-  int base = threadIdx.x;
-  __shared__ real dfMax_s[blockSize];
-  int nextIdx = blockIdx.x * dimN + base;
-  int curIdx = base;
-
-  softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN);
-}
-
-void hl_matrix_softmax(real* A_d, real* C_d, int dimM, int dimN) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-
-  dim3 block(512, 1);
-  dim3 grid(dimM, 1);
-  KeMatrixSoftMax<512><<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, dimN);
-  CHECK_SYNC("hl_matrix_softmax failed");
-}
-
-template <int blockSize>
-__global__ void KeSequenceSoftMax(real* O, real* I, const int* index) {
-  int base = threadIdx.x;
-  int bid = blockIdx.x;
-  __shared__ real dfMax_s[blockSize];
-
-  int start = index[bid];
-  int dimN = index[bid + 1] - start;
-
-  int nextIdx = start + base;
-  int curIdx = base;
-
-  softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN);
-}
-
-void hl_sequence_softmax_forward(real* A_d,
-                                 real* C_d,
-                                 const int* index,
-                                 int numSequence) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-
-  dim3 block(512, 1);
-  dim3 grid(numSequence, 1);
-  KeSequenceSoftMax<512><<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, index);
-  CHECK_SYNC("hl_sequence_softmax_forward failed");
-}
-
-__global__ void KeMatrixDerivative(
-    real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {
-  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  int colIdx = blockIdx.y * blockDim.y + threadIdx.y;
-  int index;
-
-  if (rowIdx < dimM && colIdx < dimN) {
-    index = rowIdx * dimN + colIdx;
-    grad_d[index] = output_d[index] * (grad_d[index] - sftmaxSum_d[rowIdx]);
-  }
-}
-
-void hl_matrix_softmax_derivative(
-    real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {
-  CHECK_NOTNULL(grad_d);
-  CHECK_NOTNULL(output_d);
-  CHECK_NOTNULL(sftmaxSum_d);
-
-  int blocksX = (dimM + 0) / 1;
-  int blocksY = (dimN + 1024 - 1) / 1024;
-  dim3 threads(1, 1024);
-  dim3 grid(blocksX, blocksY);
-
-  KeMatrixDerivative<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      grad_d, output_d, sftmaxSum_d, dimM, dimN);
-  CHECK_SYNC("hl_matrix_softmax_derivative failed");
-}
-
-__global__ void KeMatrixMultiBinaryCrossEntropy(
-    real* output, real* entropy, int* row, int* col, int dimM, int dimN) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (index < dimM) {
-    for (int i = 0; i < dimN; i++) {
-      entropy[index] -= log(1 - output[index * dimN + i]);
-    }
-    int* row_col = col + row[index];
-    int col_num = row[index + 1] - row[index];
-    for (int i = 0; i < col_num; i++) {
-      real o = output[index * dimN + row_col[i]];
-      entropy[index] -= log(o / (1 - o));
-    }
-  }
-}
-
-void hl_matrix_multi_binary_cross_entropy(real* output,
-                                          real* entropy,
-                                          hl_sparse_matrix_s csr_mat,
-                                          int dimM,
-                                          int dimN) {
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(entropy);
-  CHECK_NOTNULL(csr_mat);
-  CHECK_EQ(csr_mat->format, HL_SPARSE_CSR);
-  int n_threads = 1024;
-  int blocks = (dimM + n_threads - 1) / n_threads;
-  dim3 threads(n_threads);
-  dim3 grid(blocks);
-  hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix);
-  KeMatrixMultiBinaryCrossEntropy<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      output, entropy, mat->csr_row, mat->csr_col, dimM, dimN);
-  CHECK_SYNC("hl_matrix_multi_binary_cross_entropy failed");
-}
-
-__global__ void KeMatrixMultiBinaryCrossEntropyBp(
-    real* output, real* grad, int* row, int* col, int dimM, int dimN) {
-  int row_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (row_idx < dimM) {
-    for (int i = 0; i < dimN; i++) {
-      int index = row_idx * dimN + i;
-      grad[index] += 1.0 / (1 - output[index]);
-    }
-    int col_num = row[row_idx + 1] - row[row_idx];
-    int* row_col = col + row[row_idx];
-    for (int i = 0; i < col_num; i++) {
-      int index = row_idx * dimN + row_col[i];
-      grad[index] -= 1.0 / (output[index] * (1 - output[index]));
-    }
-  }
-}
-
-void hl_matrix_multi_binary_cross_entropy_bp(
-    real* output, real* grad, hl_sparse_matrix_s csr_mat, int dimM, int dimN) {
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(grad);
-  CHECK_NOTNULL(csr_mat);
-  CHECK_EQ(csr_mat->format, HL_SPARSE_CSR);
-  int n_threads = 1024;
-  int blocks = (dimM + n_threads - 1) / n_threads;
-  dim3 threads(n_threads);
-  dim3 grid(blocks);
-  hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix);
-  KeMatrixMultiBinaryCrossEntropyBp<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      output, grad, mat->csr_row, mat->csr_col, dimM, dimN);
-  CHECK_SYNC("hl_matrix_multi_binary_cross_entropy_bp failed");
-}
-
-__global__ void KeMatrixCrossEntropy(
-    real* O, real* E, int* label, int dimM, int dimN) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int newBase;
-  if (index < dimM) {
-    newBase = label[index];
-    newBase = newBase % dimN;
-    E[index] = -log(O[index * dimN + newBase]);
-  }
-}
-
-void hl_matrix_cross_entropy(
-    real* A_d, real* C_d, int* label_d, int dimM, int dimN) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-
-  int blocks = (dimM + 1024 - 1) / 1024;
-  dim3 threads(1024, 1);
-  dim3 grid(blocks, 1);
-  KeMatrixCrossEntropy<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      A_d, C_d, label_d, dimM, dimN);
-  CHECK_SYNC("hl_matrix_cross_entropy failed");
-}
-
-__global__ void KeMatrixCrossEntropyBp(
-    real* grad_d, real* output_d, int* label_d, int dimM, int dimN) {
-  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  int colIdx = blockIdx.y * blockDim.y + threadIdx.y;
-  int index;
-  if (rowIdx < dimM && colIdx < dimN) {
-    index = rowIdx * dimN + colIdx;
-    if (label_d[rowIdx] == colIdx) {
-      grad_d[index] -= 1.0f / output_d[index];
-    }
-  }
-}
-
-void hl_matrix_cross_entropy_bp(
-    real* grad_d, real* output_d, int* label_d, int dimM, int dimN) {
-  CHECK_NOTNULL(grad_d);
-  CHECK_NOTNULL(output_d);
-  CHECK_NOTNULL(label_d);
-
-  int blocksX = (dimM + 0) / 1;
-  int blocksY = (dimN + 1024 - 1) / 1024;
-  dim3 threads(1, 1024);
-  dim3 grid(blocksX, blocksY);
-  KeMatrixCrossEntropyBp<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      grad_d, output_d, label_d, dimM, dimN);
-  CHECK_SYNC("hl_matrix_cross_entropy_bp failed");
-}
-
-void hl_matrix_zero_mem(real* data, int num) {
-  hl_gpu_apply_unary_op(unary::Zero<real>(), data, 1, num, num);
-}
-
-__global__ void KeParamReluForward(real* output,
-                                   real* input,
-                                   real* w,
-                                   int width,
-                                   int height,
-                                   int partial_sum) {
-  int tx = blockIdx.x * blockDim.x + threadIdx.x;
-  int ty = blockIdx.y * blockDim.y + threadIdx.y;
-  if (tx < width && ty < height) {
-    int index = ty * width + tx;
-    output[index] =
-        input[index] > 0 ? input[index] : input[index] * w[tx / partial_sum];
-  }
-}
-
-void hl_param_relu_forward(real* output,
-                           real* input,
-                           real* w,
-                           int width,
-                           int height,
-                           int partial_sum) {
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(input);
-  CHECK_NOTNULL(w);
-  dim3 threads(16, 16);
-  int blockX = (width + 16 - 1) / 16;
-  int blockY = (height + 16 - 1) / 16;
-  dim3 grid(blockX, blockY);
-  KeParamReluForward<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      output, input, w, width, height, partial_sum);
-  CHECK_SYNC("hl_param_relu_forward failed");
-}
-
-template <int blockSize>
-__global__ void KeParamReluBackWardW(real* grad_w,
-                                     real* grad_o,
-                                     real* input,
-                                     int width,
-                                     int height,
-                                     int partial_sum) {
-  const int tid = threadIdx.x;
-  __shared__ real temp[blockSize];
-  grad_o += partial_sum * blockIdx.x;
-  input += partial_sum * blockIdx.x;
-  real tmp = 0.0;
-  for (int index = tid; index < partial_sum * height; index += blockSize) {
-    int row = index / partial_sum;
-    int offset = row * width + (index - row * partial_sum);
-    if (input[offset] < 0) {
-      tmp += grad_o[offset] * input[offset];
-    }
-  }
-  temp[tid] = tmp;
-  __syncthreads();
-  for (int s = blockSize / 2; s > 0; s >>= 1) {
-    if (tid < s) {
-      temp[tid] += temp[tid + s];
-    }
-    __syncthreads();
-  }
-  if (tid == 0) {
-    grad_w[blockIdx.x] += temp[0];
-  }
-}
-
-void hl_param_relu_backward_w(real* grad_w,
-                              real* grad_o,
-                              real* input,
-                              int width,
-                              int height,
-                              int partial_sum) {
-  CHECK_NOTNULL(grad_w);
-  CHECK_NOTNULL(grad_o);
-  CHECK_NOTNULL(input);
-  const int blockSize = 1024;
-  int grid_num = width / partial_sum;
-  dim3 threads(blockSize, 1);
-  dim3 grid(grid_num, 1);
-  KeParamReluBackWardW<blockSize><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      grad_w, grad_o, input, width, height, partial_sum);
-  CHECK_SYNC("hl_param_relu_backward_w failed");
-}
-
-__global__ void KeParamReluBackwardDiff(real* grad_o,
-                                        real* input,
-                                        real* w,
-                                        real* diff,
-                                        int width,
-                                        int height,
-                                        int partial_sum) {
-  int tx = blockIdx.x * blockDim.x + threadIdx.x;
-  int ty = blockIdx.y * blockDim.y + threadIdx.y;
-  if (tx < width && ty < height) {
-    int index = ty * width + tx;
-    diff[index] += grad_o[index] * (input[index] > 0 ? 1 : w[tx / partial_sum]);
-  }
-}
-
-void hl_param_relu_backward_diff(real* grad_o,
-                                 real* data,
-                                 real* w,
-                                 real* diff,
-                                 int width,
-                                 int height,
-                                 int partial_sum) {
-  CHECK_NOTNULL(grad_o);
-  CHECK_NOTNULL(data);
-  CHECK_NOTNULL(w);
-  CHECK_NOTNULL(diff);
-  dim3 threads(16, 16);
-  int blockX = (width + 16 - 1) / 16;
-  int blockY = (height + 16 - 1) / 16;
-  dim3 grid(blockX, blockY);
-  KeParamReluBackwardDiff<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      grad_o, data, w, diff, width, height, partial_sum);
-  CHECK_SYNC("hl_param_relu_backward_diff failed");
-}
-
-__global__ void KeMatrixAddSharedBias(
-    real* A, real* B, const int channel, const int M, const int N, real scale) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int dim = N / channel;
-  if (index < M * N) {
-    int i = index % N;
-    i = i / dim;
-    A[index] += scale * B[i];
-  }
-}
-
-void hl_matrix_add_shared_bias(real* A_d,
-                               real* B_d,
-                               const int channel,
-                               const int dimM,
-                               const int dimN,
-                               real scale) {
-  const int blocks = 512;
-  const int grids = DIVUP(dimM * dimN, blocks);
-  KeMatrixAddSharedBias<<<grids, blocks, 0, STREAM_DEFAULT>>>(
-      A_d, B_d, channel, dimM, dimN, scale);
-  CHECK_SYNC("hl_matrix_add_shared_bias failed");
-}
-
-template <int blockSize>
-__global__ void KeMatrixCollectSharedBias(real* B,
-                                          real* A,
-                                          const int channel,
-                                          const int M,
-                                          const int N,
-                                          const int dim,
-                                          const int limit,
-                                          real scale) {
-  if (dim < limit) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    if (index < channel) {
-      real sum = 0.0;
-      for (int i = 0; i < M; ++i) {
-        for (int j = 0; j < dim; ++j) {
-          sum += A[i * N + index * dim + j];
-        }
-      }
-      B[index] += scale * sum;
-    }
-  } else {
-    const int tid = threadIdx.x;
-    const int bid = blockIdx.x;
-    __shared__ real smem[blockSize];
-    real sum = 0.0;
-    for (int j = 0; j < ((dim * M + blockSize - 1) / blockSize); ++j) {
-      int n = j * blockSize + tid;
-      int m = n / dim;
-      int w = n % dim;
-      smem[tid] = (m < M && w < dim) ? A[m * N + bid * dim + w] : 0.0;
-      __syncthreads();
-      simpleReduce(smem, tid, blockSize);
-      sum += smem[0];
-    }
-    if (tid == 0) {
-      B[bid] += scale * sum;
-    }
-  }
-}
-
-void hl_matrix_collect_shared_bias(real* B_d,
-                                   real* A_d,
-                                   const int channel,
-                                   const int dimM,
-                                   const int dimN,
-                                   real scale) {
-  const int dim = dimN / channel;
-  const int blocks = 256;
-  const int limit = 64;
-  int grids = (dimM * dim) < limit ? DIVUP(channel, blocks) : channel;
-
-  KeMatrixCollectSharedBias<blocks><<<grids, blocks, 0, STREAM_DEFAULT>>>(
-      B_d, A_d, channel, dimM, dimN, dim, limit, scale);
-  CHECK_SYNC("hl_matrix_collect_shared_bias failed");
-}
-
-__global__ void keMatrixRotate(
-    real* mat, real* matRot, int dimM, int dimN, bool clockWise) {
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < dimM * dimN) {
-    int i = idx / dimN;
-    int j = idx % dimN;
-    if (clockWise) {
-      matRot[j * dimM + i] = mat[(dimM - i - 1) * dimN + j];
-    } else {
-      matRot[j * dimM + i] = mat[i * dimN + (dimN - j - 1)];
-    }
-  }
-}
-
-void hl_matrix_rotate(
-    real* mat, real* matRot, int dimM, int dimN, bool clockWise) {
-  CHECK_NOTNULL(mat);
-  CHECK_NOTNULL(matRot);
-  const int threads = 512;
-  const int blocks = DIVUP(dimM * dimN, threads);
-  keMatrixRotate<<<blocks, threads, 0, STREAM_DEFAULT>>>(
-      mat, matRot, dimM, dimN, clockWise);
-  CHECK_SYNC("hl_matrix_rotate failed");
-}
-
-__global__ void keMatrixVol2Col(int num_kernels,
-                                const real* dataSrc,
-                                real* dataDst,
-                                int depth,
-                                int height,
-                                int width,
-                                int filterD,
-                                int filterH,
-                                int filterW,
-                                int strideD,
-                                int strideH,
-                                int strideW,
-                                int paddingD,
-                                int paddingH,
-                                int paddingW,
-                                int depth_col,
-                                int height_col,
-                                int width_col) {
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels;
-       index += blockDim.x * gridDim.x) {
-    int w_out = index % width_col;
-    int h_out = (index / width_col) % height_col;
-    int d_out = (index / width_col / height_col) % depth_col;
-    int channel_in = index / width_col / height_col / depth_col;
-    int channel_out = channel_in * filterD * filterH * filterW;
-    int w_in = w_out * strideW - paddingW;
-    int h_in = h_out * strideH - paddingH;
-    int d_in = d_out * strideD - paddingD;
-
-    dataDst +=
-        ((channel_out * depth_col + d_out) * height_col + h_out) * width_col +
-        w_out;
-    dataSrc += ((channel_in * depth + d_in) * height + h_in) * width + w_in;
-    for (int k = 0; k < filterD; ++k) {
-      for (int i = 0; i < filterH; ++i) {
-        for (int j = 0; j < filterW; ++j) {
-          int d = d_in + k;
-          int h = h_in + i;
-          int w = w_in + j;
-          *dataDst = (d >= 0 && d < depth && h >= 0 && h < height && w >= 0 &&
-                      w < width)
-                         ? dataSrc[(k * height + i) * width + j]
-                         : 0;
-          dataDst += depth_col * height_col * width_col;
-        }
-      }
-    }
-  }
-}
-
-void hl_matrix_vol2Col(const real* dataSrc,
-                       int channels,
-                       int depth,
-                       int height,
-                       int width,
-                       int filterD,
-                       int filterH,
-                       int filterW,
-                       int strideD,
-                       int strideH,
-                       int strideW,
-                       int paddingD,
-                       int paddingH,
-                       int paddingW,
-                       real* dataDst) {
-  int depth_col = (depth + 2 * paddingD - filterD) / strideD + 1;
-  int height_col = (height + 2 * paddingH - filterH) / strideH + 1;
-  int width_col = (width + 2 * paddingW - filterW) / strideW + 1;
-  int num_kernels = channels * depth_col * height_col * width_col;
-
-  const int threads = 512;
-  const int blocks = DIVUP(num_kernels, threads);
-
-  keMatrixVol2Col<<<blocks, threads, 0, STREAM_DEFAULT>>>(num_kernels,
-                                                          dataSrc,
-                                                          dataDst,
-                                                          depth,
-                                                          height,
-                                                          width,
-                                                          filterD,
-                                                          filterH,
-                                                          filterW,
-                                                          strideD,
-                                                          strideH,
-                                                          strideW,
-                                                          paddingD,
-                                                          paddingH,
-                                                          paddingW,
-                                                          depth_col,
-                                                          height_col,
-                                                          width_col);
-  CHECK_SYNC("hl_matrix_vol2Col failed");
-}
-
-__global__ void keMatrixCol2Vol(int num_kernels,
-                                real* dataDst,
-                                const real* dataSrc,
-                                int depth,
-                                int height,
-                                int width,
-                                int filterD,
-                                int filterH,
-                                int filterW,
-                                int strideD,
-                                int strideH,
-                                int strideW,
-                                int paddingD,
-                                int paddingH,
-                                int paddingW,
-                                int depth_col,
-                                int height_col,
-                                int width_col,
-                                real alpha,
-                                real beta) {
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels;
-       index += blockDim.x * gridDim.x) {
-    real srcVal = 0;
-    real dstVal = dataDst[index];
-    int w = index % width + paddingW;
-    int h = (index / width) % height + paddingH;
-    int d = (index / width / height) % depth + paddingD;
-    int c = index / width / height / depth;
-    // compute the start and end of the output
-    int w_col_start = (w < filterW) ? 0 : (w - filterW) / strideW + 1;
-    int w_col_end = min(w / strideW + 1, width_col);
-    int h_col_start = (h < filterH) ? 0 : (h - filterH) / strideH + 1;
-    int h_col_end = min(h / strideH + 1, height_col);
-    int d_col_start = (d < filterD) ? 0 : (d - filterD) / strideD + 1;
-    int d_col_end = min(d / strideD + 1, depth_col);
-
-    int offset = (c * filterD * filterW * filterH + d * filterW * filterH +
-                  h * filterW + w) *
-                 depth_col * height_col * width_col;
-
-    int coeff_d_col =
-        (1 - strideD * filterW * filterH * depth_col) * height_col * width_col;
-    int coeff_h_col =
-        (1 - strideH * filterW * depth_col * height_col) * width_col;
-    int coeff_w_col = (1 - strideW * depth_col * height_col * width_col);
-
-    for (int d_col = d_col_start; d_col < d_col_end; ++d_col) {
-      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-          srcVal += dataSrc[offset + d_col * coeff_d_col + h_col * coeff_h_col +
-                            w_col * coeff_w_col];
-        }
-      }
-    }
-    dataDst[index] = alpha * srcVal + beta * dstVal;
-  }
-}
-
-void hl_matrix_col2Vol(real* dataDst,
-                       int channels,
-                       int depth,
-                       int height,
-                       int width,
-                       int filterD,
-                       int filterH,
-                       int filterW,
-                       int strideD,
-                       int strideH,
-                       int strideW,
-                       int paddingD,
-                       int paddingH,
-                       int paddingW,
-                       const real* dataSrc,
-                       real alpha,
-                       real beta) {
-  int depth_col = (depth + 2 * paddingD - filterD) / strideD + 1;
-  int height_col = (height + 2 * paddingH - filterH) / strideH + 1;
-  int width_col = (width + 2 * paddingW - filterW) / strideW + 1;
-  int num_kernels = channels * depth * height * width;
-
-  const int threads = 512;
-  const int blocks = DIVUP(num_kernels, threads);
-
-  keMatrixCol2Vol<<<blocks, threads, 0, STREAM_DEFAULT>>>(num_kernels,
-                                                          dataDst,
-                                                          dataSrc,
-                                                          depth,
-                                                          height,
-                                                          width,
-                                                          filterD,
-                                                          filterH,
-                                                          filterW,
-                                                          strideD,
-                                                          strideH,
-                                                          strideW,
-                                                          paddingD,
-                                                          paddingH,
-                                                          paddingW,
-                                                          depth_col,
-                                                          height_col,
-                                                          width_col,
-                                                          alpha,
-                                                          beta);
-
-  CHECK_SYNC("hl_matrix_col2Vol failed");
-}
-
-__global__ void keVectorCast2Int(int* out, real* vec, int size) {
-  for (int i = threadIdx.x; i < (size); i += blockDim.x) {
-    out[i] = int(vec[i]);
-  }
-}
-
-void hl_vector_cast2int(int* out, real* vec, int size) {
-  keVectorCast2Int<<<1, 512, 0, STREAM_DEFAULT>>>(out, vec, size);
-  CHECK_SYNC("hl_vector_cast2int failed");
-}
diff --git a/paddle/cuda/src/hl_cuda_sequence.cu b/paddle/cuda/src/hl_cuda_sequence.cu
deleted file mode 100644
index a3a5f038de7c0a68ee2e387d83b2272907164e90..0000000000000000000000000000000000000000
--- a/paddle/cuda/src/hl_cuda_sequence.cu
+++ /dev/null
@@ -1,408 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "hl_base.h"
-#include "hl_device_functions.cuh"
-#include "paddle/utils/Logging.h"
-
-__global__ void KeMaxSequenceForward(real* input,
-                                     const int* sequence,
-                                     real* output,
-                                     int* index,
-                                     int numSequences,
-                                     int dim) {
-  int dimIdx = threadIdx.x;
-  int sequenceId = blockIdx.x;
-  if (sequenceId >= numSequences) return;
-  int start = sequence[sequenceId];
-  int end = sequence[sequenceId + 1];
-
-  for (int i = dimIdx; i < dim; i += blockDim.x) {
-    real tmp = -HL_FLOAT_MAX;
-    int tmpId = -1;
-    for (int insId = start; insId < end; insId++) {
-      if (tmp < input[insId * dim + i]) {
-        tmp = input[insId * dim + i];
-        tmpId = insId;
-      }
-    }
-    output[sequenceId * dim + i] = tmp;
-    index[sequenceId * dim + i] = tmpId;
-  }
-}
-
-void hl_max_sequence_forward(real* input,
-                             const int* sequence,
-                             real* output,
-                             int* index,
-                             int numSequences,
-                             int dim) {
-  CHECK_NOTNULL(input);
-  CHECK_NOTNULL(sequence);
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(index);
-
-  dim3 threads(256, 1);
-  dim3 grid(numSequences, 1);
-  KeMaxSequenceForward<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      input, sequence, output, index, numSequences, dim);
-  CHECK_SYNC("hl_max_sequence_forward failed");
-}
-
-__global__ void KeMaxSequenceBackward(
-    real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {
-  int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  int colIdx = idx % dim;
-  if (idx < numSequences * dim) {
-    int insId = index[idx];
-    inputGrad[insId * dim + colIdx] += outputGrad[idx];
-  }
-}
-
-void hl_max_sequence_backward(
-    real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {
-  CHECK_NOTNULL(outputGrad);
-  CHECK_NOTNULL(index);
-  CHECK_NOTNULL(inputGrad);
-
-  unsigned int blocks = (numSequences * dim + 128 - 1) / 128;
-  dim3 threads(128, 1);
-  dim3 grid(blocks, 1);
-  KeMaxSequenceBackward<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      outputGrad, index, inputGrad, numSequences, dim);
-  CHECK_SYNC("hl_max_sequence_backward failed");
-}
-
-template <int blockDimX, int blockDimY, int gridDimX, bool AddRow>
-__global__ void KeMatrixAddRows(real* output,
-                                real* table,
-                                int* ids,
-                                int numSamples,
-                                int tableSize,
-                                int dim) {
-  int idx = threadIdx.x;
-  int idy = threadIdx.y;
-  int sampleId = blockIdx.x + idy * gridDimX;
-
-  while (sampleId < numSamples) {
-    int tableId = ids[sampleId];
-    if ((0 <= tableId) && (tableId < tableSize)) {
-      real* outputData = output + sampleId * dim;
-      real* tableData = table + tableId * dim;
-      for (int i = idx; i < dim; i += blockDimX) {
-        if (AddRow == 0) {
-          outputData[i] += tableData[i];
-        } else {
-          paddle::paddleAtomicAdd(&tableData[i], outputData[i]);
-        }
-      }
-    }
-    sampleId += blockDimY * gridDimX;
-  }
-}
-
-template <int blockDimX,
-          int blockDimY,
-          int gridDimX,
-          bool seq2batch,
-          bool isAdd>
-__global__ void KeSequence2Batch(real* batch,
-                                 real* sequence,
-                                 const int* batchIndex,
-                                 int seqWidth,
-                                 int batchCount) {
-  int idx = threadIdx.x;
-  int idy = threadIdx.y;
-  int id = blockIdx.x + idy * gridDimX;
-  while (id < batchCount) {
-    int seqId = batchIndex[id];
-    real* batchData = batch + id * seqWidth;
-    real* seqData = sequence + seqId * seqWidth;
-    for (int i = idx; i < seqWidth; i += blockDimX) {
-      if (seq2batch) {
-        if (isAdd) {
-          batchData[i] += seqData[i];
-        } else {
-          batchData[i] = seqData[i];
-        }
-      } else {
-        if (isAdd) {
-          seqData[i] += batchData[i];
-        } else {
-          seqData[i] = batchData[i];
-        }
-      }
-    }
-    id += blockDimY * gridDimX;
-  }
-}
-
-void hl_sequence2batch_copy(real* batch,
-                            real* sequence,
-                            const int* batchIndex,
-                            int seqWidth,
-                            int batchCount,
-                            bool seq2batch) {
-  CHECK_NOTNULL(sequence);
-  CHECK_NOTNULL(batch);
-  CHECK_NOTNULL(batchIndex);
-
-  dim3 threads(128, 8);
-  dim3 grid(8, 1);
-  if (seq2batch) {
-    KeSequence2Batch<128, 8, 8, 1, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        batch, sequence, batchIndex, seqWidth, batchCount);
-  } else {
-    KeSequence2Batch<128, 8, 8, 0, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        batch, sequence, batchIndex, seqWidth, batchCount);
-  }
-  CHECK_SYNC("hl_sequence2batch_copy failed");
-}
-
-void hl_sequence2batch_add(real* batch,
-                           real* sequence,
-                           int* batchIndex,
-                           int seqWidth,
-                           int batchCount,
-                           bool seq2batch) {
-  CHECK_NOTNULL(sequence);
-  CHECK_NOTNULL(batch);
-  CHECK_NOTNULL(batchIndex);
-
-  dim3 threads(128, 8);
-  dim3 grid(8, 1);
-  if (seq2batch) {
-    KeSequence2Batch<128, 8, 8, 1, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        batch, sequence, batchIndex, seqWidth, batchCount);
-  } else {
-    KeSequence2Batch<128, 8, 8, 0, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        batch, sequence, batchIndex, seqWidth, batchCount);
-  }
-  CHECK_SYNC("hl_sequence2batch_add failed");
-}
-
-template <bool normByTimes, bool seq2batch>
-__global__ void KeSequence2BatchPadding(real* batch,
-                                        real* sequence,
-                                        const int* sequenceStartPositions,
-                                        const size_t sequenceWidth,
-                                        const size_t maxSequenceLength,
-                                        const size_t numSequences) {
-  int batchIdx = blockIdx.y;
-  int sequenceStart = sequenceStartPositions[batchIdx];
-  int sequenceLength = sequenceStartPositions[batchIdx + 1] - sequenceStart;
-
-  int sequenceIdx = blockIdx.x * blockDim.y + threadIdx.y;
-  int batchBaseIdx = (sequenceIdx * numSequences + batchIdx) * sequenceWidth;
-  int sequenceBaseIdx = (sequenceStart + sequenceIdx) * sequenceWidth;
-
-  real scale = normByTimes ? (1.0f / (real)sequenceLength) : 1.0f;
-
-  if (sequenceIdx < sequenceLength) {
-    if (seq2batch) {
-      /* sequence -> batch */
-      for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
-        batch[batchBaseIdx + i] = scale * sequence[sequenceBaseIdx + i];
-      }
-    } else {
-      /* batch -> sequence */
-      for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
-        sequence[sequenceBaseIdx + i] = scale * batch[batchBaseIdx + i];
-      }
-    }
-  } else if (sequenceIdx < maxSequenceLength) {
-    if (seq2batch) {
-      /* sequence -> batch */
-      for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
-        batch[batchBaseIdx + i] = 0;
-      }
-    }
-  }
-}
-
-void hl_sequence2batch_copy_padding(real* batch,
-                                    real* sequence,
-                                    const int* sequenceStartPositions,
-                                    const size_t sequenceWidth,
-                                    const size_t maxSequenceLength,
-                                    const size_t numSequences,
-                                    bool normByTimes,
-                                    bool seq2batch) {
-  CHECK_NOTNULL(batch);
-  CHECK_NOTNULL(sequence);
-  CHECK_NOTNULL(sequenceStartPositions);
-
-  if (!normByTimes && numSequences == 1) {
-    size_t elementCount = maxSequenceLength * sequenceWidth;
-    if (seq2batch) {
-      /* sequence -> batch */
-      hl_memcpy_device2device(batch, sequence, sizeof(real) * elementCount);
-    } else {
-      /* batch -> sequence */
-      hl_memcpy_device2device(sequence, batch, sizeof(real) * elementCount);
-    }
-    return;
-  }
-
-  const int CUDA_BLOCK_SIZE = 512;
-
-  /* At least use 32 threads to copy sequenceWidth elements,
-     and at least 8 elements for each thread. */
-  int blockDimX = ((((sequenceWidth + 7) >> 3) + 31) >> 5) << 5;
-  blockDimX = (blockDimX < CUDA_BLOCK_SIZE) ? blockDimX : CUDA_BLOCK_SIZE;
-
-  int blockDimY = CUDA_BLOCK_SIZE / blockDimX;
-  dim3 threads(blockDimX, blockDimY);
-
-  int gridDimX = (maxSequenceLength + blockDimY - 1) / blockDimY;
-  int gridDimY = numSequences;
-  dim3 grid(gridDimX, gridDimY);
-
-  if (seq2batch) {
-    /* sequence -> batch */
-    if (normByTimes) {
-      KeSequence2BatchPadding<1, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          batch,
-          sequence,
-          sequenceStartPositions,
-          sequenceWidth,
-          maxSequenceLength,
-          numSequences);
-    } else {
-      KeSequence2BatchPadding<0, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          batch,
-          sequence,
-          sequenceStartPositions,
-          sequenceWidth,
-          maxSequenceLength,
-          numSequences);
-    }
-  } else {
-    /* batch -> sequence */
-    if (normByTimes) {
-      KeSequence2BatchPadding<1, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          batch,
-          sequence,
-          sequenceStartPositions,
-          sequenceWidth,
-          maxSequenceLength,
-          numSequences);
-    } else {
-      KeSequence2BatchPadding<0, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          batch,
-          sequence,
-          sequenceStartPositions,
-          sequenceWidth,
-          maxSequenceLength,
-          numSequences);
-    }
-  }
-
-  CHECK_SYNC("hl_sequence2batch_copy_padding failed");
-}
-
-__device__ inline float my_rsqrt(float x) { return rsqrtf(x); }
-
-__device__ inline double my_rsqrt(double x) { return rsqrt(x); }
-
-__global__ void KeSequenceAvgForward(real* dst,
-                                     real* src,
-                                     const int* starts,
-                                     int height,
-                                     int width,
-                                     const int mode) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  int row = gid / width;
-  int col = gid % width;
-
-  if (gid < height * width) {
-    int start = starts[row];
-    int end = starts[row + 1];
-    int seqLength = end - start;
-    if (seqLength == 0) return;
-    real sum = 0.0;
-    for (int i = start; i < end; i++) {
-      sum += src[i * width + col];
-    }
-    sum = mode == 1 ? sum : (mode == 0 ? sum / seqLength
-                                       : sum * my_rsqrt((real)seqLength));
-    dst[gid] += sum;
-  }
-}
-
-void hl_sequence_avg_forward(real* dst,
-                             real* src,
-                             const int* starts,
-                             int height,
-                             int width,
-                             const int mode) {
-  CHECK_NOTNULL(dst);
-  CHECK_NOTNULL(src);
-  CHECK_NOTNULL(starts);
-
-  int block = 512;
-  int grid = DIVUP(width * height, 512);
-
-  CHECK(mode == 0 || mode == 1 || mode == 2)
-      << "mode error in hl_sequence_avg_forward!";
-
-  KeSequenceAvgForward<<<grid, block, 0, STREAM_DEFAULT>>>(
-      dst, src, starts, height, width, mode);
-  CHECK_SYNC("hl_sequence_avg_forward failed");
-}
-
-__global__ void KeSequenceAvgBackward(real* dst,
-                                      real* src,
-                                      const int* starts,
-                                      int height,
-                                      int width,
-                                      const int mode) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  int row = gid / width;
-  int col = gid % width;
-
-  if (gid < height * width) {
-    int start = starts[row];
-    int end = starts[row + 1];
-    int seqLength = end - start;
-    if (seqLength == 0) return;
-    real grad = src[gid];
-    grad = mode == 1 ? grad : (mode == 0 ? grad / seqLength
-                                         : grad * my_rsqrt((real)seqLength));
-    for (int i = start; i < end; i++) {
-      dst[i * width + col] += grad;
-    }
-  }
-}
-
-void hl_sequence_avg_backward(real* dst,
-                              real* src,
-                              const int* starts,
-                              int height,
-                              int width,
-                              const int mode) {
-  CHECK_NOTNULL(dst);
-  CHECK_NOTNULL(src);
-  CHECK_NOTNULL(starts);
-
-  int block = 512;
-  int grid = DIVUP(width * height, 512);
-
-  CHECK(mode == 0 || mode == 1 || mode == 2)
-      << "mode error in hl_sequence_avg_backward!";
-
-  KeSequenceAvgBackward<<<grid, block, 0, STREAM_DEFAULT>>>(
-      dst, src, starts, height, width, mode);
-  CHECK_SYNC("hl_sequence_avg_backward failed");
-}
diff --git a/paddle/cuda/src/hl_cuda_sparse.cu b/paddle/cuda/src/hl_cuda_sparse.cu
deleted file mode 100644
index 432041fed5ab1ffc02dabcd4644fa70a6473fba1..0000000000000000000000000000000000000000
--- a/paddle/cuda/src/hl_cuda_sparse.cu
+++ /dev/null
@@ -1,1262 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "hl_cuda.h"
-#include "hl_cuda_sparse.cuh"
-#include "hl_matrix_apply.cuh"
-#include "hl_matrix_ops.cuh"
-#include "hl_sparse.h"
-#include "hl_sparse.ph"
-#include "paddle/utils/Logging.h"
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(mul_scalar, ONE_PARAMETER, a = a * p);
-DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
-
-void hl_matrix_csr2dense(hl_sparse_matrix_s A_d,
-                         real *C_d,
-                         int dimM,
-                         int dimN) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-  CHECK(dimM > 0 && dimN > 0 && A_d->rows == dimM && A_d->cols == dimN);
-  CHECK(A_d->format == HL_SPARSE_CSR) << "matrix format error!";
-
-  if (A_d->nnz == 0) {
-    hl_gpu_apply_unary_op(unary::Zero<real>(), C_d, dimM, dimN, dimN);
-    return;
-  }
-
-  /* nnz != 0 */
-  hl_csr_matrix A_d2 = (hl_csr_matrix)(A_d->matrix);
-  CHECK((A_d2->csr_val || A_d->type == HL_NO_VALUE) && A_d2->csr_row &&
-        A_d2->csr_col)
-      << "parameter transa error!";
-
-  int blocksX = (dimN + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X;
-  int blocksY = (dimM + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X;
-  dim3 threads(CU_CSR2DENSE_THREAD_X, CU_CSR2DENSE_THREAD_X);
-  dim3 grid(blocksX, blocksY);
-
-  if (A_d->type == HL_NO_VALUE) {
-    KeSMatrixCsr2Dense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        A_d2->csr_val, A_d2->csr_row, A_d2->csr_col, C_d, dimM, dimN);
-  } else if (A_d->type == HL_FLOAT_VALUE) {
-    KeSMatrixCsr2Dense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        A_d2->csr_val, A_d2->csr_row, A_d2->csr_col, C_d, dimM, dimN);
-  } else {
-  }
-  CHECK_SYNC("hl_matrix_csr2dense failed");
-}
-
-void hl_matrix_csc2dense(hl_sparse_matrix_s A_d,
-                         real *C_d,
-                         int dimM,
-                         int dimN) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-  CHECK(dimM > 0 && dimN > 0 && A_d->rows == dimM && A_d->cols == dimN);
-  CHECK(A_d->format == HL_SPARSE_CSC) << "matrix format error!";
-
-  if (A_d->nnz == 0) {
-    hl_gpu_apply_unary_op(unary::Zero<real>(), C_d, dimM, dimN, dimN);
-    return;
-  }
-
-  /* nnz != 0 */
-  hl_csc_matrix A_d2 = (hl_csc_matrix)(A_d->matrix);
-  CHECK((A_d2->csc_val || A_d->type == HL_NO_VALUE) && A_d2->csc_row &&
-        A_d2->csc_col)
-      << "parameter transa error!";
-
-  int blocksX = (dimN + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X;
-  int blocksY = (dimM + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X;
-  dim3 threads(CU_CSR2DENSE_THREAD_X, CU_CSR2DENSE_THREAD_X);
-  dim3 grid(blocksX, blocksY);
-
-  if (A_d->type == HL_NO_VALUE) {
-    KeSMatrixCsc2Dense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        A_d2->csc_val, A_d2->csc_row, A_d2->csc_col, C_d, dimM, dimN);
-  } else if (A_d->type == HL_FLOAT_VALUE) {
-    KeSMatrixCsc2Dense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        A_d2->csc_val, A_d2->csc_row, A_d2->csc_col, C_d, dimM, dimN);
-  } else {
-  }
-  CHECK_SYNC("hl_matrix_csc2dense failed");
-}
-
-void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
-                             hl_matrix_format_t format,
-                             hl_matrix_value_t value_type,
-                             int dimM,
-                             int dimN,
-                             int nnz) {
-  CHECK_NOTNULL(A_d);
-  CHECK(format == HL_SPARSE_CSR || format == HL_SPARSE_CSC)
-      << "sparse matrix format error!";
-  CHECK(value_type == HL_FLOAT_VALUE || value_type == HL_NO_VALUE)
-      << "sparse matrix value type error!";
-  /* avoid malloc 0 bytes */
-  int nnz_s = (nnz == 0 ? 1 : nnz);
-
-  if (format == HL_SPARSE_CSR) {
-    CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
-
-    char *tmp =
-        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csr_matrix));
-    CHECK_NOTNULL(tmp);
-
-    hl_csr_matrix csr = (hl_csr_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
-    csr->sparsity = -1.0;
-
-    if (value_type == HL_NO_VALUE) {
-      csr->csr_val = NULL;
-      csr->nnz_s = nnz_s;
-      csr->row_s = dimM + 1;
-      csr->csr_row = (int *)hl_malloc_device((dimM + 1) * sizeof(int));
-      csr->csr_col = (int *)hl_malloc_device((nnz_s) * sizeof(int));
-
-      *A_d = (hl_sparse_matrix_s)tmp;
-      (*A_d)->matrix = (hl_matrix_s)csr;
-    } else if (value_type == HL_FLOAT_VALUE) {
-      csr->nnz_s = nnz_s;
-      csr->row_s = dimM + 1;
-      csr->csr_val = (real *)hl_malloc_device((nnz_s) * sizeof(real));
-      csr->csr_row = (int *)hl_malloc_device((dimM + 1) * sizeof(int));
-      csr->csr_col = (int *)hl_malloc_device((nnz_s) * sizeof(int));
-
-      *A_d = (hl_sparse_matrix_s)tmp;
-      (*A_d)->matrix = (hl_matrix_s)csr;
-    }
-  } else if (format == HL_SPARSE_CSC) {
-    CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
-
-    char *tmp =
-        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csc_matrix));
-    CHECK_NOTNULL(tmp);
-
-    hl_csc_matrix csc = (hl_csc_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
-    csc->sparsity = -1.0f;
-
-    if (value_type == HL_NO_VALUE) {
-      csc->csc_val = NULL;
-      csc->nnz_s = nnz_s;
-      csc->col_s = dimN + 1;
-      csc->csc_row = (int *)hl_malloc_device((nnz_s) * sizeof(int));
-      csc->csc_col = (int *)hl_malloc_device((dimN + 1) * sizeof(int));
-
-      *A_d = (hl_sparse_matrix_s)tmp;
-      (*A_d)->matrix = (hl_matrix_s)csc;
-    } else if (value_type == HL_FLOAT_VALUE) {
-      csc->nnz_s = nnz_s;
-      csc->col_s = dimN + 1;
-      csc->csc_val = (real *)hl_malloc_device((nnz_s) * sizeof(real));
-      csc->csc_row = (int *)hl_malloc_device((nnz_s) * sizeof(int));
-      csc->csc_col = (int *)hl_malloc_device((dimN + 1) * sizeof(int));
-
-      *A_d = (hl_sparse_matrix_s)tmp;
-      (*A_d)->matrix = (hl_matrix_s)csc;
-    }
-  }
-
-  (*A_d)->format = format;
-  (*A_d)->type = value_type;
-  (*A_d)->rows = dimM;
-  (*A_d)->cols = dimN;
-  (*A_d)->nnz = nnz;
-}
-
-void hl_free_sparse_matrix(hl_sparse_matrix_s A_d) {
-  CHECK_NOTNULL(A_d);
-  CHECK(A_d->format == HL_SPARSE_CSR || A_d->format == HL_SPARSE_CSC)
-      << "sparse matrix format error!";
-
-  if (A_d->matrix == NULL) {
-    free(A_d);
-    return;
-  }
-
-  if (A_d->format == HL_SPARSE_CSR) {
-    hl_csr_matrix csr = (hl_csr_matrix)A_d->matrix;
-    if (csr->csr_val != NULL) {
-      hl_free_mem_device(csr->csr_val);
-      csr->csr_val = NULL;
-    }
-
-    if (csr->csr_row != NULL) {
-      hl_free_mem_device(csr->csr_row);
-      csr->csr_row = NULL;
-    }
-
-    if (csr->csr_col != NULL) {
-      hl_free_mem_device(csr->csr_col);
-      csr->csr_col = NULL;
-    }
-
-    A_d->matrix = NULL;
-    free(A_d);
-  } else if (A_d->format == HL_SPARSE_CSC) {
-    hl_csc_matrix csc = (hl_csc_matrix)A_d->matrix;
-    if (csc->csc_val != NULL) {
-      hl_free_mem_device(csc->csc_val);
-      csc->csc_val = NULL;
-    }
-
-    if (csc->csc_row != NULL) {
-      hl_free_mem_device(csc->csc_row);
-      csc->csc_row = NULL;
-    }
-
-    if (csc->csc_col != NULL) {
-      hl_free_mem_device(csc->csc_col);
-      csc->csc_col = NULL;
-    }
-
-    A_d->matrix = NULL;
-    free(A_d);
-  }
-}
-
-void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
-                                void *dest_d,
-                                size_t size,
-                                hl_matrix_format_t format,
-                                hl_matrix_value_t value_type,
-                                int dimM,
-                                int dimN,
-                                int nnz) {
-  CHECK_NOTNULL(A_d);
-  CHECK(format == HL_SPARSE_CSR || format == HL_SPARSE_CSC)
-      << "sparse matrix format error!";
-
-  if (format == HL_SPARSE_CSR) {
-    CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
-
-    size_t size_ = (dimM + 1) * sizeof(int) + nnz * sizeof(int);
-    if (value_type != HL_NO_VALUE) {
-      size_ += nnz * sizeof(real);
-    }
-    CHECK_LE(size_, size) << "dest_d size(" << size
-                          << ") too small, should bigger than(" << size_
-                          << ")!";
-
-    char *tmp =
-        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csr_matrix));
-    CHECK_NOTNULL(tmp);
-
-    hl_csr_matrix csr = (hl_csr_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
-
-    if (value_type == HL_NO_VALUE) {
-      csr->csr_val = NULL;
-      csr->csr_row = (int *)dest_d;
-      csr->csr_col = (int *)((char *)dest_d + (dimM + 1) * sizeof(int));
-    } else {
-      csr->csr_val = (real *)dest_d;
-      csr->csr_row = (int *)((char *)dest_d + nnz * sizeof(real));
-      csr->csr_col = (int *)((char *)dest_d + nnz * sizeof(real) +
-                             (dimM + 1) * sizeof(int));
-    }
-    csr->nnz_s = nnz;
-    csr->row_s = dimM + 1;
-    csr->sparsity = -1.0;
-    *A_d = (hl_sparse_matrix_s)tmp;
-    (*A_d)->matrix = (hl_matrix_s)csr;
-  } else if (format == HL_SPARSE_CSC) {
-    CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
-
-    size_t size_ = (dimN + 1) * sizeof(int) + nnz * sizeof(int);
-    if (value_type != HL_NO_VALUE) {
-      size_ += nnz * sizeof(real);
-    }
-    CHECK_LE(size_, size) << "dest_d size(" << size
-                          << ") too small, should bigger than(" << size_
-                          << ")!";
-
-    char *tmp =
-        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csc_matrix));
-    CHECK_NOTNULL(tmp);
-
-    hl_csc_matrix csc = (hl_csc_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
-    if (value_type == HL_NO_VALUE) {
-      csc->csc_val = NULL;
-      csc->csc_col = (int *)dest_d;
-      csc->csc_row = (int *)((char *)dest_d + (dimN + 1) * sizeof(int));
-    } else {
-      csc->csc_val = (real *)dest_d;
-      csc->csc_col = (int *)((char *)dest_d + nnz * sizeof(real));
-      csc->csc_row = (int *)((char *)dest_d + nnz * sizeof(real) +
-                             (dimN + 1) * sizeof(int));
-    }
-    csc->nnz_s = nnz;
-    csc->col_s = dimN + 1;
-    csc->sparsity = -1.0f;
-    *A_d = (hl_sparse_matrix_s)tmp;
-    (*A_d)->matrix = (hl_matrix_s)csc;
-  }
-
-  (*A_d)->format = format;
-  (*A_d)->type = value_type;
-  (*A_d)->rows = dimM;
-  (*A_d)->cols = dimN;
-  (*A_d)->nnz = nnz;
-}
-
-void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
-                                real *value_d,
-                                int *rows_d,
-                                int *cols_d,
-                                hl_matrix_format_t format,
-                                hl_matrix_value_t value_type,
-                                int dimM,
-                                int dimN,
-                                int nnz) {
-  CHECK_NOTNULL(A_d);
-  CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
-
-  CHECK(format == HL_SPARSE_CSR || format == HL_SPARSE_CSC)
-      << "sparse matrix format error!";
-
-  if (format == HL_SPARSE_CSR) {
-    char *tmp =
-        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csr_matrix));
-    CHECK_NOTNULL(tmp);
-
-    hl_csr_matrix csr = (hl_csr_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
-    csr->csr_row = rows_d;
-    csr->csr_col = cols_d;
-    csr->csr_val = value_d;
-    csr->nnz_s = nnz;
-    csr->row_s = dimM + 1;
-    csr->sparsity = -1.0;
-    *A_d = (hl_sparse_matrix_s)tmp;
-    (*A_d)->matrix = (hl_matrix_s)csr;
-  } else if (format == HL_SPARSE_CSC) {
-    char *tmp =
-        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csc_matrix));
-    CHECK_NOTNULL(tmp);
-
-    hl_csc_matrix csc = (hl_csc_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
-    csc->csc_row = rows_d;
-    csc->csc_col = cols_d;
-    csc->csc_val = value_d;
-    csc->nnz_s = nnz;
-    csc->col_s = dimN + 1;
-    csc->sparsity = -1.0f;
-    *A_d = (hl_sparse_matrix_s)tmp;
-    (*A_d)->matrix = (hl_matrix_s)csc;
-  }
-
-  (*A_d)->format = format;
-  (*A_d)->type = value_type;
-  (*A_d)->rows = dimM;
-  (*A_d)->cols = dimN;
-  (*A_d)->nnz = nnz;
-}
-
-void hl_destruct_sparse_matrix(hl_sparse_matrix_s A_d) {
-  CHECK_NOTNULL(A_d);
-  free(A_d);
-}
-
-void hl_memcpy_csr_matrix(hl_sparse_matrix_s csr_matrix,
-                          real *csr_val,
-                          int *csr_row,
-                          int *csr_col,
-                          hl_stream_t stream) {
-  CHECK_NOTNULL(csr_matrix);
-  CHECK_EQ(csr_matrix->format, HL_SPARSE_CSR)
-      << "csr_matrix is not csr format!";
-  CHECK_NOTNULL(csr_matrix->matrix);
-
-  hl_csr_matrix csr = (hl_csr_matrix)(csr_matrix->matrix);
-  CHECK_LE(csr_matrix->nnz, csr->nnz_s) << "copy size " << csr_matrix->nnz
-                                        << " is big than alloc size "
-                                        << csr->nnz_s;
-
-  CHECK_LE((csr_matrix->rows + 1), csr->row_s)
-      << "copy size " << (csr_matrix->rows + 1) << " is big than alloc size "
-      << csr->row_s;
-
-  CHECK(csr_matrix->type == HL_FLOAT_VALUE || csr_matrix->type == HL_NO_VALUE)
-      << "sparse matrix value type error!";
-
-  if (csr_matrix->type == HL_NO_VALUE) {
-    if (csr_row == NULL && csr_col == NULL) {
-      return;
-    } else if (csr_row != NULL && csr_col != NULL) {
-      hl_memcpy_async(
-          csr->csr_row, csr_row, (csr_matrix->rows + 1) * sizeof(int), stream);
-
-      hl_memcpy_async(
-          csr->csr_col, csr_col, (csr_matrix->nnz) * sizeof(int), stream);
-    } else {
-      LOG(FATAL) << "parameter csr_row or csr_col is null pointer!";
-    }
-  } else if (csr_matrix->type == HL_FLOAT_VALUE) {
-    if (csr_val == NULL && csr_row == NULL && csr_col == NULL) {
-      return;
-    } else if (csr_val != NULL && csr_row == NULL && csr_col == NULL) {
-      hl_memcpy_async(
-          csr->csr_val, csr_val, (csr_matrix->nnz) * sizeof(real), stream);
-    } else if (csr_val != NULL && csr_row != NULL && csr_col != NULL) {
-      hl_memcpy_async(
-          csr->csr_val, csr_val, (csr_matrix->nnz) * sizeof(real), stream);
-      hl_memcpy_async(
-          csr->csr_row, csr_row, (csr_matrix->rows + 1) * sizeof(int), stream);
-      hl_memcpy_async(
-          csr->csr_col, csr_col, (csr_matrix->nnz) * sizeof(int), stream);
-    } else {
-      LOG(FATAL) << "parameter csr_row or csr_col is null pointer!";
-    }
-  }
-
-  csr->sparsity = ((float)csr_matrix->nnz) / ((float)csr_matrix->rows) /
-                  ((float)csr_matrix->cols);
-}
-
-void hl_memcpy_csc_matrix(hl_sparse_matrix_s csc_matrix,
-                          real *csc_val,
-                          int *csc_row,
-                          int *csc_col,
-                          hl_stream_t stream) {
-  CHECK_NOTNULL(csc_matrix);
-  CHECK_EQ(csc_matrix->format, HL_SPARSE_CSC)
-      << "csc_matrix is not csc format error!";
-
-  hl_csc_matrix csc = (hl_csc_matrix)(csc_matrix->matrix);
-  CHECK_LE(csc_matrix->nnz, csc->nnz_s) << "copy size " << csc_matrix->nnz
-                                        << " is big than alloc size "
-                                        << csc->nnz_s;
-
-  CHECK_LE((csc_matrix->cols + 1), csc->col_s)
-      << "copy size " << (csc_matrix->cols + 1) << " is big than alloc size "
-      << csc->col_s;
-
-  CHECK(csc_matrix->type == HL_FLOAT_VALUE || csc_matrix->type == HL_NO_VALUE)
-      << "sparse matrix value type error!";
-
-  if (csc_matrix->type == HL_NO_VALUE) {
-    if (csc_row == NULL && csc_col == NULL) {
-      return;
-    } else if (csc_row != NULL && csc_col != NULL) {
-      hl_memcpy_async(
-          csc->csc_row, csc_row, (csc_matrix->nnz) * sizeof(int), stream);
-      hl_memcpy_async(
-          csc->csc_col, csc_col, (csc_matrix->cols + 1) * sizeof(int), stream);
-    } else {
-      LOG(FATAL) << "parameter csc_row or csc_col is null pointer!";
-    }
-  } else if (csc_matrix->type == HL_FLOAT_VALUE) {
-    if (csc_val == NULL && csc_row == NULL && csc_col == NULL) {
-      return;
-    } else if (csc_val != NULL && csc_row == NULL && csc_col == NULL) {
-      hl_memcpy_async(
-          csc->csc_val, csc_val, (csc_matrix->nnz) * sizeof(real), stream);
-    } else if (csc_val != NULL && csc_row != NULL && csc_col != NULL) {
-      hl_memcpy_async(
-          csc->csc_val, csc_val, (csc_matrix->nnz) * sizeof(real), stream);
-      hl_memcpy_async(
-          csc->csc_row, csc_row, (csc_matrix->nnz) * sizeof(int), stream);
-      hl_memcpy_async(
-          csc->csc_col, csc_col, (csc_matrix->cols + 1) * sizeof(int), stream);
-    } else {
-      LOG(FATAL) << "parameter csc_row or csc_col is null pointer!";
-    }
-  }
-
-  csc->sparsity = ((float)csc_matrix->nnz) / ((float)csc_matrix->rows) /
-                  ((float)csc_matrix->cols);
-}
-
-void hl_memcpy_sparse_matrix(hl_sparse_matrix_s dst,
-                             hl_sparse_matrix_s src,
-                             hl_stream_t stream) {
-  CHECK(dst && src && dst->matrix && src->matrix)
-      << "parameter dst or src is null pointer!";
-  CHECK_EQ(dst->format, src->format) << "sparse matrix format does not match!";
-  CHECK(dst->type != HL_FLOAT_VALUE || src->type != HL_NO_VALUE)
-      << "src sparse matrix is no value, dst sparse matrix has value!";
-
-  if (dst->format == HL_SPARSE_CSR) {
-    dst->rows = src->rows;
-    dst->cols = src->cols;
-    dst->nnz = src->nnz;
-    hl_csr_matrix csr = (hl_csr_matrix)src->matrix;
-    hl_memcpy_csr_matrix(dst, csr->csr_val, csr->csr_row, csr->csr_col, stream);
-  } else if (dst->format == HL_SPARSE_CSC) {
-    dst->rows = src->rows;
-    dst->cols = src->cols;
-    dst->nnz = src->nnz;
-    hl_csc_matrix csc = (hl_csc_matrix)src->matrix;
-    hl_memcpy_csc_matrix(dst, csc->csc_val, csc->csc_row, csc->csc_col, stream);
-  } else {
-    LOG(FATAL) << "sparse matrix format error!";
-  }
-}
-
-/**
- * Calculate beta * C, if beta is zero, C does not have to be a valid input.
- */
-static void _beta_mul_c(real *c, int dimM, int dimN, real beta) {
-  if (beta == 0.0) {
-    hl_gpu_apply_unary_op(unary::Zero<real>(), c, dimM, dimN, dimN);
-  } else {
-    if (beta != 1.0) {
-      hl_gpu_apply_unary_op(unary::mul_scalar<real>(beta), c, dimM, dimN, dimN);
-    }
-  }
-
-  return;
-}
-
-void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d,
-                             hl_trans_op_t transa,
-                             real *B_d,
-                             hl_trans_op_t transb,
-                             real *C_d,
-                             int dimM,
-                             int dimN,
-                             int dimK,
-                             real alpha,
-                             real beta) {
-  CHECK_EQ(transb, HPPL_OP_N);
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-  CHECK_NOTNULL(C_d);
-  CHECK(dimM > 0 && dimN > 0 && dimK > 0);
-  CHECK_EQ(A_d->format, HL_SPARSE_CSR) << "matrix format error!";
-
-  if ((HPPL_OP_N == transa && (A_d->rows != dimM || A_d->cols != dimK)) ||
-      (HPPL_OP_T == transa && (A_d->rows != dimK || A_d->cols != dimM))) {
-    LOG(FATAL) << "parameter error!";
-  }
-
-  if (A_d->nnz == 0) {
-    _beta_mul_c(C_d, dimM, dimN, beta);
-    return;
-  }
-
-  /* nnz != 0 */
-  hl_csr_matrix A_d2 = (hl_csr_matrix)(A_d->matrix);
-  if ((A_d2->csr_val == NULL && A_d->type != HL_NO_VALUE) ||
-      A_d2->csr_row == NULL || A_d2->csr_col == NULL) {
-    LOG(FATAL) << "parameter error!";
-  }
-
-  if (HPPL_OP_N == transa) {
-    int blocksX = (dimN + CU_CSRMM_BLOCK_N - 1) / CU_CSRMM_BLOCK_N;
-    int blocksY = (dimM + CU_CSRMM_THREAD_Y - 1) / CU_CSRMM_THREAD_Y;
-    dim3 threads(CU_CSRMM_THREAD_X, CU_CSRMM_THREAD_Y);
-    dim3 grid(blocksX, blocksY);
-
-    /* sparsity pattern */
-    // A_d->sparsity;
-    if (A_d->type == HL_NO_VALUE) {
-      KeSMatrixCsrMulDense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d2->csr_val,
-          A_d2->csr_col,
-          A_d2->csr_row,
-          B_d,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    } else {
-      KeSMatrixCsrMulDense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d2->csr_val,
-          A_d2->csr_col,
-          A_d2->csr_row,
-          B_d,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    }
-  } else if (HPPL_OP_T == transa) {
-    _beta_mul_c(C_d, dimM, dimN, beta);
-
-    int blocksX =
-        (dimN + CU_CSC_MUL_DENSE_BLOCK_N - 1) / CU_CSC_MUL_DENSE_BLOCK_N;
-    int blocksY =
-        (dimK + CU_CSC_MUL_DENSE_BLOCK_K - 1) / CU_CSC_MUL_DENSE_BLOCK_K;
-    dim3 threads(CU_CSC_MUL_DENSE_THREAD_X, CU_CSC_MUL_DENSE_THREAD_Y);
-    dim3 grid(blocksX, blocksY);
-    if (A_d->type == HL_NO_VALUE) {
-      KeSMatrixCscMulDense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d2->csr_val,
-          A_d2->csr_col,
-          A_d2->csr_row,
-          B_d,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    } else {
-      KeSMatrixCscMulDense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d2->csr_val,
-          A_d2->csr_col,
-          A_d2->csr_row,
-          B_d,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    }
-  } else {
-    LOG(FATAL) << "parameter transa error!";
-  }
-
-  CHECK_SYNC("hl_matrix_csr_mul_dense failed");
-}
-
-void hl_matrix_dense_mul_csc(real *A_d,
-                             hl_trans_op_t transa,
-                             hl_sparse_matrix_s B_d,
-                             hl_trans_op_t transb,
-                             real *C_d,
-                             int dimM,
-                             int dimN,
-                             int dimK,
-                             real alpha,
-                             real beta) {
-  CHECK_EQ(transa, HPPL_OP_N);
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-  CHECK_NOTNULL(C_d);
-
-  if (dimM <= 0 || dimN <= 0 || dimK <= 0 ||
-      ((transb == HPPL_OP_N) && (B_d->rows != dimK || B_d->cols != dimN)) ||
-      ((transb == HPPL_OP_T) && (B_d->rows != dimN || B_d->cols != dimK))) {
-    LOG(FATAL) << "parameter dims error!";
-  }
-
-  CHECK_EQ(B_d->format, HL_SPARSE_CSC) << "matrix format error!";
-
-  if (B_d->nnz == 0) {
-    _beta_mul_c(C_d, dimM, dimN, beta);
-    return;
-  }
-
-  /* nnz != 0 */
-  hl_csc_matrix B_d2 = (hl_csc_matrix)(B_d->matrix);
-  if ((B_d2->csc_val == NULL && B_d->type != HL_NO_VALUE) ||
-      B_d2->csc_row == NULL || B_d2->csc_col == NULL) {
-    LOG(FATAL) << "parameter B is null!";
-  }
-
-  if (transb == HPPL_OP_N) {
-    int blocksX = (dimM + CU_CSCMM_BLOCK_M_BEST - 1) / CU_CSCMM_BLOCK_M_BEST;
-    int blocksY = (dimN + CU_CSCMM_BLOCK_N_BEST - 1) / CU_CSCMM_BLOCK_N_BEST;
-    dim3 threads(CU_CSCMM_THREAD_X_BEST, CU_CSCMM_THREAD_Y_BEST);
-    dim3 grid(blocksX, blocksY);
-
-    if (B_d->type == HL_NO_VALUE) {
-      KeSMatrixDenseMulCsc<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d,
-          B_d2->csc_val,
-          B_d2->csc_row,
-          B_d2->csc_col,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    } else {
-      KeSMatrixDenseMulCsc<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d,
-          B_d2->csc_val,
-          B_d2->csc_row,
-          B_d2->csc_col,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    }
-  } else if (transb == HPPL_OP_T) {
-    _beta_mul_c(C_d, dimM, dimN, beta);
-    int blocksX = 1 + (dimK - 1) / CU_DM_CSR_THREAD_X;
-    int blocksY = 1 + (dimM - 1) / CU_DM_CSR_BLOCK_M;
-    dim3 threads(CU_DM_CSR_THREAD_X, CU_DM_CSR_THREAD_Y);
-    dim3 grid(blocksX, blocksY);
-    if (B_d->type == HL_NO_VALUE) {
-      KeSMatrixDenseMulCsr<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d,
-          B_d2->csc_val,
-          B_d2->csc_col,
-          B_d2->csc_row,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    } else {
-      KeSMatrixDenseMulCsr<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d,
-          B_d2->csc_val,
-          B_d2->csc_col,
-          B_d2->csc_row,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    }
-  } else {
-    LOG(FATAL) << "parameter transb error!";
-  }
-
-  CHECK_SYNC("hl_matrix_dense_mul_csc failed");
-}
-
-void hl_matrix_dense_mul_csr(real *A_d,
-                             hl_trans_op_t transa,
-                             hl_sparse_matrix_s B_d,
-                             hl_trans_op_t transb,
-                             real *C_d,
-                             int dimM,
-                             int dimN,
-                             int dimK,
-                             real alpha,
-                             real beta) {
-  CHECK_EQ(transa, HPPL_OP_N);
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-  CHECK_NOTNULL(C_d);
-
-  if (dimM <= 0 || dimN <= 0 || dimK <= 0 ||
-      (transb == HPPL_OP_N && (B_d->rows != dimK || B_d->cols != dimN)) ||
-      (transb == HPPL_OP_T && (B_d->rows != dimN || B_d->cols != dimK))) {
-    LOG(FATAL) << "parameter dims error!";
-  }
-
-  CHECK_EQ(B_d->format, HL_SPARSE_CSR) << "matrix format error!";
-
-  if (B_d->nnz == 0) {
-    _beta_mul_c(C_d, dimM, dimN, beta);
-    return;
-  }
-
-  /* nnz != 0 */
-  hl_csr_matrix B_d2 = (hl_csr_matrix)(B_d->matrix);
-  if ((B_d2->csr_val == NULL && B_d->type != HL_NO_VALUE) ||
-      B_d2->csr_row == NULL || B_d2->csr_col == NULL) {
-    LOG(FATAL) << "parameter transa error!";
-  }
-
-  if (transb == HPPL_OP_N) {
-    _beta_mul_c(C_d, dimM, dimN, beta);
-    int blocksX = 1 + (dimK - 1) / CU_DM_CSR_THREAD_X;
-    int blocksY = 1 + (dimM - 1) / CU_DM_CSR_BLOCK_M;
-    dim3 threads(CU_DM_CSR_THREAD_X, CU_DM_CSR_THREAD_Y);
-    dim3 grid(blocksX, blocksY);
-    if (B_d->type == HL_NO_VALUE) {
-      KeSMatrixDenseMulCsr<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d,
-          B_d2->csr_val,
-          B_d2->csr_row,
-          B_d2->csr_col,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    } else {
-      KeSMatrixDenseMulCsr<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d,
-          B_d2->csr_val,
-          B_d2->csr_row,
-          B_d2->csr_col,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    }
-  } else if (transb == HPPL_OP_T) {
-    int blocksX = (dimM + CU_CSCMM_BLOCK_M_BEST - 1) / CU_CSCMM_BLOCK_M_BEST;
-    int blocksY = (dimN + CU_CSCMM_BLOCK_N_BEST - 1) / CU_CSCMM_BLOCK_N_BEST;
-    dim3 threads(CU_CSCMM_THREAD_X_BEST, CU_CSCMM_THREAD_Y_BEST);
-    dim3 grid(blocksX, blocksY);
-    if (B_d->type == HL_NO_VALUE) {
-      KeSMatrixDenseMulCsc<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d,
-          B_d2->csr_val,
-          B_d2->csr_col,
-          B_d2->csr_row,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    } else {
-      KeSMatrixDenseMulCsc<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d,
-          B_d2->csr_val,
-          B_d2->csr_col,
-          B_d2->csr_row,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    }
-  } else {
-    LOG(FATAL) << "parameter transb error!";
-  }
-
-  CHECK_SYNC("hl_matrix_dense_mul_csr failed");
-}
-
-void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d,
-                             hl_trans_op_t transa,
-                             real *B_d,
-                             hl_trans_op_t transb,
-                             real *C_d,
-                             int dimM,
-                             int dimN,
-                             int dimK,
-                             real alpha,
-                             real beta) {
-  CHECK_EQ(transb, HPPL_OP_N);
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-  CHECK_NOTNULL(C_d);
-  CHECK(dimM > 0 && dimN > 0 && dimK > 0) << "parameter error!";
-  CHECK_EQ(A_d->format, HL_SPARSE_CSC) << "matrix format error!";
-
-  if ((HPPL_OP_N == transa && (A_d->rows != dimM || A_d->cols != dimK)) ||
-      (HPPL_OP_T == transa && (A_d->rows != dimK || A_d->cols != dimM))) {
-    LOG(FATAL) << "parameter error!";
-  }
-
-  if (A_d->nnz == 0) {
-    _beta_mul_c(C_d, dimM, dimN, beta);
-    return;
-  }
-
-  /* nnz != 0 */
-  hl_csc_matrix A_d2 = (hl_csc_matrix)(A_d->matrix);
-  if ((A_d2->csc_val == NULL && A_d->type != HL_NO_VALUE) ||
-      A_d2->csc_row == NULL || A_d2->csc_col == NULL) {
-    LOG(FATAL) << "parameter error!";
-  }
-
-  if (HPPL_OP_N == transa) {
-    _beta_mul_c(C_d, dimM, dimN, beta);
-
-    int blocksX =
-        (dimN + CU_CSC_MUL_DENSE_BLOCK_N - 1) / CU_CSC_MUL_DENSE_BLOCK_N;
-    int blocksY =
-        (dimK + CU_CSC_MUL_DENSE_BLOCK_K - 1) / CU_CSC_MUL_DENSE_BLOCK_K;
-    dim3 threads(CU_CSC_MUL_DENSE_THREAD_X, CU_CSC_MUL_DENSE_THREAD_Y);
-    dim3 grid(blocksX, blocksY);
-    if (A_d->type == HL_NO_VALUE) {
-      KeSMatrixCscMulDense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d2->csc_val,
-          A_d2->csc_row,
-          A_d2->csc_col,
-          B_d,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    } else {
-      KeSMatrixCscMulDense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d2->csc_val,
-          A_d2->csc_row,
-          A_d2->csc_col,
-          B_d,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    }
-  } else if (HPPL_OP_T == transa) {
-    int blocksX = (dimN + CU_CSRMM_BLOCK_N - 1) / CU_CSRMM_BLOCK_N;
-    int blocksY = (dimM + CU_CSRMM_THREAD_Y - 1) / CU_CSRMM_THREAD_Y;
-    dim3 threads(CU_CSRMM_THREAD_X, CU_CSRMM_THREAD_Y);
-    dim3 grid(blocksX, blocksY);
-
-    /* sparsity pattern */
-    // A_d->sparsity;
-    if (A_d->type == HL_NO_VALUE) {
-      KeSMatrixCsrMulDense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d2->csc_val,
-          A_d2->csc_row,
-          A_d2->csc_col,
-          B_d,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    } else {
-      KeSMatrixCsrMulDense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d2->csc_val,
-          A_d2->csc_row,
-          A_d2->csc_col,
-          B_d,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    }
-  } else {
-    LOG(FATAL) << "parameter transa error!";
-  }
-
-  CHECK_SYNC("hl_matrix_csc_mul_dense failed");
-}
-
-void hl_sparse_matrix_mul(real *A_d,
-                          hl_trans_op_t transa,
-                          real *B_d,
-                          hl_trans_op_t transb,
-                          hl_sparse_matrix_s C_d,
-                          int dimM,
-                          int dimN,
-                          int dimK,
-                          real alpha,
-                          real beta) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-  CHECK_NOTNULL(C_d);
-  CHECK(dimM > 0 && dimN > 0 && dimK > 0) << "parameter error!";
-  CHECK_NE(C_d->type, HL_NO_VALUE) << "C value type error!";
-
-  if (C_d->nnz == 0) return;
-
-  if (C_d->format == HL_SPARSE_CSC) {
-    hl_csc_matrix C_d2 = (hl_csc_matrix)(C_d->matrix);
-    if (C_d2->csc_val == NULL || C_d2->csc_row == NULL ||
-        C_d2->csc_col == NULL) {
-      LOG(FATAL) << "parameter error!";
-    }
-
-    if (beta != 1.0) {
-      hl_gpu_apply_unary_op(
-          unary::mul_scalar<real>(beta), C_d2->csc_val, 1, C_d->nnz, C_d->nnz);
-    }
-
-    int blocksX = dimN;
-    int blocksY = 1;
-    dim3 threads(CU_CSCMM_DMD2CSC_THREAD_X, 1);
-    dim3 grid(blocksX, blocksY);
-    bool transA = transa == HPPL_OP_T ? 1 : 0;
-    bool transB = transb == HPPL_OP_T ? 1 : 0;
-    KeSMatrixDenseMulDense2CSC<<<grid, threads, 0, STREAM_DEFAULT>>>(
-        C_d2->csc_val,
-        C_d2->csc_row,
-        C_d2->csc_col,
-        A_d,
-        B_d,
-        transA,
-        transB,
-        dimM,
-        dimN,
-        dimK,
-        alpha,
-        beta);
-    CHECK_SYNC("hl_sparse_matrix_mul failed");
-  } else {
-    hl_csr_matrix C_d2 = (hl_csr_matrix)(C_d->matrix);
-    if ((C_d2->csr_val == NULL && C_d->type != HL_NO_VALUE) ||
-        C_d2->csr_row == NULL || C_d2->csr_col == NULL) {
-      LOG(FATAL) << "parameter error!";
-    }
-
-    if (beta != 1.0) {
-      hl_gpu_apply_unary_op(
-          unary::mul_scalar<real>(beta), C_d2->csr_val, 1, C_d->nnz, C_d->nnz);
-    }
-
-    bool transA = transa == HPPL_OP_T ? 1 : 0;
-    bool transB = transb == HPPL_OP_T ? 1 : 0;
-    if (!transB) {
-      int blocksX = dimM;
-      int blocksY = 1;
-      dim3 threads(CU_CSCMM_DMD2CSR_THREAD_X, 1);
-      dim3 grid(blocksX, blocksY);
-
-      KeSMatrixDenseMulDense2CSR<<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d2->csr_val,
-          C_d2->csr_row,
-          C_d2->csr_col,
-          A_d,
-          B_d,
-          transA,
-          transB,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-      CHECK_SYNC("hl_sparse_matrix_mul failed");
-    } else {
-      CHECK(!transA) << "Not supported A is trans and B is not trans!";
-
-      dim3 block(CU_BLOCK_SIZE, 1);
-      int avgNnzPerRow = C_d->nnz / dimM;
-      avgNnzPerRow = avgNnzPerRow > 0 ? avgNnzPerRow : 1;
-      int gridx = DIVUP(avgNnzPerRow, CU_BLOCK_SIZE);
-      dim3 grid(gridx, dimM);
-      KeSMatrixDenseMulDenseTrans2CSR<<<grid, block, 0, STREAM_DEFAULT>>>(
-          C_d2->csr_val,
-          C_d2->csr_row,
-          C_d2->csr_col,
-          A_d,
-          B_d,
-          transA,
-          transB,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-      CHECK_SYNC("hl_sparse_matrix_mul failed");
-    }
-  }
-}
-
-void hl_memcpy_from_csc_matrix(real *csc_val,
-                               size_t val_size,
-                               int *csc_row,
-                               size_t row_size,
-                               int *csc_col,
-                               size_t col_size,
-                               hl_sparse_matrix_s csc_matrix,
-                               hl_stream_t stream) {
-  CHECK_NOTNULL(csc_matrix);
-  CHECK_NOTNULL(csc_row);
-  CHECK_NOTNULL(csc_col);
-
-  CHECK_EQ(csc_matrix->format, HL_SPARSE_CSC)
-      << "csc_matrix is not csc format error!";
-
-  if (csc_matrix->nnz > row_size ||
-      csc_matrix->cols + 1 > static_cast<int>(col_size)) {
-    LOG(FATAL) << "size not match!";
-  }
-
-  hl_csc_matrix csc = (hl_csc_matrix)(csc_matrix->matrix);
-  hl_memcpy_async((void *)csc_row,
-                  (void *)csc->csc_row,
-                  (csc_matrix->nnz) * sizeof(int),
-                  stream);
-  hl_memcpy_async((void *)csc_col,
-                  (void *)csc->csc_col,
-                  (csc_matrix->cols + 1) * sizeof(int),
-                  stream);
-  if (csc_matrix->type == HL_FLOAT_VALUE) {
-    if (csc_val != NULL) {
-      CHECK_LE(csc_matrix->nnz, val_size) << "size not match!";
-      hl_memcpy_async((void *)csc_val,
-                      (void *)csc->csc_val,
-                      (csc_matrix->nnz) * sizeof(real),
-                      stream);
-    } else {
-      LOG(FATAL) << "parameter csr_val is null pointer!";
-    }
-  }
-}
-
-void hl_memcpy_from_csr_matrix(real *csr_val,
-                               size_t val_size,
-                               int *csr_row,
-                               size_t row_size,
-                               int *csr_col,
-                               size_t col_size,
-                               hl_sparse_matrix_s csr_matrix,
-                               hl_stream_t stream) {
-  CHECK_NOTNULL(csr_matrix);
-  CHECK_NOTNULL(csr_row);
-  CHECK_NOTNULL(csr_col);
-  CHECK_EQ(csr_matrix->format, HL_SPARSE_CSR)
-      << "csr_matrix is not csr format error!";
-
-  if (csr_matrix->nnz > col_size ||
-      csr_matrix->rows + 1 > static_cast<int>(row_size)) {
-    LOG(FATAL) << "size not match!";
-  }
-
-  hl_csr_matrix csr = (hl_csr_matrix)(csr_matrix->matrix);
-  hl_memcpy_async((void *)csr_row,
-                  (void *)csr->csr_row,
-                  (csr_matrix->rows + 1) * sizeof(int),
-                  stream);
-  hl_memcpy_async((void *)csr_col,
-                  (void *)csr->csr_col,
-                  (csr_matrix->nnz) * sizeof(int),
-                  stream);
-  if (csr_matrix->type == HL_FLOAT_VALUE) {
-    if (csr_val != NULL) {
-      CHECK_LE(csr_matrix->nnz, val_size) << "size not match!";
-      hl_memcpy_async((void *)csr_val,
-                      (void *)csr->csr_val,
-                      (csr_matrix->nnz) * sizeof(real),
-                      stream);
-    } else {
-      LOG(FATAL) << "parameter csr_val is null pointer!";
-    }
-  }
-}
-
-void hl_sparse_matrix_column_sum(
-    real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale) {
-  if (B_d->format == HL_SPARSE_CSR) {
-    hl_matrix_csr_column_sum(A_d, B_d, dimM, dimN, scale);
-  } else {
-    LOG(FATAL) << "Not support CSC format error!";
-  }
-}
-
-void hl_matrix_csr_column_sum(
-    real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-
-  if (dimM <= 0 || dimN <= 0 || (B_d->rows != dimM || B_d->cols != dimN)) {
-    LOG(FATAL) << "parameter dims error!";
-  }
-
-  hl_csr_matrix B_d2 = (hl_csr_matrix)(B_d->matrix);
-  if ((B_d2->csr_val == NULL && B_d->type != HL_NO_VALUE) ||
-      B_d2->csr_row == NULL || B_d2->csr_col == NULL) {
-    LOG(FATAL) << "parameter B is null!";
-  }
-
-  if (B_d->nnz == 0) return;
-
-  int nnz = B_d->nnz;
-  int block = 512;
-  int grid = DIVUP(nnz, 512);
-  KeSMatrixCsrColumnSum<<<grid, block, 0, STREAM_DEFAULT>>>(
-      A_d, B_d2->csr_val, B_d2->csr_col, nnz);
-
-  CHECK_SYNC("hl_matrix_csr_column_sum failed");
-}
-
-void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d, real *B_d, real scale) {
-  if (A_d->format == HL_SPARSE_CSR) {
-    hl_matrix_csr_add_bias(A_d, B_d, scale);
-  } else {
-    LOG(FATAL) << "Not support CSC format error!";
-  }
-}
-
-void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d, real *B_d, real scale) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-
-  hl_csr_matrix A_d2 = (hl_csr_matrix)(A_d->matrix);
-  if ((A_d2->csr_val == NULL && A_d->type != HL_NO_VALUE) ||
-      A_d2->csr_row == NULL || A_d2->csr_col == NULL) {
-    LOG(FATAL) << "parameter A_d is null!";
-  }
-
-  if (A_d->nnz == 0) return;
-
-  int nnz = A_d->nnz;
-  int block = 512;
-  int grid = DIVUP(nnz, 512);
-  KeSMatrixCsrAddBias<<<grid, block, 0, STREAM_DEFAULT>>>(
-      A_d2->csr_val, A_d2->csr_col, B_d, scale, nnz);
-
-  CHECK_SYNC("hl_sparse_matrix_add_bias failed");
-}
-
-void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d,
-                                real *B_d,
-                                int dimM,
-                                int dimN,
-                                real alpha,
-                                real beta) {
-  if (A_d->format == HL_SPARSE_CSR) {
-    hl_matrix_csr_add_dense(A_d, B_d, dimM, dimN, alpha, beta);
-  } else {
-    LOG(FATAL) << "Not support CSC format error!";
-  }
-}
-
-void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d,
-                             real *B_d,
-                             int dimM,
-                             int dimN,
-                             real alpha,
-                             real beta) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-
-  if (dimM <= 0 || dimN <= 0 || A_d->rows != dimM || A_d->cols != dimN) {
-    LOG(FATAL) << "parameter dim error!";
-  }
-
-  hl_csr_matrix A_d2 = (hl_csr_matrix)(A_d->matrix);
-  if ((A_d2->csr_val == NULL && A_d->type != HL_NO_VALUE) ||
-      A_d2->csr_row == NULL || A_d2->csr_col == NULL) {
-    LOG(FATAL) << "parameter A_d is null!";
-  }
-
-  if (A_d->nnz == 0) return;
-
-  int gridX = DIVUP((A_d->nnz / dimM), 512);
-  gridX = gridX > 0 ? gridX : 1;
-  dim3 block(512, 1);
-  dim3 grid(gridX, dimM);
-  KeSMatrixCsrAddDense<<<grid, block, 0, STREAM_DEFAULT>>>(A_d2->csr_val,
-                                                           A_d2->csr_row,
-                                                           A_d2->csr_col,
-                                                           B_d,
-                                                           alpha,
-                                                           beta,
-                                                           dimM,
-                                                           dimN);
-
-  CHECK_SYNC("hl_sparse_matrix_add_dense failed");
-}
-
-int *hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat) {
-  __sparse_get_return__(sMat, row);
-}
-
-int *hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat) {
-  __sparse_get_return__(sMat, col);
-}
-
-real *hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat) {
-  __sparse_get_return__(sMat, val);
-}
diff --git a/paddle/cuda/src/hl_table_apply.cu b/paddle/cuda/src/hl_table_apply.cu
deleted file mode 100644
index efa4bef02ba5f5fe9ae449b44bbdc844e5745307..0000000000000000000000000000000000000000
--- a/paddle/cuda/src/hl_table_apply.cu
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "hl_base.h"
-#include "hl_cuda.h"
-#include "hl_device_functions.cuh"
-#include "paddle/utils/Logging.h"
-
-template <int blockDimX, int blockDimY, int gridDimX, bool AddRow>
-__global__ void KeMatrixAddRows(real* output,
-                                int ldo,
-                                real* table,
-                                int ldt,
-                                int* ids,
-                                int numSamples,
-                                int tableSize,
-                                int dim) {
-  int idx = threadIdx.x;
-  int idy = blockIdx.x + threadIdx.y * gridDimX;
-
-  while (idy < numSamples) {
-    int tableId = ids[idy];
-    if ((0 <= tableId) && (tableId < tableSize)) {
-      real* out = output + idy * ldo;
-      real* tab = table + tableId * ldt;
-      for (int i = idx; i < dim; i += blockDimX) {
-        if (AddRow) {
-          paddle::paddleAtomicAdd(&tab[i], out[i]);
-        } else {
-          out[i] += tab[i];
-        }
-      }
-    }
-    idy += blockDimY * gridDimX;
-  }
-}
-
-void hl_matrix_select_rows(real* output,
-                           int ldo,
-                           real* table,
-                           int ldt,
-                           int* ids,
-                           int numSamples,
-                           int tableSize,
-                           int dim) {
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(table);
-  CHECK_NOTNULL(ids);
-
-  dim3 threads(128, 8);
-  dim3 grid(8, 1);
-  KeMatrixAddRows<128, 8, 8, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      output, ldo, table, ldt, ids, numSamples, tableSize, dim);
-
-  CHECK_SYNC("hl_matrix_select_rows failed");
-}
-
-void hl_matrix_add_to_rows(real* table,
-                           int ldt,
-                           real* input,
-                           int ldi,
-                           int* ids,
-                           int numSamples,
-                           int tableSize,
-                           int dim) {
-  CHECK_NOTNULL(input);
-  CHECK_NOTNULL(table);
-  CHECK_NOTNULL(ids);
-
-  dim3 threads(128, 8);
-  dim3 grid(8, 1);
-  KeMatrixAddRows<128, 8, 8, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      input, ldi, table, ldt, ids, numSamples, tableSize, dim);
-
-  CHECK_SYNC("hl_matrix_add_to_rows failed");
-}
-
-template <class T, int blockDimX, int gridDimX>
-__global__ void KeVectorSelect(
-    T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) {
-  int idx = threadIdx.x + blockDimX * blockIdx.x;
-  while (idx < sizei) {
-    int index = ids[idx];
-    // check(index < sizes);
-    dst[idx] = src[index];
-    idx += blockDimX * gridDimX;
-  }
-}
-
-template <class T>
-void hl_vector_select_from(
-    T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) {
-  CHECK_NOTNULL(dst);
-  CHECK_NOTNULL(src);
-  CHECK_NOTNULL(ids);
-  CHECK_EQ(sized, sizei);
-
-  dim3 threads(512, 1);
-  dim3 grid(8, 1);
-  KeVectorSelect<T, 512, 8><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      dst, sized, src, sizes, ids, sizei);
-
-  CHECK_SYNC("hl_vector_select_from failed");
-}
-
-template void hl_vector_select_from(real* dst,
-                                    int sized,
-                                    const real* src,
-                                    int sizes,
-                                    const int* ids,
-                                    int sizei);
-template void hl_vector_select_from(
-    int* dst, int sized, const int* src, int sizes, const int* ids, int sizei);
diff --git a/paddle/cuda/src/hl_top_k.cu b/paddle/cuda/src/hl_top_k.cu
deleted file mode 100644
index b17290557c4f635a963d88525409b9373b057a4b..0000000000000000000000000000000000000000
--- a/paddle/cuda/src/hl_top_k.cu
+++ /dev/null
@@ -1,481 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/cuda/include/hl_base.h"
-#include "paddle/cuda/include/hl_sparse.ph"
-#include "paddle/cuda/include/hl_top_k.h"
-#include "paddle/utils/Logging.h"
-
-// using namespace hppl;
-
-struct Pair {
-  __device__ __forceinline__ Pair() {}
-
-  __device__ __forceinline__ Pair(real value, int id) : v_(value), id_(id) {}
-
-  __device__ __forceinline__ void set(real value, int id) {
-    v_ = value;
-    id_ = id;
-  }
-
-  __device__ __forceinline__ void operator=(const Pair& in) {
-    v_ = in.v_;
-    id_ = in.id_;
-  }
-
-  __device__ __forceinline__ bool operator<(const real value) const {
-    return (v_ < value);
-  }
-
-  __device__ __forceinline__ bool operator<(const Pair& in) const {
-    return (v_ < in.v_) || ((v_ == in.v_) && (id_ > in.id_));
-  }
-
-  __device__ __forceinline__ bool operator>(const Pair& in) const {
-    return (v_ > in.v_) || ((v_ == in.v_) && (id_ < in.id_));
-  }
-
-  real v_;
-  int id_;
-};
-
-__device__ __forceinline__ void addTo(Pair topK[],
-                                      const Pair& p,
-                                      int beamSize) {
-  for (int k = beamSize - 2; k >= 0; k--) {
-    if (topK[k] < p) {
-      topK[k + 1] = topK[k];
-    } else {
-      topK[k + 1] = p;
-      return;
-    }
-  }
-  topK[0] = p;
-}
-
-template <int beamSize>
-__device__ __forceinline__ void addTo(Pair topK[], const Pair& p) {
-  for (int k = beamSize - 2; k >= 0; k--) {
-    if (topK[k] < p) {
-      topK[k + 1] = topK[k];
-    } else {
-      topK[k + 1] = p;
-      return;
-    }
-  }
-  topK[0] = p;
-}
-
-template <int blockSize>
-__device__ __forceinline__ void getTopK(
-    Pair topK[], real* src, int idx, int dim, int beamSize) {
-  while (idx < dim) {
-    if (topK[beamSize - 1] < src[idx]) {
-      Pair tmp(src[idx], idx);
-      addTo(topK, tmp, beamSize);
-    }
-    idx += blockSize;
-  }
-}
-
-template <int blockSize>
-__device__ __forceinline__ void getTopK(
-    Pair topK[], real* src, int idx, int dim, const Pair& max, int beamSize) {
-  while (idx < dim) {
-    if (topK[beamSize - 1] < src[idx]) {
-      Pair tmp(src[idx], idx);
-      if (tmp < max) {
-        addTo(topK, tmp, beamSize);
-      }
-    }
-    idx += blockSize;
-  }
-}
-
-template <int blockSize>
-__device__ __forceinline__ void getTopK(
-    Pair topK[], real* val, int* col, int idx, int dim, int beamSize) {
-  while (idx < dim) {
-    if (topK[beamSize - 1] < val[idx]) {
-      Pair tmp(val[idx], col[idx]);
-      addTo(topK, tmp, beamSize);
-    }
-    idx += blockSize;
-  }
-}
-
-template <int blockSize>
-__device__ __forceinline__ void getTopK(Pair topK[],
-                                        real* val,
-                                        int* col,
-                                        int idx,
-                                        int dim,
-                                        const Pair& max,
-                                        int beamSize) {
-  while (idx < dim) {
-    if (topK[beamSize - 1] < val[idx]) {
-      Pair tmp(val[idx], col[idx]);
-      if (tmp < max) {
-        addTo(topK, tmp, beamSize);
-      }
-    }
-    idx += blockSize;
-  }
-}
-
-template <int maxLength, int blockSize>
-__device__ __forceinline__ void threadGetTopK(Pair topK[],
-                                              int& beam,
-                                              int beamSize,
-                                              real* src,
-                                              bool& firstStep,
-                                              bool& isEmpty,
-                                              Pair& max,
-                                              int dim,
-                                              const int tid) {
-  if (beam > 0) {
-    int length = beam < beamSize ? beam : beamSize;
-    if (firstStep) {
-      firstStep = false;
-      getTopK<blockSize>(topK, src, tid, dim, length);
-    } else {
-      for (int k = 0; k < maxLength; k++) {
-        if (k < maxLength - beam) {
-          topK[k] = topK[k + beam];
-        } else {
-          topK[k].set(-HL_FLOAT_MAX, -1);
-        }
-      }
-      if (!isEmpty) {
-        getTopK<blockSize>(topK + maxLength - beam, src, tid, dim, max, length);
-      }
-    }
-
-    max = topK[maxLength - 1];
-    if (max.id_ == -1) isEmpty = true;
-    beam = 0;
-  }
-}
-
-template <int maxLength, int blockSize>
-__device__ __forceinline__ void threadGetTopK(Pair topK[],
-                                              int& beam,
-                                              int beamSize,
-                                              real* val,
-                                              int* col,
-                                              bool& firstStep,
-                                              bool& isEmpty,
-                                              Pair& max,
-                                              int dim,
-                                              const int tid) {
-  if (beam > 0) {
-    int length = beam < beamSize ? beam : beamSize;
-    if (firstStep) {
-      firstStep = false;
-      getTopK<blockSize>(topK, val, col, tid, dim, length);
-    } else {
-      for (int k = 0; k < maxLength; k++) {
-        if (k < maxLength - beam) {
-          topK[k] = topK[k + beam];
-        } else {
-          topK[k].set(-HL_FLOAT_MAX, -1);
-        }
-      }
-      if (!isEmpty) {
-        getTopK<blockSize>(
-            topK + maxLength - beam, val, col, tid, dim, max, length);
-      }
-    }
-
-    max = topK[maxLength - 1];
-    if (max.id_ == -1) isEmpty = true;
-    beam = 0;
-  }
-}
-
-template <int maxLength, int blockSize>
-__device__ __forceinline__ void blockReduce(Pair* shTopK,
-                                            int* maxId,
-                                            Pair topK[],
-                                            real** topVal,
-                                            int** topIds,
-                                            int& beam,
-                                            int& beamSize,
-                                            const int tid,
-                                            const int warp) {
-  while (true) {
-    __syncthreads();
-    if (tid < blockSize / 2) {
-      if (shTopK[tid] < shTopK[tid + blockSize / 2]) {
-        maxId[tid] = tid + blockSize / 2;
-      } else {
-        maxId[tid] = tid;
-      }
-    }
-    __syncthreads();
-    for (int stride = blockSize / 4; stride > 0; stride = stride / 2) {
-      if (tid < stride) {
-        if (shTopK[maxId[tid]] < shTopK[maxId[tid + stride]]) {
-          maxId[tid] = maxId[tid + stride];
-        }
-      }
-      __syncthreads();
-    }
-    __syncthreads();
-
-    if (tid == 0) {
-      **topVal = shTopK[maxId[0]].v_;
-      **topIds = shTopK[maxId[0]].id_;
-      (*topVal)++;
-      (*topIds)++;
-    }
-    if (tid == maxId[0]) beam++;
-    if (--beamSize == 0) break;
-    __syncthreads();
-
-    // NOTE(zcd): temporary solution
-    unsigned mask = 0u;
-    CREATE_SHFL_MASK(mask, true);
-
-    if (tid == maxId[0]) {
-      if (beam < maxLength) {
-        shTopK[tid] = topK[beam];
-      }
-    }
-    if (maxId[0] / 32 == warp) {
-      if (__shfl_sync(mask, beam, (maxId[0]) % 32, 32) == maxLength) break;
-    }
-  }
-}
-
-/**
- * Each block compute one sample.
- * In a block:
- * 1. every thread get top maxLength value;
- * 2. merge to shTopK, block reduce and get max value;
- * 3. go to the second setp, until one thread's topK value is null;
- * 4. go to the first setp, until get the topK value.
- */
-template <int maxLength, int blockSize>
-__global__ void KeMatrixTopK(real* topVal,
-                             int ldv,
-                             int* topIds,
-                             real* src,
-                             int lds,
-                             int dim,
-                             int beamSize) {
-  __shared__ Pair shTopK[blockSize];
-  __shared__ int maxId[blockSize / 2];
-  const int tid = threadIdx.x;
-  const int warp = threadIdx.x / 32;
-  src += blockIdx.x * lds;
-  topVal += blockIdx.x * ldv;
-  topIds += blockIdx.x * beamSize;
-
-  Pair topK[maxLength];  // NOLINT
-  int beam = maxLength;
-  Pair max;
-  bool isEmpty = false;
-  bool firstStep = true;
-
-  for (int k = 0; k < maxLength; k++) {
-    topK[k].set(-HL_FLOAT_MAX, -1);
-  }
-  while (beamSize) {
-    threadGetTopK<maxLength, blockSize>(
-        topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
-
-    shTopK[tid] = topK[0];
-    blockReduce<maxLength, blockSize>(
-        shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
-  }
-}
-
-template <int maxLength, int blockSize>
-__global__ void KeSMatrixTopK(real* topVal,
-                              int ldv,
-                              int* topIds,
-                              real* val,
-                              int* row,
-                              int* col,
-                              int beamSize) {
-  __shared__ Pair shTopK[blockSize];
-  __shared__ int maxId[blockSize / 2];
-  const int tid = threadIdx.x;
-  const int warp = threadIdx.x / 32;
-  topVal += blockIdx.x * ldv;
-  topIds += blockIdx.x * beamSize;
-
-  Pair topK[maxLength];  // NOLINT
-  int beam = maxLength;
-  Pair max;
-  bool isEmpty = false;
-  bool firstStep = true;
-
-  int start = row[blockIdx.x];
-  int end = row[blockIdx.x + 1];
-  int dim = end - start;
-  val += start;
-  col += start;
-
-  if (beamSize > dim) {
-    // if the number of values to sort are less than the output size,
-    // use -1 to indicate the end of valid sorted values.
-    if (tid == 0) {
-      topIds[dim] = -1;
-    }
-
-    beamSize = dim;
-  }
-
-  for (int k = 0; k < maxLength; k++) {
-    topK[k].set(-HL_FLOAT_MAX, -1);
-  }
-  while (beamSize) {
-    threadGetTopK<maxLength, blockSize>(
-        topK, beam, beamSize, val, col, firstStep, isEmpty, max, dim, tid);
-
-    shTopK[tid] = topK[0];
-    blockReduce<maxLength, blockSize>(
-        shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
-  }
-}
-
-void hl_matrix_top_k(real* topVal,
-                     int ldv,
-                     int* topIds,
-                     real* src,
-                     int lds,
-                     int dim,
-                     int beamSize,
-                     int numSamples) {
-  CHECK_NOTNULL(topVal);
-  CHECK_NOTNULL(topIds);
-  CHECK_NOTNULL(src);
-
-  if (beamSize > dim) beamSize = dim;
-
-  dim3 threads(256, 1);
-  dim3 grid(numSamples, 1);
-  KeMatrixTopK<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      topVal, ldv, topIds, src, lds, dim, beamSize);
-
-  CHECK_SYNC("hl_matrix_top_k failed");
-}
-
-void hl_sparse_matrix_top_k(real* topVal,
-                            int ldv,
-                            int* topIds,
-                            hl_sparse_matrix_s src,
-                            int beamSize,
-                            int numSamples) {
-  CHECK_NOTNULL(topVal);
-  CHECK_NOTNULL(topIds);
-  CHECK_NOTNULL(src);
-  CHECK_EQ(src->format, HL_SPARSE_CSR) << "sparse matrix format error!";
-
-  hl_csr_matrix csr = (hl_csr_matrix)src->matrix;
-  if (csr->csr_val == NULL || csr->csr_row == NULL || csr->csr_col == NULL) {
-    LOG(FATAL) << "parameter src is null!";
-  }
-
-  dim3 threads(256, 1);
-  dim3 grid(numSamples, 1);
-  KeSMatrixTopK<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      topVal, ldv, topIds, csr->csr_val, csr->csr_row, csr->csr_col, beamSize);
-
-  CHECK_SYNC("hl_sparse_matrix_top_k failed");
-}
-
-/**
- * Each block compute one sample.
- * In a block:
- * 1. every thread get top maxLength value;
- * 2. merge to shTopK, block reduce and get max value;
- * 3. go to the second setp, until one thread's topK value is null;
- * 4. go to the first setp, until get the topK value.
- */
-template <int maxLength, int blockSize>
-__global__ void KeMatrixTopKClassificationError(real* topVal,
-                                                int ldv,
-                                                int* topIds,
-                                                real* src,
-                                                int lds,
-                                                int dim,
-                                                int beamSize,
-                                                int* label,
-                                                real* recResult) {
-  __shared__ Pair shTopK[blockSize];
-  __shared__ int maxId[blockSize / 2];
-  const int tid = threadIdx.x;
-  const int warp = threadIdx.x / 32;
-  src += blockIdx.x * lds;
-  topVal += blockIdx.x * ldv;
-  topIds += blockIdx.x * beamSize;
-
-  Pair topK[maxLength];  // NOLINT
-  int beam = maxLength;
-  Pair max;
-  bool isEmpty = false;
-  bool firstStep = true;
-  int topkSize = beamSize;
-
-  for (int k = 0; k < maxLength; k++) {
-    topK[k].set(-HL_FLOAT_MAX, -1);
-  }
-
-  while (beamSize) {
-    threadGetTopK<maxLength, blockSize>(
-        topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
-
-    shTopK[tid] = topK[0];
-    blockReduce<maxLength, blockSize>(
-        shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
-  }
-
-  __syncthreads();
-  if (tid == 0) {
-    for (int i = 0; i < topkSize; i++) {
-      if (*--topIds == label[blockIdx.x]) {
-        recResult[blockIdx.x] = 0;
-        break;
-      }
-      recResult[blockIdx.x] = 1.0f;
-    }
-  }
-}
-
-void hl_matrix_classification_error(real* topVal,
-                                    int ldv,
-                                    int* topIds,
-                                    real* src,
-                                    int lds,
-                                    int dim,
-                                    int topkSize,
-                                    int numSamples,
-                                    int* label,
-                                    real* recResult) {
-  CHECK_NOTNULL(topVal);
-  CHECK_NOTNULL(topIds);
-  CHECK_NOTNULL(src);
-
-  if (topkSize > dim) topkSize = dim;
-
-  dim3 threads(256, 1);
-  dim3 grid(numSamples, 1);
-  KeMatrixTopKClassificationError<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      topVal, ldv, topIds, src, lds, dim, topkSize, label, recResult);
-
-  CHECK_SYNC("hl_matrix_top_k classification error failed");
-}
diff --git a/paddle/cuda/src/hl_warpctc_wrap.cc b/paddle/cuda/src/hl_warpctc_wrap.cc
deleted file mode 100644
index 5111bceaff224f2467fe1b6c92daed03414dd12e..0000000000000000000000000000000000000000
--- a/paddle/cuda/src/hl_warpctc_wrap.cc
+++ /dev/null
@@ -1,151 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "hl_warpctc_wrap.h"
-#include <mutex>
-#include "paddle/utils/DynamicLoader.h"
-#include "paddle/utils/Logging.h"
-
-namespace dynload {
-
-std::once_flag warpctc_dso_flag;
-void* warpctc_dso_handle = nullptr;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load warpctc routine
- * via operator overloading. When PADDLE_USE_DSO is
- * false, you need to add the path of libwarp-ctc.so to
- * the linked-libs of paddle or to LD_PRELOAD.
- */
-#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                              \
-  struct DynLoad__##__name {                                           \
-    template <typename... Args>                                        \
-    auto operator()(Args... args) -> decltype(__name(args...)) {       \
-      using warpctcFunc = decltype(__name(args...)) (*)(Args...);      \
-      std::call_once(                                                  \
-          warpctc_dso_flag, GetWarpCTCDsoHandle, &warpctc_dso_handle); \
-      void* p_##_name = dlsym(warpctc_dso_handle, #__name);            \
-      return reinterpret_cast<warpctcFunc>(p_##_name)(args...);        \
-    }                                                                  \
-  } __name;  // struct DynLoad__##__name
-
-// include all needed warp-ctc functions
-DYNAMIC_LOAD_WARPCTC_WRAP(get_warpctc_version)
-DYNAMIC_LOAD_WARPCTC_WRAP(ctcGetStatusString)
-DYNAMIC_LOAD_WARPCTC_WRAP(compute_ctc_loss)
-DYNAMIC_LOAD_WARPCTC_WRAP(get_workspace_size)
-
-#undef DYNAMIC_LOAD_WARPCTC_WRAP
-
-} /* namespace dynload */
-
-#define WARPCTC_GET_VERSION dynload::get_warpctc_version
-#define WARPCTC_GET_STATUS_STRING dynload::ctcGetStatusString
-
-static int g_warpctcVersion = -1;
-#ifndef PADDLE_TYPE_DOUBLE
-#define WARPCTC_COMPUTE_LOSS dynload::compute_ctc_loss
-#define WARPCTC_GET_WORKSPACE_SIZE dynload::get_workspace_size
-#else
-hl_warpctc_status_t fatal(...) {
-  LOG(FATAL) << "warp-ctc [version " << g_warpctcVersion
-             << "] Error: not support double precision.";
-  // both of get_warpctc_version() and get_workspace_size() return an ctcStatus
-  // type value
-  return CTC_STATUS_EXECUTION_FAILED;
-}
-#define WARPCTC_COMPUTE_LOSS fatal
-#define WARPCTC_GET_WORKSPACE_SIZE fatal
-#endif
-
-/**
- * Check build-in warp-ctc function using glog and it also
- * support << operator for more details error info.
- */
-#define CHECK_WARPCTC(warpctcStat)                \
-  CHECK_EQ(CTC_STATUS_SUCCESS, warpctcStat)       \
-      << "warp-ctc [version " << g_warpctcVersion \
-      << "] Error: " << WARPCTC_GET_STATUS_STRING(warpctcStat) << " "
-
-void hl_warpctc_init(const size_t blank,
-                     bool useGpu,
-                     hl_warpctc_options_t* options) {
-  CHECK_NOTNULL(options);
-
-  g_warpctcVersion = WARPCTC_GET_VERSION();
-
-  if (useGpu) {
-#ifdef __NVCC__
-    options->loc = CTC_GPU;
-    options->stream = STREAM_DEFAULT;
-#else
-    LOG(FATAL) << "[warpctc init] GPU is not enabled.";
-#endif
-  } else {
-    options->loc = CTC_CPU;
-    options->num_threads = 1;
-  }
-
-  options->blank_label = blank;
-}
-
-void hl_warpctc_compute_loss(const real* batchInput,
-                             real* batchGrad,
-                             const int* cpuLabels,
-                             const int* cpuLabelLengths,
-                             const int* cpuInputLengths,
-                             const size_t numClasses,
-                             const size_t numSequences,
-                             real* cpuCosts,
-                             void* workspace,
-                             hl_warpctc_options_t* options) {
-  CHECK_NOTNULL(batchInput);
-  CHECK_NOTNULL(cpuLabels);
-  CHECK_NOTNULL(cpuLabelLengths);
-  CHECK_NOTNULL(cpuInputLengths);
-  CHECK_NOTNULL(cpuCosts);
-  CHECK_NOTNULL(workspace);
-  CHECK_NOTNULL(options);
-
-  CHECK_WARPCTC(WARPCTC_COMPUTE_LOSS(batchInput,
-                                     batchGrad,
-                                     cpuLabels,
-                                     cpuLabelLengths,
-                                     cpuInputLengths,
-                                     numClasses,
-                                     numSequences,
-                                     cpuCosts,
-                                     workspace,
-                                     *options));
-}
-
-void hl_warpctc_get_workspace_size(const int* cpuLabelLengths,
-                                   const int* cpuInputLengths,
-                                   const size_t numClasses,
-                                   const size_t numSequences,
-                                   hl_warpctc_options_t* options,
-                                   size_t* bytes) {
-  CHECK_NOTNULL(cpuLabelLengths);
-  CHECK_NOTNULL(cpuInputLengths);
-  CHECK_NOTNULL(options);
-  CHECK_NOTNULL(bytes);
-
-  CHECK_WARPCTC(WARPCTC_GET_WORKSPACE_SIZE(cpuLabelLengths,
-                                           cpuInputLengths,
-                                           numClasses,
-                                           numSequences,
-                                           *options,
-                                           bytes));
-}
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 6286dda4a54991b7a1042aed9886fdcb694198ba..ec252929d5584c211cea7fa52004ecdfdf586a85 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -21,12 +21,13 @@ endif()
 
 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
 
-nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place memory device_context init)
+nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place memory device_context tensor)
 cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio)
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
-nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor init)
+nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
 
 cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)
+cc_test(reader_test SRCS reader_test.cc DEPS reader)
 
 cc_test(variable_test SRCS variable_test.cc)
 
@@ -38,7 +39,7 @@ cc_test(scope_test SRCS scope_test.cc DEPS scope)
 
 cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor)
 nv_test(data_device_transform_test SRCS data_device_transform_test.cu
-        DEPS operator op_registry init math_function)
+        DEPS operator op_registry device_context math_function)
 
 if(WITH_GPU)
   nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor)
@@ -63,7 +64,7 @@ cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
 cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
     shape_inference data_transform lod_tensor profiler)
-cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry init)
+cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog)
 
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
@@ -101,14 +102,14 @@ cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
 cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
 cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
 
-cc_library(init SRCS init.cc DEPS gflags device_context place stringpiece operator)
-cc_test(init_test SRCS init_test.cc DEPS init)
-
 cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
 cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
       
 # cc_test(channel_test SRCS channel_test.cc)
 cc_test(tuple_test SRCS tuple_test.cc )
-cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op
-        channel_send_op channel_recv_op sum_op select_op elementwise_add_op compare_op
-        conditional_block_op while_op assign_op print_op executor proto_desc)
+
+# disable test temporarily.
+# TODO https://github.com/PaddlePaddle/Paddle/issues/11971
+# cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op
+#         channel_send_op channel_recv_op sum_op select_op elementwise_add_op compare_op
+#         conditional_block_op while_op assign_op print_op executor proto_desc)
diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu
index a91fe5c99d397ef1bf04f6d22e988b6d3f33e500..f2c55e533a2747325b1b16fdada37945a8ed3c42 100644
--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@@ -14,13 +14,13 @@ limitations under the License. */
 
 #include "gtest/gtest.h"
 
-#include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/elementwise_op_function.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/init.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
index 5b8dfc57ba020cea259041f55a66472ea26b4eec..cd00b7de7338982308acfa1f1e8c38e010c6a43b 100644
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -147,10 +147,9 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
                  "Input tensor type is not supported: ", in.type().name());
   memory::data_type out_type = in_type;
 
-  memory::format in_format =
-      in_tz.size() == 2 ? memory::format::nc : in.format();
-  memory::format out_format =
-      out_tz.size() == 2 ? memory::format::nc : ToMKLDNNFormat(out_layout);
+  auto in_format = platform::MKLDNNFormatForSize(in_tz.size(), in.format());
+  auto out_format =
+      platform::MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));
 
   void* in_data = GetDataFromTensor(in, in_type);
 
diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h
index 2ba84ce57fd8aa3d9aa651bdaa2930e459c74e88..90bb206ec6b698bc23ad1a5c9609a25186ec6de8 100644
--- a/paddle/fluid/framework/data_layout_transform.h
+++ b/paddle/fluid/framework/data_layout_transform.h
@@ -61,6 +61,7 @@ inline MKLDNNDataType ToMKLDNNDataType(const std::type_index type) {
   if (iter != dict.end()) return iter->second;
   return MKLDNNDataType::data_undef;
 }
+
 #endif
 
 void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc
index b8fcc92697ca1bf1d971f8fef020f31d405605a9..82872224501709080ff02a13464d58543a0abda8 100644
--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@@ -18,17 +18,21 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/framework/data_type_transform.h"
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
 namespace paddle {
 namespace framework {
 
-static void PassTensorData(Tensor* from, Tensor* to) {
+static void PassTensorData(Tensor *from, Tensor *to) {
   to->ShareDataWith(*from);
   *from = Tensor();
 }
 
-void DataTransform(const OpKernelType& expected_kernel_type,
-                   const OpKernelType& kernel_type_for_var,
-                   const Tensor& input_tensor, Tensor* output_tensor) {
+void TransformData(const OpKernelType &expected_kernel_type,
+                   const OpKernelType &kernel_type_for_var,
+                   const Tensor &input_tensor, Tensor *output_tensor) {
   bool transformed = false;
   Tensor in;
   in.ShareDataWith(input_tensor);
@@ -47,9 +51,13 @@ void DataTransform(const OpKernelType& expected_kernel_type,
 #ifdef PADDLE_WITH_MKLDNN
         // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
         // Just set layout/format. No real transform occur
+
+        auto out_format = platform::MKLDNNFormatForSize(in.dims().size(),
+                                                        ToMKLDNNFormat(lin));
+
         out.ShareDataWith(input_tensor);
         out.set_layout(DataLayout::kMKLDNN);
-        out.set_format(ToMKLDNNFormat(lin));
+        out.set_format(out_format);
 #endif
       } else {
         // Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel
@@ -85,17 +93,17 @@ void DataTransform(const OpKernelType& expected_kernel_type,
   output_tensor->ShareDataWith(in);
 }
 
-void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor,
-                            Variable* out_var) {
+void SetTensorToVariable(const Variable &in_var, const Tensor &tensor,
+                         Variable *out_var) {
   if (in_var.IsType<LoDTensor>()) {
-    auto& in_lod_tensor = in_var.Get<LoDTensor>();
-    auto* tran_lod_tensor = out_var->GetMutable<LoDTensor>();
+    auto &in_lod_tensor = in_var.Get<LoDTensor>();
+    auto *tran_lod_tensor = out_var->GetMutable<LoDTensor>();
     tran_lod_tensor->set_lod(in_lod_tensor.lod());
     tran_lod_tensor->set_layout(in_lod_tensor.layout());
     tran_lod_tensor->ShareDataWith(tensor);
   } else if (in_var.IsType<SelectedRows>()) {
-    auto& in_selected_rows = in_var.Get<SelectedRows>();
-    auto* trans_selected_rows = out_var->GetMutable<SelectedRows>();
+    auto &in_selected_rows = in_var.Get<SelectedRows>();
+    auto *trans_selected_rows = out_var->GetMutable<SelectedRows>();
     trans_selected_rows->set_height(in_selected_rows.height());
     trans_selected_rows->set_rows(in_selected_rows.rows());
     trans_selected_rows->mutable_value()->ShareDataWith(tensor);
diff --git a/paddle/fluid/framework/data_transform.h b/paddle/fluid/framework/data_transform.h
index dee5d8c7c1126013742460df1d94bb364220ad09..ae3ab051bda2e698801cc6fe6e3ddddf039f5385 100644
--- a/paddle/fluid/framework/data_transform.h
+++ b/paddle/fluid/framework/data_transform.h
@@ -30,12 +30,15 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-void DataTransform(const OpKernelType& expected_kernel_type,
-                   const OpKernelType& kernel_type_for_var,
-                   const Tensor& input_tensor, Tensor* out);
-
-void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor,
-                            Variable* out_var);
+void TransformData(const OpKernelType &expected_kernel_type,
+                   const OpKernelType &kernel_type_for_var,
+                   const Tensor &input_tensor, Tensor *out);
+
+/**
+ * Set OutVar from InVar, except the tensor is shared with `tensor`
+ */
+void SetTensorToVariable(const Variable &in_var, const Tensor &tensor,
+                         Variable *out_var);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 3c73b6cc55c187c3f6e7edd1ce38cc58f4e8413d..4fb4ec38ee965a2790d11378a1ce6befa0ef5a00 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -25,11 +25,12 @@ else()
     cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
 endif()
 
+cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_base scope lod_tensor)
 cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
 cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)
 
 cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
-        scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle)
+        scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle)
 
 
 cc_library(ssa_graph_builder_factory SRCS ssa_graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer ssa_graph_checker)
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc
index d5ca061944f33939cea59a5275e691b1966194fa..1d9f1bd6e417e30f0799f0bbed1739cedb4e8fbf 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -73,6 +73,9 @@ void BroadcastOpHandle::RunImpl() {
     int root_id = boost::get<platform::CUDAPlace>(in_tensor.place()).device;
     std::vector<std::function<void()>> broadcast_calls;
 
+    int type = platform::ToNCCLDataType(in_tensor.type());
+    size_t numel = static_cast<size_t>(in_tensor.numel());
+
     for (auto out_var_handle : out_var_handles) {
       Variable *out_var = var_scopes.at(out_var_handle->scope_idx_)
                               ->FindVar(out_var_handle->name_);
@@ -87,13 +90,11 @@ void BroadcastOpHandle::RunImpl() {
         send_recv_buffer = const_cast<void *>(in_tensor.data<void>());
         out_handle = out_var_handle;
       } else {
-        send_recv_buffer =
-            VariableVisitor::GetMutableTensor(out_var).mutable_data(
-                out_var_handle->place_);
+        send_recv_buffer = VariableVisitor::GetMutableTensor(out_var)
+                               .Resize(in_tensor.dims())
+                               .mutable_data(out_var_handle->place_);
       }
 
-      int type = platform::ToNCCLDataType(in_tensor.type());
-      size_t numel = static_cast<size_t>(in_tensor.numel());
       broadcast_calls.emplace_back(
           [send_recv_buffer, numel, type, root_id, &nccl_ctx] {
             PADDLE_ENFORCE(platform::dynload::ncclBcast(
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 64e83acb4dc1995800c4ca3caf81668b24a7c9fe..b2e5399e2376a86c1cd310b29c768832665af87f 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -33,6 +33,8 @@ struct BuildStrategy {
   GradientScaleStrategy gradient_scale_{GradientScaleStrategy::kCoeffNumDevice};
 
   std::string debug_graphviz_path_{""};
+
+  bool enable_data_balance_{false};
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/data_balance_op_handle.cc b/paddle/fluid/framework/details/data_balance_op_handle.cc
new file mode 100644
index 0000000000000000000000000000000000000000..68896c8ac1bae7d4bfcfa79cc8ec5c26bf2d93ee
--- /dev/null
+++ b/paddle/fluid/framework/details/data_balance_op_handle.cc
@@ -0,0 +1,154 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/data_balance_op_handle.h"
+#include <algorithm>
+#include "paddle/fluid/framework/details/container_cast.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+#ifdef PADDLE_WITH_CUDA
+DataBalanceOpHandle::DataBalanceOpHandle(
+    const std::vector<Scope *> &local_scopes,
+    const std::vector<platform::Place> &places,
+    const platform::NCCLContextMap *ctxs)
+    : local_scopes_(local_scopes), places_(places) {
+  if (ctxs) {
+    for (auto &p : places_) {
+      this->dev_ctxes_[p] = ctxs->DevCtx(p);
+    }
+  }
+}
+#else
+DataBalanceOpHandle::DataBalanceOpHandle(
+    const std::vector<Scope *> &local_scopes,
+    const std::vector<platform::Place> &places)
+    : local_scopes_(local_scopes), places_(places) {}
+#endif
+
+std::string DataBalanceOpHandle::Name() const { return "data balance"; }
+
+std::vector<std::array<int, 3>> DataBalanceOpHandle::GetBalancePlan(
+    const std::vector<int> &device_sizes) {
+  int device_num = device_sizes.size();
+  int total_size = 0;
+  int empty_num = 0;
+  std::vector<std::array<int, 2>> size_device_vec;
+  size_device_vec.reserve(device_num);
+  for (int i = 0; i < device_num; ++i) {
+    if (device_sizes[i] == 0) {
+      ++empty_num;
+    }
+    total_size += device_sizes[i];
+    size_device_vec.push_back({{device_sizes[i], i}});
+  }
+  std::vector<std::array<int, 3>> res;
+  if (empty_num == 0) {
+    // No need to do data balance.
+    return res;
+  }
+  if (total_size < device_num) {
+    // No enough data.
+    PADDLE_THROW_EOF();
+  }
+  std::sort(size_device_vec.begin(), size_device_vec.end(),
+            [](const std::array<int, 2> &a, const std::array<int, 2> &b) {
+              return a[0] > b[0];
+            });
+  int expected_device_size = total_size / device_num;
+  int src_idx = 0;
+  for (int dst_idx = device_num - empty_num; dst_idx < device_num; ++dst_idx) {
+    if (size_device_vec[src_idx][0] <= expected_device_size) {
+      ++src_idx;
+      PADDLE_ENFORCE_LT(
+          src_idx, device_num - empty_num,
+          "In current srategy an empty tensor should not be copy source.");
+    }
+    size_device_vec[src_idx][0] -= expected_device_size;
+    size_device_vec[dst_idx][0] += expected_device_size;
+    res.push_back({{size_device_vec[src_idx][1], size_device_vec[dst_idx][1],
+                    expected_device_size}});
+  }
+  return res;
+}
+
+void DataBalanceOpHandle::RunImpl() {
+  PADDLE_ENFORCE_GT(places_.size(), 1,
+                    "Data balance can only be enabled when the number of "
+                    "places to run larger than 1.");
+  auto in_var_handles = DynamicCast<VarHandle>(inputs_);
+  auto out_var_handles = DynamicCast<VarHandle>(outputs_);
+  PADDLE_ENFORCE(in_var_handles.size() % places_.size() == 0);
+  PADDLE_ENFORCE_EQ(
+      in_var_handles.size(), out_var_handles.size(),
+      "The NoDummyInputSize and NoDummyOutputSize should be equal.");
+  int data_num = in_var_handles.size() / places_.size();
+  WaitInputVarGenerated();
+  std::vector<std::vector<LoDTensor *>> lod_tensors(data_num);
+  std::vector<int> device_sizes;
+  for (int i = 0; i < static_cast<int>(in_var_handles.size()); ++i) {
+    PADDLE_ENFORCE_EQ(in_var_handles[i]->name_, out_var_handles[i]->name_,
+                      "The name of input and output should be equal.");
+    int place_idx = i / data_num;
+    int data_idx = i % data_num;
+    auto *local_scope =
+        local_scopes_[place_idx]->FindVar(kLocalExecScopeName)->Get<Scope *>();
+    auto *tensor_var = local_scope->FindVar(in_var_handles[i]->name_);
+    PADDLE_ENFORCE(tensor_var->IsType<LoDTensor>());
+    auto *tensor = tensor_var->GetMutable<LoDTensor>();
+    lod_tensors[data_idx].push_back(tensor);
+    int ins_size =
+        tensor->lod().empty() ? tensor->dims()[0] : tensor->NumElements();
+    if (data_idx == 0) {
+      device_sizes.emplace_back(ins_size);
+    } else {
+      PADDLE_ENFORCE_EQ(
+          ins_size, device_sizes.at(place_idx),
+          "All data on the same device shall have the same batch size.");
+    }
+  }
+  const auto &balance_plan = GetBalancePlan(device_sizes);
+
+  for (const auto &trans : balance_plan) {
+    for (int data_idx = 0; data_idx < data_num; ++data_idx) {
+      LoDTensor *src_tensor = lod_tensors[data_idx][trans[0]];
+      LoDTensor *dst_tensor = lod_tensors[data_idx][trans[1]];
+      int trans_ins_size = trans[2];
+      LoD src_lod = src_tensor->lod();
+      int src_ins_size =
+          src_lod.empty() ? src_tensor->dims()[0] : src_tensor->NumElements();
+      int cut_point = src_ins_size - trans_ins_size;
+      if (!src_lod.empty()) {
+        for (auto &level : src_lod) {
+          cut_point = level[cut_point];
+        }
+      }
+      TensorCopySync(src_tensor->Slice(cut_point, src_tensor->dims()[0]),
+                     dst_tensor->place(), dst_tensor);
+      src_tensor->ShareDataWith(src_tensor->Slice(0, cut_point));
+      if (!src_lod.empty()) {
+        dst_tensor->set_lod(SliceInLevel(
+            src_lod, 0, src_ins_size - trans_ins_size, src_ins_size));
+        src_tensor->set_lod(
+            SliceInLevel(src_lod, 0, 0, src_ins_size - trans_ins_size));
+      }
+    }
+  }
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/data_balance_op_handle.h b/paddle/fluid/framework/details/data_balance_op_handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..76a407e3610e8bb48facf1f814779f4c23f92d98
--- /dev/null
+++ b/paddle/fluid/framework/details/data_balance_op_handle.h
@@ -0,0 +1,59 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct DataBalanceOpHandle : public OpHandleBase {
+ public:
+#ifdef PADDLE_WITH_CUDA
+  DataBalanceOpHandle(const std::vector<Scope *> &local_scopes,
+                      const std::vector<platform::Place> &places,
+                      const platform::NCCLContextMap *ctxs);
+#else
+  DataBalanceOpHandle(const std::vector<Scope *> &local_scopes,
+                      const std::vector<platform::Place> &places);
+#endif
+
+  std::string Name() const override;
+
+  bool IsMultiDeviceTransfer() override { return false; };
+
+ protected:
+  void RunImpl() override;
+
+ private:
+  // std::vector<(src_dev_id, dst_dev_id, trans_size)>
+  std::vector<std::array<int, 3>> GetBalancePlan(
+      const std::vector<int> &batch_size_per_device);
+
+  const std::vector<Scope *> local_scopes_;
+  const std::vector<platform::Place> places_;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index 224e8e1f6efd7a894591ac51c929517cae7539ce..d646c944601e81477787740189d7ac60ae97fa80 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -67,8 +67,8 @@ void FetchOpHandle::RunImpl() {
 #endif
     } else {
       tensors_[i].ShareDataWith(t);
-      tensors_[i].set_lod(t.lod());
     }
+    tensors_[i].set_lod(t.lod());
   }
 
   this->WaitAndMergeCPUTensors();
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index 78356cb1be3bd089c26dde663275e2c8109df951..b82c2ef4082110f1621eb38d50361396511a4825 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -20,6 +20,7 @@
 #include "paddle/fluid/framework/details/all_reduce_op_handle.h"
 #include "paddle/fluid/framework/details/broadcast_op_handle.h"
 #include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/data_balance_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
 #include "paddle/fluid/framework/details/reduce_op_handle.h"
 #include "paddle/fluid/framework/details/rpc_op_handle.h"
@@ -57,6 +58,12 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
   for (auto &p : params) {
     grad_names_.insert(GradVarName(p));
   }
+  balance_vars_.resize(places_.size(), 0);
+  if (strategy_.enable_data_balance_ && places_.size() == 1) {
+    LOG(WARNING) << "It is no need to enable data balance when there is only "
+                    "one place. enable_data_balance is set to False.";
+    strategy_.enable_data_balance_ = false;
+  }
 }
 
 void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result,
@@ -140,11 +147,30 @@ bool MultiDevSSAGraphBuilder::IsDistTrainOp(
          checker(op.InputArgumentNames(), recv_vars);
 }
 
+size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID(
+    const std::vector<std::string> &var_names) const {
+  int64_t numel_sum = 0;
+  for (auto var_name : var_names) {
+    auto var_desc = all_vars_.at(var_name);
+    PADDLE_ENFORCE_NOT_NULL(var_desc);
+    auto dim = framework::make_ddim(var_desc->GetShape());
+    int64_t numel = framework::product(dim);
+    PADDLE_ENFORCE_GT(numel, 0);
+    numel_sum += numel;
+  }
+
+  auto smallest =
+      std::min_element(std::begin(balance_vars_), std::end(balance_vars_));
+  size_t dev_id =
+      static_cast<size_t>(std::distance(std::begin(balance_vars_), smallest));
+  balance_vars_[dev_id] += numel_sum;
+  return dev_id;
+}
+
 std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
     const ProgramDesc &program) const {
-  std::unordered_map<std::string, VarDesc *> all_vars;
   for (auto *var : program.Block(0).AllVars()) {
-    all_vars[var->Name()] = var;
+    all_vars_.emplace(var->Name(), var);
   }
 
   auto graph = new SSAGraph();
@@ -161,35 +187,16 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
   auto send_vars = FindDistTrainSendVars(program);
   auto recv_vars = FindDistTrainRecvVars(program);
 
-  std::vector<std::unordered_set<std::string>> var_name_on_devices;
   std::vector<std::unordered_set<std::string>> bcast_var_name_set;
-  var_name_on_devices.resize(places_.size());
   bcast_var_name_set.resize(places_.size());
 
   size_t cur_device_id = 0;
-  std::vector<int64_t> balance_grads(places_.size(), 0);
-
-  auto get_appropriate_dev = [&](std::string &g_name) -> size_t {
-    auto var_desc = all_vars.at(g_name);
-    PADDLE_ENFORCE_NOT_NULL(var_desc);
-    auto dim = framework::make_ddim(var_desc->GetShape());
-    int64_t numel = framework::product(dim);
-    PADDLE_ENFORCE_GE(numel, 0);
-    auto smallest =
-        std::min_element(std::begin(balance_grads), std::end(balance_grads));
-    size_t dev_id =
-        static_cast<size_t>(std::distance(std::begin(balance_grads), smallest));
-    balance_grads[dev_id] += numel;
-    return dev_id;
-  };
-
   bool is_forwarding = true;
+
   for (auto *op : program.Block(0).AllOps()) {
     if (boost::get<int>(
             op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
         static_cast<int>(OpRole::kRPC)) {
-      // append rpc op if program is distributed trainer main program.
-      // always use the first device
       CreateRPCOp(&result, *op);
     } else if (IsDistTrainOp(*op, send_vars, recv_vars)) {
       CreateDistTrainOp(&result, *op);
@@ -199,53 +206,70 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
           BuildStrategy::GradientScaleStrategy::kCustomized) {
         CreateScaleLossGradOp(&result);
       }
+      // This assumes the backward generating code will ensure IsScaleLossOp
+      // is true only for the op that scale the final scalar loss.
+      // It also assumes backward op will always follow the forward op in
+      // the block.
       is_forwarding = false;
     } else {
-      int op_dev_id = GetOpDeviceID(var_name_on_devices, *op);
-      if (op_dev_id == -1) {  // var on all device
-        CreateComputationalOps(&result, *op, places_.size());
-      } else {
+      int op_dev_id = GetOpDeviceID(*op);
+      if (op_dev_id != -1) {  // This op only runs on one specific device.
         CreateComputationalOp(&result, *op, op_dev_id);
         for (auto &var_name : op->OutputArgumentNames()) {
-          var_name_on_devices[op_dev_id].emplace(var_name);
+          var_name_on_devices_.emplace(var_name, op_dev_id);
         }
-      }
-      if (!is_forwarding && places_.size() > 1) {
-        // Currently, we assume that once gradient is generated, it can be
-        // broadcast, and each gradient is only broadcast once.
-        if (static_cast<bool>(boost::get<int>(op->GetAttr(
-                                  OpProtoAndCheckerMaker::OpRoleAttrName())) &
-                              static_cast<int>(OpRole::kBackward))) {
-          try {
-            auto backward_vars =
-                boost::get<std::vector<std::string>>(op->GetNullableAttr(
-                    OpProtoAndCheckerMaker::OpRoleVarAttrName()));
-
-            PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
-
-            for (size_t i = 0; i < backward_vars.size(); i += 2) {
-              auto &p_name = backward_vars[i];
-              auto &g_name = backward_vars[i + 1];
-              VLOG(10) << "Bcast " << g_name << " for parameter " << p_name;
-
-              switch (strategy_.reduce_) {
-                case BuildStrategy::ReduceStrategy::kReduce:
-                  cur_device_id = get_appropriate_dev(g_name);
-                  CreateReduceOp(&result, g_name, cur_device_id);
-                  var_name_on_devices[cur_device_id].emplace(g_name);
-                  bcast_var_name_set[cur_device_id].emplace(p_name);
-                  break;
-                case BuildStrategy::ReduceStrategy::kAllReduce:
-                  if (IsSparseGradient(all_vars, g_name)) {
-                    CreateReduceOp(&result, g_name, 0);
-                    CreateBroadcastOp(&result, g_name, 0);
-                  } else {
-                    InsertAllReduceOp(&result, g_name);
-                  }
-                  break;
+      } else {
+        // This op runs on all devices, and its output may have parameter's
+        // gradients.
+        if (op->Type() == "read" && strategy_.enable_data_balance_) {
+          op->SetAttr("throw_eof_exp", false);
+          CreateComputationalOps(&result, *op, places_.size());
+          const auto &data_var_names = op->Output("Out");
+          InsertDataBalanceOp(&result, data_var_names);
+        } else {
+          CreateComputationalOps(&result, *op, places_.size());
+        }
+
+        if (!is_forwarding && places_.size() > 1) {
+          // Currently, we assume that once gradient is generated, it can be
+          // broadcast, and each gradient is only broadcast once.
+          if (static_cast<bool>(boost::get<int>(op->GetAttr(
+                                    OpProtoAndCheckerMaker::OpRoleAttrName())) &
+                                static_cast<int>(OpRole::kBackward))) {
+            try {
+              auto backward_vars =
+                  boost::get<std::vector<std::string>>(op->GetNullableAttr(
+                      OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+
+              PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
+
+              for (size_t i = 0; i < backward_vars.size(); i += 2) {
+                auto &p_name = backward_vars[i];
+                auto &g_name = backward_vars[i + 1];
+                VLOG(10) << "Bcast " << g_name << " for parameter " << p_name;
+
+                switch (strategy_.reduce_) {
+                  case BuildStrategy::ReduceStrategy::kReduce:
+                    cur_device_id = GetAppropriateDeviceID({g_name});
+                    CreateReduceOp(&result, g_name, cur_device_id);
+                    var_name_on_devices_.emplace(g_name, cur_device_id);
+                    bcast_var_name_set[cur_device_id].emplace(p_name);
+                    break;
+                  case BuildStrategy::ReduceStrategy::kAllReduce:
+                    if (IsSparseGradient(g_name)) {
+                      CreateReduceOp(&result, g_name, 0);
+                      CreateBroadcastOp(&result, g_name, 0);
+                    } else {
+                      InsertAllReduceOp(&result, g_name);
+                    }
+                    break;
+                  default:
+                    LOG(FATAL) << "Unknown reduce strategy ";
+                    break;
+                }
               }
+            } catch (boost::bad_get e) {
             }
-          } catch (boost::bad_get e) {
           }
         }
       }
@@ -261,7 +285,7 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
   }
   /*
     Dependency graph has been constructed. However, there are still data
-    harzaeds need to be handled.
+    hazards need to be handled.
    */
   PolishGraphToSupportDataHazards(&result);
 
@@ -273,11 +297,9 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
   return std::unique_ptr<SSAGraph>(graph);
 }
 
-bool MultiDevSSAGraphBuilder::IsSparseGradient(
-    const std::unordered_map<std::string, VarDesc *> &all_vars,
-    const std::string &og) const {
-  PADDLE_ENFORCE(all_vars.count(og) != 0);
-  if (all_vars.at(og)->GetType() == proto::VarType::SELECTED_ROWS) {
+bool MultiDevSSAGraphBuilder::IsSparseGradient(const std::string &og) const {
+  PADDLE_ENFORCE(all_vars_.count(og) != 0);
+  if (all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS) {
     return true;
   }
   return false;
@@ -345,12 +367,35 @@ void MultiDevSSAGraphBuilder::InsertAllReduceOp(SSAGraph *result,
     auto &prev_grad = vars.back();
     op_handle->AddInput(prev_grad.get());
 
-    auto var = new VarHandle(vars.size() - 1, i, og, p);
+    auto var = new VarHandle(vars.size(), i, og, p);
     vars.emplace_back(var);
     op_handle->AddOutput(var);
   }
 }
 
+void MultiDevSSAGraphBuilder::InsertDataBalanceOp(
+    SSAGraph *result, const std::vector<std::string> &datas) const {
+#ifdef PADDLE_WITH_CUDA
+  result->ops_.emplace_back(
+      new DataBalanceOpHandle(local_scopes_, places_, nccl_ctxs_));
+#else
+  result->ops_.emplace_back(new DataBalanceOpHandle(local_scopes_, places_));
+#endif
+  auto *op_handle = result->ops_.back().get();
+  for (size_t i = 0; i < places_.size(); ++i) {
+    auto &p = places_[i];
+    SetCommunicationContext(op_handle, p);
+    for (const std::string &d_name : datas) {
+      auto &vars = result->vars_[i][d_name];
+      PADDLE_ENFORCE(!vars.empty());
+      op_handle->AddInput(vars.back().get());
+      auto var = new VarHandle(vars.size(), i, d_name, p);
+      vars.emplace_back(var);
+      op_handle->AddOutput(var);
+    }
+  }
+}
+
 bool MultiDevSSAGraphBuilder::IsParameterGradientOnce(
     const std::string &og,
     std::unordered_set<std::string> *og_has_been_broadcast) const {
@@ -363,24 +408,23 @@ bool MultiDevSSAGraphBuilder::IsParameterGradientOnce(
   return is_pg_once;
 }
 
-int MultiDevSSAGraphBuilder::GetOpDeviceID(
-    const std::vector<std::unordered_set<std::string>> &var_name_on_devices,
-    const OpDesc &op) const {
+int MultiDevSSAGraphBuilder::GetOpDeviceID(const OpDesc &op) const {
   if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) {
     return -1;
   }
 
-  int var_dev_id = -1;
-  for (auto &var_name : op.InputArgumentNames()) {
-    if (var_dev_id != -1) break;
-    for (size_t i = 0; i < var_name_on_devices.size(); ++i) {
-      if (var_name_on_devices[i].count(var_name)) {
-        var_dev_id = static_cast<int>(i);
-        break;
-      }
+  for (auto &varname : op.InputArgumentNames()) {
+    int dev_id = GetVarDeviceID(varname);
+    if (dev_id != -1) {
+      return dev_id;
     }
   }
-  return var_dev_id;
+  return -1;
+}
+
+int MultiDevSSAGraphBuilder::GetVarDeviceID(const std::string &varname) const {
+  auto got = var_name_on_devices_.find(varname);
+  return got == var_name_on_devices_.end() ? -1 : got->second;
 }
 
 void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(SSAGraph *result) const {
@@ -442,13 +486,14 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(SSAGraph *result,
     op_handle->AddInput(prev_grad.get());
   }
   auto &vars = result->vars_[dst_dev_id][og];
-  auto var =
-      new VarHandle(vars.size() - 1, dst_dev_id, og, places_[dst_dev_id]);
+  auto var = new VarHandle(vars.size(), dst_dev_id, og, places_[dst_dev_id]);
   vars.emplace_back(var);
   op_handle->AddOutput(var);
   return var;
 }
 
+// Find the first occurence of `prev_op_name` and make current `op` depend
+// on it.
 void MultiDevSSAGraphBuilder::ConnectOp(SSAGraph *result, OpHandleBase *op,
                                         const std::string &prev_op_name) const {
   for (auto &prev_op : result->ops_) {
@@ -463,16 +508,70 @@ void MultiDevSSAGraphBuilder::ConnectOp(SSAGraph *result, OpHandleBase *op,
 
 void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result,
                                                 const OpDesc &op) const {
-  CreateComputationalOp(result, op, 0);
+  int op_dev_id = -1;
+  if (op.Type() == "split_byref" || op.Type() == "split_selected_rows") {
+    op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
+    if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
+      op_dev_id = GetAppropriateDeviceID(op.InputArgumentNames());
+      for (auto &varname : op.InputArgumentNames()) {
+        var_name_on_devices_.emplace(varname, op_dev_id);
+      }
+    }
+    for (auto &varname : op.OutputArgumentNames()) {
+      var_name_on_devices_.emplace(varname, op_dev_id);
+    }
+  } else if (op.Type() == "concat") {
+    op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
+    for (auto &varname : op.OutputArgumentNames()) {
+      var_name_on_devices_.emplace(varname, op_dev_id);
+    }
+  } else {
+    PADDLE_ENFORCE(
+        "the distribute training related op should be in [split_byref, "
+        "concat].");
+  }
+
+  PADDLE_ENFORCE(op_dev_id != -1,
+                 "can not find right place for distributed op: %s", op.Type());
+
+  CreateComputationalOp(result, op, op_dev_id);
   if (op.Type() == "concat") {
     ConnectOp(result, result->ops_.back().get(), "fetch_barrier");
   }
 }
 
+// Create RPC related op handles that connects its in ops and out ops.
 void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result,
                                           const OpDesc &op) const {
-  result->ops_.emplace_back(
-      new RPCOpHandle(op, local_scopes_[0], op.Type(), places_[0]));
+  int op_dev_id = -1;
+  if (op.Type() == "send") {
+    op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
+    // the variable name which contains .block means it was splited by
+    // split_byref op
+    // so that we can balance the variable blocks to all the pserver
+    // instances.
+    if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce &&
+        op.InputArgumentNames()[0].find(".block") == std::string::npos) {
+      op_dev_id = GetAppropriateDeviceID(op.InputArgumentNames());
+      for (auto &varname : op.InputArgumentNames()) {
+        var_name_on_devices_.emplace(varname, op_dev_id);
+      }
+    }
+  } else if (op.Type() == "recv") {
+    op_dev_id = GetAppropriateDeviceID(op.OutputArgumentNames());
+    for (auto &varname : op.OutputArgumentNames()) {
+      var_name_on_devices_.emplace(varname, op_dev_id);
+    }
+  } else {
+    // send_barrier and fetch_barrier op can be scheduled on device 0
+    op_dev_id = 0;
+  }
+
+  PADDLE_ENFORCE(op_dev_id != -1, "can not find the right place for rpc op: %s",
+                 op.Type());
+
+  result->ops_.emplace_back(new RPCOpHandle(op, local_scopes_[op_dev_id],
+                                            op.Type(), places_[op_dev_id]));
 
   if (op.Type() == "send_barrier") {
     ConnectOp(result, result->ops_.back().get(), "send");
@@ -488,9 +587,7 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result,
         "send, send_barrier. recv, fetch_barrier]");
   }
 
-  // TODO(Yancey1989): schedule rpc op on different place may
-  // increate throughput
-  CreateOpHandleIOs(result, op, 0);
+  CreateOpHandleIOs(result, op, op_dev_id);
 }
 
 bool MultiDevSSAGraphBuilder::IsScaleLossOp(const OpDesc &op) const {
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h
index 78581755fe4890800636944d6cd89875a852cc19..a964e024885e56693224a6199e00ff30beaa1df4 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -47,10 +47,11 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
 #endif
 
   std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override;
+  int GetVarDeviceID(const std::string &varname) const override;
 
  private:
   void CreateOpHandleIOs(SSAGraph *result, const OpDesc &op,
-                         size_t place_id) const;
+                         size_t device_id) const;
 
  private:
   std::string loss_var_name_;
@@ -96,21 +97,26 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
       const std::string &og,
       std::unordered_set<std::string> *og_has_been_broadcast) const;
 
-  int GetOpDeviceID(
-      const std::vector<std::unordered_set<std::string>> &var_name_on_devices,
-      const OpDesc &op) const;
+  int GetOpDeviceID(const OpDesc &op) const;
 
   void InsertAllReduceOp(SSAGraph *result, const std::string &og) const;
 
+  void InsertDataBalanceOp(SSAGraph *result,
+                           const std::vector<std::string> &datas) const;
+
   void CreateBroadcastOp(SSAGraph *result, const std::string &p_name,
                          size_t src_dev_id) const;
 
-  bool IsSparseGradient(
-      const std::unordered_map<std::string, VarDesc *> &all_vars,
-      const std::string &og) const;
+  bool IsSparseGradient(const std::string &og) const;
+
+  size_t GetAppropriateDeviceID(
+      const std::vector<std::string> &var_names) const;
 
  private:
   BuildStrategy strategy_;
+  mutable std::unordered_map<std::string, VarDesc *> all_vars_;
+  mutable std::unordered_map<std::string, int> var_name_on_devices_;
+  mutable std::vector<int64_t> balance_vars_;
 
   void SetCommunicationContext(OpHandleBase *op_handle,
                                const platform::Place &p) const;
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index f79565fe71c4aef140475c922cbbf5a1e0b7fe03..d80bdcf15d798925c137460125964d3d7e65f67e 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -11,8 +11,8 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
 #include "paddle/fluid/framework/details/op_handle_base.h"
+#include <map>
 
 namespace paddle {
 namespace framework {
@@ -58,8 +58,10 @@ void OpHandleBase::Run(bool use_cuda) {
 
 void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
 #ifdef PADDLE_WITH_CUDA
+  PADDLE_ENFORCE_NOT_NULL(waited_ctx);
   if (platform::is_cpu_place(waited_ctx->GetPlace()) || events_.empty()) {
     for (auto &dev_ctx : dev_ctxes_) {
+      PADDLE_ENFORCE_NOT_NULL(dev_ctx.second);
       dev_ctx.second->Wait();
     }
   } else {
@@ -122,7 +124,6 @@ void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
 #ifdef PADDLE_WITH_CUDA
   if (!events_.empty()) {  // Use event
     std::function<void()> method = callback;
-
     for (auto &p : dev_ctxes_) {
       method = [method, p, this]() {
         static_cast<platform::CUDADeviceContext *>(p.second)->RecordEvent(
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index fbd90a3296bca92b097cab925b218b91e7f4752f..6aec178831161f8ac1306fc3ed72e3267ca3c7e5 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #pragma once
+#include <map>
 #include <string>
 #include <vector>
-
 #include "paddle/fluid/framework/details/var_handle.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/macros.h"
@@ -92,9 +92,7 @@ class OpHandleBase {
 
   std::vector<VarHandleBase *> inputs_;
   std::vector<VarHandleBase *> outputs_;
-  std::unordered_map<platform::Place, platform::DeviceContext *,
-                     platform::PlaceHash>
-      dev_ctxes_;
+  std::map<platform::Place, platform::DeviceContext *> dev_ctxes_;
 
 #ifdef PADDLE_WITH_CUDA
   std::unordered_map<int, cudaEvent_t> events_;
diff --git a/paddle/fluid/framework/details/reduce_and_gather.h b/paddle/fluid/framework/details/reduce_and_gather.h
index a6ffb37313a88120bc9e8d5ce326f60aeebdff69..c0cd873a1d83fa8c2c7b7cd5acfaad9949bcff7d 100644
--- a/paddle/fluid/framework/details/reduce_and_gather.h
+++ b/paddle/fluid/framework/details/reduce_and_gather.h
@@ -54,8 +54,7 @@ struct ReduceLoDTensor {
 inline void GatherSelectedRows(
     const std::vector<const SelectedRows *> &src_selecte_rows_,
     const std::vector<platform::Place> &in_places,
-    const std::unordered_map<platform::Place, platform::DeviceContext *,
-                             platform::PlaceHash> &dev_ctxes,
+    const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes,
     const platform::Place &out_place, SelectedRows *dst_selecte_rows) {
   PADDLE_ENFORCE(!src_selecte_rows_.empty());
 
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/ssa_graph_builder.h
index 5fc12a44b51fae26e5a8f5fdba952d3879e82d0f..18612c3c1b62cf4c2ebdc221c301c59ec81c2da7 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder.h
+++ b/paddle/fluid/framework/details/ssa_graph_builder.h
@@ -30,6 +30,7 @@ class SSAGraphBuilder {
   SSAGraphBuilder() {}
   virtual ~SSAGraphBuilder() {}
   virtual std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const = 0;
+  virtual int GetVarDeviceID(const std::string &var_name) const = 0;
 
   DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder);
 
diff --git a/paddle/fluid/framework/details/ssa_graph_checker.h b/paddle/fluid/framework/details/ssa_graph_checker.h
index 304b221e7e4c414a0ab562a1b99836d3b7c02efb..331aa9d2b5864c470dbd5e29ef6faccffdcf781c 100644
--- a/paddle/fluid/framework/details/ssa_graph_checker.h
+++ b/paddle/fluid/framework/details/ssa_graph_checker.h
@@ -16,6 +16,8 @@
 
 #include "paddle/fluid/framework/details/ssa_graph_builder.h"
 
+#include <string>
+
 namespace paddle {
 namespace framework {
 namespace details {
@@ -33,6 +35,10 @@ class SSAGraghBuilderWithChecker : public SSAGraphBuilder {
     return graph;
   }
 
+  int GetVarDeviceID(const std::string& var_name) const override {
+    return builder_->GetVarDeviceID(var_name);
+  }
+
   bool IsValidGraph(const SSAGraph* graph) const;
 
  private:
diff --git a/paddle/fluid/framework/details/ssa_graph_printer.h b/paddle/fluid/framework/details/ssa_graph_printer.h
index b4c90013789759d17646d95efdc81fc6a0a4f3e7..09b0333ef2cb43a306133aa5af98d37c11454d4d 100644
--- a/paddle/fluid/framework/details/ssa_graph_printer.h
+++ b/paddle/fluid/framework/details/ssa_graph_printer.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <iosfwd>
+#include <string>
 #include "paddle/fluid/framework/details/ssa_graph_builder.h"
 
 namespace paddle {
@@ -55,6 +56,10 @@ class SSAGraghBuilderWithPrinter : public SSAGraphBuilder {
     return graph;
   }
 
+  int GetVarDeviceID(const std::string& var_name) const override {
+    return builder_->GetVarDeviceID(var_name);
+  }
+
  private:
   std::unique_ptr<SSAGraphPrinter> printer_;
   std::unique_ptr<SSAGraphBuilder> builder_;
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 6c5098ce85b784a3edcf8f48d2cc828aabd8e161..99b10254a7961bf7b27b256acaece573a71c4115 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -96,10 +96,20 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
     auto cur_ready_vars = ready_vars.PopAll(1, &timeout);
 
     if (timeout) {
+      std::lock_guard<std::mutex> l(exception_mu_);
       if (exception_) {
-        auto exp = *exception_;
-        exception_.reset();
-        throw exp;
+        std::exception *exp = exception_.get();
+        if (dynamic_cast<platform::EOFException *>(exp)) {
+          auto e = *static_cast<platform::EOFException *>(exp);
+          exception_.reset();
+          throw e;
+        } else if (dynamic_cast<platform::EnforceNotMet *>(exp)) {
+          auto e = *static_cast<platform::EnforceNotMet *>(exp);
+          exception_.reset();
+          throw e;
+        } else {
+          LOG(FATAL) << "Unknown exception.";
+        }
       } else {
         continue;
       }
@@ -198,7 +208,14 @@ void ThreadedSSAGraphExecutor::RunOp(
       running_ops_--;
       ready_var_q->Extend(op->Outputs());
       VLOG(10) << op << " " << op->Name() << "Signal posted";
+    } catch (platform::EOFException ex) {
+      std::lock_guard<std::mutex> l(exception_mu_);
+      // EOFException will not cover up existing EnforceNotMet.
+      if (exception_.get() == nullptr) {
+        exception_.reset(new platform::EOFException(ex));
+      }
     } catch (platform::EnforceNotMet ex) {
+      std::lock_guard<std::mutex> l(exception_mu_);
       exception_.reset(new platform::EnforceNotMet(ex));
     } catch (...) {
       LOG(FATAL) << "Unknown exception catched";
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index 4a2075f1cccb3211316567197da56c01d26f35ce..c69e0487e2e503a0d445300aa2fd6bb9c30b06c9 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -56,7 +56,8 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
   std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
   platform::DeviceContextPool fetch_ctxs_;
-  std::unique_ptr<platform::EnforceNotMet> exception_;
+  std::mutex exception_mu_;
+  std::unique_ptr<std::exception> exception_;
   std::atomic<int> running_ops_;
 
   void InsertPendingOp(std::unordered_map<OpHandleBase *, size_t> *pending_ops,
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index e15232a77bb9c3e325b55737ea7abc55e3121708..84f67fafa19ac545ebb7a1019059e3c74c363c56 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -20,9 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
-#ifdef PADDLE_WITH_DISTRIBUTE
-#include "paddle/fluid/operators/detail/grpc_client.h"
-#endif
+#include "paddle/fluid/operators/detail/macros.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -48,10 +46,16 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {
 Executor::Executor(const platform::Place& place) : place_(place) {}
 
 #ifdef PADDLE_WITH_DISTRIBUTE
-void Executor::Complete() {
-  ::paddle::operators::detail::RPCClient::GetInstance<
-      ::paddle::operators::detail::GRPCClient>()
-      ->SendComplete();
+void Executor::BeginPass() {
+  ::paddle::operators::distributed::RPCClient::GetInstance<
+      ::paddle::operators::distributed::GRPCClient>()
+      ->SendBeginPass();
+}
+
+void Executor::EndPass() {
+  ::paddle::operators::distributed::RPCClient::GetInstance<
+      ::paddle::operators::distributed::GRPCClient>()
+      ->SendEndPass();
 }
 #endif
 
@@ -295,13 +299,14 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
 
 std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
     const ProgramDesc& program, int block_id) {
-  auto* ctx = new ExecutorPrepareContext(program, block_id);
+  std::unique_ptr<ExecutorPrepareContext> ctx(
+      new ExecutorPrepareContext(program, block_id));
   PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size());
   auto& block = program.Block(block_id);
   for (auto& op_desc : block.AllOps()) {
     ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
   }
-  return std::unique_ptr<ExecutorPrepareContext>(ctx);
+  return ctx;
 }
 
 std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
@@ -320,7 +325,8 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
 }
 
 void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
-                                  bool create_local_scope, bool create_vars) {
+                                  bool create_local_scope, bool create_vars,
+                                  bool keep_kids) {
   Scope* local_scope = scope;
   if (create_vars) {
     if (create_local_scope) {
@@ -343,12 +349,20 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
     }
   }
   platform::DeviceContextPool::Instance().Get(place_)->Wait();
-  if (create_vars && create_local_scope) {
+  if (local_scope != scope) {
     scope->DeleteScope(local_scope);
   } else {
-    // Delete the local scopes created in operators.
-    scope->DropKids();
+    if (!keep_kids) {
+      // By default, we should delete all kid scopes after run executor because
+      // some operators may create local scope when running, such as while_op.
+      // But when while_op also create a local executor to run it's sub block,
+      // the sub scopes it created should not be dropped immediately, because
+      // while_grad_op will use some variables created during while_op run, so
+      // we need to keep the kids and wait for the outer executor to drop them.
+      scope->DropKids();
+    }
   }
+
   if (FLAGS_benchmark) {
     VLOG(2) << "-------------------------------------------------------";
     VLOG(2) << "Memory used after deleting local scope: "
@@ -406,6 +420,9 @@ void Executor::EnableMKLDNN(const ProgramDesc& program) {
       }
     }
   }
+#else
+  LOG(WARNING)
+      << "'MKLDNN' is not supported, Please re-compile with WITH_MKLDNN option";
 #endif
 }
 
diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h
index 67a0761dac2a9adcdd0ce2b218c4aa505d688d56..563a4b2bb65dad481a755f67c7f23939816ce8e8 100644
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -46,9 +46,14 @@ class Executor {
 
 #ifdef PADDLE_WITH_DISTRIBUTE
   /*
-   * Sending signal to pserver to mark current trainer stop.
+   * Sending signal to pserver to mark current pass started.
    */
-  void Complete();
+  void BeginPass();
+
+  /*
+   * Sending signal to pserver to mark current pass finished.
+   */
+  void EndPass();
 #endif
 
   /* @Brief
@@ -78,7 +83,7 @@ class Executor {
 
   void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
                           bool create_local_scope = true,
-                          bool create_vars = true);
+                          bool create_vars = true, bool keep_kids = false);
 
   void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
                           std::map<std::string, const LoDTensor*>* feed_targets,
diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto
index 68fcc104d48b2b39929ed2198a2dd2eabae10e94..2cf14bd371831ab682166f4256d6966b5ab278c8 100644
--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
@@ -27,6 +27,7 @@ enum AttrType {
   BOOLEANS = 7;
   BLOCK = 8;
   LONG = 9;
+  BLOCKS = 10;
 }
 
 // OpDesc describes an instance of a C++ framework::OperatorBase
@@ -46,6 +47,7 @@ message OpDesc {
     repeated bool bools = 11;
     optional int32 block_idx = 12;
     optional int64 l = 13;
+    repeated int32 blocks_idx = 14;
   };
 
   message Var {
diff --git a/paddle/fluid/framework/init.cc b/paddle/fluid/framework/init.cc
deleted file mode 100644
index a1094976f6c0965ac0a601d7e37575969146fdab..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/init.cc
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <string.h>  // for strdup
-#include <algorithm>
-#include <stdexcept>
-#include <string>
-
-#include "paddle/fluid/framework/init.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/string/piece.h"
-
-namespace paddle {
-namespace framework {
-
-std::once_flag gflags_init_flag;
-std::once_flag p2p_init_flag;
-
-void InitGflags(std::vector<std::string> argv) {
-  std::call_once(gflags_init_flag, [&]() {
-    argv.insert(argv.begin(), "dummy");
-    int argc = argv.size();
-    char **arr = new char *[argv.size()];
-    std::string line;
-    for (size_t i = 0; i < argv.size(); i++) {
-      arr[i] = &argv[i][0];
-      line += argv[i];
-      line += ' ';
-    }
-    google::ParseCommandLineFlags(&argc, &arr, true);
-    VLOG(1) << "Init commandline: " << line;
-  });
-}
-
-void InitP2P(std::vector<int> devices) {
-#ifdef PADDLE_WITH_CUDA
-  std::call_once(p2p_init_flag, [&]() {
-    int count = devices.size();
-    for (int i = 0; i < count; ++i) {
-      for (int j = 0; j < count; ++j) {
-        if (devices[i] == devices[j]) continue;
-        int can_acess = -1;
-        PADDLE_ENFORCE(
-            cudaDeviceCanAccessPeer(&can_acess, devices[i], devices[j]),
-            "Failed to test P2P access.");
-        if (can_acess != 1) {
-          LOG(WARNING) << "Cannot enable P2P access from " << devices[i]
-                       << " to " << devices[j];
-        } else {
-          cudaSetDevice(devices[i]);
-          cudaDeviceEnablePeerAccess(devices[j], 0);
-        }
-      }
-    }
-  });
-#endif
-}
-
-void InitDevices(bool init_p2p) {
-  /*Init all available devices by default */
-  std::vector<int> devices;
-#ifdef PADDLE_WITH_CUDA
-  try {
-    int count = platform::GetCUDADeviceCount();
-    for (int i = 0; i < count; ++i) {
-      devices.push_back(i);
-    }
-  } catch (const std::exception &exp) {
-    LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime.";
-  }
-#else
-  LOG(WARNING)
-      << "'CUDA' is not supported, Please re-compile with WITH_GPU option";
-#endif
-  InitDevices(init_p2p, devices);
-}
-
-void InitDevices(bool init_p2p, const std::vector<int> devices) {
-  std::vector<platform::Place> places;
-  int count = 0;
-#ifdef PADDLE_WITH_CUDA
-  try {
-    count = platform::GetCUDADeviceCount();
-  } catch (const std::exception &exp) {
-    LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime.";
-  }
-#else
-  LOG(WARNING)
-      << "'CUDA' is not supported, Please re-compile with WITH_GPU option";
-#endif
-
-  for (size_t i = 0; i < devices.size(); ++i) {
-    if (devices[i] >= count || devices[i] < 0) {
-      LOG(WARNING) << "Invalid devices id.";
-      continue;
-    }
-    places.emplace_back(platform::CUDAPlace(devices[i]));
-  }
-  if (init_p2p) {
-    InitP2P(devices);
-  }
-  places.emplace_back(platform::CPUPlace());
-  platform::DeviceContextPool::Init(places);
-#ifndef PADDLE_WITH_MKLDNN
-  operators::math::SetNumThreads(1);
-#endif
-}
-
-void InitGLOG(const std::string &prog_name) {
-  // glog will not hold the ARGV[0] inside.
-  // Use strdup to alloc a new string.
-  google::InitGoogleLogging(strdup(prog_name.c_str()));
-  google::InstallFailureSignalHandler();
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/init_test.cc b/paddle/fluid/framework/init_test.cc
deleted file mode 100644
index 928e2d14abea604cf483f4bc1e1c58fbae04dd21..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/init_test.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "gtest/gtest.h"
-
-#include "paddle/fluid/framework/init.h"
-#include "paddle/fluid/platform/device_context.h"
-
-TEST(InitDevices, CPU) {
-  using paddle::framework::InitDevices;
-  using paddle::platform::DeviceContextPool;
-
-#ifndef PADDLE_WITH_CUDA
-  InitDevices(true);
-  DeviceContextPool& pool = DeviceContextPool::Instance();
-  ASSERT_EQ(pool.size(), 1U);
-#endif
-}
-
-TEST(InitDevices, CUDA) {
-  using paddle::framework::InitDevices;
-  using paddle::platform::DeviceContextPool;
-
-#ifdef PADDLE_WITH_CUDA
-  int count = paddle::platform::GetCUDADeviceCount();
-  InitDevices(true);
-  DeviceContextPool& pool = DeviceContextPool::Instance();
-  ASSERT_EQ(pool.size(), 1U + static_cast<unsigned>(count));
-#endif
-}
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index a56674cbe216e312c4394ef537140122352dc785..cba0064f38f89c1dd27cfac1ddb2339a5ee6c93f 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/var_type.h"
 
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/memory/memory.h"
@@ -51,8 +52,6 @@ std::ostream &operator<<(std::ostream &os, const LoD &lod) {
 }
 
 std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
-  PADDLE_ENFORCE(t.type().hash_code() == typeid(float).hash_code());
-
   if (!platform::is_cpu_place(t.place())) {
     LoDTensor tt;
     framework::TensorCopy(t, platform::CPUPlace(), &tt);
@@ -70,7 +69,13 @@ std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
   // only print first ten elements
   int64_t size = t.numel() < 10 ? t.numel() : 10;
   for (int64_t i = 0; i < size; ++i) {
-    os << t.data<float>()[i] << " ";
+    if (IsType<float>(t.type())) {
+      os << t.data<float>()[i] << " ";
+    } else if (IsType<int64_t>(t.type())) {
+      os << t.data<int64_t>()[i] << " ";
+    } else {
+      PADDLE_THROW("LoDTensor data type not in [float, int64_t]");
+    }
   }
 
   return os;
@@ -85,6 +90,7 @@ std::string LoDToString(const LoD &lod) {
 LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin,
                  size_t elem_end) {
   PADDLE_ENFORCE_LT(level, in.size());
+  PADDLE_ENFORCE_LT(elem_begin, elem_end);
   PADDLE_ENFORCE_LT(elem_end, in[level].size());
 
   LoD res;
@@ -380,7 +386,7 @@ void LoDTensor::MergeLoDTensor(
   LoD new_lod = lod_tensors[0]->lod();
   for (size_t i = 1; i < lod_tensors.size(); ++i) {
     auto *t = lod_tensors[i];
-    PADDLE_ENFORCE_EQ(new_type.hash_code(), t->type().hash_code());
+    PADDLE_ENFORCE_EQ(new_type, t->type());
     PADDLE_ENFORCE_EQ(new_layout, t->layout());
 
     PADDLE_ENFORCE_EQ(framework::product(new_dim) / new_dim[0],
@@ -388,6 +394,7 @@ void LoDTensor::MergeLoDTensor(
     new_dim[0] += t->dims()[0];
 
     auto &lod = t->lod();
+    PADDLE_ENFORCE_EQ(new_lod.size(), lod.size());
     for (size_t j = 0; j < lod.size(); ++j) {
       auto &sub_lod = new_lod[j];
       auto &offset = sub_lod.back();
@@ -410,5 +417,38 @@ void LoDTensor::MergeLoDTensor(
   }
 }
 
+LoD ConvertToLengthBasedLoD(const LoD &offset_lod) {
+  LoD length_lod;
+  length_lod.reserve(offset_lod.size());
+  for (size_t lvl = 0; lvl < offset_lod.size(); ++lvl) {
+    std::vector<size_t> level;
+    if (offset_lod[lvl].size() > 0) {
+      level.reserve(offset_lod[lvl].size() - 1);
+    }
+    for (size_t idx = 0; idx < offset_lod[lvl].size() - 1; ++idx) {
+      level.push_back(offset_lod[lvl][idx + 1] - offset_lod[lvl][idx]);
+    }
+    length_lod.push_back(level);
+  }
+  return length_lod;
+}
+
+LoD ConvertToOffsetBasedLoD(const LoD &length_lod) {
+  LoD offset_lod;
+  offset_lod.reserve(length_lod.size());
+  for (size_t lvl = 0; lvl < length_lod.size(); ++lvl) {
+    std::vector<size_t> level;
+    level.reserve(length_lod[lvl].size() + 1);
+    size_t tmp = 0;
+    level.push_back(tmp);
+    for (size_t idx = 0; idx < length_lod[lvl].size(); ++idx) {
+      tmp += length_lod[lvl][idx];
+      level.push_back(tmp);
+    }
+    offset_lod.push_back(level);
+  }
+  return offset_lod;
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h
index 1159fee39b0737402c60448dcbe69e7535c9d6e1..4a2729373b5c63176ed1e856f4acf29fd1e73254 100644
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -226,5 +226,19 @@ extern void WriteToRecordIO(recordio::Writer* writer,
 extern std::vector<LoDTensor> ReadFromRecordIO(
     recordio::Scanner* scanner, const platform::DeviceContext& dev_ctx);
 
+/*
+ * Convert between length-based LoD and offset-based LoD.
+ * The implementation of LoDTensor class use offset-based LoD.
+ * However, we want to expose the more user-friendly length-based
+ * LoD to the Python side instead.
+ *
+ * Example:
+ * If offset_lod = [[0, 2, 3],[0, 3, 5, 9]]
+ * then length_lod = [[2, 1], [3, 2, 4]]
+ */
+LoD ConvertToLengthBasedLoD(const LoD& offset_lod);
+
+LoD ConvertToOffsetBasedLoD(const LoD& length_lod);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/lod_tensor_test.cc b/paddle/fluid/framework/lod_tensor_test.cc
index 2ceffc93319359683e87e7fec2d18784c9bf02f3..38d3cd96d65f0a54b0ea87b4c677013f3802adfb 100644
--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
@@ -26,6 +26,20 @@
 namespace paddle {
 namespace framework {
 
+TEST(LoD, PrintLoDTensor) {
+  LoDTensor tensor1;
+  tensor1.mutable_data<float>(platform::CPUPlace());
+  tensor1.data<float>()[0] = 0.2;
+  tensor1.data<float>()[1] = 0.5;
+  LOG(INFO) << tensor1;
+
+  LoDTensor tensor2;
+  tensor2.mutable_data<int64_t>(platform::CPUPlace());
+  tensor2.data<int64_t>()[0] = 1;
+  tensor2.data<int64_t>()[1] = 2;
+  LOG(INFO) << tensor2;
+}
+
 TEST(LoD, data) {
   LoD lod{{0, 1, 2}};
   lod.push_back({0, 2, 4, 5});
@@ -37,7 +51,7 @@ TEST(LoD, data) {
   }
 }
 
-TEST(LodExpand, test) {
+TEST(LoD, ExpandLoD) {
   LoD lod{{0, 2}};
   LoDTensor tensor;
   tensor.set_lod(lod);
@@ -228,6 +242,38 @@ TEST(LoD, CheckAbsLoD) {
   ASSERT_FALSE(CheckAbsLoD(abs_lod0));
 }
 
+TEST(LoD, ConvertToLengthBasedLoD) {
+  LoD offset_lod;
+  offset_lod.push_back(std::vector<size_t>({0, 2}));
+  offset_lod.push_back(std::vector<size_t>({0, 1, 3}));
+  offset_lod.push_back(std::vector<size_t>({0, 2, 4, 5}));
+
+  LoD length_lod = ConvertToLengthBasedLoD(offset_lod);
+
+  LoD expected;
+  expected.push_back(std::vector<size_t>({2}));
+  expected.push_back(std::vector<size_t>({1, 2}));
+  expected.push_back(std::vector<size_t>({2, 2, 1}));
+
+  EXPECT_EQ(length_lod, expected);
+}
+
+TEST(LoD, ConvertToOffsetBasedLoD) {
+  LoD length_lod;
+  length_lod.push_back(std::vector<size_t>({2}));
+  length_lod.push_back(std::vector<size_t>({1, 2}));
+  length_lod.push_back(std::vector<size_t>({2, 2, 1}));
+
+  LoD offset_lod = ConvertToOffsetBasedLoD(length_lod);
+
+  LoD expected;
+  expected.push_back(std::vector<size_t>({0, 2}));
+  expected.push_back(std::vector<size_t>({0, 1, 3}));
+  expected.push_back(std::vector<size_t>({0, 2, 4, 5}));
+
+  EXPECT_EQ(offset_lod, expected);
+}
+
 template <typename T>
 static void TestRecordIO() {
   LoDTensor tensor;
diff --git a/paddle/fluid/framework/lod_tensor_test.cu b/paddle/fluid/framework/lod_tensor_test.cu
index e3efbe4c464493af87e33510647d8c67d457a76d..b9950627ca378cb9607681799bd7fe5bfce2bf50 100644
--- a/paddle/fluid/framework/lod_tensor_test.cu
+++ b/paddle/fluid/framework/lod_tensor_test.cu
@@ -17,9 +17,9 @@
 #include <stdio.h>
 
 #include "gtest/gtest.h"
-#include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/platform/assert.h"
+#include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/place.h"
 
 __global__ void test(size_t* a, int size) {
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index f92769192c218eb7cdc2350ff6e4721b45005806..a190199f1cb1361f67f20c755b8e7ef52c284adc 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -211,6 +211,12 @@ void OpDesc::SetBlockAttr(const std::string &name, BlockDesc *block) {
   need_update_ = true;
 }
 
+void OpDesc::SetBlocksAttr(const std::string &name,
+                           std::vector<BlockDesc *> blocks) {
+  this->attrs_[name] = blocks;
+  need_update_ = true;
+}
+
 void OpDesc::SetAttrMap(
     const std::unordered_map<std::string, Attribute> &attr_map) {
   attrs_ = attr_map;
@@ -305,6 +311,13 @@ struct SetAttrDescVisitor : public boost::static_visitor<void> {
   void operator()(const std::vector<bool> &v) const {
     VectorToRepeated(v, attr_->mutable_bools());
   }
+  void operator()(const std::vector<BlockDesc *> &v) const {
+    std::vector<int> blocks_idx;
+    for (auto blk : v) {
+      blocks_idx.push_back(blk->ID());
+    }
+    VectorToRepeated(blocks_idx, attr_->mutable_blocks_idx());
+  }
   void operator()(BlockDesc *desc) const { attr_->set_block_idx(desc->ID()); }
   void operator()(int64_t v) const { attr_->set_l(v); }
   void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }
diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h
index a02d3e269129596f65a2fb346e76c1af7fbead95..74dd8ec002005dd080424b48b5db1a2574a6974f 100644
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -77,6 +77,8 @@ class OpDesc {
 
   void SetBlockAttr(const std::string &name, BlockDesc *block);
 
+  void SetBlocksAttr(const std::string &name, std::vector<BlockDesc *> blocks);
+
   Attribute GetAttr(const std::string &name) const;
 
   Attribute GetNullableAttr(const std::string &name) const;
diff --git a/paddle/fluid/framework/op_info.cc b/paddle/fluid/framework/op_info.cc
index f1261dee0319440995951d1bee145404186a8ad4..af75baa5c4b98f7d092834c05eb57e9c7e131b29 100644
--- a/paddle/fluid/framework/op_info.cc
+++ b/paddle/fluid/framework/op_info.cc
@@ -21,8 +21,8 @@ namespace framework {
 // a static local variable is already being initialized.
 // https://stackoverflow.com/questions/11711920/how-to-implement-multithread-safe-singleton-in-c11-without-using-mutex
 OpInfoMap& OpInfoMap::Instance() {
-  static OpInfoMap* g_op_info_map = new OpInfoMap();
-  return *g_op_info_map;
+  static OpInfoMap g_op_info_map;
+  return g_op_info_map;
 }
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/op_kernel_type.h b/paddle/fluid/framework/op_kernel_type.h
index f51a184e7bae2283f335fe9462a77b9c5fb831a5..c59b232191c49ccb47bb9f51dcaf2fd9280fae19 100644
--- a/paddle/fluid/framework/op_kernel_type.h
+++ b/paddle/fluid/framework/op_kernel_type.h
@@ -97,7 +97,7 @@ inline bool NeedTransformLayout(const DataLayout& l, const DataLayout& r) {
   return ret;
 }
 
-inline bool TransFromNeeded(const OpKernelType& l, const OpKernelType& r) {
+inline bool NeedTransform(const OpKernelType& l, const OpKernelType& r) {
   return (!platform::places_are_same_class(l.place_, r.place_)) ||
          (l.data_type_ != r.data_type_) ||
          NeedTransformLayout(l.data_layout_, r.data_layout_);
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index 43ab227a9478707445892c14723801992d0041aa..e7dfa608b48f89a2155e43c7e63e31154675cd38 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -76,6 +76,20 @@ class OpRegistry {
 template <typename PlaceType, bool at_end, size_t I, typename... KernelType>
 struct OpKernelRegistrarFunctor;
 
+template <typename PlaceType, typename T, typename Func>
+inline void RegisterKernelClass(const char* op_type, const char* library_type,
+                                Func func) {
+  std::string library(library_type);
+  std::string data_layout = "ANYLAYOUT";
+  if (library == "MKLDNN") {
+    data_layout = "MKLDNNLAYOUT";
+  }
+  OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType(),
+                   StringToDataLayout(data_layout),
+                   StringToLibraryType(library_type));
+  OperatorWithKernel::AllOpKernels()[op_type][key] = func;
+}
+
 template <typename PlaceType, size_t I, typename... KernelTypes>
 struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
   using KERNEL_TYPE =
@@ -83,16 +97,10 @@ struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
 
   void operator()(const char* op_type, const char* library_type) const {
     using T = typename KERNEL_TYPE::ELEMENT_TYPE;
-    std::string library(library_type);
-    std::string data_layout = "ANYLAYOUT";
-    if (library == "MKLDNN") {
-      data_layout = "MKLDNNLAYOUT";
-    }
-    OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType(),
-                     StringToDataLayout(data_layout),
-                     StringToLibraryType(library_type));
-    OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KERNEL_TYPE);
-
+    RegisterKernelClass<PlaceType, T>(
+        op_type, library_type, [](const framework::ExecutionContext& ctx) {
+          KERNEL_TYPE().Compute(ctx);
+        });
     constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value;
     OpKernelRegistrarFunctor<PlaceType, I + 1 == size, I + 1, KernelTypes...>
         func;
@@ -116,6 +124,47 @@ class OpKernelRegistrar : public Registrar {
   }
 };
 
+template <typename PlaceType, bool at_end, size_t I, typename... KernelType>
+struct OpKernelRegistrarFunctorEx;
+
+template <typename PlaceType, typename... DataTypeAndKernelType>
+class OpKernelRegistrarEx : public Registrar {
+ public:
+  explicit OpKernelRegistrarEx(const char* op_type, const char* library_type) {
+    OpKernelRegistrarFunctorEx<PlaceType, false, 0, DataTypeAndKernelType...>
+        func;
+    func(op_type, library_type);
+  }
+};
+
+template <typename PlaceType, size_t I, typename... DataTypeAndKernelType>
+struct OpKernelRegistrarFunctorEx<PlaceType, true, I,
+                                  DataTypeAndKernelType...> {
+  void operator()(const char* op_type, const char* library_type) const {}
+};
+
+template <typename PlaceType, size_t I, typename... DataTypeAndKernelType>
+struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
+                                  DataTypeAndKernelType...> {
+  using Functor =
+      typename std::tuple_element<I + 1,
+                                  std::tuple<DataTypeAndKernelType...>>::type;
+  using T =
+      typename std::tuple_element<I,
+                                  std::tuple<DataTypeAndKernelType...>>::type;
+
+  void operator()(const char* op_type, const char* library_type) const {
+    RegisterKernelClass<PlaceType, T>(op_type, library_type, Functor());
+
+    constexpr auto size =
+        std::tuple_size<std::tuple<DataTypeAndKernelType...>>::value;
+    OpKernelRegistrarFunctorEx<PlaceType, I + 2 >= size, I + 2,
+                               DataTypeAndKernelType...>
+        func;
+    func(op_type, library_type);
+  }
+};
+
 /**
  * check if MACRO is used in GLOBAL NAMESPACE.
  */
@@ -133,21 +182,15 @@ class OpKernelRegistrar : public Registrar {
     VarTypeInference
     InferShapeBase
 */
-#define REGISTER_OPERATOR(op_type, op_class, ...)                      \
-  STATIC_ASSERT_GLOBAL_NAMESPACE(                                      \
-      __reg_op__##op_type,                                             \
-      "REGISTER_OPERATOR must be called in global namespace");         \
-  class _OpClass_##op_type##_ : public op_class {                      \
-   public:                                                             \
-    DEFINE_OP_CLONE_METHOD(_OpClass_##op_type##_);                     \
-    DEFINE_OP_CONSTRUCTOR(_OpClass_##op_type##_, op_class);            \
-  };                                                                   \
-  static ::paddle::framework::OperatorRegistrar<_OpClass_##op_type##_, \
-                                                ##__VA_ARGS__>         \
-      __op_registrar_##op_type##__(#op_type);                          \
-  int TouchOpRegistrar_##op_type() {                                   \
-    __op_registrar_##op_type##__.Touch();                              \
-    return 0;                                                          \
+#define REGISTER_OPERATOR(op_type, op_class, ...)                        \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                        \
+      __reg_op__##op_type,                                               \
+      "REGISTER_OPERATOR must be called in global namespace");           \
+  static ::paddle::framework::OperatorRegistrar<op_class, ##__VA_ARGS__> \
+      __op_registrar_##op_type##__(#op_type);                            \
+  int TouchOpRegistrar_##op_type() {                                     \
+    __op_registrar_##op_type##__.Touch();                                \
+    return 0;                                                            \
   }
 
 #define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) \
@@ -174,6 +217,25 @@ class OpKernelRegistrar : public Registrar {
 #define REGISTER_OP_CPU_KERNEL(op_type, ...) \
   REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
 
+#define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class, ...)      \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                           \
+      __reg_op_kernel_##op_type##_##library_type##__,                       \
+      "REGISTER_OP_KERNEL_EX must be called in global namespace");          \
+  static ::paddle::framework::OpKernelRegistrarEx<place_class, __VA_ARGS__> \
+      __op_kernel_registrar_##op_type##_##library_type##__(#op_type,        \
+                                                           #library_type);  \
+  int TouchOpKernelRegistrar_##op_type##_##library_type() {                 \
+    __op_kernel_registrar_##op_type##_##library_type##__.Touch();           \
+    return 0;                                                               \
+  }
+
+#define REGISTER_OP_CUDA_KERNEL_FUNCTOR(op_type, ...)                 \
+  REGISTER_OP_KERNEL_EX(op_type, CUDA, ::paddle::platform::CUDAPlace, \
+                        __VA_ARGS__)
+
+#define REGISTER_OP_CPU_KERNEL_FUNCTOR(op_type, ...) \
+  REGISTER_OP_KERNEL_EX(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
+
 /**
  * Macro to mark what Operator and Kernel
  * we will use and tell the compiler to
diff --git a/paddle/fluid/framework/op_registry_test.cc b/paddle/fluid/framework/op_registry_test.cc
index 18b1649cc71d5edd5b07740bbad1fe8f81128898..04996d7b09cecc3c330a47153c9b10310f1792f4 100644
--- a/paddle/fluid/framework/op_registry_test.cc
+++ b/paddle/fluid/framework/op_registry_test.cc
@@ -193,15 +193,10 @@ TEST(OpRegistry, CustomChecker) {
   ASSERT_EQ(test_attr, 4);
 }
 
-class CosineOpComplete : public paddle::framework::CosineOp {
- public:
-  DEFINE_OP_CONSTRUCTOR(CosineOpComplete, paddle::framework::CosineOp);
-  DEFINE_OP_CLONE_METHOD(CosineOpComplete);
-};
-
 TEST(OperatorRegistrar, Test) {
   paddle::framework::OperatorRegistrar<
-      CosineOpComplete, paddle::framework::CosineOpProtoAndCheckerMaker>
+      paddle::framework::CosineOp,
+      paddle::framework::CosineOpProtoAndCheckerMaker>
       reg("cos");
 }
 
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 122ee1dab35b8c7d42392a983b5b15b7c1be7869..d1dc5fcd97b77fb7707c7d48f6eaeef140d3f306 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -592,8 +592,7 @@ static void CheckTensorNANOrInf(const std::string& name,
   if (tensor.memory_size() == 0) {
     return;
   }
-  if (tensor.type().hash_code() != typeid(float).hash_code() &&   // NOLINT
-      tensor.type().hash_code() != typeid(double).hash_code()) {  // NOLINT
+  if (!IsType<float>(tensor.type()) && !IsType<double>(tensor.type())) {
     return;
   }
   PADDLE_ENFORCE(!framework::TensorContainsInf(tensor),
@@ -620,8 +619,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
         "There are no kernels which are registered in the %s operator.", type_);
   }
 
-  ExecutionContext ctx(*this, scope, *dev_ctx);
-
   OpKernelMap& kernels = kernels_iter->second;
 
   // TODO(dzhwinter) : kernel fallback mechanism will be added when all the
@@ -631,65 +628,54 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   //   Do selection
   // }
 
-  auto expected_kernel_key = this->GetExpectedKernelType(ctx);
+  auto expected_kernel_key =
+      this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx));
   VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
 
   auto kernel_iter = kernels.find(expected_kernel_key);
+#ifdef PADDLE_WITH_MKLDNN
+  // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set
+  if (kernel_iter == kernels.end() &&
+      expected_kernel_key.library_type_ == LibraryType::kMKLDNN) {
+    VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one";
+    expected_kernel_key.library_type_ = LibraryType::kPlain;
+    expected_kernel_key.data_layout_ = DataLayout::kAnyLayout;
+    kernel_iter = kernels.find(expected_kernel_key);
+  }
+#endif
   if (kernel_iter == kernels.end()) {
     PADDLE_THROW("op %s does not have kernel for %s", type_,
                  KernelTypeToString(expected_kernel_key));
   }
 
-  // do data transform
-  Scope& new_scope = scope.NewScope();
+  // do data transformScope &transfer_scope;
+  std::vector<std::string> transfered_inplace_vars;
+  auto* transfer_scope =
+      TryTransferData(scope, expected_kernel_key, &transfered_inplace_vars);
 
-  std::vector<std::string> inplace_vars;
-  for (auto& var_name_item : this->Inputs()) {
-    for (auto& var_name : var_name_item.second) {
-      auto* var = scope.FindVar(var_name);
-      if (var && VarIsTensor(var)) {
-        auto* tensor_in = GetTensorFromVar(var);
-        if (tensor_in->IsInitialized()) {
-          auto kernel_type_for_var = this->GetKernelTypeForVar(
-              var_name_item.first, *tensor_in, expected_kernel_key);
-          if (TransFromNeeded(kernel_type_for_var, expected_kernel_key)) {
-            auto out_var_names = OutputVars(true);
-            if (std::find(out_var_names.begin(), out_var_names.end(),
-                          var_name) != out_var_names.end()) {
-              inplace_vars.push_back(var_name);
-            }
-            VLOG(3) << "Transform Variable " << var_name << " from "
-                    << kernel_type_for_var << " to " << expected_kernel_key;
-            auto* trans_var = new_scope.Var(var_name);
-            std::shared_ptr<Tensor> out(new Tensor);
-            DataTransform(expected_kernel_key, kernel_type_for_var, *tensor_in,
-                          out.get());
-            CopyVariableWithTensor(*var, *(out.get()), trans_var);
-          }
-        }
-      }
-    }
+  // exec scope is the scope that kernel actually executed on.
+  const Scope& exec_scope =
+      (transfer_scope == nullptr ? scope : *transfer_scope);
+
+  if (!(expected_kernel_key.place_ == dev_ctx->GetPlace())) {
+    dev_ctx = pool.Get(expected_kernel_key.place_);
   }
 
-  auto* new_dev_ctx = pool.Get(expected_kernel_key.place_);
-  kernel_iter->second->Compute(
-      ExecutionContext(*this, new_scope, *new_dev_ctx));
+  kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx));
 
-  for (auto& var_name : inplace_vars) {
-    VLOG(3) << "share inplace var " + var_name + " back to it's original scope";
-    auto* original_tensor = GetMutableTensorFromVar(scope.FindVar(var_name));
-    auto* transformed_tensor = GetTensorFromVar(new_scope.FindVar(var_name));
-    original_tensor->ShareDataWith(*transformed_tensor);
+  if (!transfered_inplace_vars.empty()) {
+    // there is inplace variable has been transfered.
+    TransferInplaceVarsBack(scope, transfered_inplace_vars, *transfer_scope);
   }
 
   /*For profiling/benchmark only*/
   if (FLAGS_benchmark) {
-    new_dev_ctx->Wait();
+    dev_ctx->Wait();
   }
 
   if (FLAGS_check_nan_inf) {
     for (auto& vname : OutputVars(true)) {
-      auto* var = new_scope.FindVar(vname);
+      auto* var = exec_scope.FindVar(vname);
       if (var == nullptr) continue;
       if (var->IsType<framework::LoDTensor>()) {
         CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>());
@@ -697,6 +683,64 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
     }
   }
 }
+void OperatorWithKernel::TransferInplaceVarsBack(
+    const Scope& scope, const std::vector<std::string>& inplace_vars,
+    const Scope& transfer_scope) const {
+  for (auto& var_name : inplace_vars) {
+    VLOG(3) << "share inplace var " + var_name + " back to it's original scope";
+    auto* original_tensor = GetMutableTensorFromVar(scope.FindVar(var_name));
+    auto* transformed_tensor =
+        GetTensorFromVar(transfer_scope.FindVar(var_name));
+    original_tensor->ShareDataWith(*transformed_tensor);
+  }
+}
+
+Scope* OperatorWithKernel::TryTransferData(
+    const Scope& scope, const OpKernelType& expected_kernel_key,
+    std::vector<std::string>* transfered_inplace_vars) const {
+  Scope* new_scope = nullptr;
+  for (auto& var_name_item : Inputs()) {
+    for (auto& var_name : var_name_item.second) {
+      auto* var = scope.FindVar(var_name);
+      // Only tensor can be tranfer to another device.
+      if (var == nullptr || !VarIsTensor(var)) {
+        continue;
+      }
+
+      auto* tensor_in = GetTensorFromVar(var);
+      if (!tensor_in->IsInitialized()) {
+        continue;
+      }
+
+      auto kernel_type_for_var = GetKernelTypeForVar(
+          var_name_item.first, *tensor_in, expected_kernel_key);
+
+      if (!NeedTransform(kernel_type_for_var, expected_kernel_key)) {
+        continue;
+      }
+
+      auto out_var_names = OutputVars(true);
+      if (std::find(out_var_names.begin(), out_var_names.end(), var_name) !=
+          out_var_names.end()) {
+        transfered_inplace_vars->emplace_back(var_name);
+      }
+
+      VLOG(3) << "Transform Variable " << var_name << " from "
+              << kernel_type_for_var << " to " << expected_kernel_key;
+
+      if (new_scope == nullptr) {
+        new_scope = &scope.NewScope();
+      }
+
+      auto* trans_var = new_scope->Var(var_name);
+      Tensor out;
+      TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in, &out);
+      SetTensorToVariable(*var, out, trans_var);
+    }
+  }
+
+  return new_scope;
+}
 
 proto::VarType::Type OperatorWithKernel::IndicateDataType(
     const ExecutionContext& ctx) const {
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index b1d75d0d0ff3dccc67a1e833ccfe03a4cad8df39..1040eb882baea624e972faf4af3094119df72308 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -121,10 +121,6 @@ class OperatorBase {
   //! Get all outputs variable names
   virtual std::vector<std::string> OutputVars(bool has_intermediate) const;
 
-  // Return a new operator instance, which is as same as this.
-  // Use unique_ptr to prevent caller forget to delete this pointer.
-  virtual std::unique_ptr<OperatorBase> Clone() const = 0;
-
  protected:
   std::string type_;
   // NOTE: in case of OpGrad, inputs_ contains:
@@ -145,37 +141,6 @@ class OperatorBase {
                        const platform::Place& place) const = 0;
 };
 
-// Macro for define a clone method.
-// If you are writing an kernel operator, `Clone` will be defined when you
-// register it. i.e. `Clone` method is not needed to define by yourself.
-#define DEFINE_OP_CLONE_METHOD(cls)                                            \
-  std::unique_ptr<::paddle::framework::OperatorBase> Clone() const final {     \
-    return std::unique_ptr<::paddle::framework::OperatorBase>(new cls(*this)); \
-  }
-
-// Macro for define a default constructor for Operator.
-// You can also use
-//   using PARENT_CLASS::PARENT_CLASS;
-// to use parent's constructor.
-#define DEFINE_OP_CONSTRUCTOR(cls, parent_cls)             \
-  cls(const std::string& type,                             \
-      const ::paddle::framework::VariableNameMap& inputs,  \
-      const ::paddle::framework::VariableNameMap& outputs, \
-      const paddle::framework::AttributeMap& attrs)        \
-      : parent_cls(type, inputs, outputs, attrs) {}
-
-class NOP : public OperatorBase {
- public:
-  using OperatorBase::OperatorBase;
-  std::unique_ptr<OperatorBase> Clone() const override {
-    return std::unique_ptr<OperatorBase>(new NOP(*this));
-  }
-
- private:
-  void RunImpl(const Scope& scope,
-               const platform::Place& place) const override {}
-};
-
 class ExecutionContext {
  public:
   ExecutionContext(const OperatorBase& op, const Scope& scope,
@@ -347,9 +312,9 @@ class OpKernel : public OpKernelBase {
 
 class OperatorWithKernel : public OperatorBase {
  public:
+  using OpKernelFunc = std::function<void(const ExecutionContext&)>;
   using OpKernelMap =
-      std::unordered_map<OpKernelType, std::unique_ptr<OpKernelBase>,
-                         OpKernelType::Hash>;
+      std::unordered_map<OpKernelType, OpKernelFunc, OpKernelType::Hash>;
 
   OperatorWithKernel(const std::string& type, const VariableNameMap& inputs,
                      const VariableNameMap& outputs, const AttributeMap& attrs)
@@ -384,6 +349,20 @@ class OperatorWithKernel : public OperatorBase {
   // same.
   proto::VarType::Type IndicateDataType(const ExecutionContext& ctx) const;
   void RunImpl(const Scope& scope, const platform::Place& place) const final;
+
+  /**
+   * Transfer data from scope to a transfered scope. If there is no data need to
+   * be tranfered, it returns nullptr.
+   *
+   * * transfered_inplace_vars is a output vector.
+   */
+  Scope* TryTransferData(
+      const Scope& scope, const OpKernelType& expected_kernel_key,
+      std::vector<std::string>* transfered_inplace_vars) const;
+
+  void TransferInplaceVarsBack(const Scope& scope,
+                               const std::vector<std::string>& inplace_vars,
+                               const Scope& exec_scope) const;
 };
 
 extern bool OpSupportGPU(const std::string& op_type);
diff --git a/paddle/fluid/framework/operator_test.cc b/paddle/fluid/framework/operator_test.cc
index 74043b5d7990178976baf2fad991ae03f9c8dd25..ac9dd8245ad4e0e8842f219b23d3866b03fdaedb 100644
--- a/paddle/fluid/framework/operator_test.cc
+++ b/paddle/fluid/framework/operator_test.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "gtest/gtest.h"
 
-#include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/init.h"
 
 namespace paddle {
 namespace framework {
@@ -247,26 +247,3 @@ TEST(OpKernel, multi_inputs) {
   auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
   op->Run(scope, cpu_place);
 }
-
-class OperatorClone : public paddle::framework::OperatorBase {
- public:
-  DEFINE_OP_CLONE_METHOD(OperatorClone);
-  OperatorClone(const std::string& type,
-                const paddle::framework::VariableNameMap& inputs,
-                const paddle::framework::VariableNameMap& outputs,
-                const paddle::framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const paddle::framework::Scope& scope,
-               const paddle::platform::Place& place) const override {}
-};
-
-TEST(Operator, Clone) {
-  paddle::framework::InitDevices(true);
-  OperatorClone a("ABC", paddle::framework::VariableNameMap{},
-                  paddle::framework::VariableNameMap{},
-                  paddle::framework::AttributeMap{});
-  auto b = a.Clone();
-  ASSERT_EQ(a.Type(), b->Type());
-}
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 9406c6155da860c90739bddac1e81403b094e619..b53a6f43fbd1f23e69d23ad0fcc54d5c25d352a3 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -110,7 +110,6 @@ ParallelExecutor::ParallelExecutor(
 
   // Step 3. Convert main_program to SSA form and dependency graph. Also, insert
   // ncclOp
-
   details::SSAGraphBuilderFactory builder_factory(
       member_->places_, loss_var_name, params, member_->local_scopes_,
       build_strategy);
@@ -122,9 +121,10 @@ ParallelExecutor::ParallelExecutor(
 #endif
   }
 
+  builder_ = builder_factory.Create();
   member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
       exec_strategy, member_->local_scopes_, places,
-      builder_factory.Create()->Build(main_program)));
+      builder_->Build(main_program)));
 
   member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
       exec_strategy, member_->local_scopes_, std::move(var_infos),
@@ -133,10 +133,23 @@ ParallelExecutor::ParallelExecutor(
 
 void ParallelExecutor::BCastParamsToGPUs(
     const std::unordered_set<std::string> &vars) const {
-  auto *main_scope = member_->local_scopes_[0];
+  // the the initializing bcast, all vars would be bcast from device(0),
+  // otherwise
+  // bcast from the specified device.
+  bool initializing = builder_.get() == nullptr ? true : false;
 
   for (auto &var : vars) {
-    auto *main_var = main_scope->FindVar(var);
+    int var_dev_id =
+        builder_.get() == nullptr ? -1 : builder_->GetVarDeviceID(var);
+    if (!initializing && var_dev_id == -1) continue;
+
+    framework::Variable *main_var = nullptr;
+    if (initializing) {
+      main_var = member_->local_scopes_[0]->FindVar(var);
+    } else {
+      main_var = member_->local_scopes_[var_dev_id]->FindVar(var);
+    }
+
     if (main_var == nullptr || !main_var->IsType<LoDTensor>()) {
       continue;
     }
@@ -151,7 +164,9 @@ void ParallelExecutor::BCastParamsToGPUs(
       for (size_t i = 0; i < member_->places_.size(); ++i) {
         auto place = member_->places_[i];
         void *buffer;
-        if (i == 0) {
+
+        if ((initializing && i == 0) ||
+            (!initializing && static_cast<int>(i) == var_dev_id)) {
           buffer = const_cast<void *>(main_tensor.data<void>());
         } else {
           auto local_scope = member_->local_scopes_[i];
@@ -168,8 +183,16 @@ void ParallelExecutor::BCastParamsToGPUs(
         platform::NCCLGroupGuard guard;
         for (size_t i = 0; i < member_->places_.size(); ++i) {
           auto &nccl_ctx = member_->nccl_ctxs_->at(member_->places_[i]);
-          platform::dynload::ncclBcast(buffers[i], numel, data_type, 0,
-                                       nccl_ctx.comm_, nccl_ctx.stream());
+          if (initializing) {
+            platform::dynload::ncclBcast(buffers[i], numel, data_type, 0,
+                                         nccl_ctx.comm_, nccl_ctx.stream());
+          } else {
+            if (var_dev_id >= 0) {
+              platform::dynload::ncclBcast(buffers[i], numel, data_type,
+                                           var_dev_id, nccl_ctx.comm_,
+                                           nccl_ctx.stream());
+            }
+          }
         }
         member_->nccl_ctxs_->WaitAll();
       }
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index 5247e790649e76567f4527d54499d6e95dac5c27..058f83f07c26224e3180d140630c08a24c40cd80 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -19,12 +19,14 @@ limitations under the License. */
 #include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/details/execution_strategy.h"
+#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
+
 namespace paddle {
 namespace framework {
 
@@ -68,6 +70,7 @@ class ParallelExecutor {
 
  private:
   ParallelExecutorPrivate *member_;
+  std::unique_ptr<details::SSAGraphBuilder> builder_;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/reader.cc b/paddle/fluid/framework/reader.cc
index 0b36f1116d15004b355e854e101abb9ad3297836..5897d320a8b7e5af541098cadff8e78f8324949c 100644
--- a/paddle/fluid/framework/reader.cc
+++ b/paddle/fluid/framework/reader.cc
@@ -13,29 +13,61 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/reader.h"
+#include <deque>
 
 namespace paddle {
 namespace framework {
-ReaderBase::~ReaderBase() {}
 
-FileReader::FileReader(const std::vector<DDim> &dims) : dims_(dims) {}
-
-void FileReader::ReadNext(std::vector<LoDTensor> *out) {
+void ReaderBase::ReadNext(std::vector<LoDTensor> *out) {
+  std::lock_guard<std::mutex> lock(mu_);
+  PADDLE_ENFORCE_EQ(status_, ReaderStatus::kRunning);
   ReadNextImpl(out);
-  if (out->empty()) {
-    return;
-  }
+}
 
-  PADDLE_ENFORCE_EQ(out->size(), dims_.size());
-  for (size_t i = 0; i < dims_.size(); ++i) {
-    auto &actual = (*out)[i].dims();
-    auto &expect = dims_[i];
+void ReaderBase::InsertDecoratedReader(
+    const std::shared_ptr<ReaderBase> &decorated_reader) {
+  std::lock_guard<std::mutex> guard(mu_);
+  decorated_readers_.emplace_back(decorated_reader);
+}
 
-    PADDLE_ENFORCE_EQ(actual.size(), expect.size());
-    for (int j = 0; j < actual.size(); ++j) {
-      //      PADDLE_ENFORCE(actual[i] == expect[i] || expect[i] == -1);
+std::unordered_set<ReaderBase *> ReaderBase::GetEndPoints() {
+  std::unordered_set<ReaderBase *> result;
+  std::deque<ReaderBase *> queue;
+  queue.emplace_back(this);
+  while (!queue.empty()) {  // BFS search
+    auto *front = queue.front();
+    queue.pop_front();
+    if (front->decorated_readers_.empty()) {
+      result.emplace(front);
+    } else {
+      for (auto &reader : front->decorated_readers_) {
+        if (auto *reader_ptr = reader.lock().get()) {
+          queue.emplace_back(reader_ptr);
+        }
+      }
     }
   }
+
+  return result;
 }
+
+void ReaderBase::Shutdown() {
+  std::lock_guard<std::mutex> lock(mu_);
+  if (status_ != ReaderStatus::kStopped) {
+    ShutdownImpl();
+    status_ = ReaderStatus::kStopped;
+  }
+}
+
+void ReaderBase::Start() {
+  std::lock_guard<std::mutex> lock(mu_);
+  if (status_ != ReaderStatus::kRunning) {
+    StartImpl();
+    status_ = ReaderStatus::kRunning;
+  }
+}
+
+ReaderBase::~ReaderBase() { Shutdown(); }
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/reader.h b/paddle/fluid/framework/reader.h
index 64d4ceab624312ed366d7e835072899f1f033a88..6c4432cb7a70853e19460b1980d621c02caed970 100644
--- a/paddle/fluid/framework/reader.h
+++ b/paddle/fluid/framework/reader.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <memory>
+#include <unordered_set>
 #include <vector>
 
 #include "paddle/fluid/framework/ddim.h"
@@ -24,61 +25,116 @@
 namespace paddle {
 namespace framework {
 
+enum ReaderStatus { kRunning, kStopped };
+
 class ReaderBase {
  public:
-  virtual void ReadNext(std::vector<LoDTensor>* out) = 0;
+  void ReadNext(std::vector<LoDTensor>* out);
+
+  void Shutdown();
 
-  virtual void ReInit() = 0;
+  void Start();
+
+  // Return the readers which are the end of decorating chain. Basically
+  // they are readers just before read op.
+  std::unordered_set<ReaderBase*> GetEndPoints();
 
   virtual ~ReaderBase();
+
+ protected:
+  virtual void ReadNextImpl(std::vector<LoDTensor>* out) = 0;
+
+  virtual void ShutdownImpl() {}
+
+  virtual void StartImpl() {}
+
+  ReaderStatus status_{kRunning};
+
+  mutable std::mutex mu_;
+
+ private:
+  friend class DecoratedReader;
+  // These methods can be only invoked inside DecoratedReader to record the
+  // decorating chain.
+  void InsertDecoratedReader(
+      const std::shared_ptr<ReaderBase>& decorated_reader);
+  // A set of which readers that decorated this reader.
+  std::vector<std::weak_ptr<ReaderBase>> decorated_readers_;
 };
 
-class DecoratedReader : public ReaderBase {
+class DecoratedReader : public ReaderBase,
+                        public std::enable_shared_from_this<DecoratedReader> {
  public:
   explicit DecoratedReader(const std::shared_ptr<ReaderBase>& reader)
       : ReaderBase(), reader_(reader) {
     PADDLE_ENFORCE_NOT_NULL(reader_);
   }
 
-  void ReInit() override { reader_->ReInit(); }
+  void RegisterDecorateChain() {
+    reader_->InsertDecoratedReader(shared_from_this());
+  }
 
  protected:
-  std::shared_ptr<ReaderBase> reader_;
-};
-
-class FileReader : public ReaderBase {
- public:
-  explicit FileReader(const std::vector<DDim>& dims);
-
-  void ReadNext(std::vector<LoDTensor>* out) override;
+  void ShutdownImpl() override { reader_->Shutdown(); }
 
- protected:
-  virtual void ReadNextImpl(std::vector<LoDTensor>* out) = 0;
+  void StartImpl() override { reader_->Start(); }
 
- private:
-  std::vector<DDim> dims_;
+  std::shared_ptr<ReaderBase> reader_;
 };
 
+// FileReader is just a conceptual class.
+class FileReader : public ReaderBase {};
+
 // The ReaderHolder is used as reader' unified wrapper,
 // making it easier to access different type reader in Variables.
 class ReaderHolder {
  public:
-  void Reset(ReaderBase* reader) { reader_.reset(reader); }
+  template <typename T>
+  void Reset(const std::shared_ptr<T>& reader) {
+    auto reader_base = std::dynamic_pointer_cast<ReaderBase>(reader);
+    PADDLE_ENFORCE_NOT_NULL(reader_base);
+    reader_ = reader_base;
+  }
 
-  std::shared_ptr<ReaderBase> Get() const { return reader_; }
+  const std::shared_ptr<ReaderBase>& Get() const { return reader_; }
 
   void ReadNext(std::vector<LoDTensor>* out) {
     PADDLE_ENFORCE_NOT_NULL(reader_);
     reader_->ReadNext(out);
   }
-  void ReInit() {
+
+  void ResetAll() {
+    auto end_readers = reader_->GetEndPoints();
+    for (auto* reader : end_readers) {
+      reader->Shutdown();
+    }
+    for (auto* reader : end_readers) {
+      reader->Start();
+    }
+  }
+
+  void Shutdown() {
+    PADDLE_ENFORCE_NOT_NULL(reader_);
+    reader_->Shutdown();
+  }
+
+  void Start() {
     PADDLE_ENFORCE_NOT_NULL(reader_);
-    reader_->ReInit();
+    reader_->Start();
   }
 
+  operator const std::shared_ptr<ReaderBase>&() const { return this->reader_; }
+
  private:
   std::shared_ptr<ReaderBase> reader_;
 };
 
+template <typename T, typename... ARGS>
+inline std::shared_ptr<DecoratedReader> MakeDecoratedReader(ARGS&&... args) {
+  std::shared_ptr<DecoratedReader> reader(new T(std::forward<ARGS>(args)...));
+  reader->RegisterDecorateChain();
+  return reader;
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/reader_test.cc b/paddle/fluid/framework/reader_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f0d07cb7c1367576084b9494e7758103bb45d1e5
--- /dev/null
+++ b/paddle/fluid/framework/reader_test.cc
@@ -0,0 +1,52 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/reader.h"
+#include <memory>
+#include "gtest/gtest.h"
+
+class StubDecoratedReader : public paddle::framework::DecoratedReader {
+ public:
+  explicit StubDecoratedReader(const std::shared_ptr<ReaderBase> &reader)
+      : DecoratedReader(reader) {}
+
+  void ReadNextImpl(std::vector<paddle::framework::LoDTensor> *out) override {}
+};
+
+class StubRootReader : public paddle::framework::ReaderBase {
+ public:
+  void ReadNextImpl(std::vector<paddle::framework::LoDTensor> *out) override {}
+};
+
+TEST(READER, decorate_chain) {
+  auto root = std::make_shared<StubRootReader>();
+  auto end_point1 =
+      paddle::framework::MakeDecoratedReader<StubDecoratedReader>(root);
+  auto end_point2 =
+      paddle::framework::MakeDecoratedReader<StubDecoratedReader>(root);
+
+  {
+    auto endpoints = root->GetEndPoints();
+    ASSERT_EQ(endpoints.size(), 2U);
+    ASSERT_NE(endpoints.count(end_point1.get()), 0);
+    ASSERT_NE(endpoints.count(end_point2.get()), 0);
+  }
+
+  {
+    auto end_point3 =
+        paddle::framework::MakeDecoratedReader<StubDecoratedReader>(root);
+    ASSERT_EQ(root->GetEndPoints().size(), 3U);
+  }
+  { ASSERT_EQ(root->GetEndPoints().size(), 2U); }
+}
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index bb2d866c824e0fec1b241caea407a38c88a3cb51..50f374e3703a97f6c1fdb4b14fdeb0b603f9ac86 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -43,48 +43,29 @@ Scope& Scope::NewScope() const {
 }
 
 Variable* Scope::Var(const std::string& name) {
-  // acquire the lock when new var under this scope
   std::unique_lock<std::mutex> lock(mutex_);
-  auto* v = FindVarLocally(name);
-  if (v != nullptr) return v;
-
-  v = new Variable();
-  vars_[name].reset(v);
-  VLOG(3) << "Create variable " << name;
-  v->name_ = &(vars_.find(name)->first);
-  return v;
+  return VarInternal(name);
 }
 
 Variable* Scope::Var(std::string* name) {
-  auto var_name = string::Sprintf("%p.%d", this, vars_.size());
+  std::unique_lock<std::mutex> lock(mutex_);
+  auto new_name = string::Sprintf("%p.%d", this, vars_.size());
   if (name != nullptr) {
-    *name = var_name;
+    *name = new_name;
   }
-  return Var(var_name);
+  return VarInternal(new_name);
 }
 
 Variable* Scope::FindVar(const std::string& name) const {
-  // acquire the lock when find var
   std::unique_lock<std::mutex> lock(mutex_);
   return FindVarInternal(name);
 }
 
-Variable* Scope::FindVarInternal(const std::string& name) const {
-  auto var = FindVarLocally(name);
-  if (var != nullptr) {
-    return var;
-  }
-  return (parent_ == nullptr) ? nullptr : parent_->FindVarInternal(name);
-}
-
 const Scope* Scope::FindScope(const Variable* var) const {
-  for (auto& kv : vars_) {
-    if (kv.second.get() == var) {
-      return this;
-    }
-  }
-  return (parent_ == nullptr) ? nullptr : parent_->FindScope(var);
+  std::unique_lock<std::mutex> lock(mutex_);
+  return FindScopeInternal(var);
 }
+
 void Scope::DropKids() {
   std::unique_lock<std::mutex> lock(mutex_);
   for (Scope* s : kids_) delete s;
@@ -92,6 +73,7 @@ void Scope::DropKids() {
 }
 
 std::vector<std::string> Scope::LocalVarNames() const {
+  std::unique_lock<std::mutex> lock(mutex_);
   std::vector<std::string> known_vars;
   known_vars.reserve(this->vars_.size());
   for (auto& p : vars_) {
@@ -127,6 +109,39 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {
 
 void Scope::Rename(const std::string& origin_name,
                    const std::string& new_name) const {
+  std::unique_lock<std::mutex> lock(mutex_);
+  RenameInternal(origin_name, new_name);
+}
+
+std::string Scope::Rename(const std::string& origin_name) const {
+  std::unique_lock<std::mutex> lock(mutex_);
+  auto new_name = string::Sprintf("%p.%d", this, vars_.size());
+  RenameInternal(origin_name, new_name);
+  return new_name;
+}
+
+Variable* Scope::VarInternal(const std::string& name) {
+  auto* v = FindVarLocally(name);
+  if (v != nullptr) return v;
+
+  v = new Variable();
+  vars_[name].reset(v);
+  VLOG(3) << "Create variable " << name;
+  v->name_ = &(vars_.find(name)->first);
+  return v;
+}
+
+const Scope* Scope::FindScopeInternal(const Variable* var) const {
+  for (auto& kv : vars_) {
+    if (kv.second.get() == var) {
+      return this;
+    }
+  }
+  return (parent_ == nullptr) ? nullptr : parent_->FindScope(var);
+}
+
+void Scope::RenameInternal(const std::string& origin_name,
+                           const std::string& new_name) const {
   auto origin_it = vars_.find(origin_name);
   PADDLE_ENFORCE(origin_it != vars_.end(),
                  "Cannot find original variable with name %s", origin_name);
@@ -137,10 +152,12 @@ void Scope::Rename(const std::string& origin_name,
   vars_.erase(origin_it);
 }
 
-std::string Scope::Rename(const std::string& origin_name) const {
-  auto var_name = string::Sprintf("%p.%d", this, vars_.size());
-  Rename(origin_name, var_name);
-  return var_name;
+Variable* Scope::FindVarInternal(const std::string& name) const {
+  auto var = FindVarLocally(name);
+  if (var != nullptr) {
+    return var;
+  }
+  return (parent_ == nullptr) ? nullptr : parent_->FindVar(name);
 }
 
 Variable* Scope::FindVarLocally(const std::string& name) const {
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index 95b4f7c5f66a4161058955c7666be34414f5074c..e246241c0abfbc7bdcaf38d073cc58fc36a4f737 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -88,12 +88,20 @@ class Scope {
   // Call Scope::NewScope for a sub-scope.
   explicit Scope(Scope const* parent) : parent_(parent) {}
 
+  // Called by Var.
+  Variable* VarInternal(const std::string& name);
+
+  // Called by FindScope.
+  const Scope* FindScopeInternal(const Variable* var) const;
+
+  // Called by Rename.
+  void RenameInternal(const std::string& origin_name,
+                      const std::string& new_name) const;
+
   // Called by FindVar recursively.
-  // Caller doesn't own the returned Variable.
   Variable* FindVarInternal(const std::string& name) const;
 
   // Called by FindVarInternal and Var.
-  // Caller doesn't own the returned Variable.
   Variable* FindVarLocally(const std::string& name) const;
 
   // Scope in `kids_` are owned by this class.
diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h
index 96114678a9992f2975c4173c7cc003114f04d8df..7f678f869aac4616c8bca440d0431f765da41dd6 100644
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -23,9 +23,9 @@ namespace framework {
 template <typename T>
 inline const T* Tensor::data() const {
   check_memory_size();
-  PADDLE_ENFORCE(std::is_same<T, void>::value ||
-                     holder_->type() == std::type_index(typeid(T)),
-                 "Tensor holds the wrong type, it holds %s",
+  bool valid = std::is_same<T, void>::value ||
+               holder_->type() == std::type_index(typeid(T));
+  PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s",
                  this->holder_->type().name());
 
   return reinterpret_cast<const T*>(
@@ -37,9 +37,9 @@ inline bool Tensor::IsInitialized() const { return holder_ != nullptr; }
 template <typename T>
 inline T* Tensor::data() {
   check_memory_size();
-  PADDLE_ENFORCE(std::is_same<T, void>::value ||
-                     holder_->type() == std::type_index(typeid(T)),
-                 "Tensor holds the wrong type, it holds %s",
+  bool valid = std::is_same<T, void>::value ||
+               holder_->type() == std::type_index(typeid(T));
+  PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s",
                  this->holder_->type().name());
   return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                               offset_);
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index e5bc74755f46449296a153e8b330968e6d9f1e1d..f98011e896f4033ef210e0eb69f93ce7800a3cd6 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -69,7 +69,22 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
     PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
-    memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
+    if (platform::is_same_place(src_place, dst_place)) {
+      memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
+                   stream);
+    } else {
+      if (platform::is_same_place(ctx_place, src_place)) {
+        memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
+                     stream);
+        platform::DeviceContextPool::Instance().Get(src.place())->Wait();
+      } else if (platform::is_same_place(ctx_place, dst_place)) {
+        platform::DeviceContextPool::Instance().Get(src.place())->Wait();
+        memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
+                     stream);
+      } else {
+        PADDLE_THROW("ctx is not belong to dst_gpu_place or src_gpu_place.");
+      }
+    }
   }
 #endif
 }
@@ -78,10 +93,10 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
                 Tensor* dst) {
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   const platform::DeviceContext* dev_ctx;
-  if (platform::is_gpu_place(src.place())) {
-    dev_ctx = pool.Get(src.place());
-  } else {
+  if (platform::is_gpu_place(dst_place)) {
     dev_ctx = pool.Get(dst_place);
+  } else {
+    dev_ctx = pool.Get(src.place());
   }
   TensorCopy(src, dst_place, *dev_ctx, dst);
 }
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index dca279b69382b80e055f661cefe84b81326704b5..4457382ade37a12f5f3613fc4113fbf1f6f91124 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -23,10 +23,25 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+// NOTE(zcd): Because TensorCopy is an async operation, when the src_place
+// and dst_place are two different GPU, to ensure that the operation can
+// be carried out correctly, there is a src_ctx wait operation in TensorCopy.
+// If ctx_place and src_place are the same, src_ctx.Wait() is added
+// after memory::Copy; if ctx_place and dst_place are the same,
+// src_ctx.Wait() is added before memory::Copy.
 void TensorCopy(const Tensor& src, const platform::Place& dst_place,
                 const platform::DeviceContext& ctx, Tensor* dst);
+
+// NOTE(zcd): If the src.place() and dst_place are two different GPU,
+// the copy operation is carried out on the dst_place's stream. This is
+// very important, because TensorCopy is an async operator, and in most
+// case, once this copy operator returns, dst is to be used in dst_place's
+// stream, if this copy operation is carried out on the src_place's stream,
+// when dst is used in dst_place's stream the copy operation may be
+// not completed.
 void TensorCopy(const Tensor& src, const platform::Place& dst_place,
                 Tensor* dst);
+
 void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
                     Tensor* dst);
 
diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h
index 4879209ece9fdfea91e484a4118c00a2a2a2b4f7..e099e40f121ff13657e563eb608feecbca0551be 100644
--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -35,7 +35,8 @@ using VariableNameMap = std::map<std::string, std::vector<std::string>>;
 using Attribute =
     boost::variant<boost::blank, int, float, std::string, std::vector<int>,
                    std::vector<float>, std::vector<std::string>, bool,
-                   std::vector<bool>, BlockDesc*, int64_t>;
+                   std::vector<bool>, BlockDesc*, int64_t,
+                   std::vector<BlockDesc*>>;
 
 using AttributeMap = std::unordered_map<std::string, Attribute>;
 
diff --git a/paddle/fluid/framework/var_type.h b/paddle/fluid/framework/var_type.h
index 2b646d78f0b23ec3e065c891826856c2341d4ac1..429997c8b89fef7aa164e878095ab3b5c9998e5b 100644
--- a/paddle/fluid/framework/var_type.h
+++ b/paddle/fluid/framework/var_type.h
@@ -24,18 +24,24 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
+
+template <typename T>
+bool IsType(const std::type_index& type_index) {
+  return type_index == std::type_index(typeid(T));
+}
+
 inline proto::VarType::Type ToVarType(std::type_index type) {
-  if (type.hash_code() == typeid(LoDTensor).hash_code()) {
+  if (IsType<LoDTensor>(type)) {
     return proto::VarType_Type_LOD_TENSOR;
-  } else if (type.hash_code() == typeid(LoDRankTable).hash_code()) {
+  } else if (IsType<LoDRankTable>(type)) {
     return proto::VarType_Type_LOD_RANK_TABLE;
-  } else if (type.hash_code() == typeid(LoDTensorArray).hash_code()) {
+  } else if (IsType<LoDTensorArray>(type)) {
     return proto::VarType_Type_LOD_TENSOR_ARRAY;
-  } else if (type.hash_code() == typeid(SelectedRows).hash_code()) {
+  } else if (IsType<SelectedRows>(type)) {
     return proto::VarType_Type_SELECTED_ROWS;
-  } else if (type.hash_code() == typeid(ReaderHolder).hash_code()) {
+  } else if (IsType<ReaderHolder>(type)) {
     return proto::VarType_Type_READER;
-  } else if (type.hash_code() == typeid(ChannelHolder).hash_code()) {
+  } else if (IsType<ChannelHolder>(type)) {
     return proto::VarType_Type_CHANNEL;
   } else {
     PADDLE_THROW("ToVarType:Unsupported type %s", type.name());
diff --git a/paddle/fluid/framework/var_type_inference_test.cc b/paddle/fluid/framework/var_type_inference_test.cc
index 14b81ddfecb8c996ae8709910c022a074e91eb3c..7842168f603885ce7dc87d2a01dfa4f544389faa 100644
--- a/paddle/fluid/framework/var_type_inference_test.cc
+++ b/paddle/fluid/framework/var_type_inference_test.cc
@@ -22,6 +22,17 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+class NOP : public OperatorBase {
+ public:
+  NOP(const std::string &type, const VariableNameMap &inputs,
+      const VariableNameMap &outputs, const AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+ private:
+  void RunImpl(const Scope &scope,
+               const platform::Place &place) const override {}
+};
+
 class SumOpMaker : public OpProtoAndCheckerMaker {
  public:
   void Make() {
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index ec16a1c600a3bafc1c4cbbd920360253c106e3a1..1895aea7f98cb1ad12b2ce16545339252349ea37 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor init)
+set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor )
 
 # TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal?
 cc_library(paddle_fluid_api
@@ -28,9 +28,10 @@ endif()
 if(WITH_TESTING)
   # both tests/book and analysis depends the models that generated by python/paddle/fluid/tests/book
   add_subdirectory(tests/book)
-  add_subdirectory(analysis)
 endif()
 
+add_subdirectory(analysis)
+
 if (TENSORRT_FOUND)
   add_subdirectory(tensorrt)
 endif()
diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index 50835784440bfa177e38f9760bb4a47ad335a9e1..cdd67fdc929851979fe0a38afe1af74ec7321b8a 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -1,23 +1,38 @@
-set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor init)
-cc_library(analysis SRCS dot.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc fluid_to_data_flow_graph_pass.cc
-  DEPS paddle_fluid)
+cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc
+  fluid_to_data_flow_graph_pass.cc
+  data_flow_graph_to_fluid_pass.cc
+  dfg_graphviz_draw_pass.cc
+  tensorrt_subgraph_pass.cc
+  tensorrt_subgraph_node_mark_pass.cc
+  analyzer.cc
+  helper.cc
+  DEPS framework_proto proto_desc)
 cc_test(test_node SRCS node_tester.cc DEPS analysis)
 cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
 
 set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
 
-cc_test(test_data_flow_graph SRCS data_flow_graph_tester.cc DEPS analysis ${FLUID_CORE_MODULES} paddle_fluid
-  ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model)
-set_tests_properties(test_data_flow_graph PROPERTIES DEPENDS test_word2vec)
+function (inference_analysis_test TARGET)
+    if(WITH_TESTING)
+        set(options "")
+        set(oneValueArgs "")
+        set(multiValueArgs SRCS)
+        cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
-cc_test(test_subgraph_splitter
-        SRCS subgraph_splitter_tester.cc
-        DEPS analysis paddle_fluid tensor
-        ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model)
-set_tests_properties(test_subgraph_splitter PROPERTIES DEPENDS test_word2vec)
+        cc_test(${TARGET}
+                SRCS "${analysis_test_SRCS}"
+                DEPS analysis
+                ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model --fraction_of_gpu_memory_to_use=0.5)
+        set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec)
+    endif(WITH_TESTING)
+endfunction(inference_analysis_test)
 
-cc_test(test_dfg_graphviz_draw_pass
-        SRCS dfg_graphviz_draw_pass_tester.cc
-        DEPS analysis
-        ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model)
-set_tests_properties(test_dfg_graphviz_draw_pass PROPERTIES DEPENDS test_word2vec)
+inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc)
+inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc)
+inference_analysis_test(test_fluid_to_data_flow_graph_pass SRCS fluid_to_data_flow_graph_pass_tester.cc)
+inference_analysis_test(test_subgraph_splitter SRCS subgraph_splitter_tester.cc)
+inference_analysis_test(test_dfg_graphviz_draw_pass SRCS dfg_graphviz_draw_pass_tester.cc)
+inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_tester.cc)
+inference_analysis_test(test_pass_manager SRCS pass_manager_tester.cc)
+inference_analysis_test(test_tensorrt_subgraph_node_mark_pass SRCS tensorrt_subgraph_node_mark_pass_tester.cc)
+inference_analysis_test(test_analyzer SRCS analyzer_tester.cc)
diff --git a/paddle/fluid/inference/analysis/README.md b/paddle/fluid/inference/analysis/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..70adb4a974cc5f9911cb302840bbef7ec2591505
--- /dev/null
+++ b/paddle/fluid/inference/analysis/README.md
@@ -0,0 +1,58 @@
+# Inference Analysis
+
+The `inference/analysis` module is used to analyze and optimize the inference program,
+it references some philosophy from `LLVM/analysis`, 
+and make the various optimization features be pluggable and co-exist in a pipeline.
+
+We borrowed some concepts from LLVM, such as
+
+- [Pass](./pass.h)es to implement optimization that traverse the inference program,
+- [DataFlowGraph](./data_flow_graph.h) to represent the data flow graph built from a program,
+- [PassManager](./pass_manager.h) to manage a sequence of `Pass`es over a graph.
+
+There are some other basic concepts here
+
+- [Node](./node.h), the node in a `DataFlowGraph`,
+  - `Function`, the Operator in Fluid,
+  - `Value`, the Variable in Fluid;
+- [Argument](./argument.h), the argument that treat as the input and output of all `Pass`es in the pipeline,
+
+## How it works
+
+The `inference/analysis` module make all the passes in a pipeline, and works in such way:
+
+1. Build a `DataFlowGraph` from a Fluid inference ProgramDesc,
+2. Call the middle passes one by one, the same `DataFlowGraph` is passed across all the passes,
+3. Transform a new ProgramDesc from the modified `DataFlowGraph`.
+
+The new optimization features can be added as an independent `Pass` and controlled by gflags,
+each pass will generate unified debug information or visualization for better debugging.
+
+## Supported Passes
+
+### `FluidToDataFlowGraphPass`
+Transform the fluid `ProgramDesc` to a `DataFlowGraph` to give an abstract representation for all the middle passes, 
+this should be the first pass of the pipeline.
+
+### `DataFlowGraphToFluidPass`
+Generate a final `ProgramDesc` from a data flow graph, this should be the last pass of the pipeline.
+
+### `TensorRTSubgraphNodeMarkPass`
+Mark the `Node` that are supported by TensorRT, 
+this pass will generate a visualization file which can be used for debugging.
+
+### `TensorRTSubGraphPass`
+Split the sub-graph that are can be accelerated by TensorRT.
+
+### `DFG_GraphvizDrawPass`
+This pass is just for debug, it will visualize the `DataFlowGraph` using the [graphviz](http://www.graphviz.org) tool.
+
+It can be used as a helper class that draws the modified graph after each pass.
+
+## Utilities
+
+There is some helper legacy/function/class for analysis.
+
+- [dot.h](./dot.h) give a easy to use interface for generating `DOT` codes,
+- [graph_traits.h](./graph_traits.h) contains the interfaces of the graph traversal algorithms, it uses `iterator`to make the algorithms easy to share across different passes,
+there are some implementations in  [data_flow_graph.cc](./data_flow_graph.cc) , such as BFS and DFS..
diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a4625f008c15300b88ef0bce71cd7d8aa473c9a8
--- /dev/null
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -0,0 +1,83 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include <string>
+#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
+#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
+#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+#include "paddle/fluid/inference/analysis/pass_manager.h"
+#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
+#include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false,
+            "Enable subgraph to TensorRT engine for acceleration");
+
+DEFINE_string(inference_analysis_graphviz_log_root, "./",
+              "Graphviz debuger for data flow graphs.");
+
+class DfgPassManagerImpl final : public DfgPassManager {
+ public:
+  DfgPassManagerImpl() {
+    // TODO(Superjomn) set the key with pass reprs.
+    AddPass("fluid-to-data-flow-graph", new FluidToDataFlowGraphPass);
+    if (FLAGS_inference_analysis_enable_tensorrt_subgraph_engine) {
+      auto trt_teller = [](const Node* node) {
+        if (!node->IsFunction()) return false;
+        return static_cast<const Function*>(node)->func_type() == "mul";
+      };
+      AddPass("tensorrt-subgraph-marker",
+              new TensorRTSubgraphNodeMarkPass(trt_teller));
+      AddPass("tensorrt-subgraph", new TensorRTSubGraphPass(trt_teller));
+    }
+    AddPass("data-flow-graph-to-fluid", new DataFlowGraphToFluidPass);
+  }
+
+  std::string repr() const override { return "dfg-pass-manager"; }
+  std::string description() const override { return "DFG pass manager."; }
+
+ private:
+  void AddPass(const std::string& name, Pass* pass) {
+    LOG(INFO) << "Adding pass " << name;
+    Register(name, pass);
+    AddGraphvizDebugerPass(pass);
+  }
+
+  // Add the graphviz debuger pass if the parent pass has one.
+  void AddGraphvizDebugerPass(Pass* pass) {
+    auto* debuger_pass = pass->CreateGraphvizDebugerPass();
+    if (debuger_pass) {
+      LOG(INFO) << " - register debug pass [" << debuger_pass->repr() << "]";
+      Register(debuger_pass->repr(), debuger_pass);
+    }
+  }
+};
+
+Analyzer::Analyzer() { Register("manager1", new DfgPassManagerImpl); }
+
+void Analyzer::Run(Argument* argument) {
+  for (auto& x : data_) {
+    PADDLE_ENFORCE(x->Initialize(argument));
+    x->RunAll();
+    PADDLE_ENFORCE(x->Finalize());
+  }
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h
new file mode 100644
index 0000000000000000000000000000000000000000..e9e14fb1947da059c8d126d3da182ce446f6421e
--- /dev/null
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -0,0 +1,68 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+/*
+ * This file contains Analyzer, an class that exposed as a library that analyze
+ * and optimize
+ * Fluid ProgramDesc for inference. Similar to LLVM, it has multiple flags to
+ * control whether
+ * an process is applied on the program.
+ *
+ * The processes are called Passes in analysis, the Passes are placed in a
+ * pipeline, the first
+ * Pass is the FluidToDataFlowGraphPass which transforms a Fluid ProgramDesc to
+ * a data flow
+ * graph, the last Pass is DataFlowGraphToFluidPass which transforms a data flow
+ * graph to a
+ * Fluid ProgramDesc. The passes in the middle of the pipeline can be any Passes
+ * which take a
+ * node or data flow graph as input.
+ *
+ * The Analyzer can be used in two methods, the first is a executable file which
+ * can be used to
+ * pre-process the inference model and can be controlled by passing difference
+ * command flags;
+ * the other way is to compose inside the inference API as a runtime pre-process
+ * phase in the
+ * inference service.
+ */
+
+#include <gflags/gflags.h>
+#include "paddle/fluid/inference/analysis/pass.h"
+#include "paddle/fluid/inference/analysis/pass_manager.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+// TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this
+// flag if not available.
+DECLARE_bool(inference_analysis_enable_tensorrt_subgraph_engine);
+DECLARE_string(inference_analysis_graphviz_log_root);
+
+class Analyzer : public OrderedRegistry<PassManager> {
+ public:
+  // Register all the pass-managers.
+  Analyzer();
+
+  void Run(Argument* argument);
+
+  DISABLE_COPY_AND_ASSIGN(Analyzer);
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d7c1a72932a39f878add2bb884e280b91d3c38c0
--- /dev/null
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST_F(DFG_Tester, main) {
+  Analyzer analyser;
+  analyser.Run(&argument);
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/argument.cc b/paddle/fluid/inference/analysis/argument.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cb0263d5d98e86b612696ebde66d17fb2543809b
--- /dev/null
+++ b/paddle/fluid/inference/analysis/argument.cc
@@ -0,0 +1,15 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/argument.h"
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
new file mode 100644
index 0000000000000000000000000000000000000000..6d316f20bff7a68754b0afec6463bd5d7579227f
--- /dev/null
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -0,0 +1,58 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * This file defines the class Argument, which is the input and output of the
+ * analysis module. All the fields that needed either by Passes or PassManagers
+ * are contained in Argument.
+ *
+ * TODO(Superjomn) Find some way better to contain the fields when it grow too
+ * big.
+ */
+
+#pragma once
+
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/inference/analysis/data_flow_graph.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * The argument definition of both Pass and PassManagers.
+ *
+ * All the fields should be registered here for clearness.
+ */
+struct Argument {
+  // The graph that process by the Passes or PassManagers.
+  std::unique_ptr<DataFlowGraph> main_dfg;
+
+  // The original program desc.
+  std::unique_ptr<framework::proto::ProgramDesc> origin_program_desc;
+
+  // The processed program desc.
+  std::unique_ptr<framework::proto::ProgramDesc> transformed_program_desc;
+};
+
+#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
+#define ANALYSIS_ARGUMENT_CHECK_FIELD(field__)               \
+  if (UNLIKELY(!(field__))) {                                \
+    LOG(ERROR) << "field " << #field__ << " should be set."; \
+    return false;                                            \
+  }
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.cc b/paddle/fluid/inference/analysis/data_flow_graph.cc
index 4220451e3caee62caa51af5bc33d6dd3fd891018..d09bf3ed161703b0cf273522921e157c7360a0bc 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph.cc
@@ -14,12 +14,13 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
 #include "paddle/fluid/inference/analysis/dot.h"
+#include "paddle/fluid/inference/analysis/node.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
 
-// It is a better idea that the inputs and outputs of this graph is set manully
+// It is a better idea that the inputs and outputs of this graph is set manually
 // before, but there must be a Pass that helps to prune the unnecessary ops that
 // do not contribute to the given targets, so in this pass, analysis and get the
 // inputs and outputs is OK.
@@ -49,6 +50,25 @@ void DataFlowGraph::Build() {
       outputs.push_back(out);
     }
   }
+
+  Clean();
+}
+
+void DataFlowGraph::Clean() {
+  for (auto &node : nodes.nodes()) {
+    std::unordered_set<Node *> inlinks_set(node->inlinks.begin(),
+                                           node->inlinks.end());
+    std::unordered_set<Node *> outlinks_set(node->outlinks.begin(),
+                                            node->outlinks.end());
+    if (inlinks_set.size() < node->inlinks.size()) {
+      LOG(INFO) << "Clean: node " << node->repr() << " prune duplicate inputs";
+      node->inlinks.assign(inlinks_set.begin(), inlinks_set.end());
+    }
+    if (outlinks_set.size() < node->outlinks.size()) {
+      LOG(INFO) << "Clean: node " << node->repr() << " prune duplicate inputs";
+      node->outlinks.assign(outlinks_set.begin(), outlinks_set.end());
+    }
+  }
 }
 
 std::string DataFlowGraph::DotString() const {
@@ -57,19 +77,7 @@ std::string DataFlowGraph::DotString() const {
   // Add nodes
   for (size_t i = 0; i < nodes.size(); i++) {
     const Node &node = nodes.Get(i);
-    switch (node.type()) {
-      case Node::Type::kValue:
-        dot.AddNode(node.repr(), node.dot_attrs());
-        break;
-      case Node::Type::kFunction:
-        dot.AddNode(node.repr(), node.dot_attrs());
-        break;
-      case Node::Type::kFunctionBlock:
-        dot.AddNode(node.repr(), node.dot_attrs());
-        break;
-      default:
-        PADDLE_THROW("unsupported Node type %d", static_cast<int>(node.type()));
-    }
+    dot.AddNode(node.repr(), node.dot_attrs());
   }
 
   // Add edges
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.h b/paddle/fluid/inference/analysis/data_flow_graph.h
index 913e344d371ddf3ea05a53c216e5b3bea8f11c7b..a4fefc83e0c551d52bec87299bcbc966e7a2dbf7 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph.h
@@ -47,6 +47,10 @@ struct DataFlowGraph {
 
   // Output a DOT graph file for debug.
   std::string DotString() const;
+
+ private:
+  // Remove duplicate edges and so on.
+  void Clean();
 };
 
 /*
@@ -133,17 +137,24 @@ struct GraphTraits<DataFlowGraph> {
 // Extract the inputs and outputs of a graph. The inputs and outputs of a
 // sub-graph is the inputs nodes and output nodes that doesn't inside the
 // sub-graph.
-std::pair<
-    std::vector<Node *>,
-    std::vector<
-        Node *>> static ExtractInputAndOutputOfSubGraph(std::vector<Node *>
-                                                            &graph) {
+static std::pair<std::vector<Node *>, std::vector<Node *>>
+ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {  // NOLINT
   std::unordered_set<Node *> nodes(graph.begin(), graph.end());
   std::unordered_set<Node *> inputs;
   std::unordered_set<Node *> outputs;
+  // Input a Value, check whether its inlink is in the subgraph.
+  auto inlink_in_subgraph = [&](Node *n) {
+    for (auto *in : n->inlinks) {
+      if (nodes.count(in)) return true;
+    }
+    return false;
+  };
   for (auto &node : graph) {
     for (auto *in : node->inlinks) {
-      if (!nodes.count(in) && in->type() == Node::Type::kValue) {
+      // The Value that is written by nodes inside a sub-graph shouldn't be the
+      // input of the sub-graph.
+      if (!nodes.count(in) && in->type() == Node::Type::kValue &&
+          !inlink_in_subgraph(in)) {
         inputs.insert(in);
       }
     }
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..29ca008123addf07959b965a4b54bf55b18c401d
--- /dev/null
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
@@ -0,0 +1,171 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
+#include <vector>
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/proto_desc.h"
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+using framework::proto::ProgramDesc;
+
+std::vector<std::string> ExtractParameters(
+    const std::vector<std::unique_ptr<Node>>& nodes);
+
+bool DataFlowGraphToFluidPass::Initialize(Argument* argument) {
+  ANALYSIS_ARGUMENT_CHECK_FIELD(argument)
+  ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc)
+  PADDLE_ENFORCE(!argument->transformed_program_desc);
+  // The transformed_program_desc should inherit all the VarDesc and BlockDesc
+  // from the original program desc. The operators of the main block(the first
+  // block) should rewritten by data flow graph.
+  argument->transformed_program_desc.reset(
+      new ProgramDesc(*argument->origin_program_desc));
+  argument->transformed_program_desc->mutable_blocks(framework::kRootBlockIndex)
+      ->clear_ops();
+  desc_ = argument->transformed_program_desc.get();
+  argument_ = argument;
+  return true;
+}
+
+bool DataFlowGraphToFluidPass::Finalize() { return true; }
+
+void DataFlowGraphToFluidPass::Run(DataFlowGraph* graph) {
+  auto traits = GraphTraits<DataFlowGraph>(graph);
+  for (auto it = traits.nodes().begin(); it != traits.nodes().end(); ++it) {
+    if (it->deleted()) continue;
+
+    switch (it->type()) {
+      case Node::Type::kFunction: {
+        LOG(INFO) << "add function " << it->repr();
+        AddFluidOp(&(*it));
+      } break;
+      case Node::Type::kFunctionBlock: {
+        LOG(INFO) << "add engine op " << it->repr() << " , "
+                  << static_cast<FunctionBlock*>(&(*it))->subgraph.size();
+        AddEngineOp(&(*it));
+      } break;
+      default:
+        continue;
+    }
+  }
+}
+
+void DataFlowGraphToFluidPass::AddFluidOp(Node* node) {
+  auto* ori_op = static_cast<framework::proto::OpDesc*>(node->pb_desc());
+  // currently only the main block is analyzed.
+  auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
+  auto* op = main_block->add_ops();
+  *op = *ori_op;  // copy the attributes, by default, these will not be changed
+                  // by analysis phrase.
+  // The inputs and outputs of the existing ops are not changed by tensorrt
+  // subgraph pass.
+  // NOTE It might be changed by other passes in the long run.
+}
+
+void CreateTrtEngineOp(Node* node, const DataFlowGraph& graph,
+                       const framework::proto::BlockDesc& block) {
+  static int counter{0};
+  PADDLE_ENFORCE(node->IsFunctionBlock());
+  framework::OpDesc desc;
+  auto* func = static_cast<FunctionBlock*>(node);
+
+  // collect inputs
+  std::vector<std::string> io;
+  for (auto* x : func->inlinks) {
+    io.push_back(x->name());
+  }
+  desc.SetInput("Xs", io);
+
+  // collect outputs
+  io.clear();
+  for (auto* x : func->outlinks) {
+    io.push_back(x->name());
+  }
+  desc.SetOutput("Ys", io);
+
+  desc.SetType("tensorrt_engine");
+  // Set attrs
+  SetAttr(desc.Proto(), "subgraph", block.SerializeAsString());
+  SetAttr(desc.Proto(), "engine_unique_key",
+          "trt-" + std::to_string(counter++));
+  SetAttr(desc.Proto(), "max_batch", 100);  // TODO(Superjomn) add config latter
+  SetAttr(desc.Proto(), "max_workspace",
+          1024);  // TODO(Superjomn) add config latter
+  SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes()));
+  node->SetPbMsg(desc.Proto()->SerializeAsString());
+}
+
+std::vector<std::string> ExtractParameters(
+    const std::vector<std::unique_ptr<Node>>& nodes) {
+  std::vector<std::string> parameters;
+  for (const auto& node : nodes) {
+    if (!node->IsValue()) continue;
+    PADDLE_ENFORCE(!node->pb_msg().empty(), "pb_msg should be set first");
+    framework::proto::VarDesc var;
+    var.ParseFromString(node->pb_msg());
+    if (var.persistable()) {
+      parameters.push_back(var.name());
+    }
+  }
+  return parameters;
+}
+
+void DataFlowGraphToFluidPass::AddEngineOp(Node* node) {
+  // TODO(Superjomn) Here need to expose some arguments for default setting.
+  PADDLE_ENFORCE(node->IsFunctionBlock());
+  auto* block_node = static_cast<FunctionBlock*>(node);
+  framework::proto::BlockDesc proto;
+  framework::BlockDesc block_desc(nullptr, &proto);
+  // copy ops.
+  for (auto* node : block_node->subgraph) {
+    auto* op = block_desc.AppendOp();
+    PADDLE_ENFORCE(!node->pb_msg().empty());
+    op->Proto()->ParseFromString(node->pb_msg());
+  }
+  CreateTrtEngineOp(node, *argument_->main_dfg, *block_desc.Proto());
+  auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
+  auto* op = main_block->add_ops();
+  PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block");
+  op->ParseFromString(node->pb_msg());
+}
+
+namespace {
+class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
+ public:
+  using Config = DFG_GraphvizDrawPass::Config;
+  explicit DFG_DebuggerPass(const Config& config)
+      : DFG_GraphvizDrawPass(config) {}
+
+  std::string repr() const override { return "dfg-to-fluid-debuger-pass"; }
+
+  bool Finalize() override { return true; }
+};
+}  // namespace
+
+Pass* DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
+  return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
+      FLAGS_inference_analysis_graphviz_log_root,
+      "data_flow_graph_to_fluid_graphviz_debugger"));
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..edc84b02ed20991e3e7c6c437d2b1fac169bae03
--- /dev/null
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+/*
+ * This file implements the transformation from fluid ProgramDesc to data flow
+ * graph.
+ */
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/inference/analysis/data_flow_graph.h"
+#include "paddle/fluid/inference/analysis/pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+class DataFlowGraphToFluidPass final : public DataFlowGraphPass {
+ public:
+  DataFlowGraphToFluidPass() = default;
+
+  bool Initialize(Argument *argument) override;
+  bool Finalize() override;
+
+  void Run(DataFlowGraph *graph) override;
+
+  std::string repr() const override { return "DFG to fluid"; }
+  std::string description() const override {
+    return "Transform a DFG to a Fluid ProgramDesc";
+  }
+
+  Pass *CreateGraphvizDebugerPass() const override;
+
+ protected:
+  // Add a Fluid Op into the ProgramDesc.
+  void AddFluidOp(Node *node);
+  // Add a EngineOp into the ProgramDesc.
+  void AddEngineOp(Node *node);
+
+ private:
+  framework::proto::ProgramDesc *desc_;
+  Argument *argument_;
+};
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
index dcee75cee50ede1d2b660e88e06544440bd5ef77..d8fc5e580a98f76233f01fdc4d7987311f78ee45 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
@@ -27,13 +27,12 @@ namespace inference {
 namespace analysis {
 
 TEST_F(DFG_Tester, Test) {
-  framework::proto::ProgramDesc new_desc;
   DataFlowGraph graph;
 
   FluidToDataFlowGraphPass pass0;
   DataFlowGraphToFluidPass pass1;
-  pass0.Initialize(desc);
-  pass1.Initialize(&new_desc);
+  ASSERT_TRUE(pass0.Initialize(&argument));
+  ASSERT_TRUE(pass1.Initialize(&argument));
 
   pass0.Run(&graph);
   pass1.Run(&graph);
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a6f85484756417e103cbb60bcb664e8b800b9f28
--- /dev/null
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
@@ -0,0 +1,59 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+int DFG_GraphvizDrawPass::counter_{0};
+
+void DFG_GraphvizDrawPass::Run(DataFlowGraph *graph) {
+  auto content = Draw(graph);
+  auto dot_path = GenDotPath();
+  std::ofstream file(dot_path);
+  file.write(content.c_str(), content.size());
+  file.close();
+
+  auto png_path = dot_path.substr(0, dot_path.size() - 4) + ".png";
+  std::string message;
+  LOG(INFO) << "draw to " << png_path;
+  ExecShellCommand("dot -Tpng " + dot_path + " -o " + png_path, &message);
+}
+
+std::string DFG_GraphvizDrawPass::Draw(DataFlowGraph *graph) {
+  Dot dot;
+  // Add nodes
+  for (size_t i = 0; i < graph->nodes.size(); i++) {
+    const Node &node = graph->nodes.Get(i);
+    if (config_.display_deleted_node || !node.deleted()) {
+      dot.AddNode(node.repr(), node.dot_attrs());
+    }
+  }
+  // Add edges
+  for (size_t i = 0; i < graph->nodes.size(); i++) {
+    const Node &node = graph->nodes.Get(i);
+    if (!config_.display_deleted_node && node.deleted()) continue;
+    for (auto &in : node.inlinks) {
+      if (!config_.display_deleted_node && in->deleted()) continue;
+      dot.AddEdge(in->repr(), node.repr(), {});
+    }
+  }
+  return dot.Build();
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
index 41d4475382befa1bdaf7473520d64005a472a459..17445ab4407a159ca11345bc9a9226b3ad0044f0 100644
--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
@@ -21,6 +21,7 @@ limitations under the License. */
 
 #include <fstream>
 #include <string>
+#include "paddle/fluid/inference/analysis/dot.h"
 #include "paddle/fluid/inference/analysis/pass.h"
 
 namespace paddle {
@@ -32,35 +33,44 @@ namespace analysis {
  */
 class DFG_GraphvizDrawPass : public DataFlowGraphPass {
  public:
-  DFG_GraphvizDrawPass(const std::string& dir, const std::string& id)
-      : dir_(dir), id_(id) {}
-
-  bool Initialize() override { return Pass::Initialize(); }
-  void Run(DataFlowGraph* graph) override {
-    auto content = Draw(graph);
-    std::ofstream file(GenDotPath());
-    file.write(content.c_str(), content.size());
-    file.close();
-    LOG(INFO) << "draw dot to " << GenDotPath();
-  }
+  struct Config {
+    Config(const std::string &dir, const std::string &id,
+           bool display_deleted_node = false)
+        : dir(dir), id(id), display_deleted_node(display_deleted_node) {}
+
+    // The directory to store the .dot or .png files.
+    const std::string dir;
+    // The identifier for this dot file.
+    const std::string id;
+    // Whether to display deleted nodes, default false.
+    const bool display_deleted_node;
+  };
 
-  bool Finalize() override { return Pass::Finalize(); }
+  explicit DFG_GraphvizDrawPass(const Config &config) : config_(config) {}
 
-  Pass* CreatePrinterPass(std::ostream& os,
-                          const std::string& banner) const override {
-    return nullptr;
+  bool Initialize(Argument *argument) override { return true; }
+  void Run(DataFlowGraph *graph) override;
+  bool Finalize() override { return true; }
+
+  std::string repr() const override { return "DFG graphviz drawer"; }
+  std::string description() const override {
+    return "Debug a DFG by draw with graphviz";
   }
 
- private:
+ protected:
+  // A counter to add a number prefix to the debugger image output so that they
+  // will sort in the triggered order.
+  static int counter_;
+
   // Path of the dot file to output.
   std::string GenDotPath() const {
-    return dir_ + "/" + "graph_" + id_ + ".dot";
+    return config_.dir + "/" + std::to_string(counter_++) + "-graph_" +
+           config_.id + ".dot";
   }
 
-  std::string Draw(DataFlowGraph* graph) { return graph->DotString(); }
+  virtual std::string Draw(DataFlowGraph *graph);
 
-  std::string dir_;
-  std::string id_;
+  Config config_;
 };
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
index 3fc1cc18b855440c54c1ed6a9ab49a104c8c21f0..162455b9c4e06b7fbb4bdede30444faf6a8a1509 100644
--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
@@ -24,13 +24,14 @@ namespace inference {
 namespace analysis {
 
 TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) {
-  auto dfg = ProgramDescToDFG(desc);
-  DFG_GraphvizDrawPass pass("./", "test");
-  pass.Initialize();
+  auto dfg = ProgramDescToDFG(*argument.origin_program_desc);
+  DFG_GraphvizDrawPass::Config config("./", "test");
+  DFG_GraphvizDrawPass pass(config);
+  pass.Initialize(&argument);
   pass.Run(&dfg);
 
   // test content
-  std::ifstream file("./graph_test.dot");
+  std::ifstream file("./0-graph_test.dot");
   ASSERT_TRUE(file.is_open());
 
   std::string line;
@@ -38,6 +39,7 @@ TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) {
   while (std::getline(file, line)) {
     no++;
   }
+  // DFG is sensitive to ProgramDesc, be careful to change the existing models.
   ASSERT_EQ(no, 82);
 }
 
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
index 9f67c989cca4a936cd320b73efaae277263fb3e2..e918622d74cfb11d83090555be2a768cc14e7742 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
@@ -15,25 +15,31 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
 #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
 
-FluidToDataFlowGraphPass::FluidToDataFlowGraphPass() {}
-
-bool FluidToDataFlowGraphPass::Initialize() { return Pass::Initialize(); }
-
-bool FluidToDataFlowGraphPass::Initialize(
-    const framework::proto::ProgramDesc &desc) {
-  desc_ = &desc;
+bool FluidToDataFlowGraphPass::Initialize(Argument *argument) {
+  ANALYSIS_ARGUMENT_CHECK_FIELD(argument);
+  ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc);
+  PADDLE_ENFORCE(argument);
+  if (!argument->main_dfg) {
+    LOG(INFO) << "Init DFG";
+    argument->main_dfg.reset(new DataFlowGraph);
+  }
+  desc_ = argument->origin_program_desc.get();
   return true;
 }
 
-bool FluidToDataFlowGraphPass::Finalize() { return Pass::Finalize(); }
+bool FluidToDataFlowGraphPass::Finalize() { return true; }
 
 void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
+  PADDLE_ENFORCE(graph);
+  PADDLE_ENFORCE(desc_);
   // insert vars
   std::unordered_map<std::string, size_t> var2id;
   auto &main_block = desc_->blocks(framework::kRootBlockIndex);
@@ -41,7 +47,8 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
     const auto &var = main_block.vars(i);
     auto *v = graph->nodes.Create(Node::Type::kValue);
     v->SetName(var.name());
-    v->SetExtraInfo(const_cast<void *>(static_cast<const void *>(&var)));
+    v->SetPbDesc(const_cast<void *>(static_cast<const void *>(&var)));
+    v->SetPbMsg(var.SerializeAsString());
     var2id[var.name()] = v->id();
   }
   for (int i = 0; i < main_block.ops_size(); i++) {
@@ -51,7 +58,9 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
     static_cast<Function *>(o)->SetFuncType(op.type());
     // Link to the original protobuf message's memory, make it easier to
     // generate from a data flow graph to fluid ProgramDesc.
-    o->SetExtraInfo(const_cast<void *>(static_cast<const void *>(&op)));
+    o->SetPbDesc(const_cast<void *>(static_cast<const void *>(&op)));
+    o->SetPbMsg(op.SerializeAsString());
+
     // set inputs and outputs
     // TODO(Superjomn) make sure the InputNames is the real variable name.
     for (int j = 0; j < op.inputs_size(); j++) {
@@ -75,9 +84,20 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
   graph->Build();
 }
 
-Pass *FluidToDataFlowGraphPass::CreatePrinterPass(
-    std::ostream &os, const std::string &banner) const {
-  return nullptr;
+namespace {
+class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
+ public:
+  using Config = DFG_GraphvizDrawPass::Config;
+  explicit DFG_DebuggerPass(const Config &config)
+      : DFG_GraphvizDrawPass(config) {}
+  std::string repr() const override { return "fluid-to-dfg-debuger-pass"; }
+  bool Finalize() override { return true; }
+};
+}
+
+Pass *FluidToDataFlowGraphPass::CreateGraphvizDebugerPass() const {
+  return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
+      FLAGS_inference_analysis_graphviz_log_root, "fluid-to-dfg-debuger"));
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
index 33517e57becdffc0416f204247eac5feadb7ed82..da8463b63bd0bb1633bfcb9d7d41a884ddd632c7 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
@@ -34,15 +34,19 @@ namespace analysis {
  */
 class FluidToDataFlowGraphPass final : public DataFlowGraphPass {
  public:
-  FluidToDataFlowGraphPass();
-  bool Initialize() override;
-  bool Initialize(const framework::proto::ProgramDesc &desc) override;
+  FluidToDataFlowGraphPass() = default;
+
+  bool Initialize(Argument *argument) override;
   bool Finalize() override;
 
   void Run(DataFlowGraph *graph) override;
 
-  Pass *CreatePrinterPass(std::ostream &os,
-                          const std::string &banner) const override;
+  std::string repr() const override { return "fluid-to-data-flow-graph"; }
+  std::string description() const override {
+    return "transform a fluid ProgramDesc to a data flow graph.";
+  }
+
+  Pass *CreateGraphvizDebugerPass() const override;
 
  private:
   framework::proto::ProgramDesc const *desc_;
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
index 817d32c92cdbdc234eef9ed5156891c2b11ced4c..cbca5abdd5fff1672ba5d47a8876489c54ad6947 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
@@ -23,11 +23,11 @@ namespace analysis {
 
 TEST_F(DFG_Tester, Init) {
   FluidToDataFlowGraphPass pass;
-  pass.Initialize();
-  pass.Initialize(desc);
+  pass.Initialize(&argument);
   DataFlowGraph graph;
   pass.Run(&graph);
-  ASSERT_GT(graph.nodes.size(), 0);
+  // Analysis is sensitive to ProgramDesc, careful to change the original model.
+  ASSERT_EQ(graph.nodes.size(), 37UL);
   pass.Finalize();
   LOG(INFO) << '\n' << graph.DotString();
 }
diff --git a/paddle/fluid/inference/analysis/helper.cc b/paddle/fluid/inference/analysis/helper.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ca40c01fc57dbcc2ca16770a1b7d798de8b5625b
--- /dev/null
+++ b/paddle/fluid/inference/analysis/helper.cc
@@ -0,0 +1,60 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/framework/framework.pb.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+template <>
+void SetAttr<std::string>(framework::proto::OpDesc *op, const std::string &name,
+                          const std::string &data) {
+  auto *attr = op->add_attrs();
+  attr->set_name(name);
+  attr->set_type(paddle::framework::proto::AttrType::STRING);
+  attr->set_s(data);
+}
+template <>
+void SetAttr<int>(framework::proto::OpDesc *op, const std::string &name,
+                  const int &data) {
+  auto *attr = op->add_attrs();
+  attr->set_name(name);
+  attr->set_type(paddle::framework::proto::AttrType::INT);
+  attr->set_i(data);
+}
+template <>
+void SetAttr<int64_t>(framework::proto::OpDesc *op, const std::string &name,
+                      const int64_t &data) {
+  auto *attr = op->add_attrs();
+  attr->set_name(name);
+  attr->set_type(paddle::framework::proto::AttrType::LONG);
+  attr->set_l(data);
+}
+template <>
+void SetAttr<std::vector<std::string>>(framework::proto::OpDesc *op,
+                                       const std::string &name,
+                                       const std::vector<std::string> &data) {
+  auto *attr = op->add_attrs();
+  attr->set_name(name);
+  attr->set_type(paddle::framework::proto::AttrType::STRINGS);
+  for (const auto &s : data) {
+    attr->add_strings(s.c_str());
+  }
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h
index 58eb0e715cb71d87179f3240de55021603cd7423..f1064cd20f28092d80d3fd23a862da080b6cc2f3 100644
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -14,10 +14,13 @@ limitations under the License. */
 
 #pragma once
 
+#include <cstdio>
 #include <string>
+#include <typeindex>
 #include <unordered_map>
 #include <vector>
 
+#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -26,6 +29,10 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
+template <typename T>
+void SetAttr(framework::proto::OpDesc *op, const std::string &name,
+             const T &data);
+
 template <typename Vec>
 int AccuDims(Vec &&vec, int size) {
   int res = 1;
@@ -35,7 +42,7 @@ int AccuDims(Vec &&vec, int size) {
   return res;
 }
 
-#define SET_TYPE(type__) dic_[typeid(type__).hash_code()] = #type__;
+#define SET_TYPE(type__) dic_[std::type_index(typeid(type__))] = #type__;
 /*
  * Map typeid to representation.
  */
@@ -47,14 +54,14 @@ struct DataTypeNamer {
 
   template <typename T>
   const std::string &repr() const {
-    auto x = typeid(T).hash_code();
+    auto x = std::type_index(typeid(T));
     PADDLE_ENFORCE(dic_.count(x), "unknown type for representation");
     return dic_.at(x);
   }
 
-  const std::string &repr(size_t &hash) const {  // NOLINT
-    PADDLE_ENFORCE(dic_.count(hash), "unknown type for representation");
-    return dic_.at(hash);
+  const std::string &repr(const std::type_index &type) const {  // NOLINT
+    PADDLE_ENFORCE(dic_.count(type), "unknown type for representation");
+    return dic_.at(type);
   }
 
  private:
@@ -62,11 +69,10 @@ struct DataTypeNamer {
     SET_TYPE(int);
     SET_TYPE(bool);
     SET_TYPE(float);
+    SET_TYPE(void *);
   }
 
-  std::unordered_map<decltype(typeid(int).hash_code()),  // NOLINT
-                     std::string>
-      dic_;
+  std::unordered_map<std::type_index, std::string> dic_;
 };
 #undef SET_TYPE
 
@@ -92,7 +98,7 @@ template <typename T>
 class OrderedRegistry {
  public:
   T *Register(const std::string &name, T *x) {
-    PADDLE_ENFORCE(!dic_.count(name));
+    PADDLE_ENFORCE(!dic_.count(name), "duplicate key [%s]", name);
     dic_[name] = data_.size();
     data_.emplace_back(std::unique_ptr<T>(x));
     return data_.back().get();
@@ -116,6 +122,20 @@ T &GetFromScope(const framework::Scope &scope, const std::string &name) {
   return *var->GetMutable<T>();
 }
 
+static void ExecShellCommand(const std::string &cmd, std::string *message) {
+  char buffer[128];
+  std::shared_ptr<FILE> pipe(popen(cmd.c_str(), "r"), pclose);
+  if (!pipe) {
+    LOG(ERROR) << "error running command: " << cmd;
+    return;
+  }
+  while (!feof(pipe.get())) {
+    if (fgets(buffer, 128, pipe.get()) != nullptr) {
+      *message += buffer;
+    }
+  }
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/node.cc b/paddle/fluid/inference/analysis/node.cc
index fe060526080b1ee01aa98f2ff06fb2191eddf9da..f2e918f3ff41d9db0c3ec38561015967bed26f4e 100644
--- a/paddle/fluid/inference/analysis/node.cc
+++ b/paddle/fluid/inference/analysis/node.cc
@@ -20,6 +20,17 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
+template <>
+std::string &NodeAttr::As<std::string>() {
+  if (data_.empty()) {
+    type_index_ = std::type_index(typeid(std::string));
+  }
+  PADDLE_ENFORCE_EQ(type_index_, std::type_index(typeid(std::string)));
+  return data_;
+}
+
+std::string &NodeAttr::String() { return As<std::string>(); }
+
 std::vector<Dot::Attr> Value::dot_attrs() const {
   return std::vector<Dot::Attr>({Dot::Attr("style", "filled,rounded"),
                                  Dot::Attr("shape", "box"),
@@ -40,6 +51,9 @@ Node *NodeMap::Create(Node::Type type) {
     case Node::Type::kValue:
       nodes_.emplace_back(new Value);
       break;
+    case Node::Type::kFunctionBlock:
+      nodes_.emplace_back(new FunctionBlock);
+      break;
     default:
       PADDLE_THROW("Not supported node type.");
   }
diff --git a/paddle/fluid/inference/analysis/node.h b/paddle/fluid/inference/analysis/node.h
index 7972ca25c92186a8c55a76de645f4fdbb089e8d3..47e524bc5c4a6b1324d5f182053129311487522d 100644
--- a/paddle/fluid/inference/analysis/node.h
+++ b/paddle/fluid/inference/analysis/node.h
@@ -25,6 +25,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 
+#include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/inference/analysis/device.h"
 #include "paddle/fluid/inference/analysis/dot.h"
 #include "paddle/fluid/inference/analysis/helper.h"
@@ -35,6 +36,44 @@ namespace analysis {
 
 class NodeMap;
 
+// A helper class to maintain the status from Pass.
+struct NodeAttr {
+  // NOTE T should be a primary type or a struct combined by several primary
+  // types.
+  // NOTE the STL containers should not use here.
+  // Some usages
+  //   Attr attr;
+  //   attr.Bool() = true;
+
+  bool &Bool() { return As<bool>(); }
+  float &Float() { return As<float>(); }
+  int32_t &Int32() { return As<int32_t>(); }
+  int64_t &Int64() { return As<int64_t>(); }
+  void *&Pointer() { return As<void *>(); }
+  std::string &String();
+
+ private:
+  template <typename T>
+  T &As() {
+    // init storage in the first usage.
+    if (data_.empty()) {
+      VLOG(4) << "resize data to " << sizeof(T);
+      type_index_ = std::type_index(typeid(T));
+      data_.resize(sizeof(T));
+    }
+    PADDLE_ENFORCE(framework::IsType<T>(type_index_),
+                   "type not matched, origin is %s, want %s",
+                   DataTypeNamer::Global().repr(type_index_),
+                   DataTypeNamer::Global().repr<T>());
+    PADDLE_ENFORCE_EQ(data_.size(), sizeof(T), "Node attr type recast error");
+    return *reinterpret_cast<T *>(&data_[0]);
+  }
+
+ private:
+  std::string data_;
+  std::type_index type_index_{typeid(NodeAttr)};
+};
+
 /*
  * Node Representation.
  *
@@ -50,8 +89,6 @@ class Node {
 
   Node() = default;
 
-  struct Attr;
-
   // Cast to a subclass type, Function for example.
   template <typename Subclass>
   Subclass &As() {
@@ -71,12 +108,20 @@ class Node {
 
   // Get an additional attribute and convert it to T data type. NOTE this will
   // silently create a new attribute if not exists.
-  Attr &attr(const std::string &name) { return attrs_[name]; }
+  NodeAttr &attr(const std::string &name) const { return attrs_[name]; }
 
   int id() const { return id_; }
 
-  bool deleted() const { return deleted_; }
+  // The Protobuf description is set/get with a void* to decouple Node interface
+  // from a specific kind of Protobuf message.
+  void SetPbDesc(void *pb) { attr("pb_desc").Pointer() = pb; }
+  void *pb_desc() const { return attr("pb_desc").Pointer(); }
+
+  void SetPbMsg(const std::string &s) { attr("pb_msg").String() = s; }
+  const std::string &pb_msg() const { return attr("pb_msg").String(); }
+
   void SetDeleted() { deleted_ = true; }
+  bool deleted() const { return deleted_; }
 
   void SetName(const std::string &name) { name_ = name; }
   const std::string &name() const { return name_; }
@@ -84,52 +129,12 @@ class Node {
   void SetType(Type type) { type_ = type; }
   Type type() const { return type_; }
 
-  void *extra_info() const { return extra_info_; }
-  void SetExtraInfo(void *extra_info) { extra_info_ = extra_info; }
-
   // Input links.
   std::vector<Node *> inlinks;
   // Output links.
   std::vector<Node *> outlinks;
 
-  // A helper class to maintain the status from Pass.
-  // TODO(superjomn) add a checker here to ensure the T is primary.
-  struct Attr {
-    // NOTE T should be a primary type or a struct combined by several primary
-    // types.
-    // NOTE the STL containers should not use here.
-    // Some usages
-    // Attr attr;
-    // T data;
-    // attr.data.assign((char*)data, sizeof(data));
-
-    bool &Bool() { return As<bool>(); }
-    float &Float() { return As<float>(); }
-    int32_t &Int32() { return As<int32_t>(); }
-    int64_t &Int64() { return As<int64_t>(); }
-
-   private:
-    template <typename T>
-    T &As() {
-      // init storage in the first usage.
-      if (data_.empty()) {
-        VLOG(4) << "resize data to " << sizeof(T);
-        type_hash_ = typeid(T).hash_code();
-        data_.resize(sizeof(T));
-      }
-      PADDLE_ENFORCE(type_hash_ == typeid(T).hash_code(),
-                     "type not matched, origin is %s, want %s",
-                     DataTypeNamer::Global().repr(type_hash_),
-                     DataTypeNamer::Global().repr<T>());
-      PADDLE_ENFORCE_EQ(data_.size(), sizeof(T), "Node attr type recast error");
-      return *reinterpret_cast<T *>(&data_[0]);
-    }
-
-   private:
-    std::string data_;
-    size_t type_hash_{std::numeric_limits<size_t>::max()};
-  };
-
+  // Type checks.
   bool IsFunction() const { return type_ == Node::Type::kFunction; }
   bool IsValue() const { return type_ == Node::Type::kValue; }
   bool IsFunctionBlock() const { return type_ == Node::Type::kFunctionBlock; }
@@ -148,10 +153,7 @@ class Node {
   Type type_{Type::kNone};
   // Mark this node is deleted by some pass.
   bool deleted_{false};
-
-  void *extra_info_;
-
-  mutable std::unordered_map<std::string, Attr> attrs_;
+  mutable std::unordered_map<std::string, NodeAttr> attrs_;
 };
 
 class Function;
@@ -214,6 +216,10 @@ class Function : public Node {
 struct FunctionBlock : public Node {
   std::string repr() const override { return "block-" + std::to_string(id()); }
   std::vector<Node *> subgraph;
+
+ protected:
+  FunctionBlock() { SetType(Node::Type::kFunctionBlock); }
+  friend class NodeMap;
 };
 
 class NodeMap {
@@ -228,7 +234,7 @@ class NodeMap {
 
   void Delete(size_t id);
 
-  const std::vector<std::unique_ptr<Node>> &nodes() { return nodes_; }
+  const std::vector<std::unique_ptr<Node>> &nodes() const { return nodes_; }
 
   size_t size() const { return nodes_.size(); }
 
diff --git a/paddle/fluid/inference/analysis/node_attr_flags.h b/paddle/fluid/inference/analysis/node_attr_flags.h
new file mode 100644
index 0000000000000000000000000000000000000000..a3f70e5419a66969e8fb20152a8a8ace39316f57
--- /dev/null
+++ b/paddle/fluid/inference/analysis/node_attr_flags.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * This file contains all the flags that declared in Node::Attr.
+ *
+ * The Node::Attr is designed to share information between different passes, one
+ * can get other's attributes in a Node by the flags in this file.
+ */
+#pragma once
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+#define DECLARE_NODE_ATTR(flag__) const char ATTR_##flag__[] = #flag__;
+
+DECLARE_NODE_ATTR(supported_by_tensorrt)  // bool
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/pass.h b/paddle/fluid/inference/analysis/pass.h
index aa0e8667b5e4a9e6156c25fcad03bb8eee3287f6..6b4dbb3bb5ddd9f15f26758beef1d1b5bbf49142 100644
--- a/paddle/fluid/inference/analysis/pass.h
+++ b/paddle/fluid/inference/analysis/pass.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <string>
 
 #include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/inference/analysis/argument.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/analysis/node.h"
@@ -30,19 +31,11 @@ namespace analysis {
 class Pass {
  public:
   Pass() = default;
-  virtual ~Pass() {}
-  // Virtual method overridden by subclasses to do only necessary initialization
-  // before any pass is run.
-  virtual bool Initialize() { return false; }
-  // There is some passes such as FlowToDataFlowGraphPass that needs a
-  // ProgramDesc. Here use the native ProgramDesc ProtoBuf message, so that it
-  // only couple with the proto file.
-  virtual bool Initialize(const framework::proto::ProgramDesc &desc) {
-    return false;
-  }
-  // There are some Passes such as DataFlowGraphToFluidPass that will output a
-  // ProgramDesc.
-  virtual bool Initialize(framework::proto::ProgramDesc *desc) { return false; }
+  virtual ~Pass() = default;
+  // Mutable Pass.
+  virtual bool Initialize(Argument *argument) { return false; }
+  // Readonly Pass.
+  virtual bool Initialize(const Argument &argument) { return false; }
 
   // Virtual method overriden by subclasses to do any necessary clean up after
   // all passes have run.
@@ -50,7 +43,12 @@ class Pass {
 
   // Get a Pass appropriate to print the Node this pass operates on.
   virtual Pass *CreatePrinterPass(std::ostream &os,
-                                  const std::string &banner) const = 0;
+                                  const std::string &banner) const {
+    return nullptr;
+  }
+
+  // Create a debugger Pass that draw the DFG by graphviz toolkit.
+  virtual Pass *CreateGraphvizDebugerPass() const { return nullptr; }
 
   // Run on a single Node.
   virtual void Run(Node *x) { LOG(FATAL) << "not valid"; }
@@ -60,6 +58,11 @@ class Pass {
   virtual void Run(FunctionBlock *x) { LOG(FATAL) << "not valid"; }
   // Run on a single DataFlowGraph.
   virtual void Run(DataFlowGraph *x) { LOG(FATAL) << "not valid"; }
+
+  // Human-readable short representation.
+  virtual std::string repr() const = 0;
+  // Human-readable long description.
+  virtual std::string description() const = 0;
 };
 
 // NodePass process on any Node types.
diff --git a/paddle/fluid/inference/analysis/pass_manager.cc b/paddle/fluid/inference/analysis/pass_manager.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b428bb22b1f0c5c1a47fc4c46c9070c1ace4a228
--- /dev/null
+++ b/paddle/fluid/inference/analysis/pass_manager.cc
@@ -0,0 +1,56 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/analysis/pass_manager.h"
+#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+bool PassManager::Initialize(Argument* argument) {
+  argument_ = argument;
+  for (auto& pass : data_) {
+    LOG(INFO) << "Initializing pass " << pass->repr();
+    if (!pass->Initialize(argument)) {
+      LOG(ERROR) << "Failed to initialize pass [" << pass->repr() << "]";
+      return false;
+    }
+  }
+  return true;
+}
+
+void DfgPassManager::RunAll() {
+  PADDLE_ENFORCE(argument_);
+  for (auto& pass : data_) {
+    VLOG(4) << "Running pass [" << pass->repr() << "]";
+    pass->Run(argument_->main_dfg.get());
+  }
+}
+
+void NodePassManager::RunAll() {
+  PADDLE_ENFORCE(argument_);
+  PADDLE_ENFORCE(argument_->main_dfg.get());
+  auto trait =
+      GraphTraits<DataFlowGraph>(argument_->main_dfg.get()).nodes_in_DFS();
+  for (auto& node : trait) {
+    for (auto& pass : data_) {
+      pass->Run(&node);
+    }
+  }
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/pass_manager.h b/paddle/fluid/inference/analysis/pass_manager.h
new file mode 100644
index 0000000000000000000000000000000000000000..81a17e0287a5aef8a328e43380ee3691f5a32379
--- /dev/null
+++ b/paddle/fluid/inference/analysis/pass_manager.h
@@ -0,0 +1,106 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file defines the logic of pass management. The analysis for inference is
+ * a pipeline of Passes, a PassManager is a agency that helps to manage the
+ * executation of the Passes.
+ *
+ * There are two modes of Passes, the first one is called NodePass and takes
+ * an Node as input and output; the second one is called DFGPass and takes a
+ * DFG(Data Flow Graph) as input and output. It is hard to put all the passes in
+ * the same pipeline, there are two kinds of PassManagers, both takes a DFG as
+ * input and output a DFG, but the Passes inside are different:
+ *
+ *   1. NodePassManager: the passes inside are all NodePasses, it can have
+ *      different graph trivial algorithm, for example, DFS_NodePassManager will
+ *      trigger the passes in depth first order;
+ *   2. DfgPassManager: the passes inside are all DfgPasses.
+ */
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/inference/analysis/pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * PassManager is the base class for all pass managers, a pass manager has
+ * several Pass-es registered, and execute them in the linear order.
+ */
+class PassManager : public OrderedRegistry<Pass> {
+ public:
+  PassManager() = default;
+  // Call all the passes' Initialize methods. The desc and data_flow_graph are
+  // globally shared, so pass them as the arguemnts for all the pass managers.
+  virtual bool Initialize(const Argument& argument) { return false; }
+
+  virtual bool Initialize(Argument* argument);
+
+  // Call all the passes' Finalize methods.
+  virtual bool Finalize() {
+    for (auto& pass : data_) {
+      if (!pass->Finalize()) {
+        LOG(ERROR) << "Failed to finalize pass [" << pass->repr() << "]";
+        return false;
+      }
+    }
+    return true;
+  }
+
+  // Run all the passes.
+  virtual void RunAll() = 0;
+
+  // Short identifier.
+  virtual std::string repr() const = 0;
+  // Long description.
+  virtual std::string description() const = 0;
+
+  virtual ~PassManager() = default;
+
+ protected:
+  Argument* argument_{nullptr};
+};
+
+/*
+ * A pass manager that process a DFG.
+ */
+class DfgPassManager : public PassManager {
+ public:
+  DfgPassManager() = default;
+
+  void RunAll() override;
+
+  virtual ~DfgPassManager() = default;
+};
+
+/*
+ * A pass manager that process a Node each time.
+ */
+class NodePassManager : public PassManager {
+ public:
+  NodePassManager() = default;
+
+  void RunAll() override;
+
+  virtual ~NodePassManager() = default;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/pass_manager_tester.cc b/paddle/fluid/inference/analysis/pass_manager_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dac1c509d728114bd24a2ea1150c407646026fd4
--- /dev/null
+++ b/paddle/fluid/inference/analysis/pass_manager_tester.cc
@@ -0,0 +1,86 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+
+#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
+#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
+#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+#include "paddle/fluid/inference/analysis/pass_manager.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+class TestDfgPassManager final : public DfgPassManager {
+ public:
+  TestDfgPassManager() = default;
+  virtual ~TestDfgPassManager() = default;
+  // Short identifier.
+  std::string repr() const override { return "test-pass-manager"; }
+  // Long description.
+  std::string description() const override { return "test doc"; }
+};
+
+class TestNodePassManager final : public NodePassManager {
+ public:
+  virtual ~TestNodePassManager() = default;
+
+  std::string repr() const override { return "test-node-pass-manager"; }
+  std::string description() const override { return "test doc"; }
+};
+
+class TestNodePass final : public NodePass {
+ public:
+  virtual ~TestNodePass() = default;
+
+  bool Initialize(Argument* argument) override { return true; }
+
+  void Run(Node* node) override {
+    LOG(INFO) << "- Processing node " << node->repr();
+  }
+
+  std::string repr() const override { return "test-node"; }
+  std::string description() const override { return "some doc"; }
+};
+
+TEST_F(DFG_Tester, DFG_pass_manager) {
+  TestDfgPassManager manager;
+  DFG_GraphvizDrawPass::Config config("./", "dfg.dot");
+
+  manager.Register("fluid-to-flow-graph", new FluidToDataFlowGraphPass);
+  manager.Register("graphviz", new DFG_GraphvizDrawPass(config));
+  manager.Register("dfg-to-fluid", new DataFlowGraphToFluidPass);
+
+  ASSERT_TRUE(&argument);
+  ASSERT_TRUE(manager.Initialize(&argument));
+  manager.RunAll();
+}
+
+TEST_F(DFG_Tester, Node_pass_manager) {
+  // Pre-process: initialize the DFG with the ProgramDesc first.
+  FluidToDataFlowGraphPass pass0;
+  pass0.Initialize(&argument);
+  pass0.Run(argument.main_dfg.get());
+
+  TestNodePassManager manager;
+  manager.Register("test-node-pass", new TestNodePass);
+  ASSERT_TRUE(manager.Initialize(&argument));
+  manager.RunAll();
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.cc b/paddle/fluid/inference/analysis/subgraph_splitter.cc
index 43ccac96c84e987ad1f494af3e314c810fc1ffe3..389f9e1a9148a4daf0e5b751cce5cb6325252a4e 100644
--- a/paddle/fluid/inference/analysis/subgraph_splitter.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc
@@ -119,10 +119,12 @@ void SubGraphFuse::operator()() { ReplaceNodesWithSubGraphs(); }
 void SubGraphFuse::ReplaceNodesWithSubGraphs() {
   auto subgraphs = SubGraphSplitter(graph_, node_inside_subgraph_teller_)();
   for (auto &subgraph : subgraphs) {
+    std::unordered_set<Node *> subgraph_uniq(subgraph.begin(), subgraph.end());
     // replace this sub-graph with the first node. Two steps: 1. Create a Block
     // Node that contains this subgraph 2. Mark the nodes inside the sub-graph
     // as deleted. 3. Replace the deleted node with the new Block Node.
-    auto *block_node = graph_->nodes.Create(Node::Type::kFunctionBlock);
+    auto *block_node = static_cast<FunctionBlock *>(
+        graph_->nodes.Create(Node::Type::kFunctionBlock));
     auto io = ExtractInputAndOutputOfSubGraph(subgraph);
     block_node->inlinks = std::move(io.first);
     block_node->outlinks = std::move(io.second);
@@ -130,21 +132,25 @@ void SubGraphFuse::ReplaceNodesWithSubGraphs() {
       // TODO(Superjomn) need a unified mechanism to treat deleted node in each
       // pass.
       node->SetDeleted();
+      block_node->subgraph.push_back(node);
     }
 
-    std::unordered_map<Node *, Node *>
-        delelte_node_map;  // deleted node to BlockNode
-    for (auto *n : block_node->inlinks) {
-      n->inlinks.clear();
-    }
-    for (auto *n : block_node->outlinks) {
-      n->outlinks.clear();
-    }
-    for (auto *n : block_node->inlinks) {
-      n->outlinks.push_back(block_node);
+    // Change all the sub-graph's inputs and outputs corresponding inlink and
+    // outlink to this sub-graph node.
+    auto inlink_or_outlink_cleaner = [&](std::vector<Node *> &nodes) {
+      for (auto *&n : nodes) {
+        if (subgraph_uniq.count(n)) {
+          n = block_node;
+        }
+      }
+      std::unordered_set<Node *> uniq(nodes.begin(), nodes.end());
+      nodes.assign(uniq.begin(), uniq.end());
+    };
+    for (auto *i : block_node->inlinks) {
+      inlink_or_outlink_cleaner(i->outlinks);
     }
-    for (auto *n : block_node->outlinks) {
-      n->inlinks.push_back(n);
+    for (auto *&o : block_node->outlinks) {
+      inlink_or_outlink_cleaner(o->inlinks);
     }
   }
 }
diff --git a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
index 0644c0db12e3daabba76dbaad33847f5624b157a..67dd4da54b95add703428e1fded61065f60353e8 100644
--- a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
@@ -19,22 +19,23 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
+SubGraphSplitter::NodeInsideSubgraphTeller teller = [](const Node* node) {
+  if (node->type() != Node::Type::kFunction) return false;
+  const auto* func = static_cast<const Function*>(node);
+  if (func->func_type() == "elementwise_add" || func->func_type() == "relu" ||
+      func->func_type() == "conv2d" || func->func_type() == "mul" ||
+      func->func_type() == "sigmoid" || func->func_type() == "softmax") {
+    LOG(INFO) << "sub-graph marked " << node->repr();
+    return true;
+  }
+  return false;
+};
+
 TEST_F(DFG_Tester, Split) {
   auto desc = LoadProgramDesc();
   auto dfg = ProgramDescToDFG(desc);
   LOG(INFO) << "spliter\n" << dfg.DotString();
 
-  SubGraphSplitter::NodeInsideSubgraphTeller teller = [](const Node* node) {
-    if (node->type() != Node::Type::kFunction) return false;
-    const auto* func = static_cast<const Function*>(node);
-    if (func->func_type() == "elementwise_add" || func->func_type() == "relu" ||
-        func->func_type() == "conv2d" || func->func_type() == "mul" ||
-        func->func_type() == "sigmoid" || func->func_type() == "softmax") {
-      LOG(INFO) << "sub-graph marked " << node->repr();
-      return true;
-    }
-    return false;
-  };
   ASSERT_GT(dfg.nodes.size(), 5UL);
 
   auto subgraphs = SubGraphSplitter(&dfg, teller)();
@@ -62,6 +63,28 @@ TEST_F(DFG_Tester, Split) {
   ASSERT_EQ(subgraphs.back().size(), 6UL);
 }
 
+TEST_F(DFG_Tester, Fuse) {
+  auto desc = LoadProgramDesc();
+  auto dfg = ProgramDescToDFG(desc);
+
+  size_t count0 = dfg.nodes.size();
+
+  SubGraphFuse fuse(&dfg, teller);
+  fuse();
+
+  int count1 = 0;
+  for (auto& node : dfg.nodes.nodes()) {
+    if (node->deleted()) {
+      LOG(INFO) << "deleted " << node->repr();
+    }
+    count1 += node->deleted();
+  }
+
+  // At least one nodes should be deleted.
+  ASSERT_EQ(dfg.nodes.size(), count0 + 1);  // added a new FunctionBlock
+  ASSERT_EQ(6, count1);
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f736e385c11add152dc9ab9485bf1de40f80b2f3
--- /dev/null
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
+#include "paddle/fluid/inference/analysis/node_attr_flags.h"
+#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+void TensorRTSubgraphNodeMarkPass::Run(DataFlowGraph *graph) {
+  for (auto &node : graph->nodes.nodes()) {
+    node->attr(ATTR_supported_by_tensorrt).Bool() = teller_(node.get());
+  }
+}
+
+class DfgDebuggerPass : public DFG_GraphvizDrawPass {
+ public:
+  explicit DfgDebuggerPass(const DFG_GraphvizDrawPass::Config &config)
+      : DFG_GraphvizDrawPass(config) {}
+
+  std::string repr() const override {
+    return "tensorrt-subgraph-node-mark-debugger";
+  }
+
+  bool Finalize() override { return true; }
+
+ protected:
+  std::string Draw(DataFlowGraph *graph) override {
+    Dot dot;
+    // Add nodes
+    for (size_t i = 0; i < graph->nodes.size(); i++) {
+      const Node &node = graph->nodes.Get(i);
+      if (config_.display_deleted_node || !node.deleted()) {
+        auto dot_attr = node.dot_attrs();
+        if (node.attr(ATTR_supported_by_tensorrt).Bool()) {
+          dot_attr.assign(
+              {Dot::Attr{"color", "green"}, Dot::Attr{"style", "filled"}});
+        }
+        dot.AddNode(node.repr(), dot_attr);
+      }
+    }
+    // Add edges
+    for (size_t i = 0; i < graph->nodes.size(); i++) {
+      const Node &node = graph->nodes.Get(i);
+      if (!config_.display_deleted_node && node.deleted()) continue;
+      for (auto &in : node.inlinks) {
+        if (!config_.display_deleted_node && in->deleted()) continue;
+        dot.AddEdge(in->repr(), node.repr(), {});
+      }
+    }
+    return dot.Build();
+  }
+};
+
+Pass *TensorRTSubgraphNodeMarkPass::CreateGraphvizDebugerPass() const {
+  DFG_GraphvizDrawPass::Config config(
+      FLAGS_inference_analysis_graphviz_log_root, "tensorrt_marked_node");
+  return new DfgDebuggerPass(config);
+}
+bool TensorRTSubgraphNodeMarkPass::Finalize() { return true; }
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..c558a6ebbde371071c7330a14cc986bf764d1773
--- /dev/null
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h
@@ -0,0 +1,60 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * This file defines TensorRTSubgraphNodeMarkPass which helps to mark the ops
+ * that supported by TensorRT engine.
+ */
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/inference/analysis/pass.h"
+#include "paddle/fluid/inference/analysis/subgraph_splitter.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * Mark the operators that TensorRT engine supports.
+ */
+class TensorRTSubgraphNodeMarkPass : public DataFlowGraphPass {
+ public:
+  using teller_t = SubGraphSplitter::NodeInsideSubgraphTeller;
+
+  explicit TensorRTSubgraphNodeMarkPass(const teller_t& teller)
+      : teller_(teller) {}
+
+  bool Initialize(Argument* argument) override { return true; }
+
+  // This class get a sub-graph as input and determine whether to transform this
+  // sub-graph into TensorRT.
+  void Run(DataFlowGraph* graph) override;
+
+  std::string repr() const override { return "tensorrt-sub-subgraph-mark"; }
+  std::string description() const override {
+    return "tensorrt sub-graph mark pass";
+  }
+
+  Pass* CreateGraphvizDebugerPass() const override;
+  bool Finalize() override;
+
+ private:
+  teller_t teller_;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a6c15e848b99ca318f4583e3d4b88345fe8e5ebc
--- /dev/null
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc
@@ -0,0 +1,50 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/analysis/node_attr_flags.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST_F(DFG_Tester, tensorrt_subgraph_node_mark_pass) {
+  // init
+  FluidToDataFlowGraphPass pass;
+  ASSERT_TRUE(pass.Initialize(&argument));
+  argument.main_dfg.reset(new DataFlowGraph);
+  pass.Run(argument.main_dfg.get());
+
+  TensorRTSubgraphNodeMarkPass::teller_t teller = [](const Node* node) {
+    return node->IsFunction() &&
+           static_cast<const Function*>(node)->func_type() == "mul";
+  };
+  TensorRTSubgraphNodeMarkPass pass1(teller);
+  ASSERT_TRUE(pass1.Initialize(&argument));
+  pass1.Run(argument.main_dfg.get());
+
+  int counter{0};
+  for (auto& node : argument.main_dfg->nodes.nodes()) {
+    counter += node->attr(ATTR_supported_by_tensorrt).Bool();
+  }
+
+  LOG(INFO) << counter << " nodes marked";
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9993de22800bc0aafdcbf46618e6b479ac1eb187
--- /dev/null
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
@@ -0,0 +1,33 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h"
+#include "paddle/fluid/inference/analysis/subgraph_splitter.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TensorRTSubGraphPass::TensorRTSubGraphPass(
+    const TensorRTSubGraphPass::NodeInsideSubgraphTeller &teller)
+    : node_inside_subgraph_teller_(teller) {}
+
+void TensorRTSubGraphPass::Run(DataFlowGraph *graph) {
+  SubGraphFuse(graph, node_inside_subgraph_teller_)();
+}
+
+}  // namespace analysis
+}  // namespace inference
+
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6741a92095d33d261a4e1667c87a8ca02e51a9f
--- /dev/null
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/inference/analysis/node.h"
+#include "paddle/fluid/inference/analysis/pass.h"
+#include "paddle/fluid/inference/analysis/subgraph_splitter.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * Parse the graph and replace TensorRT supported nodes with SubGraphNode
+ */
+class TensorRTSubGraphPass : public DataFlowGraphPass {
+ public:
+  // Tell whether to transform a sub-graph into TensorRT.
+  using NodeInsideSubgraphTeller = SubGraphFuse::NodeInsideSubgraphTeller;
+
+  explicit TensorRTSubGraphPass(const NodeInsideSubgraphTeller& teller);
+
+  bool Initialize(Argument* argument) override { return true; }
+
+  // This class get a sub-graph as input and determine whether to transform this
+  // sub-graph into TensorRT.
+  void Run(DataFlowGraph* graph) override;
+
+  bool Finalize() override { return true; }
+
+  std::string repr() const override { return "tensorrt-sub-graph"; }
+  std::string description() const override { return "tensorrt sub graph pass"; }
+
+ private:
+  NodeInsideSubgraphTeller node_inside_subgraph_teller_;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1d749d3fa3f39b351ccee6ebeb82467f7220a0b6
--- /dev/null
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
@@ -0,0 +1,70 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h"
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+DEFINE_string(dot_dir, "./", "");
+
+TEST_F(DFG_Tester, tensorrt_single_pass) {
+  std::unordered_set<std::string> teller_set(
+      {"elementwise_add", "mul", "sigmoid"});
+  SubGraphSplitter::NodeInsideSubgraphTeller teller = [&](const Node* node) {
+    if (node->type() != Node::Type::kFunction) return false;
+    const auto* func = static_cast<const Function*>(node);
+    if (teller_set.count(func->func_type())) return true;
+    return false;
+  };
+
+  LOG(INFO) << "init";
+  DFG_GraphvizDrawPass::Config config{FLAGS_dot_dir, "origin"};
+  DFG_GraphvizDrawPass::Config config1{FLAGS_dot_dir, "fusion"};
+
+  DFG_GraphvizDrawPass dfg_pass(config);
+  DFG_GraphvizDrawPass dfg_pass1(config1);
+  FluidToDataFlowGraphPass pass0;
+  TensorRTSubGraphPass trt_pass(std::move(teller));
+
+  LOG(INFO) << "Initialize";
+  dfg_pass.Initialize(&argument);
+  dfg_pass1.Initialize(&argument);
+  pass0.Initialize(&argument);
+  trt_pass.Initialize(&argument);
+
+  LOG(INFO) << "Run";
+  argument.main_dfg.reset(new DataFlowGraph);
+  pass0.Run(argument.main_dfg.get());
+  dfg_pass.Run(argument.main_dfg.get());
+  trt_pass.Run(argument.main_dfg.get());
+  dfg_pass1.Run(argument.main_dfg.get());
+
+  // Check the TRT op's block desc
+  for (auto& node : argument.main_dfg->nodes.nodes()) {
+    if (node->IsFunctionBlock()) {
+      LOG(INFO) << "get function block";
+    }
+  }
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ut_helper.h b/paddle/fluid/inference/analysis/ut_helper.h
index 722fa99a48a5f2b0e778904de0c35977d0ee3cc0..ce1191a567a4198f003520c40bf02487c48c56eb 100644
--- a/paddle/fluid/inference/analysis/ut_helper.h
+++ b/paddle/fluid/inference/analysis/ut_helper.h
@@ -15,33 +15,46 @@ limitations under the License. */
 #pragma once
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
+#include <fstream>
 #include <string>
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
 #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
-#include "paddle/fluid/inference/io.h"
 
 namespace paddle {
 namespace inference {
+
+// Read ProgramDesc from a __model__ file, defined in io.cc
+extern void ReadBinaryFile(const std::string& filename, std::string* contents);
+
 namespace analysis {
 
 DEFINE_string(inference_model_dir, "", "inference test model dir");
 
 static framework::proto::ProgramDesc LoadProgramDesc(
     const std::string& model_dir = FLAGS_inference_model_dir) {
-  paddle::platform::CPUPlace place;
-  paddle::framework::Executor executor(place);
-  paddle::framework::Scope scope;
-  auto program = Load(&executor, &scope, model_dir);
-  return *program->Proto();
+  std::string msg;
+  std::string net_file = FLAGS_inference_model_dir + "/__model__";
+  std::ifstream fin(net_file, std::ios::in | std::ios::binary);
+  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", net_file);
+  fin.seekg(0, std::ios::end);
+  msg.resize(fin.tellg());
+  fin.seekg(0, std::ios::beg);
+  fin.read(&(msg.at(0)), msg.size());
+  fin.close();
+  framework::proto::ProgramDesc program_desc;
+  program_desc.ParseFromString(msg);
+  return program_desc;
 }
 
 static DataFlowGraph ProgramDescToDFG(
     const framework::proto::ProgramDesc& desc) {
   DataFlowGraph graph;
   FluidToDataFlowGraphPass pass;
-  pass.Initialize(desc);
+  Argument argument;
+  argument.origin_program_desc.reset(new framework::proto::ProgramDesc(desc));
+  pass.Initialize(&argument);
   pass.Run(&graph);
   pass.Finalize();
   return graph;
@@ -49,9 +62,12 @@ static DataFlowGraph ProgramDescToDFG(
 
 class DFG_Tester : public ::testing::Test {
  protected:
-  void SetUp() override { desc = LoadProgramDesc(FLAGS_inference_model_dir); }
+  void SetUp() override {
+    auto desc = LoadProgramDesc(FLAGS_inference_model_dir);
+    argument.origin_program_desc.reset(new framework::proto::ProgramDesc(desc));
+  }
 
-  framework::proto::ProgramDesc desc;
+  Argument argument;
 };
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
index 6b03ac7119b117e442e6af34c719c8a4f736bde9..181868977dd8f2568486ed0c4e1f260a69795896 100644
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/pybind/pybind.h"
 
 DEFINE_string(devices, "", "The devices to be used which is joined by comma.");
@@ -33,7 +33,7 @@ namespace inference {
 
 void Init(const std::vector<std::string> argv) {
   framework::InitGflags(argv);
-  operators::math::SetNumThreads(FLAGS_math_num_threads);
+  platform::SetNumThreads(FLAGS_math_num_threads);
   // init devices
   std::vector<int> devices;
   std::string token;
diff --git a/paddle/fluid/inference/io.h b/paddle/fluid/inference/io.h
index caf599b1a68783f155cd134c2a29e9ffa49a0895..01b50b3670cb9da2e0be232a61ea6129dd83aa20 100644
--- a/paddle/fluid/inference/io.h
+++ b/paddle/fluid/inference/io.h
@@ -18,9 +18,9 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/init.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index c7a5a49dd02d0db022fabff5c3ae1c7800bac25c..6697952051c4b1997ca6b550da17a52e64cb3454 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -64,7 +64,8 @@ class OpConverter {
     (*it)(op, scope, test_mode);
   }
 
-  // convert fluid block to tensorrt network
+  // Convert a fluid block to tensorrt network, NOTE it just convert operators,
+  // the INetwork's inputs and outputs should specified in some other modules.
   void ConvertBlock(const framework::proto::BlockDesc& block,
                     const std::unordered_set<std::string>& parameters,
                     const framework::Scope& scope, TensorRTEngine* engine) {
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index b60f00de9fa5fc8f8f4537379bf9ee9c8bb6f31c..b06a9bbc6758ae9410b2fce99ef2b1a9e7ab98c0 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -51,11 +51,12 @@ class TensorRTEngine : public EngineBase {
     nvinfer1::Weights w_;
   };
 
-  TensorRTEngine(int max_batch, int max_workspace, cudaStream_t* stream,
+  TensorRTEngine(int max_batch, int max_workspace,
+                 cudaStream_t* stream = nullptr,
                  nvinfer1::ILogger& logger = NaiveLogger::Global())
       : max_batch_(max_batch),
         max_workspace_(max_workspace),
-        stream_(stream),
+        stream_(stream ? stream : &default_stream_),
         logger_(logger) {}
 
   virtual ~TensorRTEngine();
@@ -121,6 +122,8 @@ class TensorRTEngine : public EngineBase {
   // the max memory size the engine uses
   int max_workspace_;
   cudaStream_t* stream_;
+  // If stream_ is not set from outside, hold its own stream.
+  cudaStream_t default_stream_;
   nvinfer1::ILogger& logger_;
 
   std::vector<Buffer> buffers_;
@@ -165,20 +168,31 @@ class TensorRTEngine : public EngineBase {
  */
 class TRT_EngineManager {
  public:
-  TensorRTEngine* Create(int max_batch, int max_workspace,
-                         cudaStream_t* stream) {
-    engines_.emplace_back(new TensorRTEngine(max_batch, max_workspace, stream));
-    return engines_.back().get();
+  bool HasEngine(const std::string& name) const {
+    return engines_.count(name) != 0;
+  }
+
+  // Get an engine called `name`.
+  TensorRTEngine* Get(const std::string& name) const {
+    return engines_.at(name).get();
+  }
+
+  // Create or get an engine called `name`
+  TensorRTEngine* Create(int max_batch, int max_workspace, cudaStream_t* stream,
+                         const std::string& name) {
+    auto* p = new TensorRTEngine(max_batch, max_workspace, stream);
+    engines_[name].reset(p);
+    return p;
   }
 
   void DeleteALl() {
-    for (auto& ptr : engines_) {
-      ptr.reset(nullptr);
+    for (auto& item : engines_) {
+      item.second.reset(nullptr);
     }
   }
 
  private:
-  std::vector<std::unique_ptr<TensorRTEngine>> engines_;
+  std::unordered_map<std::string, std::unique_ptr<TensorRTEngine>> engines_;
 };
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index 9dcd79c3bb9ed713ff0f12024969cc5798750988..5cc1db12bb71e428d493e7c6f718b1c6ed431858 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -19,8 +19,8 @@ limitations under the License. */
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
+#include "paddle/fluid/platform/cpu_helper.h"
 #ifdef PADDLE_WITH_MKLML
-#include <mkl_service.h>
 #include <omp.h>
 #endif
 
@@ -29,6 +29,7 @@ DEFINE_string(data_file, "", "File of input index data.");
 DEFINE_int32(repeat, 100, "Running the inference program repeat times");
 DEFINE_bool(prepare_vars, true, "Prepare variables before executor");
 DEFINE_int32(num_threads, 1, "Number of threads should be used");
+DECLARE_bool(use_mkldnn);
 
 inline double GetCurrentMs() {
   struct timeval time;
@@ -103,9 +104,9 @@ void ThreadRunInfer(
     const int tid, paddle::framework::Scope* scope,
     const std::vector<std::vector<const paddle::framework::LoDTensor*>>& jobs) {
   // maybe framework:ProgramDesc is not thread-safe
+  paddle::platform::CPUPlace place;
+  paddle::framework::Executor executor(place);
   auto& sub_scope = scope->NewScope();
-  auto place = paddle::platform::CPUPlace();
-  auto executor = paddle::framework::Executor(place);
   auto inference_program =
       paddle::inference::Load(&executor, scope, FLAGS_model_path);
 
@@ -163,7 +164,7 @@ TEST(inference, nlp) {
   // only use 1 thread number per std::thread
   omp_set_dynamic(0);
   omp_set_num_threads(1);
-  mkl_set_num_threads(1);
+  paddle::platform::SetNumThreads(1);
 #endif
 
   double start_ms = 0, stop_ms = 0;
@@ -182,8 +183,8 @@ TEST(inference, nlp) {
     stop_ms = GetCurrentMs();
   } else {
     // 1. Define place, executor, scope
-    auto place = paddle::platform::CPUPlace();
-    auto executor = paddle::framework::Executor(place);
+    paddle::platform::CPUPlace place;
+    paddle::framework::Executor executor(place);
 
     // 2. Initialize the inference_program and load parameters
     std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc
index 4194ba197948b47003863196efdac1c08a7ae4f6..01a8501dd4abe73cbc71dc4c08734cae66df08ef 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -19,8 +19,9 @@ namespace paddle {
 namespace memory {
 namespace detail {
 
-BuddyAllocator::BuddyAllocator(SystemAllocator* system_allocator,
-                               size_t min_chunk_size, size_t max_chunk_size)
+BuddyAllocator::BuddyAllocator(
+    std::unique_ptr<SystemAllocator> system_allocator, size_t min_chunk_size,
+    size_t max_chunk_size)
     : min_chunk_size_(min_chunk_size),
       max_chunk_size_(max_chunk_size),
       cache_(system_allocator->UseGpu()),
diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h
index 2f39d774d6fb6a2bc37877eb2f8b90bebd3cda28..f0c83efc23ce39c4fc89296d672e1e55751851bf 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.h
+++ b/paddle/fluid/memory/detail/buddy_allocator.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <memory>
 #include <mutex>  // NOLINT
 #include <set>
 #include <tuple>
@@ -32,8 +33,8 @@ namespace detail {
 
 class BuddyAllocator {
  public:
-  BuddyAllocator(SystemAllocator* system_allocator, size_t min_chunk_size,
-                 size_t max_chunk_size);
+  BuddyAllocator(std::unique_ptr<SystemAllocator> system_allocator,
+                 size_t min_chunk_size, size_t max_chunk_size);
 
   ~BuddyAllocator();
 
@@ -103,7 +104,7 @@ class BuddyAllocator {
 
  private:
   /*! Allocate CPU/GPU memory from system */
-  SystemAllocator* system_allocator_;
+  std::unique_ptr<SystemAllocator> system_allocator_;
   std::mutex mutex_;
 };
 
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index d5390529163491c2711e50ffad236534e88b73ee..9b1ab1e228dd758b52975abc4c4aa0bdeadbe2de 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -43,14 +43,16 @@ void* CPUAllocator::Alloc(size_t* index, size_t size) {
 
   *index = 0;  // unlock memory
 
-  void* p;
+  void* p = nullptr;
 
 #ifdef PADDLE_WITH_MKLDNN
   // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp
   // memory alignment
-  PADDLE_ENFORCE_EQ(posix_memalign(&p, 4096ul, size), 0);
+  PADDLE_ENFORCE_EQ(posix_memalign(&p, 4096ul, size), 0, "Alloc %ld error!",
+                    size);
 #else
-  PADDLE_ENFORCE_EQ(posix_memalign(&p, 32ul, size), 0);
+  PADDLE_ENFORCE_EQ(posix_memalign(&p, 32ul, size), 0, "Alloc %ld error!",
+                    size);
 #endif
   PADDLE_ENFORCE(p, "Fail to allocate CPU memory: size = %d .", size);
 
diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc
index 0c74f62de5c6f5d432ee928945db6dcf385ca209..7c800b3c164049244770ceb2070b177d8307e85e 100644
--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <vector>
+
 #include "paddle/fluid/memory/malloc.h"
 
 #include "glog/logging.h"
@@ -20,6 +22,12 @@ limitations under the License. */
 #include "paddle/fluid/memory/detail/system_allocator.h"
 #include "paddle/fluid/platform/gpu_info.h"
 
+DEFINE_bool(init_allocated_mem, false,
+            "It is a mistake that the values of the memory allocated by "
+            "BuddyAllocator are always zeroed in some op's implementation. "
+            "To find this error in time, we use init_allocated_mem to indicate "
+            "that initializing the allocated memory with a small value "
+            "during unit testing.");
 DECLARE_double(fraction_of_gpu_memory_to_use);
 
 namespace paddle {
@@ -28,12 +36,15 @@ namespace memory {
 using BuddyAllocator = detail::BuddyAllocator;
 
 BuddyAllocator* GetCPUBuddyAllocator() {
+  static std::once_flag init_flag;
   static detail::BuddyAllocator* a = nullptr;
-  if (a == nullptr) {
-    a = new detail::BuddyAllocator(new detail::CPUAllocator,
-                                   platform::CpuMinChunkSize(),
-                                   platform::CpuMaxChunkSize());
-  }
+
+  std::call_once(init_flag, []() {
+    a = new detail::BuddyAllocator(
+        std::unique_ptr<detail::SystemAllocator>(new detail::CPUAllocator),
+        platform::CpuMinChunkSize(), platform::CpuMaxChunkSize());
+  });
+
   return a;
 }
 
@@ -41,6 +52,9 @@ template <>
 void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
   VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
   void* p = GetCPUBuddyAllocator()->Alloc(size);
+  if (FLAGS_init_allocated_mem) {
+    memset(p, 0xEF, size);
+  }
   VLOG(10) << "  pointer=" << p;
   return p;
 }
@@ -59,27 +73,33 @@ size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
 #ifdef PADDLE_WITH_CUDA
 
 BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
-  static BuddyAllocator** as = NULL;
-  if (as == NULL) {
+  static std::once_flag init_flag;
+  static detail::BuddyAllocator** a_arr = nullptr;
+
+  std::call_once(init_flag, [gpu_id]() {
     int gpu_num = platform::GetCUDADeviceCount();
-    as = new BuddyAllocator*[gpu_num];
-    for (int gpu = 0; gpu < gpu_num; gpu++) {
-      as[gpu] = nullptr;
+    PADDLE_ENFORCE(gpu_id < gpu_num, "gpu_id:%d should < gpu_num:%d", gpu_id,
+                   gpu_num);
+
+    a_arr = new BuddyAllocator*[gpu_num];
+    for (int i = 0; i < gpu_num; i++) {
+      a_arr[i] = nullptr;
+      platform::SetDeviceId(i);
+      a_arr[i] = new BuddyAllocator(
+          std::unique_ptr<detail::SystemAllocator>(new detail::GPUAllocator(i)),
+          platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
+
+      VLOG(10) << "\n\nNOTE: each GPU device use "
+               << FLAGS_fraction_of_gpu_memory_to_use * 100
+               << "% of GPU memory.\n"
+               << "You can set GFlags environment variable '"
+               << "FLAGS_fraction_of_gpu_memory_to_use"
+               << "' to change the fraction of GPU usage.\n\n";
     }
-  }
+  });
+
   platform::SetDeviceId(gpu_id);
-  if (!as[gpu_id]) {
-    as[gpu_id] = new BuddyAllocator(new detail::GPUAllocator(gpu_id),
-                                    platform::GpuMinChunkSize(),
-                                    platform::GpuMaxChunkSize());
-    VLOG(10) << "\n\nNOTE: each GPU device use "
-             << FLAGS_fraction_of_gpu_memory_to_use * 100
-             << "% of GPU memory.\n"
-             << "You can set GFlags environment variable '"
-             << "FLAGS_fraction_of_gpu_memory_to_use"
-             << "' to change the fraction of GPU usage.\n\n";
-  }
-  return as[gpu_id];
+  return a_arr[gpu_id];
 }
 
 template <>
@@ -104,6 +124,9 @@ void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
     LOG(WARNING) << "GPU memory used: " << Used<platform::CUDAPlace>(place);
     platform::SetDeviceId(cur_dev);
   }
+  if (FLAGS_init_allocated_mem) {
+    cudaMemset(ptr, 0xEF, size);
+  }
   return ptr;
 }
 
@@ -113,12 +136,16 @@ void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p) {
 }
 
 BuddyAllocator* GetCUDAPinnedBuddyAllocator() {
-  static BuddyAllocator* ba = NULL;
-  if (ba == NULL) {
-    ba = new BuddyAllocator(new detail::CUDAPinnedAllocator,
+  static std::once_flag init_flag;
+  static BuddyAllocator* ba = nullptr;
+
+  std::call_once(init_flag, []() {
+    ba = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
+                                new detail::CUDAPinnedAllocator),
                             platform::CUDAPinnedMinChunkSize(),
                             platform::CUDAPinnedMaxChunkSize());
-  }
+  });
+
   return ba;
 }
 
@@ -137,6 +164,9 @@ void* Alloc<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place,
     LOG(WARNING) << "cudaMallocHost Cannot allocate " << size
                  << " bytes in CUDAPinnedPlace";
   }
+  if (FLAGS_init_allocated_mem) {
+    memset(ptr, 0xEF, size);
+  }
   return ptr;
 }
 
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index e4bb04295bc8bcb34f8a4d0c6cd0372cb04d255c..b35a32a553d0d5d0e4f3ad65d6f2cfcde7bada0d 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -184,35 +184,41 @@ else()
     set(DEPS_OPS ${DEPS_OPS} nccl_op)
 endif()
 
-add_subdirectory(detail)
+set(DISTRIBUTE_DEPS "")
 if(WITH_DISTRIBUTE)
-
+    add_subdirectory(distributed)
+    
     set(DISTRIBUTE_DEPS "")
     if(WITH_GRPC)
         set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
     else()
         set(DISTRIBUTE_DEPS sendrecvop_brpc brpc leveldb snappystream snappy protobuf ssl crypto zlib)
+        if(WITH_BRPC_RDMA)
+            find_library(IBVERBS_LIBRARY NAMES ibverbs)
+            ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL)
+            SET_PROPERTY(TARGET ibverbs PROPERTY IMPORTED_LOCATION ${IBVERBS_LIBRARY})
+
+
+            find_library(RDMACM_LIBRARY NAMES rdmacm)
+            ADD_LIBRARY(rdmacm SHARED IMPORTED GLOBAL)
+            SET_PROPERTY(TARGET rdmacm PROPERTY IMPORTED_LOCATION ${RDMACM_LIBRARY})
+
+            set(DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} ibverbs rdmacm)
+        endif()
     endif()
 
     set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-    op_library(prefetch_op DEPS ${DISTRIBUTE_DEPS})
-    set_source_files_properties(prefetch_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    op_library(recv_op DEPS ${DISTRIBUTE_DEPS})
-    set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS})
-    set_source_files_properties(listen_and_serv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    op_library(send_op DEPS ${DISTRIBUTE_DEPS})
-    set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    op_library(send_barrier_op DEPS ${DISTRIBUTE_DEPS})
-    op_library(fetch_barrier_op DEPS ${DISTRIBUTE_DEPS})
-    set_source_files_properties(send_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    set_source_files_properties(fetch_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    foreach(dist_op "prefetch_op" "checkpoint_notify_op" "listen_and_serv_op" "send_op" "recv_op" "send_barrier_op" "fetch_barrier_op")
+        op_library(${dist_op} DEPS ${DISTRIBUTE_DEPS})
+        set_source_files_properties(${dist_op}.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    endforeach()
+    
     #set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
     #cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op
     #        listen_and_serv_op sum_op executor SERIAL)
     if(WITH_GPU)
         set_source_files_properties(test_send_nccl_id.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-        cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS listen_and_serv_op executor SERIAL)
+        cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS listen_and_serv_op ${DISTRIBUTE_DEPS} executor SERIAL)
         if(WITH_GRPC)
             op_library(gen_nccl_id_op DEPS nccl_common sendrecvop_grpc)
         else()
@@ -223,7 +229,7 @@ if(WITH_DISTRIBUTE)
         set(DEPS_OPS ${DEPS_OPS} gen_nccl_id_op)
     endif()
 else()
-    set(DEPS_OPS ${DEPS_OPS}  prefetch_op recv_op listen_and_serv_op send_op send_barrier_op fetch_barrier_op gen_nccl_id_op)
+    set(DEPS_OPS ${DEPS_OPS}  checkpoint_notify_op prefetch_op recv_op listen_and_serv_op send_op send_barrier_op fetch_barrier_op gen_nccl_id_op)
 endif()
 
 op_library(cross_entropy_op DEPS cross_entropy)
@@ -233,7 +239,8 @@ op_library(sequence_softmax_op DEPS softmax)
 if (WITH_GPU AND TENSORRT_FOUND)
     op_library(tensorrt_engine_op DEPS tensorrt_engine)
     nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc
-      DEPS tensorrt_engine_op tensorrt_engine tensorrt_converter)
+      DEPS tensorrt_engine_op tensorrt_engine tensorrt_converter
+      analysis)
 else()
     set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op)
 endif()
@@ -304,6 +311,7 @@ foreach(src ${DETECTION_LIBRARY})
 endforeach()
 
 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
+set(GLOB_DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} CACHE INTERNAL "distributed dependency")
 
 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
diff --git a/paddle/fluid/operators/activation_mkldnn_op.cc b/paddle/fluid/operators/activation_mkldnn_op.cc
index 46ed99bcf2234f7621d9f00eb48c846d8a355795..137bca5e2b8e2754aed274970e08b03ee816a7f2 100644
--- a/paddle/fluid/operators/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/activation_mkldnn_op.cc
@@ -12,16 +12,20 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "mkldnn.hpp"
 #include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/operators/mkldnn_activation_op.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
 
 namespace paddle {
 namespace operators {
 
-using paddle::framework::Tensor;
-using paddle::platform::MKLDNNDeviceContext;
+using framework::DataLayout;
+using framework::Tensor;
+using mkldnn::memory;
+using mkldnn::primitive;
+using mkldnn::stream;
+using platform::GetMKLDNNFormat;
+using platform::MKLDNNDeviceContext;
+using platform::to_void_cast;
 
 namespace {
 std::string gethash(const mkldnn::memory::dims &operand_dims,
@@ -35,188 +39,260 @@ std::string gethash(const mkldnn::memory::dims &operand_dims,
   };
   return dim2str(operand_dims) + std::to_string(algorithm);
 }
+}  // namespace
+
+template <typename Functor>
+class MKLDNNActivationKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto *x = ctx.Input<Tensor>("X");
+    PADDLE_ENFORCE(x->layout() == DataLayout::kMKLDNN &&
+                       x->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input x tensor");
+
+    Functor functor;
+
+    auto attrs = functor.GetAttrs();
+    for (auto &attr : attrs) {
+      *attr.second = ctx.Attr<float>(attr.first);
+    }
+    functor(ctx);
+  }
+};
 
-template <typename T, typename ExecContext>
-void eltwise_forward(const ExecContext &ctx, mkldnn::algorithm algorithm,
-                     const T alpha = 0, const T beta = 0) {
+template <typename Functor>
+class MKLDNNActivationGradKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto *diff_y = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    PADDLE_ENFORCE(diff_y->layout() == DataLayout::kMKLDNN &&
+                       diff_y->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input OutGrad tensor");
+
+    Functor functor;
+
+    auto attrs = functor.GetAttrs();
+    for (auto &attr : attrs) {
+      *attr.second = ctx.Attr<float>(attr.first);
+    }
+    functor(ctx);
+  }
+};
+
+template <typename T>
+void eltwise_forward(const framework::ExecutionContext &ctx,
+                     mkldnn::algorithm algorithm, const T alpha = 0,
+                     const T beta = 0) {
   PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
                  "It must use CPUPlace.");
-
   auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
   const auto &mkldnn_engine = dev_ctx.GetEngine();
 
-  // get buffers
-  const auto *src = ctx.template Input<Tensor>("X");
-  const auto *src_data = src->template data<T>();
+  const auto *x = ctx.Input<Tensor>("X");
+  auto *y = ctx.Output<Tensor>("Out");
 
-  auto *dst = ctx.template Output<Tensor>("Out");
-  T *dst_data = dst->template mutable_data<T>(ctx.GetPlace());
+  const T *x_data = x->data<T>();
+  T *y_data = y->mutable_data<T>(ctx.GetPlace());
 
-  // get memory dim
-  PADDLE_ENFORCE(src->dims().size() == 2 || src->dims().size() == 4,
+  PADDLE_ENFORCE(x->dims().size() == 2 || x->dims().size() == 4,
                  "Input dim must be with 2 or 4");
-  std::vector<int> src_tz = framework::vectorize2int(src->dims());
+
+  std::vector<int> src_tz = framework::vectorize2int(x->dims());
+
+  auto src_format =
+      src_tz.size() == 2 ? mkldnn::memory::format::nc : x->format();
 
   const std::string key = gethash(src_tz, algorithm);
   const std::string key_src_data =
       key + ctx.op().Output("Out") + "@eltwise_fwd_src_data";
-  const std::string key_src_mem = key + "@eltwise_fwd_src_mem";
-  const std::string key_dst_mem = key + "@eltwise_fwd_dst_mem";
-  const std::string key_fwd = key + "@eltwise_fwd";
+  const std::string key_src_layout =
+      key + ctx.op().Output("Out") + "@eltwise_fwd_src_layout";
+  const std::string key_with_layout = key + std::to_string(src_format);
+  const std::string key_src_mem = key_with_layout + "@eltwise_fwd_src_mem";
+  const std::string key_dst_mem = key_with_layout + "@eltwise_fwd_dst_mem";
+  const std::string key_fwd = key_with_layout + "@eltwise_fwd";
+  const std::string key_fwd_pd = key_with_layout + "@eltwise_fwd_pd";
+
+  // save input data and layout to be referred in backward path
+  auto p_src_data = std::make_shared<const T *>(x_data);
+  dev_ctx.SetBlob(key_src_data, p_src_data);
+  auto p_src_layout = std::make_shared<memory::format>(src_format);
+  dev_ctx.SetBlob(key_src_layout, p_src_layout);
 
   auto p_fwd = std::static_pointer_cast<mkldnn::eltwise_forward>(
       dev_ctx.GetBlob(key_fwd));
 
-  // save input data to be referred in backward path
-  auto p_src_data = std::make_shared<const T *>(src_data);
-  dev_ctx.SetBlob(key_src_data, p_src_data);
+  std::shared_ptr<memory> dst_memory;
 
   if (p_fwd == nullptr) {
-    // create memory description
-    auto data_md = src_tz.size() == 2
-                       ? platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
-                                                 mkldnn::memory::format::nc)
-                       : platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
-                                                 mkldnn::memory::format::nchw);
-
-    // create memory primitives
-    auto p_src_mem = std::make_shared<mkldnn::memory>(mkldnn::memory(
-        {data_md, mkldnn_engine}, platform::to_void_cast(src_data)));
-    dev_ctx.SetBlob(key_src_mem, p_src_mem);
-
-    auto p_dst_mem = std::make_shared<mkldnn::memory>(mkldnn::memory(
-        {data_md, mkldnn_engine}, platform::to_void_cast(dst_data)));
-    dev_ctx.SetBlob(key_dst_mem, p_dst_mem);
-
-    auto fwd_desc = mkldnn::eltwise_forward::desc(
-        mkldnn::prop_kind::forward_training, algorithm, data_md, alpha, beta);
-    auto p_fwd_pd = std::make_shared<mkldnn::eltwise_forward::primitive_desc>(
-        fwd_desc, mkldnn_engine);
-    const std::string key_fwd_pd = key + "eltwise_fwd_pd";
-    dev_ctx.SetBlob(key_fwd_pd, p_fwd_pd);
-    p_fwd = std::make_shared<mkldnn::eltwise_forward>(
-        *p_fwd_pd, *(p_src_mem.get()), *(p_dst_mem.get()));
+    // create mkldnn memory for input X
+    auto src_md = platform::MKLDNNMemDesc(
+        src_tz, platform::MKLDNNGetDataType<T>(), src_format);
+    auto src_memory = std::shared_ptr<memory>(
+        new memory({src_md, mkldnn_engine}, to_void_cast(x_data)));
+    // save src_memory to be referred in backward path
+    dev_ctx.SetBlob(key_src_mem, src_memory);
+
+    // create primitive descriptor for activation forward and save it
+    auto forward_desc = mkldnn::eltwise_forward::desc(
+        mkldnn::prop_kind::forward_training, algorithm,
+        src_memory->get_primitive_desc().desc(), alpha, beta);
+    auto forward_pd = std::make_shared<mkldnn::eltwise_forward::primitive_desc>(
+        forward_desc, mkldnn_engine);
+
+    // save prim desc into global device context to be referred in backward path
+    dev_ctx.SetBlob(key_fwd_pd, forward_pd);
+
+    // create mkldnn memory for output y
+    dst_memory =
+        std::make_shared<memory>(forward_pd->dst_primitive_desc(), y_data);
+
+    dev_ctx.SetBlob(key_dst_mem, dst_memory);
+
+    // create activation primitive
+    p_fwd = std::make_shared<mkldnn::eltwise_forward>(*forward_pd, *src_memory,
+                                                      *dst_memory);
     dev_ctx.SetBlob(key_fwd, p_fwd);
   } else {
     // primitives already exist
-    auto p_src_mem =
+    auto src_memory =
         std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(key_src_mem));
-    PADDLE_ENFORCE(p_src_mem != nullptr,
-                   "Fail to find eltwise p_src_mem in device context.");
-    auto p_dst_mem =
+    PADDLE_ENFORCE(src_memory != nullptr,
+                   "Fail to find eltwise src_memory in device context.");
+    dst_memory =
         std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(key_dst_mem));
-    PADDLE_ENFORCE(p_dst_mem != nullptr,
-                   "Fail to find eltwise p_src_mem in device context.");
+    PADDLE_ENFORCE(dst_memory != nullptr,
+                   "Fail to find eltwise dst_memory in device context.");
 
-    p_src_mem->set_data_handle(platform::to_void_reinterpret_cast(src_data));
-    p_dst_mem->set_data_handle(dst_data);
+    src_memory->set_data_handle(platform::to_void_cast(x_data));
+    dst_memory->set_data_handle(y_data);
   }
 
   // push primitive to stream and wait until it's executed
-  std::vector<mkldnn::primitive> pipeline = {*(p_fwd.get())};
-  mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+  std::vector<primitive> pipeline;
+  pipeline.push_back(*p_fwd);
+  stream(stream::kind::eager).submit(pipeline).wait();
+
+  y->set_layout(DataLayout::kMKLDNN);
+  y->set_format(GetMKLDNNFormat(*dst_memory));
 }
 
-template <typename T, typename ExecContext>
-void eltwise_grad(const ExecContext &ctx, mkldnn::algorithm algorithm,
-                  const T alpha = 0, const T beta = 0) {
+template <typename T>
+void eltwise_grad(const framework::ExecutionContext &ctx,
+                  mkldnn::algorithm algorithm, const T alpha = 0,
+                  const T beta = 0) {
   auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
   const auto &mkldnn_engine = dev_ctx.GetEngine();
 
-  // get buffers
-  const auto *out = ctx.template Input<Tensor>("Out");
-
-  auto *dout = ctx.template Input<Tensor>(framework::GradVarName("Out"));
-  const auto *diff_dst = dout->template data<T>();
+  const auto *diff_y = ctx.Input<Tensor>(framework::GradVarName("Out"));
+  auto *diff_x = ctx.Output<Tensor>(framework::GradVarName("X"));
 
-  auto *dx =
-      ctx.template Output<framework::Tensor>(framework::GradVarName("X"));
-  const T *diff_src = dx->template mutable_data<T>(ctx.GetPlace());
+  const T *diff_y_data = diff_y->data<T>();
+  T *diff_x_data = diff_x->mutable_data<T>(ctx.GetPlace());
 
-  // get memory dim
-  std::vector<int> src_tz = framework::vectorize2int(out->dims());
+  std::vector<int> diff_dst_tz = framework::vectorize2int(diff_y->dims());
 
-  const std::string key = gethash(src_tz, algorithm);
-  const std::string key_diff_src_mem = key + "@eltwise_diff_src_mem";
-  const std::string key_diff_dst_mem = key + "@eltwise_diff_dst_mem";
-  const std::string key_grad = key + "@eltwise_grad";
+  auto diff_y_format =
+      diff_dst_tz.size() == 2 ? mkldnn::memory::format::nc : diff_y->format();
 
+  const std::string key = gethash(diff_dst_tz, algorithm);
   const std::string key_src_data =
       key + ctx.op().Input("Out") + "@eltwise_fwd_src_data";
+  const std::string key_src_layout =
+      key + ctx.op().Input("Out") + "@eltwise_fwd_src_layout";
+  const auto p_src_layout =
+      std::static_pointer_cast<memory::format>(dev_ctx.GetBlob(key_src_layout));
+  const std::string key_src_mem =
+      key + std::to_string(*p_src_layout) + "@eltwise_fwd_src_mem";
+  const std::string key_fwd_pd =
+      key + std::to_string(*p_src_layout) + "@eltwise_fwd_pd";
+  const std::string key_with_layouts =
+      key + std::to_string(*p_src_layout) + "-" + std::to_string(diff_y_format);
+  const std::string key_diff_src_mem =
+      key_with_layouts + "@eltwise_diff_src_mem";
+  const std::string key_diff_dst_mem =
+      key_with_layouts + "@eltwise_diff_dst_mem";
+  const std::string key_grad = key_with_layouts + "@eltwise_grad";
+
   const auto p_src_data =
       std::static_pointer_cast<T *>(dev_ctx.GetBlob(key_src_data));
 
-  const std::string key_src_mem = key + "@eltwise_fwd_src_mem";
-  auto p_src_mem =
+  auto src_memory =
       std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(key_src_mem));
-  p_src_mem->set_data_handle(*p_src_data.get());
+  PADDLE_ENFORCE(src_memory != nullptr,
+                 "Fail to find src_memory in device context");
+  src_memory->set_data_handle(*p_src_data.get());
+
+  std::shared_ptr<memory> diff_src_memory;
 
-  auto p_grad = std::static_pointer_cast<mkldnn::eltwise_forward::primitive>(
+  auto p_grad = std::static_pointer_cast<mkldnn::eltwise_backward>(
       dev_ctx.GetBlob(key_grad));
 
   if (p_grad == nullptr) {
-    // create memory description
-    auto data_md = src_tz.size() == 2
-                       ? platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
-                                                 mkldnn::memory::format::nc)
-                       : platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
-                                                 mkldnn::memory::format::nchw);
-
-    // create memory primitives
-    std::shared_ptr<void> p_diff_src_mem =
-        std::make_shared<mkldnn::memory>(mkldnn::memory(
-            {data_md, mkldnn_engine}, platform::to_void_cast(diff_src)));
-    dev_ctx.SetBlob(key_diff_src_mem, p_diff_src_mem);
-    std::shared_ptr<void> p_diff_dst_mem =
-        std::make_shared<mkldnn::memory>(mkldnn::memory(
-            {data_md, mkldnn_engine}, platform::to_void_cast(diff_dst)));
-    dev_ctx.SetBlob(key_diff_dst_mem, p_diff_dst_mem);
-
-    auto bwd_desc = mkldnn::eltwise_backward::desc(algorithm, data_md, data_md,
-                                                   alpha, beta);
-
-    const std::string key_fwd_pd = key + "eltwise_fwd_pd";
-    auto *p_fwd_pd = static_cast<mkldnn::eltwise_forward::primitive_desc *>(
-        dev_ctx.GetBlob(key_fwd_pd).get());
-
-    auto eltwise_bwd_prim_desc = mkldnn::eltwise_backward::primitive_desc(
-        bwd_desc, mkldnn_engine, *p_fwd_pd);
-
+    // create mkldnn memory for input diff_y
+    auto diff_dst_md = platform::MKLDNNMemDesc(
+        diff_dst_tz, platform::MKLDNNGetDataType<T>(), diff_y_format);
+    auto diff_dst_memory = std::shared_ptr<memory>(
+        new memory({diff_dst_md, mkldnn_engine}, to_void_cast(diff_y_data)));
+    dev_ctx.SetBlob(key_diff_dst_mem, diff_dst_memory);
+
+    // retrieve eltwise primitive desc from device context
+    auto forward_pd =
+        std::static_pointer_cast<mkldnn::eltwise_forward::primitive_desc>(
+            dev_ctx.GetBlob(key_fwd_pd));
+    PADDLE_ENFORCE(forward_pd != nullptr,
+                   "Fail to find eltwise_fwd_pd in device context");
+
+    // ceate primitive descriptor for activation backward
+    auto backward_desc = mkldnn::eltwise_backward::desc(
+        algorithm, diff_dst_memory->get_primitive_desc().desc(),
+        src_memory->get_primitive_desc().desc(), alpha, beta);
+    auto backward_pd = mkldnn::eltwise_backward::primitive_desc(
+        backward_desc, mkldnn_engine, *forward_pd);
+
+    // create mkldnn memory for output diff_src
+    diff_src_memory = std::make_shared<memory>(
+        backward_pd.diff_src_primitive_desc(), diff_x_data);
+    dev_ctx.SetBlob(key_diff_src_mem, diff_src_memory);
+
+    // create activation backward primitive
     p_grad = std::make_shared<mkldnn::eltwise_backward>(
-        eltwise_bwd_prim_desc, *static_cast<mkldnn::memory *>(p_src_mem.get()),
-        *(static_cast<mkldnn::memory *>(p_diff_dst_mem.get())),
-        *(static_cast<mkldnn::memory *>(p_diff_src_mem.get())));
+        backward_pd, *src_memory, *diff_dst_memory, *diff_src_memory);
+    dev_ctx.SetBlob(key_grad, p_grad);
   } else {
     // primitives already exist
-    auto p_diff_src_mem = std::static_pointer_cast<mkldnn::memory>(
+    diff_src_memory = std::static_pointer_cast<mkldnn::memory>(
         dev_ctx.GetBlob(key_diff_src_mem));
-    auto p_diff_dst_mem = std::static_pointer_cast<mkldnn::memory>(
+    auto diff_dst_memory = std::static_pointer_cast<mkldnn::memory>(
         dev_ctx.GetBlob(key_diff_dst_mem));
 
-    p_diff_src_mem->set_data_handle(
-        platform::to_void_reinterpret_cast(diff_src));
-    p_diff_dst_mem->set_data_handle(
-        platform::to_void_reinterpret_cast(diff_dst));
+    diff_src_memory->set_data_handle(
+        platform::to_void_reinterpret_cast(diff_x_data));
+    diff_dst_memory->set_data_handle(
+        platform::to_void_reinterpret_cast(diff_y_data));
   }
 
   // push primitive to stream and wait until it's executed
-  std::vector<mkldnn::primitive> pipeline = {*(p_grad.get())};
-  mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+  std::vector<primitive> pipeline;
+  pipeline.push_back(*p_grad);
+  stream(stream::kind::eager).submit(pipeline).wait();
+
+  diff_x->set_layout(DataLayout::kMKLDNN);
+  diff_x->set_format(GetMKLDNNFormat(*diff_src_memory));
 }
-}  // anonymous namespace
 
 template <typename T, mkldnn::algorithm algorithm>
 struct MKLDNNActivationFunc : public BaseActivationFunctor<T> {
-  template <typename ExecContext>
-  void operator()(const ExecContext &ctx) const {
+  void operator()(const framework::ExecutionContext &ctx) const {
     eltwise_forward<T>(ctx, algorithm);
   }
 };
 
 template <typename T, mkldnn::algorithm algorithm>
 struct MKLDNNActivationGradFunc : public BaseActivationFunctor<T> {
-  template <typename ExecContext>
-  void operator()(const ExecContext &ctx) const {
+  void operator()(const framework::ExecutionContext &ctx) const {
     eltwise_grad<T>(ctx, algorithm);
   }
 };
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index af1d85047e519df6766b2139a0445ae9dc5945e2..286b03d7b7d11a50f33f0190c1a5b9097ed0f4a2 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -19,6 +19,8 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+using paddle::framework::Tensor;
+
 #define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT)               \
   class OP_NAME##OpMaker                                                \
       : public ::paddle::framework::OpProtoAndCheckerMaker {            \
@@ -29,7 +31,7 @@ namespace operators {
       AddAttr<bool>("use_mkldnn",                                       \
                     "(bool, default false) Only used in mkldnn kernel") \
           .SetDefault(false);                                           \
-      AddComment(OP_COMMENT);                                           \
+      AddComment(#OP_COMMENT);                                          \
     }                                                                   \
   }
 
@@ -58,7 +60,6 @@ framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx,
                                       const framework::OperatorWithKernel& oper,
                                       const std::string& name) {
   framework::LibraryType library{framework::LibraryType::kPlain};
-
   framework::DataLayout layout = framework::DataLayout::kAnyLayout;
 #ifdef PADDLE_WITH_MKLDNN
   auto it = oper.Attrs().find("use_mkldnn");
@@ -82,6 +83,7 @@ class ActivationOp : public framework::OperatorWithKernel {
     ctx->ShareLoD("X", /*->*/ "Out");
   }
 
+ protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return GetKernelType(ctx, *this, "X");
@@ -96,6 +98,7 @@ class ActivationOpGrad : public framework::OperatorWithKernel {
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Out"));
   }
 
+ protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return GetKernelType(ctx, *this, "Out");
@@ -112,7 +115,7 @@ $$out = \frac{1}{1 + e^{-x}}$$
 __attribute__((unused)) constexpr char LogSigmoidDoc[] = R"DOC(
 Logsigmoid Activation Operator
 
-$$out = \log \frac{1}{1 + e^{-x}}$$
+$$out = \\log \\frac{1}{1 + e^{-x}}$$
 
 )DOC";
 
@@ -133,14 +136,14 @@ $out = \max(x, 0)$
 __attribute__((unused)) constexpr char TanhDoc[] = R"DOC(
 Tanh Activation Operator.
 
-$$out = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
+$$out = \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
 
 )DOC";
 
 __attribute__((unused)) constexpr char TanhShrinkDoc[] = R"DOC(
 TanhShrink Activation Operator.
 
-$$out = x - \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
+$$out = x - \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
 
 )DOC";
 
@@ -196,7 +199,7 @@ $out = [x]$
 __attribute__((unused)) constexpr char ReciprocalDoc[] = R"DOC(
 Reciprocal Activation Operator.
 
-$$out = \frac{1}{x}$$
+$$out = \\frac{1}{x}$$
 
 )DOC";
 
@@ -252,15 +255,14 @@ class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "Output of Softshrink operator");
     AddAttr<float>("lambda", "non-negative offset").SetDefault(0.5f);
     AddComment(R"DOC(
-Softshrink Activation Operator.
+:strong:`Softshrink Activation Operator`
 
-$$
-out = \begin{cases} 
-    x - \lambda, \text{if } x > \lambda \\
-    x + \lambda, \text{if } x < -\lambda \\
-    0,  \text{otherwise}
-    \end{cases}
-$$
+..  math::
+    out = \begin{cases} 
+         x - \lambda, \text{if } x > \lambda \\
+         x + \lambda, \text{if } x < -\lambda \\
+         0,  \text{otherwise}
+         \end{cases}
 
 )DOC");
   }
@@ -271,18 +273,18 @@ class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddInput("X", "Input of HardShrink operator");
     AddOutput("Out", "Output of HardShrink operator");
-    AddAttr<float>("threshold", "The value of threshold for HardShrink")
+    AddAttr<float>("threshold",
+                   "The value of threshold for HardShrink. [default: 0.5]")
         .SetDefault(0.5f);
     AddComment(R"DOC(
-HardShrink Activation Operator.
+:strong:`HardShrink activation operator`
 
-$$
-out = \begin{cases} 
-    x, \text{if } x > \lambda \\
-    x, \text{if } x < -\lambda \\
-    0,  \text{otherwise}
-    \end{cases}
-$$
+..  math::
+    out = \begin{cases}
+            x, \text{if } x > \lambda \\
+            x, \text{if } x < -\lambda \\
+            0,  \text{otherwise}
+          \end{cases}
 
 )DOC");
   }
@@ -383,7 +385,7 @@ class STanhOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 STanh Activation Operator.
 
-$$out = b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$
+$$out = b * \\frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$
 
 )DOC");
   }
@@ -394,18 +396,18 @@ class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddInput("X", "Input of ThresholdedRelu operator");
     AddOutput("Out", "Output of ThresholdedRelu operator");
-    AddAttr<float>("threshold", "The threshold location of activation")
+    AddAttr<float>("threshold",
+                   "The threshold location of activation. [default 1.0].")
         .SetDefault(1.0f);
     AddComment(R"DOC(
-ThresholdedRelu Activation Operator.
+:strong:`ThresholdedRelu activation operator`
 
-$$
-out = \begin{cases} 
-    x, \text{if } x > threshold \\
-    0,  \text{otherwise}
-    \end{cases}
-$$
+..  math::
 
+    out = \begin{cases}
+             x,  \text{if } x > threshold \\
+             0,  \text{otherwise}
+          \end{cases}
 )DOC");
   }
 };
@@ -444,7 +446,7 @@ class SwishOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Swish Activation Operator.
 
-$$out = \frac{x}{1 + e^{- \beta x}}$$
+$$out = \\frac{x}{1 + e^{- \beta x}}$$
 
 )DOC");
   }
diff --git a/paddle/fluid/operators/adam_op.cc b/paddle/fluid/operators/adam_op.cc
index 6ee73c3000fb45b4e1cd5bbb730da7d61b494b6f..5d670fe3b9d99a31a628ff707ff860564eca952e 100644
--- a/paddle/fluid/operators/adam_op.cc
+++ b/paddle/fluid/operators/adam_op.cc
@@ -56,9 +56,12 @@ class AdamOp : public framework::OperatorWithKernel {
                       "Beta2 power accumulator should have 1 dimension");
 
     auto param_dims = ctx->GetInputDim("Param");
-    PADDLE_ENFORCE_EQ(
-        param_dims, ctx->GetInputDim("Grad"),
-        "Param and Grad input of AdamOp should have same dimension");
+    if (ctx->GetInputsVarType("Grad")[0] ==
+        framework::proto::VarType::LOD_TENSOR) {
+      PADDLE_ENFORCE_EQ(
+          param_dims, ctx->GetInputDim("Grad"),
+          "Param and Grad input of AdamOp should have same dimension");
+    }
     PADDLE_ENFORCE_EQ(
         param_dims, ctx->GetInputDim("Moment1"),
         "Param and Moment1 input of AdamOp should have same dimension");
diff --git a/paddle/fluid/operators/adam_op.h b/paddle/fluid/operators/adam_op.h
index f82ff47b52490c354f383515d430d14e24cbf6af..a7a28b02b67f2ef180ec0e273dbe7ef555f88ce2 100644
--- a/paddle/fluid/operators/adam_op.h
+++ b/paddle/fluid/operators/adam_op.h
@@ -282,6 +282,10 @@ class AdamOpKernel : public framework::OpKernel<T> {
     } else if (grad_var->IsType<framework::SelectedRows>()) {
       auto& grad =
           Ref(ctx.Input<framework::SelectedRows>("Grad"), "Must set Grad");
+      if (grad.rows().size() == 0) {
+        VLOG(3) << "grad row size is 0!!";
+        return;
+      }
       // merge duplicated rows if any.
       scatter::MergeAdd<DeviceContext, T> merge_func;
       auto grad_merge =
diff --git a/paddle/fluid/operators/argsort_op.cc b/paddle/fluid/operators/argsort_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a2f5a2545701991263c1ef842e9275b1edbfd2ca
--- /dev/null
+++ b/paddle/fluid/operators/argsort_op.cc
@@ -0,0 +1,87 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/argsort_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ArgsortOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ArgsortOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ArgsortOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Indices"),
+                   "Output(Indices) of ArgsortOp should not be null.");
+
+    auto in_dims = ctx->GetInputDim("X");
+    int axis = ctx->Attrs().Get<int>("axis");
+
+    auto num_dims = in_dims.size();
+    PADDLE_ENFORCE(axis < num_dims,
+                   "Attr(axis) %d of ArgsortOp is out of bounds for Input(X)'s "
+                   "rank %d.",
+                   axis, num_dims);
+    PADDLE_ENFORCE(axis >= -num_dims,
+                   "Attr(axis) %d of ArgsortOp must be not less than "
+                   "-rank(Input(X)) (%d).",
+                   axis, num_dims);
+
+    ctx->SetOutputDim("Out", in_dims);
+    ctx->SetOutputDim("Indices", in_dims);
+    ctx->ShareLoD("X", "Out");
+    ctx->ShareLoD("X", "Indices");
+  }
+};
+
+class ArgsortOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) The input of Argsort op.");
+    AddOutput("Out",
+              "(Tensor) The sorted tensor of Argsort op, with the same "
+              "shape as Input(X).");
+    AddOutput("Indices",
+              "(Tensor) The indices of a tensor giving the sorted order, with "
+              "the same shape as Input(X).");
+    AddComment(R"DOC(
+Argsort operator
+
+Performs sorting on the input tensor along the given axis and outputs two 
+tensors, Output(Out) and Output(Indices). They reserve the same shape 
+with Input(X), and Output(Out) represents the sorted tensor while 
+Output(Indices) gives the sorted order along the given axis Attr(axis).
+
+ )DOC");
+    AddAttr<int>("axis",
+                 "(int, default -1) The axis along which to sort the tensor. "
+                 "When axis < 0, the actual axis will be the |axis|'th "
+                 "counting backwards. Default -1, the last dimension.")
+        .SetDefault(-1);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(argsort, ops::ArgsortOp, ops::ArgsortOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(argsort,
+                       ops::ArgsortKernel<paddle::platform::CPUPlace, float>,
+                       ops::ArgsortKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/fluid/operators/argsort_op.cu b/paddle/fluid/operators/argsort_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7d5199aae7da4eed5afa6b8bd64c04a540b915d4
--- /dev/null
+++ b/paddle/fluid/operators/argsort_op.cu
@@ -0,0 +1,151 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <thrust/execution_policy.h>
+#include <thrust/sort.h>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/argsort_op.h"
+#include "paddle/fluid/platform/assert.h"
+#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using platform::PADDLE_CUDA_NUM_THREADS;
+
+const int kMaxRank = 9;  // The max rank of a tensor allowed in Fluid
+
+__global__ void ComputeTargetIdx(const int64_t* in_dims, int dims_size,
+                                 int axis, int64_t n, int64_t* trg_idx,
+                                 int64_t* med_ids) {
+  int64_t index = threadIdx.x + blockDim.x * blockIdx.x;
+  if (index < n) {
+    int64_t shape_out_axis[kMaxRank - 1] = {0};
+    int64_t dims_out_axis[kMaxRank - 1] = {0};
+    int64_t tmp = index;
+    int64_t pos_in_axis = 0;
+    int64_t i = dims_size - 2;
+    int64_t dim_axis = 0;
+    for (int64_t j = dims_size - 1; j >= 0; --j) {
+      int64_t dim = in_dims[j];
+      if (j != axis) {
+        shape_out_axis[i] = tmp % dim;
+        dims_out_axis[i] = dim;
+        i--;
+      } else {
+        dim_axis = dim;
+        pos_in_axis = tmp % dim_axis;
+      }
+      tmp /= dim;
+    }
+    int64_t group = (dims_size > 1) ? shape_out_axis[0] : 0;
+    for (int64_t j = 0; j < dims_size - 2; ++j) {
+      group = group * dims_out_axis[j + 1] + shape_out_axis[j + 1];
+    }
+
+    int64_t traget_idx = group * dim_axis + pos_in_axis;
+    trg_idx[index] = traget_idx;
+    med_ids[traget_idx] = pos_in_axis;
+  }
+}
+
+template <typename T>
+__global__ void PermuteInData(const T* in, const int64_t* trg_idx, int64_t n,
+                              T* med_out) {
+  int index = threadIdx.x + blockDim.x * blockIdx.x;
+  if (index < n) {
+    med_out[trg_idx[index]] = in[index];
+  }
+}
+
+template <typename T>
+__global__ void Sort(int64_t axis_dim, int64_t groups, T* med_out,
+                     int64_t* med_ids) {
+  int index = threadIdx.x + blockDim.x * blockIdx.x;
+  if (index < groups) {
+    thrust::sort_by_key(thrust::device, med_out + index * axis_dim,
+                        med_out + axis_dim * (1 + index),
+                        med_ids + index * axis_dim);
+  }
+}
+
+template <typename T>
+__global__ void PermuteMediateData(const T* med_out, const int64_t* med_ids,
+                                   const int64_t* trg_idx, int64_t n, T* out,
+                                   int64_t* indices) {
+  int index = threadIdx.x + blockDim.x * blockIdx.x;
+  if (index < n) {
+    out[index] = med_out[trg_idx[index]];
+    indices[index] = med_ids[trg_idx[index]];
+  }
+}
+
+template <typename T>
+class ArgsortOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+    auto* indices = ctx.Output<Tensor>("Indices");
+    int axis = ctx.Attr<int>("axis");
+
+    auto in_dims = input->dims();
+    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+
+    const T* in_data = input->data<T>();
+    T* out_data = output->mutable_data<T>(ctx.GetPlace());
+    int64_t* ids_data = indices->mutable_data<int64_t>(ctx.GetPlace());
+
+    int64_t numel = input->numel();
+    int64_t groups = numel / in_dims[axis];
+
+    std::vector<int64_t> in_dims_vec = vectorize(in_dims);
+    thrust::device_vector<int64_t> in_dims_dev(in_dims_vec.begin(),
+                                               in_dims_vec.end());
+    int64_t* in_dims_data = thrust::raw_pointer_cast(in_dims_dev.data());
+    // Mediate tensor for sorting data and indices
+    Tensor mediate_output, mediate_indices;
+    T* med_out_data =
+        mediate_output.mutable_data<T>(input->dims(), ctx.GetPlace());
+    int64_t* med_ids_data =
+        mediate_indices.mutable_data<int64_t>(in_dims, ctx.GetPlace());
+    // Target index of each element along the given axis in the mediate tensors
+    Tensor trg_idx_t;
+    int64_t* trg_idx = trg_idx_t.mutable_data<int64_t>(in_dims, ctx.GetPlace());
+
+    auto stream = ctx.cuda_device_context().stream();
+    const int num_threads = PADDLE_CUDA_NUM_THREADS;
+
+    ComputeTargetIdx<<<(numel - 1) / num_threads + 1, num_threads, 0, stream>>>(
+        in_dims_data, in_dims.size(), axis, numel, trg_idx, med_ids_data);
+
+    PermuteInData<<<(numel - 1) / num_threads + 1, num_threads, 0, stream>>>(
+        in_data, trg_idx, numel, med_out_data);
+
+    Sort<<<(groups - 1) / num_threads + 1, num_threads, 0, stream>>>(
+        in_dims[axis], groups, med_out_data, med_ids_data);
+
+    PermuteMediateData<<<(numel - 1) / num_threads + 1, num_threads, 0,
+                         stream>>>(med_out_data, med_ids_data, trg_idx, numel,
+                                   out_data, ids_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(argsort, paddle::operators::ArgsortOpCUDAKernel<float>,
+                        paddle::operators::ArgsortOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/argsort_op.h b/paddle/fluid/operators/argsort_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..7e9112cfb7cbe5f783b04729fb4dff3676c922bc
--- /dev/null
+++ b/paddle/fluid/operators/argsort_op.h
@@ -0,0 +1,81 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class ArgsortKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<framework::Tensor>("X");
+    auto* output = ctx.Output<framework::Tensor>("Out");
+    auto* indices = ctx.Output<framework::Tensor>("Indices");
+    int axis = ctx.Attr<int>("axis");
+
+    auto in_dims = input->dims();
+    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+
+    const T* in_data = input->data<T>();
+    T* out_data = output->mutable_data<T>(ctx.GetPlace());
+    int64_t* ids_data = indices->mutable_data<int64_t>(ctx.GetPlace());
+
+    int64_t groups = input->numel() / in_dims[axis];
+    int64_t stride = (axis == in_dims.size() - 1)
+                         ? 1
+                         : framework::product(framework::slice_ddim(
+                               in_dims, axis + 1, in_dims.size()));
+
+    for (int64_t i = 0; i < groups; ++i) {
+      int64_t idx = i;
+      std::vector<int64_t> shape_vec(in_dims.size(), 0);
+      for (int64_t dim = in_dims.size() - 1; dim >= 0; --dim) {
+        if (dim != axis) {
+          shape_vec[dim] = idx % in_dims[dim];
+          idx /= in_dims[dim];
+        }
+      }
+
+      int64_t start_index = shape_vec[0];
+      for (int64_t dim = 0; dim < in_dims.size() - 1; ++dim) {
+        start_index = start_index * in_dims[dim + 1] + shape_vec[dim + 1];
+      }
+
+      std::vector<int64_t> org_index_vec(in_dims[axis], start_index);
+      for (int64_t j = 1; j < in_dims[axis]; ++j) {
+        org_index_vec[j] += j * stride;
+      }
+
+      std::sort(org_index_vec.begin(), org_index_vec.end(),
+                [in_data](const int64_t v1, const int64_t v2) {
+                  return in_data[v1] < in_data[v2];
+                });
+
+      for (size_t j = 0; j < org_index_vec.size(); ++j) {
+        int64_t index = start_index + j * stride;
+        out_data[index] = in_data[org_index_vec[j]];
+        ids_data[index] = (org_index_vec[j] - start_index) / stride;
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/assign_value_op.cc b/paddle/fluid/operators/assign_value_op.cc
index 4ad6f3443db33fd14b67091d14fd877b951730ff..a757916be7f6ece9b783d51d1051aac6a276795b 100644
--- a/paddle/fluid/operators/assign_value_op.cc
+++ b/paddle/fluid/operators/assign_value_op.cc
@@ -70,6 +70,7 @@ $$Out = values$$
 
 namespace ops = paddle::operators;
 
-REGISTER_OPERATOR(assign_value, ops::AssignValueOp, ops::AssignValueOpMaker);
+REGISTER_OPERATOR(assign_value, ops::AssignValueOp, ops::AssignValueOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
 REGISTER_OP_CPU_KERNEL(assign_value, ops::AssignValueKernel<int>,
                        ops::AssignValueKernel<float>);
diff --git a/paddle/fluid/operators/average_accumulates_op.cc b/paddle/fluid/operators/average_accumulates_op.cc
index 25864e95d7e290c7f684501893e99c828c511979..f389eab605e087c535b9918264e6502217062505 100644
--- a/paddle/fluid/operators/average_accumulates_op.cc
+++ b/paddle/fluid/operators/average_accumulates_op.cc
@@ -19,28 +19,28 @@ namespace operators {
 
 template <>
 void GetAccumulators<paddle::platform::CPUDeviceContext>(
-    const framework::ExecutionContext& ctx, int64_t* num_updates_,
-    int64_t* num_accumulates_, int64_t* old_num_accumulates_) {
+    const framework::ExecutionContext& ctx, int64_t* num_updates,
+    int64_t* num_accumulates, int64_t* old_num_accumulates) {
   auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
   auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
   auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
 
-  *old_num_accumulates_ = in_old_num_accumulates->data<int64_t>()[0];
-  *num_accumulates_ = in_num_accumulates->data<int64_t>()[0];
-  *num_updates_ = in_num_updates->data<int64_t>()[0];
+  *old_num_accumulates = in_old_num_accumulates->data<int64_t>()[0];
+  *num_accumulates = in_num_accumulates->data<int64_t>()[0];
+  *num_updates = in_num_updates->data<int64_t>()[0];
 }
 
 template <>
 void SetAccumulators<paddle::platform::CPUDeviceContext>(
-    const framework::ExecutionContext& ctx, int64_t num_updates_,
-    int64_t num_accumulates_, int64_t old_num_accumulates_) {
+    const framework::ExecutionContext& ctx, int64_t num_updates,
+    int64_t num_accumulates, int64_t old_num_accumulates) {
   auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
   auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
   auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
 
-  out_old_num_accumulates->data<int64_t>()[0] = old_num_accumulates_;
-  out_num_accumulates->data<int64_t>()[0] = num_accumulates_;
-  out_num_updates->data<int64_t>()[0] = num_updates_;
+  out_old_num_accumulates->data<int64_t>()[0] = old_num_accumulates;
+  out_num_accumulates->data<int64_t>()[0] = num_accumulates;
+  out_num_updates->data<int64_t>()[0] = num_updates;
 }
 
 class AverageAccumulatesOp : public framework::OperatorWithKernel {
@@ -177,7 +177,7 @@ class AverageAccumulatesOpMaker : public framework::OpProtoAndCheckerMaker {
 
     AddComment(R"DOC(
 AverageAccumulates Operator.
-Accumulate the sum of parameter whtin sliding window. The size of sliding window is
+Accumulate the sum of parameter within sliding window. The size of sliding window is
 determined by 'average_window', 'max_average_window' and 'min_average_window'.
 Memory was shared by Input(in_sum_1) and Output(out_sum_1) which acts as an accumulator 'sum_1'.
 'sum_2', 'sum_3', 'num_accumulates', 'old_num_accumulates' and 'num_updates' were the same as 'sum_1'.
diff --git a/paddle/fluid/operators/average_accumulates_op.h b/paddle/fluid/operators/average_accumulates_op.h
index 07ac5ced11605f6d0d5164d1c0f69acbd7bbed60..3958d3f685470f2505abf0e8bfd269d3834970ae 100644
--- a/paddle/fluid/operators/average_accumulates_op.h
+++ b/paddle/fluid/operators/average_accumulates_op.h
@@ -54,8 +54,9 @@ class AverageAccumulatesKernel : public framework::OpKernel<T> {
     float average_window = ctx.Attr<float>("average_window");
     int64_t max_average_window = ctx.Attr<int64_t>("max_average_window");
     int64_t min_average_window = ctx.Attr<int64_t>("min_average_window");
-    min_average_window =
-        std::min<int64_t>(min_average_window, max_average_window);
+    PADDLE_ENFORCE_LE(min_average_window, max_average_window,
+                      "min_average_window shouldn't be larger than "
+                      "max_average_window");
 
     // Get inputs
     auto* param = ctx.Input<Tensor>("param");
diff --git a/paddle/fluid/operators/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/batch_norm_mkldnn_op.cc
index 8206cc9890160da756efb13c991020f09b20126a..9ab2179b5fe689762704039c5f67dd080e530aa5 100644
--- a/paddle/fluid/operators/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/batch_norm_mkldnn_op.cc
@@ -21,8 +21,6 @@ namespace operators {
 
 using batch_norm_bwd = mkldnn::batch_normalization_backward;
 using batch_norm_fwd = mkldnn::batch_normalization_forward;
-using framework::DataLayout;
-using framework::Tensor;
 using mkldnn::memory;
 using mkldnn::primitive;
 using mkldnn::reorder;
@@ -31,18 +29,6 @@ using paddle::platform::MKLDNNDeviceContext;
 using paddle::platform::MKLDNNMemDesc;
 using platform::to_void_cast;
 
-template <typename T>
-using EigenArrayMap =
-    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
-template <typename T>
-using ConstEigenArrayMap =
-    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
-template <typename T>
-using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
-template <typename T>
-using ConstEigenVectorArrayMap =
-    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
-
 namespace {
 template <typename T>
 struct bn_type_traits {
@@ -80,6 +66,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     const float epsilon = ctx.Attr<float>("epsilon");
     const float momentum = ctx.Attr<float>("momentum");
     const bool is_test = ctx.Attr<bool>("is_test");
+    const bool fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
 
     const auto *x = ctx.Input<Tensor>("X");
     const auto *mean = ctx.Input<Tensor>("Mean");
@@ -125,11 +112,15 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
     unsigned flags = mkldnn::use_scale_shift;
     if (is_test) flags |= mkldnn::use_global_stats;
+    if (fuse_with_relu) flags |= mkldnn::fuse_bn_relu;
 
     // create mkldnn memory from input x tensor
-    auto src_memory =
-        memory({{{src_tz}, memory::data_type::f32, x->format()}, mkldnn_engine},
-               to_void_cast(x_data));
+    mkldnn::memory::format input_format =
+        platform::MKLDNNFormatForSize(src_tz.size(), x->format());
+
+    auto src_memory = memory(
+        {{{src_tz}, memory::data_type::f32, input_format}, mkldnn_engine},
+        to_void_cast(x_data));
 
     // create primitive descriptor for batch norm forward
     using bn_fwd_types = bn_type_traits<mkldnn::batch_normalization_forward>;
@@ -263,15 +254,21 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     using bn_bwd_types = bn_type_traits<mkldnn::batch_normalization_backward>;
 
     // create mkldnn memory from input diff_y tensor
-    auto user_diff_dst_memory =
-        memory({{{diff_dst_tz}, memory::data_type::f32, diff_y->format()},
-                mkldnn_engine},
-               to_void_cast(diff_y_data));
+
+    mkldnn::memory::format dst_format =
+        platform::MKLDNNFormatForSize(src_tz.size(), diff_y->format());
+
+    auto user_diff_dst_memory = memory(
+        {{{diff_dst_tz}, memory::data_type::f32, dst_format}, mkldnn_engine},
+        to_void_cast(diff_y_data));
 
     // create mkldnn memory from input x tensor
-    auto src_memory =
-        memory({{{src_tz}, memory::data_type::f32, x->format()}, mkldnn_engine},
-               to_void_cast(x_data));
+    mkldnn::memory::format input_format =
+        platform::MKLDNNFormatForSize(src_tz.size(), x->format());
+
+    auto src_memory = memory(
+        {{{src_tz}, memory::data_type::f32, input_format}, mkldnn_engine},
+        to_void_cast(x_data));
 
     // for diff_dst, try to use same format as dst in forward pass
     auto diff_dst_pd = batch_norm_fwd_pd.get()->dst_primitive_desc();
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index 625ca2d7c4c70d1098b0fb28380d8d1eb24cb338..5912a1a17cbd29c3ebd83f37133c044f0905c8bd 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -22,22 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using DataLayout = framework::DataLayout;
-
-template <typename T>
-using EigenArrayMap =
-    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
-template <typename T>
-using ConstEigenArrayMap =
-    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
-template <typename T>
-using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
-template <typename T>
-using ConstEigenVectorArrayMap =
-    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
-
 class BatchNormOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -171,6 +155,9 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<bool>("use_mkldnn",
                   "(bool, default false) Only used in mkldnn kernel")
         .SetDefault(false);
+    AddAttr<bool>("fuse_with_relu",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
     AddComment(R"DOC(
 Batch Normalization.
 
@@ -229,6 +216,18 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
       saved_mean_e.setZero();
       saved_variance_e.setZero();
 
+      EigenVectorArrayMap<T> running_mean_arr(
+          mean_out->mutable_data<T>(ctx.GetPlace()), C);
+      EigenVectorArrayMap<T> running_var_arr(
+          variance_out->mutable_data<T>(ctx.GetPlace()), C);
+
+      if ((N * sample_size) == 1) {
+        LOG(WARNING) << "Only 1 element in normalization dimension, "
+                     << "we skip the batch norm calculation, let y = x.";
+        framework::TensorCopySync(*x, ctx.GetPlace(), y);
+        return;
+      }
+
       switch (data_layout) {
         case DataLayout::kNCHW: {
           ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
@@ -260,10 +259,6 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
           PADDLE_THROW("Unknown storage order: %s", data_layout_str);
       }
 
-      EigenVectorArrayMap<T> running_mean_arr(
-          mean_out->mutable_data<T>(ctx.GetPlace()), C);
-      EigenVectorArrayMap<T> running_var_arr(
-          variance_out->mutable_data<T>(ctx.GetPlace()), C);
       running_mean_arr =
           running_mean_arr * momentum + saved_mean_e * (1. - momentum);
       running_var_arr =
@@ -440,6 +435,11 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
     d_bias_arr.setZero();
     d_scale_arr.setZero();
 
+    if ((N * sample_size) == 1) {
+      framework::TensorCopySync(*d_y, ctx.GetPlace(), d_x);
+      return;
+    }
+
     const auto scale_inv_var_nhw = scale_arr * inv_var_arr / (N * sample_size);
 
     switch (data_layout) {
diff --git a/paddle/fluid/operators/batch_norm_op.cu.cc b/paddle/fluid/operators/batch_norm_op.cu.cc
index 550dd32d36767f90e880415bfffaf01aeb623609..ca6cd8669352fd5814f25a04433ca97fe4abe9ff 100644
--- a/paddle/fluid/operators/batch_norm_op.cu.cc
+++ b/paddle/fluid/operators/batch_norm_op.cu.cc
@@ -72,6 +72,9 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
     int N, C, H, W, D;
     ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
 
+    auto *y = ctx.Output<Tensor>("Y");
+    y->mutable_data<T>(ctx.GetPlace());
+
     // ------------------- cudnn descriptors ---------------------
     cudnnTensorDescriptor_t data_desc_;
     cudnnTensorDescriptor_t bn_param_desc_;
@@ -93,7 +96,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
     mode_ = CUDNN_BATCHNORM_SPATIAL;
 #endif
 
-    VLOG(1) << "Setting descriptors.";
+    VLOG(3) << "Setting descriptors.";
     std::vector<int> dims;
     std::vector<int> strides;
     if (data_layout == DataLayout::kNCHW) {
@@ -113,11 +116,6 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
     const auto *scale = ctx.Input<Tensor>("Scale");
     const auto *bias = ctx.Input<Tensor>("Bias");
 
-    auto *y = ctx.Output<Tensor>("Y");
-
-    // alloc memory
-    y->mutable_data<T>(ctx.GetPlace());
-
     auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
 
     auto handle = dev_ctx.cudnn_handle();
@@ -162,22 +160,28 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
       functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
       functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
 
-      double this_factor = 1. - momentum;
-
-      CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardTraining(
-          handle, mode_, CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(),
-          data_desc_, x->template data<T>(), data_desc_,
-          y->template mutable_data<T>(ctx.GetPlace()), bn_param_desc_,
-          scale->template data<BatchNormParamType<T>>(),
-          bias->template data<BatchNormParamType<T>>(), this_factor,
-          mean_out->template mutable_data<BatchNormParamType<T>>(
-              ctx.GetPlace()),
-          variance_out->template mutable_data<BatchNormParamType<T>>(
-              ctx.GetPlace()),
-          epsilon, saved_mean->template mutable_data<BatchNormParamType<T>>(
-                       ctx.GetPlace()),
-          saved_variance->template mutable_data<BatchNormParamType<T>>(
-              ctx.GetPlace())));
+      if ((N * H * W * D) == 1) {
+        LOG(WARNING) << "Only 1 element in normalization dimension, "
+                     << "we skip the batch norm calculation, let y = x.";
+        framework::TensorCopySync(*x, ctx.GetPlace(), y);
+      } else {
+        double this_factor = 1. - momentum;
+
+        CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardTraining(
+            handle, mode_, CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(),
+            data_desc_, x->template data<T>(), data_desc_,
+            y->template mutable_data<T>(ctx.GetPlace()), bn_param_desc_,
+            scale->template data<BatchNormParamType<T>>(),
+            bias->template data<BatchNormParamType<T>>(), this_factor,
+            mean_out->template mutable_data<BatchNormParamType<T>>(
+                ctx.GetPlace()),
+            variance_out->template mutable_data<BatchNormParamType<T>>(
+                ctx.GetPlace()),
+            epsilon, saved_mean->template mutable_data<BatchNormParamType<T>>(
+                         ctx.GetPlace()),
+            saved_variance->template mutable_data<BatchNormParamType<T>>(
+                ctx.GetPlace())));
+      }
     }
 
     // clean when exit.
@@ -209,6 +213,25 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
     int N, C, H, W, D;
     ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
 
+    // init output
+    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    d_x->mutable_data<T>(ctx.GetPlace());
+    d_scale->mutable_data<T>(ctx.GetPlace());
+    d_bias->mutable_data<T>(ctx.GetPlace());
+
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    if ((N * H * W * D) == 1) {
+      framework::TensorCopySync(*d_y, ctx.GetPlace(), d_x);
+      math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
+          functor;
+      functor(dev_ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
+      functor(dev_ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
+      return;
+    }
+
     PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL);
     PADDLE_ENFORCE_EQ(scale->dims()[0], C);
 
@@ -247,21 +270,11 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
     CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor(
         bn_param_desc_, data_desc_, mode_));
 
-    // init output
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
-    d_x->mutable_data<T>(ctx.GetPlace());
-    d_scale->mutable_data<T>(ctx.GetPlace());
-    d_bias->mutable_data<T>(ctx.GetPlace());
-
     const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
     const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
     const void *saved_mean_data = saved_mean->template data<T>();
     const void *saved_var_data = saved_var->template data<T>();
 
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward(
         dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
         CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
diff --git a/paddle/fluid/operators/batch_norm_op.h b/paddle/fluid/operators/batch_norm_op.h
index 9e5fc41598f29336074335f3624a2300ad018d09..5e3d630d6889e445c5e84fa836d2d81bb7266779 100644
--- a/paddle/fluid/operators/batch_norm_op.h
+++ b/paddle/fluid/operators/batch_norm_op.h
@@ -19,6 +19,22 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using DataLayout = framework::DataLayout;
+
+template <typename T>
+using EigenArrayMap =
+    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using ConstEigenArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
+template <typename T>
+using ConstEigenVectorArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
+
 template <typename DeviceContext, typename T>
 class BatchNormKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc
index c3dd22119ddab8ecf9213ee274e4cbd4f05e78fd..10d678111f5325e495b24286e6ecf651230393fe 100644
--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
@@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/beam_search_decode_op.h"
+#include <algorithm>
 #include <string>
+
+#include "paddle/fluid/operators/beam_search_decode_op.h"
 #include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
@@ -22,8 +24,11 @@ namespace operators {
 struct BeamSearchDecodeFunctor {
   BeamSearchDecodeFunctor(const LoDTensorArray& step_ids,
                           const LoDTensorArray& step_scores,
-                          LoDTensor* id_tensor, LoDTensor* score_tensor)
-      : step_ids_origin_(step_ids),
+                          LoDTensor* id_tensor, LoDTensor* score_tensor,
+                          size_t beam_size, int end_id)
+      : beam_size_(beam_size),
+        end_id_(end_id),
+        step_ids_origin_(step_ids),
         step_scores_origin_(step_scores),
         id_tensor_(id_tensor),
         score_tensor_(score_tensor) {
@@ -37,9 +42,11 @@ struct BeamSearchDecodeFunctor {
       // Copy all tensors in the input tensor array
       for (auto& step_id : step_ids_origin_) {
         framework::LoDTensor out;
-        dev_ctx->Wait();
-        framework::TensorCopy(step_id, platform::CPUPlace(), *dev_ctx, &out);
-        dev_ctx->Wait();
+        if (step_id.numel() > 0) {
+          dev_ctx->Wait();
+          framework::TensorCopy(step_id, platform::CPUPlace(), *dev_ctx, &out);
+          dev_ctx->Wait();
+        }
 
         out.set_lod(step_id.lod());
         step_ids_.push_back(out);
@@ -53,9 +60,12 @@ struct BeamSearchDecodeFunctor {
       // Copy all tensors in the input tensor array
       for (auto& step_score : step_scores_origin_) {
         framework::LoDTensor out;
-        dev_ctx->Wait();
-        framework::TensorCopy(step_score, platform::CPUPlace(), *dev_ctx, &out);
-        dev_ctx->Wait();
+        if (step_score.numel() > 0) {
+          dev_ctx->Wait();
+          framework::TensorCopy(step_score, platform::CPUPlace(), *dev_ctx,
+                                &out);
+          dev_ctx->Wait();
+        }
 
         out.set_lod(step_score.lod());
         step_scores_.push_back(out);
@@ -67,6 +77,8 @@ struct BeamSearchDecodeFunctor {
   void operator()() const;
 
   bool tensor_on_gpu_;
+  size_t beam_size_;
+  int end_id_;
   const LoDTensorArray& step_ids_origin_;
   const LoDTensorArray& step_scores_origin_;
   LoDTensorArray step_ids_ = LoDTensorArray();
@@ -77,14 +89,14 @@ struct BeamSearchDecodeFunctor {
 
 template <typename T>
 void BeamSearchDecodeFunctor::operator()() const {
-  BeamSearchDecoder<T> beam_search_decoder;
+  BeamSearchDecoder<T> beam_search_decoder(beam_size_, end_id_);
   // Check if the tensor is on GPU. If so, use the CPU copy instead
   if (tensor_on_gpu_) {
-    beam_search_decoder.PackAllSteps(step_ids_, step_scores_, id_tensor_,
-                                     score_tensor_);
+    beam_search_decoder.Backtrace(step_ids_, step_scores_, id_tensor_,
+                                  score_tensor_);
   } else {
-    beam_search_decoder.PackAllSteps(step_ids_origin_, step_scores_origin_,
-                                     id_tensor_, score_tensor_);
+    beam_search_decoder.Backtrace(step_ids_origin_, step_scores_origin_,
+                                  id_tensor_, score_tensor_);
   }
 }
 
@@ -122,13 +134,17 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
                         "Level of LodTensor should be 2");
     }
 
+    size_t beam_size = ctx.Attr<int>("beam_size");
+    int end_id = ctx.Attr<int>("end_id");
+
     // prepare output
     LoDTensor* sentenceIds = ctx.Output<LoDTensor>("SentenceIds");
     LoDTensor* sentenceScores = ctx.Output<LoDTensor>("SentenceScores");
 
     framework::VisitDataType(
         framework::ToDataType(scores->at(0).type()),
-        BeamSearchDecodeFunctor(*ids, *scores, sentenceIds, sentenceScores));
+        BeamSearchDecodeFunctor(*ids, *scores, sentenceIds, sentenceScores,
+                                beam_size, end_id));
   }
 };
 
@@ -137,18 +153,32 @@ class BeamSearchDecodeOpProtoMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddInput("Ids",
              "(LodTensorArray)"
-             "score of the candidate words in each step");
+             "The LodTensorArray containing the selected ids of all steps");
     AddInput("Scores",
              "(LodTensorArray)"
-             "score of the candidate words in each step");
-    AddOutput("SentenceIds",
-              "(LodTensor)"
-              "All possible result sentences of word ids");
-    AddOutput("SentenceScores",
-              "(LodTensor)"
-              "All possible result sentences of word scores");
+             "The LodTensorArray containing the selected scores of all steps");
+    AddOutput(
+        "SentenceIds",
+        "(LodTensor)"
+        "An LodTensor containing all generated id sequences for all source "
+        "sentences");
+    AddOutput(
+        "SentenceScores",
+        "(LodTensor)"
+        "An LodTensor containing scores corresponding to Output(SentenceIds)");
+    AddAttr<int>("beam_size", "beam size for beam search");
+    AddAttr<int>("end_id",
+                 "the token id which indicates the end of a sequence");
     AddComment(R"DOC(
-Pack the result of Beam search op into SentenceIds and SentenceScores.
+Beam Search Decode Operator. This Operator constructs the full hypotheses for
+each source sentence by walking back along the LoDTensorArray Input(ids)
+whose lods can be used to restore the path in the beam search tree.
+
+The Output(SentenceIds) and Output(SentenceScores) separately contain the 
+generated id sequences and the corresponding scores. The shapes and lods of the 
+two LodTensor are same. The lod level is 2 and the two levels separately 
+indicate how many hypotheses each source sentence has and how many ids each 
+hypothesis has.
 )DOC");
   }
 };
@@ -172,10 +202,12 @@ class BeamSearchDecodeInferVarType : public framework::VarTypeInference {
   void operator()(const framework::OpDesc& op_desc,
                   framework::BlockDesc* block) const override {
     for (auto& o : op_desc.Output("SentenceIds")) {
-      block->Var(o)->SetType(framework::proto::VarType::LOD_TENSOR);
+      auto& sentence_ids = block->FindRecursiveOrCreateVar(o);
+      sentence_ids.SetType(framework::proto::VarType::LOD_TENSOR);
     }
     for (auto& o : op_desc.Output("SentenceScores")) {
-      block->Var(o)->SetType(framework::proto::VarType::LOD_TENSOR);
+      auto& sentence_scores = block->FindRecursiveOrCreateVar(o);
+      sentence_scores.SetType(framework::proto::VarType::LOD_TENSOR);
     }
   }
 };
diff --git a/paddle/fluid/operators/beam_search_decode_op.h b/paddle/fluid/operators/beam_search_decode_op.h
index 3c01f81c83555b985bb6b7a9e3330ab594a62863..6aefc5446f167eebb0da673b3fbdf7ed128daa98 100644
--- a/paddle/fluid/operators/beam_search_decode_op.h
+++ b/paddle/fluid/operators/beam_search_decode_op.h
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
 
@@ -25,42 +27,12 @@ using LoDTensor = framework::LoDTensor;
 using LoDTensorArray = framework::LoDTensorArray;
 
 // all the lod have 2 levels.
-// The First is source level, the second is sentence level.
-// source level describe how many candidate words for this source.
-// sentence level describe these candidates belong to which prefix
+// The first is source level, the second is sentence level.
+// source level describe how many prefixes (branchs) for each source sentece
+// (beam). sentence level describe how these candidates belong to the prefixes.
 const size_t kSourceLevel = 0;
 const size_t kSentenceLevel = 1;
 
-template <typename T>
-struct BeamNode {
-  BeamNode(int64_t word_id, T score) : word_id_(word_id), score_(score) {}
-
-  ~BeamNode() {
-    if (parent_) {
-      parent_->DropKid(this);
-      if (parent_->kids_.size() == 0UL) {
-        delete parent_;
-      }
-    }
-    VLOG(3) << "Delete BeamNode root with word_id:" << this->word_id_;
-  }
-
-  void AppendTo(BeamNode* parent) {
-    parent_ = parent;
-    parent->kids_.insert(this);
-  }
-
-  void DropKid(BeamNode* kid) { kids_.erase(kid); }
-
-  BeamNode* parent_ = nullptr;
-  std::unordered_set<BeamNode*> kids_;
-  int64_t word_id_;
-  T score_;
-};
-
-template <typename T>
-using BeamNodeVector = std::vector<std::unique_ptr<BeamNode<T>>>;
-
 template <typename T>
 struct Sentence {
   std::vector<int64_t> word_ids;
@@ -72,24 +44,8 @@ using SentenceVector = std::vector<Sentence<T>>;
 
 template <typename T>
 struct BeamSearchDecoder {
-  /**
-   * make a BeamNode and all it's related prefix BeanNode into a Sentence.
-   */
-  Sentence<T> MakeSentence(const BeamNode<T>* node) const;
-
-  /**
-   * Param:
-   *  cur_ids: LoDTensor of One step for word ID
-   *  cur_scores: LoDTensor of One Step for word score
-   *  prefixes_list: prefixes for each source sentence.
-   *  sentence_vector_list: result sentence_vector for each source sentence.
-   * Return:
-   *  a new prefixes list for each source of current step
-   */
-  std::vector<BeamNodeVector<T>> PackTwoSteps(
-      const LoDTensor& cur_ids, const LoDTensor& cur_scores,
-      std::vector<BeamNodeVector<T>>* prefixes_list,
-      std::vector<SentenceVector<T>>* sentence_vector_list) const;
+  BeamSearchDecoder(size_t beam_size, int end_id)
+      : beam_size_(beam_size), end_id_(end_id) {}
 
   /**
    * convert the result sentence_vector for each source sentence into two
@@ -100,107 +56,30 @@ struct BeamSearchDecoder {
    *  sentence_vector_list: sentence_vector for each source sentence.
    *  id_tensor: result LoDTensor for sentences of id.
    *  score_tensor: result LoDTensor for sentences of score.
+   *  reverse: whether ids of sentence in sentence_vector_list is reversed
+   *  sort_by_score: whether to sort hypotheses of each sentence by scores.
    */
   void ConvertSentenceVectorToLodTensor(
       std::vector<SentenceVector<T>> sentence_vector_list, LoDTensor* id_tensor,
-      LoDTensor* score_tensor) const;
+      LoDTensor* score_tensor, bool reverse = true,
+      bool sort_by_score = true) const;
 
   /**
-   * Pack all steps of id/score LodTensor into sentence LoDTensor
-   * it's main logic is:
-   * ```python
-   *   prefix
-   *   result_sentence
-   *   result_lod_tensor
-   *
-   *   for (step in steps):
-   *     prefix = PackTwoSteps(prefix, step, &result_sentence)
-   *   ConvertSentenceVector<T>ToLodTensor(result_sentence, &result_lod_tensor)
-   * ```
+   * Gather the hypotheses for each source sentence by backtrace though the
+   * LoDTensorArray step_ids whose lods reserve the path in the tree.
    */
-  void PackAllSteps(const LoDTensorArray& step_ids,
-                    const LoDTensorArray& step_scores, LoDTensor* id_tensor,
-                    LoDTensor* score_tensor) const;
-};
-
-template <typename T>
-Sentence<T> BeamSearchDecoder<T>::MakeSentence(const BeamNode<T>* node) const {
-  Sentence<T> sentence;
-  while (node != nullptr) {
-    sentence.word_ids.emplace_back(node->word_id_);
-    sentence.scores.emplace_back(node->score_);
-    node = node->parent_;
-  }
-
-  std::reverse(std::begin(sentence.word_ids), std::end(sentence.word_ids));
-  std::reverse(std::begin(sentence.scores), std::end(sentence.scores));
-
-  return sentence;
-}
-
-template <typename T>
-std::vector<BeamNodeVector<T>> BeamSearchDecoder<T>::PackTwoSteps(
-    const LoDTensor& cur_ids, const LoDTensor& cur_scores,
-    std::vector<BeamNodeVector<T>>* prefixes_list,
-    std::vector<SentenceVector<T>>* sentence_vector_list) const {
-  std::vector<BeamNodeVector<T>> result;
+  void Backtrace(const LoDTensorArray& step_ids,
+                 const LoDTensorArray& step_scores, LoDTensor* id_tensor,
+                 LoDTensor* score_tensor) const;
 
-  for (size_t src_idx = 0; src_idx < cur_ids.lod()[kSourceLevel].size() - 1;
-       ++src_idx) {
-    size_t src_start = cur_ids.lod().at(kSourceLevel)[src_idx];
-    size_t src_end = cur_ids.lod().at(kSourceLevel)[src_idx + 1];
-
-    BeamNodeVector<T> beam_nodes;
-
-    // if prefixes size is 0, it means this is the first step. In this step,
-    // all candidate id is the start of candidate sentences.
-    if (prefixes_list->empty()) {
-      PADDLE_ENFORCE_EQ(cur_ids.lod().at(kSourceLevel).back(),
-                        cur_ids.lod().at(kSentenceLevel).back(),
-                        "in the first step");
-      for (size_t id_idx = src_start; id_idx < src_end; ++id_idx) {
-        beam_nodes.push_back(std::unique_ptr<BeamNode<T>>(new BeamNode<T>(
-            cur_ids.data<int64_t>()[id_idx], cur_scores.data<T>()[id_idx])));
-      }
-    } else {
-      BeamNodeVector<T>& prefixes = prefixes_list->at(src_idx);
-      SentenceVector<T>& sentence_vector = (*sentence_vector_list)[src_idx];
-
-      PADDLE_ENFORCE_EQ(src_end - src_start, prefixes.size(),
-                        "prefix and candidate set number should be the same");
-
-      auto candidate_offset = cur_ids.lod()[kSentenceLevel];
-      for (size_t prefix_idx = 0; prefix_idx < prefixes.size(); ++prefix_idx) {
-        std::unique_ptr<BeamNode<T>>& prefix = prefixes[prefix_idx];
-        size_t candidate_start = candidate_offset[src_start + prefix_idx];
-        size_t candidate_end = candidate_offset[src_start + prefix_idx + 1];
-        if (candidate_start == candidate_end) {
-          VLOG(3) << "this sentence has no more candidate, "
-                     "add to result sentence and rm it from beam tree";
-          sentence_vector.push_back(MakeSentence(prefix.get()));
-          prefix.reset();
-        } else {
-          for (size_t candidate_idx = candidate_start;
-               candidate_idx < candidate_end; ++candidate_idx) {
-            auto* candidate =
-                new BeamNode<T>(cur_ids.data<int64_t>()[candidate_idx],
-                                cur_scores.data<T>()[candidate_idx]);
-            candidate->AppendTo(prefix.get());
-            beam_nodes.push_back(std::unique_ptr<BeamNode<T>>(candidate));
-          }
-          prefix.release();
-        }
-      }
-    }
-    result.push_back(std::move(beam_nodes));
-  }
-  return result;
-}
+  size_t beam_size_;
+  int end_id_;
+};
 
 template <typename T>
 void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(
     std::vector<SentenceVector<T>> sentence_vector_list, LoDTensor* id_tensor,
-    LoDTensor* score_tensor) const {
+    LoDTensor* score_tensor, bool reverse, bool sort_by_score) const {
   size_t src_num = sentence_vector_list.size();
 
   PADDLE_ENFORCE_NE(src_num, 0, "src_num should not be 0");
@@ -211,11 +90,29 @@ void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(
   std::vector<T> score_data;
 
   for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
+    if (sort_by_score) {
+      sort(sentence_vector_list[src_idx].begin(),
+           sentence_vector_list[src_idx].end(),
+           [reverse](const Sentence<T>& a, const Sentence<T>& b) {
+             if (reverse)
+               return a.scores.front() > b.scores.front();
+             else
+               return a.scores.back() > b.scores.back();
+           });
+    }
     for (Sentence<T>& sentence : sentence_vector_list[src_idx]) {
-      id_data.insert(id_data.end(), sentence.word_ids.begin(),
-                     sentence.word_ids.end());
-      score_data.insert(score_data.end(), sentence.scores.begin(),
-                        sentence.scores.end());
+      if (reverse) {
+        id_data.insert(id_data.end(), sentence.word_ids.rbegin(),
+                       sentence.word_ids.rend());
+        score_data.insert(score_data.end(), sentence.scores.rbegin(),
+                          sentence.scores.rend());
+      } else {
+        id_data.insert(id_data.end(), sentence.word_ids.begin(),
+                       sentence.word_ids.end());
+        score_data.insert(score_data.end(), sentence.scores.begin(),
+                          sentence.scores.end());
+      }
+
       sentence_level_lod.push_back(sentence_level_lod.back() +
                                    sentence.word_ids.size());
     }
@@ -243,39 +140,75 @@ void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(
 }
 
 template <typename T>
-void BeamSearchDecoder<T>::PackAllSteps(const LoDTensorArray& step_ids,
-                                        const LoDTensorArray& step_scores,
-                                        LoDTensor* id_tensor,
-                                        LoDTensor* score_tensor) const {
+void BeamSearchDecoder<T>::Backtrace(const LoDTensorArray& step_ids,
+                                     const LoDTensorArray& step_scores,
+                                     LoDTensor* id_tensor,
+                                     LoDTensor* score_tensor) const {
   PADDLE_ENFORCE(!step_ids.empty(), "step num should be larger than 0");
   PADDLE_ENFORCE_EQ(step_ids.size(), step_scores.size(),
                     "step_ids and step_scores should be the same");
   const size_t step_num = step_ids.size();
   const size_t src_num = step_ids.at(0).lod().at(kSourceLevel).size() - 1;
+  std::vector<SentenceVector<T>> sentence_vector_list(
+      src_num, SentenceVector<T>(beam_size_));
+  std::vector<std::vector<size_t>> prefix_idx_vector_list(src_num);
+  for (int step_id = step_num - 1; step_id >= 0; --step_id) {
+    auto& cur_ids = step_ids.at(step_id);
+    auto& cur_scores = step_scores.at(step_id);
+    for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
+      // for each source sentence
+      auto& sentence_vector = sentence_vector_list.at(src_idx);
+      auto& prefix_idx_vector = prefix_idx_vector_list.at(src_idx);
+      size_t src_prefix_start = cur_ids.lod().at(kSourceLevel)[src_idx];
+      size_t src_prefix_end = cur_ids.lod().at(kSourceLevel)[src_idx + 1];
+      if (prefix_idx_vector.empty()) {  // be finished and pruned at this step
+                                        // or the last time step
+        for (size_t prefix_idx = src_prefix_start; prefix_idx < src_prefix_end;
+             ++prefix_idx) {
+          size_t candidate_start = cur_ids.lod().at(kSentenceLevel)[prefix_idx];
+          size_t candidate_end =
+              cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1];
+          for (size_t candidate_idx = candidate_start;
+               candidate_idx < candidate_end; ++candidate_idx) {
+            prefix_idx_vector.push_back(prefix_idx);
+            size_t idx = prefix_idx_vector.size() - 1;
+            auto cur_id = cur_ids.data<int64_t>()[candidate_idx];
+            auto cur_score = cur_scores.data<T>()[candidate_idx];
+            sentence_vector.at(idx).word_ids.push_back(cur_id);
+            sentence_vector.at(idx).scores.push_back(cur_score);
+          }
+        }
+      } else {  // use prefix_idx_vector to backtrace
+        size_t src_candidate_start =
+            cur_ids.lod().at(kSentenceLevel)[src_prefix_start];
+        size_t prefix_idx = src_prefix_start;
+        size_t candidate_num =
+            cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] -
+            cur_ids.lod().at(kSentenceLevel)[prefix_idx];
+        for (size_t idx = 0; idx < prefix_idx_vector.size(); ++idx) {
+          auto candidate_idx = prefix_idx_vector.at(idx);
+          auto cur_id = cur_ids.data<int64_t>()[candidate_idx];
+          auto cur_score = cur_scores.data<T>()[candidate_idx];
+          if (cur_id != end_id_ || sentence_vector.at(idx).word_ids.empty()) {
+            // to skip redundant end tokens
+            sentence_vector.at(idx).word_ids.push_back(cur_id);
+            sentence_vector.at(idx).scores.push_back(cur_score);
+          }
 
-  PADDLE_ENFORCE_GT(src_num, 0UL, "source num should be larger than 0");
-
-  // previous prefixes for each step,
-  // the init length is 0, means this is the first step.
-  std::vector<BeamNodeVector<T>> beamnode_vector_list(0);
-  std::vector<SentenceVector<T>> sentence_vector_list(src_num);
-
-  // pack all steps for one batch first, then another batch
-  for (size_t step_id = 0; step_id < step_num; ++step_id) {
-    beamnode_vector_list =
-        PackTwoSteps(step_ids.at(step_id), step_scores.at(step_id),
-                     &beamnode_vector_list, &sentence_vector_list);
-  }
-  // append last beam_node to result
-  for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
-    for (auto& beam_node : beamnode_vector_list.at(src_idx)) {
-      sentence_vector_list[src_idx].push_back(MakeSentence(beam_node.get()));
-      beam_node.reset();
+          while (src_candidate_start + candidate_num <=
+                 candidate_idx) {  // search the corresponding prefix
+            prefix_idx++;
+            candidate_num += cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] -
+                             cur_ids.lod().at(kSentenceLevel)[prefix_idx];
+          }
+          prefix_idx_vector.at(idx) = prefix_idx;
+        }
+      }
     }
   }
 
   ConvertSentenceVectorToLodTensor(sentence_vector_list, id_tensor,
-                                   score_tensor);
+                                   score_tensor, true, true);
 }
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/beam_search_decode_op_test.cc b/paddle/fluid/operators/beam_search_decode_op_test.cc
index 36f9594969c416c694928811012baf94332bbd91..88339e38d89db3f79abf232d6b0d035b759739a6 100644
--- a/paddle/fluid/operators/beam_search_decode_op_test.cc
+++ b/paddle/fluid/operators/beam_search_decode_op_test.cc
@@ -20,15 +20,11 @@ using LoD = paddle::framework::LoD;
 using LoDTensor = paddle::framework::LoDTensor;
 using LoDTensorArray = paddle::framework::LoDTensorArray;
 
-template <typename T>
-using BeamNode = paddle::operators::BeamNode<T>;
 template <typename T>
 using BeamSearchDecoder = paddle::operators::BeamSearchDecoder<T>;
 template <typename T>
 using Sentence = paddle::operators::Sentence<T>;
 template <typename T>
-using BeamNodeVector = paddle::operators::BeamNodeVector<T>;
-template <typename T>
 using SentenceVector = paddle::operators::SentenceVector<T>;
 
 namespace paddle {
@@ -77,138 +73,50 @@ void GenerateExample(const std::vector<size_t>& level_0,
 }  // namespace test
 }  // namespace paddle
 
-TEST(BeamSearchDecodeOp, DeleteBeamNode) {
-  auto* root = new BeamNode<float>(0, 0);
-  auto* b1 = new BeamNode<float>(1, 1);
-  auto* b2 = new BeamNode<float>(2, 2);
-  auto* b3 = new BeamNode<float>(3, 3);
-
-  b1->AppendTo(root);
-  b2->AppendTo(root);
-  b3->AppendTo(b1);
-
-  delete b3;
-  delete b2;
-}
-
-TEST(BeamSearchDecodeOp, MakeSentence) {
-  auto* root = new BeamNode<float>(0, 0);
-  auto* b1 = new BeamNode<float>(1, 1);
-  auto* end = new BeamNode<float>(2, 2);
-  b1->AppendTo(root);
-  end->AppendTo(b1);
-
-  BeamSearchDecoder<float> helper;
-  Sentence<float> sentence = helper.MakeSentence(end);
-  delete end;
-
-  std::vector<int64_t> expect_ids = {0, 1, 2};
-  ASSERT_EQ(sentence.word_ids, expect_ids);
-
-  std::vector<float> expect_scores = {0, 1, 2};
-  ASSERT_EQ(sentence.scores, expect_scores);
-}
-
-TEST(BeamSearchDecodeOp, PackTwoStepsFistStep) {
-  CPUPlace place;
-
-  LoDTensorArray ids;
-  LoDTensorArray scores;
-
-  paddle::test::GenerateExample(
-      std::vector<size_t>{0, 2, 6}, std::vector<size_t>{0, 1, 2, 3, 4, 5, 6},
-      std::vector<int>{1, 2, 3, 4, 5, 6}, &ids, &scores);
-
-  std::vector<BeamNodeVector<float>> beamnode_vector_list;
-  std::vector<SentenceVector<float>> sentence_vector_list(
-      2, SentenceVector<float>());
-
-  BeamSearchDecoder<float> helper;
-  beamnode_vector_list = helper.PackTwoSteps(
-      ids[0], scores[0], &beamnode_vector_list, &sentence_vector_list);
-  ASSERT_EQ(beamnode_vector_list.size(), 2UL);
-  ASSERT_EQ(beamnode_vector_list[0].size(), 2UL);
-  ASSERT_EQ(beamnode_vector_list[1].size(), 4UL);
-}
-
-TEST(BeamSearchDecodeOp, PackTwoSteps) {
-  CPUPlace place;
-
-  // first source has three prefix
-  BeamNodeVector<float> source0_prefixes;
-  source0_prefixes.push_back(
-      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(1, 1)));
-  source0_prefixes.push_back(
-      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(0, 0)));
-  source0_prefixes.push_back(
-      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(3, 3)));
-
-  // second source has two prefix
-  BeamNodeVector<float> source1_prefixes;
-  source1_prefixes.push_back(
-      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(4, 4)));
-  source1_prefixes.push_back(
-      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(5, 5)));
-
-  std::vector<BeamNodeVector<float>> beamnode_vector_list;
-  std::vector<SentenceVector<float>> sentence_vector_list(
-      2, SentenceVector<float>());
-
-  beamnode_vector_list.push_back(std::move(source0_prefixes));
-  beamnode_vector_list.push_back(std::move(source1_prefixes));
-
-  // generate data for one step
-  LoDTensorArray ids;
-  LoDTensorArray scores;
-
-  paddle::test::GenerateExample(std::vector<size_t>{0, 3, 5},
-                                std::vector<size_t>{0, 1, 1, 3, 4, 5},
-                                std::vector<int>{0, 1, 2, 3, 4}, &ids, &scores);
-
-  BeamSearchDecoder<float> helper1;
-  beamnode_vector_list = helper1.PackTwoSteps(
-      ids[0], scores[0], &beamnode_vector_list, &sentence_vector_list);
-
-  ASSERT_EQ(sentence_vector_list[0].size(), 1UL);
-  ASSERT_EQ(sentence_vector_list[1].size(), 0UL);
-  ASSERT_EQ(beamnode_vector_list[0].size(), 3UL);
-  ASSERT_EQ(beamnode_vector_list[1].size(), 2UL);
-}
-
-TEST(BeamSearchDecodeOp, PackAllSteps) {
+TEST(BeamSearchDecodeOp, Backtrace) {
   CPUPlace place;
 
-  // we will constuct a sample data with 3 steps and 2 source sentences
+  // Construct sample data with 5 steps and 2 source sentences
+  // beam_size = 2, start_id = 0, end_id = 1
   LoDTensorArray ids;
   LoDTensorArray scores;
 
   paddle::test::GenerateExample(
-      std::vector<size_t>{0, 3, 6}, std::vector<size_t>{0, 1, 2, 3, 4, 5, 6},
-      std::vector<int>{1, 2, 3, 4, 5, 6}, &ids, &scores);
+      std::vector<size_t>{0, 1, 2}, std::vector<size_t>{0, 1, 2},
+      std::vector<int>{0, 0}, &ids, &scores);  // start with start_id
+  paddle::test::GenerateExample(std::vector<size_t>{0, 1, 2},
+                                std::vector<size_t>{0, 2, 4},
+                                std::vector<int>{2, 3, 4, 5}, &ids, &scores);
+  paddle::test::GenerateExample(std::vector<size_t>{0, 2, 4},
+                                std::vector<size_t>{0, 2, 2, 4, 4},
+                                std::vector<int>{3, 1, 5, 4}, &ids, &scores);
+  paddle::test::GenerateExample(std::vector<size_t>{0, 2, 4},
+                                std::vector<size_t>{0, 1, 2, 3, 4},
+                                std::vector<int>{1, 1, 3, 5}, &ids, &scores);
   paddle::test::GenerateExample(
-      std::vector<size_t>{0, 3, 6}, std::vector<size_t>{0, 1, 1, 3, 5, 5, 6},
-      std::vector<int>{0, 1, 2, 3, 4, 5}, &ids, &scores);
-  paddle::test::GenerateExample(std::vector<size_t>{0, 3, 6},
-                                std::vector<size_t>{0, 0, 1, 2, 3, 4, 5},
-                                std::vector<int>{0, 1, 2, 3, 4}, &ids, &scores);
+      std::vector<size_t>{0, 2, 4},
+      std::vector<size_t>{0, 0, 0, 2,
+                          2},  // the branchs of the first source sentence
+                               // are pruned since finished
+      std::vector<int>{5, 1},
+      &ids, &scores);
 
-  ASSERT_EQ(ids.size(), 3UL);
-  ASSERT_EQ(scores.size(), 3UL);
+  ASSERT_EQ(ids.size(), 5UL);
+  ASSERT_EQ(scores.size(), 5UL);
 
-  BeamSearchDecoder<float> helper;
+  BeamSearchDecoder<float> helper(2, 1);  // beam_size = 2, end_id = 1
 
   LoDTensor id_tensor;
   LoDTensor score_tensor;
-  helper.PackAllSteps(ids, scores, &id_tensor, &score_tensor);
+  helper.Backtrace(ids, scores, &id_tensor, &score_tensor);
 
   LoD lod = id_tensor.lod();
-  std::vector<size_t> expect_source_lod = {0, 4, 8};
+  std::vector<size_t> expect_source_lod = {0, 2, 4};
   EXPECT_EQ(lod[0], expect_source_lod);
-  std::vector<size_t> expect_sentence_lod = {0, 1, 3, 6, 9, 10, 13, 16, 19};
+  std::vector<size_t> expect_sentence_lod = {0, 4, 7, 12, 17};
   EXPECT_EQ(lod[1], expect_sentence_lod);
-  // 2| 1, 0| 3, 1, 0| 3, 2, 1| 5| 4, 3, 2| 4, 4, 3| 6, 5, 4
-  std::vector<int> expect_data = {2, 1, 0, 3, 1, 0, 3, 2, 1, 5,
-                                  4, 3, 2, 4, 4, 3, 6, 5, 4};
+  std::vector<int> expect_data = {0, 2, 3, 1, 0, 2, 1, 0, 4,
+                                  5, 3, 5, 0, 4, 5, 3, 1};
   ASSERT_EQ(id_tensor.dims()[0], static_cast<int64_t>(expect_data.size()));
   for (size_t i = 0; i < expect_data.size(); ++i) {
     ASSERT_EQ(id_tensor.data<int64_t>()[i],
diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc
index df0b50881f4e3ec6f57bdb2b63033931059c486e..62771d09f112785ca1ba741a0ba239b1f0234633 100644
--- a/paddle/fluid/operators/beam_search_op.cc
+++ b/paddle/fluid/operators/beam_search_op.cc
@@ -12,25 +12,26 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/beam_search_op.h"
-
 #include <algorithm>
 #include <map>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/beam_search_op.h"
 
 namespace paddle {
 namespace operators {
 
 void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
+                            const framework::LoDTensor &pre_scores,
                             framework::LoDTensor *selected_ids,
                             framework::LoDTensor *selected_scores) {
   auto abs_lod = framework::ToAbsOffset(ids_->lod());
   auto &high_level = abs_lod[lod_level_];
 
-  auto items = SelectTopBeamSizeItems();
+  auto items = SelectTopBeamSizeItems(pre_ids, pre_scores);
   auto selected_items = ToMap(items, high_level.back());
   VLOG(3) << "selected_items:";
   for (size_t i = 0; i < selected_items.size(); ++i) {
@@ -39,7 +40,8 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
       VLOG(3) << ItemToString(item);
     }
   }
-  PruneEndidCandidates(pre_ids, &selected_items);
+
+  PruneEndBeams(pre_ids, &selected_items);
   // calculate the output tensor's height
   size_t num_instances = std::accumulate(
       std::begin(selected_items), std::end(selected_items), 0,
@@ -61,12 +63,6 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
   size_t low_offset = 0;
   for (auto &items : selected_items) {
     low_level.push_back(low_offset);
-    sort(items.begin(), items.end(), [](const Item &a, const Item &b) {
-      if (a.offset < b.offset) {
-        return true;
-      }
-      return a.id < b.id;
-    });
     for (auto &item : items) {
       ids_data[low_offset] = item.id;
       scores_data[low_offset] = item.score;
@@ -86,21 +82,31 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
   selected_scores->set_lod(lod);
 }
 
-int BeamSearch::PruneEndidCandidates(const framework::LoDTensor &pre_ids,
-                                     std::vector<std::vector<Item>> *items) {
+void BeamSearch::PruneEndBeams(const framework::LoDTensor &pre_ids,
+                               std::vector<std::vector<Item>> *items) {
   auto *pre_ids_data = pre_ids.data<int64_t>();
-
-  int res = 0;
-  for (size_t offset = 0; offset < items->size(); offset++) {
-    auto prefix_id = pre_ids_data[offset];
-    if (prefix_id == end_id_) {
-      items->at(offset).clear();
-    } else {
-      res++;
+  auto abs_lod = framework::ToAbsOffset(ids_->lod());
+  auto &high_level = abs_lod[lod_level_];
+  for (size_t src_idx = 0; src_idx < high_level.size() - 1; ++src_idx) {
+    size_t src_prefix_start = high_level[src_idx];
+    size_t src_prefix_end = high_level[src_idx + 1];
+    bool finish_flag = true;
+    for (size_t offset = src_prefix_start; offset < src_prefix_end; offset++) {
+      for (auto &item : items->at(offset)) {
+        if (item.id != static_cast<size_t>(end_id_) ||
+            pre_ids_data[offset] != end_id_) {
+          finish_flag = false;
+          break;
+        }
+      }
+      if (!finish_flag) break;
+    }
+    if (finish_flag) {  // all branchs of the beam (source sentence) end and
+                        // prune this beam
+      for (size_t offset = src_prefix_start; offset < src_prefix_end; offset++)
+        items->at(offset).clear();
     }
   }
-
-  return res;
 }
 
 std::vector<std::vector<BeamSearch::Item>> BeamSearch::ToMap(
@@ -115,19 +121,17 @@ std::vector<std::vector<BeamSearch::Item>> BeamSearch::ToMap(
   return result;
 }
 
-std::vector<std::vector<BeamSearch::Item>>
-BeamSearch::SelectTopBeamSizeItems() {
+std::vector<std::vector<BeamSearch::Item>> BeamSearch::SelectTopBeamSizeItems(
+    const framework::LoDTensor &pre_ids,
+    const framework::LoDTensor &pre_scores) {
   std::vector<std::vector<Item>> result;
   std::vector<Item> items;
   // for each source sentence, select the top beam_size items across all
   // candidate sets.
-  while (NextItemSet(&items)) {
-    std::nth_element(std::begin(items), std::begin(items) + beam_size_,
-                     std::end(items), [](const Item &a, const Item &b) {
-                       // TODO(superjom) make score's comparation customizable.
-                       // partial sort in descending order
-                       return a.score > b.score;
-                     });
+  while (NextItemSet(pre_ids, pre_scores, &items)) {
+    std::nth_element(
+        std::begin(items), std::begin(items) + beam_size_, std::end(items),
+        [](const Item &a, const Item &b) { return a.score > b.score; });
     // prune the top beam_size items.
     if (items.size() > beam_size_) {
       items.resize(beam_size_);
@@ -146,7 +150,9 @@ BeamSearch::SelectTopBeamSizeItems() {
 }
 
 // the candidates of a source
-bool BeamSearch::NextItemSet(std::vector<BeamSearch::Item> *items) {
+bool BeamSearch::NextItemSet(const framework::LoDTensor &pre_ids,
+                             const framework::LoDTensor &pre_scores,
+                             std::vector<BeamSearch::Item> *items) {
   if (sent_offset_ >= ids_->NumElements(lod_level_)) {
     return false;
   }
@@ -164,14 +170,24 @@ bool BeamSearch::NextItemSet(std::vector<BeamSearch::Item> *items) {
     instance_dim *= ids.dims()[i];
   }
 
+  auto *pre_ids_data = pre_ids.data<int64_t>();
+  auto *pre_scores_data = pre_scores.data<float>();
   items->clear();
   items->reserve(framework::product(ids.dims()));
   for (size_t offset = abs_lod[lod_level_][sent_offset_];
        offset < abs_lod[lod_level_][sent_offset_ + 1]; offset++) {
-    for (size_t d = 0; d < instance_dim; d++) {
-      const size_t dim_offset = offset * instance_dim + d;
-      items->emplace_back(offset, ids_data[dim_offset],
-                          scores_data[dim_offset]);
+    auto pre_id = pre_ids_data[offset];
+    auto pre_score = pre_scores_data[offset];
+    if (pre_id == end_id_) {
+      // Allocate all probability mass to eos_id for finished branchs and the
+      // other candidate ids can be ignored.
+      items->emplace_back(offset, end_id_, pre_score);
+    } else {
+      for (size_t d = 0; d < instance_dim; d++) {
+        const size_t dim_offset = offset * instance_dim + d;
+        items->emplace_back(offset, ids_data[dim_offset],
+                            scores_data[dim_offset]);
+      }
     }
   }
 
@@ -199,15 +215,27 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     // inputs and outputs stored in proto
-    AddInput("pre_ids", "ids in previous step");
-    AddInput("ids", "a LoDTensor of shape of [None,k]");
+    AddInput("pre_ids",
+             "(LoDTensor) The LoDTensor containing the selected ids at the "
+             "previous step. It should be a tensor with shape (batch_size, 1) "
+             "and lod `[[0, 1, ... , batch_size], [0, 1, ..., batch_size]]` at "
+             "thefirst step.");
+    AddInput("pre_scores",
+             "(LoDTensor) The LoDTensor containing the accumulated "
+             "scores corresponding to the selected ids at the previous step.");
+    AddInput("ids",
+             "(LoDTensor) The LoDTensor containing the candidates ids. Its "
+             "shape should be (batch_size * beam_size, K), where K supposed to "
+             "be beam_size.");
     AddInput("scores",
-             "a LoDTensor that has the same shape and LoD with `ids`");
+             "(LoDTensor) The LodTensor containing the accumulated scores "
+             "corresponding to Input(ids) and its shape is the same as the "
+             "shape of Input(ids).");
     AddOutput("selected_ids",
-              "a LoDTensor that stores the IDs selected by beam search");
-    AddOutput(
-        "selected_scores",
-        "a LoDTensor that has the same shape and LoD with `selected_ids`");
+              "A LodTensor that stores the IDs selected by beam search.");
+    AddOutput("selected_scores",
+              "A LoDTensor containing the accumulated scores corresponding to "
+              "Output(selected_ids).");
 
     // Attributes stored in AttributeMap
     AddAttr<int>("level", "the level of LoDTensor");
@@ -215,8 +243,21 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("end_id",
                  "the token id which indicates the end of a sequence");
 
-    AddComment(
-        "This is a beam search operator that help to generate sequences.");
+    AddComment(R"DOC(
+This operator does the search in beams for one time step. 
+Specifically, it selects the top-K candidate word ids of current step from
+Input(ids) according to their Input(scores) for all source sentences,
+where K is Attr(beam_size) and Input(ids), Input(scores) are predicted results
+from the computation cell. Additionally, Input(pre_ids) and Input(pre_scores)
+are the output of beam_search at previous step, they are needed for special use
+to handle ended candidate translations. The paths linking prefixes and selected
+candidates are organized and reserved in lod.
+
+Note that the Input(scores) passed in should be accumulated scores, and
+length penalty should be done with extra operators before calculating the
+accumulated scores if needed, also suggest finding top-K before it and
+using the top-K candidates following.
+)DOC");
   }
 };
 
@@ -253,10 +294,12 @@ class BeamSearchInferVarType : public framework::VarTypeInference {
   void operator()(const framework::OpDesc &op_desc,
                   framework::BlockDesc *block) const override {
     for (auto &o : op_desc.Output("selected_ids")) {
-      block->Var(o)->SetType(framework::proto::VarType::LOD_TENSOR);
+      auto &selected_ids = block->FindRecursiveOrCreateVar(o);
+      selected_ids.SetType(framework::proto::VarType::LOD_TENSOR);
     }
     for (auto &o : op_desc.Output("selected_scores")) {
-      block->Var(o)->SetType(framework::proto::VarType::LOD_TENSOR);
+      auto &selected_scores = block->FindRecursiveOrCreateVar(o);
+      selected_scores.SetType(framework::proto::VarType::LOD_TENSOR);
     }
   }
 };
diff --git a/paddle/fluid/operators/beam_search_op.h b/paddle/fluid/operators/beam_search_op.h
index 46bc4f6f936929050276e8b3b93f1eddd62ac638..b5e2ed05924cc8b7bc06058b9b1103ba10be486e 100644
--- a/paddle/fluid/operators/beam_search_op.h
+++ b/paddle/fluid/operators/beam_search_op.h
@@ -132,6 +132,7 @@ class BeamSearch {
    * that means no candidates is provided, and the task will stop running.
    */
   void operator()(const framework::LoDTensor& pre_ids,
+                  const framework::LoDTensor& pre_scores,
                   framework::LoDTensor* selected_ids,
                   framework::LoDTensor* selected_scores);
   /*
@@ -153,14 +154,16 @@ class BeamSearch {
 
  protected:
   /*
-   * Delete all the records that follows the end token.
+   * Prune the source sentences all branchs finished, and it is optional.
+   * Pruning must one step later than finishing (thus pre_ids is needed here),
+   * since the end tokens must be writed out.
    */
-  int PruneEndidCandidates(const framework::LoDTensor& pre_ids,
-                           std::vector<std::vector<Item>>* items);
+  void PruneEndBeams(const framework::LoDTensor& pre_ids,
+                     std::vector<std::vector<Item>>* items);
 
   /*
    * Transform the items into a map whose key is offset, value is the items.
-   * NOTE low performance
+   * NOTE low performance.
    */
   std::vector<std::vector<Item>> ToMap(
       const std::vector<std::vector<Item>>& inputs, size_t element_num);
@@ -168,12 +171,16 @@ class BeamSearch {
   /*
    * For each source, select top beam_size records.
    */
-  std::vector<std::vector<Item>> SelectTopBeamSizeItems();
+  std::vector<std::vector<Item>> SelectTopBeamSizeItems(
+      const framework::LoDTensor& pre_ids,
+      const framework::LoDTensor& pre_scores);
 
   /*
    * Get the items of next source sequence, return false if no remaining items.
    */
-  bool NextItemSet(std::vector<Item>* items);
+  bool NextItemSet(const framework::LoDTensor& pre_ids,
+                   const framework::LoDTensor& pre_scores,
+                   std::vector<Item>* items);
 
  private:
   size_t beam_size_;
@@ -192,24 +199,25 @@ template <typename DeviceContext, typename T>
 class BeamSearchOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* ids_var = context.Input<framework::LoDTensor>("ids");
-    auto* scores_var = context.Input<framework::LoDTensor>("scores");
-    auto* pre_ids_var = context.Input<framework::LoDTensor>("pre_ids");
-    PADDLE_ENFORCE_NOT_NULL(ids_var);
-    PADDLE_ENFORCE_NOT_NULL(scores_var);
-    PADDLE_ENFORCE_NOT_NULL(pre_ids_var);
+    auto* ids = context.Input<framework::LoDTensor>("ids");
+    auto* scores = context.Input<framework::LoDTensor>("scores");
+    auto* pre_ids = context.Input<framework::LoDTensor>("pre_ids");
+    auto* pre_scores = context.Input<framework::LoDTensor>("pre_scores");
+    PADDLE_ENFORCE_NOT_NULL(ids);
+    PADDLE_ENFORCE_NOT_NULL(scores);
+    PADDLE_ENFORCE_NOT_NULL(pre_ids);
+    PADDLE_ENFORCE_NOT_NULL(pre_scores);
 
     size_t level = context.Attr<int>("level");
     size_t beam_size = context.Attr<int>("beam_size");
     int end_id = context.Attr<int>("end_id");
-    BeamSearch alg(*ids_var, *scores_var, level, beam_size, end_id);
-    auto selected_ids_var =
-        context.Output<framework::LoDTensor>("selected_ids");
-    auto selected_scores_var =
+    BeamSearch alg(*ids, *scores, level, beam_size, end_id);
+    auto selected_ids = context.Output<framework::LoDTensor>("selected_ids");
+    auto selected_scores =
         context.Output<framework::LoDTensor>("selected_scores");
-    PADDLE_ENFORCE_NOT_NULL(selected_ids_var);
-    PADDLE_ENFORCE_NOT_NULL(selected_scores_var);
-    alg(*pre_ids_var, selected_ids_var, selected_scores_var);
+    PADDLE_ENFORCE_NOT_NULL(selected_ids);
+    PADDLE_ENFORCE_NOT_NULL(selected_scores);
+    alg(*pre_ids, *pre_scores, selected_ids, selected_scores);
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/beam_search_op_test.cc b/paddle/fluid/operators/beam_search_op_test.cc
index ec666359aa2bd81f1323b54f9a03235740c3a696..c4f4b478fbfc87e4178155132781214575c1e6b0 100644
--- a/paddle/fluid/operators/beam_search_op_test.cc
+++ b/paddle/fluid/operators/beam_search_op_test.cc
@@ -30,7 +30,7 @@ using std::endl;
 
 void CreateInput(LoDTensor* ids, LoDTensor* scores) {
   LoD lod;
-  vector<size_t> level0({0, 1, 4});
+  vector<size_t> level0({0, 2, 4});
   vector<size_t> level1({0, 1, 2, 3, 4});
   lod.push_back(level0);
   lod.push_back(level1);
@@ -64,17 +64,22 @@ TEST(beam_search_op, run) {
   for (int i = 0; i < 4; i++) {
     pre_ids.mutable_data<int64_t>(place)[i] = i + 1;
   }
+  LoDTensor pre_scores;
+  pre_scores.Resize(framework::make_ddim(vector<int64_t>(4, 1)));
+  for (int i = 0; i < 4; i++) {
+    pre_scores.mutable_data<float>(place)[i] = 0.1 * (i + 1);
+  }
 
-  BeamSearch beamsearch(ids, scores, (int64_t)0, (int64_t)2, 0);
+  BeamSearch beamsearch(ids, scores, (size_t)0, (size_t)2, 0);
   LoDTensor sids, sscores;
-  beamsearch(pre_ids, &sids, &sscores);
+  beamsearch(pre_ids, pre_scores, &sids, &sscores);
 
   LOG(INFO) << "score: " << sscores << endl;
 
   ASSERT_EQ(sids.lod(), sscores.lod());
 
-  vector<int> tids({2, 4, 3, 8});
-  vector<float> tscores({0.3, 0.5, 0.9, 0.7});
+  vector<int> tids({4, 2, 3, 8});
+  vector<float> tscores({0.5, 0.6, 0.9, 0.7});
 
   for (int i = 0; i < 4; i++) {
     ASSERT_EQ(tids[i], sids.data<int64_t>()[i]);
diff --git a/paddle/fluid/operators/bilinear_interp_op.cc b/paddle/fluid/operators/bilinear_interp_op.cc
index 2572e813d656353a2187c29da89266733a32f3ce..2dc3399da183fbcf7664066f6f7ce12db3dc6d5e 100644
--- a/paddle/fluid/operators/bilinear_interp_op.cc
+++ b/paddle/fluid/operators/bilinear_interp_op.cc
@@ -110,6 +110,7 @@ REGISTER_OPERATOR(bilinear_interp, ops::BilinearInterpOp,
                   ops::BilinearInterpOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(bilinear_interp_grad, ops::BilinearInterpOpGrad);
-REGISTER_OP_CPU_KERNEL(bilinear_interp, ops::BilinearInterpKernel<float>);
+REGISTER_OP_CPU_KERNEL(bilinear_interp, ops::BilinearInterpKernel<float>,
+                       ops::BilinearInterpKernel<uint8_t>);
 REGISTER_OP_CPU_KERNEL(bilinear_interp_grad,
                        ops::BilinearInterpGradKernel<float>);
diff --git a/paddle/fluid/operators/bilinear_interp_op.h b/paddle/fluid/operators/bilinear_interp_op.h
index 8b03cd5a0635584a45782fe5a4823c37fe4fa8e8..70847cb8c1abe2e94bc844ab8117d1f23fea533b 100644
--- a/paddle/fluid/operators/bilinear_interp_op.h
+++ b/paddle/fluid/operators/bilinear_interp_op.h
@@ -46,8 +46,10 @@ class BilinearInterpKernel : public framework::OpKernel<T> {
     int in_chw = channels * in_hw;
     int out_chw = channels * out_hw;
 
-    T ratio_h = (out_h > 1) ? static_cast<T>(in_h - 1) / (out_h - 1) : 0.f;
-    T ratio_w = (out_w > 1) ? static_cast<T>(in_w - 1) / (out_w - 1) : 0.f;
+    float ratio_h =
+        (out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
+    float ratio_w =
+        (out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
 
     if (in_h == out_h && in_w == out_w) {
       memcpy(output, input, input_t->numel() * sizeof(T));
@@ -56,24 +58,24 @@ class BilinearInterpKernel : public framework::OpKernel<T> {
         for (int i = 0; i < out_h; ++i) {     // loop for images
           int h = ratio_h * i;
           int hid = (h < in_h - 1) ? 1 : 0;
-          T h1lambda = ratio_h * i - h;
-          T h2lambda = 1 - h1lambda;
+          float h1lambda = ratio_h * i - h;
+          float h2lambda = 1.f - h1lambda;
 
           for (int j = 0; j < out_w; ++j) {
             int w = ratio_w * j;
             int wid = (w < in_w - 1) ? 1 : 0;
-            T w1lambda = ratio_w * j - w;
-            T w2lambda = 1 - w1lambda;
+            float w1lambda = ratio_w * j - w;
+            float w2lambda = 1.f - w1lambda;
             // calculate four position for bilinear interpolation
             const T* in_pos = &input[k * in_chw + h * in_w + w];
             T* out_pos = &output[k * out_chw + i * out_w + j];
 
             for (int c = 0; c < channels; ++c) {  // loop for channels
               // bilinear interpolation
-              out_pos[0] =
+              out_pos[0] = static_cast<T>(
                   h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[wid]) +
                   h1lambda * (w2lambda * in_pos[hid * in_w] +
-                              w1lambda * in_pos[hid * in_w + wid]);
+                              w1lambda * in_pos[hid * in_w + wid]));
               in_pos += in_hw;
               out_pos += out_hw;
             }
@@ -117,8 +119,10 @@ class BilinearInterpGradKernel : public framework::OpKernel<T> {
     int in_chw = channels * in_hw;
     int out_chw = channels * out_hw;
 
-    T ratio_h = (out_h > 1) ? static_cast<T>(in_h - 1) / (out_h - 1) : 0.f;
-    T ratio_w = (out_w > 1) ? static_cast<T>(in_w - 1) / (out_w - 1) : 0.f;
+    float ratio_h =
+        (out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
+    float ratio_w =
+        (out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
 
     if (in_h == out_h && in_w == out_w) {
       memcpy(d_input, d_output, d_input_t->numel() * sizeof(T));
@@ -127,22 +131,24 @@ class BilinearInterpGradKernel : public framework::OpKernel<T> {
         for (int i = 0; i < out_h; ++i) {     // loop for images
           int h = ratio_h * i;
           int hid = (h < in_h - 1) ? 1 : 0;
-          T h1lambda = ratio_h * i - h;
-          T h2lambda = 1 - h1lambda;
+          float h1lambda = ratio_h * i - h;
+          float h2lambda = 1 - h1lambda;
 
           for (int j = 0; j < out_w; ++j) {
             int w = ratio_w * j;
             int wid = (w < in_w - 1) ? 1 : 0;
-            T w1lambda = ratio_w * j - w;
-            T w2lambda = 1 - w1lambda;
+            float w1lambda = ratio_w * j - w;
+            float w2lambda = 1 - w1lambda;
             T* in_pos = &d_input[k * in_chw + h * in_w + w];
             const T* out_pos = &d_output[k * out_chw + i * out_w + j];
 
             for (int c = 0; c < channels; ++c) {  // loop for channels
-              in_pos[0] += h2lambda * w2lambda * out_pos[0];
-              in_pos[wid] += h2lambda * w1lambda * out_pos[0];
-              in_pos[hid * in_w] += h1lambda * w2lambda * out_pos[0];
-              in_pos[hid * in_w + wid] += h1lambda * w1lambda * out_pos[0];
+              in_pos[0] += static_cast<T>(h2lambda * w2lambda * out_pos[0]);
+              in_pos[wid] += static_cast<T>(h2lambda * w1lambda * out_pos[0]);
+              in_pos[hid * in_w] +=
+                  static_cast<T>(h1lambda * w2lambda * out_pos[0]);
+              in_pos[hid * in_w + wid] +=
+                  static_cast<T>(h1lambda * w1lambda * out_pos[0]);
               in_pos += in_hw;
               out_pos += out_hw;
             }
diff --git a/paddle/fluid/operators/checkpoint_notify_op.cc b/paddle/fluid/operators/checkpoint_notify_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c4219a429a53eb4869426a2674109555fb784b85
--- /dev/null
+++ b/paddle/fluid/operators/checkpoint_notify_op.cc
@@ -0,0 +1,88 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <future>  // NOLINT
+#include <ostream>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/send_recv_util.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace paddle {
+namespace operators {
+
+class CheckpointNotifyOp : public framework::OperatorBase {
+ public:
+  CheckpointNotifyOp(const std::string& type,
+                     const framework::VariableNameMap& inputs,
+                     const framework::VariableNameMap& outputs,
+                     const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& place) const override {
+    std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
+    std::string dir = Attr<std::string>("dir");
+    std::string lookup_table_name = Attr<std::string>("lookup_table");
+
+    distributed::RPCClient* rpc_client =
+        distributed::RPCClient::GetInstance<RPCCLIENT_T>();
+    for (size_t i = 0; i < epmap.size(); i++) {
+      auto lookup_table_save_dir =
+          string::Sprintf("%s/%s_%d", dir, lookup_table_name, i);
+      rpc_client->AsyncCheckpointNotify(epmap[i], lookup_table_save_dir);
+      VLOG(3) << "checkpoint notify sending lookup table: " << lookup_table_name
+              << " and dir:" << dir << " to " << epmap[i];
+    }
+    rpc_client->Wait();
+  }
+};
+
+class CheckpointNotifyOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddAttr<std::vector<std::string>>("epmap",
+                                      "(string vector, default  127.0.0.1:6164)"
+                                      "Parameter Server endpoints in the order")
+        .SetDefault({"127.0.0.1:6164"});
+    AddAttr<std::string>(
+        "dir", "(string, default '') indicate the folder checkpoint will use");
+    AddAttr<std::string>("lookup_table",
+                         "(string, default '') the lookup table name");
+    AddComment(R"DOC(
+CheckpointNotify operator
+
+This operator will send lookup table and it's checkpoint direcoty to listen_and_serve op at
+the parameter server.
+)DOC");
+  }
+};
+
+class CheckpointNotifyOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(checkpoint_notify, ops::CheckpointNotifyOp,
+                  paddle::framework::EmptyGradOpMaker,
+                  ops::CheckpointNotifyOpMaker,
+                  ops::CheckpointNotifyOpShapeInference);
diff --git a/paddle/fluid/operators/chunk_eval_op.cc b/paddle/fluid/operators/chunk_eval_op.cc
index 62636bb2f9078768180ab1e0016e3565617d24d2..dc43c69be0bcea2b82e1d61a9a5b2e03129d4f8e 100644
--- a/paddle/fluid/operators/chunk_eval_op.cc
+++ b/paddle/fluid/operators/chunk_eval_op.cc
@@ -91,32 +91,31 @@ class ChunkEvalOpMaker : public framework::OpProtoAndCheckerMaker {
         "(int64_t). The number of chunks both in Inference and Label on the "
         "given mini-batch.");
     AddAttr<int>("num_chunk_types",
-                 "(int). The number of chunk type. See below for details.");
-    AddAttr<std::string>(
-        "chunk_scheme",
-        "(string, default IOB). The labeling scheme indicating "
-        "how to encode the chunks. Must be IOB, IOE, IOBES or plain. See below "
-        "for details.")
+                 "The number of chunk type. See the description for details.");
+    AddAttr<std::string>("chunk_scheme",
+                         "The labeling scheme indicating "
+                         "how to encode the chunks. Must be IOB, IOE, IOBES or "
+                         "plain. See the description"
+                         "for details.")
         .SetDefault("IOB");
     AddAttr<std::vector<int>>("excluded_chunk_types",
-                              "(list<int>) A list including chunk type ids "
+                              "A list including chunk type ids "
                               "indicating chunk types that are not counted. "
-                              "See below for details.")
+                              "See the description for details.")
         .SetDefault(std::vector<int>{});
     AddComment(R"DOC(
 For some basics of chunking, please refer to
-‘Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>’.
+'Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>'.
 
-
-CheckEvalOp computes the precision, recall, and F1-score of chunk detection,
+ChunkEvalOp computes the precision, recall, and F1-score of chunk detection,
 and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes.
 Here is a NER example of labeling for these tagging schemes:
-
- 	     Li     Ming    works  at  Agricultural   Bank   of    China  in  Beijing.
-  IO:    I-PER  I-PER   O      O   I-ORG          I-ORG  I-ORG I-ORG  O   I-LOC
-  IOB:   B-PER  I-PER   O      O   B-ORG          I-ORG  I-ORG I-ORG  O   B-LOC
-  IOE:   I-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   E-LOC
-  IOBES: B-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   S-LOC
+   
+          Li     Ming    works  at  Agricultural   Bank   of    China  in  Beijing.
+   IO     I-PER  I-PER   O      O   I-ORG          I-ORG  I-ORG I-ORG  O   I-LOC
+   IOB    B-PER  I-PER   O      O   B-ORG          I-ORG  I-ORG I-ORG  O   B-LOC
+   IOE    I-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   E-LOC
+   IOBES  B-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   S-LOC
 
 There are three chunk types(named entity types) including PER(person), ORG(organization)
 and LOC(LOCATION), and we can see that the labels have the form <tag type>-<chunk type>.
@@ -124,31 +123,31 @@ and LOC(LOCATION), and we can see that the labels have the form <tag type>-<chun
 Since the calculations actually use label ids rather than labels, extra attention
 should be paid when mapping labels to ids to make CheckEvalOp work. The key point
 is that the listed equations are satisfied by ids.
-
-    tag_type = label % num_tag_type
-    chunk_type = label / num_tag_type
+   
+   tag_type = label % num_tag_type
+   chunk_type = label / num_tag_type
 
 where `num_tag_type` is the num of tag types in the tagging scheme, `num_chunk_type`
 is the num of chunk types, and `tag_type` get its value from the following table.
-
-    Scheme Begin Inside End   Single
-     plain   0     -      -     -
-     IOB     0     1      -     -
-     IOE     -     0      1     -
-     IOBES   0     1      2     3
+   
+   Scheme Begin Inside End   Single
+    plain   0     -      -     -
+    IOB     0     1      -     -
+    IOE     -     0      1     -
+    IOBES   0     1      2     3
 
 Still use NER as example, assuming the tagging scheme is IOB while chunk types are ORG,
 PER and LOC. To satisfy the above equations, the label map can be like this:
 
-    B-ORG  0
-    I-ORG  1
-    B-PER  2
-    I-PER  3
-    B-LOC  4
-    I-LOC  5
-    O      6
+   B-ORG  0
+   I-ORG  1
+   B-PER  2
+   I-PER  3
+   B-LOC  4
+   I-LOC  5
+   O      6
 
-It’s not hard to verify the equations noting that the num of chunk types
+It's not hard to verify the equations noting that the num of chunk types
 is 3 and the num of tag types in IOB scheme is 2. For example, the label
 id of I-LOC is 5, the tag type id of I-LOC is 1, and the chunk type id of
 I-LOC is 2, which consistent with the results from the equations.
diff --git a/paddle/fluid/operators/clip_by_norm_op.cc b/paddle/fluid/operators/clip_by_norm_op.cc
index c87bded034e382c981d119e8499d6780e288031f..eae86a373be278cbb3ea9425b2ff0169f8faa99e 100644
--- a/paddle/fluid/operators/clip_by_norm_op.cc
+++ b/paddle/fluid/operators/clip_by_norm_op.cc
@@ -54,10 +54,19 @@ be linearly scaled to make the L2 norm of $Out$ equal to $max\_norm$, as
 shown in the following formula:
 
 $$
-Out = \frac{max\_norm * X}{norm(X)},
+Out = \\frac{max\\_norm * X}{norm(X)},
 $$
 
 where $norm(X)$ represents the L2 norm of $X$.
+
+Examples:
+        .. code-block:: python
+
+            data = fluid.layer.data(
+                name='data', shape=[2, 4, 6], dtype='float32')
+            reshaped = fluid.layers.clip_by_norm(
+                x=data, max_norm=0.5)
+
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/compare_op.cc b/paddle/fluid/operators/compare_op.cc
index 3a4819f3dec9704a4a7c8910dd22e80fda082335..f40b1ba338d429c248103eeb930ac7e1bb690218 100644
--- a/paddle/fluid/operators/compare_op.cc
+++ b/paddle/fluid/operators/compare_op.cc
@@ -23,30 +23,26 @@ class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     OpComment comment;
-    AddInput("X",
-             string::Sprintf("(LoDTensor) the left hand operand of %s operator",
-                             comment.type));
-    AddInput("Y", string::Sprintf(
-                      "(LoDTensor) the right hand operand of %s operator",
-                      comment.type));
+    AddInput("X", string::Sprintf("the left hand operand of %s operator",
+                                  comment.type));
+    AddInput("Y", string::Sprintf("the right hand operand of %s operator",
+                                  comment.type));
     AddAttr<bool>("force_cpu",
-                  "(bool, default false) Force fill output variable to cpu "
+                  "Force fill output variable to cpu "
                   "memory. Otherwise, fill output variable to the running "
-                  "device")
-        .SetDefault(false);
-    AddOutput("Out", string::Sprintf(
-                         "(LoDTensor) n-dim bool tensor. Each element is %s",
-                         comment.equation));
-    AddComment(string::Sprintf(R"DOC(%s Operator
-
+                  "device [default true].")
+        .SetDefault(true);
+    AddOutput("Out", string::Sprintf("n-dim bool tensor. Each element is %s",
+                                     comment.equation));
+    AddComment(string::Sprintf(R"DOC(
 It operates element-wise on X and Y, and returns the Out. Each of them is a
 N-dim tensor. X and Y could be any type.  The each element of the Out tensor is
-calculated by %s
+calculated by $%s$
 )DOC",
-                               comment.type, comment.equation));
-    AddAttr<int>("axis",
-                 "(int, default -1). The start dimension index "
-                 "for broadcasting Y onto X.")
+                               comment.equation));
+    AddAttr<int>(
+        "axis",
+        "The start dimension index for broadcasting Y onto X. [default -1]")
         .SetDefault(-1)
         .EqualGreaterThan(-1);
   }
diff --git a/paddle/fluid/operators/concat_op.h b/paddle/fluid/operators/concat_op.h
index 1b1b8bf5ed959dd9c2ce8c9f5c905a75b81865fd..a496301526f58875ff51aeaa5b2094c3c656531c 100644
--- a/paddle/fluid/operators/concat_op.h
+++ b/paddle/fluid/operators/concat_op.h
@@ -60,34 +60,45 @@ template <typename DeviceContext, typename T>
 class ConcatGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
-    auto* in = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* out_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto ins = ctx.MultiInput<framework::Tensor>("X");
+    auto out_var_names = ctx.Outputs(framework::GradVarName("X"));
     auto outs = ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
     int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
 
+    // get output tensor that the name is not kEmptyVarName
+    std::vector<framework::Tensor*> outputs;
+    for (size_t j = 0; j < outs.size(); ++j) {
+      if (out_var_names[j] != framework::kEmptyVarName) {
+        outs[j]->mutable_data<T>(ctx.GetPlace());
+        outputs.push_back(outs[j]);
+      } else {
+        outputs.push_back(nullptr);
+      }
+    }
+
     // Sometimes direct copies will be faster, this maybe need deeply analysis.
     if (axis == 0 && outs.size() < 10) {
       size_t input_offset = 0;
-      auto in_stride = framework::stride_numel(in->dims());
+      const auto in_stride = framework::stride_numel(out_grad->dims());
 
-      for (auto& out : outs) {
-        out->mutable_data<T>(ctx.GetPlace());
-        auto out_stride = framework::stride_numel(out->dims());
-        StridedNumelCopyWithAxis<T>(ctx.device_context(), axis, out->data<T>(),
-                                    out_stride, in->data<T>() + input_offset,
-                                    in_stride, out_stride[axis]);
+      for (size_t i = 0; i < outs.size(); ++i) {
+        auto out_stride = framework::stride_numel(ins[i]->dims());
+        auto* out = outputs[i];
+        if (out != nullptr) {
+          StridedNumelCopyWithAxis<T>(
+              ctx.device_context(), axis, out->data<T>(), out_stride,
+              out_grad->data<T>() + input_offset, in_stride, out_stride[axis]);
+        }
         input_offset += out_stride[axis];
       }
     } else {
-      std::vector<framework::Tensor> outputs(outs.size());
-      for (size_t j = 0; j < outs.size(); ++j) {
-        outs[j]->mutable_data<T>(ctx.GetPlace());
-        outputs[j] = *outs[j];
-      }
-
       auto& dev_ctx = ctx.template device_context<DeviceContext>();
       paddle::operators::math::ConcatGradFunctor<DeviceContext, T>
           concat_grad_functor;
-      concat_grad_functor(dev_ctx, *in, static_cast<int>(axis), &outputs);
+      concat_grad_functor(dev_ctx, *out_grad, ins, static_cast<int>(axis),
+                          &outputs);
     }
   }
 };
diff --git a/paddle/fluid/operators/conditional_block_op.cc b/paddle/fluid/operators/conditional_block_op.cc
index 5984f80d04bdeb232f8e24264ae979725af24ef4..580fde753816c30b188b8a99cc63fcbafde64e25 100644
--- a/paddle/fluid/operators/conditional_block_op.cc
+++ b/paddle/fluid/operators/conditional_block_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 #include <algorithm>
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/var_type.h"
 
 namespace paddle {
 namespace operators {
@@ -47,7 +48,7 @@ class ConditionalOp : public framework::OperatorBase {
     if (!(ips.size() == 1UL && ips[0]->IsInitialized())) {
       PADDLE_THROW("should have one initialized input as condition");
     }
-    if (!(ips[0]->type().hash_code() == typeid(bool).hash_code() &&  // NOLINT
+    if (!(framework::IsType<bool>(ips[0]->type()) &&  // NOLINT
           ips[0]->numel() == 1)) {
       PADDLE_THROW(
           "condition input's data type should be bool, "
@@ -204,9 +205,10 @@ class ConditionalBlockGradInferShape : public framework::InferShapeBase {
       context->SetOutputsDim(framework::GradVarName("Params"),
                              context->GetInputsDim("Params"));
     }
-    PADDLE_ENFORCE(context->HasOutputs(framework::GradVarName("X")));
-    context->SetOutputsDim(framework::GradVarName("X"),
-                           context->GetInputsDim("X"));
+    if (context->HasOutputs(framework::GradVarName("X"))) {
+      context->SetOutputsDim(framework::GradVarName("X"),
+                             context->GetInputsDim("X"));
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc
index 0b363f5c43f9fc191790e5cca629ffc46eb9388c..eeb98ee44f206dbfbe1f61689aa9843122ae3f92 100644
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -156,7 +156,7 @@ Parameters(strides, paddings) are two elements. These two elements represent hei
 and width, respectively.
 The input(X) size and output(Out) size may be different.
 
-Example:
+For an example:
   Input:
        Input shape: $(N, C_{in}, H_{in}, W_{in})$
        Filter shape: $(C_{in}, C_{out}, H_f, W_f)$
@@ -302,6 +302,7 @@ framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType(
 
 namespace ops = paddle::operators;
 
+// conv2d_transpose
 REGISTER_OPERATOR(conv2d_transpose, ops::ConvTransposeOp,
                   ops::Conv2DTransposeOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
@@ -317,6 +318,7 @@ REGISTER_OP_CPU_KERNEL(
     ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
                                      double>);
 
+// conv3d_transpose
 REGISTER_OPERATOR(conv3d_transpose, ops::ConvTransposeOp,
                   ops::Conv3DTransposeOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
@@ -331,3 +333,19 @@ REGISTER_OP_CPU_KERNEL(
     ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
                                      double>);
+
+// depthwise conv2d_transpose
+REGISTER_OPERATOR(depthwise_conv2d_transpose, ops::ConvTransposeOp,
+                  ops::Conv2DTransposeOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(depthwise_conv2d_transpose_grad, ops::ConvTransposeOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    depthwise_conv2d_transpose,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    depthwise_conv2d_transpose_grad,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
+                                     double>);
diff --git a/paddle/fluid/operators/conv_transpose_op.cu.cc b/paddle/fluid/operators/conv_transpose_op.cu.cc
index 640fa7d14a079debeceb54d8775c4ede7da1b536..a6d5665df83ae5c89d42840e91a6abd853fedd12 100644
--- a/paddle/fluid/operators/conv_transpose_op.cu.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cu.cc
@@ -15,25 +15,28 @@ limitations under the License. */
 #include "paddle/fluid/operators/conv_transpose_op.h"
 
 namespace ops = paddle::operators;
+using CUDA = paddle::platform::CUDADeviceContext;
 
-REGISTER_OP_CUDA_KERNEL(
-    conv2d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    conv2d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
-                                     float>,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
-                                     double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    conv3d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    conv3d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
-                                     float>,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
-                                     double>);
+// conv2d
+REGISTER_OP_CUDA_KERNEL(conv2d_transpose,
+                        ops::GemmConvTransposeKernel<CUDA, float>,
+                        ops::GemmConvTransposeKernel<CUDA, double>);
+REGISTER_OP_CUDA_KERNEL(conv2d_transpose_grad,
+                        ops::GemmConvTransposeGradKernel<CUDA, float>,
+                        ops::GemmConvTransposeGradKernel<CUDA, double>);
+
+// conv3d
+REGISTER_OP_CUDA_KERNEL(conv3d_transpose,
+                        ops::GemmConvTransposeKernel<CUDA, float>,
+                        ops::GemmConvTransposeKernel<CUDA, double>);
+REGISTER_OP_CUDA_KERNEL(conv3d_transpose_grad,
+                        ops::GemmConvTransposeGradKernel<CUDA, float>,
+                        ops::GemmConvTransposeGradKernel<CUDA, double>);
+
+// depthwise conv2d
+REGISTER_OP_CUDA_KERNEL(depthwise_conv2d_transpose,
+                        ops::DepthwiseConvTransposeKernel<CUDA, float>,
+                        ops::DepthwiseConvTransposeKernel<CUDA, double>);
+REGISTER_OP_CUDA_KERNEL(depthwise_conv2d_transpose_grad,
+                        ops::DepthwiseConvTransposeGradKernel<CUDA, float>,
+                        ops::DepthwiseConvTransposeGradKernel<CUDA, double>);
diff --git a/paddle/fluid/operators/conv_transpose_op.h b/paddle/fluid/operators/conv_transpose_op.h
index 1dcfc651fdd79aed50736d05d38ec8576b183d41..0d9c6a62fec1ea24bee5c24b4a7b792781f14d9e 100644
--- a/paddle/fluid/operators/conv_transpose_op.h
+++ b/paddle/fluid/operators/conv_transpose_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/depthwise_conv.h"
 #include "paddle/fluid/operators/math/im2col.h"
 #include "paddle/fluid/operators/math/vol2col.h"
 
@@ -316,5 +317,74 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
     }
   }
 };
+
+template <typename DeviceContext, typename T>
+class DepthwiseConvTransposeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<Tensor>("Input");
+    Tensor filter = *context.Input<Tensor>("Filter");
+    Tensor* output = context.Output<Tensor>("Output");
+    output->mutable_data<T>(context.GetPlace());
+
+    int groups = context.Attr<int>("groups");
+    PADDLE_ENFORCE_EQ(groups, filter.dims()[0]);
+
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
+    for (auto v : dilations) {
+      PADDLE_ENFORCE_EQ(v, 1);
+    }
+
+    output->mutable_data<T>(context.GetPlace());
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    math::SetConstant<DeviceContext, T> set_zero;
+    set_zero(dev_ctx, output, static_cast<T>(0));
+
+    math::DepthwiseConvInputGradFunctor<DeviceContext, T>
+        depthwiseConvInputGrad;
+    depthwiseConvInputGrad(dev_ctx, *output, filter, *input, strides, paddings,
+                           output);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<Tensor>("Input");
+    const Tensor* output_grad =
+        context.Input<Tensor>(framework::GradVarName("Output"));
+    Tensor* input_grad =
+        context.Output<Tensor>(framework::GradVarName("Input"));
+    Tensor* filter_grad =
+        context.Output<Tensor>(framework::GradVarName("Filter"));
+    Tensor filter = *context.Input<Tensor>("Filter");
+
+    if (!input_grad && !filter_grad) return;
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+
+    if (input_grad) {
+      math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;
+      depthwiseConv(dev_ctx, *output_grad, filter, strides, paddings,
+                    input_grad);
+    }
+
+    if (filter_grad) {
+      math::SetConstant<DeviceContext, T> set_zero;
+      filter_grad->mutable_data<T>(context.GetPlace());
+      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
+
+      math::DepthwiseConvFilterGradFunctor<DeviceContext, T>
+          depthwiseConvFilterGrad;
+      depthwiseConvFilterGrad(dev_ctx, *output_grad, *input, strides, paddings,
+                              filter_grad);
+    }
+  }
+};
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/cos_sim_op.cc b/paddle/fluid/operators/cos_sim_op.cc
index 046dd11910bb0ff46b567c3b89883582782205d3..8f3644039f9950a8a70e2fd66c20837a5f52bd7f 100644
--- a/paddle/fluid/operators/cos_sim_op.cc
+++ b/paddle/fluid/operators/cos_sim_op.cc
@@ -76,9 +76,9 @@ class CosSimOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsIntermediate();
 
     AddComment(R"DOC(
-Cosine Similarity Operator.
+**Cosine Similarity Operator**
 
-$Out = X^T * Y / (\sqrt{X^T * X} * \sqrt{Y^T * Y})$
+$Out = \frac{X^T * Y}{(\sqrt{X^T * X} * \sqrt{Y^T * Y})}$
 
 The input X and Y must have the same shape, except that the 1st dimension
 of input Y could be just 1 (different from input X), which will be
diff --git a/paddle/fluid/operators/crf_decoding_op.cc b/paddle/fluid/operators/crf_decoding_op.cc
index 40f43936db662f2b18ffa540da4794755b5d6fc7..c27befe1143baa68add4b56f3572eab75272c3a5 100644
--- a/paddle/fluid/operators/crf_decoding_op.cc
+++ b/paddle/fluid/operators/crf_decoding_op.cc
@@ -53,21 +53,18 @@ sequence of observed tags.
 The output of this operator changes according to whether Input(Label) is given:
 
 1. Input(Label) is given:
-
-This happens in training. This operator is used to co-work with the chunk_eval
-operator.
-
-When Input(Label) is given, the crf_decoding operator returns a row vector
-with shape [N x 1] whose values are fixed to be 0, indicating an incorrect
-prediction, or 1 indicating a tag is correctly predicted. Such an output is the
-input to chunk_eval operator.
+   This happens in training. This operator is used to co-work with the chunk_eval
+   operator.
+   When Input(Label) is given, the crf_decoding operator returns a row vector
+   with shape [N x 1] whose values are fixed to be 0, indicating an incorrect
+   prediction, or 1 indicating a tag is correctly predicted. Such an output is the
+   input to chunk_eval operator.
 
 2. Input(Label) is not given:
-
-This is the standard decoding process.
+   This is the standard decoding process.
 
 The crf_decoding operator returns a row vector with shape [N x 1] whose values
-range from 0 to maximum tag number - 1. Each element indicates an index of a
+range from 0 to maximum tag number - 1, Each element indicates an index of a
 predicted tag.
 )DOC");
   }
diff --git a/paddle/fluid/operators/crop_op.h b/paddle/fluid/operators/crop_op.h
index 91cfbbda7352c9b1676aae99e2bd57ccc9e10069..772e80bbea4f2db654cefd0dcb404bc33803bd7a 100644
--- a/paddle/fluid/operators/crop_op.h
+++ b/paddle/fluid/operators/crop_op.h
@@ -52,7 +52,7 @@ static std::vector<int> GetOffsets(const framework::ExecutionContext& ctx) {
   } else {
     res = ctx.Attr<std::vector<int>>("offsets");
     PADDLE_ENFORCE_EQ(
-        rank, res.size(),
+        rank, static_cast<int>(res.size()),
         "Offsets size should be equal to dimension size of input tensor.");
   }
   return res;
diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc
index d5e095f9cad95b74b8ff79e4a60ccbdf11512a5a..a3bec3da45136bca5cb2763e7ffd6b67703a1813 100644
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
@@ -124,8 +124,7 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
              "Tensor<float/double> with shape [N x D].");
     AddOutput("Y",
               "(Tensor, default Tensor<float>), a 2-D tensor with shape "
-              "[N x 1]. The cross entropy loss.")
-        .Reuse("X");
+              "[N x 1]. The cross entropy loss.");
     AddAttr<bool>("soft_label",
                   "(bool, default false), a flag indicating whether to "
                   "interpretate the given labels as soft labels.")
diff --git a/paddle/fluid/operators/cumsum_op.cc b/paddle/fluid/operators/cumsum_op.cc
index 92bb835e8f18e17ae1355fdec29f43b8ffb70460..5302b822d6b9f232e9ccd0d03cc549d7d5044ebf 100644
--- a/paddle/fluid/operators/cumsum_op.cc
+++ b/paddle/fluid/operators/cumsum_op.cc
@@ -30,19 +30,19 @@ class CumOp : public framework::OperatorWithKernel {
 class CumsumOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("X", "Input of Cumsum operator");
-    AddOutput("Out", "Output of Cumsum operator");
+    AddInput("X", "Input of cumsum operator");
+    AddOutput("Out", "Output of cumsum operator");
     AddAttr<int>("axis",
-                 "(int, default -1). The dimenstion to accumulate along. "
-                 "-1 means the last dimenstion")
+                 "The dimenstion to accumulate along. -1 means the last "
+                 "dimenstion [default -1].")
         .SetDefault(-1)
         .EqualGreaterThan(-1);
     AddAttr<bool>("exclusive",
-                  "bool, default false). Whether to perform exclusive cumsum")
+                  "Whether to perform exclusive cumsum. [default false].")
         .SetDefault(false);
     AddAttr<bool>("reverse",
-                  "bool, default false). If true, the cumsum is performed in "
-                  "the reversed direction")
+                  "If true, the cumsum is performed in the reversed direction. "
+                  "[default false].")
         .SetDefault(false);
     AddComment(R"DOC(
 The cumulative sum of the elements along a given axis.
diff --git a/paddle/fluid/operators/detail/CMakeLists.txt b/paddle/fluid/operators/detail/CMakeLists.txt
deleted file mode 100644
index abc5aad0430e71928a441c9488dda16dfdd63b9c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detail/CMakeLists.txt
+++ /dev/null
@@ -1,38 +0,0 @@
-if(NOT WITH_DISTRIBUTE)
-    return()
-endif()
-
-
-if(WITH_GRPC)
-  grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
-      request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor
-      selected_rows memory)
-  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-  set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-  cc_test(serde_test SRCS grpc_serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr
-          cares zlib protobuf sendrecvop_grpc SERIAL)
-  cc_test(grpc_server_test SRCS rpc_server_test.cc DEPS sendrecvop_grpc
-          grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor
-          proto_desc lookup_table_op SERIAL)
-  return()
-endif()
-
-
-set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-brpc_library(sendrecvop_brpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc  rpc_client.cc request_handler_impl.cc
-  PROTO send_recv.proto
-  DEPS lod_tensor selected_rows memory)
-
-find_library(OPENSSL_CRYPTO_LIBRARY_STATIC NAMES libcrypto.so)
-ADD_LIBRARY(crypto SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET crypto PROPERTY IMPORTED_LOCATION ${OPENSSL_CRYPTO_LIBRARY_STATIC})
-
-
-find_library(OPENSSL_SSL_LIBRARY_STATIC NAMES libssl.so)
-ADD_LIBRARY(ssl SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET ssl PROPERTY IMPORTED_LOCATION ${OPENSSL_SSL_LIBRARY_STATIC})
-
-cc_test(brpc_server_test SRCS rpc_server_test.cc DEPS sendrecvop_brpc 
-       brpc protobuf leveldb gflags glog
-       protobuf executor proto_desc lookup_table_op snappystream snappy ssl crypto SERIAL)
diff --git a/paddle/fluid/operators/detail/brpc_client.cc b/paddle/fluid/operators/detail/brpc_client.cc
deleted file mode 100644
index 9a4e410f1d83e93883438fae116c38eb60787673..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detail/brpc_client.cc
+++ /dev/null
@@ -1,180 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/detail/brpc_client.h"
-#include "paddle/fluid/framework/threadpool.h"
-
-namespace paddle {
-namespace operators {
-namespace detail {
-
-DEFINE_int32(brpc_channel_num, 24,
-             "Number of channels to send requests connected to one server");
-DEFINE_int32(timeout_ms, 30000, "RPC timeout in milliseconds");
-DEFINE_int32(max_retry, 3, "Max retries(not including the first RPC)");
-
-BRPCClient::~BRPCClient() { Wait(); }
-
-void HandleSendResponse(brpc::Controller* cntl,
-                        sendrecv::VoidMessage* response) {
-  // std::unique_ptr makes sure cntl/response will be deleted before returning.
-  std::unique_ptr<brpc::Controller> cntl_guard(cntl);
-  std::unique_ptr<sendrecv::VoidMessage> response_guard(response);
-
-  if (cntl->Failed()) {
-    LOG(WARNING) << "Fail to send EchoRequest, " << cntl->ErrorText();
-    return;
-  }
-  LOG(INFO) << "Received response from " << cntl->remote_side()
-            << " latency=" << cntl->latency_us() << "us";
-}
-
-bool BRPCClient::AsyncSendVar(const std::string& ep,
-                              const platform::DeviceContext& ctx,
-                              const framework::Scope& scope,
-                              const std::string& var_name, int64_t time_out) {
-  const platform::DeviceContext* p_ctx = &ctx;
-  const std::string ep_val = ep;
-  const std::string var_name_val = var_name;
-  const framework::Scope* p_scope = &scope;
-  const auto ch_ptr = GetChannel(ep_val);
-
-  framework::AsyncIO(
-      [var_name_val, p_ctx, ep_val, p_scope, time_out, ch_ptr, this] {
-        auto ch_ctx = ch_ptr->Pop();
-        brpc::Controller* cntl = new brpc::Controller();
-        sendrecv::VoidMessage* response = new sendrecv::VoidMessage();
-        cntl->set_timeout_ms(time_out);
-
-        google::protobuf::Closure* done =
-            brpc::NewCallback(&HandleSendResponse, cntl, response);
-
-        sendrecv::VariableMessage request;
-        ch_ctx->stub->SendVariable(cntl, &request, response, done);
-      });
-  req_count_++;
-
-  return true;
-}
-
-void HandleGetResponse(brpc::Controller* cntl,
-                       sendrecv::VariableMessage* response) {
-  // std::unique_ptr makes sure cntl/response will be deleted before returning.
-  std::unique_ptr<brpc::Controller> cntl_guard(cntl);
-  std::unique_ptr<sendrecv::VariableMessage> response_guard(response);
-
-  if (cntl->Failed()) {
-    LOG(WARNING) << "Fail to send EchoRequest, " << cntl->ErrorText();
-    return;
-  }
-  LOG(INFO) << "Received response from " << cntl->remote_side()
-            << " latency=" << cntl->latency_us() << "us";
-
-  // framework::Variable* outvar = nullptr;
-  // DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, &outvar);
-}
-
-bool BRPCClient::AsyncGetVar(const std::string& ep,
-                             const platform::DeviceContext& ctx,
-                             const framework::Scope& scope,
-                             const std::string& var_name, int64_t time_out) {
-  const platform::DeviceContext* p_ctx = &ctx;
-  const std::string ep_val = ep;
-  const std::string var_name_val = var_name;
-  const framework::Scope* p_scope = &scope;
-  const auto ch = GetChannel(ep_val);
-
-  framework::AsyncIO(
-      [var_name_val, ep_val, p_scope, p_ctx, time_out, ch, this] {});
-
-  req_count_++;
-
-  return true;
-}
-
-bool BRPCClient::AsyncPrefetchVar(const std::string& ep,
-                                  const platform::DeviceContext& ctx,
-                                  const framework::Scope& scope,
-                                  const std::string& in_var_name,
-                                  const std::string& out_var_name,
-                                  int64_t time_out) {
-  const platform::DeviceContext* p_ctx = &ctx;
-  const std::string ep_val = ep;
-  const std::string in_var_name_val = in_var_name;
-  const std::string out_var_name_val = out_var_name;
-  const framework::Scope* p_scope = &scope;
-  const auto ch = GetChannel(ep_val);
-
-  framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx,
-                      time_out, ch, this] {});
-
-  req_count_++;
-  return true;
-}
-
-void BRPCClient::AsyncSendBatchBarrier(const std::string& ep,
-                                       int64_t time_out) {
-  req_count_++;
-}
-
-void BRPCClient::AsyncSendFetchBarrier(const std::string& ep,
-                                       int64_t time_out) {
-  req_count_++;
-}
-
-void BRPCClient::Wait() {
-  std::unique_lock<std::mutex> lk(sync_mutex_);
-  sync_cond_.wait(lk, [this] { return req_count_ == 0; });
-}
-
-ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) {
-  {
-    std::lock_guard<std::mutex> guard(chan_mutex_);
-    auto it = channels_.find(ep);
-    if (it != channels_.end()) {
-      return it->second;
-    }
-  }
-
-  ChannelQueuePtr q(new framework::BlockingQueue<ChannelContextPtr>());
-
-  brpc::ChannelOptions options;
-  options.protocol = "baidu_std";
-  options.connection_type = "pooled";
-  options.connect_timeout_ms = 100;
-  options.timeout_ms = FLAGS_timeout_ms /*milliseconds*/;
-  options.max_retry = FLAGS_max_retry;
-  for (int i = 0; i < FLAGS_brpc_channel_num; ++i) {
-    std::shared_ptr<ChannelContext> c(new ChannelContext());
-    if (c->channel.Init(ep.c_str(), &options) != 0) {
-      LOG(ERROR) << "Fail to initialize channel";
-      return nullptr;
-    }
-
-    c->stub.reset(new sendrecv::SendRecvService_Stub(
-        static_cast<google::protobuf::RpcChannel*>(&c->channel)));
-    q->Push(c);
-  }
-
-  {
-    std::lock_guard<std::mutex> guard(chan_mutex_);
-    channels_[ep] = q;
-  }
-
-  return q;
-}
-
-}  // namespace detail
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/brpc_client.h b/paddle/fluid/operators/detail/brpc_client.h
deleted file mode 100644
index 1e953ea431d51a9586bfd0b352c7f27d079ff1a8..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detail/brpc_client.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <time.h>
-
-#include <chrono>  // NOLINT
-#include <ctime>
-#include <functional>
-#include <iostream>
-#include <map>
-#include <mutex>  // NOLINT
-#include <string>
-#include <vector>
-
-#include "brpc/channel.h"
-#include "paddle/fluid/framework/blocking_queue.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/detail/rpc_client.h"
-#include "paddle/fluid/operators/detail/send_recv.pb.h"
-#include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
-
-namespace paddle {
-namespace operators {
-namespace detail {
-
-struct ChannelContext {
-  brpc::Channel channel;
-  std::shared_ptr<sendrecv::SendRecvService_Stub> stub;
-};
-
-typedef std::shared_ptr<ChannelContext> ChannelContextPtr;
-typedef std::shared_ptr<framework::BlockingQueue<ChannelContextPtr>>
-    ChannelQueuePtr;
-
-class BRPCClient : public RPCClient {
- public:
-  BRPCClient() {}
-  virtual ~BRPCClient();
-
-  bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx,
-                    const framework::Scope& scope, const std::string& var_name,
-                    int64_t time_out = RPCClient::rpc_time_out) override;
-
-  bool AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx,
-                   const framework::Scope& scope, const std::string& var_name,
-                   int64_t time_out = RPCClient::rpc_time_out) override;
-
-  bool AsyncPrefetchVar(const std::string& ep,
-                        const platform::DeviceContext& ctx,
-                        const framework::Scope& scope,
-                        const std::string& in_var_name,
-                        const std::string& out_var_name,
-                        int64_t time_out = RPCClient::rpc_time_out) override;
-
-  void AsyncSendBatchBarrier(
-      const std::string& ep,
-      int64_t time_out = RPCClient::rpc_time_out) override;
-
-  void AsyncSendFetchBarrier(
-      const std::string& ep,
-      int64_t time_out = RPCClient::rpc_time_out) override;
-
-  void Wait() override;
-
- private:
-  void Proceed();
-  ChannelQueuePtr GetChannel(const std::string& ep);
-
- private:
-  std::unordered_map<std::string, ChannelQueuePtr> channels_;
-
-  // mutex for Wait client sync
-  std::mutex sync_mutex_;
-  std::condition_variable sync_cond_;
-  std::atomic<int64_t> req_count_{0};
-
-  // mutex for GetChannel thread safety
-  std::mutex chan_mutex_;
-  DISABLE_COPY_AND_ASSIGN(BRPCClient);
-};
-
-}  // namespace detail
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/brpc_server.cc b/paddle/fluid/operators/detail/brpc_server.cc
deleted file mode 100644
index 2170abe679f9ededff3b53e3139e56f8aad227cb..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detail/brpc_server.cc
+++ /dev/null
@@ -1,144 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/detail/brpc_server.h"
-#include "paddle/fluid/operators/detail/request_handler.h"
-
-namespace sendrecv {
-
-typedef std::unordered_map<std::string,
-                           paddle::operators::detail::RequestHandler*>
-    HandlerMap;
-
-class BRPCServiceImpl : public SendRecvService {
- public:
-  explicit BRPCServiceImpl(const HandlerMap& rpc_call_map)
-      : request_send_h_(nullptr),
-        request_get_h_(nullptr),
-        request_prefetch_h_(nullptr) {
-    auto it = rpc_call_map.find(paddle::operators::detail::kRequestSend);
-    if (it != rpc_call_map.end()) {
-      request_send_h_ = it->second;
-    }
-
-    it = rpc_call_map.find(paddle::operators::detail::kRequestSend);
-    if (it != rpc_call_map.end()) {
-      request_get_h_ = it->second;
-    }
-
-    it = rpc_call_map.find(paddle::operators::detail::kRequestPrefetch);
-    if (it != rpc_call_map.end()) {
-      request_prefetch_h_ = it->second;
-    }
-  }
-
-  virtual ~BRPCServiceImpl() {}
-
-  void SendVariable(google::protobuf::RpcController* cntl_butil,
-                    const VariableMessage* request, VoidMessage* response,
-                    google::protobuf::Closure* done) override {
-    PADDLE_ENFORCE(request_send_h_ != nullptr,
-                   "RequestSend handler should be registed first!");
-    brpc::ClosureGuard done_guard(done);
-
-    paddle::framework::Scope* local_scope = request_send_h_->scope();
-    paddle::framework::Variable* outvar = nullptr;
-    paddle::framework::Variable* invar = nullptr;
-
-    std::string varname = request->varname();
-
-    if (!request_send_h_->sync_mode()) {
-      local_scope = &request_send_h_->scope()->NewScope();
-      invar = local_scope->Var(varname);
-    } else {
-      invar = local_scope->FindVar(varname);
-    }
-
-    request_send_h_->Handle(varname, local_scope, invar, &outvar);
-
-    if (!request_send_h_->sync_mode()) {
-      request_send_h_->scope()->DeleteScope(local_scope);
-    }
-  }
-
-  void GetVariable(google::protobuf::RpcController* cntl_butil,
-                   const VariableMessage* request, VariableMessage* response,
-                   google::protobuf::Closure* done) override {
-    PADDLE_ENFORCE(request_get_h_ != nullptr,
-                   "RequestGet handler should be registed first!");
-  }
-
-  void PrefetchVariable(google::protobuf::RpcController* cntl_butil,
-                        const VariableMessage* request,
-                        VariableMessage* response,
-                        google::protobuf::Closure* done) override {
-    PADDLE_ENFORCE(request_prefetch_h_ != nullptr,
-                   "kRequestPrefetch handler should be registed first!");
-  }
-
- private:
-  paddle::operators::detail::RequestHandler* request_send_h_;
-  paddle::operators::detail::RequestHandler* request_get_h_;
-  paddle::operators::detail::RequestHandler* request_prefetch_h_;
-};
-}  // namespace sendrecv
-
-namespace paddle {
-namespace operators {
-namespace detail {
-
-void AsyncBRPCServer::StartServer() {
-  // Instance of your service.
-  sendrecv::BRPCServiceImpl service_impl(rpc_call_map_);
-
-  // Add the service into server. Notice the second parameter, because the
-  // service is put on stack, we don't want server to delete it, otherwise
-  // use brpc::SERVER_OWNS_SERVICE.
-  if (server_.AddService(&service_impl, brpc::SERVER_DOESNT_OWN_SERVICE) != 0) {
-    LOG(FATAL) << "Fail to add service";
-    return;
-  }
-
-  brpc::ServerOptions options;
-  options.idle_timeout_sec = idle_timeout_s_;
-  options.max_concurrency = max_concurrency_;
-  if (server_.Start(bind_address_.c_str(), &options) != 0) {
-    LOG(FATAL) << "Fail to start EchoServer" << bind_address_;
-    return;
-  }
-
-  butil::EndPoint ep = server_.listen_address();
-  selected_port_ = ep.port;
-
-  {
-    std::lock_guard<std::mutex> lock(this->mutex_ready_);
-    ready_ = 1;
-  }
-  condition_ready_.notify_all();
-
-  server_.Join();
-}
-
-void AsyncBRPCServer::ShutDownImpl() { server_.Stop(1000); }
-
-void AsyncBRPCServer::WaitServerReady() {
-  VLOG(3) << "AsyncGRPCServer is wait server ready";
-  std::unique_lock<std::mutex> lock(this->mutex_ready_);
-  condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
-  VLOG(3) << "AsyncGRPCServer WaitSeverReady";
-}
-
-};  // namespace detail
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/detail/brpc_server.h b/paddle/fluid/operators/detail/brpc_server.h
deleted file mode 100644
index 0105c8074a46849031d8fa9c21a5507a982ec3c3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detail/brpc_server.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <condition_variable>  // NOLINT
-#include <mutex>               // NOLINT
-#include <string>
-
-#include "brpc/server.h"
-#include "paddle/fluid/operators/detail/rpc_server.h"
-#include "paddle/fluid/operators/detail/send_recv.pb.h"
-
-namespace paddle {
-namespace operators {
-namespace detail {
-
-class AsyncBRPCServer final : public RPCServer {
- public:
-  explicit AsyncBRPCServer(const std::string& address, int client_num)
-      : RPCServer(address, client_num), ready_(0) {}
-
-  virtual ~AsyncBRPCServer() {}
-  void StartServer() override;
-  void WaitServerReady() override;
-
- private:
-  void ShutDownImpl() override;
-
-  brpc::Server server_;
-
-  static constexpr int idle_timeout_s_ = -1;
-  static constexpr int max_concurrency_ = 0;
-
-  std::mutex mutex_ready_;
-  std::condition_variable condition_ready_;
-  int ready_;
-};
-
-};  // namespace detail
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/detail/bytebuffer_stream.cc b/paddle/fluid/operators/detail/bytebuffer_stream.cc
deleted file mode 100644
index a14171563edb0ac9a22b7ae493c965de3efb7823..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detail/bytebuffer_stream.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-// NOTE: This file was originally created by tensorflow
-//       (https://github.com/tensorflow/tensorflow/) we borrow this
-//       file and did some modifications so that we can send gRPC
-//       requests without too much copying of the tensor data.
-
-#include "paddle/fluid/operators/detail/bytebuffer_stream.h"
-
-namespace paddle {
-namespace operators {
-namespace detail {
-
-GrpcByteBufferSource::GrpcByteBufferSource() {}
-
-bool GrpcByteBufferSource::Init(const grpc::ByteBuffer& src) {
-  cur_ = -1;
-  left_ = 0;
-  ptr_ = nullptr;
-  byte_count_ = 0;
-  bool ok = src.Dump(&slices_).ok();
-  if (!ok) {
-    slices_.clear();
-  }
-  return ok;
-}
-
-bool GrpcByteBufferSource::Next(const void** data, int* size) {
-  // Use loop instead of if in case buffer contained empty slices.
-  while (left_ == 0) {
-    // Advance to next slice.
-    cur_++;
-    if (cur_ >= slices_.size()) {
-      return false;
-    }
-    const ::grpc::Slice& s = slices_[cur_];
-    left_ = s.size();
-    ptr_ = reinterpret_cast<const char*>(s.begin());
-  }
-
-  *data = ptr_;
-  *size = left_;
-  byte_count_ += left_;
-  ptr_ += left_;
-  left_ = 0;
-  return true;
-}
-
-void GrpcByteBufferSource::BackUp(int count) {
-  ptr_ -= count;
-  left_ += count;
-  byte_count_ -= count;
-}
-
-bool GrpcByteBufferSource::Skip(int count) {
-  const void* data;
-  int size;
-  while (Next(&data, &size)) {
-    if (size >= count) {
-      BackUp(size - count);
-      return true;
-    }
-    // size < count;
-    count -= size;
-  }
-  // error or we have too large count;
-  return false;
-}
-
-google::protobuf::int64 GrpcByteBufferSource::ByteCount() const {
-  return byte_count_;
-}
-
-}  // namespace detail
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/bytebuffer_stream.h b/paddle/fluid/operators/detail/bytebuffer_stream.h
deleted file mode 100644
index 054dd4ff294414cca55d7e033f2c5403bbb85526..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detail/bytebuffer_stream.h
+++ /dev/null
@@ -1,188 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-// NOTE: This file was originally created by tensorflow
-//       (https://github.com/tensorflow/tensorflow/) we borrow this
-//       file and did some modifications so that we can send gRPC
-//       requests without too much copying of the tensor data.
-
-#pragma once
-
-#include <vector>
-
-#include "google/protobuf/io/coded_stream.h"
-#include "google/protobuf/io/zero_copy_stream.h"
-#include "grpc++/grpc++.h"
-
-namespace grpc {
-// A ZeroCopyInputStream that reads from grpc_byte_buffer
-class GrpcBufferReader final
-    : public ::google::protobuf::io::ZeroCopyInputStream {
-  typedef void (CoreCodegenInterface::*OldReaderInitAPI)(
-      grpc_byte_buffer_reader* reader, grpc_byte_buffer* buffer);
-  typedef int (CoreCodegenInterface::*NewReaderInitAPI)(
-      grpc_byte_buffer_reader* reader, grpc_byte_buffer* buffer);
-  void ReaderInit(OldReaderInitAPI ptr, grpc_byte_buffer_reader* reader,
-                  grpc_byte_buffer* buffer) {
-    (g_core_codegen_interface->*ptr)(reader, buffer);
-  }
-  void ReaderInit(NewReaderInitAPI ptr, grpc_byte_buffer_reader* reader,
-                  grpc_byte_buffer* buffer) {
-    int result = (g_core_codegen_interface->*ptr)(reader, buffer);
-    (void)result;
-  }
-
- public:
-  explicit GrpcBufferReader(grpc_byte_buffer* buffer)
-      : byte_count_(0), backup_count_(0) {
-    ReaderInit(&CoreCodegenInterface::grpc_byte_buffer_reader_init, &reader_,
-               buffer);
-  }
-  ~GrpcBufferReader() override {
-    g_core_codegen_interface->grpc_byte_buffer_reader_destroy(&reader_);
-  }
-
-  bool Next(const void** data, int* size) override {
-    if (backup_count_ > 0) {
-      *data = GRPC_SLICE_START_PTR(slice_) + GRPC_SLICE_LENGTH(slice_) -
-              backup_count_;
-      GPR_CODEGEN_ASSERT(backup_count_ <= INT_MAX);
-      *size = static_cast<int>(backup_count_);
-      backup_count_ = 0;
-      return true;
-    }
-    if (!g_core_codegen_interface->grpc_byte_buffer_reader_next(&reader_,
-                                                                &slice_)) {
-      return false;
-    }
-    g_core_codegen_interface->grpc_slice_unref(slice_);
-    *data = GRPC_SLICE_START_PTR(slice_);
-    // On win x64, int is only 32bit
-    GPR_CODEGEN_ASSERT(GRPC_SLICE_LENGTH(slice_) <= INT_MAX);
-    byte_count_ += * size = static_cast<int>(GRPC_SLICE_LENGTH(slice_));
-    return true;
-  }
-
-  void BackUp(int count) override { backup_count_ = count; }
-
-  bool Skip(int count) override {
-    const void* data;
-    int size;
-    while (Next(&data, &size)) {
-      if (size >= count) {
-        BackUp(size - count);
-        return true;
-      }
-      // size < count;
-      count -= size;
-    }
-    // error or we have too large count;
-    return false;
-  }
-
-  ::google::protobuf::int64 ByteCount() const override {
-    return byte_count_ - backup_count_;
-  }
-
- private:
-  int64_t byte_count_;
-  int64_t backup_count_;
-  grpc_byte_buffer_reader reader_;
-  grpc_slice slice_;
-};
-
-};  // namespace grpc
-
-namespace paddle {
-namespace operators {
-namespace detail {
-// Source provides a way for a particular RPC implementation to provide
-// received data to ParseFrom.
-class Source {
- public:
-  virtual ~Source() {}
-
-  // Return the stream that contains the data to be parsed.
-  // Note that this method might be invoked more than once if
-  // ParseFrom needs to fall back to a more expensive parsing method.
-  // Every call must return a stream pointing at the beginning of
-  // the serialized RecvTensorResponse.
-  //
-  // Note that a subsequent call to contents() invalidates previous
-  // results of contents().
-  //
-  // Ownership of the returned stream is retained by the Source and
-  // should not be deleted by the caller.
-  virtual ::google::protobuf::io::ZeroCopyInputStream* contents() = 0;
-};
-
-// A ZeroCopyInputStream that reads from a grpc::ByteBuffer.
-class GrpcByteBufferSource
-    : public ::google::protobuf::io::ZeroCopyInputStream {
- public:
-  GrpcByteBufferSource();
-  bool Init(const ::grpc::ByteBuffer& src);  // Can be called multiple times.
-  bool Next(const void** data, int* size) override;
-  void BackUp(int count) override;
-  bool Skip(int count) override;
-  ::google::protobuf::int64 ByteCount() const override;
-
- private:
-  std::vector<::grpc::Slice> slices_;
-  size_t cur_;       // Current slice index.
-  int left_;         // Number of bytes in slices_[cur_] left to yield.
-  const char* ptr_;  // Address of next byte in slices_[cur_] to yield.
-  ::google::protobuf::int64 byte_count_;
-};
-
-class GrpcByteBufferSourceWrapper : public Source {
- public:
-  explicit GrpcByteBufferSourceWrapper(GrpcByteBufferSource* source)
-      : source_(source) {}
-  ::google::protobuf::io::ZeroCopyInputStream* contents() override {
-    return source_;
-  }
-
- private:
-  GrpcByteBufferSource* source_;
-};
-
-class GrpcByteSource : public Source {
- public:
-  explicit GrpcByteSource(grpc_byte_buffer* buffer) : buffer_(buffer) {}
-  ~GrpcByteSource() override { DeleteStream(); }
-
-  typedef ::grpc::GrpcBufferReader Reader;
-
-  ::google::protobuf::io::ZeroCopyInputStream* contents() override {
-    DeleteStream();
-    stream_ = new (&space_) Reader(buffer_);
-    return stream_;
-  }
-
- private:
-  void DeleteStream() {
-    if (stream_) {
-      stream_->~Reader();
-    }
-  }
-
-  grpc_byte_buffer* buffer_;  // Not owned
-  Reader* stream_ = nullptr;  // Points into space_ if non-nullptr
-  char space_[sizeof(Reader)];
-};
-
-}  // namespace detail
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc
deleted file mode 100644
index 02ffe3651e1deefcf6981c3d304d64b9a01661bf..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ /dev/null
@@ -1,281 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/detail/grpc_client.h"
-
-#include <sys/time.h>
-
-#include <limits>
-
-#include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/detail/request_handler.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace operators {
-namespace detail {
-
-void GRPCClient::InitImpl() { InitEventLoop(); }
-
-void GRPCClient::InitEventLoop() {
-  // start the client process thread
-  // TODO(wuyi): can make this in a threadpool
-  client_thread_.reset(new std::thread(std::bind(&GRPCClient::Proceed, this)));
-}
-
-void GRPCClient::SendComplete() {
-  for (auto& it : channels_) {
-    this->AsyncSendComplete(it.first);
-  }
-}
-
-GRPCClient::~GRPCClient() {
-  Wait();
-  cq_.Shutdown();
-  {
-    std::lock_guard<std::mutex> guard(chan_mutex_);
-    for (auto& it : channels_) {
-      it.second.reset();
-    }
-  }
-  client_thread_->join();
-}
-
-bool GRPCClient::AsyncSendVar(const std::string& ep,
-                              const platform::DeviceContext& ctx,
-                              const framework::Scope& scope,
-                              const std::string& var_name, int64_t time_out) {
-  const platform::DeviceContext* p_ctx = &ctx;
-  const std::string ep_val = ep;
-  const std::string var_name_val = var_name;
-  const framework::Scope* p_scope = &scope;
-  const auto ch = GetChannel(ep_val);
-
-  framework::AsyncIO([var_name_val, p_ctx, ep_val, p_scope, time_out, ch,
-                      this] {
-    auto* var = p_scope->FindVar(var_name_val);
-
-    ::grpc::ByteBuffer req;
-    SerializeToByteBuffer(var_name_val, var, *p_ctx, &req);
-
-    // varhandle
-    VarHandle var_h;
-    var_h.ep = ep_val;
-    var_h.scope = p_scope;
-    var_h.name = var_name_val;
-    var_h.ctx = p_ctx;
-
-    // stub context
-    SendProcessor* s = new SendProcessor(ch);
-    s->Prepare(var_h, time_out);
-    s->response_call_back_ = nullptr;
-
-    auto call = s->stub_g_.PrepareUnaryCall(
-        s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, &cq_);
-    call->StartCall();
-    call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
-  });
-  req_count_++;
-
-  return true;
-}
-
-void ProcGetResponse(const VarHandle& var_h,
-                     const ::grpc::ByteBuffer& ret_msg) {
-  framework::Variable* outvar = nullptr;
-  DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, &outvar);
-}
-
-template <typename T>
-void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) {
-  ::grpc::Slice slice(proto.ByteSizeLong());
-  proto.SerializeWithCachedSizesToArray(const_cast<uint8_t*>(slice.begin()));
-  ::grpc::ByteBuffer tmp(&slice, 1);
-  result->Swap(&tmp);
-}
-
-bool GRPCClient::AsyncGetVar(const std::string& ep,
-                             const platform::DeviceContext& ctx,
-                             const framework::Scope& scope,
-                             const std::string& var_name, int64_t time_out) {
-  const platform::DeviceContext* p_ctx = &ctx;
-  const std::string ep_val = ep;
-  const std::string var_name_val = var_name;
-  const framework::Scope* p_scope = &scope;
-  const auto ch = GetChannel(ep_val);
-
-  framework::AsyncIO([var_name_val, ep_val, p_scope, p_ctx, time_out, ch,
-                      this] {
-    // prepare input
-    sendrecv::VariableMessage req;
-    req.set_varname(var_name_val);
-    ::grpc::ByteBuffer buf;
-    RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf);
-
-    // var handle
-    VarHandle var_h;
-    var_h.ep = ep_val;
-    var_h.scope = p_scope;
-    var_h.name = var_name_val;
-    var_h.ctx = p_ctx;
-
-    // stub context
-    GetProcessor* s = new GetProcessor(ch);
-    s->Prepare(var_h, time_out);
-    s->response_call_back_ = ProcGetResponse;
-
-    auto call = s->stub_g_.PrepareUnaryCall(
-        s->context_.get(), "/sendrecv.SendRecvService/GetVariable", buf, &cq_);
-    call->StartCall();
-    call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
-  });
-
-  req_count_++;
-
-  return true;
-}
-
-bool GRPCClient::AsyncPrefetchVar(const std::string& ep,
-                                  const platform::DeviceContext& ctx,
-                                  const framework::Scope& scope,
-                                  const std::string& in_var_name,
-                                  const std::string& out_var_name,
-                                  int64_t time_out) {
-  const platform::DeviceContext* p_ctx = &ctx;
-  const std::string ep_val = ep;
-  const std::string in_var_name_val = in_var_name;
-  const std::string out_var_name_val = out_var_name;
-  const framework::Scope* p_scope = &scope;
-  const auto ch = GetChannel(ep_val);
-
-  framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx,
-                      time_out, ch, this] {
-    auto* var = p_scope->FindVar(in_var_name_val);
-
-    ::grpc::ByteBuffer req;
-    SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, out_var_name_val);
-
-    // var handle
-    VarHandle var_h;
-    var_h.ep = ep_val;
-    var_h.scope = p_scope;
-    var_h.name = out_var_name_val;
-    var_h.ctx = p_ctx;
-
-    // stub context
-    GetProcessor* s = new GetProcessor(ch);
-    s->Prepare(var_h, time_out);
-    s->response_call_back_ = ProcGetResponse;
-
-    auto call = s->stub_g_.PrepareUnaryCall(
-        s->context_.get(), "/sendrecv.SendRecvService/PrefetchVariable", req,
-        &cq_);
-    call->StartCall();
-    call->Finish(&s->reply_, &s->status_, static_cast<void*>(s));
-  });
-
-  req_count_++;
-  return true;
-}
-
-void GRPCClient::AsyncSendBatchBarrier(const std::string& ep,
-                                       int64_t time_out) {
-  const auto ch = GetChannel(ep);
-
-  BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
-  s->Prepare(time_out);
-
-  sendrecv::VariableMessage req;
-  req.set_varname(BATCH_BARRIER_MESSAGE);
-  auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
-  req_count_++;
-}
-
-void GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
-                                       int64_t time_out) {
-  const auto ch = GetChannel(ep);
-  FetchBarrierProcessor* s = new FetchBarrierProcessor(ch);
-  s->Prepare(time_out);
-
-  sendrecv::VariableMessage req;
-  req.set_varname(FETCH_BARRIER_MESSAGE);
-  auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
-  req_count_++;
-}
-
-void GRPCClient::AsyncSendComplete(const std::string& ep, int64_t time_out) {
-  const auto ch = GetChannel(ep);
-
-  BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
-  s->Prepare(time_out);
-
-  sendrecv::VariableMessage req;
-  req.set_varname(COMPLETE_MESSAGE);
-  auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
-  req_count_++;
-}
-
-void GRPCClient::Wait() {
-  std::unique_lock<std::mutex> lk(sync_mutex_);
-  sync_cond_.wait(lk, [this] { return req_count_ == 0; });
-}
-
-void GRPCClient::Proceed() {
-  void* tag = nullptr;
-  bool ok = false;
-
-  while (cq_.Next(&tag, &ok)) {
-    BaseProcessor* c = static_cast<BaseProcessor*>(tag);
-    GPR_ASSERT(ok);
-    PADDLE_ENFORCE(c);
-    if (c->status_.ok()) {
-      c->Process();
-    } else {
-      LOG(ERROR) << "var: " << c->var_h_.String()
-                 << " grpc error:" << c->status_.error_message();
-    }
-    delete c;
-    {
-      std::lock_guard<std::mutex> lk(sync_mutex_);
-      req_count_--;
-    }
-    sync_cond_.notify_all();
-  }
-}
-
-std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) {
-  // TODO(Yancey1989): make grpc client completely thread-safe
-  std::lock_guard<std::mutex> guard(chan_mutex_);
-  auto it = channels_.find(ep);
-  if (it != channels_.end()) {
-    return it->second;
-  }
-
-  grpc::ChannelArguments args;
-  args.SetCompressionAlgorithm(GRPC_COMPRESS_NONE);
-  args.SetMaxSendMessageSize(std::numeric_limits<int>::max());
-  args.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
-
-  auto ch =
-      grpc::CreateCustomChannel(ep, grpc::InsecureChannelCredentials(), args);
-  channels_[ep] = ch;
-  return ch;
-}
-
-}  // namespace detail
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/grpc_client.h b/paddle/fluid/operators/detail/grpc_client.h
deleted file mode 100644
index 44000c028b499d9ad1a0e0dd40a5e287cd61d143..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detail/grpc_client.h
+++ /dev/null
@@ -1,231 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <time.h>
-
-#include <chrono>              // NOLINT
-#include <condition_variable>  // NOLINT
-#include <ctime>
-#include <functional>
-#include <iostream>
-#include <map>
-#include <mutex>  // NOLINT
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "grpc++/channel.h"
-#include "grpc++/generic/generic_stub.h"
-#include "grpc++/grpc++.h"
-#include "grpc++/support/byte_buffer.h"
-#include "grpc++/support/slice.h"
-#include "grpc/support/log.h"
-#include "paddle/fluid/framework/blocking_queue.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/detail/rpc_client.h"
-#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
-#include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
-
-namespace paddle {
-namespace operators {
-namespace detail {
-
-struct VarHandle {
-  std::string ep;
-  const platform::DeviceContext* ctx;
-  const framework::Scope* scope;
-  std::string name;
-
-  std::string String() const {
-    std::ostringstream s;
-    s << "name:[" << name << "] ep:[" << ep << "]";
-    return s.str();
-  }
-};
-
-void ProcGetResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg);
-
-class BaseProcessor {
- public:
-  explicit BaseProcessor(std::shared_ptr<grpc::Channel> ch) {
-    context_ = nullptr;
-  }
-
-  virtual ~BaseProcessor() {}
-
-  virtual void Prepare(const VarHandle& var_info, int64_t time_out) {
-    context_.reset(new grpc::ClientContext());
-    var_h_ = var_info;
-
-    std::chrono::system_clock::time_point deadline =
-        std::chrono::system_clock::now() + std::chrono::milliseconds(time_out);
-
-    context_->set_deadline(deadline);
-  }
-
-  virtual void Prepare(int64_t time_out) {
-    context_.reset(new grpc::ClientContext());
-
-    std::chrono::system_clock::time_point deadline =
-        std::chrono::system_clock::now() + std::chrono::milliseconds(time_out);
-
-    context_->set_deadline(deadline);
-  }
-
-  virtual void Process() = 0;
-
-  std::unique_ptr<grpc::ClientContext> context_;
-  grpc::Status status_;
-  VarHandle var_h_;
-};
-
-typedef std::function<void(const VarHandle&, const ::grpc::ByteBuffer&)>
-    RequestSendCallBack;
-
-class SendProcessor : public BaseProcessor {
- public:
-  explicit SendProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(ch), stub_g_(ch) {}
-
-  virtual ~SendProcessor() {}
-
-  virtual void Process() {
-    if (response_call_back_) {
-      response_call_back_(var_h_, reply_);
-    }
-  }
-
-  ::grpc::GenericStub stub_g_;
-  ::grpc::ByteBuffer reply_;
-  RequestSendCallBack response_call_back_ = nullptr;
-};
-
-typedef std::function<void(const VarHandle&, const ::grpc::ByteBuffer&)>
-    RequestGetCallBack;
-
-class GetProcessor : public BaseProcessor {
- public:
-  explicit GetProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(ch), stub_g_(ch) {}
-
-  virtual ~GetProcessor() {}
-
-  virtual void Process() {
-    if (response_call_back_) {
-      response_call_back_(var_h_, reply_);
-    }
-  }
-
-  ::grpc::ByteBuffer reply_;
-  ::grpc::GenericStub stub_g_;
-  RequestGetCallBack response_call_back_ = ProcGetResponse;
-};
-
-class BatchBarrierProcessor : public BaseProcessor {
- public:
-  explicit BatchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(ch) {
-    stub_ = sendrecv::SendRecvService::NewStub(ch);
-  }
-
-  virtual ~BatchBarrierProcessor() {}
-
-  virtual void Process() {}
-  sendrecv::VoidMessage reply_;
-  std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
-};
-
-class FetchBarrierProcessor : public BaseProcessor {
- public:
-  explicit FetchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(ch) {
-    stub_ = sendrecv::SendRecvService::NewStub(ch);
-  }
-
-  virtual ~FetchBarrierProcessor() {}
-
-  virtual void Process() {}
-  sendrecv::VariableMessage reply_;
-  std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
-};
-
-class GRPCClient : public RPCClient {
- public:
-  GRPCClient() {}
-  virtual ~GRPCClient();
-
-  bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx,
-                    const framework::Scope& scope, const std::string& var_name,
-                    int64_t time_out = RPCClient::rpc_time_out) override;
-
-  bool AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx,
-                   const framework::Scope& scope, const std::string& var_name,
-                   int64_t time_out = RPCClient::rpc_time_out) override;
-
-  bool AsyncPrefetchVar(const std::string& ep,
-                        const platform::DeviceContext& ctx,
-                        const framework::Scope& scope,
-                        const std::string& in_var_name,
-                        const std::string& out_var_name,
-                        int64_t time_out = RPCClient::rpc_time_out) override;
-
-  void AsyncSendBatchBarrier(
-      const std::string& ep,
-      int64_t time_out = RPCClient::rpc_time_out) override;
-
-  void AsyncSendFetchBarrier(
-      const std::string& ep,
-      int64_t time_out = RPCClient::rpc_time_out) override;
-
-  void Wait() override;
-
-  void SendComplete() override;
-
- protected:
-  void InitImpl() override;
-
- private:
-  // InitEventLoop should only be called by Init()
-  void InitEventLoop();
-
-  void Proceed();
-
-  void AsyncSendComplete(const std::string& ep,
-                         int64_t time_out = RPCClient::rpc_time_out);
-
-  std::shared_ptr<grpc::Channel> GetChannel(const std::string& ep);
-
- private:
-  grpc::CompletionQueue cq_;
-  std::unordered_map<std::string, std::shared_ptr<grpc::Channel>> channels_;
-  std::unique_ptr<std::thread> client_thread_;
-
-  // mutex for Wait client sync
-  std::mutex sync_mutex_;
-  std::condition_variable sync_cond_;
-  std::atomic<int64_t> req_count_{0};
-
-  // mutex for GetChannel thread safety
-  std::mutex chan_mutex_;
-  DISABLE_COPY_AND_ASSIGN(GRPCClient);
-};
-
-}  // namespace detail
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/grpc_serde_test.cc b/paddle/fluid/operators/detail/grpc_serde_test.cc
deleted file mode 100644
index 15892295e6901fe649788c9e34604008fc8cbdfa..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detail/grpc_serde_test.cc
+++ /dev/null
@@ -1,221 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <unistd.h>
-#include <string>
-#include <thread>  // NOLINT
-
-#include "google/protobuf/text_format.h"
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
-#include "paddle/fluid/operators/detail/variable_response.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/string/printf.h"
-
-namespace framework = paddle::framework;
-namespace platform = paddle::platform;
-namespace operators = paddle::operators;
-namespace math = paddle::operators::math;
-namespace memory = paddle::memory;
-
-void RunSerdeTestSelectedRows(platform::Place place) {
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& ctx = *pool.Get(place);
-
-  // serialize var to ByteBuffer
-  framework::Variable var;
-  auto* slr = var.GetMutable<framework::SelectedRows>();
-  slr->set_height(1000);
-  auto* tensor = slr->mutable_value();
-  auto* rows = slr->mutable_rows();
-  tensor->Resize(framework::make_ddim({564, 128}));
-  tensor->mutable_data<float>(place);
-  int tensor_numel = 564 * 128;
-  math::set_constant(ctx, tensor, 32.7);
-  for (int i = 0; i < 564; ++i) rows->push_back(i);
-
-  ::grpc::ByteBuffer msg;
-  operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg);
-  EXPECT_GT(msg.Length(), static_cast<size_t>(0));
-
-  // deserialize
-  std::vector<::grpc::Slice> slices;
-  (void)msg.Dump(&slices);
-  std::string tmp;
-  for (const auto& s : slices) {
-    tmp.append(reinterpret_cast<const char*>(s.begin()), s.size());
-  }
-
-  sendrecv::VariableMessage varmsg;
-  EXPECT_TRUE(varmsg.ParseFromString(tmp));
-
-  // deserialize bytebuffer
-  EXPECT_EQ(varmsg.varname(), "myvar");
-  EXPECT_EQ(varmsg.type(), 1);
-
-  const float* tensor_data =
-      reinterpret_cast<const float*>(varmsg.serialized().data());
-  const int64_t* rows_data =
-      reinterpret_cast<const int64_t*>(varmsg.rows().data());
-  for (int i = 0; i < tensor_numel; ++i) {
-    EXPECT_FLOAT_EQ(tensor_data[i], 32.7);
-  }
-  for (int i = 0; i < 564; ++i) {
-    EXPECT_EQ(rows_data[i], i);
-  }
-
-  // deserialize zero-copy
-  // framework::Variable var2;
-  // operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2);
-  framework::Scope scope;
-  scope.Var("myvar");
-  operators::detail::VariableResponse resp(&scope, &ctx);
-  EXPECT_EQ(resp.Parse(msg), 0);
-
-  framework::Variable* var2 = resp.GetVar();
-
-  auto* slr2 = var2->GetMutable<framework::SelectedRows>();
-  auto* tensor2 = slr2->mutable_value();
-  auto* rows2 = slr2->mutable_rows();
-  float* tensor_data2 = nullptr;
-  framework::Tensor tmp_tensor;
-
-  if (platform::is_gpu_place(ctx.GetPlace())) {
-    platform::CPUPlace cpu;
-    framework::TensorCopy(*tensor2, cpu, &tmp_tensor);
-    tensor_data2 = tmp_tensor.data<float>();
-  } else {
-    tensor_data2 = const_cast<float*>(tensor2->data<float>());
-  }
-  const int64_t* rows_data2 = rows2->data();
-
-  for (int i = 0; i < tensor_numel; ++i) {
-    EXPECT_FLOAT_EQ(tensor_data2[i], 32.7);
-  }
-  for (size_t i = 0; i < rows2->size(); ++i) {
-    EXPECT_EQ(rows_data2[i], static_cast<int64_t>(i));
-  }
-  EXPECT_EQ(slr2->height(), 1000);
-}
-
-void RunTestLodTensor(platform::Place place, int from_type = 0) {
-  // serialize var to ByteBuffer
-  framework::Variable var;
-  auto* tensor = var.GetMutable<framework::LoDTensor>();
-  tensor->Resize(framework::make_ddim({512, 8, 4, 2}));
-  framework::LoD lod;
-  lod.push_back(framework::Vector<size_t>({1, 3, 8}));
-  tensor->set_lod(lod);
-  int tensor_numel = 512 * 8 * 4 * 2;
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& ctx = *pool.Get(place);
-  tensor->mutable_data<float>(place);
-  math::set_constant(ctx, tensor, 31.9);
-
-  ::grpc::ByteBuffer msg;
-  operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg);
-  EXPECT_GT(msg.Length(), static_cast<size_t>(0));
-
-  // deserialize
-  std::vector<::grpc::Slice> slices;
-  (void)msg.Dump(&slices);
-  std::string tmp;
-  for (const auto& s : slices) {
-    tmp.append(reinterpret_cast<const char*>(s.begin()), s.size());
-  }
-  sendrecv::VariableMessage varmsg;
-  EXPECT_TRUE(varmsg.ParseFromString(tmp));
-  EXPECT_EQ(varmsg.varname(), "myvar");
-  EXPECT_EQ(varmsg.type(), 0);
-  EXPECT_EQ(varmsg.dims()[0], 512);
-  EXPECT_EQ(varmsg.dims()[1], 8);
-  EXPECT_EQ(varmsg.dims()[2], 4);
-  EXPECT_EQ(varmsg.dims()[3], 2);
-  EXPECT_EQ(varmsg.lod_level(), 1);
-  EXPECT_EQ(varmsg.lod(0).lod_data(0), 1);
-  EXPECT_EQ(varmsg.lod(0).lod_data(1), 3);
-  EXPECT_EQ(varmsg.lod(0).lod_data(2), 8);
-
-  const float* tensor_data =
-      reinterpret_cast<const float*>(varmsg.serialized().data());
-  for (int i = 0; i < tensor_numel; ++i) {
-    EXPECT_FLOAT_EQ(tensor_data[i], 31.9);
-  }
-
-  // message binary
-  std::string str;
-  varmsg.SerializeToString(&str);
-
-  // message bytebuffer
-  ::grpc::Slice slices_2[1];
-  int num_slices = 1;
-  slices_2[0] = ::grpc::Slice(str.length());
-  memcpy(const_cast<uint8_t*>(slices_2[0].begin()), str.c_str(), str.length());
-  ::grpc::ByteBuffer bytebuffer2(&slices_2[0], num_slices);
-
-  // deserialize zero-copy
-  framework::Scope scope;
-  scope.Var("myvar");
-  operators::detail::VariableResponse resp(&scope, &ctx);
-  if (from_type == 0) {
-    EXPECT_EQ(resp.Parse(msg), 0);
-  } else {
-    EXPECT_EQ(resp.Parse(bytebuffer2), 0);
-  }
-
-  framework::Variable* var2 = resp.GetVar();
-
-  auto tensor2 = var2->Get<framework::LoDTensor>();
-  float* tensor_data2 = nullptr;
-  framework::Tensor tmp_tensor;
-
-  if (platform::is_gpu_place(ctx.GetPlace())) {
-    platform::CPUPlace cpu;
-    framework::TensorCopy(tensor2, cpu, &tmp_tensor);
-    tensor_data2 = tmp_tensor.data<float>();
-  } else {
-    tensor_data2 = const_cast<float*>(tensor2.data<float>());
-  }
-
-  EXPECT_EQ(varmsg.lod_level(), 1);
-  EXPECT_EQ(varmsg.lod(0).lod_data(0), 1);
-  EXPECT_EQ(varmsg.lod(0).lod_data(1), 3);
-  EXPECT_EQ(varmsg.lod(0).lod_data(2), 8);
-  for (int i = 0; i < tensor_numel; ++i) EXPECT_FLOAT_EQ(tensor_data2[i], 31.9);
-}
-
-TEST(LodTensor, Run) {
-  platform::CPUPlace place;
-  RunTestLodTensor(place);
-  RunTestLodTensor(place, 1);
-#ifdef PADDLE_WITH_CUDA
-  platform::CUDAPlace gpu(0);
-  RunTestLodTensor(gpu);
-  RunTestLodTensor(gpu, 1);
-#endif
-}
-
-TEST(SelectedRows, Run) {
-  platform::CPUPlace place;
-  RunSerdeTestSelectedRows(place);
-
-#ifdef PADDLE_WITH_CUDA
-  platform::CUDAPlace gpu;
-  RunSerdeTestSelectedRows(gpu);
-#endif
-}
diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc
deleted file mode 100644
index 5a87258901c6563fe793d4041f344011a56d9a01..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ /dev/null
@@ -1,359 +0,0 @@
-/*Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <limits>
-#include <string>
-
-#include "paddle/fluid/operators/detail/grpc_server.h"
-
-using ::grpc::ServerAsyncResponseWriter;
-
-namespace paddle {
-namespace operators {
-namespace detail {
-enum CallStatus { PROCESS = 0, FINISH };
-
-// reference:
-// https://stackoverflow.com/questions/41732884/grpc-multiple-services-in-cpp-async-server
-class RequestBase {
- public:
-  explicit RequestBase(GrpcService::AsyncService* service,
-                       ::grpc::ServerCompletionQueue* cq,
-                       RequestHandler* request_handler, int req_id)
-      : service_(service),
-        cq_(cq),
-        status_(PROCESS),
-        request_handler_(request_handler),
-        req_id_(req_id) {
-    PADDLE_ENFORCE(cq_);
-  }
-  virtual ~RequestBase() {}
-  virtual void Process() = 0;
-
-  CallStatus Status() const {
-    std::lock_guard<std::mutex> l(status_mu_);
-    return status_;
-  }
-
-  template <typename T>
-  void Finish(const T& reply, ServerAsyncResponseWriter<T>* responder) {
-    std::lock_guard<std::mutex> l(status_mu_);
-    status_ = FINISH;
-    responder->Finish(reply, ::grpc::Status::OK,
-                      reinterpret_cast<void*>(static_cast<intptr_t>(req_id_)));
-  }
-  virtual std::string GetReqName() = 0;
-
- protected:
-  mutable std::mutex status_mu_;
-  ::grpc::ServerContext ctx_;
-  GrpcService::AsyncService* service_;
-  ::grpc::ServerCompletionQueue* cq_;
-  CallStatus status_;
-  RequestHandler* request_handler_;
-  int req_id_;
-};
-
-class RequestSend final : public RequestBase {
- public:
-  explicit RequestSend(GrpcService::AsyncService* service,
-                       ::grpc::ServerCompletionQueue* cq,
-                       RequestHandler* request_handler, int req_id)
-      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
-    request_.reset(new VariableResponse(request_handler->scope(),
-                                        request_handler->dev_ctx(),
-                                        !request_handler->sync_mode()));
-    int method_id = static_cast<int>(detail::GrpcMethod::kSendVariable);
-    service_->RequestAsyncUnary(
-        method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
-        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
-  }
-  virtual ~RequestSend() {}
-  std::string GetReqName() override { return request_->Varname(); }
-
-  void Process() override {
-    std::string varname = GetReqName();
-    VLOG(3) << "RequestSend var_name:" << varname;
-
-    auto scope = request_->GetMutableLocalScope();
-    auto invar = request_->GetVar();
-    framework::Variable* outvar = nullptr;
-
-    request_handler_->Handle(varname, scope, invar, &outvar);
-    Finish(reply_, &responder_);
-  }
-
- protected:
-  sendrecv::VoidMessage reply_;
-  std::shared_ptr<VariableResponse> request_;
-  ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
-};
-
-class RequestGet final : public RequestBase {
- public:
-  explicit RequestGet(GrpcService::AsyncService* service,
-                      ::grpc::ServerCompletionQueue* cq,
-                      RequestHandler* request_handler, int req_id)
-      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
-    auto method_id = static_cast<int>(detail::GrpcMethod::kGetVariable);
-    service_->RequestAsyncUnary(
-        method_id, &ctx_, &request_, &responder_, cq_, cq_,
-        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
-  }
-
-  virtual ~RequestGet() {}
-
-  std::string GetReqName() override { return request_.varname(); }
-
-  void Process() override {
-    // proc request.
-    std::string varname = request_.varname();
-    VLOG(3) << "RequestGet " << varname;
-
-    auto scope = request_handler_->scope();
-    auto invar = scope->FindVar(varname);
-    framework::Variable* outvar = nullptr;
-
-    request_handler_->Handle(varname, scope, invar, &outvar);
-
-    if (outvar) {
-      SerializeToByteBuffer(varname, outvar, *request_handler_->dev_ctx(),
-                            &reply_);
-    }
-    Finish(reply_, &responder_);
-  }
-
- protected:
-  sendrecv::VariableMessage request_;
-  ::grpc::ByteBuffer reply_;
-  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
-};
-
-class RequestPrefetch final : public RequestBase {
- public:
-  explicit RequestPrefetch(GrpcService::AsyncService* service,
-                           ::grpc::ServerCompletionQueue* cq,
-                           RequestHandler* request_handler, int req_id)
-      : RequestBase(service, cq, request_handler, req_id),
-        responder_(&ctx_),
-        local_scope_(nullptr) {
-    request_.reset(new VariableResponse(request_handler->scope(),
-                                        request_handler->dev_ctx(), true));
-    int method_id = static_cast<int>(detail::GrpcMethod::kPrefetchVariable);
-    service_->RequestAsyncUnary(
-        method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
-        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
-  }
-
-  virtual ~RequestPrefetch() {}
-
-  std::string GetReqName() override { return request_->Varname(); }
-
-  void Process() override {
-    // prefetch process...
-    std::string in_var_name = request_->Varname();
-    std::string out_var_name = request_->OutVarname();
-    VLOG(3) << "RequestPrefetch, in_var_name: " << in_var_name
-            << " out_var_name: " << out_var_name;
-
-    auto scope = request_->GetMutableLocalScope();
-    auto invar = scope->FindVar(in_var_name);
-    // out var must be created in local scope!
-    framework::Variable* outvar = scope->Var(out_var_name);
-
-    request_handler_->Handle(in_var_name, scope, invar, &outvar, out_var_name);
-
-    SerializeToByteBuffer(out_var_name, outvar, *request_handler_->dev_ctx(),
-                          &reply_);
-    Finish(reply_, &responder_);
-  }
-
- protected:
-  std::shared_ptr<VariableResponse> request_;
-  ::grpc::ByteBuffer reply_;
-  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
-  framework::Scope* local_scope_;
-};
-
-void AsyncGRPCServer::WaitServerReady() {
-  VLOG(3) << "AsyncGRPCServer is wait server ready";
-  std::unique_lock<std::mutex> lock(this->mutex_ready_);
-  condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
-  VLOG(3) << "AsyncGRPCServer WaitSeverReady";
-}
-
-void AsyncGRPCServer::StartServer() {
-  ::grpc::ServerBuilder builder;
-  builder.AddListeningPort(bind_address_, ::grpc::InsecureServerCredentials(),
-                           &selected_port_);
-
-  builder.SetMaxSendMessageSize(std::numeric_limits<int>::max());
-  builder.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
-  builder.RegisterService(&service_);
-
-  for (auto t : rpc_call_map_) {
-    rpc_cq_[t.first].reset(builder.AddCompletionQueue().release());
-  }
-
-  server_ = builder.BuildAndStart();
-  LOG(INFO) << "Server listening on " << bind_address_
-            << " selected port: " << selected_port_;
-
-  std::function<void(const std::string&, int)> f =
-      std::bind(&AsyncGRPCServer::TryToRegisterNewOne, this,
-                std::placeholders::_1, std::placeholders::_2);
-
-  for (auto& t : rpc_call_map_) {
-    auto& rpc_name = t.first;
-    auto& cq = rpc_cq_[rpc_name];
-    auto threadnum = rpc_thread_num_[rpc_name];
-    auto& reqs = rpc_reqs_[rpc_name];
-
-    reqs.reserve(kRequestBufSize);
-
-    for (int i = 0; i < kRequestBufSize; i++) {
-      TryToRegisterNewOne(rpc_name, i);
-    }
-
-    for (int i = 0; i < threadnum; i++) {
-      rpc_threads_[rpc_name].emplace_back(new std::thread(std::bind(
-          &AsyncGRPCServer::HandleRequest, this, cq.get(), rpc_name, f)));
-      VLOG(3) << t.first << " creates threads!";
-    }
-  }
-
-  {
-    std::lock_guard<std::mutex> lock(this->mutex_ready_);
-    ready_ = 1;
-  }
-  condition_ready_.notify_all();
-
-  // wait server
-  server_->Wait();
-
-  for (auto& t : rpc_threads_) {
-    auto& threads = t.second;
-    for (size_t i = 0; i < threads.size(); ++i) {
-      threads[i]->join();
-      VLOG(3) << t.first << " threads ends!";
-    }
-  }
-}
-
-void AsyncGRPCServer::ShutdownQueue() {
-  for (auto& t : rpc_cq_) {
-    t.second->Shutdown();
-    VLOG(3) << t.first << " shutdown!";
-  }
-}
-
-void AsyncGRPCServer::ShutDownImpl() {
-  std::unique_lock<std::mutex> lock(cq_mutex_);
-  is_shut_down_ = true;
-  ShutdownQueue();
-
-  VLOG(3) << "server_ shutdown!";
-  server_->Shutdown();
-}
-
-void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
-                                          int req_id) {
-  std::unique_lock<std::mutex> lock(cq_mutex_);
-  if (is_shut_down_) {
-    VLOG(3) << "shutdown, do not TryToRegisterNewSendOne";
-    return;
-  }
-
-  VLOG(4) << "register send rpc_name:" << rpc_name
-          << ", handler:" << rpc_call_map_[kRequestSend];
-
-  auto& reqs = rpc_reqs_[rpc_name];
-  auto& handler = rpc_call_map_[rpc_name];
-  auto& cq = rpc_cq_[rpc_name];
-
-  RequestBase* b = nullptr;
-  if (rpc_name == kRequestSend) {
-    b = new RequestSend(&service_, cq.get(), handler, req_id);
-  } else if (rpc_name == kRequestGet) {
-    b = new RequestGet(&service_, cq.get(), handler, req_id);
-  } else if (rpc_name == kRequestPrefetch) {
-    b = new RequestPrefetch(&service_, cq.get(), handler, req_id);
-  } else {
-    PADDLE_ENFORCE(false, "not supported rpc");
-  }
-
-  reqs[req_id] = b;
-
-  VLOG(4) << "Create RequestSend status:" << b->Status();
-}
-
-void AsyncGRPCServer::HandleRequest(
-    ::grpc::ServerCompletionQueue* cq, const std::string& rpc_name,
-    std::function<void(const std::string&, int)> TryToRegisterNewOne) {
-  void* tag = NULL;
-  bool ok = false;
-
-  while (true) {
-    VLOG(3) << "HandleRequest " << rpc_name << " wait next";
-    if (!cq->Next(&tag, &ok)) {
-      LOG(INFO) << "CompletionQueue " << rpc_name << " shutdown!";
-      break;
-    }
-
-    int req_id = static_cast<int>(reinterpret_cast<intptr_t>(tag));
-    VLOG(3) << "HandleRequest " << rpc_name << ", req_id:" << req_id
-            << " get next";
-
-    auto& reqs = rpc_reqs_[rpc_name];
-    RequestBase* base = nullptr;
-    {
-      PADDLE_ENFORCE(req_id >= 0 && req_id < kRequestBufSize);
-      std::unique_lock<std::mutex> lock(cq_mutex_);
-      base = reqs[req_id];
-    }
-
-    // reference:
-    // https://github.com/tensorflow/tensorflow/issues/5596
-    // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM
-    // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I
-    if (!ok) {
-      LOG(WARNING) << "completion queue:" << rpc_name
-                   << " recv no regular event:argument name["
-                   << base->GetReqName() << "]";
-      TryToRegisterNewOne(rpc_name, req_id);
-      delete base;
-      continue;
-    }
-
-    VLOG(3) << "queue id:" << rpc_name << ", req_id:" << req_id
-            << ", status:" << base->Status();
-
-    switch (base->Status()) {
-      case PROCESS: {
-        base->Process();
-        break;
-      }
-      case FINISH: {
-        TryToRegisterNewOne(rpc_name, req_id);
-        delete base;
-        break;
-      }
-      default: { assert(false); }
-    }
-  }
-}
-
-}  // namespace detail
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h
deleted file mode 100644
index f1db7590f6f14d5d44acc12453861a446e278cd2..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detail/grpc_server.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <map>
-#include <set>
-#include <string>
-#include <thread>  // NOLINT
-#include <utility>
-#include <vector>
-
-#include "grpc++/grpc++.h"
-#include "paddle/fluid/framework/blocking_queue.h"
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/detail/grpc_service.h"
-#include "paddle/fluid/operators/detail/request_handler.h"
-#include "paddle/fluid/operators/detail/rpc_server.h"
-#include "paddle/fluid/operators/detail/send_recv.grpc.pb.h"
-#include "paddle/fluid/operators/detail/send_recv.pb.h"
-#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace operators {
-namespace detail {
-
-class RequestBase;
-
-class AsyncGRPCServer final : public RPCServer {
- public:
-  explicit AsyncGRPCServer(const std::string& address, int client_num)
-      : RPCServer(address, client_num), ready_(0) {}
-
-  virtual ~AsyncGRPCServer() {}
-  void WaitServerReady() override;
-  void StartServer() override;
-
- private:
-  // HandleRequest needs to be thread-safe.
-  void HandleRequest(
-      ::grpc::ServerCompletionQueue* cq, const std::string& rpc_name,
-      std::function<void(const std::string&, int)> TryToRegisterNewOne);
-
-  void TryToRegisterNewOne(const std::string& rpc_name, int req_id);
-  void ShutdownQueue();
-  void ShutDownImpl() override;
-
- private:
-  static const int kRequestBufSize = 100;
-
-  std::mutex cq_mutex_;
-  volatile bool is_shut_down_ = false;
-
-  GrpcService::AsyncService service_;
-  std::unique_ptr<::grpc::Server> server_;
-
-  // condition of the sub program
-  std::condition_variable barrier_condition_;
-
-  std::mutex mutex_ready_;
-  std::condition_variable condition_ready_;
-
-  int ready_;
-
-  std::map<std::string, std::unique_ptr<::grpc::ServerCompletionQueue>> rpc_cq_;
-  std::map<std::string, std::vector<std::unique_ptr<std::thread>>> rpc_threads_;
-  std::map<std::string, std::vector<RequestBase*>> rpc_reqs_;
-};
-
-};  // namespace detail
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/detail/grpc_service.h b/paddle/fluid/operators/detail/grpc_service.h
deleted file mode 100644
index e0505c2b9d0903837713d7e0032b01ab091c2e04..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detail/grpc_service.h
+++ /dev/null
@@ -1,123 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <grpc++/impl/codegen/async_stream.h>
-#include <grpc++/impl/codegen/async_unary_call.h>
-#include <grpc++/impl/codegen/proto_utils.h>
-#include <grpc++/impl/codegen/rpc_method.h>
-#include <grpc++/impl/codegen/service_type.h>
-#include <grpc++/impl/codegen/status.h>
-#include <grpc++/impl/codegen/stub_options.h>
-#include <grpc++/impl/codegen/sync_stream.h>
-#include <grpc++/support/byte_buffer.h>
-#include "paddle/fluid/operators/detail/variable_response.h"
-
-#include "paddle/fluid/platform/profiler.h"
-
-// NOTE: This method was originally created by tensorflow
-//       (https://github.com/tensorflow/tensorflow/) we borrow this
-//       method and did some modifications so that we can parse gRPC
-//       requests without too much copying of the tensor data.
-
-namespace grpc {
-class CompletionQueue;
-class Channel;
-class RpcService;
-class ServerCompletionQueue;
-class ServerContext;
-
-// Support parsing/unparsing of tensorflow::VariableResponse.
-// Wire-format is identical to RecvVariableResponse.
-template <>
-class SerializationTraits<paddle::operators::detail::VariableResponse> {
- public:
-  static Status Serialize(
-      const paddle::operators::detail::VariableResponse& msg,
-      grpc_byte_buffer** bp, bool* own_buffer) {
-    PADDLE_ENFORCE(false, "SerializationTraits::Serialize not implemented!");
-    return Status();
-  }
-  static Status Deserialize(grpc_byte_buffer* buffer,
-                            paddle::operators::detail::VariableResponse* msg,
-                            int max_message_size = INT_MAX) {
-    if (buffer == nullptr) {
-      return Status(StatusCode::INTERNAL, "No payload");
-    }
-
-    Status result = g_core_codegen_interface->ok();
-    if (result.ok()) {
-      paddle::operators::detail::GrpcByteSource source(buffer);
-      int ret = msg->Parse(&source);
-      if (ret != 0) {
-        result = Status(StatusCode::INTERNAL, "VariableResponse parse error");
-      }
-    }
-    g_core_codegen_interface->grpc_byte_buffer_destroy(buffer);
-    return result;
-  }
-};
-}  // namespace grpc
-
-namespace paddle {
-namespace operators {
-namespace detail {
-
-enum class GrpcMethod {
-  kSendVariable,
-  kGetVariable,
-  kPrefetchVariable,
-};
-
-static const int kGrpcNumMethods =
-    static_cast<int>(GrpcMethod::kPrefetchVariable) + 1;
-
-inline const char* GrpcMethodName(GrpcMethod id) {
-  switch (id) {
-    case GrpcMethod::kSendVariable:
-      return "/sendrecv.SendRecvService/SendVariable";
-    case GrpcMethod::kGetVariable:
-      return "/sendrecv.SendRecvService/GetVariable";
-    case GrpcMethod::kPrefetchVariable:
-      return "/sendrecv.SendRecvService/PrefetchVariable";
-  }
-
-  // Shouldn't be reached.
-  PADDLE_ENFORCE(false, "Invalid id: not found valid method name");
-  return nullptr;
-}
-
-class GrpcService final {
- public:
-  class AsyncService : public ::grpc::Service {
-   public:
-    AsyncService() {
-      for (int i = 0; i < kGrpcNumMethods; ++i) {
-        AddMethod(new ::grpc::internal::RpcServiceMethod(
-            GrpcMethodName(static_cast<GrpcMethod>(i)),
-            ::grpc::internal::RpcMethod::NORMAL_RPC, nullptr));
-        ::grpc::Service::MarkMethodAsync(i);
-      }
-    }
-    virtual ~AsyncService() {}
-
-    // Make RequestAsyncUnary public for grpc_call.h
-    using ::grpc::Service::RequestAsyncUnary;
-  };
-};
-
-}  // namespace detail
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/macros.h b/paddle/fluid/operators/detail/macros.h
index da1de72dad00db3ffe609e17bd198ef0a56bbfcd..6f4a15caa5542a45cd8e26a72b055ca8948069d0 100644
--- a/paddle/fluid/operators/detail/macros.h
+++ b/paddle/fluid/operators/detail/macros.h
@@ -14,14 +14,22 @@
 
 #pragma once
 
+#ifdef PADDLE_WITH_DISTRIBUTE
+
 #ifdef PADDLE_WITH_GRPC
-#include "paddle/fluid/operators/detail/grpc_client.h"
-#include "paddle/fluid/operators/detail/grpc_server.h"
-#define RPCSERVER_T detail::AsyncGRPCServer
-#define RPCCLIENT_T detail::GRPCClient
-#else
-#include "paddle/fluid/operators/detail/brpc_client.h"
-#include "paddle/fluid/operators/detail/brpc_server.h"
-#define RPCSERVER_T detail::AsyncBRPCServer
-#define RPCCLIENT_T detail::BRPCClient
-#endif
+
+#include "paddle/fluid/operators/distributed/grpc_client.h"
+#include "paddle/fluid/operators/distributed/grpc_server.h"
+#define RPCSERVER_T paddle::operators::distributed::AsyncGRPCServer
+#define RPCCLIENT_T paddle::operators::distributed::GRPCClient
+
+#else  // PADDLE_WITH_GRPC
+
+#include "paddle/fluid/operators/distributed/brpc_client.h"
+#include "paddle/fluid/operators/distributed/brpc_server.h"
+#define RPCSERVER_T paddle::operators::distributed::AsyncBRPCServer
+#define RPCCLIENT_T paddle::operators::distributed::BRPCClient
+
+#endif  // PADDLE_WITH_GRPC
+
+#endif  // PADDLE_WITH_DISTRIBUTE
diff --git a/paddle/fluid/operators/detail/proto_encoder_helper.h b/paddle/fluid/operators/detail/proto_encoder_helper.h
deleted file mode 100644
index d91d054b2507f32d1e948dde33da06a70cabe775..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detail/proto_encoder_helper.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-// NOTE: This file was originally created by tensorflow
-//       (https://github.com/tensorflow/tensorflow/) we borrow this
-//       file and did some modifications so that we can send gRPC
-//       requests without too much copying of the tensor data.
-
-#pragma once
-
-#include <string>
-
-#include "grpc++/grpc++.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-namespace detail {
-
-char* EncodeVarint32(char* dst, uint32_t v) {
-  // Operate on characters as unsigneds
-  unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
-  static const int B = 128;
-  if (v < (1 << 7)) {
-    *(ptr++) = v;
-  } else if (v < (1 << 14)) {
-    *(ptr++) = v | B;
-    *(ptr++) = v >> 7;
-  } else if (v < (1 << 21)) {
-    *(ptr++) = v | B;
-    *(ptr++) = (v >> 7) | B;
-    *(ptr++) = v >> 14;
-  } else if (v < (1 << 28)) {
-    *(ptr++) = v | B;
-    *(ptr++) = (v >> 7) | B;
-    *(ptr++) = (v >> 14) | B;
-    *(ptr++) = v >> 21;
-  } else {
-    *(ptr++) = v | B;
-    *(ptr++) = (v >> 7) | B;
-    *(ptr++) = (v >> 14) | B;
-    *(ptr++) = (v >> 21) | B;
-    *(ptr++) = v >> 28;
-  }
-  return reinterpret_cast<char*>(ptr);
-}
-
-char* EncodeVarint64(char* dst, uint64_t v) {
-  static const int B = 128;
-  unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
-  while (v >= B) {
-    *(ptr++) = (v & (B - 1)) | B;
-    v >>= 7;
-  }
-  *(ptr++) = static_cast<unsigned char>(v);
-  return reinterpret_cast<char*>(ptr);
-}
-
-int VarintLength(uint64_t v) {
-  int len = 1;
-  while (v >= 128) {
-    v >>= 7;
-    len++;
-  }
-  return len;
-}
-
-class ProtoEncodeHelper {
- public:
-  ProtoEncodeHelper(char* buf, int max_size)
-      : base_(buf), p_(buf), limit_(base_ + max_size) {}
-
-  ~ProtoEncodeHelper() {
-    // Make sure callers didn't do operations that went over max_size promised
-    PADDLE_ENFORCE_LE(p_, limit_);
-  }
-
-  const char* data() const { return base_; }
-  size_t size() const { return p_ - base_; }
-
-  void WriteUint64(int tag, uint64_t v) {
-    Encode32(combine(tag, WIRETYPE_VARINT));
-    Encode64(v);
-  }
-  void WriteBool(int tag, bool v) {
-    Encode32(combine(tag, WIRETYPE_VARINT));
-    EncodeBool(v);
-  }
-  void WriteString(int tag, const std::string& v) {
-    Encode32(combine(tag, WIRETYPE_LENGTH_DELIMITED));
-    Encode32(v.size());
-    EncodeBytes(v.data(), v.size());
-  }
-  void WriteVarlengthBeginning(int tag, uint32_t len) {
-    Encode32(combine(tag, WIRETYPE_LENGTH_DELIMITED));
-    Encode32(len);
-  }
-  void WriteRawBytes(const std::string& v) { EncodeBytes(v.data(), v.size()); }
-
- private:
-  // Note: this module's behavior must match the protocol buffer wire encoding
-  // format.
-  enum {
-    WIRETYPE_VARINT = 0,
-    WIRETYPE_LENGTH_DELIMITED = 2,
-  };
-  static uint32_t combine(uint32_t tag, uint32_t type) {
-    return ((tag << 3) | type);
-  }
-  inline void Encode32(uint32_t v) {
-    if (v < 128) {
-      // Fast path for single-byte values.  Many of the calls will use a
-      // constant value for v, so the comparison will get optimized away
-      // when Encode32 is inlined into the caller.
-      *p_ = v;
-      p_++;
-    } else {
-      p_ = EncodeVarint32(p_, v);
-    }
-  }
-  void Encode64(uint64_t v) { p_ = EncodeVarint64(p_, v); }
-  void EncodeBool(bool v) {
-    *p_ = (v ? 1 : 0);  // Equal to varint32 encoding of 0 or 1
-    p_++;
-  }
-  void EncodeBytes(const char* bytes, int N) {
-    memcpy(p_, bytes, N);
-    p_ += N;
-  }
-
-  char* base_;
-  char* p_;
-  char* limit_;  // Just for CHECKs
-};
-
-}  // namespace detail
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/request_handler.h b/paddle/fluid/operators/detail/request_handler.h
deleted file mode 100644
index a2d08747d59220d30a5b8fd56074fd2739ae3bab..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detail/request_handler.h
+++ /dev/null
@@ -1,129 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <time.h>
-
-#include <functional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/var_type.h"
-
-namespace paddle {
-namespace operators {
-namespace detail {
-
-constexpr char kRequestSend[] = "RequestSend";
-constexpr char kRequestGet[] = "RequestGet";
-constexpr char kRequestPrefetch[] = "RequestPrefetch";
-
-#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
-#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
-#define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV"
-#define COMPLETE_MESSAGE "COMPLETE@RECV"
-
-class RPCServer;
-
-class RequestHandler {
- public:
-  explicit RequestHandler(bool sync_mode)
-      : sync_mode_(sync_mode),
-        dev_ctx_(nullptr),
-        executor_(nullptr),
-        scope_(nullptr),
-        program_(nullptr),
-        rpc_server_(nullptr) {}
-
-  virtual ~RequestHandler() {}
-
-  // Set attributes.
-  void SetScope(framework::Scope* scope) { scope_ = scope; }
-  void SetDevCtx(const platform::DeviceContext* dev_ctx) { dev_ctx_ = dev_ctx; }
-  void SetProgram(framework::ProgramDesc* program) { program_ = program; }
-  void SetExecutor(framework::Executor* executor) { executor_ = executor; }
-
-  // Used for dist lookup table prefetch
-  void SetPrefetchPreparedCtx(
-      std::unordered_map<
-          std::string, std::shared_ptr<framework::ExecutorPrepareContext>>* g) {
-    prefetch_var_name_to_prepared_ctx_ = g;
-  }
-
-  // Used for async.
-  void SetGradToPreparedCtx(
-      std::unordered_map<
-          std::string, std::shared_ptr<framework::ExecutorPrepareContext>>* g) {
-    grad_to_prepared_ctx_ = g;
-  }
-
-  void SetRPCServer(RPCServer* rpc_server) { rpc_server_ = rpc_server; }
-
-  // Get attributes.
-  bool sync_mode() { return sync_mode_; }
-  framework::Scope* scope() { return scope_; }
-  const platform::DeviceContext* dev_ctx() { return dev_ctx_; }
-  framework::ProgramDesc* program() { return program_; }
-  framework::Executor* executor() { return executor_; }
-
-  // This function processes user's rpc request.
-  // The implemention is in request_handler_impl.
-  // example:
-  //    std::string varname = request_.varname();
-  //
-  //    auto scope = request_handler_->scope();
-  //    auto invar = scope->FindVar(varname);
-  //    framework::Variable* outvar = nullptr;
-  //
-  //    request_handler_->Handle(varname, scope, invar, &outvar);
-  //    if (outvar) {
-  //        SerializeToByteBuffer(varname, outvar,
-  //           *request_handler_->dev_ctx(), &reply_);
-  //    }
-  virtual bool Handle(const std::string& varname, framework::Scope* scope,
-                      framework::Variable* var, framework::Variable** outvar,
-                      const std::string& out_var_name = "") = 0;
-
- protected:
-  const bool sync_mode_;
-
-  const platform::DeviceContext* dev_ctx_;
-  framework::Executor* executor_;
-  framework::Scope* scope_;
-  framework::ProgramDesc* program_;
-
-  // used for distribute lookup table prefetch
-  std::unordered_map<std::string,
-                     std::shared_ptr<framework::ExecutorPrepareContext>>*
-      prefetch_var_name_to_prepared_ctx_;
-
-  // Used for async.
-  std::unordered_map<std::string,
-                     std::shared_ptr<framework::ExecutorPrepareContext>>*
-      grad_to_prepared_ctx_;
-
-  RPCServer* rpc_server_;
-};
-
-}  // namespace detail
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/request_handler_impl.cc b/paddle/fluid/operators/detail/request_handler_impl.cc
deleted file mode 100644
index 7425bee798cd9ba0af8cd777a6db63862c8a4031..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detail/request_handler_impl.cc
+++ /dev/null
@@ -1,124 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/detail/request_handler_impl.h"
-#include "paddle/fluid/operators/detail/rpc_server.h"
-
-namespace paddle {
-namespace operators {
-namespace detail {
-
-bool RequestSendHandler::Handle(const std::string& varname,
-                                framework::Scope* scope,
-                                framework::Variable* invar,
-                                framework::Variable** outvar,
-                                const std::string& out_var_name) {
-  VLOG(4) << "RequestSendHandler:" << varname;
-
-  // Async
-  if (!sync_mode_) {
-    try {
-      executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(),
-                                    scope);
-    } catch (std::exception& e) {
-      LOG(ERROR) << "async: run sub program error " << e.what();
-      return false;
-    }
-    return true;
-  }
-
-  // Sync
-  if (varname == BATCH_BARRIER_MESSAGE) {
-    VLOG(3) << "sync: recv batch barrier message";
-    rpc_server_->IncreaseBatchBarrier(kRequestSend);
-  } else if (varname == COMPLETE_MESSAGE) {
-    VLOG(3) << "sync: recv complete message";
-    rpc_server_->DecreaseClientNum();
-  } else {
-    VLOG(3) << "sync: received var_name: " << varname;
-    if (sync_mode_) {
-      rpc_server_->WaitCond(kRequestSend);
-    }
-
-    if (invar == nullptr) {
-      LOG(ERROR) << "sync: Can not find server side var: " << varname;
-      PADDLE_THROW("sync: Can not find server side var");
-      return false;
-    }
-    if (invar->IsType<framework::SelectedRows>()) {
-      std::unique_lock<std::mutex> lock(mutex_sparse_vars_);
-      sparse_vars_.push_back(invar);
-    }
-  }
-  return true;
-}
-
-void RequestSendHandler::ResetSparseVarRecorder() {
-  std::unique_lock<std::mutex> lock(mutex_sparse_vars_);
-  for (auto* var : sparse_vars_) {
-    var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
-  }
-  sparse_vars_.clear();
-}
-
-bool RequestGetHandler::Handle(const std::string& varname,
-                               framework::Scope* scope,
-                               framework::Variable* invar,
-                               framework::Variable** outvar,
-                               const std::string& out_var_name) {
-  VLOG(4) << "RequestGetHandler:" << varname;
-
-  if (varname != FETCH_BARRIER_MESSAGE) {
-    if (sync_mode_) {
-      rpc_server_->WaitCond(kRequestGet);
-    }
-    *outvar = scope_->FindVar(varname);
-    return true;
-  }
-
-  // FETCH_BARRIER_MESSAGE
-  if (sync_mode_) {
-    VLOG(3) << "sync: recv fetch barrier message";
-    rpc_server_->IncreaseBatchBarrier(kRequestGet);
-  }
-
-  return true;
-}
-
-bool RequestPrefetchHandler::Handle(const std::string& varname,
-                                    framework::Scope* scope,
-                                    framework::Variable* invar,
-                                    framework::Variable** outvar,
-                                    const std::string& out_var_name) {
-  VLOG(4) << "RequestPrefetchHandler " << varname;
-
-  auto var_desc = program_->Block(0).FindVar(out_var_name);
-  InitializeVariable(*outvar, var_desc->GetType());
-  executor_->RunPreparedContext(
-      (*prefetch_var_name_to_prepared_ctx_)[varname].get(), scope);
-
-  return true;
-}
-
-}  // namespace detail
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/request_handler_impl.h b/paddle/fluid/operators/detail/request_handler_impl.h
deleted file mode 100644
index 3f77c09a9598b431d747f1b824615e49d939098e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detail/request_handler_impl.h
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <time.h>
-
-#include <functional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/detail/request_handler.h"
-
-namespace paddle {
-namespace operators {
-namespace detail {
-
-class RequestSendHandler final : public RequestHandler {
- public:
-  explicit RequestSendHandler(bool sync_mode) : RequestHandler(sync_mode) {}
-  virtual ~RequestSendHandler() {}
-  bool Handle(const std::string& varname, framework::Scope* scope,
-              framework::Variable* var, framework::Variable** outvar,
-              const std::string& out_var_name = "") override;
-  void ResetSparseVarRecorder();
-
- private:
-  std::mutex mutex_sparse_vars_;
-  std::vector<framework::Variable*> sparse_vars_;
-};
-
-class RequestGetHandler final : public RequestHandler {
- public:
-  explicit RequestGetHandler(bool sync_mode) : RequestHandler(sync_mode) {}
-  virtual ~RequestGetHandler() {}
-  bool Handle(const std::string& varname, framework::Scope* scope,
-              framework::Variable* var, framework::Variable** outvar,
-              const std::string& out_var_name = "") override;
-};
-
-class RequestPrefetchHandler final : public RequestHandler {
- public:
-  explicit RequestPrefetchHandler(bool sync_mode) : RequestHandler(sync_mode) {}
-  virtual ~RequestPrefetchHandler() {}
-  bool Handle(const std::string& varname, framework::Scope* scope,
-              framework::Variable* var, framework::Variable** outvar,
-              const std::string& out_var_name = "") override;
-};
-
-}  // namespace detail
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/rpc_client.cc b/paddle/fluid/operators/detail/rpc_client.cc
deleted file mode 100644
index 9a791403e3d6b99c5d4de5183e83e1af655d7d4c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detail/rpc_client.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/detail/rpc_client.h"
-
-namespace paddle {
-namespace operators {
-namespace detail {
-
-std::once_flag RPCClient::init_flag_;
-std::unique_ptr<RPCClient> RPCClient::rpc_client_(nullptr);
-
-}  // namespace detail
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/rpc_client.h b/paddle/fluid/operators/detail/rpc_client.h
deleted file mode 100644
index 47c6ffb4fd7a002fc0bd8053fb3314a2fbf18fd3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detail/rpc_client.h
+++ /dev/null
@@ -1,89 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-
-namespace paddle {
-namespace operators {
-namespace detail {
-
-class RPCClient {
- public:
-  RPCClient() {}
-  virtual ~RPCClient() {}
-  virtual bool AsyncSendVar(const std::string& ep,
-                            const platform::DeviceContext& ctx,
-                            const framework::Scope& scope,
-                            const std::string& var_name,
-                            int64_t time_out = rpc_time_out) = 0;
-
-  virtual bool AsyncGetVar(const std::string& ep,
-                           const platform::DeviceContext& ctx,
-                           const framework::Scope& scope,
-                           const std::string& var_name,
-                           int64_t time_out = rpc_time_out) = 0;
-
-  virtual bool AsyncPrefetchVar(const std::string& ep,
-                                const platform::DeviceContext& ctx,
-                                const framework::Scope& scope,
-                                const std::string& in_var_name,
-                                const std::string& out_var_name,
-                                int64_t time_out = rpc_time_out) = 0;
-
-  virtual void AsyncSendBatchBarrier(const std::string& ep,
-                                     int64_t time_out = rpc_time_out) = 0;
-
-  virtual void AsyncSendFetchBarrier(const std::string& ep,
-                                     int64_t time_out = rpc_time_out) = 0;
-
-  // SendComplete tells all the server that current trainer have no more data
-  // to train, so that the pserver can reduce it's barrier count, and continue
-  // to train with other trainers.
-  virtual void SendComplete() = 0;
-
-  virtual void Wait() = 0;
-
-  static constexpr int64_t rpc_time_out = 120 * 1000;
-
-  template <typename T>
-  static RPCClient* GetInstance() {
-    std::call_once(init_flag_, &RPCClient::Init<T>);
-    return rpc_client_.get();
-  }
-
-  // Init is called by GetInstance.
-  template <typename T>
-  static void Init() {
-    if (rpc_client_.get() == nullptr) {
-      rpc_client_.reset(new T());
-      rpc_client_->InitImpl();
-    }
-  }
-
- protected:
-  virtual void InitImpl() {}
-
- private:
-  static std::once_flag init_flag_;
-  static std::unique_ptr<RPCClient> rpc_client_;
-};
-}  // namespace detail
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/rpc_server.cc b/paddle/fluid/operators/detail/rpc_server.cc
deleted file mode 100644
index cd0fe96e2301ee3304fe9a2967df58b9f7072d8d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detail/rpc_server.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <fstream>
-#include <iostream>
-#include <limits>
-#include <string>
-
-#include "paddle/fluid/operators/detail/rpc_server.h"
-
-namespace paddle {
-namespace operators {
-namespace detail {
-
-void RPCServer::ShutDown() {
-  LOG(INFO) << "RPCServer ShutDown ";
-  ShutDownImpl();
-
-  exit_flag_ = true;
-  barrier_cond_.notify_all();
-  rpc_cond_.notify_all();
-}
-
-void RPCServer::SavePort() const {
-  auto file_path = string::Sprintf("/tmp/paddle.%d.port", ::getpid());
-  std::ofstream port_file;
-  port_file.open(file_path);
-  port_file << selected_port_;
-  port_file.close();
-  VLOG(4) << "selected port written to " << file_path;
-}
-
-void RPCServer::WaitBarrier(const std::string& rpc_name) {
-  std::unique_lock<std::mutex> lock(this->mutex_);
-  barrier_cond_.wait(lock, [this, &rpc_name] {
-    return (barrier_counter_[rpc_name] >= client_num_ || exit_flag_.load());
-  });
-
-  VLOG(3) << "batch_barrier_:" << barrier_counter_[rpc_name];
-}
-
-void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) {
-  VLOG(3) << "RPCServer begin IncreaseBatchBarrier " << rpc_name;
-  int b = 0;
-  std::unique_lock<std::mutex> lock(mutex_);
-  b = ++barrier_counter_[rpc_name];
-  if (b >= client_num_) {
-    lock.unlock();
-    barrier_cond_.notify_all();
-    lock.lock();
-  }
-}
-
-void RPCServer::DecreaseClientNum() {
-  {
-    std::unique_lock<std::mutex> lock(mutex_);
-    client_num_--;
-  }
-  barrier_cond_.notify_all();
-}
-
-void RPCServer::ResetBarrierCounter() {
-  VLOG(3) << "RPCServer ResetBarrierCounter ";
-  std::unique_lock<std::mutex> lock(mutex_);
-  for (auto& t : barrier_counter_) {
-    t.second = 0;
-  }
-}
-
-void RPCServer::RegisterRPC(const std::string& rpc_name,
-                            RequestHandler* handler, int thread_num) {
-  rpc_call_map_[rpc_name] = handler;
-  rpc_thread_num_[rpc_name] = thread_num;
-
-  static int cond = -1;
-  rpc_cond_map_[rpc_name] = ++cond;
-  VLOG(4) << "RegisterRPC rpc_name:" << rpc_name << ", handler:" << handler
-          << ", cond:" << rpc_cond_map_[rpc_name];
-}
-
-void RPCServer::SetCond(const std::string& rpc_name) {
-  VLOG(3) << "RPCServer SetCond " << rpc_name;
-  {
-    std::unique_lock<std::mutex> lock(mutex_);
-    cur_cond_ = rpc_cond_map_[rpc_name];
-  }
-
-  rpc_cond_.notify_all();
-}
-
-void RPCServer::WaitCond(const std::string& rpc_name) {
-  VLOG(3) << "RPCServer WaitCond " << rpc_name;
-  int cond = 0;
-  {
-    std::unique_lock<std::mutex> lock(mutex_);
-    cond = rpc_cond_map_[rpc_name];
-  }
-
-  std::unique_lock<std::mutex> lock(mutex_);
-  rpc_cond_.wait(
-      lock, [=] { return (cur_cond_.load() == cond || exit_flag_.load()); });
-}
-
-}  // namespace detail
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/rpc_server.h b/paddle/fluid/operators/detail/rpc_server.h
deleted file mode 100644
index 2e3342428cb56c34abaca655d5906668cda8f140..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detail/rpc_server.h
+++ /dev/null
@@ -1,91 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <set>
-#include <string>
-#include <thread>  // NOLINT
-#include <utility>
-#include <vector>
-#include "paddle/fluid/operators/detail/request_handler.h"
-
-namespace paddle {
-namespace operators {
-namespace detail {
-
-class RPCServer {
- public:
-  explicit RPCServer(const std::string& address, int client_num)
-      : cur_cond_(0),
-        bind_address_(address),
-        exit_flag_(false),
-        selected_port_(0),
-        client_num_(client_num) {}
-
-  virtual ~RPCServer() {}
-  virtual void StartServer() = 0;
-  virtual void WaitServerReady() = 0;
-
-  void ShutDown();
-
-  bool IsExit() { return exit_flag_.load(); }
-
-  int GetSelectedPort() const { return selected_port_; }
-  void SavePort() const;
-
-  // RegisterRPC, register the rpc method name to a handler
-  // class, and auto generate a condition id for this call
-  // to be used for the barrier.
-  void RegisterRPC(const std::string& rpc_name, RequestHandler* handler,
-                   int thread_num = 5);
-
-  // Wait util all the clients have reached the barrier for one
-  // rpc method. This function should be called in the
-  // RequestHandler if you want to run the server/client in a
-  // synchronous mode.
-  void WaitBarrier(const std::string& rpc_name);
-
-  void SetCond(const std::string& rpc_name);
-  void WaitCond(const std::string& rpc_name);
-  void IncreaseBatchBarrier(const std::string rpc_name);
-  void DecreaseClientNum();
-  void ResetBarrierCounter();
-
- protected:
-  virtual void ShutDownImpl() = 0;
-
- private:
-  std::mutex mutex_;
-  std::unordered_map<std::string, int> barrier_counter_;
-  std::condition_variable barrier_cond_;
-
-  std::unordered_map<std::string, int> rpc_cond_map_;
-  std::atomic<int> cur_cond_;
-  std::condition_variable rpc_cond_;
-
- protected:
-  std::string bind_address_;
-  std::atomic<int> exit_flag_;
-  int selected_port_;
-  int client_num_;
-
-  std::unordered_map<std::string, RequestHandler*> rpc_call_map_;
-  std::unordered_map<std::string, int> rpc_thread_num_;
-  friend class RequestHandler;
-};
-
-};  // namespace detail
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/detail/rpc_server_test.cc b/paddle/fluid/operators/detail/rpc_server_test.cc
deleted file mode 100644
index 463a7b80cfac280de5afe91ee85caaaf074cef32..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detail/rpc_server_test.cc
+++ /dev/null
@@ -1,162 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <unistd.h>
-#include <string>
-#include <thread>  // NOLINT
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-#include "paddle/fluid/operators/detail/macros.h"
-#include "paddle/fluid/operators/detail/request_handler_impl.h"
-#include "paddle/fluid/operators/detail/rpc_client.h"
-#include "paddle/fluid/operators/detail/rpc_server.h"
-
-namespace framework = paddle::framework;
-namespace platform = paddle::platform;
-namespace detail = paddle::operators::detail;
-
-USE_OP(lookup_table);
-
-std::unique_ptr<detail::RPCServer> g_rpc_service;
-std::unique_ptr<detail::RequestHandler> g_req_handler;
-
-framework::BlockDesc* AppendPrefetchBlcok(framework::ProgramDesc* program) {
-  auto root_block = program->MutableBlock(0);
-  auto* block = program->AppendBlock(*root_block);
-
-  framework::VariableNameMap input({{"W", {"w"}}, {"Ids", {"ids"}}});
-  framework::VariableNameMap output({{"Output", {"out"}}});
-  auto op = block->AppendOp();
-  op->SetType("lookup_table");
-  op->SetInput("W", {"w"});
-  op->SetInput("Ids", {"ids"});
-  op->SetOutput("Out", {"out"});
-
-  auto& out = *root_block->Var("out");
-  out.SetType(framework::proto::VarType::SELECTED_ROWS);
-  out.SetShape({10, 10});
-
-  return block;
-}
-
-void CreateVarsOnScope(framework::Scope* scope, platform::CPUPlace* place) {
-  auto w_var = scope->Var("w");
-  w_var->GetMutable<framework::SelectedRows>();
-
-  auto out_var = scope->Var("out");
-  out_var->GetMutable<framework::SelectedRows>();
-
-  auto ids_var = scope->Var("ids");
-  ids_var->GetMutable<framework::SelectedRows>();
-}
-
-void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place,
-                         int64_t rows_numel) {
-  CreateVarsOnScope(scope, place);
-  auto ids_var = scope->Var("ids")->GetMutable<framework::SelectedRows>();
-  auto rows = ids_var->mutable_rows();
-  for (int64_t i = 0; i < rows_numel; ++i) rows->push_back(i * 2);
-  ids_var->mutable_value()->Resize({rows_numel, 1});
-  ids_var->mutable_value()->mutable_data<float>(*place);
-}
-
-void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
-                         int64_t rows_numel) {
-  CreateVarsOnScope(scope, place);
-  auto w = scope->Var("w")->GetMutable<framework::SelectedRows>();
-  auto rows = w->mutable_rows();
-  for (int64_t i = 0; i < rows_numel; ++i) rows->push_back(i);
-  auto w_value = w->mutable_value();
-  w_value->Resize({rows_numel, 10});
-
-  auto ptr = w_value->mutable_data<float>(*place);
-
-  for (int64_t i = 0; i < w_value->numel(); ++i) {
-    ptr[i] = static_cast<float>(i / 10);
-  }
-}
-
-void StartServer() {
-  framework::ProgramDesc program;
-  framework::Scope scope;
-  platform::CPUPlace place;
-  framework::Executor exe(place);
-  platform::CPUDeviceContext ctx(place);
-  auto* block = AppendPrefetchBlcok(&program);
-  std::string in_var_name("ids");
-  std::vector<int> prefetch_block_ids{block->ID()};
-  auto prepared = exe.Prepare(program, prefetch_block_ids);
-  InitTensorsOnServer(&scope, &place, 10);
-
-  std::unordered_map<std::string,
-                     std::shared_ptr<framework::ExecutorPrepareContext>>
-      prefetch_var_name_to_prepared;
-  prefetch_var_name_to_prepared[in_var_name] = prepared[0];
-  g_req_handler->SetProgram(&program);
-  g_req_handler->SetPrefetchPreparedCtx(&prefetch_var_name_to_prepared);
-  g_req_handler->SetDevCtx(&ctx);
-  g_req_handler->SetScope(&scope);
-  g_req_handler->SetExecutor(&exe);
-
-  g_rpc_service->RegisterRPC(detail::kRequestPrefetch, g_req_handler.get());
-  g_req_handler->SetRPCServer(g_rpc_service.get());
-
-  std::thread server_thread(
-      std::bind(&detail::RPCServer::StartServer, g_rpc_service.get()));
-
-  server_thread.join();
-}
-
-TEST(PREFETCH, CPU) {
-  g_req_handler.reset(new detail::RequestPrefetchHandler(true));
-  g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1));
-  detail::RPCClient* client = detail::RPCClient::GetInstance<RPCCLIENT_T>();
-
-  std::thread server_thread(StartServer);
-  g_rpc_service->WaitServerReady();
-
-  int port = g_rpc_service->GetSelectedPort();
-  std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port);
-
-  framework::Scope scope;
-  platform::CPUPlace place;
-  platform::CPUDeviceContext ctx(place);
-  {
-    // create var on local scope
-    int64_t rows_numel = 5;
-    InitTensorsOnClient(&scope, &place, rows_numel);
-    std::string in_var_name("ids");
-    std::string out_var_name("out");
-
-    client->AsyncPrefetchVar(ep, ctx, scope, in_var_name, out_var_name);
-    client->Wait();
-    auto var = scope.Var(out_var_name);
-    auto value = var->GetMutable<framework::SelectedRows>()->value();
-    auto ptr = value.mutable_data<float>(place);
-
-    for (int64_t i = 0; i < rows_numel; ++i) {
-      EXPECT_EQ(ptr[0 + i * value.dims()[1]], static_cast<float>(i * 2));
-    }
-  }
-
-  g_rpc_service->ShutDown();
-  server_thread.join();
-  LOG(INFO) << "begin reset";
-  g_rpc_service.reset(nullptr);
-  g_req_handler.reset(nullptr);
-}
diff --git a/paddle/fluid/operators/detail/send_recv.proto b/paddle/fluid/operators/detail/send_recv.proto
deleted file mode 100644
index 54cb93e04d18b3784be187c9c8885bbccc55488b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detail/send_recv.proto
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. Licensed under
-the Apache License, Version 2.0 (the "License"); you may not use this file
-except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-syntax = "proto3";
-package sendrecv;
-
-// option cc_generic_services = true;
-
-service SendRecvService {
-  // For parameter server round-robin like hashing, do not split tensors.
-  // Send and recv only one tensor
-  // TODO(typhoonzero): add streaming API
-  rpc SendVariable(VariableMessage) returns (VoidMessage) {}
-  // Argument VariableMessage for GetVariable should only contain varname.
-  rpc GetVariable(VariableMessage) returns (VariableMessage) {}
-  // pre-fetch variable by given variable name and Ids
-  rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {}
-}
-
-// VariableMessage is serialized paddle variable message.
-// It can be:
-// LoDTensor
-// SelectedRows
-enum VarType {
-  LOD_TENSOR = 0;
-  SELECTED_ROWS = 1;
-  NCCL_ID = 2;
-}
-
-// NOTICE(gongwb):don't modify this proto if you are not
-//   not familar with how we serialize in sendrecvop_utils.h
-//   and deserilize it in  variable_response.h.
-message VariableMessage {
-  enum Type {
-    // Pod Types
-    BOOL = 0;
-    INT16 = 1;
-    INT32 = 2;
-    INT64 = 3;
-    FP16 = 4;
-    FP32 = 5;
-    FP64 = 6;
-  }
-
-  message LodData { repeated int64 lod_data = 1; }
-  string varname = 1;
-  // TODO(Yancey1989): reference framework::proto::VarDesc::VarType
-  VarType type = 2;
-  // bool persistable is not needed for sending.
-  // tensor info:
-  Type data_type = 3;
-  repeated int64 dims = 4;
-
-  // lod details:
-  int64 lod_level = 5;
-  repeated LodData lod = 6;
-  // selected_rows height, aka. original dim0
-  int64 slr_height = 7;
-  // tensor data
-  bytes serialized = 8;
-  // selected_rows data
-  bytes rows = 9;
-  // Look up table block execution output variable name.
-  string out_varname = 10;
-  // If 1, the ps server will start profiling, the ps
-  // server stops profiling and generates a profile to /tmp/profile_ps_*
-  // when profile switches from 1 to 2.
-  int64 profile = 11;
-}
-
-message VoidMessage {}
diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.cc b/paddle/fluid/operators/detail/sendrecvop_utils.cc
deleted file mode 100644
index 507b465435609a91ebca97dd70b176c3b79bee02..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detail/sendrecvop_utils.cc
+++ /dev/null
@@ -1,232 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
-
-#ifdef PADDLE_WITH_CUDA
-#include <nccl.h>
-#endif
-#include <sys/time.h>
-#include <thread>  // NOLINT
-
-#include "google/protobuf/io/coded_stream.h"
-#include "google/protobuf/io/zero_copy_stream.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/operators/detail/bytebuffer_stream.h"
-#include "paddle/fluid/operators/detail/proto_encoder_helper.h"
-#include "paddle/fluid/operators/detail/variable_response.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace operators {
-namespace detail {
-
-using VarMsg = sendrecv::VariableMessage;
-
-void GetTensorPayload(framework::Variable* var,
-                      const platform::DeviceContext& ctx, VarMsg* request,
-                      void** payload, size_t* payload_size) {
-  auto tensor = var->Get<framework::LoDTensor>();
-  // FIXME(wuyi): data types in send_recv.proto is copied from
-  // framework.proto
-  request->set_data_type(
-      static_cast<VarMsg::Type>(framework::ToDataType(tensor.type())));
-  for (auto& dim : framework::vectorize(tensor.dims())) {
-    request->add_dims(dim);
-  }
-  const framework::LoD lod = tensor.lod();
-  if (lod.size() > 0) {
-    request->set_lod_level(lod.size());
-    for (auto& each : lod) {
-      VarMsg::LodData* lod_inner = request->add_lod();
-      for (auto& d : each) {
-        lod_inner->add_lod_data(d);
-      }
-    }
-  }
-  if (platform::is_gpu_place(ctx.GetPlace())) {
-#ifdef PADDLE_WITH_CUDA
-    PADDLE_ENFORCE(platform::is_gpu_place(tensor.place()));
-    platform::CUDAPinnedPlace cuda_pinned;
-    auto& gpu_dev_ctx = static_cast<const platform::CUDADeviceContext&>(ctx);
-    auto copy_size = tensor.numel() * framework::SizeOfType(tensor.type());
-    *payload = memory::Alloc(cuda_pinned, copy_size);
-
-    memory::Copy(cuda_pinned, *payload,
-                 boost::get<platform::CUDAPlace>(tensor.place()),
-                 reinterpret_cast<const void*>(tensor.data<void>()), copy_size,
-                 gpu_dev_ctx.stream());
-    ctx.Wait();
-#endif
-  } else {
-    *payload = tensor.data<void>();
-  }
-  *payload_size = tensor.numel() * framework::SizeOfType(tensor.type());
-}
-
-void GetSelectedRowsPayload(framework::Variable* var,
-                            const platform::DeviceContext& ctx, VarMsg* request,
-                            void** payload, size_t* payload_size) {
-  auto* slr = var->GetMutable<framework::SelectedRows>();
-  request->set_data_type(
-      static_cast<VarMsg::Type>(framework::ToDataType(slr->value().type())));
-  request->set_lod_level(0);
-  request->set_slr_height(slr->height());
-
-  for (auto& dim : framework::vectorize(slr->value().dims())) {
-    request->add_dims(dim);
-  }
-
-  auto* tensor = slr->mutable_value();
-  if (platform::is_gpu_place(ctx.GetPlace())) {
-#ifdef PADDLE_WITH_CUDA
-    platform::CUDAPinnedPlace cuda_pinned;
-    auto& gpu_dev_ctx = static_cast<const platform::CUDADeviceContext&>(ctx);
-    auto copy_size = tensor->numel() * framework::SizeOfType(tensor->type());
-    *payload = memory::Alloc(cuda_pinned, copy_size);
-    memory::Copy(cuda_pinned, *payload,
-                 boost::get<platform::CUDAPlace>(tensor->place()),
-                 reinterpret_cast<const void*>(tensor->data<void>()), copy_size,
-                 gpu_dev_ctx.stream());
-    ctx.Wait();
-#endif
-  } else {
-    *payload = slr->mutable_value()->data<void>();
-  }
-  *payload_size = tensor->numel() * framework::SizeOfType(tensor->type());
-}
-
-void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
-                           const platform::DeviceContext& ctx,
-                           ::grpc::ByteBuffer* msg,
-                           const std::string& out_name) {
-  // Default DestroyCallback does nothing, When using GPU
-  // the CPU buffer need to be freed.
-  DestroyCallback destroy_callback = [](void* backing) {};
-  VarMsg request;
-  void* payload = nullptr;
-  size_t payload_size;
-
-  request.set_varname(name);
-  // Note: normally the profiler is enabled in 1 trainer, hence only
-  // 1 trainer returns true for ShouldSendProfileState(). It tells PS
-  // servers the trainer's profiling state so that PS can follow the
-  // trainer.
-  if (platform::ShouldSendProfileState()) {
-    if (platform::IsProfileEnabled()) {
-      request.set_profile(platform::kEnableProfiler);
-    } else {
-      request.set_profile(platform::kDisableProfiler);
-    }
-  }
-  if (!out_name.empty()) {
-    request.set_out_varname(out_name);
-  }
-  if (var->IsType<framework::LoDTensor>()) {
-    request.set_type(::sendrecv::LOD_TENSOR);
-    GetTensorPayload(var, ctx, &request, &payload, &payload_size);
-  } else if (var->IsType<framework::SelectedRows>()) {
-    request.set_type(::sendrecv::SELECTED_ROWS);
-    GetSelectedRowsPayload(var, ctx, &request, &payload, &payload_size);
-#ifdef PADDLE_WITH_CUDA
-  } else if (var->IsType<ncclUniqueId>()) {
-    request.set_type(::sendrecv::NCCL_ID);
-#endif
-  } else {
-    PADDLE_THROW("Serialize does not support type: %s",
-                 typeid(var->Type()).name());
-  }
-
-  if (platform::is_gpu_place(ctx.GetPlace())) {
-#ifdef PADDLE_WITH_CUDA
-    // GPU data is copied to CPU buffer when sending,
-    // free the buffer when possible.
-    destroy_callback = [](void* backing) {
-      platform::CUDAPinnedPlace cuda_pinned;
-      memory::Free(cuda_pinned, backing);
-    };
-#endif
-  }
-
-  std::string header;
-  request.AppendToString(&header);
-  auto buffer = std::unique_ptr<char[]>(new char[1024]);
-  void* buf = buffer.get();
-  ProtoEncodeHelper e(static_cast<char*>(buf), 1024);
-  e.WriteRawBytes(std::string(header.data(), header.size()));
-// NCCLID is copied directly to the message, return bytebuffer
-// with only one slice if serializing NCCLID.
-#ifdef PADDLE_WITH_CUDA
-  if (var->IsType<ncclUniqueId>()) {
-    e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber,
-                              NCCL_UNIQUE_ID_BYTES);
-    const ncclUniqueId& uid = var->Get<ncclUniqueId>();
-    e.WriteRawBytes(std::string(uid.internal, NCCL_UNIQUE_ID_BYTES));
-
-    // for serialize NCCL_ID
-    ::grpc::Slice slices(e.size());
-    memcpy(const_cast<uint8_t*>(slices.begin()), e.data(), e.size());
-    ::grpc::ByteBuffer tmp(&slices, 1);
-    msg->Swap(&tmp);
-    return;
-  }
-#endif
-
-  e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size);
-  // steal reference of tensor data
-  ::grpc::Slice slices[4];  // metadata, tensor, rows meta, rows
-  int num_slices = 2;       // only SelectedRows have rows buffer
-  slices[0] = ::grpc::Slice(e.size());
-  memcpy(const_cast<uint8_t*>(slices[0].begin()), e.data(), e.size());
-  slices[1] = ::grpc::Slice(
-      grpc_slice_new_with_user_data(payload, payload_size, destroy_callback,
-                                    static_cast<char*>(payload)),
-      ::grpc::Slice::STEAL_REF);
-
-  if (var->IsType<framework::SelectedRows>()) {
-    auto* slr = var->GetMutable<framework::SelectedRows>();
-    ProtoEncodeHelper e2(static_cast<char*>(buf), 128);
-    size_t rows_memory_size =
-        slr->rows().size() * framework::SizeOfType(typeid(int64_t));
-    e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size);
-    slices[2] = ::grpc::Slice(e2.size());
-    memcpy(const_cast<uint8_t*>(slices[2].begin()), e2.data(), e2.size());
-
-    slices[3] = ::grpc::Slice(
-        grpc_slice_new_with_user_data(
-            const_cast<void*>(
-                reinterpret_cast<const void*>(slr->rows().data())),
-            rows_memory_size, [](void* backing) {},
-            const_cast<char*>(
-                reinterpret_cast<const char*>(slr->rows().data()))),
-        ::grpc::Slice::STEAL_REF);
-    num_slices = 4;
-  }
-
-  ::grpc::ByteBuffer tmp(&slices[0], num_slices);
-  msg->Swap(&tmp);
-}
-
-void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
-                               const platform::DeviceContext& ctx,
-                               const framework::Scope* scope,
-                               framework::Variable** var) {
-  operators::detail::VariableResponse resp(scope, &ctx);
-  PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!");
-  *var = resp.GetVar();
-}
-
-}  // namespace detail
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.h b/paddle/fluid/operators/detail/sendrecvop_utils.h
deleted file mode 100644
index bd16bf1dab8d933ffd18b6d6d9e3ce1c7d73029b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detail/sendrecvop_utils.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <sys/time.h>
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/framework/var_type.h"
-
-#include "paddle/fluid/operators/detail/send_recv.grpc.pb.h"
-#include "paddle/fluid/operators/detail/send_recv.pb.h"
-
-namespace paddle {
-namespace operators {
-namespace detail {
-
-typedef void (*DestroyCallback)(void*);
-
-void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
-                           const platform::DeviceContext& ctx,
-                           ::grpc::ByteBuffer* msg,
-                           const std::string& out_varname = std::string());
-
-void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
-                               const platform::DeviceContext& ctx,
-                               const framework::Scope* scope,
-                               framework::Variable** var);
-
-inline std::type_index ToTypeIndex(sendrecv::VariableMessage::Type type) {
-  switch (type) {
-    case sendrecv::VariableMessage::FP32:
-      return typeid(float);  // NOLINT
-    case sendrecv::VariableMessage::FP64:
-      return typeid(double);  // NOLINT
-    case sendrecv::VariableMessage::INT32:
-      return typeid(int);  // NOLINT
-    case sendrecv::VariableMessage::INT64:
-      return typeid(int64_t);  // NOLINT
-    case sendrecv::VariableMessage::BOOL:
-      return typeid(bool);  // NOLINT
-    default:
-      PADDLE_THROW("Not support type %d", type);
-  }
-}
-
-}  // namespace detail
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/variable_response.cc b/paddle/fluid/operators/detail/variable_response.cc
deleted file mode 100644
index 24cb91a3bb820a0e5d51aaa49154434919080f69..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detail/variable_response.cc
+++ /dev/null
@@ -1,485 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/detail/variable_response.h"
-
-#include <string>
-#include <utility>
-#include <vector>
-#ifdef PADDLE_WITH_CUDA
-#include <nccl.h>
-#endif
-#include "paddle/fluid/platform/profiler.h"
-
-#include "paddle/fluid/operators/detail/send_recv.pb.h"
-#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
-
-namespace paddle {
-namespace operators {
-namespace detail {
-
-enum WireType {
-  WIRETYPE_VARINT = 0,
-  WIRETYPE_LENGTH_DELIMITED = 2,
-};
-
-inline int GetTagFieldNumber(uint32_t tag) { return tag >> 3; }
-
-inline WireType GetTagWireType(uint32_t tag) {
-  return static_cast<WireType>(tag & 0x7);
-}
-
-bool ReadVarintSizeAsInt(::google::protobuf::io::CodedInputStream* input,
-                         int* result) {
-  uint64_t v;
-  if (input->ReadVarint64(&v) && v <= static_cast<uint64_t>(INT_MAX)) {
-    *result = static_cast<int>(v);
-    return true;
-  } else {
-    return false;
-  }
-}
-
-bool ReadRaw(::google::protobuf::io::CodedInputStream* input,
-             const platform::DeviceContext& dev_ctx, platform::Place place,
-             void* dest, int size) {
-  const void* data = NULL;
-  int size_to_write = 0;
-  int length = size;
-  int total_written = 0;
-
-  if (platform::is_gpu_place(place)) {
-#ifdef PADDLE_WITH_CUDA
-    auto& gpu_dev_ctx =
-        static_cast<const platform::CUDADeviceContext&>(dev_ctx);
-    platform::CPUPlace cpu;
-
-    char* p = reinterpret_cast<char*>(dest);
-    while (total_written < length) {
-      if (!input->GetDirectBufferPointer(&data, &size_to_write)) {
-        return false;
-      }
-      // NOTE: if raw buffer is large and have two neighbor fields of raw
-      // buffers GetDirectBufferPointer can get all of them, use length to
-      // truncate it.
-      if (total_written + size_to_write > length) {
-        size_to_write = length - total_written;
-      }
-      memory::Copy(boost::get<platform::CUDAPlace>(place),
-                   reinterpret_cast<void*>(p), cpu, data, size_to_write,
-                   gpu_dev_ctx.stream());
-      p += size_to_write;
-      total_written += size_to_write;
-
-      input->Skip(size_to_write);
-    }
-    gpu_dev_ctx.Wait();
-#else
-    PADDLE_THROW("Unexpected branch");
-#endif
-    return true;
-  }
-
-  char* p = reinterpret_cast<char*>(dest);
-  while (total_written < length) {
-    if (!input->GetDirectBufferPointer(&data, &size_to_write)) {
-      return false;
-    }
-    // NOTE: if raw buffer is large and have two neighbor fields of raw buffers
-    // GetDirectBufferPointer can get all of them, use length to truncate it.
-    if (total_written + size_to_write > length) {
-      size_to_write = length - total_written;
-    }
-    // TODO(gongwb): can we avoid copy?
-    platform::CPUPlace cpu;
-    memory::Copy(cpu, reinterpret_cast<void*>(p), cpu, data, size_to_write);
-
-    p += size_to_write;
-    total_written += size_to_write;
-
-    input->Skip(size_to_write);
-  }
-
-  return true;
-}
-
-bool VariableResponse::CopyLodTensorData(
-    ::google::protobuf::io::CodedInputStream* input,
-    const platform::DeviceContext& ctx, const framework::DDim& dims,
-    int length) {
-  auto* tensor = GetVar()->GetMutable<framework::LoDTensor>();
-  tensor->Resize(dims);
-
-  framework::LoD lod;
-  for (int i = 0; i < meta_.lod_level(); ++i) {
-    framework::Vector<size_t> v;
-    for (int j = 0; j < meta_.lod(i).lod_data_size(); ++j) {
-      v.push_back(meta_.lod(i).lod_data(j));
-    }
-    lod.push_back(v);
-  }
-  tensor->set_lod(lod);
-
-  void* tensor_data =
-      tensor->mutable_data(ctx.GetPlace(), ToTypeIndex(meta_.data_type()));
-
-  if (!ReadRaw(input, ctx, tensor->place(), tensor_data, length)) {
-    return false;
-  }
-
-  return true;
-}
-
-inline framework::DDim GetDims(
-    const ::google::protobuf::RepeatedField<::google::protobuf::int64>& dims) {
-  std::vector<int> vecdims;
-  for (auto& d : dims) {
-    vecdims.push_back(d);
-  }
-  return framework::make_ddim(vecdims);
-}
-
-bool VariableResponse::CopySelectRowsTensorData(
-    ::google::protobuf::io::CodedInputStream* input,
-    const platform::DeviceContext& ctx, const framework::DDim& dims,
-    int length) {
-  auto* slr = GetVar()->GetMutable<framework::SelectedRows>();
-  slr->set_height(meta_.slr_height());
-  auto* tensor = slr->mutable_value();
-  tensor->Resize(dims);
-  PADDLE_ENFORCE_EQ(
-      static_cast<size_t>(tensor->numel()),
-      length / framework::SizeOfType(
-                   paddle::operators::detail::ToTypeIndex(meta_.data_type())));
-  void* tensor_data = tensor->mutable_data(
-      ctx.GetPlace(),
-      paddle::operators::detail::ToTypeIndex(meta_.data_type()));
-
-  if (!ReadRaw(input, ctx, tensor->place(), tensor_data, length)) {
-    return false;
-  }
-
-  return true;
-}
-
-bool VariableResponse::CopySelectRowsData(
-    ::google::protobuf::io::CodedInputStream* input,
-    const platform::DeviceContext& ctx, int length) {
-  auto* slr = GetVar()->GetMutable<framework::SelectedRows>();
-  slr->mutable_rows()->resize(length /
-                              framework::SizeOfType(typeid(int64_t)));  // int64
-  int64_t* rows_data = slr->mutable_rows()->data();
-
-  // copy rows CPU data, GPU data will be copied lazily.
-  platform::CPUPlace cpu;
-  if (!ReadRaw(input, ctx, cpu, rows_data, length)) {
-    return false;
-  }
-
-  return true;
-}
-
-bool ParseLodData(::google::protobuf::io::CodedInputStream* input,
-                  std::vector<int64_t>* lod) {
-  while (true) {
-    auto p = input->ReadTagWithCutoff(127);
-    int tag = GetTagFieldNumber(p.first);
-    WireType wt = GetTagWireType(p.first);
-
-    if (!p.second) {
-      return (tag == 0);
-    }
-
-    switch (tag) {
-      case sendrecv::VariableMessage_LodData::kLodDataFieldNumber: {
-        uint64_t v;
-        if (wt == WIRETYPE_VARINT) {
-          if (!input->ReadVarint64(&v)) {
-            return false;
-          }
-          lod->push_back(v);
-          break;
-        }
-
-        if (wt == WIRETYPE_LENGTH_DELIMITED) {
-          int num_bytes = 0;
-          if (!input->ReadVarintSizeAsInt(&num_bytes)) {
-            return tag;
-          }
-          int start_pos = input->CurrentPosition();
-          while (input->CurrentPosition() - start_pos < num_bytes) {
-            uint64_t v;
-            if (!input->ReadVarint64(&v)) {
-              return tag;
-            }
-            lod->push_back(v);
-          }
-          break;
-        }
-
-        return false;
-      }
-      default: { return false; }
-    }
-  }
-
-  return true;
-}
-
-int VariableResponse::Parse(const ::grpc::ByteBuffer& byte_buffer) {
-  GrpcByteBufferSource source;
-  source.Init(byte_buffer);
-  GrpcByteBufferSourceWrapper r(&source);
-
-  return Parse(&r);
-}
-
-int VariableResponse::Parse(Source* source) {
-  ::google::protobuf::io::ZeroCopyInputStream* input_stream =
-      source->contents();
-  ::google::protobuf::io::CodedInputStream input(input_stream);
-  input.SetTotalBytesLimit(INT_MAX, INT_MAX);
-
-  while (true) {
-    auto p = input.ReadTagWithCutoff(127);
-    int tag = GetTagFieldNumber(p.first);
-    WireType wt = GetTagWireType(p.first);
-    if (!p.second) {
-      if (tag != 0) {
-        return -1;
-      }
-      return 0;
-    }
-
-    switch (tag) {
-      case sendrecv::VariableMessage::kVarnameFieldNumber: {
-        uint32_t length;
-        if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
-          return tag;
-        }
-
-        std::string temp;
-        if (!input.ReadString(&temp, length)) {
-          return tag;
-        }
-
-        meta_.set_varname(temp);
-        break;
-      }
-      case sendrecv::VariableMessage::kTypeFieldNumber: {
-        uint32_t v;
-        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) {
-          return tag;
-        }
-
-        meta_.set_type(static_cast<::sendrecv::VarType>(v));
-        break;
-      }
-      case sendrecv::VariableMessage::kDataTypeFieldNumber: {
-        uint32_t v = 0;
-        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) {
-          return tag;
-        }
-
-        meta_.set_data_type(static_cast<::sendrecv::VariableMessage_Type>(v));
-        break;
-      }
-      case sendrecv::VariableMessage::kDimsFieldNumber: {
-        // not packed
-        if (wt == WIRETYPE_VARINT) {
-          uint64_t v;
-          if (!input.ReadVarint64(&v)) {
-            return tag;
-          }
-          meta_.add_dims(v);
-          break;
-        }
-
-        // packed
-        if (wt == WIRETYPE_LENGTH_DELIMITED) {
-          int num_bytes = 0;
-          if (!input.ReadVarintSizeAsInt(&num_bytes)) {
-            return tag;
-          }
-          int start_pos = input.CurrentPosition();
-          while (input.CurrentPosition() - start_pos < num_bytes) {
-            uint64_t v;
-            if (!input.ReadVarint64(&v)) {
-              return tag;
-            }
-            meta_.add_dims(v);
-          }
-          break;
-        }
-        return tag;
-      }
-      case sendrecv::VariableMessage::kLodLevelFieldNumber: {
-        uint64_t v = 0;
-        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
-          return tag;
-        }
-        meta_.set_lod_level(static_cast<int64_t>(v));
-        break;
-      }
-      case sendrecv::VariableMessage::kLodFieldNumber: {
-        int length = 0;
-        if (wt != WIRETYPE_LENGTH_DELIMITED ||
-            !ReadVarintSizeAsInt(&input, &length)) {
-          return tag;
-        }
-
-        std::pair<::google::protobuf::io::CodedInputStream::Limit, int> p =
-            input.IncrementRecursionDepthAndPushLimit(length);
-
-        std::vector<int64_t> lod_data;
-        if (p.second < 0 || !ParseLodData(&input, &lod_data)) {
-          return tag;
-        }
-
-        if (!input.DecrementRecursionDepthAndPopLimit(p.first)) {
-          return false;
-        }
-
-        if (lod_data.size() == 0) {
-          break;
-        }
-
-        auto lod = meta_.add_lod();
-        for (uint32_t i = 0; i < lod_data.size(); i++) {
-          lod->add_lod_data(lod_data[i]);
-        }
-        break;
-      }
-      case sendrecv::VariableMessage::kSlrHeightFieldNumber: {
-        uint64_t v = 0;
-        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
-          return tag;
-        }
-        meta_.set_slr_height(static_cast<int64_t>(v));
-        break;
-      }
-      case sendrecv::VariableMessage::kSerializedFieldNumber: {
-        PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
-                        meta_.type() == sendrecv::LOD_TENSOR ||
-                        meta_.type() == sendrecv::NCCL_ID) &&
-                           meta_.varname() != "",
-                       "meta info should be got first!");
-
-        int num_bytes = 0;
-        if (wt != WIRETYPE_LENGTH_DELIMITED ||
-            !ReadVarintSizeAsInt(&input, &num_bytes)) {
-          return tag;
-        }
-
-        if (meta_.type() == sendrecv::NCCL_ID) {
-#ifdef PADDLE_WITH_CUDA
-          auto* var = scope_->FindVar(meta_.varname());
-          if (var != nullptr) {
-            ncclUniqueId* id = var->GetMutable<ncclUniqueId>();
-            if (!ReadRaw(&input, *dev_ctx_, platform::CPUPlace(), id->internal,
-                         num_bytes)) {
-              return tag;
-            }
-          }
-          break;
-#else
-          PADDLE_THROW("Not compiled with CUDA!");
-#endif
-        }
-
-        framework::DDim dims = GetDims(meta_.dims());
-        if (meta_.type() == sendrecv::LOD_TENSOR) {
-          PADDLE_ENFORCE(meta_.lod_size() >= 0,
-                         "lod info should be got first!");
-          if (!CopyLodTensorData(&input, *dev_ctx_, dims, num_bytes)) {
-            return tag;
-          }
-          break;
-        }
-
-        if (meta_.type() == sendrecv::SELECTED_ROWS) {
-          if (!CopySelectRowsTensorData(&input, *dev_ctx_, dims, num_bytes)) {
-            return tag;
-          }
-          break;
-        }
-
-        return tag;
-      }
-      case sendrecv::VariableMessage::kRowsFieldNumber: {
-        PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
-                        meta_.type() == sendrecv::LOD_TENSOR) &&
-                           meta_.varname() != "",
-                       "meta info should be got first!");
-
-        int num_bytes = 0;
-        if (wt != WIRETYPE_LENGTH_DELIMITED ||
-            !ReadVarintSizeAsInt(&input, &num_bytes)) {
-          return tag;
-        }
-
-        if (!CopySelectRowsData(&input, *dev_ctx_, num_bytes)) {
-          return tag;
-        }
-        break;
-      }
-      case sendrecv::VariableMessage::kOutVarnameFieldNumber: {
-        uint32_t length;
-        if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
-          return tag;
-        }
-
-        std::string temp;
-        if (!input.ReadString(&temp, length)) {
-          return tag;
-        }
-
-        meta_.set_out_varname(temp);
-        break;
-      }
-      case sendrecv::VariableMessage::kProfileFieldNumber: {
-        uint64_t profiling = 0;
-        if (!input.ReadVarint64(&profiling)) {
-          return tag;
-        }
-        meta_.set_profile(profiling);
-        int64_t listener_id = platform::ListenerId();
-        if (listener_id <= 0) {
-          break;
-        }
-        if (profiling == platform::kEnableProfiler &&
-            !platform::IsProfileEnabled()) {
-          platform::EnableProfiler(platform::ProfilerState::kCPU);
-        } else if (profiling == platform::kDisableProfiler &&
-                   platform::IsProfileEnabled()) {
-          // TODO(panyx0718): Should we allow to customize file dir.
-          platform::DisableProfiler(
-              platform::EventSortingKey::kDefault,
-              string::Sprintf("/tmp/profile_ps_%lld", listener_id));
-        }
-        break;
-      }
-      default: {
-        // Unknown tag, return unknown error.
-        return -1;
-      }
-    }
-  }
-
-  return 0;
-}
-
-};  // namespace detail
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/detail/variable_response.h b/paddle/fluid/operators/detail/variable_response.h
deleted file mode 100644
index 69cfd784f8dd4f129f50c6882061e53e8535b949..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detail/variable_response.h
+++ /dev/null
@@ -1,104 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/var_type.h"
-
-#include "paddle/fluid/operators/detail/send_recv.grpc.pb.h"
-#include "paddle/fluid/operators/detail/send_recv.pb.h"
-
-#include "google/protobuf/io/coded_stream.h"
-#include "google/protobuf/io/zero_copy_stream.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/detail/bytebuffer_stream.h"
-
-namespace paddle {
-namespace operators {
-namespace detail {
-
-class VariableResponse {
- public:
-  VariableResponse(const framework::Scope* scope,
-                   const platform::DeviceContext* dev_ctx,
-                   bool create_scope = false)
-      : scope_(scope), dev_ctx_(dev_ctx), create_scope_(create_scope) {
-    if (create_scope) {
-      local_scope_ = &scope->NewScope();
-    }
-  }
-
-  virtual ~VariableResponse() {
-    if (create_scope_) {
-      scope_->DeleteScope(local_scope_);
-    }
-  }
-
-  // return:
-  // 0:ok.
-  // -1: unkown error.
-  // other: number of error field.
-  int Parse(Source* source);
-
-  // return:
-  // 0:ok.
-  // -1: unkown error.
-  // other: number of error field.
-  int Parse(const ::grpc::ByteBuffer& byte_buffer);
-
-  const framework::Scope& GetLocalScope() const { return *local_scope_; }
-
-  framework::Scope* GetMutableLocalScope() const { return local_scope_; }
-
-  inline std::string Varname() const { return meta_.varname(); }
-  inline std::string OutVarname() const { return meta_.out_varname(); }
-
-  // should call parse first.
-  framework::Variable* GetVar() {
-    if (create_scope_) {
-      return local_scope_->Var(meta_.varname());
-    }
-    return scope_->FindVar(meta_.varname());
-  }
-
- private:
-  bool CopySelectRowsTensorData(::google::protobuf::io::CodedInputStream* input,
-                                const platform::DeviceContext& ctx,
-                                const framework::DDim& dims, int length);
-
-  bool CopySelectRowsData(::google::protobuf::io::CodedInputStream* input,
-                          const platform::DeviceContext& ctx, int length);
-
-  bool CopyLodTensorData(::google::protobuf::io::CodedInputStream* input,
-                         const platform::DeviceContext& ctx,
-                         const framework::DDim& dims, int length);
-
- private:
-  const framework::Scope* scope_;
-  const platform::DeviceContext* dev_ctx_;
-  bool create_scope_ = false;
-  framework::Scope* local_scope_ = nullptr;
-  // only Skeleton
-  sendrecv::VariableMessage meta_;
-};
-
-};  // namespace detail
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index 20d960f9fee1eae42b2241fb96c163e15db5e24d..6d296ff7bf14de9175dc589dfa8b46c534127ca1 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -22,6 +22,8 @@ iou_similarity_op.cu)
 detection_library(mine_hard_examples_op SRCS mine_hard_examples_op.cc)
 detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc)
 detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu)
+detection_library(anchor_generator_op SRCS anchor_generator_op.cc
+anchor_generator_op.cu)
 detection_library(target_assign_op SRCS target_assign_op.cc
 target_assign_op.cu)
 detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc
diff --git a/paddle/fluid/operators/detection/anchor_generator_op.cc b/paddle/fluid/operators/detection/anchor_generator_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0c0155a0a977846b1300d93b4c3fef0e71fc1d26
--- /dev/null
+++ b/paddle/fluid/operators/detection/anchor_generator_op.cc
@@ -0,0 +1,154 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/detection/anchor_generator_op.h"
+
+namespace paddle {
+namespace operators {
+
+class AnchorGeneratorOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(Input) of AnchorGeneratorOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Anchors"),
+                   "Output(Anchors) of AnchorGeneratorOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("Variances"),
+        "Output(Variances) of AnchorGeneratorOp should not be null.");
+
+    auto input_dims = ctx->GetInputDim("Input");
+    PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW.");
+
+    auto anchor_sizes = ctx->Attrs().Get<std::vector<float>>("anchor_sizes");
+    auto aspect_ratios = ctx->Attrs().Get<std::vector<float>>("aspect_ratios");
+    auto stride = ctx->Attrs().Get<std::vector<float>>("stride");
+    auto variances = ctx->Attrs().Get<std::vector<float>>("variances");
+
+    size_t num_anchors = aspect_ratios.size() * anchor_sizes.size();
+
+    std::vector<int64_t> dim_vec(4);
+    dim_vec[0] = input_dims[2];
+    dim_vec[1] = input_dims[3];
+    dim_vec[2] = num_anchors;
+    dim_vec[3] = 4;
+    ctx->SetOutputDim("Anchors", framework::make_ddim(dim_vec));
+    ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("Input")->type()),
+        ctx.device_context());
+  }
+};
+
+class AnchorGeneratorOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Input",
+             "(Tensor, default Tensor<float>), "
+             "the input feature is a tensor with a rank of 4. "
+             "The layout is NCHW.");
+    AddOutput("Anchors",
+              "(Tensor, default Tensor<float>), the output is a "
+              "tensor with a rank of 4. The layout is [H, W, num_anchors, 4]. "
+              "H is the height of input, W is the width of input, num_anchors "
+              "is the box count of each position. "
+              "Each anchor is in (xmin, ymin, xmax, ymax) format");
+    AddOutput("Variances",
+              "(Tensor, default Tensor<float>), the expanded variances for "
+              "normalizing bbox regression targets. The layout is [H, W, "
+              "num_anchors, 4]. "
+              "H is the height of input, W is the width of input, num_anchors "
+              "is the box count of each position. "
+              "Each variance is in (xcenter, ycenter, w, h) format");
+
+    AddAttr<std::vector<float>>(
+        "anchor_sizes",
+        "(vector<float>) List of Region Proposal Network(RPN) anchor sizes "
+        " given in absolute pixels e.g. (64, 128, 256, 512)."
+        " For instance, the anchor size of 64 means the area of this anchor "
+        "equals to 64**2.")
+        .AddCustomChecker([](const std::vector<float>& anchor_sizes) {
+          PADDLE_ENFORCE_GT(anchor_sizes.size(), 0,
+                            "Size of anchor_sizes must be at least 1.");
+          for (size_t i = 0; i < anchor_sizes.size(); ++i) {
+            PADDLE_ENFORCE_GT(anchor_sizes[i], 0.0,
+                              "anchor_sizes[%d] must be positive.", i);
+          }
+        });
+    AddAttr<std::vector<float>>(
+        "aspect_ratios",
+        "(vector<float>) List of Region Proposal Network(RPN) anchor aspect "
+        "ratios, e.g. (0.5, 1, 2)."
+        "For instacne, the aspect ratio of 0.5 means the height / width of "
+        "this anchor equals 0.5.");
+
+    AddAttr<std::vector<float>>("variances",
+                                "(vector<float>) List of variances to be used "
+                                "in box regression deltas")
+        .AddCustomChecker([](const std::vector<float>& variances) {
+          PADDLE_ENFORCE_EQ(variances.size(), 4,
+                            "Must and only provide 4 variance.");
+          for (size_t i = 0; i < variances.size(); ++i) {
+            PADDLE_ENFORCE_GT(variances[i], 0.0,
+                              "variance[%d] must be greater than 0.", i);
+          }
+        });
+
+    AddAttr<std::vector<float>>("stride",
+                                "Anchors stride across width and height, "
+                                "with a default of (16, 16)")
+        .SetDefault(std::vector<float>(2, 16.0))
+        .AddCustomChecker([](const std::vector<float>& stride) {
+          PADDLE_ENFORCE_EQ(
+              stride.size(), 2,
+              "Must and only provide 2 stride for width and height.");
+          for (size_t i = 0; i < stride.size(); ++i) {
+            PADDLE_ENFORCE_GT(stride[i], 0.0,
+                              "stride[%d] should be larger than 0.", i);
+          }
+        });
+
+    AddAttr<float>("offset",
+                   "(float) "
+                   "Anchor center offset, with a default of 0.5")
+        .SetDefault(0.5);
+    AddComment(R"DOC(
+AnchorGenerator operator
+Generates anchors for Faster RCNN, FPN etc. algorithm.
+Each position of the input produce N anchors, N =
+ size(anchor_sizes) * size(aspect_ratios).
+
+Please get more information from the following papers:
+https://arxiv.org/abs/1506.01497.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(anchor_generator, ops::AnchorGeneratorOp,
+                  ops::AnchorGeneratorOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+
+REGISTER_OP_CPU_KERNEL(anchor_generator, ops::AnchorGeneratorOpKernel<float>,
+                       ops::AnchorGeneratorOpKernel<double>);
diff --git a/paddle/fluid/operators/detection/anchor_generator_op.cu b/paddle/fluid/operators/detection/anchor_generator_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3cc9bbeee1eeed17142a6b1bd23b45aff9cf745f
--- /dev/null
+++ b/paddle/fluid/operators/detection/anchor_generator_op.cu
@@ -0,0 +1,132 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/detection/anchor_generator_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void GenAnchors(T* out, const T* aspect_ratios, const int ar_num,
+                           const T* anchor_sizes, const int as_num,
+                           const T* stride, const int sd_num, const int height,
+                           const int width, const T offset) {
+  int num_anchors = as_num * ar_num;
+  int box_num = height * width * num_anchors;
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < box_num;
+       i += blockDim.x * gridDim.x) {
+    int h_idx = i / (num_anchors * width);
+    int w_idx = (i / num_anchors) % width;
+    T stride_width = stride[0];
+    T stride_height = stride[1];
+    T x_ctr = (w_idx * stride_width) + offset * (stride_width - 1);
+    T y_ctr = (h_idx * stride_height) + offset * (stride_height - 1);
+    T area, area_ratios;
+    T base_w, base_h;
+    T scale_w, scale_h;
+    T anchor_width, anchor_height;
+    int anch_idx = i % num_anchors;
+    int ar_idx = anch_idx / as_num;
+    int as_idx = anch_idx % as_num;
+    T aspect_ratio = aspect_ratios[ar_idx];
+    T anchor_size = anchor_sizes[as_idx];
+    area = stride_width * stride_height;
+    area_ratios = area / aspect_ratio;
+    base_w = round(sqrt(area_ratios));
+    base_h = round(base_w * aspect_ratio);
+    scale_w = anchor_size / stride_width;
+    scale_h = anchor_size / stride_height;
+    anchor_width = scale_w * base_w;
+    anchor_height = scale_h * base_h;
+
+    T xmin = (x_ctr - 0.5 * (anchor_width - 1));
+    T ymin = (y_ctr - 0.5 * (anchor_height - 1));
+    T xmax = (x_ctr + 0.5 * (anchor_width - 1));
+    T ymax = (y_ctr + 0.5 * (anchor_height - 1));
+    out[i * 4] = xmin;
+    out[i * 4 + 1] = ymin;
+    out[i * 4 + 2] = xmax;
+    out[i * 4 + 3] = ymax;
+  }
+}
+
+template <typename T>
+__global__ void SetVariance(T* out, const T* var, const int vnum,
+                            const int num) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
+       i += blockDim.x * gridDim.x) {
+    out[i] = var[i % vnum];
+  }
+}
+
+template <typename T>
+class AnchorGeneratorOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<paddle::framework::Tensor>("Input");
+    auto* anchors = ctx.Output<paddle::framework::Tensor>("Anchors");
+    auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
+
+    auto anchor_sizes = ctx.Attr<std::vector<float>>("anchor_sizes");
+    auto aspect_ratios = ctx.Attr<std::vector<float>>("aspect_ratios");
+    auto stride = ctx.Attr<std::vector<float>>("stride");
+    auto variances = ctx.Attr<std::vector<float>>("variances");
+
+    T offset = static_cast<T>(ctx.Attr<float>("offset"));
+
+    auto width = input->dims()[3];
+    auto height = input->dims()[2];
+
+    int num_anchors = aspect_ratios.size() * anchor_sizes.size();
+
+    int box_num = width * height * num_anchors;
+
+    int block = 512;
+    int grid = (box_num + block - 1) / block;
+
+    auto stream =
+        ctx.template device_context<platform::CUDADeviceContext>().stream();
+
+    anchors->mutable_data<T>(ctx.GetPlace());
+    vars->mutable_data<T>(ctx.GetPlace());
+
+    framework::Tensor ar;
+    framework::TensorFromVector(aspect_ratios, ctx.device_context(), &ar);
+
+    framework::Tensor as;
+    framework::TensorFromVector(anchor_sizes, ctx.device_context(), &as);
+
+    framework::Tensor sd;
+    framework::TensorFromVector(stride, ctx.device_context(), &sd);
+
+    GenAnchors<T><<<grid, block, 0, stream>>>(
+        anchors->data<T>(), ar.data<T>(), aspect_ratios.size(), as.data<T>(),
+        anchor_sizes.size(), sd.data<T>(), stride.size(), height, width,
+        offset);
+
+    framework::Tensor v;
+    framework::TensorFromVector(variances, ctx.device_context(), &v);
+    grid = (box_num * 4 + block - 1) / block;
+    SetVariance<T><<<grid, block, 0, stream>>>(vars->data<T>(), v.data<T>(),
+                                               variances.size(), box_num * 4);
+  }
+};  // namespace operators
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(anchor_generator,
+                        ops::AnchorGeneratorOpCUDAKernel<float>,
+                        ops::AnchorGeneratorOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/detection/anchor_generator_op.h b/paddle/fluid/operators/detection/anchor_generator_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..e0e499d76a19ba5f6b91ba4c8797684fb53c7caa
--- /dev/null
+++ b/paddle/fluid/operators/detection/anchor_generator_op.h
@@ -0,0 +1,109 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/transform.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class AnchorGeneratorOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<paddle::framework::Tensor>("Input");
+    auto* anchors = ctx.Output<paddle::framework::Tensor>("Anchors");
+    auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
+
+    auto anchor_sizes = ctx.Attr<std::vector<float>>("anchor_sizes");
+    auto aspect_ratios = ctx.Attr<std::vector<float>>("aspect_ratios");
+    auto stride = ctx.Attr<std::vector<float>>("stride");
+    auto variances = ctx.Attr<std::vector<float>>("variances");
+
+    T offset = static_cast<T>(ctx.Attr<float>("offset"));
+
+    auto feature_width = input->dims()[3];
+    auto feature_height = input->dims()[2];
+
+    T stride_width, stride_height;
+    stride_width = stride[0];
+    stride_height = stride[1];
+
+    int num_anchors = aspect_ratios.size() * anchor_sizes.size();
+
+    anchors->mutable_data<T>(ctx.GetPlace());
+    vars->mutable_data<T>(ctx.GetPlace());
+
+    auto e_anchors = framework::EigenTensor<T, 4>::From(*anchors);
+    for (int h_idx = 0; h_idx < feature_height; ++h_idx) {
+      for (int w_idx = 0; w_idx < feature_width; ++w_idx) {
+        T x_ctr = (w_idx * stride_width) + offset * (stride_width - 1);
+        T y_ctr = (h_idx * stride_height) + offset * (stride_height - 1);
+        T area, area_ratios;
+        T base_w, base_h;
+        T scale_w, scale_h;
+        T anchor_width, anchor_height;
+        int idx = 0;
+        for (size_t r = 0; r < aspect_ratios.size(); ++r) {
+          auto ar = aspect_ratios[r];
+          for (size_t s = 0; s < anchor_sizes.size(); ++s) {
+            auto anchor_size = anchor_sizes[s];
+            area = stride_width * stride_height;
+            area_ratios = area / ar;
+            base_w = round(sqrt(area_ratios));
+            base_h = round(base_w * ar);
+            scale_w = anchor_size / stride_width;
+            scale_h = anchor_size / stride_height;
+            anchor_width = scale_w * base_w;
+            anchor_height = scale_h * base_h;
+            e_anchors(h_idx, w_idx, idx, 0) =
+                (x_ctr - 0.5 * (anchor_width - 1));
+            e_anchors(h_idx, w_idx, idx, 1) =
+                (y_ctr - 0.5 * (anchor_height - 1));
+            e_anchors(h_idx, w_idx, idx, 2) =
+                (x_ctr + 0.5 * (anchor_width - 1));
+            e_anchors(h_idx, w_idx, idx, 3) =
+                (y_ctr + 0.5 * (anchor_height - 1));
+            idx++;
+          }
+        }
+      }
+    }
+
+    framework::Tensor var_t;
+    var_t.mutable_data<T>(
+        framework::make_ddim({1, static_cast<int>(variances.size())}),
+        ctx.GetPlace());
+    auto var_et = framework::EigenTensor<T, 2>::From(var_t);
+    for (size_t i = 0; i < variances.size(); ++i) {
+      var_et(0, i) = variances[i];
+    }
+
+    int anchor_num = feature_height * feature_width * num_anchors;
+    auto var_dim = vars->dims();
+    vars->Resize({anchor_num, static_cast<int>(variances.size())});
+
+    auto e_vars = framework::EigenMatrix<T, Eigen::RowMajor>::From(*vars);
+    e_vars = var_et.broadcast(Eigen::DSizes<int, 2>(anchor_num, 1));
+
+    vars->Resize(var_dim);
+  }
+};  // namespace operators
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detection/bipartite_match_op.cc b/paddle/fluid/operators/detection/bipartite_match_op.cc
index d437ad5c19828331c749244404ba80d0f3acda2a..c23b65fe4dead3ca01a447d03877e3359b19e656 100644
--- a/paddle/fluid/operators/detection/bipartite_match_op.cc
+++ b/paddle/fluid/operators/detection/bipartite_match_op.cc
@@ -51,6 +51,12 @@ class BipartiteMatchOp : public framework::OperatorWithKernel {
   }
 };
 
+template <class T>
+bool DistPairDescend(std::tuple<int, int, T> pair1,
+                     std::tuple<int, int, T> pair2) {
+  return std::get<2>(pair1) > std::get<2>(pair2);
+}
+
 template <typename T>
 class BipartiteMatchKernel : public framework::OpKernel<T> {
  public:
@@ -58,46 +64,76 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
   // The match_dist must be initialized to 0 at first.
   void BipartiteMatch(const Tensor& dist, int* match_indices,
                       T* match_dist) const {
-    constexpr T kEPS = static_cast<T>(1e-6);
     PADDLE_ENFORCE_EQ(dist.dims().size(), 2, "The rank of dist must be 2.");
     int64_t row = dist.dims()[0];
     int64_t col = dist.dims()[1];
     auto* dist_data = dist.data<T>();
-    std::vector<int> row_pool;
-    for (int i = 0; i < row; ++i) {
-      row_pool.push_back(i);
-    }
-    while (row_pool.size() > 0) {
-      int max_idx = -1;
-      int max_row_idx = -1;
-      T max_dist = -1;
-      for (int64_t j = 0; j < col; ++j) {
-        if (match_indices[j] != -1) {
-          continue;
+    // Test result: When row==130 the speed of these two methods almost the same
+    if (row >= 130) {
+      std::vector<std::tuple<int, int, T>> match_pair;
+
+      for (int64_t i = 0; i < row; ++i) {
+        for (int64_t j = 0; j < col; ++j) {
+          match_pair.push_back(std::make_tuple(i, j, dist_data[i * col + j]));
         }
-        for (size_t k = 0; k < row_pool.size(); ++k) {
-          int m = row_pool[k];
-          // distance is 0 between m-th row and j-th column
-          if (dist_data[m * col + j] < kEPS) {
+      }
+      std::sort(match_pair.begin(), match_pair.end(), DistPairDescend<T>);
+      std::vector<int> row_indices(row, -1);
+
+      int64_t idx = 0;
+      for (int64_t k = 0; k < row * col; ++k) {
+        int64_t i = std::get<0>(match_pair[k]);
+        int64_t j = std::get<1>(match_pair[k]);
+        T dist = std::get<2>(match_pair[k]);
+
+        if (idx >= row) {
+          break;
+        }
+        if (match_indices[j] == -1 && row_indices[i] == -1 && dist > 0) {
+          match_indices[j] = i;
+          row_indices[i] = j;
+          match_dist[j] = dist;
+          idx += 1;
+        }
+      }
+    } else {
+      constexpr T kEPS = static_cast<T>(1e-6);
+      std::vector<int> row_pool;
+      for (int i = 0; i < row; ++i) {
+        row_pool.push_back(i);
+      }
+      while (row_pool.size() > 0) {
+        int max_idx = -1;
+        int max_row_idx = -1;
+        T max_dist = -1;
+        for (int64_t j = 0; j < col; ++j) {
+          if (match_indices[j] != -1) {
             continue;
           }
-          if (dist_data[m * col + j] > max_dist) {
-            max_idx = j;
-            max_row_idx = m;
-            max_dist = dist_data[m * col + j];
+          for (size_t k = 0; k < row_pool.size(); ++k) {
+            int m = row_pool[k];
+            // distance is 0 between m-th row and j-th column
+            if (dist_data[m * col + j] < kEPS) {
+              continue;
+            }
+            if (dist_data[m * col + j] > max_dist) {
+              max_idx = j;
+              max_row_idx = m;
+              max_dist = dist_data[m * col + j];
+            }
           }
         }
-      }
-      if (max_idx == -1) {
-        // Cannot find good match.
-        break;
-      } else {
-        PADDLE_ENFORCE_EQ(match_indices[max_idx], -1);
-        match_indices[max_idx] = max_row_idx;
-        match_dist[max_idx] = max_dist;
-        // Erase the row index.
-        row_pool.erase(
-            std::find(row_pool.begin(), row_pool.end(), max_row_idx));
+        if (max_idx == -1) {
+          // Cannot find good match.
+          break;
+        } else {
+          PADDLE_ENFORCE_EQ(match_indices[max_idx], -1);
+          match_indices[max_idx] = max_row_idx;
+          match_dist[max_idx] = max_dist;
+          // Erase the row index.
+          row_pool.erase(
+              std::find(row_pool.begin(), row_pool.end(), max_row_idx));
+        }
       }
     }
   }
diff --git a/paddle/fluid/operators/detection/box_coder_op.cc b/paddle/fluid/operators/detection/box_coder_op.cc
index 8c4b4321b7582a5cfad89f23e3d298ed16162d99..d0f95f727fdbc82777147e3e8ada6ad4f7a35e60 100644
--- a/paddle/fluid/operators/detection/box_coder_op.cc
+++ b/paddle/fluid/operators/detection/box_coder_op.cc
@@ -106,23 +106,36 @@ class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
               "and M represents the number of deocded boxes.");
 
     AddComment(R"DOC(
-Bounding Box Coder Operator.
+
+Bounding Box Coder.
+
 Encode/Decode the target bounding box with the priorbox information.
+
 The Encoding schema described below:
-ox = (tx - px) / pw / pxv
-oy = (ty - py) / ph / pyv
-ow = log(abs(tw / pw)) / pwv 
-oh = log(abs(th / ph)) / phv 
+
+    ox = (tx - px) / pw / pxv
+
+    oy = (ty - py) / ph / pyv
+
+    ow = log(abs(tw / pw)) / pwv 
+
+    oh = log(abs(th / ph)) / phv 
+
 The Decoding schema described below:
-ox = (pw * pxv * tx * + px) - tw / 2
-oy = (ph * pyv * ty * + py) - th / 2
-ow = exp(pwv * tw) * pw + tw / 2
-oh = exp(phv * th) * ph + th / 2
-where tx, ty, tw, th denote the target box's center coordinates, width and
-height respectively. Similarly, px, py, pw, ph denote the priorbox's(anchor)
-center coordinates, width and height. pxv, pyv, pwv, phv denote the variance
-of the priorbox and ox, oy, ow, oh denote the encoded/decoded coordinates,
-width and height.
+
+    ox = (pw * pxv * tx * + px) - tw / 2
+
+    oy = (ph * pyv * ty * + py) - th / 2
+
+    ow = exp(pwv * tw) * pw + tw / 2
+
+    oh = exp(phv * th) * ph + th / 2
+
+where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, width
+and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote the
+priorbox's (anchor) center coordinates, width and height. `pxv`, `pyv`, `pwv`,
+`phv` denote the variance of the priorbox and `ox`, `oy`, `ow`, `oh` denote the
+encoded/decoded coordinates, width and height.
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/detection/iou_similarity_op.cc b/paddle/fluid/operators/detection/iou_similarity_op.cc
index 8e58605fcea04f9ffa97ce8cca53c073e7068aaf..9c89b7ca9af1b235659554afc805600d31ef8ea6 100644
--- a/paddle/fluid/operators/detection/iou_similarity_op.cc
+++ b/paddle/fluid/operators/detection/iou_similarity_op.cc
@@ -68,15 +68,16 @@ class IOUSimilarityOpMaker : public framework::OpProtoAndCheckerMaker {
               "representing pairwise iou scores.");
 
     AddComment(R"DOC(
-IOU Similarity Operator.
+**IOU Similarity Operator**
+
 Computes intersection-over-union (IOU) between two box lists.
- Box list 'X' should be a LoDTensor and 'Y' is a common Tensor,
- boxes in 'Y' are shared by all instance of the batched inputs of X.
- Given two boxes A and B, the calculation of IOU is as follows:
+Box list 'X' should be a LoDTensor and 'Y' is a common Tensor,
+boxes in 'Y' are shared by all instance of the batched inputs of X.
+Given two boxes A and B, the calculation of IOU is as follows:
 
 $$
 IOU(A, B) = 
-\frac{area(A\cap B)}{area(A)+area(B)-area(A\cap B)}
+\\frac{area(A\\cap B)}{area(A)+area(B)-area(A\\cap B)}
 $$
 
 )DOC");
diff --git a/paddle/fluid/operators/detection/polygon_box_transform_op.cc b/paddle/fluid/operators/detection/polygon_box_transform_op.cc
index 335e8dd470f851d8c5f6bdbc94cfc343da269034..568d50d457d838d5f11605710c0d3b987af01d10 100644
--- a/paddle/fluid/operators/detection/polygon_box_transform_op.cc
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cc
@@ -83,11 +83,13 @@ class PolygonBoxTransformOpMaker : public framework::OpProtoAndCheckerMaker {
 
     AddComment(R"DOC(
 PolygonBoxTransform Operator.
+
+PolygonBoxTransform Operator is used to transform the coordinate shift to the real coordinate.
+
 The input is the final geometry output in detection network.
 We use 2*n numbers to denote the coordinate shift from n corner vertices of
 the polygon_box to the pixel location. As each distance offset contains two numbers (xi, yi),
 the geometry output contains 2*n channels.
-PolygonBoxTransform Operator is used to transform the coordinate shift to the real coordinate.
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/detection_map_op.cc b/paddle/fluid/operators/detection_map_op.cc
index 716c8625d35308f98582e6802e90d99d643e188b..d7f49a9590e4ef4ca4d2ad5a92572c70e6bfb6ac 100644
--- a/paddle/fluid/operators/detection_map_op.cc
+++ b/paddle/fluid/operators/detection_map_op.cc
@@ -175,12 +175,12 @@ class DetectionMAPOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Detection mAP evaluate operator.
 The general steps are as follows. First, calculate the true positive and
- false positive according to the input of detection and labels, then
- calculate the mAP evaluate value.
- Supporting '11 point' and 'integral' mAP algorithm. Please get more information
- from the following articles:
- https://sanchom.wordpress.com/tag/average-precision/
- https://arxiv.org/abs/1512.02325
+false positive according to the input of detection and labels, then
+calculate the mAP evaluate value.
+Supporting '11 point' and 'integral' mAP algorithm. Please get more information
+from the following articles:
+https://sanchom.wordpress.com/tag/average-precision/
+https://arxiv.org/abs/1512.02325
 
 )DOC");
   }
diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..675ca36774beb72cc1e9b136ad0b18ce061689ac
--- /dev/null
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -0,0 +1,33 @@
+if(WITH_GRPC)
+  grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
+      request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor
+      selected_rows memory)
+  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+  set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  cc_test(serde_test SRCS grpc_serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr
+          cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL)
+  cc_test(grpc_server_test SRCS rpc_server_test.cc DEPS sendrecvop_grpc
+          grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor
+          proto_desc lookup_table_op SERIAL)
+  return()
+endif()
+
+
+set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+brpc_library(sendrecvop_brpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc  rpc_client.cc request_handler_impl.cc
+  PROTO send_recv.proto
+  DEPS lod_tensor selected_rows memory)
+
+find_library(OPENSSL_CRYPTO_LIBRARY_STATIC NAMES libcrypto.so)
+ADD_LIBRARY(crypto SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET crypto PROPERTY IMPORTED_LOCATION ${OPENSSL_CRYPTO_LIBRARY_STATIC})
+
+
+find_library(OPENSSL_SSL_LIBRARY_STATIC NAMES libssl.so)
+ADD_LIBRARY(ssl SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET ssl PROPERTY IMPORTED_LOCATION ${OPENSSL_SSL_LIBRARY_STATIC})
+
+cc_test(brpc_server_test SRCS rpc_server_test.cc DEPS sendrecvop_brpc 
+       brpc protobuf leveldb gflags glog
+       protobuf executor proto_desc lookup_table_op snappystream snappy ssl crypto SERIAL)
diff --git a/paddle/fluid/operators/distributed/brpc_client.cc b/paddle/fluid/operators/distributed/brpc_client.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b394c678fb6503eb73a1e11e6feb814251e9e940
--- /dev/null
+++ b/paddle/fluid/operators/distributed/brpc_client.cc
@@ -0,0 +1,180 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/distributed/brpc_client.h"
+#include "paddle/fluid/framework/threadpool.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+DEFINE_int32(brpc_channel_num, 24,
+             "Number of channels to send requests connected to one server");
+DEFINE_int32(timeout_ms, 30000, "RPC timeout in milliseconds");
+DEFINE_int32(max_retry, 3, "Max retries(not including the first RPC)");
+
+BRPCClient::~BRPCClient() { Wait(); }
+
+void HandleSendResponse(brpc::Controller* cntl,
+                        sendrecv::VoidMessage* response) {
+  // std::unique_ptr makes sure cntl/response will be deleted before returning.
+  std::unique_ptr<brpc::Controller> cntl_guard(cntl);
+  std::unique_ptr<sendrecv::VoidMessage> response_guard(response);
+
+  if (cntl->Failed()) {
+    LOG(WARNING) << "Fail to send EchoRequest, " << cntl->ErrorText();
+    return;
+  }
+  LOG(INFO) << "Received response from " << cntl->remote_side()
+            << " latency=" << cntl->latency_us() << "us";
+}
+
+bool BRPCClient::AsyncSendVar(const std::string& ep,
+                              const platform::DeviceContext& ctx,
+                              const framework::Scope& scope,
+                              const std::string& var_name, int64_t time_out) {
+  const platform::DeviceContext* p_ctx = &ctx;
+  const std::string ep_val = ep;
+  const std::string var_name_val = var_name;
+  const framework::Scope* p_scope = &scope;
+  const auto ch_ptr = GetChannel(ep_val);
+
+  framework::AsyncIO(
+      [var_name_val, p_ctx, ep_val, p_scope, time_out, ch_ptr, this] {
+        auto ch_ctx = ch_ptr->Pop();
+        brpc::Controller* cntl = new brpc::Controller();
+        sendrecv::VoidMessage* response = new sendrecv::VoidMessage();
+        cntl->set_timeout_ms(time_out);
+
+        google::protobuf::Closure* done =
+            brpc::NewCallback(&HandleSendResponse, cntl, response);
+
+        sendrecv::VariableMessage request;
+        ch_ctx->stub->SendVariable(cntl, &request, response, done);
+      });
+  req_count_++;
+
+  return true;
+}
+
+void HandleGetResponse(brpc::Controller* cntl,
+                       sendrecv::VariableMessage* response) {
+  // std::unique_ptr makes sure cntl/response will be deleted before returning.
+  std::unique_ptr<brpc::Controller> cntl_guard(cntl);
+  std::unique_ptr<sendrecv::VariableMessage> response_guard(response);
+
+  if (cntl->Failed()) {
+    LOG(WARNING) << "Fail to send EchoRequest, " << cntl->ErrorText();
+    return;
+  }
+  LOG(INFO) << "Received response from " << cntl->remote_side()
+            << " latency=" << cntl->latency_us() << "us";
+
+  // framework::Variable* outvar = nullptr;
+  // DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, &outvar);
+}
+
+bool BRPCClient::AsyncGetVar(const std::string& ep,
+                             const platform::DeviceContext& ctx,
+                             const framework::Scope& scope,
+                             const std::string& var_name, int64_t time_out) {
+  const platform::DeviceContext* p_ctx = &ctx;
+  const std::string ep_val = ep;
+  const std::string var_name_val = var_name;
+  const framework::Scope* p_scope = &scope;
+  const auto ch = GetChannel(ep_val);
+
+  framework::AsyncIO(
+      [var_name_val, ep_val, p_scope, p_ctx, time_out, ch, this] {});
+
+  req_count_++;
+
+  return true;
+}
+
+bool BRPCClient::AsyncPrefetchVar(const std::string& ep,
+                                  const platform::DeviceContext& ctx,
+                                  const framework::Scope& scope,
+                                  const std::string& in_var_name,
+                                  const std::string& out_var_name,
+                                  int64_t time_out) {
+  const platform::DeviceContext* p_ctx = &ctx;
+  const std::string ep_val = ep;
+  const std::string in_var_name_val = in_var_name;
+  const std::string out_var_name_val = out_var_name;
+  const framework::Scope* p_scope = &scope;
+  const auto ch = GetChannel(ep_val);
+
+  framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx,
+                      time_out, ch, this] {});
+
+  req_count_++;
+  return true;
+}
+
+void BRPCClient::AsyncSendBatchBarrier(const std::string& ep,
+                                       int64_t time_out) {
+  req_count_++;
+}
+
+void BRPCClient::AsyncSendFetchBarrier(const std::string& ep,
+                                       int64_t time_out) {
+  req_count_++;
+}
+
+void BRPCClient::Wait() {
+  std::unique_lock<std::mutex> lk(sync_mutex_);
+  sync_cond_.wait(lk, [this] { return req_count_ == 0; });
+}
+
+ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) {
+  {
+    std::lock_guard<std::mutex> guard(chan_mutex_);
+    auto it = channels_.find(ep);
+    if (it != channels_.end()) {
+      return it->second;
+    }
+  }
+
+  ChannelQueuePtr q(new framework::BlockingQueue<ChannelContextPtr>());
+
+  brpc::ChannelOptions options;
+  options.protocol = "baidu_std";
+  options.connection_type = "pooled";
+  options.connect_timeout_ms = 100;
+  options.timeout_ms = FLAGS_timeout_ms /*milliseconds*/;
+  options.max_retry = FLAGS_max_retry;
+  for (int i = 0; i < FLAGS_brpc_channel_num; ++i) {
+    std::shared_ptr<ChannelContext> c(new ChannelContext());
+    if (c->channel.Init(ep.c_str(), &options) != 0) {
+      LOG(ERROR) << "Fail to initialize channel";
+      return nullptr;
+    }
+
+    c->stub.reset(new sendrecv::SendRecvService_Stub(
+        static_cast<google::protobuf::RpcChannel*>(&c->channel)));
+    q->Push(c);
+  }
+
+  {
+    std::lock_guard<std::mutex> guard(chan_mutex_);
+    channels_[ep] = q;
+  }
+
+  return q;
+}
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc_client.h b/paddle/fluid/operators/distributed/brpc_client.h
new file mode 100644
index 0000000000000000000000000000000000000000..8ff1f0a6076b3574c42065edcbac50eb75b3b483
--- /dev/null
+++ b/paddle/fluid/operators/distributed/brpc_client.h
@@ -0,0 +1,98 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <time.h>
+
+#include <chrono>  // NOLINT
+#include <ctime>
+#include <functional>
+#include <iostream>
+#include <map>
+#include <mutex>  // NOLINT
+#include <string>
+#include <vector>
+
+#include "brpc/channel.h"
+#include "paddle/fluid/framework/blocking_queue.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/distributed/rpc_client.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+#include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+struct ChannelContext {
+  brpc::Channel channel;
+  std::shared_ptr<sendrecv::SendRecvService_Stub> stub;
+};
+
+typedef std::shared_ptr<ChannelContext> ChannelContextPtr;
+typedef std::shared_ptr<framework::BlockingQueue<ChannelContextPtr>>
+    ChannelQueuePtr;
+
+class BRPCClient : public RPCClient {
+ public:
+  BRPCClient() {}
+  virtual ~BRPCClient();
+
+  bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx,
+                    const framework::Scope& scope, const std::string& var_name,
+                    int64_t time_out = FLAGS_rpc_deadline) override;
+
+  bool AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx,
+                   const framework::Scope& scope, const std::string& var_name,
+                   int64_t time_out = FLAGS_rpc_deadline) override;
+
+  bool AsyncPrefetchVar(const std::string& ep,
+                        const platform::DeviceContext& ctx,
+                        const framework::Scope& scope,
+                        const std::string& in_var_name,
+                        const std::string& out_var_name,
+                        int64_t time_out = FLAGS_rpc_deadline) override;
+
+  void AsyncSendBatchBarrier(const std::string& ep,
+                             int64_t time_out = FLAGS_rpc_deadline) override;
+
+  void AsyncSendFetchBarrier(const std::string& ep,
+                             int64_t time_out = FLAGS_rpc_deadline) override;
+
+  void Wait() override;
+
+ private:
+  void Proceed();
+  ChannelQueuePtr GetChannel(const std::string& ep);
+
+ private:
+  std::unordered_map<std::string, ChannelQueuePtr> channels_;
+
+  // mutex for Wait client sync
+  std::mutex sync_mutex_;
+  std::condition_variable sync_cond_;
+  std::atomic<int64_t> req_count_{0};
+
+  // mutex for GetChannel thread safety
+  std::mutex chan_mutex_;
+  DISABLE_COPY_AND_ASSIGN(BRPCClient);
+};
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc_server.cc b/paddle/fluid/operators/distributed/brpc_server.cc
new file mode 100644
index 0000000000000000000000000000000000000000..862167f02084cfe81db1c0936bbfb0415fa85721
--- /dev/null
+++ b/paddle/fluid/operators/distributed/brpc_server.cc
@@ -0,0 +1,144 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/distributed/brpc_server.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
+
+namespace sendrecv {
+
+typedef std::unordered_map<std::string,
+                           paddle::operators::distributed::RequestHandler*>
+    HandlerMap;
+
+class BRPCServiceImpl : public SendRecvService {
+ public:
+  explicit BRPCServiceImpl(const HandlerMap& rpc_call_map)
+      : request_send_h_(nullptr),
+        request_get_h_(nullptr),
+        request_prefetch_h_(nullptr) {
+    auto it = rpc_call_map.find(paddle::operators::distributed::kRequestSend);
+    if (it != rpc_call_map.end()) {
+      request_send_h_ = it->second;
+    }
+
+    it = rpc_call_map.find(paddle::operators::distributed::kRequestSend);
+    if (it != rpc_call_map.end()) {
+      request_get_h_ = it->second;
+    }
+
+    it = rpc_call_map.find(paddle::operators::distributed::kRequestPrefetch);
+    if (it != rpc_call_map.end()) {
+      request_prefetch_h_ = it->second;
+    }
+  }
+
+  virtual ~BRPCServiceImpl() {}
+
+  void SendVariable(google::protobuf::RpcController* cntl_butil,
+                    const VariableMessage* request, VoidMessage* response,
+                    google::protobuf::Closure* done) override {
+    PADDLE_ENFORCE(request_send_h_ != nullptr,
+                   "RequestSend handler should be registed first!");
+    brpc::ClosureGuard done_guard(done);
+
+    paddle::framework::Scope* local_scope = request_send_h_->scope();
+    paddle::framework::Variable* outvar = nullptr;
+    paddle::framework::Variable* invar = nullptr;
+
+    std::string varname = request->varname();
+
+    if (!request_send_h_->sync_mode()) {
+      local_scope = &request_send_h_->scope()->NewScope();
+      invar = local_scope->Var(varname);
+    } else {
+      invar = local_scope->FindVar(varname);
+    }
+
+    request_send_h_->Handle(varname, local_scope, invar, &outvar);
+
+    if (!request_send_h_->sync_mode()) {
+      request_send_h_->scope()->DeleteScope(local_scope);
+    }
+  }
+
+  void GetVariable(google::protobuf::RpcController* cntl_butil,
+                   const VariableMessage* request, VariableMessage* response,
+                   google::protobuf::Closure* done) override {
+    PADDLE_ENFORCE(request_get_h_ != nullptr,
+                   "RequestGet handler should be registed first!");
+  }
+
+  void PrefetchVariable(google::protobuf::RpcController* cntl_butil,
+                        const VariableMessage* request,
+                        VariableMessage* response,
+                        google::protobuf::Closure* done) override {
+    PADDLE_ENFORCE(request_prefetch_h_ != nullptr,
+                   "kRequestPrefetch handler should be registed first!");
+  }
+
+ private:
+  paddle::operators::distributed::RequestHandler* request_send_h_;
+  paddle::operators::distributed::RequestHandler* request_get_h_;
+  paddle::operators::distributed::RequestHandler* request_prefetch_h_;
+};
+}  // namespace sendrecv
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+void AsyncBRPCServer::StartServer() {
+  // Instance of your service.
+  sendrecv::BRPCServiceImpl service_impl(rpc_call_map_);
+
+  // Add the service into server. Notice the second parameter, because the
+  // service is put on stack, we don't want server to delete it, otherwise
+  // use brpc::SERVER_OWNS_SERVICE.
+  if (server_.AddService(&service_impl, brpc::SERVER_DOESNT_OWN_SERVICE) != 0) {
+    LOG(FATAL) << "Fail to add service";
+    return;
+  }
+
+  brpc::ServerOptions options;
+  options.idle_timeout_sec = idle_timeout_s_;
+  options.max_concurrency = max_concurrency_;
+  if (server_.Start(bind_address_.c_str(), &options) != 0) {
+    LOG(FATAL) << "Fail to start EchoServer" << bind_address_;
+    return;
+  }
+
+  butil::EndPoint ep = server_.listen_address();
+  selected_port_ = ep.port;
+
+  {
+    std::lock_guard<std::mutex> lock(this->mutex_ready_);
+    ready_ = 1;
+  }
+  condition_ready_.notify_all();
+
+  server_.Join();
+}
+
+void AsyncBRPCServer::ShutDownImpl() { server_.Stop(1000); }
+
+void AsyncBRPCServer::WaitServerReady() {
+  VLOG(3) << "AsyncGRPCServer is wait server ready";
+  std::unique_lock<std::mutex> lock(this->mutex_ready_);
+  condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
+  VLOG(3) << "AsyncGRPCServer WaitSeverReady";
+}
+
+};  // namespace distributed
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc_server.h b/paddle/fluid/operators/distributed/brpc_server.h
new file mode 100644
index 0000000000000000000000000000000000000000..85a7ad0dfe843dad483d43631b69a79d75211ce9
--- /dev/null
+++ b/paddle/fluid/operators/distributed/brpc_server.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <condition_variable>  // NOLINT
+#include <mutex>               // NOLINT
+#include <string>
+
+#include "brpc/server.h"
+#include "paddle/fluid/operators/distributed/rpc_server.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+class AsyncBRPCServer final : public RPCServer {
+ public:
+  explicit AsyncBRPCServer(const std::string& address, int client_num)
+      : RPCServer(address, client_num), ready_(0) {}
+
+  virtual ~AsyncBRPCServer() {}
+  void StartServer() override;
+  void WaitServerReady() override;
+
+ private:
+  void ShutDownImpl() override;
+
+  brpc::Server server_;
+
+  static constexpr int idle_timeout_s_ = -1;
+  static constexpr int max_concurrency_ = 0;
+
+  std::mutex mutex_ready_;
+  std::condition_variable condition_ready_;
+  int ready_;
+};
+
+};  // namespace distributed
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/bytebuffer_stream.cc b/paddle/fluid/operators/distributed/bytebuffer_stream.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6e91b447db838c9095432eda22e9e1171e938d31
--- /dev/null
+++ b/paddle/fluid/operators/distributed/bytebuffer_stream.cc
@@ -0,0 +1,88 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+// NOTE: This file was originally created by tensorflow
+//       (https://github.com/tensorflow/tensorflow/) we borrow this
+//       file and did some modifications so that we can send gRPC
+//       requests without too much copying of the tensor data.
+
+#include "paddle/fluid/operators/distributed/bytebuffer_stream.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+GrpcByteBufferSource::GrpcByteBufferSource() {}
+
+bool GrpcByteBufferSource::Init(const grpc::ByteBuffer& src) {
+  cur_ = -1;
+  left_ = 0;
+  ptr_ = nullptr;
+  byte_count_ = 0;
+  bool ok = src.Dump(&slices_).ok();
+  if (!ok) {
+    slices_.clear();
+  }
+  return ok;
+}
+
+bool GrpcByteBufferSource::Next(const void** data, int* size) {
+  // Use loop instead of if in case buffer contained empty slices.
+  while (left_ == 0) {
+    // Advance to next slice.
+    cur_++;
+    if (cur_ >= slices_.size()) {
+      return false;
+    }
+    const ::grpc::Slice& s = slices_[cur_];
+    left_ = s.size();
+    ptr_ = reinterpret_cast<const char*>(s.begin());
+  }
+
+  *data = ptr_;
+  *size = left_;
+  byte_count_ += left_;
+  ptr_ += left_;
+  left_ = 0;
+  return true;
+}
+
+void GrpcByteBufferSource::BackUp(int count) {
+  ptr_ -= count;
+  left_ += count;
+  byte_count_ -= count;
+}
+
+bool GrpcByteBufferSource::Skip(int count) {
+  const void* data;
+  int size;
+  while (Next(&data, &size)) {
+    if (size >= count) {
+      BackUp(size - count);
+      return true;
+    }
+    // size < count;
+    count -= size;
+  }
+  // error or we have too large count;
+  return false;
+}
+
+google::protobuf::int64 GrpcByteBufferSource::ByteCount() const {
+  return byte_count_;
+}
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/bytebuffer_stream.h b/paddle/fluid/operators/distributed/bytebuffer_stream.h
new file mode 100644
index 0000000000000000000000000000000000000000..e7de172c79c30761483b5d96f5bad19860208832
--- /dev/null
+++ b/paddle/fluid/operators/distributed/bytebuffer_stream.h
@@ -0,0 +1,188 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+// NOTE: This file was originally created by tensorflow
+//       (https://github.com/tensorflow/tensorflow/) we borrow this
+//       file and did some modifications so that we can send gRPC
+//       requests without too much copying of the tensor data.
+
+#pragma once
+
+#include <vector>
+
+#include "google/protobuf/io/coded_stream.h"
+#include "google/protobuf/io/zero_copy_stream.h"
+#include "grpc++/grpc++.h"
+
+namespace grpc {
+// A ZeroCopyInputStream that reads from grpc_byte_buffer
+class GrpcBufferReader final
+    : public ::google::protobuf::io::ZeroCopyInputStream {
+  typedef void (CoreCodegenInterface::*OldReaderInitAPI)(
+      grpc_byte_buffer_reader* reader, grpc_byte_buffer* buffer);
+  typedef int (CoreCodegenInterface::*NewReaderInitAPI)(
+      grpc_byte_buffer_reader* reader, grpc_byte_buffer* buffer);
+  void ReaderInit(OldReaderInitAPI ptr, grpc_byte_buffer_reader* reader,
+                  grpc_byte_buffer* buffer) {
+    (g_core_codegen_interface->*ptr)(reader, buffer);
+  }
+  void ReaderInit(NewReaderInitAPI ptr, grpc_byte_buffer_reader* reader,
+                  grpc_byte_buffer* buffer) {
+    int result = (g_core_codegen_interface->*ptr)(reader, buffer);
+    (void)result;
+  }
+
+ public:
+  explicit GrpcBufferReader(grpc_byte_buffer* buffer)
+      : byte_count_(0), backup_count_(0) {
+    ReaderInit(&CoreCodegenInterface::grpc_byte_buffer_reader_init, &reader_,
+               buffer);
+  }
+  ~GrpcBufferReader() override {
+    g_core_codegen_interface->grpc_byte_buffer_reader_destroy(&reader_);
+  }
+
+  bool Next(const void** data, int* size) override {
+    if (backup_count_ > 0) {
+      *data = GRPC_SLICE_START_PTR(slice_) + GRPC_SLICE_LENGTH(slice_) -
+              backup_count_;
+      GPR_CODEGEN_ASSERT(backup_count_ <= INT_MAX);
+      *size = static_cast<int>(backup_count_);
+      backup_count_ = 0;
+      return true;
+    }
+    if (!g_core_codegen_interface->grpc_byte_buffer_reader_next(&reader_,
+                                                                &slice_)) {
+      return false;
+    }
+    g_core_codegen_interface->grpc_slice_unref(slice_);
+    *data = GRPC_SLICE_START_PTR(slice_);
+    // On win x64, int is only 32bit
+    GPR_CODEGEN_ASSERT(GRPC_SLICE_LENGTH(slice_) <= INT_MAX);
+    byte_count_ += * size = static_cast<int>(GRPC_SLICE_LENGTH(slice_));
+    return true;
+  }
+
+  void BackUp(int count) override { backup_count_ = count; }
+
+  bool Skip(int count) override {
+    const void* data;
+    int size;
+    while (Next(&data, &size)) {
+      if (size >= count) {
+        BackUp(size - count);
+        return true;
+      }
+      // size < count;
+      count -= size;
+    }
+    // error or we have too large count;
+    return false;
+  }
+
+  ::google::protobuf::int64 ByteCount() const override {
+    return byte_count_ - backup_count_;
+  }
+
+ private:
+  int64_t byte_count_;
+  int64_t backup_count_;
+  grpc_byte_buffer_reader reader_;
+  grpc_slice slice_;
+};
+
+};  // namespace grpc
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+// Source provides a way for a particular RPC implementation to provide
+// received data to ParseFrom.
+class Source {
+ public:
+  virtual ~Source() {}
+
+  // Return the stream that contains the data to be parsed.
+  // Note that this method might be invoked more than once if
+  // ParseFrom needs to fall back to a more expensive parsing method.
+  // Every call must return a stream pointing at the beginning of
+  // the serialized RecvTensorResponse.
+  //
+  // Note that a subsequent call to contents() invalidates previous
+  // results of contents().
+  //
+  // Ownership of the returned stream is retained by the Source and
+  // should not be deleted by the caller.
+  virtual ::google::protobuf::io::ZeroCopyInputStream* contents() = 0;
+};
+
+// A ZeroCopyInputStream that reads from a grpc::ByteBuffer.
+class GrpcByteBufferSource
+    : public ::google::protobuf::io::ZeroCopyInputStream {
+ public:
+  GrpcByteBufferSource();
+  bool Init(const ::grpc::ByteBuffer& src);  // Can be called multiple times.
+  bool Next(const void** data, int* size) override;
+  void BackUp(int count) override;
+  bool Skip(int count) override;
+  ::google::protobuf::int64 ByteCount() const override;
+
+ private:
+  std::vector<::grpc::Slice> slices_;
+  size_t cur_;       // Current slice index.
+  int left_;         // Number of bytes in slices_[cur_] left to yield.
+  const char* ptr_;  // Address of next byte in slices_[cur_] to yield.
+  ::google::protobuf::int64 byte_count_;
+};
+
+class GrpcByteBufferSourceWrapper : public Source {
+ public:
+  explicit GrpcByteBufferSourceWrapper(GrpcByteBufferSource* source)
+      : source_(source) {}
+  ::google::protobuf::io::ZeroCopyInputStream* contents() override {
+    return source_;
+  }
+
+ private:
+  GrpcByteBufferSource* source_;
+};
+
+class GrpcByteSource : public Source {
+ public:
+  explicit GrpcByteSource(grpc_byte_buffer* buffer) : buffer_(buffer) {}
+  ~GrpcByteSource() override { DeleteStream(); }
+
+  typedef ::grpc::GrpcBufferReader Reader;
+
+  ::google::protobuf::io::ZeroCopyInputStream* contents() override {
+    DeleteStream();
+    stream_ = new (&space_) Reader(buffer_);
+    return stream_;
+  }
+
+ private:
+  void DeleteStream() {
+    if (stream_) {
+      stream_->~Reader();
+    }
+  }
+
+  grpc_byte_buffer* buffer_;  // Not owned
+  Reader* stream_ = nullptr;  // Points into space_ if non-nullptr
+  char space_[sizeof(Reader)];
+};
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4a09f3870d64d8e14b2db41ff3ea7c2f9e67b558
--- /dev/null
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -0,0 +1,333 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/distributed/grpc_client.h"
+
+#include <sys/time.h>
+
+#include <limits>
+
+#include "glog/logging.h"  // For VLOG
+#include "paddle/fluid/framework/threadpool.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+void GRPCClient::InitImpl() { InitEventLoop(); }
+
+void GRPCClient::InitEventLoop() {
+  // start the client process thread
+  // TODO(wuyi): can make this in a threadpool
+  client_thread_.reset(new std::thread(std::bind(&GRPCClient::Proceed, this)));
+}
+
+void GRPCClient::SendBeginPass() {
+  for (auto& it : channels_) {
+    VLOG(3) << "send begin pass to: " << it.first;
+    this->AsyncSendBeginPass(it.first);
+  }
+  this->Wait();
+}
+
+void GRPCClient::SendEndPass() {
+  for (auto& it : channels_) {
+    VLOG(3) << "send end pass to " << it.first;
+    this->AsyncSendEndPass(it.first);
+  }
+  this->Wait();
+}
+
+GRPCClient::~GRPCClient() {
+  Wait();
+  cq_.Shutdown();
+  {
+    std::lock_guard<std::mutex> guard(chan_mutex_);
+    for (auto& it : channels_) {
+      it.second.reset();
+    }
+  }
+  client_thread_->join();
+}
+
+bool GRPCClient::AsyncSendVar(const std::string& ep,
+                              const platform::DeviceContext& ctx,
+                              const framework::Scope& scope,
+                              const std::string& var_name, int64_t time_out) {
+  const platform::DeviceContext* p_ctx = &ctx;
+  const std::string ep_val = ep;
+  const std::string var_name_val = var_name;
+  const framework::Scope* p_scope = &scope;
+  const auto ch = GetChannel(ep_val);
+
+  framework::AsyncIO([var_name_val, p_ctx, ep_val, p_scope, time_out, ch,
+                      this] {
+    auto* var = p_scope->FindVar(var_name_val);
+
+    ::grpc::ByteBuffer req;
+    SerializeToByteBuffer(var_name_val, var, *p_ctx, &req);
+
+    // varhandle
+    VarHandle var_h;
+    var_h.ep = ep_val;
+    var_h.scope = p_scope;
+    var_h.name = var_name_val;
+    var_h.ctx = p_ctx;
+    var_h.method = "Send";
+
+    VLOG(3) << var_h.String() << " begin";
+
+    // stub context
+    SendProcessor* s = new SendProcessor(ch);
+    s->Prepare(var_h, time_out);
+    s->response_call_back_ = nullptr;
+
+    auto call = s->stub_g_.PrepareUnaryCall(
+        s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, &cq_);
+    call->StartCall();
+    call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
+  });
+  req_count_++;
+
+  return true;
+}
+
+void ProcGetResponse(const VarHandle& var_h,
+                     const ::grpc::ByteBuffer& ret_msg) {
+  framework::Variable* outvar = nullptr;
+  DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, &outvar);
+}
+
+template <typename T>
+void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) {
+  ::grpc::Slice slice(proto.ByteSizeLong());
+  proto.SerializeWithCachedSizesToArray(const_cast<uint8_t*>(slice.begin()));
+  ::grpc::ByteBuffer tmp(&slice, 1);
+  result->Swap(&tmp);
+}
+
+bool GRPCClient::AsyncGetVar(const std::string& ep,
+                             const platform::DeviceContext& ctx,
+                             const framework::Scope& scope,
+                             const std::string& var_name, int64_t time_out) {
+  const platform::DeviceContext* p_ctx = &ctx;
+  const std::string ep_val = ep;
+  const std::string var_name_val = var_name;
+  const framework::Scope* p_scope = &scope;
+  const auto ch = GetChannel(ep_val);
+
+  framework::AsyncIO([var_name_val, ep_val, p_scope, p_ctx, time_out, ch,
+                      this] {
+    // prepare input
+    sendrecv::VariableMessage req;
+    req.set_varname(var_name_val);
+    ::grpc::ByteBuffer buf;
+    RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf);
+
+    // var handle
+    VarHandle var_h;
+    var_h.ep = ep_val;
+    var_h.scope = p_scope;
+    var_h.name = var_name_val;
+    var_h.ctx = p_ctx;
+    var_h.method = "Get";
+
+    VLOG(3) << var_h.String() << " begin";
+
+    // stub context
+    GetProcessor* s = new GetProcessor(ch);
+    s->Prepare(var_h, time_out);
+    s->response_call_back_ = ProcGetResponse;
+
+    auto call = s->stub_g_.PrepareUnaryCall(
+        s->context_.get(), "/sendrecv.SendRecvService/GetVariable", buf, &cq_);
+    call->StartCall();
+    call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
+  });
+
+  req_count_++;
+
+  return true;
+}
+
+bool GRPCClient::AsyncPrefetchVar(const std::string& ep,
+                                  const platform::DeviceContext& ctx,
+                                  const framework::Scope& scope,
+                                  const std::string& in_var_name,
+                                  const std::string& out_var_name,
+                                  int64_t time_out) {
+  const platform::DeviceContext* p_ctx = &ctx;
+  const std::string ep_val = ep;
+  const std::string in_var_name_val = in_var_name;
+  const std::string out_var_name_val = out_var_name;
+  const framework::Scope* p_scope = &scope;
+  const auto ch = GetChannel(ep_val);
+
+  framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx,
+                      time_out, ch, this] {
+    auto* var = p_scope->FindVar(in_var_name_val);
+
+    ::grpc::ByteBuffer req;
+    SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, out_var_name_val);
+
+    // var handle
+    VarHandle var_h;
+    var_h.ep = ep_val;
+    var_h.scope = p_scope;
+    var_h.name = out_var_name_val;
+    var_h.ctx = p_ctx;
+    var_h.method = "Prefetch";
+
+    VLOG(3) << var_h.String() << " begin";
+
+    // stub context
+    GetProcessor* s = new GetProcessor(ch);
+    s->Prepare(var_h, time_out);
+    s->response_call_back_ = ProcGetResponse;
+
+    auto call = s->stub_g_.PrepareUnaryCall(
+        s->context_.get(), "/sendrecv.SendRecvService/PrefetchVariable", req,
+        &cq_);
+    call->StartCall();
+    call->Finish(&s->reply_, &s->status_, static_cast<void*>(s));
+  });
+
+  req_count_++;
+  return true;
+}
+
+void GRPCClient::AsyncSendBatchBarrier(const std::string& ep,
+                                       int64_t time_out) {
+  const auto ch = GetChannel(ep);
+
+  BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
+  s->Prepare(time_out);
+
+  sendrecv::VariableMessage req;
+  req.set_varname(BATCH_BARRIER_MESSAGE);
+  auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
+  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
+  req_count_++;
+}
+
+void GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
+                                       int64_t time_out) {
+  const auto ch = GetChannel(ep);
+  FetchBarrierProcessor* s = new FetchBarrierProcessor(ch);
+  s->Prepare(time_out);
+
+  sendrecv::VariableMessage req;
+  req.set_varname(FETCH_BARRIER_MESSAGE);
+  auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
+  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
+  req_count_++;
+}
+
+void GRPCClient::AsyncSendBeginPass(const std::string& ep, int64_t time_out) {
+  const auto ch = GetChannel(ep);
+
+  BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
+  s->Prepare(time_out);
+
+  sendrecv::VariableMessage req;
+  req.set_varname(BEGIN_PASS_MESSAGE);
+  auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
+  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
+  req_count_++;
+}
+
+void GRPCClient::AsyncSendEndPass(const std::string& ep, int64_t time_out) {
+  const auto ch = GetChannel(ep);
+
+  FetchBarrierProcessor* s = new FetchBarrierProcessor(ch);
+  s->Prepare(time_out);
+
+  sendrecv::VariableMessage req;
+  req.set_varname(END_PASS_MESSAGE);
+  auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
+  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
+  req_count_++;
+}
+
+void GRPCClient::AsyncCheckpointNotify(const std::string& ep,
+                                       const std::string& dir,
+                                       int64_t time_out) {
+  const auto ch = GetChannel(ep);
+
+  CheckpointNotifyProcessor* s = new CheckpointNotifyProcessor(ch);
+  s->Prepare(time_out);
+
+  sendrecv::VariableMessage req;
+  req.set_varname(CHECKPOINT_SAVE_MESSAGE);
+  req.set_out_varname(dir);
+
+  auto rpc = s->stub_->AsyncCheckpointNotify(s->context_.get(), req, &cq_);
+  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
+  req_count_++;
+}
+
+void GRPCClient::Wait() {
+  std::unique_lock<std::mutex> lk(sync_mutex_);
+  sync_cond_.wait(lk, [this] { return req_count_ == 0; });
+}
+
+void GRPCClient::Proceed() {
+  void* tag = nullptr;
+  bool ok = false;
+
+  while (cq_.Next(&tag, &ok)) {
+    BaseProcessor* c = static_cast<BaseProcessor*>(tag);
+    GPR_ASSERT(ok);
+    PADDLE_ENFORCE(c);
+    if (c->status_.ok()) {
+      VLOG(3) << c->var_h_.String() << " process";
+      c->Process();
+    } else {
+      LOG(FATAL) << c->var_h_.String()
+                 << " meets grpc error:" << c->status_.error_message();
+    }
+    delete c;
+    {
+      std::lock_guard<std::mutex> lk(sync_mutex_);
+      req_count_--;
+    }
+    sync_cond_.notify_all();
+  }
+}
+
+std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) {
+  std::lock_guard<std::mutex> guard(chan_mutex_);
+  auto it = channels_.find(ep);
+  if (it != channels_.end()) {
+    return it->second;
+  }
+
+  // Channel configurations:
+  grpc::ChannelArguments args;
+  args.SetInt(GRPC_ARG_MAX_RECONNECT_BACKOFF_MS, 2000);
+  args.SetCompressionAlgorithm(GRPC_COMPRESS_NONE);
+  args.SetMaxSendMessageSize(std::numeric_limits<int>::max());
+  args.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
+
+  auto ch =
+      grpc::CreateCustomChannel(ep, grpc::InsecureChannelCredentials(), args);
+  channels_[ep] = ch;
+  return ch;
+}
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc_client.h b/paddle/fluid/operators/distributed/grpc_client.h
new file mode 100644
index 0000000000000000000000000000000000000000..5dae20155edcf9edd746a5d9a9bbe0ccd789f431
--- /dev/null
+++ b/paddle/fluid/operators/distributed/grpc_client.h
@@ -0,0 +1,258 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <time.h>
+
+#include <chrono>              // NOLINT
+#include <condition_variable>  // NOLINT
+#include <ctime>
+#include <functional>
+#include <iostream>
+#include <map>
+#include <mutex>  // NOLINT
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "grpc++/channel.h"
+#include "grpc++/generic/generic_stub.h"
+#include "grpc++/grpc++.h"
+#include "grpc++/support/byte_buffer.h"
+#include "grpc++/support/slice.h"
+#include "grpc/support/log.h"
+#include "paddle/fluid/framework/blocking_queue.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/distributed/rpc_client.h"
+#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
+#include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+struct VarHandle {
+  // RPC endpoint.
+  std::string ep;
+  const platform::DeviceContext* ctx;
+  const framework::Scope* scope;
+  // Variable name.
+  std::string name;
+  // RPC method name.
+  std::string method;
+
+  std::string String() const {
+    std::ostringstream s;
+    s << method << " name:[" << name << "], ep:[" << ep << "]";
+    return s.str();
+  }
+};
+
+void ProcGetResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg);
+
+class BaseProcessor {
+ public:
+  explicit BaseProcessor(std::shared_ptr<grpc::Channel> ch) {
+    context_ = nullptr;
+  }
+
+  virtual ~BaseProcessor() {}
+
+  virtual void Prepare(const VarHandle& var_info, int64_t time_out) {
+    context_.reset(new grpc::ClientContext());
+    var_h_ = var_info;
+    context_->set_wait_for_ready(true);
+    if (time_out) {
+      std::chrono::system_clock::time_point deadline =
+          std::chrono::system_clock::now() +
+          std::chrono::milliseconds(time_out);
+      context_->set_deadline(deadline);
+    }
+  }
+
+  virtual void Prepare(int64_t time_out) {
+    context_.reset(new grpc::ClientContext());
+    context_->set_wait_for_ready(true);
+
+    std::chrono::system_clock::time_point deadline =
+        std::chrono::system_clock::now() + std::chrono::milliseconds(time_out);
+
+    context_->set_deadline(deadline);
+  }
+
+  virtual void Process() = 0;
+
+  std::unique_ptr<grpc::ClientContext> context_;
+  grpc::Status status_;
+  VarHandle var_h_;
+};
+
+typedef std::function<void(const VarHandle&, const ::grpc::ByteBuffer&)>
+    RequestSendCallBack;
+
+class SendProcessor : public BaseProcessor {
+ public:
+  explicit SendProcessor(std::shared_ptr<grpc::Channel> ch)
+      : BaseProcessor(ch), stub_g_(ch) {}
+
+  virtual ~SendProcessor() {}
+
+  virtual void Process() {
+    if (response_call_back_) {
+      response_call_back_(var_h_, reply_);
+    }
+  }
+
+  ::grpc::GenericStub stub_g_;
+  ::grpc::ByteBuffer reply_;
+  RequestSendCallBack response_call_back_ = nullptr;
+};
+
+typedef std::function<void(const VarHandle&, const ::grpc::ByteBuffer&)>
+    RequestGetCallBack;
+
+class GetProcessor : public BaseProcessor {
+ public:
+  explicit GetProcessor(std::shared_ptr<grpc::Channel> ch)
+      : BaseProcessor(ch), stub_g_(ch) {}
+
+  virtual ~GetProcessor() {}
+
+  virtual void Process() {
+    if (response_call_back_) {
+      response_call_back_(var_h_, reply_);
+    }
+  }
+
+  ::grpc::ByteBuffer reply_;
+  ::grpc::GenericStub stub_g_;
+  RequestGetCallBack response_call_back_ = ProcGetResponse;
+};
+
+class BatchBarrierProcessor : public BaseProcessor {
+ public:
+  explicit BatchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
+      : BaseProcessor(ch) {
+    stub_ = sendrecv::SendRecvService::NewStub(ch);
+  }
+
+  virtual ~BatchBarrierProcessor() {}
+
+  virtual void Process() {}
+  sendrecv::VoidMessage reply_;
+  std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
+};
+
+class FetchBarrierProcessor : public BaseProcessor {
+ public:
+  explicit FetchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
+      : BaseProcessor(ch) {
+    stub_ = sendrecv::SendRecvService::NewStub(ch);
+  }
+
+  virtual ~FetchBarrierProcessor() {}
+
+  virtual void Process() {}
+  sendrecv::VariableMessage reply_;
+  std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
+};
+
+class CheckpointNotifyProcessor : public BaseProcessor {
+ public:
+  explicit CheckpointNotifyProcessor(std::shared_ptr<grpc::Channel> ch)
+      : BaseProcessor(ch) {
+    stub_ = sendrecv::SendRecvService::NewStub(ch);
+  }
+
+  virtual ~CheckpointNotifyProcessor() {}
+
+  virtual void Process() {}
+  sendrecv::VoidMessage reply_;
+  std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
+};
+
+class GRPCClient : public RPCClient {
+ public:
+  GRPCClient() {}
+  virtual ~GRPCClient();
+
+  bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx,
+                    const framework::Scope& scope, const std::string& var_name,
+                    int64_t time_out = FLAGS_rpc_deadline) override;
+
+  bool AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx,
+                   const framework::Scope& scope, const std::string& var_name,
+                   int64_t time_out = FLAGS_rpc_deadline) override;
+
+  bool AsyncPrefetchVar(const std::string& ep,
+                        const platform::DeviceContext& ctx,
+                        const framework::Scope& scope,
+                        const std::string& in_var_name,
+                        const std::string& out_var_name,
+                        int64_t time_out = FLAGS_rpc_deadline) override;
+
+  void AsyncSendBatchBarrier(const std::string& ep,
+                             int64_t time_out = FLAGS_rpc_deadline) override;
+
+  void AsyncSendFetchBarrier(const std::string& ep,
+                             int64_t time_out = FLAGS_rpc_deadline) override;
+
+  void AsyncCheckpointNotify(const std::string& ep, const std::string& dir,
+                             int64_t time_out = FLAGS_rpc_deadline) override;
+
+  void AsyncSendBeginPass(const std::string& ep,
+                          int64_t time_out = FLAGS_rpc_deadline) override;
+
+  void AsyncSendEndPass(const std::string& ep,
+                        int64_t time_out = FLAGS_rpc_deadline) override;
+
+  void Wait() override;
+
+  void SendBeginPass() override;
+
+  void SendEndPass() override;
+
+ protected:
+  void InitImpl() override;
+
+ private:
+  // InitEventLoop should only be called by Init()
+  void InitEventLoop();
+
+  void Proceed();
+
+  std::shared_ptr<grpc::Channel> GetChannel(const std::string& ep);
+
+ private:
+  grpc::CompletionQueue cq_;
+  std::unordered_map<std::string, std::shared_ptr<grpc::Channel>> channels_;
+  std::unique_ptr<std::thread> client_thread_;
+
+  // mutex for Wait client sync
+  std::mutex sync_mutex_;
+  std::condition_variable sync_cond_;
+  std::atomic<int64_t> req_count_{0};
+
+  // mutex for GetChannel thread safety
+  std::mutex chan_mutex_;
+  DISABLE_COPY_AND_ASSIGN(GRPCClient);
+};
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc_serde_test.cc b/paddle/fluid/operators/distributed/grpc_serde_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3d107b533bcb7bfef3f9b13ec99afbd579a62e52
--- /dev/null
+++ b/paddle/fluid/operators/distributed/grpc_serde_test.cc
@@ -0,0 +1,221 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <unistd.h>
+#include <string>
+#include <thread>  // NOLINT
+
+#include "google/protobuf/text_format.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/variable_response.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
+namespace operators = paddle::operators;
+namespace math = paddle::operators::math;
+namespace memory = paddle::memory;
+
+void RunSerdeTestSelectedRows(platform::Place place) {
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto& ctx = *pool.Get(place);
+
+  // serialize var to ByteBuffer
+  framework::Variable var;
+  auto* slr = var.GetMutable<framework::SelectedRows>();
+  slr->set_height(1000);
+  auto* tensor = slr->mutable_value();
+  auto* rows = slr->mutable_rows();
+  tensor->Resize(framework::make_ddim({564, 128}));
+  tensor->mutable_data<float>(place);
+  int tensor_numel = 564 * 128;
+  math::set_constant(ctx, tensor, 32.7);
+  for (int i = 0; i < 564; ++i) rows->push_back(i);
+
+  ::grpc::ByteBuffer msg;
+  operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg);
+  EXPECT_GT(msg.Length(), static_cast<size_t>(0));
+
+  // deserialize
+  std::vector<::grpc::Slice> slices;
+  (void)msg.Dump(&slices);
+  std::string tmp;
+  for (const auto& s : slices) {
+    tmp.append(reinterpret_cast<const char*>(s.begin()), s.size());
+  }
+
+  sendrecv::VariableMessage varmsg;
+  EXPECT_TRUE(varmsg.ParseFromString(tmp));
+
+  // deserialize bytebuffer
+  EXPECT_EQ(varmsg.varname(), "myvar");
+  EXPECT_EQ(varmsg.type(), 1);
+
+  const float* tensor_data =
+      reinterpret_cast<const float*>(varmsg.serialized().data());
+  const int64_t* rows_data =
+      reinterpret_cast<const int64_t*>(varmsg.rows().data());
+  for (int i = 0; i < tensor_numel; ++i) {
+    EXPECT_FLOAT_EQ(tensor_data[i], 32.7);
+  }
+  for (int i = 0; i < 564; ++i) {
+    EXPECT_EQ(rows_data[i], i);
+  }
+
+  // deserialize zero-copy
+  // framework::Variable var2;
+  // operators::distributed::DeserializeFromByteBuffer(msg, ctx, &var2);
+  framework::Scope scope;
+  scope.Var("myvar");
+  operators::distributed::VariableResponse resp(&scope, &ctx);
+  EXPECT_EQ(resp.Parse(msg), 0);
+
+  framework::Variable* var2 = resp.GetVar();
+
+  auto* slr2 = var2->GetMutable<framework::SelectedRows>();
+  auto* tensor2 = slr2->mutable_value();
+  auto* rows2 = slr2->mutable_rows();
+  float* tensor_data2 = nullptr;
+  framework::Tensor tmp_tensor;
+
+  if (platform::is_gpu_place(ctx.GetPlace())) {
+    platform::CPUPlace cpu;
+    framework::TensorCopy(*tensor2, cpu, &tmp_tensor);
+    tensor_data2 = tmp_tensor.data<float>();
+  } else {
+    tensor_data2 = const_cast<float*>(tensor2->data<float>());
+  }
+  const int64_t* rows_data2 = rows2->data();
+
+  for (int i = 0; i < tensor_numel; ++i) {
+    EXPECT_FLOAT_EQ(tensor_data2[i], 32.7);
+  }
+  for (size_t i = 0; i < rows2->size(); ++i) {
+    EXPECT_EQ(rows_data2[i], static_cast<int64_t>(i));
+  }
+  EXPECT_EQ(slr2->height(), 1000);
+}
+
+void RunTestLodTensor(platform::Place place, int from_type = 0) {
+  // serialize var to ByteBuffer
+  framework::Variable var;
+  auto* tensor = var.GetMutable<framework::LoDTensor>();
+  tensor->Resize(framework::make_ddim({512, 8, 4, 2}));
+  framework::LoD lod;
+  lod.push_back(framework::Vector<size_t>({1, 3, 8}));
+  tensor->set_lod(lod);
+  int tensor_numel = 512 * 8 * 4 * 2;
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto& ctx = *pool.Get(place);
+  tensor->mutable_data<float>(place);
+  math::set_constant(ctx, tensor, 31.9);
+
+  ::grpc::ByteBuffer msg;
+  operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg);
+  EXPECT_GT(msg.Length(), static_cast<size_t>(0));
+
+  // deserialize
+  std::vector<::grpc::Slice> slices;
+  (void)msg.Dump(&slices);
+  std::string tmp;
+  for (const auto& s : slices) {
+    tmp.append(reinterpret_cast<const char*>(s.begin()), s.size());
+  }
+  sendrecv::VariableMessage varmsg;
+  EXPECT_TRUE(varmsg.ParseFromString(tmp));
+  EXPECT_EQ(varmsg.varname(), "myvar");
+  EXPECT_EQ(varmsg.type(), 0);
+  EXPECT_EQ(varmsg.dims()[0], 512);
+  EXPECT_EQ(varmsg.dims()[1], 8);
+  EXPECT_EQ(varmsg.dims()[2], 4);
+  EXPECT_EQ(varmsg.dims()[3], 2);
+  EXPECT_EQ(varmsg.lod_level(), 1);
+  EXPECT_EQ(varmsg.lod(0).lod_data(0), 1);
+  EXPECT_EQ(varmsg.lod(0).lod_data(1), 3);
+  EXPECT_EQ(varmsg.lod(0).lod_data(2), 8);
+
+  const float* tensor_data =
+      reinterpret_cast<const float*>(varmsg.serialized().data());
+  for (int i = 0; i < tensor_numel; ++i) {
+    EXPECT_FLOAT_EQ(tensor_data[i], 31.9);
+  }
+
+  // message binary
+  std::string str;
+  varmsg.SerializeToString(&str);
+
+  // message bytebuffer
+  ::grpc::Slice slices_2[1];
+  int num_slices = 1;
+  slices_2[0] = ::grpc::Slice(str.length());
+  memcpy(const_cast<uint8_t*>(slices_2[0].begin()), str.c_str(), str.length());
+  ::grpc::ByteBuffer bytebuffer2(&slices_2[0], num_slices);
+
+  // deserialize zero-copy
+  framework::Scope scope;
+  scope.Var("myvar");
+  operators::distributed::VariableResponse resp(&scope, &ctx);
+  if (from_type == 0) {
+    EXPECT_EQ(resp.Parse(msg), 0);
+  } else {
+    EXPECT_EQ(resp.Parse(bytebuffer2), 0);
+  }
+
+  framework::Variable* var2 = resp.GetVar();
+
+  auto tensor2 = var2->Get<framework::LoDTensor>();
+  float* tensor_data2 = nullptr;
+  framework::Tensor tmp_tensor;
+
+  if (platform::is_gpu_place(ctx.GetPlace())) {
+    platform::CPUPlace cpu;
+    framework::TensorCopy(tensor2, cpu, &tmp_tensor);
+    tensor_data2 = tmp_tensor.data<float>();
+  } else {
+    tensor_data2 = const_cast<float*>(tensor2.data<float>());
+  }
+
+  EXPECT_EQ(varmsg.lod_level(), 1);
+  EXPECT_EQ(varmsg.lod(0).lod_data(0), 1);
+  EXPECT_EQ(varmsg.lod(0).lod_data(1), 3);
+  EXPECT_EQ(varmsg.lod(0).lod_data(2), 8);
+  for (int i = 0; i < tensor_numel; ++i) EXPECT_FLOAT_EQ(tensor_data2[i], 31.9);
+}
+
+TEST(LodTensor, Run) {
+  platform::CPUPlace place;
+  RunTestLodTensor(place);
+  RunTestLodTensor(place, 1);
+#ifdef PADDLE_WITH_CUDA
+  platform::CUDAPlace gpu(0);
+  RunTestLodTensor(gpu);
+  RunTestLodTensor(gpu, 1);
+#endif
+}
+
+TEST(SelectedRows, Run) {
+  platform::CPUPlace place;
+  RunSerdeTestSelectedRows(place);
+
+#ifdef PADDLE_WITH_CUDA
+  platform::CUDAPlace gpu;
+  RunSerdeTestSelectedRows(gpu);
+#endif
+}
diff --git a/paddle/fluid/operators/distributed/grpc_server.cc b/paddle/fluid/operators/distributed/grpc_server.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f35e268f6ad36da02f17db2feb3fbf1fdf6c1e41
--- /dev/null
+++ b/paddle/fluid/operators/distributed/grpc_server.cc
@@ -0,0 +1,414 @@
+/*Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <limits>
+#include <string>
+
+#include "paddle/fluid/operators/distributed/grpc_server.h"
+
+using ::grpc::ServerAsyncResponseWriter;
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+enum CallStatus { PROCESS = 0, FINISH };
+
+// reference:
+// https://stackoverflow.com/questions/41732884/grpc-multiple-services-in-cpp-async-server
+class RequestBase {
+ public:
+  explicit RequestBase(GrpcService::AsyncService* service,
+                       ::grpc::ServerCompletionQueue* cq,
+                       RequestHandler* request_handler, int req_id)
+      : service_(service),
+        cq_(cq),
+        status_(PROCESS),
+        request_handler_(request_handler),
+        req_id_(req_id) {
+    PADDLE_ENFORCE(cq_);
+  }
+  virtual ~RequestBase() {}
+  virtual void Process() = 0;
+
+  std::string Status2String(const std::string& method) {
+    std::string status = "Process";
+    if (status_ == FINISH) {
+      status = "Finish";
+    }
+
+    std::ostringstream s;
+    s << method << " name:[" << GetReqName() << "]"
+      << ", ep:[" << ctx_.peer() << "]"
+      << " " << status << " using req_id:" << req_id_;
+    return s.str();
+  }
+
+  CallStatus Status() const {
+    std::lock_guard<std::mutex> l(status_mu_);
+    return status_;
+  }
+
+  template <typename T>
+  void Finish(const T& reply, ServerAsyncResponseWriter<T>* responder) {
+    std::lock_guard<std::mutex> l(status_mu_);
+    status_ = FINISH;
+    responder->Finish(reply, ::grpc::Status::OK,
+                      reinterpret_cast<void*>(static_cast<intptr_t>(req_id_)));
+  }
+  virtual std::string GetReqName() = 0;
+
+ protected:
+  mutable std::mutex status_mu_;
+  ::grpc::ServerContext ctx_;
+  GrpcService::AsyncService* service_;
+  ::grpc::ServerCompletionQueue* cq_;
+  CallStatus status_;
+  RequestHandler* request_handler_;
+  int req_id_;
+};
+
+class RequestSend final : public RequestBase {
+ public:
+  explicit RequestSend(GrpcService::AsyncService* service,
+                       ::grpc::ServerCompletionQueue* cq,
+                       RequestHandler* request_handler, int req_id)
+      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
+    request_.reset(new VariableResponse(request_handler->scope(),
+                                        request_handler->dev_ctx(),
+                                        !request_handler->sync_mode()));
+    int method_id = static_cast<int>(distributed::GrpcMethod::kSendVariable);
+    service_->RequestAsyncUnary(
+        method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
+        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
+  }
+  virtual ~RequestSend() {}
+  std::string GetReqName() override { return request_->Varname(); }
+
+  void Process() override {
+    std::string varname = GetReqName();
+    VLOG(4) << "RequestSend var_name:" << varname;
+
+    auto scope = request_->GetMutableLocalScope();
+    auto invar = request_->GetVar();
+    framework::Variable* outvar = nullptr;
+
+    request_handler_->Handle(varname, scope, invar, &outvar);
+    Finish(reply_, &responder_);
+  }
+
+ protected:
+  sendrecv::VoidMessage reply_;
+  std::shared_ptr<VariableResponse> request_;
+  ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
+};
+
+class RequestGet final : public RequestBase {
+ public:
+  explicit RequestGet(GrpcService::AsyncService* service,
+                      ::grpc::ServerCompletionQueue* cq,
+                      RequestHandler* request_handler, int req_id)
+      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
+    auto method_id = static_cast<int>(distributed::GrpcMethod::kGetVariable);
+    service_->RequestAsyncUnary(
+        method_id, &ctx_, &request_, &responder_, cq_, cq_,
+        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
+  }
+
+  virtual ~RequestGet() {}
+
+  std::string GetReqName() override { return request_.varname(); }
+
+  void Process() override {
+    // proc request.
+    std::string varname = request_.varname();
+    VLOG(4) << "RequestGet " << varname;
+
+    auto scope = request_handler_->scope();
+    auto invar = scope->FindVar(varname);
+    framework::Variable* outvar = nullptr;
+
+    request_handler_->Handle(varname, scope, invar, &outvar);
+
+    if (outvar) {
+      SerializeToByteBuffer(varname, outvar, *request_handler_->dev_ctx(),
+                            &reply_);
+    }
+    Finish(reply_, &responder_);
+  }
+
+ protected:
+  sendrecv::VariableMessage request_;
+  ::grpc::ByteBuffer reply_;
+  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
+};
+
+class RequestPrefetch final : public RequestBase {
+ public:
+  explicit RequestPrefetch(GrpcService::AsyncService* service,
+                           ::grpc::ServerCompletionQueue* cq,
+                           RequestHandler* request_handler, int req_id)
+      : RequestBase(service, cq, request_handler, req_id),
+        responder_(&ctx_),
+        local_scope_(nullptr) {
+    request_.reset(new VariableResponse(request_handler->scope(),
+                                        request_handler->dev_ctx(), true));
+    int method_id =
+        static_cast<int>(distributed::GrpcMethod::kPrefetchVariable);
+    service_->RequestAsyncUnary(
+        method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
+        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
+  }
+
+  virtual ~RequestPrefetch() {}
+
+  std::string GetReqName() override { return request_->Varname(); }
+
+  void Process() override {
+    // prefetch process...
+    std::string in_var_name = request_->Varname();
+    std::string out_var_name = request_->OutVarname();
+    VLOG(4) << "RequestPrefetch, in_var_name: " << in_var_name
+            << " out_var_name: " << out_var_name;
+
+    auto scope = request_->GetMutableLocalScope();
+    auto invar = scope->FindVar(in_var_name);
+    // out var must be created in local scope!
+    framework::Variable* outvar = scope->Var(out_var_name);
+
+    request_handler_->Handle(in_var_name, scope, invar, &outvar, out_var_name);
+
+    SerializeToByteBuffer(out_var_name, outvar, *request_handler_->dev_ctx(),
+                          &reply_);
+    Finish(reply_, &responder_);
+  }
+
+ protected:
+  std::shared_ptr<VariableResponse> request_;
+  ::grpc::ByteBuffer reply_;
+  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
+  framework::Scope* local_scope_;
+};
+
+class RequestCheckpointNotify final : public RequestBase {
+ public:
+  explicit RequestCheckpointNotify(GrpcService::AsyncService* service,
+                                   ::grpc::ServerCompletionQueue* cq,
+                                   RequestHandler* request_handler, int req_id)
+      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
+    request_.reset(new VariableResponse(request_handler->scope(),
+                                        request_handler->dev_ctx()));
+    int method_id =
+        static_cast<int>(distributed::GrpcMethod::kCheckpointNotify);
+    service_->RequestAsyncUnary(
+        method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
+        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
+  }
+
+  virtual ~RequestCheckpointNotify() {}
+
+  std::string GetReqName() override { return request_->Varname(); }
+
+  void Process() override {
+    auto scope = request_->GetMutableLocalScope();
+
+    std::string checkpoint_notify = request_->Varname();
+    std::string checkpoint_dir = request_->OutVarname();
+
+    VLOG(4) << "RequestCheckpointNotify notify: " << checkpoint_notify
+            << ", dir: " << checkpoint_dir;
+
+    request_handler_->Handle(checkpoint_notify, scope, nullptr, nullptr,
+                             checkpoint_dir);
+    Finish(reply_, &responder_);
+  }
+
+ protected:
+  std::shared_ptr<VariableResponse> request_;
+  sendrecv::VoidMessage reply_;
+  ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
+};
+
+void AsyncGRPCServer::WaitServerReady() {
+  VLOG(4) << "AsyncGRPCServer is wait server ready";
+  std::unique_lock<std::mutex> lock(this->mutex_ready_);
+  condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
+  VLOG(4) << "AsyncGRPCServer WaitSeverReady";
+}
+
+void AsyncGRPCServer::StartServer() {
+  ::grpc::ServerBuilder builder;
+  builder.AddListeningPort(bind_address_, ::grpc::InsecureServerCredentials(),
+                           &selected_port_);
+
+  builder.SetMaxSendMessageSize(std::numeric_limits<int>::max());
+  builder.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
+  builder.RegisterService(&service_);
+
+  for (auto t : rpc_call_map_) {
+    rpc_cq_[t.first].reset(builder.AddCompletionQueue().release());
+  }
+
+  server_ = builder.BuildAndStart();
+  LOG(INFO) << "Server listening on " << bind_address_
+            << " selected port: " << selected_port_;
+
+  std::function<void(const std::string&, int)> f =
+      std::bind(&AsyncGRPCServer::TryToRegisterNewOne, this,
+                std::placeholders::_1, std::placeholders::_2);
+
+  for (auto& t : rpc_call_map_) {
+    auto& rpc_name = t.first;
+    auto& cq = rpc_cq_[rpc_name];
+    auto threadnum = rpc_thread_num_[rpc_name];
+    auto& reqs = rpc_reqs_[rpc_name];
+
+    reqs.reserve(kRequestBufSize);
+
+    for (int i = 0; i < kRequestBufSize; i++) {
+      VLOG(6) << "TryToRegisterNewOne on RPC NAME: " << rpc_name << " I: " << i;
+      TryToRegisterNewOne(rpc_name, i);
+    }
+
+    for (int i = 0; i < threadnum; i++) {
+      rpc_threads_[rpc_name].emplace_back(new std::thread(std::bind(
+          &AsyncGRPCServer::HandleRequest, this, cq.get(), rpc_name, f)));
+      VLOG(4) << t.first << " creates threads!";
+    }
+  }
+
+  {
+    std::lock_guard<std::mutex> lock(this->mutex_ready_);
+    ready_ = 1;
+  }
+  condition_ready_.notify_all();
+
+  // wait server
+  server_->Wait();
+
+  for (auto& t : rpc_threads_) {
+    auto& threads = t.second;
+    for (size_t i = 0; i < threads.size(); ++i) {
+      threads[i]->join();
+      VLOG(4) << t.first << " threads ends!";
+    }
+  }
+}
+
+void AsyncGRPCServer::ShutdownQueue() {
+  for (auto& t : rpc_cq_) {
+    t.second->Shutdown();
+    VLOG(4) << t.first << " queue shutdown!";
+  }
+}
+
+void AsyncGRPCServer::ShutDownImpl() {
+  std::unique_lock<std::mutex> lock(cq_mutex_);
+  is_shut_down_ = true;
+  ShutdownQueue();
+
+  VLOG(4) << "server_ shutdown!";
+  server_->Shutdown();
+}
+
+void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
+                                          int req_id) {
+  std::unique_lock<std::mutex> lock(cq_mutex_);
+  if (is_shut_down_) {
+    VLOG(4) << "shutdown, do not TryToRegisterNewSendOne";
+    return;
+  }
+
+  VLOG(4) << "TryToRegisterNewOne on RPC NAME: " << rpc_name
+          << " REQ ID: " << req_id;
+
+  auto& reqs = rpc_reqs_[rpc_name];
+  auto& handler = rpc_call_map_[rpc_name];
+  auto& cq = rpc_cq_[rpc_name];
+
+  RequestBase* b = nullptr;
+  if (rpc_name == kRequestSend) {
+    b = new RequestSend(&service_, cq.get(), handler, req_id);
+  } else if (rpc_name == kRequestGet) {
+    b = new RequestGet(&service_, cq.get(), handler, req_id);
+  } else if (rpc_name == kRequestPrefetch) {
+    b = new RequestPrefetch(&service_, cq.get(), handler, req_id);
+  } else if (rpc_name == kRequestCheckpoint) {
+    b = new RequestCheckpointNotify(&service_, cq.get(), handler, req_id);
+  } else {
+    PADDLE_ENFORCE(false, "not supported rpc");
+  }
+
+  reqs[req_id] = b;
+
+  VLOG(4) << "Create RequestSend status:" << b->Status();
+}
+
+void AsyncGRPCServer::HandleRequest(
+    ::grpc::ServerCompletionQueue* cq, const std::string& rpc_name,
+    std::function<void(const std::string&, int)> TryToRegisterNewOne) {
+  void* tag = NULL;
+  bool ok = false;
+
+  while (true) {
+    VLOG(4) << "HandleRequest " << rpc_name << " wait next";
+    if (!cq->Next(&tag, &ok)) {
+      VLOG(3) << "CompletionQueue " << rpc_name << " shutdown!";
+      break;
+    }
+
+    int req_id = static_cast<int>(reinterpret_cast<intptr_t>(tag));
+    VLOG(4) << "HandleRequest " << rpc_name << ", req_id:" << req_id
+            << " get next";
+
+    auto& reqs = rpc_reqs_[rpc_name];
+    RequestBase* base = nullptr;
+    {
+      PADDLE_ENFORCE(req_id >= 0 && req_id < kRequestBufSize);
+      std::unique_lock<std::mutex> lock(cq_mutex_);
+      base = reqs[req_id];
+    }
+
+    VLOG(3) << base->Status2String(rpc_name);
+
+    // reference:
+    // https://github.com/tensorflow/tensorflow/issues/5596
+    // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM
+    // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I
+    if (!ok) {
+      LOG(WARNING) << "completion queue:" << rpc_name
+                   << " recv no regular event"
+                   << " context:" << base->Status2String(rpc_name);
+      TryToRegisterNewOne(rpc_name, req_id);
+      delete base;
+      continue;
+    }
+
+    switch (base->Status()) {
+      case PROCESS: {
+        base->Process();
+        break;
+      }
+      case FINISH: {
+        TryToRegisterNewOne(rpc_name, req_id);
+        delete base;
+        break;
+      }
+      default: { assert(false); }
+    }
+  }
+}
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc_server.h b/paddle/fluid/operators/distributed/grpc_server.h
new file mode 100644
index 0000000000000000000000000000000000000000..d2524f5e65db6dedab78f45e17380359b58a3d11
--- /dev/null
+++ b/paddle/fluid/operators/distributed/grpc_server.h
@@ -0,0 +1,89 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <map>
+#include <set>
+#include <string>
+#include <thread>  // NOLINT
+#include <utility>
+#include <vector>
+
+#include "grpc++/grpc++.h"
+#include "paddle/fluid/framework/blocking_queue.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/operators/distributed/grpc_service.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
+#include "paddle/fluid/operators/distributed/rpc_server.h"
+#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+class RequestBase;
+
+class AsyncGRPCServer final : public RPCServer {
+ public:
+  explicit AsyncGRPCServer(const std::string& address, int client_num)
+      : RPCServer(address, client_num), ready_(0) {}
+
+  virtual ~AsyncGRPCServer() {}
+  void WaitServerReady() override;
+  void StartServer() override;
+
+ private:
+  // HandleRequest needs to be thread-safe.
+  void HandleRequest(
+      ::grpc::ServerCompletionQueue* cq, const std::string& rpc_name,
+      std::function<void(const std::string&, int)> TryToRegisterNewOne);
+
+  void TryToRegisterNewOne(const std::string& rpc_name, int req_id);
+  void ShutdownQueue();
+  void ShutDownImpl() override;
+
+ private:
+  static const int kRequestBufSize = 100;
+
+  std::mutex cq_mutex_;
+  volatile bool is_shut_down_ = false;
+
+  GrpcService::AsyncService service_;
+  std::unique_ptr<::grpc::Server> server_;
+
+  // condition of the sub program
+  std::condition_variable barrier_condition_;
+
+  std::mutex mutex_ready_;
+  std::condition_variable condition_ready_;
+
+  int ready_;
+
+  std::map<std::string, std::unique_ptr<::grpc::ServerCompletionQueue>> rpc_cq_;
+  std::map<std::string, std::vector<std::unique_ptr<std::thread>>> rpc_threads_;
+  std::map<std::string, std::vector<RequestBase*>> rpc_reqs_;
+};
+
+};  // namespace distributed
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc_service.h b/paddle/fluid/operators/distributed/grpc_service.h
new file mode 100644
index 0000000000000000000000000000000000000000..cdc4e7b79276d6aac55aeac8ac121ca28d2cc1f0
--- /dev/null
+++ b/paddle/fluid/operators/distributed/grpc_service.h
@@ -0,0 +1,127 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <grpc++/impl/codegen/async_stream.h>
+#include <grpc++/impl/codegen/async_unary_call.h>
+#include <grpc++/impl/codegen/proto_utils.h>
+#include <grpc++/impl/codegen/rpc_method.h>
+#include <grpc++/impl/codegen/service_type.h>
+#include <grpc++/impl/codegen/status.h>
+#include <grpc++/impl/codegen/stub_options.h>
+#include <grpc++/impl/codegen/sync_stream.h>
+#include <grpc++/support/byte_buffer.h>
+#include "paddle/fluid/operators/distributed/variable_response.h"
+
+#include "paddle/fluid/platform/profiler.h"
+
+// NOTE: This method was originally created by tensorflow
+//       (https://github.com/tensorflow/tensorflow/) we borrow this
+//       method and did some modifications so that we can parse gRPC
+//       requests without too much copying of the tensor data.
+
+namespace grpc {
+class CompletionQueue;
+class Channel;
+class RpcService;
+class ServerCompletionQueue;
+class ServerContext;
+
+// Support parsing/unparsing of tensorflow::VariableResponse.
+// Wire-format is identical to RecvVariableResponse.
+template <>
+class SerializationTraits<paddle::operators::distributed::VariableResponse> {
+ public:
+  static Status Serialize(
+      const paddle::operators::distributed::VariableResponse& msg,
+      grpc_byte_buffer** bp, bool* own_buffer) {
+    PADDLE_ENFORCE(false, "SerializationTraits::Serialize not implemented!");
+    return Status();
+  }
+  static Status Deserialize(
+      grpc_byte_buffer* buffer,
+      paddle::operators::distributed::VariableResponse* msg,
+      int max_message_size = INT_MAX) {
+    if (buffer == nullptr) {
+      return Status(StatusCode::INTERNAL, "No payload");
+    }
+
+    Status result = g_core_codegen_interface->ok();
+    if (result.ok()) {
+      paddle::operators::distributed::GrpcByteSource source(buffer);
+      int ret = msg->Parse(&source);
+      if (ret != 0) {
+        result = Status(StatusCode::INTERNAL, "VariableResponse parse error");
+      }
+    }
+    g_core_codegen_interface->grpc_byte_buffer_destroy(buffer);
+    return result;
+  }
+};
+}  // namespace grpc
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+enum class GrpcMethod {
+  kSendVariable,
+  kGetVariable,
+  kPrefetchVariable,
+  kCheckpointNotify,
+};
+
+static const int kGrpcNumMethods =
+    static_cast<int>(GrpcMethod::kCheckpointNotify) + 1;
+
+inline const char* GrpcMethodName(GrpcMethod id) {
+  switch (id) {
+    case GrpcMethod::kSendVariable:
+      return "/sendrecv.SendRecvService/SendVariable";
+    case GrpcMethod::kGetVariable:
+      return "/sendrecv.SendRecvService/GetVariable";
+    case GrpcMethod::kPrefetchVariable:
+      return "/sendrecv.SendRecvService/PrefetchVariable";
+    case GrpcMethod::kCheckpointNotify:
+      return "/sendrecv.SendRecvService/CheckpointNotify";
+  }
+
+  // Shouldn't be reached.
+  PADDLE_ENFORCE(false, "Invalid id: not found valid method name");
+  return nullptr;
+}
+
+class GrpcService final {
+ public:
+  class AsyncService : public ::grpc::Service {
+   public:
+    AsyncService() {
+      for (int i = 0; i < kGrpcNumMethods; ++i) {
+        AddMethod(new ::grpc::internal::RpcServiceMethod(
+            GrpcMethodName(static_cast<GrpcMethod>(i)),
+            ::grpc::internal::RpcMethod::NORMAL_RPC, nullptr));
+        ::grpc::Service::MarkMethodAsync(i);
+      }
+    }
+    virtual ~AsyncService() {}
+
+    // Make RequestAsyncUnary public for grpc_call.h
+    using ::grpc::Service::RequestAsyncUnary;
+  };
+};
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/proto_encoder_helper.h b/paddle/fluid/operators/distributed/proto_encoder_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..2fab02e32fe18ee04f86a69bb5bae1cbe7c6762c
--- /dev/null
+++ b/paddle/fluid/operators/distributed/proto_encoder_helper.h
@@ -0,0 +1,149 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+// NOTE: This file was originally created by tensorflow
+//       (https://github.com/tensorflow/tensorflow/) we borrow this
+//       file and did some modifications so that we can send gRPC
+//       requests without too much copying of the tensor data.
+
+#pragma once
+
+#include <string>
+
+#include "grpc++/grpc++.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+char* EncodeVarint32(char* dst, uint32_t v) {
+  // Operate on characters as unsigneds
+  unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
+  static const int B = 128;
+  if (v < (1 << 7)) {
+    *(ptr++) = v;
+  } else if (v < (1 << 14)) {
+    *(ptr++) = v | B;
+    *(ptr++) = v >> 7;
+  } else if (v < (1 << 21)) {
+    *(ptr++) = v | B;
+    *(ptr++) = (v >> 7) | B;
+    *(ptr++) = v >> 14;
+  } else if (v < (1 << 28)) {
+    *(ptr++) = v | B;
+    *(ptr++) = (v >> 7) | B;
+    *(ptr++) = (v >> 14) | B;
+    *(ptr++) = v >> 21;
+  } else {
+    *(ptr++) = v | B;
+    *(ptr++) = (v >> 7) | B;
+    *(ptr++) = (v >> 14) | B;
+    *(ptr++) = (v >> 21) | B;
+    *(ptr++) = v >> 28;
+  }
+  return reinterpret_cast<char*>(ptr);
+}
+
+char* EncodeVarint64(char* dst, uint64_t v) {
+  static const int B = 128;
+  unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
+  while (v >= B) {
+    *(ptr++) = (v & (B - 1)) | B;
+    v >>= 7;
+  }
+  *(ptr++) = static_cast<unsigned char>(v);
+  return reinterpret_cast<char*>(ptr);
+}
+
+int VarintLength(uint64_t v) {
+  int len = 1;
+  while (v >= 128) {
+    v >>= 7;
+    len++;
+  }
+  return len;
+}
+
+class ProtoEncodeHelper {
+ public:
+  ProtoEncodeHelper(char* buf, int max_size)
+      : base_(buf), p_(buf), limit_(base_ + max_size) {}
+
+  ~ProtoEncodeHelper() {
+    // Make sure callers didn't do operations that went over max_size promised
+    PADDLE_ENFORCE_LE(p_, limit_);
+  }
+
+  const char* data() const { return base_; }
+  size_t size() const { return p_ - base_; }
+
+  void WriteUint64(int tag, uint64_t v) {
+    Encode32(combine(tag, WIRETYPE_VARINT));
+    Encode64(v);
+  }
+  void WriteBool(int tag, bool v) {
+    Encode32(combine(tag, WIRETYPE_VARINT));
+    EncodeBool(v);
+  }
+  void WriteString(int tag, const std::string& v) {
+    Encode32(combine(tag, WIRETYPE_LENGTH_DELIMITED));
+    Encode32(v.size());
+    EncodeBytes(v.data(), v.size());
+  }
+  void WriteVarlengthBeginning(int tag, uint32_t len) {
+    Encode32(combine(tag, WIRETYPE_LENGTH_DELIMITED));
+    Encode32(len);
+  }
+  void WriteRawBytes(const std::string& v) { EncodeBytes(v.data(), v.size()); }
+
+ private:
+  // Note: this module's behavior must match the protocol buffer wire encoding
+  // format.
+  enum {
+    WIRETYPE_VARINT = 0,
+    WIRETYPE_LENGTH_DELIMITED = 2,
+  };
+  static uint32_t combine(uint32_t tag, uint32_t type) {
+    return ((tag << 3) | type);
+  }
+  inline void Encode32(uint32_t v) {
+    if (v < 128) {
+      // Fast path for single-byte values.  Many of the calls will use a
+      // constant value for v, so the comparison will get optimized away
+      // when Encode32 is inlined into the caller.
+      *p_ = v;
+      p_++;
+    } else {
+      p_ = EncodeVarint32(p_, v);
+    }
+  }
+  void Encode64(uint64_t v) { p_ = EncodeVarint64(p_, v); }
+  void EncodeBool(bool v) {
+    *p_ = (v ? 1 : 0);  // Equal to varint32 encoding of 0 or 1
+    p_++;
+  }
+  void EncodeBytes(const char* bytes, int N) {
+    memcpy(p_, bytes, N);
+    p_ += N;
+  }
+
+  char* base_;
+  char* p_;
+  char* limit_;  // Just for CHECKs
+};
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h
new file mode 100644
index 0000000000000000000000000000000000000000..271306d5d20f1b849a81a9bfa6436f2faf261204
--- /dev/null
+++ b/paddle/fluid/operators/distributed/request_handler.h
@@ -0,0 +1,143 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <time.h>
+
+#include <functional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/var_type.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+constexpr char kRequestSend[] = "RequestSend";
+constexpr char kRequestGet[] = "RequestGet";
+constexpr char kRequestPrefetch[] = "RequestPrefetch";
+constexpr char kRequestCheckpoint[] = "RequestCheckpoint";
+constexpr char kRequestPassBarrier[] = "RequestPassBarrier";
+
+#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
+#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
+#define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV"
+#define COMPLETE_MESSAGE "COMPLETE@RECV"
+#define BEGIN_PASS_MESSAGE "BEGIN_PASS@RECV"
+#define END_PASS_MESSAGE "END_PASS@RECV"
+
+#define CHECKPOINT_SAVE_MESSAGE "SAVE@CHECKPOINTNOTIFY"
+#define CHECKPOINT_LOAD_MESSAGE "LOAD@CHECKPOINTNOTIFY"
+
+class RPCServer;
+
+class RequestHandler {
+ public:
+  explicit RequestHandler(bool sync_mode)
+      : sync_mode_(sync_mode),
+        dev_ctx_(nullptr),
+        executor_(nullptr),
+        scope_(nullptr),
+        program_(nullptr),
+        rpc_server_(nullptr) {}
+
+  virtual ~RequestHandler() {}
+
+  // Set attributes.
+  void SetScope(framework::Scope* scope) { scope_ = scope; }
+  void SetDevCtx(const platform::DeviceContext* dev_ctx) { dev_ctx_ = dev_ctx; }
+  void SetProgram(framework::ProgramDesc* program) { program_ = program; }
+  void SetExecutor(framework::Executor* executor) { executor_ = executor; }
+
+  // Used for dist lookup table prefetch
+  void SetPrefetchPreparedCtx(
+      std::unordered_map<
+          std::string, std::shared_ptr<framework::ExecutorPrepareContext>>* g) {
+    prefetch_var_name_to_prepared_ctx_ = g;
+  }
+
+  void SetCheckpointNotifyPreparedCtx(
+      std::shared_ptr<framework::ExecutorPrepareContext> g) {
+    checkpoint_prepared_ctx_ = g;
+  }
+
+  // Used for async.
+  void SetGradToPreparedCtx(
+      std::unordered_map<
+          std::string, std::shared_ptr<framework::ExecutorPrepareContext>>* g) {
+    grad_to_prepared_ctx_ = g;
+  }
+
+  void SetRPCServer(RPCServer* rpc_server) { rpc_server_ = rpc_server; }
+
+  // Get attributes.
+  bool sync_mode() { return sync_mode_; }
+  framework::Scope* scope() { return scope_; }
+  const platform::DeviceContext* dev_ctx() { return dev_ctx_; }
+  framework::ProgramDesc* program() { return program_; }
+  framework::Executor* executor() { return executor_; }
+
+  // This function processes user's rpc request.
+  // The implemention is in request_handler_impl.
+  // example:
+  //    std::string varname = request_.varname();
+  //
+  //    auto scope = request_handler_->scope();
+  //    auto invar = scope->FindVar(varname);
+  //    framework::Variable* outvar = nullptr;
+  //
+  //    request_handler_->Handle(varname, scope, invar, &outvar);
+  //    if (outvar) {
+  //        SerializeToByteBuffer(varname, outvar,
+  //           *request_handler_->dev_ctx(), &reply_);
+  //    }
+  virtual bool Handle(const std::string& varname, framework::Scope* scope,
+                      framework::Variable* var, framework::Variable** outvar,
+                      const std::string& out_var_name = "") = 0;
+
+ protected:
+  const bool sync_mode_;
+
+  const platform::DeviceContext* dev_ctx_;
+  framework::Executor* executor_;
+  framework::Scope* scope_;
+  framework::ProgramDesc* program_;
+
+  // used for distribute lookup table prefetch
+  std::unordered_map<std::string,
+                     std::shared_ptr<framework::ExecutorPrepareContext>>*
+      prefetch_var_name_to_prepared_ctx_;
+  // used for checkpoint notify
+  std::shared_ptr<framework::ExecutorPrepareContext> checkpoint_prepared_ctx_;
+
+  // Used for async.
+  std::unordered_map<std::string,
+                     std::shared_ptr<framework::ExecutorPrepareContext>>*
+      grad_to_prepared_ctx_;
+
+  RPCServer* rpc_server_;
+};
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5e6bff20f5f8c06e1497c697e3aabf7b9cb94ad6
--- /dev/null
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
@@ -0,0 +1,147 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/distributed/request_handler_impl.h"
+#include "paddle/fluid/operators/distributed/rpc_server.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+// define LOOKUP_TABLE_PATH for checkpoint notify to save lookup table variables
+// to directory specified.
+constexpr char LOOKUP_TABLE_PATH[] = "kLookupTablePath";
+
+bool RequestSendHandler::Handle(const std::string& varname,
+                                framework::Scope* scope,
+                                framework::Variable* invar,
+                                framework::Variable** outvar,
+                                const std::string& out_var_name) {
+  VLOG(4) << "RequestSendHandler:" << varname;
+
+  // Async
+  if (!sync_mode_) {
+    try {
+      executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(),
+                                    scope);
+    } catch (std::exception& e) {
+      LOG(ERROR) << "async: run sub program error " << e.what();
+      return false;
+    }
+    return true;
+  }
+
+  // Sync
+  if (varname == BATCH_BARRIER_MESSAGE) {
+    VLOG(3) << "sync: recv batch barrier message";
+    rpc_server_->IncreaseBatchBarrier(kRequestSend);
+  } else if (varname == BEGIN_PASS_MESSAGE) {
+    VLOG(3) << "sync: recv begin pass message";
+    rpc_server_->WaitCond(kRequestSend);
+    rpc_server_->BeginPass();
+  } else {
+    VLOG(3) << "sync: received var_name: " << varname;
+    rpc_server_->WaitCond(kRequestSend);
+    VLOG(3) << "sync: processing received var: " << varname;
+
+    if (invar == nullptr) {
+      LOG(ERROR) << "sync: Can not find server side var: " << varname;
+      PADDLE_THROW("sync: Can not find server side var");
+      return false;
+    }
+    if (invar->IsType<framework::SelectedRows>()) {
+      std::unique_lock<std::mutex> lock(mutex_sparse_vars_);
+      sparse_vars_.push_back(invar);
+    }
+  }
+  return true;
+}
+
+void RequestSendHandler::ResetSparseVarRecorder() {
+  std::unique_lock<std::mutex> lock(mutex_sparse_vars_);
+  for (auto* var : sparse_vars_) {
+    var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
+  }
+  sparse_vars_.clear();
+}
+
+bool RequestGetHandler::Handle(const std::string& varname,
+                               framework::Scope* scope,
+                               framework::Variable* invar,
+                               framework::Variable** outvar,
+                               const std::string& out_var_name) {
+  VLOG(4) << "RequestGetHandler:" << varname;
+  if (sync_mode_) {
+    if (varname == FETCH_BARRIER_MESSAGE) {
+      VLOG(3) << "sync: recv fetch barrier message";
+      rpc_server_->IncreaseBatchBarrier(kRequestGet);
+    } else if (varname == END_PASS_MESSAGE) {
+      rpc_server_->EndPass();
+    } else {
+      rpc_server_->WaitCond(kRequestGet);
+      *outvar = scope_->FindVar(varname);
+    }
+  } else {
+    if (varname != FETCH_BARRIER_MESSAGE && varname != END_PASS_MESSAGE) {
+      *outvar = scope_->FindVar(varname);
+    }
+  }
+  return true;
+}
+
+bool RequestPrefetchHandler::Handle(const std::string& varname,
+                                    framework::Scope* scope,
+                                    framework::Variable* invar,
+                                    framework::Variable** outvar,
+                                    const std::string& out_var_name) {
+  VLOG(4) << "RequestPrefetchHandler " << varname;
+
+  auto var_desc = program_->Block(0).FindVar(out_var_name);
+  InitializeVariable(*outvar, var_desc->GetType());
+  executor_->RunPreparedContext(
+      (*prefetch_var_name_to_prepared_ctx_)[varname].get(), scope);
+
+  return true;
+}
+
+bool RequestCheckpointHandler::Handle(const std::string& varname,
+                                      framework::Scope* scope,
+                                      framework::Variable* invar,
+                                      framework::Variable** outvar,
+                                      const std::string& out_var_name) {
+  PADDLE_ENFORCE(
+      checkpoint_notify_id != -1,
+      "when checkpoint_notify_id = -1, there should be no RPC invoke.");
+
+  auto* lt_var = scope->FindVar(LOOKUP_TABLE_PATH)->GetMutable<std::string>();
+  lt_var->clear();
+  lt_var->append(out_var_name);
+  VLOG(4) << "RequestCheckpointHandler update var kLookupTablePath to: "
+          << out_var_name;
+  executor_->RunPreparedContext(checkpoint_prepared_ctx_.get(), scope);
+  return true;
+}
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.h b/paddle/fluid/operators/distributed/request_handler_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..87185500f2ffc3a8578eea339cc7a1e2b0e46631
--- /dev/null
+++ b/paddle/fluid/operators/distributed/request_handler_impl.h
@@ -0,0 +1,86 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <time.h>
+
+#include <functional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+class RequestSendHandler final : public RequestHandler {
+ public:
+  explicit RequestSendHandler(bool sync_mode) : RequestHandler(sync_mode) {}
+  virtual ~RequestSendHandler() {}
+  bool Handle(const std::string& varname, framework::Scope* scope,
+              framework::Variable* var, framework::Variable** outvar,
+              const std::string& out_var_name = "") override;
+  void ResetSparseVarRecorder();
+
+ private:
+  std::mutex mutex_sparse_vars_;
+  std::vector<framework::Variable*> sparse_vars_;
+};
+
+class RequestGetHandler final : public RequestHandler {
+ public:
+  explicit RequestGetHandler(bool sync_mode) : RequestHandler(sync_mode) {}
+  virtual ~RequestGetHandler() {}
+  bool Handle(const std::string& varname, framework::Scope* scope,
+              framework::Variable* var, framework::Variable** outvar,
+              const std::string& out_var_name = "") override;
+};
+
+class RequestPrefetchHandler final : public RequestHandler {
+ public:
+  explicit RequestPrefetchHandler(bool sync_mode) : RequestHandler(sync_mode) {}
+  virtual ~RequestPrefetchHandler() {}
+  bool Handle(const std::string& varname, framework::Scope* scope,
+              framework::Variable* var, framework::Variable** outvar,
+              const std::string& out_var_name = "") override;
+};
+
+class RequestCheckpointHandler final : public RequestHandler {
+ public:
+  explicit RequestCheckpointHandler(bool sync_mode, int checkpoint_notify_id)
+      : RequestHandler(sync_mode) {
+    this->checkpoint_notify_id = checkpoint_notify_id;
+  }
+  virtual ~RequestCheckpointHandler() {}
+  bool Handle(const std::string& varname, framework::Scope* scope,
+              framework::Variable* var, framework::Variable** outvar,
+              const std::string& out_var_name = "") override;
+
+ private:
+  int checkpoint_notify_id;
+};
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/rpc_client.cc b/paddle/fluid/operators/distributed/rpc_client.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b5ec9fe5367beb97b3cc7298102deff1e8ca4ec9
--- /dev/null
+++ b/paddle/fluid/operators/distributed/rpc_client.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/distributed/rpc_client.h"
+#include "gflags/gflags.h"
+
+// default to 3min to avoid temprary network failures.
+DEFINE_int32(rpc_deadline, 180000, "deadline timeouts for rpc");
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+std::once_flag RPCClient::init_flag_;
+std::unique_ptr<RPCClient> RPCClient::rpc_client_(nullptr);
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h
new file mode 100644
index 0000000000000000000000000000000000000000..6479d3a97bafba37b74a1d1c04852a6e60e01be8
--- /dev/null
+++ b/paddle/fluid/operators/distributed/rpc_client.h
@@ -0,0 +1,101 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "gflags/gflags.h"
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+
+DECLARE_int32(rpc_deadline);
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+class RPCClient {
+ public:
+  RPCClient() {}
+  virtual ~RPCClient() {}
+  virtual bool AsyncSendVar(const std::string& ep,
+                            const platform::DeviceContext& ctx,
+                            const framework::Scope& scope,
+                            const std::string& var_name,
+                            int64_t time_out = FLAGS_rpc_deadline) = 0;
+
+  virtual bool AsyncGetVar(const std::string& ep,
+                           const platform::DeviceContext& ctx,
+                           const framework::Scope& scope,
+                           const std::string& var_name,
+                           int64_t time_out = FLAGS_rpc_deadline) = 0;
+
+  virtual bool AsyncPrefetchVar(const std::string& ep,
+                                const platform::DeviceContext& ctx,
+                                const framework::Scope& scope,
+                                const std::string& in_var_name,
+                                const std::string& out_var_name,
+                                int64_t time_out = FLAGS_rpc_deadline) = 0;
+
+  virtual void AsyncSendBatchBarrier(const std::string& ep,
+                                     int64_t time_out = FLAGS_rpc_deadline) = 0;
+
+  virtual void AsyncSendFetchBarrier(const std::string& ep,
+                                     int64_t time_out = FLAGS_rpc_deadline) = 0;
+
+  virtual void AsyncCheckpointNotify(const std::string& ep,
+                                     const std::string& dir,
+                                     int64_t time_out = FLAGS_rpc_deadline) = 0;
+
+  virtual void AsyncSendBeginPass(const std::string& ep,
+                                  int64_t time_out = FLAGS_rpc_deadline) = 0;
+
+  virtual void AsyncSendEndPass(const std::string& ep,
+                                int64_t time_out = FLAGS_rpc_deadline) = 0;
+
+  // BeginePass/EndPass tells all the pserver that start/end a pass, so that
+  // the pserver can increase/reduce it's barrier count, and continue to train
+  // with other trainers.
+  virtual void SendBeginPass() = 0;
+  virtual void SendEndPass() = 0;
+
+  virtual void Wait() = 0;
+
+  template <typename T>
+  static RPCClient* GetInstance() {
+    std::call_once(init_flag_, &RPCClient::Init<T>);
+    return rpc_client_.get();
+  }
+
+  // Init is called by GetInstance.
+  template <typename T>
+  static void Init() {
+    if (rpc_client_.get() == nullptr) {
+      rpc_client_.reset(new T());
+      rpc_client_->InitImpl();
+    }
+  }
+
+ protected:
+  virtual void InitImpl() {}
+
+ private:
+  static std::once_flag init_flag_;
+  static std::unique_ptr<RPCClient> rpc_client_;
+};
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d49ee34eeaf4e80f6fd4f8cdc548cc2b938d0f2a
--- /dev/null
+++ b/paddle/fluid/operators/distributed/rpc_server.cc
@@ -0,0 +1,134 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fstream>
+#include <iostream>
+#include <limits>
+#include <string>
+
+#include "paddle/fluid/operators/distributed/rpc_server.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+void RPCServer::ShutDown() {
+  LOG(INFO) << "RPCServer ShutDown ";
+  ShutDownImpl();
+
+  exit_flag_ = true;
+  barrier_cond_.notify_all();
+  rpc_cond_.notify_all();
+}
+
+void RPCServer::SavePort() const {
+  auto file_path = string::Sprintf("/tmp/paddle.%d.port", ::getpid());
+  std::ofstream port_file;
+  port_file.open(file_path);
+  port_file << selected_port_;
+  port_file.close();
+  VLOG(4) << "selected port written to " << file_path;
+}
+
+void RPCServer::WaitBarrier(const std::string& rpc_name) {
+  std::unique_lock<std::mutex> lock(this->mutex_);
+  barrier_cond_.wait(lock, [this, &rpc_name] {
+    return ((barrier_counter_[rpc_name] == client_num_ && client_num_ != 0) ||
+            exit_flag_.load());
+  });
+
+  VLOG(3) << "batch_barrier_: " << rpc_name << " "
+          << barrier_counter_[rpc_name];
+}
+
+void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) {
+  VLOG(4) << "RPCServer begin IncreaseBatchBarrier " << rpc_name;
+  int b = 0;
+  std::unique_lock<std::mutex> lock(mutex_);
+  b = ++barrier_counter_[rpc_name];
+  if (b >= client_num_) {
+    lock.unlock();
+    barrier_cond_.notify_all();
+    lock.lock();
+  }
+}
+
+void RPCServer::BeginPass() {
+  VLOG(4) << "RPCServer begin increase pass barrier";
+  {
+    std::unique_lock<std::mutex> lock(mutex_);
+    client_num_++;
+    VLOG(4) << "increase client_num to: " << client_num_;
+  }
+  barrier_cond_.notify_all();
+}
+
+void RPCServer::EndPass() {
+  VLOG(4) << "RPCServer begin increase pass barrier";
+  {
+    std::unique_lock<std::mutex> lock(mutex_);
+    client_num_--;
+    VLOG(4) << "decrease client_num to: " << client_num_;
+    if (cur_cond_.load() == rpc_cond_map_[kRequestGet]) {
+      barrier_counter_[kRequestGet]--;
+    }
+  }
+  barrier_cond_.notify_all();
+}
+
+void RPCServer::ResetBarrierCounter() {
+  VLOG(3) << "RPCServer ResetBarrierCounter ";
+  std::unique_lock<std::mutex> lock(mutex_);
+  for (auto& t : barrier_counter_) {
+    t.second = 0;
+  }
+}
+
+void RPCServer::RegisterRPC(const std::string& rpc_name,
+                            RequestHandler* handler, int thread_num) {
+  rpc_call_map_[rpc_name] = handler;
+  rpc_thread_num_[rpc_name] = thread_num;
+
+  static int cond = -1;
+  rpc_cond_map_[rpc_name] = ++cond;
+  VLOG(4) << "RegisterRPC rpc_name:" << rpc_name << ", handler:" << handler
+          << ", cond:" << rpc_cond_map_[rpc_name];
+}
+
+void RPCServer::SetCond(const std::string& rpc_name) {
+  VLOG(3) << "RPCServer SetCond " << rpc_name;
+  {
+    std::unique_lock<std::mutex> lock(mutex_);
+    cur_cond_ = rpc_cond_map_[rpc_name];
+  }
+
+  rpc_cond_.notify_all();
+}
+
+void RPCServer::WaitCond(const std::string& rpc_name) {
+  VLOG(4) << "RPCServer WaitCond " << rpc_name;
+  int cond = 0;
+  {
+    std::unique_lock<std::mutex> lock(mutex_);
+    cond = rpc_cond_map_[rpc_name];
+  }
+
+  std::unique_lock<std::mutex> lock(mutex_);
+  rpc_cond_.wait(
+      lock, [=] { return (cur_cond_.load() == cond || exit_flag_.load()); });
+}
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/rpc_server.h b/paddle/fluid/operators/distributed/rpc_server.h
new file mode 100644
index 0000000000000000000000000000000000000000..833991c8aa6e7cfd10f2aa52f9218be7ff8ccebf
--- /dev/null
+++ b/paddle/fluid/operators/distributed/rpc_server.h
@@ -0,0 +1,97 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <set>
+#include <string>
+#include <thread>  // NOLINT
+#include <utility>
+#include <vector>
+#include "paddle/fluid/operators/distributed/request_handler.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+class RPCServer {
+ public:
+  explicit RPCServer(const std::string& address, int client_num)
+      : cur_cond_(0),
+        bind_address_(address),
+        exit_flag_(false),
+        selected_port_(0),
+        client_num_(client_num) {}
+
+  virtual ~RPCServer() {}
+  virtual void StartServer() = 0;
+  virtual void WaitServerReady() = 0;
+
+  void ShutDown();
+
+  bool IsExit() { return exit_flag_.load(); }
+
+  int GetSelectedPort() const { return selected_port_; }
+
+  int GetClientNum() const;
+
+  void SavePort() const;
+
+  // RegisterRPC, register the rpc method name to a handler
+  // class, and auto generate a condition id for this call
+  // to be used for the barrier.
+  void RegisterRPC(const std::string& rpc_name, RequestHandler* handler,
+                   int thread_num = 5);
+
+  // Wait util all the clients have reached the barrier for one
+  // rpc method. This function should be called in the
+  // RequestHandler if you want to run the server/client in a
+  // synchronous mode.
+  void WaitBarrier(const std::string& rpc_name);
+
+  void SetCond(const std::string& rpc_name);
+  void WaitCond(const std::string& rpc_name);
+  void IncreaseBatchBarrier(const std::string rpc_name);
+
+  void BeginPass();
+  void EndPass();
+
+  void ResetBarrierCounter();
+
+ protected:
+  virtual void ShutDownImpl() = 0;
+
+ private:
+  std::mutex mutex_;
+  std::unordered_map<std::string, int> barrier_counter_;
+  std::condition_variable barrier_cond_;
+
+  std::unordered_map<std::string, int> rpc_cond_map_;
+  std::atomic<int> cur_cond_;
+  std::condition_variable rpc_cond_;
+
+ protected:
+  std::string bind_address_;
+  std::atomic<int> exit_flag_;
+  int selected_port_;
+  int client_num_;
+
+  std::unordered_map<std::string, RequestHandler*> rpc_call_map_;
+  std::unordered_map<std::string, int> rpc_thread_num_;
+  friend class RequestHandler;
+};
+
+};  // namespace distributed
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/rpc_server_test.cc b/paddle/fluid/operators/distributed/rpc_server_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a0693cffabcc561b0adfafc2c49027a890dd5efc
--- /dev/null
+++ b/paddle/fluid/operators/distributed/rpc_server_test.cc
@@ -0,0 +1,164 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <unistd.h>
+#include <string>
+#include <thread>  // NOLINT
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/request_handler_impl.h"
+#include "paddle/fluid/operators/distributed/rpc_client.h"
+#include "paddle/fluid/operators/distributed/rpc_server.h"
+
+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
+namespace distributed = paddle::operators::distributed;
+
+USE_OP(lookup_table);
+
+std::unique_ptr<distributed::RPCServer> g_rpc_service;
+std::unique_ptr<distributed::RequestHandler> g_req_handler;
+
+framework::BlockDesc* AppendPrefetchBlcok(framework::ProgramDesc* program) {
+  auto root_block = program->MutableBlock(0);
+  auto* block = program->AppendBlock(*root_block);
+
+  framework::VariableNameMap input({{"W", {"w"}}, {"Ids", {"ids"}}});
+  framework::VariableNameMap output({{"Output", {"out"}}});
+  auto op = block->AppendOp();
+  op->SetType("lookup_table");
+  op->SetInput("W", {"w"});
+  op->SetInput("Ids", {"ids"});
+  op->SetOutput("Out", {"out"});
+
+  auto& out = *root_block->Var("out");
+  out.SetType(framework::proto::VarType::SELECTED_ROWS);
+  out.SetShape({10, 10});
+
+  return block;
+}
+
+void CreateVarsOnScope(framework::Scope* scope, platform::CPUPlace* place) {
+  auto w_var = scope->Var("w");
+  w_var->GetMutable<framework::SelectedRows>();
+
+  auto out_var = scope->Var("out");
+  out_var->GetMutable<framework::SelectedRows>();
+
+  auto ids_var = scope->Var("ids");
+  ids_var->GetMutable<framework::SelectedRows>();
+}
+
+void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place,
+                         int64_t rows_numel) {
+  CreateVarsOnScope(scope, place);
+  auto ids_var = scope->Var("ids")->GetMutable<framework::SelectedRows>();
+  auto rows = ids_var->mutable_rows();
+  for (int64_t i = 0; i < rows_numel; ++i) rows->push_back(i * 2);
+  ids_var->mutable_value()->Resize({rows_numel, 1});
+  ids_var->mutable_value()->mutable_data<float>(*place);
+}
+
+void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
+                         int64_t rows_numel) {
+  CreateVarsOnScope(scope, place);
+  auto w = scope->Var("w")->GetMutable<framework::SelectedRows>();
+  auto rows = w->mutable_rows();
+  for (int64_t i = 0; i < rows_numel; ++i) rows->push_back(i);
+  auto w_value = w->mutable_value();
+  w_value->Resize({rows_numel, 10});
+
+  auto ptr = w_value->mutable_data<float>(*place);
+
+  for (int64_t i = 0; i < w_value->numel(); ++i) {
+    ptr[i] = static_cast<float>(i / 10);
+  }
+}
+
+void StartServer() {
+  framework::ProgramDesc program;
+  framework::Scope scope;
+  platform::CPUPlace place;
+  framework::Executor exe(place);
+  platform::CPUDeviceContext ctx(place);
+  auto* block = AppendPrefetchBlcok(&program);
+  std::string in_var_name("ids");
+  std::vector<int> prefetch_block_ids{block->ID()};
+  auto prepared = exe.Prepare(program, prefetch_block_ids);
+  InitTensorsOnServer(&scope, &place, 10);
+
+  std::unordered_map<std::string,
+                     std::shared_ptr<framework::ExecutorPrepareContext>>
+      prefetch_var_name_to_prepared;
+  prefetch_var_name_to_prepared[in_var_name] = prepared[0];
+  g_req_handler->SetProgram(&program);
+  g_req_handler->SetPrefetchPreparedCtx(&prefetch_var_name_to_prepared);
+  g_req_handler->SetDevCtx(&ctx);
+  g_req_handler->SetScope(&scope);
+  g_req_handler->SetExecutor(&exe);
+
+  g_rpc_service->RegisterRPC(distributed::kRequestPrefetch,
+                             g_req_handler.get());
+  g_req_handler->SetRPCServer(g_rpc_service.get());
+
+  std::thread server_thread(
+      std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get()));
+
+  server_thread.join();
+}
+
+TEST(PREFETCH, CPU) {
+  g_req_handler.reset(new distributed::RequestPrefetchHandler(true));
+  g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1));
+  distributed::RPCClient* client =
+      distributed::RPCClient::GetInstance<RPCCLIENT_T>();
+
+  std::thread server_thread(StartServer);
+  g_rpc_service->WaitServerReady();
+
+  int port = g_rpc_service->GetSelectedPort();
+  std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port);
+
+  framework::Scope scope;
+  platform::CPUPlace place;
+  platform::CPUDeviceContext ctx(place);
+  {
+    // create var on local scope
+    int64_t rows_numel = 5;
+    InitTensorsOnClient(&scope, &place, rows_numel);
+    std::string in_var_name("ids");
+    std::string out_var_name("out");
+
+    client->AsyncPrefetchVar(ep, ctx, scope, in_var_name, out_var_name);
+    client->Wait();
+    auto var = scope.Var(out_var_name);
+    auto value = var->GetMutable<framework::SelectedRows>()->value();
+    auto ptr = value.mutable_data<float>(place);
+
+    for (int64_t i = 0; i < rows_numel; ++i) {
+      EXPECT_EQ(ptr[0 + i * value.dims()[1]], static_cast<float>(i * 2));
+    }
+  }
+
+  g_rpc_service->ShutDown();
+  server_thread.join();
+  LOG(INFO) << "begin reset";
+  g_rpc_service.reset(nullptr);
+  g_req_handler.reset(nullptr);
+}
diff --git a/paddle/fluid/operators/distributed/send_recv.proto b/paddle/fluid/operators/distributed/send_recv.proto
new file mode 100644
index 0000000000000000000000000000000000000000..e0902320cff003797b12ed0204f7f99c44554b62
--- /dev/null
+++ b/paddle/fluid/operators/distributed/send_recv.proto
@@ -0,0 +1,83 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. Licensed under
+the Apache License, Version 2.0 (the "License"); you may not use this file
+except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+syntax = "proto3";
+package sendrecv;
+
+// option cc_generic_services = true;
+
+service SendRecvService {
+  // For parameter server round-robin like hashing, do not split tensors.
+  // Send and recv only one tensor
+  // TODO(typhoonzero): add streaming API
+  rpc SendVariable(VariableMessage) returns (VoidMessage) {}
+  // Argument VariableMessage for GetVariable should only contain varname.
+  rpc GetVariable(VariableMessage) returns (VariableMessage) {}
+  // pre-fetch variable by given variable name and Ids
+  rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {}
+
+  rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {}
+}
+
+// VariableMessage is serialized paddle variable message.
+// It can be:
+// LoDTensor
+// SelectedRows
+enum VarType {
+  LOD_TENSOR = 0;
+  SELECTED_ROWS = 1;
+  NCCL_ID = 2;
+}
+
+// NOTICE(gongwb):don't modify this proto if you are not
+//   not familar with how we serialize in sendrecvop_utils.h
+//   and deserilize it in  variable_response.h.
+message VariableMessage {
+  enum Type {
+    // Pod Types
+    BOOL = 0;
+    INT16 = 1;
+    INT32 = 2;
+    INT64 = 3;
+    FP16 = 4;
+    FP32 = 5;
+    FP64 = 6;
+  }
+
+  message LodData { repeated int64 lod_data = 1; }
+  string varname = 1;
+  // TODO(Yancey1989): reference framework::proto::VarDesc::VarType
+  VarType type = 2;
+  // bool persistable is not needed for sending.
+  // tensor info:
+  Type data_type = 3;
+  repeated int64 dims = 4;
+
+  // lod details:
+  int64 lod_level = 5;
+  repeated LodData lod = 6;
+  // selected_rows height, aka. original dim0
+  int64 slr_height = 7;
+  // tensor data
+  bytes serialized = 8;
+  // selected_rows data
+  bytes rows = 9;
+  // Look up table block execution output variable name.
+  string out_varname = 10;
+  // If 1, the ps server will start profiling, the ps
+  // server stops profiling and generates a profile to /tmp/profile_ps_*
+  // when profile switches from 1 to 2.
+  int64 profile = 11;
+}
+
+message VoidMessage {}
diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.cc b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..98129d9f1014c39347e3409533f2bc10092611d2
--- /dev/null
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
@@ -0,0 +1,232 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
+
+#ifdef PADDLE_WITH_CUDA
+#include <nccl.h>
+#endif
+#include <sys/time.h>
+#include <thread>  // NOLINT
+
+#include "google/protobuf/io/coded_stream.h"
+#include "google/protobuf/io/zero_copy_stream.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/operators/distributed/bytebuffer_stream.h"
+#include "paddle/fluid/operators/distributed/proto_encoder_helper.h"
+#include "paddle/fluid/operators/distributed/variable_response.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+using VarMsg = sendrecv::VariableMessage;
+
+void GetTensorPayload(framework::Variable* var,
+                      const platform::DeviceContext& ctx, VarMsg* request,
+                      void** payload, size_t* payload_size) {
+  auto tensor = var->Get<framework::LoDTensor>();
+  // FIXME(wuyi): data types in send_recv.proto is copied from
+  // framework.proto
+  request->set_data_type(
+      static_cast<VarMsg::Type>(framework::ToDataType(tensor.type())));
+  for (auto& dim : framework::vectorize(tensor.dims())) {
+    request->add_dims(dim);
+  }
+  const framework::LoD lod = tensor.lod();
+  if (lod.size() > 0) {
+    request->set_lod_level(lod.size());
+    for (auto& each : lod) {
+      VarMsg::LodData* lod_inner = request->add_lod();
+      for (auto& d : each) {
+        lod_inner->add_lod_data(d);
+      }
+    }
+  }
+  if (platform::is_gpu_place(ctx.GetPlace())) {
+#ifdef PADDLE_WITH_CUDA
+    PADDLE_ENFORCE(platform::is_gpu_place(tensor.place()));
+    platform::CUDAPinnedPlace cuda_pinned;
+    auto& gpu_dev_ctx = static_cast<const platform::CUDADeviceContext&>(ctx);
+    auto copy_size = tensor.numel() * framework::SizeOfType(tensor.type());
+    *payload = memory::Alloc(cuda_pinned, copy_size);
+
+    memory::Copy(cuda_pinned, *payload,
+                 boost::get<platform::CUDAPlace>(tensor.place()),
+                 reinterpret_cast<const void*>(tensor.data<void>()), copy_size,
+                 gpu_dev_ctx.stream());
+    ctx.Wait();
+#endif
+  } else {
+    *payload = tensor.data<void>();
+  }
+  *payload_size = tensor.numel() * framework::SizeOfType(tensor.type());
+}
+
+void GetSelectedRowsPayload(framework::Variable* var,
+                            const platform::DeviceContext& ctx, VarMsg* request,
+                            void** payload, size_t* payload_size) {
+  auto* slr = var->GetMutable<framework::SelectedRows>();
+  request->set_data_type(
+      static_cast<VarMsg::Type>(framework::ToDataType(slr->value().type())));
+  request->set_lod_level(0);
+  request->set_slr_height(slr->height());
+
+  for (auto& dim : framework::vectorize(slr->value().dims())) {
+    request->add_dims(dim);
+  }
+
+  auto* tensor = slr->mutable_value();
+  if (platform::is_gpu_place(ctx.GetPlace())) {
+#ifdef PADDLE_WITH_CUDA
+    platform::CUDAPinnedPlace cuda_pinned;
+    auto& gpu_dev_ctx = static_cast<const platform::CUDADeviceContext&>(ctx);
+    auto copy_size = tensor->numel() * framework::SizeOfType(tensor->type());
+    *payload = memory::Alloc(cuda_pinned, copy_size);
+    memory::Copy(cuda_pinned, *payload,
+                 boost::get<platform::CUDAPlace>(tensor->place()),
+                 reinterpret_cast<const void*>(tensor->data<void>()), copy_size,
+                 gpu_dev_ctx.stream());
+    ctx.Wait();
+#endif
+  } else {
+    *payload = slr->mutable_value()->data<void>();
+  }
+  *payload_size = tensor->numel() * framework::SizeOfType(tensor->type());
+}
+
+void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
+                           const platform::DeviceContext& ctx,
+                           ::grpc::ByteBuffer* msg,
+                           const std::string& out_name) {
+  // Default DestroyCallback does nothing, When using GPU
+  // the CPU buffer need to be freed.
+  DestroyCallback destroy_callback = [](void* backing) {};
+  VarMsg request;
+  void* payload = nullptr;
+  size_t payload_size;
+
+  request.set_varname(name);
+  // Note: normally the profiler is enabled in 1 trainer, hence only
+  // 1 trainer returns true for ShouldSendProfileState(). It tells PS
+  // servers the trainer's profiling state so that PS can follow the
+  // trainer.
+  if (platform::ShouldSendProfileState()) {
+    if (platform::IsProfileEnabled()) {
+      request.set_profile(platform::kEnableProfiler);
+    } else {
+      request.set_profile(platform::kDisableProfiler);
+    }
+  }
+  if (!out_name.empty()) {
+    request.set_out_varname(out_name);
+  }
+  if (var->IsType<framework::LoDTensor>()) {
+    request.set_type(::sendrecv::LOD_TENSOR);
+    GetTensorPayload(var, ctx, &request, &payload, &payload_size);
+  } else if (var->IsType<framework::SelectedRows>()) {
+    request.set_type(::sendrecv::SELECTED_ROWS);
+    GetSelectedRowsPayload(var, ctx, &request, &payload, &payload_size);
+#ifdef PADDLE_WITH_CUDA
+  } else if (var->IsType<ncclUniqueId>()) {
+    request.set_type(::sendrecv::NCCL_ID);
+#endif
+  } else {
+    PADDLE_THROW("Serialize does not support type: %s",
+                 typeid(var->Type()).name());
+  }
+
+  if (platform::is_gpu_place(ctx.GetPlace())) {
+#ifdef PADDLE_WITH_CUDA
+    // GPU data is copied to CPU buffer when sending,
+    // free the buffer when possible.
+    destroy_callback = [](void* backing) {
+      platform::CUDAPinnedPlace cuda_pinned;
+      memory::Free(cuda_pinned, backing);
+    };
+#endif
+  }
+
+  std::string header;
+  request.AppendToString(&header);
+  auto buffer = std::unique_ptr<char[]>(new char[1024]);
+  void* buf = buffer.get();
+  ProtoEncodeHelper e(static_cast<char*>(buf), 1024);
+  e.WriteRawBytes(std::string(header.data(), header.size()));
+// NCCLID is copied directly to the message, return bytebuffer
+// with only one slice if serializing NCCLID.
+#ifdef PADDLE_WITH_CUDA
+  if (var->IsType<ncclUniqueId>()) {
+    e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber,
+                              NCCL_UNIQUE_ID_BYTES);
+    const ncclUniqueId& uid = var->Get<ncclUniqueId>();
+    e.WriteRawBytes(std::string(uid.internal, NCCL_UNIQUE_ID_BYTES));
+
+    // for serialize NCCL_ID
+    ::grpc::Slice slices(e.size());
+    memcpy(const_cast<uint8_t*>(slices.begin()), e.data(), e.size());
+    ::grpc::ByteBuffer tmp(&slices, 1);
+    msg->Swap(&tmp);
+    return;
+  }
+#endif
+
+  e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size);
+  // steal reference of tensor data
+  ::grpc::Slice slices[4];  // metadata, tensor, rows meta, rows
+  int num_slices = 2;       // only SelectedRows have rows buffer
+  slices[0] = ::grpc::Slice(e.size());
+  memcpy(const_cast<uint8_t*>(slices[0].begin()), e.data(), e.size());
+  slices[1] = ::grpc::Slice(
+      grpc_slice_new_with_user_data(payload, payload_size, destroy_callback,
+                                    static_cast<char*>(payload)),
+      ::grpc::Slice::STEAL_REF);
+
+  if (var->IsType<framework::SelectedRows>()) {
+    auto* slr = var->GetMutable<framework::SelectedRows>();
+    ProtoEncodeHelper e2(static_cast<char*>(buf), 128);
+    size_t rows_memory_size =
+        slr->rows().size() * framework::SizeOfType(typeid(int64_t));
+    e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size);
+    slices[2] = ::grpc::Slice(e2.size());
+    memcpy(const_cast<uint8_t*>(slices[2].begin()), e2.data(), e2.size());
+
+    slices[3] = ::grpc::Slice(
+        grpc_slice_new_with_user_data(
+            const_cast<void*>(
+                reinterpret_cast<const void*>(slr->rows().data())),
+            rows_memory_size, [](void* backing) {},
+            const_cast<char*>(
+                reinterpret_cast<const char*>(slr->rows().data()))),
+        ::grpc::Slice::STEAL_REF);
+    num_slices = 4;
+  }
+
+  ::grpc::ByteBuffer tmp(&slices[0], num_slices);
+  msg->Swap(&tmp);
+}
+
+void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
+                               const platform::DeviceContext& ctx,
+                               const framework::Scope* scope,
+                               framework::Variable** var) {
+  operators::distributed::VariableResponse resp(scope, &ctx);
+  PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!");
+  *var = resp.GetVar();
+}
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.h b/paddle/fluid/operators/distributed/sendrecvop_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe25e73fa608727ba0bb912a82776b330ec8d83a
--- /dev/null
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.h
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <sys/time.h>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/var_type.h"
+
+#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+typedef void (*DestroyCallback)(void*);
+
+void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
+                           const platform::DeviceContext& ctx,
+                           ::grpc::ByteBuffer* msg,
+                           const std::string& out_varname = std::string());
+
+void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
+                               const platform::DeviceContext& ctx,
+                               const framework::Scope* scope,
+                               framework::Variable** var);
+
+inline std::type_index ToTypeIndex(sendrecv::VariableMessage::Type type) {
+  switch (type) {
+    case sendrecv::VariableMessage::FP32:
+      return typeid(float);  // NOLINT
+    case sendrecv::VariableMessage::FP64:
+      return typeid(double);  // NOLINT
+    case sendrecv::VariableMessage::INT32:
+      return typeid(int);  // NOLINT
+    case sendrecv::VariableMessage::INT64:
+      return typeid(int64_t);  // NOLINT
+    case sendrecv::VariableMessage::BOOL:
+      return typeid(bool);  // NOLINT
+    default:
+      PADDLE_THROW("Not support type %d", type);
+  }
+}
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/variable_response.cc b/paddle/fluid/operators/distributed/variable_response.cc
new file mode 100644
index 0000000000000000000000000000000000000000..45832c60bf9172497afabac927ba39a7cbfb9a52
--- /dev/null
+++ b/paddle/fluid/operators/distributed/variable_response.cc
@@ -0,0 +1,489 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/distributed/variable_response.h"
+
+#include <string>
+#include <utility>
+#include <vector>
+#ifdef PADDLE_WITH_CUDA
+#include <nccl.h>
+#endif
+#include "paddle/fluid/platform/profiler.h"
+
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+enum WireType {
+  WIRETYPE_VARINT = 0,
+  WIRETYPE_LENGTH_DELIMITED = 2,
+};
+
+inline int GetTagFieldNumber(uint32_t tag) { return tag >> 3; }
+
+inline WireType GetTagWireType(uint32_t tag) {
+  return static_cast<WireType>(tag & 0x7);
+}
+
+bool ReadVarintSizeAsInt(::google::protobuf::io::CodedInputStream* input,
+                         int* result) {
+  uint64_t v;
+  if (input->ReadVarint64(&v) && v <= static_cast<uint64_t>(INT_MAX)) {
+    *result = static_cast<int>(v);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+bool ReadRaw(::google::protobuf::io::CodedInputStream* input,
+             const platform::DeviceContext& dev_ctx, platform::Place place,
+             void* dest, int size) {
+  const void* data = NULL;
+  int size_to_write = 0;
+  int length = size;
+  int total_written = 0;
+
+  if (platform::is_gpu_place(place)) {
+#ifdef PADDLE_WITH_CUDA
+    auto& gpu_dev_ctx =
+        static_cast<const platform::CUDADeviceContext&>(dev_ctx);
+    platform::CPUPlace cpu;
+
+    char* p = reinterpret_cast<char*>(dest);
+    while (total_written < length) {
+      if (!input->GetDirectBufferPointer(&data, &size_to_write)) {
+        return false;
+      }
+      // NOTE: if raw buffer is large and have two neighbor fields of raw
+      // buffers GetDirectBufferPointer can get all of them, use length to
+      // truncate it.
+      if (total_written + size_to_write > length) {
+        size_to_write = length - total_written;
+      }
+      // This log is useful to see how long a internal block size is of rpc.
+      VLOG(7) << "copy " << size_to_write << " data to CUDAPlace";
+      memory::Copy(boost::get<platform::CUDAPlace>(place),
+                   reinterpret_cast<void*>(p), cpu, data, size_to_write,
+                   gpu_dev_ctx.stream());
+      p += size_to_write;
+      total_written += size_to_write;
+
+      input->Skip(size_to_write);
+    }
+    gpu_dev_ctx.Wait();
+#else
+    PADDLE_THROW("Unexpected branch");
+#endif
+    return true;
+  }
+
+  char* p = reinterpret_cast<char*>(dest);
+  while (total_written < length) {
+    if (!input->GetDirectBufferPointer(&data, &size_to_write)) {
+      return false;
+    }
+    // NOTE: if raw buffer is large and have two neighbor fields of raw buffers
+    // GetDirectBufferPointer can get all of them, use length to truncate it.
+    if (total_written + size_to_write > length) {
+      size_to_write = length - total_written;
+    }
+    // TODO(gongwb): can we avoid copy?
+    platform::CPUPlace cpu;
+    // This log is useful to see how long a internal block size is of rpc.
+    VLOG(7) << "copy " << size_to_write << " data to CPUPlace";
+    memory::Copy(cpu, reinterpret_cast<void*>(p), cpu, data, size_to_write);
+
+    p += size_to_write;
+    total_written += size_to_write;
+
+    input->Skip(size_to_write);
+  }
+
+  return true;
+}
+
+bool VariableResponse::CopyLodTensorData(
+    ::google::protobuf::io::CodedInputStream* input,
+    const platform::DeviceContext& ctx, const framework::DDim& dims,
+    int length) {
+  auto* tensor = GetVar()->GetMutable<framework::LoDTensor>();
+  tensor->Resize(dims);
+
+  framework::LoD lod;
+  for (int i = 0; i < meta_.lod_level(); ++i) {
+    framework::Vector<size_t> v;
+    for (int j = 0; j < meta_.lod(i).lod_data_size(); ++j) {
+      v.push_back(meta_.lod(i).lod_data(j));
+    }
+    lod.push_back(v);
+  }
+  tensor->set_lod(lod);
+
+  void* tensor_data =
+      tensor->mutable_data(ctx.GetPlace(), ToTypeIndex(meta_.data_type()));
+
+  if (!ReadRaw(input, ctx, tensor->place(), tensor_data, length)) {
+    return false;
+  }
+
+  return true;
+}
+
+inline framework::DDim GetDims(
+    const ::google::protobuf::RepeatedField<::google::protobuf::int64>& dims) {
+  std::vector<int> vecdims;
+  for (auto& d : dims) {
+    vecdims.push_back(d);
+  }
+  return framework::make_ddim(vecdims);
+}
+
+bool VariableResponse::CopySelectRowsTensorData(
+    ::google::protobuf::io::CodedInputStream* input,
+    const platform::DeviceContext& ctx, const framework::DDim& dims,
+    int length) {
+  auto* slr = GetVar()->GetMutable<framework::SelectedRows>();
+  slr->set_height(meta_.slr_height());
+  auto* tensor = slr->mutable_value();
+  tensor->Resize(dims);
+  PADDLE_ENFORCE_EQ(static_cast<size_t>(tensor->numel()),
+                    length / framework::SizeOfType(
+                                 paddle::operators::distributed::ToTypeIndex(
+                                     meta_.data_type())));
+  void* tensor_data = tensor->mutable_data(
+      ctx.GetPlace(),
+      paddle::operators::distributed::ToTypeIndex(meta_.data_type()));
+
+  if (!ReadRaw(input, ctx, tensor->place(), tensor_data, length)) {
+    return false;
+  }
+
+  return true;
+}
+
+bool VariableResponse::CopySelectRowsData(
+    ::google::protobuf::io::CodedInputStream* input,
+    const platform::DeviceContext& ctx, int length) {
+  auto* slr = GetVar()->GetMutable<framework::SelectedRows>();
+  slr->mutable_rows()->resize(length /
+                              framework::SizeOfType(typeid(int64_t)));  // int64
+  int64_t* rows_data = slr->mutable_rows()->data();
+
+  // copy rows CPU data, GPU data will be copied lazily.
+  platform::CPUPlace cpu;
+  if (!ReadRaw(input, ctx, cpu, rows_data, length)) {
+    return false;
+  }
+
+  return true;
+}
+
+bool ParseLodData(::google::protobuf::io::CodedInputStream* input,
+                  std::vector<int64_t>* lod) {
+  while (true) {
+    auto p = input->ReadTagWithCutoff(127);
+    int tag = GetTagFieldNumber(p.first);
+    WireType wt = GetTagWireType(p.first);
+
+    if (!p.second) {
+      return (tag == 0);
+    }
+
+    switch (tag) {
+      case sendrecv::VariableMessage_LodData::kLodDataFieldNumber: {
+        uint64_t v;
+        if (wt == WIRETYPE_VARINT) {
+          if (!input->ReadVarint64(&v)) {
+            return false;
+          }
+          lod->push_back(v);
+          break;
+        }
+
+        if (wt == WIRETYPE_LENGTH_DELIMITED) {
+          int num_bytes = 0;
+          if (!input->ReadVarintSizeAsInt(&num_bytes)) {
+            return tag;
+          }
+          int start_pos = input->CurrentPosition();
+          while (input->CurrentPosition() - start_pos < num_bytes) {
+            uint64_t v;
+            if (!input->ReadVarint64(&v)) {
+              return tag;
+            }
+            lod->push_back(v);
+          }
+          break;
+        }
+
+        return false;
+      }
+      default: { return false; }
+    }
+  }
+
+  return true;
+}
+
+int VariableResponse::Parse(const ::grpc::ByteBuffer& byte_buffer) {
+  GrpcByteBufferSource source;
+  source.Init(byte_buffer);
+  GrpcByteBufferSourceWrapper r(&source);
+
+  return Parse(&r);
+}
+
+int VariableResponse::Parse(Source* source) {
+  ::google::protobuf::io::ZeroCopyInputStream* input_stream =
+      source->contents();
+  ::google::protobuf::io::CodedInputStream input(input_stream);
+  input.SetTotalBytesLimit(INT_MAX, INT_MAX);
+
+  while (true) {
+    auto p = input.ReadTagWithCutoff(127);
+    int tag = GetTagFieldNumber(p.first);
+    WireType wt = GetTagWireType(p.first);
+    if (!p.second) {
+      if (tag != 0) {
+        return -1;
+      }
+      return 0;
+    }
+
+    switch (tag) {
+      case sendrecv::VariableMessage::kVarnameFieldNumber: {
+        uint32_t length;
+        if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
+          return tag;
+        }
+
+        std::string temp;
+        if (!input.ReadString(&temp, length)) {
+          return tag;
+        }
+
+        meta_.set_varname(temp);
+        break;
+      }
+      case sendrecv::VariableMessage::kTypeFieldNumber: {
+        uint32_t v;
+        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) {
+          return tag;
+        }
+
+        meta_.set_type(static_cast<::sendrecv::VarType>(v));
+        break;
+      }
+      case sendrecv::VariableMessage::kDataTypeFieldNumber: {
+        uint32_t v = 0;
+        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) {
+          return tag;
+        }
+
+        meta_.set_data_type(static_cast<::sendrecv::VariableMessage_Type>(v));
+        break;
+      }
+      case sendrecv::VariableMessage::kDimsFieldNumber: {
+        // not packed
+        if (wt == WIRETYPE_VARINT) {
+          uint64_t v;
+          if (!input.ReadVarint64(&v)) {
+            return tag;
+          }
+          meta_.add_dims(v);
+          break;
+        }
+
+        // packed
+        if (wt == WIRETYPE_LENGTH_DELIMITED) {
+          int num_bytes = 0;
+          if (!input.ReadVarintSizeAsInt(&num_bytes)) {
+            return tag;
+          }
+          int start_pos = input.CurrentPosition();
+          while (input.CurrentPosition() - start_pos < num_bytes) {
+            uint64_t v;
+            if (!input.ReadVarint64(&v)) {
+              return tag;
+            }
+            meta_.add_dims(v);
+          }
+          break;
+        }
+        return tag;
+      }
+      case sendrecv::VariableMessage::kLodLevelFieldNumber: {
+        uint64_t v = 0;
+        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
+          return tag;
+        }
+        meta_.set_lod_level(static_cast<int64_t>(v));
+        break;
+      }
+      case sendrecv::VariableMessage::kLodFieldNumber: {
+        int length = 0;
+        if (wt != WIRETYPE_LENGTH_DELIMITED ||
+            !ReadVarintSizeAsInt(&input, &length)) {
+          return tag;
+        }
+
+        std::pair<::google::protobuf::io::CodedInputStream::Limit, int> p =
+            input.IncrementRecursionDepthAndPushLimit(length);
+
+        std::vector<int64_t> lod_data;
+        if (p.second < 0 || !ParseLodData(&input, &lod_data)) {
+          return tag;
+        }
+
+        if (!input.DecrementRecursionDepthAndPopLimit(p.first)) {
+          return false;
+        }
+
+        if (lod_data.size() == 0) {
+          break;
+        }
+
+        auto lod = meta_.add_lod();
+        for (uint32_t i = 0; i < lod_data.size(); i++) {
+          lod->add_lod_data(lod_data[i]);
+        }
+        break;
+      }
+      case sendrecv::VariableMessage::kSlrHeightFieldNumber: {
+        uint64_t v = 0;
+        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
+          return tag;
+        }
+        meta_.set_slr_height(static_cast<int64_t>(v));
+        break;
+      }
+      case sendrecv::VariableMessage::kSerializedFieldNumber: {
+        PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
+                        meta_.type() == sendrecv::LOD_TENSOR ||
+                        meta_.type() == sendrecv::NCCL_ID) &&
+                           meta_.varname() != "",
+                       "meta info should be got first!");
+
+        int num_bytes = 0;
+        if (wt != WIRETYPE_LENGTH_DELIMITED ||
+            !ReadVarintSizeAsInt(&input, &num_bytes)) {
+          return tag;
+        }
+
+        if (meta_.type() == sendrecv::NCCL_ID) {
+#ifdef PADDLE_WITH_CUDA
+          auto* var = scope_->FindVar(meta_.varname());
+          if (var != nullptr) {
+            ncclUniqueId* id = var->GetMutable<ncclUniqueId>();
+            if (!ReadRaw(&input, *dev_ctx_, platform::CPUPlace(), id->internal,
+                         num_bytes)) {
+              return tag;
+            }
+          }
+          break;
+#else
+          PADDLE_THROW("Not compiled with CUDA!");
+#endif
+        }
+
+        framework::DDim dims = GetDims(meta_.dims());
+        if (meta_.type() == sendrecv::LOD_TENSOR) {
+          PADDLE_ENFORCE(meta_.lod_size() >= 0,
+                         "lod info should be got first!");
+          if (!CopyLodTensorData(&input, *dev_ctx_, dims, num_bytes)) {
+            return tag;
+          }
+          break;
+        }
+
+        if (meta_.type() == sendrecv::SELECTED_ROWS) {
+          if (!CopySelectRowsTensorData(&input, *dev_ctx_, dims, num_bytes)) {
+            return tag;
+          }
+          break;
+        }
+
+        return tag;
+      }
+      case sendrecv::VariableMessage::kRowsFieldNumber: {
+        PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
+                        meta_.type() == sendrecv::LOD_TENSOR) &&
+                           meta_.varname() != "",
+                       "meta info should be got first!");
+
+        int num_bytes = 0;
+        if (wt != WIRETYPE_LENGTH_DELIMITED ||
+            !ReadVarintSizeAsInt(&input, &num_bytes)) {
+          return tag;
+        }
+
+        if (!CopySelectRowsData(&input, *dev_ctx_, num_bytes)) {
+          return tag;
+        }
+        break;
+      }
+      case sendrecv::VariableMessage::kOutVarnameFieldNumber: {
+        uint32_t length;
+        if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
+          return tag;
+        }
+
+        std::string temp;
+        if (!input.ReadString(&temp, length)) {
+          return tag;
+        }
+
+        meta_.set_out_varname(temp);
+        break;
+      }
+      case sendrecv::VariableMessage::kProfileFieldNumber: {
+        uint64_t profiling = 0;
+        if (!input.ReadVarint64(&profiling)) {
+          return tag;
+        }
+        meta_.set_profile(profiling);
+        int64_t listener_id = platform::ListenerId();
+        if (listener_id <= 0) {
+          break;
+        }
+        if (profiling == platform::kEnableProfiler &&
+            !platform::IsProfileEnabled()) {
+          platform::EnableProfiler(platform::ProfilerState::kCPU);
+        } else if (profiling == platform::kDisableProfiler &&
+                   platform::IsProfileEnabled()) {
+          // TODO(panyx0718): Should we allow to customize file dir.
+          platform::DisableProfiler(
+              platform::EventSortingKey::kDefault,
+              string::Sprintf("/tmp/profile_ps_%lld", listener_id));
+        }
+        break;
+      }
+      default: {
+        // Unknown tag, return unknown error.
+        return -1;
+      }
+    }
+  }
+
+  return 0;
+}
+
+};  // namespace distributed
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/variable_response.h b/paddle/fluid/operators/distributed/variable_response.h
new file mode 100644
index 0000000000000000000000000000000000000000..1db4a0a522654ff2497b8bd9ee1381b5ab64067a
--- /dev/null
+++ b/paddle/fluid/operators/distributed/variable_response.h
@@ -0,0 +1,104 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/var_type.h"
+
+#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+
+#include "google/protobuf/io/coded_stream.h"
+#include "google/protobuf/io/zero_copy_stream.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/distributed/bytebuffer_stream.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+class VariableResponse {
+ public:
+  VariableResponse(const framework::Scope* scope,
+                   const platform::DeviceContext* dev_ctx,
+                   bool create_scope = false)
+      : scope_(scope), dev_ctx_(dev_ctx), create_scope_(create_scope) {
+    if (create_scope) {
+      local_scope_ = &scope->NewScope();
+    }
+  }
+
+  virtual ~VariableResponse() {
+    if (create_scope_) {
+      scope_->DeleteScope(local_scope_);
+    }
+  }
+
+  // return:
+  // 0:ok.
+  // -1: unkown error.
+  // other: number of error field.
+  int Parse(Source* source);
+
+  // return:
+  // 0:ok.
+  // -1: unkown error.
+  // other: number of error field.
+  int Parse(const ::grpc::ByteBuffer& byte_buffer);
+
+  const framework::Scope& GetLocalScope() const { return *local_scope_; }
+
+  framework::Scope* GetMutableLocalScope() const { return local_scope_; }
+
+  inline std::string Varname() const { return meta_.varname(); }
+  inline std::string OutVarname() const { return meta_.out_varname(); }
+
+  // should call parse first.
+  framework::Variable* GetVar() {
+    if (create_scope_) {
+      return local_scope_->Var(meta_.varname());
+    }
+    return scope_->FindVar(meta_.varname());
+  }
+
+ private:
+  bool CopySelectRowsTensorData(::google::protobuf::io::CodedInputStream* input,
+                                const platform::DeviceContext& ctx,
+                                const framework::DDim& dims, int length);
+
+  bool CopySelectRowsData(::google::protobuf::io::CodedInputStream* input,
+                          const platform::DeviceContext& ctx, int length);
+
+  bool CopyLodTensorData(::google::protobuf::io::CodedInputStream* input,
+                         const platform::DeviceContext& ctx,
+                         const framework::DDim& dims, int length);
+
+ private:
+  const framework::Scope* scope_;
+  const platform::DeviceContext* dev_ctx_;
+  bool create_scope_ = false;
+  framework::Scope* local_scope_ = nullptr;
+  // only Skeleton
+  sendrecv::VariableMessage meta_;
+};
+
+};  // namespace distributed
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise_add_mkldnn_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1a5427b39241b666eeaf12b173ea00443bb5f6e4
--- /dev/null
+++ b/paddle/fluid/operators/elementwise_add_mkldnn_op.cc
@@ -0,0 +1,190 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/operators/elementwise_add_op.h"
+#include "paddle/fluid/operators/elementwise_op_function.h"
+
+#include "paddle/fluid/platform/mkldnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::DataLayout;
+using framework::Tensor;
+using mkldnn::memory;
+using mkldnn::reorder;
+using mkldnn::primitive;
+using mkldnn::stream;
+using mkldnn::sum;
+
+template <typename T>
+class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* z = ctx.Output<Tensor>("Out");
+    const T* x_data = x->data<T>();
+    const T* y_data = y->data<T>();
+    T* z_data = z->mutable_data<T>(ctx.GetPlace());
+
+    int axis = ctx.Attr<int>("axis");
+
+    auto x_dims = x->dims();
+    auto y_dims = y->dims();
+    auto z_dims = z->dims();
+
+    // Execute default elementwise_add operator when
+    // broadcast operations need to performed.
+    if (x_dims != y_dims) {
+      auto sum_func = [](T a, T b) -> T { return a + b; };
+
+      TransformFunctor<decltype(sum_func), T,
+                       paddle::platform::CPUDeviceContext, T>
+          functor(
+              x, y, z,
+              ctx.template device_context<paddle::platform::CPUDeviceContext>(),
+              sum_func);
+
+      axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+      PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
+                     "Axis should be in range [0, x_dims)");
+
+      trim_trailing_singular_dims(&y_dims);
+      axis = (y_dims.size() == 0) ? x_dims.size() : axis;
+
+      int pre, n, post;
+      get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post);
+
+      if (post == 1) {
+        functor.RunRowWise(n, pre);
+      } else {
+        functor.RunMidWise(n, pre, post);
+      }
+      z->set_layout(DataLayout::kMKLDNN);
+      z->set_format(x->format());
+    } else {
+      PADDLE_ENFORCE(x->layout() == DataLayout::kMKLDNN &&
+                         x->format() != memory::format::format_undef,
+                     "Wrong layout/format set for X tensor");
+      PADDLE_ENFORCE(y->layout() == DataLayout::kMKLDNN &&
+                         y->format() != memory::format::format_undef,
+                     "Wrong layout/format set for Y tensor");
+
+      std::vector<int> src_x_tz = framework::vectorize2int(x_dims);
+      std::vector<int> src_y_tz = framework::vectorize2int(y_dims);
+      std::vector<int> dst_tz = framework::vectorize2int(z_dims);
+
+      std::vector<memory::primitive_desc> srcs_pd;
+      std::vector<memory> srcs;
+      std::vector<float> scales = {1.0f, 1.0f};
+
+      auto src_x_pd = memory::primitive_desc(
+          {{src_x_tz}, memory::data_type::f32, x->format()}, mkldnn_engine);
+      auto src_y_pd = memory::primitive_desc(
+          {{src_y_tz}, memory::data_type::f32, y->format()}, mkldnn_engine);
+      auto src_x_memory =
+          memory(src_x_pd, paddle::platform::to_void_cast(x_data));
+      auto src_y_memory =
+          memory(src_y_pd, paddle::platform::to_void_cast(y_data));
+
+      srcs_pd.push_back(src_x_pd);
+      srcs_pd.push_back(src_y_pd);
+      srcs.push_back(src_x_memory);
+      srcs.push_back(src_y_memory);
+
+      auto dst_md =
+          memory::desc({dst_tz}, memory::data_type::f32, memory::format::any);
+
+      // create primitive descriptor for sum
+      auto sum_pd = sum::primitive_desc(dst_md, scales, srcs_pd);
+
+      // create mkldnn memory for dst
+      memory dst_memory = memory(sum_pd.dst_primitive_desc(), z_data);
+
+      std::vector<primitive::at> inputs;
+      inputs.push_back(srcs[0]);
+      inputs.push_back(srcs[1]);
+
+      // create sum primitive
+      auto sum_prim = sum(sum_pd, inputs, dst_memory);
+
+      std::vector<primitive> pipeline;
+      pipeline.push_back(sum_prim);
+      stream(stream::kind::eager).submit(pipeline).wait();
+
+      z->set_layout(DataLayout::kMKLDNN);
+      z->set_format(
+          (memory::format)dst_memory.get_primitive_desc().desc().data.format);
+    }
+  }
+};
+
+template <typename T>
+class EltwiseAddMKLDNNGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using Tensor = framework::Tensor;
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    int axis = ctx.Attr<int>("axis");
+
+    auto set_mkldnn_format = [](Tensor* in, const Tensor* out) {
+      in->set_layout(DataLayout::kMKLDNN);
+      in->set_format(out->format());
+    };
+
+    if (x->dims() == y->dims()) {
+      auto blas = math::GetBlas<paddle::platform::CPUDeviceContext, T>(ctx);
+      if (dx) {
+        blas.VCOPY(dout->numel(), dout->data<T>(),
+                   dx->mutable_data<T>(ctx.GetPlace()));
+        set_mkldnn_format(dx, dout);
+      }
+
+      if (dy) {
+        blas.VCOPY(dout->numel(), dout->data<T>(),
+                   dy->mutable_data<T>(ctx.GetPlace()));
+        set_mkldnn_format(dy, dout);
+      }
+    } else {
+      // Execute default kernel when broadcast is needed
+      ElemwiseGradCompute<paddle::platform::CPUDeviceContext, T,
+                          IdentityGrad<T>, IdentityGrad<T>>(
+          ctx, *x, *y, *out, *dout, axis, dx, dy, IdentityGrad<T>(),
+          IdentityGrad<T>());
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_KERNEL(elementwise_add, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::EltwiseAddMKLDNNKernel<float>)
+
+REGISTER_OP_KERNEL(elementwise_add_grad, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::EltwiseAddMKLDNNGradKernel<float>)
diff --git a/paddle/fluid/operators/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise_mul_op.cc
index ba343909bb87b4f2efa56c0a4ff664b278e90c60..7cd67e74de6b9c4fbc718f60b4f671ccab2f9956 100644
--- a/paddle/fluid/operators/elementwise_mul_op.cc
+++ b/paddle/fluid/operators/elementwise_mul_op.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise_mul_op.h"
 #include "paddle/fluid/operators/elementwise_op.h"
 namespace ops = paddle::operators;
-REGISTER_ELEMWISE_OP(elementwise_mul, "Mul", "Out = X \\odot\\ Y");
+REGISTER_ELEMWISE_OP(elementwise_mul, "Mul", "Out = X \\\\odot Y");
 REGISTER_OP_CPU_KERNEL(
     elementwise_mul,
     ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise_op.h b/paddle/fluid/operators/elementwise_op.h
index 12364fff96c03c5f9dff23c7c00ceedd043803a6..bb88970e42c194d9437609b62435f1a89e2b446b 100644
--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
@@ -14,8 +14,12 @@ limitations under the License. */
 
 #pragma once
 #include <string>
+#include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 
 namespace paddle {
 namespace operators {
@@ -40,6 +44,21 @@ class ElementwiseOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("Out", x_dim);
     ctx->ShareLoD("X", /*->*/ "Out");
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto input_data_type =
+        framework::ToDataType(ctx.Input<Tensor>("X")->type());
+
+#ifdef PADDLE_WITH_MKLDNN
+    if (platform::CanMKLDNNBeUsed(ctx)) {
+      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
+    }
+#endif
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
 };
 
 class ElementwiseOpInferVarType : public framework::VarTypeInference {
@@ -65,6 +84,8 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
                  "for broadcasting Y onto X.")
         .SetDefault(-1)
         .EqualGreaterThan(-1);
+    AddAttr<bool>("use_mkldnn", "(bool, default false). Used by MKLDNN.")
+        .SetDefault(false);
     AddComment(string::Sprintf(R"DOC(
 Limited Elementwise %s Operator
 
@@ -138,6 +159,21 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
       ctx->SetOutputDim(y_grad_name, y_dims);
     }
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto input_data_type =
+        framework::ToDataType(ctx.Input<Tensor>("X")->type());
+
+#ifdef PADDLE_WITH_MKLDNN
+    if (platform::CanMKLDNNBeUsed(ctx)) {
+      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
+    }
+#endif
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
 };
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/fc_mkldnn_op.cc b/paddle/fluid/operators/fc_mkldnn_op.cc
index 847b7b0c12e1679501dbe83d578b23ca2aef3e9e..99fa659a351249a4a93f71700e1c646465861aba 100644
--- a/paddle/fluid/operators/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/fc_mkldnn_op.cc
@@ -115,6 +115,7 @@ class MKLDNNMemory {
 
 template <typename T>
 class FCMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
+ public:
   void Compute(const paddle::framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
                    "It must use CPUPlace.");
diff --git a/paddle/fluid/operators/fetch_barrier_op.cc b/paddle/fluid/operators/fetch_barrier_op.cc
index 98b051afb551f373009d2bd3df1a8daa64b7e6c7..02beb80fc8a9f451393dcdd54492c4f88f908497 100644
--- a/paddle/fluid/operators/fetch_barrier_op.cc
+++ b/paddle/fluid/operators/fetch_barrier_op.cc
@@ -42,8 +42,8 @@ class FetchBarrierOp : public framework::OperatorBase {
     // For profiling
     platform::RecordEvent record_event(Type(), &ctx);
 
-    detail::RPCClient* rpc_client =
-        detail::RPCClient::GetInstance<RPCCLIENT_T>();
+    distributed::RPCClient* rpc_client =
+        distributed::RPCClient::GetInstance<RPCCLIENT_T>();
 
     rpc_client->Wait();
 
diff --git a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc b/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc
index 8050f61d4546f3351645f23ddcc63b2c49f17929..4a974281481c8bc02589b428098475d73b8a0ba5 100644
--- a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc
+++ b/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc
@@ -36,11 +36,12 @@ class GaussianRandomBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker {
   void Apply() override {
     AddAttr<float>("mean",
                    "(float, default 0.0) "
-                   "mean of random tensor.")
+                   "The mean (or center) of the gaussian distribution.")
         .SetDefault(.0f);
     AddAttr<float>("std",
                    "(float, default 1.0) "
-                   "std of random tensor.")
+                   "The standard deviation (std, or spread) of the "
+                   "gaussian distribution.")
         .SetDefault(1.0f);
     AddAttr<int>("seed",
                  "(int, default 0) "
@@ -55,9 +56,11 @@ class GaussianRandomBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker {
         .SetDefault(framework::proto::VarType::FP32);
 
     AddComment(R"DOC(
-GaussianRandom Operator.
 
 Used to initialize tensors with gaussian random generator.
+The defalut mean of the distribution is 0. and defalut standard
+deviation (std) of the distribution is 1.. Uers can set mean and std
+by input arguments.
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/gaussian_random_mkldnn_op.cc b/paddle/fluid/operators/gaussian_random_mkldnn_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..76b00b396c1349eff5db1059268e7cf280a8fc64
--- /dev/null
+++ b/paddle/fluid/operators/gaussian_random_mkldnn_op.cc
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+#include "paddle/fluid/operators/mean_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::DataLayout;
+template <typename T>
+class GaussianMKLDNNKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    float mean = context.Attr<float>("mean");
+    float std = context.Attr<float>("std");
+    auto* tensor = context.Output<framework::Tensor>("Out");
+    T* data = tensor->mutable_data<T>(context.GetPlace());
+
+    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+    std::minstd_rand engine;
+    if (seed == 0) {
+      seed = std::random_device()();
+    }
+    engine.seed(seed);
+    std::normal_distribution<T> dist(mean, std);
+    int64_t size = tensor->numel();
+    for (int64_t i = 0; i < size; ++i) {
+      data[i] = dist(engine);
+    }
+
+    // The format of output is set as the mkldnn's format
+    // TODO(@mozga-intel) The format of matrix sets inside the another layers.
+    tensor->set_layout(DataLayout::kMKLDNN);
+    tensor->set_format(mkldnn::memory::format::oihw);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_KERNEL(gaussian_random, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::GaussianMKLDNNKernel<float>);
diff --git a/paddle/fluid/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc
index 815c1bb50988be49ca9996e368a59344c6583d58..1488aab1926b5b4ba7bceed582700f5a11fc6c93 100644
--- a/paddle/fluid/operators/gaussian_random_op.cc
+++ b/paddle/fluid/operators/gaussian_random_op.cc
@@ -15,6 +15,10 @@ limitations under the License. */
 #include <random>
 #include "paddle/fluid/framework/op_registry.h"
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -62,9 +66,20 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
+    framework::LibraryType library{framework::LibraryType::kPlain};
+    framework::DataLayout layout{framework::DataLayout::kAnyLayout};
+
+#ifdef PADDLE_WITH_MKLDNN
+    if (library == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library = framework::LibraryType::kMKLDNN;
+      layout = framework::DataLayout::kMKLDNN;
+    }
+#endif
+
     return framework::OpKernelType(
         static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype")),
-        ctx.device_context());
+        ctx.device_context(), layout, library);
   }
 };
 
@@ -95,7 +110,9 @@ class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker {
                  "(int, default 5(FP32)) "
                  "Output data type.")
         .SetDefault(framework::proto::VarType::FP32);
-
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
     AddComment(R"DOC(
 GaussianRandom Operator.
 
diff --git a/paddle/fluid/operators/gen_nccl_id_op.cc b/paddle/fluid/operators/gen_nccl_id_op.cc
index f824eee4e7d1ef19c9a38fd5d3369265f9c549a0..697c239e59d158428ae9ba9f7feded19637dff28 100644
--- a/paddle/fluid/operators/gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/gen_nccl_id_op.cc
@@ -22,7 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/operators/detail/macros.h"
-#include "paddle/fluid/operators/detail/request_handler_impl.h"
+#include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #include "paddle/fluid/platform/nccl_helper.h"
 
 namespace paddle {
@@ -60,7 +60,8 @@ class GenNCCLIdOp : public framework::OperatorBase {
 
     std::vector<std::string> endpoint_list =
         Attr<std::vector<std::string>>("endpoint_list");
-    detail::RPCClient* client = detail::RPCClient::GetInstance<RPCCLIENT_T>();
+    distributed::RPCClient* client =
+        distributed::RPCClient::GetInstance<RPCCLIENT_T>();
 
     for (auto& ep : endpoint_list) {
       VLOG(3) << "sending nccl id to " << ep;
@@ -80,11 +81,11 @@ class GenNCCLIdOp : public framework::OperatorBase {
     // NOTE: Can not use unique_ptr here because the default
     // deleter will call GRPC Server's base class's dtor and
     // that will cause a wired crash.
-    detail::RequestSendHandler rpc_h(true);
-    std::unique_ptr<detail::RPCServer> rpc_service(
+    distributed::RequestSendHandler rpc_h(true);
+    std::unique_ptr<distributed::RPCServer> rpc_service(
         new RPCSERVER_T(endpoint, 1));
 
-    rpc_service->RegisterRPC(detail::kRequestSend, &rpc_h);
+    rpc_service->RegisterRPC(distributed::kRequestSend, &rpc_h);
     rpc_h.SetRPCServer(rpc_service.get());
 
     framework::ProgramDesc empty_program;
@@ -95,11 +96,11 @@ class GenNCCLIdOp : public framework::OperatorBase {
     rpc_h.SetExecutor(&executor);
 
     std::thread server_thread(
-        std::bind(&detail::RPCServer::StartServer, rpc_service.get()));
+        std::bind(&distributed::RPCServer::StartServer, rpc_service.get()));
 
-    rpc_service->SetCond(detail::kRequestSend);
+    rpc_service->SetCond(distributed::kRequestSend);
     VLOG(3) << "start getting nccl id from trainer 0...";
-    rpc_service->WaitBarrier(detail::kRequestSend);
+    rpc_service->WaitBarrier(distributed::kRequestSend);
     VLOG(3) << "got nccl id and stop server...";
     rpc_service->ShutDown();
     VLOG(3) << "rpc server stopped";
diff --git a/paddle/fluid/operators/layer_norm_op.cc b/paddle/fluid/operators/layer_norm_op.cc
index ab097d31e9ab5eafa788539170e7e405df697625..14ce1da2e97186a50ed8bd52223a500c4c57b328 100644
--- a/paddle/fluid/operators/layer_norm_op.cc
+++ b/paddle/fluid/operators/layer_norm_op.cc
@@ -62,36 +62,33 @@ class LayerNormOp : public framework::OperatorWithKernel {
 class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("X", "(LoDTensor) The input tensor.");
+    AddInput("X", "The input tensor.");
     AddInput("Scale",
-             "(Tensor, optional) Scale is a 1-dimensional tensor of size "
+             "(optional) Scale is a 1-dimensional tensor of size "
              "H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])."
              "It is applied to the output.")
         .AsDispensable();
     AddInput("Bias",
-             "(Tensor, optional) Bias is a 1-dimensional tensor of size "
+             "(optional) Bias is a 1-dimensional tensor of size "
              "H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])."
              "It is applied to the output.")
         .AsDispensable();
-    AddOutput("Y", "(LoDTensor) Result after normalization.");
-    AddOutput("Mean", "(Tensor) Mean of the current mini batch.")
-        .AsIntermediate();
-    AddOutput("Variance", "(Tensor) Variance of the current mini batch.")
+    AddOutput("Y", "Result after normalization.");
+    AddOutput("Mean", "Mean of the current mini batch.").AsIntermediate();
+    AddOutput("Variance", "Variance of the current mini batch.")
         .AsIntermediate();
 
     AddAttr<float>("epsilon",
-                   "(float, default 1e-5) Constant for "
-                   "numerical stability")
+                   "Constant for numerical stability [default 1e-5].")
         .SetDefault(1e-5)
         .AddCustomChecker([](const float &epsilon) {
           PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f,
                          "'epsilon' should be between 0.0 and 0.001.");
         });
     AddAttr<int>("begin_norm_axis",
-                 "(int default:1), the "
-                 "axis of `begin_norm_axis ... Rank(X) - 1` will be "
+                 "the axis of `begin_norm_axis ... Rank(X) - 1` will be "
                  "normalized. `begin_norm_axis` splits the tensor(`X`) to a "
-                 "matrix [N,H].")
+                 "matrix [N,H]. [default 1].")
         .SetDefault(1)
         .AddCustomChecker([](const int &begin_norm_axis) {
           PADDLE_ENFORCE_GT(begin_norm_axis, 0,
@@ -99,10 +96,14 @@ class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker {
         });
 
     AddComment(R"DOC(
-Layer Normalization.
-Layer Norm has been implemented as discussed in the paper:
-https://arxiv.org/abs/1607.06450
-...
+Assume feature vectors exist on dimensions
+:attr:`begin_norm_axis ... rank(input)` and calculate the moment statistics
+along these dimensions for each feature vector :math:`a` with size
+:math:`H`, then normalize each feature vector using the corresponding
+statistics. After that, apply learnable gain and bias on the normalized
+tensor to scale and shift if :attr:`scale` and :attr:`shift` are set.
+
+Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/linear_chain_crf_op.cc b/paddle/fluid/operators/linear_chain_crf_op.cc
index a711da362771353891f900f544d97e64510dc0ba..ea1ca7f59db22bee973a8827a88e2fb80265fa51 100644
--- a/paddle/fluid/operators/linear_chain_crf_op.cc
+++ b/paddle/fluid/operators/linear_chain_crf_op.cc
@@ -84,6 +84,7 @@ CRF. Please refer to http://www.cs.columbia.edu/~mcollins/fb.pdf and
 http://cseweb.ucsd.edu/~elkan/250Bwinter2012/loglinearCRFs.pdf for details.
 
 Equation:
+
 1. Denote Input(Emission) to this operator as $x$ here.
 2. The first D values of Input(Transition) to this operator are for starting
 weights, denoted as $a$ here.
@@ -106,6 +107,7 @@ Finally, the linear chain CRF operator outputs the logarithm of the conditional
 likelihood of each training sample in a mini-batch.
 
 NOTE:
+
 1. The feature function for a CRF is made up of the emission features and the
 transition features. The emission feature weights are NOT computed in
 this operator. They MUST be computed first before this operator is called.
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index 4d12278799f66f2fb92b7580ba0c43e845aa4d3a..56e39649b409f7eed108027f6df58c19dd3c8ab8 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -21,14 +21,14 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/detail/macros.h"
 
-#include "paddle/fluid/operators/detail/request_handler_impl.h"
+#include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #include "paddle/fluid/operators/listen_and_serv_op.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
 
-void RunServer(std::shared_ptr<detail::RPCServer> service) {
+void RunServer(std::shared_ptr<distributed::RPCServer> service) {
   service->StartServer();
   VLOG(4) << "RunServer thread end";
 }
@@ -99,19 +99,19 @@ static int64_t GetTimestamp() {
 void ListenAndServOp::RunSyncLoop(
     framework::Executor *executor, framework::ProgramDesc *program,
     framework::Scope *recv_scope,
-    const std::vector<int> &prefetch_block_id_list) const {
+    const std::vector<int> &prefetch_block_id_list,
+    const int checkpoint_point_block_id) const {
   size_t num_blocks = program->Size();
+  auto optimize_blocks =
+      Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
   PADDLE_ENFORCE_GE(num_blocks, 2,
                     "server program should have at least 2 blocks");
 
-  std::vector<int> optimize_block_id_list;
-  for (int blkid = 1; blkid < num_blocks; ++blkid) {
-    if (std::find(prefetch_block_id_list.begin(), prefetch_block_id_list.end(),
-                  blkid) == prefetch_block_id_list.end()) {
-      optimize_block_id_list.push_back(blkid);
-    }
+  std::vector<int> optimize_blocks_idx;
+  for (auto blk : optimize_blocks) {
+    optimize_blocks_idx.push_back(blk->ID());
   }
-  auto optimize_prepared = executor->Prepare(*program, optimize_block_id_list);
+  auto optimize_prepared = executor->Prepare(*program, optimize_blocks_idx);
   // Insert placeholder for block0 which holds current op itself.
   optimize_prepared.insert(
       optimize_prepared.begin(),
@@ -121,12 +121,12 @@ void ListenAndServOp::RunSyncLoop(
   while (true) {
     // Get from multiple trainers, we don't care about the order in which
     // the gradients arrives, just add suffix 0~n and merge the gradient.
-    rpc_service_->SetCond(detail::kRequestSend);
-    rpc_service_->WaitBarrier(detail::kRequestSend);
+    rpc_service_->SetCond(distributed::kRequestSend);
+    rpc_service_->WaitBarrier(distributed::kRequestSend);
 
     if (rpc_service_->IsExit()) {
       LOG(WARNING) << "get exit!rpc_processor break!";
-      rpc_service_->SetCond(detail::kRequestGet);
+      rpc_service_->SetCond(distributed::kRequestGet);
       break;
     }
 
@@ -134,14 +134,14 @@ void ListenAndServOp::RunSyncLoop(
     // and this will still work.
     // The optimize blocks which have the same parent ID would run parallel
     // TODO(Yancey1989): need to use ParallelExecutor for future
-    int32_t last_parent_blkid = program->Block(1).Parent();
+    int32_t last_parent_blkid = optimize_blocks[0]->Parent();
     std::vector<size_t> parallel_blkids;
-    parallel_blkids.push_back(1);
+    parallel_blkids.push_back(optimize_blocks[0]->ID());
     double ts = GetTimestamp();
-    for (size_t i = 1; i < optimize_block_id_list.size(); ++i) {
+    for (size_t i = 1; i < optimize_blocks.size(); ++i) {
       // skip the first optimize block because it is already in the
       // parallel_blkids.
-      int blkid = optimize_block_id_list[i];
+      int blkid = optimize_blocks[i]->ID();
       if (program->Block(blkid).Parent() != last_parent_blkid) {
         ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared,
                               program, recv_scope);
@@ -154,18 +154,18 @@ void ListenAndServOp::RunSyncLoop(
                           recv_scope);
     VLOG(2) << "run all blocks spent " << GetTimestamp() - ts << "(ms)";
 
-    rpc_service_->SetCond(detail::kRequestGet);
-    rpc_service_->WaitBarrier(detail::kRequestGet);
+    rpc_service_->SetCond(distributed::kRequestGet);
+    rpc_service_->WaitBarrier(distributed::kRequestGet);
     rpc_service_->ResetBarrierCounter();
     // reset received sparse vars to avoid reuse it in the next mini-batch
-    dynamic_cast<detail::RequestSendHandler *>(request_send_handler_.get())
+    dynamic_cast<distributed::RequestSendHandler *>(request_send_handler_.get())
         ->ResetSparseVarRecorder();
   }  // while(true)
 }
 
 void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
-                                   framework::ProgramDesc *program) const {
-  VLOG(3) << "RunAsyncLoop in";
+                                   framework::ProgramDesc *program,
+                                   framework::Scope *recv_scope) const {
   // grad name to block id
   std::unordered_map<std::string, int32_t> grad_to_block_id;
   std::unordered_map<int32_t, std::string> id_to_grad;
@@ -192,6 +192,10 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
     block_list.push_back(blkid);
   }
   auto optimize_prepared = executor->Prepare(*program, block_list);
+  // execute global block if needed
+  if (block_list[0] == 1 && id_to_grad.count(1) == 0) {
+    executor->RunPreparedContext(optimize_prepared[0].get(), recv_scope);
+  }
   std::unordered_map<std::string,
                      std::shared_ptr<framework::ExecutorPrepareContext>>
       grad_to_prepared_ctx;
@@ -203,10 +207,9 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
   request_get_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
   request_prefetch_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
 
-  VLOG(3) << "RunAsyncLoop into while";
   while (true) {
     if (rpc_service_->IsExit()) {
-      LOG(INFO) << "get exit!rpc_processor break!";
+      VLOG(4) << "get exit!rpc_processor break!";
       break;
     }
 
@@ -215,19 +218,21 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
 }
 
 static void FillRequestCtx(
-    detail::RequestHandler *h, framework::Scope *scope,
+    distributed::RequestHandler *h, framework::Scope *scope,
     platform::DeviceContext *dev_ctx, framework::Executor *executor,
     framework::ProgramDesc *program,
     std::unordered_map<std::string,
                        std::shared_ptr<framework::ExecutorPrepareContext>>
         *prefetch_ctx,
-    detail::RPCServer *rpc_server) {
+    std::shared_ptr<framework::ExecutorPrepareContext> checkpoint_ctx,
+    distributed::RPCServer *rpc_server) {
   h->SetScope(scope);
   h->SetDevCtx(dev_ctx);
   h->SetExecutor(executor);
   h->SetProgram(program);
   h->SetPrefetchPreparedCtx(prefetch_ctx);
   h->SetRPCServer(rpc_server);
+  h->SetCheckpointNotifyPreparedCtx(checkpoint_ctx);
 }
 
 void ListenAndServOp::RunImpl(const framework::Scope &scope,
@@ -243,26 +248,44 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
 
   PADDLE_ENFORCE(!rpc_service_);
   std::string endpoint = Attr<std::string>("endpoint");
+  int checkpoint_block_id = Attr<int>(kCheckpointBlockId);
 
-  LOG(INFO) << "sync_mode:" << sync_mode << ", fan_in:" << fan_in
-            << ", end_point:" << endpoint;
+  VLOG(4) << "sync_mode:" << sync_mode << ", fan_in:" << fan_in
+          << ", end_point:" << endpoint
+          << ", checkpoint_block_id: " << checkpoint_block_id;
 
   rpc_service_.reset(new RPCSERVER_T(endpoint, fan_in));
 
-  request_send_handler_.reset(new detail::RequestSendHandler(sync_mode));
-  request_get_handler_.reset(new detail::RequestGetHandler(sync_mode));
+  request_send_handler_.reset(new distributed::RequestSendHandler(sync_mode));
+  request_get_handler_.reset(new distributed::RequestGetHandler(sync_mode));
   request_prefetch_handler_.reset(
-      new detail::RequestPrefetchHandler(sync_mode));
-
-  rpc_service_->RegisterRPC(detail::kRequestSend, request_send_handler_.get());
-  rpc_service_->RegisterRPC(detail::kRequestGet, request_get_handler_.get());
-  rpc_service_->RegisterRPC(detail::kRequestPrefetch,
+      new distributed::RequestPrefetchHandler(sync_mode));
+  request_checkpoint_handler_.reset(new distributed::RequestCheckpointHandler(
+      sync_mode, checkpoint_block_id));
+
+  rpc_service_->RegisterRPC(distributed::kRequestSend,
+                            request_send_handler_.get());
+  rpc_service_->RegisterRPC(distributed::kRequestGet,
+                            request_get_handler_.get());
+  rpc_service_->RegisterRPC(distributed::kRequestPrefetch,
                             request_prefetch_handler_.get());
-
-  auto *optimize_block = Attr<framework::BlockDesc *>(kOptimizeBlock);
-  auto *program = optimize_block->Program();
+  rpc_service_->RegisterRPC(distributed::kRequestCheckpoint,
+                            request_checkpoint_handler_.get());
+
+  auto optimize_blocks =
+      Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
+  PADDLE_ENFORCE(optimize_blocks.size() >= 1,
+                 "optimize blocks should be 1 at least on the pserver side.");
+  auto *program = optimize_blocks[0]->Program();
   framework::Executor executor(dev_place);
 
+  std::shared_ptr<framework::ExecutorPrepareContext> ckpt_pre_context = nullptr;
+  if (checkpoint_block_id != -1) {
+    auto ctx = executor.Prepare(*program, checkpoint_block_id);
+    // see: https://stackoverflow.com/a/14856553
+    ckpt_pre_context = std::move(ctx);
+  }
+
   // prepare for prefetch
   std::vector<int> prefetch_block_id_list;
   std::unordered_map<int, std::string> block_id_to_prefetch_var_name;
@@ -293,13 +316,15 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
     prefetch_var_name_to_prepared_ctx[prefetch_var_name] = prefetch_prepared[i];
   }
 
-  auto f = std::bind(FillRequestCtx, std::placeholders::_1, &recv_scope,
-                     &dev_ctx, &executor, program,
-                     &prefetch_var_name_to_prepared_ctx, rpc_service_.get());
+  auto f =
+      std::bind(FillRequestCtx, std::placeholders::_1, &recv_scope, &dev_ctx,
+                &executor, program, &prefetch_var_name_to_prepared_ctx,
+                ckpt_pre_context, rpc_service_.get());
 
   f(request_send_handler_.get());
   f(request_get_handler_.get());
   f(request_prefetch_handler_.get());
+  f(request_checkpoint_handler_.get());
 
   // start the server listening after all member initialized.
   server_thread_.reset(new std::thread(RunServer, rpc_service_));
@@ -313,9 +338,10 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
   // Write to a file of server selected port for python use.
   SavePort();
   if (sync_mode) {
-    RunSyncLoop(&executor, program, &recv_scope, prefetch_block_id_list);
+    RunSyncLoop(&executor, program, &recv_scope, prefetch_block_id_list,
+                checkpoint_block_id);
   } else {
-    RunAsyncLoop(&executor, program);
+    RunAsyncLoop(&executor, program, &recv_scope);
   }
 }
 
@@ -337,18 +363,23 @@ class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
         "a map from grad name to it's optimize block id")
         .SetDefault({});
     AddAttr<bool>("sync_mode", "if works at sync_mode or not").SetDefault(true);
-    AddAttr<framework::BlockDesc *>(kOptimizeBlock,
-                                    "BlockID to run on server side.");
+    AddAttr<std::vector<framework::BlockDesc *>>(
+        kOptimizeBlocks, "Optimize blocks to run on server side.")
+        .SetDefault({});
     AddAttr<std::vector<std::string>>(kPrefetchVarNameToBlockId,
                                       "prefetch blocks to run on server side.")
         .SetDefault({});
     AddAttr<int>("Fanin", "How many clients send to this server.")
         .SetDefault(1);
+    AddAttr<int>(kCheckpointBlockId,
+                 "BolckID to run save checkpoint on pserer.")
+        .SetDefault(-1);
   }
 };
 
 void SignalHandler::StopAndExit(int signal_num) {
-  VLOG(3) << "Catch interrupt signal: " << signal_num << ", program will exit";
+  // Do not use VLOG here for the device for printing maybe already released.
+  // exit will release interal allocated resoureces.
   exit(0);
 }
 
diff --git a/paddle/fluid/operators/listen_and_serv_op.h b/paddle/fluid/operators/listen_and_serv_op.h
index 46c3a19e20b3f2dd970a672bb99f98e83d3e25bf..978969cc515c7954b59f2bf7a4f2c0e1b13f9bc0 100644
--- a/paddle/fluid/operators/listen_and_serv_op.h
+++ b/paddle/fluid/operators/listen_and_serv_op.h
@@ -24,16 +24,17 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/detail/request_handler.h"
-#include "paddle/fluid/operators/detail/rpc_server.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
+#include "paddle/fluid/operators/distributed/rpc_server.h"
 
 namespace paddle {
 namespace operators {
 
-constexpr char kOptimizeBlock[] = "OptimizeBlock";
+constexpr char kOptimizeBlocks[] = "optimize_blocks";
 constexpr char kPrefetchVarNameToBlockId[] = "prefetch_var_name_to_block_id";
+constexpr char kCheckpointBlockId[] = "checkpint_block_id";
 
-void RunServer(std::shared_ptr<detail::RPCServer> service);
+void RunServer(std::shared_ptr<distributed::RPCServer> service);
 
 class ListenAndServOp : public framework::OperatorBase {
  public:
@@ -47,10 +48,12 @@ class ListenAndServOp : public framework::OperatorBase {
   void RunSyncLoop(framework::Executor* executor,
                    framework::ProgramDesc* program,
                    framework::Scope* recv_scope,
-                   const std::vector<int>& prefetch_block_id_list) const;
+                   const std::vector<int>& prefetch_block_id_list,
+                   const int checkpoint_point_block_id) const;
 
   void RunAsyncLoop(framework::Executor* executor,
-                    framework::ProgramDesc* program) const;
+                    framework::ProgramDesc* program,
+                    framework::Scope* recv_scope) const;
 
   void SavePort() const;
 
@@ -62,10 +65,13 @@ class ListenAndServOp : public framework::OperatorBase {
                const platform::Place& dev_place) const override;
 
  protected:
-  mutable std::shared_ptr<detail::RPCServer> rpc_service_;
-  mutable std::shared_ptr<detail::RequestHandler> request_send_handler_;
-  mutable std::shared_ptr<detail::RequestHandler> request_get_handler_;
-  mutable std::shared_ptr<detail::RequestHandler> request_prefetch_handler_;
+  mutable std::shared_ptr<distributed::RPCServer> rpc_service_;
+  mutable std::shared_ptr<distributed::RequestHandler> request_send_handler_;
+  mutable std::shared_ptr<distributed::RequestHandler> request_get_handler_;
+  mutable std::shared_ptr<distributed::RequestHandler>
+      request_prefetch_handler_;
+  mutable std::shared_ptr<distributed::RequestHandler>
+      request_checkpoint_handler_;
 
   mutable std::shared_ptr<std::thread> server_thread_;
 };
diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc
index 8f4b5049271c9592d2db268ea7ff2f5c8abc28b6..ac35cf0b89bfaa0c0f8e64445f18a3bbd478e70a 100644
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -34,6 +34,8 @@ class LoadOp : public framework::OperatorBase {
     auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place);
     platform::RecordEvent record_event(Type(), dev_ctx);
 
+    // FIXME(yuyang18): We save variable to local file now, but we should change
+    // it to save an output stream.
     auto filename = Attr<std::string>("file_path");
     std::ifstream fin(filename);
     PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s for load op",
@@ -44,9 +46,25 @@ class LoadOp : public framework::OperatorBase {
     PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found",
                    out_var_name);
 
-    auto *tensor = out_var->GetMutable<framework::LoDTensor>();
+    if (out_var->IsType<framework::LoDTensor>()) {
+      LoadLodTensor(fin, place, out_var);
+    } else if (out_var->IsType<framework::SelectedRows>()) {
+      LoadSelectedRows(fin, place, out_var);
+    } else {
+      PADDLE_ENFORCE(
+          false,
+          "Load only support LoDTensor and SelectedRows, %s has wrong type",
+          out_var_name);
+    }
+  }
 
-    DeserializeFromStream(fin, tensor, *dev_ctx);
+  void LoadLodTensor(std::istream &fin, const platform::Place &place,
+                     framework::Variable *var) const {
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+    auto *tensor = var->GetMutable<framework::LoDTensor>();
+    DeserializeFromStream(fin, tensor, dev_ctx);
 
     auto load_as_fp16 = Attr<bool>("load_as_fp16");
     auto in_dtype = framework::ToDataType(tensor->type());
@@ -63,18 +81,27 @@ class LoadOp : public framework::OperatorBase {
                                &fp16_tensor);
 
       // reset output tensor
-      out_var->Clear();
-      tensor = out_var->GetMutable<framework::LoDTensor>();
+      var->Clear();
+      tensor = var->GetMutable<framework::LoDTensor>();
       tensor->set_lod(fp16_tensor.lod());
       tensor->ShareDataWith(fp16_tensor);
     }
   }
+
+  void LoadSelectedRows(std::istream &fin, const platform::Place &place,
+                        framework::Variable *var) const {
+    auto *selectedRows = var->GetMutable<framework::SelectedRows>();
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+    framework::DeserializeFromStream(fin, selectedRows, dev_ctx);
+  }
 };
 
 class LoadOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddOutput("Out", "The tensor need to be loaded");
+    AddOutput("Out", "The LoDTensor / SelectedRows need to be loaded");
     AddAttr<bool>(
         "load_as_fp16",
         "If true, the tensor will be first loaded and then "
@@ -85,7 +112,9 @@ class LoadOpProtoMaker : public framework::OpProtoAndCheckerMaker {
                          R"(Variable will be loaded from "file_path")")
         .AddCustomChecker(
             [](const std::string &path) { return !path.empty(); });
-    AddComment("Load operator will load a tensor variable from disk file.");
+    AddComment(
+        "Load operator will load a LoDTensor / SelectedRows variable from disk "
+        "file.");
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/logical_op.cc b/paddle/fluid/operators/logical_op.cc
index db109f5cd053d84718ac85bd4693ecece12ce172..26970db8d2af62bb06fce4eb1a1f21fd41617bd1 100644
--- a/paddle/fluid/operators/logical_op.cc
+++ b/paddle/fluid/operators/logical_op.cc
@@ -146,6 +146,6 @@ REGISTER_UNARY_LOGICAL_OP(logical_not, "$$Out = !X$$");
 REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CPU,
                               paddle::operators::LogicalNotFunctor);
 REGISTER_BINARY_LOGICAL_OP(logical_xor,
-                           "$$Out = (X || Y) \\, \\&\\& \\, !(X \\&\\& Y)$$");
+                           "$$Out = (X || Y) \\&\\& !(X \\&\\& Y)$$");
 REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CPU,
                                paddle::operators::LogicalXorFunctor);
diff --git a/paddle/fluid/operators/lstm_op.cc b/paddle/fluid/operators/lstm_op.cc
index 4751e3e8025e51a687f8fcfd25e603b61e762f6d..3225bf9bb63d57969ce9ae0e4a74e8f466c8c2d0 100644
--- a/paddle/fluid/operators/lstm_op.cc
+++ b/paddle/fluid/operators/lstm_op.cc
@@ -184,34 +184,32 @@ Long-Short Term Memory (LSTM) Operator.
 The defalut implementation is diagonal/peephole connection
 (https://arxiv.org/pdf/1402.1128.pdf), the formula is as follows:
 
-$$
-i_t = \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + W_{ic}c_{t-1} + b_i) \\
+$$ i_t = \\sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + W_{ic}c_{t-1} + b_i) $$
 
-f_t = \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + W_{fc}c_{t-1} + b_f) \\
+$$ f_t = \\sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + W_{fc}c_{t-1} + b_f) $$
 
-\tilde{c_t} = act_g(W_{cx}x_t + W_{ch}h_{t-1} + b_c) \\
+$$ \\tilde{c_t} = act_g(W_{cx}x_t + W_{ch}h_{t-1} + b_c) $$
 
-o_t = \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + W_{oc}c_t + b_o) \\
+$$ o_t = \\sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + W_{oc}c_t + b_o) $$
 
-c_t = f_t \odot c_{t-1} + i_t \odot \tilde{c_t} \\
+$$ c_t = f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} $$
 
-h_t = o_t \odot act_h(c_t)
-$$
+$$ h_t = o_t \\odot act_h(c_t) $$
 
-where the W terms denote weight matrices (e.g. $W_{xi}$ is the matrix
-of weights from the input gate to the input), $W_{ic}, W_{fc}, W_{oc}$
-are diagonal weight matrices for peephole connections. In our implementation,
-we use vectors to reprenset these diagonal weight matrices. The b terms
-denote bias vectors ($b_i$ is the input gate bias vector), $\sigma$
-is the non-line activations, such as logistic sigmoid function, and
-$i, f, o$ and $c$ are the input gate, forget gate, output gate,
-and cell activation vectors, respectively, all of which have the same size as
-the cell output activation vector $h$.
-
-The $\odot$ is the element-wise product of the vectors. $act_g$ and $act_h$
-are the cell input and cell output activation functions and `tanh` is usually
-used for them. $\tilde{c_t}$ is also called candidate hidden state,
-which is computed based on the current input and the previous hidden state.
+- W terms denote weight matrices (e.g. $W_{xi}$ is the matrix
+  of weights from the input gate to the input), $W_{ic}, W_{fc}, W_{oc}$
+  are diagonal weight matrices for peephole connections. In our implementation,
+  we use vectors to reprenset these diagonal weight matrices.
+- The b terms denote bias vectors ($b_i$ is the input gate bias vector).
+- $\sigma$ is the non-line activations, such as logistic sigmoid function.
+- $i, f, o$ and $c$ are the input gate, forget gate, output gate,
+  and cell activation vectors, respectively, all of which have the same size as
+  the cell output activation vector $h$.
+- The $\odot$ is the element-wise product of the vectors.
+- $act_g$ and $act_h$ are the cell input and cell output activation functions
+  and `tanh` is usually used for them.
+- $\tilde{c_t}$ is also called candidate hidden state,
+  which is computed based on the current input and the previous hidden state.
 
 Set `use_peepholes` False to disable peephole connection. The formula
 is omitted here, please refer to the paper
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index bc788ef5e9f84c00bb7abb65997ad68182efec62..d2b772d11379c218be77277b89f3ded7b59ab9f3 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -55,13 +55,13 @@ math_library(matrix_bit_code)
 math_library(unpooling)
 math_library(vol2col)
 
-cc_test(math_function_test SRCS math_function_test.cc)
+cc_test(math_function_test SRCS math_function_test.cc DEPS math_function)
 cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor)
 cc_test(im2col_test SRCS im2col_test.cc DEPS im2col)
 cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col)
 cc_test(sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_padding)
 if(WITH_GPU)
-    nv_test(math_function_gpu_test SRCS math_function_test.cu)
-    nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor)
+    nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function)
+    nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor math_function)
 endif()
 cc_test(concat_test SRCS concat_test.cc DEPS concat)
diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h
index 6207d14ecdc922cbca2d05d20e4b8a9da9b9d627..9f6c1e5c35f02cd4bc729eea78b17fac017aa90e 100644
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -18,49 +18,17 @@
 #include "paddle/fluid/framework/tensor.h"
 
 #ifdef PADDLE_WITH_MKLML
-#include <mkl_cblas.h>
-#include <mkl_lapacke.h>
-#include <mkl_service.h>
-#include <mkl_vml_functions.h>
+#include "paddle/fluid/platform/dynload/mklml.h"
 #endif
 
 #ifdef PADDLE_USE_OPENBLAS
 #include <cblas.h>
-#ifdef LAPACK_FOUND
-#include <lapacke.h>
-#endif
-#endif
-
-#ifndef LAPACK_FOUND
-extern "C" {
-#include <cblas.h>  // NOLINT
-int LAPACKE_sgetrf(int matrix_layout, int m, int n, float* a, int lda,
-                   int* ipiv);
-int LAPACKE_dgetrf(int matrix_layout, int m, int n, double* a, int lda,
-                   int* ipiv);
-int LAPACKE_sgetri(int matrix_layout, int n, float* a, int lda,
-                   const int* ipiv);
-int LAPACKE_dgetri(int matrix_layout, int n, double* a, int lda,
-                   const int* ipiv);
-}
 #endif
 
 namespace paddle {
 namespace operators {
 namespace math {
 
-static void SetNumThreads(int num_threads) {
-#ifdef PADDLE_USE_OPENBLAS
-  int real_num_threads = num_threads > 1 ? num_threads : 1;
-  openblas_set_num_threads(real_num_threads);
-#elif defined(PADDLE_WITH_MKLML)
-  int real_num_threads = num_threads > 1 ? num_threads : 1;
-  mkl_set_num_threads(real_num_threads);
-#else
-  PADDLE_ENFORCE(false, "To be implemented.");
-#endif
-}
-
 /**
  * Matrix Descriptor of a memory buffer.
  *
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index ae20406bc21d5e08359be8295cd98495dda7813b..2ce94cfc93823aa891114ef8fd1e851727ebc623 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -22,61 +22,109 @@ namespace math {
 template <typename T>
 struct CBlas;
 
+#ifdef PADDLE_WITH_MKLML
 template <>
 struct CBlas<float> {
   template <typename... ARGS>
   static void GEMM(ARGS... args) {
-    cblas_sgemm(args...);
+    platform::dynload::cblas_sgemm(args...);
   }
 
   template <typename... ARGS>
   static void AXPY(ARGS... args) {
-    cblas_saxpy(args...);
+    platform::dynload::cblas_saxpy(args...);
+  }
+
+  template <typename... ARGS>
+  static void VCOPY(ARGS... args) {
+    platform::dynload::cblas_scopy(args...);
+  }
+
+  template <typename... ARGS>
+  static void GEMV(ARGS... args) {
+    platform::dynload::cblas_sgemv(args...);
+  }
+
+  template <typename... ARGS>
+  static void GEMM_BATCH(ARGS... args) {
+    platform::dynload::cblas_sgemm_batch(args...);
   }
 
-#ifdef PADDLE_WITH_MKLML
   template <typename... ARGS>
   static void VADD(ARGS... args) {
-    vsAdd(args...);
+    platform::dynload::vsAdd(args...);
+  }
+};
+
+template <>
+struct CBlas<double> {
+  template <typename... ARGS>
+  static void GEMM(ARGS... args) {
+    platform::dynload::cblas_dgemm(args...);
+  }
+
+  template <typename... ARGS>
+  static void AXPY(ARGS... args) {
+    platform::dynload::cblas_daxpy(args...);
   }
-#endif
 
   template <typename... ARGS>
   static void VCOPY(ARGS... args) {
-    cblas_scopy(args...);
+    platform::dynload::cblas_dcopy(args...);
   }
 
   template <typename... ARGS>
   static void GEMV(ARGS... args) {
-    cblas_sgemv(args...);
+    platform::dynload::cblas_dgemv(args...);
   }
 
-#ifdef PADDLE_WITH_MKLML
   template <typename... ARGS>
   static void GEMM_BATCH(ARGS... args) {
-    cblas_sgemm_batch(args...);
+    platform::dynload::cblas_dgemm_batch(args...);
+  }
+
+  template <typename... ARGS>
+  static void VADD(ARGS... args) {
+    platform::dynload::vdAdd(args...);
   }
-#endif
 };
 
+#else
+
 template <>
-struct CBlas<double> {
+struct CBlas<float> {
   template <typename... ARGS>
   static void GEMM(ARGS... args) {
-    cblas_dgemm(args...);
+    cblas_sgemm(args...);
   }
 
   template <typename... ARGS>
   static void AXPY(ARGS... args) {
-    cblas_daxpy(args...);
+    cblas_saxpy(args...);
   }
 
-#ifdef PADDLE_WITH_MKLML
   template <typename... ARGS>
-  static void VADD(ARGS... args) {
-    vdAdd(args...);
+  static void VCOPY(ARGS... args) {
+    cblas_scopy(args...);
+  }
+
+  template <typename... ARGS>
+  static void GEMV(ARGS... args) {
+    cblas_sgemv(args...);
+  }
+};
+
+template <>
+struct CBlas<double> {
+  template <typename... ARGS>
+  static void GEMM(ARGS... args) {
+    cblas_dgemm(args...);
+  }
+
+  template <typename... ARGS>
+  static void AXPY(ARGS... args) {
+    cblas_daxpy(args...);
   }
-#endif
 
   template <typename... ARGS>
   static void VCOPY(ARGS... args) {
@@ -87,15 +135,8 @@ struct CBlas<double> {
   static void GEMV(ARGS... args) {
     cblas_dgemv(args...);
   }
-
-#ifdef PADDLE_WITH_MKLML
-  template <typename... ARGS>
-  static void GEMM_BATCH(ARGS... args) {
-    cblas_dgemm_batch(args...);
-  }
-#endif
 };
-
+#endif
 template <>
 struct CBlas<platform::float16> {
   static void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); }
diff --git a/paddle/fluid/operators/math/concat.cc b/paddle/fluid/operators/math/concat.cc
index cc69212466b72f3fa82e8f5f58b4f3229dab28ec..55c8a472aca7fe700ef6a3f96bed1496d7b12b80 100644
--- a/paddle/fluid/operators/math/concat.cc
+++ b/paddle/fluid/operators/math/concat.cc
@@ -70,21 +70,23 @@ template <typename T>
 class ConcatGradFunctor<platform::CPUDeviceContext, T> {
  public:
   void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, const int axis,
-                  std::vector<framework::Tensor>* outputs) {
+                  const framework::Tensor& input,
+                  const std::vector<const framework::Tensor*>& ref_inputs,
+                  const int axis, std::vector<framework::Tensor*>* outputs) {
     // TODO(zcd): Add input data validity checking
-    int num = outputs->size();
+    size_t num = outputs->size();
 
     int input_rows = 1;
-    auto dim_0 = outputs->at(0).dims();
+    auto dim_0 = ref_inputs[0]->dims();
     for (int i = 0; i < axis; ++i) {
       input_rows *= dim_0[i];
     }
+
     int input_cols = 0;
 
     std::vector<int64_t> output_cols(outputs->size());
-    for (int i = 0; i < num; ++i) {
-      int t_cols = outputs->at(i).numel() / input_rows;
+    for (size_t i = 0; i < num; ++i) {
+      int t_cols = ref_inputs[i]->numel() / input_rows;
       input_cols += t_cols;
       output_cols[i] = t_cols;
     }
@@ -94,11 +96,14 @@ class ConcatGradFunctor<platform::CPUDeviceContext, T> {
     for (int k = 0; k < input_rows; ++k) {
       const T* src_ptr = input.data<T>() + k * input_cols;
       int col_idx = 0;
-      for (int j = 0; j < num; ++j) {
+      for (size_t j = 0; j < num; ++j) {
         int col_len = output_cols[j];
-        T* dst_ptr = outputs->at(j).data<T>() + k * col_len;
-        memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx,
-                     sizeof(T) * col_len);
+        auto* out_tensor = outputs->at(j);
+        if (out_tensor != nullptr) {
+          T* dst_ptr = out_tensor->data<T>() + k * col_len;
+          memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx,
+                       sizeof(T) * col_len);
+        }
         col_idx += col_len;
       }
     }
diff --git a/paddle/fluid/operators/math/concat.cu b/paddle/fluid/operators/math/concat.cu
index 4285d38dcd6a4124543cdd2246c82a8203f5a281..5863d74fca21de8b77bc208fb95d8fd52562f7a7 100644
--- a/paddle/fluid/operators/math/concat.cu
+++ b/paddle/fluid/operators/math/concat.cu
@@ -22,43 +22,24 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-template <typename T>
-__device__ T upper_bound(const T* first, T count, T val) {
-  const T* orig = first;
-  const T* it = nullptr;
-  T step = 0;
-  while (count > 0) {
-    it = first;
-    step = count / 2;
-    it += step;
-    if (!(val < *it)) {
-      first = ++it;
-      count -= step + 1;
-    } else {
-      count = step;
-    }
-  }
-  return first - orig;
-}
-
 template <typename T>
 __global__ void KernelConcat(T** inputs, const int* input_cols, int col_size,
                              const int output_rows, const int output_cols,
                              T* output) {
   int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  int segment = upper_bound<int>(input_cols, col_size, tid_x) - 1;
-
-  int curr_offset = input_cols[segment];
-  int curr_segment = segment;
+  int curr_segment = 0;
+  int curr_offset = input_cols[0];
   for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) {
-    T curr_col_offset;
-    while ((curr_col_offset = input_cols[curr_segment + 1]) <= tid_x) {
+    int curr_col_offset = input_cols[curr_segment + 1];
+    while (curr_col_offset <= tid_x) {
       curr_offset = curr_col_offset;
       ++curr_segment;
+      curr_col_offset = input_cols[curr_segment + 1];
     }
 
     int local_col = tid_x - curr_offset;
     int segment_width = curr_col_offset - curr_offset;
+
     T* input_ptr = inputs[curr_segment];
     int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
     for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y)
@@ -89,23 +70,25 @@ __global__ void KernelConcatGrad(const T* input_data, const int in_row,
                                  const int in_col, const int* out_cols,
                                  int out_cols_size, T** outputs_data) {
   int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  int segment = upper_bound<int>(out_cols, out_cols_size, tid_x) - 1;
-  int curr_offset = out_cols[segment];
-  int curr_segment = segment;
+  int curr_segment = 0;
+  int curr_offset = out_cols[0];
   for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
-    T curr_col_offset;
-    while ((curr_col_offset = out_cols[curr_segment + 1]) <= tid_x) {
+    int curr_col_offset = out_cols[curr_segment + 1];
+    while (curr_col_offset <= tid_x) {
       curr_offset = curr_col_offset;
       ++curr_segment;
+      curr_col_offset = out_cols[curr_segment + 1];
     }
 
     int local_col = tid_x - curr_offset;
     int segment_width = curr_col_offset - curr_offset;
     T* output_ptr = outputs_data[curr_segment];
-    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-    for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
-      output_ptr[tid_y * segment_width + local_col] =
-          input_data[tid_y * in_col + tid_x];
+    if (output_ptr != nullptr) {
+      int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+      for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
+        output_ptr[tid_y * segment_width + local_col] =
+            input_data[tid_y * in_col + tid_x];
+    }
   }
 }
 
@@ -118,10 +101,12 @@ __global__ void KernelConcatGrad(const T* input_data, const int in_row,
     int split = tid_x / fixed_out_col;
     int in_offset = tid_x - split * fixed_out_col;
     T* output_ptr = outputs_data[split];
-    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-    for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
-      output_ptr[tid_y * fixed_out_col + in_offset] =
-          input_data[tid_y * in_col + tid_x];
+    if (output_ptr != nullptr) {
+      int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+      for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
+        output_ptr[tid_y * fixed_out_col + in_offset] =
+            input_data[tid_y * in_col + tid_x];
+    }
   }
 }
 
@@ -203,17 +188,18 @@ template <typename T>
 class ConcatGradFunctor<platform::CUDADeviceContext, T> {
  public:
   void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, const int axis,
-                  std::vector<framework::Tensor>* outputs) {
+                  const framework::Tensor& input,
+                  const std::vector<const framework::Tensor*>& ref_inputs,
+                  const int axis, std::vector<framework::Tensor*>* outputs) {
     // TODO(zcd): Add input data validity checking
     int o_num = outputs->size();
     int out_row = 1;
-    auto dim_0 = outputs->at(0).dims();
+    auto dim_0 = ref_inputs[0]->dims();
     for (int i = 0; i < axis; ++i) {
       out_row *= dim_0[i];
     }
 
-    int out_col = outputs->at(0).numel() / out_row;
+    int out0_col = ref_inputs[0]->numel() / out_row;
     int in_col = 0, in_row = out_row;
     bool sameShape = true;
 
@@ -223,13 +209,17 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
 
     outputs_cols[0] = 0;
     for (int i = 0; i < o_num; ++i) {
-      int t_col = outputs->at(i).numel() / out_row;
+      int t_col = ref_inputs.at(i)->numel() / out_row;
       if (sameShape) {
-        if (t_col != out_col) sameShape = false;
+        if (t_col != out0_col) sameShape = false;
       }
       in_col += t_col;
       outputs_cols[i + 1] = in_col;
-      outputs_ptr[i] = outputs->at(i).data<T>();
+      if (outputs->at(i) != nullptr) {
+        outputs_ptr[i] = outputs->at(i)->data<T>();
+      } else {
+        outputs_ptr[i] = nullptr;
+      }
     }
 
     T** dev_out_gpu_data =
@@ -255,7 +245,7 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
 
     if (sameShape) {
       KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
-          input.data<T>(), in_row, in_col, out_col, dev_out_gpu_data);
+          input.data<T>(), in_row, in_col, out0_col, dev_out_gpu_data);
     } else {
       const int* dev_outs_col_data = outputs_cols.CUDAData(context.GetPlace());
       KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
diff --git a/paddle/fluid/operators/math/concat.h b/paddle/fluid/operators/math/concat.h
index 041ce8bf8a2e9528a004c076ead4471a3837c1a6..9e080f2e8be23768dcea47b577043beef37b2eaf 100644
--- a/paddle/fluid/operators/math/concat.h
+++ b/paddle/fluid/operators/math/concat.h
@@ -57,7 +57,8 @@ template <typename DeviceContext, typename T>
 class ConcatGradFunctor {
  public:
   void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const int axis, std::vector<framework::Tensor>* outputs);
+                  const std::vector<const framework::Tensor*>& ref_inputs,
+                  const int axis, std::vector<framework::Tensor*>* outputs);
 };
 
 }  // namespace math
diff --git a/paddle/fluid/operators/math/detail/avx_functions.cc b/paddle/fluid/operators/math/detail/avx_functions.cc
index b95109d3f73505fa6b5438326804a2b348fb3668..5641f914523771f47bd7f814bfd39964a53deefc 100644
--- a/paddle/fluid/operators/math/detail/avx_functions.cc
+++ b/paddle/fluid/operators/math/detail/avx_functions.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <immintrin.h>
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 // TODO(qingqing) refine this dependence
-#include "paddle/cuda/src/avx_mathfun.h"
+#include "paddle/legacy/cuda/src/avx_mathfun.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
index d39154c6f88d6d17c1719eb9a5b048211f4bb52b..c3387be6daa3bd34a6e3410ced23fce5d65f2cf7 100644
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -30,6 +30,7 @@ template struct SetConstant<platform::CPUDeviceContext, double>;
 template struct SetConstant<platform::CPUDeviceContext, int>;
 template struct SetConstant<platform::CPUDeviceContext, int64_t>;
 template struct SetConstant<platform::CPUDeviceContext, bool>;
+template struct SetConstant<platform::CPUDeviceContext, uint8_t>;
 
 #define DEFINE_CPU_TRANS(RANK)                                             \
   template struct Transpose<platform::CPUDeviceContext, platform::float16, \
diff --git a/paddle/fluid/operators/math/math_function.h b/paddle/fluid/operators/math/math_function.h
index 8b296b6a07ca222ddc08fedfd2eed423b46dc5c3..7ec78d9ef8e7ff966674b043c017f2fbedb77bb9 100644
--- a/paddle/fluid/operators/math/math_function.h
+++ b/paddle/fluid/operators/math/math_function.h
@@ -14,30 +14,11 @@ limitations under the License. */
 
 #pragma once
 #ifdef PADDLE_WITH_MKLML
-#include <mkl_cblas.h>
-#include <mkl_lapacke.h>
-#include <mkl_vml_functions.h>
+#include "paddle/fluid/platform/dynload/mklml.h"
 #endif
 
 #ifdef PADDLE_USE_OPENBLAS
 #include <cblas.h>
-#ifdef LAPACK_FOUND
-#include <lapacke.h>
-#endif
-#endif
-
-#ifndef LAPACK_FOUND
-extern "C" {
-#include <cblas.h>  // NOLINT
-int LAPACKE_sgetrf(int matrix_layout, int m, int n, float* a, int lda,
-                   int* ipiv);
-int LAPACKE_dgetrf(int matrix_layout, int m, int n, double* a, int lda,
-                   int* ipiv);
-int LAPACKE_sgetri(int matrix_layout, int n, float* a, int lda,
-                   const int* ipiv);
-int LAPACKE_dgetri(int matrix_layout, int n, double* a, int lda,
-                   const int* ipiv);
-}
 #endif
 
 #include <cmath>
diff --git a/paddle/fluid/operators/mean_op.cc b/paddle/fluid/operators/mean_op.cc
index 4881cff4a368ffae9b030f04b7fff01d6ee7d26e..9e0bebd17c02a3ce010b77142757b8789cfbcdd9 100644
--- a/paddle/fluid/operators/mean_op.cc
+++ b/paddle/fluid/operators/mean_op.cc
@@ -33,12 +33,10 @@ class MeanOp : public framework::OperatorWithKernel {
 class MeanOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("X", "The input of mean op");
-    AddOutput("Out", "The output of mean op").Reuse("X");
+    AddInput("X", "(Tensor) The input of mean op");
+    AddOutput("Out", "(Tensor) The output of mean op").Reuse("X");
     AddComment(R"DOC(
-Mean Operator.
-
-Out is a scalar which is the mean of all elements in X. 
+Mean Operator calculates the mean of all elements in X.
 
 )DOC");
   }
diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc
index a16861b3b77fc980ab932b9d88859b38ec36108b..2dc1467b0d4816d5cc0535eb62e936cf342a241c 100644
--- a/paddle/fluid/operators/merge_lod_tensor_op.cc
+++ b/paddle/fluid/operators/merge_lod_tensor_op.cc
@@ -44,8 +44,10 @@ class MergeLoDTensorOp : public framework::OperatorBase {
         scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
     auto level = static_cast<size_t>(Attr<int>("level"));
 
-    auto &mask_dim = mask.dims();
+    PADDLE_ENFORCE(in_true.numel() || in_false.numel(),
+                   "Input(InTrue) or Input(InFalse) should be initialized.");
 
+    auto &mask_dim = mask.dims();
     std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()};
     if (platform::is_cpu_place(mask.place())) {
       cpu_mask->ShareDataWith(mask);
@@ -59,19 +61,27 @@ class MergeLoDTensorOp : public framework::OperatorBase {
     }
     auto *mask_data = cpu_mask->data<bool>();
 
-    int rank = in_true.dims().size();
-    platform::Place place = in_true.place();
-    std::type_index data_type = in_true.type();
-    framework::DDim in_true_dims =
-        framework::slice_ddim(in_true.dims(), 1, rank);
-
+    platform::Place place = dev_place;
     int64_t batch_size = in_true.dims()[0] + in_false.dims()[0];
 
-    auto in_true_dim_vec = framework::vectorize(in_true_dims);
-    in_true_dim_vec.insert(in_true_dim_vec.begin(), batch_size);
+    std::type_index data_type =
+        in_true.IsInitialized() ? in_true.type() : in_false.type();
+    int rank;
+    framework::DDim in_dims;
+    if (in_true.IsInitialized()) {
+      rank = in_true.dims().size();
+      in_dims = framework::slice_ddim(in_true.dims(), 1, rank);
+    } else {
+      rank = in_false.dims().size();
+      in_dims = framework::slice_ddim(in_false.dims(), 1, rank);
+    }
+
+    auto in_dim_vec = framework::vectorize(in_dims);
+    in_dim_vec.insert(in_dim_vec.begin(), batch_size);
 
-    framework::DDim out_dims = framework::make_ddim(in_true_dim_vec);
+    framework::DDim out_dims = framework::make_ddim(in_dim_vec);
     out->Resize(out_dims);
+
     out->mutable_data(place, data_type);
 
     auto *out_lod = out->mutable_lod();
diff --git a/paddle/fluid/operators/multiplex_op.cc b/paddle/fluid/operators/multiplex_op.cc
index a4363fd25d57edb5c2509904a1f55634832613be..18ad46cb5eeeab2169136e40cebdaa53c0bfd587 100644
--- a/paddle/fluid/operators/multiplex_op.cc
+++ b/paddle/fluid/operators/multiplex_op.cc
@@ -62,26 +62,46 @@ class MultiplexOp : public framework::OperatorWithKernel {
 class MultiplexOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("Ids", "The index tensor of multiplex operator.");
-    AddInput("X", "The candidate tensors of multiplex operator.")
+    AddInput("Ids",
+             "Tensor<int32>, index variable which is a 2-D tensor with shape "
+             "[M, 1] where M is the batch size.");
+    AddInput("X",
+             "A list of variables to gather from. All variables have the same "
+             "shape and the rank is at least 2.")
         .AsDuplicable();
     AddOutput("Out", "The output tensor of multiplex operator.");
     AddComment(R"DOC(
-Multiplex Operator.
-
-Multiplex multiple tensors according to the index provided by the index tensor.
-
-Ids: the index tensor.
-X[0 : N - 1]: the candidate tensors for output (N >= 2).
-For each index i from 0 to batchSize - 1, the output is the i-th row of the
+Referring to the given index variable, this layer selects rows from the
+input variables to construct a multiplex variable. Assuming that there are
+:math:`m` input variables and :math:`I_i` represents the i-th input
+variable and :math:`i` is in [0, :math:`m`). All input variables are
+tensors with same shape [:math:`d_0`, :math:`d_1`, ..., :math:`d_R`].
+Please note that rank of the input tensor should be at least 2. Each input
+variable will be treated as a 2-D matrix with shape [:math:`M`, :math:`N`]
+where :math:`M` for :math:`d_0` and :math:`N` for :math:`d_1` * :math:`d_2`
+* ... * :math:`d_R`. Let :math:`I_i[j]` be the j-th row of the i-th input
+variable. The given index variable should be a 2-D tensor with shape
+[:math:`M`, 1]. Let `ID[i]` be the i-th index value of the index variable.
+Then the output variable will be a tensor with shape [:math:`d_0`,
+:math:`d_1`, ..., :math:`d_R`]. If we treat the output tensor as a 2-D
+matrix with shape [:math:`M`, :math:`N`] and let :math:`O[i]` be the i-th
+row of the matrix, then `O[i]` is equal to :math:`I_{ID[i]}[i]`.
+
+* Ids: the index tensor.
+
+* X[0 : N - 1]: the candidate tensors for output (N >= 2).
+
+* For each index i from 0 to batchSize - 1, the output is the i-th row of the
 the (Ids[i])-th tensor.
 
 For i-th row of the output tensor:
 
-$$y[i] = x_{k}[i]$$
+$$
+y[i] = x_{k}[i]
+$$
 
-where `y` is the output tensor, `x_{k}` is the k-th input tensor,
-and `k = Ids[i]`.
+where $y$ is the output tensor, $x_{k}$ is the k-th input tensor,
+and $k = Ids[i]$.
 
 )DOC");
   }
diff --git a/paddle/fluid/operators/nccl_op_test.cu.cc b/paddle/fluid/operators/nccl_op_test.cu.cc
index ef54d79fdf2becde98c68044d14bd4347773b975..d5fb7a12e5d9757f3e639f6de7f0129bd531e2a1 100644
--- a/paddle/fluid/operators/nccl_op_test.cu.cc
+++ b/paddle/fluid/operators/nccl_op_test.cu.cc
@@ -19,7 +19,6 @@ limitations under the License. */
 #include <thread>  // NOLINT
 #include <vector>
 
-#include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/program_desc.h"
@@ -27,6 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/place.h"
 
 USE_NO_KERNEL_OP(ncclInit);
diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc
index 06092e680a1efbef379ccf40fdf476769f820429..e471f04662a1fa3e8e77a2db37f0da4521682018 100644
--- a/paddle/fluid/operators/nce_op.cc
+++ b/paddle/fluid/operators/nce_op.cc
@@ -128,8 +128,10 @@ class NCEOpMaker : public framework::OpProtoAndCheckerMaker {
                               "user should avoid setting this attribute.")
         .SetDefault({});
     AddComment(R"DOC(
-Compute and return the noise-contrastive estimation training loss.
-See [Noise-contrastive estimation: A new estimation principle for unnormalized statistical models](http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf).
+Compute and return the noise-contrastive estimation training loss. See 
+`Noise-contrastive estimation: A new estimation principle for unnormalized 
+statistical models 
+ <http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf>`_.
 By default this operator uses a uniform distribution for sampling.
 )DOC");
   }
diff --git a/paddle/fluid/operators/parallel_do_op.cc b/paddle/fluid/operators/parallel_do_op.cc
index 1012640d5e2052e4f347ad458cea9072a004f334..c9744db3d0654ef63357963d9a9a3cb946f56e2d 100644
--- a/paddle/fluid/operators/parallel_do_op.cc
+++ b/paddle/fluid/operators/parallel_do_op.cc
@@ -295,7 +295,7 @@ class ParallelDoGradOp : public framework::OperatorBase {
 
         auto sum_op = framework::OpRegistry::CreateOp(
             "sum", {{"X", {s, tmp_name}}}, {{"Out", {s}}},
-            framework::AttributeMap{});
+            framework::AttributeMap{{"use_mkldnn", {false}}});
         VLOG(10) << sum_op->DebugStringEx(sub_scopes[0]);
         sum_op->Run(*sub_scopes[0], places[0]);
         WaitOnPlace(places[0]);
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index 6707cdded4020fe3e2b01ba399dfc279a9da677d..f8ad63690e84339da0390d4ddd2db45f25db385a 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -204,8 +204,6 @@ void Pool2dOpMaker::Make() {
   // TODO(dzhwinter): need to registered layout transform function
 
   AddComment(R"DOC(
-Pool2d Operator.
-
 The pooling2d operation calculates the output based on
 the input, pooling_type and ksize, strides, paddings parameters.
 Input(X) and output(Out) are in NCHW format, where N is batch size, C is the
@@ -215,19 +213,28 @@ These two elements represent height and width, respectively.
 The input(X) size and output(Out) size may be different.
 
 Example:
+
   Input:
+
        X shape: $(N, C, H_{in}, W_{in})$
+
   Output:
+
        Out shape: $(N, C, H_{out}, W_{out})$
+
   For ceil_mode = false:
        $$
-       H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
-       W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
+       H_{out} = \\frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1
+       $$
+       $$
+       W_{out} = \\frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
        $$
   For ceil_mode = true:
        $$
-       H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0] + strides[0] - 1)}{strides[0]} + 1 \\
-       W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1] + strides[1] - 1)}{strides[1]} + 1
+       H_{out} = \\frac{(H_{in} - ksize[0] + 2 * paddings[0] + strides[0] - 1)}{strides[0]} + 1
+       $$
+       $$
+       W_{out} = \\frac{(W_{in} - ksize[1] + 2 * paddings[1] + strides[1] - 1)}{strides[1]} + 1
        $$
 
 )DOC");
diff --git a/paddle/fluid/operators/positive_negative_pair_op.h b/paddle/fluid/operators/positive_negative_pair_op.h
index f20f33bbeb19766d6974ea17b155cac363c01fb2..db0a1002f47944c5d926fb5a51b84536dcf446b8 100644
--- a/paddle/fluid/operators/positive_negative_pair_op.h
+++ b/paddle/fluid/operators/positive_negative_pair_op.h
@@ -14,7 +14,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/prefetch_op.cc b/paddle/fluid/operators/prefetch_op.cc
index f71ba84b318c1f8b0604310f3db8a0826124e207..8734282fe496b8e90af19abd5549566d62316fc3 100644
--- a/paddle/fluid/operators/prefetch_op.cc
+++ b/paddle/fluid/operators/prefetch_op.cc
@@ -41,8 +41,8 @@ class PrefetchOp : public framework::OperatorBase {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto& ctx = *pool.Get(place);
 
-    detail::RPCClient* rpc_client =
-        detail::RPCClient::GetInstance<RPCCLIENT_T>();
+    distributed::RPCClient* rpc_client =
+        distributed::RPCClient::GetInstance<RPCCLIENT_T>();
 
     for (size_t i = 0; i < ins.size(); i++) {
       if (NeedSend(scope, ins[i])) {
diff --git a/paddle/fluid/operators/print_op.cc b/paddle/fluid/operators/print_op.cc
index db7634918a5179a61304315ecd08350d23fb4642..cceac402951ae6bf3fe0b4c96af5b7ce9ca1ba0e 100644
--- a/paddle/fluid/operators/print_op.cc
+++ b/paddle/fluid/operators/print_op.cc
@@ -16,6 +16,7 @@
 #include <ctime>
 
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/framework/variable.h"
 
 namespace paddle {
@@ -62,7 +63,7 @@ struct Formater {
     }
   }
   void PrintDtype() {
-    if (dtype.hash_code() != typeid(const char).hash_code()) {
+    if (!framework::IsType<const char>(dtype)) {
       CLOG << "\tdtype: " << dtype.name() << std::endl;
     }
   }
@@ -83,15 +84,15 @@ struct Formater {
   void PrintData(size_t size) {
     PADDLE_ENFORCE_NOT_NULL(data);
     // print float
-    if (dtype.hash_code() == typeid(const float).hash_code()) {
+    if (framework::IsType<const float>(dtype)) {
       Display<float>(size);
-    } else if (dtype.hash_code() == typeid(const double).hash_code()) {
+    } else if (framework::IsType<const double>(dtype)) {
       Display<double>(size);
-    } else if (dtype.hash_code() == typeid(const int).hash_code()) {
+    } else if (framework::IsType<const int>(dtype)) {
       Display<int>(size);
-    } else if (dtype.hash_code() == typeid(const int64_t).hash_code()) {
+    } else if (framework::IsType<const int64_t>(dtype)) {
       Display<int64_t>(size);
-    } else if (dtype.hash_code() == typeid(const bool).hash_code()) {
+    } else if (framework::IsType<const bool>(dtype)) {
       Display<bool>(size);
     } else {
       CLOG << "\tdata: unprintable type: " << dtype.name() << std::endl;
diff --git a/paddle/fluid/operators/random_crop_op.cc b/paddle/fluid/operators/random_crop_op.cc
index 528a6e4a1b68fe611d104f21bafe970762611a03..123fa44fa3ddbc9343b9629be63fdefdf12b4646 100644
--- a/paddle/fluid/operators/random_crop_op.cc
+++ b/paddle/fluid/operators/random_crop_op.cc
@@ -37,6 +37,11 @@ class RandomCropOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("SeedOut", "The random seed after random cropping.")
         .AsIntermediate();
     AddAttr<std::vector<int>>("shape", "The shape of a cropped instance.");
+    AddAttr<int>("startup_seed",
+                 "If the input 'Seed' is not initialized, the 'startup_seed' "
+                 "will be used to replace it. Even so, the seed after random "
+                 "crop will also be outputed to the 'SeedOut'.")
+        .SetDefault(0);
     AddComment(R"DOC(
       This operator takes a batch of instance, and do random cropping on each instance.
       It means that cropping positions differs on each instance, which is determined
@@ -49,8 +54,6 @@ class RandomCropOpMaker : public framework::OpProtoAndCheckerMaker {
 class RandomCropOpInferShape : public framework::InferShapeBase {
  public:
   void operator()(framework::InferShapeContext* ctx) const override {
-    auto seed_dim = ctx->GetInputDim("Seed");
-    PADDLE_ENFORCE(seed_dim.size() == 1 && seed_dim[0] == 1);
     auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
     auto x_dim = ctx->GetInputDim("X");
     PADDLE_ENFORCE_GT(x_dim.size(), static_cast<int64_t>(shape.size()));
@@ -62,7 +65,6 @@ class RandomCropOpInferShape : public framework::InferShapeBase {
       out_dim[x_i] = shape[shape_i];
     }
     ctx->SetOutputDim("Out", framework::make_ddim(out_dim));
-    ctx->SetOutputDim("SeedOut", framework::make_ddim({1}));
   }
 };
 
diff --git a/paddle/fluid/operators/random_crop_op.h b/paddle/fluid/operators/random_crop_op.h
index f3261cbdc986b0cc724315c1eb92b8b84e18c742..d68ba9d661698bb0d33b139f5748daec2ead6595 100644
--- a/paddle/fluid/operators/random_crop_op.h
+++ b/paddle/fluid/operators/random_crop_op.h
@@ -142,16 +142,22 @@ template <typename DeviceContext, typename T>
 class RandomCropKernel : public framework::OpKernel<T> {
  public:
   virtual void Compute(const framework::ExecutionContext& ctx) const {
-    auto& seed_tensor = detail::Ref(ctx.Input<framework::LoDTensor>("Seed"));
     int64_t seed = 0;
-    if (platform::is_cpu_place(seed_tensor.place())) {
-      seed = *seed_tensor.data<int64_t>();
+    auto& seed_tensor = detail::Ref(ctx.Input<framework::LoDTensor>("Seed"));
+    if (seed_tensor.IsInitialized()) {
+      if (platform::is_cpu_place(seed_tensor.place())) {
+        seed = *seed_tensor.data<int64_t>();
+      } else {
+        LOG(WARNING) << "It is slow to place seed in GPU memory. Please verify "
+                        "your program";
+        framework::LoDTensor cpu_seed;
+        framework::TensorCopySync(seed_tensor, platform::CPUPlace(), &cpu_seed);
+        seed = *cpu_seed.data<int64_t>();
+      }
     } else {
-      LOG(WARNING) << "It is slow to place seed in GPU memory. Please verify "
-                      "your program";
-      framework::LoDTensor cpu_seed;
-      framework::TensorCopySync(seed_tensor, platform::CPUPlace(), &cpu_seed);
-      seed = *cpu_seed.data<int64_t>();
+      VLOG(5) << "WARNING: The input 'Seed' is not initialized, use attribute "
+                 "'startup_seed' instead.";
+      seed = ctx.Attr<int>("startup_seed");
     }
     auto shape = ctx.Attr<std::vector<int>>("shape");
     auto& x = detail::Ref(ctx.Input<framework::LoDTensor>("X"));
@@ -171,7 +177,7 @@ class RandomCropKernel : public framework::OpKernel<T> {
     engine.discard(functor.prod_batchsize_dims_ *
                    (functor.rank_ - functor.num_batchsize_dims_));
     *ctx.Output<framework::LoDTensor>("SeedOut")->mutable_data<int64_t>(
-        platform::CPUPlace()) = engine();
+        framework::make_ddim({1}), platform::CPUPlace()) = engine();
   }
 };
 
diff --git a/paddle/fluid/operators/read_op.cc b/paddle/fluid/operators/read_op.cc
index 72a27d43584d55cd0859c63577ae85ff0f5fdfa8..65fcce8bb019965a805ad09d50be0aba64e4f24e 100644
--- a/paddle/fluid/operators/read_op.cc
+++ b/paddle/fluid/operators/read_op.cc
@@ -66,9 +66,19 @@ class ReadOp : public framework::OperatorBase {
     std::vector<std::string> out_arg_names = Outputs("Out");
     std::vector<framework::LoDTensor> ins;
     reader->ReadNext(&ins);
-    PADDLE_ENFORCE(!ins.empty(), "There is no next data.");
+    if (ins.empty()) {
+      if (Attr<bool>("throw_eof_exp")) {
+        PADDLE_THROW_EOF();
+      } else {
+        ins.resize(out_arg_names.size());
+        for (auto& tensor : ins) {
+          // data type is not important for subsequent DataBalanceOpHandle
+          tensor.mutable_data<float>(framework::make_ddim({0}), dev_place);
+        }
+      }
+    }
     PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size());
-    for (size_t i = 0; i < ins.size(); ++i) {
+    for (size_t i = 0; i < out_arg_names.size(); ++i) {
       auto* out =
           scope.FindVar(out_arg_names[i])->GetMutable<framework::LoDTensor>();
       out->ShareDataWith(ins[i]);
@@ -82,6 +92,14 @@ class ReadOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddInput("Reader", "(ReaderHolder) The executed reader.");
     AddOutput("Out", "(LoDTensor) The output data.").AsDuplicable();
+    AddAttr<bool>(
+        "throw_eof_exp",
+        "If set true, an exception will be thrown when the Reader "
+        "yields empty (which means there is no next data).\n"
+        "NOTES: This flag must be true always. It will be set to false"
+        " only when the data-balance is enabled in ParallelExecutor"
+        " and it is set by ParallelExecutor instance, not users.")
+        .SetDefault(true);
     AddComment(R"DOC(
       Read Operator
 
diff --git a/paddle/fluid/operators/reader/CMakeLists.txt b/paddle/fluid/operators/reader/CMakeLists.txt
index 62532036f86bfb82465ccd9e0ec526299489932a..9dbcc35e6f5bb01c159980a49dd4b4c9d37d2aab 100644
--- a/paddle/fluid/operators/reader/CMakeLists.txt
+++ b/paddle/fluid/operators/reader/CMakeLists.txt
@@ -22,8 +22,8 @@ reader_library(create_batch_reader_op SRCS create_batch_reader_op.cc)
 reader_library(create_recordio_file_reader_op SRCS create_recordio_file_reader_op.cc)
 reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc)
 reader_library(create_multi_pass_reader_op SRCS create_multi_pass_reader_op.cc)
-reader_library(create_threaded_reader_op SRCS create_threaded_reader_op.cc)
 reader_library(create_custom_reader_op SRCS create_custom_reader_op.cc)
+reader_library(create_py_reader_op SRCS create_py_reader_op.cc)
 
 cc_test(reader_blocking_queue_test SRCS reader_blocking_queue_test.cc)
 # Export local libraries to parent
diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h
index 71684b14176edc8f71efbefa9a7decffc8f3011e..db8cf3b605c9175eeda4548b1e7c8203f26c5d89 100644
--- a/paddle/fluid/operators/reader/blocking_queue.h
+++ b/paddle/fluid/operators/reader/blocking_queue.h
@@ -88,24 +88,29 @@ class BlockingQueue {
     receive_cv_.notify_all();
   }
 
-  bool IsClosed() {
+  bool IsClosed() const {
     std::lock_guard<std::mutex> lock(mutex_);
     return closed_;
   }
 
-  size_t Cap() {
+  size_t Cap() const {
     std::lock_guard<std::mutex> lock(mutex_);
     return capacity_;
   }
 
+  size_t Size() const {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return queue_.size();
+  }
+
  private:
   size_t capacity_;
   bool closed_;
   std::deque<T> queue_;
 
-  std::mutex mutex_;
-  std::condition_variable receive_cv_;
-  std::condition_variable send_cv_;
+  mutable std::mutex mutex_;
+  mutable std::condition_variable receive_cv_;
+  mutable std::condition_variable send_cv_;
 };
 }  // namespace reader
 }  // namespace operators
diff --git a/paddle/fluid/operators/reader/create_batch_reader_op.cc b/paddle/fluid/operators/reader/create_batch_reader_op.cc
index ecbae3894d551186f53625a6cc9cfdb36adc8d2d..1dbafd23e92732bdaf0d263a01e267227786d839 100644
--- a/paddle/fluid/operators/reader/create_batch_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_batch_reader_op.cc
@@ -20,15 +20,19 @@ namespace reader {
 
 class BatchReader : public framework::DecoratedReader {
  public:
-  BatchReader(const std::shared_ptr<ReaderBase>& reader, int batch_size)
-      : DecoratedReader(reader), batch_size_(batch_size) {
+  BatchReader(const std::shared_ptr<ReaderBase>& reader, int batch_size,
+              bool discard_leftover)
+      : DecoratedReader(reader),
+        batch_size_(batch_size),
+        discard_leftover_(discard_leftover) {
     buffer_.reserve(batch_size_);
   }
 
-  void ReadNext(std::vector<framework::LoDTensor>* out) override;
+  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override;
 
  private:
   int batch_size_;
+  bool discard_leftover_;
   std::vector<std::vector<framework::LoDTensor>> buffer_;
 };
 
@@ -46,8 +50,9 @@ class CreateBatchReaderOp : public framework::OperatorBase {
     }
     const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
                                         ->Get<framework::ReaderHolder>();
-    out->Reset(
-        new BatchReader(underlying_reader.Get(), Attr<int>("batch_size")));
+    out->Reset(framework::MakeDecoratedReader<BatchReader>(
+        underlying_reader, Attr<int>("batch_size"),
+        Attr<bool>("discard_leftover")));
   }
 };
 
@@ -57,6 +62,10 @@ class CreateBatchReaderOpMaker : public DecoratedReaderMakerBase {
     AddAttr<int>("batch_size",
                  "How many instances the batch reader yields each time.")
         .GreaterThan(0);
+    AddAttr<bool>("discard_leftover",
+                  "If true, the leftover instances that are not enough for a "
+                  "new batch will be discarded.")
+        .SetDefault(true);
     AddComment(R"DOC(
       CreateBatchReader Operator
 
@@ -66,7 +75,7 @@ class CreateBatchReaderOpMaker : public DecoratedReaderMakerBase {
   }
 };
 
-void BatchReader::ReadNext(std::vector<framework::LoDTensor>* out) {
+void BatchReader::ReadNextImpl(std::vector<framework::LoDTensor>* out) {
   buffer_.clear();
   buffer_.reserve(batch_size_);
   for (int i = 0; i < batch_size_; ++i) {
@@ -77,6 +86,9 @@ void BatchReader::ReadNext(std::vector<framework::LoDTensor>* out) {
       break;
     }
   }
+  if (discard_leftover_ && buffer_.size() < batch_size_) {
+    buffer_.clear();
+  }
   // Concat instances
   out->clear();
   if (buffer_.empty()) {
diff --git a/paddle/fluid/operators/reader/create_custom_reader_op.cc b/paddle/fluid/operators/reader/create_custom_reader_op.cc
index 0a02fcdeaa5a6de97d59ddce4f58ad945aa2572a..85394b336fc967fc6973131fbedda4c796825185 100644
--- a/paddle/fluid/operators/reader/create_custom_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_custom_reader_op.cc
@@ -33,12 +33,13 @@ class CustomReader : public framework::DecoratedReader {
         source_var_names_(source_var_names),
         sink_var_names_(sink_var_names) {}
 
-  void ReadNext(std::vector<framework::LoDTensor>* out) override;
+  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override;
 
  private:
   const framework::ProgramDesc program_;
   int sub_block_id_;
   framework::Executor exe_;
+  framework::Scope scope_;
 
   std::vector<std::string> source_var_names_;
   std::vector<std::string> sink_var_names_;
@@ -59,10 +60,10 @@ class CreateCustomReaderOp : public framework::OperatorBase {
     }
     const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
                                         ->Get<framework::ReaderHolder>();
-    out->Reset(
-        new CustomReader(underlying_reader.Get(), *sub_block,
-                         Attr<std::vector<std::string>>("source_var_names"),
-                         Attr<std::vector<std::string>>("sink_var_names")));
+    out->Reset(framework::MakeDecoratedReader<CustomReader>(
+        underlying_reader, *sub_block,
+        Attr<std::vector<std::string>>("source_var_names"),
+        Attr<std::vector<std::string>>("sink_var_names")));
   }
 };
 
@@ -142,7 +143,7 @@ class CustomReaderInferVarType : public framework::VarTypeInference {
   }
 };
 
-void CustomReader::ReadNext(std::vector<framework::LoDTensor>* out) {
+void CustomReader::ReadNextImpl(std::vector<framework::LoDTensor>* out) {
   out->clear();
   std::vector<framework::LoDTensor> underlying_outs;
   reader_->ReadNext(&underlying_outs);
@@ -158,23 +159,24 @@ void CustomReader::ReadNext(std::vector<framework::LoDTensor>* out) {
   // The scope for CustomReader's sub-block should be independent and shouldn't
   // be any other computation scope's child. Otherwise, data preprocessing and
   // compution cannot be concurrent.
-  framework::Scope scope;
+  framework::Scope* exe_scope = &scope_.NewScope();
   // 1. Copy LoDTensors from underlying reader's output to source variables.
   for (size_t i = 0; i < source_var_names_.size(); ++i) {
-    framework::Variable* var = scope.Var(source_var_names_[i]);
+    framework::Variable* var = exe_scope->Var(source_var_names_[i]);
     framework::LoDTensor* tensor = var->GetMutable<framework::LoDTensor>();
     tensor->ShareDataWith(underlying_outs[i]);
     tensor->set_lod(underlying_outs[i].lod());
   }
   // 2. Run the sub-block.
-  exe_.Run(program_, &scope, sub_block_id_, false, true);
+  exe_.Run(program_, exe_scope, sub_block_id_, false, true);
   // 3. Copy LoDTensors from sink variables to out.
   out->resize(sink_var_names_.size());
   for (size_t i = 0; i < sink_var_names_.size(); ++i) {
-    const auto& tensor = detail::Ref(scope.FindVar(sink_var_names_[i]))
+    const auto& tensor = detail::Ref(exe_scope->FindVar(sink_var_names_[i]))
                              .Get<framework::LoDTensor>();
     framework::TensorCopySync(tensor, platform::CPUPlace(), &(*out)[i]);
   }
+  scope_.DeleteScope(exe_scope);
 }
 
 }  // namespace reader
diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
index 5f35b9b3eac1d9aab8662833c6e39d12f11a0087..7b14370f4fd64e8fd5b8d9038006494b88d671dc 100644
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -50,12 +50,21 @@ class DoubleBufferReader : public framework::DecoratedReader {
     StartPrefetcher();
   }
 
-  void ReadNext(std::vector<framework::LoDTensor>* out) override;
-  void ReInit() override;
+  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override;
 
   ~DoubleBufferReader() { EndPrefetcher(); }
 
  private:
+  void ShutdownImpl() override {
+    EndPrefetcher();
+    reader_->Shutdown();
+  }
+
+  void StartImpl() override {
+    reader_->Start();
+    StartPrefetcher();
+  }
+
   void StartPrefetcher() {
     channel_ = new reader::BlockingQueue<size_t>(kChannelSize);
     prefetcher_ = std::thread([this] { PrefetchThreadFunc(); });
@@ -109,7 +118,8 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase {
       place = platform::CUDAPlace(static_cast<int>(num));
     }
 
-    out->Reset(new DoubleBufferReader(underlying_reader.Get(), place));
+    out->Reset(framework::MakeDecoratedReader<DoubleBufferReader>(
+        underlying_reader, place));
   }
 };
 
@@ -136,7 +146,7 @@ class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase {
   }
 };
 
-void DoubleBufferReader::ReadNext(std::vector<framework::LoDTensor>* out) {
+void DoubleBufferReader::ReadNextImpl(std::vector<framework::LoDTensor>* out) {
   size_t cached_tensor_id;
   if (channel_->Receive(&cached_tensor_id)) {
     if (platform::is_gpu_place(place_)) {
@@ -150,12 +160,6 @@ void DoubleBufferReader::ReadNext(std::vector<framework::LoDTensor>* out) {
   }
 }
 
-void DoubleBufferReader::ReInit() {
-  reader_->ReInit();
-  EndPrefetcher();
-  StartPrefetcher();
-}
-
 void DoubleBufferReader::PrefetchThreadFunc() {
   VLOG(5) << "A new prefetch thread starts.";
   size_t cached_tensor_id = 0;
diff --git a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
index 19b54110b9aeece33b8d6c73612ae0e12dbfafbd..0a225597d34f43c7fb82aeae2552cdf16c8ba566 100644
--- a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
@@ -24,23 +24,22 @@ class MultiPassReader : public framework::DecoratedReader {
   MultiPassReader(const std::shared_ptr<ReaderBase>& reader, int pass_num)
       : DecoratedReader(reader), pass_num_(pass_num), pass_count_(0) {}
 
-  void ReadNext(std::vector<framework::LoDTensor>* out) override {
+  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
     reader_->ReadNext(out);
-    if (out->empty()) {
+    if (out->empty() && pass_count_ < pass_num_ - 1) {
+      reader_->Shutdown();
+      reader_->Start();
+      reader_->ReadNext(out);
       ++pass_count_;
-      if (pass_count_ < pass_num_) {
-        reader_->ReInit();
-        reader_->ReadNext(out);
-      }
     }
   }
 
-  void ReInit() override {
+ private:
+  void StartImpl() override {
     pass_count_ = 0;
-    reader_->ReInit();
+    reader_->Start();
   }
 
- private:
   int pass_num_;
   mutable int pass_count_;
 };
@@ -60,7 +59,8 @@ class CreateMultiPassReaderOp : public framework::OperatorBase {
     const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
                                         ->Get<framework::ReaderHolder>();
     int pass_num = Attr<int>("pass_num");
-    out->Reset(new MultiPassReader(underlying_reader.Get(), pass_num));
+    out->Reset(framework::MakeDecoratedReader<MultiPassReader>(
+        underlying_reader, pass_num));
   }
 };
 
diff --git a/paddle/fluid/operators/reader/create_py_reader_op.cc b/paddle/fluid/operators/reader/create_py_reader_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d41124279930e92138e7e6a5ab045659a415eb6d
--- /dev/null
+++ b/paddle/fluid/operators/reader/create_py_reader_op.cc
@@ -0,0 +1,89 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
+#include "paddle/fluid/operators/reader/reader_op_registry.h"
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+class PyReader : public framework::FileReader {
+ public:
+  explicit PyReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue)
+      : framework::FileReader() {
+    PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null");
+    queue_ = queue;
+  }
+
+  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
+    bool success;
+    *out = queue_->Pop(&success);
+    if (!success) out->clear();
+  }
+
+ private:
+  void ShutdownImpl() override { /* TODO */
+  }
+
+  void StartImpl() override { /* TODO */
+  }
+
+  std::shared_ptr<LoDTensorBlockingQueue> queue_;
+};
+
+class CreatePyReaderOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    if (out->Get() != nullptr) return;
+
+    const std::string& queue_name = Input("blocking_queue");
+    auto* queue_holder_var = scope.FindVar(queue_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        queue_holder_var,
+        "No LoDTensorBlockingQueueHolder variable with name %s found",
+        queue_name);
+    auto* queue_holder =
+        queue_holder_var->template GetMutable<LoDTensorBlockingQueueHolder>();
+
+    out->Reset(std::make_shared<PyReader>(queue_holder->GetQueue()));
+  }
+};
+
+class CreatePyReaderOpMaker : public FileReaderMakerBase {
+ protected:
+  void Apply() override {
+    AddInput("blocking_queue",
+             "Name of the `LoDTensorBlockingQueueHolder` variable");
+
+    AddComment(R"DOC(
+			Create PyReader to support LoDTensor data feeding in Python side.
+      )DOC");
+  }
+};
+
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
+
+namespace reader = ::paddle::operators::reader;
+
+REGISTER_FILE_READER_OPERATOR(create_py_reader, reader::CreatePyReaderOp,
+                              reader::CreatePyReaderOpMaker);
diff --git a/paddle/fluid/operators/reader/create_random_data_generator_op.cc b/paddle/fluid/operators/reader/create_random_data_generator_op.cc
index 5b7e8a063a034f0be056065826fca0fe807bc9a7..e5c116dfcd71ef40597ca19d1da0b51038baaad1 100644
--- a/paddle/fluid/operators/reader/create_random_data_generator_op.cc
+++ b/paddle/fluid/operators/reader/create_random_data_generator_op.cc
@@ -19,11 +19,11 @@ namespace operators {
 namespace reader {
 
 template <typename T>
-class RandomDataGenerator : public framework::ReaderBase {
+class RandomDataGenerator : public framework::FileReader {
  public:
   RandomDataGenerator(const std::vector<framework::DDim>& shapes, float low,
                       float high)
-      : framework::ReaderBase(), low_(low), high_(high), shapes_(shapes) {
+      : framework::FileReader(), low_(low), high_(high), shapes_(shapes) {
     PADDLE_ENFORCE_LE(low, high,
                       "'low' shouldn't be greater than 'high'.(%f vs %f)", low,
                       high);
@@ -32,7 +32,7 @@ class RandomDataGenerator : public framework::ReaderBase {
     dist_ = std::uniform_real_distribution<float>(low_, high_);
   }
 
-  void ReadNext(std::vector<framework::LoDTensor>* out) override {
+  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
     out->clear();
     out->reserve(shapes_.size());
     for (const framework::DDim& shape : shapes_) {
@@ -51,8 +51,6 @@ class RandomDataGenerator : public framework::ReaderBase {
     }
   }
 
-  void ReInit() override { return; }
-
  private:
   float low_;
   float high_;
@@ -79,8 +77,8 @@ class CreateRandomDataGeneratorOp : public framework::OperatorBase {
     std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
     auto* out = scope.FindVar(Output("Out"))
                     ->template GetMutable<framework::ReaderHolder>();
-    out->Reset(new RandomDataGenerator<T>(shapes, Attr<float>("low"),
-                                          Attr<float>("high")));
+    out->Reset(std::make_shared<RandomDataGenerator<T>>(
+        shapes, Attr<float>("low"), Attr<float>("high")));
   }
 };
 
diff --git a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
index 282ec3f36b98e7aa62d71fb04f72721a5464e21c..b32f09b22524c8b67ce57cc6022ef46efc2e828d 100644
--- a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
@@ -21,10 +21,8 @@ namespace reader {
 template <bool ThreadSafe>
 class RecordIOFileReader : public framework::FileReader {
  public:
-  explicit RecordIOFileReader(const std::string& filename,
-                              const std::vector<framework::DDim>& dims)
-      : FileReader(dims),
-        scanner_(filename),
+  explicit RecordIOFileReader(const std::string& filename)
+      : scanner_(filename),
         dev_ctx_(*platform::DeviceContextPool::Instance().Get(
             platform::CPUPlace())) {
     if (ThreadSafe) {
@@ -33,8 +31,6 @@ class RecordIOFileReader : public framework::FileReader {
     LOG(INFO) << "Creating file reader" << filename;
   }
 
-  void ReInit() override { scanner_.Reset(); }
-
  protected:
   void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
     if (ThreadSafe) {
@@ -45,6 +41,8 @@ class RecordIOFileReader : public framework::FileReader {
     }
   }
 
+  void StartImpl() override { scanner_.Reset(); }
+
  private:
   std::unique_ptr<std::mutex> mutex_;
   recordio::Scanner scanner_;
@@ -58,31 +56,26 @@ class CreateRecordIOReaderOp : public framework::OperatorBase {
  private:
   void RunImpl(const framework::Scope& scope,
                const platform::Place& dev_place) const override {
-    const auto& shape_concat = Attr<std::vector<int>>("shape_concat");
-    const auto& ranks = Attr<std::vector<int>>("ranks");
-    PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty());
-    PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0),
-                      static_cast<int>(shape_concat.size()),
-                      "The accumulate of all ranks should be equal to the "
-                      "shape concat's length.");
     std::string filename = Attr<std::string>("filename");
-
     auto* out = scope.FindVar(Output("Out"))
                     ->template GetMutable<framework::ReaderHolder>();
 
-    out->Reset(new RecordIOFileReader<true>(
-        filename, RestoreShapes(shape_concat, ranks)));
+    out->Reset(std::make_shared<RecordIOFileReader<true>>(filename));
   }
 };
 
 class CreateRecordIOReaderOpMaker : public FileReaderMakerBase {
  protected:
   void Apply() override {
-    AddAttr<std::string>("filename", "The filename of record io reader");
+    AddAttr<std::string>(
+        "filename",
+        "The filename of record file. This file will given to reader.");
     AddComment(R"DOC(
-      CreateRecordIOReader Operator
+Open a recordio file and return the reader object. The returned reader object
+is thread-safe.
 
-      Create a reader from a record io file
+NOTE: This is a very low-level API. It is used for debugging data file or
+training. Please use `open_files` instead of this API for production usage.
     )DOC");
   }
 };
diff --git a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
index 57e8e21214b7c99e52550fe51a67c9b5201cb46f..4b308abc290c10a8a5846672e719b503dfc79b21 100644
--- a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
@@ -34,7 +34,7 @@ class ShuffleReader : public framework::DecoratedReader {
     ReloadBuffer();
   }
 
-  void ReadNext(std::vector<framework::LoDTensor>* out) override {
+  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
     out->clear();
     if (iteration_pos_ >= buffer_.size()) {
       VLOG(10) << "Resetting shuffle buffer";
@@ -47,6 +47,17 @@ class ShuffleReader : public framework::DecoratedReader {
   }
 
  private:
+  void ShutdownImpl() override {
+    buffer_.clear();
+    iteration_pos_ = 0;
+    reader_->Shutdown();
+  }
+
+  void StartImpl() override {
+    reader_->Start();
+    ReloadBuffer();
+  }
+
   void ReloadBuffer() {
     buffer_.clear();
     buffer_.reserve(buffer_size_);
@@ -86,9 +97,8 @@ class CreateShuffleReaderOp : public framework::OperatorBase {
     }
     const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
                                         ->Get<framework::ReaderHolder>();
-    out->Reset(
-        new ShuffleReader(underlying_reader.Get(),
-                          static_cast<size_t>(Attr<int>("buffer_size"))));
+    out->Reset(framework::MakeDecoratedReader<ShuffleReader>(
+        underlying_reader, static_cast<size_t>(Attr<int>("buffer_size"))));
   }
 };
 
diff --git a/paddle/fluid/operators/reader/create_threaded_reader_op.cc b/paddle/fluid/operators/reader/create_threaded_reader_op.cc
deleted file mode 100644
index 3798015146f4ffb085aa82e23ca3f1fb3c5cf5a4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reader/create_threaded_reader_op.cc
+++ /dev/null
@@ -1,79 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/detail/safe_ref.h"
-#include "paddle/fluid/operators/reader/reader_op_registry.h"
-
-namespace paddle {
-namespace operators {
-namespace reader {
-
-class ThreadedReader : public framework::DecoratedReader {
- public:
-  explicit ThreadedReader(const std::shared_ptr<ReaderBase>& reader)
-      : DecoratedReader(reader) {}
-
-  void ReadNext(std::vector<framework::LoDTensor>* out) override {
-    std::lock_guard<std::mutex> lock(mutex_);
-    reader_->ReadNext(out);
-  }
-
-  void ReInit() override { reader_->ReInit(); }
-
- private:
-  std::mutex mutex_;
-};
-
-class CreateThreadedReaderOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-
- private:
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override {
-    auto* out = detail::Ref(scope.FindVar(Output("Out")))
-                    .GetMutable<framework::ReaderHolder>();
-    if (out->Get() != nullptr) {
-      return;
-    }
-    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
-                                        ->Get<framework::ReaderHolder>();
-    out->Reset(new ThreadedReader(underlying_reader.Get()));
-  }
-};
-
-class CreateThreadedReaderOpMaker : public DecoratedReaderMakerBase {
- protected:
-  void Apply() override {
-    AddComment(R"DOC(
-      CreateThreadedReader Operator
-
-      This operator creates a threaded reader. A threaded reader's
-      'ReadNext()' can be invoked by several threads at the same
-      time.
-      When the attribute 'safe_mode' is true, the threaded reader's
-      'ReInit()' is disabled to avoid unexpected bugs in multi-thread
-      environment.
-    )DOC");
-  }
-};
-
-}  // namespace reader
-}  // namespace operators
-}  // namespace paddle
-
-namespace reader = paddle::operators::reader;
-REGISTER_DECORATED_READER_OPERATOR(create_threaded_reader,
-                                   reader::CreateThreadedReaderOp,
-                                   reader::CreateThreadedReaderOpMaker);
diff --git a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
new file mode 100644
index 0000000000000000000000000000000000000000..30d962ba10a954a837f9771d21cedf0feb643439
--- /dev/null
+++ b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
@@ -0,0 +1,103 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/operators/reader/blocking_queue.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+class LoDTensorBlockingQueueHolder;
+
+class LoDTensorBlockingQueue {
+  friend class LoDTensorBlockingQueueHolder;
+
+ private:
+  LoDTensorBlockingQueue(size_t capacity,
+                         const std::vector<framework::DDim>& dims)
+      : queue_(capacity), dims_(dims) {}
+
+ public:
+  bool Push(const std::vector<framework::LoDTensor>& lod_tensor_vec) {
+    CheckDims(lod_tensor_vec);
+    return queue_.Send(lod_tensor_vec);
+  }
+
+  bool Push(std::vector<framework::LoDTensor>&& lod_tensor_vec) {
+    CheckDims(lod_tensor_vec);
+    return queue_.Send(std::move(lod_tensor_vec));
+  }
+
+  std::vector<framework::LoDTensor> Pop(bool* ok = nullptr) {
+    std::vector<framework::LoDTensor> lod_tensor_vec;
+    bool success = queue_.Receive(&lod_tensor_vec);
+    if (ok != nullptr) *ok = success;
+    return lod_tensor_vec;
+  }
+
+  inline size_t Cap() const { return queue_.Cap(); }
+
+  inline size_t Size() const { return queue_.Size(); }
+
+  inline void Close() { return queue_.Close(); }
+
+  inline bool IsClosed() const { return queue_.IsClosed(); }
+
+ private:
+  void CheckDims(const std::vector<framework::LoDTensor>& lod_tensor_vec) {
+    PADDLE_ENFORCE(dims_.size() == lod_tensor_vec.size(),
+                   "Expect input size is %d but found %s", dims_.size(),
+                   lod_tensor_vec.size());
+    for (size_t i = 0; i < dims_.size(); ++i) {
+      const auto& in_dims = framework::slice_ddim(
+          lod_tensor_vec[i].dims(), 1, lod_tensor_vec[i].dims().size());
+      const auto& expect_dims =
+          framework::slice_ddim(dims_[i], 1, dims_[i].size());
+      PADDLE_ENFORCE(in_dims == expect_dims,
+                     "Dims of the %d-th input tensor do not match", i);
+    }
+  }
+
+  BlockingQueue<std::vector<framework::LoDTensor>> queue_;
+  std::vector<framework::DDim> dims_;
+};
+
+class LoDTensorBlockingQueueHolder {
+ public:
+  void InitOnce(size_t capacity, const std::vector<framework::DDim>& dims) {
+    PADDLE_ENFORCE(
+        queue_ == nullptr,
+        "LoDTensorBlockingQueueHolder::InitOnce() can only be called once");
+    queue_.reset(new LoDTensorBlockingQueue(capacity, dims));
+  }
+
+  inline const std::shared_ptr<LoDTensorBlockingQueue>& GetQueue() const {
+    return queue_;
+  }
+
+ private:
+  std::shared_ptr<LoDTensorBlockingQueue> queue_;
+};
+
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reader/open_files_op.cc b/paddle/fluid/operators/reader/open_files_op.cc
index 31e5d81e55ed9703eb3a9ef2595fa2a280f1a734..9a8d203672fa2d560440d063d93fa5f8523690ef 100644
--- a/paddle/fluid/operators/reader/open_files_op.cc
+++ b/paddle/fluid/operators/reader/open_files_op.cc
@@ -23,24 +23,26 @@ namespace reader {
 
 class MultiFileReader : public framework::ReaderBase {
  public:
-  MultiFileReader(const std::vector<std::string>& file_names,
-                  const std::vector<framework::DDim>& dims, size_t thread_num,
+  MultiFileReader(const std::vector<std::string>& file_names, size_t thread_num,
                   size_t buffer_size)
       : buffer_size_(buffer_size) {
     readers_.reserve(file_names.size());
     for (const std::string& f_name : file_names) {
-      readers_.emplace_back(CreateReaderByFileName(f_name, dims));
+      readers_.emplace_back(CreateReaderByFileName(f_name));
     }
     prefetchers_.resize(thread_num);
     StartNewScheduler();
   }
 
-  void ReadNext(std::vector<framework::LoDTensor>* out) override;
-  void ReInit() override;
+  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override;
 
   ~MultiFileReader() { EndScheduler(); }
 
  private:
+  void ShutdownImpl() override { EndScheduler(); }
+
+  void StartImpl() override { StartNewScheduler(); }
+
   void StartNewScheduler();
   void EndScheduler();
   void ScheduleThreadFunc();
@@ -55,17 +57,12 @@ class MultiFileReader : public framework::ReaderBase {
   reader::BlockingQueue<std::vector<framework::LoDTensor>>* buffer_;
 };
 
-void MultiFileReader::ReadNext(std::vector<framework::LoDTensor>* out) {
+void MultiFileReader::ReadNextImpl(std::vector<framework::LoDTensor>* out) {
   if (!buffer_->Receive(out)) {
     out->clear();
   }
 }
 
-void MultiFileReader::ReInit() {
-  EndScheduler();
-  StartNewScheduler();
-}
-
 void MultiFileReader::StartNewScheduler() {
   size_t thread_num = prefetchers_.size();
   waiting_reader_idx_ = new reader::BlockingQueue<size_t>(readers_.size());
@@ -120,7 +117,7 @@ void MultiFileReader::ScheduleThreadFunc() {
       }
     }
   }
-  // If users invoke ReInit() when scheduler is running, it will close the
+  // If users invoke Shutdown() when scheduler is running, it will close the
   // 'avaiable_thread_idx_' and prefecther threads have no way to tell scheduler
   // to release their resource. So a check is needed before scheduler ends.
   for (auto& p : prefetchers_) {
@@ -138,7 +135,8 @@ void MultiFileReader::PrefetchThreadFunc(size_t reader_idx, size_t thread_idx) {
     std::vector<framework::LoDTensor> ins;
     reader->ReadNext(&ins);
     if (ins.empty()) {
-      reader->ReInit();
+      reader->Shutdown();
+      reader->Start();
       break;
     }
     try {
@@ -180,9 +178,8 @@ class OpenFilesOp : public framework::OperatorBase {
 
     auto* out = scope.FindVar(Output("Out"))
                     ->template GetMutable<framework::ReaderHolder>();
-    out->Reset(new MultiFileReader(file_names,
-                                   RestoreShapes(shape_concat, ranks),
-                                   thread_num, buffer_size));
+    out->Reset(
+        std::make_shared<MultiFileReader>(file_names, thread_num, buffer_size));
   }
 };
 
diff --git a/paddle/fluid/operators/reader/reader_op_registry.cc b/paddle/fluid/operators/reader/reader_op_registry.cc
index 612e1f5eca3a4836db1fd167fc6bb63400d20177..b82aab1214992be73d876a42424234e3cea46455 100644
--- a/paddle/fluid/operators/reader/reader_op_registry.cc
+++ b/paddle/fluid/operators/reader/reader_op_registry.cc
@@ -39,7 +39,7 @@ std::unordered_map<std::string, FileReaderCreator>& FileReaderRegistry() {
 }
 
 std::unique_ptr<framework::ReaderBase> CreateReaderByFileName(
-    const std::string& file_name, const std::vector<framework::DDim>& dims) {
+    const std::string& file_name) {
   size_t separator_pos = file_name.find_last_of(kFileFormatSeparator);
   PADDLE_ENFORCE_NE(separator_pos, std::string::npos,
                     "File name illegal! A legal file name should be like: "
@@ -49,12 +49,12 @@ std::unique_ptr<framework::ReaderBase> CreateReaderByFileName(
   auto itor = FileReaderRegistry().find(filetype);
   PADDLE_ENFORCE(itor != FileReaderRegistry().end(),
                  "No file reader registered for '%s' format.", filetype);
-  framework::ReaderBase* reader = (itor->second)(file_name, dims);
+  framework::ReaderBase* reader = (itor->second)(file_name);
   return std::unique_ptr<framework::ReaderBase>(reader);
 }
 
 void FileReaderMakerBase::Make() {
-  AddOutput("Out", "(ReaderHolder) The created random reader.").AsDuplicable();
+  AddOutput("Out", "(ReaderHolder): The created random reader.").AsDuplicable();
   AddAttr<std::vector<int>>("shape_concat", "The concat of all data's shapes.");
   AddAttr<std::vector<int>>(
       "ranks",
diff --git a/paddle/fluid/operators/reader/reader_op_registry.h b/paddle/fluid/operators/reader/reader_op_registry.h
index 244bf15f068a47efc29ee54492cdbdeb10025020..25c3e7d77b788d38daf6dee1fc79e5c1c97e8842 100644
--- a/paddle/fluid/operators/reader/reader_op_registry.h
+++ b/paddle/fluid/operators/reader/reader_op_registry.h
@@ -25,22 +25,21 @@ namespace reader {
 
 static constexpr char kFileFormatSeparator[] = ".";
 
-using FileReaderCreator = std::function<framework::ReaderBase*(
-    const std::string&, const std::vector<framework::DDim>&)>;
+using FileReaderCreator =
+    std::function<framework::ReaderBase*(const std::string&)>;
 
 std::unordered_map<std::string, FileReaderCreator>& FileReaderRegistry();
 
 template <typename Reader>
 int RegisterFileReader(const std::string& filetype) {
-  FileReaderRegistry()[filetype] = [](
-      const std::string& fn, const std::vector<framework::DDim>& dims) {
-    return new Reader(fn, dims);
+  FileReaderRegistry()[filetype] = [](const std::string& fn) {
+    return new Reader(fn);
   };
   return 0;
 }
 
 std::unique_ptr<framework::ReaderBase> CreateReaderByFileName(
-    const std::string& file_name, const std::vector<framework::DDim>& dims);
+    const std::string& file_name);
 
 extern std::vector<framework::DDim> RestoreShapes(
     const std::vector<int>& shape_concat, const std::vector<int>& ranks);
diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc
index 9c1cee7022a9b9a98f026f7602f0f7badc44a49b..162bfcbb0844d29385d0f8ad5d25a3f8de6bd41b 100644
--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
@@ -429,7 +429,8 @@ class RecurrentGradOp : public RecurrentBase {
 
           auto sum_op = framework::OpRegistry::CreateOp(
               "sum", {{"X", {pg_names[param_id], new_inside_name}}},
-              {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{});
+              {{"Out", {pg_names[param_id]}}},
+              framework::AttributeMap{{"use_mkldnn", {false}}});
           sum_op->Run(cur_scope, place);
 
           cur_scope.Rename(new_inside_name, inside_grad_name);
diff --git a/paddle/fluid/operators/recv_op.cc b/paddle/fluid/operators/recv_op.cc
index 15dfb5469bf51330b98d6699fb3ce708222212ed..9854a31f5b10f5ecd940c0d41c2c3e468fc17bad 100644
--- a/paddle/fluid/operators/recv_op.cc
+++ b/paddle/fluid/operators/recv_op.cc
@@ -43,8 +43,8 @@ class RecvOp : public framework::OperatorBase {
     // For profiling
     platform::RecordEvent record_event(Type(), &ctx);
 
-    detail::RPCClient* rpc_client =
-        detail::RPCClient::GetInstance<RPCCLIENT_T>();
+    distributed::RPCClient* rpc_client =
+        distributed::RPCClient::GetInstance<RPCCLIENT_T>();
 
     for (size_t i = 0; i < outs.size(); i++) {
       VLOG(3) << "getting " << outs[i] << " from " << epmap[i];
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 7f743f577fbcdaf6f62e01031e25ef09a842c2e9..918f3be533d51367eade5f5108ad2eab954a9303 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -12,14 +12,108 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/reshape_op.h"
-
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
 
+class ReshapeOp : public framework::OperatorWithKernel {
+ public:
+  ReshapeOp(const std::string &type, const framework::VariableNameMap &inputs,
+            const framework::VariableNameMap &outputs,
+            const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ReshapeOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ReshapeOp should not be null.");
+
+    const std::vector<int> &shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    PADDLE_ENFORCE(!shape.empty(),
+                   "The shape information must be set by Attr(shape).");
+
+    if (ctx->HasInput("Shape") && ctx->IsRuntime()) {
+      // If true, set the shape of Output(Out) according to Input(Shape) in
+      // ReshapeKernel with ExecutionContext. Also check LoD in ReshapeKernel.
+      ctx->ShareLoD("X", /*->*/ "Out");
+      return;
+    }
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto out_dims = ValidateShape(shape, x_dims);
+    ctx->SetOutputDim("Out", out_dims);
+    if (x_dims[0] == out_dims[0]) {
+      // Only pass LoD when the first dimension of output and Input(X)
+      // are the same.
+      ctx->ShareLoD("X", /*->*/ "Out");
+    }
+  }
+
+  static framework::DDim ValidateShape(const std::vector<int> shape,
+                                       const framework::DDim &in_dims) {
+    const int64_t in_size = framework::product(in_dims);
+    // only one dimension can be set to -1, whose size will be automatically
+    // infered.
+    const int64_t unk_dim_val = -1;
+    const int64_t copy_dim_val = 0;
+
+    std::vector<int64_t> output_shape(shape.size(), 0);
+    int64_t capacity = 1;
+    int unk_dim_idx = -1;
+    for (size_t i = 0; i < shape.size(); ++i) {
+      if (shape[i] == unk_dim_val) {
+        PADDLE_ENFORCE(
+            unk_dim_idx == -1,
+            "Only one input dimension of Attr(shape) can be unknown.");
+        unk_dim_idx = i;
+      } else if (shape[i] == copy_dim_val) {
+        PADDLE_ENFORCE(
+            static_cast<int>(i) < in_dims.size(),
+            "The index of dimension to copy from input shape must be less "
+            "than the size of input shape.");
+      } else {
+        PADDLE_ENFORCE(
+            shape[i] > 0,
+            "Each input dimension of Attr(shape) must not be negtive except "
+            "one unknown dimension.");
+      }
+
+      capacity *= (shape[i] ? shape[i] : in_dims[i]);
+      output_shape[i] =
+          (shape[i] ? static_cast<int64_t>(shape[i]) : in_dims[i]);
+    }
+
+    if (unk_dim_idx != -1) {
+      if (in_size > 0) {
+        // in_size < 0 and is un-determinate in compile time, skip the check,
+        // for example, in_dims = [-1, 8, 1, 1], shape = [-1, 3, 8],
+        // capacity = -24, in_size = -8, output_shape[0] = 0
+        // the following check will fail.
+        output_shape[unk_dim_idx] = -in_size / capacity;
+        PADDLE_ENFORCE_EQ(output_shape[unk_dim_idx] * capacity, -in_size,
+                          "Invalid shape is given.");
+      } else {
+        output_shape[unk_dim_idx] = -1;
+      }
+    } else {
+      PADDLE_ENFORCE_EQ(capacity, in_size, "Invalid shape is given.");
+    }
+    return framework::make_ddim(output_shape);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
 class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -107,19 +201,93 @@ class ReshapeGradOp : public framework::OperatorWithKernel {
   }
 };
 
+class ReshapeKernel {
+ public:
+  void operator()(const framework::ExecutionContext &ctx) const {
+    auto *out = ctx.Output<framework::LoDTensor>("Out");
+    auto *in = ctx.Input<framework::LoDTensor>("X");
+
+    auto *shape_tensor = ctx.HasInput("Shape")
+                             ? ctx.Input<framework::LoDTensor>("Shape")
+                             : nullptr;
+
+    framework::DDim out_dims = out->dims();
+
+    if (shape_tensor) {
+      auto *shape_data = shape_tensor->data<int>();
+      framework::Tensor cpu_shape_tensor;
+      if (platform::is_gpu_place(ctx.GetPlace())) {
+        TensorCopySync(*shape_tensor, platform::CPUPlace(), &cpu_shape_tensor);
+        shape_data = cpu_shape_tensor.data<int>();
+      }
+      auto shape =
+          std::vector<int>(shape_data, shape_data + shape_tensor->numel());
+      out_dims = ReshapeOp::ValidateShape(shape, in->dims());
+    }
+    if (!in->lod().empty()) {
+      PADDLE_ENFORCE_EQ(
+          out_dims[0], in->dims()[0],
+          "Reshape operator cannot reshape an input sequence batch "
+          "into an output sequence batch that has a different "
+          "number of time steps. Please consider using "
+          "sequence_reshape op.");
+    }
+
+    bool inplace = ctx.Attr<bool>("inplace");
+    out->Resize(out_dims);
+    if (!inplace) {
+      out->mutable_data(ctx.GetPlace(), in->type());
+      framework::TensorCopySync(*in, ctx.GetPlace(), out);
+      out->Resize(out_dims);
+    } else {
+      out->ShareDataWith(*in);
+      out->Resize(out_dims);
+    }
+  }
+};
+
+class ReshapeGradKernel {
+ public:
+  void operator()(const framework::ExecutionContext &ctx) const {
+    auto *d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto *d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+
+    d_x->mutable_data(ctx.GetPlace(), d_out->type());
+    bool inplace = ctx.Attr<bool>("inplace");
+
+    auto in_dims = d_x->dims();
+    if (!inplace) {
+      framework::TensorCopy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x);
+      ctx.device_context().Wait();
+      d_x->Resize(in_dims);
+    } else {
+      d_x->ShareDataWith(*d_out);
+      d_x->Resize(in_dims);
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
 
 REGISTER_OPERATOR(reshape, ops::ReshapeOp, ops::ReshapeOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(reshape_grad, ops::ReshapeGradOp);
-REGISTER_OP_CPU_KERNEL(reshape, ops::ReshapeKernel<CPU, float>,
-                       ops::ReshapeKernel<CPU, double>,
-                       ops::ReshapeKernel<CPU, int>,
-                       ops::ReshapeKernel<CPU, int64_t>);
-REGISTER_OP_CPU_KERNEL(reshape_grad, ops::ReshapeGradKernel<CPU, float>,
-                       ops::ReshapeGradKernel<CPU, double>,
-                       ops::ReshapeGradKernel<CPU, int>,
-                       ops::ReshapeGradKernel<CPU, int64_t>);
+REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double,
+                               ops::ReshapeKernel, int, ops::ReshapeKernel,
+                               int64_t, ops::ReshapeKernel);
+REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
+                               double, ops::ReshapeGradKernel, int,
+                               ops::ReshapeGradKernel, int64_t,
+                               ops::ReshapeGradKernel);
+
+#ifdef PADDLE_WITH_CUDA
+REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double,
+                                ops::ReshapeKernel, int, ops::ReshapeKernel,
+                                int64_t, ops::ReshapeKernel);
+REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
+                                double, ops::ReshapeGradKernel, int,
+                                ops::ReshapeGradKernel, int64_t,
+                                ops::ReshapeGradKernel);
+#endif
diff --git a/paddle/fluid/operators/reshape_op.cu b/paddle/fluid/operators/reshape_op.cu
deleted file mode 100644
index c628c634e2bc9ae260948a6e7ccf786cbd6c5c3c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reshape_op.cu
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/reshape_op.h"
-using CUDA = paddle::platform::CUDADeviceContext;
-
-REGISTER_OP_CUDA_KERNEL(reshape, paddle::operators::ReshapeKernel<CUDA, float>,
-                        paddle::operators::ReshapeKernel<CUDA, double>,
-                        paddle::operators::ReshapeKernel<CUDA, int>,
-                        paddle::operators::ReshapeKernel<CUDA, int64_t>);
-REGISTER_OP_CUDA_KERNEL(reshape_grad,
-                        paddle::operators::ReshapeGradKernel<CUDA, float>,
-                        paddle::operators::ReshapeGradKernel<CUDA, double>,
-                        paddle::operators::ReshapeGradKernel<CUDA, int>,
-                        paddle::operators::ReshapeGradKernel<CUDA, int64_t>);
diff --git a/paddle/fluid/operators/reshape_op.h b/paddle/fluid/operators/reshape_op.h
deleted file mode 100644
index 3dd8c7c11eca241e747bfa129962032d882ce44c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reshape_op.h
+++ /dev/null
@@ -1,189 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-class ReshapeOp : public framework::OperatorWithKernel {
- public:
-  ReshapeOp(const std::string &type, const framework::VariableNameMap &inputs,
-            const framework::VariableNameMap &outputs,
-            const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of ReshapeOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of ReshapeOp should not be null.");
-
-    const std::vector<int> &shape = ctx->Attrs().Get<std::vector<int>>("shape");
-    PADDLE_ENFORCE(!shape.empty(),
-                   "The shape information must be set by Attr(shape).");
-
-    if (ctx->HasInput("Shape") && ctx->IsRuntime()) {
-      // If true, set the shape of Output(Out) according to Input(Shape) in
-      // ReshapeKernel with ExecutionContext. Also check LoD in ReshapeKernel.
-      ctx->ShareLoD("X", /*->*/ "Out");
-      return;
-    }
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto out_dims = ValidateShape(shape, x_dims);
-    ctx->SetOutputDim("Out", out_dims);
-    if (x_dims[0] == out_dims[0]) {
-      // Only pass LoD when the first dimension of output and Input(X)
-      // are the same.
-      ctx->ShareLoD("X", /*->*/ "Out");
-    }
-  }
-
-  static framework::DDim ValidateShape(const std::vector<int> shape,
-                                       const framework::DDim &in_dims) {
-    const int64_t in_size = framework::product(in_dims);
-    // only one dimension can be set to -1, whose size will be automatically
-    // infered.
-    const int64_t unk_dim_val = -1;
-    const int64_t copy_dim_val = 0;
-
-    std::vector<int64_t> output_shape(shape.size(), 0);
-    int64_t capacity = 1;
-    int unk_dim_idx = -1;
-    for (size_t i = 0; i < shape.size(); ++i) {
-      if (shape[i] == unk_dim_val) {
-        PADDLE_ENFORCE(
-            unk_dim_idx == -1,
-            "Only one input dimension of Attr(shape) can be unknown.");
-        unk_dim_idx = i;
-      } else if (shape[i] == copy_dim_val) {
-        PADDLE_ENFORCE(
-            static_cast<int>(i) < in_dims.size(),
-            "The index of dimension to copy from input shape must be less "
-            "than the size of input shape.");
-      } else {
-        PADDLE_ENFORCE(
-            shape[i] > 0,
-            "Each input dimension of Attr(shape) must not be negtive except "
-            "one unknown dimension.");
-      }
-
-      capacity *= (shape[i] ? shape[i] : in_dims[i]);
-      output_shape[i] =
-          (shape[i] ? static_cast<int64_t>(shape[i]) : in_dims[i]);
-    }
-
-    if (unk_dim_idx != -1) {
-      if (in_size > 0) {
-        // in_size < 0 and is un-determinate in compile time, skip the check,
-        // for example, in_dims = [-1, 8, 1, 1], shape = [-1, 3, 8],
-        // capacity = -24, in_size = -8, output_shape[0] = 0
-        // the following check will fail.
-        output_shape[unk_dim_idx] = -in_size / capacity;
-        PADDLE_ENFORCE_EQ(output_shape[unk_dim_idx] * capacity, -in_size,
-                          "Invalid shape is given.");
-      } else {
-        output_shape[unk_dim_idx] = -1;
-      }
-    } else {
-      PADDLE_ENFORCE_EQ(capacity, in_size, "Invalid shape is given.");
-    }
-    return framework::make_ddim(output_shape);
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-        ctx.device_context());
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ReshapeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const {
-    auto *out = ctx.Output<framework::LoDTensor>("Out");
-    auto *in = ctx.Input<framework::LoDTensor>("X");
-
-    auto *shape_tensor = ctx.HasInput("Shape")
-                             ? ctx.Input<framework::LoDTensor>("Shape")
-                             : nullptr;
-
-    framework::DDim out_dims = out->dims();
-
-    if (shape_tensor) {
-      auto *shape_data = shape_tensor->data<int>();
-      framework::Tensor cpu_shape_tensor;
-      if (platform::is_gpu_place(ctx.GetPlace())) {
-        TensorCopySync(*shape_tensor, platform::CPUPlace(), &cpu_shape_tensor);
-        shape_data = cpu_shape_tensor.data<int>();
-      }
-      auto shape =
-          std::vector<int>(shape_data, shape_data + shape_tensor->numel());
-      out_dims = ReshapeOp::ValidateShape(shape, in->dims());
-    }
-    if (!in->lod().empty()) {
-      PADDLE_ENFORCE_EQ(
-          out_dims[0], in->dims()[0],
-          "Reshape operator cannot reshape an input sequence batch "
-          "into an output sequence batch that has a different "
-          "number of time steps. Please consider using "
-          "sequence_reshape op.");
-    }
-
-    bool inplace = ctx.Attr<bool>("inplace");
-    out->Resize(out_dims);
-    if (!inplace) {
-      out->mutable_data<T>(ctx.GetPlace());
-      framework::TensorCopySync(*in, ctx.GetPlace(), out);
-      out->Resize(out_dims);
-    } else {
-      out->ShareDataWith(*in);
-      out->Resize(out_dims);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ReshapeGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const {
-    auto *d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto *d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-
-    d_x->mutable_data<T>(ctx.GetPlace());
-    bool inplace = ctx.Attr<bool>("inplace");
-
-    auto in_dims = d_x->dims();
-    if (!inplace) {
-      framework::TensorCopy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x);
-      ctx.device_context().Wait();
-      d_x->Resize(in_dims);
-    } else {
-      d_x->ShareDataWith(*d_out);
-      d_x->Resize(in_dims);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc
index 293abb0ea4f1ac03c3889ce2937ef8fa0845db73..d6d209d5de041500a9b4893d70800a58e8ee1e1d 100644
--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
@@ -139,7 +139,20 @@ class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
                  "The pooled output width.")
         .SetDefault(1);
     AddComment(R"DOC(
-ROIPool operator
+**ROIPool Operator**
+
+Region of interest pooling (also known as RoI pooling) is to perform
+is to perform max pooling on inputs of nonuniform sizes to obtain
+fixed-size feature maps (e.g. 7*7).
+
+The operator has three steps:
+
+1. Dividing each region proposal into equal-sized sections with
+   the pooled_width and pooled_height
+
+2. Finding the largest value in each section
+
+3. Copying these max values to the output buffer
 
 ROI Pooling for Faster-RCNN. The link below is a further introduction: 
 https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn
diff --git a/paddle/fluid/operators/row_conv_op.cc b/paddle/fluid/operators/row_conv_op.cc
index 20f140f962c3aac364a1239a663d5f340bbeb6b2..10b1b0c899d833d70fa6afe51998fe210899e3c3 100644
--- a/paddle/fluid/operators/row_conv_op.cc
+++ b/paddle/fluid/operators/row_conv_op.cc
@@ -78,23 +78,23 @@ class RowConvOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("X",
-             "(LoDTensor), the input(X) is a LodTensor, which supports "
+             "the input(X) is a LodTensor, which supports "
              "variable time-length input sequences. The underlying tensor "
              "in this LoDTensor is a matrix with shape (T x N), where T "
              "is the total time steps in this mini-batch and N is the input "
              "data dimension.");
     AddInput("Filter",
-             "(Tensor), the input(Filter) is a learnable parameter. It "
+             "the input(Filter) is a learnable parameter. It "
              "is a 2-D tensor with shape (future_context x N), where, "
              "future_context is the future context length and N is the data "
              "dimension.");
     AddOutput("Out",
-              "(LoDTensor), the output(Out) is a LodTensor, which supports "
+              "the output(Out) is a LodTensor, which supports "
               "variable time-length input sequences. The underlying tensor "
               "in this LodTensor is a matrix with shape T x N, i.e., the "
               "same shape as X.");
     AddComment(R"DOC(
-Row-convolution Operator.
+:strong:`Row-convolution operator`
 
 The row convolution is called lookahead convolution.  This operator was 
 introduced in the following paper for DeepSpeech2:
@@ -114,9 +114,23 @@ and a filter ($W$) of size $context \times d$,
 the output sequence is convolved as:
 
 $$
-out_{i, :} = \sum_{j=i}^{i + context} in_{j,:} \dot W_{i-j, :}
+out_{i, :} = \\sum_{j=i}^{i + context} in_{j,:} \\cdot W_{i-j, :}
 $$
 
+In the above equation:
+
+* $Out_{i}$: The i-th row of output variable with shape [1, D].
+
+* $\\tau$: Future context size.
+
+* $X_{j}$: The j-th row of input variable with shape [1, D].
+
+* $W_{i-j}$: The (i-j)-th row of parameters with shape [1, D].
+
+More details about row_conv please refer to
+the design document
+https://github.com/PaddlePaddle/Paddle/issues/2228#issuecomment-303903645 .
+
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/save_load_op_test.cc b/paddle/fluid/operators/save_load_op_test.cc
index c4fcc61af4b75e6dc7d5c31e20c5fff358637af5..ccaea0eef2906953d922e097348b6c0a86dad6f1 100644
--- a/paddle/fluid/operators/save_load_op_test.cc
+++ b/paddle/fluid/operators/save_load_op_test.cc
@@ -139,6 +139,7 @@ TEST(LoadFP16Op, CPU) {
   save_op->Run(scope, place);
 
   auto load_var = scope.Var("out_var");
+  load_var->GetMutable<paddle::framework::LoDTensor>();
   auto load_op = paddle::framework::OpRegistry::CreateOp(
       "load", {}, {{"Out", {"out_var"}}}, attrs);
   load_op->Run(scope, place);
diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc
index e6d27e2dedd7668b93bd8ddc330a897d1c6fa732..201a51130d6b6f94104e2dabf9e7facffa672ae0 100644
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -22,11 +22,17 @@ limitations under the License. */
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
 
+// define LOOKUP_TABLE_PATH for checkpoint notify to save lookup table variables
+// to directory specified.
+constexpr char LOOKUP_TABLE_PATH[] = "kLookupTablePath";
+
 // TODO(yuyang18): If the functions below are needed by other files, move them
 // to paddle::filesystem namespace.
 constexpr char kSEP = '/';
@@ -67,9 +73,27 @@ class SaveOp : public framework::OperatorBase {
  private:
   void RunImpl(const framework::Scope &scope,
                const platform::Place &place) const override {
+    auto iname = Input("X");
+    auto *var = scope.FindVar(iname);
+    PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s for save_op",
+                   iname);
+
+    if (var->IsType<framework::LoDTensor>()) {
+      SaveLodTensor(place, var);
+    } else if (var->IsType<framework::SelectedRows>()) {
+      SaveSelectedRows(scope, place, var);
+    } else {
+      PADDLE_ENFORCE(
+          false,
+          "SaveOp only support LoDTensor and SelectedRows, %s has wrong type",
+          iname);
+    }
+  }
+
+  void SaveLodTensor(const platform::Place &place,
+                     framework::Variable *var) const {
     auto filename = Attr<std::string>("file_path");
     auto overwrite = Attr<bool>("overwrite");
-    auto save_as_fp16 = Attr<bool>("save_as_fp16");
 
     if (FileExists(filename) && !overwrite) {
       PADDLE_THROW("%s is existed, cannot save to it when overwrite=false",
@@ -78,26 +102,19 @@ class SaveOp : public framework::OperatorBase {
 
     MkDirRecursively(DirName(filename).c_str());
 
-    // FIXME(yuyang18): We save variable to local file now, but we should change
-    // it to save an output stream.
-    std::ofstream fout(filename);
-    PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
-                   filename);
-
-    auto iname = Input("X");
-    auto *var = scope.FindVar(iname);
-    PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s for save_op",
-                   iname);
-
-    PADDLE_ENFORCE(var->IsType<framework::LoDTensor>(),
-                   "SaveOp only support LoDTensor, %s has wrong type", iname);
-
     auto &tensor = var->Get<framework::LoDTensor>();
 
     // get device context from pool
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto &dev_ctx = *pool.Get(place);
 
+    // FIXME(yuyang18): We save variable to local file now, but we should change
+    // it to save an output stream.
+    std::ofstream fout(filename);
+    PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
+                   filename);
+
+    auto save_as_fp16 = Attr<bool>("save_as_fp16");
     auto in_dtype = framework::ToDataType(tensor.type());
     auto out_dtype = save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
 
@@ -112,17 +129,43 @@ class SaveOp : public framework::OperatorBase {
     } else {
       framework::SerializeToStream(fout, tensor, dev_ctx);
     }
+    fout.close();
+  }
+
+  void SaveSelectedRows(const framework::Scope &scope,
+                        const platform::Place &place,
+                        framework::Variable *var) const {
+    auto *lt_var = scope.FindVar(LOOKUP_TABLE_PATH)->GetMutable<std::string>();
+    PADDLE_ENFORCE(
+        lt_var != nullptr,
+        "Can not find variable kLookupTablePath for SaveSelectedRows");
+    std::string filename = lt_var->data();
+    VLOG(4) << "SaveSelectedRows get File name: " << filename;
+
+    auto &selectedRows = var->Get<framework::SelectedRows>();
+
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
+    // FIXME(yuyang18): We save variable to local file now, but we should change
+    // it to save an output stream.
+    std::ofstream fout(filename);
+    PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
+                   filename);
+    framework::SerializeToStream(fout, selectedRows, dev_ctx);
+    fout.close();
   }
 };
 
 class SaveOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("X", "(Tensor ) Input tensor to be saved");
+    AddInput("X", "(Tensor ) Input LoDTensor and SelectedRows to be saved");
     AddComment(R"DOC(
 Save operator
 
-This operator will serialize and write a tensor variable to file on disk.
+This operator will serialize and write LoDTensor / SelectedRows variable to file on disk.
 )DOC");
     AddAttr<bool>("overwrite",
                   "(boolean, default true)"
@@ -142,9 +185,26 @@ This operator will serialize and write a tensor variable to file on disk.
   }
 };
 
+class SaveOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
+    auto out_var_name = op_desc.Output(LOOKUP_TABLE_PATH).front();
+    auto &out_var = block->FindRecursiveOrCreateVar(out_var_name);
+    auto var_type = framework::proto::VarType::RAW;
+    out_var.SetType(var_type);
+  }
+};
+
+class SaveOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {}
+};
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 
-REGISTER_OPERATOR(save, ops::SaveOp, ops::SaveOpProtoMaker);
+REGISTER_OPERATOR(save, ops::SaveOp, paddle::framework::EmptyGradOpMaker,
+                  ops::SaveOpProtoMaker, ops::SaveOpVarTypeInference,
+                  ops::SaveOpShapeInference);
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index 4687e21e7155fc7309fb28c881c0d47152df9ad5..7f8822e40053b5bcd394f446138a2292d80b69bf 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -41,13 +41,13 @@ class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "(Tensor) Input tensor of scale operator.");
     AddOutput("Out", "(Tensor) Output tensor of scale operator.");
     AddComment(R"DOC(
-Scale operator
+**Scale operator**
+
+Multiply the input tensor with a float scalar to scale the input tensor.
 
 $$Out = scale*X$$
 )DOC");
-    AddAttr<float>("scale",
-                   "(float, default 1.0)"
-                   "The scaling factor of the scale operator.")
+    AddAttr<float>("scale", "The scaling factor of the scale operator.")
         .SetDefault(1.0);
   }
 };
diff --git a/paddle/fluid/operators/send_barrier_op.cc b/paddle/fluid/operators/send_barrier_op.cc
index c6c975a23ce846464388c72af5d8902144ceb16a..6b4572dcccc21e783f1df0b9bcde11d532ff4ba8 100644
--- a/paddle/fluid/operators/send_barrier_op.cc
+++ b/paddle/fluid/operators/send_barrier_op.cc
@@ -44,8 +44,8 @@ class SendBarrierOp : public framework::OperatorBase {
     // For profiling
     platform::RecordEvent record_event(Type(), &ctx);
 
-    detail::RPCClient* rpc_client =
-        detail::RPCClient::GetInstance<RPCCLIENT_T>();
+    distributed::RPCClient* rpc_client =
+        distributed::RPCClient::GetInstance<RPCCLIENT_T>();
 
     VLOG(3) << "SendBarrierOp sync_mode:" << sync_mode;
 
diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc
index 84ec36625314572d16e5c537884b6efec420cc60..0cac329aafa8c4c67cae48ba62a48575f5edba92 100644
--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@@ -45,8 +45,8 @@ class SendOp : public framework::OperatorBase {
     // For profiling
     platform::RecordEvent record_event(Type(), &ctx);
 
-    detail::RPCClient* rpc_client =
-        detail::RPCClient::GetInstance<RPCCLIENT_T>();
+    distributed::RPCClient* rpc_client =
+        distributed::RPCClient::GetInstance<RPCCLIENT_T>();
 
     for (size_t i = 0; i < ins.size(); i++) {
       if (NeedSend(scope, ins[i])) {
diff --git a/paddle/fluid/operators/send_recv_op_test.cc b/paddle/fluid/operators/send_recv_op_test.cc
index e550552b195b768d68ec64e9c3b5889b56ca719f..aee6180add5708d31f7ce927b37c4524a291fe3c 100644
--- a/paddle/fluid/operators/send_recv_op_test.cc
+++ b/paddle/fluid/operators/send_recv_op_test.cc
@@ -129,7 +129,10 @@ void StartServerNet(bool is_sparse, std::atomic<bool> *initialized) {
   // sub program run in listen_and_serv_op, for simple test we use sum
   f::ProgramDesc program;
   const auto &root_block = program.Block(0);
+  std::vector<framework::BlockDesc *> optimize_blocks;
   auto *optimize_block = program.AppendBlock(root_block);
+  optimize_blocks.push_back(optimize_block);
+
   auto *prefetch_block = program.AppendBlock(root_block);
   // X for server side tensors, RX for received tensors, must be of same shape.
   AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, optimize_block,
@@ -139,7 +142,7 @@ void StartServerNet(bool is_sparse, std::atomic<bool> *initialized) {
   attrs.insert({"Fanin", 1});
   attrs.insert({"ParamList", std::vector<std::string>({"Out"})});
   attrs.insert({"GradList", std::vector<std::string>({"x1"})});
-  attrs.insert({"OptimizeBlock", optimize_block});
+  attrs.insert({"optimize_blocks", optimize_blocks});
   attrs.insert({"PrefetchBlock", prefetch_block});
   attrs.insert({"grad_to_block_id", std::vector<std::string>({""})});
   attrs.insert({"sync_mode", true});
diff --git a/paddle/fluid/operators/sequence_expand_op.h b/paddle/fluid/operators/sequence_expand_op.h
index d62c387c3eebf9df0ab532f4e891da006f239468..39301e1ac0971dfe0ca7854257f10ddeb60f1000 100644
--- a/paddle/fluid/operators/sequence_expand_op.h
+++ b/paddle/fluid/operators/sequence_expand_op.h
@@ -151,9 +151,6 @@ struct SequenceExpandGradFunctor<platform::CPUDeviceContext, T> {
       const framework::Vector<size_t>& x_lod,   /*expand source lod*/
       const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
       LoDTensor* dx) {
-    math::SetConstant<platform::CPUDeviceContext, T> set_zero;
-    set_zero(context, dx, static_cast<T>(0));
-
     int dout_offset = 0;
     for (size_t i = 1; i < ref_lod.size(); ++i) {
       int repeat_num = ref_lod[i] - ref_lod[i - 1];
@@ -187,6 +184,10 @@ class SequenceExpandGradKernel : public framework::OpKernel<T> {
     g_x->mutable_data<T>(context.GetPlace());
     g_x->set_lod(x->lod());
 
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    math::SetConstant<DeviceContext, T> set_zero;
+    set_zero(dev_ctx, g_x, static_cast<T>(0));
+
     auto& y_lod = y->lod();
     if (ref_level == -1) ref_level = y_lod.size() - 1;
     // just copy the gradient
diff --git a/paddle/fluid/operators/shape_op.cc b/paddle/fluid/operators/shape_op.cc
index c75fce7959d1af51afd52af23fe657d10a2f3988..b44d5f898013a5d27467bd80118c29a886d5e8b3 100644
--- a/paddle/fluid/operators/shape_op.cc
+++ b/paddle/fluid/operators/shape_op.cc
@@ -36,10 +36,13 @@ class ShapeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("Input", "(Tensor), The input tensor.");
-    AddOutput("Out", "(Tensor), The shape of input tensor.");
+    AddOutput("Out",
+              "(Tensor), The shape of input tensor, the data type of the shape"
+              " is int64_t, will be on the same device with the input Tensor.");
     AddComment(R"DOC(
-Shape Operator. 
-Get the shape of input tensor.
+Shape Operator
+
+Get the shape of input tensor. Only support CPU input Tensor now.
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
index 135e2a6f7f877c9ef159a4542b834d5627649e81..c3b0fe32098cb4b41ccc155db58809ef9f1bf46b 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -113,14 +113,14 @@ The logistic loss is given as follows:
 
        $$loss = -Labels * \log(\sigma(X)) - (1 - Labels) * \log(1 - \sigma(X))$$
 
-We know that $$\sigma(X) = (1 / (1 + \exp(-X)))$$. By substituting this we get:
+We know that $$\sigma(X) = \\frac{1}{1 + \exp(-X)}$$. By substituting this we get:
 
        $$loss = X - X * Labels + \log(1 + \exp(-X))$$
 
 For stability and to prevent overflow of $$\exp(-X)$$ when X < 0,
 we reformulate the loss as follows:
 
-       $$loss = \max(X, 0) - X * Labels + \log(1 + \exp(-|X|))$$
+       $$loss = \max(X, 0) - X * Labels + \log(1 + \exp(-\|X\|))$$
 
 Both the input `X` and `Labels` can carry the LoD (Level of Details) information.
 However the output only shares the LoD with input `X`.
diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc
index 61bb445e8b4c6a71e9b1a6a0bcf02a31ab271d0a..4bd23d594134f227e86b01fd75b7e202dd76c11b 100644
--- a/paddle/fluid/operators/slice_op.cc
+++ b/paddle/fluid/operators/slice_op.cc
@@ -95,23 +95,26 @@ of that dimension. If the value passed to start or end is larger than
 the n (the number of elements in this dimension), it represents n. 
 For slicing to the end of a dimension with unknown size, it is recommended 
 to pass in INT_MAX. If axes are omitted, they are set to [0, ..., ndim-1].
-
-    Example 1:
-    Given:
-        data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
-        axes = [0, 1]
-        starts = [1, 0]
-        ends = [2, 3]
-    Then:
-        result = [ [5, 6, 7], ]
-
-    Example 2:
-    Given:
-        data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
-        starts = [0, 1]
-        ends = [-1, 1000]
-    Then:
-        result = [ [2, 3, 4], ]
+Following examples will explain how slice works:
+
+    .. code-block:: text
+
+        Cast1:
+            Given:
+                data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
+                axes = [0, 1]
+                starts = [1, 0]
+                ends = [2, 3]
+            Then:
+                result = [ [5, 6, 7], ]
+
+        Cast2:
+            Given:
+                data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
+                starts = [0, 1]
+                ends = [-1, 1000]
+            Then:
+                result = [ [2, 3, 4], ]
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/softmax_mkldnn_op.cc b/paddle/fluid/operators/softmax_mkldnn_op.cc
index 14b57b11fefb2b726531cb164dbf479f8df26b24..6668e6b9e917eea7ba4a80ac78917b73eb827208 100644
--- a/paddle/fluid/operators/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/softmax_mkldnn_op.cc
@@ -27,8 +27,81 @@ using paddle::platform::MKLDNNMemDesc;
 using mkldnn::memory;  // Note: paddle has also "memory" namespace
 using mkldnn::primitive;
 using mkldnn::softmax_forward;
+using mkldnn::softmax_backward;
 using mkldnn::prop_kind;
 using mkldnn::stream;
+using platform::to_void_cast;
+
+class SoftmaxMKLDNNHandler : public platform::MKLDNNHandler {
+ public:
+  SoftmaxMKLDNNHandler(
+      std::shared_ptr<mkldnn::softmax_forward::primitive_desc> softmax_pd,
+      const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
+      const std::string& base_key)
+      : platform::MKLDNNHandler(dev_ctx, engine, base_key),
+        softmax_pd_(softmax_pd) {}
+
+  SoftmaxMKLDNNHandler(
+      std::shared_ptr<mkldnn::softmax_forward::primitive_desc> softmax_pd,
+      std::shared_ptr<mkldnn::softmax_backward::primitive_desc> softmax_bwd_pd,
+      const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
+      const std::string& base_key)
+      : platform::MKLDNNHandler(dev_ctx, engine, base_key),
+        softmax_pd_(softmax_pd),
+        softmax_bwd_pd_(softmax_bwd_pd) {
+    // If we are in Grad operatgor then update a key with BWD suffix to
+    // distinguish from FWD memory primitives
+    key_ += "-BWD";
+  }
+
+  std::shared_ptr<mkldnn::softmax_forward> AcquireSoftmax(
+      std::shared_ptr<mkldnn::memory> dst_memory_p,
+      std::shared_ptr<mkldnn::memory> src_memory_p) {
+    /*Generate key*/
+    auto prim_key = key_ + "@softmax_p";
+
+    auto softmax_p = std::static_pointer_cast<mkldnn::softmax_forward>(
+        dev_ctx_.GetBlob(prim_key));
+    PADDLE_ENFORCE((softmax_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find softmax primitive in device context");
+    if (softmax_p == nullptr) {
+      softmax_p = std::make_shared<mkldnn::softmax_forward>(
+          *(softmax_pd_.get()),
+          *(static_cast<mkldnn::memory*>(src_memory_p.get())),
+          *(static_cast<mkldnn::memory*>(dst_memory_p.get())));
+      dev_ctx_.SetBlob(prim_key, softmax_p);
+    } else {
+      is_reusing_ = true;
+    }
+
+    return softmax_p;
+  }
+
+  std::shared_ptr<mkldnn::softmax_backward> AcquireSoftmaxBackward(
+      std::shared_ptr<mkldnn::memory> dst_memory_p,
+      std::shared_ptr<mkldnn::memory> diff_dst_memory_p,
+      std::shared_ptr<mkldnn::memory> diff_src_memory_p) {
+    auto prim_key = key_ + "@softmax_bwd_p";
+    auto softmax_bwd_p = std::static_pointer_cast<mkldnn::softmax_backward>(
+        dev_ctx_.GetBlob(prim_key));
+    PADDLE_ENFORCE((softmax_bwd_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find softmax backward primitive in device context");
+    if (softmax_bwd_p == nullptr) {
+      softmax_bwd_p = std::make_shared<mkldnn::softmax_backward>(
+          *softmax_bwd_pd_, *(dst_memory_p.get()), *(diff_dst_memory_p.get()),
+          *(diff_src_memory_p.get()));
+      dev_ctx_.SetBlob(prim_key, softmax_bwd_p);
+    } else {
+      is_reusing_ = true;
+    }
+
+    return softmax_bwd_p;
+  }
+
+ private:
+  std::shared_ptr<mkldnn::softmax_forward::primitive_desc> softmax_pd_;
+  std::shared_ptr<mkldnn::softmax_backward::primitive_desc> softmax_bwd_pd_;
+};
 
 template <typename T>
 class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
@@ -54,56 +127,27 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
     // Same memory descriptor to be used for input and output
     memory::dims softmax_tz = {src_tz[0], src_tz[1]};
     // Generate keys for storing/retriving primitives for this operator
-    // TODO(jczaja): Each MKLDNN operator may have diffrent hashing function
-    auto gethash = [](memory::dims& operand_dims) {
-      return std::string(std::to_string(operand_dims[0]) + "-" +
-                         std::to_string(operand_dims[1]));
-    };
-    const std::string key = gethash(softmax_tz);
-    const std::string key_softmax_p = key + "@softmax_p";
-    const std::string key_softmax_src_mem_p = key + "@softmax_src_mem_p";
-    const std::string key_softmax_dst_mem_p = key + "@softmax_dst_mem_p";
-
-    std::shared_ptr<void> softmax_p = dev_ctx.GetBlob(key_softmax_p);
-    if (softmax_p == nullptr) {
-      // Currently only NC data format is supported
-      auto softmax_md =
-          MKLDNNMemDesc({softmax_tz}, memory::f32, memory::format::nc);
-      // Normalization is made after innermost dimension eg. C out of NC
-      auto softmax_desc = softmax_forward::desc(prop_kind::forward_scoring,
-                                                softmax_md, 1 /*dim: C*/);
-      // create memory primitives
-      auto softmax_src_memory_p = std::make_shared<memory>(
-          memory::primitive_desc{softmax_md, mkldnn_engine},
-          static_cast<void*>(const_cast<T*>(input_data)));
-      dev_ctx.SetBlob(key_softmax_src_mem_p, softmax_src_memory_p);
-      auto softmax_dst_memory_p = std::make_shared<memory>(
-          memory::primitive_desc{softmax_md, mkldnn_engine},
-          static_cast<void*>(output_data));
-      dev_ctx.SetBlob(key_softmax_dst_mem_p, softmax_dst_memory_p);
-
-      auto softmax_forward_pd =
-          std::make_shared<softmax_forward::primitive_desc>(softmax_desc,
-                                                            mkldnn_engine);
-      softmax_p = std::make_shared<softmax_forward>(
-          *(softmax_forward_pd.get()),
-          *(static_cast<memory*>(softmax_src_memory_p.get())),
-          *(static_cast<memory*>(softmax_dst_memory_p.get())));
-      dev_ctx.SetBlob(key_softmax_p, softmax_p);
-    } else {
-      // Primitives already exist
-      auto src_memory_p = std::static_pointer_cast<memory>(
-          dev_ctx.GetBlob(key_softmax_src_mem_p));
-      PADDLE_ENFORCE(src_memory_p != nullptr,
-                     "Fail to find softmax src mem_p in device context");
-      auto dst_memory_p = std::static_pointer_cast<memory>(
-          dev_ctx.GetBlob(key_softmax_dst_mem_p));
-      PADDLE_ENFORCE(dst_memory_p != nullptr,
-                     "Fail to find softmax dst mem_p in device context");
-      src_memory_p->set_data_handle(
-          reinterpret_cast<void*>(const_cast<T*>(input_data)));
-      dst_memory_p->set_data_handle(output_data);
-    }
+    const std::string key =
+        platform::MKLDNNHandler::GetHash(softmax_tz, ctx.op().Output("Out"));
+    const std::string key_softmax_pd = key + "@softmax_pd";
+
+    // Currently only NC data format is supported
+    auto softmax_md = MKLDNNMemDesc(
+        {softmax_tz}, platform::MKLDNNGetDataType<T>(), memory::format::nc);
+    // Normalization is made after innermost dimension eg. C out of NC
+    auto softmax_desc = softmax_forward::desc(prop_kind::forward_scoring,
+                                              softmax_md, 1 /*dim: C*/);
+    auto softmax_pd = std::make_shared<mkldnn::softmax_forward::primitive_desc>(
+        softmax_desc, mkldnn_engine);
+    dev_ctx.SetBlob(key_softmax_pd, softmax_pd);
+
+    SoftmaxMKLDNNHandler handler(softmax_pd, dev_ctx, mkldnn_engine, key);
+    auto softmax_src_memory_p =
+        handler.AcquireSrcMemory(softmax_md, to_void_cast<T>(input_data));
+    auto softmax_dst_memory_p =
+        handler.AcquireDstMemory(softmax_md, to_void_cast<T>(output_data));
+    auto softmax_p =
+        handler.AcquireSoftmax(softmax_dst_memory_p, softmax_src_memory_p);
 
     std::vector<primitive> pipeline{
         *(static_cast<softmax_forward::primitive*>(softmax_p.get()))};
@@ -120,6 +164,77 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
   }
 };
 
+template <typename T>
+class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+
+    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    auto mkldnn_engine = dev_ctx.GetEngine();
+    const Tensor* output = ctx.Input<Tensor>("Out");
+    const T* dst_data = output->data<T>();
+
+    auto* dout = ctx.template Input<Tensor>(framework::GradVarName("Out"));
+    const auto* diff_dst_ptr = dout->template data<T>();
+
+    auto* dx =
+        ctx.template Output<framework::Tensor>(framework::GradVarName("X"));
+    T* diff_src_ptr = dx->template mutable_data<T>(ctx.GetPlace());
+
+    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
+    std::vector<int> src_tz(dst_tz);
+    PADDLE_ENFORCE(output->dims().size() == 2UL,
+                   "The input of softmax op must be a 2D matrix.");
+    // MKL-DNN does support softmax over selected axis. Having 2D Tensor,
+    // we will make normalization after final eg. axis: 1
+    PADDLE_ENFORCE(((src_tz[0] == dst_tz[0]) && (src_tz[1] == dst_tz[1])),
+                   "Softmax input and output dimensions should match");
+    // Same memory descriptor to be used for input and output
+    memory::dims softmax_tz = {src_tz[0], src_tz[1]};
+    // Currently only supports NC data format
+    // retrieve eltwise primitive desc from device context
+    const std::string key =
+        platform::MKLDNNHandler::GetHash(softmax_tz, ctx.op().Input("Out"));
+    const std::string key_softmax_pd = key + "@softmax_pd";
+
+    auto softmax_pd =
+        std::static_pointer_cast<mkldnn::softmax_forward::primitive_desc>(
+            dev_ctx.GetBlob(key_softmax_pd));
+    PADDLE_ENFORCE(softmax_pd != nullptr,
+                   "Fail to find softmax_pd in device context");
+
+    // TODO(jczaja): Add layouts support when there is a need to do so
+    // Two dimensional softmax does support NC format
+    auto data_softmax_md = MKLDNNMemDesc(
+        {softmax_tz}, platform::MKLDNNGetDataType<T>(), memory::format::nc);
+    auto diff_softmax_md = MKLDNNMemDesc(
+        {softmax_tz}, platform::MKLDNNGetDataType<T>(), memory::format::nc);
+    // Normalization is made after innermost dimension eg. C out of NC
+    auto softmax_bwd_desc =
+        softmax_backward::desc(diff_softmax_md, data_softmax_md, 1 /* dim: C*/);
+    auto softmax_bwd_pd =
+        std::make_shared<mkldnn::softmax_backward::primitive_desc>(
+            softmax_bwd_desc, mkldnn_engine, *softmax_pd);
+
+    SoftmaxMKLDNNHandler handler(softmax_pd, softmax_bwd_pd, dev_ctx,
+                                 mkldnn_engine, key);
+    auto dst_memory_p =
+        handler.AcquireDstMemory(data_softmax_md, to_void_cast<T>(dst_data));
+    auto diff_dst_memory_p = handler.AcquireDiffDstMemory(
+        diff_softmax_md, to_void_cast<T>(diff_dst_ptr));
+    auto diff_src_memory_p = handler.AcquireDiffSrcMemory(
+        diff_softmax_md, to_void_cast<T>(diff_src_ptr));
+
+    // Get primitve from device context
+    auto softmax_bwd_p = handler.AcquireSoftmaxBackward(
+        dst_memory_p, diff_dst_memory_p, diff_src_memory_p);
+
+    std::vector<primitive> pipeline{*softmax_bwd_p};
+    stream(stream::kind::eager).submit(pipeline).wait();
+  }
+};
 }  // namespace operators
 }  // namespace paddle
 
@@ -127,3 +242,5 @@ namespace ops = paddle::operators;
 
 REGISTER_OP_KERNEL(softmax, MKLDNN, ::paddle::platform::CPUPlace,
                    ops::SoftmaxMKLDNNKernel<float>);
+REGISTER_OP_KERNEL(softmax_grad, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::SoftmaxMKLDNNGradKernel<float>);
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index 847b3cbd1bd416ae1326211c98ba9d145c103298..31a7458f637921c290fc71ac748143867b4aae19 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -145,16 +145,30 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     // choose cudnn kernel if the runtime supported.
     framework::LibraryType library_{framework::LibraryType::kPlain};
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
 
 #ifdef PADDLE_WITH_CUDA
     if (platform::CanCUDNNBeUsed(ctx)) {
       library_ = framework::LibraryType::kCUDNN;
     }
 #endif
-    std::string data_format = ctx.Attr<std::string>("data_format");
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
-        framework::StringToDataLayout(data_format), library_);
+#ifdef PADDLE_WITH_MKLDNN
+    if (library_ == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library_ = framework::LibraryType::kMKLDNN;
+      layout_ = framework::DataLayout::kMKLDNN;
+    }
+#endif
+    auto input_data_type =
+        framework::ToDataType(ctx.Input<Tensor>("X")->type());
+    if (input_data_type == framework::proto::VarType::FP16) {
+      PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                     "float16 can only be used on GPU place");
+    }
+
+    return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_,
+                                   library_);
   }
 };
 
diff --git a/paddle/fluid/operators/sum_mkldnn_op.cc b/paddle/fluid/operators/sum_mkldnn_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f78d977760f18c9eb1270e515e68acb208a7c9a4
--- /dev/null
+++ b/paddle/fluid/operators/sum_mkldnn_op.cc
@@ -0,0 +1,240 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*Licensed under the Apache License, Version 2.0(the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. */
+
+#include "mkldnn.hpp"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/fluid/operators/sum_op.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using paddle::framework::Tensor;
+using paddle::platform::MKLDNNDeviceContext;
+using paddle::platform::CPUDeviceContext;
+using framework::DataLayout;
+using mkldnn::memory;
+using mkldnn::primitive;
+using mkldnn::stream;
+using mkldnn::sum;
+using mkldnn::reorder;
+using platform::to_void_cast;
+
+template <typename T>
+class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+    auto in_vars = ctx.MultiInputVar("X");
+
+    const int N = in_vars.size();
+    auto out_var = ctx.OutputVar("Out");
+    bool in_place = out_var == in_vars[0];
+
+    if (out_var->IsType<framework::LoDTensor>()) {
+      LoDTensor* output = ctx.Output<LoDTensor>("Out");
+      T* output_data = output->mutable_data<T>(ctx.GetPlace());
+
+      std::vector<int> dst_tz = framework::vectorize2int(output->dims());
+      auto src_tz = dst_tz;
+      memory::format output_format{memory::format::format_undef};
+      std::vector<float> scales;
+      std::vector<memory::primitive_desc> srcs_mpd;
+      std::vector<mkldnn::memory> srcs_mem;
+
+      PADDLE_ENFORCE(in_vars[0]->IsType<LoDTensor>(),
+                     "Input[0] must be LoDTensors");
+      auto& input0 = in_vars[0]->Get<LoDTensor>();
+      PADDLE_ENFORCE(input0.layout() == DataLayout::kMKLDNN &&
+                         input0.format() != memory::format::format_undef,
+                     "Wrong layout/format for inputs[0]");
+
+      memory::format input_format = input0.format();
+
+      if (src_tz.size() == 1 && (input_format == memory::format::nchw ||
+                                 input_format == memory::format::nhwc)) {
+        input_format = memory::format::x;
+      }
+      if (src_tz.size() == 2 && (input_format == memory::format::nchw ||
+                                 input_format == memory::format::nhwc)) {
+        input_format = memory::format::nc;
+      }
+
+      for (int i = in_place ? 1 : 0; i < N; i++) {
+        PADDLE_ENFORCE(in_vars[i]->IsType<LoDTensor>(),
+                       "all inputs must be all LoDTensors");
+        auto& input = in_vars[i]->Get<LoDTensor>();
+        PADDLE_ENFORCE(input.layout() == DataLayout::kMKLDNN &&
+                           input.format() != memory::format::format_undef,
+                       "Wrong layout/format for inputs");
+
+        if (input.numel() == 0) {
+          continue;
+        }
+
+        const T* input_data = input.data<T>();
+
+        auto src_md =
+            memory::desc(src_tz, memory::data_type::f32, input_format);
+        auto src_mpd = memory::primitive_desc(src_md, mkldnn_engine);
+        auto src_mem = memory(src_mpd, to_void_cast(input_data));
+        srcs_mpd.push_back(src_mpd);
+        srcs_mem.push_back(src_mem);
+        scales.push_back(1.0);
+      }
+
+      auto dst_md =
+          memory::desc(dst_tz, memory::data_type::f32, memory::format::any);
+
+      auto sum_pd = sum::primitive_desc(dst_md, scales, srcs_mpd);
+
+      std::shared_ptr<memory> dst_mem;
+      if (in_place) {
+        dst_mem.reset(new memory(sum_pd.dst_primitive_desc()));
+      } else {
+        dst_mem.reset(new memory(sum_pd.dst_primitive_desc(), output_data));
+      }
+      std::vector<mkldnn::primitive::at> inputs;
+      for (size_t i = 0; i < srcs_mem.size(); ++i) {
+        inputs.push_back(srcs_mem[i]);
+      }
+
+      auto sum_prim = mkldnn::sum(sum_pd, inputs, *dst_mem);
+      output_format = (memory::format)platform::GetMKLDNNFormat(sum_pd);
+
+      primitive reorder_prim;
+      std::shared_ptr<memory> target_mem;
+      if (in_place) {
+        output_format = input_format;
+        target_mem.reset(new memory(
+            {{{src_tz}, memory::data_type::f32, output_format}, mkldnn_engine},
+            output_data));
+        reorder_prim = reorder(*dst_mem, *target_mem);
+      }
+
+      std::vector<primitive> pipeline;
+      pipeline.push_back(sum_prim);
+      if (in_place) pipeline.push_back(reorder_prim);
+      stream(stream::kind::eager).submit(pipeline).wait();
+
+      output->set_layout(DataLayout::kMKLDNN);
+      output->set_format(output_format);
+    } else if (out_var->IsType<framework::SelectedRows>()) {
+      // TODO(@mozga-intel) Add MKLDNN SelectedRows support
+      std::unique_ptr<framework::SelectedRows> in0;
+      if (in_place) {
+        // If is in_place, we store the input[0] to in0
+        auto& in_sel0 = in_vars[0]->Get<SelectedRows>();
+        auto& rows = in_sel0.rows();
+        in0.reset(new framework::SelectedRows(rows, in_sel0.height()));
+        in0->mutable_value()->ShareDataWith(in_sel0.value());
+      }
+
+      auto get_selected_row = [&](size_t i) -> const SelectedRows& {
+        if (i == 0 && in0) {
+          return *in0.get();
+        } else {
+          return in_vars[i]->Get<SelectedRows>();
+        }
+      };
+      auto* out = ctx.Output<SelectedRows>("Out");
+      out->mutable_rows()->clear();
+      auto* out_value = out->mutable_value();
+
+      // Runtime InferShape
+      size_t first_dim = 0;
+      for (int i = 0; i < N; i++) {
+        auto& sel_row = get_selected_row(i);
+        first_dim += sel_row.rows().size();
+      }
+      auto in_dim =
+          framework::vectorize(get_selected_row(N - 1).value().dims());
+      in_dim[0] = static_cast<int64_t>(first_dim);
+
+      out_value->Resize(framework::make_ddim(in_dim));
+
+      // if all the input sparse vars are empty, no need to
+      // merge these vars.
+      if (first_dim == 0UL) {
+        return;
+      }
+      out_value->mutable_data<T>(ctx.GetPlace());
+      math::SelectedRowsAddTo<CPUDeviceContext, T> functor;
+      int64_t offset = 0;
+      for (int i = 0; i < N; i++) {
+        auto& sel_row = get_selected_row(i);
+        if (sel_row.rows().size() == 0) {
+          continue;
+        }
+        PADDLE_ENFORCE_EQ(out->height(), sel_row.height());
+        functor(ctx.template device_context<CPUDeviceContext>(), sel_row,
+                offset, out);
+        offset += sel_row.value().numel();
+      }
+    } else if (out_var->IsType<framework::LoDTensorArray>()) {
+      // TODO(@mozga-intel) Add MKLDNN LoDTensorArray support
+      auto& out_array = *out_var->GetMutable<framework::LoDTensorArray>();
+      for (size_t i = in_place ? 1 : 0; i < in_vars.size(); ++i) {
+        PADDLE_ENFORCE(in_vars[i]->IsType<framework::LoDTensorArray>(),
+                       "Only support all inputs are TensorArray");
+        auto& in_array = in_vars[i]->Get<framework::LoDTensorArray>();
+
+        for (size_t i = 0; i < in_array.size(); ++i) {
+          if (in_array[i].numel() != 0) {
+            if (i >= out_array.size()) {
+              out_array.resize(i + 1);
+            }
+            if (out_array[i].numel() == 0) {
+              framework::TensorCopy(in_array[i], in_array[i].place(),
+                                    ctx.device_context(), &out_array[i]);
+              out_array[i].set_lod(in_array[i].lod());
+            } else {
+              PADDLE_ENFORCE(out_array[i].lod() == in_array[i].lod());
+              auto in = EigenVector<T>::Flatten(in_array[i]);
+              auto result = EigenVector<T>::Flatten(out_array[i]);
+              result.device(*ctx.template device_context<MKLDNNDeviceContext>()
+                                 .eigen_device()) = result + in;
+            }
+          }
+        }
+      }
+    } else {
+      PADDLE_THROW("Unexpected branch, output variable type is %s",
+                   out_var->Type().name());
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_KERNEL(sum, MKLDNN, ::paddle::platform::CPUPlace,
+                   paddle::operators::SumMKLDNNOpKernel<float>);
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
index 863baba9ea7663d0b21875e0b423dc4a6ce2d59a..fe7c7039c7dec714e265ede1b7167fd800ddc2f7 100644
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -18,6 +18,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/var_type_inference.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 using framework::Tensor;
@@ -63,6 +67,18 @@ class SumOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     auto x_vars = ctx.MultiInputVar("X");
+
+    framework::LibraryType library{framework::LibraryType::kPlain};
+    framework::DataLayout layout{framework::DataLayout::kAnyLayout};
+
+#ifdef PADDLE_WITH_MKLDNN
+    if (library == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library = framework::LibraryType::kMKLDNN;
+      layout = framework::DataLayout::kMKLDNN;
+    }
+#endif
+
     if (x_vars[0]->IsType<framework::LoDTensor>()) {
       int dtype = -1;
       for (auto& x_var : x_vars) {
@@ -80,26 +96,27 @@ class SumOp : public framework::OperatorWithKernel {
                         "Sum operator should have at least one tensor");
 
       return framework::OpKernelType(
-          static_cast<framework::proto::VarType::Type>(dtype),
-          ctx.device_context());
+          static_cast<framework::proto::VarType::Type>(dtype), ctx.GetPlace(),
+          layout, library);
     } else if (x_vars[0]->IsType<framework::SelectedRows>()) {
       for (auto& var : x_vars) {
         auto& value = var->Get<framework::SelectedRows>().value();
         if (value.IsInitialized()) {
           return framework::OpKernelType(framework::ToDataType(value.type()),
-                                         ctx.device_context());
+                                         ctx.device_context(), layout, library);
         }
       }
       // if input sparse vars are not initialized, use an default kernel type.
       return framework::OpKernelType(framework::proto::VarType::FP32,
-                                     ctx.device_context());
+                                     ctx.device_context(), layout, library);
     } else if (x_vars[0]->IsType<framework::LoDTensorArray>()) {
       for (auto& x_var : x_vars) {
         auto& array = x_var->Get<framework::LoDTensorArray>();
         for (auto& each : array) {
           if (each.numel() != 0) {
             return framework::OpKernelType(framework::ToDataType(each.type()),
-                                           ctx.device_context());
+                                           ctx.device_context(), layout,
+                                           library);
           }
         }
       }
@@ -116,6 +133,9 @@ class SumOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "(vector<Tensor>) The input tensors of sum operator.")
         .AsDuplicable();
     AddOutput("Out", "(Tensor) The output tensor of sum operator.").Reuse("X");
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
     AddComment(R"DOC(
 Sum operator.
 
@@ -132,7 +152,6 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
                   framework::BlockDesc* block) const override {
     auto& inputs = op_desc.Input("X");
     auto var_type = framework::proto::VarType::SELECTED_ROWS;
-
     for (auto& name : op_desc.Input("X")) {
       VLOG(10) << name << " "
                << block->FindRecursiveOrCreateVar(name).GetType();
@@ -206,6 +225,7 @@ namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(sum, ops::SumOp, ops::SumOpMaker, ops::SumGradMaker,
                   ops::SumOpVarTypeInference);
+
 REGISTER_OP_CPU_KERNEL(
     sum, ops::SumKernel<paddle::platform::CPUDeviceContext, float>,
     ops::SumKernel<paddle::platform::CPUDeviceContext, double>,
diff --git a/paddle/fluid/operators/tensor_array_read_write_op.cc b/paddle/fluid/operators/tensor_array_read_write_op.cc
index c703d11eeccf8418250f00c801f47418ee9c85ae..a2d44284e9de1ace42cabbce82e0b45929432d7b 100644
--- a/paddle/fluid/operators/tensor_array_read_write_op.cc
+++ b/paddle/fluid/operators/tensor_array_read_write_op.cc
@@ -38,15 +38,14 @@ class WriteToArrayOp : public ArrayOp {
                << " to " << offset + 1;
       out->resize(offset + 1);
     }
+    auto *out_tensor = &out->at(offset);
+    out_tensor->set_lod(x_tensor.lod());
     if (x_tensor.memory_size() > 0) {
-      auto *out_tensor = &out->at(offset);
-
       platform::DeviceContextPool &pool =
           platform::DeviceContextPool::Instance();
       auto &dev_ctx = *pool.Get(place);
 
       TensorCopy(x_tensor, place, dev_ctx, out_tensor);
-      out_tensor->set_lod(x_tensor.lod());
     } else {
       VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so "
                   "nothing has been written to output array["
diff --git a/paddle/fluid/operators/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt_engine_op.cc
index 4b1208c4376b48e25866fc510f3a6d2ea06e7610..647cfc0a0af2be85e2868c6f68cab962c6631a8d 100644
--- a/paddle/fluid/operators/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op.cc
@@ -14,11 +14,14 @@
 
 #ifdef PADDLE_WITH_CUDA
 
-#include "paddle/fluid/operators/tensorrt_engine_op.h"
+#include <string>
+#include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/utils/singleton.h"
+#include "paddle/fluid/operators/tensorrt_engine_op.h"
 
 namespace paddle {
 namespace operators {
@@ -66,17 +69,25 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
 }  // namespace
 
 template <typename DeviceContext, typename T>
-void paddle::operators::TensorRTEngineKernel<DeviceContext, T>::Prepare(
+void TensorRTEngineKernel<DeviceContext, T>::Prepare(
     const framework::ExecutionContext &context) const {
   VLOG(4) << "Prepare engine";
   // Get the ProgramDesc and pass to convert.
   framework::proto::BlockDesc block_desc;
   block_desc.ParseFromString(context.Attr<std::string>("subgraph"));
-  max_batch_ = context.Attr<int>("max_batch");
+  int max_batch = context.Attr<int>("max_batch");
   auto max_workspace = context.Attr<int>("max_workspace");
-  engine_ = Singleton<TRT_EngineManager>::Global().Create(
-      max_batch_, max_workspace, &stream_);
-  engine_->InitNetwork();
+  auto params = context.Attr<std::vector<std::string>>("parameters");
+  std::unordered_set<std::string> parameters;
+  for (const auto &param : params) {
+    parameters.insert(param);
+  }
+
+  // TODO(Superjomn) replace this with a different stream
+  auto *engine = Singleton<TRT_EngineManager>::Global().Create(
+      max_batch, max_workspace, nullptr /*engine hold its own stream*/,
+      context.Attr<std::string>("engine_uniq_key"));
+  engine->InitNetwork();
 
   framework::BlockDesc block(nullptr /*programdesc*/, &block_desc);
   // Add inputs
@@ -87,24 +98,23 @@ void paddle::operators::TensorRTEngineKernel<DeviceContext, T>::Prepare(
     PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
                       "TensorRT engine only takes LoDTensor as input");
     auto shape = var->GetShape();
-    engine_->DeclareInput(
+    engine->DeclareInput(
         input, FluidDataType2TRT(
                    var->Proto()->type().lod_tensor().tensor().data_type()),
         Vec2TRT_Dims(var->GetShape()));
   }
 
-  // TODO(Superjomn) parameters should be passed after analysised from outside.
   inference::Singleton<inference::tensorrt::OpConverter>::Global().ConvertBlock(
-      block_desc, {}, context.scope(), engine_);
+      block_desc, parameters, context.scope(), engine);
 
   // Add outputs
   VLOG(4) << "declare outputs";
   for (auto &output : context.Outputs("Ys")) {
     VLOG(4) << "declare output " << output;
-    engine_->DeclareOutput(output);
+    engine->DeclareOutput(output);
   }
 
-  engine_->FreezeNetwork();
+  engine->FreezeNetwork();
 }
 
 class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -113,6 +123,7 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Xs", "A list of inputs.").AsDuplicable();
     AddOutput("Ys", "A list of outputs").AsDuplicable();
     AddAttr<std::string>("subgraph", "the subgraph.");
+    AddAttr<std::string>("engine_uniq_key", "unique key for the TRT engine.");
     AddAttr<int>("max_batch", "the maximum batch size.");
     AddAttr<int>("max_workspace", "the maximum batch size.");
     AddComment("TensorRT engine operator.");
diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h
index 4b089601ff76eedd87bb3a52a38c4d22d4a94bf6..1602a913aeebe43fabe2f9c9036edd18ac4c70fd 100644
--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
@@ -16,6 +16,9 @@
 
 #ifdef PADDLE_WITH_CUDA
 
+#include <string>
+#include <vector>
+
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
@@ -23,6 +26,9 @@
 namespace paddle {
 namespace operators {
 
+using inference::Singleton;
+using inference::tensorrt::TRT_EngineManager;
+
 class TensorRTEngineOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -47,16 +53,19 @@ template <typename DeviceContext, typename T>
 class TensorRTEngineKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    if (!engine_) {
+    VLOG(4) << "TensorRTEngineKernel executing";
+    auto engine_name = context.Attr<std::string>("engine_uniq_key");
+    if (!Singleton<TRT_EngineManager>::Global().HasEngine(engine_name)) {
       Prepare(context);
     }
+    auto* engine = Singleton<TRT_EngineManager>::Global().Get(engine_name);
     auto input_names = context.op().Inputs("Xs");
     PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs");
     // Try to determine a batch_size
     auto& tensor0 = inference::analysis::GetFromScope<framework::LoDTensor>(
         context.scope(), input_names.front());
     int batch_size = tensor0.dims()[0];
-    PADDLE_ENFORCE_LE(batch_size, max_batch_);
+    PADDLE_ENFORCE_LE(batch_size, context.Attr<int>("max_batch"));
 
     // Convert input tensor from fluid to engine.
     for (const auto& x : context.Inputs("Xs")) {
@@ -64,20 +73,20 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
       auto& t = inference::analysis::GetFromScope<framework::LoDTensor>(
           context.scope(), x);
       if (platform::is_cpu_place(t.place())) {
-        engine_->SetInputFromCPU(x, static_cast<const void*>(t.data<void>()),
-                                 t.memory_size());
+        engine->SetInputFromCPU(x, static_cast<const void*>(t.data<void>()),
+                                t.memory_size());
       } else {
-        engine_->SetInputFromGPU(x, static_cast<const void*>(t.data<void>()),
-                                 t.memory_size());
+        engine->SetInputFromGPU(x, static_cast<const void*>(t.data<void>()),
+                                t.memory_size());
       }
     }
     // Execute the engine.
     PADDLE_ENFORCE_GT(batch_size, 0);
-    engine_->Execute(batch_size);
+    engine->Execute(batch_size);
     // Convert output tensor from engine to fluid
     for (const auto& y : context.Outputs("Ys")) {
       // convert output and copy to fluid.
-      nvinfer1::ITensor* trt_t = engine_->GetITensor(y);
+      nvinfer1::ITensor* trt_t = engine->GetITensor(y);
       auto dims = trt_t->getDimensions();
       // Use the output ITensor's dims to reshape the Fluid Tensor.
       std::vector<int> ddim(dims.d, dims.d + dims.nbDims);
@@ -89,27 +98,22 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
       auto size = inference::analysis::AccuDims(dims.d, dims.nbDims);
       if (platform::is_cpu_place(fluid_t->place())) {
         // TODO(Superjomn) change this float to dtype size.
-        engine_->GetOutputInCPU(
+        engine->GetOutputInCPU(
             y, fluid_t->mutable_data<float>(platform::CPUPlace()),
             size * sizeof(float));
       } else {
-        engine_->GetOutputInGPU(
+        engine->GetOutputInGPU(
             y, fluid_t->mutable_data<float>(platform::CUDAPlace()),
             size * sizeof(float));
       }
     }
 
-    cudaStreamSynchronize(stream_);
+    cudaStreamSynchronize(*engine->stream());
   }
 
  protected:
   // Build the engine.
   void Prepare(const framework::ExecutionContext& context) const;
-
- private:
-  mutable cudaStream_t stream_;
-  mutable inference::tensorrt::TensorRTEngine* engine_{nullptr};
-  mutable int max_batch_{0};
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt_engine_op_test.cc
index 6f383de259b270038c32296b59007f6c7d895f12..82a16361e40513aeaf6f510e450f58989369fcdb 100644
--- a/paddle/fluid/operators/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
 
@@ -51,37 +52,10 @@ void AddTensorToBlockDesc(framework::proto::BlockDesc* block,
   *var = *desc.Proto();
 }
 
-template <typename T>
-void SetAttr(framework::proto::OpDesc* op, const std::string& name,
-             const T& data);
-
-template <>
-void SetAttr<std::string>(framework::proto::OpDesc* op, const std::string& name,
-                          const std::string& data) {
-  auto* attr = op->add_attrs();
-  attr->set_name(name);
-  attr->set_type(paddle::framework::proto::AttrType::STRING);
-  attr->set_s(data);
-}
-template <>
-void SetAttr<int>(framework::proto::OpDesc* op, const std::string& name,
-                  const int& data) {
-  auto* attr = op->add_attrs();
-  attr->set_name(name);
-  attr->set_type(paddle::framework::proto::AttrType::INT);
-  attr->set_i(data);
-}
-template <>
-void SetAttr<int64_t>(framework::proto::OpDesc* op, const std::string& name,
-                      const int64_t& data) {
-  auto* attr = op->add_attrs();
-  attr->set_name(name);
-  attr->set_type(paddle::framework::proto::AttrType::LONG);
-  attr->set_l(data);
-}
-
 }  // namespace
 
+using inference::analysis::SetAttr;
+
 TEST(TensorRTEngineOp, manual) {
   framework::ProgramDesc program;
   auto* block_ = program.Proto()->add_blocks();
@@ -123,11 +97,15 @@ TEST(TensorRTEngineOp, manual) {
   engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"}));
   SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
                        block_->SerializeAsString());
-  SetAttr<int>(engine_op_desc.Proto(), "max_batch", 30);
+  SetAttr<int>(engine_op_desc.Proto(), "max_batch", 100);
   SetAttr<int>(engine_op_desc.Proto(), "max_workspace", 1 << 10);
+  SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "a_engine");
+  SetAttr<std::vector<std::string>>(engine_op_desc.Proto(), "parameters",
+                                    std::vector<std::string>({}));
 
   LOG(INFO) << "create engine op";
   auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto());
+  LOG(INFO) << "engine_op " << engine_op.get();
 
   framework::Scope scope;
   platform::CPUPlace place;
@@ -145,6 +123,87 @@ TEST(TensorRTEngineOp, manual) {
   engine_op->Run(scope, place);
 }
 
+void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
+  framework::ProgramDesc program;
+  framework::Scope scope;
+  platform::CPUPlace place;
+  platform::CPUDeviceContext ctx(place);
+
+  auto* block_ = program.Proto()->add_blocks();
+  block_->set_idx(0);
+  block_->set_parent_idx(-1);
+
+  using shape_t = std::vector<int64_t>;
+
+  LOG(INFO) << "create block desc";
+  framework::BlockDesc block_desc(&program, block_);
+
+  auto AddFCLayer = [&](const std::string& x_name, const std::string& y_name,
+                        const std::string& z_name, bool x_created,
+                        const shape_t& x_shape, const shape_t& y_shape,
+                        const shape_t& z_shape) {
+    LOG(INFO) << "create fc op";
+    auto* fc = block_desc.AppendOp();
+    fc->SetType("mul");
+    fc->SetInput("X", std::vector<std::string>({x_name}));
+    fc->SetInput("Y", std::vector<std::string>({y_name}));
+    fc->SetOutput("Out", std::vector<std::string>({z_name}));
+
+    // Set inputs' variable shape in BlockDesc
+    if (!x_created) {
+      AddTensorToBlockDesc(block_, x_name,
+                           std::vector<int64_t>({batch_size, input_dim, 1, 1}));
+    }
+    AddTensorToBlockDesc(block_, y_name,
+                         std::vector<int64_t>({input_dim, output_dim}));
+    AddTensorToBlockDesc(block_, z_name,
+                         std::vector<int64_t>({batch_size, output_dim}));
+
+    // Prepare variables.
+    if (!x_created) {
+      CreateCPUTensor(&scope, x_name, std::vector<int64_t>(x_shape));
+    }
+    CreateCPUTensor(&scope, y_name, std::vector<int64_t>(y_shape));
+    CreateCPUTensor(&scope, z_name, std::vector<int64_t>(z_shape));
+
+    // It is wired, need to copy manually.
+    *block_->add_ops() = *fc->Proto();
+  };
+
+  // Test with 4 layer FC
+  AddFCLayer("x0", "y0", "z0", false, {batch_size, input_dim},
+             {input_dim, output_dim}, {batch_size, output_dim});
+  AddFCLayer("z0", "y1", "z1", true, {}, {output_dim, output_dim},
+             {batch_size, output_dim});
+  AddFCLayer("z1", "y2", "z2", true, {}, {output_dim, output_dim},
+             {batch_size, output_dim});
+  AddFCLayer("z2", "y3", "z3", true, {}, {output_dim, output_dim},
+             {batch_size, output_dim});
+
+  LOG(INFO) << "create tensorrt desc";
+  framework::OpDesc engine_op_desc(nullptr);
+  engine_op_desc.SetType("tensorrt_engine");
+  engine_op_desc.SetInput("Xs", std::vector<std::string>({"x0"}));
+  engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z3"}));
+
+  SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
+                       block_->SerializeAsString());
+  SetAttr<int>(engine_op_desc.Proto(), "max_batch", batch_size);
+  SetAttr<int>(engine_op_desc.Proto(), "max_workspace", 2 << 10);
+  SetAttr<std::vector<std::string>>(
+      engine_op_desc.Proto(), "parameters",
+      std::vector<std::string>({"y0", "y1", "y2", "y3"}));
+  SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "b_engine");
+
+  auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto());
+
+  // Execute them.
+  engine_op->Run(scope, place);
+}
+
+// Test with a larger FC layer.
+TEST(TensorRTEngineOp, fc) { Execute(40, 28, 28); }
+
 }  // namespace operators
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/test_send_nccl_id.cc b/paddle/fluid/operators/test_send_nccl_id.cc
index 5015b1005569ba70b147ebb795243e24ab81ea5c..e2b7b6b8e447381229e4ad594b7974bc0aa159d5 100644
--- a/paddle/fluid/operators/test_send_nccl_id.cc
+++ b/paddle/fluid/operators/test_send_nccl_id.cc
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/detail/macros.h"
-#include "paddle/fluid/operators/detail/request_handler_impl.h"
+#include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #include "paddle/fluid/operators/listen_and_serv_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
@@ -37,11 +37,11 @@ USE_NO_KERNEL_OP(listen_and_serv);
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 namespace m = paddle::operators::math;
-namespace detail = paddle::operators::detail;
+namespace distributed = paddle::operators::distributed;
 namespace string = paddle::string;
 
-std::unique_ptr<detail::RPCServer> g_rpc_service;
-std::unique_ptr<detail::RequestHandler> g_req_handler;
+std::unique_ptr<distributed::RPCServer> g_rpc_service;
+std::unique_ptr<distributed::RequestHandler> g_req_handler;
 
 void StartServer() {
   f::Scope scope;
@@ -57,14 +57,14 @@ void StartServer() {
   g_req_handler->SetProgram(&empty_program);
   g_req_handler->SetExecutor(&executor);
 
-  g_rpc_service->RegisterRPC(detail::kRequestSend, g_req_handler.get());
+  g_rpc_service->RegisterRPC(distributed::kRequestSend, g_req_handler.get());
   g_req_handler->SetRPCServer(g_rpc_service.get());
 
   std::thread server_thread(
-      std::bind(&detail::RPCServer::StartServer, g_rpc_service.get()));
+      std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get()));
 
-  g_rpc_service->SetCond(detail::kRequestSend);
-  g_rpc_service->WaitBarrier(detail::kRequestSend);
+  g_rpc_service->SetCond(distributed::kRequestSend);
+  g_rpc_service->WaitBarrier(distributed::kRequestSend);
 
   LOG(INFO) << "got nccl id and stop server...";
   g_rpc_service->ShutDown();
@@ -72,7 +72,7 @@ void StartServer() {
 }
 
 TEST(SendNcclId, RPCServer) {
-  g_req_handler.reset(new detail::RequestSendHandler(true));
+  g_req_handler.reset(new distributed::RequestSendHandler(true));
   g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1));
 
   std::thread server_thread(StartServer);
@@ -91,7 +91,8 @@ TEST(SendNcclId, RPCServer) {
 
   std::string ep = string::Sprintf("127.0.0.1:%d", port);
 
-  detail::RPCClient* client = detail::RPCClient::GetInstance<RPCCLIENT_T>();
+  distributed::RPCClient* client =
+      distributed::RPCClient::GetInstance<RPCCLIENT_T>();
 
   LOG(INFO) << "connect to server" << ep;
   client->AsyncSendVar(ep, dev_ctx, scope, NCCL_ID_VARNAME);
diff --git a/paddle/fluid/operators/uniform_random_batch_size_like_op.cc b/paddle/fluid/operators/uniform_random_batch_size_like_op.cc
index 78fee77df8151221459b0afa0d6789bfe82cfda5..75d6181749e4e9bd81a3c02de69caf0acd81eef9 100644
--- a/paddle/fluid/operators/uniform_random_batch_size_like_op.cc
+++ b/paddle/fluid/operators/uniform_random_batch_size_like_op.cc
@@ -35,10 +35,10 @@ class UniformRandomBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker {
  protected:
   void Apply() override {
     AddComment(R"DOC(
-Uniform random operator
+UniformRandomBatchSizeLike operator.
 
 This operator initializes a tensor with the same batch_size as the Input tensor
- with random values sampled from a uniform distribution.
+with random values sampled from a uniform distribution.
 
 )DOC");
     AddAttr<float>("min",
diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index 137ea91caedabc3167146d91b063dbe9e2e2b931..edd1baa4ace4e246190afcd12b0716f1dd38e243 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -86,32 +86,24 @@ class UniformRandomOp : public framework::OperatorWithKernel {
 class UniformRandomOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddOutput("Out", "(Tensor) The output tensor of uniform random op");
+    AddOutput("Out", "The output tensor of uniform random op");
     AddComment(R"DOC(
-Uniform random operator.
-
 This operator initializes a tensor with random values sampled from a
-uniform distribution.
+uniform distribution. The random result is in set [min, max].
 
 )DOC");
-    AddAttr<std::vector<int>>("shape",
-                              "(vector<int>) The shape of the output tensor");
-    AddAttr<float>("min",
-                   "(float, default -1.0) "
-                   "Minimum value of uniform random")
+    AddAttr<std::vector<int>>("shape", "The shape of the output tensor");
+    AddAttr<float>("min", "Minimum value of uniform random. [default -1.0].")
         .SetDefault(-1.0f);
-    AddAttr<float>("max",
-                   "(float, default 1.0) "
-                   "Maximun value of uniform random")
+    AddAttr<float>("max", "Maximun value of uniform random. [default 1.0].")
         .SetDefault(1.0f);
     AddAttr<int>("seed",
-                 "(int, default 0) "
                  "Random seed used for generating samples. "
                  "0 means use a seed generated by the system."
                  "Note that if seed is not 0, this operator will always "
-                 "generate the same random numbers every time.")
+                 "generate the same random numbers every time. [default 0].")
         .SetDefault(0);
-    AddAttr<int>("dtype", "(int, default 5(FP32)) Output tensor data type")
+    AddAttr<int>("dtype", "Output tensor data type. [default 5(FP32)].")
         .SetDefault(framework::proto::VarType::FP32);
   }
 };
diff --git a/paddle/fluid/operators/while_op.cc b/paddle/fluid/operators/while_op.cc
index 175c3ac5d79f24e47d21417df8e3eaeb4d5b2335..733157ea05ed39434b9a750e3a94ea548f512ce6 100644
--- a/paddle/fluid/operators/while_op.cc
+++ b/paddle/fluid/operators/while_op.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
 
 namespace paddle {
@@ -135,15 +136,14 @@ class WhileGradOp : public framework::OperatorBase {
         auto &og_inside =
             detail::Ref(cur_scope.Var(inside_og_name),
                         "Cannot find inside gradient %s", inside_og_name);
-        if (og_outside.Type().hash_code() ==
-            typeid(framework::LoDTensor).hash_code()) {
+        if (framework::IsType<framework::LoDTensor>(og_outside.Type())) {
           auto &outside_tensor = og_outside.Get<framework::LoDTensor>();
           auto &inside_tensor =
               detail::Ref(og_inside.GetMutable<framework::LoDTensor>());
           inside_tensor.set_lod(outside_tensor.lod());
           inside_tensor.ShareDataWith(outside_tensor);
-        } else if (og_outside.Type().hash_code() ==
-                   typeid(framework::LoDTensorArray).hash_code()) {
+        } else if (framework::IsType<framework::LoDTensorArray>(
+                       og_outside.Type())) {
           auto &outside_array = og_outside.Get<framework::LoDTensorArray>();
           auto &inside_array =
               detail::Ref(og_inside.GetMutable<framework::LoDTensorArray>());
@@ -203,11 +203,11 @@ class WhileGradOp : public framework::OperatorBase {
                 ->set_lod(inside_tensor.lod());
           }
         }
-
         auto new_inside_name = cur_scope.Rename(inside_grad_name);
         auto sum_op = framework::OpRegistry::CreateOp(
             "sum", {{"X", {pg_names[param_id], new_inside_name}}},
-            {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{});
+            {{"Out", {pg_names[param_id]}}},
+            framework::AttributeMap{{"use_mkldnn", {false}}});
         sum_op->Run(cur_scope, dev_place);
         cur_scope.Rename(new_inside_name, inside_grad_name);
       }
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index b29035bafd34fa81dc6b59691142fe74439202b8..20037d0764056c2a093af801c9cc1eb788dd46d6 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -28,6 +28,9 @@ cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 
 add_subdirectory(dynload)
 
+cc_library(cpu_helper SRCS cpu_helper.cc DEPS cblas enforce)
+cc_test(cpu_helper_test SRCS cpu_helper_test.cc DEPS cpu_helper)
+
 IF(WITH_GPU)
     set(GPU_CTX_DEPS dynload_cuda dynamic_loader)
 ELSE()
@@ -42,10 +45,12 @@ ENDIF()
 
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
-cc_library(device_context SRCS device_context.cc DEPS malloc
-    place eigen3 ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
+cc_library(device_context SRCS device_context.cc init.cc DEPS malloc
+    place eigen3 stringpiece cpu_helper ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
 nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
 
+cc_test(init_test SRCS init_test.cc DEPS device_context)
+
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
 nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
 
@@ -53,5 +58,5 @@ cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framewo
 cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
 cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
 
-nv_test(float16_gpu_test SRCS float16_test.cu)
-cc_test(float16_test SRCS float16_test.cc)
+nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
+cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor)
diff --git a/paddle/fluid/platform/cpu_helper.cc b/paddle/fluid/platform/cpu_helper.cc
new file mode 100644
index 0000000000000000000000000000000000000000..77ecb170111d63f23312d06fa8a8172bc45f2a4e
--- /dev/null
+++ b/paddle/fluid/platform/cpu_helper.cc
@@ -0,0 +1,42 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/cpu_helper.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#ifdef PADDLE_WITH_MKLML
+#include "paddle/fluid/platform/dynload/mklml.h"
+#endif
+
+#ifdef PADDLE_USE_OPENBLAS
+#include <cblas.h>
+#endif
+
+namespace paddle {
+namespace platform {
+
+void SetNumThreads(int num_threads) {
+#ifdef PADDLE_USE_OPENBLAS
+  int real_num_threads = num_threads > 1 ? num_threads : 1;
+  openblas_set_num_threads(real_num_threads);
+#elif defined(PADDLE_WITH_MKLML)
+  int real_num_threads = num_threads > 1 ? num_threads : 1;
+  platform::dynload::MKL_Set_Num_Threads(real_num_threads);
+#else
+  PADDLE_ENFORCE(false, "To be implemented.");
+#endif
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/cpu_helper.h b/paddle/fluid/platform/cpu_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..78fc392b632ef92d4ae08de2051041fc0bf6778b
--- /dev/null
+++ b/paddle/fluid/platform/cpu_helper.h
@@ -0,0 +1,26 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stddef.h>
+
+namespace paddle {
+namespace platform {
+
+//! Set the number of threads in use.
+void SetNumThreads(int num_threads);
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/cpu_helper_test.cc b/paddle/fluid/platform/cpu_helper_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dc1b2b56cd98ca6259c46a76231dbc99482970c1
--- /dev/null
+++ b/paddle/fluid/platform/cpu_helper_test.cc
@@ -0,0 +1,22 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/cpu_helper.h"
+
+#include "gtest/gtest.h"
+
+TEST(CpuHelper, SetNumThread) {
+  paddle::platform::SetNumThreads(1);
+  paddle::platform::SetNumThreads(4);
+}
diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc
index 40dc7c9a0b6a40f2419ace3ce7e0e5e82bc95c1a..f832d72b53e8d06a32d5c0ac2ecf7130aa28a666 100644
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -28,9 +28,15 @@ DEFINE_double(fraction_of_cpu_memory_to_use, 1,
               "Default use 100% of CPU memory for PaddlePaddle,"
               "reserve the rest for page tables, etc");
 
-DEFINE_uint64(
-    initial_cpu_memory_in_mb, 500,
-    "Default initial 500MB of CPU memory for PaddlePaddle, in MD unit.");
+DEFINE_uint64(initial_cpu_memory_in_mb,
+#ifdef PADDLE_WITH_MKLDNN
+              /* Aligned with mozga-intel, MKLDNN need at least 5000 MB
+               * to obtain the best performance*/
+              5000,
+#else
+              500,
+#endif
+              "Initial CPU memory for PaddlePaddle, in MD unit.");
 
 DEFINE_double(
     fraction_of_cuda_pinned_memory_to_use, 0.5,
@@ -59,10 +65,7 @@ inline size_t CpuTotalPhysicalMemory() {
 size_t CpuMaxAllocSize() {
   // For distributed systems, it requires configuring and limiting
   // the fraction of memory to use.
-  return std::min(
-      static_cast<size_t>(FLAGS_fraction_of_cpu_memory_to_use *
-                          CpuTotalPhysicalMemory()),
-      static_cast<size_t>(FLAGS_initial_cpu_memory_in_mb * 1 << 20));
+  return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory();
 }
 
 size_t CpuMinChunkSize() {
@@ -71,8 +74,11 @@ size_t CpuMinChunkSize() {
 }
 
 size_t CpuMaxChunkSize() {
-  // Allow to allocate the maximum chunk size is roughly 3% of CPU memory.
-  return CpuMaxAllocSize() / 32;
+  // Allow to allocate the maximum chunk size is roughly 3% of CPU memory,
+  // or the initial_cpu_memory_in_mb.
+  return std::min(
+      static_cast<size_t>(CpuMaxAllocSize() / 32),
+      static_cast<size_t>(FLAGS_initial_cpu_memory_in_mb * 1 << 20));
 }
 
 size_t CUDAPinnedMaxAllocSize() {
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 6c50ab2685c56bafe146c67fe2ef081ee4c55628..2cc26da013f59f5b7ee1747d57baca9c1c0efe2c 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
 
+#include <set>
 #include <string>
 #include <unordered_set>
 #include <vector>
@@ -35,7 +36,7 @@ DeviceContextPool::DeviceContextPool(
     const std::vector<platform::Place>& places) {
   PADDLE_ENFORCE_GT(places.size(), 0);
   using PtrType = std::unique_ptr<DeviceContext>;
-  std::unordered_set<Place, PlaceHash> set;
+  std::set<Place> set;
   for (auto& p : places) {
     set.insert(p);
   }
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 292ffef1aef12732812b8c5b0020cad73b1d06fc..88e0383146c1adf2752a362091996bad9cfcce5e 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -27,12 +27,12 @@ limitations under the License. */
 #include <mkldnn.hpp>
 #endif
 
+#include <map>
+#include "glog/logging.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
-#include "glog/logging.h"
-
 namespace paddle {
 namespace platform {
 
@@ -201,9 +201,7 @@ class DeviceContextPool {
 
  private:
   static DeviceContextPool* pool;
-  std::unordered_map<const platform::Place,
-                     std::unique_ptr<platform::DeviceContext>, PlaceHash>
-      device_contexts_;
+  std::map<Place, std::unique_ptr<DeviceContext>> device_contexts_;
   DISABLE_COPY_AND_ASSIGN(DeviceContextPool);
 };
 
diff --git a/paddle/fluid/platform/device_context_test.cu b/paddle/fluid/platform/device_context_test.cu
index fa806aba6d8747beebc3eed2c661b326dd62fd76..171d2979a0218ad5e22112190a59866b3e0b617f 100644
--- a/paddle/fluid/platform/device_context_test.cu
+++ b/paddle/fluid/platform/device_context_test.cu
@@ -69,19 +69,3 @@ TEST(Device, DeviceContextPool) {
     ASSERT_NE(dev_ctx, nullptr);
   }
 }
-
-int main(int argc, char** argv) {
-  std::vector<paddle::platform::Place> places;
-
-  places.emplace_back(paddle::platform::CPUPlace());
-  int count = paddle::platform::GetCUDADeviceCount();
-  for (int i = 0; i < count; ++i) {
-    places.emplace_back(paddle::platform::CUDAPlace(i));
-  }
-
-  VLOG(0) << " DeviceCount " << count;
-  paddle::platform::DeviceContextPool::Init(places);
-
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index 364c4901b297dbd647faae85b01f682a1daace9c..9da787a4073fa002f75154f7c4fba54e9ed8efa6 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -1,14 +1,23 @@
 cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
 
-list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc nccl.cc)
+list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc)
+
+# There is no macOS version of NCCL.
+if (NOT APPLE)
+  list(APPEND CUDA_SRCS nccl.cc)
+endif()
+
 if (TENSORRT_FOUND)
   list(APPEND CUDA_SRCS tensorrt.cc)
 endif()
 
-
 configure_file(cupti_lib_path.h.in ${CMAKE_CURRENT_BINARY_DIR}/cupti_lib_path.h)
 if (CUPTI_FOUND)
     list(APPEND CUDA_SRCS cupti.cc)
 endif(CUPTI_FOUND)
 nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
 cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
+if (WITH_MKLML)
+    cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml)
+endif()
+# TODO(TJ): add iomp, mkldnn?
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index 19c01dc5a968c7e1d2b0f15cf9a0e8427004e58b..93bf7c13516ffa4baca6a30f1daf946939726d85 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -36,8 +36,6 @@ DEFINE_string(cuda_dir, "",
 
 DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
 
-DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so.");
-
 DEFINE_string(nccl_dir, "",
               "Specify path for loading nccl library, such as libcublas, "
               "libcurand. For instance, /usr/local/cuda/lib64. If default, "
@@ -49,6 +47,8 @@ DEFINE_string(
     tensorrt_dir, "",
     "Specify path for loading tensorrt library, such as libnvinfer.so.");
 
+DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so.");
+
 namespace paddle {
 namespace platform {
 namespace dynload {
@@ -76,6 +76,7 @@ static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path,
   VLOG(3) << "Try to find library: " << dso_path
           << " from default system path.";
   // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
+  // and /usr/local/lib path
   void* dso_handle = dlopen(dso_path.c_str(), dynload_flags);
 
 // DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
@@ -97,6 +98,10 @@ static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path,
   }
 #endif
 
+  if (nullptr == dso_handle) {
+    LOG(WARNING) << "Can not find library: " << dso_path
+                 << ". Please try to add the lib path to LD_LIBRARY_PATH.";
+  }
   return dso_handle;
 }
 
@@ -182,14 +187,6 @@ void* GetWarpCTCDsoHandle() {
 #endif
 }
 
-void* GetLapackDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.dylib");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.so");
-#endif
-}
-
 void* GetNCCLDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib");
@@ -206,6 +203,14 @@ void* GetTensorRtDsoHandle() {
 #endif
 }
 
+void* GetMKLMLDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.dylib");
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.so");
+#endif
+}
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h
index 0de3559b6088086cb52c254535b6ec42da7dd724..84fd2ce9987628a5ed29e4125a03dedb96e416c1 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
@@ -23,9 +23,9 @@ void* GetCUDNNDsoHandle();
 void* GetCUPTIDsoHandle();
 void* GetCurandDsoHandle();
 void* GetWarpCTCDsoHandle();
-void* GetLapackDsoHandle();
 void* GetNCCLDsoHandle();
 void* GetTensorRtDsoHandle();
+void* GetMKLMLDsoHandle();
 
 }  // namespace dynload
 }  // namespace platform
diff --git a/paddle/fluid/platform/dynload/mklml.cc b/paddle/fluid/platform/dynload/mklml.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0f61a5e09b3243cbdf570ba7c28a260f181d8848
--- /dev/null
+++ b/paddle/fluid/platform/dynload/mklml.cc
@@ -0,0 +1,30 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/mklml.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+std::once_flag mklml_dso_flag;
+void* mklml_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+MKLML_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h
new file mode 100644
index 0000000000000000000000000000000000000000..17acefe8cde01809572e4c86cbdccfed9a477a51
--- /dev/null
+++ b/paddle/fluid/platform/dynload/mklml.h
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <dlfcn.h>
+#include <mkl.h>
+#include <mutex>  // NOLINT
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+extern std::once_flag mklml_dso_flag;
+extern void* mklml_dso_handle;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load mklml routine
+ * via operator overloading.
+ */
+#define DYNAMIC_LOAD_MKLML_WRAP(__name)                                    \
+  struct DynLoad__##__name {                                               \
+    template <typename... Args>                                            \
+    auto operator()(Args... args) -> decltype(__name(args...)) {           \
+      using mklmlFunc = decltype(&::__name);                               \
+      std::call_once(mklml_dso_flag, []() {                                \
+        mklml_dso_handle = paddle::platform::dynload::GetMKLMLDsoHandle(); \
+      });                                                                  \
+      static void* p_##_name = dlsym(mklml_dso_handle, #__name);           \
+      return reinterpret_cast<mklmlFunc>(p_##_name)(args...);              \
+    }                                                                      \
+  };                                                                       \
+  extern DynLoad__##__name __name
+
+#define DECLARE_DYNAMIC_LOAD_MKLML_WRAP(__name) DYNAMIC_LOAD_MKLML_WRAP(__name)
+
+#define MKLML_ROUTINE_EACH(__macro) \
+  __macro(cblas_sgemm);             \
+  __macro(cblas_saxpy);             \
+  __macro(cblas_scopy);             \
+  __macro(cblas_sgemv);             \
+  __macro(cblas_sgemm_batch);       \
+  __macro(cblas_dgemm);             \
+  __macro(cblas_daxpy);             \
+  __macro(cblas_dcopy);             \
+  __macro(cblas_dgemv);             \
+  __macro(cblas_dgemm_batch);       \
+  __macro(vsAdd);                   \
+  __macro(vdAdd);                   \
+  __macro(MKL_Set_Num_Threads)
+
+MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP);
+
+#undef DYNAMIC_LOAD_MKLML_WRAP
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 7b8c29e1e642ec6bb4023afd8c083311b8b31812..566485cd3c383640047d97f40b452735e8c8c171 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -44,8 +44,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/dynload/cublas.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/fluid/platform/dynload/curand.h"
+#ifndef __APPLE__
 #include "paddle/fluid/platform/dynload/nccl.h"
-#endif
+#endif  // __APPLE__
+#endif  // PADDLE_WITH_CUDA
 
 namespace paddle {
 namespace platform {
@@ -100,6 +102,15 @@ struct EnforceNotMet : public std::exception {
   const char* what() const noexcept { return err_str_.c_str(); }
 };
 
+struct EOFException : public std::exception {
+  std::string err_str_;
+  EOFException(const char* err_msg, const char* f, int l) {
+    err_str_ = string::Sprintf("%s at [%s:%d]", err_msg, f, l);
+  }
+
+  const char* what() const noexcept { return err_str_.c_str(); }
+};
+
 // Because most enforce conditions would evaluate to true, we can use
 // __builtin_expect to instruct the C++ compiler to generate code that
 // always forces branch prediction of true.
@@ -111,7 +122,11 @@ template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
     bool stat, const Args&... args) {
   if (UNLIKELY(!(stat))) {
+#ifndef REPLACE_ENFORCE_GLOG
     throw std::runtime_error(string::Sprintf(args...));
+#else
+    LOG(FATAL) << string::Sprintf(args...);
+#endif
   }
 }
 
@@ -121,8 +136,12 @@ template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
     cudaError_t e, const Args&... args) {
   if (UNLIKELY(e)) {
+#ifndef REPLACE_ENFORCE_GLOG
     throw thrust::system_error(e, thrust::cuda_category(),
                                string::Sprintf(args...));
+#else
+    LOG(FATAL) << string::Sprintf(args...);
+#endif
   }
 }
 
@@ -130,8 +149,12 @@ template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
     curandStatus_t stat, const Args&... args) {
   if (stat != CURAND_STATUS_SUCCESS) {
+#ifndef REPLACE_ENFORCE_GLOG
     throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(),
                                string::Sprintf(args...));
+#else
+    LOG(FATAL) << string::Sprintf(args...);
+#endif
   }
 }
 
@@ -141,8 +164,12 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
   if (stat == CUDNN_STATUS_SUCCESS) {
     return;
   } else {
+#ifndef REPLACE_ENFORCE_GLOG
     throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) +
                              string::Sprintf(args...));
+#else
+    LOG(FATAL) << string::Sprintf(args...);
+#endif
   }
 }
 
@@ -171,20 +198,30 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
   } else if (stat == CUBLAS_STATUS_LICENSE_ERROR) {
     err = "CUBLAS: license error, ";
   }
+#ifndef REPLACE_ENFORCE_GLOG
   throw std::runtime_error(err + string::Sprintf(args...));
+#else
+  LOG(FATAL) << err << string::Sprintf(args...);
+#endif
 }
 
+#ifndef __APPLE__
 template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
     ncclResult_t stat, const Args&... args) {
   if (stat == ncclSuccess) {
     return;
   } else {
+#ifndef REPLACE_ENFORCE_GLOG
     throw std::runtime_error(platform::dynload::ncclGetErrorString(stat) +
                              string::Sprintf(args...));
+#else
+    LOG(FATAL) << platform::dynload::ncclGetErrorString(stat)
+               << string::Sprintf(args...);
+#endif
   }
 }
-
+#endif  // __APPLE__
 #endif  // PADDLE_WITH_CUDA
 
 template <typename T>
@@ -200,6 +237,7 @@ inline void throw_on_error(T e) {
         __FILE__, __LINE__);                                           \
   } while (false)
 
+#ifndef REPLACE_ENFORCE_GLOG
 #define PADDLE_ENFORCE(...)                                             \
   do {                                                                  \
     try {                                                               \
@@ -209,7 +247,15 @@ inline void throw_on_error(T e) {
                                               __FILE__, __LINE__);      \
     }                                                                   \
   } while (false)
+#else
+#define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__);
+#endif
 
+#define PADDLE_THROW_EOF()                                                     \
+  do {                                                                         \
+    throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \
+                                           __LINE__);                          \
+  } while (false)
 /*
  * Some enforce helpers here, usage:
  *    int a = 1;
diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc
index 57d751cc00b5f11f1ba1a3b0c9a6b7ce9e79f586..0e8684581a93f076b1a077cc52e966d3c88cf078 100644
--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
@@ -210,3 +210,14 @@ TEST(ENFORCE_USER_DEFINED_CLASS, NE) {
   Dims a{{1, 2, 3, 4}}, b{{5, 6, 7, 8}};
   ASSERT_THROW(PADDLE_ENFORCE_EQ(a, b), paddle::platform::EnforceNotMet);
 }
+
+TEST(EOF_EXCEPTION, THROW_EOF) {
+  bool caught_eof = false;
+  try {
+    PADDLE_THROW_EOF();
+  } catch (paddle::platform::EOFException error) {
+    caught_eof = true;
+    EXPECT_TRUE(HasPrefix(StringPiece(error.what()), "There is no next data."));
+  }
+  EXPECT_TRUE(caught_eof);
+}
diff --git a/paddle/fluid/platform/float16_test.cc b/paddle/fluid/platform/float16_test.cc
index a589e32b61a9b6a44bdc4529eee715d987d6922c..ede294be1e2e26693bd3ead2ccd5e6a6c8a075bc 100644
--- a/paddle/fluid/platform/float16_test.cc
+++ b/paddle/fluid/platform/float16_test.cc
@@ -13,8 +13,8 @@ limitations under the License. */
 #include <vector>
 
 #include "gtest/gtest.h"
-#include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/platform/init.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu
index 577fc24ceb1d3c83cc0546dc5db9c8c7c1f01f86..1b9cf9b5d3fa2121b588c31d7cf2f4c50cb951bc 100644
--- a/paddle/fluid/platform/float16_test.cu
+++ b/paddle/fluid/platform/float16_test.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
 #define ARITHMETIC_KERNEL(op_type, sign)                                 \
   __global__ void op_type(const half* in1, const half* in2, half* out) { \
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0b776528414735e8a7c1e3763e7ccb662bb9f285
--- /dev/null
+++ b/paddle/fluid/platform/init.cc
@@ -0,0 +1,130 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <string.h>  // for strdup
+#include <algorithm>
+#include <stdexcept>
+#include <string>
+
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/cpu_helper.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/init.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/string/piece.h"
+
+namespace paddle {
+namespace framework {
+
+std::once_flag gflags_init_flag;
+std::once_flag p2p_init_flag;
+
+void InitGflags(std::vector<std::string> argv) {
+  std::call_once(gflags_init_flag, [&]() {
+    argv.insert(argv.begin(), "dummy");
+    int argc = argv.size();
+    char **arr = new char *[argv.size()];
+    std::string line;
+    for (size_t i = 0; i < argv.size(); i++) {
+      arr[i] = &argv[i][0];
+      line += argv[i];
+      line += ' ';
+    }
+    google::ParseCommandLineFlags(&argc, &arr, true);
+    VLOG(1) << "Init commandline: " << line;
+  });
+}
+
+void InitP2P(std::vector<int> devices) {
+#ifdef PADDLE_WITH_CUDA
+  std::call_once(p2p_init_flag, [&]() {
+    int count = devices.size();
+    for (int i = 0; i < count; ++i) {
+      for (int j = 0; j < count; ++j) {
+        if (devices[i] == devices[j]) continue;
+        int can_acess = -1;
+        PADDLE_ENFORCE(
+            cudaDeviceCanAccessPeer(&can_acess, devices[i], devices[j]),
+            "Failed to test P2P access.");
+        if (can_acess != 1) {
+          LOG(WARNING) << "Cannot enable P2P access from " << devices[i]
+                       << " to " << devices[j];
+        } else {
+          cudaSetDevice(devices[i]);
+          cudaDeviceEnablePeerAccess(devices[j], 0);
+        }
+      }
+    }
+  });
+#endif
+}
+
+void InitDevices(bool init_p2p) {
+  /*Init all available devices by default */
+  std::vector<int> devices;
+#ifdef PADDLE_WITH_CUDA
+  try {
+    int count = platform::GetCUDADeviceCount();
+    for (int i = 0; i < count; ++i) {
+      devices.push_back(i);
+    }
+  } catch (const std::exception &exp) {
+    LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime.";
+  }
+#else
+  LOG(WARNING)
+      << "'CUDA' is not supported, Please re-compile with WITH_GPU option";
+#endif
+  InitDevices(init_p2p, devices);
+}
+
+void InitDevices(bool init_p2p, const std::vector<int> devices) {
+  std::vector<platform::Place> places;
+  int count = 0;
+#ifdef PADDLE_WITH_CUDA
+  try {
+    count = platform::GetCUDADeviceCount();
+  } catch (const std::exception &exp) {
+    LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime.";
+  }
+#else
+  LOG(WARNING)
+      << "'CUDA' is not supported, Please re-compile with WITH_GPU option";
+#endif
+
+  for (size_t i = 0; i < devices.size(); ++i) {
+    if (devices[i] >= count || devices[i] < 0) {
+      LOG(WARNING) << "Invalid devices id.";
+      continue;
+    }
+    places.emplace_back(platform::CUDAPlace(devices[i]));
+  }
+  if (init_p2p) {
+    InitP2P(devices);
+  }
+  places.emplace_back(platform::CPUPlace());
+  platform::DeviceContextPool::Init(places);
+#ifndef PADDLE_WITH_MKLDNN
+  platform::SetNumThreads(1);
+#endif
+}
+
+void InitGLOG(const std::string &prog_name) {
+  // glog will not hold the ARGV[0] inside.
+  // Use strdup to alloc a new string.
+  google::InitGoogleLogging(strdup(prog_name.c_str()));
+  google::InstallFailureSignalHandler();
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/init.h b/paddle/fluid/platform/init.h
similarity index 100%
rename from paddle/fluid/framework/init.h
rename to paddle/fluid/platform/init.h
diff --git a/paddle/fluid/platform/init_test.cc b/paddle/fluid/platform/init_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eef1470a90c7da15efff965fc8f66dfa616ba25f
--- /dev/null
+++ b/paddle/fluid/platform/init_test.cc
@@ -0,0 +1,40 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/init.h"
+
+TEST(InitDevices, CPU) {
+  using paddle::framework::InitDevices;
+  using paddle::platform::DeviceContextPool;
+
+#ifndef PADDLE_WITH_CUDA
+  InitDevices(true);
+  DeviceContextPool& pool = DeviceContextPool::Instance();
+  ASSERT_EQ(pool.size(), 1U);
+#endif
+}
+
+TEST(InitDevices, CUDA) {
+  using paddle::framework::InitDevices;
+  using paddle::platform::DeviceContextPool;
+
+#ifdef PADDLE_WITH_CUDA
+  int count = paddle::platform::GetCUDADeviceCount();
+  InitDevices(true);
+  DeviceContextPool& pool = DeviceContextPool::Instance();
+  ASSERT_EQ(pool.size(), 1U + static_cast<unsigned>(count));
+#endif
+}
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index de711b7d23ef01d57a62087c552ea090f01f0386..33fec2c1073819d88d85a8872227adcb9df3e8f4 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once
 
 #include <mkldnn.h>
+#include <string>
 #include <vector>
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/place.h"
@@ -99,5 +100,155 @@ inline mkldnn::memory::format GetMKLDNNFormat(const mkldnn::memory memory) {
       memory.get_primitive_desc().desc().data.format);
 }
 
+inline mkldnn::memory::format GetMKLDNNFormat(
+    const mkldnn::sum::primitive_desc& memory) {
+  return static_cast<mkldnn::memory::format>(
+      memory.dst_primitive_desc().desc().data.format);
+}
+
+class MKLDNNHandler {
+ public:
+  MKLDNNHandler(const MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
+                const std::string& base_key)
+      : dev_ctx_(dev_ctx),
+        engine_(engine),
+        key_(base_key),
+        is_reusing_(false) {}
+
+  std::shared_ptr<mkldnn::memory> AcquireSrcMemory(
+      const mkldnn::memory::desc& md, void* ptr) {
+    return this->AcquireMemory(md, ptr, "@user_src_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireWeightsMemory(
+      const mkldnn::memory::desc& md, void* ptr) {
+    return this->AcquireMemory(md, ptr, "@user_weights_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDstMemory(
+      const mkldnn::memory::desc& md, void* ptr) {
+    return this->AcquireMemory(md, ptr, "@user_dst_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDiffDstMemory(
+      const mkldnn::memory::desc& md, void* ptr) {
+    return this->AcquireMemory(md, ptr, "@user_diff_dst_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDiffSrcMemory(
+      const mkldnn::memory::desc& md, void* ptr) {
+    return this->AcquireMemory(md, ptr, "@user_diff_src_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireMemoryFromPrimitive(
+      mkldnn::memory::primitive_desc mdp, void* ptr,
+      const std::string& suffix) {
+    auto local_key = key_ + suffix;
+    auto mem_p =
+        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
+    PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find mem primitive in device context");
+    if (mem_p == nullptr) {
+      mem_p = std::make_shared<mkldnn::memory>(mdp, ptr);
+      dev_ctx_.SetBlob(local_key, mem_p);
+    } else {
+      mem_p->set_data_handle(ptr);
+      // Mark that reusing happenned. All primitives from operator instance
+      // should be reused or none of them. So we check consistency
+      is_reusing_ = true;
+    }
+    return mem_p;
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireMemory(const mkldnn::memory::desc& md,
+                                                void* ptr,
+                                                const std::string& suffix) {
+    /*Generate key*/
+    auto local_key = key_ + suffix;
+    auto mem_p =
+        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
+    PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find mem primitive in device context");
+    if (mem_p == nullptr) {
+      mem_p = std::make_shared<mkldnn::memory>(
+          mkldnn::memory::primitive_desc{md, engine_}, ptr);
+      dev_ctx_.SetBlob(local_key, mem_p);
+    } else {
+      mem_p->set_data_handle(ptr);
+      // Mark that reusing happenned. All primitives from operator instance
+      // should be reused or none of them. So we check consistency
+      is_reusing_ = true;
+    }
+    return mem_p;
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireMemory(
+      mkldnn::memory::primitive_desc& mpd,       // NOLINT
+      mkldnn::memory::primitive_desc& user_mpd,  // NOLINT
+      const std::shared_ptr<mkldnn::memory> user_memory_p,
+      const std::string& suffix,
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+    // create reorder primitive if the input format is not the preferred one
+    auto local_key = key_ + suffix;
+    auto key_reorder_p = key_ + suffix + "reorder_p";
+
+    auto target_memory_p =
+        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
+    PADDLE_ENFORCE((target_memory_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find mem primitive in device context");
+    if (target_memory_p == nullptr) {
+      target_memory_p = user_memory_p;
+      std::shared_ptr<mkldnn::primitive> reorder_p;
+      if (mpd != user_mpd) {
+        target_memory_p = std::make_shared<mkldnn::memory>(mpd);
+
+        auto reorder_p =
+            std::make_shared<mkldnn::reorder>(*user_memory_p, *target_memory_p);
+        dev_ctx_.SetBlob(key_reorder_p, reorder_p);
+        pipeline.push_back(*reorder_p);
+      }
+      dev_ctx_.SetBlob(local_key, target_memory_p);
+    } else {
+      // Make reorder if needed
+      auto reorder_p = std::static_pointer_cast<mkldnn::reorder>(
+          dev_ctx_.GetBlob(key_reorder_p));
+      if (reorder_p != nullptr) {
+        pipeline.push_back(*reorder_p);
+      }
+      is_reusing_ = true;
+    }
+    return target_memory_p;
+  }
+
+  static std::string GetHash(mkldnn::memory::dims& operand_dims,  // NOLINT
+                             const std::string& suffix) {
+    auto dims2str = [](const mkldnn::memory::dims& operand_dims) {
+      std::string dstr = "";
+      for (size_t i = 0; i < operand_dims.size(); ++i) {
+        dstr += std::to_string(operand_dims[i]) + "-";
+      }
+      return dstr;
+    };
+
+    return dims2str(operand_dims) + suffix;
+  }
+
+ protected:
+  const MKLDNNDeviceContext& dev_ctx_;
+  mkldnn::engine engine_;
+  std::string key_;
+  bool is_reusing_;
+};
+
+inline mkldnn::memory::format MKLDNNFormatForSize(
+    size_t dims_size, mkldnn::memory::format data_format) {
+  if (dims_size == 1) {
+    return mkldnn::memory::format::x;
+  } else if (dims_size == 2) {
+    return mkldnn::memory::format::nc;
+  }
+  return data_format;
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h
index ad54a878996bd36f2d714f6554b44c89dae3fd0c..e3ee504f3d042d6a99036e34507c4c8bee306750 100644
--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
@@ -30,6 +30,7 @@ struct CPUPlace {
   // needed for variant equality comparison
   inline bool operator==(const CPUPlace &) const { return true; }
   inline bool operator!=(const CPUPlace &) const { return false; }
+  inline bool operator<(const CPUPlace &) const { return false; }
 };
 
 struct CUDAPlace {
@@ -42,6 +43,7 @@ struct CUDAPlace {
     return device == o.device;
   }
   inline bool operator!=(const CUDAPlace &o) const { return !(*this == o); }
+  inline bool operator<(const CUDAPlace &o) const { return device < o.device; }
 
   int device;
 };
@@ -52,6 +54,7 @@ struct CUDAPinnedPlace {
   // needed for variant equality comparison
   inline bool operator==(const CUDAPinnedPlace &) const { return true; }
   inline bool operator!=(const CUDAPinnedPlace &) const { return false; }
+  inline bool operator<(const CUDAPinnedPlace &) const { return false; }
 };
 
 struct IsCUDAPlace : public boost::static_visitor<bool> {
@@ -89,18 +92,6 @@ bool is_cuda_pinned_place(const Place &);
 bool places_are_same_class(const Place &, const Place &);
 bool is_same_place(const Place &, const Place &);
 
-struct PlaceHash {
-  std::size_t operator()(const Place &p) const {
-    constexpr size_t num_dev_bits = 4;
-    std::hash<int> ihash;
-    size_t dev_id = 0;
-    if (is_gpu_place(p)) {
-      dev_id = boost::get<CUDAPlace>(p).device;
-    }
-    return ihash(dev_id << num_dev_bits | p.which());
-  }
-};
-
 std::ostream &operator<<(std::ostream &, const Place &);
 
 template <typename Visitor>
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 4fef351c2118e43697606c90a616cd870e78cd77..89ca4f781273e99bbb83216c238dfc5c88c0a22b 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -2,13 +2,13 @@ if(WITH_PYTHON)
   if(WITH_AMD_GPU)
     hip_library(paddle_pybind SHARED
       SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
-      DEPS pybind python proto_desc memory executor prune init profiler feed_fetch_method
+      DEPS pybind python proto_desc memory executor prune profiler feed_fetch_method
            parallel_executor
       ${GLOB_OP_LIB})
   else()
     cc_library(paddle_pybind SHARED
       SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
-      DEPS pybind python proto_desc memory executor prune init profiler feed_fetch_method
+      DEPS pybind python proto_desc memory executor prune profiler feed_fetch_method
            parallel_executor
       ${GLOB_OP_LIB})
     if(NOT APPLE AND NOT ANDROID)
diff --git a/paddle/fluid/pybind/exception.cc b/paddle/fluid/pybind/exception.cc
index 08a2f185e117718d07ba984f76dfe5bf8229c33c..831f30e35fd3e01ce0f0524f6f85dd59494f5353 100644
--- a/paddle/fluid/pybind/exception.cc
+++ b/paddle/fluid/pybind/exception.cc
@@ -18,10 +18,13 @@ namespace paddle {
 namespace pybind {
 
 void BindException(pybind11::module* m) {
+  static pybind11::exception<platform::EOFException> eof(*m, "EOFException");
   static pybind11::exception<platform::EnforceNotMet> exc(*m, "EnforceNotMet");
   pybind11::register_exception_translator([](std::exception_ptr p) {
     try {
       if (p) std::rethrow_exception(p);
+    } catch (const platform::EOFException& e) {
+      eof(e.what());
     } catch (const platform::EnforceNotMet& e) {
       exc(e.what());
     }
diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
index bcf6d4dd3087060c016e53722cde80704ef2e834..fcd3356d44ee592233c3883d439d0677714900b8 100644
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -268,7 +268,8 @@ void BindOpDesc(pybind11::module *m) {
       .value("STRINGS", pd::proto::AttrType::STRINGS)
       .value("BOOL", pd::proto::AttrType::BOOLEAN)
       .value("BOOLS", pd::proto::AttrType::BOOLEANS)
-      .value("BLOCK", pd::proto::AttrType::BLOCK);
+      .value("BLOCK", pd::proto::AttrType::BLOCK)
+      .value("BLOCKS", pd::proto::AttrType::BLOCKS);
 
   pybind11::class_<pd::OpDesc> op_desc(*m, "OpDesc", "");
   op_desc
@@ -293,6 +294,7 @@ void BindOpDesc(pybind11::module *m) {
       .def("set_attr", &pd::OpDesc::SetAttr)
       .def("attr", &pd::OpDesc::GetAttr)
       .def("set_block_attr", &pd::OpDesc::SetBlockAttr)
+      .def("set_blocks_attr", &pd::OpDesc::SetBlocksAttr)
       .def("set_serialized_attr",
            [](pd::OpDesc &self, const std::string &name,
               const pybind11::bytes &seriralized) {
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index bd5c613f8cf794df5dfeb7517ed4350f9b3b6099..0c523b6f176345c0407b8541c04fb8c3b27f7c60 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
@@ -34,7 +33,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/pybind/const_value.h"
@@ -144,28 +145,75 @@ PYBIND11_PLUGIN(core) {
   py::class_<LoDTensor, Tensor>(m, "LoDTensor")
       .def_buffer(
           [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); })
-      .def(
-          "__init__",
-          [](LoDTensor &instance, const std::vector<std::vector<size_t>> &lod) {
-            LoD new_lod;
-            new_lod.reserve(lod.size());
-            std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
-            new (&instance) LoDTensor(new_lod);
-          })
+      .def("__init__",
+           [](LoDTensor &instance, const std::vector<std::vector<size_t>>
+                                       &recursive_sequence_lengths) {
+             LoD new_lod;
+             new_lod.reserve(recursive_sequence_lengths.size());
+             std::copy(recursive_sequence_lengths.begin(),
+                       recursive_sequence_lengths.end(),
+                       std::back_inserter(new_lod));
+             LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod);
+             PADDLE_ENFORCE(
+                 CheckLoD(new_offset_lod, -1),
+                 "the provided recursive_sequence_lengths info is invalid");
+             new (&instance) LoDTensor(new_offset_lod);
+           })
       .def("__init__", [](LoDTensor &instance) { new (&instance) LoDTensor(); })
+      // We implement offset based LOD in C++ while we use length based with
+      // Python API. So we changed set_lod to set_recursive_sequence_lengths to
+      // avoid misuse.
+      // The discussion is here:
+      // https://github.com/PaddlePaddle/Paddle/issues/10855
       .def("set_lod",
            [](LoDTensor &self, const std::vector<std::vector<size_t>> &lod) {
+             // the input lod is offset-based level-of-detail info
              LoD new_lod;
              new_lod.reserve(lod.size());
              std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
+             PADDLE_ENFORCE(CheckLoD(new_lod, vectorize(self.dims()).front()),
+                            "the provided lod info is invalid");
              self.set_lod(new_lod);
            })
-      .def("lod", [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
-        auto lod = self.lod();
-        std::vector<std::vector<size_t>> new_lod;
-        new_lod.reserve(lod.size());
-        std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
-        return new_lod;
+      .def("set_recursive_sequence_lengths",
+           [](LoDTensor &self, const std::vector<std::vector<size_t>>
+                                   &recursive_sequence_lengths) {
+             // the input recursive_sequence_lengths is length-based
+             // level-of-detail info
+             LoD new_lod;
+             new_lod.reserve(recursive_sequence_lengths.size());
+             std::copy(recursive_sequence_lengths.begin(),
+                       recursive_sequence_lengths.end(),
+                       std::back_inserter(new_lod));
+             LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod);
+             PADDLE_ENFORCE(
+                 CheckLoD(new_offset_lod, vectorize(self.dims()).front()),
+                 "the provided recursive_sequence_lengths info is invalid");
+             self.set_lod(new_offset_lod);
+           })
+      .def("lod",
+           [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
+             // output the offset-based lod info
+             LoD lod = self.lod();
+             std::vector<std::vector<size_t>> new_lod;
+             new_lod.reserve(lod.size());
+             std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
+             return new_lod;
+           })
+      // Set above comments of set_lod.
+      .def("recursive_sequence_lengths",
+           [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
+             // output the length-based lod info
+             LoD lod = ConvertToLengthBasedLoD(self.lod());
+             std::vector<std::vector<size_t>> new_lod;
+             new_lod.reserve(lod.size());
+             std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
+             return new_lod;
+           })
+      .def("has_valid_recursive_sequence_lengths", [](LoDTensor &self) -> bool {
+        // Check that the lod info is valid and match the outermost
+        // dimension of the LoDTensor data
+        return CheckLoD(self.lod(), vectorize(self.dims()).front());
       });
 
   py::class_<SelectedRows>(m, "SelectedRows")
@@ -248,7 +296,38 @@ All parameter, weight, gradient are variables in Paddle.
            py::return_value_policy::reference);
 
   py::class_<framework::ReaderHolder>(m, "Reader", "")
-      .def("reset", &framework::ReaderHolder::ReInit);
+      .def("reset", &framework::ReaderHolder::ResetAll);
+
+  using LoDTensorBlockingQueue =
+      ::paddle::operators::reader::LoDTensorBlockingQueue;
+  using LoDTensorBlockingQueueHolder =
+      ::paddle::operators::reader::LoDTensorBlockingQueueHolder;
+  py::class_<LoDTensorBlockingQueue>(m, "LoDTensorBlockingQueue", "")
+      .def("push",
+           [](LoDTensorBlockingQueue &self,
+              const std::vector<framework::LoDTensor> &lod_tensor_vec) {
+             pybind11::gil_scoped_release release;
+             return self.Push(lod_tensor_vec);
+           })
+      .def("size", &LoDTensorBlockingQueue::Size)
+      .def("capacity", &LoDTensorBlockingQueue::Cap)
+      .def("close", &LoDTensorBlockingQueue::Close)
+      .def("is_closed", &LoDTensorBlockingQueue::IsClosed);
+
+  m.def("init_lod_tensor_blocking_queue",
+        [](Variable &var, size_t capacity,
+           const std::vector<std::vector<int64_t>> &shapes)
+            -> LoDTensorBlockingQueue * {
+              std::vector<DDim> dims(shapes.size());
+              std::transform(shapes.begin(), shapes.end(), dims.begin(),
+                             [](const std::vector<int64_t> &shape) {
+                               return make_ddim(shape);
+                             });
+              auto *holder = var.GetMutable<LoDTensorBlockingQueueHolder>();
+              holder->InitOnce(capacity, dims);
+              return holder->GetQueue().get();
+            },
+        py::return_value_policy::reference);
 
   py::class_<Scope>(m, "Scope", "")
       .def("var",
@@ -414,11 +493,14 @@ All parameter, weight, gradient are variables in Paddle.
   py::class_<framework::Executor>(m, "Executor")
       .def(py::init<const platform::Place &>())
 #ifdef PADDLE_WITH_DISTRIBUTE
-      .def("complete", &Executor::Complete)
+      .def("begin_pass", &Executor::BeginPass)
+      .def("end_pass", &Executor::EndPass)
 #endif
-      .def("run",
-           (void (Executor::*)(const ProgramDesc &, Scope *, int, bool, bool)) &
-               Executor::Run);
+      .def("run", [](Executor &self, const ProgramDesc &prog, Scope *scope,
+                     int block_id, bool create_local_scope, bool create_vars) {
+        pybind11::gil_scoped_release release;
+        self.Run(prog, scope, block_id, create_local_scope, create_vars);
+      });
 
   m.def("init_gflags", framework::InitGflags);
   m.def("init_glog", framework::InitGLOG);
@@ -562,7 +644,11 @@ All parameter, weight, gradient are variables in Paddle.
           [](const BuildStrategy &self) { return self.debug_graphviz_path_; },
           [](BuildStrategy &self, const std::string &path) {
             self.debug_graphviz_path_ = path;
-          });
+          })
+      .def_property(
+          "enable_data_balance",
+          [](const BuildStrategy &self) { return self.enable_data_balance_; },
+          [](BuildStrategy &self, bool b) { self.enable_data_balance_ = b; });
 
   pe.def(py::init<const std::vector<platform::Place> &,
                   const std::unordered_set<std::string> &,
@@ -584,7 +670,12 @@ All parameter, weight, gradient are variables in Paddle.
            &ParallelExecutor::FeedTensorsIntoLocalScopes)
       .def("feed_and_split_tensor_into_local_scopes",
            &ParallelExecutor::FeedAndSplitTensorIntoLocalScopes)
-      .def("run", &ParallelExecutor::Run);
+      .def("run", [](ParallelExecutor &self,
+                     const std::vector<std::string> &fetch_tensors,
+                     const std::string &fetched_var_name) {
+        pybind11::gil_scoped_release release;
+        self.Run(fetch_tensors, fetched_var_name);
+      });
 
   BindRecordIOWriter(&m);
   return m.ptr();
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 93b09ed6922b32a5531224acc470daf0d97f95bd..3e2ea1ef88b03f5b2576c1cee2b5d26a439943da 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -97,7 +97,7 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
 inline pybind11::buffer_info CastToPyBuffer(const framework::Tensor &tensor) {
   auto buffer_info =
       details::CastToPyBufferImpl<true, 0, float, int, double, int64_t, bool,
-                                  platform::float16>()(tensor);
+                                  uint8_t, platform::float16>()(tensor);
   return buffer_info;
 }
 
@@ -146,7 +146,7 @@ void PyCPUTensorSetFromArray(
 template <>
 // This following specialization maps uint16_t in the parameter type to
 // platform::float16.
-void PyCPUTensorSetFromArray(
+inline void PyCPUTensorSetFromArray(
     framework::Tensor *self,
     pybind11::array_t<uint16_t,
                       pybind11::array::c_style | pybind11::array::forcecast>
@@ -185,7 +185,7 @@ void PyCUDATensorSetFromArray(
 template <>
 // This following specialization maps uint16_t in the parameter type to
 // platform::float16.
-void PyCUDATensorSetFromArray(
+inline void PyCUDATensorSetFromArray(
     framework::Tensor *self,
     pybind11::array_t<uint16_t,
                       pybind11::array::c_style | pybind11::array::forcecast>
@@ -224,7 +224,7 @@ void PyCUDAPinnedTensorSetFromArray(
 template <>
 // This following specialization maps uint16_t in the parameter type to
 // platform::float16.
-void PyCUDAPinnedTensorSetFromArray(
+inline void PyCUDAPinnedTensorSetFromArray(
     framework::Tensor *self,
     pybind11::array_t<uint16_t,
                       pybind11::array::c_style | pybind11::array::forcecast>
diff --git a/paddle/fluid/string/printf.h b/paddle/fluid/string/printf.h
index 062095a1c3e977c0bcc89346ead765acb023bcf7..47de23377398423dabf3b0ed5b670e564f57cdfb 100644
--- a/paddle/fluid/string/printf.h
+++ b/paddle/fluid/string/printf.h
@@ -83,6 +83,13 @@ void Fprintf(std::ostream& out, const char* fmt, const Args&... args) {
   tinyformat::vformat(out, fmt, tinyformat::makeFormatList(args...));
 }
 
+template <typename... Args>
+std::string Sprintf(const Args&... args) {
+  std::ostringstream oss;
+  Fprintf(oss, "");
+  return oss.str();
+}
+
 template <typename... Args>
 std::string Sprintf(const char* fmt, const Args&... args) {
   std::ostringstream oss;
diff --git a/paddle/fluid/string/printf_test.cc b/paddle/fluid/string/printf_test.cc
index 678029f93534ab374bd29083f8991d632ccdd5a1..544b12ef3a877a6e84c136433799301edaa4abdf 100644
--- a/paddle/fluid/string/printf_test.cc
+++ b/paddle/fluid/string/printf_test.cc
@@ -27,4 +27,5 @@ TEST(StringPrintf, StringPrintf) {
   EXPECT_EQ(std::string("Wednesday, July 27, 14:44"),
             paddle::string::Sprintf("%s, %s %d, %.2d:%.2d", weekday, month, day,
                                     hour, min));
+  EXPECT_EQ(std::string(""), paddle::string::Sprintf());
 }
diff --git a/paddle/fluid/train/demo/demo_trainer.cc b/paddle/fluid/train/demo/demo_trainer.cc
index 813d8386868558bd62a9d5670d540ddeddb2b77d..4425f062efa6eab552caee1a429746528cd66926 100644
--- a/paddle/fluid/train/demo/demo_trainer.cc
+++ b/paddle/fluid/train/demo/demo_trainer.cc
@@ -15,11 +15,11 @@
 #include <fstream>
 
 #include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/place.h"
 
 namespace paddle {
diff --git a/paddle/function/BufferArg.cpp b/paddle/function/BufferArg.cpp
deleted file mode 100644
index 2dc931c5d7e727679d435470544e60f9b5ce2bde..0000000000000000000000000000000000000000
--- a/paddle/function/BufferArg.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <glog/logging.h>
-
-#include "BufferArg.h"
-#include "paddle/math/SparseMatrix.h"
-
-namespace paddle {
-
-const SequenceArg& BufferArg::sequence() const {
-  CHECK_EQ(bufferType_, TENSOR_SEQUENCE_DATA);
-  return dynamic_cast<const SequenceArg&>(*this);
-}
-
-const SparseMatrixArg& BufferArg::sparse() const {
-  CHECK_EQ(bufferType_, TENSOR_SPARSE);
-  return dynamic_cast<const SparseMatrixArg&>(*this);
-}
-
-SparseMatrixArg::SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType)
-    : BufferArg(sparse, argType),
-      row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
-      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32),
-      nnz_(sparse.getElementCnt()),
-      format_(static_cast<SparseDataFormat>(sparse.getFormat())),
-      type_(static_cast<SparseDataType>(sparse.getValueType())) {
-  bufferType_ = TENSOR_SPARSE;
-}
-
-SparseMatrixArg::SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType)
-    : BufferArg(sparse, argType),
-      row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
-      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32),
-      nnz_(sparse.getElementCnt()),
-      format_(static_cast<SparseDataFormat>(sparse.getFormat())),
-      type_(static_cast<SparseDataType>(sparse.getValueType())) {
-  bufferType_ = TENSOR_SPARSE;
-}
-
-}  // namespace paddle
diff --git a/paddle/function/BufferArg.h b/paddle/function/BufferArg.h
deleted file mode 100644
index 6de8c94e778c8d1439b2a2aa3c581a5a3cf70261..0000000000000000000000000000000000000000
--- a/paddle/function/BufferArg.h
+++ /dev/null
@@ -1,364 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <glog/logging.h>
-
-#include "TensorShape.h"
-#include "TensorType.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-enum BufferType {
-  TENSOR_UNKNOWN = 0,
-  TENSOR_NORMAL = 1,
-  TENSOR_SEQUENCE_ID = 2,
-  TENSOR_SEQUENCE_DATA = 3,
-  TENSOR_SPARSE = 4
-};
-
-class BufferArg;
-class SequenceArg;
-class SparseMatrixArg;
-
-/**
- * \brief BufferArg used as the argument type of Function.
- *
- * The arguments of the Paddle Function have four Buffer types.
- * 1. BufferArg for a dense Buffer of any dimension.
- * 2. SequenceIdArg for a Buffer of sequence start positions.
- * 3. SequenceArg for a Buffer of sequence data.
- * 4. SparseMatrixArg for a Buffer of sparse matrix.
- *
- * Buffer shape
- * For most buffers, the first dimension `shape()[0]` represents
- * the size of the mini-batch.
- *
- * Buffer argType
- * There is an ArgType property for the BufferArg used as Function Output.
- * Whether the result of the Function calculation is assigned to the
- * output Buffer or added to the output Buffer is determined by the
- * argType_ property of the output BufferArg.
- */
-
-// ArgType is only used by output BufferArg.
-// For input argument, argType_ is ignored.
-// For output argument, need to set the argType_ of the BufferArg.
-enum ArgType {
-  UNSPECIFIED = 0,
-  ASSIGN_TO = 1,
-  ADD_TO = 2,
-};
-class BufferArg {
- public:
-  void setArgType(ArgType argType) { argType_ = argType; }
-
-  ArgType getArgType() const { return argType_; }
-
- public:
-  BufferArg(ValueType valueType,
-            const TensorShape& shape,
-            ArgType argType = UNSPECIFIED)
-      : buf_(nullptr), valueType_(valueType), shape_(shape), argType_(argType) {
-    bufferType_ = TENSOR_NORMAL;
-  }
-
-  BufferArg(void* buf,
-            ValueType valueType,
-            const TensorShape& shape,
-            ArgType argType = UNSPECIFIED)
-      : buf_(buf), valueType_(valueType), shape_(shape), argType_(argType) {
-    bufferType_ = TENSOR_NORMAL;
-  }
-
-  BufferArg(void* buf, ValueType valueType) : buf_(buf), valueType_(valueType) {
-    bufferType_ = TENSOR_NORMAL;
-  }
-
-  BufferArg(const Matrix& matrix, ArgType argType = UNSPECIFIED)
-      : buf_(
-            const_cast<void*>(reinterpret_cast<const void*>(matrix.getData()))),
-        valueType_(DataType<real>::value),
-        shape_(2),
-        argType_(argType) {
-    bufferType_ = TENSOR_NORMAL;
-    shape_.setDim(0, matrix.getHeight());
-    shape_.setDim(1, matrix.getWidth());
-  }
-
-  BufferArg(const Matrix& matrix,
-            const TensorShape& shape,
-            ArgType argType = UNSPECIFIED)
-      : buf_(
-            const_cast<void*>(reinterpret_cast<const void*>(matrix.getData()))),
-        valueType_(DataType<real>::value),
-        shape_(shape),
-        argType_(argType) {
-    bufferType_ = TENSOR_NORMAL;
-    CHECK_EQ(matrix.getElementCnt(), shape.getElements());
-  }
-
-  BufferArg(const Vector& vector, ArgType argType = UNSPECIFIED)
-      : buf_(
-            const_cast<void*>(reinterpret_cast<const void*>(vector.getData()))),
-        valueType_(DataType<real>::value),
-        shape_(1),
-        argType_(argType) {
-    bufferType_ = TENSOR_NORMAL;
-    shape_.setDim(0, vector.getSize());
-  }
-
-  BufferArg(const IVector& vector, ArgType argType = UNSPECIFIED)
-      : buf_(
-            const_cast<void*>(reinterpret_cast<const void*>(vector.getData()))),
-        valueType_(VALUE_TYPE_INT32),
-        shape_(1),
-        argType_(argType) {
-    bufferType_ = TENSOR_NORMAL;
-    shape_.setDim(0, vector.getSize());
-  }
-
-  template <DeviceType DType>
-  typename Tensor<real, DType>::Matrix matrix() const {
-    CHECK(buf_);
-    CHECK(valueType_ == DataType<real>::value);
-    // CHECK(deviceType_ == DType);
-    CHECK_EQ((size_t)2, shape_.ndims());
-    return typename Tensor<real, DType>::Matrix(
-        reinterpret_cast<real*>(buf_), shape_[0], shape_[1]);
-  }
-
-  template <typename VType, DeviceType DType>
-  typename Tensor<VType, DType>::Vector vector() const {
-    CHECK(buf_);
-    CHECK(valueType_ == DataType<VType>::value);
-    // CHECK(deviceType_ == DType);
-    CHECK_EQ((size_t)1, shape_.ndims());
-    return typename Tensor<VType, DType>::Vector(
-        shape_[0], reinterpret_cast<VType*>(buf_));
-  }
-
-  virtual ~BufferArg() {}
-
-  template <typename T>
-  T* data() const {
-    return reinterpret_cast<T*>(buf_);
-  }
-
-  void* data() const { return buf_; }
-  ValueType valueType() const { return valueType_; }
-  BufferType bufferType() const { return bufferType_; }
-  const TensorShape& shape() const { return shape_; }
-  bool isSparseArg() const { return TENSOR_SPARSE == bufferType_; }
-  bool isSequenceArg() const { return TENSOR_SEQUENCE_DATA == bufferType_; }
-  virtual size_t numElements() const { return shape_.getElements(); }
-
-  const SequenceArg& sequence() const;
-  const SparseMatrixArg& sparse() const;
-
- protected:
-  void* buf_;
-  ValueType valueType_;
-  TensorShape shape_;
-  BufferType bufferType_{TENSOR_UNKNOWN};
-  ArgType argType_{UNSPECIFIED};
-  // TODO(tianbing), add deviceType_
-  // leading dimensions. The size is dims_.size()
-  // Dims lds_;
-};
-
-// sequence start positions in a mini-batch of sequences
-// shape_.ndims() == 1
-// valueType_ = int32
-// if a < b then value_.buf_[a] < value_.buf_[b]
-class SequenceIdArg : public BufferArg {
- public:
-  SequenceIdArg(const TensorShape& shape, ArgType argType = UNSPECIFIED)
-      : BufferArg(VALUE_TYPE_INT32, shape, argType) {
-    bufferType_ = TENSOR_SEQUENCE_ID;
-    CHECK_EQ(shape_.ndims(), 1UL);
-    CHECK_GE(shape_[0], 1UL);
-    numSeqs_ = shape_[0] - 1;
-  }
-
-  SequenceIdArg(void* buf,
-                const TensorShape& shape,
-                ArgType argType = UNSPECIFIED)
-      : BufferArg(buf, VALUE_TYPE_INT32, shape, argType) {
-    bufferType_ = TENSOR_SEQUENCE_ID;
-    CHECK_EQ(shape_.ndims(), 1UL);
-    numSeqs_ = shape_[0] - 1;
-  }
-
-  SequenceIdArg(const IVector& vector) : BufferArg(vector) {
-    bufferType_ = TENSOR_SEQUENCE_ID;
-    numSeqs_ = shape_[0] - 1;
-  }
-
-  ~SequenceIdArg() {}
-
-  size_t numSeqs() const { return numSeqs_; }
-
- private:
-  size_t numSeqs_;
-};
-
-// sequences data
-// For mini-batch calculate,
-// one batch can contain more than one sequence of data.
-// SequenceArg can be used to represent sequences that contain multiple
-// unequal lengths.
-class SequenceArg : public BufferArg {
- public:
-  SequenceArg(ValueType valueType,
-              const TensorShape& shape,
-              ArgType argType = UNSPECIFIED)
-      : BufferArg(valueType, shape, argType),
-        startPositions_(TensorShape({shape[0]})) {
-    bufferType_ = TENSOR_SEQUENCE_DATA;
-  }
-
-  SequenceArg(void* buf,
-              ValueType valueType,
-              const TensorShape& shape,
-              const SequenceIdArg& startPositions,
-              ArgType argType = UNSPECIFIED)
-      : BufferArg(buf, valueType, shape, argType),
-        startPositions_(startPositions) {
-    bufferType_ = TENSOR_SEQUENCE_DATA;
-  }
-
-  SequenceArg(const Matrix& matrix,
-              const IVector& vector,
-              ArgType argType = UNSPECIFIED)
-      : BufferArg(matrix, argType), startPositions_(vector) {
-    bufferType_ = TENSOR_SEQUENCE_DATA;
-  }
-
-  ~SequenceArg() {}
-
-  void* getIdBuf() const { return startPositions_.data(); }
-  size_t numSeqs() const { return startPositions_.numSeqs(); }
-  SequenceIdArg& getSequenceId() { return startPositions_; }
-  const SequenceIdArg& getSequenceId() const { return startPositions_; }
-
- private:
-  SequenceIdArg startPositions_;
-};
-
-// sparse matrix
-// valueType_ == float or double
-// shape_.ndims() == 2
-class SparseMatrixArg : public BufferArg {
- public:
-  SparseMatrixArg(void* buf,
-                  ValueType valueType,
-                  const TensorShape& shape,
-                  const BufferArg& row,
-                  const BufferArg& col,
-                  size_t nnz,
-                  SparseFormat format,
-                  SparseValueType type,
-                  ArgType argType = UNSPECIFIED)
-      : BufferArg(buf, valueType, shape, argType),
-        row_(row),
-        col_(col),
-        nnz_(nnz),
-        format_(static_cast<SparseDataFormat>(format)),
-        type_(static_cast<SparseDataType>(type)) {
-    bufferType_ = TENSOR_SPARSE;
-    CHECK((valueType == VALUE_TYPE_FLOAT) || (valueType == VALUE_TYPE_DOUBLE));
-    CHECK_EQ(shape_.ndims(), 2UL);
-    CHECK_EQ(row_.shape().ndims(), 1UL);
-    CHECK_EQ(col_.shape().ndims(), 1UL);
-    if (format_ == T_SPARSE_CSR) {
-      CHECK_EQ(nnz, col.shape()[0]);
-    } else if (format_ == T_SPARSE_CSC) {
-      CHECK_EQ(nnz, row.shape()[0]);
-    }
-  }
-
-  SparseMatrixArg(ValueType valueType,
-                  const TensorShape& shape,
-                  size_t nnz,
-                  SparseFormat format,
-                  SparseValueType type,
-                  ArgType argType = UNSPECIFIED)
-      : BufferArg(valueType, shape, argType),
-        row_(BufferArg(nullptr, VALUE_TYPE_INT32)),
-        col_(BufferArg(nullptr, VALUE_TYPE_INT32)),
-        nnz_(nnz),
-        format_(static_cast<SparseDataFormat>(format)),
-        type_(static_cast<SparseDataType>(type)) {
-    bufferType_ = TENSOR_SPARSE;
-    CHECK((valueType == VALUE_TYPE_FLOAT) || (valueType == VALUE_TYPE_DOUBLE));
-    CHECK_EQ(shape_.ndims(), 2UL);
-
-    /// len of row_ : height + 1 (CSR) or nnz (CSC), buf_ == nullptr
-    row_ = (format_ == T_SPARSE_CSR
-                ? BufferArg(VALUE_TYPE_INT32, TensorShape{shape_[0] + 1})
-                : BufferArg(VALUE_TYPE_INT32, TensorShape{nnz}));
-    /// len of col_ :  width + 1 (CSC) or nnz (CSR), buf_ == nullptr
-    col_ = (format_ == T_SPARSE_CSR
-                ? BufferArg(VALUE_TYPE_INT32, TensorShape{nnz})
-                : BufferArg(VALUE_TYPE_INT32, TensorShape{shape_[1] + 1}));
-  }
-
-  SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED);
-
-  SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED);
-
-  template <DeviceType DType>
-  typename Tensor<real, DType>::SparseMatrix SparseMatrix() const {
-    CHECK(buf_);
-    CHECK(valueType_ == DataType<real>::value);
-    // CHECK(deviceType_ == DType);
-    CHECK_EQ(2UL, shape_.ndims());
-    return typename Tensor<real, DType>::SparseMatrix(
-        reinterpret_cast<real*>(buf_),
-        reinterpret_cast<int*>(row_.data()),
-        reinterpret_cast<int*>(col_.data()),
-        shape_[0],
-        shape_[1],
-        nnz_,
-        static_cast<SparseValueType>(type_),
-        static_cast<SparseFormat>(format_),
-        false);
-  }
-
-  ~SparseMatrixArg() {}
-
-  void* getRowBuf() const { return row_.data(); }
-
-  void* getColBuf() const { return col_.data(); }
-
-  size_t nnz() const { return nnz_; }
-
-  size_t numElements() const override { return nnz_; }
-
-  SparseDataFormat dataFormat() const { return format_; }
-
-  SparseDataType dataType() const { return type_; }
-
- private:
-  BufferArg row_;
-  BufferArg col_;
-  size_t nnz_;
-  SparseDataFormat format_;
-  SparseDataType type_;
-};
-
-}  // namespace paddle
diff --git a/paddle/function/BufferArgTest.cpp b/paddle/function/BufferArgTest.cpp
deleted file mode 100644
index 1a6e0110afb64c8b4f164d71e31e5f9bfcdee4a8..0000000000000000000000000000000000000000
--- a/paddle/function/BufferArgTest.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "BufferArg.h"
-#include <gtest/gtest.h>
-#include "paddle/math/MemoryHandle.h"
-
-namespace paddle {
-
-TEST(BufferTest, BufferArg) {
-  TensorShape shape({8, 10});
-  CpuMemoryHandle memory(shape.getElements() *
-                         sizeOfValuType(VALUE_TYPE_FLOAT));
-  BufferArg buffer(memory.getBuf(), VALUE_TYPE_FLOAT, shape);
-  EXPECT_EQ(buffer.data(), memory.getBuf());
-}
-
-TEST(BufferTest, SequenceIdArg) {
-  TensorShape shape({10});
-  CpuMemoryHandle memory(shape.getElements() *
-                         sizeOfValuType(VALUE_TYPE_INT32));
-  SequenceIdArg buffer(memory.getBuf(), shape);
-  EXPECT_EQ(buffer.data(), memory.getBuf());
-  EXPECT_EQ(buffer.numSeqs(), 9U);
-}
-
-}  // namespace paddle
diff --git a/paddle/function/ContextProjectionOp.cpp b/paddle/function/ContextProjectionOp.cpp
deleted file mode 100644
index 1187842452460ac3fd71f48150fab6467f93dc6c..0000000000000000000000000000000000000000
--- a/paddle/function/ContextProjectionOp.cpp
+++ /dev/null
@@ -1,412 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ContextProjectionOp.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/Vector.h"
-
-namespace paddle {
-/**
- * Context Projection Forward with CPU Matrix Device.
- *
- */
-template <>
-void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
-                                               const CpuMatrix& input_mat,
-                                               const CpuMatrix& weight_mat,
-                                               const CpuIVector& seq_vec,
-                                               size_t context_length,
-                                               int context_start,
-                                               size_t begin_pad) {
-  const int* starts = seq_vec.getData();
-  const size_t num_sequences = seq_vec.getSize() - 1;
-  for (size_t i = 0; i < num_sequences; ++i) {
-    for (size_t j = 0; j < context_length; ++j) {
-      int begin = starts[i] + context_start + j;
-      int end = starts[i + 1] + context_start + j;
-      int dst_begin = starts[i];
-      int dst_end = starts[i + 1];
-      if (begin < starts[i]) {
-        int64_t pad_size =
-            std::min(starts[i] - begin, starts[i + 1] - starts[i]);
-        MatrixPtr mat = out_mat.subMatrix(starts[i], pad_size);
-        if (weight_mat) {
-          MatrixPtr sub =
-              const_cast<CpuMatrix&>(weight_mat).subMatrix(j, pad_size);
-          mat->addAtOffset(*sub, j * input_mat.getWidth());
-        }
-        dst_begin = starts[i] + pad_size;
-        begin = starts[i];
-      }
-      if (end > starts[i + 1]) {
-        int64_t pad_size =
-            std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
-        MatrixPtr mat = out_mat.subMatrix(starts[i + 1] - pad_size, pad_size);
-        if (weight_mat) {
-          MatrixPtr sub =
-              const_cast<CpuMatrix&>(weight_mat)
-                  .subMatrix(begin_pad + context_start + j - pad_size,
-                             pad_size);
-          mat->addAtOffset(*sub, j * input_mat.getWidth());
-        }
-        dst_end = starts[i + 1] - pad_size;
-        end = starts[i + 1];
-      }
-      if (end <= begin) continue;
-      MatrixPtr src =
-          const_cast<CpuMatrix&>(input_mat).subMatrix(begin, end - begin);
-      MatrixPtr dst = out_mat.subMatrix(dst_begin, dst_end - dst_begin);
-      dst->addAtOffset(*src, j * input_mat.getWidth());
-    }
-  }
-}
-
-/**
- * Paddle Function for Context Projection Forward.
- * Calculate the output layer value sequence after context projection.
- *
- * What is Context Projection for a sequence?
- * For example, assumed input (x) has 4 words and the dimension of each word
- * representation is 2. If we use zero to pad instead of learned weight to pad,
- * and the context_lenth is 3, the output (y) is:
- *
- * @code
- *  x = [a1, a2;
- *       b1, b2;
- *       c1, c2;
- *       d1, d2]
- *  y = [0,  0,  a1, a2, b1, b2;
- *       a1, a2, b1, b2, c1, c2;
- *       b1, b2, c1, c2, d1, d2;
- *       c1, c2, d1, d2, 0,  0]
- * @endcode
- *
- * \param outputs[0].matrix   output layer value, n * (d * l)
- * \param outputs[0].vector   start position sequence, n * 1
- * \param inputs[0].matrix    input layer value, n * d
- * \param inputs[0].vector    start position sequence, n * 1
- * \param inputs[1].matrix    input layer weight, pad * d
- */
-template <DeviceType Device>
-class ContextProjectionForwardFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    context_length_ = config.get<size_t>("context_length");
-    context_start_ = config.get<int>("context_start");
-    begin_pad_ = config.get<size_t>("begin_pad");
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK(1UL == inputs.size() || 2UL == inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
-        << "SequenceArg required here";
-    const auto val_seqs = dynamic_cast<const SequenceArg&>(inputs[0]);
-    auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
-
-    CHECK(out_seq.data() && val_seqs.data() && val_seqs.getSequenceId().data());
-    CHECK_EQ(out_seq.shape().ndims(), 2UL);
-    CHECK_EQ(val_seqs.shape().ndims(), 2UL);
-    /// dim of output = dim of input * context_length
-    CHECK_EQ(out_seq.shape()[1], val_seqs.shape()[1] * context_length_);
-    /// input and output has the same batch_size
-    CHECK_EQ(val_seqs.shape()[0], out_seq.shape()[0]);
-    if (2UL == inputs.size()) {
-      CHECK_EQ(inputs[1].shape().ndims(), 2UL);
-      /// dim of input == dim of weight
-      CHECK_EQ(val_seqs.shape()[1], inputs[1].shape()[1]);
-    }
-
-    CHECK_EQ(out_seq.getArgType(), ADD_TO);
-    auto out_mat = out_seq.matrix<Device>();
-    const auto in_mat = val_seqs.matrix<Device>();
-    const auto w_mat =
-        (2UL == inputs.size() && inputs[1].data())
-            ? inputs[1].matrix<Device>()
-            : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
-    const auto seq_vec = val_seqs.getSequenceId().vector<int, Device>();
-
-    ContextProjectionForward<Device>(out_mat,
-                                     in_mat,
-                                     w_mat,
-                                     seq_vec,
-                                     context_length_,
-                                     context_start_,
-                                     begin_pad_);
-  }
-
- private:
-  size_t context_length_;
-  int context_start_;
-  size_t begin_pad_;
-};
-
-/**
- * Context Projection Backward with CPU Matrix Device.
- *
- */
-template <>
-void ContextProjectionBackward<DEVICE_TYPE_CPU>(const CpuMatrix& out_grad_mat,
-                                                CpuMatrix& in_grad_mat,
-                                                CpuMatrix& w_grad_mat,
-                                                const CpuIVector& seq_vec,
-                                                size_t context_length,
-                                                int context_start,
-                                                size_t begin_pad,
-                                                bool is_padding,
-                                                size_t total_pad) {
-  size_t input_dim = in_grad_mat ? in_grad_mat.getWidth()
-                                 : w_grad_mat ? w_grad_mat.getWidth() : 0;
-  const int* starts = seq_vec.getData();
-  size_t num_sequences = seq_vec.getSize() - 1;
-  for (size_t i = 0; i < num_sequences; ++i) {
-    for (size_t j = 0; j < context_length; ++j) {
-      int begin = starts[i] + context_start + j;
-      int end = starts[i + 1] + context_start + j;
-      int dst_begin = starts[i];
-      int dst_end = starts[i + 1];
-      if (begin < starts[i]) {
-        int64_t pad_size =
-            std::min(starts[i] - begin, starts[i + 1] - starts[i]);
-        if (is_padding && w_grad_mat) {
-          MatrixPtr mat = const_cast<CpuMatrix&>(out_grad_mat)
-                              .subMatrix(starts[i], pad_size);
-          MatrixPtr sub = w_grad_mat.subMatrix(j, pad_size);
-          sub->addAtOffset(*mat, j * input_dim);
-        }
-        dst_begin = starts[i] + pad_size;
-        begin = starts[i];
-      }
-      if (end > starts[i + 1]) {
-        int64_t pad_size =
-            std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
-        if (is_padding && w_grad_mat) {
-          MatrixPtr mat = const_cast<CpuMatrix&>(out_grad_mat)
-                              .subMatrix(starts[i + 1] - pad_size, pad_size);
-          MatrixPtr sub = w_grad_mat.subMatrix(
-              begin_pad + context_start + j - pad_size, pad_size);
-          sub->addAtOffset(*mat, j * input_dim);
-        }
-        dst_end = starts[i + 1] - pad_size;
-        end = starts[i + 1];
-      }
-      if (end <= begin) continue;
-      if (!in_grad_mat) continue;
-      MatrixPtr src = in_grad_mat.subMatrix(begin, end - begin);
-      MatrixPtr dst = const_cast<CpuMatrix&>(out_grad_mat)
-                          .subMatrix(dst_begin, dst_end - dst_begin);
-      src->addAtOffset(*dst, j * input_dim);
-    }
-  }
-}
-
-/**
- * Context Projection Backward Function.
- * Update the weight gradient and input layer gradient with backprop
- *
- * \param inputs[0].matrix          output layer grad, n * (d * l)
- * \param inputs[0].vector          start position sequence, n * 1
- * \param outputs[0].matrix         input layer grad, n * d
- * \param outputs[0].vector         start position sequence, n * 1
- * \param outputs[1]                weight grad, pad * d
- */
-template <DeviceType Device>
-class ContextProjectionBackwardFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    context_length_ = config.get<size_t>("context_length");
-    context_start_ = config.get<int>("context_start");
-    begin_pad_ = config.get<size_t>("begin_pad");
-    is_padding_ = config.get<bool>("is_padding");
-    total_pad_ = config.get<size_t>("total_pad");
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(1UL, inputs.size());
-    CHECK(1UL == outputs.size() || 2UL == outputs.size());
-    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
-        << "SequenceArg required here";
-    const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
-    auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
-    CHECK(in_seq.data() && in_seq.getSequenceId().data());
-    CHECK_EQ(in_seq.shape().ndims(), 2UL);
-    CHECK_EQ(out_seq.shape().ndims(), 2UL);
-    CHECK_EQ(out_seq.getSequenceId().shape().ndims(), 1UL);
-
-    /// input and output grad has the same batch_size
-    CHECK_EQ(out_seq.shape()[0], in_seq.shape()[0]);
-    /// dim of output grad = dim of input grad * context_length
-    CHECK_EQ(in_seq.shape()[1], out_seq.shape()[1] * context_length_);
-    CHECK_EQ(out_seq.getArgType(), ADD_TO);
-
-    if (2UL == outputs.size()) {
-      CHECK_EQ(outputs[1].shape().ndims(), 2UL);
-      /// dim of input grad == dim of weight
-      CHECK_EQ(out_seq.shape()[1], outputs[1].shape()[1]);
-      CHECK_EQ(outputs[1].getArgType(), ADD_TO);
-    }
-
-    const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
-    const auto out_grad_mat = in_seq.matrix<Device>();
-    auto in_grad_mat =
-        !out_seq.data() ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
-                        : out_seq.matrix<Device>();
-    auto w_grad_mat =
-        (2UL == outputs.size() && outputs[1].data())
-            ? outputs[1].matrix<Device>()
-            : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
-
-    ContextProjectionBackward<Device>(out_grad_mat,
-                                      in_grad_mat,
-                                      w_grad_mat,
-                                      seq_vec,
-                                      context_length_,
-                                      context_start_,
-                                      begin_pad_,
-                                      is_padding_,
-                                      total_pad_);
-  }
-
- private:
-  size_t context_length_;
-  int context_start_;
-  size_t begin_pad_;
-  bool is_padding_;
-  size_t total_pad_;
-};
-
-/**
- * Context Projection Backward Data Function
- * Update input layer grad
- * input:  sequence of output layer grad
- * output: sequence of input layer grad
- *
- * \param outputs[0].matrix              input layer grad, n * d
- * \param outputs[0].vector              start position sequence, n * 1
- * \param inputs[0].matrix               output layer grad, n * (d * l)
- * \param inputs[0].vector               start positon sequence, n * 1
- */
-template <DeviceType Device>
-class ContextProjectionBackwardDataFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    context_length_ = config.get<size_t>("context_length");
-    context_start_ = config.get<int>("context_start");
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(1UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
-        << "SequenceArg required here";
-    const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
-    const auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
-
-    CHECK(in_seq.data() && out_seq.data() && in_seq.getSequenceId().data());
-    CHECK_EQ(out_seq.shape().ndims(), 2UL);
-    CHECK_EQ(in_seq.shape().ndims(), 2UL);
-    CHECK_EQ(in_seq.getSequenceId().shape().ndims(), 1UL);
-    /// output layer grad dim == input layer grad dim * context_length_
-    CHECK_EQ(in_seq.shape().ndims(), out_seq.shape().ndims() * context_length_);
-    /// input and output has the same batch_size
-    CHECK_EQ(in_seq.shape()[0], out_seq.shape()[0]);
-    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
-
-    const auto out_grad_mat = in_seq.matrix<Device>();
-    const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
-    auto in_grad_mat = out_seq.matrix<Device>();
-
-    ContextProjectionBackwardData<Device>(
-        out_grad_mat, in_grad_mat, seq_vec, context_length_, context_start_);
-  }
-
- private:
-  size_t context_length_;
-  int context_start_;
-};
-
-/**
- * Context Projection Backward Weight Function
- * Update weight grad by backprop
- * input:  sequence of output layer grad
- * output: weight grad
- *
- * \param outputs[0]                   weight grad, pad * d
- * \param inputs[0].matrix             output layer grad, n * (d * l)
- * \param inputs[0].vecotr             start positon sequence, n * 1
- */
-template <DeviceType Device>
-class ContextProjectionBackwardWeightFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    context_length_ = config.get<size_t>("context_length");
-    context_start_ = config.get<int>("context_start");
-    begin_pad_ = config.get<size_t>("begin_pad");
-    total_pad_ = config.get<size_t>("total_pad");
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(1UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-    CHECK(inputs[0].isSequenceArg()) << "SequenceArg required here";
-    const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
-    CHECK(in_seq.data() && in_seq.getSequenceId().data() && outputs[0].data());
-    CHECK_EQ(outputs[0].shape().ndims(), 2UL);
-    CHECK_EQ(in_seq.shape().ndims(), 2UL);
-    CHECK_EQ(in_seq.getSequenceId().shape().ndims(), 1UL);
-    CHECK_EQ(in_seq.shape()[0], outputs[0].shape()[0]);
-    /// output layer grad dim == weight dim * context_length_
-    CHECK_EQ(in_seq.shape()[1], outputs[0].shape()[1] * context_length_);
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-
-    const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
-    const auto out_grad_mat = in_seq.matrix<Device>();
-    auto w_grad_mat = outputs[0].matrix<Device>();
-    ContextProjectionBackwardWeight<Device>(out_grad_mat,
-                                            w_grad_mat,
-                                            seq_vec,
-                                            context_length_,
-                                            context_start_,
-                                            total_pad_,
-                                            begin_pad_);
-  }
-
- private:
-  size_t context_length_;
-  int context_start_;
-  size_t begin_pad_;
-  size_t total_pad_;
-};
-
-REGISTER_TYPED_FUNC(ContextProjectionForward,
-                    CPU,
-                    ContextProjectionForwardFunc);
-REGISTER_TYPED_FUNC(ContextProjectionBackward,
-                    CPU,
-                    ContextProjectionBackwardFunc);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(ContextProjectionForward,
-                    GPU,
-                    ContextProjectionForwardFunc);
-REGISTER_TYPED_FUNC(ContextProjectionBackward,
-                    GPU,
-                    ContextProjectionBackwardFunc);
-REGISTER_TYPED_FUNC(ContextProjectionBackwardData,
-                    GPU,
-                    ContextProjectionBackwardDataFunc);
-REGISTER_TYPED_FUNC(ContextProjectionBackwardWeight,
-                    GPU,
-                    ContextProjectionBackwardWeightFunc);
-#endif
-}  // namespace paddle
diff --git a/paddle/function/ContextProjectionOpTest.cpp b/paddle/function/ContextProjectionOpTest.cpp
deleted file mode 100644
index d805c3ae927321fc74946e202b98401b6b3cd0f7..0000000000000000000000000000000000000000
--- a/paddle/function/ContextProjectionOpTest.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "FunctionTest.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-
-void testMatrixProjectionForward(int context_start,
-                                 size_t context_length,
-                                 bool is_padding,
-                                 size_t batch_size,
-                                 size_t input_dim) {
-  size_t pad = std::max(0, -context_start) +
-               std::max(0, (int)(context_start + context_length - 1));
-  if (pad == 0) is_padding = false;
-
-  CpuGpuFuncCompare test(
-      "ContextProjectionForward",
-      FuncConfig()
-          .set("context_length", context_length)
-          .set("context_start", context_start)
-          .set("begin_pad", (size_t)std::max(0, -context_start)));
-
-  // prepare input arguments
-  test.addSequence(SequenceIdArg(TensorShape{batch_size}));
-  test.addInputs(
-      SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batch_size, input_dim}));
-  if (is_padding) {  // weight
-    test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{pad, input_dim}));
-  }
-  test.addOutputs(
-      SequenceArg(VALUE_TYPE_FLOAT,
-                  TensorShape{batch_size, input_dim * context_length}),
-      ADD_TO);
-
-  // run Function
-  test.run();
-}
-
-void testMatrixProjectionBackward(int context_start,
-                                  size_t context_length,
-                                  bool is_padding,
-                                  size_t batch_size,
-                                  size_t input_dim) {
-  size_t pad = std::max(0, -context_start) +
-               std::max(0, (int)(context_start + context_length - 1));
-  if (pad == 0) is_padding = false;
-
-  CpuGpuFuncCompare test(
-      "ContextProjectionBackward",
-      FuncConfig()
-          .set("context_length", context_length)
-          .set("context_start", context_start)
-          .set("begin_pad", (size_t)std::max(0, -context_start))
-          .set("is_padding", is_padding)
-          .set("total_pad", pad));
-
-  // prepare input arguments
-  test.addSequence(SequenceIdArg(TensorShape{batch_size}));
-  test.addInputs(SequenceArg(
-      VALUE_TYPE_FLOAT, TensorShape{batch_size, input_dim * context_length}));
-  test.addOutputs(
-      SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batch_size, input_dim}),
-      ADD_TO);
-  if (is_padding) {  // weight
-    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{pad, input_dim}),
-                    ADD_TO);
-  }
-
-  // run Function
-  test.run();
-}
-
-TEST(ContextProjection, Projection) {
-  for (auto context_start : {-5, -3, -1, 0, 3}) {
-    for (auto context_length : {1, 2, 5, 7}) {
-      for (auto trainable_padding : {false, true}) {
-        for (auto batch_size : {1, 2, 5, 20, 100}) {
-          for (auto input_dim : {15, 32, 63, 128, 200}) {
-            VLOG(3) << " context_start=" << context_start
-                    << " context_length=" << context_length
-                    << " trainable_padding=" << trainable_padding
-                    << " batch_size=" << batch_size
-                    << " input_dim=" << input_dim;
-            testMatrixProjectionForward(context_start,
-                                        context_length,
-                                        trainable_padding,
-                                        batch_size,
-                                        input_dim);
-            testMatrixProjectionBackward(context_start,
-                                         context_length,
-                                         trainable_padding,
-                                         batch_size,
-                                         input_dim);
-          }
-        }
-      }
-    }
-  }
-}
diff --git a/paddle/function/CosSimOp.cpp b/paddle/function/CosSimOp.cpp
deleted file mode 100644
index 2c25e1af44965d30591faeccc9a181e36c7e0a0f..0000000000000000000000000000000000000000
--- a/paddle/function/CosSimOp.cpp
+++ /dev/null
@@ -1,240 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CosSimOp.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/Vector.h"
-
-namespace paddle {
-/**
- * Cosine Similarity for CpuMatrix
- *
- * \param out_mat, output value, size: nSamples * 1.
- * \param in1_mat, input value 1, size: nSamples * dim.
- * \param in2_mat, input value 2, size: n2 * dim (n2 == 1 or n2 == nSamples).
- * \param scale, default 1.0
- *
- */
-template <>
-void CosSimForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
-                                    const CpuMatrix& in1_mat,
-                                    const CpuMatrix& in2_mat,
-                                    real scale) {
-  CHECK(out_mat.getData() && in1_mat.getData() && in2_mat.getData());
-  size_t num_samples = out_mat.getHeight();
-  size_t dim = in1_mat.getWidth();
-  /// column vector [nSamples, 1]
-  real* out = out_mat.getData();
-  const real* x = in1_mat.getData();
-  const real* y = in2_mat.getData();
-
-  /// in2 might only have one row or full rows
-  CHECK(in2_mat.getHeight() == 1LU || in2_mat.getHeight() == num_samples);
-  size_t inc = (in2_mat.getHeight() == 1LU) ? 0 : dim;
-  for (size_t i = 0; i < num_samples; ++i, x += dim, y += inc) {
-    real square_sum_x = 0;
-    real square_sum_y = 0;
-    real xy = 0;
-    for (size_t j = 0; j < dim; ++j) {
-      square_sum_x += x[j] * x[j];
-      square_sum_y += y[j] * y[j];
-      xy += x[j] * y[j];
-    }
-    CHECK(square_sum_x > 0 && square_sum_y > 0);
-    out[i] = scale * xy / (std::sqrt(square_sum_x) * std::sqrt(square_sum_y));
-  }
-}
-
-/**
- * Cosine Similarity
- * for each row i,
- *   out[i] = scale * cos(input1[i], input2[i])
- *      = scale * <input1[i], input2[i]>/sqrt(|input1[i]|^2 * |input2[i]|^2)
- * when input2 only has one row, then for each row i,
- *   out[i] = cos(input1[i], input2[0])
- *
- * \param inputs[0] input matrix 1, size: nSamples * dim.
- * \param inputs[1] input matrix 2, size: n2 * dim (n2 == 1 or n2 == nSamples).
- * \param outputs[0] output matrix, size : nSamples * 1.
- */
-
-template <DeviceType Device>
-class CosSimForwardFunc : public FunctionBase {
-  void init(const FuncConfig& config) override {
-    scale_ = config.get<real>("scale");
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(inputs.size(), 2UL);
-    CHECK_EQ(outputs.size(), 1UL);
-
-    CHECK_EQ(inputs[0].shape().ndims(), 2UL);
-    CHECK_EQ(inputs[1].shape().ndims(), 2UL);
-    CHECK_EQ(outputs[0].shape().ndims(), 2UL);
-
-    CHECK_EQ(inputs[0].shape()[0], outputs[0].shape()[0]);
-    CHECK_EQ(inputs[0].shape()[1], inputs[1].shape()[1]);
-    CHECK_EQ(outputs[0].shape()[1], 1UL);
-
-    CHECK(outputs[0].data() && inputs[0].data() && inputs[1].data());
-
-    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
-    auto out_mat = outputs[0].matrix<Device>();
-    const auto in1_mat = inputs[0].matrix<Device>();
-    const auto in2_mat = inputs[1].matrix<Device>();
-
-    CosSimForward<Device>(out_mat, in1_mat, in2_mat, scale_);
-  }
-
- private:
-  real scale_;
-};
-
-/**
- * Cosine Similarity Derivative for CpuMatrix
- *
- * \param in1_grad  forward input grad 1, size: nSamples * dim.
- * \param in2_grad  forward input grad 2,
- *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
- *
- * \param out_grad  backward loss output grad, size : nSamples * 1.
- * \param out_val   forward output value, size: nSamples * 1.
- * \param in1_val   forward input value 1, size: nSamples * dim.
- * \param in2_val   forward input value 2,
- *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
- * \param scale,    default 1.0
- */
-template <>
-void CosSimBackward<DEVICE_TYPE_CPU>(const CpuMatrix& out_grad,
-                                     const CpuMatrix& out_val,
-                                     const CpuMatrix& in1_val,
-                                     const CpuMatrix& in2_val,
-                                     CpuMatrix& in1_grad,
-                                     CpuMatrix& in2_grad,
-                                     real scale) {
-  CHECK(out_grad.getData() && out_val.getData() && in1_val.getData() &&
-        in2_val.getData() && in1_grad.getData() && in2_grad.getData());
-  CHECK_EQ(out_val.useGpu_, false) << "Matrix type are GPU, CPU required";
-
-  const real* grad = out_grad.getData();
-  const real* out = out_val.getData();
-  const real* prev_out_x = in1_val.getData();
-  const real* prev_out_y = in2_val.getData();
-  real* prev_grad_x = in1_grad.getData();
-  real* prev_grad_y = in2_grad.getData();
-
-  size_t num_samples = out_grad.getHeight();
-  size_t dim = in1_val.getWidth();
-  CHECK_EQ(in2_val.getHeight(), in2_grad.getHeight());
-  CHECK(in2_val.getHeight() == 1LU || in2_val.getHeight() == num_samples);
-  size_t inc = (in2_val.getHeight() == 1LU) ? 0 : dim;
-  for (size_t i = 0; i < num_samples; ++i,
-              prev_out_x += dim,
-              prev_out_y += inc,
-              prev_grad_x += dim,
-              prev_grad_y += inc) {
-    real square_sum_x = 0;
-    real square_sum_y = 0;
-    real xy = 0;
-    for (size_t j = 0; j < dim; ++j) {
-      square_sum_x += prev_out_x[j] * prev_out_x[j];
-      square_sum_y += prev_out_y[j] * prev_out_y[j];
-      xy += prev_out_x[j] * prev_out_y[j];
-    }
-    CHECK(square_sum_x > 0 && square_sum_y > 0);
-    if (xy == 0) {
-      real reciprocal =
-          1.0f / (std::sqrt(square_sum_x) * std::sqrt(square_sum_y));
-      for (size_t j = 0; j < dim; ++j) {
-        prev_grad_x[j] += scale * grad[i] * prev_out_y[j] * reciprocal;
-        prev_grad_y[j] += scale * grad[i] * prev_out_x[j] * reciprocal;
-      }
-    } else {
-      real reciprocal_xy = 1.0f / xy;
-      real reciprocal_square_sum_x = 1.0f / square_sum_x;
-      real reciprocal_square_sum_y = 1.0f / square_sum_y;
-      for (size_t j = 0; j < dim; ++j) {
-        prev_grad_x[j] +=
-            out[i] * grad[i] * (prev_out_y[j] * reciprocal_xy -
-                                prev_out_x[j] * reciprocal_square_sum_x);
-        prev_grad_y[j] +=
-            out[i] * grad[i] * (prev_out_x[j] * reciprocal_xy -
-                                prev_out_y[j] * reciprocal_square_sum_y);
-      }
-    }
-  }
-}
-
-/**
- * Cosine Similarity backward Derivative
- *
- * \param outputs[0] forward input grad 1, size: nSamples * dim.
- * \param outputs[1] forward input grad 2,
- *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
- *
- * \param inputs[0] backward loss output grad, size : nSamples * 1.
- * \param inputs[1] forward output value, size: nSamples * 1.
- * \param inputs[2] forward input value 1, size: nSamples * dim.
- * \param inputs[3] forward input value 2,
- *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
- */
-template <DeviceType Device>
-class CosSimBackwardFunc : public FunctionBase {
-  void init(const FuncConfig& config) override {
-    scale_ = config.get<real>("scale");
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(inputs.size(), 4UL);
-    CHECK_EQ(outputs.size(), 2UL);
-    /// dim of out_grad and out_val == 1, column vector
-    CHECK_EQ(inputs[0].shape()[1], 1UL);
-    CHECK_EQ(inputs[1].shape()[1], 1UL);
-    /// nSamples of out_grad == out_val == in_val1 == in_grad1
-    CHECK_EQ(inputs[1].shape()[0], inputs[0].shape()[0]);
-    CHECK_EQ(inputs[0].shape()[0], inputs[0].shape()[0]);
-    CHECK_EQ(outputs[0].shape()[0], inputs[0].shape()[0]);
-    /// dim of in1_val1 == in_val2 == in_grad1 == in_grad2
-    CHECK_EQ(inputs[3].shape()[1], inputs[2].shape()[1]);
-    CHECK_EQ(outputs[0].shape()[1], inputs[2].shape()[1]);
-    CHECK_EQ(outputs[1].shape()[1], inputs[2].shape()[1]);
-
-    CHECK(inputs[0].data() && inputs[1].data() && inputs[2].data() &&
-          inputs[3].data() && outputs[0].data() && outputs[1].data());
-
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-    CHECK_EQ(outputs[1].getArgType(), ADD_TO);
-
-    const auto out_grad = inputs[0].matrix<Device>();
-    const auto out_val = inputs[1].matrix<Device>();
-    const auto in1_val = inputs[2].matrix<Device>();
-    const auto in2_val = inputs[3].matrix<Device>();
-    auto in1_grad = outputs[0].matrix<Device>();
-    auto in2_grad = outputs[1].matrix<Device>();
-
-    CosSimBackward<Device>(
-        out_grad, out_val, in1_val, in2_val, in1_grad, in2_grad, scale_);
-  }
-
- private:
-  real scale_;
-};
-
-REGISTER_TYPED_FUNC(CosSimForward, CPU, CosSimForwardFunc);
-REGISTER_TYPED_FUNC(CosSimBackward, CPU, CosSimBackwardFunc);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(CosSimForward, GPU, CosSimForwardFunc);
-REGISTER_TYPED_FUNC(CosSimBackward, GPU, CosSimBackwardFunc);
-#endif
-}  // namespace paddle
diff --git a/paddle/function/CosSimOpTest.cpp b/paddle/function/CosSimOpTest.cpp
deleted file mode 100644
index 42b02da0cb07a57e030a3edb08bea23203efd688..0000000000000000000000000000000000000000
--- a/paddle/function/CosSimOpTest.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "FunctionTest.h"
-#include "paddle/math/Matrix.h"
-
-using namespace paddle;  // NOLINT
-
-void testCosSimForward(size_t height_x,
-                       size_t height_y,
-                       size_t width,
-                       real scale) {
-  CpuGpuFuncCompare test("CosSimForward", FuncConfig().set("scale", scale));
-  // prepare input arguments
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, width}));
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_y, width}));
-  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1}),
-                  ASSIGN_TO);
-  // run Function
-  test.run();
-}
-
-void testCosSimBackward(size_t height_x,
-                        size_t height_y,
-                        size_t width,
-                        real scale) {
-  CpuGpuFuncCompare test("CosSimBackward", FuncConfig().set("scale", scale));
-  // prepare input arguments
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1}));
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1}));
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, width}));
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_y, width}));
-  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, width}),
-                  ADD_TO);
-  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_y, width}),
-                  ADD_TO);
-  // run Function
-  test.run();
-}
-
-TEST(Matrix, cosSim) {
-  for (auto height_x : {10, 100, 1000}) {
-    for (auto height_y : {1, height_x}) {
-      for (auto width : {10, 100, 1000}) {
-        for (auto scale : {1.0, 2.0}) {
-          testCosSimForward(height_x, height_y, width, scale);
-          testCosSimBackward(height_x, height_y, width, scale);
-        }
-      }
-    }
-  }
-}
diff --git a/paddle/function/CropOp.cpp b/paddle/function/CropOp.cpp
deleted file mode 100644
index 5bd98910fe838751935f8ef2387ce96e755c6df1..0000000000000000000000000000000000000000
--- a/paddle/function/CropOp.cpp
+++ /dev/null
@@ -1,177 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CropOp.h"
-#include "paddle/function/TensorShape.h"
-#include "paddle/math/Vector.h"
-
-namespace paddle {
-
-template <>
-void Crop<DEVICE_TYPE_CPU>(real* outputs,
-                           const real* inputs,
-                           const TensorShape inShape,
-                           const TensorShape outShape,
-                           const FuncConfig& conf) {
-  std::vector<uint32_t> crop_corner =
-      conf.get<std::vector<uint32_t>>("crop_corner");
-  int cCrop = crop_corner[1];
-  int hCrop = crop_corner[2];
-  int wCrop = crop_corner[3];
-
-  int num = inShape[0];
-  int inC = inShape[1];
-  int inH = inShape[2];
-  int inW = inShape[3];
-
-  int outC = outShape[1];
-  int outH = outShape[2];
-  int outW = outShape[3];
-
-  for (int n = 0; n < num; n++) {
-    for (int c = 0; c < outC; c++) {
-      for (int h = 0; h < outH; h++) {
-        int outoff = ((n * outC + c) * outH + h) * outW;
-        int inoff = ((n * inC + c + cCrop) * inH + h + hCrop) * inW + wCrop;
-        memcpy(outputs + outoff, inputs + inoff, outW * sizeof(real));
-      }
-    }
-  }
-}
-
-template <>
-void CropGrad<DEVICE_TYPE_CPU>(const real* inGrad,
-                               real* outGrad,
-                               const TensorShape inShape,
-                               const TensorShape outShape,
-                               const FuncConfig& conf) {
-  std::vector<uint32_t> crop_corner =
-      conf.get<std::vector<uint32_t>>("crop_corner");
-  int cCrop = crop_corner[1];
-  int hCrop = crop_corner[2];
-  int wCrop = crop_corner[3];
-
-  int num = outShape[0];
-  int outC = outShape[1];
-  int outH = outShape[2];
-  int outW = outShape[3];
-
-  int inC = inShape[1];
-  int inH = inShape[2];
-  int inW = inShape[3];
-
-  for (int n = 0; n < num; n++) {
-    for (int c = 0; c < inC; c++) {
-      for (int h = 0; h < inH; h++) {
-        int outoff = ((n * outC + c + cCrop) * outH + h + hCrop) * outW + wCrop;
-        int inoff = ((n * inC + c) * inH + h) * inW;
-        CpuVector inG = CpuVector(inW, const_cast<real*>(inGrad + inoff));
-        CpuVector outG = CpuVector(inW, outGrad + outoff);
-        outG += inG;
-      }
-    }
-  }
-}
-
-/**
- * \brief Crop input according to the specify corner and shape.
- *        The input and output is a 4D tensor. In CropFunc, we only
- *        crop the 2nd to 4th dimension.
- *
- * Argument in this Function:
- * \param pad_    A struct object contains the cropping corner and shape.
- * \param inputs  A 4D tensor, only one input.
- * \param outputs A 4D tensor, the output value after cropping.
- *
- * For example,
- * Input(2,2,2,3) = [
- *                    [ [[1,2,3], [3,4,5]],
- *                      [[2,3,5], [1,6,7]] ],
- *                    [ [[4,3,1], [1,8,7]],
- *                      [[3,8,9], [2,3,5]] ]
- *                  ] # the input shape is (2,2,2,3)
- *
- * pad_: if corner = (0,1,1) and crop_shape = (2,1,2)
- * Output(2,2,1,2) = [
- *                    [ [[4,5]],
- *                      [[6,7]] ],
- *                    [ [[8,7]],
- *                      [[3,5]] ]
- *                  ] # the input shape is (2,2,2,3)
- */
-template <DeviceType Device>
-class CropFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override { conf_ = config; }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(1UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
-
-    TensorShape inShape = inputs[0].shape();
-    TensorShape outShape = outputs[0].shape();
-
-    Crop<Device>(outputs[0].data<real>(),
-                 inputs[0].data<real>(),
-                 inShape,
-                 outShape,
-                 conf_);
-  }
-
- private:
-  FuncConfig conf_;
-};
-
-/**
- * \brief The backward propagation of cropping Function.
- *
- * Argument in this Function:
- * \param crop_    The same meaning as it in CropFunc.
- * \param inputs  The gradient with respect to the output value of CropFunc.
- * \param outputs The gradient with respect to the input value of CropFunc.
- */
-
-template <DeviceType Device>
-class CropGradFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override { conf_ = config; }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(1UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-
-    TensorShape outShape = outputs[0].shape();
-    TensorShape inShape = inputs[0].shape();
-
-    CropGrad<Device>(inputs[0].data<real>(),
-                     outputs[0].data<real>(),
-                     inShape,
-                     outShape,
-                     conf_);
-  }
-
- private:
-  FuncConfig conf_;
-};
-
-REGISTER_TYPED_FUNC(Crop, CPU, CropFunc);
-REGISTER_TYPED_FUNC(CropGrad, CPU, CropGradFunc);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(Crop, GPU, CropFunc);
-REGISTER_TYPED_FUNC(CropGrad, GPU, CropGradFunc);
-#endif
-
-}  // namespace paddle
diff --git a/paddle/function/CrossMapNormalOp.cpp b/paddle/function/CrossMapNormalOp.cpp
deleted file mode 100644
index 7ff9227e5c2702d9d5334db501730b57ec10bfe3..0000000000000000000000000000000000000000
--- a/paddle/function/CrossMapNormalOp.cpp
+++ /dev/null
@@ -1,344 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CrossMapNormalOp.h"
-#include "paddle/math/Vector.h"
-
-namespace paddle {
-
-template <>
-void CrossMapNormal<DEVICE_TYPE_CPU>(real* outputs,
-                                     real* denoms,
-                                     const real* inputs,
-                                     size_t numSamples,
-                                     size_t channels,
-                                     size_t height,
-                                     size_t width,
-                                     size_t size,
-                                     real scale,
-                                     real pow) {
-  size_t oneImage = height * width;
-  size_t oneSample = channels * oneImage;
-
-  CpuVector outputsV(numSamples * oneSample, outputs);
-  CpuVector inputsV(numSamples * oneSample, const_cast<real*>(inputs));
-  CpuVector denomsV(numSamples * oneSample, denoms);
-
-  // f(x) = x * ( 1 + scale * SUM((x)^2) )^(-pow)
-  // x represents inputs
-  // f(x) represents outputs
-  // denoms save the intermediate result for backward
-  denomsV = denomsV.constant(1.0);
-  const int start = -((int)size - 1) / 2;
-  const int end = (int)size + start;
-  for (size_t i = 0; i < numSamples; i++) {
-    real* oneDenom = denoms + i * oneSample;
-    real* oneInput = const_cast<real*>(inputs) + i * oneSample;
-    for (int c = 0; c < (int)channels; c++) {
-      CpuVector denom(oneImage, oneDenom + c * oneImage);
-      for (int s = start; s < end; s++) {
-        if (c + s >= 0 && c + s < (int)channels) {
-          CpuVector input(oneImage, oneInput + (c + s) * oneImage);
-          denom += input.square() * scale;
-        }
-      }
-    }
-  }
-
-  outputsV = inputsV * denomsV.pow(-pow);
-}
-
-template <>
-void CrossMapNormalGrad<DEVICE_TYPE_CPU>(real* inputsGrad,
-                                         const real* inputsValue,
-                                         const real* outputsValue,
-                                         const real* outputsGrad,
-                                         const real* denoms,
-                                         size_t numSamples,
-                                         size_t channels,
-                                         size_t height,
-                                         size_t width,
-                                         size_t size,
-                                         real scale,
-                                         real pow) {
-  size_t oneSample = channels * height * width;
-  std::function<CpuVector(real*, size_t)> oneImage = [=](real* data,
-                                                         size_t offset) {
-    return CpuVector(height * width, data + offset);
-  };
-
-  const int start = -((int)size) / 2;
-  const int end = (int)size + start;
-  const real ratio = -(real)2 * scale * pow;
-  for (size_t i = 0; i < numSamples; i++) {
-    size_t sOffset = i * oneSample;
-    real* oneInputGrad = inputsGrad + sOffset;
-    real* oneInputValue = const_cast<real*>(inputsValue) + sOffset;
-    real* oneDenom = const_cast<real*>(denoms) + sOffset;
-    real* oneOutputGrad = const_cast<real*>(outputsGrad) + sOffset;
-    real* oneOutputValue = const_cast<real*>(outputsValue) + sOffset;
-
-    for (int c = 0; c < (int)channels; c++) {
-      size_t cOffset = c * height * width;
-      CpuVector inputGrad = oneImage(oneInputGrad, cOffset);
-      CpuVector inputValue = oneImage(oneInputValue, cOffset);
-      CpuVector denom = oneImage(oneDenom, cOffset);
-      CpuVector outputGrad = oneImage(oneOutputGrad, cOffset);
-
-      inputGrad = inputGrad + denom.pow(-pow) * outputGrad;
-      for (int s = start; s < end; s++) {
-        if (c + s >= 0 && c + s < (int)channels) {
-          size_t offset = (c + s) * height * width;
-          CpuVector output = oneImage(oneOutputValue, offset);
-          CpuVector outputGrad = oneImage(oneOutputGrad, offset);
-          CpuVector denom = oneImage(oneDenom, offset);
-
-          inputGrad += ((outputGrad * output * ratio) / denom) * inputValue;
-        }
-      }
-    }
-  }
-}
-
-/**
- * \brief Normalization with across maps.
- *
- * This Function comes from the paper
- * "ImageNet Classification with Deep Convolutional Neural Networks".
- *
- * The original formula is:
- *
- *                                Input(i, x, y)
- * Output(i, x, y) = ----------------------------------------------
- *                                 -- upper
- *                    (k + alpha * >  (Input(j, x, y))^2) ^ (beta)
- *                                 -- j = lower
- *
- * upper is `min(C, c + N/2)`
- * lower if `max(0, c - N/2)`
- *
- * Function implementation:
- *
- * inputs and outpus is NCHW format, while input.shape.ndims() is equal 4.
- * And the meaning of each dimension(0-3) is respectively batch size,
- * feature maps, rows and columns.
- *
- * Input and Output in the above formula is for each map(i) of one image, and
- * Input(i, x, y), Output(i, x, y) represents an element in an image.
- *
- * C is the number of feature maps of one image, and N is a hyper-parameters
- * is configured when Function is initialized. The sum in the denominator
- * is the sum of the same position in the neighboring maps.
- *
- * In the implementation of Function, k is equal to 1,
- * so Function has no argument for k.
- *
- * Function Arguments:
- *
- * \param size_      represent N
- * \param scale_     represent alpha
- * \param pow_       represent beta
- * \param inputs[0]  represent Input
- * \param outputs[0] represent Output
- * \param outputs[1] represent The denominator in the formula(except beta)
- *
- * Note:
- * Save output[1] is to simplify the backward calculation.
- * TODO, if only consider the forward calculation, we can optimize to
- * remove the output[1].
- */
-template <DeviceType Device>
-class CrossMapNormalFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    // function arguments
-    size_ = config.get<size_t>("size");
-    scale_ = config.get<real>("scale");
-    pow_ = config.get<real>("pow");
-
-    // number of inputs and outputs
-    numInputs_ = 1;
-    numOutputs_ = 2;
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    check(inputs, outputs);
-    // ArgType check still on here,
-    // not sure whether it is better to put inside the check.
-    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
-    CHECK_EQ(outputs[1].getArgType(), ASSIGN_TO);
-    size_t batchSize = inputs[0].shape()[0];
-    size_t maps = inputs[0].shape()[1];
-    size_t rows = inputs[0].shape()[2];
-    size_t columns = inputs[0].shape()[3];
-
-    CrossMapNormal<Device>(outputs[0].data<real>(),
-                           outputs[1].data<real>(),
-                           inputs[0].data<real>(),
-                           batchSize,
-                           maps,
-                           rows,
-                           columns,
-                           size_,
-                           scale_,
-                           pow_);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-
-    CHECK_EQ(inputs[0].shape().ndims(), (size_t)4);
-    CHECK(inputs[0].shape() == outputs[0].shape());
-    CHECK(inputs[0].shape() == outputs[1].shape());
-  }
-
-  // Only need the shape of the input, can calculate the
-  // floating-point operation.
-  size_t ops(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ((size_t)numInputs_, inputs.size());
-    size_t batchSize = inputs[0].shape()[0];
-    size_t maps = inputs[0].shape()[1];
-    size_t rows = inputs[0].shape()[2];
-    size_t columns = inputs[0].shape()[3];
-
-    // number of floating-point operations
-    // an approximate value
-    size_t ops = batchSize * maps * rows * columns * (size_ * 2 + 3);
-
-    return ops;
-  }
-
- private:
-  size_t size_;
-  real scale_;
-  real pow_;
-};
-
-/**
- * \brief Backward calculation for normalization with across maps.
- *
- * Function implementation:
- *
- * The implementation of this Function is derived from the
- * CrossMapNormalFunc implementation.
- *
- * InputGrad = OutputGrad * denoms ^ (-beta)
- *    -- upper
- *  + > (OutputGrad * OutputValue * (-2 * alpha * beta) / denoms) * InputValue
- *    -- lower
- *
- * The data of inputs/outputs format is the same as the forward interface
- * and is NCHW.
- *
- * The upper and lower is the same as forward. The logic of the sum
- * is also the same as forward.
- *
- * Function Arguments:
- *
- * \param size_      represent N
- * \param scale_     represent alpha
- * \param pow_       represent beta
- * \param inputs[0]  represent InputValue, inputs[0] of CrossMapNormalFunc
- * \param inputs[1]  represent OutputValue, outputs[0] of CrossMapNormalFunc
- * \param inputs[2]  represent OutputGrad
- * \param inputs[3]  represent denoms, outputs[1] of CrossMapNormalFunc
- *                   This is the intermediate result that is
- *                   preserved in the forward calculation.
- * \param outputs[0] represent InputGrad
- */
-template <DeviceType Device>
-class CrossMapNormalGradFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    // function arguments
-    size_ = config.get<size_t>("size");
-    scale_ = config.get<real>("scale");
-    pow_ = config.get<real>("pow");
-
-    // number of inputs and outputs
-    numInputs_ = 4;
-    numOutputs_ = 1;
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    check(inputs, outputs);
-    if (outputs[0].getArgType() != ADD_TO) {
-      // Currently, some algorithm implementations are ASSIGN_TO mode,
-      // if need to support the ADD_TO calculation, need to clear the output.
-      typename Tensor<real, Device>::Vector tmp(
-          outputs[0].shape().getElements(), outputs[0].data<real>());
-      tmp.zero();
-    }
-
-    size_t batchSize = inputs[0].shape()[0];
-    size_t maps = inputs[0].shape()[1];
-    size_t rows = inputs[0].shape()[2];
-    size_t columns = inputs[0].shape()[3];
-
-    CrossMapNormalGrad<Device>(outputs[0].data<real>(),
-                               inputs[0].data<real>(),
-                               inputs[1].data<real>(),
-                               inputs[2].data<real>(),
-                               inputs[3].data<real>(),
-                               batchSize,
-                               maps,
-                               rows,
-                               columns,
-                               size_,
-                               scale_,
-                               pow_);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-
-    CHECK_EQ(inputs[0].shape().ndims(), (size_t)4);
-    CHECK(inputs[0].shape() == inputs[1].shape());
-    CHECK(inputs[0].shape() == inputs[2].shape());
-    CHECK(inputs[0].shape() == inputs[3].shape());
-    CHECK(inputs[0].shape() == outputs[0].shape());
-  }
-
-  // Only need the shape of one input, can calculate the
-  // floating-point operation.
-  size_t ops(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_LT((size_t)1, inputs.size());
-    size_t batchSize = inputs[0].shape()[0];
-    size_t maps = inputs[0].shape()[1];
-    size_t rows = inputs[0].shape()[2];
-    size_t columns = inputs[0].shape()[3];
-
-    // number of floating-point operations
-    // an approximate value
-    size_t ops = batchSize * maps * rows * columns * (size_ * 4 + 2);
-
-    return ops;
-  }
-
- private:
-  size_t size_;
-  real scale_;
-  real pow_;
-};
-
-REGISTER_TYPED_FUNC(CrossMapNormal, CPU, CrossMapNormalFunc);
-REGISTER_TYPED_FUNC(CrossMapNormalGrad, CPU, CrossMapNormalGradFunc);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(CrossMapNormal, GPU, CrossMapNormalFunc);
-REGISTER_TYPED_FUNC(CrossMapNormalGrad, GPU, CrossMapNormalGradFunc);
-#endif
-
-}  // namespace paddle
diff --git a/paddle/function/DepthwiseConvOpGpu.cu b/paddle/function/DepthwiseConvOpGpu.cu
deleted file mode 100644
index 2c0e71b19b22abac25d273d8bbeddc330e67f8b0..0000000000000000000000000000000000000000
--- a/paddle/function/DepthwiseConvOpGpu.cu
+++ /dev/null
@@ -1,376 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "DepthwiseConvOp.h"
-#include "paddle/math/BaseMatrix.h"
-
-namespace paddle {
-
-// CUDA kernel to compute the depthwise convolution forward pass
-template <class T>
-__global__ void ConvolutionDepthwiseForward(const int nthreads,
-                                            const T* const inputData,
-                                            const T* const filterData,
-                                            const int batchSize,
-                                            const int outputChannels,
-                                            const int outputHeight,
-                                            const int outputWidth,
-                                            const int inputChannels,
-                                            const int inputHeight,
-                                            const int inputWidth,
-                                            const int filterMultiplier,
-                                            const int filterHeight,
-                                            const int filterWidth,
-                                            const int strideH,
-                                            const int strideW,
-                                            const int paddingH,
-                                            const int paddingW,
-                                            T* const outputData) {
-  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-
-  if (index < nthreads) {
-    const int batch = index / outputChannels / outputHeight / outputWidth;
-    const int c_out = (index / outputHeight / outputWidth) % outputChannels;
-    const int h_out = (index / outputWidth) % outputHeight;
-    const int w_out = index % outputWidth;
-
-    const int c_in = c_out / filterMultiplier;
-    const T* weight = filterData + c_out * filterHeight * filterWidth;
-    T value = 0;
-    const int h_in_start = -paddingH + h_out * strideH;
-    const int w_in_start = -paddingW + w_out * strideW;
-    const int h_in_end = -paddingH + h_out * strideH + filterHeight - 1;
-    const int w_in_end = -paddingW + w_out * strideW + filterWidth - 1;
-    if ((h_in_start >= 0) && (h_in_end < inputHeight) && (w_in_start >= 0) &&
-        (w_in_end < inputWidth)) {
-      for (int kh = 0; kh < filterHeight; ++kh) {
-        for (int kw = 0; kw < filterWidth; ++kw) {
-          const int h_in = -paddingH + h_out * strideH + kh;
-          const int w_in = -paddingW + w_out * strideW + kw;
-          const int offset =
-              ((batch * inputChannels + c_in) * inputHeight + h_in) *
-                  inputWidth +
-              w_in;
-          value += (*weight) * inputData[offset];
-          ++weight;
-        }
-      }
-    } else {
-      for (int kh = 0; kh < filterHeight; ++kh) {
-        for (int kw = 0; kw < filterWidth; ++kw) {
-          const int h_in = -paddingH + h_out * strideH + kh;
-          const int w_in = -paddingW + w_out * strideW + kw;
-          if ((h_in >= 0) && (h_in < inputHeight) && (w_in >= 0) &&
-              (w_in < inputWidth)) {
-            const int offset =
-                ((batch * inputChannels + c_in) * inputHeight + h_in) *
-                    inputWidth +
-                w_in;
-            value += (*weight) * inputData[offset];
-          }
-          ++weight;
-        }
-      }
-    }
-    outputData[index] = value;
-  }
-}
-
-// CUDA kernel to compute the depthwise convolution backprop w.r.t input.
-template <class T>
-__global__ void ConvolutionDepthwiseInputBackward(const int nthreads,
-                                                  const T* const top_diff,
-                                                  const T* const weight_data,
-                                                  const int num,
-                                                  const int outputChannels,
-                                                  const int outputHeight,
-                                                  const int outputWidth,
-                                                  const int inputChannels,
-                                                  const int inputHeight,
-                                                  const int inputWidth,
-                                                  const int filterMultiplier,
-                                                  const int filterHeight,
-                                                  const int filterWidth,
-                                                  const int strideH,
-                                                  const int strideW,
-                                                  const int paddingH,
-                                                  const int paddingW,
-                                                  T* const bottom_diff) {
-  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  if (index < nthreads) {
-    const int batch = index / inputChannels / inputHeight / inputWidth;
-    const int c_in = (index / inputHeight / inputWidth) % inputChannels;
-    const int h_in = (index / inputWidth) % inputHeight;
-    const int w_in = index % inputWidth;
-
-    const int c_out_start = c_in * filterMultiplier;
-
-    int h_out_start = (h_in - filterHeight + paddingH + strideH) / strideH;
-    h_out_start = 0 > h_out_start ? 0 : h_out_start;
-    int h_out_end = (h_in + paddingH) / strideH;
-    h_out_end = outputHeight - 1 < h_out_end ? outputHeight - 1 : h_out_end;
-    int w_out_start = (w_in - filterWidth + paddingW + strideW) / strideW;
-    w_out_start = 0 > w_out_start ? 0 : w_out_start;
-    int w_out_end = (w_in + paddingW) / strideW;
-    w_out_end = outputWidth - 1 < w_out_end ? outputWidth - 1 : w_out_end;
-
-    T value = 0;
-
-    for (int c_out = c_out_start; c_out < c_out_start + filterMultiplier;
-         c_out++) {
-      for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) {
-        const int filter_h = h_in + paddingH - h_out * strideH;
-        for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) {
-          const int filter_w = w_in + paddingW - w_out * strideW;
-          const int filter_offset = c_out * filterHeight * filterWidth +
-                                    filter_h * filterWidth + filter_w;
-          const int top_diff_offset =
-              ((batch * outputChannels + c_out) * outputHeight + h_out) *
-                  outputWidth +
-              w_out;
-          value += top_diff[top_diff_offset] * weight_data[filter_offset];
-        }
-      }
-    }
-    bottom_diff[index] += value;
-  }
-}
-
-// CUDA kernel to compute the depthwise convolution backprop w.r.t filter.
-template <class T>
-__global__ void ConvolutionDepthwiseFilterBackward(const int num_i,
-                                                   const int nthreads,
-                                                   const T* const top_diff,
-                                                   const T* const inputData,
-                                                   const int num,
-                                                   const int outputChannels,
-                                                   const int outputHeight,
-                                                   const int outputWidth,
-                                                   const int inputChannels,
-                                                   const int inputHeight,
-                                                   const int inputWidth,
-                                                   const int filterMultiplier,
-                                                   const int filterHeight,
-                                                   const int filterWidth,
-                                                   const int strideH,
-                                                   const int strideW,
-                                                   const int paddingH,
-                                                   const int paddingW,
-                                                   T* const buffer_data) {
-  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  if (index < nthreads) {
-    const int h_out = (index / outputWidth) % outputHeight;
-    const int w_out = index % outputWidth;
-    const int kh =
-        (index / filterWidth / outputHeight / outputWidth) % filterHeight;
-    const int kw = (index / outputHeight / outputWidth) % filterWidth;
-    const int h_in = -paddingH + h_out * strideH + kh;
-    const int w_in = -paddingW + w_out * strideW + kw;
-    if ((h_in >= 0) && (h_in < inputHeight) && (w_in >= 0) &&
-        (w_in < inputWidth)) {
-      const int c_out =
-          index / (filterHeight * filterWidth * outputHeight * outputWidth);
-      const int c_in = c_out / filterMultiplier;
-      const int batch = num_i;
-      const int top_offset =
-          ((batch * outputChannels + c_out) * outputHeight + h_out) *
-              outputWidth +
-          w_out;
-      const int bottom_offset =
-          ((batch * inputChannels + c_in) * inputHeight + h_in) * inputWidth +
-          w_in;
-      buffer_data[index] = top_diff[top_offset] * inputData[bottom_offset];
-    } else {
-      buffer_data[index] = 0;
-    }
-  }
-}
-
-template <class T>
-class DepthwiseConvFunctor<DEVICE_TYPE_GPU, T> {
- public:
-  void operator()(const T* inputData,
-                  const T* filterData,
-                  int batchSize,
-                  int outputChannels,
-                  int outputHeight,
-                  int outputWidth,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterMultiplier,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  T* outputData) {
-    int outputSize = batchSize * outputChannels * outputHeight * outputWidth;
-
-    size_t blocks = (outputSize + 1024 - 1) / 1024;
-    size_t blockX = 512;
-    size_t blockY = (blocks + 512 - 1) / 512;
-    dim3 threads(1024, 1);
-    dim3 grid(blockX, blockY);
-
-    ConvolutionDepthwiseForward<T><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        outputSize,
-        inputData,
-        filterData,
-        batchSize,
-        outputChannels,
-        outputHeight,
-        outputWidth,
-        inputChannels,
-        inputHeight,
-        inputWidth,
-        filterMultiplier,
-        filterHeight,
-        filterWidth,
-        strideH,
-        strideW,
-        paddingH,
-        paddingW,
-        outputData);
-  }
-};
-
-template <class T>
-class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, T> {
- public:
-  void operator()(const T* outputGrad,
-                  const T* filterData,
-                  int batchSize,
-                  int outputChannels,
-                  int outputHeight,
-                  int outputWidth,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterMultiplier,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  T* inputGrad) {
-    int inputSize = batchSize * inputChannels * inputHeight * inputWidth;
-
-    size_t blocks = (inputSize + 1024 - 1) / 1024;
-    size_t blockX = 512;
-    size_t blockY = (blocks + 512 - 1) / 512;
-    dim3 threads(1024, 1);
-    dim3 grid(blockX, blockY);
-
-    ConvolutionDepthwiseInputBackward<T>
-        // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(inputSize,
-                                               outputGrad,
-                                               filterData,
-                                               batchSize,
-                                               outputChannels,
-                                               outputHeight,
-                                               outputWidth,
-                                               inputChannels,
-                                               inputHeight,
-                                               inputWidth,
-                                               filterMultiplier,
-                                               filterHeight,
-                                               filterWidth,
-                                               strideH,
-                                               strideW,
-                                               paddingH,
-                                               paddingW,
-                                               inputGrad);
-  }
-};
-
-template <class T>
-class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, T> {
- public:
-  void operator()(const T* outputGrad,
-                  const T* inputData,
-                  int batchSize,
-                  int outputChannels,
-                  int outputHeight,
-                  int outputWidth,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterMultiplier,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  T* colData,
-                  T* filterGrad) {
-    int colDataSize = outputChannels * filterHeight * filterWidth *
-                      outputHeight * outputWidth;
-
-    size_t blocks = (colDataSize + 1024 - 1) / 1024;
-    size_t blockX = 512;
-    size_t blockY = (blocks + 512 - 1) / 512;
-    dim3 threads(1024, 1);
-    dim3 grid(blockX, blockY);
-    BaseMatrix filterGradMatrix(outputChannels * filterHeight * filterWidth,
-                                1,
-                                filterGrad,
-                                false,
-                                true);
-
-    for (int i = 0; i < batchSize; i++) {
-      ConvolutionDepthwiseFilterBackward<
-          T><<<grid, threads, 0, STREAM_DEFAULT>>>(i,
-                                                   colDataSize,
-                                                   outputGrad,
-                                                   inputData,
-                                                   batchSize,
-                                                   outputChannels,
-                                                   outputHeight,
-                                                   outputWidth,
-                                                   inputChannels,
-                                                   inputHeight,
-                                                   inputWidth,
-                                                   filterMultiplier,
-                                                   filterHeight,
-                                                   filterWidth,
-                                                   strideH,
-                                                   strideW,
-                                                   paddingH,
-                                                   paddingW,
-                                                   colData);
-      int K = outputHeight * outputWidth;
-      int M = colDataSize / K;
-
-      BaseMatrix colMatrix(M, K, colData, false, true);
-      filterGradMatrix.sumRows(colMatrix, (T)1.0, (T)1.0);
-    }
-  }
-};
-
-#ifdef PADDLE_TYPE_DOUBLE
-template class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, double>;
-template class DepthwiseConvFunctor<DEVICE_TYPE_GPU, double>;
-template class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, double>;
-#else
-template class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, float>;
-template class DepthwiseConvFunctor<DEVICE_TYPE_GPU, float>;
-template class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, float>;
-#endif
-
-}  // namespace paddle
diff --git a/paddle/function/EigenGemm.cpp b/paddle/function/EigenGemm.cpp
deleted file mode 100644
index 8e9dbbd7a154095a7298bb2f59a82d13a60f9bd3..0000000000000000000000000000000000000000
--- a/paddle/function/EigenGemm.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <glog/logging.h>
-#include "paddle/function/EigenThreadDevice.h"
-
-namespace paddle {
-
-template <class T>
-struct EigenBlasGemm {
-  typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, int>,
-                           Eigen::Aligned>
-      EigenMatrix;
-
-  static void compute(const bool transA,
-                      const bool transB,
-                      const int M,
-                      const int N,
-                      const int K,
-                      const T alpha,
-                      const T* A,
-                      const int lda,
-                      const T* B,
-                      const int ldb,
-                      const T beta,
-                      T* C,
-                      const int ldc) {
-    Eigen::array<int, 2> sizeA;
-    if (transA) {
-      sizeA[0] = K;
-      sizeA[1] = M;
-      CHECK_EQ(M, lda);
-    } else {
-      sizeA[0] = M;
-      sizeA[1] = K;
-      CHECK_EQ(K, lda);
-    }
-    Eigen::array<int, 2> sizeB;
-    if (transB) {
-      sizeB[0] = N;
-      sizeB[1] = K;
-      CHECK_EQ(K, ldb);
-    } else {
-      sizeB[0] = K;
-      sizeB[1] = N;
-      CHECK_EQ(N, ldb);
-    }
-    Eigen::array<int, 2> sizeC = {{M, ldc}};
-    Eigen::array<int, 2> offsetC = {{0, 0}};
-    Eigen::array<int, 2> extentC = {{M, N}};
-
-    const EigenMatrix a(const_cast<T*>(A), sizeA);
-    const EigenMatrix b(const_cast<T*>(B), sizeB);
-    EigenMatrix c(C, sizeC);
-
-    typedef typename Eigen::Tensor<T, 2>::DimensionPair DimPair;
-    Eigen::array<DimPair, 1> dims;
-    dims[0] = DimPair(1, 0);
-    dims[0].first = transA ? 0 : 1;
-    dims[0].second = transB ? 1 : 0;
-
-    auto* device = EigenDeviceWarpper::device();
-    if (N == ldc) {
-      if (alpha == T(1) && beta == T(0)) {
-        c.device(*device) = a.contract(b, dims);
-      } else if (alpha == T(1) && beta == T(1)) {
-        c.device(*device) += a.contract(b, dims);
-      } else {
-        c.device(*device) = alpha * a.contract(b, dims) + beta * c;
-      }
-    } else {
-      if (alpha == T(1) && beta == T(0)) {
-        c.slice(offsetC, extentC).device(*device) = a.contract(b, dims);
-      } else if (alpha == T(1) && beta == T(1)) {
-        c.slice(offsetC, extentC).device(*device) += a.contract(b, dims);
-      } else {
-        c.slice(offsetC, extentC).device(*device) =
-            alpha * a.contract(b, dims) + beta * c.slice(offsetC, extentC);
-      }
-    }
-    EigenDeviceWarpper::free_device(device);
-  }
-};
-
-#ifdef PADDLE_TYPE_DOUBLE
-template struct EigenBlasGemm<double>;
-#else
-template struct EigenBlasGemm<float>;
-#endif
-
-}  // namespace paddle
diff --git a/paddle/function/Function.h b/paddle/function/Function.h
deleted file mode 100644
index a6c14ef29b760faa393c37bd2357824a061c7b38..0000000000000000000000000000000000000000
--- a/paddle/function/Function.h
+++ /dev/null
@@ -1,214 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <map>
-#include <vector>
-#include "BufferArg.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Any.h"
-#include "paddle/utils/ClassRegistrar.h"
-#include "paddle/utils/Error.h"
-
-namespace paddle {
-
-/**
- * Function Configuration.
- * The argument type of Function::init.
- */
-class FuncConfig {
- public:
-  template <typename T>
-  T get(const std::string& key, Error* err = nullptr) const {
-    try {
-      return any_cast<T>(valueMap_.at(key));
-    } catch (std::exception& e) {  // could be cast or out of range exception.
-      if (err) {
-        *err = Error(e.what());
-      } else {
-        LOG(FATAL) << "Cannot get key " << key << " with error " << e.what();
-      }
-      return T();
-    }
-  }
-
-  template <typename T>
-  FuncConfig& set(const std::string& key, T v, Error* err = nullptr) {
-    auto it = valueMap_.find(key);
-    if (it != valueMap_.end()) {  // already contains key.
-      if (err) {
-        *err = Error("Key %s is already set in FuncConfig", key.c_str());
-      } else {
-        LOG(FATAL) << "Key " << key << " is already set in FuncConfig.";
-      }
-      return *this;
-    }
-    valueMap_[key] = any(v);
-    return *this;
-  }
-
- protected:
-  mutable std::unordered_map<std::string, any> valueMap_;
-};
-
-/**
- * Argument type for Function::calc().
- * A BufferArgs contains a set of BufferArg,
- * because Function can have multiple inputs and outputs.
- *
- * addArg() with Matix object used to adapt Layer Argument.
- * Will create a BufferArg object in addArg(),
- * and free in destructor of BufferArgs.
- *
- * addArg() with BufferArg object, just save BufferArg object address,
- * and the caller needs to guarantee the validity of the BufferArg object
- * in the BufferArgs life time.
- */
-class BufferArgs {
- public:
-  BufferArgs() {}
-
-  ~BufferArgs() {
-    for (auto arg : _args_) {
-      delete arg;
-    }
-  }
-
-  size_t size() const { return args_.size(); }
-
-  // add argument into BufferArgs
-  // Tensor can be Matrix, Vector, IVector.
-  // For inputs, do not need argType.
-  // For outputs, the argType needs to be specified as ASSIGN_TO or ADD_TO.
-  void addArg(const Matrix& arg, ArgType argType = UNSPECIFIED) {
-    _args_.push_back(new BufferArg(arg, argType));
-    addArg(*_args_.back());
-  }
-
-  void addArg(const Vector& arg, ArgType argType = UNSPECIFIED) {
-    _args_.push_back(new BufferArg(arg, argType));
-    addArg(*_args_.back());
-  }
-
-  void addArg(const IVector& arg, ArgType argType = UNSPECIFIED) {
-    _args_.push_back(new BufferArg(arg, argType));
-    addArg(*_args_.back());
-  }
-
-  // Add arg into BufferArgs and reshape the arg.
-  //
-  // For example, arg represents an image buffer,
-  // but Matrix can only represent a two-dimensional Tensor.
-  // So need an extra argument to describe the shape of the image buffer.
-  void addArg(const Matrix& arg,
-              const TensorShape& shape,
-              ArgType argType = UNSPECIFIED);
-
-  void addArg(const CpuSparseMatrix& arg, ArgType argType = UNSPECIFIED);
-  void addArg(const GpuSparseMatrix& arg, ArgType argType = UNSPECIFIED);
-
-  void addArg(const Matrix& matrix,
-              const IVector& vector,
-              ArgType argType = UNSPECIFIED);
-
-  // get argument
-  const BufferArg& operator[](size_t num) const {
-    CHECK_LT(num, args_.size());
-    return *args_[num];
-  }
-
-  void addArg(BufferArg& arg) { args_.push_back(&arg); }
-
-  void addArg(SequenceIdArg& arg) { args_.push_back(&arg); }
-
-  void addArg(SequenceArg& arg) { args_.push_back(&arg); }
-
-  void addArg(SparseMatrixArg& arg) { args_.push_back(&arg); }
-
- private:
-  std::vector<BufferArg*> args_;
-  // The BufferArg object is constructed and freed by BufferArgs.
-  std::vector<BufferArg*> _args_;
-};
-
-/**
- * \brief Base class for Function.
- * The basic Function implementation requires override init and calc interfaces.
- *
- * The caller needs to ensure the validity of the arguments
- * during Function execution.
- *
- * Function inputs are readonly, Function outputs have two modes: ASSIGN_TO
- * and ADD_TO.
- * If output.getArgType() == ASSIGN_TO, this is assign mode, and the calculation
- * result of Function assigned to the output BufferArg.
- * If output.getArgType() == ADD_TO, this is add mode, and the calculation
- * result of Function need added to the output BufferArg.
- *
- * For example:
- * ASSIGN_TO: output = Function(inputs)
- * ADD_TO: output += Function(inputs)
- * If Function has more than one output, each output can have different modes.
- */
-class FunctionBase {
- public:
-  virtual ~FunctionBase() {}
-
-  virtual void init(const FuncConfig& config) {}
-
-  virtual void calc(const BufferArgs& inputs, const BufferArgs& outputs) {}
-
-  // This member function is used to check whether the BufferType and shape of
-  // the inputs and outputs arguments of the Function are correct.
-  // General calc function which will call this check to do arguments check.
-  // And before the calc called, the caller can also check their own arguments.
-  virtual void check(const BufferArgs& inputs, const BufferArgs& outputs) {}
-
-  // Calculate the number of floating-point operations of this Function.
-  // The inputs and outputs arguments do not need to contain the actual data,
-  // only the shape.
-  // And some Functions have the same input and output shapes,
-  // so you may not need to enter the complete number of arguments.
-  // But entering the full arguments is always correct for this interface.
-  virtual size_t ops(const BufferArgs& inputs, const BufferArgs& outputs) {
-    return 0;
-  }
-
-  int getNumInputs() const { return numInputs_; }
-
-  int getNumOutputs() const { return numOutputs_; }
-
-  static ClassRegistrar<FunctionBase> funcRegistrar_;
-
- protected:
-  // numInputs_ and numOutputs_ represents the maximum
-  // input and output supported by Function.
-  // Some functions are optimized for input and output,
-  // so when comparing the number of arguments, for these functions
-  // inputs.size() <= numInputs_ or outputs.size() <= numOutputs_
-  size_t numInputs_;
-  size_t numOutputs_;
-};
-
-#define FUNC_NAME(typeName, deviceName) #typeName "-" #deviceName
-
-#define REGISTER_TYPED_FUNC(typeName, deviceName, className)   \
-  static InitFunction __reg_type_##typeName##deviceName([]() { \
-    FunctionBase::funcRegistrar_                               \
-        .registerClass<className<DEVICE_TYPE_##deviceName>>(   \
-            FUNC_NAME(typeName, deviceName));                  \
-  })
-
-}  // namespace paddle
diff --git a/paddle/function/FunctionTest.cpp b/paddle/function/FunctionTest.cpp
deleted file mode 100644
index f5e6ca3f515a7fcd1498979703a0a59ddca40742..0000000000000000000000000000000000000000
--- a/paddle/function/FunctionTest.cpp
+++ /dev/null
@@ -1,166 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Function.h"
-#include <gtest/gtest.h>
-#include "paddle/math/SparseMatrix.h"
-
-namespace paddle {
-
-template <DeviceType DType>
-void FunctionApi(typename Tensor<real, DType>::Matrix& output,
-                 const typename Tensor<real, DType>::Matrix& input);
-
-template <>
-void FunctionApi<DEVICE_TYPE_CPU>(CpuMatrix& output, const CpuMatrix& input) {
-  EXPECT_EQ(output.getHeight(), 100U);
-  EXPECT_EQ(output.getWidth(), 200U);
-}
-
-template <>
-void FunctionApi<DEVICE_TYPE_GPU>(GpuMatrix& output, const GpuMatrix& input) {
-  EXPECT_EQ(output.getHeight(), 10U);
-  EXPECT_EQ(output.getWidth(), 20U);
-}
-
-template <DeviceType DType>
-void Function(const BufferArgs& arguments) {
-  const auto input = arguments[0].matrix<DType>();
-  auto output = arguments[1].matrix<DType>();
-  FunctionApi<DType>(output, input);
-}
-
-TEST(Function, BufferArgs) {
-  CpuMatrix cpuInput = CpuMatrix(100, 200);
-  CpuMatrix cpuOutput = CpuMatrix(100, 200);
-  BufferArgs cpuArgments;
-  cpuArgments.addArg(cpuInput);
-  cpuArgments.addArg(cpuOutput);
-  Function<DEVICE_TYPE_CPU>(cpuArgments);
-
-  GpuMatrix gpuInput = GpuMatrix(10, 20);
-  GpuMatrix gpuOutput = GpuMatrix(10, 20);
-  BufferArgs gpuArgments;
-  gpuArgments.addArg(gpuInput);
-  gpuArgments.addArg(gpuOutput);
-  Function<DEVICE_TYPE_GPU>(gpuArgments);
-}
-
-/**
- * Some tests case are used to check the consistency between the BufferArg type
- * argument received by Function and the original type argument.
- *
- * Use Case:
- *  TEST() {
- *    Matrix matrix(...);
- *    CheckBufferArg lambda = [=](const BufferArg& arg) {
- *      // check matrix and arg are equivalent
- *      EXPECT_EQ(matrix, arg);
- *    }
- *
- *   BufferArgs argments{matrix...};
- *   std::vector<CheckBufferArg> checkFunc{lambda...};
- *   testBufferArgs(argments, checkFunc);
- *  }
- */
-typedef std::function<void(const BufferArg&)> CheckBufferArg;
-
-void testBufferArgs(const BufferArgs& inputs,
-                    const std::vector<CheckBufferArg>& check) {
-  EXPECT_EQ(inputs.size(), check.size());
-  for (size_t i = 0; i < inputs.size(); i++) {
-    check[i](inputs[i]);
-  }
-}
-
-void testBufferArgs(const BufferArgs& inputs, const CheckBufferArg& check) {
-  EXPECT_EQ(inputs.size(), 1U);
-  check(inputs[0]);
-}
-
-TEST(Arguments, Matrix) {
-  MatrixPtr matrix = Matrix::create(100, 200);
-  CheckBufferArg check = [=](const BufferArg& arg) {
-    EXPECT_EQ(arg.shape().ndims(), 2U);
-    EXPECT_EQ(arg.shape()[0], 100U);
-    EXPECT_EQ(arg.shape()[1], 200U);
-    EXPECT_EQ(arg.data(), matrix->getData());
-
-    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getHeight(), matrix->getHeight());
-    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getWidth(), matrix->getWidth());
-    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getData(), matrix->getData());
-  };
-
-  BufferArgs argments;
-  argments.addArg(*matrix);
-  std::vector<CheckBufferArg> checkFunc;
-  checkFunc.push_back(check);
-  testBufferArgs(argments, checkFunc);
-}
-
-TEST(Arguments, Vector) {
-  VectorPtr vector = Vector::create(100, false);
-  CheckBufferArg check = [=](const BufferArg& arg) {
-    EXPECT_EQ(arg.shape().ndims(), 1U);
-    EXPECT_EQ(arg.shape()[0], 100U);
-    EXPECT_EQ(arg.data(), vector->getData());
-
-    CpuVector inVector = arg.vector<real, DEVICE_TYPE_CPU>();
-    EXPECT_EQ(inVector.getSize(), vector->getSize());
-    EXPECT_EQ(inVector.getData(), vector->getData());
-  };
-
-  BufferArgs argments;
-  argments.addArg(*vector);
-  std::vector<CheckBufferArg> checkFunc;
-  checkFunc.push_back(check);
-  testBufferArgs(argments, checkFunc);
-}
-
-TEST(Arguments, CpuSparseMatrix) {
-  CpuSparseMatrix sparse(200, 300, 50);
-  CheckBufferArg check = [=](const BufferArg& arg) {
-    EXPECT_EQ(arg.shape().ndims(), 2U);
-    EXPECT_EQ(arg.shape()[0], 200U);
-    EXPECT_EQ(arg.shape()[1], 300U);
-    EXPECT_EQ(arg.data(), sparse.getData());
-    // CHECK_EQ(arg.sparse().nnz(), 50);
-    // CHECK_EQ(arg.sparse().dataFormat(), SPARSE_CSR_FORMAT);
-    // CHECK_EQ(arg.sparse().dataType(), SPARSE_FLOAT_VALUE);
-    EXPECT_EQ(arg.sparse().getRowBuf(), sparse.getRows());
-    EXPECT_EQ(arg.sparse().getColBuf(), sparse.getCols());
-  };
-
-  BufferArgs argments;
-  argments.addArg(sparse);
-  std::vector<CheckBufferArg> checkFunc;
-  checkFunc.push_back(check);
-  testBufferArgs(argments, checkFunc);
-}
-
-TEST(Arguments, BufferArg) {
-  BufferArg arg(nullptr, VALUE_TYPE_FLOAT, {1, 2, 3});
-  CheckBufferArg check = [=](const BufferArg& arg) {
-    EXPECT_EQ(arg.shape().ndims(), 3U);
-    EXPECT_EQ(arg.shape()[0], 1U);
-    EXPECT_EQ(arg.shape()[1], 2U);
-    EXPECT_EQ(arg.shape()[2], 3U);
-  };
-
-  BufferArgs argments;
-  argments.addArg(arg);
-  testBufferArgs(argments, check);
-}
-
-}  // namespace paddle
diff --git a/paddle/function/FunctionTest.h b/paddle/function/FunctionTest.h
deleted file mode 100644
index 14003d2c885c8f846f9445ad8844869c9112816e..0000000000000000000000000000000000000000
--- a/paddle/function/FunctionTest.h
+++ /dev/null
@@ -1,410 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Function.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/SparseMatrix.h"
-#include "paddle/math/tests/TensorCheck.h"
-#include "paddle/testing/TestUtil.h"
-
-namespace paddle {
-
-typedef std::shared_ptr<BufferArg> BufferArgPtr;
-
-namespace test {
-template <DeviceType DType>
-struct Allocator;
-
-template <>
-struct Allocator<DEVICE_TYPE_CPU> {
-  using type = CpuMemoryHandle;
-};
-
-template <>
-struct Allocator<DEVICE_TYPE_GPU> {
-  using type = GpuMemoryHandle;
-};
-
-// Copy argument1 to argument2
-template <DeviceType DType1, DeviceType DType2>
-class CopyArgument {
- public:
-  void operator()(const BufferArg& arg1, BufferArg& arg2) {
-    CHECK_EQ(arg1.valueType(), arg2.valueType());
-    CHECK_LE(arg1.shape().getElements(), arg2.shape().getElements());
-
-    if (arg1.valueType() == VALUE_TYPE_INT32) {
-      IVectorPtr vector1 =
-          IVector::create((int*)arg1.data(),
-                          arg1.shape().getElements(),
-                          DType1 == DEVICE_TYPE_CPU ? false : true);
-      IVectorPtr vector2 =
-          IVector::create((int*)arg2.data(),
-                          arg2.shape().getElements(),
-                          DType2 == DEVICE_TYPE_CPU ? false : true);
-      vector2->copyFrom(*vector1);
-    } else {
-      VectorPtr vector1 =
-          Vector::create((real*)arg1.data(),
-                         arg1.shape().getElements(),
-                         DType1 == DEVICE_TYPE_CPU ? false : true);
-      VectorPtr vector2 =
-          Vector::create((real*)arg2.data(),
-                         arg2.shape().getElements(),
-                         DType2 == DEVICE_TYPE_CPU ? false : true);
-      vector2->copyFrom(*vector1);
-    }
-  }
-};
-}  // namespace test
-
-/**
- * \brief A class for comparing two Functions of different implementations.
- *        For example, can be used to compare the CPU and GPU implementation
- *        of the function is consistent.
- *
- * Use case:
- *  // Initializes a test object, the corresponding cpu and gpu Function
- *  // are constructed according to FunctionName and FuncConfig.
- *  CpuGpuFuncCompare test(FunctionName, FuncConfig);
- *  // Prepare inputs and outputs arguments.
- *  // Here the input and output can not contain real data,
- *  // only contains the argument type and shape.
- *  test.addInputs(input1);
- *  test.addInputs(input2);
- *  test.addOutputs(output1);
- *  test.addOutputs(output2);
- *  // Run.
- *  // Will according to the type and shape of arguments(inputs_/outputs_),
- *  // automatic initialization cpu and gpu function required arguments
- *  // (cpuInputs_/cpuOutputs_/gpuInputs_/gpuOutputs_).
- *  // Call the CPU and GPU Function calculation results.
- *  // Compares CPU and GPU calculation results for consistency.
- *  test.run();
- */
-template <DeviceType DType1, DeviceType DType2>
-class Compare2Function {
- public:
-  typedef typename test::Allocator<DType1>::type Allocator1;
-  typedef typename test::Allocator<DType2>::type Allocator2;
-  typedef typename Tensor<real, DType1>::Vector Vector1;
-  typedef typename Tensor<real, DType2>::Vector Vector2;
-  typedef typename Tensor<real, DType1>::SparseMatrix SparseMatrix1;
-  typedef typename Tensor<real, DType2>::SparseMatrix SparseMatrix2;
-
-  Compare2Function(const std::string& name1,
-                   const std::string& name2,
-                   const FuncConfig& config)
-      : function1_(FunctionBase::funcRegistrar_.createByType(name1)),
-        function2_(FunctionBase::funcRegistrar_.createByType(name2)) {
-    function1_->init(config);
-    function2_->init(config);
-    initArgsCallback_ = nullptr;
-  }
-
-  ~Compare2Function() {}
-
-  // input need only contains shape, do not contains data.
-  void addInputs(const BufferArg& input) {
-    size_t size =
-        input.shape().getElements() * sizeOfValuType(input.valueType());
-    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
-    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
-
-    func1Inputs_.emplace_back(std::make_shared<BufferArg>(
-        func1Memory_.back()->getBuf(), input.valueType(), input.shape()));
-    func2Inputs_.emplace_back(std::make_shared<BufferArg>(
-        func2Memory_.back()->getBuf(), input.valueType(), input.shape()));
-  }
-
-  // assume one copy of sequence is shared by different SequenceArgs
-  void addSequence(const SequenceIdArg& input) {
-    CHECK_EQ(input.shape().ndims(), 1UL);
-    size_t batchSize = input.shape()[0];
-    size_t numSeqs = batchSize / 10 + 1;
-    size_t sizeId = (numSeqs + 1) * sizeOfValuType(VALUE_TYPE_INT32);
-    func1Memory_.emplace_back(std::make_shared<Allocator1>(sizeId));
-    func2Memory_.emplace_back(std::make_shared<Allocator2>(sizeId));
-    seq1_ = std::make_shared<SequenceIdArg>(func1Memory_.back()->getBuf(),
-                                            TensorShape{numSeqs + 1});
-    seq2_ = std::make_shared<SequenceIdArg>(func2Memory_.back()->getBuf(),
-                                            TensorShape{numSeqs + 1});
-    /// init sequence Id
-    initArg(*seq1_, batchSize);
-
-    copyArg_(*seq1_, *seq2_);
-  }
-
-  void addInputs(const SequenceArg& input) {
-    CHECK_EQ(input.shape().ndims(), 2UL);
-    size_t batchSize = input.shape()[0];
-    if (!seq1_ || !seq2_) {  // sequence not exist
-      addSequence(SequenceIdArg(TensorShape{batchSize}));
-    }
-
-    size_t size =
-        input.shape().getElements() * sizeOfValuType(input.valueType());
-    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
-    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
-
-    /// SequenceArg
-    func1Inputs_.emplace_back(
-        std::make_shared<SequenceArg>(func1Memory_.back()->getBuf(),
-                                      input.valueType(),
-                                      input.shape(),
-                                      *seq1_));
-    func2Inputs_.emplace_back(
-        std::make_shared<SequenceArg>(func2Memory_.back()->getBuf(),
-                                      input.valueType(),
-                                      input.shape(),
-                                      *seq2_));
-  }
-
-  void registerInitCallback(std::function<void(BufferArg&, size_t)> callback) {
-    initArgsCallback_ = callback;
-  }
-
-  // output need only contains shape, do not contains data.
-  void addOutputs(const BufferArg& output, ArgType argType = ASSIGN_TO) {
-    size_t size =
-        output.shape().getElements() * sizeOfValuType(output.valueType());
-    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
-    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
-
-    func1Outputs_.emplace_back(
-        std::make_shared<BufferArg>(func1Memory_.back()->getBuf(),
-                                    output.valueType(),
-                                    output.shape(),
-                                    argType));
-    func2Outputs_.emplace_back(
-        std::make_shared<BufferArg>(func2Memory_.back()->getBuf(),
-                                    output.valueType(),
-                                    output.shape(),
-                                    argType));
-  }
-
-  /// add and init output sparse matrix
-  void addOutputs(const SparseMatrixArg& output, ArgType argType = ASSIGN_TO) {
-    sparse1_ = std::make_shared<SparseMatrix1>(
-        output.shape()[0],
-        output.shape()[1],
-        output.nnz(),
-        static_cast<SparseValueType>(output.dataType()),
-        static_cast<SparseFormat>(output.dataFormat()));
-
-    sparse2_ = std::make_shared<SparseMatrix2>(
-        output.shape()[0],
-        output.shape()[1],
-        output.nnz(),
-        static_cast<SparseValueType>(output.dataType()),
-        static_cast<SparseFormat>(output.dataFormat()));
-
-    /// init sparse matrix
-    hl_stream_t stream(HPPL_STREAM_1);
-    sparse1_->randomizeUniform();
-    sparse2_->copyFrom(*sparse1_, stream);
-    hl_stream_synchronize(stream);
-
-    func1Outputs_.emplace_back(
-        std::make_shared<SparseMatrixArg>(*sparse1_, argType));
-    func2Outputs_.emplace_back(
-        std::make_shared<SparseMatrixArg>(*sparse2_, argType));
-  }
-
-  void addOutputs(const SequenceArg& output, ArgType argType = ASSIGN_TO) {
-    CHECK_EQ(output.shape().ndims(), 2UL);
-    size_t batchSize = output.shape()[0];
-
-    if (!seq1_ || !seq2_) {  // sequence not exist
-      addSequence(SequenceIdArg(TensorShape{batchSize}));
-    }
-    size_t size =
-        output.shape().getElements() * sizeOfValuType(output.valueType());
-    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
-    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
-
-    /// SequenceArg
-    func1Outputs_.emplace_back(
-        std::make_shared<SequenceArg>(func1Memory_.back()->getBuf(),
-                                      output.valueType(),
-                                      output.shape(),
-                                      *seq1_,
-                                      argType));
-    func2Outputs_.emplace_back(
-        std::make_shared<SequenceArg>(func2Memory_.back()->getBuf(),
-                                      output.valueType(),
-                                      output.shape(),
-                                      *seq2_,
-                                      argType));
-  }
-
-  void addInputs(const SparseMatrixArg& input) {
-    sparse1_ = std::make_shared<SparseMatrix1>(
-        input.shape()[0],
-        input.shape()[1],
-        input.nnz(),
-        static_cast<SparseValueType>(input.dataType()),
-        static_cast<SparseFormat>(input.dataFormat()));
-
-    sparse2_ = std::make_shared<SparseMatrix2>(
-        input.shape()[0],
-        input.shape()[1],
-        input.nnz(),
-        static_cast<SparseValueType>(input.dataType()),
-        static_cast<SparseFormat>(input.dataFormat()));
-
-    /// init sparse matrix
-    hl_stream_t stream(HPPL_STREAM_1);
-    sparse1_->randomizeUniform();
-    sparse2_->copyFrom(*sparse1_, stream);
-    hl_stream_synchronize(stream);
-
-    func1Inputs_.emplace_back(std::make_shared<SparseMatrixArg>(*sparse1_));
-    func2Inputs_.emplace_back(std::make_shared<SparseMatrixArg>(*sparse2_));
-  }
-
-  void run() {
-    // prepare cpu/gpu arguments
-    initInputs();
-
-    initOutputs();
-    // function calculate
-    auto callFunction = [](FunctionBase* function,
-                           std::vector<BufferArgPtr>& inputs,
-                           std::vector<BufferArgPtr>& outputs) {
-      BufferArgs inArgs;
-      BufferArgs outArgs;
-      for (auto arg : inputs) {
-        inArgs.addArg(*arg);
-      }
-      for (auto arg : outputs) {
-        outArgs.addArg(*arg);
-      }
-      function->calc(inArgs, outArgs);
-    };
-
-    callFunction(function1_.get(), func1Inputs_, func1Outputs_);
-    callFunction(function2_.get(), func2Inputs_, func2Outputs_);
-
-    // check outputs
-    compareOutputs();
-  }
-
-  std::shared_ptr<FunctionBase> getFunction1() const { return function1_; }
-
-  std::shared_ptr<FunctionBase> getFunction2() const { return function2_; }
-
- protected:
-  // only init cpu argument, gpu argument copy from cpu argument.
-  void initArg(BufferArg& arg) {
-    Vector1 vector(arg.shape().getElements(), (real*)arg.data());
-    vector.uniform(0.001, 1);
-  }
-
-  void initArg(SequenceArg& arg) {
-    /// init only matrix
-    Vector1 vector(arg.shape().getElements(), (real*)arg.data());
-    vector.uniform(0.001, 1);
-  }
-
-  void initArg(SequenceIdArg& arg, size_t batchSize) {
-    size_t numSeqs = arg.numSeqs();
-    int* buf = reinterpret_cast<int*>(arg.data());
-    int pos = 0;
-    size_t maxLen = 2 * batchSize / numSeqs;
-    for (int i = 0; i < (int)numSeqs; ++i) {
-      int len = 1 + uniformRandom(std::min<int64_t>(
-                        maxLen, batchSize - pos - numSeqs + i));
-      buf[i] = pos;
-      pos += len;
-      VLOG(1) << " len=" << len;
-    }
-    buf[numSeqs] = batchSize;
-  }
-
-  void initInputs() {
-    for (size_t i = 0; i < func1Inputs_.size(); i++) {
-      if (func1Inputs_[i]->isSparseArg()) {
-        continue;  /// sparse matrix already init
-      }
-
-      if (func1Inputs_[i]->isSequenceArg()) {
-        initArg(dynamic_cast<SequenceArg&>(*func1Inputs_[i]));
-      } else {
-        initArg(*func1Inputs_[i]);
-      }
-
-      if (initArgsCallback_ != nullptr) {
-        initArgsCallback_(*func1Inputs_[i], i);
-      }
-
-      copyArg_(*func1Inputs_[i], *func2Inputs_[i]);
-    }
-  }
-
-  void initOutputs() {
-    for (size_t i = 0; i < func1Outputs_.size(); i++) {
-      if (func1Outputs_[i]->isSparseArg()) {
-        continue;  /// sparse matrix already init
-      }
-
-      if (func1Outputs_[i]->isSequenceArg()) {
-        initArg(dynamic_cast<SequenceArg&>(*func1Outputs_[i]));
-      } else {
-        initArg(*func1Outputs_[i]);
-      }
-
-      copyArg_(*func1Outputs_[i], *func2Outputs_[i]);
-    }
-  }
-
-  void compareOutputs() {
-    for (size_t i = 0; i < func1Outputs_.size(); i++) {
-      // TODO, Need a BufferCheck used to compare the two buffers.
-      const auto cpu = func1Outputs_[i];
-      const auto gpu = func2Outputs_[i];
-      CHECK_EQ(cpu->numElements(), gpu->numElements());
-      Vector1 cpuVector(cpu->numElements(), (real*)cpu->data());
-      Vector2 gpuVector(gpu->numElements(), (real*)gpu->data());
-      autotest::TensorCheckErr(cpuVector, gpuVector);
-    }
-  }
-
- protected:
-  std::shared_ptr<FunctionBase> function1_;
-  std::shared_ptr<FunctionBase> function2_;
-  std::vector<std::shared_ptr<Allocator1>> func1Memory_;
-  std::vector<std::shared_ptr<Allocator2>> func2Memory_;
-  std::vector<BufferArgPtr> func1Inputs_;
-  std::vector<BufferArgPtr> func1Outputs_;
-  std::vector<BufferArgPtr> func2Inputs_;
-  std::vector<BufferArgPtr> func2Outputs_;
-  std::shared_ptr<SparseMatrix1> sparse1_;
-  std::shared_ptr<SparseMatrix2> sparse2_;
-  std::shared_ptr<SequenceIdArg> seq1_;
-  std::shared_ptr<SequenceIdArg> seq2_;
-  test::CopyArgument<DType1, DType2> copyArg_;
-  std::function<void(BufferArg&, size_t)> initArgsCallback_;
-};
-
-class CpuGpuFuncCompare
-    : public Compare2Function<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> {
- public:
-  CpuGpuFuncCompare(const std::string& name, const FuncConfig& config)
-      : Compare2Function(name + "-CPU", name + "-GPU", config) {}
-
-  ~CpuGpuFuncCompare() {}
-};
-
-}  // namespace paddle
diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp
deleted file mode 100644
index 5b023e2c10e5040a28660d555efceb0e26b40d49..0000000000000000000000000000000000000000
--- a/paddle/function/GemmConvOp.cpp
+++ /dev/null
@@ -1,522 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ConvOp.h"
-#include "GemmFunctor.h"
-#include "Im2Col.h"
-#include "paddle/math/MemoryHandle.h"
-
-namespace paddle {
-
-/*
- * \brief Forward calculation of convolution.
- */
-template <DeviceType Device>
-class GemmConvFunction : public ConvFunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    check(inputs, outputs);
-    // TODO(hedaoyuan): Need to define some index macros,
-    // to avoid useing 0 and 1.
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-
-    real beta;
-    if (outputs[0].getArgType() == ADD_TO) {
-      beta = 1.0;
-    } else {
-      beta = 0.0;
-    }
-
-    size_t batchSize = input[0];
-    size_t inputChannels = input[1];
-    size_t inputHeight = input[2];
-    size_t inputWidth = input[3];
-    size_t filterHeight = getFilterHeight(filter);
-    size_t filterWidth = getFilterWidth(filter);
-    size_t outputChannels = output[1];
-    size_t outputHeight = output[2];
-    size_t outputWidth = output[3];
-
-    real* inputData = inputs[0].data<real>();
-    real* filterData = inputs[1].data<real>();
-    real* outputData = outputs[0].data<real>();
-    bool needIm2col = isNeedIm2col(filter);
-
-    TensorShape imShape =
-        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
-
-    TensorShape colShape;
-    real* colData = NULL;
-
-    if (needIm2col) {
-      colShape = TensorShape({inputChannels / groups_,
-                              filterHeight,
-                              filterWidth,
-                              outputHeight,
-                              outputWidth});
-      resizeBuffer<Device>(colShape.getElements());
-      colData = reinterpret_cast<real*>(memory_->getBuf());
-    }
-
-    Im2ColFunctor<kCFO, Device, real> im2col;
-    size_t inputOffset = imShape.getElements();
-    size_t outputOffset =
-        (outputChannels / groups_) * outputHeight * outputWidth;
-    size_t filterOffset = filter.getElements() / groups_;
-
-    for (size_t i = 0; i < batchSize; i++) {
-      for (size_t g = 0; g < groups_; g++) {
-        if (needIm2col) {
-          im2col(inputData + g * inputOffset,
-                 imShape,
-                 colData,
-                 colShape,
-                 strideH(),
-                 strideW(),
-                 paddingH(),
-                 paddingW(),
-                 dilationH(),
-                 dilationW());
-        } else {
-          colData = inputData + g * inputOffset;
-        }
-        int M = outputChannels / groups_;
-        int N = outputHeight * outputWidth;
-        int K = inputChannels / groups_ * filterHeight * filterWidth;
-        BlasGemm<Device, real>::compute(false,
-                                        false,
-                                        M,
-                                        N,
-                                        K,
-                                        1.0f,
-                                        filterData + g * filterOffset,
-                                        K,
-                                        colData,
-                                        N,
-                                        beta,
-                                        outputData + g * outputOffset,
-                                        N);
-      }
-      inputData += inputChannels * inputHeight * inputWidth;
-      outputData += outputChannels * outputHeight * outputWidth;
-    }
-  }
-};
-
-#ifdef PADDLE_MOBILE_INFERENCE
-
-/*
- * \brief Forward calculation of convolution, optimized for mobile.
- */
-template <DeviceType Device>
-class GemmConvMobileFunction : public ConvFunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    check(inputs, outputs);
-    // TODO(hedaoyuan): Need to define some index macros,
-    // to avoid useing 0 and 1.
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-
-    real beta;
-    if (outputs[0].getArgType() == ADD_TO) {
-      beta = 1.0;
-    } else {
-      beta = 0.0;
-    }
-
-    size_t batchSize = input[0];
-    size_t inputChannels = input[1];
-    size_t inputHeight = input[2];
-    size_t inputWidth = input[3];
-    size_t filterHeight = getFilterHeight(filter);
-    size_t filterWidth = getFilterWidth(filter);
-    size_t outputChannels = output[1];
-    size_t outputHeight = output[2];
-    size_t outputWidth = output[3];
-
-    real* inputData = inputs[0].data<real>();
-    real* filterData = inputs[1].data<real>();
-    real* outputData = outputs[0].data<real>();
-    real* colData = NULL;
-    bool needIm2col = isNeedIm2col(filter);
-
-    TensorShape imShape =
-        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
-    TensorShape colShape;
-
-    // Max col matrix width 4096, Max col matrix size 4M.
-    size_t outputHeightSteps =
-        std::min(std::max(4096 / outputWidth, (size_t)1), outputHeight);
-    size_t maxColWidth = outputHeightSteps * outputWidth;
-    size_t channelSteps =
-        std::min(std::max((1048576 / maxColWidth) / filterHeight * filterWidth,
-                          (size_t)1),
-                 inputChannels / groups_);
-    size_t maxColHeight = channelSteps * filterHeight * filterWidth;
-
-    if (needIm2col) {
-      colShape = TensorShape({inputChannels / groups_,
-                              filterHeight,
-                              filterWidth,
-                              outputHeight,
-                              outputWidth});
-
-      resizeBuffer<Device>(maxColHeight * maxColWidth * sizeof(real));
-      colData = reinterpret_cast<real*>(memory_->getBuf());
-    }
-
-    Im2ColMobileFunctor<real> im2col;
-    size_t inputOffset = imShape.getElements();
-    size_t outputOffset =
-        (outputChannels / groups_) * outputHeight * outputWidth;
-    size_t filterOffset = filter.getElements() / groups_;
-
-    int nStride = outputHeight * outputWidth;
-    int kStride = inputChannels / groups_ * filterHeight * filterWidth;
-    for (size_t i = 0; i < batchSize; i++) {
-      filterData = inputs[1].data<real>();
-      for (size_t g = 0; g < groups_; g++) {
-        if (needIm2col) {
-          real beta_ = beta;
-          for (size_t ic = 0; ic < inputChannels / groups_;
-               ic += channelSteps) {
-            int channels = std::min(inputChannels / groups_ - ic, channelSteps);
-            for (size_t oh = 0; oh < outputHeight; oh += outputHeightSteps) {
-              int height = std::min(outputHeight - oh, outputHeightSteps);
-
-              int M = outputChannels / groups_;
-              int N = height * outputWidth;
-              int K = channels * filterHeight * filterWidth;
-              // im2col
-              im2col(inputData,
-                     imShape,
-                     colData,
-                     colShape,
-                     strideH(),
-                     strideW(),
-                     paddingH(),
-                     paddingW(),
-                     dilationH(),
-                     dilationW(),
-                     channels,
-                     oh,
-                     height,
-                     N);
-
-              // gemm
-              BlasGemm<Device, real>::compute(
-                  false,
-                  false,
-                  M,
-                  N,
-                  K,
-                  1.0f,
-                  filterData + ic * filterHeight * filterWidth,
-                  kStride,
-                  colData,
-                  N,
-                  beta_,
-                  outputData + oh * outputWidth,
-                  nStride);
-            }
-            beta_ = 1.0;
-          }
-        } else {
-          int M = outputChannels / groups_;
-          int N = outputHeight * outputWidth;
-          int K = inputChannels / groups_ * filterHeight * filterWidth;
-          BlasGemm<Device, real>::compute(false,
-                                          false,
-                                          M,
-                                          N,
-                                          K,
-                                          1.0f,
-                                          filterData,
-                                          K,
-                                          inputData,
-                                          N,
-                                          beta,
-                                          outputData,
-                                          N);
-        }
-        inputData += inputOffset;
-        outputData += outputOffset;
-        filterData += filterOffset;
-      }
-    }
-
-    memory_.reset();
-  }
-};
-
-#endif
-
-/*
- * \brief Backward input calculation of convolution.
- */
-template <DeviceType Device>
-class GemmConvGradInputFunction : public ConvFunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& output = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& input = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    check(inputs, outputs);
-    // Since the implementation of Col2ImFunctor is ADD_TO,
-    // this function only supports ADD_TO mode.
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-    const TensorShape& output = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& input = outputs[0].shape();
-
-    size_t batchSize = input[0];
-    size_t inputChannels = input[1];
-    size_t inputHeight = input[2];
-    size_t inputWidth = input[3];
-    size_t filterHeight = getFilterHeight(filter);
-    size_t filterWidth = getFilterWidth(filter);
-    size_t outputChannels = output[1];
-    size_t outputHeight = output[2];
-    size_t outputWidth = output[3];
-
-    real* outputGrad = inputs[0].data<real>();
-    real* filterData = inputs[1].data<real>();
-    real* inputGrad = outputs[0].data<real>();
-    bool needIm2col = isNeedIm2col(filter);
-
-    TensorShape imShape =
-        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
-
-    TensorShape colShape;
-    real* colData = NULL;
-
-    if (needIm2col) {
-      colShape = TensorShape({inputChannels / groups_,
-                              filterHeight,
-                              filterWidth,
-                              outputHeight,
-                              outputWidth});
-      resizeBuffer<Device>(colShape.getElements());
-      colData = reinterpret_cast<real*>(memory_->getBuf());
-    }
-
-    Col2ImFunctor<kCFO, Device, real> col2im;
-    size_t inputOffset = imShape.getElements();
-    size_t outputOffset =
-        (outputChannels / groups_) * outputHeight * outputWidth;
-    size_t filterOffset = filter.getElements() / groups_;
-
-    for (size_t i = 0; i < batchSize; i++) {
-      for (size_t g = 0; g < groups_; g++) {
-        int K = outputChannels / groups_;
-        int N = outputHeight * outputWidth;
-        int M = inputChannels / groups_ * filterHeight * filterWidth;
-        real scale = 0.0f;
-        if (!needIm2col) {
-          colData = inputGrad + g * inputOffset;
-          scale = 1.0f;
-        }
-        BlasGemm<Device, real>::compute(true,
-                                        false,
-                                        M,
-                                        N,
-                                        K,
-                                        1.0f,
-                                        filterData + g * filterOffset,
-                                        M,
-                                        outputGrad + g * outputOffset,
-                                        N,
-                                        scale,
-                                        colData,
-                                        N);
-        if (needIm2col) {
-          col2im(inputGrad + g * inputOffset,
-                 imShape,
-                 colData,
-                 colShape,
-                 strideH(),
-                 strideW(),
-                 paddingH(),
-                 paddingW(),
-                 dilationH(),
-                 dilationW());
-        }
-      }
-      inputGrad += inputChannels * inputHeight * inputWidth;
-      outputGrad += outputChannels * outputHeight * outputWidth;
-    }
-  }
-};
-
-/*
- * \brief Backward filter calculation of convolution.
- */
-template <DeviceType Device>
-class GemmConvGradFilterFunction : public ConvFunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& output = inputs[0].shape();
-    const TensorShape& input = inputs[1].shape();
-    const TensorShape& filter = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    check(inputs, outputs);
-    const TensorShape& output = inputs[0].shape();
-    const TensorShape& input = inputs[1].shape();
-    const TensorShape& filter = outputs[0].shape();
-
-    real beta;
-    if (outputs[0].getArgType() == ADD_TO) {
-      beta = 1.0;
-    } else {
-      beta = 0.0;
-    }
-
-    size_t batchSize = input[0];
-    size_t inputChannels = input[1];
-    size_t inputHeight = input[2];
-    size_t inputWidth = input[3];
-    size_t filterHeight = getFilterHeight(filter);
-    size_t filterWidth = getFilterWidth(filter);
-    size_t outputChannels = output[1];
-    size_t outputHeight = output[2];
-    size_t outputWidth = output[3];
-
-    real* outputGrad = inputs[0].data<real>();
-    real* inputData = inputs[1].data<real>();
-    real* filterGrad = outputs[0].data<real>();
-    bool needIm2col = isNeedIm2col(filter);
-
-    TensorShape imShape =
-        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
-
-    TensorShape colShape;
-    real* colData = NULL;
-
-    if (needIm2col) {
-      colShape = TensorShape({inputChannels / groups_,
-                              filterHeight,
-                              filterWidth,
-                              outputHeight,
-                              outputWidth});
-      resizeBuffer<Device>(colShape.getElements());
-      colData = reinterpret_cast<real*>(memory_->getBuf());
-    }
-
-    Im2ColFunctor<kCFO, Device, real> im2col;
-    size_t inputOffset = imShape.getElements();
-    size_t outputOffset =
-        (outputChannels / groups_) * outputHeight * outputWidth;
-    size_t filterOffset = filter.getElements() / groups_;
-    for (size_t i = 0; i < batchSize; i++) {
-      for (size_t g = 0; g < groups_; g++) {
-        if (needIm2col) {
-          im2col(inputData + g * inputOffset,
-                 imShape,
-                 colData,
-                 colShape,
-                 strideH(),
-                 strideW(),
-                 paddingH(),
-                 paddingW(),
-                 dilationH(),
-                 dilationW());
-        } else {
-          colData = inputData + g * inputOffset;
-        }
-        int M = outputChannels / groups_;
-        int K = outputHeight * outputWidth;
-        int N = inputChannels / groups_ * filterHeight * filterWidth;
-        BlasGemm<Device, real>::compute(false,
-                                        true,
-                                        M,
-                                        N,
-                                        K,
-                                        1.0f,
-                                        outputGrad + g * outputOffset,
-                                        K,
-                                        colData,
-                                        K,
-                                        i == 0 ? beta : 1.0f,
-                                        filterGrad + g * filterOffset,
-                                        N);
-      }
-      inputData += inputChannels * inputHeight * inputWidth;
-      outputGrad += outputChannels * outputHeight * outputWidth;
-    }
-  }
-};
-
-#ifdef PADDLE_MOBILE_INFERENCE
-REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvMobileFunction);
-#else
-REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvFunction);
-#endif
-REGISTER_TYPED_FUNC(GemmConvGradInput, CPU, GemmConvGradInputFunction);
-REGISTER_TYPED_FUNC(GemmConvGradFilter, CPU, GemmConvGradFilterFunction);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(GemmConv, GPU, GemmConvFunction);
-REGISTER_TYPED_FUNC(GemmConvGradInput, GPU, GemmConvGradInputFunction);
-REGISTER_TYPED_FUNC(GemmConvGradFilter, GPU, GemmConvGradFilterFunction);
-#endif
-
-}  // namespace paddle
diff --git a/paddle/function/GemmFunctor.cpp b/paddle/function/GemmFunctor.cpp
deleted file mode 100644
index 0b1fe1b67d8fd6caf86a08bc05e250b1936e9f85..0000000000000000000000000000000000000000
--- a/paddle/function/GemmFunctor.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "GemmFunctor.h"
-#include "paddle/math/MathFunctions.h"
-
-namespace paddle {
-
-template <class T>
-struct BlasGemm<DEVICE_TYPE_CPU, T> {
-  static void compute(const bool transA,
-                      const bool transB,
-                      const int M,
-                      const int N,
-                      const int K,
-                      const T alpha,
-                      const T* A,
-                      const int lda,
-                      const T* B,
-                      const int ldb,
-                      const T beta,
-                      T* C,
-                      const int ldc) {
-#ifdef PADDLE_USE_EIGEN_FOR_BLAS
-    EigenBlasGemm<T>::compute(
-        transA, transB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
-#else
-    gemm<T>(transA == false ? CblasNoTrans : CblasTrans,
-            transB == false ? CblasNoTrans : CblasTrans,
-            M,
-            N,
-            K,
-            alpha,
-            A,
-            lda,
-            B,
-            ldb,
-            beta,
-            C,
-            ldc);
-#endif
-  }
-};
-
-template <class T>
-struct BlasGemm<DEVICE_TYPE_GPU, T> {
-  static void compute(const bool transA,
-                      const bool transB,
-                      const int M,
-                      const int N,
-                      const int K,
-                      const T alpha,
-                      const T* A,
-                      const int lda,
-                      const T* B,
-                      const int ldb,
-                      const T beta,
-                      T* C,
-                      const int ldc) {
-    hl_matrix_mul((T*)A,
-                  transA == false ? HPPL_OP_N : HPPL_OP_T,
-                  (T*)B,
-                  transB == false ? HPPL_OP_N : HPPL_OP_T,
-                  C,
-                  M,
-                  N,
-                  K,
-                  alpha,
-                  beta,
-                  lda,
-                  ldb,
-                  ldc);
-  }
-};
-
-template struct BlasGemm<DEVICE_TYPE_CPU, real>;
-template struct BlasGemm<DEVICE_TYPE_GPU, real>;
-
-}  // namespace paddle
diff --git a/paddle/function/Im2ColTest.cpp b/paddle/function/Im2ColTest.cpp
deleted file mode 100644
index 967c5b91536608364b4181707b843799b1764c3f..0000000000000000000000000000000000000000
--- a/paddle/function/Im2ColTest.cpp
+++ /dev/null
@@ -1,223 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Im2Col.h"
-#include <gtest/gtest.h>
-#include "Function.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/tests/TensorCheck.h"
-
-namespace paddle {
-
-template <DeviceType Device, class T>
-void TestIm2ColFunctor() {
-  for (size_t channels : {1, 5, 32}) {
-    for (size_t inputHeight : {5, 33, 100}) {
-      for (size_t inputWidth : {5, 32, 96}) {
-        for (size_t filterHeight : {1, 5}) {
-          for (size_t filterWidth : {3, 7}) {
-            for (size_t stride : {1, 2}) {
-              for (size_t padding : {0, 1}) {
-                for (size_t dilation : {1, 3}) {
-                  size_t filterSizeH = (filterHeight - 1) * dilation + 1;
-                  size_t filterSizeW = (filterWidth - 1) * dilation + 1;
-                  if (inputHeight + 2 * padding < filterSizeH ||
-                      inputWidth + 2 * padding < filterSizeW)
-                    break;
-                  if (padding >= filterSizeH || padding >= filterSizeW) break;
-                  size_t outputHeight =
-                      (inputHeight - filterSizeH + 2 * padding) / stride + 1;
-                  size_t outputWidth =
-                      (inputWidth - filterSizeW + 2 * padding) / stride + 1;
-
-                  TensorShape imShape =
-                      TensorShape({channels, inputHeight, inputWidth});
-                  TensorShape colShape1 = TensorShape({channels,
-                                                       filterHeight,
-                                                       filterWidth,
-                                                       outputHeight,
-                                                       outputWidth});
-                  TensorShape colShape2 = TensorShape({outputHeight,
-                                                       outputWidth,
-                                                       channels,
-                                                       filterHeight,
-                                                       filterWidth});
-
-                  size_t height = channels * filterHeight * filterWidth;
-                  size_t width = outputHeight * outputWidth;
-                  VectorPtr input1 =
-                      Vector::create(imShape.getElements(), false);
-                  VectorPtr input2 =
-                      Vector::create(imShape.getElements(), false);
-                  MatrixPtr output1 =
-                      Matrix::create(height, width, false, false);
-                  MatrixPtr output2 =
-                      Matrix::create(width, height, false, false);
-                  input1->uniform(0.001, 1);
-                  input2->copyFrom(*input1);
-
-                  Im2ColFunctor<kCFO, Device, T> im2Col1;
-                  Im2ColFunctor<kOCF, Device, T> im2Col2;
-                  im2Col1(input1->getData(),
-                          imShape,
-                          output1->getData(),
-                          colShape1,
-                          stride,
-                          stride,
-                          padding,
-                          padding,
-                          dilation,
-                          dilation);
-                  im2Col2(input2->getData(),
-                          imShape,
-                          output2->getData(),
-                          colShape2,
-                          stride,
-                          stride,
-                          padding,
-                          padding,
-                          dilation,
-                          dilation);
-
-                  // The transposition of the result of ColFormat == kCFO
-                  // is equal to the result of ColFormat == kOCF.
-                  MatrixPtr test;
-                  output2->transpose(test, true);
-                  autotest::TensorCheckErr(*output1, *test);
-
-                  Col2ImFunctor<kCFO, Device, T> col2Im1;
-                  Col2ImFunctor<kOCF, Device, T> col2Im2;
-
-                  col2Im1(input1->getData(),
-                          imShape,
-                          output1->getData(),
-                          colShape1,
-                          stride,
-                          stride,
-                          padding,
-                          padding,
-                          dilation,
-                          dilation);
-                  col2Im2(input2->getData(),
-                          imShape,
-                          output2->getData(),
-                          colShape2,
-                          stride,
-                          stride,
-                          padding,
-                          padding,
-                          dilation,
-                          dilation);
-                  autotest::TensorCheckErr(*input1, *input2);
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(Im2ColFunctor, CPU) { TestIm2ColFunctor<DEVICE_TYPE_CPU, float>(); }
-
-#ifdef PADDLE_WITH_CUDA
-
-TEST(Im2ColFunctor, GPU) { TestIm2ColFunctor<DEVICE_TYPE_GPU, float>(); }
-
-#endif
-
-template <class T>
-void TestIm2ColMobileFunctor() {
-  for (size_t channels : {32}) {
-    for (size_t inputHeight : {33, 100}) {
-      for (size_t inputWidth : {32, 96}) {
-        for (size_t filterHeight : {5}) {
-          for (size_t filterWidth : {7}) {
-            for (size_t stride : {2}) {
-              for (size_t padding : {1}) {
-                for (size_t dilation : {1, 3}) {
-                  size_t filterSizeH = (filterHeight - 1) * dilation + 1;
-                  size_t filterSizeW = (filterWidth - 1) * dilation + 1;
-                  if (inputHeight + 2 * padding < filterSizeH ||
-                      inputWidth + 2 * padding < filterSizeW)
-                    break;
-                  if (padding >= filterSizeH || padding >= filterSizeW) break;
-                  size_t outputHeight =
-                      (inputHeight - filterSizeH + 2 * padding) / stride + 1;
-                  size_t outputWidth =
-                      (inputWidth - filterSizeW + 2 * padding) / stride + 1;
-
-                  TensorShape imShape =
-                      TensorShape({channels, inputHeight, inputWidth});
-                  TensorShape colShape1 = TensorShape({channels,
-                                                       filterHeight,
-                                                       filterWidth,
-                                                       outputHeight,
-                                                       outputWidth});
-
-                  size_t height = channels * filterHeight * filterWidth;
-                  size_t width = outputHeight * outputWidth;
-                  VectorPtr input1 =
-                      Vector::create(imShape.getElements(), false);
-                  VectorPtr input2 =
-                      Vector::create(imShape.getElements(), false);
-                  MatrixPtr output1 =
-                      Matrix::create(height, width, false, false);
-                  MatrixPtr output2 =
-                      Matrix::create(height, width, false, false);
-                  input1->uniform(0.001, 1);
-                  input2->copyFrom(*input1);
-
-                  Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, T> im2Col1;
-                  Im2ColMobileFunctor<T> im2Col2;
-                  im2Col1(input1->getData(),
-                          imShape,
-                          output1->getData(),
-                          colShape1,
-                          stride,
-                          stride,
-                          padding,
-                          padding,
-                          dilation,
-                          dilation);
-                  im2Col2(input2->getData(),
-                          imShape,
-                          output2->getData(),
-                          colShape1,
-                          stride,
-                          stride,
-                          padding,
-                          padding,
-                          dilation,
-                          dilation,
-                          channels,
-                          0,
-                          outputHeight,
-                          outputHeight * outputWidth);
-
-                  autotest::TensorCheckEqual(*output1, *output2);
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(Im2ColFunctor, Mobile) { TestIm2ColMobileFunctor<float>(); }
-
-}  // namespace paddle
diff --git a/paddle/function/MulOp.cpp b/paddle/function/MulOp.cpp
deleted file mode 100644
index 7bf36c8050a8c33d836ce98dc7f3cf6d3de38d55..0000000000000000000000000000000000000000
--- a/paddle/function/MulOp.cpp
+++ /dev/null
@@ -1,347 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MulOp.h"
-#include "GemmFunctor.h"
-#include "paddle/math/SIMDFunctions.h"
-#include "paddle/utils/ThreadLocal.h"
-
-namespace {
-inline void vecAddTo(real* a, const real* b, real scaleB, size_t len) {
-  for (unsigned int i = 0; i < len; ++i) {
-    a[i] += (1.0 == scaleB) ? b[i] : scaleB * b[i];
-  }
-}
-
-inline void colVecAddTo(
-    real* a, real* b, real c, size_t len, size_t aWidth, size_t bWidth) {
-  for (unsigned int i = 0; i < len; ++i) {
-    a[i * aWidth] += (1.0 == c) ? b[i * bWidth] : b[i * bWidth] * c;
-  }
-}
-}  // namespace
-
-namespace paddle {
-/// sparse matrix (+)= dense matrix * dense matrix
-template <>
-void MulOp<DEVICE_TYPE_CPU>(CpuSparseMatrix& out,
-                            const CpuMatrix& a,
-                            const CpuMatrix& b,
-                            real scaleAB,
-                            real scaleT,
-                            bool aTrans,
-                            bool bTrans) {
-  CHECK_EQ(out.getValueType(), FLOAT_VALUE);
-  if (scaleT == 0) {
-    out.zeroMem();
-  }
-  const real* A = a.getData();
-  const real* B = b.getData();
-  real* C = out.getValue();
-  int* rows = out.getRows();
-  int* cols = out.getCols();
-  size_t width = out.getWidth();
-  size_t height = out.getHeight();
-
-  /// SPARSE_CSC, {a any, b not trans}
-  if (out.getFormat() == SPARSE_CSC) {
-    /// b not trans and a any
-    CHECK(!bTrans);
-    size_t m = !aTrans ? a.getWidth() : a.getHeight();
-    for (size_t i = 0; i < width; i++) {
-      size_t start = out.getColStartIdx(i);
-      size_t end = out.getColStartIdx(i + 1);
-      for (size_t j = start; j < end; j++) {
-        real sum = 0;
-        size_t rowIdx = rows[j];
-        for (size_t k = 0; k < m; k++) {
-          sum += (!aTrans ? A[rowIdx * m + k] : A[k * height + rowIdx]) *
-                 B[k * width + i];
-        }
-        C[j] = scaleAB * sum + scaleT * C[j];
-      }
-    }
-    return;
-  }
-
-  /// SPARSE_CSR, {a any, b not trans} or {a not trans, b trans}
-  if (out.getFormat() == SPARSE_CSR) {
-    /// a and b can not both transpose
-    CHECK(!(aTrans && bTrans));
-    size_t m = a.getWidth();
-    for (size_t i = 0; i < height; i++) {
-      size_t start = out.getRowStartIdx(i);
-      size_t end = out.getRowStartIdx(i + 1);
-      for (size_t j = start; j < end; j++) {
-        real sum = 0;
-        size_t colIdx = cols[j];
-        for (size_t k = 0; k < m; k++) {
-          sum += (!aTrans ? A[i * m + k] : A[k * height + i]) *
-                 (!bTrans ? B[k * width + colIdx] : B[colIdx * m + k]);
-        }
-        C[j] = scaleAB * sum + scaleT * C[j];
-      }
-    }
-    return;
-  }
-}
-
-/// dense matrix (+)= dense matrix * dense matrix
-template <>
-void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
-                            const CpuMatrix& a,
-                            const CpuMatrix& b,
-                            real scaleAB,
-                            real scaleT,
-                            bool aTrans,
-                            bool bTrans) {
-  BlasGemm<DEVICE_TYPE_CPU, real>::compute(
-      aTrans,
-      bTrans,
-      out.getHeight(),
-      out.getWidth(),
-      !aTrans ? a.getWidth() : a.getHeight(),
-      scaleAB,
-      a.getData(),
-      a.getStride(),
-      b.getData(),
-      b.getStride(),
-      scaleT,
-      out.getData(),
-      out.getStride());
-}
-
-/// dense matrix (+)= sparse matrix * dense matrix
-template <>
-void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
-                            const CpuSparseMatrix& a,
-                            const CpuMatrix& b,
-                            real scaleAB,
-                            real scaleT,
-                            bool aTrans,
-                            bool bTrans) {
-  if (scaleT == 0) {
-    out.zeroMem();
-  }
-  const real* B = b.getData();
-  real* C = out.getData();
-  if (out.getWidth() % 32 == 0) {
-    CHECK_EQ((size_t)B % 32, 0UL);
-    CHECK_EQ((size_t)C % 32, 0UL);
-  }
-
-  int* cols = a.getCols();
-  real* values = a.getValue();
-  for (size_t i = 0; i < a.getHeight(); ++i) {
-    const int start = a.getRowStartIdx(i);
-    const int end = a.getRowStartIdx(i + 1);
-    for (int j = start; j < end; ++j) {
-      vecAddTo(!aTrans ? out.getRow(i) : out.getRow(cols[j]),
-               !aTrans ? const_cast<CpuMatrix&>(b).getRow(cols[j])
-                       : const_cast<CpuMatrix&>(b).getRow(i),
-               (a.getValueType() == FLOAT_VALUE) ? values[j] : (real)1.0,
-               out.getWidth());
-    }
-  }
-}
-
-/// dense matrix (+)= dense matrix * sparse matrix
-template <>
-void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
-                            const CpuMatrix& a,
-                            const CpuSparseMatrix& b,
-                            real scaleAB,
-                            real scaleT,
-                            bool aTrans,
-                            bool bTrans) {
-  if (scaleT == 0) {
-    out.zeroMem();
-  }
-  real* A = const_cast<real*>(a.getData());
-  real* B = const_cast<real*>(b.getValue());
-  real* C = out.getData();
-  int* rows = b.getRows();
-  int* cols = b.getCols();
-
-  /// SPARSE_CSC format
-  if (b.getFormat() == SPARSE_CSC) {
-    for (size_t j = 0; j < b.getWidth(); ++j) {
-      int start = b.getColStartIdx(j);
-      int end = b.getColStartIdx(j + 1);
-      for (int i = start; i < end; ++i) {
-        colVecAddTo(!bTrans ? C + j : C + rows[i],
-                    !bTrans ? A + rows[i] : A + j,
-                    (b.getValueType() == NO_VALUE) ? (real)1.0 : B[i],
-                    out.getHeight(),
-                    out.getWidth(),
-                    a.getWidth());
-      }
-    }
-    return;
-  }
-
-  /// SPARSE_CSR format
-  if (b.getFormat() == SPARSE_CSR) {
-    for (size_t j = 0; j < b.getHeight(); ++j) {
-      int start = b.getRowStartIdx(j);
-      int end = b.getRowStartIdx(j + 1);
-      for (int i = start; i < end; ++i) {
-        colVecAddTo(!bTrans ? C + cols[i] : C + j,
-                    !bTrans ? A + j : A + cols[i],
-                    (b.getValueType() == NO_VALUE) ? (real)1.0 : B[i],
-                    out.getHeight(),
-                    out.getWidth(),
-                    a.getWidth());
-      }
-    }
-    return;
-  }
-}
-
-/**
- * mul operator
- * out = scaleT * out + scaleAB * (A * B)
- * here, scaleT in {0, 1}, scaleAB == 1,
- * out = A * B, ASSIGN_TO
- * out += A * B, ADD_TO
- *
- *
- * \param outputs[0]      output matrix (out), M * N,
- *                        could be either Sparse or Dense Matrix
- *                        M is num of rows, N is num of columns
- * \param inputs[0]       first input matrix (A),  M * K (if non-trans)
- *                        could be either Sparse or Dense Matrix
- *                        M is num of rows, K is num of columns
- * \param inputs[1]       second input matrix (B), K * N (if non-trans)
- *                        could be either Sparse or Dense Matrix
- *                        K is num of rows, N is num of columns
- *
- * Support eight Mul operators, with both GPU and CPU devices
- * For each device, four Mul operators are supported:
- * 1. dense (out) = dense (A) * dense (B)
- * 2. dense (out) = sparse (A) * dense (B)
- *    sparse matrix only support SPARSE_CSR format
- * 3. dense (out) = dense (A) * sparse (B)
- *    sparse matrix support SPARSE_CSC and SPARSE_CSR formats
- * 4. sparse (out) = dense (A) * dense (B)
- *    sparse matrix support SPARSE_CSC and SPARSE_CSR formats
- *
- */
-template <DeviceType Device>
-class MulFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    aTrans_ = config.get<bool>("aTrans");
-    bTrans_ = config.get<bool>("bTrans");
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK(!aTrans_ || !bTrans_)
-        << "Not support both a and b are transpose matrices";
-
-    CHECK_EQ((size_t)2, inputs.size());
-    CHECK_EQ((size_t)1, outputs.size());
-    CHECK(inputs[0].data() && inputs[1].data() && outputs[0].data());
-    CHECK_EQ(inputs[0].shape().ndims(), (size_t)2);
-    CHECK_EQ(inputs[1].shape().ndims(), (size_t)2);
-    CHECK_EQ(outputs[0].shape().ndims(), (size_t)2);
-
-    size_t aRow = !aTrans_ ? inputs[0].shape()[0] : inputs[0].shape()[1];
-    size_t aCol = !aTrans_ ? inputs[0].shape()[1] : inputs[0].shape()[0];
-    size_t bRow = !bTrans_ ? inputs[1].shape()[0] : inputs[1].shape()[1];
-    size_t bCol = !bTrans_ ? inputs[1].shape()[1] : inputs[1].shape()[0];
-    /// C = A * B, or C += A * B, for matrix format
-    CHECK_EQ(aCol, bRow);
-    CHECK_EQ(aRow, outputs[0].shape()[0]);
-    CHECK_EQ(bCol, outputs[0].shape()[1]);
-
-    /// only support C = A * B (ASSIGN_TO) or C += A * B (ADD_TO)
-    real scaleT = (outputs[0].getArgType() == ADD_TO) ? 1.0 : 0.0;
-
-    /// support dense = not both sparse * sparse
-    /// or sparse = dense * dense
-    CHECK((!outputs[0].isSparseArg() &&
-           !(inputs[0].isSparseArg() && inputs[1].isSparseArg())) ||
-          (outputs[0].isSparseArg() && !inputs[0].isSparseArg() &&
-           !inputs[1].isSparseArg()));
-
-    auto outMat = outputs[0].matrix<Device>();
-    /// dense matrix = dense matrix * dense matrix
-    if (!inputs[0].isSparseArg() && !inputs[1].isSparseArg() &&
-        !outputs[0].isSparseArg()) {
-      MulOp<Device>(outMat,
-                    inputs[0].matrix<Device>(),
-                    inputs[1].matrix<Device>(),
-                    1.0,  // scaleAB
-                    scaleT,
-                    aTrans_,
-                    bTrans_);
-      return;
-    }
-
-    /// dense matrix = dense matrix * sparse matrix
-    if (!inputs[0].isSparseArg() && inputs[1].isSparseArg() &&
-        !outputs[0].isSparseArg()) {
-      CHECK(!aTrans_) << "Not supported a transpose";
-      MulOp<Device>(outMat,
-                    inputs[0].matrix<Device>(),
-                    inputs[1].sparse().SparseMatrix<Device>(),
-                    1.0,  // scaleAB
-                    scaleT,
-                    aTrans_,
-                    bTrans_);
-      return;
-    }
-
-    /// dense matrix = sparse matrix * dense matrix
-    if (inputs[0].isSparseArg() && !inputs[1].isSparseArg() &&
-        !outputs[0].isSparseArg()) {
-      CHECK(!bTrans_) << "Not supported b transpose";
-      CHECK_EQ(inputs[0].sparse().dataFormat(), T_SPARSE_CSR)
-          << "Only supported SPARSE_CSR format for sparse matrix a";
-      MulOp<Device>(outMat,
-                    inputs[0].sparse().SparseMatrix<Device>(),
-                    inputs[1].matrix<Device>(),
-                    1.0,  // scaleAB
-                    scaleT,
-                    aTrans_,
-                    bTrans_);
-      return;
-    }
-
-    /// sparse matrix = dense matrix * dense matrix
-    auto outSparseMat = outputs[0].sparse().SparseMatrix<Device>();
-    if (!inputs[0].isSparseArg() && !inputs[1].isSparseArg() &&
-        outputs[0].isSparseArg()) {
-      MulOp<Device>(outSparseMat,
-                    inputs[0].matrix<Device>(),
-                    inputs[1].matrix<Device>(),
-                    1.0,  // scaleAB
-                    scaleT,
-                    aTrans_,
-                    bTrans_);
-      return;
-    }
-  }
-
- private:
-  bool aTrans_;
-  bool bTrans_;
-};
-
-REGISTER_TYPED_FUNC(MulOp, CPU, MulFunc);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(MulOp, GPU, MulFunc);
-#endif
-}  // namespace paddle
diff --git a/paddle/function/MulOp.h b/paddle/function/MulOp.h
deleted file mode 100644
index e6057be4e54b3cc2b3502b9a93825d4b53037c91..0000000000000000000000000000000000000000
--- a/paddle/function/MulOp.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Function.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/SparseMatrix.h"
-
-namespace paddle {
-/// CPU, dense matrix (+)= dense matrix * dense matrix
-template <DeviceType DType>
-void MulOp(CpuMatrix& out,
-           const CpuMatrix& a,
-           const CpuMatrix& b,
-           real scaleAB,
-           real scaleT,
-           bool aTrans,
-           bool bTrans);
-
-/// CPU, dense matrix (+)= sparse matrix * dense matrix
-template <DeviceType DType>
-void MulOp(CpuMatrix& out,
-           const CpuSparseMatrix& a,
-           const CpuMatrix& b,
-           real scaleAB,
-           real scaleT,
-           bool aTrans,
-           bool bTrans);
-
-/// CPU, dense matrix (+)= dense matrix * sparse matrix
-template <DeviceType DType>
-void MulOp(CpuMatrix& out,
-           const CpuMatrix& a,
-           const CpuSparseMatrix& b,
-           real scaleAB,
-           real scaleT,
-           bool aTrans,
-           bool bTrans);
-
-/// CPU, sparse matrix (+)= dense matrix * dense matrix
-template <DeviceType DType>
-void MulOp(CpuSparseMatrix& out,
-           const CpuMatrix& a,
-           const CpuMatrix& b,
-           real scaleAB,
-           real scaleT,
-           bool aTrans,
-           bool bTrans);
-
-/// GPU, dense matrix (+)= dense matrix * dense matrix
-template <DeviceType DType>
-void MulOp(GpuMatrix& out,
-           const GpuMatrix& a,
-           const GpuMatrix& b,
-           real scaleAB,
-           real scaleT,
-           bool aTrans,
-           bool bTrans);
-
-/// GPU, dense matrix (+)= sparse matrix * dense matrix
-template <DeviceType DType>
-void MulOp(GpuMatrix& out,
-           const GpuSparseMatrix& a,
-           const GpuMatrix& b,
-           real scaleAB,
-           real scaleT,
-           bool aTrans,
-           bool bTrans);
-
-/// GPU, dense matrix (+)= dense matrix * sparse matrix
-template <DeviceType DType>
-void MulOp(GpuMatrix& out,
-           const GpuMatrix& a,
-           const GpuSparseMatrix& b,
-           real scaleAB,
-           real scaleT,
-           bool aTrans,
-           bool bTrans);
-
-/// GPU, sparse matrix (+)= dense matrix * dense matrix
-template <DeviceType DType>
-void MulOp(GpuSparseMatrix& out,
-           const GpuMatrix& a,
-           const GpuMatrix& b,
-           real scaleAB,
-           real scaleT,
-           bool aTrans,
-           bool bTrans);
-
-}  // namespace paddle
diff --git a/paddle/function/MulOpGpu.cu b/paddle/function/MulOpGpu.cu
deleted file mode 100644
index d63416a8e45346089bac23100742b8afc99b8e77..0000000000000000000000000000000000000000
--- a/paddle/function/MulOpGpu.cu
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MulOp.h"
-#include "hl_base.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/SparseMatrix.h"
-
-namespace paddle {
-/// dense matrix (+)= dense matrix * dense matrix
-template <>
-void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
-                            const GpuMatrix& a,
-                            const GpuMatrix& b,
-                            real scaleAB,
-                            real scaleT,
-                            bool aTrans,
-                            bool bTrans) {
-  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
-  hl_matrix_mul(const_cast<real*>(a.getData()),
-                !aTrans ? HPPL_OP_N : HPPL_OP_T,
-                const_cast<real*>(b.getData()),
-                !bTrans ? HPPL_OP_N : HPPL_OP_T,
-                const_cast<real*>(out.getData()),
-                out.getHeight(),
-                out.getWidth(),
-                !aTrans ? a.getWidth() : a.getHeight(),
-                scaleAB,
-                scaleT,
-                a.getStride(),
-                b.getStride(),
-                out.getStride());
-}
-
-/// dense matrix (+)= sparse matrix * dense matrix
-template <>
-void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
-                            const GpuSparseMatrix& a,
-                            const GpuMatrix& b,
-                            real scaleAB,
-                            real scaleT,
-                            bool aTrans,
-                            bool bTrans) {
-  CHECK(out.isContiguous());
-  CHECK(b.isContiguous());
-  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
-  hl_matrix_csr_mul_dense(a.sMatrix_.get(),
-                          aTrans ? HPPL_OP_T : HPPL_OP_N,
-                          const_cast<real*>(b.getData()),
-                          HPPL_OP_N,
-                          const_cast<real*>(out.getData()),
-                          out.getHeight(),
-                          out.getWidth(),
-                          b.getHeight(),
-                          scaleAB,
-                          scaleT);
-}
-
-/// dense matrix (+)= dense matrix * sparse matrix
-template <>
-void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
-                            const GpuMatrix& a,
-                            const GpuSparseMatrix& b,
-                            real scaleAB,
-                            real scaleT,
-                            bool aTrans,
-                            bool bTrans) {
-  CHECK(out.isContiguous());
-  CHECK(a.isContiguous());
-  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
-
-  if (b.format_ == SPARSE_CSC) {
-    hl_matrix_dense_mul_csc(const_cast<real*>(a.getData()),
-                            HPPL_OP_N,
-                            b.sMatrix_.get(),
-                            bTrans ? HPPL_OP_T : HPPL_OP_N,
-                            const_cast<real*>(out.getData()),
-                            out.getHeight(),
-                            out.getWidth(),
-                            a.getWidth(),
-                            scaleAB,
-                            scaleT);
-  } else {
-    hl_matrix_dense_mul_csr(const_cast<real*>(a.getData()),
-                            HPPL_OP_N,
-                            b.sMatrix_.get(),
-                            bTrans ? HPPL_OP_T : HPPL_OP_N,
-                            const_cast<real*>(out.getData()),
-                            out.getHeight(),
-                            out.getWidth(),
-                            a.getWidth(),
-                            scaleAB,
-                            scaleT);
-  }
-}
-
-/// sparse matrix (+)= dense matrix * dense matrix
-template <>
-void MulOp<DEVICE_TYPE_GPU>(GpuSparseMatrix& out,
-                            const GpuMatrix& a,
-                            const GpuMatrix& b,
-                            real scaleAB,
-                            real scaleT,
-                            bool aTrans,
-                            bool bTrans) {
-  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
-  hl_sparse_matrix_mul(const_cast<real*>(a.getData()),
-                       aTrans ? HPPL_OP_T : HPPL_OP_N,
-                       const_cast<real*>(b.getData()),
-                       bTrans ? HPPL_OP_T : HPPL_OP_N,
-                       out.sMatrix_.get(),
-                       out.getHeight(),
-                       out.getWidth(),
-                       !bTrans ? b.getHeight() : b.getWidth(),
-                       scaleAB,
-                       scaleT);
-}
-
-}  // namespace paddle
diff --git a/paddle/function/MulOpTest.cpp b/paddle/function/MulOpTest.cpp
deleted file mode 100644
index 4e1ebd749c0cd083c025e43a321d6992a11786ff..0000000000000000000000000000000000000000
--- a/paddle/function/MulOpTest.cpp
+++ /dev/null
@@ -1,212 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "FunctionTest.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/SparseMatrix.h"
-#include "paddle/math/tests/test_matrixUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-
-/**
- *  C += A * B, A, B, C dense matrix
- *  dense = dense * dense
- */
-void testFuncDDDMatrix(
-    bool transa, bool transb, size_t dimM, size_t dimN, size_t dimK) {
-  real scaleT = 1.0;
-  size_t heightA = (transa == false) ? dimM : dimK;
-  size_t widthA = (transa == false) ? dimK : dimM;
-  size_t heightB = (transb == false) ? dimK : dimN;
-  size_t widthB = (transb == false) ? dimN : dimK;
-  size_t heightC = dimM;
-  size_t widthC = dimN;
-  // init Test object
-  CpuGpuFuncCompare test(
-      "MulOp", FuncConfig().set("aTrans", transa).set("bTrans", transb));
-  // prepare input arguments
-  /// matrix A : HA * WA
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{heightA, widthA}));
-  /// matrix B: HB * WB
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{heightB, widthB}));
-
-  /// output matrix C: HC * WC
-  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{heightC, widthC}),
-                  scaleT == 1.0 ? ADD_TO : ASSIGN_TO);
-  // run Function
-  test.run();
-}
-
-TEST(MulOp, DDDMatrixMul) {
-  LOG(INFO) << "function test for dense = dense * dense matrix";
-  for (const auto transa : {false, true}) {
-    for (const auto transb : {false, true}) {
-      for (const auto dimM : {1, 10, 100}) {
-        for (const auto dimN : {1, 10}) {
-          for (const auto dimK : {8}) {
-            if (transa && transb) {
-              continue;
-            }
-            VLOG(3) << std::setiosflags(std::ios::left) << std::setfill(' ')
-                    << " transa=" << transa << " transb=" << transb
-                    << " dimM=" << std::setw(5) << dimM
-                    << " dimN=" << std::setw(5) << dimN
-                    << " dimK=" << std::setw(5) << dimK;
-            testFuncDDDMatrix(transa, transb, dimM, dimN, dimK);
-          }
-        }
-      }
-    }
-  }
-}
-
-/**
- * C += A * B, B, C dense, A sparse
- * dense = sparse * dense
- */
-void testFuncDSparseDMatrix(
-    size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
-  real scaleT = 1.0;
-  // init Test object
-  CpuGpuFuncCompare test(
-      "MulOp", FuncConfig().set("aTrans", false).set("bTrans", false));
-  // prepare input arguments
-  /// sparse matrix A : M * K
-  test.addInputs(SparseMatrixArg(
-      VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}, nnz, FORMAT, FLOAT_VALUE));
-  /// matrix B: K * N
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimK, dimN}));
-
-  /// output matrix C: M * N
-  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}),
-                  scaleT == 1.0 ? ADD_TO : ASSIGN_TO);
-  // run Function
-  test.run();
-}
-
-TEST(MuLOp, DSparseDMul) {
-  LOG(INFO) << "function test for dense = sparse * dense matrix";
-  for (const auto dimM : {10, 100, 1000}) {
-    for (const auto dimN : {10, 100}) {
-      for (const auto dimK : {3, 10}) {
-        for (const auto nnz : {3, 10}) {
-          for (const auto FORMAT : {SPARSE_CSR}) {
-            VLOG(3) << std::setiosflags(std::ios::left) << std::setfill(' ')
-                    << " dimM=" << std::setw(5) << dimM
-                    << " dimN=" << std::setw(5) << dimN
-                    << " dimK=" << std::setw(5) << dimK
-                    << " nnz=" << std::setw(5) << nnz
-                    << " format=" << std::setw(5) << FORMAT;
-            testFuncDSparseDMatrix(dimM, dimN, dimK, nnz, FORMAT);
-          }
-        }
-      }
-    }
-  }
-}
-
-/**
- * C += A * B, A, C dense, B sparse
- * dense = dense * sparse
- */
-void testFuncDDSparseMatrix(
-    size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
-  real scaleT = 1.0;
-  // init Test object
-  CpuGpuFuncCompare test(
-      "MulOp", FuncConfig().set("aTrans", false).set("bTrans", false));
-  // prepare input arguments
-  /// matrix A : M * K
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}));
-
-  /// matrix B: K * N
-  test.addInputs(SparseMatrixArg(
-      VALUE_TYPE_FLOAT, TensorShape{dimK, dimN}, nnz, FORMAT, FLOAT_VALUE));
-
-  /// output matrix C: M * N
-  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}),
-                  scaleT == 1.0 ? ADD_TO : ASSIGN_TO);
-  // run Function
-  test.run();
-}
-
-TEST(MulOp, DDSparseMul) {
-  LOG(INFO) << "function test for dense = dense * sparse matrix";
-  for (const auto dimM : {10, 100, 1000}) {
-    for (const auto dimN : {10, 100}) {
-      for (const auto dimK : {3, 10}) {
-        for (const auto nnz : {3, 10}) {
-          for (const auto FORMAT : {SPARSE_CSR, SPARSE_CSC}) {
-            VLOG(3) << std::setiosflags(std::ios::left) << std::setfill(' ')
-                    << " dimM=" << std::setw(5) << dimM
-                    << " dimN=" << std::setw(5) << dimN
-                    << " dimK=" << std::setw(5) << dimK
-                    << " nnz=" << std::setw(5) << nnz
-                    << " format=" << std::setw(5) << FORMAT;
-            testFuncDDSparseMatrix(dimM, dimN, dimK, nnz, FORMAT);
-          }
-        }
-      }
-    }
-  }
-}
-
-/**
- * C += A * B, A sparse, B, C dense
- * sparse = dense * dense
- */
-void testFuncSparseDDMatrix(
-    size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
-  real scaleT = 1.0;
-  // init Test object
-  CpuGpuFuncCompare test(
-      "MulOp", FuncConfig().set("aTrans", false).set("bTrans", false));
-  // prepare input arguments
-  /// matrix A : M * K
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}));
-
-  /// matrix B: K * N
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimK, dimN}));
-
-  /// output sparse matrix C: M * N
-  test.addOutputs(
-      SparseMatrixArg(
-          VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}, nnz, FORMAT, FLOAT_VALUE),
-      scaleT == 1.0 ? ADD_TO : ASSIGN_TO);
-  // run Function
-  test.run();
-}
-
-TEST(MulOp, SparseDDMul) {
-  LOG(INFO) << "function test for sparse = dense * dense matrix";
-  for (const auto dimM : {10, 100, 1000}) {
-    for (const auto dimN : {10, 100}) {
-      for (const auto dimK : {3, 10}) {
-        for (const auto nnz : {3, 10}) {
-          for (const auto FORMAT : {SPARSE_CSC, SPARSE_CSR}) {
-            VLOG(3) << std::setiosflags(std::ios::left) << std::setfill(' ')
-                    << " dimM=" << std::setw(5) << dimM
-                    << " dimN=" << std::setw(5) << dimN
-                    << " dimK=" << std::setw(5) << dimK
-                    << " nnz=" << std::setw(5) << nnz
-                    << " format=" << std::setw(5) << FORMAT;
-            testFuncSparseDDMatrix(dimM, dimN, dimK, nnz, FORMAT);
-          }
-        }
-      }
-    }
-  }
-}
diff --git a/paddle/function/PadOp.cpp b/paddle/function/PadOp.cpp
deleted file mode 100644
index 5d7515e8c053439b95fb18de3c8ffe70705600a3..0000000000000000000000000000000000000000
--- a/paddle/function/PadOp.cpp
+++ /dev/null
@@ -1,215 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PadOp.h"
-#include "paddle/math/Vector.h"
-
-namespace paddle {
-
-template <>
-void Pad<DEVICE_TYPE_CPU>(real* outputs,
-                          const real* inputs,
-                          const int num,
-                          const int inC,
-                          const int inH,
-                          const int inW,
-                          const PadConf& pad) {
-  int cstart = pad.channel[0], cend = pad.channel[1];
-  int hstart = pad.height[0], hend = pad.height[1];
-  int wstart = pad.width[0], wend = pad.width[1];
-  int outC = inC + cstart + cend;
-  int outH = inH + hstart + hend;
-  int outW = inW + wstart + wend;
-  for (int i = 0; i < num; i++) {
-    for (int c = 0; c < inC; c++) {
-      for (int h = 0; h < inH; h++) {
-        int inoff = ((i * inC + c) * inH + h) * inW;
-        int outoff =
-            ((i * outC + c + cstart) * outH + h + hstart) * outW + wstart;
-        memcpy(outputs + outoff, inputs + inoff, inW * sizeof(real));
-      }
-    }
-  }
-}
-
-template <>
-void PadGrad<DEVICE_TYPE_CPU>(real* inGrad,
-                              const real* outGrad,
-                              const int num,
-                              const int inC,
-                              const int inH,
-                              const int inW,
-                              const PadConf& pad) {
-  int cstart = pad.channel[0], cend = pad.channel[1];
-  int hstart = pad.height[0], hend = pad.height[1];
-  int wstart = pad.width[0], wend = pad.width[1];
-  int outC = inC + cstart + cend;
-  int outH = inH + hstart + hend;
-  int outW = inW + wstart + wend;
-  for (int i = 0; i < num; i++) {
-    for (int c = 0; c < inC; c++) {
-      for (int h = 0; h < inH; h++) {
-        int inoff = ((i * inC + c) * inH + h) * inW;
-        int outoff =
-            ((i * outC + c + cstart) * outH + h + hstart) * outW + wstart;
-        CpuVector inG = CpuVector(inW, inGrad + inoff);
-        CpuVector outG = CpuVector(inW, const_cast<real*>(outGrad + outoff));
-        inG += outG;
-      }
-    }
-  }
-}
-
-static inline PadConf castToPadConf(const FuncConfig& conf) {
-  return {conf.get<std::vector<uint32_t>>("channel"),
-          conf.get<std::vector<uint32_t>>("height"),
-          conf.get<std::vector<uint32_t>>("width")};
-}
-
-/**
- * \brief Padding zeros to input according to the specify dimension.
- *        The struct pad_ contains the padding size in each dimension.
- *        The input and output is a 4D tensor. In PadFunc, we only
- *        pad zeros to the 2nd to 4th dimension.
- *
- * Argument in this Function:
- * \param pad_    A struct object contains the padding size in each dimension.
- *                It has six integers. The channelStart and channelEnd indicate
- *                how many zeros to add before and after the input in channel
- *                dimension. And the heightStart and heightEnd indicate padding
- *                in height dimension. The widthStart and widthEnd indicate the
- *                padding in width dimension.
- * \param inputs  A 4D tensor, only one input.
- * \param outputs A 4D tensor, the output value after padding.
- *
- * For example,
- * Input(2,2,2,3) = [
- *                    [ [[1,2,3], [3,4,5]],
- *                      [[2,3,5], [1,6,7]] ],
- *                    [ [[4,3,1], [1,8,7]],
- *                      [[3,8,9], [2,3,5]] ]
- *                  ] # the shape is (1,2,2,3)
- *
- * pad_: if channelStart = channelEnd = 1, others are 0.
- * Output(2,4,2,3) = [
- *                    [ [[0,0,0], [0,0,0]],
- *                      [[1,2,3], [3,4,5]],
- *                      [[2,3,5], [1,6,7]],
- *                      [[0,0,0], [0,0,0]] ],
- *                    [ [[0,0,0], [0,0,0]],
- *                      [[4,3,1], [1,8,7]],
- *                      [[3,8,9], [2,3,5]],
- *                      [[0,0,0], [0,0,0]] ]
- *                   ] # the shape is (2,4,2,3)
- *
- * pad_: if widthStart = 1, widthEnd = 2, others are 0.
- * Output(2,2,2,6) = [
- *                     [ [[0,1,2,3,0,0], [0,3,4,5,0,0]],
- *                       [[0,2,3,5,0,0], [0,1,6,7,0,0]] ],
- *                     [ [[0,4,3,1,0,0], [0,1,8,7,0,0]],
- *                       [[0,3,8,9,0,0], [0,2,3,5,0,0]] ],
- *                   ] # the shape is (2,2,2,6)
- *
- * pad_: if heightStart = 1, heightEnd = 1, others are 0.
- * Output(2,2,4,3) = [
- *                     [ [[0,0,0], [1,2,3], [3,4,5], [0,0,0]],
- *                       [[0,0,0], [2,3,5], [1,6,7], [0,0,0]] ],
- *                     [ [[0,0,0], [4,3,1], [1,8,7], [0,0,0]],
- *                       [[0,0,0], [3,8,9], [2,3,5], [0,0,0]] ],
- *                   ] # the shape is (2,2,4,3)
- */
-
-template <DeviceType Device>
-class PadFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override { pad_ = castToPadConf(config); }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(1UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
-
-    size_t num = inputs[0].shape()[0];
-    size_t inC = inputs[0].shape()[1];
-    size_t inH = inputs[0].shape()[2];
-    size_t inW = inputs[0].shape()[3];
-    typename Tensor<real, Device>::Vector vec(outputs[0].shape().getElements(),
-                                              outputs[0].data<real>());
-    vec.zero();
-
-    Pad<Device>(outputs[0].data<real>(),
-                inputs[0].data<real>(),
-                num,
-                inC,
-                inH,
-                inW,
-                pad_);
-  }
-
- private:
-  PadConf pad_;
-};
-
-/**
- * \brief The backward propagation of padding Function. Remove the elements
- *        in the padding positions of forward.
- *
- * Argument in this Function:
- * \param pad_    The same meaning as it in PadFunc.
- * \param inputs  The gradient with respect to the output value of PadFunc.
- * \param outputs The gradient with respect to the input value of PadFunc.
- */
-
-template <DeviceType Device>
-class PadGradFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override { pad_ = castToPadConf(config); }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(1UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-
-    size_t num = outputs[0].shape()[0];
-    size_t inC = outputs[0].shape()[1];
-    size_t inH = outputs[0].shape()[2];
-    size_t inW = outputs[0].shape()[3];
-
-    if (outputs[0].getArgType() != ADD_TO) {
-      // for unit test
-      typename Tensor<real, Device>::Vector tmp(
-          outputs[0].shape().getElements(), outputs[0].data<real>());
-      tmp.zero();
-    }
-
-    PadGrad<Device>(outputs[0].data<real>(),
-                    inputs[0].data<real>(),
-                    num,
-                    inC,
-                    inH,
-                    inW,
-                    pad_);
-  }
-
- private:
-  PadConf pad_;
-};
-
-REGISTER_TYPED_FUNC(Pad, CPU, PadFunc);
-REGISTER_TYPED_FUNC(PadGrad, CPU, PadGradFunc);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(Pad, GPU, PadFunc);
-REGISTER_TYPED_FUNC(PadGrad, GPU, PadGradFunc);
-#endif
-
-}  // namespace paddle
diff --git a/paddle/function/RowConvOp.cpp b/paddle/function/RowConvOp.cpp
deleted file mode 100644
index 129e9334582fad011c259e8ab8268b00a7fab7b6..0000000000000000000000000000000000000000
--- a/paddle/function/RowConvOp.cpp
+++ /dev/null
@@ -1,225 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "RowConvOp.h"
-#include <iostream>
-#include "paddle/math/Vector.h"
-
-namespace paddle {
-
-template <>
-void RowConv<DEVICE_TYPE_CPU>(CpuMatrix& out,
-                              const CpuMatrix& in,
-                              const CpuMatrix& filter,
-                              const CpuIVector& seq) {
-  const int* starts = seq.getData();
-  const size_t numSeq = seq.getSize() - 1;
-  const size_t contextLength = filter.getHeight();
-  for (size_t i = 0; i < numSeq; ++i) {
-    size_t begin = starts[i];
-    size_t end = starts[i + 1];
-    for (size_t j = begin; j < end; ++j) {
-      MatrixPtr x;
-      MatrixPtr w;
-      if ((j + contextLength) < end) {
-        x = (const_cast<CpuMatrix&>(in)).subMatrix(j, contextLength);
-        w = (const_cast<CpuMatrix&>(filter)).subMatrix(0, contextLength);
-      } else {
-        x = (const_cast<CpuMatrix&>(in)).subMatrix(j, end - j);
-        w = (const_cast<CpuMatrix&>(filter)).subMatrix(0, end - j);
-      }
-      MatrixPtr y = out.subMatrix(j, 1);
-      y->addDotMulVMM(*x, *w);
-    }
-  }
-}
-
-template <>
-void RowConvGrad<DEVICE_TYPE_CPU>(const CpuMatrix& outG,
-                                  const CpuMatrix& in,
-                                  const CpuMatrix& filter,
-                                  CpuMatrix& inG,
-                                  CpuMatrix& filterG,
-                                  const CpuIVector& seq) {
-  // gradient w.r.t filter
-  const int* starts = seq.getData();
-  const size_t numSeq = seq.getSize() - 1;
-  const size_t contextLength = filter.getHeight();
-  if (filterG) {
-    for (size_t i = 0; i < numSeq; ++i) {
-      size_t begin = starts[i];
-      size_t end = starts[i + 1];
-      size_t steps = end - begin;
-      for (size_t j = 0; j < contextLength && (begin + j) < end; ++j) {
-        MatrixPtr x =
-            (const_cast<CpuMatrix&>(in)).subMatrix(begin + j, steps - j);
-        MatrixPtr dy =
-            (const_cast<CpuMatrix&>(outG)).subMatrix(begin, steps - j);
-        MatrixPtr dw = filterG.subMatrix(j, 1);
-        dw->addDotMulVMM(*dy, *x);
-      }
-    }
-  }
-
-  // gradient w.r.t input feature
-  if (inG) {
-    for (size_t i = 0; i < numSeq; ++i) {
-      size_t begin = starts[i];
-      size_t end = starts[i + 1];
-      size_t steps = end - begin;
-      for (size_t j = 0; j < steps; ++j) {
-        MatrixPtr dx = inG.subMatrix(begin + j, 1);
-        for (size_t t = 0; t < contextLength; ++t) {
-          if (int(j - t) >= 0) {
-            MatrixPtr dy =
-                (const_cast<CpuMatrix&>(outG)).subMatrix(begin + j - t, 1);
-            MatrixPtr w = (const_cast<CpuMatrix&>(filter)).subMatrix(t, 1);
-            dx->addDotMul(*dy, *w, 1.0, 1.0);
-          }
-        }
-      }
-    }
-  }
-}
-
-/**
- * \brief The row convolution is called lookahead convolution. It is firstly
- * introduced in deep-speech2 system. The bidirectional RNN that learns
- * representation for a sequence by performing a forward and a backward pass
- * through the entire sequence. However, unlike unidirectional RNNs,
- * bidirectional RNNs are challenging to deploy in an online and low-latency
- * setting. The lookahead convolution incorporates information from future
- * subsequences in a computationally efficient manner to improve unidirectional
- * recurrent neural networks.
- *
- * The connection of row convolution is different form the 1D sequence
- * convolution. Assumed that, the future context-length is k, that is to say,
- * it can get the output at timestep t by using the the input feature from t-th
- * timestep to (t+k)-th timestep. Assumed that the hidden dim of input
- * activations are d, the activations r_t for the new layer at time-step t are:
- *
- *
- *            -- k + 1
- *  r(t,i) =  >       W(i,j) * h(t+j-1, i),  for (1 <= i <= d)
- *            -- j = 1
- *
- *
- * The weight shape is: (k + 1) x d
- * Function Arguments:
- *
- * \param inputs[0]  The input activations.
- * \param inputs[0]  The filter (or weight) and shape is (k+1) x d.
- * \param outputs[1] The output activations.
- *
- * [1] Dario Amodei, etc. Deep Speech 2 : End-to-End Speech Recognition in
- * English
- *     and Mandarin. https://arxiv.org/abs/1512.02595
- */
-
-template <DeviceType Device>
-class RowConvFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override {}
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    // check
-    CHECK_EQ(2UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-    // TODO(qingqing): support ASSIGN_TO.
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
-        << "SequenceArg required here.";
-    const auto in = dynamic_cast<const SequenceArg&>(inputs[0]);
-    auto out = dynamic_cast<const SequenceArg&>(outputs[0]);
-    auto w = inputs[1];
-    CHECK(in.data() && out.data() && in.getSequenceId().data());
-    CHECK_EQ(in.shape().ndims(), 2UL);
-    CHECK(in.shape() == out.shape());
-    CHECK_EQ(w.shape()[1], in.shape()[1]);
-
-    auto outMat = out.matrix<Device>();
-    const auto inMat = in.matrix<Device>();
-    const auto wMat = w.matrix<Device>();
-    const auto seqId = in.getSequenceId().vector<int, Device>();
-
-    RowConv<Device>(outMat, inMat, wMat, seqId);
-  }
-};
-
-/**
- * \brief The backward of row convolution function. This function calculated
- * the gradient w.r.t filter and the gradient w.r.t input activations(or data).
- *
- * Argument in this Function:
- *
- * \param inputs[0]  The gradient w.r.t output activations.
- * \param inputs[1]  The input activations.
- * \param inputs[2]  The filter (or weight) and shape is (k+1) x d.
- * \param outputs[0] The gradient w.r.t input activations.
- * \param outputs[1] The gradient w.r.r filter.
- *
- * Abbreviation:
- * w.r.t: with respect to.
- */
-
-template <DeviceType Device>
-class RowConvGradFunc : public FunctionBase {
-  // TODO(qingqing): split into RowConvDataFunc and RowConvWeightFunc
- public:
-  void init(const FuncConfig& config) override {}
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    // check
-    CHECK_EQ(3UL, inputs.size());
-    CHECK_EQ(2UL, outputs.size());
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-    CHECK_EQ(outputs[1].getArgType(), ADD_TO);
-    CHECK(inputs[0].isSequenceArg() && inputs[1].isSequenceArg() &&
-          outputs[0].isSequenceArg())
-        << "SequenceArg required here.";
-
-    const auto outGrad = dynamic_cast<const SequenceArg&>(inputs[0]);
-    const auto in = dynamic_cast<const SequenceArg&>(inputs[1]);
-    const auto w = inputs[2];
-    auto inGrad = dynamic_cast<const SequenceArg&>(outputs[0]);
-    auto wGrad = outputs[1];
-
-    CHECK_EQ(in.shape().ndims(), 2UL);
-    CHECK(in.shape() == inGrad.shape());
-    CHECK(in.shape() == outGrad.shape());
-    CHECK_EQ(wGrad.shape()[1], in.shape()[1]);
-
-    const auto outGMat = outGrad.matrix<Device>();
-    const auto inMat = in.matrix<Device>();
-    const auto wMat = w.matrix<Device>();
-    auto inGMat = inGrad.data()
-                      ? inGrad.matrix<Device>()
-                      : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
-    auto wGMat = wGrad.data()
-                     ? wGrad.matrix<Device>()
-                     : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
-    const auto seqId = in.getSequenceId().vector<int, Device>();
-
-    RowConvGrad<Device>(outGMat, inMat, wMat, inGMat, wGMat, seqId);
-  }
-};
-
-REGISTER_TYPED_FUNC(RowConv, CPU, RowConvFunc);
-REGISTER_TYPED_FUNC(RowConvGrad, CPU, RowConvGradFunc);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(RowConv, GPU, RowConvFunc);
-REGISTER_TYPED_FUNC(RowConvGrad, GPU, RowConvGradFunc);
-#endif
-
-}  // namespace paddle
diff --git a/paddle/function/RowConvOpGpu.cu b/paddle/function/RowConvOpGpu.cu
deleted file mode 100644
index f820ee9a9713ce17547aa03945dc3c291ef50a59..0000000000000000000000000000000000000000
--- a/paddle/function/RowConvOpGpu.cu
+++ /dev/null
@@ -1,373 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/cuda/include/hl_base.h"
-#include "paddle/function/RowConvOp.h"
-
-namespace paddle {
-
-template <int BLOCK_H, int BLOCK_W>
-__global__ void KeRowConv(real* y,
-                          const real* x,
-                          const real* w,
-                          const int* starts,
-                          const int height,
-                          const int width,
-                          const int numSeq,
-                          const int context) {
-  const int tidx = threadIdx.x;
-  const int tidy = threadIdx.y;
-  const int blky = blockDim.y;
-  const int gidx = blockIdx.x * blockDim.x;
-
-  __shared__ real sw[BLOCK_H][BLOCK_W];
-
-  for (int i = tidy; i < context; i += blky) {
-    sw[i][tidx] = gidx + tidx < width ? w[i * width + gidx + tidx] : 0.0;
-  }
-
-  __syncthreads();
-
-  for (int i = 0; i < numSeq; ++i) {
-    const int start = starts[i];
-    const int end = starts[i + 1];
-    const int steps = end - start;
-    for (int j = tidy; j < steps; j += blky) {
-      real sum = 0;
-      int off = (start + j) * width;
-      for (int t = 0; t < context; ++t) {
-        if ((start + j + t) < end) {
-          int xoff = off + t * width;
-          real xVal = gidx + tidx < width ? x[xoff + gidx + tidx] : 0.0;
-          sum += sw[t][tidx] * xVal;
-        }
-      }
-      if (gidx + tidx < width) {
-        y[off + gidx + tidx] += sum;
-      }
-    }
-  }
-}
-
-__global__ void KeRowConv2(real* y,
-                           const real* x,
-                           const real* w,
-                           const int* starts,
-                           const int height,
-                           const int width,
-                           const int numSeq,
-                           const int context) {
-  const int tidx = threadIdx.x;
-  const int tidy = threadIdx.y;
-  const int blky = blockDim.y;
-  const int gidx = blockIdx.x * blockDim.x;
-
-  for (int i = 0; i < numSeq; ++i) {
-    const int start = starts[i];
-    const int end = starts[i + 1];
-    const int steps = end - start;
-    for (int j = tidy; j < steps; j += blky) {
-      int off = (start + j) * width;
-      real sum = 0;
-      for (int t = 0; t < context && (start + j + t) < end; ++t) {
-        int xoff = off + t * width;
-        real xd = gidx + tidx < width ? x[xoff + gidx + tidx] : 0.0;
-        real wd = gidx + tidx < width ? w[t * width + gidx + tidx] : 0.0;
-        sum += wd * xd;
-      }
-      if (gidx + tidx < width) {
-        y[off + gidx + tidx] += sum;
-      }
-    }
-  }
-}
-
-template <>
-void RowConv<DEVICE_TYPE_GPU>(GpuMatrix& out,  // NOLINT
-                              const GpuMatrix& in,
-                              const GpuMatrix& filter,
-                              const GpuIVector& seq) {
-  const size_t numSeq = seq.getSize() - 1;
-  const size_t contextLength = filter.getHeight();
-  const size_t height = in.getHeight();
-  const size_t width = in.getWidth();
-
-  real* y = out.getData();
-  const real* x = in.getData();
-  const real* w = filter.getData();
-  const int* starts = seq.getData();
-
-  dim3 dimBlock(32, 32);
-  dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
-
-  if (contextLength <= 32) {
-    KeRowConv<32, 32><<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
-        y, x, w, starts, height, width, numSeq, contextLength);
-  } else {
-    KeRowConv2<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
-        y, x, w, starts, height, width, numSeq, contextLength);
-  }
-  CHECK_SYNC("RowConv");
-}
-
-template <int BLOCK_H, int BLOCK_W, int CONTEXT>
-__global__ void KeRowConvBwWeight(real* dw,
-                                  const real* x,
-                                  const real* dy,
-                                  const int* starts,
-                                  const int height,
-                                  const int width,
-                                  const int numSeq,
-                                  const int context) {
-  const int tidx = threadIdx.x;
-  const int tidy = threadIdx.y;
-  const int blky = blockDim.y;
-  const int gidx = blockIdx.x * blockDim.x;
-
-  __shared__ real sh_x[BLOCK_W][BLOCK_H];
-  __shared__ real sh_dy[BLOCK_W][BLOCK_H + CONTEXT - 1];
-  __shared__ real sh_dw[CONTEXT][BLOCK_W];
-
-  if (tidy < context) {
-    sh_dw[tidy][tidx] = 0.0;
-  }
-  __syncthreads();
-
-  // NOTE(zcd): temporary solution
-  unsigned mask = 0u;
-  CREATE_SHFL_MASK(mask, true);
-
-  for (int i = 0; i < numSeq; ++i) {
-    const int start = starts[i];
-    const int end = starts[i + 1];
-    const int steps = end - start;
-    const int size = ((steps + BLOCK_H - 1) / BLOCK_H) * BLOCK_H;
-    for (int j = tidy; j < size; j += BLOCK_H) {
-      int xoff = gidx + tidx;
-      int yoff = start + j;
-
-      // transpose
-      sh_x[tidx][tidy] =
-          (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
-      sh_dy[tidx][tidy + context - 1] =
-          (xoff < width && yoff < end) ? dy[yoff * width + xoff] : 0.0;
-      __syncthreads();
-      if (tidy < (context - 1)) {
-        yoff = yoff - context + 1;
-        sh_dy[tidx][tidy] =
-            (xoff < width && yoff >= start) ? dy[yoff * width + xoff] : 0.0;
-      }
-      __syncthreads();
-
-      for (int t = 0; t < context; t++) {
-        real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx + context - 1 - t];
-        __syncthreads();
-        // warp size and blockDim.x is 32.
-
-        for (int offset = 16; offset > 0; offset /= 2)
-          val += __shfl_down_sync(mask, val, offset);
-
-        __syncthreads();
-        if (tidx == 0) {
-          sh_dw[t][tidy] += val;
-        }
-        __syncthreads();
-      }
-    }
-  }
-
-  for (int t = tidy; (t < context) && ((gidx + tidx) < width); t += blky) {
-    dw[t * width + gidx + tidx] += sh_dw[t][tidx];
-  }
-}
-
-template <int BLOCK_H, int BLOCK_W>
-__global__ void KeRowConvBwWeight2(real* dw,
-                                   const real* x,
-                                   const real* dy,
-                                   const int* starts,
-                                   const int height,
-                                   const int width,
-                                   const int numSeq,
-                                   const int context) {
-  const int tidx = threadIdx.x;
-  const int tidy = threadIdx.y;
-  const int gidx = blockIdx.x * blockDim.x;
-
-  __shared__ real sh_x[BLOCK_H][BLOCK_W];
-  __shared__ real sh_dy[BLOCK_H][BLOCK_W];
-
-  // NOTE(zcd): temporary solution
-  unsigned mask = 0u;
-  CREATE_SHFL_MASK(mask, true);
-
-  for (int i = 0; i < numSeq; ++i) {
-    const int start = starts[i];
-    const int end = starts[i + 1];
-    const int steps = end - start;
-
-    const int size = ((steps + BLOCK_H - 1) / BLOCK_H) * BLOCK_H;
-    for (int j = tidy; j < size; j += BLOCK_H) {
-      int xoff = gidx + tidx;
-      int yoff = start + j;
-
-      // transpose
-      sh_x[tidx][tidy] =
-          (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
-      __syncthreads();
-
-      for (int t = 0; t < context; t++) {
-        sh_dy[tidx][tidy] =
-            (xoff < width && (yoff - t) >= start && yoff - t < end)
-                ? dy[(yoff - t) * width + xoff]
-                : 0.0;
-        __syncthreads();
-
-        real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx];
-        __syncthreads();
-        // warp size and blockDim.x is 32.
-        for (int offset = 16; offset > 0; offset /= 2)
-          val += __shfl_down_sync(mask, val, offset);
-
-        __syncthreads();
-
-        if (tidx == 0 && (gidx + tidy) < width) {
-          dw[t * width + gidx + tidy] += val;
-        }
-      }
-    }
-  }
-}
-
-template <int BLOCK_H, int BLOCK_W>
-__global__ void KeRowConvBwData(real* dx,
-                                const real* w,
-                                const real* dy,
-                                const int* starts,
-                                const int height,
-                                const int width,
-                                const int numSeq,
-                                const int context) {
-  const int tidx = threadIdx.x;
-  const int tidy = threadIdx.y;
-  const int blky = blockDim.y;
-  const int gidx = blockIdx.x * blockDim.x;
-
-  __shared__ real sw[BLOCK_H][BLOCK_W];
-
-  for (int i = tidy; i < context; i += blky) {
-    sw[i][tidx] = gidx + tidx < width ? w[i * width + gidx + tidx] : 0.0;
-  }
-
-  __syncthreads();
-
-  for (int i = 0; i < numSeq; ++i) {
-    const int start = starts[i];
-    const int end = starts[i + 1];
-    const int steps = end - start;
-    for (int j = tidy; j < steps; j += blky) {
-      real sum = 0;
-      int off = (start + j) * width;
-      for (int t = 0; t < context && (j - t) >= 0; ++t) {
-        int dyOff = off - t * width;
-        real dyVal = gidx + tidx < width ? dy[dyOff + gidx + tidx] : 0.0;
-        sum += sw[t][tidx] * dyVal;
-      }
-      if (gidx + tidx < width) {
-        dx[off + gidx + tidx] += sum;
-      }
-    }
-  }
-}
-
-__global__ void KeRowConvBwData2(real* dx,
-                                 const real* w,
-                                 const real* dy,
-                                 const int* starts,
-                                 const int height,
-                                 const int width,
-                                 const int numSeq,
-                                 const int context) {
-  const int tidx = threadIdx.x;
-  const int tidy = threadIdx.y;
-  const int blky = blockDim.y;
-  const int gidx = blockIdx.x * blockDim.x;
-
-  for (int i = 0; i < numSeq; ++i) {
-    const int start = starts[i];
-    const int end = starts[i + 1];
-    const int steps = end - start;
-    for (int j = tidy; j < steps; j += blky) {
-      real sum = 0;
-      int off = (start + j) * width;
-      for (int t = 0; t < context && (j - t) >= 0; ++t) {
-        int dyOff = off - t * width;
-        real dyVal = gidx + tidx < width ? dy[dyOff + gidx + tidx] : 0.0;
-        real wVal = gidx + tidx < width ? w[t * width + gidx + tidx] : 0.0;
-        sum += wVal * dyVal;
-      }
-      if (gidx + tidx < width) {
-        dx[off + gidx + tidx] += sum;
-      }
-    }
-  }
-}
-
-template <>
-void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
-                                  const GpuMatrix& in,
-                                  const GpuMatrix& filter,
-                                  GpuMatrix& inG,      // NOLINT
-                                  GpuMatrix& filterG,  // NOLINT
-                                  const GpuIVector& seq) {
-  const size_t numSeq = seq.getSize() - 1;
-  const size_t contextLength = filter.getHeight();
-  const size_t height = in.getHeight();
-  const size_t width = in.getWidth();
-
-  const real* dy = outG.getData();
-  const real* x = in.getData();
-  const real* w = filter.getData();
-  const int* starts = seq.getData();
-
-  if (filterG) {
-    dim3 dimBlock(32, 32);
-    dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
-    real* dw = filterG.getData();
-    if (contextLength <= 32) {
-      KeRowConvBwWeight<32, 32, 32><<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
-          dw, x, dy, starts, height, width, numSeq, contextLength);
-    } else {
-      KeRowConvBwWeight2<32, 32><<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
-          dw, x, dy, starts, height, width, numSeq, contextLength);
-    }
-  }
-
-  if (inG) {
-    real* dx = inG.getData();
-    dim3 dimBlock2(32, 32);
-    dim3 dimGrid2(DIVUP(width, dimBlock2.x), 1);
-    if (contextLength <= 64) {
-      KeRowConvBwData<32, 64><<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>>(
-          dx, w, dy, starts, height, width, numSeq, contextLength);
-    } else {
-      KeRowConvBwData2<<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>>(
-          dx, w, dy, starts, height, width, numSeq, contextLength);
-    }
-  }
-
-  CHECK_SYNC("RowConvGrad");
-}
-
-}  // namespace paddle
diff --git a/paddle/function/ScaleSubRegionOp.cpp b/paddle/function/ScaleSubRegionOp.cpp
deleted file mode 100644
index 9a06ef2a96f25b5b7326049df2a708637f319561..0000000000000000000000000000000000000000
--- a/paddle/function/ScaleSubRegionOp.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ScaleSubRegionOp.h"
-#include "paddle/function/TensorShape.h"
-
-namespace paddle {
-
-template <>
-void ScaleSubRegion<DEVICE_TYPE_CPU>(real* outputs,
-                                     const real* inputs,
-                                     const real* indices,
-                                     const TensorShape shape,
-                                     const FuncConfig& conf) {
-  real value = conf.get<real>("value");
-
-  int number = shape[0];
-  int channel = shape[1];
-  int height = shape[2];
-  int width = shape[3];
-
-  memcpy(outputs, inputs, number * channel * height * width * sizeof(real));
-
-  for (int n = 0; n < number; ++n) {
-    // indices start from 1
-    int offset = n * 6;
-    for (int c = indices[offset] - 1; c < indices[offset + 1]; ++c) {
-      for (int h = indices[offset + 2] - 1; h < indices[offset + 3]; ++h) {
-        for (int w = indices[offset + 4] - 1; w < indices[offset + 5]; ++w) {
-          int idx = ((n * channel + c) * height + h) * width + w;
-          outputs[idx] *= value;
-        }
-      }
-    }
-  }
-}
-
-template <>
-void ScaleSubRegionGrad<DEVICE_TYPE_CPU>(const real* inGrad,
-                                         real* outGrad,
-                                         const real* indices,
-                                         const TensorShape shape,
-                                         const FuncConfig& conf) {
-  real value = conf.get<real>("value");
-
-  int number = shape[0];
-  int channel = shape[1];
-  int height = shape[2];
-  int width = shape[3];
-
-  for (int n = 0; n < number; ++n) {
-    for (int c = 0; c < channel; ++c) {
-      for (int h = 0; h < height; ++h) {
-        for (int w = 0; w < width; ++w) {
-          int idx = ((n * channel + c) * height + h) * width + w;
-          int offset = n * 6;
-          if (c >= (indices[offset] - 1) && c <= (indices[offset + 1] - 1) &&
-              h >= (indices[offset + 2] - 1) &&
-              h <= (indices[offset + 3] - 1) &&
-              w >= (indices[offset + 4] - 1) &&
-              w <= (indices[offset + 5] - 1)) {
-            outGrad[idx] += inGrad[idx] * value;
-          } else {
-            outGrad[idx] += inGrad[idx];
-          }
-        }
-      }
-    }
-  }
-}
-
-/**
- * \brief For each instance, ScaleSubRegion can be used to multiply a value to
- *        a specified sub continuous region. By providing start index and end
- *        index for C/H/W, you can specify the location and shape of the region.
- *
- * Argument in this Function:
- * \param inputs    A 4-D tensor with shape [N, C, H, W], only one input.
- * \param indices   A 2-D tensor with shape [N, 6], indicates the sub region.
- * \param outputs   A 4-D tensor with same shape as inputs, output value.
- */
-template <DeviceType Device>
-class ScaleSubRegionFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override { conf_ = config; }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(2UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
-
-    TensorShape shape = inputs[0].shape();
-
-    ScaleSubRegion<Device>(outputs[0].data<real>(),
-                           inputs[0].data<real>(),
-                           inputs[1].data<real>(),
-                           shape,
-                           conf_);
-  }
-
- private:
-  FuncConfig conf_;
-};
-
-/**
- * \brief The backward propagation of ScaleSubRegion Function.
- *
- * Argument in this Function:
- * \param inputs  A 4-D tensor with shape [N, C, H, W], output gradient.
- * \param indices A 2-D tensor with shape [N, 6], indicates the sub region.
- * \param outputs A 4-D tensor with shape [N, C, H, W], gradient of input value.
- */
-
-template <DeviceType Device>
-class ScaleSubRegionGradFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override { conf_ = config; }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(2UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-
-    TensorShape shape = inputs[0].shape();
-
-    ScaleSubRegionGrad<Device>(inputs[0].data<real>(),
-                               outputs[0].data<real>(),
-                               inputs[1].data<real>(),
-                               shape,
-                               conf_);
-  }
-
- private:
-  FuncConfig conf_;
-};
-
-REGISTER_TYPED_FUNC(ScaleSubRegion, CPU, ScaleSubRegionFunc);
-REGISTER_TYPED_FUNC(ScaleSubRegionGrad, CPU, ScaleSubRegionGradFunc);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(ScaleSubRegion, GPU, ScaleSubRegionFunc);
-REGISTER_TYPED_FUNC(ScaleSubRegionGrad, GPU, ScaleSubRegionGradFunc);
-#endif
-
-}  // namespace paddle
diff --git a/paddle/function/SwitchOp.cpp b/paddle/function/SwitchOp.cpp
deleted file mode 100644
index 750fb6bf28baf050b1f9f965a1a9b315363e5645..0000000000000000000000000000000000000000
--- a/paddle/function/SwitchOp.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "SwitchOp.h"
-#include "paddle/math/Vector.h"
-
-namespace paddle {
-
-template <>
-void NCHW2NHWC<DEVICE_TYPE_CPU>(real* outputs,
-                                const real* inputs,
-                                const int num,
-                                const int inC,
-                                const int inH,
-                                const int inW,
-                                const int argType) {
-  for (int n = 0; n < num; ++n) {
-    for (int c = 0; c < inC; ++c) {
-      for (int h = 0; h < inH; ++h) {
-        for (int w = 0; w < inW; ++w) {
-          if (argType == ADD_TO) {
-            outputs[((n * inH + h) * inW + w) * inC + c] += *(inputs++);
-          } else {
-            outputs[((n * inH + h) * inW + w) * inC + c] = *(inputs++);
-          }
-        }
-      }
-    }
-  }
-}
-
-template <>
-void NHWC2NCHW<DEVICE_TYPE_CPU>(real* outputs,
-                                const real* inputs,
-                                const int num,
-                                const int inH,
-                                const int inW,
-                                const int inC,
-                                const int argType) {
-  for (int n = 0; n < num; ++n) {
-    for (int h = 0; h < inH; ++h) {
-      for (int w = 0; w < inW; ++w) {
-        for (int c = 0; c < inC; ++c) {
-          if (argType == ADD_TO) {
-            outputs[((n * inC + c) * inH + h) * inW + w] += *(inputs++);
-          } else {
-            outputs[((n * inC + c) * inH + h) * inW + w] = *(inputs++);
-          }
-        }
-      }
-    }
-  }
-}
-
-/**
- * \brief  Switch dimension order of image input.
- *         The input and output is a 4D tensor. Switch order
- *         'batch_size,channels, height, width' to
- *         order 'batch_size, height, width, channels'.
- *
- * Argument in this Function:
- * \param inputs  input data with order 'batch_size,channels, height, width'.
- * \param outputs output data with order 'batch_size, height, width, channels'.
- */
-template <DeviceType Device>
-class NCHW2NHWCFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override {}
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(1UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-
-    size_t num = inputs[0].shape()[0];
-    size_t inC = inputs[0].shape()[1];
-    size_t inH = inputs[0].shape()[2];
-    size_t inW = inputs[0].shape()[3];
-    NCHW2NHWC<Device>(outputs[0].data<real>(),
-                      inputs[0].data<real>(),
-                      num,
-                      inC,
-                      inH,
-                      inW,
-                      outputs[0].getArgType());
-  }
-};
-
-/**
- * \brief  Switch dimension order of image input.
- *         The input and output is a 4D tensor. Switch order
- *         'batch_size, height, width, channels' to
- *         order 'batch_size, channels, height, width'.
- *
- * Argument in this Function:
- * \param inputs  input data with order 'batch_size, height, width, channels'.
- * \param outputs output data with order 'batch_size, channels, height, width'.
- */
-template <DeviceType Device>
-class NHWC2NCHWFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override {}
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(1UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-
-    size_t num = inputs[0].shape()[0];
-    size_t inH = inputs[0].shape()[1];
-    size_t inW = inputs[0].shape()[2];
-    size_t inC = inputs[0].shape()[3];
-
-    NHWC2NCHW<Device>(outputs[0].data<real>(),
-                      inputs[0].data<real>(),
-                      num,
-                      inH,
-                      inW,
-                      inC,
-                      outputs[0].getArgType());
-  }
-};
-
-REGISTER_TYPED_FUNC(NCHW2NHWC, CPU, NCHW2NHWCFunc);
-REGISTER_TYPED_FUNC(NHWC2NCHW, CPU, NHWC2NCHWFunc);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(NCHW2NHWC, GPU, NCHW2NHWCFunc);
-REGISTER_TYPED_FUNC(NHWC2NCHW, GPU, NHWC2NCHWFunc);
-#endif
-
-}  // namespace paddle
diff --git a/paddle/function/TensorType.h b/paddle/function/TensorType.h
deleted file mode 100644
index b384591bd8852bbdc61bf9aa678ce613732c369a..0000000000000000000000000000000000000000
--- a/paddle/function/TensorType.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-enum ValueType {
-  VALUE_TYPE_INT32 = 0,
-  VALUE_TYPE_FLOAT = 1,
-  VALUE_TYPE_DOUBLE = 2,
-  VALUE_TYPE_BYTE = 3
-};
-
-enum DeviceType {
-  DEVICE_TYPE_UNSPECIFIED = 0,
-  DEVICE_TYPE_CPU = 1,
-  DEVICE_TYPE_GPU = 2
-};
-
-enum SparseDataType { T_NO_VALUE = 0, T_FLOAT_VALUE = 1 };
-
-enum SparseDataFormat { T_SPARSE_CSR = 0, T_SPARSE_CSC = 1 };
-
-inline int sizeOfValuType(ValueType valueType) {
-  if (valueType == VALUE_TYPE_INT32) {
-    return 4;
-  } else if (valueType == VALUE_TYPE_FLOAT) {
-    return 4;
-  } else if (valueType == VALUE_TYPE_DOUBLE) {
-    return 8;
-  } else {
-    LOG(FATAL) << "Unknown type: " << valueType;
-    return 0;
-  }
-}
-
-template <typename T>
-struct DataType;
-
-template <>
-struct DataType<float> {
-  static const ValueType value = VALUE_TYPE_FLOAT;
-};
-
-template <>
-struct DataType<double> {
-  static const ValueType value = VALUE_TYPE_DOUBLE;
-};
-
-template <>
-struct DataType<int> {
-  static const ValueType value = VALUE_TYPE_INT32;
-};
-
-namespace detail {
-
-template <typename VType, DeviceType Device>
-struct MatrixT;
-
-template <>
-struct MatrixT<real, DEVICE_TYPE_CPU> {
-  using type = CpuMatrix;
-};
-
-template <>
-struct MatrixT<real, DEVICE_TYPE_GPU> {
-  using type = GpuMatrix;
-};
-
-template <>
-struct MatrixT<int, DEVICE_TYPE_CPU> {
-  using type = void;  // Not implemented
-};
-
-template <>
-struct MatrixT<int, DEVICE_TYPE_GPU> {
-  using type = void;  // Not implemented
-};
-
-template <typename VType, DeviceType Device>
-struct SparseMatrixT;
-
-template <>
-struct SparseMatrixT<real, DEVICE_TYPE_CPU> {
-  using type = CpuSparseMatrix;
-};
-
-template <>
-struct SparseMatrixT<real, DEVICE_TYPE_GPU> {
-  using type = GpuSparseMatrix;
-};
-
-template <>
-struct SparseMatrixT<int, DEVICE_TYPE_CPU> {
-  using type = void;  // Not implemented
-};
-
-template <>
-struct SparseMatrixT<int, DEVICE_TYPE_GPU> {
-  using type = void;  // Not implemented
-};
-
-template <typename VType, DeviceType Device>
-struct VectorT;
-
-template <>
-struct VectorT<real, DEVICE_TYPE_CPU> {
-  using type = CpuVector;
-};
-
-template <>
-struct VectorT<real, DEVICE_TYPE_GPU> {
-  using type = GpuVector;
-};
-
-template <>
-struct VectorT<int, DEVICE_TYPE_CPU> {
-  using type = CpuIVector;
-};
-
-template <>
-struct VectorT<int, DEVICE_TYPE_GPU> {
-  using type = GpuIVector;
-};
-
-}  // namespace detail
-
-template <typename VType, DeviceType DType>
-struct Tensor {
-  typedef typename detail::VectorT<VType, DType>::type Vector;
-  typedef typename detail::MatrixT<VType, DType>::type Matrix;
-  typedef typename detail::SparseMatrixT<VType, DType>::type SparseMatrix;
-};
-
-}  // namespace paddle
diff --git a/paddle/function/neon/NeonDepthwiseConv.cpp b/paddle/function/neon/NeonDepthwiseConv.cpp
deleted file mode 100644
index d7ac83da41aaba5cd38b042d0381dea527f9c42d..0000000000000000000000000000000000000000
--- a/paddle/function/neon/NeonDepthwiseConv.cpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "NeonDepthwiseConv.h"
-#include "paddle/function/ConvOp.h"
-
-namespace paddle {
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-
-template <DeviceType Device>
-class NeonDepthwiseConvFunction : public ConvFunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    check(inputs, outputs);
-
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-
-    int batchSize = input[0];
-    int inputChannels = input[1];
-    int inputHeight = input[2];
-    int inputWidth = input[3];
-    int filterHeight = getFilterHeight(filter);
-    int filterWidth = getFilterWidth(filter);
-    int outputChannels = output[1];
-    int outputHeight = output[2];
-    int outputWidth = output[3];
-    int filterMultiplier = outputChannels / groups_;
-    CHECK_EQ(static_cast<size_t>(inputChannels), groups_);
-
-    // only support strideH() == strideW() and filterHeight == filterWidth.
-    CHECK_EQ(strideH(), strideW());
-    CHECK_EQ(filterHeight, filterWidth);
-
-    float* inputData = inputs[0].data<float>();
-    float* filterData = inputs[1].data<float>();
-    float* outputData = outputs[0].data<float>();
-
-    // padding the input
-    float* inputPadding = inputData;
-    int padInputHeight = inputHeight + 2 * paddingH();
-    int padInputWidth = inputWidth + 2 * paddingW();
-    int newSize =
-        batchSize * (inputChannels + 1) * padInputHeight * padInputWidth;
-
-    resizeBuffer<Device>(newSize);
-    inputPadding = reinterpret_cast<float*>(memory_->getBuf());
-    neon::Padding<float>::run(inputData,
-                              inputPadding,
-                              batchSize * inputChannels,
-                              inputHeight,
-                              inputWidth,
-                              padInputHeight,
-                              padInputWidth);
-
-    std::function<void(
-        const float*, const float*, int, int, int, int, int, int, float*)>
-        DepthWiseConv;
-
-    if (filterWidth == 3 && strideW() == 1) {
-      DepthWiseConv = neon::DepthwiseConvKernel<3, 1>::run;
-    } else if (filterWidth == 3 && strideW() == 2) {
-      DepthWiseConv = neon::DepthwiseConvKernel<3, 2>::run;
-    } else if (filterWidth == 4 && strideW() == 1) {
-      DepthWiseConv = neon::DepthwiseConvKernel<4, 1>::run;
-    } else if (filterWidth == 4 && strideW() == 2) {
-      DepthWiseConv = neon::DepthwiseConvKernel<4, 2>::run;
-    } else {
-      LOG(FATAL) << "Not supported";
-    }
-
-    for (int i = 0; i < batchSize; i++) {
-      DepthWiseConv(inputPadding,
-                    filterData,
-                    padInputHeight,
-                    padInputWidth,
-                    outputChannels,
-                    outputHeight,
-                    outputWidth,
-                    filterMultiplier,
-                    outputData);
-      inputPadding += inputChannels * padInputHeight * padInputWidth;
-      outputData += outputChannels * outputHeight * outputWidth;
-    }
-  }
-};
-
-#ifndef PADDLE_TYPE_DOUBLE
-REGISTER_TYPED_FUNC(NeonDepthwiseConv, CPU, NeonDepthwiseConvFunction);
-#endif
-
-#endif
-
-}  // namespace paddle
diff --git a/paddle/function/neon/NeonDepthwiseConvTranspose.cpp b/paddle/function/neon/NeonDepthwiseConvTranspose.cpp
deleted file mode 100644
index 1fc5daf6078bbd5b4506ff2e0832e2cc3ec48fe3..0000000000000000000000000000000000000000
--- a/paddle/function/neon/NeonDepthwiseConvTranspose.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "NeonDepthwiseConv.h"
-#include "paddle/function/ConvOp.h"
-
-namespace paddle {
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-
-template <DeviceType Device>
-class NeonDepthwiseConvTransposeFunction : public ConvFunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    check(inputs, outputs);
-
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-
-    int batchSize = input[0];
-    int inputChannels = input[1];
-    int inputHeight = input[2];
-    int inputWidth = input[3];
-    int filterHeight = getFilterHeight(filter);
-    int filterWidth = getFilterWidth(filter);
-    int outputChannels = output[1];
-    int outputHeight = output[2];
-    int outputWidth = output[3];
-    int filterMultiplier = outputChannels / groups_;
-    CHECK_EQ(inputChannels, groups_);
-
-    // only support strideH() == strideW() and filterHeight == filterWidth.
-    CHECK_EQ(strideH(), strideW());
-    CHECK_EQ(paddingH(), paddingW());
-    CHECK_EQ(filterHeight, filterWidth);
-
-    float* inputData = inputs[0].data<float>();
-    float* filterData = inputs[1].data<float>();
-    float* outputData = outputs[0].data<float>();
-
-    // padding the input, input -> inputPadding
-    float* inputPadding = inputData;
-    int padInputHeight =
-        (inputHeight - 1) * strideH() + 2 * filterHeight - 1 - 2 * paddingH();
-    int padInputWidth =
-        (inputWidth - 1) * strideW() + 2 * filterWidth - 1 - 2 * paddingW();
-
-    if (padInputHeight > inputHeight || padInputWidth > inputWidth) {
-      int newSize = batchSize * inputChannels * padInputHeight * padInputWidth;
-      resizeBuffer<Device>(newSize);
-      inputPadding = reinterpret_cast<float*>(memory_->getBuf());
-      if (strideH() == 1) {
-        neon::Padding<float>::run(inputData,
-                                  inputPadding,
-                                  batchSize * inputChannels,
-                                  inputHeight,
-                                  inputWidth,
-                                  padInputHeight,
-                                  padInputWidth);
-      } else if (strideH() == 2) {
-        neon::StridePadding::run(inputData,
-                                 inputPadding,
-                                 batchSize * inputChannels,
-                                 inputHeight,
-                                 inputWidth,
-                                 padInputHeight,
-                                 padInputWidth);
-      } else {
-        LOG(FATAL) << "Not supported";
-      }
-    }
-
-    std::function<void(
-        const float*, const float*, int, int, int, int, int, int, float*)>
-        DepthWiseConv;
-
-    if (filterWidth == 3) {
-      DepthWiseConv = neon::DepthwiseConvKernel<3, 1>::run;
-    } else if (filterWidth == 4) {
-      DepthWiseConv = neon::DepthwiseConvKernel<4, 1>::run;
-    } else {
-      LOG(FATAL) << "Not supported";
-    }
-
-    for (int i = 0; i < batchSize; i++) {
-      DepthWiseConv(inputPadding,
-                    filterData,
-                    padInputHeight,
-                    padInputWidth,
-                    outputChannels,
-                    outputHeight,
-                    outputWidth,
-                    filterMultiplier,
-                    outputData);
-      inputPadding += inputChannels * padInputHeight * padInputWidth;
-      outputData += outputChannels * outputHeight * outputWidth;
-    }
-  }
-};
-
-#ifndef PADDLE_TYPE_DOUBLE
-
-REGISTER_TYPED_FUNC(NeonDepthwiseConvTranspose,
-                    CPU,
-                    NeonDepthwiseConvTransposeFunction);
-
-#endif
-
-#endif
-
-}  // namespace paddle
diff --git a/paddle/function/nnpack/NNPACKConvOp.cpp b/paddle/function/nnpack/NNPACKConvOp.cpp
deleted file mode 100644
index 48c997b50d8c73b25c58801c30e597c9d1f3232a..0000000000000000000000000000000000000000
--- a/paddle/function/nnpack/NNPACKConvOp.cpp
+++ /dev/null
@@ -1,247 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "nnpack.h"
-#include "paddle/function/ConvOp.h"
-
-DEFINE_bool(nnpack_allocate_outside,
-            true,
-            "Allocate and free workspace memory outside the NNPACK interface.");
-DEFINE_int32(nnpack_num_threads,
-             0,
-             "The number of nnpack threads"
-             "default: 0; 0 to disable threadpool.");
-
-namespace paddle {
-
-nnp_convolution_algorithm get_nnp_convolution_algorithm(
-    const std::string& algorithm) {
-  if (algorithm == "auto") {
-    return nnp_convolution_algorithm_auto;
-  } else if (algorithm == "ft8x8") {
-    return nnp_convolution_algorithm_ft8x8;
-  } else if (algorithm == "ft16x16") {
-    return nnp_convolution_algorithm_ft16x16;
-  } else if (algorithm == "wt8x8") {
-    return nnp_convolution_algorithm_wt8x8;
-  } else if (algorithm == "implicit-gemm") {
-    return nnp_convolution_algorithm_implicit_gemm;
-  } else if (algorithm == "direct") {
-    return nnp_convolution_algorithm_direct;
-  } else {
-    return nnp_convolution_algorithm_auto;
-  }
-}
-
-template <DeviceType Device>
-class NNPACKConvFunction : public ConvFunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-    algorithm_ = get_nnp_convolution_algorithm(config.get<std::string>("algo"));
-    transform_strategy_ = nnp_convolution_transform_strategy_compute;
-    nnp_status status = nnp_initialize();
-    CHECK_EQ(status, nnp_status_success);
-    workspaceBuffer_ = nullptr;
-    workspaceSize_ = 0;
-
-    create_nnpack_threadpool();
-  }
-
-  ~NNPACKConvFunction() {
-    if (workspaceBuffer_) {
-      free(workspaceBuffer_);
-    }
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
-    check(inputs, outputs);
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-
-    size_t batchSize = input[0];
-    size_t inputChannels = input[1];
-    size_t inputHeight = input[2];
-    size_t inputWidth = input[3];
-    size_t filterHeight = getFilterHeight(filter);
-    size_t filterWidth = getFilterWidth(filter);
-    size_t outputChannels = output[1];
-    size_t outputHeight = output[2];
-    size_t outputWidth = output[3];
-
-    nnp_size inputSize = {.width = inputWidth, .height = inputHeight};
-    nnp_padding padding = {.top = (size_t)paddingH(),
-                           .right = (size_t)paddingW(),
-                           .bottom = (size_t)paddingH(),
-                           .left = (size_t)paddingW()};
-    nnp_size kernelSize = {.width = filterWidth, .height = filterHeight};
-    nnp_size outputSubsampling = {.width = (size_t)strideW(),
-                                  .height = (size_t)strideH()};
-
-    float* inputData = inputs[0].data<float>();
-    float* filterData = inputs[1].data<float>();
-    float* outputData = outputs[0].data<float>();
-
-    void* bufferPtr = nullptr;
-    size_t* sizePtr = nullptr;
-    size_t needSize;
-    if (FLAGS_nnpack_allocate_outside) {
-      if (batchSize == 1) {
-        nnp_status status = nnp_convolution_inference(algorithm_,
-                                                      transform_strategy_,
-                                                      inputChannels,
-                                                      outputChannels,
-                                                      inputSize,
-                                                      padding,
-                                                      kernelSize,
-                                                      outputSubsampling,
-                                                      nullptr,
-                                                      nullptr,
-                                                      nullptr,
-                                                      nullptr,
-                                                      nullptr,
-                                                      &needSize,
-                                                      nnp_activation_identity,
-                                                      nullptr,
-                                                      nullptr,
-                                                      nullptr);
-        CHECK_EQ(status, nnp_status_success);
-      } else {
-        // only supports stride = 1
-        CHECK_EQ(strideH(), 1);
-        CHECK_EQ(strideW(), 1);
-        nnp_status status = nnp_convolution_output(algorithm_,
-                                                   batchSize,
-                                                   inputChannels,
-                                                   outputChannels,
-                                                   inputSize,
-                                                   padding,
-                                                   kernelSize,
-                                                   nullptr,
-                                                   nullptr,
-                                                   nullptr,
-                                                   nullptr,
-                                                   nullptr,
-                                                   &needSize,
-                                                   nnp_activation_identity,
-                                                   nullptr,
-                                                   nullptr,
-                                                   nullptr);
-        CHECK_EQ(status, nnp_status_success);
-      }
-
-      VLOG(3) << "workspace size is " << needSize;
-      if (needSize > workspaceSize_) {
-        workspaceSize_ = needSize;
-        if (workspaceBuffer_) {
-          free(workspaceBuffer_);
-        } else {
-          posix_memalign(&workspaceBuffer_, 64, needSize);
-        }
-      }
-
-      if (needSize) {
-        bufferPtr = workspaceBuffer_;
-        sizePtr = &needSize;
-      }
-    }
-
-    size_t inputOffset = inputChannels / groups_ * inputHeight * inputWidth;
-    size_t outputOffset = outputChannels / groups_ * outputHeight * outputWidth;
-    size_t filterOffset = filter.getElements() / groups_;
-
-    if (batchSize == 1) {
-      for (size_t g = 0; g < groups_; g++) {
-        nnp_status status =
-            nnp_convolution_inference(algorithm_,
-                                      transform_strategy_,
-                                      inputChannels / groups_,
-                                      outputChannels / groups_,
-                                      inputSize,
-                                      padding,
-                                      kernelSize,
-                                      outputSubsampling,
-                                      inputData + inputOffset * g,
-                                      filterData + filterOffset * g,
-                                      nullptr, /* bias */
-                                      outputData + outputOffset * g,
-                                      bufferPtr,
-                                      sizePtr,
-                                      nnp_activation_identity,
-                                      nullptr,
-                                      threadpool_, /* threadpool */
-                                      nullptr);
-        CHECK_EQ(status, nnp_status_success);
-      }
-    } else {
-      // only supports stride = 1
-      CHECK_EQ(strideH(), 1);
-      CHECK_EQ(strideW(), 1);
-
-      // TODO(hedaoyuan): There has some bug when batchSize > 1 and groups_ > 1.
-      CHECK_EQ(groups_, static_cast<size_t>(1));
-      nnp_status status = nnp_convolution_output(algorithm_,
-                                                 batchSize,
-                                                 inputChannels,
-                                                 outputChannels,
-                                                 inputSize,
-                                                 padding,
-                                                 kernelSize,
-                                                 inputData,
-                                                 filterData,
-                                                 nullptr, /* bias */
-                                                 outputData,
-                                                 bufferPtr,
-                                                 sizePtr,
-                                                 nnp_activation_identity,
-                                                 nullptr,
-                                                 threadpool_, /* threadpool */
-                                                 nullptr);
-      CHECK_EQ(status, nnp_status_success);
-    }
-  }
-
-  static void create_nnpack_threadpool() {
-    if (FLAGS_nnpack_num_threads && threadpool_ == nullptr) {
-      threadpool_ = pthreadpool_create(FLAGS_nnpack_num_threads);
-      VLOG(3) << "Number of threads "
-              << pthreadpool_get_threads_count(threadpool_);
-    }
-  }
-
- private:
-  nnp_convolution_algorithm algorithm_;
-  nnp_convolution_transform_strategy transform_strategy_;
-  void* workspaceBuffer_;
-  size_t workspaceSize_;
-  static pthreadpool_t threadpool_;
-};
-
-template <DeviceType Device>
-pthreadpool_t NNPACKConvFunction<Device>::threadpool_ = nullptr;
-
-REGISTER_TYPED_FUNC(NNPACKConv, CPU, NNPACKConvFunction);
-
-}  // namespace paddle
diff --git a/paddle/function/nnpack/NNPACKConvOpTest.cpp b/paddle/function/nnpack/NNPACKConvOpTest.cpp
deleted file mode 100644
index c80ffb5d5d255465e9a2fa251fb9a6c61f96e7ec..0000000000000000000000000000000000000000
--- a/paddle/function/nnpack/NNPACKConvOpTest.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/function/ConvOpTest.h"
-
-namespace paddle {
-
-TEST(NNPACK, Forward) {
-  Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
-      "GemmConv-CPU", "NNPACKConv-CPU", forward);
-}
-
-TEST(NNPACK, Depthwise) {
-  DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
-      "GemmConv-CPU", "NNPACKConv-CPU", forward);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/activations/ActivationFunction.cpp b/paddle/gserver/activations/ActivationFunction.cpp
deleted file mode 100644
index 71c238fbfe9f32f3764601ebb441336931f8ef5f..0000000000000000000000000000000000000000
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ /dev/null
@@ -1,509 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ActivationFunction.h"
-
-#include <algorithm>
-#include <iostream>
-#include <memory>
-#include <string>
-#include <thread>
-#include <type_traits>
-#include "paddle/parameter/Argument.h"
-#include "paddle/utils/ClassRegistrar.h"
-#include "paddle/utils/Logging.h"
-
-#ifdef PADDLE_WITH_MKLDNN
-#include "MKLDNNActivation.h"
-#endif
-
-namespace paddle {
-
-static ClassRegistrar<ActivationFunction> gActivationRegistrar;
-/**
- * @def ACTIVATION_CLASS_NAME
- * @brief Macro for getting derived activation class name
- * @note ACTIVATION_CLASS_NAME(softmax) softmax_;
- * means softmaxActivation softmax_;
- */
-#define ACTIVATION_CLASS_NAME(ACTIVATION_NAME) ACTIVATION_NAME##Activation
-/**
- * @def BEGIN_DEFINE_ACTIVATION
- * @brief Macro for defining a devried activation class
- */
-#define BEGIN_DEFINE_ACTIVATION(ACTIVATION_NAME)                             \
-  class ACTIVATION_CLASS_NAME(ACTIVATION_NAME) : public ActivationFunction { \
-   private:                                                                  \
-    static const std::string name;                                           \
-                                                                             \
-   public:                                                                   \
-    const std::string& getName() const { return name; }
-/**
- * @def END_DEFINE_ACTIVATION
- * @brief Macro for registering a derived activation class
- */
-#define END_DEFINE_ACTIVATION(ACTIVATION_NAME)                     \
-  }                                                                \
-  ;                                                                \
-  const std::string ACTIVATION_CLASS_NAME(ACTIVATION_NAME)::name = \
-      #ACTIVATION_NAME;                                            \
-  static InitFunction __reg_activation__##ACTIVATION_NAME([] {     \
-    gActivationRegistrar                                           \
-        .registerClass<ACTIVATION_CLASS_NAME(ACTIVATION_NAME)>(    \
-            #ACTIVATION_NAME);                                     \
-  });
-
-/**
- * @brief The IdentityActivation class
- *
- * Do nothing when forward/backward.
- */
-class IdentityActivation : public ActivationFunction {
- public:
-  static const std::string name;
-  Error __must_check forward(Argument& act) {
-    (void)act;
-    return Error();
-  }
-  Error __must_check backward(Argument& act) {
-    (void)act;
-    return Error();
-  }
-  const std::string& getName() const { return name; }
-};
-const std::string IdentityActivation::name = "";
-static InitFunction __reg_activation__identity([] {
-  gActivationRegistrar.registerClass<IdentityActivation>("");
-  gActivationRegistrar.registerClass<IdentityActivation>("linear");
-});
-
-/**
- * @brief Sigmoid Activation
- * \f[
- * f(z) = \frac{1}{1+exp(-z)}
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(sigmoid)
-Error __must_check forward(Argument& act) {
-  act.value->sigmoid(*act.value);
-  return Error();
-}
-Error __must_check backward(Argument& act) {
-  act.grad->sigmoidDerivative(*act.value);
-  return Error();
-}
-END_DEFINE_ACTIVATION(sigmoid)
-
-/**
- * @brief Softmax Activation
- * \f[
- * P(y=j|x) = \frac{e^{x^Tw_j}}{\sum^K_{k=1}e^{x^Tw_k}}
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(softmax)
-private:
-MatrixPtr sftMaxSum_;
-MatrixPtr sftMaxDot_;
-
-public:
-Error __must_check forward(Argument& act) {
-  act.value->softmax(*act.value);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  MatrixPtr outputV = act.value;
-  MatrixPtr outputG = act.grad;
-
-  if (outputG->useGpu()) {
-    outputG->softmaxBackward(*outputV);
-  } else {
-    SetDevice device(act.deviceId);
-    Matrix::resizeOrCreate(sftMaxDot_,
-                           outputG->getHeight(),
-                           outputG->getWidth(),
-                           /* trans */ false,
-                           useGpu(act.deviceId));
-    Matrix::resizeOrCreate(sftMaxSum_,
-                           outputG->getHeight(),
-                           1,
-                           /* trans */ false,
-                           useGpu(act.deviceId));
-
-    sftMaxDot_->dotMul(*outputG, *outputV);
-    sftMaxSum_->colMerge(*sftMaxDot_);
-
-    act.grad->softmaxDerivative(*act.value, *sftMaxSum_);
-  }
-  return Error();
-}
-END_DEFINE_ACTIVATION(softmax)
-
-/**
- * @brief Sequence_softmax Activation
- * @note Softmax on all frames of one sequence.
- * Width of frame must be one.
- */
-BEGIN_DEFINE_ACTIVATION(sequence_softmax)
-private:
-ACTIVATION_CLASS_NAME(softmax) softmax_;
-Argument argument_;
-
-public:
-Error __must_check forward(Argument& act) {
-  if (act.value->getWidth() != 1UL) {
-    return Error(
-        "Input width for each timestep of sequence softmax should be 1");
-  }
-
-  if (!argument_.value) {
-    argument_.value = Matrix::create(nullptr,
-                                     /* height= */ 1,
-                                     1,
-                                     /* trans= */ false,
-                                     useGpu(act.deviceId));
-    argument_.grad = Matrix::create(nullptr,
-                                    /* height= */ 1,
-                                    1,
-                                    /* trans= */ false,
-                                    useGpu(act.deviceId));
-  }
-
-  auto starts =
-      act.hasSubseq()
-          ? act.subSequenceStartPositions->getVector(useGpu(act.deviceId))
-          : act.sequenceStartPositions->getVector(useGpu(act.deviceId));
-  act.value->sequenceSoftmax(*act.value, *starts);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  if (act.value->getWidth() != 1UL) {
-    return Error(
-        "Input width for each timestep of sequence softmax should be 1");
-  }
-
-  size_t numSequences =
-      act.hasSubseq() ? act.getNumSubSequences() : act.getNumSequences();
-  const int* starts = act.getCpuStartPositions();
-
-  for (size_t i = 0; i < numSequences; ++i) {
-    // TODO(Dangqingqing) optimization for GPU
-    size_t offset = starts[i];
-    size_t size = starts[i + 1] - starts[i];
-    argument_.value->setData(act.value->getData() + offset, 1UL, size);
-    argument_.grad->setData(act.grad->getData() + offset, 1UL, size);
-
-    Error err = softmax_.backward(argument_);
-    if (!err.isOK()) return err;
-  }
-  return Error();
-}
-END_DEFINE_ACTIVATION(sequence_softmax)
-
-/*
- * @brief SoftSign Activation.
- * \f[
- * f(z) = \frac{z}{1 + |z|}
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(softsign)
-private:
-MatrixPtr denominator_;
-
-Error __must_check forward(Argument& act) {
-  size_t height = act.value->getHeight();
-  size_t width = act.value->getWidth();
-  Matrix::resizeOrCreate(
-      denominator_, height, width, false, useGpu(act.deviceId));
-  denominator_->assign(*act.value);
-  denominator_->abs2();
-  denominator_->add(1.);
-
-  act.value->dotDiv(*act.value, *denominator_);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  denominator_->square2();
-  denominator_->scalarDiv(*denominator_, 1.);
-  act.grad->dotMul(*act.grad, *denominator_);
-  return Error();
-}
-END_DEFINE_ACTIVATION(softsign)
-
-/**
- * @brief Relu Activation.
- * forward. y = max(0, z)
- *
- * derivative of relu is:
- *
- *    1 if z > 0
- *
- *    0 otherwise.
- */
-BEGIN_DEFINE_ACTIVATION(relu)
-Error __must_check forward(Argument& act) {
-  act.value->relu(*act.value);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->reluDerivative(*act.value);
-  return Error();
-}
-END_DEFINE_ACTIVATION(relu)
-
-/**
- * @brief BRelu Activation.
- *
- * forward. y = min(24, max(0, z))
- *
- * derivative of brelu is:
- *
- *    1 if 0 < z < 24
- *
- *    0 otherwise.
- *
- * TODO(yuyang18): Remove magic number 24 or make it configuable.
- */
-BEGIN_DEFINE_ACTIVATION(brelu)
-Error __must_check forward(Argument& act) {
-  act.value->brelu(*act.value);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->breluDerivative(*act.value);
-  return Error();
-}
-END_DEFINE_ACTIVATION(brelu)
-
-/**
- * @brief Tanh Activation.
- * \f[
- * f(z) = tanh(z)=\frac{e^z-e^{-z}}{e^z+e^{-z}}
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(tanh)
-Error __must_check forward(Argument& act) {
-  act.value->tanh(*act.value);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->tanhDerivative(*act.value);
-  return Error();
-}
-END_DEFINE_ACTIVATION(tanh)
-
-/**
- * @brief Scaled Tanh Activation
- * \f[
- * f(z) = 1.7159 * tanh(2/3*z)
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(stanh)
-private:
-real a, b;
-
-public:
-ACTIVATION_CLASS_NAME(stanh)() : a(1.7159), b(2. / 3.) {}
-Error __must_check forward(Argument& act) {
-  act.value->scaledTanh(*act.value, a, b);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->scaledTanhDerivative(*act.value, a, b);
-  return Error();
-}
-END_DEFINE_ACTIVATION(stanh)
-
-/**
- * @brief Soft Relu Activation.
- * \f[
- * f(z) = ln(1+e^z)
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(softrelu)
-Error __must_check forward(Argument& act) {
-  act.value->softrelu(*act.value);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->softreluDerivative(*act.value);
-  return Error();
-}
-END_DEFINE_ACTIVATION(softrelu)
-
-/**
- * @brief Abs Activation.
- * Forward: f(z) = abs(z)
- *
- * Derivative:
- *
- *     1   if z>0
- *
- *    -1   if z<0
- *
- *     0   if z=0
- */
-BEGIN_DEFINE_ACTIVATION(abs)
-Error __must_check forward(Argument& act) {
-  SetDevice device(act.deviceId);
-  Matrix::resizeOrCreate(act.in,
-                         act.value->getHeight(),
-                         act.value->getWidth(),
-                         /* trans */ false,
-                         useGpu(act.deviceId));
-
-  act.in->copyFrom(*act.value);
-  act.value->abs2(*act.value);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->absDerivative(*act.in);
-  return Error();
-}
-END_DEFINE_ACTIVATION(abs)
-
-/**
- * @brief Square Activation.
- * \f[
- * f(z) = z^2.
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(square)
-Error __must_check forward(Argument& act) {
-  SetDevice device(act.deviceId);
-  Matrix::resizeOrCreate(act.in,
-                         act.value->getHeight(),
-                         act.value->getWidth(),
-                         /* trans */ false,
-                         useGpu(act.deviceId));
-
-  act.in->copyFrom(*act.value);
-  act.value->square2(*act.value);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->squareDerivative(*act.in);
-  return Error();
-}
-END_DEFINE_ACTIVATION(square)
-
-/**
- * @brief Exponential Activation.
- * \f[
- * f(z) = e^z
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(exponential)
-Error __must_check forward(Argument& act) {
-  act.value->exp2(*act.value);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->expDerivative(*act.value);
-  return Error();
-}
-END_DEFINE_ACTIVATION(exponential)
-
-/**
- * @brief Reciprocal Activation.
- * \f[
- * f(z) = 1/z
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(reciprocal)
-Error __must_check forward(Argument& act) {
-  act.value->reciprocal2();
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->dotMulSquare(*act.value);
-  act.grad->neg();
-  return Error();
-}
-END_DEFINE_ACTIVATION(reciprocal)
-
-/**
- * @brief Square Root Activation.
- * \f[
- * f(z) = sqrt(z)
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(sqrt)
-Error __must_check forward(Argument& act) {
-  act.value->sqrt2();
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->dotDiv(*act.grad, *act.value);
-  act.grad->mulScalar(0.5);
-  return Error();
-}
-END_DEFINE_ACTIVATION(sqrt)
-
-/**
- * @brief Logarithm Activation.
- * \f[
- * f(z) = log(z)
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(log)
-Error __must_check forward(Argument& act) {
-  SetDevice device(act.deviceId);
-  Matrix::resizeOrCreate(act.in,
-                         act.value->getHeight(),
-                         act.value->getWidth(),
-                         /* trans */ false,
-                         useGpu(act.deviceId));
-
-  act.in->copyFrom(*act.value);
-  act.value->log2(*act.value);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->dotDiv(*act.grad, *act.in);
-  return Error();
-}
-END_DEFINE_ACTIVATION(log)
-
-ActivationFunction* ActivationFunction::create(const std::string& type) {
-#ifdef PADDLE_WITH_MKLDNN
-  if (!type.empty() && type.compare(0, 7, "mkldnn_") == 0) {
-    return MKLDNNActivation::create(type);
-  }
-#endif
-
-  return gActivationRegistrar.createByType(type);
-}
-
-std::vector<std::string> ActivationFunction::getAllRegisteredTypes() {
-  std::vector<std::string> types;
-  gActivationRegistrar.forEachType(
-      [&](const std::string& type) { types.push_back(type); });
-  return types;
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/activations/ActivationFunction.h b/paddle/gserver/activations/ActivationFunction.h
deleted file mode 100644
index 8e2e144769f2e668a9a8f02890d29c4a7fe128a3..0000000000000000000000000000000000000000
--- a/paddle/gserver/activations/ActivationFunction.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include <vector>
-#include "paddle/utils/Error.h"
-
-namespace paddle {
-
-struct Argument;
-/**
- * @brief Activation function is a function that transforms a set of input
- * signals into an output signals. The purpose of the activation function
- * is to introduce non-liearilty into the network.
- *
- * @note Common activation function are provieded, including linear,
- * sigmoid, softmax, sequence_max, relu, brelu, tanh, stanh,
- * softrelu, abs, square, exponential.
- *
- */
-class ActivationFunction {
- public:
-  static ActivationFunction* create(const std::string& type);
-  static std::vector<std::string> getAllRegisteredTypes();
-
-  ActivationFunction() {}
-
-  virtual ~ActivationFunction() {}
-
-  /**
-   * @brief Foward propagation
-   *
-   * act.value <- f(act.value),
-   * where f is the activation function.
-   * Suppose that before calling forward(), act.value is x and
-   * after forward() is called, act.value is y, then y = f(x).
-   *
-   * Usually, act is Layer::output_
-   */
-  virtual Error __must_check forward(Argument& act) = 0;
-
-  /**
-   * @brief Backward propagaion
-   *
-   * x and y are defined in the above comment for forward().
-   * - Before calling backward(), act.grad = dE / dy, where E is the error/cost
-   * - After backward() returns, act.grad = dE / dx = (dE/dy) * (dy/dx)
-   */
-  virtual Error __must_check backward(Argument& act) = 0;
-
-  virtual const std::string& getName() const = 0;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/activations/MKLDNNActivation.cpp b/paddle/gserver/activations/MKLDNNActivation.cpp
deleted file mode 100644
index 672444c6561adbeb78c3c453f12ab6aaedeed646..0000000000000000000000000000000000000000
--- a/paddle/gserver/activations/MKLDNNActivation.cpp
+++ /dev/null
@@ -1,249 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MKLDNNActivation.h"
-#include "mkldnn.hpp"
-#include "paddle/utils/ClassRegistrar.h"
-
-namespace paddle {
-
-static ClassRegistrar<ActivationFunction> gMKLDNNActivationRegistrar;
-/**
- * @def MKLDNN_ACTIVATION_CLASS_NAME
- * @note MKLDNN_ACTIVATION_CLASS_NAME(relu) relu_;
- * means mkldnn_reluActivation relu_;
- */
-#define MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE) mkldnn_##ACT_TYPE##Activation
-
-/**
- * @def BEGIN_MKLDNN_ACTIVATION
- */
-#define BEGIN_MKLDNN_ACTIVATION(ACT_TYPE, BASE_CLASS) \
-  class MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE) : public BASE_CLASS {
-/**
- * @def END_MKLDNN_ACTIVATION
- */
-#define END_MKLDNN_ACTIVATION(ACT_TYPE)                            \
- private:                                                          \
-  static const std::string name;                                   \
-                                                                   \
- public:                                                           \
-  const std::string& getName() const { return name; }              \
-  }                                                                \
-  ;                                                                \
-  const std::string MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::name = \
-      "mkldnn_" #ACT_TYPE;                                         \
-  static InitFunction __reg_activation__mkldnn_##ACT_TYPE([] {     \
-    gMKLDNNActivationRegistrar                                     \
-        .registerClass<MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)>(    \
-            "mkldnn_" #ACT_TYPE);                                  \
-  });
-
-/**
- * @def DEFINE_MKLDNN_ACTIVATION
- */
-#define DEFINE_MKLDNN_ACTIVATION(ACT_TYPE, BASE_CLASS) \
-  BEGIN_MKLDNN_ACTIVATION(ACT_TYPE, BASE_CLASS)        \
-  END_MKLDNN_ACTIVATION(ACT_TYPE)
-
-/**
- * @def DEFINE_MKLDNN_ELTWISE_ACTIVATION
- */
-#define DEFINE_MKLDNN_ELTWISE_ACTIVATION(                            \
-    ACT_TYPE, BASE_CLASS, ALPHA, BWD_ALPHA)                          \
-  BEGIN_MKLDNN_ACTIVATION(ACT_TYPE, BASE_CLASS)                      \
- private:                                                            \
-  static const float alpha;                                          \
-  static const float bwdAlpha;                                       \
-                                                                     \
- public:                                                             \
-  float getAlpha() const { return alpha; }                           \
-  float getBwdAlpha() const { return bwdAlpha; }                     \
-  END_MKLDNN_ACTIVATION(ACT_TYPE)                                    \
-  const float MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::alpha = ALPHA; \
-  const float MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::bwdAlpha = BWD_ALPHA;
-
-/**
- * @brief MKLDNN Relu Activation.
- * Actually mkldnn_relu is Leaky Relu.
- *  f(x) = x                   (x >= 0)
- *  f(x) = negative_slope * x  (x <  0)
- * @note the negative_slope should be -0.f in forward
- */
-DEFINE_MKLDNN_ELTWISE_ACTIVATION(relu, MKLDNNEltwiseActivation, -0.f, 0.f)
-
-/**
- * @brief MKLDNN Tanh Activation.
- */
-DEFINE_MKLDNN_ELTWISE_ACTIVATION(tanh, MKLDNNEltwiseActivation, 0.f, 0.f)
-
-/**
- * @brief MKLDNN ELU(Exponential Linear Unit) Activation.
- *  f(x) = x                              (x >= 0)
- *  f(x) = negative_slope * (exp(x) - 1)  (x <  0)
- */
-DEFINE_MKLDNN_ELTWISE_ACTIVATION(elu, MKLDNNEltwiseActivation, 0.f, 0.f)
-
-mkldnn::algorithm MKLDNNEltwiseActivation::getAlgo(std::string type) const {
-  const std::map<std::string, mkldnn::algorithm> algoMap = {
-      {"relu", algorithm::eltwise_relu},
-      {"tanh", algorithm::eltwise_tanh},
-      {"elu", algorithm::eltwise_elu}};
-  type.erase(0, 7);  // remove mkldnn_
-  algorithm algo = (algorithm)0;
-  mapGet(type, algoMap, &algo);
-  return algo;
-}
-
-void MKLDNNEltwiseActivation::resetFwd(Argument& act) {
-  if (cnt_ == act.value->getElementCnt()) {
-    return;
-  }
-  MKLDNNActivation::resetFwd(act);
-  // note: alpha represents the NegativeSlope when used in relu.
-  float alpha = getAlpha();
-  float beta = getBeta();
-  algorithm algo = getAlgo(this->getName());
-  auto fwdDesc = eltwise_fwd::desc(mkldnn::prop_kind::forward_training,
-                                   algo,
-                                   val_->getMemoryDesc(),
-                                   alpha,
-                                   beta);
-  fwdPD_.reset(new eltwise_fwd::primitive_desc(fwdDesc, *engine_));
-  // use inplace for forward but save input value before submit
-  inVal_ = val_;
-  copyInVal_ = nullptr;
-  if (act.grad && algo == algorithm::eltwise_tanh) {
-    // tanh need save src input for backward
-    inVal_ = MKLDNNMatrix::create(val_->getPrimitiveDesc());
-    copyInVal_ = std::make_shared<mkldnn::reorder>(*val_, *inVal_);
-    CHECK(copyInVal_) << "should not be emptry";
-    pipelineFwd_.push_back(*copyInVal_);
-  }
-  fwd_.reset(new eltwise_fwd(*fwdPD_, *val_, *val_));
-  pipelineFwd_.push_back(*fwd_);
-  needResetBwd_ = true;
-}
-
-void MKLDNNEltwiseActivation::resetBwd(Argument& act) {
-  if (!needResetBwd_) {
-    return;
-  }
-  VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward";
-  needResetBwd_ = false;
-  algorithm algo = getAlgo(this->getName());
-  float alpha = getBwdAlpha();
-  float beta = getBeta();
-  grad_ = MKLDNNMatrix::create(val_->getPrimitiveDesc(), act.grad);
-  auto eng = CPUEngine::Instance().getEngine();
-  auto bwdDesc = eltwise_bwd::desc(
-      algo, grad_->getMemoryDesc(), val_->getMemoryDesc(), alpha, beta);
-  auto bwdPD = eltwise_bwd::primitive_desc(bwdDesc, eng, *fwdPD_);
-  CHECK(inVal_);
-  bwd_.reset(new eltwise_bwd(bwdPD, *inVal_, *grad_, *grad_));
-  pipelineBwd_.clear();
-  pipelineBwd_.push_back(*bwd_);
-}
-
-/**
- * @brief MKLDNN Softmax Activation
- */
-DEFINE_MKLDNN_ACTIVATION(softmax, MKLDNNSoftmaxActivation)
-
-void MKLDNNSoftmaxActivation::resetFwd(Argument& act) {
-  if (cnt_ == act.value->getElementCnt()) {
-    return;
-  }
-  MKLDNNActivation::resetFwd(act);
-  int axis = 1;
-  auto fwdDesc = softmax_fwd::desc(
-      mkldnn::prop_kind::forward_scoring, val_->getMemoryDesc(), axis);
-  auto fwdPD = softmax_fwd::primitive_desc(fwdDesc, *engine_);
-  fwd_.reset(new softmax_fwd(fwdPD, *val_, *val_));
-  pipelineFwd_.push_back(*fwd_);
-}
-
-Error __must_check MKLDNNSoftmaxActivation::forward(Argument& act) {
-  resetFwd(act);
-  stream_->submit(pipelineFwd_);
-  real* v = act.value->getData();
-  real threshold = exp(-64);
-#pragma omp parallel for
-  for (size_t i = 0; i < act.value->getElementCnt(); ++i) {
-    v[i] = v[i] < threshold ? threshold : v[i];
-  }
-  return Error();
-}
-
-Error __must_check MKLDNNSoftmaxActivation::backward(Argument& act) {
-  MatrixPtr outputV = act.value;
-  MatrixPtr outputG = act.grad;
-  Matrix::resizeOrCreate(sftMaxDot_,
-                         outputG->getHeight(),
-                         outputG->getWidth(),
-                         /* trans */ false,
-                         /* useGpu */ false);
-  Matrix::resizeOrCreate(sftMaxSum_,
-                         outputG->getHeight(),
-                         1,
-                         /* trans */ false,
-                         /* useGpu */ false);
-  sftMaxDot_->dotMul(*outputG, *outputV);
-  sftMaxSum_->colMerge(*sftMaxDot_);
-  act.grad->softmaxDerivative(*act.value, *sftMaxSum_);
-  return Error();
-}
-
-ActivationFunction* MKLDNNActivation::create(const std::string& type) {
-  return gMKLDNNActivationRegistrar.createByType(type);
-}
-
-std::vector<std::string> MKLDNNActivation::getAllRegisteredTypes() {
-  std::vector<std::string> types;
-  gMKLDNNActivationRegistrar.forEachType(
-      [&](const std::string& type) { types.push_back(type); });
-  return types;
-}
-
-void MKLDNNActivation::resetFwd(Argument& act) {
-  VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward";
-  cnt_ = act.value->getElementCnt();
-  pipelineFwd_.clear();
-  stream_.reset(new MKLDNNStream());
-  engine_.reset(new mkldnn::engine(mkldnn::engine::cpu, 0));
-  val_ = std::dynamic_pointer_cast<MKLDNNMatrix>(act.value);
-  if (val_ == nullptr) {
-    int bs = act.getBatchSize();
-    int ih = act.getFrameHeight() > 0 ? act.getFrameHeight() : 1;
-    int iw = act.getFrameWidth() > 0 ? act.getFrameWidth() : 1;
-    int ic = cnt_ / bs / ih / iw;
-    CHECK_EQ(cnt_, (size_t)bs * ic * ih * iw);
-    val_ = MKLDNNMatrix::create(
-        {bs, ic, ih, iw}, mkldnn::memory::format::nchw, *engine_, act.value);
-    CHECK(val_);
-    val_->downSpatial();
-  }
-}
-
-Error __must_check MKLDNNActivation::forward(Argument& act) {
-  resetFwd(act);
-  stream_->submit(pipelineFwd_);
-  return Error();
-}
-Error __must_check MKLDNNActivation::backward(Argument& act) {
-  resetBwd(act);
-  stream_->submit(pipelineBwd_);
-  return Error();
-}
-}  // namespace paddle
diff --git a/paddle/gserver/activations/MKLDNNActivation.h b/paddle/gserver/activations/MKLDNNActivation.h
deleted file mode 100644
index eece1b9c37e72624dffd119804c65f7bd36e20fb..0000000000000000000000000000000000000000
--- a/paddle/gserver/activations/MKLDNNActivation.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "ActivationFunction.h"
-#include "mkldnn.hpp"
-#include "paddle/gserver/layers/MKLDNNBase.h"
-#include "paddle/math/MKLDNNMatrix.h"
-#include "paddle/parameter/Argument.h"
-
-namespace paddle {
-
-/**
- * @brief Base class of MKLDNN Activation.
- * Common activation function are provieded,
- * including mkldnn_relu, mkldnn_elu, mkldnn_tanh, mkldnn_softmax
- */
-class MKLDNNActivation : public ActivationFunction {
- protected:
-  // input value element count
-  size_t cnt_;
-  // should not merge the resetBwd into resetFwd,
-  // because the grad data would be changing before backward.
-  bool needResetBwd_;
-  // mkldnn matrix, primitive, stream and pipeline
-  MKLDNNMatrixPtr val_;
-  MKLDNNMatrixPtr grad_;
-  std::shared_ptr<mkldnn::engine> engine_;
-  std::shared_ptr<MKLDNNStream> stream_;
-  std::shared_ptr<mkldnn::primitive> fwd_;
-  std::shared_ptr<mkldnn::primitive> bwd_;
-  std::vector<mkldnn::primitive> pipelineFwd_;
-  std::vector<mkldnn::primitive> pipelineBwd_;
-
- public:
-  MKLDNNActivation() : cnt_(0), needResetBwd_(true) {}
-  ~MKLDNNActivation() {}
-  static ActivationFunction* create(const std::string& type);
-  static std::vector<std::string> getAllRegisteredTypes();
-  virtual const std::string& getName() const = 0;
-  /**
-   * reset the forward primitives
-   */
-  virtual void resetFwd(Argument& act);
-  /**
-   * reset the backward primitives,
-   * can not merge this functions into resetFwd as the grad data
-   * would be changing before backward.
-   */
-  virtual void resetBwd(Argument& act) {}
-  virtual Error __must_check forward(Argument& act);
-  virtual Error __must_check backward(Argument& act);
-};
-
-/**
- * @brief Base class of MKLDNN Eltwise Activation,
- * includes mkldnn_relu, mkldnn_elu and mkldnn_tanh.
- */
-class MKLDNNEltwiseActivation : public MKLDNNActivation {
-  typedef mkldnn::eltwise_forward eltwise_fwd;
-  typedef mkldnn::eltwise_backward eltwise_bwd;
-  typedef mkldnn::algorithm algorithm;
-
- protected:
-  // save the forward primitive desc, which can be used backward
-  std::shared_ptr<eltwise_fwd::primitive_desc> fwdPD_;
-  // eltwise_bwd need src input value
-  MKLDNNMatrixPtr inVal_;
-  // use for copy data
-  std::shared_ptr<mkldnn::reorder> copyInVal_;
-
- public:
-  MKLDNNEltwiseActivation() {}
-  ~MKLDNNEltwiseActivation() {}
-  virtual const std::string& getName() const = 0;
-
-  // in common, the alpha of forward and backward should be equal.
-  // but for relu, to avoid negative value, they should be opposite
-  virtual float getAlpha() const = 0;
-  virtual float getBwdAlpha() const = 0;
-  virtual float getBeta() const { return 0.f; }
-  virtual algorithm getAlgo(std::string type) const;
-  void resetFwd(Argument& act) override;
-  void resetBwd(Argument& act) override;
-};
-
-/**
- * @brief Base class of MKLDNN softmax Activation,
- * only have mkldnn forward, use cpu implement for backward.
- */
-class MKLDNNSoftmaxActivation : public MKLDNNActivation {
-  typedef mkldnn::softmax_forward softmax_fwd;
-
- private:
-  // for backward
-  MatrixPtr sftMaxSum_;
-  MatrixPtr sftMaxDot_;
-
- public:
-  MKLDNNSoftmaxActivation() {}
-  ~MKLDNNSoftmaxActivation() {}
-  virtual const std::string& getName() const = 0;
-  void resetFwd(Argument& act) override;
-  Error __must_check forward(Argument& act) override;
-  Error __must_check backward(Argument& act) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/dataproviders/DataProvider.cpp b/paddle/gserver/dataproviders/DataProvider.cpp
deleted file mode 100644
index 580cf821c685b3daf7f015bc137c6d5ea31ef100..0000000000000000000000000000000000000000
--- a/paddle/gserver/dataproviders/DataProvider.cpp
+++ /dev/null
@@ -1,410 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "DataProvider.h"
-
-#include <unistd.h>
-#include <algorithm>
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/StringUtil.h"
-#include "paddle/utils/Util.h"
-
-namespace paddle {
-
-void BufferBatch::swap(BufferBatch* bufBatch) {
-  DataBatch* batchData = bufBatch->getDataBatch();
-  hl_event_t hlEvent = bufBatch->getCuEvent();
-  hl_stream_t hlStream = bufBatch->getCuStream();
-  bufBatch->setDataBatch(batchData_);
-  bufBatch->setCuStream(hlStream_);
-  bufBatch->setCuEvent(hlEvent_);
-
-  batchData_ = batchData;
-  hlEvent_ = hlEvent;
-  hlStream_ = hlStream;
-}
-
-void BufferBatch::clone(DataBatch* srcBatch, bool useGpu) {
-  if (batchData_ == NULL) {
-    batchData_ = new DataBatch();
-  }
-  std::vector<Argument>& destData = batchData_->getStreams();
-  int numStreams = srcBatch->getNumStreams();
-  destData.resize(numStreams);
-  batchData_->setSize(srcBatch->getSize());
-  if (useGpu) {
-    createCuEvent();
-  }
-
-  for (int i = 0; i < numStreams; i++) {
-    destData[i].resizeAndCopyFrom(srcBatch->getStream(i), useGpu, hlStream_);
-  }
-  if (useGpu) {
-    hl_stream_record_event(hlStream_, hlEvent_);
-  }
-}
-
-DoubleBuffer::DoubleBuffer(DataProvider* dataPool,
-                           bool useGpu,
-                           int64_t batchSize) {
-  batchSize_ = batchSize;
-  dataPool_ = dataPool;
-  useGpu_ = useGpu;
-  dataQueue_ = new BufferBatchQueue();
-  bufferQueue_ = new BufferBatchQueue();
-
-  // insert a empty buffer
-  bufferQueue_->enqueue(new BufferBatch());
-  stopping_ = false;
-  pending_ = true;
-}
-
-DoubleBuffer::~DoubleBuffer() {
-  finishAsyncLoad();
-  while (dataQueue_->size()) {
-    BufferBatch* dataBtch = dataQueue_->dequeue();
-    delete dataBtch;
-    dataBtch = NULL;
-  }
-  while (bufferQueue_->size()) {
-    BufferBatch* bufBtch = bufferQueue_->dequeue();
-    delete bufBtch;
-    bufBtch = NULL;
-  }
-  delete dataQueue_;
-  dataQueue_ = NULL;
-  delete bufferQueue_;
-  bufferQueue_ = NULL;
-}
-
-void DoubleBuffer::removeOneBatch(DataBatch* dataBatch) {
-  // get data
-  BufferBatch* batch = dataQueue_->dequeue();
-  batch->syncEvent();  // when use GPU, need synchronized with the cuEvent
-  *dataBatch = *(batch->getDataBatch());
-
-  // push anothor buffer
-  if (*usingBatch_ == nullptr) {
-    *usingBatch_ = std::make_shared<BufferBatch>();
-  }
-
-  // Mark the using-batch
-  batch->swap((*usingBatch_).get());
-  bufferQueue_->enqueue(batch);
-
-  if (0 == dataBatch->getSize()) {
-    setPending(true);
-  }
-}
-
-void DoubleBuffer::insertOneBatch(DataBatch* batch) {
-  while (!bufferQueue_->waitNotEmptyFor(2 /* seconds */)) {  // time out
-    if (stopping_) return;
-  }
-  BufferBatch* bufBatch = bufferQueue_->dequeue();
-  // clone and copy the data from an Threadlocal Variable
-  bufBatch->clone(batch, useGpu_);
-  dataQueue_->enqueue(bufBatch);
-}
-
-void DoubleBuffer::asyncLoadBatch() {
-  int64_t actualSize = 0;
-  if (useGpu_) {
-    hl_set_device(FLAGS_gpu_id);
-  }
-  setPending(false);
-
-  while (true) {
-    taskReadySem_.wait();
-    if (stopping_) break;
-
-    while (batchSize_ == 0 && !stopping_) {
-      usleep(5);
-    }
-    if (stopping_) break;
-
-    do {
-      DataBatch newBatch;
-      {
-        REGISTER_TIMER("getNextBatchInternal");
-        actualSize = dataPool_->getNextBatchInternal(batchSize_, &newBatch);
-      }
-      insertOneBatch(&newBatch);
-    } while (actualSize > 0 && !stopping_);
-  }
-}
-
-void DoubleBuffer::startAsyncLoad() {
-  if (asyncLoader_ == nullptr) {
-    asyncLoader_.reset(new std::thread([this]() { this->asyncLoadBatch(); }));
-  }
-  taskReadySem_.post();
-}
-
-ClassRegistrar<DataProvider, DataConfig, ModelConfig, bool>
-    DataProvider::registrar_;
-
-DataProvider* DataProvider::create(const DataConfig& config,
-                                   const ModelConfig& modelConfig,
-                                   bool useGpu) {
-  return registrar_.createByType(config.type(), config, modelConfig, useGpu);
-}
-
-REGISTER_DATA_PROVIDER(simple, SimpleDataProvider);
-REGISTER_DATA_PROVIDER(dummy, DummyDataProvider);
-
-int64_t DataProvider::getNextBatch(int64_t size, DataBatch* batch) {
-  int64_t batchSize = doubleBuffer_ ? getNextBatchFromBuffer(size, batch)
-                                    : getNextBatchInternal(size, batch);
-
-  if (!batchSize) return 0;
-
-  if (!config_.constant_slots_size()) return batchSize;
-
-  auto& constantSlots = *constantSlots_;
-  constantSlots.resize(config_.constant_slots_size());
-
-  for (int i = 0; i < config_.constant_slots_size(); ++i) {
-    MemoryHandlePtr handle =
-        constantSlots[i] ? constantSlots[i]->getMemoryHandle() : nullptr;
-    Matrix::resizeOrCreate(constantSlots[i],
-                           batchSize,
-                           1,         // = width
-                           false,     // = trans
-                           useGpu_);  // = useGpu
-    if (handle != constantSlots[i]->getMemoryHandle()) {
-      // memory buf was reallocated. We need to initialize the value
-      constantSlots[i]->assign(config_.constant_slots(i));
-    }
-    batch->appendData(constantSlots[i],
-                      batch->getStream(0).sequenceStartPositions);
-  }
-
-  return batchSize;
-}
-
-int64_t DataProvider::getNextBatchFromBuffer(int64_t size, DataBatch* batch) {
-  CHECK(doubleBuffer_ != nullptr);
-
-  if (doubleBuffer_->getBatchSize() != size) {
-    doubleBuffer_->setBatchSize(size);
-  }
-
-  doubleBuffer_->removeOneBatch(batch);
-  return batch->getSize();
-}
-
-void DataProvider::initAsyncLoader() {
-  if (doubleBuffer_ == nullptr) {
-    doubleBuffer_.reset(new DoubleBuffer(this, useGpu_));
-  }
-  useGpu_ = false;  // Avoid D2D copy, it will delay the computing performance
-}
-
-SimpleDataProviderBase::SimpleDataProviderBase(const DataConfig& config,
-                                               bool useGpu,
-                                               bool withInfo)
-    : DataProvider(config, useGpu) {
-  /* initialize the size of a sample, and the buffer */
-  sampleDim_ = config_.feat_dim() * (2 * config_.context_len() + 1);
-  bufferCapacity_ = config_.buffer_capacity();
-  withInfo_ = withInfo;
-  sampleNumInBuf_ = 0;
-  nextItemIndex_ = 0;
-
-  /* malloc buffer in cpu */
-  hInputDataBuf_ = std::make_shared<CpuMatrix>(bufferCapacity_, sampleDim_);
-  hInputLabelBuf_ = std::make_shared<CpuIVector>(bufferCapacity_);
-  hInputInfoBuf_ = std::make_shared<CpuIVector>(bufferCapacity_);
-}
-
-void SimpleDataProviderBase::shuffle() {
-  int i, t;
-  int len = sampleNumInBuf_;
-  std::vector<real> temp(sampleDim_);
-  real* data = hInputDataBuf_->getData();
-  int* label = hInputLabelBuf_->getData();
-  int* info = hInputInfoBuf_->getData();
-  int sampleSz = sizeof(real) * sampleDim_;
-  for (i = 0; i < len; i++) {
-    int randNum = rand();  // NOLINT TODO(yuyang18): Use rand_r instead?
-    t = randNum % (len - i) + i;
-    // swap
-    if (i != t) {
-      // swap data
-      memcpy(&temp[0], &data[i * sampleDim_], sampleSz);
-      memcpy(&data[i * sampleDim_], &data[t * sampleDim_], sampleSz);
-      memcpy(&data[t * sampleDim_], &temp[0], sampleSz);
-      std::swap(label[i], label[t]);
-      if (withInfo_) {
-        std::swap(info[i], info[t]);
-      }
-    }
-  }
-}
-
-int64_t SimpleDataProviderBase::getNextBatchInternal(int64_t size,
-                                                     DataBatch* batch) {
-  CHECK(batch != NULL);
-  batch->clear();
-
-  int64_t startIndex;
-  int64_t cpySize;
-
-  std::lock_guard<RWLock> guard(lock_);
-  if (sampleNumInBuf_ - nextItemIndex_ < size) {
-    int64_t n = fillBuffer();
-    VLOG(1) << "fillBuffer return " << n << " samples.\n";
-  }
-
-  startIndex = nextItemIndex_;
-  cpySize = std::min(size, sampleNumInBuf_ - nextItemIndex_);
-  nextItemIndex_ += cpySize;
-
-  if (cpySize > 0) {
-    real* data = hInputDataBuf_->getData() + startIndex * sampleDim_;
-    int* label = hInputLabelBuf_->getData() + startIndex;
-    int* info = hInputInfoBuf_->getData() + startIndex;
-
-    MatrixPtr& dataBatch = *dataBatch_;     // get the thread local object
-    IVectorPtr& labelBatch = *labelBatch_;  // get the thread local object
-    IVectorPtr& infoBatch = *infoBatch_;    // get the thread local object
-    if (!dataBatch) {
-      dataBatch = Matrix::create(cpySize, sampleDim_, false, useGpu_);
-      labelBatch = IVector::create(cpySize, useGpu_);
-      if (withInfo_) {
-        infoBatch = IVector::create(cpySize, 0);
-      }
-    } else {
-      dataBatch->resize(cpySize, sampleDim_);
-      labelBatch->resize(cpySize);
-      if (withInfo_) {
-        infoBatch->resize(cpySize);
-      }
-    }
-    dataBatch->copyFrom(data, cpySize * sampleDim_);
-    labelBatch->copyFrom(label, cpySize);
-    batch->appendData(dataBatch);
-    batch->appendLabel(labelBatch);
-    if (withInfo_) {
-      infoBatch->copyFrom(info, cpySize);
-      batch->appendLabel(infoBatch);
-    }
-  }
-
-  batch->setSize(cpySize);
-  return cpySize;
-}
-
-void SimpleDataProviderBase::reset() {
-  sampleNumInBuf_ = 0;
-  nextItemIndex_ = 0;
-  DataProvider::reset();
-}
-
-int64_t SimpleDataProviderBase::getSize() {
-  LOG(FATAL) << "Currently, not implemented";
-  return 0;
-}
-
-int64_t SimpleDataProviderBase::fillBuffer() {
-  int64_t n = sampleNumInBuf_ - nextItemIndex_;
-
-  /* flash the remaining data to the beginning of the buffer */
-  if (n > 0) {
-    hInputDataBuf_->copyFrom(
-        hInputDataBuf_->getData() + nextItemIndex_ * sampleDim_,
-        n * sampleDim_);
-    hInputLabelBuf_->copyFrom(hInputLabelBuf_->getData() + nextItemIndex_, n);
-    if (withInfo_) {
-      hInputInfoBuf_->copyFrom(hInputInfoBuf_->getData() + nextItemIndex_, n);
-    }
-  }
-
-  sampleNumInBuf_ =
-      n + fillBufferImp(hInputDataBuf_->getData() + n * sampleDim_,
-                        hInputLabelBuf_->getData() + n,
-                        hInputInfoBuf_->getData() + n,
-                        bufferCapacity_ - n);
-
-  /* for stachastic gradient training */
-  if (!skipShuffle_) {
-    shuffle();
-  }
-
-  nextItemIndex_ = 0;
-
-  return sampleNumInBuf_;
-}
-
-SimpleDataProvider::SimpleDataProvider(const DataConfig& config, bool useGpu)
-    : SimpleDataProviderBase(config, useGpu, /* withInfo= */ false),
-      currentSampleIndex_(0) {
-  loadData(config_.files());
-}
-
-SimpleDataProvider::~SimpleDataProvider() {}
-
-int64_t SimpleDataProvider::fillBufferImp(real* data,
-                                          int* label,
-                                          int* info,
-                                          int64_t size) {
-  (void)info;
-  int64_t n = std::min<int64_t>(labels_.size() - currentSampleIndex_, size);
-  memcpy(data,
-         &data_[currentSampleIndex_ * sampleDim_],
-         n * sampleDim_ * sizeof(real));
-  memcpy(label, &labels_[currentSampleIndex_], sizeof(int) * n);
-  currentSampleIndex_ += n;
-
-  return n;
-}
-
-void SimpleDataProvider::reset() {
-  currentSampleIndex_ = 0;
-  SimpleDataProviderBase::reset();
-}
-
-void SimpleDataProvider::loadData(const std::string& fileName) {
-  std::ifstream is(fileName);
-  CHECK(is) << "Fail to open " << fileName;
-  std::string line;
-  while (is) {
-    if (!getline(is, line)) break;
-    LOG(INFO) << "load data file " << line;
-    loadDataFile(line);
-  }
-  LOG(INFO) << "read done, num of instance=" << labels_.size()
-            << " data size=" << data_.size();
-}
-
-void SimpleDataProvider::loadDataFile(const std::string& fileName) {
-  std::ifstream is(fileName);
-  std::string line;
-  std::vector<std::string> pieces;
-  while (is) {
-    if (!getline(is, line)) break;
-    str::split(line, ' ', &pieces);
-    CHECK_EQ((uint64_t)(sampleDim_ + 1), pieces.size())
-        << " Dimension mismatch, " << pieces.size() - 1 << " in " << fileName
-        << " " << sampleDim_ << " from config";
-    labels_.push_back(atoi(pieces[0].c_str()));
-    for (int i = 0; i < sampleDim_; ++i) {
-      data_.push_back(atof(pieces[i + 1].c_str()));
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/dataproviders/DataProvider.h b/paddle/gserver/dataproviders/DataProvider.h
deleted file mode 100644
index 21822b10c2ebf1d353195794cf8f49e02b64c177..0000000000000000000000000000000000000000
--- a/paddle/gserver/dataproviders/DataProvider.h
+++ /dev/null
@@ -1,480 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <fstream>
-#include <iostream>
-#include <memory>
-#include <mutex>
-#include <vector>
-
-#include "DataConfig.pb.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/SparseMatrix.h"
-#include "paddle/math/Vector.h"
-#include "paddle/parameter/Argument.h"
-#include "paddle/utils/ClassRegistrar.h"
-#include "paddle/utils/Common.h"
-#include "paddle/utils/Locks.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Queue.h"
-#include "paddle/utils/ThreadLocal.h"
-#include "paddle/utils/Util.h"
-
-namespace paddle {
-/**
- * @def REGISTER_DATA_PROVIDER
- * @brief Macro for registering a data provider. The class type should contain
- *        a consturctor with parameter (DataConfig, bool).
- */
-#define REGISTER_DATA_PROVIDER(__type_name, __class_name)                \
-  static InitFunction __reg_type_##__type_name([]() {                    \
-    DataProvider::registrar_.registerClass(                              \
-        #__type_name,                                                    \
-        [](DataConfig conf, ModelConfig, bool useGpu) -> DataProvider* { \
-          DataProvider* dp = new __class_name(conf, useGpu);             \
-          return dp;                                                     \
-        });                                                              \
-  })
-
-/**
- * @def REGISTER_DATA_PROVIDER_EX
- * @brief Macro for registering a data provider, which contains a constructor
- *        with parameter (DataConfig, ModelConfig, bool).
- */
-#define REGISTER_DATA_PROVIDER_EX(__type_name, __class_name)            \
-  static InitFunction __reg_type_##__type_name([] {                     \
-    DataProvider::registrar_.registerClass<__class_name>(#__type_name); \
-  })
-
-class DataBatch;
-class BufferBatch;
-typedef std::shared_ptr<DataBatch> DataBatchPtr;
-typedef std::shared_ptr<BufferBatch> BufferBatchPtr;
-/**
- * @brief Data for batch training a neural network
- */
-class DataBatch {
- public:
-  DataBatch() : size_(0) { data_.clear(); }
-  /**
-   * @brief Get batch size
-   * @return batch size
-   */
-  int64_t getSize() const { return size_; }
-  /**
-   * @brief Get num of sequences of sequence data
-   * @return num of sequences
-   */
-  int64_t getNumSequences() const {
-    if (data_.empty()) return size_;
-    return data_[0].sequenceStartPositions
-               ? data_[0].sequenceStartPositions->getSize() - 1
-               : size_;
-  }
-  /**
-   * @brief Set batch size
-   * @param[in] size size
-   */
-  void setSize(int64_t size) { size_ = size; }
-  /**
-   * @brief Get size of argument vector
-   * @return size of argument vector
-   * @note For usual supervised learning, input data and label is needed,
-   * then there will be two argument.
-   */
-  int64_t getNumStreams() const { return data_.size(); }
-
-  /**
-   * @brief Get a argument with index i
-   * @param[in] i index in argument vector
-   * @return a argument with index i
-   */
-  const Argument& getStream(int i) const { return data_[i]; }
-  /**
-   * @brief Get all argument
-   * @return an argument vector
-   */
-  std::vector<Argument>& getStreams() { return data_; }
-  /**
-   * @brief Get all argument const
-   * @return an argument vector
-   */
-  std::vector<Argument> getStreams() const { return data_; }
-  /**
-   * @brief Clear DataBatch
-   */
-  void clear() {
-    data_.clear();
-    size_ = 0;
-  }
-
-  /**
-   * @brief Append data to DataBatch
-   * @param[in] data  matrix data
-   * @note The order in which each data stream is appended must match the order
-   * specified in stream_names of DataConfig. The stream_names can be obtained
-   * using DataProvider::getStreamNames().
-   */
-  void appendData(MatrixPtr data) {
-    Argument argu;
-    argu.value = data;
-    data_.push_back(argu);
-  }
-
-  /**
-   * @brief Append sequence data to DataBatch
-   * @param[in] data                      matrix data
-   * @param[in] sequenceStartPositions    sequence data
-   * @note The order in which each data stream is appended must match the order
-   * specified in stream_names of DataConfig. The stream_names can be obtained
-   * using DataProvider::getStreamNames().
-   */
-  void appendData(const MatrixPtr& data,
-                  const ICpuGpuVectorPtr& sequenceStartPositions) {
-    Argument argu;
-    argu.value = data;
-    argu.sequenceStartPositions = sequenceStartPositions;
-    data_.push_back(argu);
-  }
-  /**
-   * @brief Append label data
-   * @param[in]  label    label data
-   * @param[in]  value    matrix data, default null
-   */
-  void appendLabel(IVectorPtr label, MatrixPtr value = nullptr) {
-    Argument argu;
-    argu.ids = label;
-    argu.value = value;
-    data_.push_back(argu);
-  }
-
-  /*
-   * @brief Append argument
-   * @param[in]  argus   DataBatch.getStreams()
-   * @param[in]  size    DataBatch.getSize()
-   * @param[in]  dataId  sub dataprovider id (in MultiDataProvider)
-   */
-  void appendArguments(const std::vector<Argument>& argus,
-                       int size,
-                       int dataId) {
-    size_ += size;
-    for (const auto& argu : argus) {
-      data_.push_back(argu);
-      data_.back().dataId = dataId;
-    }
-  }
-
- protected:
-  /**
-   * @brief batch size
-   */
-  int64_t size_;
-  /**
-   * @brief A batch data consist of a Argument vector,
-   * An argument corresponds to a type of input data.
-   */
-  std::vector<Argument> data_;
-};
-
-class BufferBatch {
- public:
-  BufferBatch() {
-    hlStream_ = HPPL_STREAM_DEFAULT;
-    hlEvent_ = NULL;
-    batchData_ = NULL;
-  }
-  ~BufferBatch() {
-    if (hlEvent_) {
-      hl_destroy_event(hlEvent_);
-      hlEvent_ = NULL;
-    }
-    delete batchData_;
-    batchData_ = NULL;
-  }
-
-  void setDataBatch(DataBatch* batchData) { batchData_ = batchData; }
-  DataBatch* getDataBatch() { return batchData_; }
-
-  void setCuStream(hl_stream_t stream) { hlStream_ = stream; }
-  hl_stream_t getCuStream() const { return hlStream_; }
-
-  void setCuEvent(hl_event_t event) { hlEvent_ = event; }
-
-  hl_event_t getCuEvent() const { return hlEvent_; }
-
-  void createCuEvent() {
-    if (!hlEvent_) {
-      hlStream_ = HPPL_STREAM_1;
-      hl_create_event(&hlEvent_);
-    }
-  }
-
-  void syncEvent() {
-    if (hlEvent_) {
-      hl_stream_wait_event(hlStream_, hlEvent_);
-    }
-  }
-
-  void swap(BufferBatch* bufBatch);
-  void clone(DataBatch* srcBatch, bool useGpu);
-
- protected:
-  DataBatch* batchData_;
-  hl_stream_t hlStream_;
-  hl_event_t hlEvent_;
-};
-
-class DataProvider;
-typedef std::shared_ptr<DataProvider> DataProviderPtr;
-
-typedef Queue<BufferBatch*> BufferBatchQueue;
-
-class DoubleBuffer {
- public:
-  DoubleBuffer(DataProvider* dataPool, bool useGpu, int64_t batchSize = 0);
-  virtual ~DoubleBuffer();
-  void removeOneBatch(DataBatch* dataBatch);
-
-  void setBatchSize(int64_t newBatchSize) { batchSize_ = newBatchSize; }
-
-  int64_t getBatchSize() { return batchSize_; }
-
-  void startAsyncLoad();
-  void finishAsyncLoad() {
-    stopping_ = true;
-    taskReadySem_.post();
-    if (asyncLoader_) {
-      asyncLoader_->join();
-    }
-  }
-
-  void setPending(bool pending) { pending_ = pending; }
-
- protected:
-  virtual void asyncLoadBatch();
-  void insertOneBatch(DataBatch* batch);
-
-  DataProvider* dataPool_;
-  bool useGpu_;
-  int32_t batchSize_;
-  ThreadLocal<BufferBatchPtr> usingBatch_;
-  BufferBatchQueue* dataQueue_;
-  BufferBatchQueue* bufferQueue_;
-  std::unique_ptr<std::thread> asyncLoader_;
-  Semaphore taskReadySem_;
-  bool stopping_;
-  bool pending_;
-};
-
-/**
- * @brief Base class for DataProvider, which supplies data for training
- * @note It can supplies multiple streams of data.
- * For typical supervised training, there are two streams:
- * one is for input, one is for label.
- */
-class DataProvider {
- public:
-  static ClassRegistrar<DataProvider, DataConfig, ModelConfig, bool> registrar_;
-  static DataProvider* create(const DataConfig& config,
-                              const ModelConfig& modelConfig,
-                              bool useGpu = FLAGS_use_gpu);
-
-  /**
-   * @brief create only used for unittest.
-   */
-  inline static DataProvider* create(const DataConfig& config,
-                                     bool useGpu = FLAGS_use_gpu) {
-    return create(config, ModelConfig(), useGpu);
-  }
-
-  DataProvider(const DataConfig& config, bool useGpu)
-      : config_(config),
-        skipShuffle_(false),
-        usageRatio_(config.usage_ratio()),
-        useGpu_(useGpu) {
-    if (config_.async_load_data()) {
-      initAsyncLoader();
-    }
-  }
-  virtual ~DataProvider() {}
-
-  const DataConfig& getConfig() const { return config_; }
-
-  void setSkipShuffle() { skipShuffle_ = true; }
-
-  /**
-   * @brief Get next batch of training samples
-   * @param[in]    size    size of training samples to get
-   * @param[out]   batch   a batch of training samples
-   * @return actual size of obtained training samples
-   */
-  int64_t getNextBatch(int64_t size, DataBatch* batch);
-
-  /**
-   * @brief Shuffle the data set
-   */
-  virtual void shuffle() = 0;
-
-  /**
-   * @brief reset all the value of index
-   * @note reset() must be called before any calls to getNextBatch()
-   * IMPORTANT: subclass reset() should always call the base class reset()
-   * at the end of the function
-   */
-  virtual void reset() {
-    if (doubleBuffer_ != nullptr) {
-      doubleBuffer_->startAsyncLoad();
-    }
-  }
-
-  /**
-   * @brief Get the size of training samples
-   * @return the number of training samples in the data set.
-   * @note return -1 to indicate unlimited number of samples.
-   */
-  virtual int64_t getSize() = 0;
-
-  /**
-   * @brief Get next batch training samples internally
-   * @param[in]    size      size of training samples to get
-   * @param[out]   batch     a batch of training samples
-   * @return actual size of obtained training samples
-   */
-  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch) = 0;
-
- protected:
-  DataConfig config_;
-  bool skipShuffle_;
-  float usageRatio_;
-  bool useGpu_;
-  std::unique_ptr<DoubleBuffer> doubleBuffer_;
-  ThreadLocal<std::vector<MatrixPtr>> constantSlots_;
-  /**
-   * @@brief Get next batch training samples from buffer
-   * @param[in]    size      size of training samples to get
-   * @param[out]   batch     a batch of training samples
-   * @return actual size of obtained training samples
-   */
-  int64_t getNextBatchFromBuffer(int64_t size, DataBatch* batch);
-
-  void initAsyncLoader();
-};
-
-/**
- * A data provider which does nothing. It only serves as providing
- * necessary configurations such as stream_names
- */
-class DummyDataProvider : public DataProvider {
- public:
-  DummyDataProvider(const DataConfig& config, bool useGpu)
-      : DataProvider(config, useGpu) {}
-  virtual void shuffle() {}
-  virtual void reset() { DataProvider::reset(); }
-  virtual int64_t getSize() { return 0; }
-  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch) {
-    (void)size;
-    (void)batch;
-    return 0;
-  }
-};
-
-/**
- * Data provider for one input and one integer label.
- */
-class SimpleDataProviderBase : public DataProvider {
- protected:
-  /// sample feature dimension
-  int64_t sampleDim_;
-  /// the number of samples
-  int64_t bufferCapacity_;
-  int64_t sampleNumInBuf_;
-  /// next item to read in buffer
-  int64_t nextItemIndex_;
-  /// some user defined info for validation
-  bool withInfo_;
-
-  /// data buffer: bufferCapacity_ * nDataDim_
-  CpuMatrixPtr hInputDataBuf_;
-
-  /// label buffer:bufferCapacity_ * 1
-  CpuIVectorPtr hInputLabelBuf_;
-
-  /// info buffer:bufferCapacity_ * 1
-  CpuIVectorPtr hInputInfoBuf_;
-
-  ThreadLocal<MatrixPtr> dataBatch_;
-  ThreadLocal<IVectorPtr> labelBatch_;
-  ThreadLocal<IVectorPtr> infoBatch_;
-
-  RWLock lock_;
-
- public:
-  SimpleDataProviderBase(const DataConfig& config, bool useGpu, bool withInfo);
-  ~SimpleDataProviderBase() {}
-
-  void shuffle();
-
-  virtual void reset();
-
-  virtual int64_t getSize();
-
-  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
-
-  /// return the number of samples in the buffer
-  int64_t fillBuffer();
-
- protected:
-  /**
-   * @brief Fill at most size samples into data and label.
-   *
-   * Each input is stored in contiguous memory locations in data.
-   *
-   * data[n * sampleDim_] .. data[n * sampleDim_ + sampleDim_ - 1] is for
-   * the input of the n-th sample.
-   *
-   * label[n] is the label for the n-th sample.
-   */
-  virtual int64_t fillBufferImp(real* data,
-                                int* label,
-                                int* info,
-                                int64_t size) = 0;
-};
-
-class SimpleDataProvider : public SimpleDataProviderBase {
- public:
-  SimpleDataProvider(const DataConfig& config, bool useGpu);
-  ~SimpleDataProvider();
-  virtual void reset();
-
- protected:
-  void loadData(const std::string& fileName);
-  void loadDataFile(const std::string& fileName);
-  virtual int64_t fillBufferImp(real* data,
-                                int* label,
-                                int* info,
-                                int64_t size);
-
- protected:
-  size_t currentSampleIndex_;
-  std::vector<int> labels_;
-  std::vector<real> data_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/dataproviders/MultiDataProvider.cpp b/paddle/gserver/dataproviders/MultiDataProvider.cpp
deleted file mode 100644
index f71947ef3946284b7ecfb50851100fe43bd78857..0000000000000000000000000000000000000000
--- a/paddle/gserver/dataproviders/MultiDataProvider.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MultiDataProvider.h"
-#include <algorithm>
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Util.h"
-
-namespace paddle {
-
-using namespace std;
-
-MultiDataProvider::MultiDataProvider(const DataConfig& config,
-                                     const ModelConfig& modelConfig,
-                                     bool useGpu)
-    : DataProvider(config, useGpu) {
-  bool atLeastOneMainDataFlag = false;
-  totalDataRatio_ = 0;
-  LOG(INFO) << "MultiDataProvider: sub data provider size: "
-            << config.sub_data_configs_size();
-  LOG(INFO) << "MultiDataProvider: for_test: " << config.for_test();
-  isTestMode_ = config.for_test();
-  for (int i = 0; i < config.sub_data_configs_size(); i++) {
-    LOG(INFO) << "dataRatio of sub(" << i
-              << ") is: " << config.sub_data_configs(i).data_ratio();
-    totalDataRatio_ += config.sub_data_configs(i).data_ratio();
-    if (config.sub_data_configs(i).is_main_data()) {
-      LOG(INFO) << "main data is [" << i << "]";
-      atLeastOneMainDataFlag = true;
-    }
-  }
-  CHECK(atLeastOneMainDataFlag) << "all sub dataproviders in MultiData do not"
-                                << " have is_main_data flag";
-  LOG(INFO) << "totalDataRatio_=" << totalDataRatio_;
-  DataConfig subConfig;
-  int subDataProviderCount = config.sub_data_configs_size();
-  if (isTestMode()) {
-    LOG(INFO) << "construct MultiDataProvider in test mode";
-  } else {
-    LOG(INFO) << "construct MultiDataProvider in train mode";
-  }
-  subDataProviders_.resize(subDataProviderCount);
-  for (int i = 0; i < subDataProviderCount; i++) {
-    subConfig = config.sub_data_configs(i);
-    if (subConfig.async_load_data()) {
-      LOG(INFO) << "can not use async_load_data in sub dataprovider of "
-                   "MultiDataProvider";
-      subConfig.set_async_load_data(false);
-    }
-    subDataProviders_[i] = std::unique_ptr<DataProvider>(
-        DataProvider::create(subConfig, modelConfig, useGpu_));
-  }
-}
-
-void MultiDataProvider::reset() {
-  for (auto& elem : subDataProviders_) {
-    elem->reset();
-  }
-  DataProvider::reset();
-}
-
-void MultiDataProvider::shuffle() {
-  for (auto& elem : subDataProviders_) {
-    elem->shuffle();
-  }
-}
-
-int64_t MultiDataProvider::getNextBatchInternal(int64_t size,
-                                                DataBatch* batch) {
-  batch->clear();
-  for (size_t i = 0; i < subDataProviders_.size(); ++i) {
-    // calc size according to data ratio
-    int64_t subSize =
-        (int64_t)(1.0 * size * config_.sub_data_configs(i).data_ratio() /
-                  totalDataRatio_);
-    DataBatch subBatch;
-    int64_t realSize =
-        subDataProviders_[i]->getNextBatchInternal(subSize, &subBatch);
-    if (realSize == 0) {
-      // current subDataProvider has no data
-      if (!isTestMode()) {
-        // in train mode
-        if (config_.sub_data_configs(i).is_main_data()) {
-          // is main data provider. then return 0
-          batch->clear();
-          return 0;
-        } else {
-          // not main data provider, reset current subDataProvider and try again
-          subDataProviders_[i]->reset();
-          subBatch.clear();
-          realSize =
-              subDataProviders_[i]->getNextBatchInternal(subSize, &subBatch);
-          CHECK_GT(realSize, 0);
-        }
-      } else {
-        // in test mode, make an empty argument
-        Argument emptyArgu;
-        std::vector<Argument> argus;
-        argus.push_back(emptyArgu);
-        batch->appendArguments(argus, 0, -1);
-        continue;
-      }
-    }
-    batch->appendArguments(subBatch.getStreams(), subBatch.getSize(), i);
-  }
-  return batch->getSize();
-}
-
-REGISTER_DATA_PROVIDER_EX(multi, MultiDataProvider);
-
-}  // namespace paddle
diff --git a/paddle/gserver/dataproviders/PyDataProvider.cpp b/paddle/gserver/dataproviders/PyDataProvider.cpp
deleted file mode 100644
index dadf1b4cf27f248c7353aaad50dc22d4f6431cca..0000000000000000000000000000000000000000
--- a/paddle/gserver/dataproviders/PyDataProvider.cpp
+++ /dev/null
@@ -1,498 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PyDataProvider.h"
-#include "paddle/utils/Common.h"
-#include "paddle/utils/PythonUtil.h"
-#include "paddle/utils/Util.h"
-
-namespace paddle {
-
-#ifndef PADDLE_NO_PYTHON
-REGISTER_DATA_PROVIDER(py, PyDataProvider);
-#endif
-
-PyDataProvider::PyDataProvider(const DataConfig& config,
-                               bool useGpu,
-                               bool loadDataAll)
-    : DataProvider(config, useGpu), batchSize_(0) {
-  PyGuard guard;
-  pyModuleName_ = config_.load_data_module();
-  pyClassName_ = config_.load_data_object();
-  if (config_.load_data_args() != "") {
-    pyUserArgs_["load_data_args"] = config_.load_data_args();
-  }
-
-  if (loadDataAll) {
-    std::vector<std::string> fileList;
-    if (!config_.files().empty()) {
-      loadFileList(config_.files(), fileList);
-    }
-    loadData(fileList);
-  }
-}
-
-void PyDataProvider::loadData(const std::vector<std::string>& fileList) {
-  VLOG(1) << "module:" << pyModuleName_ << " class:" << pyClassName_;
-  classInstance_ =
-      createPythonClass(pyModuleName_, pyClassName_, fileList, pyUserArgs_);
-  CHECK(classInstance_) << "Create class instance failed.";
-  PyObjectPtr obj(PyObject_CallMethod(
-      classInstance_.get(), const_cast<char*>("getHeader"), NULL));
-  CHECK_PY(obj) << "Call function getHeader failed.";
-  std::string headerInfo =
-      std::string(PyString_AsString(obj.get()), PyString_Size(obj.get()));
-  parseHeaderData(headerInfo);
-  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
-}
-
-void PyDataProvider::parseHeaderData(const std::string& headerData) {
-  char* pHeader = const_cast<char*>(headerData.c_str());
-  char* pHeaderEnd = pHeader + headerData.size();
-  slotNum_ = readT<unsigned int>(pHeader, pHeaderEnd);
-  unsigned int useSequenceFlag = readT<unsigned int>(pHeader, pHeaderEnd);
-  isIID_ = useSequenceFlag != 1;
-  slots_.clear();
-  slots_.reserve(slotNum_);
-  for (size_t i = 0; i < slotNum_; ++i) {
-    unsigned int slotType = readT<unsigned int>(pHeader, pHeaderEnd);
-    unsigned int slotDim = readT<unsigned int>(pHeader, pHeaderEnd);
-    slots_.emplace_back();
-    slots_.back().dim = slotDim;
-    slots_.back().type = static_cast<SlotDef_SlotType>(slotType);
-  }
-}
-
-void PyDataProvider::resetSlots() {
-  for (auto& slot : slots_) {
-    slot.indexData.clear();
-    slot.denseData.clear();
-    slot.sparseNonValueData.clear();
-    slot.sparseFloatValueData.clear();
-    slot.indices.clear();
-    slot.sequenceStartPositions.clear();
-    slot.sampleSequenceIdVec.clear();
-    slot.subSequenceStartPositions.clear();
-    slot.strData.clear();
-  }
-}
-
-void PyDataProvider::fillDenseSlot(ProtoSlot& slot,
-                                   char*& data,
-                                   const char* dataEnd) {
-  unsigned int dim = slot.dim;
-  slot.sampleNum = readT<unsigned int>(data, dataEnd);
-  slot.denseData.resize(slot.sampleNum * dim);
-#ifdef PADDLE_TYPE_DOUBLE
-  CHECK_LE(data + sizeof(real) * dim * slot.sampleNum, dataEnd)
-      << "std::copy data is out of range";
-  // PyDataProvider always provide data in float
-  float* dat = reinterpret_cast<float*>(data);
-  std::copy(dat, dat + slot.sampleNum * dim, slot.denseData.begin());
-#else
-  memcpyWithCheck(slot.denseData.data(),
-                  data,
-                  sizeof(real) * dim * slot.sampleNum,
-                  dataEnd);
-#endif
-  // PyDataProvider always provide data in float
-  data += sizeof(float) * dim * slot.sampleNum;
-}
-
-void PyDataProvider::fillSparseNonValueSlot(ProtoSlot& slot,
-                                            char*& data,
-                                            const char* dataEnd) {
-  slot.sampleNum = readT<unsigned int>(data, dataEnd);
-  unsigned int* indexPtr = (unsigned int*)data;
-  CHECK_LE(data + sizeof(unsigned int) * slot.sampleNum, dataEnd)
-      << "Vector assign value is out of range";
-  slot.indices.assign(indexPtr, indexPtr + slot.sampleNum);
-  data += sizeof(unsigned int) * slot.sampleNum;
-  unsigned int length = 0;
-  length = readT<unsigned int>(data, dataEnd);
-  slot.indices.push_back(length);
-  slot.sparseNonValueData.resize(length);
-  memcpyWithCheck(slot.sparseNonValueData.data(),
-                  data,
-                  sizeof(unsigned int) * length,
-                  dataEnd);
-  data += sizeof(unsigned int) * length;
-}
-
-void PyDataProvider::fillSparseValueSlot(ProtoSlot& slot,
-                                         char*& data,
-                                         const char* dataEnd) {
-  slot.sampleNum = readT<unsigned int>(data, dataEnd);
-  unsigned int* indexPtr = (unsigned int*)data;
-  CHECK_LE(data + sizeof(unsigned int) * slot.sampleNum, dataEnd)
-      << "Vector assign value is out of range";
-  slot.indices.assign(indexPtr, indexPtr + slot.sampleNum);
-  data += sizeof(unsigned int) * slot.sampleNum;
-  unsigned int length = 0;
-  length = readT<unsigned int>(data, dataEnd);
-  unsigned int* colPtr = reinterpret_cast<unsigned int*>(data);
-  CHECK_LE(data + sizeof(unsigned int) * length, dataEnd)
-      << "Data is out of range";
-  data += sizeof(unsigned int) * length;
-  size_t colLen = readT<unsigned int>(data, dataEnd);
-  CHECK_EQ(colLen, length);
-  float* valuePtr = reinterpret_cast<float*>(data);
-  CHECK_LE(data + sizeof(real) * length, dataEnd) << "Data is out of range";
-  data += sizeof(real) * length;
-  slot.indices.push_back(length);
-  slot.sparseFloatValueData.resize(length);
-  for (unsigned int ii = 0; ii < length; ++ii) {
-    slot.sparseFloatValueData[ii].col = colPtr[ii];
-    slot.sparseFloatValueData[ii].value = valuePtr[ii];
-  }
-}
-
-void PyDataProvider::fillIndexSlot(ProtoSlot& slot,
-                                   char*& data,
-                                   const char* dataEnd) {
-  slot.sampleNum = readT<unsigned int>(data, dataEnd);
-  CHECK_LE(data + sizeof(unsigned int) * slot.sampleNum, dataEnd)
-      << "Vector assign is out of range";
-  slot.indexData.assign(reinterpret_cast<int*>(data),
-                        reinterpret_cast<int*>(data) + slot.sampleNum);
-  data += sizeof(unsigned int) * slot.sampleNum;
-}
-
-void PyDataProvider::fillStringSlot(ProtoSlot& slot,
-                                    char*& data,
-                                    const char* dataEnd) {
-  slot.sampleNum = readT<unsigned int>(data, dataEnd);
-  for (unsigned int i = 0; i < slot.sampleNum; ++i) {
-    size_t len = readT<uint32_t>(data, dataEnd);
-    auto str_begin = data;
-    data += len;
-    CHECK_LE(data, dataEnd) << "Data is out of range";
-    slot.strData.emplace_back(str_begin, len);
-  }
-}
-
-void PyDataProvider::fillSlotsByStr(const std::string& samples) {
-  char* data = const_cast<char*>(samples.c_str());
-  char* dataEnd = data + samples.size();
-  batchSize_ = readT<unsigned int>(data, dataEnd);
-  if (0 == batchSize_) {
-    return;
-  }
-
-  for (size_t j = 0; j < slotNum_; ++j) {
-    auto& slot = slots_[j];
-    CHECK(SlotDef::INDEX >= slot.type || SlotDef::STRING == slot.type)
-        << " Slot type:" << slot.type << " is out of range.";
-    CHECK_GE(slot.type, SlotDef::VECTOR_DENSE) << " Slot type:" << slot.type
-                                               << " is out of range.";
-    switch (slot.type) {
-      case SlotDef::VECTOR_DENSE:
-        fillDenseSlot(slot, data, dataEnd);
-        break;
-      case SlotDef::VECTOR_SPARSE_NON_VALUE:
-        fillSparseNonValueSlot(slot, data, dataEnd);
-        break;
-      case SlotDef::VECTOR_SPARSE_VALUE:
-        fillSparseValueSlot(slot, data, dataEnd);
-        break;
-      case SlotDef::INDEX:
-        fillIndexSlot(slot, data, dataEnd);
-        break;
-      case SlotDef::VAR_MDIM_DENSE:
-        LOG(FATAL) << "Not implemented";
-        break;
-      case SlotDef::VAR_MDIM_INDEX:
-        LOG(FATAL) << "Not implemented";
-        break;
-      case SlotDef::STRING:
-        fillStringSlot(slot, data, dataEnd);
-        break;
-    }
-  }
-  // read sequenceStartPositions
-  for (size_t j = 0; j < slotNum_; ++j) {
-    auto& slot = slots_[j];
-    if (!iidData()) {
-      unsigned int sequenceNum = readT<unsigned int>(data, dataEnd);
-      slot.sequenceNum = sequenceNum;
-      for (size_t i = 0; i < sequenceNum; ++i) {
-        slot.sequenceStartPositions.push_back(
-            readT<unsigned int>(data, dataEnd));
-      }
-      for (size_t i = 0; i < sequenceNum; ++i) {
-        size_t begin = slot.sequenceStartPositions[i];
-        size_t end = (i < sequenceNum - 1) ? slot.sequenceStartPositions[i + 1]
-                                           : slot.sampleNum;
-        for (size_t ii = begin; ii < end; ++ii) {
-          slot.sampleSequenceIdVec.push_back(ii);
-        }
-      }
-    } else {
-      for (size_t i = 0; i < slot.sampleNum; ++i) {
-        slot.sampleSequenceIdVec.push_back(i);
-      }
-    }
-  }
-  // read subSequenceStartPositions, not all slots have this infomation.
-  for (size_t j = 0; j < slotNum_; ++j) {
-    auto& slot = slots_[j];
-    if (!iidData() && data != dataEnd) {
-      unsigned int subSequenceNum = readT<unsigned int>(data, dataEnd);
-      slot.subSequenceNum = subSequenceNum;
-      for (size_t i = 0; i < subSequenceNum; ++i) {
-        slot.subSequenceStartPositions.push_back(
-            readT<unsigned int>(data, dataEnd));
-      }
-    }
-  }
-}
-
-void PyDataProvider::reset() {
-  {  // Invoke PyDataProvider Reset
-    PyGuard guard;
-    PyObjectPtr obj(PyObject_CallMethod(
-        classInstance_.get(), const_cast<char*>("reset"), NULL));
-    CHECK_PY(obj) << "Call function reset failed.";
-  }
-
-  if (!skipShuffle_) {
-    // Invoke PyDataProvider Shuffle
-    shuffle();
-  }
-  DataProvider::reset();
-}
-
-void PyDataProvider::shuffle() {
-  // py shuffle
-  PyGuard guard;
-  PyObjectPtr obj(PyObject_CallMethod(
-      classInstance_.get(), const_cast<char*>("shuffle"), NULL));
-  CHECK_PY(obj) << "Call function shuffle failed.";
-}
-
-void PyDataProvider::handleDenseSlot(ProtoSlot& slot,
-                                     size_t slotIndex,
-                                     std::vector<Argument>& cpuArguments) {
-  unsigned int dim = slot.dim;
-  Matrix::resizeOrCreate(cpuArguments[slotIndex].value,
-                         slot.sampleNum,
-                         dim,
-                         false,   // trans = false
-                         false);  // useGpu = false
-  real* buf = cpuArguments[slotIndex].value->getData();
-  for (size_t i = 0; i < slot.sampleNum; ++i) {
-    memcpyWithCheck(buf + i * dim,
-                    slot.denseData.data() + slot.sampleSequenceIdVec[i] * dim,
-                    sizeof(real) * dim,
-                    slot.denseData.data() + slot.denseData.size());
-  }
-}
-
-void PyDataProvider::handleSparseNonValueSlot(
-    ProtoSlot& slot, size_t slotIndex, std::vector<Argument>& cpuArguments) {
-  unsigned int dim = slot.dim;
-  if (!(cpuArguments[slotIndex].value)) {
-    cpuArguments[slotIndex].value =
-        Matrix::createSparseMatrix(slot.sampleNum,
-                                   dim,
-                                   slot.sampleNum /*DEFAULT_AVG_WIDTH = 1*/,
-                                   NO_VALUE,
-                                   SPARSE_CSR,
-                                   false,
-                                   useGpu_);
-  }
-  auto mat = cpuArguments[slotIndex].value;
-  mat->resize(slot.sampleNum, dim, slot.sampleNum, NO_VALUE, SPARSE_CSR);
-  if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
-    std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
-        slot.sampleSequenceIdVec.data(),
-        slot.indices.data(),
-        slot.sparseNonValueData.data(),
-        HPPL_STREAM_1);
-  } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
-    std::dynamic_pointer_cast<CpuSparseMatrix>(mat)->copyFrom(
-        slot.sampleSequenceIdVec.data(),
-        slot.indices.data(),
-        slot.sparseNonValueData.data());
-  } else {
-    LOG(FATAL) << "Not Supported";
-  }
-}
-
-void PyDataProvider::handleSparseValueSlot(
-    ProtoSlot& slot, size_t slotIndex, std::vector<Argument>& cpuArguments) {
-  unsigned int dim = slot.dim;
-  if (!(cpuArguments[slotIndex].value)) {
-    cpuArguments[slotIndex].value =
-        Matrix::createSparseMatrix(slot.sampleNum,
-                                   dim,
-                                   slot.sampleNum /*DEFAULT_AVG_WIDTH = 1*/,
-                                   FLOAT_VALUE,
-                                   SPARSE_CSR,
-                                   false,
-                                   useGpu_);
-  }
-  auto mat = cpuArguments[slotIndex].value;
-  mat->resize(slot.sampleNum, dim, slot.sampleNum, FLOAT_VALUE, SPARSE_CSR);
-  if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
-    std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
-        slot.sampleSequenceIdVec.data(),
-        slot.indices.data(),
-        slot.sparseFloatValueData.data(),
-        HPPL_STREAM_DEFAULT);
-  } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
-    std::dynamic_pointer_cast<CpuSparseMatrix>(mat)->copyFrom(
-        slot.sampleSequenceIdVec.data(),
-        slot.indices.data(),
-        slot.sparseFloatValueData.data());
-  } else {
-    LOG(FATAL) << "Not Supported";
-  }
-}
-
-void PyDataProvider::handleIndexSlot(ProtoSlot& slot,
-                                     size_t slotIndex,
-                                     std::vector<Argument>& cpuArguments) {
-  IVector::resizeOrCreate(cpuArguments[slotIndex].ids,
-                          slot.sampleNum,
-                          /*useGpu_*/ false);
-  int* buf = cpuArguments[slotIndex].ids->getData();
-  for (size_t i = 0; i < slot.sampleNum; ++i) {
-    buf[i] = slot.indexData[slot.sampleSequenceIdVec[i]];
-  }
-}
-
-void PyDataProvider::handleStringSlot(ProtoSlot& slot,
-                                      size_t slotIndex,
-                                      std::vector<Argument>& cpuArguments) {
-  if (cpuArguments[slotIndex].strs) {
-    cpuArguments[slotIndex].strs->resize(slot.sampleNum);
-  } else {
-    cpuArguments[slotIndex].strs =
-        std::make_shared<std::vector<std::string>>(slot.sampleNum);
-  }
-  for (size_t i = 0; i < slot.sampleNum; ++i) {
-    (*cpuArguments[slotIndex].strs)[i] =
-        slot.strData[slot.sampleSequenceIdVec[i]];
-  }
-}
-
-int64_t PyDataProvider::getNextBatchInternal(int64_t size, DataBatch* batch) {
-  PyGuard guard;
-  PyObjectPtr obj(PyObject_CallMethod(classInstance_.get(),
-                                      const_cast<char*>("getNextBatch"),
-                                      const_cast<char*>("i"),
-                                      size));
-  CHECK_PY(obj) << "Call function getNextBatch failed.";
-  const std::string& samples =
-      std::string(PyString_AsString(obj.get()), PyString_Size(obj.get()));
-  resetSlots();
-  fillSlotsByStr(samples);
-  size = batchSize_;
-  if (size <= 0) return 0;
-
-  DataBatch& cpuBatch = *cpuBatch_;
-  std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
-  cpuBatch.setSize(size);
-  cpuArguments.resize(slotNum_);
-
-  if (!iidData()) {
-    for (size_t j = 0; j < slotNum_; ++j) {
-      auto& slot = slots_[j];
-      ICpuGpuVector::resizeOrCreate(cpuArguments[j].sequenceStartPositions,
-                                    slot.sequenceNum + 1,
-                                    /* useGpu= */ false);
-      int* buf = cpuArguments[j].sequenceStartPositions->getMutableData(false);
-      std::copy(slot.sequenceStartPositions.begin(),
-                slot.sequenceStartPositions.end(),
-                buf);
-      buf[slot.sequenceStartPositions.size()] = slot.sampleNum;
-
-      if (slot.subSequenceStartPositions.size()) {
-        ICpuGpuVector::resizeOrCreate(cpuArguments[j].subSequenceStartPositions,
-                                      slot.subSequenceNum + 1,
-                                      /*  useGpu= */ false);
-        int* buf =
-            cpuArguments[j].subSequenceStartPositions->getMutableData(false);
-        std::copy(slot.subSequenceStartPositions.begin(),
-                  slot.subSequenceStartPositions.end(),
-                  buf);
-        buf[slot.subSequenceNum] = slot.sampleNum;
-        // check subSequenceStartPositions and sequenceStartPositions
-        cpuArguments[j].checkSubset();
-      }
-    }
-  }
-
-  for (size_t slotIndex = 0; slotIndex < slotNum_; ++slotIndex) {
-    auto& slot = slots_[slotIndex];
-    SlotDef::SlotType slotType = slot.type;
-    switch (slotType) {
-      case SlotDef::VECTOR_DENSE:
-        handleDenseSlot(slot, slotIndex, cpuArguments);
-        break;
-      case SlotDef::VECTOR_SPARSE_NON_VALUE:
-        handleSparseNonValueSlot(slot, slotIndex, cpuArguments);
-        break;
-      case SlotDef::VECTOR_SPARSE_VALUE:
-        handleSparseValueSlot(slot, slotIndex, cpuArguments);
-        break;
-      case SlotDef::INDEX:
-        handleIndexSlot(slot, slotIndex, cpuArguments);
-        break;
-      case SlotDef::VAR_MDIM_DENSE:
-        LOG(FATAL) << "Not implemented";
-        break;
-      case SlotDef::VAR_MDIM_INDEX:
-        LOG(FATAL) << "Not implemented";
-        break;
-      case SlotDef::STRING:
-        handleStringSlot(slot, slotIndex, cpuArguments);
-        break;
-    }
-  }
-
-  if (useGpu_) {
-    std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
-    DataBatch& gpuBatch = *gpuBatch_;
-    std::vector<Argument>& gpuArguments = gpuBatch.getStreams();
-    gpuArguments.resize(cpuArguments.size());
-    gpuBatch.setSize(size);
-    for (size_t i = 0; i < slotNum_; ++i) {
-      SlotDef::SlotType slotType = slots_[i].type;
-      if (SlotDef::VECTOR_SPARSE_VALUE == slotType ||
-          SlotDef::VECTOR_SPARSE_NON_VALUE == slotType) {
-        gpuArguments[i] = cpuArguments[i];
-        gpuArguments[i].sequenceStartPositions =
-            cpuArguments[i].sequenceStartPositions;
-
-        if (slots_[i].subSequenceStartPositions.size()) {
-          gpuArguments[i].subSequenceStartPositions =
-              cpuArguments[i].subSequenceStartPositions;
-        }
-      } else {
-        gpuArguments[i].resizeAndCopyFrom(
-            cpuArguments[i], useGpu_, HPPL_STREAM_1);
-      }
-    }
-    hl_stream_synchronize(HPPL_STREAM_1);
-    *batch = gpuBatch;
-  } else {
-    *batch = cpuBatch;
-  }
-
-  return batch->getSize();
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/dataproviders/PyDataProvider.h b/paddle/gserver/dataproviders/PyDataProvider.h
deleted file mode 100644
index da50dd4e2ebb743ef45af319bc713ed7ac3d3e10..0000000000000000000000000000000000000000
--- a/paddle/gserver/dataproviders/PyDataProvider.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <paddle/utils/PythonUtil.h>
-#include "DataFormat.pb.h"
-#include "DataProvider.h"
-
-#include <vector>
-
-namespace paddle {
-
-class PyDataProvider : public DataProvider {
- public:
-  PyDataProvider(const DataConfig& config,
-                 bool useGpu,
-                 bool loadDataAll = true);
-
-  virtual void reset();
-
-  // Note this size includes the sequences which are skipped because they
-  // are longer than the batch size
-  virtual int64_t getSize() {
-    LOG(FATAL) << "Not implement yet";
-    return -1;
-  }
-  virtual void shuffle();
-
-  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
-
- protected:
-  struct ProtoSlot;
-  // return false if each each sample is one sequence, i.e., independent
-  // of other samples.
-  inline bool iidData() const { return isIID_; }
-
-  void parseHeaderData(const std::string& headerData);
-  void fillDenseSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
-  void fillSparseNonValueSlot(ProtoSlot& slot,
-                              char*& data,
-                              const char* dataEnd);
-  void fillSparseValueSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
-  void fillIndexSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
-  void fillStringSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
-  void fillSlotsByStr(const std::string& samples);
-  void handleDenseSlot(ProtoSlot& slot,
-                       size_t slotIndex,
-                       std::vector<Argument>& cpuArguments);
-  void handleSparseNonValueSlot(ProtoSlot& slot,
-                                size_t slotIndex,
-                                std::vector<Argument>& cpuArguments);
-  void handleSparseValueSlot(ProtoSlot& slot,
-                             size_t slotIndex,
-                             std::vector<Argument>& cpuArguments);
-  void handleIndexSlot(ProtoSlot& slot,
-                       size_t slotIndex,
-                       std::vector<Argument>& cpuArguments);
-  void handleStringSlot(ProtoSlot& slot,
-                        size_t slotIndex,
-                        std::vector<Argument>& cpuArguments);
-  void resetSlots();
-  void loadData(const std::vector<std::string>& fileList);
-
- protected:
-  struct ProtoSlot {
-    SlotDef::SlotType type;
-    int dim;
-    unsigned int sampleNum;
-    unsigned int sequenceNum;
-    unsigned int subSequenceNum;
-    // Store the data of index type slot
-    std::vector<int> indexData;
-    // Store the data of dense type slot
-    std::vector<real> denseData;
-    // Store the data of sparseNonValue type slot
-    std::vector<sparse_non_value_t> sparseNonValueData;
-    // Store the data of sparseValue type slot
-    std::vector<sparse_float_value_t> sparseFloatValueData;
-    // Used to store the index of each sample in slot values
-    std::vector<int64_t> indices;
-    // The starting position of each sequence in samples
-    // The last element should be the number of samples
-    // If empty, each sample is one sequence.
-    std::vector<size_t> sequenceStartPositions;
-    // The index id of sequences in slot
-    std::vector<int64_t> sampleSequenceIdVec;
-    // The starting position of each subsequence in samples
-    // The last element should be the number of subsequence
-    // If empty, each sequence of sample has no subsequence.
-    std::vector<size_t> subSequenceStartPositions;
-    // Store the data of string type slot
-    std::vector<std::string> strData;
-  };
-  std::vector<ProtoSlot> slots_;
-
-  PyObjectPtr classInstance_;
-  unsigned int batchSize_;
-  unsigned int slotNum_;
-  // if use sequence, isIID_ equals false, otherwise it is true.
-  bool isIID_;
-  // The name of python module name
-  std::string pyModuleName_;
-  // The name of python class name
-  std::string pyClassName_;
-  // User args set in config
-  std::map<std::string, std::string> pyUserArgs_;
-
-  ThreadLocalD<DataBatch> cpuBatch_;
-  ThreadLocalD<DataBatch> gpuBatch_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/dataproviders/PyDataProvider2.cpp b/paddle/gserver/dataproviders/PyDataProvider2.cpp
deleted file mode 100644
index 54ee091e8f257f76b113d4ca6f8a7c3989c0c1df..0000000000000000000000000000000000000000
--- a/paddle/gserver/dataproviders/PyDataProvider2.cpp
+++ /dev/null
@@ -1,1031 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef PADDLE_NO_PYTHON
-
-#include <Python.h>
-#include <numpy/numpyconfig.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <list>
-#include <unordered_set>
-#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
-#include <numpy/ndarrayobject.h>
-
-#include "DataProvider.h"
-
-#include "paddle/utils/Locks.h"
-#include "paddle/utils/PythonUtil.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-namespace unittest {
-
-static std::unique_ptr<std::function<void(size_t /*poolActualSize */)>>
-    OnPoolFilled;
-
-namespace pydp2 {
-
-void setOnPoolFilledHook(const std::function<void(size_t)>& callback) {
-  OnPoolFilled.reset(new std::function<void(size_t)>());
-  *OnPoolFilled = callback;
-}
-
-void clearOnPoolFilledHook() { OnPoolFilled.reset(); }
-
-}  // namespace pydp2
-}  // namespace unittest
-
-/**
- * Slot type
- */
-enum SlotType {
-  ST_DENSE = 0,
-  ST_NON_SPARSE_VALUE = 1,
-  ST_SPARSE_VALUE = 2,
-  ST_INDEX = 3
-};
-
-/**
- * Sequence type
- */
-enum SeqType { SQT_NONE = 0, SQT_SEQ, SQT_SUBSEQ };
-
-/**
- * Cache Type.
- */
-enum CacheType {
-  NO_CACHE = 0,           // Each pass will load data from PyDataProvider2.
-  CACHE_PASS_IN_MEM = 1,  // First pass will load data from PyDataProvider2,
-                          // then cache all data in memory. Load data from
-                          // memory in rest passes.
-};
-
-struct SlotHeader {  // Slot Header will parse from python object's slots field.
-  size_t dim;
-  SlotType slotType;
-  SeqType seqType;
-};
-
-inline std::ostream& operator<<(std::ostream& os, const SlotHeader& header) {
-  os << "Dim = " << header.dim << " Type = " << header.slotType
-     << " SeqType = " << header.seqType;
-  return os;
-}
-
-/**
- * FieldScanner Interface.
- *
- * It will read python object, and fill to argument's each slot.
- * There are two steps, prepare and fill. Scanner will alloc memory during
- * prepare step, fill data into argument during fill step.
- */
-class IFieldScanner {
- public:
-  DISABLE_COPY(IFieldScanner);
-  /**
-   * Ctor.
-   * @param headerPtr slot header that scanner belong to.
-   */
-  explicit IFieldScanner(SlotHeader* headerPtr) : headerPtr_(headerPtr) {}
-  virtual ~IFieldScanner() {}
-
-  /**
-   * Start prepare step.
-   */
-  virtual void startPrepare(Argument& argument) {}
-
-  /**
-   * Prepare step.
-   *
-   * @note the obj could be a timestep of sample or whole sample. It depends
-   * what scanner it is.
-   */
-  virtual void prepare(Argument& argument, PyObject* obj) {}
-
-  /**
-   * Finish Prepare step.
-   */
-  virtual void finishPrepare(Argument& argument) {}
-
-  /**
-   * Start fill step.
-   */
-  virtual void startFill(Argument& argument) {}
-
-  /**
-   * Fill step.
-   *
-   * @note the obj could be a timestep of sample or whole sample. It depends
-   * what scanner it is.
-   */
-  virtual void fill(Argument& argument, PyObject* obj) {}
-
-  /**
-   * Finish fill step.
-   */
-  virtual void finishFill(Argument& argument) {}
-
-  /**
-   * Factory method. Create a scanner by header. The final scanner may be
-   * combine many scanners.
-   *
-   * @note Fatal if header is not support.
-   */
-  static IFieldScanner* create(SlotHeader* header);
-
- protected:
-  SlotHeader* headerPtr_;
-};
-
-/**
- * Py Data Provider Cache Interface.
- */
-class IPyDataProviderCache {
- public:
-  virtual ~IPyDataProviderCache() {}
-
-  /**
-   * invoke when DataProvider::reset()
-   * @return true if read data from python.
-   */
-  virtual bool reset() = 0;
-
-  /**
-   * invoke when these data are used by DataProvider, and need to clear.
-   * @param [inout] data used data.
-   *
-   * @note The implemented class must clear these data array. Or if you want to
-   * delete the PyObjectPtr later, you should make sure the paddle process only
-   * have one active thread calling python code (use PyGuard otherwise).
-   */
-  virtual void drop(std::deque<PyObjectPtr>* data) = 0;
-
-  /**
-   * Return whole data in cache.
-   */
-  virtual std::deque<PyObjectPtr>* load() = 0;
-
-  /**
-   * Factory method. Convert CacheType to IPyDataProviderCache*
-   */
-  static IPyDataProviderCache* create(CacheType ct);
-};
-
-/**
- * PyDataProvider2.
- *
- * For usage, please refer python module 'paddle.trainer.PyDataProvider2'
- *
- * Here, we start a thread to read data. It is totally asynchronous for reading
- * data. And it support cache strategies.
- */
-class PyDataProvider2 : public DataProvider {
- public:
-  /**
-   * Ctor
-   */
-  PyDataProvider2(const DataConfig& config,
-                  const ModelConfig& modelConfig,
-                  bool useGpu)
-      : DataProvider(config, useGpu), callingContextCreated_(2) {
-    if (PyArray_API == NULL) import_array();
-    auto& args = config.load_data_args();
-    PyObjectPtr kwargs = PyObjectPtr(PyDict_New());
-    if (!args.empty()) {
-      kwargs = callPythonFuncRetPyObj(
-          "paddle.trainer.PyDataProvider2", "deserialize_args", {args});
-    }
-
-    py::DictHelper kwargsDict(kwargs);
-    kwargsDict.setBool("is_train", !config.for_test());
-    std::vector<std::string> inputs;
-    inputs.reserve(modelConfig.input_layer_names().size());
-    std::copy(modelConfig.input_layer_names().begin(),
-              modelConfig.input_layer_names().end(),
-              std::back_inserter(inputs));
-    kwargsDict.setStringList("input_order", inputs);
-
-    // kwargs is keyword arguemts to create object.
-    this->createPyDataObj(config.load_data_module(),
-                          config.load_data_object(),
-                          config.files(),
-                          std::move(kwargs));
-    DBG << "Instance " << instance_.get() << " loaded.";
-    this->readPyFields(config.for_test());
-    DBG << "Py Field Done";
-  }
-
-  /**
-   * Dtor
-   * @note will stop loading thread when destructing
-   */
-  virtual ~PyDataProvider2() { resetImpl(false); }
-
- private:
-  void createPyDataObj(const std::string& model,
-                       const std::string& className,
-                       const std::string& fileListName,
-                       PyObjectPtr&& kwargs  // NOLINT
-                       ) {
-    LOG(INFO) << "loading dataprovider " << model << "::" << className;
-
-    PyObjectPtr module = py::import(model);
-    PyObjectPtr moduleDict(PyModule_GetDict(module.get()));
-    CHECK_PY(moduleDict) << "Invoke module.__dict__ error";
-    PyObjectPtr cls(PyDict_GetItemString(moduleDict.get(), className.c_str()));
-    CHECK_PY(cls) << "load class " << className.c_str() << "error";
-
-    // If there are multiple python instance share same module, the PyObjectPtr
-    // only for instance will make python reference-count error.
-    //
-    // So here, we increase reference count manually.
-    Py_XINCREF(module.get());
-    Py_XINCREF(moduleDict.get());
-    Py_XINCREF(cls.get());
-
-    PyObjectPtr fileListInPy = loadPyFileLists(fileListName);
-    PyDict_SetItemString(kwargs.get(), "file_list", fileListInPy.get());
-    {
-      PyGuard guard;
-      instance_.reset(PyObject_Call(cls.get(), zeroTuple_.get(), kwargs.get()));
-    }
-    CHECK_PY(instance_) << "Cannot Create instance";
-  }
-
-  void readPyFields(bool testing) {
-    py::ObjectHelper self(this->instance_);
-    bool ok;
-
-    this->skipShuffle_ =
-        !self.getBoolAttr("should_shuffle", &ok /*isBoolType*/);
-    if (!ok) {
-      this->skipShuffle_ = testing;  // shuffle when is training, skip shuffle
-                                     // when is testing.
-    }
-    DBG << "Provider Skip Shuffle " << this->skipShuffle_;
-
-    this->poolSize_ = self.getIntAttr<size_t>("pool_size", &ok);
-    if (!ok) {
-      this->poolSize_ = -1UL;
-    }
-    this->minPoolSize_ = self.getIntAttr<size_t>("min_pool_size", &ok);
-    if (!ok) {
-      this->minPoolSize_ = -1UL;
-    }
-    this->minPoolSize_ = std::min(this->poolSize_, this->minPoolSize_);
-
-    this->canOverBatchSize_ = self.getBoolAttr("can_over_batch_size");
-
-    calcBatchSize_.reset(self.getAttr("calc_batch_size"));
-    if (this->calcBatchSize_ && !py::isCallable(this->calcBatchSize_)) {
-      this->calcBatchSize_.reset();
-    }
-
-    generator_.reset(self.getAttr("generator"));
-    CHECK(py::isCallable(generator_));
-
-    // Reading slots.
-    PyObjectPtr slotsPtr(self.getAttr("slots"));
-    py::SequenceHelper slots(slotsPtr);
-    headers_.reserve(slots.size());
-    for (size_t i = 0; i < slots.size(); ++i) {
-      headers_.emplace_back();
-      auto& header = headers_.back();
-      PyObject* hdPtr = slots[i];
-      CHECK(hdPtr != nullptr);
-      Py_XINCREF(hdPtr);
-      PyObjectPtr headerPtrWrap(hdPtr);
-      py::ObjectHelper hd(headerPtrWrap);
-      header.dim = hd.getIntAttrWithError<size_t>("dim");
-      header.seqType = (SeqType)hd.getIntAttrWithError<int>("seq_type");
-      header.slotType = (SlotType)hd.getIntAttrWithError<int>("type");
-    }
-
-    DBG << "Data header size " << headers_.size();
-    for (auto& header : headers_) {
-      DBG << header;
-    }
-    cache_.reset(IPyDataProviderCache::create(
-        (CacheType)self.getIntAttrWithError<int>("cache")));
-  }
-
-  PyObjectPtr loadPyFileLists(const std::string& fileListName) {
-    loadFileList(fileListName, fileLists_);
-    PyObject* lst = PyList_New(fileLists_.size());
-    for (size_t i = 0; i < fileLists_.size(); ++i) {
-      PyList_SET_ITEM(lst, i, PyString_FromString(fileLists_[i].c_str()));
-    }
-    return PyObjectPtr(lst);
-  }
-
-  void loadThread() {
-    DBG << "Creating context";
-    for (auto& filename : fileLists_) {
-      PyGuard g;
-      py::CallableHelper generator(this->generator_);
-      generator.setArgsSize(2);
-      generator.getArgs().set(0, instance_);
-      generator.getArgs().set(1, PyString_FromString(filename.c_str()), true);
-      callingContexts_.emplace_back(generator());
-      CHECK_PY(callingContexts_.back()) << "Generator error.";
-      CHECK(PyIter_Check(callingContexts_.back()));
-    }
-    DBG << "Create context done";
-    callingContextCreated_.wait();
-
-    PositionRandom p(skipShuffle_);
-
-    while (!exit_ && !callingContexts_.empty()) {
-      PyObject* data = nullptr;
-
-      {  // Read data.
-        size_t cid = p(callingContexts_.size());
-        bool atEnd;
-        data = py::iterNext(callingContexts_[cid], &atEnd);
-        if (atEnd || data == nullptr) {
-          if (cid != 0) {
-            std::swap(callingContexts_[cid], callingContexts_[0]);
-            cid = 0;
-          }
-
-          PyObjectPtr front;
-          {
-            std::unique_lock<std::mutex> l(mtx_);
-            front = pop_get_front(callingContexts_);
-          }
-          {
-            PyGuard g;
-            front.reset();
-          }
-          this->pullCV_.notify_all();
-          continue;
-        }
-      }
-
-      size_t additionalBatchSize = 1;
-      if (calcBatchSize_) {
-        PyGuard guard;
-        py::CallableHelper calcBatchSize(this->calcBatchSize_);
-        calcBatchSize.setArgsSize(1);
-        calcBatchSize.getArgs().set(0, data);
-        PyObjectPtr bs(calcBatchSize());
-        CHECK_PY(bs);
-        bool ok;
-        additionalBatchSize = py::castInt<size_t>(bs.get(), &ok);
-        CHECK(ok) << "CalcBatchSize must return int or long";
-      }
-
-      if (this->loadThread_) {  // wait poolActualSize < poolSize;
-        std::unique_lock<std::mutex> l(mtx_);
-        pushCV_.wait(l, [this] { return this->poolActualSize_ < poolSize_; });
-      }
-
-      {
-        std::lock_guard<std::mutex> guard(mtx_);
-        poolActualSize_ += additionalBatchSize;
-        dataPool_.emplace_back(data);
-      }
-      pullCV_.notify_all();
-    }
-    DBG << "load thread end";
-  }
-
-  inline void resetImpl(bool startNewThread) {
-    DBG << "Reseting " << startNewThread;
-    exit_.store(true);
-    if (loadThread_) {  // is loading.
-      loadThread_->join();
-      loadThread_.reset();
-    }
-    {
-      PyGuard g;
-      callingContexts_.clear();
-      this->pullCV_.notify_one();
-    }
-
-    std::lock_guard<std::mutex> guard(mutexForReset_);
-    {
-      PyGuard g;
-      dataPool_.clear();
-    }
-    poolActualSize_ = 0;
-
-    if (startNewThread && cache_->reset()) {
-      DBG << "Start new thread.";
-      loadThread_.reset(new std::thread([this] {
-        exit_ = false;
-        loadThread();
-      }));
-      callingContextCreated_.wait();
-    }
-    DBG << "Reset done";
-    exit_ = false;
-  }
-
- private:
-  std::unique_ptr<std::thread> loadThread_;
-  std::atomic<bool> exit_;
-  std::deque<PyObjectPtr> callingContexts_;
-  std::deque<PyObjectPtr> dataPool_;
-  size_t poolActualSize_;
-  std::condition_variable pushCV_;
-  std::condition_variable pullCV_;
-  std::mutex mtx_;
-
-  std::mutex mutexForReset_;
-
-  ThreadBarrier callingContextCreated_;
-  std::unique_ptr<IPyDataProviderCache> cache_;
-
-  PyObjectPtr instance_;
-  size_t poolSize_;
-  size_t minPoolSize_;
-  bool canOverBatchSize_;
-  PyObjectPtr calcBatchSize_;
-  PyObjectPtr generator_;
-  std::vector<std::string> fileLists_;
-  std::vector<SlotHeader> headers_;
-  static PyObjectPtr zeroTuple_;
-
-  class PositionRandom {
-   public:
-    inline explicit PositionRandom(bool skipRand)
-        : eng_(ThreadLocalRandomEngine::get()), skipRand_(skipRand) {}
-
-    inline size_t operator()(size_t len) {
-      if (!skipRand_) {
-        if (!dist_ || dist_->b() != len - 1) {
-          dist_.reset(new std::uniform_int_distribution<size_t>(0, len - 1));
-        }
-        return (*dist_)(eng_);
-      } else {
-        return 0;
-      }
-    }
-
-   private:
-    std::default_random_engine& eng_;
-    std::unique_ptr<std::uniform_int_distribution<size_t>> dist_;
-    bool skipRand_;
-  };
-
-  // DataProvider interface
- public:
-  /**
-   * Resetting the PyDataProvider. May start reading thread here.
-   */
-  virtual void reset() {
-    resetImpl(true);
-    DataProvider::reset();
-  }
-
-  /**
-   * Shuffle. Do nothing because PyDataProvider do shuffle implicitly by random
-   * select data from datapool.
-   */
-  void shuffle() {}
-
-  /**
-   * Not limited size.
-   */
-  int64_t getSize() { return -1; }
-
-  /**
-   * Loading a batch of data.
-   */
-  int64_t getNextBatchInternal(int64_t size_, DataBatch* batch) {
-    std::lock_guard<std::mutex> guard(mutexForReset_);
-    REGISTER_TIMER("PyDP2.getNextBatchInternal")
-    CHECK_GE(size_, 0);
-    size_t size = (size_t)size_;
-    if (loadThread_) {  // loading from thread should wait for data pool ready.
-                        // but, loading from cache, cache object should ensure
-                        // data pool ready.
-      std::unique_lock<std::mutex> l(mtx_);
-      pullCV_.wait(l, [this, &size] {
-        return this->poolActualSize_ >= std::max(size, this->minPoolSize_) ||
-               callingContexts_.empty();
-      });
-
-      if (unittest::OnPoolFilled) {
-        (*unittest::OnPoolFilled)(this->poolActualSize_);
-      }
-    }
-    std::deque<PyObjectPtr> data;
-    size_t bsize = 0;
-    std::deque<PyObjectPtr>* poolPtr = nullptr;
-
-    if (this->loadThread_) {  // loading from thread.
-      poolPtr = &this->dataPool_;
-    } else {  // loading from cache.
-      poolPtr = this->cache_->load();
-    }
-    if (exit_) {
-      // PyDataProvider is destructing.
-      return 0;
-    }
-    CHECK(poolPtr != nullptr);
-
-    std::deque<PyObjectPtr>& pool = *poolPtr;
-
-    while (bsize < size && !pool.empty()) {
-      {
-        // move data from pool to data
-        std::lock_guard<std::mutex> guard(mtx_);
-        if (skipShuffle_) {
-          size_t i = 0;
-          CHECK(pool[i] != nullptr);
-          data.emplace_back(std::move(pool[i]));
-          pool.pop_front();
-        } else {  // when shuffle, use swap to drop only last pool element.
-          size_t i = ThreadLocalRand::rand() % pool.size();
-          CHECK(pool[i] != nullptr);
-          if (i != 0) {
-            std::swap(pool[i], pool.front());
-          }
-          data.emplace_back(std::move(pool.front()));
-          pool.pop_front();
-        }
-
-        if (calcBatchSize_) {  // custom calc batch size.
-          PyGuard guard;
-          Py_INCREF(data.back().get());
-          py::CallableHelper calcBatchSize(calcBatchSize_);
-          calcBatchSize.setArgsSize(1);
-          calcBatchSize.getArgs().set(0, data.back());
-          PyObjectPtr customBatchSize(calcBatchSize());
-          bool ok;
-          size_t tmp = py::castInt<size_t>(customBatchSize.get(), &ok);
-          CHECK(ok) << "calc_batch_size must return int";
-
-          if (bsize + tmp > size && !canOverBatchSize_) {
-            // Put data back.
-            pool.push_front(std::move(data.back()));
-            data.pop_back();
-            break;
-          } else {
-            bsize += tmp;
-          }
-        } else {
-          bsize += 1;
-        }
-      }
-    }
-
-    if (this->loadThread_) {
-      {
-        std::lock_guard<std::mutex> g(mtx_);
-        poolActualSize_ -= bsize;
-      }
-      this->pushCV_.notify_all();
-    }
-
-    if (bsize == 0) {  // end of pass. In data pool, cannot get any data.
-      return 0;
-    }
-
-    DataBatch cpuBatch;
-    cpuBatch.setSize(bsize);
-    auto& inArgs = cpuBatch.getStreams();
-    inArgs.resize(headers_.size());
-    std::vector<std::unique_ptr<IFieldScanner>> scanners;
-    scanners.reserve(headers_.size());
-    for (auto& header : headers_) {
-      scanners.emplace_back(IFieldScanner::create(&header));
-    }
-    DBG << "Scanner created.";
-    for (size_t i = 0; i < headers_.size(); ++i) {
-      scanners[i]->startPrepare(inArgs[i]);
-    }
-    for (auto& d : data) {
-      py::SequenceHelper s(d);
-      for (size_t i = 0; i < headers_.size(); ++i) {
-        scanners[i]->prepare(inArgs[i], s[i]);
-      }
-    }
-    for (size_t i = 0; i < headers_.size(); ++i) {
-      scanners[i]->finishPrepare(inArgs[i]);
-    }
-    for (size_t i = 0; i < headers_.size(); ++i) {
-      scanners[i]->startFill(inArgs[i]);
-    }
-    for (auto& d : data) {
-      py::SequenceHelper s(d);
-      for (size_t i = 0; i < headers_.size(); ++i) {
-        scanners[i]->fill(inArgs[i], s[i]);
-      }
-    }
-
-    for (size_t i = 0; i < headers_.size(); ++i) {
-      scanners[i]->finishFill(inArgs[i]);
-    }
-
-    {
-      PyGuard g;
-      cache_->drop(&data);
-    }
-
-    DBG << "Reading CPU Batch Done.";
-
-    if (useGpu_) {
-      std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
-      DataBatch& gpuBatch = *batch;
-      std::vector<Argument>& gpuArguments = gpuBatch.getStreams();
-      gpuArguments.resize(cpuArguments.size());
-      gpuBatch.setSize(bsize);
-      for (size_t i = 0; i < headers_.size(); ++i) {
-        gpuArguments[i].resizeAndCopyFrom(
-            cpuArguments[i], useGpu_, HPPL_STREAM_1);
-      }
-      hl_stream_synchronize(HPPL_STREAM_1);
-    } else {
-      *batch = cpuBatch;
-    }
-    return bsize;
-  }
-};
-
-PyObjectPtr PyDataProvider2::zeroTuple_(PyTuple_New(0));
-
-REGISTER_DATA_PROVIDER_EX(py2, PyDataProvider2);
-
-/**
- * Scanner for dense slot.
- */
-class DenseScanner : public IFieldScanner {
- public:
-  explicit DenseScanner(SlotHeader* ptr) : IFieldScanner(ptr), height_(0) {}
-
-  /**
-   * Prepare.
-   * @param argument target argument
-   * @param obj each timestep of a sample.
-   */
-  virtual void prepare(Argument& argument, PyObject* obj) { ++height_; }
-
-  virtual void finishPrepare(Argument& argument) {
-    Matrix::resizeOrCreate(
-        argument.value, height_, headerPtr_->dim, false, false);
-    height_ = 0;
-  }
-
-  /**
-   * Fill argument from obj.
-   * @param argument
-   * @param obj
-   */
-  virtual void fill(Argument& argument, PyObject* obj) {
-    real* dat = argument.value->getData() + height_ * headerPtr_->dim;
-    if (PyArray_Check(obj)) {
-      auto dtype = PyArray_DTYPE((PyArrayObject*)obj);
-      if (dtype->type == 'f' && dtype->elsize == sizeof(real)) {
-        real* data = (real*)PyArray_DATA((PyArrayObject*)obj);
-        auto sz = PyArray_SIZE((PyArrayObject*)obj);
-        std::copy(data, data + sz, dat);
-      } else {
-        LOG(FATAL) << "You should yield float" << sizeof(real) * 8 << " array";
-      }
-    } else {
-      py::SequenceHelper s(obj);
-      // TODO(yuyang18): Here we can use AVX or SSE to accelerate memory copy.
-      for (size_t i = 0; i < headerPtr_->dim; ++i) {
-        dat[i] = (real)s.getDouble(i);
-      }
-    }
-    ++height_;
-  }
-
- private:
-  size_t height_;
-};
-
-/**
- * Scanner for index slot
- */
-class IndexScanner : public IFieldScanner {
- public:
-  explicit IndexScanner(SlotHeader* ptr) : IFieldScanner(ptr), cnt_(0) {}
-
-  /**
-   * Prepare memory space.
-   *
-   * @note obj is a single timestep of sample
-   */
-  virtual void prepare(Argument& argument, PyObject* obj) { ++cnt_; }
-
-  virtual void finishPrepare(Argument& argument) {
-    IVector::resizeOrCreate(argument.ids, cnt_, false);
-    cnt_ = 0;
-  }
-
-  /**
-   * Fill one index to argument.
-   */
-  virtual void fill(Argument& argument, PyObject* obj) {
-    bool ok;
-    argument.ids->getData()[cnt_++] = py::castInt<int>(obj, &ok);
-    CHECK(ok) << "Cannot cast int " << py::repr(obj);
-  }
-
- private:
-  size_t cnt_;
-};
-
-class SparseNonValueScanner : public IFieldScanner {
- public:
-  explicit SparseNonValueScanner(SlotHeader* ptr)
-      : IFieldScanner(ptr), nnz_(0), height_(0) {}
-
-  /**
-   * Prepare memory space
-   * @note obj is a timestep of one sample.
-   */
-  virtual void prepare(Argument& argument, PyObject* obj) {
-    ++height_;
-    nnz_ += py::SequenceHelper(obj).size();
-  }
-
-  virtual void finishPrepare(Argument& argument) {
-    Matrix::resizeOrCreateSparseMatrix(
-        argument.value, height_, headerPtr_->dim, nnz_, NO_VALUE);
-  }
-
-  virtual void startFill(Argument& argument) {
-    auto smat = (CpuSparseMatrix*)(argument.value.get());
-    smat->getRows()[0] = 0;
-    nnz_ = 0;
-    height_ = 1;
-  }
-
-  /**
-   * Fill one sparse vector to argument.
-   * @note obj is a timestep of one sample.
-   */
-  virtual void fill(Argument& argument, PyObject* obj) {
-    py::SequenceHelper s(obj);
-    auto sz = s.size();
-    auto smat = (CpuSparseMatrix*)(argument.value.get());
-    int* row = smat->getRows();
-    int* col = smat->getCols();
-    real* dat = smat->getData();
-    row[height_] = row[height_ - 1] + (int)sz;
-
-    for (decltype(sz) i = 0; i < sz; ++i) {
-      setData(col + nnz_, dat + nnz_, s[i]);
-      ++nnz_;
-    }
-    ++height_;
-  }
-
- protected:
-  /**
-   * Set a single sparse index and value.
-   * @param [out] col sparse index
-   * @param [out] dat sparse value
-   * @param [in] obj Python Object. For sparse_non_value is a PyInt or PyLong.
-   *                 For sparse_value is a Tuple (int, float).
-   */
-  virtual void setData(int* col, real* dat, PyObject* obj) {
-    bool ok;
-    *col = py::castInt<int>(obj, &ok);
-    CHECK(ok);
-  }
-
-  size_t nnz_;
-  size_t height_;
-};
-
-class SparseValueScanner : public SparseNonValueScanner {
- public:
-  explicit SparseValueScanner(SlotHeader* ptr) : SparseNonValueScanner(ptr) {}
-
-  virtual void finishPrepare(Argument& argument) {
-    Matrix::resizeOrCreateSparseMatrix(
-        argument.value, height_, headerPtr_->dim, nnz_, FLOAT_VALUE);
-  }
-
- protected:
-  virtual void setData(int* col, real* dat, PyObject* obj) {
-    py::SequenceHelper s(obj);
-    SparseNonValueScanner::setData(col, dat, s[0]);
-    *dat = (real)s.getDouble(1);
-  }
-};
-
-/**
- * Sequence Scanner. Scanner for sequence or sub-sequence.
- */
-class SequenceScanner : public IFieldScanner {
- public:
-  /**
-   * Ctor
-   * @param innerScanner inner scanner for each timestep or sub-sequence.
-   * @param getSeqStartPos A callback, (Argument) => ICpuGpuVectorPtr.
-   *                       return a sequence start position or a sub-sequence
-   *                       start position.
-   */
-  SequenceScanner(
-      std::unique_ptr<IFieldScanner>&& innerScanner,
-      const std::function<ICpuGpuVectorPtr&(Argument&)>& getSeqStartPos)
-      : IFieldScanner(nullptr),
-        inner_(std::move(innerScanner)),
-        cnt_(0),
-        getSeqStartPos_(getSeqStartPos) {}
-
-  /**
-   * Start prepare. Invoke inner->startPrepare too.
-   */
-  virtual void startPrepare(Argument& argument) {
-    inner_->startPrepare(argument);
-  }
-
-  /**
-   * Prepare. obj is a list or tuple. it will invoke inner_->prepare for each
-   * element of sequence obj.
-   */
-  virtual void prepare(Argument& argument, PyObject* obj) {
-    py::SequenceHelper s(obj);
-    ++cnt_;
-    for (size_t i = 0; i < s.size(); ++i) {
-      inner_->prepare(argument, s[i]);
-    }
-  }
-
-  /**
-   * Finish prepare. invoke inner_->finishPrepare too.
-   */
-  virtual void finishPrepare(Argument& argument) {
-    ICpuGpuVector::resizeOrCreate(getSeqStartPos_(argument), cnt_ + 1, false);
-    inner_->finishPrepare(argument);
-  }
-
-  /**
-   * Start fill. invoke inner->startFill too.
-   */
-  virtual void startFill(Argument& argument) {
-    getSeqStartPos_(argument)->getMutableData(false)[0] = 0;
-    cnt_ = 1;
-    inner_->startFill(argument);
-  }
-
-  /**
-   * Fill. Obj is a tuple or list. invoke inner->fill for each element of
-   * sequence obj. And set seqStartPos at same time. The seqStartPos will be
-   * calculated by getSeqStartPos callback passed in ctor.
-   */
-  virtual void fill(Argument& argument, PyObject* obj) {
-    getSeqStartPos_(argument)->getMutableData(false)[cnt_] =
-        getSeqStartPos_(argument)->getMutableData(false)[cnt_ - 1] +
-        (int)getSize(obj);
-    py::SequenceHelper s(obj);
-    ++cnt_;
-    for (size_t i = 0; i < s.size(); ++i) {
-      inner_->fill(argument, s[i]);
-    }
-  }
-
-  /**
-   * Finish fill. will invoke inner->finishFill too.
-   */
-  virtual void finishFill(Argument& argument) { inner_->finishFill(argument); }
-
- protected:
-  size_t getSize(PyObject* obj) {
-    py::SequenceHelper s(obj);
-    auto sc = dynamic_cast<SequenceScanner*>(inner_.get());
-    if (sc) {
-      size_t sum = 0;
-      for (size_t i = 0; i < s.size(); ++i) {
-        sum += sc->getSize(s[i]);
-      }
-      return sum;
-    } else {
-      return s.size();
-    }
-  }
-
- private:
-  std::unique_ptr<IFieldScanner> inner_;
-  size_t cnt_;
-  std::function<ICpuGpuVectorPtr&(Argument&)> getSeqStartPos_;
-};
-
-IFieldScanner* IFieldScanner::create(SlotHeader* header) {
-  IFieldScanner* retv = nullptr;
-  switch (header->slotType) {
-    case ST_DENSE:
-      retv = new DenseScanner(header);
-      break;
-    case ST_INDEX:
-      retv = new IndexScanner(header);
-      break;
-    case ST_NON_SPARSE_VALUE:
-      retv = new SparseNonValueScanner(header);
-      break;
-    case ST_SPARSE_VALUE:
-      retv = new SparseValueScanner(header);
-      break;
-    default:
-      LOG(FATAL) << "Not implemented " << header->slotType;
-  }
-
-  switch (header->seqType) {
-    case SQT_NONE:
-      break;
-    case SQT_SUBSEQ:
-      retv = new SequenceScanner(std::unique_ptr<IFieldScanner>(retv),
-                                 [](Argument& arg) -> ICpuGpuVectorPtr& {
-                                   return arg.subSequenceStartPositions;
-                                 });
-    // fall through, not break;
-    case SQT_SEQ:
-      retv = new SequenceScanner(std::unique_ptr<IFieldScanner>(retv),
-                                 [](Argument& arg) -> ICpuGpuVectorPtr& {
-                                   return arg.sequenceStartPositions;
-                                 });
-      break;
-    default:
-      LOG(FATAL) << "Not implemented";
-  }
-
-  return retv;
-}
-
-/**
- * No Cache Strategy. Will destruct old data immediately and load data from
- * python every pass.
- */
-class NoCacheStrategy : public IPyDataProviderCache {
- public:
-  virtual bool reset() { return true; }
-
-  virtual void drop(std::deque<PyObjectPtr>* data) { data->clear(); }
-
-  virtual std::deque<PyObjectPtr>* load() { return nullptr; }
-};
-
-/**
- * Cache One Pass In Memory strategy.
- *
- * In first pass, will load data from python and store them in memory.
- * The rest passes, will load data from memory.
- */
-class CacheOnePassInMemory : public IPyDataProviderCache {
- public:
-  CacheOnePassInMemory()
-      : objPool_(new std::deque<PyObjectPtr>()),
-        droppedPool_(new std::deque<PyObjectPtr>()) {}
-
-  virtual bool reset() {
-    if (objPool_->empty() && droppedPool_->empty()) {
-      return true;
-    } else if (objPool_->empty()) {
-      std::swap(objPool_, droppedPool_);
-      return false;
-    } else {
-      LOG(FATAL) << "Unexpected branch";
-    }
-  }
-
-  virtual void drop(std::deque<PyObjectPtr>* data) {
-    size_t orgSize = droppedPool_->size();
-    droppedPool_->resize(orgSize + data->size());
-    for (size_t i = 0; i < data->size(); ++i) {
-      std::swap((*droppedPool_)[orgSize + i], (*data)[i]);
-    }
-    data->clear();
-  }
-
-  virtual std::deque<PyObjectPtr>* load() { return objPool_.get(); }
-
- private:
-  std::unique_ptr<std::deque<PyObjectPtr>> objPool_;
-  std::unique_ptr<std::deque<PyObjectPtr>> droppedPool_;
-};
-
-IPyDataProviderCache* IPyDataProviderCache::create(CacheType ct) {
-  switch (ct) {
-    case NO_CACHE:
-      return new NoCacheStrategy();
-    case CACHE_PASS_IN_MEM:
-      return new CacheOnePassInMemory();
-    default:
-      LOG(FATAL) << "Not implemented";
-  }
-}
-}  // namespace paddle
-
-#endif
diff --git a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
deleted file mode 100644
index c6cd41de9a1a22470d8659eb90d1ac2b075b2df9..0000000000000000000000000000000000000000
--- a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
+++ /dev/null
@@ -1,320 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Evaluator.h"
-#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
-#include "paddle/utils/StringUtil.h"
-
-namespace paddle {
-
-/**
- * calculate sequence-to-sequence edit distance
- */
-class CTCErrorEvaluator : public Evaluator {
- private:
-  MatrixPtr outActivations_;
-  int numTimes_, numClasses_, numSequences_, blank_;
-  real deletions_, insertions_, substitutions_;
-  int seqClassficationError_;
-  mutable std::unordered_map<std::string, real> evalResults_;
-
-  std::vector<int> path2String(const std::vector<int>& path) {
-    std::vector<int> str;
-    str.clear();
-    int prevLabel = -1;
-    for (std::vector<int>::const_iterator label = path.begin();
-         label != path.end();
-         label++) {
-      if (*label != blank_ &&
-          (str.empty() || *label != str.back() || prevLabel == blank_)) {
-        str.push_back(*label);
-      }
-      prevLabel = *label;
-    }
-    return str;
-  }
-
-  std::vector<int> bestLabelSeq() {
-    std::vector<int> path;
-    path.clear();
-    real* acts = outActivations_->getData();
-    for (int i = 0; i < numTimes_; ++i) {
-      path.push_back(std::max_element(acts + i * numClasses_,
-                                      acts + (i + 1) * numClasses_) -
-                     (acts + i * numClasses_));
-    }
-    return path2String(path);
-  }
-
-  /* "sp, dp, ip" is the weighting parameter of "substitution, deletion,
-   * insertion"
-   * in edit-distance error */
-  real stringAlignment(std::vector<int>& gtStr,
-                       std::vector<int>& recogStr,
-                       bool backtrace = true,
-                       real sp = 1.0,
-                       real dp = 1.0,
-                       real ip = 1.0) {
-    std::vector<std::vector<int>> matrix;
-    int substitutions, deletions, insertions;
-    real distance;
-    int n = gtStr.size();
-    int m = recogStr.size();
-
-    if (n == 0) {
-      substitutions = 0;
-      deletions = 0;
-      insertions = m;
-      distance = m;
-    } else if (m == 0) {
-      substitutions = 0;
-      deletions = n;
-      insertions = 0;
-      distance = n;
-    } else {
-      substitutions = 0;
-      deletions = 0;
-      insertions = 0;
-      distance = 0;
-      // initialize the matrix
-      matrix.resize(n + 1);
-      for (int i = 0; i < n + 1; ++i) {
-        matrix[i].resize(m + 1);
-        for (int j = 0; j < m + 1; ++j) {
-          matrix[i][j] = 0;
-        }
-      }
-      for (int i = 0; i < n + 1; ++i) {
-        matrix[i][0] = i;
-      }
-      for (int j = 0; j < m + 1; ++j) {
-        matrix[0][j] = j;
-      }
-
-      // calculate the insertions, substitutions and deletions
-      for (int i = 1; i < n + 1; ++i) {
-        int s_i = gtStr[i - 1];
-        for (int j = 1; j < m + 1; ++j) {
-          int t_j = recogStr[j - 1];
-          int cost = (s_i == t_j) ? 0 : 1;
-          const int above = matrix[i - 1][j];
-          const int left = matrix[i][j - 1];
-          const int diag = matrix[i - 1][j - 1];
-          const int cell = std::min(above + 1, std::min(left + 1, diag + cost));
-          matrix[i][j] = cell;
-        }
-      }
-
-      if (backtrace) {
-        size_t i = n;
-        size_t j = m;
-        substitutions = 0;
-        deletions = 0;
-        insertions = 0;
-
-        while (i != 0 && j != 0) {
-          if (matrix[i][j] == matrix[i - 1][j - 1]) {
-            --i;
-            --j;
-          } else if (matrix[i][j] == matrix[i - 1][j - 1] + 1) {
-            ++substitutions;
-            --i;
-            --j;
-          } else if (matrix[i][j] == matrix[i - 1][j] + 1) {
-            ++deletions;
-            --i;
-          } else {
-            ++insertions;
-            --j;
-          }
-        }
-        while (i != 0) {
-          ++deletions;
-          --i;
-        }
-        while (j != 0) {
-          ++insertions;
-          --j;
-        }
-        int diff = substitutions + deletions + insertions;
-        if (diff != matrix[n][m]) {
-          LOG(ERROR) << "Found path with distance " << diff
-                     << " but Levenshtein distance is " << matrix[n][m];
-        }
-
-        distance = (sp * substitutions) + (dp * deletions) + (ip * insertions);
-      } else {
-        distance = (real)matrix[n][m];
-      }
-    }
-    real maxLen = std::max(m, n);
-    deletions_ += deletions / maxLen;
-    insertions_ += insertions / maxLen;
-    substitutions_ += substitutions / maxLen;
-
-    if (distance != 0) {
-      seqClassficationError_ += 1;
-    }
-
-    return distance / maxLen;
-  }
-
-  real editDistance(
-      real* output, int numTimes, int numClasses, int* labels, int labelsLen) {
-    numTimes_ = numTimes;
-    numClasses_ = numClasses;
-    blank_ = numClasses_ - 1;
-    outActivations_ = Matrix::create(output, numTimes, numClasses);
-    std::vector<int> recogStr, gtStr;
-    recogStr = bestLabelSeq();
-    for (int i = 0; i < labelsLen; ++i) {
-      gtStr.push_back(labels[i]);
-    }
-
-    return stringAlignment(gtStr, recogStr);
-  }
-
-  void storeLocalValues() const {
-    evalResults_["error"] = numSequences_ ? totalScore_ / numSequences_ : 0;
-    evalResults_["deletion_error"] =
-        numSequences_ ? deletions_ / numSequences_ : 0;
-    evalResults_["insertion_error"] =
-        numSequences_ ? insertions_ / numSequences_ : 0;
-    evalResults_["substitution_error"] =
-        numSequences_ ? substitutions_ / numSequences_ : 0;
-    evalResults_["sequence_error"] =
-        (real)seqClassficationError_ / numSequences_;
-  }
-
- public:
-  CTCErrorEvaluator()
-      : numTimes_(0),
-        numClasses_(0),
-        numSequences_(0),
-        blank_(0),
-        deletions_(0),
-        insertions_(0),
-        substitutions_(0),
-        seqClassficationError_(0) {}
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    CHECK_EQ(arguments.size(), (size_t)2);
-    Argument output, label;
-    output.resizeAndCopyFrom(arguments[0], false, HPPL_STREAM_DEFAULT);
-    label.resizeAndCopyFrom(arguments[1], false, HPPL_STREAM_DEFAULT);
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-    CHECK(label.sequenceStartPositions);
-    CHECK(label.ids);
-    size_t numSequences = label.sequenceStartPositions->getSize() - 1;
-    const int* labelStarts = label.sequenceStartPositions->getData(false);
-    const int* outputStarts = output.sequenceStartPositions->getData(false);
-    real totalErr = 0;
-    for (size_t i = 0; i < numSequences; ++i) {
-      real err = 0;
-      err = editDistance(
-          output.value->getData() + output.value->getWidth() * outputStarts[i],
-          outputStarts[i + 1] - outputStarts[i],
-          output.value->getWidth(),
-          label.ids->getData() + labelStarts[i],
-          labelStarts[i + 1] - labelStarts[i]);
-
-      totalErr += err;
-    }
-
-    return totalErr;
-  }
-
-  virtual void eval(const NeuralNetwork& nn) {
-    Evaluator::eval(nn);
-    std::vector<Argument> arguments;
-    arguments.reserve(config_.input_layers_size());
-    for (const std::string& name : config_.input_layers()) {
-      arguments.push_back(nn.getLayer(name)->getOutput());
-    }
-  }
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
-    numSequences_ += arguments[1].getNumSequences();
-  }
-
-  virtual void start() {
-    Evaluator::start();
-    numSequences_ = 0;
-    blank_ = 0;
-    deletions_ = 0;
-    insertions_ = 0;
-    substitutions_ = 0;
-    seqClassficationError_ = 0;
-  }
-
-  virtual void printStats(std::ostream& os) const {
-    storeLocalValues();
-    os << config_.name() << " error = " << evalResults_["error"];
-    os << " deletions error = " << evalResults_["deletion_error"];
-    os << " insertions error = " << evalResults_["insertion_error"];
-    os << " substitution error = " << evalResults_["substitution_error"];
-    os << " sequence error = " << evalResults_["sequence_error"];
-  }
-
-  virtual void distributeEval(ParameterClient2* client) {
-    double buf[6] = {totalScore_,
-                     (double)deletions_,
-                     (double)insertions_,
-                     (double)substitutions_,
-                     (double)seqClassficationError_,
-                     (double)numSequences_};
-    client->reduce(buf, buf, 6, FLAGS_trainer_id, 0);
-    totalScore_ = buf[0];
-    deletions_ = (real)buf[1];
-    insertions_ = (real)buf[2];
-    substitutions_ = (real)buf[3];
-    seqClassficationError_ = (int)buf[4];
-    numSequences_ = (int)buf[5];
-  }
-
-  void getNames(std::vector<std::string>* names) {
-    storeLocalValues();
-    names->reserve(names->size() + evalResults_.size());
-    for (auto it = evalResults_.begin(); it != evalResults_.end(); ++it) {
-      names->push_back(config_.name() + "." + it->first);
-    }
-  }
-
-  real getValue(const std::string& name, Error* err) const {
-    storeLocalValues();
-
-    std::vector<std::string> buffers;
-    paddle::str::split(name, '.', &buffers);
-    auto it = evalResults_.find(buffers[buffers.size() - 1]);
-
-    if (it == evalResults_.end()) {
-      *err = Error("Evaluator does not have the key %s", name.c_str());
-      return 0.0f;
-    }
-
-    return it->second;
-  }
-
-  std::string getType(const std::string& name, Error* err) const {
-    this->getValue(name, err);
-    if (!err->isOK()) {
-      return "";
-    }
-    return "ctc_edit_distance";
-  }
-};
-
-REGISTER_EVALUATOR(ctc_edit_distance, CTCErrorEvaluator);
-
-}  // namespace paddle
diff --git a/paddle/gserver/evaluators/ChunkEvaluator.cpp b/paddle/gserver/evaluators/ChunkEvaluator.cpp
deleted file mode 100644
index a2216293b1ab3a32e9cc903b805ca0aca10d58c1..0000000000000000000000000000000000000000
--- a/paddle/gserver/evaluators/ChunkEvaluator.cpp
+++ /dev/null
@@ -1,296 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <set>
-#include <vector>
-
-#include "paddle/math/Vector.h"
-#include "paddle/utils/StringUtil.h"
-
-#include "Evaluator.h"
-
-namespace paddle {
-
-/**
- * Chunk evaluator is used to evaluate segment labelling accuracy for a
- * sequence. It calculates the chunk detection F1 score.
- *
- * A chunk is correctly detected if its beginning, end and type are correct.
- * Other chunk type is ignored.
- * For each label in the label sequence, we have
- *
- * @code
- * tagType = label % numTagType
- * chunkType = label / numTagType
- * otherChunkType = numChunkTypes
- * @endcode
- *
- * The total number of different labels is numTagType*numChunkTypes+1
- * We support 4 labelling scheme
- * The tag type for each of the scheme is shown as follows:
- *
- * @code
- *  Scheme Begin Inside End   Single
- *   plain  0     -      -     -
- *   IOB    0     1      -     -
- *   IOE    -     0      1     -
- *   IOBES  0     1      2     3
- * @endcode
- *
- * 'plain' means the whole chunk must contain exactly the same chunk label.
- */
-class ChunkEvaluator : public Evaluator {
-  int otherChunkType_;
-  int numChunkTypes_;  // number of chunk types besides other chunk type
-  int numTagTypes_;
-  int tagBegin_;
-  int tagInside_;
-  int tagEnd_;
-  int tagSingle_;
-
-  int64_t numLabelSegments_;
-  int64_t numOutputSegments_;
-  int64_t numCorrect_;
-
-  struct Segment {
-    int begin;
-    int end;
-    int type;
-    bool operator==(const Segment& y) const {
-      return begin == y.begin && end == y.end && type == y.type;
-    }
-  };
-
-  std::vector<Segment> labelSegments_;
-  std::vector<Segment> outputSegments_;
-  std::set<int> excludedChunkTypes_;
-  mutable std::unordered_map<std::string, real> values_;
-
- public:
-  virtual void init(const EvaluatorConfig& config) {
-    Evaluator::init(config);
-    if (config.chunk_scheme() == "IOB") {
-      numTagTypes_ = 2;
-      tagBegin_ = 0;
-      tagInside_ = 1;
-      tagEnd_ = -1;
-      tagSingle_ = -1;
-    } else if (config.chunk_scheme() == "IOE") {
-      numTagTypes_ = 2;
-      tagBegin_ = -1;
-      tagInside_ = 0;
-      tagEnd_ = 1;
-      tagSingle_ = -1;
-    } else if (config.chunk_scheme() == "IOBES") {
-      numTagTypes_ = 4;
-      tagBegin_ = 0;
-      tagInside_ = 1;
-      tagEnd_ = 2;
-      tagSingle_ = 3;
-    } else if (config.chunk_scheme() == "plain") {
-      numTagTypes_ = 1;
-      tagBegin_ = -1;
-      tagInside_ = -1;
-      tagEnd_ = -1;
-      tagSingle_ = -1;
-    } else {
-      LOG(FATAL) << "Unknown chunk scheme: " << config.chunk_scheme();
-    }
-    CHECK(config.has_num_chunk_types()) << "Missing num_chunk_types in config";
-    otherChunkType_ = numChunkTypes_ = config.num_chunk_types();
-
-    // the chunks of types in excludedChunkTypes_ will not be counted
-    auto& tmp = config.excluded_chunk_types();
-    excludedChunkTypes_.insert(tmp.begin(), tmp.end());
-  }
-
-  virtual void start() {
-    Evaluator::start();
-    numLabelSegments_ = 0;
-    numOutputSegments_ = 0;
-    numCorrect_ = 0;
-  }
-
-  virtual void printStats(std::ostream& os) const {
-    storeLocalValues();
-    os << config_.name() << "=" << values_["F1-score"]
-       << " true_chunks=" << numLabelSegments_
-       << " result_chunks=" << numOutputSegments_
-       << " correct_chunks=" << numCorrect_;
-  }
-
-  virtual void distributeEval(ParameterClient2* client) {
-    int64_t buf[3] = {numLabelSegments_, numOutputSegments_, numCorrect_};
-    client->reduce(buf, buf, 3, FLAGS_trainer_id, 0);
-    numLabelSegments_ = buf[0];
-    numOutputSegments_ = buf[1];
-    numCorrect_ = buf[2];
-  }
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    CHECK_EQ(arguments.size(), (size_t)2);
-    IVectorPtr& output = arguments[0].ids;
-    IVectorPtr& label = arguments[1].ids;
-    CHECK(!output->useGpu() && !label->useGpu()) << "Not supported";
-    auto sequenceStartPositions =
-        arguments[1].sequenceStartPositions->getVector(false);
-    CHECK_EQ(output->getSize(), label->getSize());
-    CHECK(sequenceStartPositions);
-    size_t numSequences = sequenceStartPositions->getSize() - 1;
-    const int* starts = sequenceStartPositions->getData();
-    for (size_t i = 0; i < numSequences; ++i) {
-      eval1(output->getData() + starts[i],
-            label->getData() + starts[i],
-            starts[i + 1] - starts[i]);
-    }
-    return 0;
-  }
-
-  void eval1(int* output, int* label, int length) {
-    getSegments(output, length, outputSegments_);
-    getSegments(label, length, labelSegments_);
-    size_t i = 0, j = 0;
-    while (i < outputSegments_.size() && j < labelSegments_.size()) {
-      if (outputSegments_[i] == labelSegments_[j] &&
-          excludedChunkTypes_.count(outputSegments_[i].type) != 1) {
-        ++numCorrect_;
-      }
-      if (outputSegments_[i].end < labelSegments_[j].end) {
-        ++i;
-      } else if (outputSegments_[i].end > labelSegments_[j].end) {
-        ++j;
-      } else {
-        ++i;
-        ++j;
-      }
-    }
-    for (auto& segment : labelSegments_) {
-      if (excludedChunkTypes_.count(segment.type) != 1) ++numLabelSegments_;
-    }
-    for (auto& segment : outputSegments_) {
-      if (excludedChunkTypes_.count(segment.type) != 1) ++numOutputSegments_;
-    }
-  }
-
-  void getSegments(int* label, int length, std::vector<Segment>& segments) {
-    segments.clear();
-    segments.reserve(length);
-    int chunkStart = 0;
-    bool inChunk = false;
-    int tag = -1;
-    int type = otherChunkType_;
-    for (int i = 0; i < length; ++i) {
-      int prevTag = tag;
-      int prevType = type;
-      CHECK_LE(label[i], numChunkTypes_ * numTagTypes_);
-      tag = label[i] % numTagTypes_;
-      type = label[i] / numTagTypes_;
-      if (inChunk && isChunkEnd(prevTag, prevType, tag, type)) {
-        Segment segment{
-            chunkStart,  // begin
-            i - 1,       // end
-            prevType,
-        };
-        segments.push_back(segment);
-        inChunk = false;
-      }
-      if (isChunkBegin(prevTag, prevType, tag, type)) {
-        chunkStart = i;
-        inChunk = true;
-      }
-    }
-    if (inChunk) {
-      Segment segment{
-          chunkStart,  // begin
-          length - 1,  // end
-          type,
-      };
-      segments.push_back(segment);
-    }
-  }
-
-  // whether (prevTag, prevType) is the end of a chunk
-  bool isChunkEnd(int prevTag, int prevType, int tag, int type) {
-    if (prevType == otherChunkType_) return false;
-    if (type == otherChunkType_) return true;
-    if (type != prevType) return true;
-    if (prevTag == tagBegin_) return tag == tagBegin_ || tag == tagSingle_;
-    if (prevTag == tagInside_) return tag == tagBegin_ || tag == tagSingle_;
-    if (prevTag == tagEnd_) return true;
-    if (prevTag == tagSingle_) return true;
-    return false;
-  }
-
-  // whether (tag, type) is the beginning of a chunk
-  bool isChunkBegin(int prevTag, int prevType, int tag, int type) {
-    if (prevType == otherChunkType_) return type != otherChunkType_;
-    if (type == otherChunkType_) return false;
-    if (type != prevType) return true;
-    if (tag == tagBegin_) return true;
-    if (tag == tagInside_) return prevTag == tagEnd_ || prevTag == tagSingle_;
-    if (tag == tagEnd_) return prevTag == tagEnd_ || prevTag == tagSingle_;
-    if (tag == tagSingle_) return true;
-    return false;
-  }
-
-  // three metrics: precision, recall and F1-score
-  void getNames(std::vector<std::string>* names) {
-    storeLocalValues();
-    names->reserve(names->size() + values_.size());
-    for (auto it = values_.begin(); it != values_.end(); ++it) {
-      names->push_back(config_.name() + "." + it->first);
-    }
-  }
-
-  // get value by field name
-  real getValue(const std::string& name, Error* err) const {
-    storeLocalValues();
-    std::vector<std::string> buffers;
-    paddle::str::split(name, '.', &buffers);
-    auto it = values_.find(buffers.back());
-    if (it == values_.end()) {  // not found
-      *err = Error("No such key %s", name.c_str());
-      return 0.0f;
-    }
-
-    return it->second;
-  }
-
-  // get type of evaluator
-  std::string getType(const std::string& name, Error* err) const {
-    this->getValue(name, err);
-    if (!err->isOK()) {
-      return "";
-    }
-    return "chunk";
-  }
-
- private:
-  void storeLocalValues() const {
-    CHECK_GE(numOutputSegments_, 0);
-    CHECK_GE(numLabelSegments_, 0);
-    double precision =
-        !numOutputSegments_ ? 0 : (double)numCorrect_ / numOutputSegments_;
-    double recall =
-        !numLabelSegments_ ? 0 : (double)numCorrect_ / numLabelSegments_;
-    values_["precision"] = precision;
-    values_["recall"] = recall;
-    values_["F1-score"] =
-        !numCorrect_ ? 0 : 2 * precision * recall / (precision + recall);
-  }
-};
-
-REGISTER_EVALUATOR(chunk, ChunkEvaluator);
-
-}  // namespace paddle
diff --git a/paddle/gserver/evaluators/DetectionMAPEvaluator.cpp b/paddle/gserver/evaluators/DetectionMAPEvaluator.cpp
deleted file mode 100644
index ddb8ebca784db4a83c328ff75f5c50c7aecd7352..0000000000000000000000000000000000000000
--- a/paddle/gserver/evaluators/DetectionMAPEvaluator.cpp
+++ /dev/null
@@ -1,308 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Evaluator.h"
-#include "paddle/gserver/layers/DetectionUtil.h"
-
-using std::map;
-using std::vector;
-using std::pair;
-using std::make_pair;
-
-namespace paddle {
-
-/**
- * @brief detection map Evaluator
- *
- * The config file api is detection_map_evaluator.
- */
-class DetectionMAPEvaluator : public Evaluator {
- public:
-  DetectionMAPEvaluator()
-      : evaluateDifficult_(false), cpuOutput_(nullptr), cpuLabel_(nullptr) {}
-
-  virtual void start() {
-    Evaluator::start();
-    allTruePos_.clear();
-    allFalsePos_.clear();
-    numPos_.clear();
-  }
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    overlapThreshold_ = config_.overlap_threshold();
-    backgroundId_ = config_.background_id();
-    evaluateDifficult_ = config_.evaluate_difficult();
-    apType_ = config_.ap_type();
-
-    MatrixPtr detectTmpValue = arguments[0].value;
-    Matrix::resizeOrCreate(cpuOutput_,
-                           detectTmpValue->getHeight(),
-                           detectTmpValue->getWidth(),
-                           false,
-                           false);
-
-    MatrixPtr labelTmpValue = arguments[1].value;
-    Matrix::resizeOrCreate(cpuLabel_,
-                           labelTmpValue->getHeight(),
-                           labelTmpValue->getWidth(),
-                           false,
-                           false);
-
-    cpuOutput_->copyFrom(*detectTmpValue);
-    cpuLabel_->copyFrom(*labelTmpValue);
-
-    Argument label = arguments[1];
-    const int* labelIndex = label.sequenceStartPositions->getData(false);
-    size_t batchSize = label.getNumSequences();
-
-    vector<map<size_t, vector<NormalizedBBox>>> allGTBBoxes;
-    vector<map<size_t, vector<pair<real, NormalizedBBox>>>> allDetectBBoxes;
-
-    for (size_t n = 0; n < batchSize; ++n) {
-      map<size_t, vector<NormalizedBBox>> bboxes;
-      for (int i = labelIndex[n]; i < labelIndex[n + 1]; ++i) {
-        vector<NormalizedBBox> bbox;
-        getBBoxFromLabelData(cpuLabel_->getData() + i * 6, 1, bbox);
-        int c = cpuLabel_->getData()[i * 6];
-        bboxes[c].push_back(bbox[0]);
-      }
-      allGTBBoxes.push_back(bboxes);
-    }
-
-    size_t n = 0;
-    const real* cpuOutputData = cpuOutput_->getData();
-    for (size_t imgId = 0; imgId < batchSize; ++imgId) {
-      map<size_t, vector<pair<real, NormalizedBBox>>> bboxes;
-      size_t curImgId = static_cast<size_t>((cpuOutputData + n * 7)[0]);
-      while (curImgId == imgId && n < cpuOutput_->getHeight()) {
-        vector<real> label;
-        vector<real> score;
-        vector<NormalizedBBox> bbox;
-        getBBoxFromDetectData(cpuOutputData + n * 7, 1, label, score, bbox);
-        bboxes[label[0]].push_back(make_pair(score[0], bbox[0]));
-        ++n;
-        curImgId = static_cast<size_t>((cpuOutputData + n * 7)[0]);
-      }
-      allDetectBBoxes.push_back(bboxes);
-    }
-
-    for (size_t n = 0; n < batchSize; ++n) {
-      for (map<size_t, vector<NormalizedBBox>>::iterator it =
-               allGTBBoxes[n].begin();
-           it != allGTBBoxes[n].end();
-           ++it) {
-        size_t count = 0;
-        if (evaluateDifficult_) {
-          count = it->second.size();
-        } else {
-          for (size_t i = 0; i < it->second.size(); ++i)
-            if (!(it->second[i].isDifficult)) ++count;
-        }
-        if (numPos_.find(it->first) == numPos_.end() && count != 0) {
-          numPos_[it->first] = count;
-        } else {
-          numPos_[it->first] += count;
-        }
-      }
-    }
-
-    // calcTFPos
-    calcTFPos(batchSize, allGTBBoxes, allDetectBBoxes);
-
-    return 0;
-  }
-
-  virtual void printStats(std::ostream& os) const {
-    real mAP = calcMAP();
-    os << "Detection mAP=" << mAP;
-  }
-
-  virtual void distributeEval(ParameterClient2* client) {
-    LOG(FATAL) << "Distribute detection evaluation not implemented.";
-  }
-
- protected:
-  void calcTFPos(const size_t batchSize,
-                 const vector<map<size_t, vector<NormalizedBBox>>>& allGTBBoxes,
-                 const vector<map<size_t, vector<pair<real, NormalizedBBox>>>>&
-                     allDetectBBoxes) {
-    for (size_t n = 0; n < allDetectBBoxes.size(); ++n) {
-      if (allGTBBoxes[n].size() == 0) {
-        for (map<size_t, vector<pair<real, NormalizedBBox>>>::const_iterator
-                 it = allDetectBBoxes[n].begin();
-             it != allDetectBBoxes[n].end();
-             ++it) {
-          size_t label = it->first;
-          for (size_t i = 0; i < it->second.size(); ++i) {
-            allTruePos_[label].push_back(make_pair(it->second[i].first, 0));
-            allFalsePos_[label].push_back(make_pair(it->second[i].first, 1));
-          }
-        }
-      } else {
-        for (map<size_t, vector<pair<real, NormalizedBBox>>>::const_iterator
-                 it = allDetectBBoxes[n].begin();
-             it != allDetectBBoxes[n].end();
-             ++it) {
-          size_t label = it->first;
-          vector<pair<real, NormalizedBBox>> predBBoxes = it->second;
-          if (allGTBBoxes[n].find(label) == allGTBBoxes[n].end()) {
-            for (size_t i = 0; i < predBBoxes.size(); ++i) {
-              allTruePos_[label].push_back(make_pair(predBBoxes[i].first, 0));
-              allFalsePos_[label].push_back(make_pair(predBBoxes[i].first, 1));
-            }
-          } else {
-            vector<NormalizedBBox> gtBBoxes =
-                allGTBBoxes[n].find(label)->second;
-            vector<bool> visited(gtBBoxes.size(), false);
-            // Sort detections in descend order based on scores
-            std::sort(predBBoxes.begin(),
-                      predBBoxes.end(),
-                      sortScorePairDescend<NormalizedBBox>);
-            for (size_t i = 0; i < predBBoxes.size(); ++i) {
-              real maxOverlap = -1.0;
-              size_t maxIdx = 0;
-              for (size_t j = 0; j < gtBBoxes.size(); ++j) {
-                real overlap =
-                    jaccardOverlap(predBBoxes[i].second, gtBBoxes[j]);
-                if (overlap > maxOverlap) {
-                  maxOverlap = overlap;
-                  maxIdx = j;
-                }
-              }
-              if (maxOverlap > overlapThreshold_) {
-                if (evaluateDifficult_ ||
-                    (!evaluateDifficult_ && !gtBBoxes[maxIdx].isDifficult)) {
-                  if (!visited[maxIdx]) {
-                    allTruePos_[label].push_back(
-                        make_pair(predBBoxes[i].first, 1));
-                    allFalsePos_[label].push_back(
-                        make_pair(predBBoxes[i].first, 0));
-                    visited[maxIdx] = true;
-                  } else {
-                    allTruePos_[label].push_back(
-                        make_pair(predBBoxes[i].first, 0));
-                    allFalsePos_[label].push_back(
-                        make_pair(predBBoxes[i].first, 1));
-                  }
-                }
-              } else {
-                allTruePos_[label].push_back(make_pair(predBBoxes[i].first, 0));
-                allFalsePos_[label].push_back(
-                    make_pair(predBBoxes[i].first, 1));
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-
-  real calcMAP() const {
-    real mAP = 0.0;
-    size_t count = 0;
-    for (map<size_t, size_t>::const_iterator it = numPos_.begin();
-         it != numPos_.end();
-         ++it) {
-      size_t label = it->first;
-      size_t labelNumPos = it->second;
-      if (labelNumPos == 0 || allTruePos_.find(label) == allTruePos_.end())
-        continue;
-      vector<pair<real, size_t>> labelTruePos = allTruePos_.find(label)->second;
-      vector<pair<real, size_t>> labelFalsePos =
-          allFalsePos_.find(label)->second;
-      // Compute average precision.
-      vector<size_t> tpCumSum;
-      getAccumulation(labelTruePos, &tpCumSum);
-      vector<size_t> fpCumSum;
-      getAccumulation(labelFalsePos, &fpCumSum);
-      std::vector<real> precision, recall;
-      size_t num = tpCumSum.size();
-      // Compute Precision.
-      for (size_t i = 0; i < num; ++i) {
-        CHECK_LE(tpCumSum[i], labelNumPos);
-        precision.push_back(static_cast<real>(tpCumSum[i]) /
-                            static_cast<real>(tpCumSum[i] + fpCumSum[i]));
-        recall.push_back(static_cast<real>(tpCumSum[i]) / labelNumPos);
-      }
-      // VOC2007 style
-      if (apType_ == "11point") {
-        vector<real> maxPrecisions(11, 0.0);
-        int startIdx = num - 1;
-        for (int j = 10; j >= 0; --j)
-          for (int i = startIdx; i >= 0; --i) {
-            if (recall[i] < j / 10.) {
-              startIdx = i;
-              if (j > 0) maxPrecisions[j - 1] = maxPrecisions[j];
-              break;
-            } else {
-              if (maxPrecisions[j] < precision[i])
-                maxPrecisions[j] = precision[i];
-            }
-          }
-        for (int j = 10; j >= 0; --j) mAP += maxPrecisions[j] / 11;
-        ++count;
-      } else if (apType_ == "Integral") {
-        // Nature integral
-        real averagePrecisions = 0.;
-        real prevRecall = 0.;
-        for (size_t i = 0; i < num; ++i) {
-          if (fabs(recall[i] - prevRecall) > 1e-6)
-            averagePrecisions += precision[i] * fabs(recall[i] - prevRecall);
-          prevRecall = recall[i];
-        }
-        mAP += averagePrecisions;
-        ++count;
-      } else {
-        LOG(FATAL) << "Unkown ap version: " << apType_;
-      }
-    }
-    if (count != 0) mAP /= count;
-    return mAP * 100;
-  }
-
-  void getAccumulation(vector<pair<real, size_t>> inPairs,
-                       vector<size_t>* accuVec) const {
-    std::stable_sort(
-        inPairs.begin(), inPairs.end(), sortScorePairDescend<size_t>);
-    accuVec->clear();
-    size_t sum = 0;
-    for (size_t i = 0; i < inPairs.size(); ++i) {
-      sum += inPairs[i].second;
-      accuVec->push_back(sum);
-    }
-  }
-
-  std::string getTypeImpl() const { return "detection_map"; }
-
-  real getValueImpl() const { return calcMAP(); }
-
- private:
-  real overlapThreshold_;  // overlap threshold when determining whether matched
-  bool evaluateDifficult_;  // whether evaluate difficult ground truth
-  size_t backgroundId_;     // class index of background
-  std::string apType_;      // how to calculate mAP (Integral or 11point)
-
-  MatrixPtr cpuOutput_;
-  MatrixPtr cpuLabel_;
-
-  map<size_t, size_t> numPos_;  // counts of true objects each classification
-  map<size_t, vector<pair<real, size_t>>>
-      allTruePos_;  // true positive prediction
-  map<size_t, vector<pair<real, size_t>>>
-      allFalsePos_;  // false positive prediction
-};
-
-REGISTER_EVALUATOR(detection_map, DetectionMAPEvaluator);
-
-}  // namespace paddle
diff --git a/paddle/gserver/evaluators/Evaluator.cpp b/paddle/gserver/evaluators/Evaluator.cpp
deleted file mode 100644
index 941fb8fb539d58cca22ecf563d2effa816243c3b..0000000000000000000000000000000000000000
--- a/paddle/gserver/evaluators/Evaluator.cpp
+++ /dev/null
@@ -1,1361 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/gserver/evaluators/Evaluator.h"
-#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/StringUtil.h"
-
-DECLARE_int32(trainer_id);
-
-namespace paddle {
-
-void Evaluator::eval(const NeuralNetwork& nn) {
-  std::vector<Argument> arguments;
-  arguments.reserve(config_.input_layers_size());
-  for (const std::string& name : config_.input_layers()) {
-    arguments.push_back(nn.getLayer(name)->getOutput());
-  }
-  SetDevice device(arguments[0].deviceId);
-  real score = evalImp(arguments);
-  totalScore_ += score;
-  updateSamplesNum(arguments);
-}
-/**
- * @brief classification error Evaluator
- *
- * The config file api is classification_error_evaluator.
- */
-class ClassificationErrorEvaluator : public Evaluator {
- public:
-  /*
-  ClassificationErrorEvaluator() : totalScore2_(0) {}
-
-  virtual void start() {
-    Evaluator::start();
-    totalScore2_ = 0;
-    } */
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
-    if (3 == arguments.size()) {
-      numSamples_ += arguments[2].value->getSum();
-    } else {
-      numSamples_ += arguments[0].getBatchSize();
-    }
-  }
-
-  MatrixPtr calcError(std::vector<Argument>& arguments) {
-    CHECK_GE(arguments.size(), (size_t)2);
-    CHECK_LE(arguments.size(), (size_t)3);
-    MatrixPtr& output = arguments[0].value;
-    IVectorPtr& label = arguments[1].ids;
-    MatrixPtr& multiBinaryLabel = arguments[1].value;  // For multi binary label
-    bool supportWeight = (3 == arguments.size()) ? true : false;
-    MatrixPtr weight = supportWeight ? arguments[2].value : nullptr;
-    if (nullptr == output ||
-        (nullptr == label && nullptr == multiBinaryLabel) ||
-        (supportWeight && nullptr == weight)) {
-      return 0;
-    }
-
-    if (label != nullptr) {
-      CHECK_EQ(label->getSize(), output->getHeight());
-    } else {
-      CHECK_EQ(multiBinaryLabel->getHeight(), output->getHeight());
-      CHECK_EQ(multiBinaryLabel->getWidth(), output->getWidth());
-    }
-    if (supportWeight) {
-      CHECK_EQ(output->getHeight(), weight->getHeight());
-      CHECK_EQ((size_t)1, weight->getWidth());
-    }
-
-    const MatrixPtr errorMat = Matrix::create(output->getHeight(),
-                                              1,
-                                              /* trans= */ false,
-                                              useGpu(arguments[0].deviceId));
-
-    errorMat->zeroMem();
-
-    if (label != nullptr) {
-      errorMat->classificationError(*output, *label, config_.top_k());
-    } else if (dynamic_cast<CpuSparseMatrix*>(multiBinaryLabel.get()) ||
-               dynamic_cast<GpuSparseMatrix*>(multiBinaryLabel.get())) {
-      errorMat->classificationErrorMulti(
-          *output, *multiBinaryLabel, config_.classification_threshold());
-    } else {
-      errorMat->binaryClassificationError(
-          0, *output, *multiBinaryLabel, config_.classification_threshold());
-    }
-
-    if (supportWeight) {
-      errorMat->dotMul(*errorMat, *weight);
-    }
-    return errorMat;
-  }
-
-  void printStats(std::ostream& os) const {
-    if (config_.top_k() == 1) {
-      os << config_.name() << "="
-         << (numSamples_ ? totalScore_ / numSamples_ : 0);
-    } else {
-      os << " top_" << config_.top_k()
-         << "_error=" << (numSamples_ ? totalScore_ / numSamples_ : 0);
-    }
-  }
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    MatrixPtr errorMat = calcError(arguments);
-    return errorMat->getSum();
-  }
-
-  virtual void distributeEval(ParameterClient2* client) {
-    mergeResultsOfAllClients(client);
-  }
-
-  // Evaluator interface
- protected:
-  std::string getTypeImpl() const { return "classification_error"; }
-};
-
-/**
- * @brief sequence classification error Evaluator
- * @note sequence level classification error stats,
- * if any frame in one sequence has error, the sequence is error
- */
-class SequenceClassificationErrorEvaluator
-    : public ClassificationErrorEvaluator {
- public:
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
-    numSamples_ += arguments[0].getNumSequences();
-  }
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    auto sequenceStartPositions =
-        arguments[0].sequenceStartPositions->getVector(false);
-    CHECK(sequenceStartPositions != nullptr);
-    const int* starts = sequenceStartPositions->getData();
-
-    MatrixPtr errorMat = calcError(arguments);
-
-    int errCounter = 0;
-    CpuVector errorVec(0, nullptr);
-    for (size_t i = 0; i < sequenceStartPositions->getSize() - 1; ++i) {
-      errorVec.subVecFrom(
-          errorMat->getData(), starts[i], starts[i + 1] - starts[i]);
-      if (errorVec.getSum() > 0) {
-        errCounter += 1;
-      }
-    }
-
-    return static_cast<real>(errCounter);
-  }
-
-  virtual void distributeEval(ParameterClient2* client) {
-    mergeResultsOfAllClients(client);
-  }
-
-  // Evaluator interface
- protected:
-  std::string getTypeImpl() const { return "seq_classification_error"; }
-};
-REGISTER_EVALUATOR(seq_classification_error,
-                   SequenceClassificationErrorEvaluator);
-/**
- * @brief sum Evaluator
- * Calculate the sum of output or label
- *
- * The config file api is sum_evaluator.
- */
-class SumEvaluator : public Evaluator {
- public:
-  SumEvaluator() : cpuLabel_(nullptr), cpuWeight_(nullptr) {}
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
-    if (2 == arguments.size()) {
-      numSamples_ += arguments[1].value->getSum();
-    } else {
-      numSamples_ += arguments[0].getBatchSize();
-    }
-  }
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    REGISTER_TIMER("SumEvaluator");
-    CHECK_GE(arguments.size(), (size_t)1);
-    CHECK_LE(arguments.size(), (size_t)2);
-    bool supportWeight = (2 == arguments.size()) ? true : false;
-    if (supportWeight) {
-      if (nullptr == arguments[1].value) {
-        return 0;
-      }
-      CHECK_EQ(arguments[1].value->getWidth(), (size_t)1);
-    }
-
-    // The sum of output
-    if (arguments[0].value) {
-      if (supportWeight) {
-        CHECK_EQ(arguments[0].value->getHeight(),
-                 arguments[1].value->getHeight());
-        MatrixPtr tmpMat = Matrix::create(arguments[0].value->getHeight(),
-                                          arguments[0].value->getWidth(),
-                                          /* trans= */ false,
-                                          arguments[0].value->useGpu());
-        tmpMat->copyFrom(*arguments[0].value);
-        tmpMat->rowScale(0, *tmpMat, *arguments[1].value);
-        return tmpMat->getSum();
-      } else {
-        return arguments[0].value->getSum();
-      }
-      // The sum of label
-    } else if (arguments[0].ids) {
-      size_t insNum = arguments[0].ids->getSize();
-      IVectorPtr label = arguments[0].ids;
-      MatrixPtr weight = supportWeight ? arguments[1].value : nullptr;
-      if (dynamic_cast<GpuIVector*>(label.get())) {
-        IVector::resizeOrCreate(cpuLabel_, insNum, false);
-        cpuLabel_->copyFrom(*arguments[0].ids);
-
-        if (supportWeight) {
-          CHECK_EQ(insNum, arguments[1].value->getHeight());
-          Matrix::resizeOrCreate(cpuWeight_, insNum, (size_t)1, false, false);
-          cpuWeight_->copyFrom(*arguments[1].value);
-        }
-
-        label = cpuLabel_;
-        weight = cpuWeight_;
-      }
-
-      if (supportWeight) {
-        real score = 0.0;
-        int* labelD = label->getData();
-        real* weightD = weight->getData();
-        for (size_t i = 0; i < insNum; ++i) {
-          score += (labelD[i] * weightD[i]);
-        }
-        return score;
-      } else {
-        return label->getSum();
-      }
-    } else {
-      return 0;
-    }
-  }
-
-  virtual void distributeEval(ParameterClient2* client) {
-    mergeResultsOfAllClients(client);
-  }
-
- private:
-  IVectorPtr cpuLabel_;
-  MatrixPtr cpuWeight_;
-
-  // Evaluator interface
- protected:
-  std::string getTypeImpl() const { return "sum"; }
-};
-/**
- * @brief column sum Evaluator
- * @note column sum for the colIdx-th column *
- * - colIdx = 0: the 0-th column.
- * - colIdx > 0: the colIdx-th column.
- * - colIdx < 0: the last colIdx-th column.
- *
- * The config file api is column_sum_evaluator.
- *
- */
-class ColumnSumEvaluator : public Evaluator {
- public:
-  explicit ColumnSumEvaluator(int32_t colIdx)
-      : colIdx_(colIdx), colNum_(0), sum_(nullptr) {}
-
-  virtual void start() {
-    Evaluator::start();
-    if (nullptr != sum_) {
-      sum_->zeroMem();
-    }
-  }
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
-    if (2 == arguments.size()) {
-      numSamples_ += arguments[1].value->getSum();
-    } else {
-      numSamples_ += arguments[0].getBatchSize();
-    }
-  }
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    REGISTER_TIMER("ColumnSumEvaluator");
-    CHECK_GE(arguments.size(), (size_t)1);
-    CHECK_LE(arguments.size(), (size_t)2);
-    bool supportWeight = (2 == arguments.size()) ? true : false;
-    if (nullptr == arguments[0].value ||
-        (supportWeight && nullptr == arguments[1].value)) {
-      return 0;
-    }
-
-    size_t insNum = arguments[0].value->getHeight();
-    size_t colNum = arguments[0].value->getWidth();
-    if (nullptr == sum_) {
-      sum_ = Matrix::create((size_t)1, colNum, false, /* useGpu */ false);
-      colNum_ = colNum;
-      sum_->zeroMem();
-    } else {
-      CHECK_EQ(colNum, sum_->getWidth());
-    }
-
-    if (supportWeight) {
-      CHECK_EQ(insNum, arguments[1].value->getHeight());
-      CHECK_EQ((size_t)1, arguments[1].value->getWidth());
-      MatrixPtr tmpMat = Matrix::create(insNum, colNum);
-      if (arguments[0].value->useGpu()) {
-        tmpMat->copyFrom(*arguments[0].value);
-      }
-      if (!arguments[1].value->useGpu()) {
-        if (!arguments[0].value->useGpu()) {
-          tmpMat->rowScale(0, *arguments[0].value, *arguments[1].value);
-        } else {
-          tmpMat->rowScale(0, *tmpMat, *arguments[1].value);
-        }
-      } else {
-        MatrixPtr tmp2 = Matrix::create(insNum, 1);
-        tmp2->copyFrom(*arguments[1].value);
-        if (!arguments[0].value->useGpu()) {
-          tmpMat->rowScale(0, *arguments[0].value, *tmp2);
-        } else {
-          tmpMat->rowScale(0, *tmpMat, *tmp2);
-        }
-      }
-      sum_->accumulateColSum(*tmpMat);
-    } else {
-      if (!arguments[0].value->useGpu()) {
-        sum_->accumulateColSum(*arguments[0].value);
-      } else {
-        MatrixPtr tmpMat = Matrix::create(insNum, colNum);
-        tmpMat->copyFrom(*arguments[0].value);
-        sum_->accumulateColSum(*tmpMat);
-      }
-    }
-    return 0;
-  }
-
-  virtual void printStats(std::ostream& os) const {
-    CHECK(colIdx_ + (int32_t)colNum_ >= 0 && colIdx_ - (int32_t)colNum_ < 0)
-        << "column index [" << colIdx_ << "] out of range [-" << colNum_ << ", "
-        << colNum_ << ")";
-    size_t colIdx = 0;
-    if (colIdx_ >= 0) {
-      colIdx = colIdx_;
-    } else {
-      colIdx = colNum_ + colIdx_;
-    }
-    os << config_.name() << "="
-       << (numSamples_ ? sum_->getElement(0, colIdx) / numSamples_ : 0);
-  }
-
-  void distributeEval(ParameterClient2* client) {
-    client->reduce(
-        sum_->getData(), sum_->getData(), colNum_, FLAGS_trainer_id, 0);
-    client->reduce(&numSamples_, &numSamples_, 1, FLAGS_trainer_id, 0);
-  }
-
- private:
-  int32_t colIdx_;
-  size_t colNum_;
-  MatrixPtr sum_; /* cpu matrix */
-
-  // Evaluator interface
- protected:
-  std::string getTypeImpl() const {
-    if (colIdx_ == -1)
-      return "last-column-sum";
-    else
-      return "column-sum";
-  }
-};
-
-void AucEvaluator::start() {
-  Evaluator::start();
-  memset(statPos_, 0, sizeof(statPos_));
-  memset(statNeg_, 0, sizeof(statNeg_));
-}
-
-real AucEvaluator::evalImp(std::vector<Argument>& arguments) {
-  REGISTER_TIMER("AucEvaluator");
-  CHECK_GE(arguments.size(), (size_t)2);
-  CHECK_LE(arguments.size(), (size_t)3);
-  MatrixPtr output = arguments[0].value;
-  IVectorPtr label = arguments[1].ids;
-  MatrixPtr labelval = arguments[1].value;
-  bool supportWeight = (3 == arguments.size()) ? true : false;
-  MatrixPtr weight = supportWeight ? arguments[2].value : nullptr;
-
-  if (nullptr == output || (supportWeight && nullptr == weight)) {
-    return 0;
-  }
-  size_t insNum = output->getHeight();
-  size_t outputDim = output->getWidth();
-  // Copy label from value to a vector.
-  if (nullptr == label && nullptr != labelval) {
-    // label width is 1
-    CHECK_EQ(1U, labelval->getWidth());
-    VectorPtr vec =
-        Vector::create(labelval->getData(), insNum, output->useGpu());
-    label = vec->castToInt();
-  }
-
-  CHECK_EQ(insNum, label->getSize());
-  if (supportWeight) {
-    CHECK_EQ(insNum, weight->getHeight());
-    CHECK_EQ((size_t)1, weight->getWidth());
-  }
-
-  CHECK(colIdx_ + (int32_t)outputDim >= 0 && colIdx_ - (int32_t)outputDim < 0)
-      << "column index [" << colIdx_ << "] out of range [-" << outputDim << ", "
-      << outputDim << ")";
-  realColumnIdx_ = 0;
-  if (colIdx_ >= 0) {
-    realColumnIdx_ = colIdx_;
-  } else {
-    realColumnIdx_ = outputDim + colIdx_;
-  }
-
-  if (dynamic_cast<GpuMatrix*>(output.get())) {
-    Matrix::resizeOrCreate(cpuOutput_,
-                           insNum,
-                           outputDim,
-                           /* trans=*/false,
-                           /* useGpu=*/false);
-    cpuOutput_->copyFrom(*output);
-    IVector::resizeOrCreate(cpuLabel_, insNum, false);
-    cpuLabel_->copyFrom(*label);
-
-    if (supportWeight) {
-      Matrix::resizeOrCreate(cpuWeight_, insNum, (size_t)1, false, false);
-      cpuWeight_->copyFrom(*weight);
-    }
-
-    output = cpuOutput_;
-    label = cpuLabel_;
-    weight = cpuWeight_;
-  }
-
-  real* outputD = output->getData();
-  int* labelD = label->getData();
-  real* weightD = supportWeight ? weight->getData() : nullptr;
-  size_t pos = realColumnIdx_;
-
-  for (size_t i = 0; i < insNum; ++i) {
-    real value = outputD[pos];
-    uint32_t binIdx = static_cast<uint32_t>(value * kBinNum_);
-    CHECK(binIdx <= kBinNum_) << "bin index [" << binIdx
-                              << "] out of range, predict value[" << value
-                              << "]";
-    real w = supportWeight ? weightD[i] : 1.0;
-    if (labelD[i] == kNegativeLabel_) {
-      statNeg_[binIdx] += w;
-    } else {
-      statPos_[binIdx] += w;
-    }
-    pos += outputDim;
-  }
-  return 0;
-}
-
-void AucEvaluator::distributeEval(ParameterClient2* client) {
-  client->reduce(statPos_, statPos_, kBinNum_ + 1, FLAGS_trainer_id, 0);
-  client->reduce(statNeg_, statNeg_, kBinNum_ + 1, FLAGS_trainer_id, 0);
-}
-
-double AucEvaluator::calcAuc() const {
-  double totPos = 0.0;
-  double totNeg = 0.0;
-  double totPosPrev = 0.0;
-  double totNegPrev = 0.0;
-  double auc = 0.0;
-
-  int64_t idx = kBinNum_;
-  while (idx >= 0) {
-    totPosPrev = totPos;
-    totNegPrev = totNeg;
-    totPos += statPos_[idx];
-    totNeg += statNeg_[idx];
-    auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev);
-    --idx;
-  }
-
-  if (totPos > 0.0 && totNeg > 0.0) {
-    return auc / totPos / totNeg;
-  } else {
-    return 0.0;
-  }
-}
-
-real AucEvaluator::getValueImpl() const { return calcAuc(); }
-
-std::string AucEvaluator::getTypeImpl() const {
-  if (colIdx_ == -1) {
-    return "last-column-auc";
-  } else {
-    return "auc";
-  }
-}
-
-// class RankAucEvaluator
-REGISTER_EVALUATOR(rankauc, RankAucEvaluator);
-
-void RankAucEvaluator::start() { Evaluator::start(); }
-void RankAucEvaluator::updateSamplesNum(
-    const std::vector<Argument>& arguments) {
-  numSamples_ += arguments[0].getNumSequences();
-}
-real RankAucEvaluator::evalImp(std::vector<Argument>& arguments) {
-  CHECK_GE(arguments.size(), 2U);
-  CHECK_LE(arguments.size(), 3U);
-  double batchAuc = 0.0;
-  output_ = arguments[0].value;
-  click_ = arguments[1].value;
-  size_t batchSize = output_->getHeight();
-  CHECK(!output_->useGpu()) << "RankAUC evaluator does not support GPU!";
-
-  if (arguments.size() == 3U) {
-    pv_ = arguments[2].value;
-  } else {
-    Matrix::resizeOrCreate(pv_, batchSize, 1, false, false);
-    std::fill(pv_->getData(), pv_->getData() + batchSize, 1.0);
-  }
-
-  real* outputData = output_->getData();
-  real* clickData = click_->getData();
-  real* pvData = pv_->getData();
-
-  auto startPos = arguments[0].sequenceStartPositions->getVector(false);
-  const int* startPosData = startPos->getData();
-  size_t batchNum = startPos->getSize() - 1;
-  for (size_t i = 0; i < batchNum; ++i) {
-    int beginPos = startPosData[i];
-    int endPos = startPosData[i + 1];
-    batchAuc += calcRankAuc(outputData + beginPos,
-                            clickData + beginPos,
-                            pvData + beginPos,
-                            endPos - beginPos);
-  }
-  return batchAuc;
-}
-
-double RankAucEvaluator::calcRankAuc(real* outputData,
-                                     real* clickData,
-                                     real* pvData,
-                                     size_t size) {
-  outputPair_.clear();
-  for (size_t i = 0; i < size; ++i) {
-    outputPair_.push_back(std::make_pair(outputData[i], i));
-  }
-  std::sort(outputPair_.begin(),
-            outputPair_.end(),
-            [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
-              return a.first > b.first;
-            });
-  double aucTmp = 0.0;
-  double clickSum = 0.0;
-  double oldClickSum = 0.0;
-  double noClick = 0.0;
-  double noClickSum = 0.0;
-
-  double lastScore = outputPair_[0].first + 1.0;
-  for (size_t i = 0; i < size; ++i) {
-    if (lastScore != outputPair_[i].first) {
-      aucTmp += (clickSum + oldClickSum) * noClick / 2.0;
-      oldClickSum = clickSum;
-      noClick = 0.0;
-      lastScore = outputPair_[i].first;
-    }
-    size_t id = outputPair_[i].second;
-    noClick += pvData[id] - clickData[id];
-    noClickSum += noClick;
-    clickSum += clickData[id];
-  }
-  aucTmp += (clickSum + oldClickSum) * noClick / 2.0;
-  return (clickSum * noClickSum) == 0.0 ? 0.0
-                                        : aucTmp / (clickSum * noClickSum);
-}
-
-std::string RankAucEvaluator::getTypeImpl() const { return "rankauc"; }
-
-// class PrecisionRecallEvaluator
-REGISTER_EVALUATOR(precision_recall, PrecisionRecallEvaluator);
-
-void PrecisionRecallEvaluator::start() {
-  Evaluator::start();
-  statsInfo_.clear();
-  values_.clear();
-}
-
-real PrecisionRecallEvaluator::evalImp(std::vector<Argument>& arguments) {
-  REGISTER_TIMER("PrecisionRecallEvaluator");
-  CHECK_GE(arguments.size(), (size_t)2);
-  CHECK_LE(arguments.size(), (size_t)3);
-  MatrixPtr output = arguments[0].value;
-  IVectorPtr label = arguments[1].ids;
-  MatrixPtr multiBinaryLabel = arguments[1].value;
-  bool supportWeight = (3 == arguments.size()) ? true : false;
-  MatrixPtr weight = supportWeight ? arguments[2].value : nullptr;
-  if (nullptr == output || (nullptr == label && nullptr == multiBinaryLabel) ||
-      (supportWeight && nullptr == weight)) {
-    return 0;
-  }
-
-  size_t insNum = output->getHeight();
-  size_t outputDim = output->getWidth();
-  if (label != nullptr) {
-    CHECK_EQ(insNum, label->getSize());
-  } else {
-    CHECK_EQ(insNum, multiBinaryLabel->getHeight());
-    CHECK_EQ(outputDim, multiBinaryLabel->getWidth());
-  }
-  if (supportWeight) {
-    CHECK_EQ(insNum, weight->getHeight());
-    CHECK_EQ((size_t)1, weight->getWidth());
-  }
-
-  if (statsInfo_.size() != outputDim) {
-    statsInfo_.clear();
-    statsInfo_.resize(outputDim);
-  }
-
-  isMultiBinaryLabel_ = (nullptr == label) ? true : false;
-  if (label != nullptr) {
-    if (dynamic_cast<GpuMatrix*>(output.get())) {
-      Matrix::resizeOrCreate(cpuOutput_, insNum, outputDim, false, false);
-      cpuOutput_->copyFrom(*output);
-      IVector::resizeOrCreate(cpuLabel_, insNum, false);
-      cpuLabel_->copyFrom(*label);
-      if (supportWeight) {
-        Matrix::resizeOrCreate(cpuWeight_, insNum, (size_t)1, false, false);
-        cpuWeight_->copyFrom(*weight);
-      }
-
-      output = cpuOutput_;
-      label = cpuLabel_;
-      weight = cpuWeight_;
-    }
-    calcStatsInfo(output, label, weight);
-  } else {
-    // Not support GPU for multi binary labels
-    CHECK(dynamic_cast<CpuSparseMatrix*>(multiBinaryLabel.get()));
-    calcStatsInfoMulti(output, multiBinaryLabel, weight);
-  }
-  return 0;
-}
-
-void PrecisionRecallEvaluator::printStats(std::ostream& os) const {
-  PrintStatsInfo info;
-  bool containMacroMicroInfo = getStatsInfo(&info);
-  os << "positive_label=" << config_.positive_label()
-     << " precision=" << info.precision << " recall=" << info.recall
-     << " F1-score=" << info.f1;
-  if (containMacroMicroInfo) {
-    os << "macro-average-precision=" << info.macroAvgPrecision
-       << " macro-average-recall=" << info.macroAvgRecall
-       << " macro-average-F1-score=" << info.macroAvgF1Score;
-    if (!isMultiBinaryLabel_) {
-      // precision and recall are equal in this case
-      os << " micro-average-precision=" << info.microAvgPrecision;
-    } else {
-      os << " micro-average-precision=" << info.microAvgPrecision
-         << " micro-average-recall=" << info.microAvgRecall
-         << " micro-average-F1-score=" << info.microAvgF1Score;
-    }
-  }
-}
-
-void PrecisionRecallEvaluator::calcStatsInfo(const MatrixPtr& output,
-                                             const IVectorPtr& label,
-                                             const MatrixPtr& weight) {
-  size_t insNum = output->getHeight();
-  size_t dim = output->getWidth();
-  real* outputD = output->getData();
-  int* labelD = label->getData();
-  real* weightD = (weight != nullptr) ? weight->getData() : nullptr;
-  for (size_t i = 0; i < insNum; ++i) {
-    CHECK_GE(labelD[i], 0);
-    CHECK_LT((size_t)labelD[i], dim);
-    size_t maxIdx = 0;
-    real maxValue = outputD[i * dim];
-    for (size_t j = 1; j < dim; ++j) {
-      size_t idx = i * dim + j;
-      if (maxValue < outputD[idx]) {
-        maxIdx = j;
-        maxValue = outputD[idx];
-      }
-    }
-
-    real w = (weightD != nullptr) ? weightD[i] : 1.0;
-    if (maxIdx == (size_t)labelD[i]) {
-      statsInfo_[maxIdx].TP += w;  // true positive for labelD[i]
-      // true negative for all labels except for labelD[i]
-      for (size_t j = 0; j < dim; ++j) {
-        statsInfo_[j].TN += w;
-      }
-      statsInfo_[maxIdx].TN -= w;
-    } else {
-      statsInfo_[labelD[i]].FN += w;  // false negative for labelD[i]
-      statsInfo_[maxIdx].FP += w;     // false positive for maxIdx
-      // true negatives for all labels except for maxIdx and labelD[i]
-      for (size_t j = 0; j < dim; ++j) {
-        statsInfo_[j].TN += w;
-      }
-      statsInfo_[maxIdx].TN -= w;
-      statsInfo_[labelD[i]].TN -= w;
-    }
-  }
-}
-
-void PrecisionRecallEvaluator::calcStatsInfoMulti(const MatrixPtr& output,
-                                                  const MatrixPtr& label,
-                                                  const MatrixPtr& weight) {
-  size_t insNum = output->getHeight();
-  size_t dim = output->getWidth();
-  real* outputD = output->getData();
-  auto labelD = dynamic_cast<CpuSparseMatrix*>(label.get());
-  real* weightD = (weight != nullptr) ? weight->getData() : nullptr;
-  real threshold = config_.classification_threshold();
-  for (size_t i = 0; i < insNum; ++i) {
-    for (size_t j = 0; j < dim; ++j) {
-      real w = (weightD != nullptr) ? weightD[i] : 1.0;
-      size_t idx = i * dim + j;
-      if (outputD[idx] < threshold) {
-        statsInfo_[j].TN += w;  // true negative
-      } else {
-        statsInfo_[j].FP += w;  // false positive
-      }
-    }
-
-    const int* cols = labelD->getRowCols(i);
-    for (size_t j = 0; j < labelD->getColNum(i); ++j) {
-      CHECK_LT(size_t(cols[j]), dim);
-      real w = (weightD != nullptr) ? weightD[i] : 1.0;
-      size_t idx = i * dim + cols[j];
-      if (outputD[idx] < threshold) {
-        statsInfo_[cols[j]].FN += w;  // false negative
-        statsInfo_[cols[j]].TN -= w;  // true negative
-      } else {
-        statsInfo_[cols[j]].TP += w;  // true positive
-        statsInfo_[cols[j]].FP -= w;  // false positive
-      }
-    }
-  }
-}
-
-void PrecisionRecallEvaluator::storeLocalValues() const {
-  if (this->values_.size() == 0) {
-    PrintStatsInfo info;
-    bool containMacroMicroInfo = getStatsInfo(&info);
-    values_["precision"] = info.precision;
-    values_["recal"] = info.recall;
-    values_["F1-score"] = info.f1;
-    if (containMacroMicroInfo) {
-      values_["macro-average-precision"] = info.macroAvgPrecision;
-      values_["macro-average-recall"] = info.macroAvgRecall;
-      values_["macro-average-F1-score"] = info.macroAvgF1Score;
-      if (!isMultiBinaryLabel_) {
-        // precision and recall are equal in this case
-        values_["micro-average-precision"] = info.microAvgPrecision;
-      } else {
-        values_["micro-average-precision"] = info.microAvgPrecision;
-        values_["micro-average-recall"] = info.microAvgRecall;
-        values_["micro-average-F1-score"] = info.microAvgF1Score;
-      }
-    }
-  }
-}
-
-void PrecisionRecallEvaluator::getNames(std::vector<std::string>* names) {
-  this->storeLocalValues();
-  names->reserve(this->values_.size());
-  for (auto it = this->values_.begin(); it != this->values_.end(); ++it) {
-    names->push_back(this->config_.name() + "." + it->first);
-  }
-}
-
-real PrecisionRecallEvaluator::getValue(const std::string& name,
-                                        Error* err) const {
-  this->storeLocalValues();
-  std::vector<std::string> buffers;
-  paddle::str::split(name, '.', &buffers);
-  auto it = this->values_.find(buffers[buffers.size() - 1]);
-  if (it == this->values_.end()) {  // not found
-    *err = Error("No such key %s", name.c_str());
-    return .0f;
-  }
-
-  return it->second;
-}
-
-std::string PrecisionRecallEvaluator::getType(const std::string& name,
-                                              Error* err) const {
-  this->getValue(name, err);
-  if (!err->isOK()) {
-    return "";
-  }
-  return "precision_recall";
-}
-
-void PrecisionRecallEvaluator::distributeEval(ParameterClient2* client) {
-  size_t size = 4 * statsInfo_.size();
-  double* buf = new double[size];
-  for (size_t i = 0; i < statsInfo_.size(); ++i) {
-    buf[4 * i + 0] = statsInfo_[i].TP;
-    buf[4 * i + 1] = statsInfo_[i].TN;
-    buf[4 * i + 2] = statsInfo_[i].FP;
-    buf[4 * i + 3] = statsInfo_[i].FN;
-  }
-  client->reduce(buf, buf, size, FLAGS_trainer_id, 0);
-  for (size_t i = 0; i < statsInfo_.size(); ++i) {
-    statsInfo_[i].TP = buf[4 * i + 0];
-    statsInfo_[i].TN = buf[4 * i + 1];
-    statsInfo_[i].FP = buf[4 * i + 2];
-    statsInfo_[i].FN = buf[4 * i + 3];
-  }
-  delete[] buf;
-}
-
-bool PrecisionRecallEvaluator::getStatsInfo(
-    PrecisionRecallEvaluator::PrintStatsInfo* info) const {
-  int label = config_.positive_label();
-  if (label != -1) {
-    CHECK(label >= 0 && label < (int)statsInfo_.size())
-        << "positive_label [" << label << "] should be in range [0, "
-        << statsInfo_.size() << ")";
-    info->precision = calcPrecision(statsInfo_[label].TP, statsInfo_[label].FP);
-    info->recall = calcRecall(statsInfo_[label].TP, statsInfo_[label].FN);
-    info->f1 = calcF1Score(info->precision, info->recall);
-    return false;
-  }
-
-  // micro average method: precision = (TP1+TP2)/(TP1+FP1+TP2+FP2)
-  // macro average method: precision = (precision1+precision2)/2
-  double microTotalTP = 0;
-  double microTotalFP = 0;
-  double microTotalFN = 0;
-  info->macroAvgPrecision = 0;
-  info->macroAvgRecall = 0;
-  size_t numLabels = statsInfo_.size();
-  for (size_t i = 0; i < numLabels; ++i) {
-    microTotalTP += statsInfo_[i].TP;
-    microTotalFP += statsInfo_[i].FP;
-    microTotalFN += statsInfo_[i].FN;
-    info->macroAvgPrecision +=
-        calcPrecision(statsInfo_[i].TP, statsInfo_[i].FP);
-    info->macroAvgRecall += calcRecall(statsInfo_[i].TP, statsInfo_[i].FN);
-  }
-  info->macroAvgPrecision /= numLabels;
-  info->macroAvgRecall /= numLabels;
-  info->macroAvgF1Score =
-      calcF1Score(info->macroAvgPrecision, info->macroAvgRecall);
-
-  info->microAvgPrecision = calcPrecision(microTotalTP, microTotalFP);
-  info->microAvgRecall = calcPrecision(microTotalTP, microTotalFN);
-  info->microAvgF1Score =
-      calcF1Score(info->microAvgPrecision, info->microAvgRecall);
-  return true;
-}
-
-REGISTER_EVALUATOR(pnpair, PnpairEvaluator);
-void PnpairEvaluator::start() {
-  Evaluator::start();
-  memset(pairArray_, 0, sizeof(pairArray_));
-  predictArray_.clear();
-}
-
-real PnpairEvaluator::evalImp(std::vector<Argument>& arguments) {
-  CHECK_GE(arguments.size(), 3UL);
-  CHECK_LE(arguments.size(), 4UL);
-  MatrixPtr output = arguments[0].value;
-  IVectorPtr label = arguments[1].ids;
-  IVectorPtr info = arguments[2].ids;
-  bool supportWeight = (4 == arguments.size()) ? true : false;
-  MatrixPtr weight = supportWeight ? arguments[3].value : nullptr;
-  if (nullptr == output || nullptr == label ||
-      (supportWeight && nullptr == weight)) {
-    return 0;
-  }
-  size_t height = output->getHeight();
-  size_t width = output->getWidth();
-  CHECK_EQ(height, label->getSize());
-  CHECK_EQ(height, info->getSize());
-  if (supportWeight) {
-    CHECK_EQ(height, weight->getHeight());
-    CHECK_EQ((size_t)1, weight->getWidth());
-  }
-
-  if (dynamic_cast<GpuMatrix*>(output.get())) {
-    Matrix::resizeOrCreate(cpuOutput_, height, width, false, false);
-    IVector::resizeOrCreate(cpuLabel_, height, false);
-    IVector::resizeOrCreate(cpuInfo_, height, false);
-    cpuOutput_->copyFrom(*output);
-    cpuLabel_->copyFrom(*label);
-    cpuInfo_->copyFrom(*info);
-
-    output = cpuOutput_;
-    label = cpuLabel_;
-    info = cpuInfo_;
-
-    if (supportWeight) {
-      Matrix::resizeOrCreate(cpuWeight_, height, (size_t)1, false, false);
-      cpuWeight_->copyFrom(*weight);
-      weight = cpuWeight_;
-    }
-  }
-
-  real* outputs = output->getData();
-  int* labels = label->getData();
-  int* infos = info->getData();
-  real* weights = supportWeight ? weight->getData() : nullptr;
-  for (size_t i = 0; i < output->getHeight(); i++) {
-    real y1 = outputs[i * width + (width - 1)];
-    real w = supportWeight ? weights[i] : 1.0;
-    predictArray_.push_back(PredictionResult(y1, labels[i], infos[i], w));
-  }
-  return 0;
-}
-
-void PnpairEvaluator::stat(size_t start,
-                           size_t end,
-                           PredictionResult* answers,
-                           double& pos,
-                           double& neg,
-                           double& spe) {
-  for (size_t i = start; i < end; i++) {
-    for (size_t j = i + 1; j < end; j++) {
-      CHECK_EQ(answers[i].queryid, answers[j].queryid);
-      // The pair weight is the mean of the two samples' weight
-      double weight = (answers[i].weight + answers[j].weight) / 2.0;
-      if (answers[i].label != answers[j].label) {
-        if ((answers[i].out > answers[j].out &&
-             answers[i].label > answers[j].label) ||
-            (answers[i].out < answers[j].out &&
-             answers[i].label < answers[j].label)) {
-          pos += weight;
-        } else if ((answers[i].out > answers[j].out &&
-                    answers[i].label < answers[j].label) ||
-                   (answers[i].out < answers[j].out &&
-                    answers[i].label > answers[j].label)) {
-          neg += weight;
-        } else {
-          spe += weight;
-        }
-      }
-    }
-  }
-}
-
-void PnpairEvaluator::calc(std::vector<PredictionResult>& predictArray) {
-  std::sort(predictArray.begin(),
-            predictArray.end(),
-            [](const PredictionResult& x, const PredictionResult& y) {
-              return x.queryid < y.queryid;
-            });
-
-  double pos = 0;
-  double neg = 0;
-  double special = 0;
-  auto start = predictArray.begin();
-  while (start != predictArray.end()) {
-    auto end = std::find_if(
-        start + 1, predictArray.end(), [=](const PredictionResult& x) {
-          return x.queryid != start->queryid;
-        });
-    CHECK(end != start);
-    stat(start - predictArray.begin(),
-         end - predictArray.begin(),
-         predictArray.data(),
-         pos,
-         neg,
-         special);
-
-    start = end;
-  }
-
-  pairArray_[0] += pos;
-  pairArray_[1] += neg;
-
-  LOG(INFO) << " calc total pos pair: " << pos
-            << " calc total neg pair: " << neg
-            << " calc total special pair: " << special;
-}
-
-std::string PnpairEvaluator::getTypeImpl() const { return "pnpair"; }
-
-ClassRegistrar<Evaluator> Evaluator::registrar_;
-Evaluator* Evaluator::create(const EvaluatorConfig& config) {
-  Evaluator* evaluator = registrar_.createByType(config.type());
-  evaluator->init(config);
-  return evaluator;
-}
-
-REGISTER_EVALUATOR(classification_error, ClassificationErrorEvaluator);
-REGISTER_EVALUATOR(sum, SumEvaluator);
-static InitFunction __reg_type_auc_sum__([]() {
-  Evaluator::registrar_.registerClass(
-      "last-column-sum", [] { return new ColumnSumEvaluator(-1); });
-  Evaluator::registrar_.registerClass("last-column-auc",
-                                      [] { return new AucEvaluator(-1); });
-});
-
-/**
- * @brief print value of each layer.
- *
- * The config file api is value_printer_evaluator.
- */
-class ValuePrinter : public NotGetableEvaluator {
- public:
-  virtual void eval(const NeuralNetwork& nn) {
-    for (const std::string& name : config_.input_layers()) {
-      nn.getLayer(name)->getOutput().printValueString(LOG(INFO),
-                                                      "layer=" + name + " ");
-    }
-  }
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
-
-  virtual real evalImp(std::vector<Argument>& arguments) { return 0; }
-};
-REGISTER_EVALUATOR(value_printer, ValuePrinter);
-
-/**
- * @brief print gradient of each layer.
- *
- * The config file api is gradient_printer_evaluator.
- */
-class GradientPrinter : public NotGetableEvaluator {
- public:
-  virtual void eval(const NeuralNetwork& nn) {
-    for (const std::string& name : config_.input_layers()) {
-      const Argument& argu = nn.getLayer(name)->getOutput();
-      if (argu.grad) {
-        std::ostringstream os;
-        argu.grad->print(os);
-        LOG(INFO) << "layer=" << name << " grad matrix:\n" << os.str();
-      }
-    }
-  }
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
-
-  virtual real evalImp(std::vector<Argument>& arguments) { return 0; }
-};
-REGISTER_EVALUATOR(gradient_printer, GradientPrinter);
-/**
- * @brief print row max id vctor of each layer
- *
- * The config file api is maxid_printer_evaluator.
- */
-class MaxIdPrinter : public NotGetableEvaluator {
- private:
-  IVectorPtr maxIds_;
-  MatrixPtr maxValues_;
-
- public:
-  MaxIdPrinter() {}
-
-  virtual void eval(const NeuralNetwork& nn) {
-    for (const std::string& name : config_.input_layers()) {
-      const Argument& argu = nn.getLayer(name)->getOutput();
-      if (argu.value) {
-        size_t height = argu.value->getHeight();
-        size_t width = config_.num_results();
-        IVector::resizeOrCreate(maxIds_, height * width, false);
-        Matrix::resizeOrCreate(maxValues_, height, width, false);
-        argu.value->rowMax(*maxIds_, *maxValues_);
-        std::ostringstream os;
-        int* ids = maxIds_->getData();
-        real* values = maxValues_->getData();
-        for (size_t i = 0; i < height; ++i) {
-          for (size_t j = 0; j < width; ++j) {
-            size_t pos = i * width + j;
-            os << ids[pos] << " : " << values[pos] << ", ";
-          }
-          os << std::endl;
-        }
-        LOG(INFO) << "layer=" << name << " row max id vector:\n" << os.str();
-      }
-    }
-  }
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
-
-  virtual real evalImp(std::vector<Argument>& arguments) { return 0; }
-};
-REGISTER_EVALUATOR(max_id_printer, MaxIdPrinter);
-/**
- * @brief print sequence max frames of each layer
- *
- * The config file api is maxframe_printer_evaluator.
- */
-class MaxFramePrinter : public NotGetableEvaluator {
- private:
-  IVectorPtr maxIds_;
-  MatrixPtr maxValues_;
-  MatrixPtr value_;
-
- public:
-  MaxFramePrinter() {
-    value_ =
-        Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, false);
-  }
-
-  virtual void eval(const NeuralNetwork& nn) {
-    for (const std::string& name : config_.input_layers()) {
-      const Argument& argu = nn.getLayer(name)->getOutput();
-
-      CHECK_EQ(argu.value->getWidth(), 1LU);
-      size_t numSequences = argu.getNumSequences();
-      const int* starts = argu.sequenceStartPositions->getData(false);
-
-      std::ostringstream os;
-      for (size_t i = 0; i < numSequences; ++i) {
-        size_t offset = starts[i];
-        size_t size = starts[i + 1] - starts[i];
-        value_->setData(argu.value->getData() + offset, 1LU, size);
-
-        size_t height = 1LU;
-        size_t width = std::min((size_t)config_.num_results(), size);
-        IVector::resizeOrCreate(maxIds_, height * width, false);
-        Matrix::resizeOrCreate(maxValues_, height, width, false);
-
-        value_->rowMax(*maxIds_, *maxValues_);
-
-        int* ids = maxIds_->getData();
-        real* values = maxValues_->getData();
-        for (size_t j = 0; j < width; ++j) {
-          os << ids[j] << " : " << values[j] << ", ";
-        }
-        os << "total " << size << " frames" << std::endl;
-      }
-      LOG(INFO) << "layer=" << name << " sequence max frames:\n" << os.str();
-    }
-  }
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
-
-  virtual real evalImp(std::vector<Argument>& arguments) { return 0; }
-};
-REGISTER_EVALUATOR(max_frame_printer, MaxFramePrinter);
-
-/**
- * @brief print text according to index matrix and a dictionary.
- *
- * There can be multiple input to this layer:
- * - If there is only one input, the input must be a matrix containing
- *      the sequence of indices;
- * - If there are more than one input, the first input should be ids,
- *      and are interpreted as sample ids.
- *
- * The output format will be:
- *
- * - sequence without sub-sequence, and there is probability.
- *
- *     @code
- *      id \t prob space_seperated_tokens_from_dictionary_according_to_seq
- *     @endcode
- *
- * - sequence without sub-sequence, and there is not probability.
- *
- *     @code
- *      id \t space_seperated_tokens_from_dictionary_according_to_seq
- *     @endcode
- *
- * - sequence with sub-sequence, and there is not probability.
- *
- *     @code
- *      id \t space_seperated_tokens_from_dictionary_according_to_sub_seq
- *      \t \t space_seperated_tokens_from_dictionary_according_to_sub_seq
- *      ...
- *     @endcode
- *
- * Typically SequenceTextPrinter layer takes output of maxid or RecurrentGroup
- * with maxid (when generating) as an input.
- *
- * The config file api is seqtext_printer_evaluator.
- *
- */
-class SequenceTextPrinter : public NotGetableEvaluator {
- private:
-  /// dict_file, which contains a list of tokens
-  std::vector<std::string> dict_;
-  /// result_file, which is the output file
-  std::ofstream os_;
-  /// True/False, to indicate whether to use space to separate output tokens.
-  /// Default is True. No space is added if set to False.
-  bool delimited_;
-  /// store the cpu version of argument.ids
-  std::vector<IVectorPtr> cpuIds_;
-  /// store the probability associated with each sequence
-  std::vector<MatrixPtr> cpuIn_;
-
- public:
-  SequenceTextPrinter() {}
-
-  virtual void init(const EvaluatorConfig& config) {
-    Evaluator::init(config);
-    if (!config.dict_file().empty()) {
-      loadFileList(config.dict_file(), dict_);
-    }
-
-    os_.open(config.result_file(), std::ofstream::trunc);
-    CHECK(os_.is_open()) << "Failed to open file " << config.result_file();
-    delimited_ = config.delimited();
-  }
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    CHECK_GE(arguments.size(), 1LU);
-    bool hasId = arguments.size() > 1;
-    size_t numSequences = arguments[0].getNumSequences();
-    if (hasId) {
-      CHECK_EQ(arguments[0].ids->getSize(), numSequences)
-          << "first input must be sample id.";
-    }
-    for (size_t i = hasId ? 1 : 0; i < arguments.size(); ++i) {
-      CHECK_EQ((size_t)arguments[i].getNumSequences(), numSequences);
-    }
-
-    auto resizeVector = [](IVectorPtr& dest, const IVectorPtr& src) {
-      if (src && src->useGpu()) {
-        IVector::resizeOrCreate(dest, src->getSize(), false);
-        dest->copyFrom(*src);
-      } else {
-        dest = src;
-      }
-    };
-
-    auto resizeMatrix = [](MatrixPtr& dest, const MatrixPtr& src) {
-      if (src && src->useGpu()) {
-        Matrix::resizeOrCreate(
-            dest, src->getHeight(), src->getWidth(), false, false);
-        dest->copyFrom(*src);
-      } else {
-        dest = src;
-      }
-    };
-
-    cpuIds_.resize(arguments.size());
-    cpuIn_.resize(arguments.size());
-    for (size_t i = 0; i < arguments.size(); ++i) {
-      resizeVector(cpuIds_[i], arguments[i].ids);
-      resizeMatrix(cpuIn_[i], arguments[i].in);
-    }
-
-    int* sampleIds = nullptr;
-    if (hasId) {
-      sampleIds = cpuIds_[0]->getData();
-    }
-
-    for (size_t i = 0; i < numSequences; ++i) {
-      os_ << (hasId ? sampleIds[i] : i);
-      for (size_t j = hasId ? 1 : 0; j < arguments.size(); ++j) {
-        int* output = cpuIds_[j]->getData();
-        const int* starts = arguments[j].sequenceStartPositions->getData(false);
-
-        auto seqPrint = [&](int start, int end) {
-          os_ << "\t";
-          for (int k = start; k < end; k++) {
-            int id = output[k];
-            os_ << (delimited_ ? " " : "");
-            if (!dict_.empty()) {
-              CHECK_LT((size_t)id, dict_.size());
-              os_ << dict_[id];
-            } else {
-              os_ << id;
-            }
-          }
-        };
-
-        if (arguments[j].hasSubseq()) {
-          // print sequence with sub-sequence
-          const int* subStarts =
-              arguments[j].subSequenceStartPositions->getData(false);
-          int subSeqId_start = 0;
-          int subSeqId_end = 0;
-          for (size_t k = 0; k < (size_t)arguments[j].getNumSubSequences() + 1;
-               ++k) {
-            if (starts[i] == subStarts[k]) subSeqId_start = k;
-            if (starts[i + 1] == subStarts[k]) subSeqId_end = k;
-          }
-          for (int k = subSeqId_start; k < subSeqId_end; k++) {
-            seqPrint(subStarts[k], subStarts[k + 1]);
-            os_ << std::endl;
-          }
-
-        } else {
-          // print sequence without sub-sequence
-          if (arguments[j].in) {  // beam print
-            real* probs = cpuIn_[j]->rowBuf(i);
-            os_ << std::endl;
-            int start = starts[i];
-            int seqEnd = starts[i + 1];
-            for (size_t k = 0; k < arguments[j].in->getWidth(); ++k) {
-              if (start == seqEnd) {
-                break;
-              }
-              int end = start + output[start] + 2;
-              CHECK_LE(end, seqEnd);
-              CHECK_EQ(output[end - 1], -1);
-              os_ << k << "\t" << probs[k];
-              seqPrint(start + 1, end - 1);
-              os_ << std::endl;
-              start = end;
-            }
-          } else {
-            seqPrint(starts[i], starts[i + 1]);
-          }
-        }
-      }
-      os_ << std::endl;
-    }
-    return 0;
-  }
-};
-REGISTER_EVALUATOR(seq_text_printer, SequenceTextPrinter);
-/**
- * @brief print classification error.
- *
- * The config file api is classification_error_printer_evaluator.
- */
-class ClassificationErrorPrinter : public ClassificationErrorEvaluator {
- public:
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    MatrixPtr errorMat = calcError(arguments);
-
-    std::ostringstream os;
-    errorMat->print(os);
-    LOG(INFO) << "Printer=" << config_.name() << " Classification Error:\n"
-              << os.str();
-
-    if (auto startPos = arguments[0].sequenceStartPositions) {
-      std::ostringstream os;
-      startPos->getVector(false)->print(os, startPos->getSize());
-      LOG(INFO) << "Printer=" << config_.name() << " sequence pos vector:\n"
-                << os.str();
-    }
-    return 0;
-  }
-};
-REGISTER_EVALUATOR(classification_error_printer, ClassificationErrorPrinter);
-
-std::string DummyEvaluator::getTypeImpl() const { return "dummy"; }
-
-}  // namespace paddle
diff --git a/paddle/gserver/evaluators/Evaluator.h b/paddle/gserver/evaluators/Evaluator.h
deleted file mode 100644
index 42948f1097d9a12600f4b11646a47e45b9bf4e96..0000000000000000000000000000000000000000
--- a/paddle/gserver/evaluators/Evaluator.h
+++ /dev/null
@@ -1,510 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <fstream>
-#include "ModelConfig.pb.h"
-#include "paddle/parameter/Argument.h"
-#include "paddle/pserver/ParameterClient2.h"
-#include "paddle/utils/ClassRegistrar.h"
-#include "paddle/utils/Error.h"
-
-namespace paddle {
-
-class NeuralNetwork;
-/**
- * @def REGISTER_EVALUATOR
- * @brief Macro for registering evaluator class
- */
-
-#define REGISTER_EVALUATOR(__type_name, __class_name)                \
-  static InitFunction __reg_type_##__type_name([]() {                \
-    Evaluator::registrar_.registerClass<__class_name>(#__type_name); \
-  })
-/**
- * @brief Base class for Evaluator
- * Evaluating the performance of a model is very important.
- * It indicates how successful the scores(predictions) of a datasets
- * has been by a trained model.
- */
-class Evaluator {
- public:
-  static Evaluator* create(const EvaluatorConfig& config);
-
-  Evaluator() : numSamples_(0), totalScore_(0) {}
-
-  virtual ~Evaluator() {}
-
-  virtual void init(const EvaluatorConfig& config) { config_ = config; }
-
-  /**
-   * @brief start to evaluate some data
-   */
-  virtual void start() {
-    numSamples_ = 0;
-    totalScore_ = 0;
-  }
-
-  /**
-   * @brief Process a batch of data.
-   */
-  virtual void eval(const NeuralNetwork& nn);
-
-  /**
-   * @brief Process a batch of data.
-   * @return the score for the batch if it make sense to sum the score across
-   * batches.
-   * @note Otherwise evaluator should return 0 and override finish() and
-   * printStats() to do the right calculation.
-   */
-  virtual real evalImp(std::vector<Argument>& arguments) = 0;
-
-  /**
-   * @brief Update the number of processed samples
-   */
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
-    numSamples_ += arguments[0].getBatchSize();
-  }
-
-  /// finish() should be called before distributeEval
-  virtual void distributeEval(ParameterClient2* client) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  void mergeResultsOfAllClients(ParameterClient2* client) {
-    double data[2] = {totalScore_, numSamples_};
-    client->reduce(data, data, 2, FLAGS_trainer_id, 0);
-    totalScore_ = data[0];
-    numSamples_ = data[1];
-  }
-
-  /**
-   * @brief finish the evaluation.
-   */
-  virtual void finish() {}
-
-  /**
-   * @brief print the statistics of evaluate result
-   * @note finish() should be called before printStats
-   */
-  virtual void printStats(std::ostream& os) const {
-    os << config_.name() << "="
-       << (numSamples_ ? totalScore_ / numSamples_ : 0);
-  }
-
-  friend std::ostream& operator<<(std::ostream& os,
-                                  const Evaluator& evaluator) {
-    evaluator.printStats(os);
-    return os;
-  }
-
-  friend std::ostream&& operator<<(std::ostream&& os,  // NOLINT
-                                   const Evaluator& evaluator) {
-    evaluator.printStats(os);
-    return std::move(os);
-  }
-
-  static ClassRegistrar<Evaluator> registrar_;
-
-  /**
-   * @brief getNames will return all field names of current evaluator.
-   *
-   * The format of name is `evaluator_name.evaluator_fields`. If the evaluator
-   * has multiple field, the name could be `evaluator_name.field1`. For example
-   * the PrecisionRecallEvaluator contains `precision`, `recall` fields. The get
-   * names will return `precision_recall_evaluator.precision`,
-   * `precision_recall_evaluator.recal`, etc.
-   *
-   * Also, if current Evaluator is a combined evaluator. getNames will return
-   * all names of all evaluators inside the combined evaluator.
-   *
-   * @param names [out]: the field names of current evaluator.
-   * @note Never clear the names parameter inside getNames.
-   */
-  virtual void getNames(std::vector<std::string>* names) {
-    names->push_back(config_.name());
-  }
-
-  /**
-   * @brief getValue will return the current evaluate value of one field.
-   *
-   * @param name: The field name of current evaluator.
-   * @param err [out]: The error state.
-   *
-   * @return The evaluate value(metric).
-   */
-  virtual real getValue(const std::string& name, Error* err) const {
-    if (name != config_.name()) {
-      *err = Error("no such name of evaluator %s", name.c_str());
-      return .0f;
-    }
-    return this->getValueImpl();
-  }
-
-  /**
-   * @brief getType will return the evaluator type by field name.
-   *
-   * Evaluate Type is the current type of evaluator in string. Such as 'auc',
-   * 'precision_recall'. In combined evaluator, different name may get different
-   * evaluate type because it could be evaluated by different evaluator inside.
-   *
-   * @param name: The field name of current Evaluator.
-   * @param err: The error state. nullptr means don't care.
-   * @return the evaluator type string.
-   */
-  virtual std::string getType(const std::string& name, Error* err) const {
-    if (name != config_.name()) {
-      *err = Error("no such name of evaluator %s", name.c_str());
-      return std::string();
-    }
-    return this->getTypeImpl();
-  }
-
- protected:
-  /**
-   * @brief getValueImpl The simplest way to define getValue result. If this
-   * evaluator doesn't contain multiple fields, and do not throw any error, just
-   * implemented this method to get the evaluate result(metric).
-   * @return Evaluate result(metric).
-   */
-  virtual real getValueImpl() const {
-    return numSamples_ != .0 ? totalScore_ / numSamples_ : .0;
-  }
-
-  /**
-   * @brief getTypeImpl The simplest way to define getType result. If this
-   * evaluator doesn't combine many evaluators, the get type should only return
-   * itself type.
-   * @return Evaluator type.
-   */
-  virtual std::string getTypeImpl() const { return "base"; }
-
- protected:
-  EvaluatorConfig config_;
-  double numSamples_;
-  double totalScore_;
-};
-
-/**
- * @brief The NotGetableEvaluator class is the base class of evaluator that
- * cannot get value in runtime. The most NotGetableEvaluator is Printer
- * Evaluator, which is only used to debug network configuration.
- */
-class NotGetableEvaluator : public Evaluator {
-  // Evaluator interface
- public:
-  void getNames(std::vector<std::string>* names) {}
-
-  real getValue(const std::string& name, Error* err) const {
-    *err = Error("Not implemented");
-    return .0f;
-  }
-
-  std::string getType(const std::string& name, Error* err) const {
-    *err = Error("Not implemented");
-    return "";
-  }
-};
-
-class DummyEvaluator : public Evaluator {
- public:
-  DummyEvaluator() {}
-  virtual void init(const EvaluatorConfig&) {}
-  virtual void start() {}
-  virtual void eval(const NeuralNetwork&) {}
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    (void)arguments;
-    return -1;
-  }
-  virtual void finish() {}
-  virtual void printStats(std::ostream&) const {}
-
-  // Evaluator interface
- protected:
-  std::string getTypeImpl() const;
-};
-/**
- * @brief evaluate AUC using colIdx-th column as prediction.
- * The AUC(Area Under the Curve) is a common evaluation metric
- * for binary classification problems. It computes the area under
- * the receiver operating characteristic(ROC) curve.
- *
- * @note colIdx-th column
- *
- * - colIdx = 0: the 0-th column.
- * - colIdx > 0: the colIdx-th column.
- * - colIdx < 0: the last colIdx-th column.
- *
- * The config file api is auc_evaluator.
- *
- */
-class AucEvaluator : public Evaluator {
- public:
-  AucEvaluator(int32_t colIdx)
-      : colIdx_(colIdx),
-        realColumnIdx_(0),
-        cpuOutput_(nullptr),
-        cpuLabel_(nullptr),
-        cpuWeight_(nullptr) {}
-
-  virtual void start();
-
-  virtual real evalImp(std::vector<Argument>& arguments);
-
-  virtual void printStats(std::ostream& os) const {
-    os << config_.name() << "=" << calcAuc();
-  }
-
-  virtual void distributeEval(ParameterClient2* client);
-
- private:
-  static const uint32_t kBinNum_ = (1 << 24) - 1;
-  static const int kNegativeLabel_ = 0;
-  double statPos_[kBinNum_ + 1];
-  double statNeg_[kBinNum_ + 1];
-  int32_t colIdx_;
-  uint32_t realColumnIdx_;
-  MatrixPtr cpuOutput_;
-  IVectorPtr cpuLabel_;
-  MatrixPtr cpuWeight_;
-
-  AucEvaluator() {}
-
-  inline static double trapezoidArea(double X1,
-                                     double X2,
-                                     double Y1,
-                                     double Y2) {
-    return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0;
-  }
-
-  double calcAuc() const;
-
-  // Evaluator interface
- protected:
-  real getValueImpl() const;
-  std::string getTypeImpl() const;
-};
-
-/**
- * @brief RankAucEvaluator calculates the AUC of each list (i.e., titles
- * under the same query), and averages them. Each list should be organized
- * as a sequence. The inputs of this evaluator is [output, click, pv]. If pv
- * is not provided, it will be set to 1. The types of click and pv are
- * dense value.
- */
-class RankAucEvaluator : public Evaluator {
- public:
-  // evaluate ranking AUC
-  virtual void start();
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments);
-
-  virtual real evalImp(std::vector<Argument>& arguments);
-
-  virtual void distributeEval(ParameterClient2* client) {
-    mergeResultsOfAllClients(client);
-  }
-
- private:
-  MatrixPtr output_;
-  MatrixPtr click_;
-  MatrixPtr pv_;
-  std::vector<std::pair<real, int>> outputPair_;
-
-  double calcRankAuc(real* outputData,
-                     real* clickData,
-                     real* pvData,
-                     size_t size);
-
-  // Evaluator interface
- protected:
-  std::string getTypeImpl() const;
-};
-
-/**
- * @brief precision, recall and f1 score Evaluator
- * \f[
- * precision = \frac{tp}{tp+tn} \\
- * recall=\frac{tp}{tp+fn} \\
- * f1=2*\frac{precsion*recall}{precision+recall}
- * \f]
- *
- * The config file api is precision_recall_evaluator.
- */
-class PrecisionRecallEvaluator : public Evaluator {
- public:
-  // Evaluate precision, recall and F1 score
-  PrecisionRecallEvaluator()
-      : isMultiBinaryLabel_(false),
-        cpuOutput_(nullptr),
-        cpuLabel_(nullptr),
-        cpuWeight_(nullptr) {}
-
-  virtual void start();
-
-  virtual real evalImp(std::vector<Argument>& arguments);
-
-  virtual void printStats(std::ostream& os) const;
-
-  virtual void distributeEval(ParameterClient2* client);
-
-  void getNames(std::vector<std::string>* names);
-
-  real getValue(const std::string& name, Error* err) const;
-
-  std::string getType(const std::string& name, Error* err) const;
-
-  struct StatsInfo {
-    /// numbers of true positives
-    double TP;
-    /// numbers of true negatives
-    double TN;
-    /// numbers of false positives
-    double FP;
-    /// numbers of false negatives
-    double FN;
-
-    StatsInfo() : TP(0.0), TN(0.0), FP(0.0), FN(0.0) {}
-  };
-
- private:
-  bool isMultiBinaryLabel_;
-  std::vector<StatsInfo> statsInfo_;
-
-  MatrixPtr cpuOutput_;
-  IVectorPtr cpuLabel_;
-  MatrixPtr cpuWeight_;
-
-  struct PrintStatsInfo {
-    double precision;
-    double recall;
-    double f1;
-    double macroAvgPrecision;
-    double macroAvgRecall;
-    double macroAvgF1Score;
-    double microAvgPrecision;
-    double microAvgRecall;
-    double microAvgF1Score;
-  };
-
-  bool getStatsInfo(PrintStatsInfo* info) const;
-
-  void calcStatsInfo(const MatrixPtr& output,
-                     const IVectorPtr& label,
-                     const MatrixPtr& weight);
-
-  void calcStatsInfoMulti(const MatrixPtr& output,
-                          const MatrixPtr& label,
-                          const MatrixPtr& weight);
-
-  inline static double calcPrecision(double TP, double FP) {
-    if (TP > 0.0 || FP > 0.0) {
-      return TP / (TP + FP);
-    } else {
-      return 1.0;
-    }
-  }
-
-  inline static double calcRecall(double TP, double FN) {
-    if (TP > 0.0 || FN > 0.0) {
-      return TP / (TP + FN);
-    } else {
-      return 1.0;
-    }
-  }
-
-  inline static double calcF1Score(double precision, double recall) {
-    if (precision > 0.0 || recall > 0.0) {
-      return 2 * precision * recall / (precision + recall);
-    } else {
-      return 0;
-    }
-  }
-
-  mutable std::unordered_map<std::string, real> values_;
-
-  void storeLocalValues() const;
-};
-
-/*
- * @brief positive-negative pair rate Evaluator
- *
- * The config file api is pnpair_evaluator.
- */
-class PnpairEvaluator : public Evaluator {
- public:
-  PnpairEvaluator()
-      : cpuOutput_(nullptr),
-        cpuLabel_(nullptr),
-        cpuInfo_(nullptr),
-        cpuWeight_(nullptr) {}
-
-  virtual void start();
-  virtual real evalImp(std::vector<Argument>& arguments);
-
-  struct PredictionResult {
-    PredictionResult(real __out, int __label, int __queryid, real __weight)
-        : out(__out), label(__label), queryid(__queryid), weight(__weight) {}
-    real out;
-    int label;
-    int queryid;
-    real weight;
-  };
-  std::vector<PredictionResult> predictArray_;
-  void printPredictResults() {
-    std::ofstream fs(FLAGS_predict_file);
-    CHECK(fs) << "Fail to open " << FLAGS_predict_file;
-    for (auto& res : predictArray_) {
-      fs << res.out << " " << res.label << " " << res.queryid << std::endl;
-    }
-  }
-
-  void stat(size_t start,
-            size_t end,
-            PredictionResult* answers,
-            double& pos,
-            double& neg,
-            double& spe);
-  void calc(std::vector<PredictionResult>& predictArray);
-
-  virtual void finish() { calc(predictArray_); }
-
-  virtual void printStats(std::ostream& os) const {
-    os << " pos/neg=" << this->getValueImpl();
-  }
-
-  virtual void distributeEval(ParameterClient2* client) {
-    client->reduce(pairArray_, pairArray_, kPairArrayNum_, FLAGS_trainer_id, 0);
-    LOG(INFO) << " distribute eval calc total pos pair: " << pairArray_[0]
-              << " calc total neg pair: " << pairArray_[1];
-  }
-
- private:
-  static const uint32_t kPairArrayNum_ = 2;
-  double pairArray_[kPairArrayNum_];
-  MatrixPtr cpuOutput_;
-  IVectorPtr cpuLabel_;
-  IVectorPtr cpuInfo_;
-  MatrixPtr cpuWeight_;
-
-  // Evaluator interface
- protected:
-  real getValueImpl() const {
-    return pairArray_[0] / ((pairArray_[1] <= 0) ? 1.0 : pairArray_[1]);
-  }
-  std::string getTypeImpl() const;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/GradientMachine.cpp b/paddle/gserver/gradientmachines/GradientMachine.cpp
deleted file mode 100644
index 654024e8a47c1e538f25823da78dce6a7a093975..0000000000000000000000000000000000000000
--- a/paddle/gserver/gradientmachines/GradientMachine.cpp
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "GradientMachine.h"
-
-#include <fstream>
-#include "paddle/utils/Logging.h"
-
-#include "NeuralNetwork.h"
-#include "hl_gpu.h"
-
-#ifndef PADDLE_MOBILE_INFERENCE
-#include "GradientMachineMode.h"
-#include "MultiGradientMachine.h"
-#include "MultiNetwork.h"
-#include "ParallelNeuralNetwork.h"
-#endif
-
-namespace paddle {
-
-GradientMachine* GradientMachine::create(
-    const ModelConfig& config,
-    int mode,
-    const std::vector<ParameterType>& parameterTypes) {
-#ifndef PADDLE_MOBILE_INFERENCE
-  if (auto gm = IGradientMachineMode::tryCreateGradientMachine(mode, config)) {
-    return gm;
-  }
-  if (FLAGS_trainer_count > 1) {
-    return new MultiGradientMachine(config, FLAGS_use_gpu);
-  }
-#endif
-  if (FLAGS_trainer_count == 1) {  // single
-#ifndef PADDLE_MOBILE_INFERENCE
-    NeuralNetwork* nn;
-    if (config.type() == "multi_nn") {
-      /* multi submodel calculate, thread(s) will be initialized inside */
-      nn = new MultiNetwork("root");
-    } else if (FLAGS_parallel_nn) {
-      /* multi threads calculate */
-      nn = new ParallelNeuralNetwork();
-    } else {
-      /* single thread calculate */
-      nn = NeuralNetwork::create(config);
-    }
-#else
-    NeuralNetwork* nn = NeuralNetwork::create(config);
-#endif
-    ParamInitCallback testParamInitCb = [](int paramId, Parameter* para) {
-      para->enableType(PARAMETER_VALUE);
-    };
-    nn->init(
-        config, mode == kTesting ? testParamInitCb : nullptr, parameterTypes);
-    return nn;
-  }
-  LOG(FATAL) << "Unknown model type: " << config.type();
-  return nullptr;
-}
-
-void GradientMachine::saveParameters(const std::string& dir) const {
-  LOG(INFO) << "Saving parameters to " << dir;
-
-  for (auto& para : parameters_) {
-    std::string filename = dir + "/" + para->getName();
-    if (para->isFullSize()) {
-      para->save(filename);
-    }
-  }
-}
-
-void GradientMachine::loadParameters(const std::string& dir) {
-  LOG(INFO) << "Loading parameters from " << dir;
-
-  for (auto& para : parameters_) {
-    std::string filename = dir + "/" + para->getName();
-    if (para->isFullSize()) {
-      para->load(filename);
-    }
-  }
-}
-
-void GradientMachine::randParameters() {
-  LOG(INFO) << "Initing parameters..";
-
-  for (auto& para : parameters_) {
-    if (para->isFullSize()) {
-      para->randomize();
-    }
-  }
-  LOG(INFO) << "Init parameters done.";
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/GradientMachine.h b/paddle/gserver/gradientmachines/GradientMachine.h
deleted file mode 100644
index 22cf5d265f429ecbcea1808a54c85d7e89f8bc99..0000000000000000000000000000000000000000
--- a/paddle/gserver/gradientmachines/GradientMachine.h
+++ /dev/null
@@ -1,250 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <iostream>
-#include <vector>
-
-#include "ModelConfig.pb.h"
-#include "TrainerConfig.pb.h"
-#include "paddle/gserver/dataproviders/DataProvider.h"
-#include "paddle/gserver/layers/Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/parameter/Parameter.h"
-#include "paddle/parameter/ParameterUpdaterBase.h"
-#include "paddle/utils/Thread.h"
-
-#ifndef PADDLE_MOBILE_INFERENCE
-#include "paddle/gserver/evaluators/Evaluator.h"
-#endif
-
-namespace paddle {
-/**
- * @brief A gradient machine is capable of calculating some outputs given
- *        some inputs and performing gradient calculation based on the
- *        derivative from the outputs.
- *
- * A gradient machine can be either a full neural network or part of a neural
- * network.
- *
- * Usage for training:
- *
- *  1. Prepare inArgs. Put your input data into inArgs[i].value.
- *
- *  2. Call forward(inArgs, &outArgs)
- *
- *  3. Calculate gradient with respect to outArgs[i]->value
- *     and fill them into outArgs[i]->grad.
- *     This step can be skipped if your the outputs are from cost layers.
- *
- *  4. Call backward(). After backward, gradient of each parameter is
- *     accumulated to getParameters()[i]->getBuf(PARAMETER_GRADIENT)
- *
- *  5. Update parameter value getParameters()[i]->getBuf(PARAMETER_VALUE) using
- *     gradients.
- *
- *  6. Clear gradients to zero.
- *
- * Usage for prediction:
- *
- *  1. Prepare inArgs. Put your input data into inArgs[i].value.
- *
- *  2. Call forward(inArgs, &outArgs)
- *
- *  3. Obtain the prediction result from outArgs[i]
- */
-
-typedef std::vector<LayerStatePtr> MachineState;
-
-class GradientMachine;
-
-typedef std::shared_ptr<GradientMachine> GradientMachinePtr;
-
-class GradientMachine {
- public:
-  enum CreateMode {
-    kNormal = 0,
-    kSgdSparseCpuTraining = 3,
-    kTesting = 4,
-    kCustom = 10
-  };
-
-  /**
-   * Create a gradient machine from ModelConfig
-   * Parameter will have parameterTypes
-   */
-  static GradientMachine* create(
-      const ModelConfig& config,
-      int mode = kNormal,
-      const std::vector<ParameterType>& parameterTypes =
-          std::vector<ParameterType>{
-              PARAMETER_VALUE, PARAMETER_GRADIENT, PARAMETER_MOMENTUM});
-
-  virtual ~GradientMachine() {}
-
-  /**
-   * Prefetch row ids of sparse parameter.
-   */
-  virtual void prefetch(const std::vector<Argument>& inArgs) { (void)inArgs; }
-
-  /**
-   * @brief Forward propagation.
-   *
-   * Calculate outputs (outArgs) based the inputs (inArgs)
-   *
-   * @note: if passType==PASS_TEST, then backward() should not be called
-   */
-  virtual void forward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs,
-                       PassType passType) = 0;
-
-  /**
-   * @brief Backward propagation.
-   *
-   * Calculate the gradient of inArgs and parameter.
-   *
-   * This function should only be called after a corresponding forward() call.
-   * The caller is responsible for filling the correct grad for the outArgs
-   * obtained using forward().
-   *
-   * It may also change the grad field for the inArgs supplied at forward()
-   */
-  virtual void backward(const UpdateCallback& callback = nullptr) = 0;
-
-  /**
-   * Combine forward() and backward(). For multithread training, this
-   * may be faster.
-   *
-   * @note: passType PASS_TEST is not allowed for forwardBackward().
-   */
-  virtual void forwardBackward(const std::vector<Argument>& inArgs,
-                               std::vector<Argument>* outArgs,
-                               PassType passType,
-                               const UpdateCallback& callback = nullptr) {
-    forward(inArgs, outArgs, passType);
-    backward(callback);
-  }
-
-  virtual Argument getLayerOutput(const std::string& layerName) = 0;
-
-  // see comment in Layer.h for the function with the same name
-  virtual void resetState() {}
-
-  // set machine state
-  virtual void setState(const MachineState& machineState) {}
-
-  // save machine state
-  virtual void getState(MachineState& machineState) {}
-
-  virtual void onPassEnd() = 0;
-
-#ifndef PADDLE_MOBILE_INFERENCE
-  /**
-   * Create an evaluator which can be used for eval()
-   */
-  virtual Evaluator* makeEvaluator() const = 0;
-
-  /**
-   * evaluate using the given evaluator
-   */
-  virtual void eval(Evaluator* evaluator) const = 0;
-#endif
-
-  std::vector<ParameterPtr>& getParameters() { return parameters_; }
-
-  std::vector<ParameterPtr>& getNonStaticParameters() {
-    if (nonStaticParameters_.empty()) {
-      for (auto para : parameters_) {
-        if (!para->isStatic()) {
-          nonStaticParameters_.push_back(para);
-        }
-      }
-    }
-    return nonStaticParameters_;
-  }
-
-  inline bool hasStaticParameters() {
-    return parameters_.size() != getNonStaticParameters().size();
-  }
-
-  /**
-   * @brief   Used before formal training, start work-threads and set
-   *          trainer Parameters;
-   *
-   * @note    This function will only been implemented and used in a
-   *          multithreaded environment.
-   */
-  virtual void start() {}
-
-  /**
-   * @brief   check  each work-thread whether is failed/error/finish,
-   *          if not, return ture, and yes return false.
-   *
-   * @note    This function will only been implemented and used in a
-   *          multithreaded environment.
-   */
-  virtual void finish() {}
-
-  /**
-   * @brief   set the training status a "finished" value, the sub_work_threads
-   *          will option the change, and then exit.
-   *
-   * @note    This function will only been implemented and used in a
-   *          multithreaded environment.
-   */
-  virtual bool trainIsOn() { return true; }
-
-  /**
-   * @brief   when all or some of the sub-workThreads are suspended to waiting
-   *          controller's instructions, and after some processing done in the
-   *          controller, it will call this function to wake up all the pending
-   *          thread.
-   *
-   * @note    This function will only been implemented and used in a
-   *          multithreaded environment.
-   */
-  virtual void restart() {}
-
-  /// Set the gradient of the output from outside.
-  virtual void setOutputGrad(const std::vector<Argument>& args) {
-    LOG(FATAL) << "Not implemented!";
-  }
-
-  void saveParameters(const std::string& dir) const;
-
-  void loadParameters(const std::string& dir);
-
-  void randParameters();
-
-  virtual void getStats(real& cost, int64_t& numProcessed) {
-    (void)cost;
-    (void)numProcessed;
-  }
-
-  /**
-   * @brief   Release the middle layer's output memory.
-   *
-   * @note    This function is used for memory optimization in inference.
-   */
-  virtual void releaseOutput() {}
-
- protected:
-  virtual void onLoadParameter() {}
-
-  std::vector<ParameterPtr> parameters_;
-  std::vector<ParameterPtr> nonStaticParameters_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
deleted file mode 100644
index b8d4d28f0f309a5f7348605e8d35e160e7fd5552..0000000000000000000000000000000000000000
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
+++ /dev/null
@@ -1,894 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MultiGradientMachine.h"
-
-#include "paddle/utils/Logging.h"
-
-#include "paddle/utils/Stat.h"
-
-#include "NeuralNetwork.h"
-#include "ParallelNeuralNetwork.h"
-
-DEFINE_bool(allow_only_one_model_on_one_gpu,
-            true,
-            "If true, do not allow multiple models on one GPU device");
-
-namespace paddle {
-
-// get types of the parameters which need to be merged after backward()
-static void fillMergeTypes(PassType passType,
-                           std::vector<ParameterType>* mergeTypes) {
-  mergeTypes->clear();
-  if (passType != PASS_TEST) {
-    mergeTypes->push_back(PARAMETER_GRADIENT);
-  }
-}
-
-MultiGradientMachine::MultiGradientMachine(const ModelConfig& config,
-                                           bool useGpu)
-    : useGpu_(useGpu),
-      trainerBarrier_(FLAGS_trainer_count),
-      allBarrier_(FLAGS_trainer_count + 1),
-      inArgsCopied_(false) {
-  isPassGrad_ = false;
-  numThreads_ = FLAGS_trainer_count;
-  if (useGpu) {
-    //! TODO(yuyang18): When useGpu=false && paddle is not compiled with gpu,
-    //! the hl_get_device_count will get an error result. It seems should return
-    //! 0 when hppl is not compiled as gpu version.
-    numDevices_ = hl_get_device_count();
-  } else {
-    numDevices_ = 0;
-  }
-  ParamInitCallback mainParamInitCb = [](int paramId, Parameter* para) {
-    // only create buf for CPU parameters
-    // GPU parameters will be created in each thread
-    if (para->useGpu()) return;
-
-    if (para->isSparseRemoteUpdate()) {
-      para->enableType(PARAMETER_VALUE,
-                       FLAGS_loadsave_parameters_in_pserver
-                           ? Parameter::MAT_SPARSE_ROW_PREFETCH
-                           : Parameter::MAT_SPARSE_ROW_PREFETCH_FULL_SIZE);
-      para->enableType(PARAMETER_GRADIENT, Parameter::MAT_SPARSE_ROW);
-    } else if (para->isGradSparseUpdate()) {
-      para->enableType(PARAMETER_VALUE);
-      para->enableType(PARAMETER_GRADIENT, Parameter::MAT_SPARSE_ROW_IDS);
-      SparseRowIdsCpuMatrix* mat = dynamic_cast<SparseRowIdsCpuMatrix*>(
-          para->getMat(PARAMETER_GRADIENT).get());
-      mat->setNumOfThreads(FLAGS_trainer_count);
-    } else if (para->isValueShared()) {
-      para->enableType(PARAMETER_VALUE, Parameter::MAT_VALUE_SHARED);
-      if (!para->isStatic()) {
-        para->enableType(PARAMETER_GRADIENT);
-      }
-    } else {
-      para->enableType(PARAMETER_VALUE);
-      if (!para->isStatic()) {
-        para->enableType(PARAMETER_GRADIENT);
-      }
-    }
-  };
-
-  NeuralNetwork* nn = NeuralNetwork::create(config);
-  nn->init(config, mainParamInitCb);
-  gradientMachine_.reset(nn);
-  parameters_ = gradientMachine_->getParameters();
-
-  numLogicalDevices_ = 0;
-  if (useGpu_) {
-    numLogicalDevices_ = 1;
-
-    for (size_t pid = 0; pid < parameters_.size(); pid++) {
-      if (parameters_[pid]->getConfig().device() + 1 > numLogicalDevices_) {
-        numLogicalDevices_ = parameters_[pid]->getConfig().device() + 1;
-      }
-    }
-    LOG(INFO) << "numLogicalDevices=" << numLogicalDevices_
-              << " numThreads=" << numThreads_ << " numDevices=" << numDevices_;
-
-    if (numLogicalDevices_ * numThreads_ > numDevices_ &&
-        FLAGS_allow_only_one_model_on_one_gpu) {
-      LOG(FATAL) << "trainer_count * num_devices_in_model "
-                 << "(" << numThreads_ << "*" << numLogicalDevices_ << ")"
-                 << "=" << numThreads_ * numLogicalDevices_
-                 << " exceeds number of GPU devices(" << numDevices_ << ")";
-    }
-    numLogicalDevices_ = std::min(numLogicalDevices_, numDevices_);
-
-    /* Enables direct access to memory allocations on a peer device */
-    for (int i = 0; i < numThreads_; i++) {
-      for (int d = 0; d < numLogicalDevices_; ++d) {
-        enablePeerAccess(logicalDeviceId2RealDeviceId(d, i),
-                         logicalDeviceId2RealDeviceId(d, i + 1));
-        enablePeerAccess(logicalDeviceId2RealDeviceId(d, i),
-                         logicalDeviceId2RealDeviceId(d, i - 1));
-      }
-    }
-  }
-
-  for (int i = 0; i < numThreads_; ++i) {
-    threads_.emplace_back(new TrainerThread(config, i, this));
-  }
-
-  bufferSizes_.resize(numLogicalDevices_, 0);
-  paraMainThread_.reserve(parameters_.size());
-  int pid = 0;
-  for (auto& para : parameters_) {
-    if (para->isStatic() || !para->useGpu()) {
-      paraMainThread_.push_back(0);
-    } else {
-      int end = pid++ % numThreads_;
-      paraMainThread_.push_back(end);
-      int paraDeviceId = para->getDeviceId();
-      if (paraDeviceId == -1) paraDeviceId = 0;
-      paraDeviceId = paraDeviceId % numLogicalDevices_;
-      if (para->getSize() > bufferSizes_[paraDeviceId]) {
-        bufferSizes_[paraDeviceId] = para->getSize();
-        VLOG(1) << "bufferSize[" << paraDeviceId << "]" << para->getSize();
-      }
-    }
-  }
-
-  // TODO(xuwei06) Instead of using maximal buffer size, we may use a smaller
-  // fixed buffer size and use pipeline to dispatch parameter value and merge
-  // parameter gradient, which may be faster.
-
-  // combination of all trainers mainPara into GradientMachine parameters
-  hasNonstaticCpuParamters_ = false;
-  for (size_t pid = 0; pid < parameters_.size(); pid++) {
-    if (parameters_[pid]->useGpu()) {
-      parameters_[pid] = threads_[paraMainThread_[pid]]->getParameters()[pid];
-    } else if (!parameters_[pid]->isStatic()) {
-      hasNonstaticCpuParamters_ = true;
-    }
-  }
-
-  gradBufs_.resize(numThreads_);
-  for (int i = 0; i < numThreads_; ++i) {
-    gradBufs_[i].resize(numLogicalDevices_);
-    for (int d = 0; d < numLogicalDevices_; ++d) {
-      gradBufs_[i][d].sem.post();
-    }
-  }
-
-  outArgStream_ = HPPL_STREAM_1;
-
-  start();
-}
-
-void MultiGradientMachine::start() {
-  for (auto& thread : threads_) {
-    thread->start();
-  }
-}
-
-void MultiGradientMachine::finish() {
-  for (auto& thread : threads_) {
-    thread->stop();
-  }
-}
-
-std::vector<const std::vector<ParameterPtr>*>
-MultiGradientMachine::getSlaveParameters() {
-  std::vector<const std::vector<ParameterPtr>*> vec;
-  vec.reserve(threads_.size());
-  for (auto& thread : threads_) {
-    vec.push_back(&thread->getParameters());
-  }
-  return vec;
-}
-
-void MultiGradientMachine::notifyGradientTransfer(int paramId) {
-  gradQueue_.enqueue(paramId);
-}
-
-void MultiGradientMachine::allocGradBufs() {
-  if (numLogicalDevices_ == 0) return;
-  if (gradBufs_[0][0].bufs.size() >= mergeTypes_.size()) return;
-
-  for (int i = 0; i < numThreads_; i++) {
-    for (int d = 0; d < numLogicalDevices_; ++d) {
-      if (bufferSizes_[d] == 0) continue;
-      SetDevice device(logicalDeviceId2RealDeviceId(d, i));
-      for (size_t j = 0; j < mergeTypes_.size(); j++) {
-        gradBufs_[i][d].bufs.push_back(
-            Vector::create(bufferSizes_[d], /* useGpu= */ true));
-      }
-    }
-  }
-}
-
-void MultiGradientMachine::prefetch(const std::vector<Argument>& inArgs) {
-  // Each gradient machine in threads needs to do prefetch on its own
-  // part of inArgs. So we need to first divide inArgs to each thread
-  inArgs_ = inArgs;
-  startTask(TASK_COPY_IN_ARGS);
-
-  for (auto& para : parameters_) {
-    if (para->isSparseRemoteUpdate()) {
-      auto mat = dynamic_cast<SparsePrefetchRowCpuMatrix*>(
-          para->getMat(PARAMETER_VALUE).get());
-      mat->clearIndices();
-    }
-  }
-
-  waitForCopyInArgs();
-
-  // Because SparsePrefetchRowCpuMatrix can only be changed by ONE thread
-  // at one time, we need to do prefetch sequentially
-  for (auto& thread : threads_) {
-    thread->prefetch();
-  }
-
-  for (auto& para : parameters_) {
-    if (para->isSparseRemoteUpdate()) {
-      auto mat = dynamic_cast<SparsePrefetchRowCpuMatrix*>(
-          para->getMat(PARAMETER_VALUE).get());
-      mat->setupIndices();
-      auto matGrad = dynamic_cast<SparseRowCpuMatrix*>(
-          para->getMat(PARAMETER_GRADIENT).get());
-      matGrad->reserveStore();
-    }
-  }
-}
-
-void MultiGradientMachine::forward(const std::vector<Argument>& inArgs,
-                                   std::vector<Argument>* outArgs,
-                                   PassType passType) {
-  forwardImp(inArgs, outArgs, passType, TASK_FORWARD);
-}
-
-void MultiGradientMachine::forwardImp(const std::vector<Argument>& inArgs,
-                                      std::vector<Argument>* outArgs,
-                                      PassType passType,
-                                      TaskType taskType) {
-  updateThreadParameters();
-  passType_ = passType;
-
-  if (!inArgsCopied_) {
-    inArgs_ = inArgs;
-    inArgsCopied_ = false;
-  }
-
-  fillMergeTypes(passType, &mergeTypes_);
-  allocGradBufs();
-  startTask(taskType);
-
-  getOutArgs(outArgs, passType);
-}
-
-void MultiGradientMachine::backward(const UpdateCallback& callback) {
-  backwardCallback_ = callback;
-  startTask(TASK_BACKWARD);
-  backwardImp(callback);
-}
-
-void MultiGradientMachine::forwardBackward(const std::vector<Argument>& inArgs,
-                                           std::vector<Argument>* outArgs,
-                                           PassType passType,
-                                           const UpdateCallback& callback) {
-  backwardCallback_ = callback;
-  forwardImp(inArgs, outArgs, passType, TASK_FORWARD_BACKWARD);
-  backwardImp(callback);
-}
-
-Argument MultiGradientMachine::getLayerOutput(const std::string& layerName) {
-  std::vector<Argument> args;
-  args.reserve(threads_.size());
-
-  for (auto& thread : threads_) {
-    args.push_back(thread->getGradientMachine()->getLayerOutput(layerName));
-  }
-  outLayerArgs_.concat(args, false /* use_gpu */, outArgStream_, passType_);
-
-  return outLayerArgs_;
-}
-
-void MultiGradientMachine::backwardImp(const UpdateCallback& callback) {
-  for (size_t i = 0; i < parameters_.size(); i++) {
-    if (!parameters_[i]->useGpu() || parameters_[i]->isStatic()) continue;
-    REGISTER_TIMER("controller_dequeue");
-    gradQueue_.dequeue();
-  }
-  if (hasNonstaticCpuParamters()) {
-    waitAfterMerge();
-    if (backwardCallback_) {
-      for (auto& para : parameters_) {
-        if (!para->useGpu() && !para->isStatic()) {
-          backwardCallback_(para.get());
-        }
-      }
-    }
-  }
-}
-
-void MultiGradientMachine::updateThreadParameters() {
-  for (size_t pid = 0; pid < parameters_.size(); ++pid) {
-    if (!parameters_[pid]->useGpu()) continue;
-    if (!parameters_[pid]->isValueUpdated()) continue;
-    parameters_[pid]->clearValueUpdated();
-    for (int i = 0; i < (int)threads_.size(); i++) {
-      threads_[i]->incUpdateCounter();
-    }
-    // NotifyValueReady should happen after that all threads' incUpdateCounter()
-    // are called so that the counters are correct when notifyValueReady()
-    // is called.
-    threads_[paraMainThread_[pid]]->notifyValueReady(pid);
-  }
-}
-
-void MultiGradientMachine::onPassEnd() {
-  for (auto& thread : threads_) {
-    thread->onPassEnd();
-  }
-}
-
-Evaluator* MultiGradientMachine::makeEvaluator() const {
-  return threads_[0]->getGradientMachine()->makeEvaluator();
-}
-
-void MultiGradientMachine::eval(Evaluator* evaluator) const {
-  for (auto& thread : threads_) {
-    SetDevice device(thread->getDeviceId());
-    if (thread->hasInputData()) {
-      thread->getGradientMachine()->eval(evaluator);
-    }
-  }
-}
-
-void MultiGradientMachine::getOutArgs(std::vector<Argument>* outArgs,
-                                      PassType passType) {
-  for (auto& thread : threads_) {
-    REGISTER_TIMER("waitOutArgs");
-    thread->waitOutArgsReady();
-  }
-
-  outArgs_.resize(threads_[threads_.size() - 1]->getOutArgs().size());
-
-  REGISTER_TIMER("copyOutArgs");
-  for (size_t i = 0; i < outArgs_.size(); ++i) {
-    std::vector<Argument> args;
-    args.reserve(threads_.size());
-    for (auto& thread : threads_) {
-      // If the thread input is empty, then the output is empty.
-      auto tmp = thread->getOutArgs();
-      if (tmp.size() > 0) {
-        args.push_back(tmp[i]);
-      }
-    }
-    outArgs_[i].concat(args, useGpu_, outArgStream_, passType);
-  }
-
-  if (useGpu_) {
-    hl_stream_synchronize(outArgStream_);
-  }
-
-  *outArgs = outArgs_;
-}
-
-void MultiGradientMachine::setOutputGrad(const std::vector<Argument>& args) {
-  CHECK_EQ(args.size(), outArgs_.size());
-  for (size_t i = 0; i < args.size(); i++) {
-    outArgs_[i].grad = args[i].grad;
-  }
-}
-
-void MultiGradientMachine::startTask(TaskType taskType) {
-  taskType_ = taskType;
-  for (auto& thread : threads_) {
-    thread->notifyTaskReady();
-  }
-}
-
-TrainerThread::TrainerThread(const ModelConfig& config,
-                             int threadId,
-                             MultiGradientMachine* multiMachine)
-    : multiMachine_(multiMachine),
-      config_(config),
-      threadId_(threadId),
-      inArgsCopied_(false) {
-  int numThreads = multiMachine->getNumThreads();
-
-  auto& mainParas = multiMachine->getParameters();
-
-  using std::placeholders::_1;
-  using std::placeholders::_2;
-
-  partnerId_ = mod(threadId_ - 1, numThreads);
-
-  deviceId_ = !multiMachine_->useGpu()
-                  ? -1
-                  : multiMachine_->logicalDeviceId2RealDeviceId(0, threadId_);
-  SetDevice gpuDevice(deviceId_);
-
-  NeuralNetwork* nn = nullptr;
-  if (!multiMachine->useGpu() || !FLAGS_parallel_nn) {
-    nn = NeuralNetwork::create(config);
-  } else {
-    nn = new ParallelNeuralNetwork();
-    for (auto& paraConfig : *config_.mutable_parameters()) {
-      if (paraConfig.device() != -1) {
-        paraConfig.set_device(multiMachine_->logicalDeviceId2RealDeviceId(
-            paraConfig.device(), threadId_));
-      }
-    }
-    for (auto& layerConfig : *config_.mutable_layers()) {
-      if (layerConfig.device() != -1) {
-        layerConfig.set_device(multiMachine_->logicalDeviceId2RealDeviceId(
-            layerConfig.device(), threadId_));
-      }
-    }
-  }
-  // Only GPU do not share parameter values with main paramters.
-  ParamInitCallback slaveParamInitCb =
-      std::bind(parameterInitNN, _1, _2, &mainParas);
-  nn->init(config_, slaveParamInitCb);
-  gradientMachine_.reset(nn);
-  parameters_ = gradientMachine_->getParameters();
-  if (!FLAGS_parallel_nn) {
-    for (auto& para : parameters_) {
-      para->setDevice(deviceId_);
-    }
-  }
-
-  backwardCallback_ =
-      std::bind(&TrainerThread::backwardCallback, this, std::placeholders::_1);
-
-  gradStream_ = HPPL_STREAM_2;
-  valueStream_ = HPPL_STREAM_3;
-  stopping_ = true;
-  updateCounter_ = 0;
-  parameterUpdated_ = false;
-}
-
-TrainerThread::~TrainerThread() { stop(); }
-
-void TrainerThread::start() {
-  if (!stopping_) return;
-
-  stopping_ = false;
-
-  gradientMachine_->start();
-
-  computeThread_.reset(new std::thread([this]() { computeThread(); }));
-
-  if (multiMachine_->useGpu()) {
-    gradCollectThread_.reset(
-        new std::thread([this]() { gradCollectThread(); }));
-
-    valueDispatchThread_.reset(
-        new std::thread([this]() { valueDispatchThread(); }));
-
-    copyThread_.reset(new std::thread([this]() { copyGradToBufferThread(); }));
-  }
-}
-
-void TrainerThread::stop() {
-  if (stopping_) return;
-
-  stopping_ = true;
-
-  if (computeThread_) {
-    taskReadySem_.post();
-    computeThread_->join();
-  }
-  if (gradCollectThread_) {
-    gradQueue_.enqueue(0);
-    gradCollectThread_->join();
-  }
-  if (copyThread_) {
-    gradBufQueue_.enqueue(0);
-    copyThread_->join();
-  }
-  if (valueDispatchThread_) {
-    valueReadyQueue_.enqueue(0);
-    valueDispatchThread_->join();
-  }
-}
-
-void TrainerThread::computeThread() {
-  VLOG(1) << "gradComputeThread " << threadId_;
-
-  if (deviceId_ >= 0) {
-    hl_init(deviceId_);
-  }
-
-  while (true) {
-    {
-      REGISTER_TIMER("taskSem_wait");
-      taskReadySem_.wait();
-    }
-
-    if (stopping_) break;
-
-    switch (multiMachine_->getTaskType()) {
-      case MultiGradientMachine::TASK_FORWARD_BACKWARD:
-        forward();
-        backward();
-        break;
-      case MultiGradientMachine::TASK_FORWARD:
-        forward();
-        break;
-      case MultiGradientMachine::TASK_BACKWARD:
-        backward();
-        break;
-      case MultiGradientMachine::TASK_COPY_IN_ARGS:
-        batchSize_ = copyInArgs();
-        inArgsCopied_ = true;
-        multiMachine_->waitForCopyInArgs();
-        break;
-    }
-  }
-}
-
-void TrainerThread::prefetch() {
-  SetDevice setDevice(deviceId_);
-  gradientMachine_->prefetch(inArgs_);
-}
-
-void TrainerThread::forward() {
-  if (!inArgsCopied_) {
-    REGISTER_TIMER("copyInArgs");
-    batchSize_ = copyInArgs();
-  } else {
-    inArgsCopied_ = false;
-  }
-
-  if (multiMachine_->getPassType() != PASS_TEST) {
-    REGISTER_TIMER("clearGradient");
-    // For main parameter, the user of MultiGpuSyncMachine is responsible
-    // for setting the gradient to zero
-    for (size_t i = 0; i < parameters_.size(); i++) {
-      if (parameters_[i]->useGpu()) {
-        if (multiMachine_->paraMainThread(i) != threadId_) {
-          SetDevice device(parameters_[i]->getDeviceId());
-          parameters_[i]->clearGradient();
-        }
-      } else {
-        parameters_[i]->clearGradient();
-      }
-    }
-  }
-
-  {
-    REGISTER_TIMER("wait_value");
-    valueReadyCond_.wait([this]() { return !parameterUpdated_; });
-  }
-
-  { fillMergeTypes(multiMachine_->getPassType(), &mergeTypes_); }
-
-  {
-    REGISTER_TIMER("thread_forward");
-    if (batchSize_ > 0) {
-      gradientMachine_->forward(
-          inArgs_, &outArgs_, multiMachine_->getPassType());
-    } else {
-      outArgs_.clear();
-    }
-  }
-  outArgsReadySem_.post();
-}
-
-void TrainerThread::backward() {
-  REGISTER_TIMER("thread_backward");
-  if (multiMachine_->isPassGrad()) {
-    copyOutputGrad();
-  }
-  if (batchSize_ > 0) {
-    gradientMachine_->backward(backwardCallback_);
-  } else {
-    for (size_t i = parameters_.size(); i > 0; i--) {
-      backwardCallback(parameters_[i - 1].get());
-    }
-  }
-  if (multiMachine_->hasNonstaticCpuParamters()) {
-    mergeCpuGradients();
-  }
-}
-
-void TrainerThread::backwardCallback(Parameter* para) {
-  // CPU parameters are merged in the end
-  if (!para->useGpu() || para->isStatic()) return;
-
-  int paramId = para->getID();
-  if (multiMachine_->getNumThreads() == 1) {
-    // no need to do merge if there is only one thread
-    doCallback(paramId);
-  } else if (threadId_ == mod(multiMachine_->paraMainThread(paramId) - 1,
-                              multiMachine_->getNumThreads())) {
-    notifyCopyGradToBuffer(paramId);
-  } else {
-    notifyGradientCollect(paramId);
-  }
-}
-
-void TrainerThread::copyGradToBufferThread() {
-  VLOG(1) << "copyGradToBufferThread " << threadId_;
-
-  if (deviceId_ >= 0) {
-    hl_init(deviceId_);
-  }
-  auto& partnerThread = multiMachine_->getThread(partnerId_);
-  auto& gradBufs = multiMachine_->getGradBuf(partnerId_);
-
-  while (true) {
-    int pid = gradBufQueue_.dequeue();
-    if (stopping_) break;
-
-    int pdeviceId = multiMachine_->realDeviceId2LogicalDeviceId(
-        parameters_[pid]->getDeviceId(), threadId_);
-
-    auto& gradBuf = gradBufs[pdeviceId];
-
-    {
-      REGISTER_TIMER("waitBufferReady");
-      gradBuf.sem.wait();
-    }
-
-    {
-      REGISTER_TIMER("copyGradToBuffer");
-      SetDevice setDevice(parameters_[pid]->getDeviceId());
-      for (size_t i = 0; i < mergeTypes_.size(); ++i) {
-        gradBuf.bufs[i]->resize(
-            parameters_[pid]->getBuf(mergeTypes_[i])->getSize());
-        gradBuf.bufs[i]->copyFrom(*parameters_[pid]->getBuf(mergeTypes_[i]),
-                                  gradStream_);
-      }
-      hl_stream_synchronize(gradStream_);
-    }
-    partnerThread->notifyGradientCollect(pid);
-  }
-}
-
-void TrainerThread::gradCollectThread() {
-  VLOG(1) << "gradCollectThread " << threadId_;
-
-  if (deviceId_ >= 0) {
-    hl_init(deviceId_);
-  }
-
-  std::vector<size_t> gradReadyCount(parameters_.size(), 0);
-
-  auto& gradBufs = multiMachine_->getGradBuf(threadId_);
-
-  while (true) {
-    int pid = gradQueue_.dequeue();
-    if (stopping_) break;
-
-    if (++gradReadyCount[pid] < 2) continue;
-    gradReadyCount[pid] = 0;
-    int pdeviceId = multiMachine_->realDeviceId2LogicalDeviceId(
-        parameters_[pid]->getDeviceId(), threadId_);
-
-    auto& gradBuf = gradBufs[pdeviceId];
-
-    {
-      REGISTER_TIMER("mergeGrad");
-      for (size_t i = 0; i < mergeTypes_.size(); ++i) {
-        ParameterType type = mergeTypes_[i];
-        const VectorPtr& localGrad = parameters_[pid]->getBuf(type);
-        SetDevice setDevice(parameters_[pid]->getDeviceId());
-        localGrad->add(*gradBuf.bufs[i]);
-      }
-    }
-
-    gradBuf.sem.post();
-
-    if (multiMachine_->paraMainThread(pid) == threadId_) {
-      doCallback(pid);
-    } else {
-      notifyCopyGradToBuffer(pid);
-    }
-  }
-}
-
-void TrainerThread::doCallback(int pid) {
-  REGISTER_TIMER("callback");
-  auto& gpuThreads = multiMachine_->getAllThreads();
-  if (multiMachine_->getBackwardCallback()) {
-    // The callback supplied by the user of MultiGradientMachine may handle
-    // the parameter update using the gradient.
-    multiMachine_->getBackwardCallback()(parameters_[pid].get());
-    if (parameters_[pid]->isValueUpdated()) {
-      parameters_[pid]->clearValueUpdated();
-      for (auto& thread : gpuThreads) {
-        thread->incUpdateCounter();
-      }
-      notifyValueReady(pid);
-    }
-  }
-  multiMachine_->notifyGradientTransfer(pid);
-}
-
-void TrainerThread::valueDispatchThread() {
-  VLOG(1) << "valueDispatchThread " << threadId_;
-
-  if (deviceId_ >= 0) {
-    hl_init(deviceId_);
-  }
-
-  auto& thread = multiMachine_->getThread(partnerId_);
-
-  while (true) {
-    int pid;
-    {
-      REGISTER_TIMER("value_dequeue");
-      pid = valueReadyQueue_.dequeue();
-    }
-    if (stopping_) break;
-
-    if (multiMachine_->paraMainThread(pid) == partnerId_) continue;
-
-    {
-      REGISTER_TIMER("copyValue");
-      SetDevice setDevice(parameters_[pid]->getDeviceId());
-      thread->getValueBuf(pid)->copyFrom(*getValueBuf(pid), valueStream_);
-      hl_stream_synchronize(valueStream_);
-    }
-
-    thread->notifyValueReady(pid);
-  }
-}
-
-void TrainerThread::notifyValueReady(int paramId) {
-  if (--updateCounter_ == 0) {
-    valueReadyCond_.notify_all([this] { parameterUpdated_ = false; });
-  }
-
-  notifyValueDispatch(paramId);
-}
-
-int TrainerThread::copyInArgs() {
-  const std::vector<Argument>& fullInArgs = multiMachine_->getInArgs();
-  int numThreads = multiMachine_->getAllThreads().size();
-  int32_t numSequences = fullInArgs[0].getNumSequences();
-  int32_t startSeq = numSequences * threadId_ / numThreads;
-  int32_t endSeq = numSequences * (threadId_ + 1) / numThreads;
-  int32_t copySize = endSeq - startSeq;
-
-  /**
-   * For the first copy, need to allocate space here
-   */
-  if (inArgs_.size() == 0) {
-    inArgs_.resize(fullInArgs.size());
-  }
-
-  if (copySize == 0) {
-    return 0;
-  }
-
-  for (size_t i = 0; i < fullInArgs.size(); i++) {
-    inArgs_[i].resizeAndCopyFrom(
-        fullInArgs[i],
-        startSeq,
-        copySize,
-        FLAGS_parallel_nn ? false : multiMachine_->useGpu());
-  }
-  return copySize;
-}
-
-void TrainerThread::mergeCpuGradients() {
-  CHECK_EQ(mergeTypes_.size(), 1UL);
-  CHECK_EQ(mergeTypes_[0], PARAMETER_GRADIENT);
-
-  {
-    REGISTER_TIMER("waitbeforeMerge");
-    multiMachine_->waitBeforeMerge();
-  }
-  std::vector<const std::vector<ParameterPtr>*> slaveParameters =
-      multiMachine_->getSlaveParameters();
-
-  CHECK(slaveParameters.size());
-  for (auto& para : multiMachine_->getNonStaticParameters()) {
-    if (para->useGpu()) continue;
-    if (para->isSparseRemoteUpdate()) {
-      REGISTER_TIMER("mergeRemoteGradSparse");
-      mergeGradSparseRemote(para.get(), slaveParameters);
-    } else if (para->isGradSparseUpdate()) {
-      REGISTER_TIMER("mergeGradSparse");
-      mergeGradSparse(para.get(), slaveParameters);
-    } else {
-      REGISTER_TIMER("mergeGradDense");
-      mergeGradDense(para.get(), slaveParameters);
-    }
-  }
-  {
-    REGISTER_TIMER("waitbeforeMerge");
-    multiMachine_->waitAfterMerge();
-  }
-}
-
-void TrainerThread::mergeGradSparse(
-    Parameter* para,
-    std::vector<const std::vector<ParameterPtr>*>& slaveParameters) {
-  size_t pid = para->getID();
-  SparseRowIdsCpuMatrix* mainMat = dynamic_cast<SparseRowIdsCpuMatrix*>(
-      para->getMat(PARAMETER_GRADIENT).get());
-  std::vector<uint32_t>& ids = mainMat->getIds(threadId_);
-
-  for (auto slaveParams : slaveParameters) {
-    SparseRowCpuMatrix* mat = dynamic_cast<SparseRowCpuMatrix*>(
-        (*slaveParams)[pid]->getMat(PARAMETER_GRADIENT).get());
-    mat->addTo(*mainMat, ids, threadId_, multiMachine_->getNumThreads());
-    // we use a sample hash method(%) instead of range partition,
-    // because range partition has balance issue sometimes,
-    // when feature ids are not generated from hashcode.
-  }
-  uniqueIds(ids);
-}
-
-void TrainerThread::mergeGradSparseRemote(
-    Parameter* para,
-    std::vector<const std::vector<ParameterPtr>*>& slaveParameters) {
-  size_t pid = para->getID();
-  SparseRowCpuMatrix* mainMat =
-      dynamic_cast<SparseRowCpuMatrix*>(para->getMat(PARAMETER_GRADIENT).get());
-
-  mainMat->checkIndices();
-  mainMat->zeroMemThread(threadId_, multiMachine_->getNumThreads());
-
-  for (auto slaveParams : slaveParameters) {
-    SparseRowCpuMatrix* mat = dynamic_cast<SparseRowCpuMatrix*>(
-        (*slaveParams)[pid]->getMat(PARAMETER_GRADIENT).get());
-    mat->addTo(*mainMat, threadId_, multiMachine_->getNumThreads());
-  }
-}
-
-void TrainerThread::mergeGradDense(
-    Parameter* para,
-    std::vector<const std::vector<ParameterPtr>*>& slaveParameters) {
-  size_t pid = para->getID();
-  auto interval = calcSplitArrayInterval(para->getSize(),
-                                         (size_t)threadId_,
-                                         multiMachine_->getNumThreads(),
-                                         8LU /*for avx*/);
-  size_t startSeq = interval.first;
-  size_t copySize = interval.second - interval.first;
-
-  // setup sub bufs
-  CpuVector destGrad(0, nullptr);
-  destGrad.subVecFrom(*para->getBuf(PARAMETER_GRADIENT), startSeq, copySize);
-
-  // merge
-  CpuVector slaveGradSub(0, nullptr);
-  for (auto slaveParams : slaveParameters) {
-    slaveGradSub.subVecFrom(
-        *(*slaveParams)[pid]->getBuf(PARAMETER_GRADIENT), startSeq, copySize);
-    destGrad.add(slaveGradSub);
-  }
-}
-
-void TrainerThread::copyOutputGrad() {
-  const std::vector<Argument>& outputGradArgs = multiMachine_->outArgs_;
-  int numThreads = multiMachine_->getAllThreads().size();
-  int32_t numSequences = outputGradArgs[0].getNumSequences();
-  int32_t startSeq = numSequences * threadId_ / numThreads;
-  int32_t endSeq = numSequences * (threadId_ + 1) / numThreads;
-  int32_t copySize = endSeq - startSeq;
-  outArgs_.resize(outputGradArgs.size());
-  for (size_t i = 0; i < outputGradArgs.size(); i++) {
-    outArgs_[i].resizeAndCopyFrom(outputGradArgs[i],
-                                  startSeq,
-                                  copySize,
-                                  multiMachine_->useGpu(),
-                                  HPPL_STREAM_DEFAULT);
-  }
-  if (multiMachine_->useGpu()) {
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-  }
-  gradientMachine_->setOutputGrad(outArgs_);
-}
-}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.h b/paddle/gserver/gradientmachines/MultiGradientMachine.h
deleted file mode 100644
index eff7d5284c6dd4898344203b50acc94ae61b4d59..0000000000000000000000000000000000000000
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.h
+++ /dev/null
@@ -1,478 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <atomic>
-
-#include "GradientMachine.h"
-
-#include "hl_gpu.h"
-#include "paddle/utils/Locks.h"
-#include "paddle/utils/Queue.h"
-
-namespace paddle {
-
-class TrainerThread;
-
-typedef Queue<int> PidQueue;
-typedef std::unique_ptr<TrainerThread> TrainerThreadPtr;
-
-struct GradBuffer {
-  /// GradBuffer is used for gathering gradient for GPU parameters
-  int paramId;
-
-  /// sem is used to notify that the local gradient merge of the current thread
-  /// finished for the current thread.
-  Semaphore sem;
-
-  // bufs[mergeIndex]
-  std::vector<VectorPtr> bufs;
-};
-
-/**
- *  A MultiGradientMachine is a synchronous GradientMachine which devides
- *  one data batch into several smaller batches and assign each one small batch
- *  to one computint thread for computation. After each thread finishes
- *  computation, it merges result (including output Argument and gradient during
- *  backward()). It basically is the same as single thread gradient machine,
- *  except that it uses multi-thread to do the computation.
- *
- *  It handles GPU and Cpu parameters differently.  In GPU, one computing thread
- *  generally corresponds to one GPU device. Thus, each thread keeps a separate
- *  copy of the parameter in its own device's memory. In CPU, we only need to
- keep
- *  one copy of the parameters in the main memory. After, each computing thread
- *  computes its own parameter gradient, the update process needs to accumulate
- *  the parameter gradients from all the computing threads, and update the
- *  accumulated parameter gradient to the corresponding parameter value.
- *
- *  Each GPU parameter is assigned to a thread called its main thread. For each
- *  parameter, the accumulation of its gradients and the update of its value
- *  happens in its main thread. The main thread first gather the parameter
- *  gradients from all the computing thread. Then, it performs parameter update.
- *  After a gradient is updated by the main thread, it is scattered to all the
- *  computing thread so that the parameters in all the computing threads are
- *  synchronized. The scatter and gather process are implemented by ring-style
- *  communication. Assume we have N computing threads, its thread ids will be
- *  0, 1, ..., N-1. For each parameter, the id of the main thread is specified
- in
- *  paraMainThread_[pid], where pid is the id of the parameter. Each thread i
- only
- *  sends data to its partner thread (i - 1) % N. For example, for a parameter
- *  gradient that is computed in thread 4, and its main thread is 2. Its
- *  traveling process would be 4, 5,..., N-1, 0, 1, 2. In each step, the
- gradient
- *  buffer is added to the local gradient, and the local gradient is then copied
- *  to the gradient buffer of the next thread. At last, its main thread 2 will
- *  get the accumulated parameter gradient. For the same parameter, after its
- *  value is updated, the value's traveling process would be 2, 1, 0, N-1, ...
- 3.
- *  At the end, all the computing threads would have the updated parameter
- value.
- *
- *  A computing thread (TrainerThread) uses 4 threads to do different jobs:
- *
- *  1. computeThread(): performing forward(), backward(), prefetch().
- *
- *  2. valueDispatchThread(): copying parameter values to partner thread.
- *
- *  3. copyGradToBufferThread(): copying parameter gradient to partner thread.
- *
- *  4. gradCollectThread(): merging the gradient from step 3 with local gradient
- *     and call the callback supplied by the user to update parameter value.
- *
- *  CPU parameter value has only one copy. And their gradients are merged at the
- *  end of backward().
- *
- *  * Handling of sparse update
- *  Currently, sparse update is only supported for CPU parameters.
-
- *  Sparse updates refers to gradient caculation where the gradient is sparse.
- For
- *  example, if the input argument to a 'fc' layer is sparse, the gradient of
- the
- *  weight matrix of this layer will be sparse. It is usually more efficient to
- *  treat the gradient explicitly as sparse vector during the parameter update.
-
- *  There are two types of sparse updates called local sparse update and remote
- *  sparse update.
-
- *  For both types of sparse updates, there is one copy of parameter value and
- *  gradient called main parameter value and gradient, and there is a copy of
- *  parameter value and gradient for each computing thread called slave
- parameter
- *  value and gradient. The slave parameter values are always shared with the
- *  corresponding main parameter value. The slave parameter grad is a sparse row
- *  matrix. The sparse pattern for slave parameter grads are different, because
- *  the small batches for each computing thread might have different sparsity
- *  pattern.
-
- *  1. Local sparse update
- *
- *     Main parameter value type is MAT_NORMAL. It is a dense matrix.
- *
- *     Main parameter grad type is MAT_SPARSE_ROW_IDS (SparseRowIdsCpuMatrix)
- *     It is also a dense matrix, but the updated values are specified by IDS.
- *
- *     Slave parameter value shares with main parameter value.
- *
- *     Slave parameter grad type is MAT_SPARSE_ROW_AUTO_GROW
- *     (SparseAutoGrowRowCpuMatrix). It is a sparse row matrix.
- *
- *     During backward() of each TrainerThread, SparseAutoGrowRowCpuMatrix will
- *     gather all the non-zero gradient. And After backward(), they will be
- merged
- *     into main parameter grad (SparseRowIdsCpuMatrix), with indices indicating
- *     which rows have nonzero gradient.
- *
- *  2. Remote sparse update
- *
- *     Main parameter value type is MAT_SPARSE_ROW_PREFETCH(_FULL_SIZE)
- *     (SparsePrefetchRowCpuMatrix). MAT_SPARSE_ROW_PREFETCH is a sparse matrix.
- *     MAT_SPARSE_ROW_PREFETCH_FULL_SIZE is a dense matrix. However, only the
- *     parameter values that are prefetched is up-to-date.
- *
- *     Main parameter grad type is MAT_SPARSE_ROW (SparseRowCpuMatrix).
- *     And it shares sparse pattern with value by sharing indexDictHandle_,
- which
- *     is an internal data structure used by SparseRowCpuMatrixto specify the
- *     sparsity pattern of Slave parameter value shares with main parameter
- value.
- *
- *     Slave parameter grad type is MAT_SPARSE_ROW_AUTO_GROW
- *     (SparsePrefetchRowCpuMatrix). It is a sparse row matrix
- *
- *     During prefetch(), all the layers will indicates which rows of each
- *     parameter are needed. Then the framework will retrieve those rows from
- *     parameter server.
- *
- *     During backward() of each TrainerThread, SparseAutoGrowRowCpuMatrix will
- *     gather all the non-zero gradient. And After backward(), they will be
- merged
- *     into main parameter grad (SparseRowCpuMatrix). And the framework will
- send
- *     the merged gradient to parameter server.
- */
-class MultiGradientMachine : public GradientMachine {
- public:
-  enum TaskType {
-    TASK_FORWARD_BACKWARD = 0,
-    TASK_FORWARD = 1,
-    TASK_BACKWARD = 2,
-    TASK_COPY_IN_ARGS = 3,
-  };
-
-  explicit MultiGradientMachine(const ModelConfig& config, bool useGpu);
-
-  virtual void start();
-
-  virtual void finish();
-
-  virtual void prefetch(const std::vector<Argument>& inArgs);
-
-  virtual void forward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs,
-                       PassType passType);
-
-  virtual void backward(const UpdateCallback& callback = nullptr);
-
-  void forwardBackward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs,
-                       PassType passType,
-                       const UpdateCallback& callback);
-
-  virtual Argument getLayerOutput(const std::string& layerName);
-
-  virtual void onPassEnd();
-
-  virtual Evaluator* makeEvaluator() const;
-
-  virtual void eval(Evaluator* evaluator) const;
-
-  bool useGpu() const { return useGpu_; }
-
-  /// @return whether to pass the gradients in outArgs_ to each threads.
-  bool isPassGrad() { return isPassGrad_; }
-
-  /// @brief set whether to pass the gradient in outArgs_ to each threads.
-  void setPassGrad(bool isPass) { isPassGrad_ = isPass; }
-
-  /// Set the gradients of the outputs.
-  /// The gradietns will be copied to each thread in the computing threads.
-  virtual void setOutputGrad(const std::vector<Argument>& args);
-
- protected:
-  friend class TrainerThread;
-
-  std::vector<TrainerThreadPtr>& getAllThreads() { return threads_; }
-  /// Calculate the real device id based on the logical device id and the
-  /// thread id.
-  int logicalDeviceId2RealDeviceId(int logicalId, int threadId = 0) const {
-    if (logicalId == -1) {
-      logicalId = 0;
-    }
-    return mod(logicalId + FLAGS_gpu_id + threadId * numLogicalDevices_,
-               numDevices_);
-  }
-
-  /// Calculate the logical device id based on the real device id and the
-  /// thread id.
-  int realDeviceId2LogicalDeviceId(int realId, int threadId = 0) const {
-    if (realId == -1) {
-      return 0;
-    } else {
-      return mod(realId - FLAGS_gpu_id - threadId * numLogicalDevices_,
-                 numDevices_);
-    }
-  }
-
-  std::vector<const std::vector<ParameterPtr>*> getSlaveParameters();
-
-  bool hasNonstaticCpuParamters() const { return hasNonstaticCpuParamters_; }
-
-  /// Called TrainerThread to wait before merging CPU parameter gradients.
-  void waitBeforeMerge() { trainerBarrier_.wait(); }
-
-  /// called by MultiGradientMachine and TrainerThread to wait after merging
-  /// CPU parameter graidents.
-  void waitAfterMerge() { allBarrier_.wait(); }
-
-  /// called by MultiGradientMachine and TrainerThread to wait for copyInArgs()
-  /// finishing
-  void waitForCopyInArgs() { allBarrier_.wait(); }
-
-  TrainerThreadPtr& getThread(int threadId) { return threads_[threadId]; }
-
-  std::vector<GradBuffer>& getGradBuf(int threadId) {
-    return gradBufs_[threadId];
-  }
-
-  PassType getPassType() const { return passType_; }
-
-  /// Called by TrainerThread to notify MultiGradientMachine that the gradient
-  /// for paramId is ready
-  void notifyGradientTransfer(int paramId);
-
-  const std::vector<Argument>& getInArgs() { return inArgs_; }
-
-  TaskType getTaskType() const { return taskType_; }
-
-  const UpdateCallback& getBackwardCallback() const {
-    return backwardCallback_;
-  }
-
-  int getNumDevices() const { return numDevices_; }
-
-  int getNumLogicalDevices() const { return numLogicalDevices_; }
-
-  int getNumThreads() const { return numThreads_; }
-
-  int paraMainThread(int pid) const { return paraMainThread_[pid]; }
-
- protected:
-  virtual void forwardImp(const std::vector<Argument>& inArgs,
-                          std::vector<Argument>* outArgs,
-                          PassType passType,
-                          TaskType taskType);
-
-  virtual void backwardImp(const UpdateCallback& callback = NULL);
-
-  /// update all parameters
-  void updateThreadParameters();
-
-  void startTask(TaskType taskType);
-
-  void getOutArgs(std::vector<Argument>* outArgs, PassType passType);
-
-  void allocGradBufs();
-
- protected:
-  bool useGpu_;
-
-  bool hasNonstaticCpuParamters_;
-
-  /// store main parameter only
-  std::unique_ptr<GradientMachine> gradientMachine_;
-
-  std::vector<TrainerThreadPtr> threads_;
-  std::vector<int> paraMainThread_;
-  std::vector<std::vector<GradBuffer>> gradBufs_;  // [threadId][deviceId]
-  std::vector<size_t> bufferSizes_;
-
-  PassType passType_;
-  TaskType taskType_;
-  PidQueue gradQueue_;
-  std::vector<Argument> inArgs_;
-  std::vector<Argument> outArgs_;
-  hl_stream_t outArgStream_;
-
-  Argument outLayerArgs_;
-
-  /// ParameterType which needs to be merged from each GPU
-  std::vector<ParameterType> mergeTypes_;
-  int numDevices_;         /* number of gpu devices */
-  int numLogicalDevices_;  // number of GPU used by one NN
-  int numThreads_;         /* number of train threads */
-
-  UpdateCallback backwardCallback_;
-
-  /// barrrier for threads_
-  ThreadBarrier trainerBarrier_;
-
-  /// barrier for both MultiGradientMachine and threds_
-  ThreadBarrier allBarrier_;
-
-  /// indicate whether inArgs is copied before forward()
-  bool inArgsCopied_;
-
-  /// Whether to copy the gradient back from an external input.
-  bool isPassGrad_;
-};
-
-class TrainerThread {
- public:
-  TrainerThread(const ModelConfig& config,
-                int threadId,
-                MultiGradientMachine* multiMachine);
-
-  ~TrainerThread();
-
-  void start();
-
-  void onPassEnd() { gradientMachine_->onPassEnd(); }
-
-  void waitOutArgsReady() { outArgsReadySem_.wait(); }
-
-  void notifyTaskReady() { taskReadySem_.post(); }
-
-  int getDeviceId() const { return deviceId_; }
-
-  GradientMachine* getGradientMachine() { return gradientMachine_.get(); }
-
-  const std::vector<ParameterPtr>& getParameters() { return parameters_; }
-
-  void stop();
-
-  void notifyValueReady(int paramId);
-
-  const VectorPtr& getValueBuf(int paramId) {
-    return parameters_[paramId]->getBuf(PARAMETER_VALUE);
-  }
-
-  const std::vector<Argument>& getOutArgs() { return outArgs_; }
-
-  void incUpdateCounter(int n = 1) {
-    updateCounter_ += n;
-    parameterUpdated_ = true;
-  }
-
-  void notifyGradientCollect(int paramId) { gradQueue_.enqueue(paramId); }
-
-  void notifyCopyGradToBuffer(int paramId) { gradBufQueue_.enqueue(paramId); }
-
-  void notifyValueDispatch(int paramId) { valueReadyQueue_.enqueue(paramId); }
-
-  void prefetch();
-
-  /// copy the output gradient from the main GradientMachine.
-  void copyOutputGrad();
-
-  /// Whether the thread has input data.
-  bool hasInputData() { return batchSize_ != 0; }
-
- protected:
-  void mergeCpuGradients();
-
-  void mergeGradSparse(
-      Parameter* para,
-      std::vector<const std::vector<ParameterPtr>*>& slaveParameters);
-
-  void mergeGradSparseRemote(
-      Parameter* para,
-      std::vector<const std::vector<ParameterPtr>*>& slaveParameters);
-
-  void mergeGradDense(
-      Parameter* para,
-      std::vector<const std::vector<ParameterPtr>*>& slaveParameters);
-
-  void computeThread();
-  void valueDispatchThread();
-  void copyGradToBufferThread();
-  void gradCollectThread();
-
-  int copyInArgs();
-  void forward();
-  void backward();
-  void backwardCallback(Parameter* para);
-
-  /// call the actuall callback supplied by the caller of
-  /// GradientMachine::backward
-  void doCallback(int pid);
-
- protected:
-  MultiGradientMachine* multiMachine_;
-  ModelConfig config_;
-  /// whether the thread should stop
-  bool stopping_;
-  /// the threads form which to collect gradient
-  int partnerId_;
-  /// from 0 to threads-1
-  int threadId_;
-  int deviceId_;
-  std::unique_ptr<GradientMachine> gradientMachine_;
-  std::vector<ParameterPtr> parameters_;
-
-  /// ParameterType which needs to be merged from each GPU
-  std::vector<ParameterType> mergeTypes_;
-
-  /// compute thread
-  std::unique_ptr<std::thread> computeThread_;
-  std::vector<Argument> inArgs_;
-  std::vector<Argument> outArgs_;
-  Semaphore taskReadySem_;
-  Semaphore outArgsReadySem_;
-
-  /// copy thread
-  std::unique_ptr<std::thread> copyThread_;
-  /// queue of gradient needs to be copied to partner
-  PidQueue gradBufQueue_;
-  hl_stream_t gradStream_;
-
-  /// grad merge thread
-  std::unique_ptr<std::thread> gradCollectThread_;
-  /// queue of gradient needs to be merged with gradient coopied by
-  /// copyGradToBufferThread
-  PidQueue gradQueue_;
-  UpdateCallback backwardCallback_;
-
-  /// value dispatch thread
-  std::unique_ptr<std::thread> valueDispatchThread_;
-  /// queue of the parameter whose the vale are ready for copy
-  PidQueue valueReadyQueue_;
-
-  /// used to notify all the parameter values are ready
-  LockedCondition valueReadyCond_;
-
-  hl_stream_t valueStream_;
-  /// how many parameters are updated
-  std::atomic<int> updateCounter_;
-  bool parameterUpdated_;
-
-  /// indicate whether inArgs is copied before forward()
-  bool inArgsCopied_;
-  int batchSize_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/MultiNetwork.cpp b/paddle/gserver/gradientmachines/MultiNetwork.cpp
deleted file mode 100644
index 5f3d09dda26772850828e6d44e8cc65635b314dc..0000000000000000000000000000000000000000
--- a/paddle/gserver/gradientmachines/MultiNetwork.cpp
+++ /dev/null
@@ -1,185 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/Util.h"
-
-#include "MultiNetwork.h"
-
-#include "NeuralNetwork.h"
-#include "ParallelNeuralNetwork.h"
-
-namespace paddle {
-
-void MultiNetwork::init(const ModelConfig& config,
-                        ParamInitCallback callback,
-                        const std::vector<ParameterType>& parameterTypes,
-                        bool useGpu) {
-  CHECK_GT(config.sub_models_size(), 1) << "sub_models_size should GT 1";
-  // check submodel[0] is root
-  CHECK_EQ("root", config.sub_models(0).name())
-      << "sub_models(0) should be root";
-  // ignore root
-  subNetworks_.resize(config.sub_models_size() - 1);
-  // base class
-  NeuralNetwork::init(config, callback, parameterTypes, useGpu);
-  // sub networks
-  for (int i = 1; i < config.sub_models_size(); ++i) {
-    std::string subModelName = config.sub_models(i).name();
-    if (FLAGS_parallel_nn) {
-      subNetworks_[i - 1] = std::unique_ptr<ParallelNeuralNetwork>(
-          new ParallelNeuralNetwork(subModelName, this));
-    } else {
-      subNetworks_[i - 1] = std::unique_ptr<NeuralNetwork>(
-          NeuralNetwork::newNeuralNetwork(subModelName, this));
-    }
-    subNetworks_[i - 1]->init(config);
-  }
-}
-
-void MultiNetwork::prefetch(const std::vector<Argument>& inArgs) {
-  std::vector<std::vector<Argument>> argumentGroups;
-  Argument::splitByDataId(inArgs, &argumentGroups);
-  // check group size is equal to sub network size
-  CHECK_EQ(argumentGroups.size(), subNetworks_.size());
-  for (size_t i = 0; i < subNetworks_.size(); i++) {
-    if (argumentGroups[i].size() == 1 && argumentGroups[i][0].dataId == -1) {
-      // check input args: if dataId is -1, then skip this sub network
-      continue;
-    }
-    subNetworks_[i]->prefetch(argumentGroups[i]);
-  }
-}
-
-void MultiNetwork::forward(const std::vector<Argument>& inArgs,
-                           std::vector<Argument>* outArgs,
-                           PassType passType) {
-  // split inArgs to several vectors
-  std::vector<std::vector<Argument>> argumentGroups;
-  Argument::splitByDataId(inArgs, &argumentGroups);
-
-  // check group size is equal to sub network size
-  CHECK_EQ(argumentGroups.size(), subNetworks_.size());
-  std::vector<Argument> tempOutArgs;
-  outArgs->clear();
-
-  for (size_t i = 0; i < subNetworks_.size(); i++) {
-    tempOutArgs.clear();
-    if (argumentGroups[i].size() == 1 && argumentGroups[i][0].dataId == -1) {
-      // check input args: if dataId is -1, then skip this sub network
-      continue;
-    }
-    subNetworks_[i]->forward(argumentGroups[i], &tempOutArgs, passType);
-    for (const auto& elem : tempOutArgs) {
-      outArgs->push_back(elem);
-      outArgs->back().dataId = i;
-    }
-  }
-}
-
-void MultiNetwork::backward(const UpdateCallback& callback) {
-  for (size_t i = 0; i < subNetworks_.size(); i++) {
-    subNetworks_[i]->backward(callback);
-  }
-}
-
-void MultiNetwork::forwardBackward(const std::vector<Argument>& inArgs,
-                                   std::vector<Argument>* outArgs,
-                                   PassType passType,
-                                   const UpdateCallback& callback) {
-  forward(inArgs, outArgs, passType);
-  backward(callback);
-}
-
-void MultiNetwork::onPassEnd() {
-  for (size_t i = 0; i < subNetworks_.size(); i++) {
-    subNetworks_[i]->onPassEnd();
-  }
-}
-
-void MultiNetwork::start() {
-  for (auto& subNetwork : subNetworks_) {
-    subNetwork->start();
-  }
-}
-
-void MultiNetwork::finish() {
-  for (size_t i = 0; i < subNetworks_.size(); i++) {
-    subNetworks_[i]->finish();
-  }
-}
-
-class MultiCombinedEvaluator : public Evaluator {
- public:
-  MultiCombinedEvaluator() {}
-  void addEvaluator(std::unique_ptr<Evaluator>&& evaluator) {
-    evaluators_.emplace_back(std::move(evaluator));
-  }
-  virtual void start() {
-    for (auto& evaluator : evaluators_) {
-      evaluator->start();
-    }
-  }
-
-  virtual void finish() {
-    for (auto& evaluator : evaluators_) {
-      evaluator->finish();
-    }
-  }
-
-  virtual void eval(const NeuralNetwork& nn) {
-    const MultiNetwork& multiNetwork = dynamic_cast<const MultiNetwork&>(nn);
-    CHECK_EQ(evaluators_.size(), multiNetwork.getSubNetworks().size());
-    int size = evaluators_.size();
-    for (int i = 0; i < size; i++) {
-      // one evaluator for one subNetwork
-      evaluators_[i]->eval(*multiNetwork.getSubNetworks()[i]);
-    }
-  }
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    (void)arguments;
-    return -1;
-  }
-
-  virtual void printStats(std::ostream& os) const {
-    for (auto& evaluator : evaluators_) {
-      evaluator->printStats(os);
-      os << ' ';
-    }
-  }
-
-  virtual void distributeEval(ParameterClient2* client) {
-    for (auto& evaluator : evaluators_) {
-      evaluator->distributeEval(client);
-    }
-  }
-
- protected:
-  std::vector<std::unique_ptr<Evaluator>> evaluators_;
-};
-
-Evaluator* MultiNetwork::makeEvaluator() const {
-  MultiCombinedEvaluator* multiCombinedEvaluator = new MultiCombinedEvaluator();
-  for (size_t i = 0; i < subNetworks_.size(); i++) {
-    std::unique_ptr<Evaluator> evaluator(subNetworks_[i]->makeEvaluator());
-    multiCombinedEvaluator->addEvaluator(std::move(evaluator));
-  }
-  return multiCombinedEvaluator;
-}
-
-void MultiNetwork::eval(Evaluator* evaluator) const { evaluator->eval(*this); }
-
-}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/MultiNetwork.h b/paddle/gserver/gradientmachines/MultiNetwork.h
deleted file mode 100644
index 495d5592017b5fb937fb8243bf12a5f2f30d67e7..0000000000000000000000000000000000000000
--- a/paddle/gserver/gradientmachines/MultiNetwork.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "GradientMachine.h"
-#include "NeuralNetwork.h"
-
-#include "paddle/utils/Locks.h"
-
-namespace paddle {
-
-class MultiNetwork : public NeuralNetwork {
- public:
-  explicit MultiNetwork(std::string subModelName = "")
-      : NeuralNetwork(subModelName) {}
-
-  virtual void init(const ModelConfig& config,
-                    ParamInitCallback callback,
-                    const std::vector<ParameterType>& parameterTypes,
-                    bool useGpu);
-
-  virtual void prefetch(const std::vector<Argument>& inArgs);
-
-  virtual void forward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs,
-                       PassType passType);
-
-  virtual void backward(const UpdateCallback& callback = nullptr);
-
-  void forwardBackward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs,
-                       PassType passType,
-                       const UpdateCallback& callback);
-
-  virtual void onPassEnd();
-
-  virtual Evaluator* makeEvaluator() const;
-
-  virtual void eval(Evaluator* evaluator) const;
-
-  const std::vector<std::unique_ptr<NeuralNetwork>>& getSubNetworks() const {
-    return subNetworks_;
-  }
-
-  virtual void start();
-
-  virtual void finish();
-
- protected:
-  std::vector<std::unique_ptr<NeuralNetwork>> subNetworks_;
-};
-}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
deleted file mode 100644
index ac60a3a3408d37b66cb712d893c6b93a1750f448..0000000000000000000000000000000000000000
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ /dev/null
@@ -1,548 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/utils/Util.h"
-
-#include "NeuralNetwork.h"
-#include "hl_gpu.h"
-#include "paddle/utils/CustomStackTrace.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/gserver/layers/MKLDNNLayer.h"
-#endif
-
-#ifndef PADDLE_MOBILE_INFERENCE
-#include "MultiNetwork.h"
-#include "RecurrentGradientMachine.h"
-#include "paddle/gserver/layers/AgentLayer.h"
-#endif
-
-namespace paddle {
-void parameterInitNN(int paramId,
-                     Parameter* para,
-                     std::vector<ParameterPtr>* sharedParams) {
-  // Create parameters values.
-  if (!para->useGpu() && sharedParams) {
-    para->enableSharedType(PARAMETER_VALUE,
-                           (*sharedParams)[paramId]->getBuf(PARAMETER_VALUE),
-                           (*sharedParams)[paramId]->getMat(PARAMETER_VALUE));
-  } else {
-    if (para->isSparseRemoteUpdate()) {
-      para->enableType(PARAMETER_VALUE,
-                       FLAGS_loadsave_parameters_in_pserver
-                           ? Parameter::MAT_SPARSE_ROW_PREFETCH
-                           : Parameter::MAT_SPARSE_ROW_PREFETCH_FULL_SIZE);
-    } else {
-      para->enableType(PARAMETER_VALUE);
-    }
-  }
-  // Create parameter gradients.
-  if (para->isSparseRemoteUpdate() && !sharedParams) {
-    para->enableType(PARAMETER_GRADIENT, Parameter::MAT_SPARSE_ROW);
-  } else if (para->isGradSparseUpdate()) {
-    para->enableType(PARAMETER_GRADIENT, Parameter::MAT_SPARSE_ROW_AUTO_GROW);
-  } else if (!para->isStatic()) {
-    para->enableType(PARAMETER_GRADIENT);
-  }
-}
-
-NeuralNetwork* NeuralNetwork::create(const ModelConfig& config) {
-#ifndef PADDLE_MOBILE_INFERENCE
-  if (config.type() == "recurrent_nn") {
-    return newNeuralNetwork("root");
-  } else if (config.type() == "multi_nn") {
-    return new MultiNetwork("root");
-  } else {
-    return newNeuralNetwork();
-  }
-#else
-  return new NeuralNetwork();
-#endif
-}
-
-std::map<std::string, bool> NeuralNetwork::dllInitMap;
-
-void NeuralNetwork::init(const ModelConfig& config,
-                         ParamInitCallback callback,
-                         const std::vector<ParameterType>& parameterTypes,
-                         bool useGpu) {
-  using std::placeholders::_1;
-  using std::placeholders::_2;
-  ParamInitCallback paramCallback = nullptr;
-  if (callback != nullptr) {
-    paramSelfInited_ = false;
-    paramCallback = callback;
-  } else {
-    paramSelfInited_ = true;
-    paramCallback = std::bind(parameterInitNN, _1, _2, nullptr);
-  }
-  config_ = config;
-
-  if (rootNetwork_ != nullptr) {
-    // direct use parameters_ and parameterMap_ from base network
-    CHECK_EQ((size_t)config.parameters_size(),
-             rootNetwork_->getParameters().size());
-    parameters_ = rootNetwork_->getParameters();
-    parameterMap_ = *(rootNetwork_->getParameterMap());
-  } else {
-    parameters_.reserve(config.parameters_size());
-    for (const auto& para_config : config.parameters()) {
-      auto parameter = std::make_shared<Parameter>(para_config,
-                                                   useGpu,
-                                                   /*initialize=*/false);
-      paramCallback(parameters_.size(), parameter.get());
-      if (!callback) {
-        for (ParameterType type :
-             (parameter->isStatic()
-                  ? std::vector<ParameterType>{PARAMETER_VALUE}
-                  : parameterTypes)) {
-          if (type != PARAMETER_VALUE && type != PARAMETER_GRADIENT) {
-            parameter->enableType(type);
-          }
-        }
-      }
-      parameter->setID(parameters_.size());
-      parameters_.push_back(parameter);
-      CHECK(!parameterMap_.count(parameter->getName()));
-      parameterMap_[parameter->getName()] = parameter;
-    }
-  }
-
-  auto layerCreate = [&](const LayerConfig& layer_config) {
-    auto layer = Layer::create(layer_config);
-    CHECK(layer) << "Create layer failed. Layer name:" << layer->getName();
-    layers_.push_back(layer);
-    CHECK(!layerMap_.count(layer->getName()));
-    layerMap_[layer->getName()] = layer;
-  };
-
-  auto subModelConfig = std::find_if(config.sub_models().begin(),
-                                     config.sub_models().end(),
-                                     [=](const SubModelConfig& sub_model) {
-                                       return sub_model.name() == subModelName_;
-                                     });
-  bool useSubModel = (subModelConfig != config.sub_models().end());
-  CHECK_EQ(useSubModel, !subModelName_.empty());
-  if (useSubModel) {
-    layers_.reserve(subModelConfig->layer_names_size());
-    for (const auto& layer_name : subModelConfig->layer_names()) {
-      auto layer_config =
-          std::find_if(config.layers().begin(),
-                       config.layers().end(),
-                       [=](const LayerConfig& layer_config) {
-                         return layer_config.name() == layer_name;
-                       });
-      CHECK(layer_config != config.layers().end());
-      layerCreate(*layer_config);
-    }
-  } else {
-    layers_.reserve(config.layers_size());
-    for (const auto& layer_config : config.layers()) {
-      bool useLayer = true;
-      if (config.has_external_config()) {
-        useLayer = true;
-        for (const auto& name : config.external_config().layer_names()) {
-          if (layer_config.name() == name) {
-            useLayer = false;
-            break;
-          }
-        }
-      }
-      if (useLayer) {
-        layerCreate(layer_config);
-      }
-    }
-  }
-
-  for (const auto& layer : layers_) {
-    layer->init(layerMap_, parameterMap_);
-    layer->initSubNetwork(this /*root*/, config_, parameterTypes, useGpu);
-  }
-
-  for (const auto& layer_name :
-       (useSubModel ? subModelConfig->input_layer_names()
-                    : config.input_layer_names())) {
-    auto it = layerMap_.find(layer_name);
-    CHECK(it != layerMap_.end());
-    dataLayers_.push_back(std::dynamic_pointer_cast<DataLayer>(it->second));
-  }
-
-  for (const auto& layer_name :
-       (useSubModel ? subModelConfig->output_layer_names()
-                    : config.output_layer_names())) {
-    auto it = layerMap_.find(layer_name);
-    CHECK(it != layerMap_.end());
-    outputLayers_.push_back(it->second);
-  }
-
-  for (const auto& layer : layers_) {
-    const auto& name = layer->getName();
-    bool isMiddleLayer = true;
-
-    // if data layer
-    for (const auto& dataLayer : dataLayers_) {
-      if (name == dataLayer->getName()) {
-        isMiddleLayer = false;
-        break;
-      }
-    }
-
-    // if output layer
-    for (const auto& dataLayer : outputLayers_) {
-      if (name == dataLayer->getName()) {
-        isMiddleLayer = false;
-        break;
-      }
-    }
-
-    if (isMiddleLayer) {
-      middleLayers_.push_back(layer);
-    }
-  }
-}
-
-void NeuralNetwork::connect(LayerPtr agentLayer,
-                            LayerPtr realLayer,
-                            int height) {
-#ifndef PADDLE_MOBILE_INFERENCE
-  AgentLayer* agent = dynamic_cast<AgentLayer*>(agentLayer.get());
-  CHECK_NOTNULL(agent);
-  agent->setRealLayer(realLayer, height);
-#endif
-}
-
-void NeuralNetwork::connect(std::string agentLayerName,
-                            NeuralNetwork* srcNN,
-                            std::string realLayerName) {
-  connect(this->getLayer(agentLayerName), srcNN->getLayer(realLayerName));
-}
-
-void NeuralNetwork::prefetch(const std::vector<Argument>& inArgs) {
-  CHECK_EQ(inArgs.size(), dataLayers_.size());
-
-  if (paramSelfInited_) {
-    for (auto& para : parameters_) {
-      if (para->isSparseRemoteUpdate()) {
-        auto mat = dynamic_cast<SparsePrefetchRowCpuMatrix*>(
-            para->getMat(PARAMETER_VALUE).get());
-        para->clearGradient();
-        if (mat) mat->clearIndices();
-      }
-    }
-  }
-
-  for (size_t i = 0; i != dataLayers_.size(); ++i) {
-    if (FLAGS_parallel_nn) {
-      const_cast<Argument&>(inArgs[i]).deviceId = -1;
-    }
-    dataLayers_[i]->setData(inArgs[i]);
-  }
-
-  for (auto& layer : layers_) {
-    layer->prefetch();
-  }
-
-  if (paramSelfInited_) {
-    for (auto& para : parameters_) {
-      if (para->isSparseRemoteUpdate()) {
-        auto mat = dynamic_cast<SparsePrefetchRowCpuMatrix*>(
-            para->getMat(PARAMETER_VALUE).get());
-        mat->setupIndices();
-        auto matGrad = dynamic_cast<SparseRowCpuMatrix*>(
-            para->getMat(PARAMETER_GRADIENT).get());
-        matGrad->reserveStore();
-      }
-    }
-  }
-}
-
-void NeuralNetwork::forward(const std::vector<Argument>& inArgs,
-                            std::vector<Argument>* outArgs,
-                            PassType passType) {
-  CHECK_EQ(inArgs.size(), dataLayers_.size());
-  outArgs->resize(outputLayers_.size());
-  for (size_t i = 0; i != dataLayers_.size(); ++i) {
-    dataLayers_[i]->setData(inArgs[i]);
-  }
-
-  gLayerStackTrace.set_stage(true);
-
-  {
-    for (auto& layer : layers_) {
-      REGISTER_TIMER_INFO("ForwardTimer", layer->getName().c_str());
-      gLayerStackTrace.push(layer->getName());
-      layer->forward(passType);
-      gLayerStackTrace.pop(layer->getName());
-    }
-  }
-
-  outArgs->clear();
-  outArgs->reserve(outputLayers_.size());
-  for (auto& layer : outputLayers_) {
-    outArgs->push_back(layer->getOutput());
-  }
-}
-
-void NeuralNetwork::resetState() {
-  for (auto& layer : layers_) {
-    layer->resetState();
-  }
-}
-
-void NeuralNetwork::setState(const MachineState& machineState) {
-  for (size_t i = 0; i < layers_.size(); i++) {
-    if (machineState[i] != nullptr) {
-      layers_[i]->setState(machineState[i]);
-    }
-  }
-}
-
-void NeuralNetwork::getState(MachineState& machineState) {
-  machineState.clear();
-  machineState.reserve(layers_.size());
-  for (auto& layer : layers_) {
-    LayerStatePtr p = layer->getState();
-    machineState.push_back(p);
-  }
-}
-
-void NeuralNetwork::backward(const UpdateCallback& callback) {
-  gLayerStackTrace.set_stage(false);
-  FOR_EACH_R(layer, layers_) {
-    REGISTER_TIMER_INFO("BackwardTimer", (*layer)->getName().c_str());
-    gLayerStackTrace.push((*layer)->getName());
-    if ((*layer)->needGradient()) {
-      (*layer)->backward(callback);
-    }
-    gLayerStackTrace.pop((*layer)->getName());
-  }
-}
-
-void NeuralNetwork::finish() {
-#ifdef PADDLE_WITH_MKLDNN
-  FOR_EACH_R(layer, layers_) {
-    MKLDNNLayerPtr dnnLayer = std::dynamic_pointer_cast<MKLDNNLayer>(*layer);
-    if (dnnLayer) {
-      dnnLayer->convertWeightsToPaddle();
-    }
-  }
-#endif
-}
-
-Argument NeuralNetwork::getLayerOutput(const std::string& layerName) {
-  return getLayer(layerName)->getOutput();
-}
-
-void NeuralNetwork::onPassEnd() {
-  for (auto& layer : layers_) {
-    layer->onPassEnd();
-  }
-}
-
-void NeuralNetwork::releaseOutput() {
-  for (auto& layer : middleLayers_) {
-    Argument& arg = layer->getOutput();
-    arg.value.reset();
-  }
-}
-
-#ifndef PADDLE_MOBILE_INFERENCE
-
-class CombinedEvaluator : public Evaluator {
- public:
-  void addEvaluator(std::unique_ptr<Evaluator>&& evaluator) {
-    evaluators_.emplace_back(std::move(evaluator));
-  }
-  void start() override {
-    for (auto& evaluator : evaluators_) {
-      evaluator->start();
-    }
-  }
-
-  void finish() override {
-    for (auto& evaluator : evaluators_) {
-      evaluator->finish();
-    }
-  }
-
-  void eval(const NeuralNetwork& nn) override {
-    for (auto& evaluator : evaluators_) {
-      evaluator->eval(nn);
-    }
-  }
-  real evalImp(std::vector<Argument>& arguments) override {
-    (void)arguments;
-    return -1;
-  }
-  void printStats(std::ostream& os) const override {
-    for (auto& evaluator : evaluators_) {
-      evaluator->printStats(os);
-      os << ' ';
-    }
-  }
-
-  void distributeEval(ParameterClient2* client) override {
-    for (auto& evaluator : evaluators_) {
-      evaluator->distributeEval(client);
-    }
-  }
-
- protected:
-  std::vector<std::unique_ptr<Evaluator>> evaluators_;
-
-  // Evaluator interface
- public:
-  /**
-   * @brief getNames will return all inside evaluators' names.
-   * @param names [out]: return names.
-   */
-  void getNames(std::vector<std::string>* names) override {
-    for (auto& eval : evaluators_) {
-      eval->getNames(names);
-    }
-  }
-
-  /**
-   * @brief getValue could get all inside evaluators' value.
-   */
-  real getValue(const std::string& name, Error* err) const override {
-    return this->getMethodHelper<real>(
-        name, err, [&name, err](const std::unique_ptr<Evaluator>& eval) {
-          return eval->getValue(name, err);
-        });
-  }
-
-  /**
-   * @brief getType could get all inside evaluators' type.
-   */
-  std::string getType(const std::string& name, Error* err) const override {
-    return this->getMethodHelper<std::string>(
-        name, err, [&name, err](const std::unique_ptr<Evaluator>& eval) {
-          return eval->getType(name, err);
-        });
-  }
-
- private:
-  template <typename T>
-  T getMethodHelper(const std::string& name,
-                    Error* err,
-                    const std::function<T(const std::unique_ptr<Evaluator>&)>&
-                        callback) const {
-    for (auto& eval : evaluators_) {
-      std::vector<std::string> names;
-      eval->getNames(&names);
-      if (std::find(names.begin(), names.end(), name) != names.end()) {
-        return callback(eval);
-      }
-    }
-    *err = Error("No such key %s", name.c_str());
-    return T();
-  }
-};
-
-class SubnetEvaluator : public CombinedEvaluator {
- public:
-  SubnetEvaluator(const std::string& layerName,
-                  std::unique_ptr<Evaluator>&& evaluator)
-      : layerName_(layerName) {
-    addEvaluator(std::move(evaluator));
-  }
-  void eval(const NeuralNetwork& nn) override {
-    const LayerPtr& layer = nn.getLayer(layerName_);
-    CHECK(layer) << "Nonexisted layer: " << layerName_ << " in submodel "
-                 << nn.getName();
-    bool accessed = false;
-    layer->accessSubNetwork([this, &accessed](NeuralNetwork& subnet) {
-      subnet.eval(evaluators_[0].get());
-      accessed = true;
-    });
-    CHECK(accessed) << "There is no subnetwork for layer " << layerName_
-                    << " in submodel " << nn.getName();
-  }
-
- protected:
-  std::string layerName_;
-};
-
-Evaluator* NeuralNetwork::makeEvaluator() const {
-  CombinedEvaluator* combinedEvaluator = new CombinedEvaluator();
-  auto subModelConfig = std::find_if(config_.sub_models().begin(),
-                                     config_.sub_models().end(),
-                                     [=](const SubModelConfig& sub_model) {
-                                       return sub_model.name() == subModelName_;
-                                     });
-  bool useSubModel = (subModelConfig != config_.sub_models().end());
-  CHECK_EQ(useSubModel, !subModelName_.empty());
-  if (useSubModel) {
-    // create the evaluators that belong to CURRENT submodel
-    for (int i = 0; i < subModelConfig->evaluator_names_size(); ++i) {
-      // find evaluator by name
-      auto thisEvalConfig = std::find_if(
-          config_.evaluators().begin(),
-          config_.evaluators().end(),
-          [=](const EvaluatorConfig& ecfg) {
-            return ecfg.name() == subModelConfig->evaluator_names(i);
-          });
-      bool validConfig = (thisEvalConfig != config_.evaluators().end());
-      if (validConfig) {
-        std::unique_ptr<Evaluator> evaluator(
-            Evaluator::create(*thisEvalConfig));
-        combinedEvaluator->addEvaluator(std::move(evaluator));
-      }
-    }
-    for (auto& layer : layers_) {
-      layer->accessSubNetwork(
-          [layer, combinedEvaluator](NeuralNetwork& subnet) {
-            std::unique_ptr<Evaluator> subEvaluator(new SubnetEvaluator(
-                layer->getName(),
-                std::unique_ptr<Evaluator>(subnet.makeEvaluator())));
-            combinedEvaluator->addEvaluator(std::move(subEvaluator));
-          });
-    }
-  } else {
-    for (const EvaluatorConfig& evalConfig : config_.evaluators()) {
-      std::unique_ptr<Evaluator> evaluator(Evaluator::create(evalConfig));
-      combinedEvaluator->addEvaluator(std::move(evaluator));
-    }
-  }
-  return combinedEvaluator;
-}
-
-void NeuralNetwork::eval(Evaluator* evaluator) const { evaluator->eval(*this); }
-
-#endif
-
-void NeuralNetwork::setOutputGrad(const std::vector<Argument>& args) {
-  CHECK_GE(outputLayers_.size(), args.size());
-  for (size_t i = 0; i < args.size(); ++i) {
-    outputLayers_[i]->getOutput().grad = args[i].grad;
-  }
-}
-
-extern NeuralNetwork* newCustomNerualNetwork(const std::string& name,
-                                             NeuralNetwork* network)
-    __attribute__((weak));
-
-NeuralNetwork* NeuralNetwork::newNeuralNetwork(const std::string& name,
-                                               NeuralNetwork* rootNetwork) {
-  if (newCustomNerualNetwork) {
-    return newCustomNerualNetwork(name, rootNetwork);
-  } else {
-    return new NeuralNetwork(name, rootNetwork);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.h b/paddle/gserver/gradientmachines/NeuralNetwork.h
deleted file mode 100644
index 3e5615c8f0b30ab1283d41e025496051869289dc..0000000000000000000000000000000000000000
--- a/paddle/gserver/gradientmachines/NeuralNetwork.h
+++ /dev/null
@@ -1,179 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include <map>
-#include <memory>
-
-#include "ModelConfig.pb.h"
-#include "paddle/gserver/dataproviders/DataProvider.h"
-#include "paddle/gserver/gradientmachines/GradientMachine.h"
-#include "paddle/gserver/layers/CostLayer.h"
-#include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/gserver/layers/Layer.h"
-#include "paddle/parameter/Parameter.h"
-#include "paddle/utils/ClassRegistrar.h"
-
-namespace paddle {
-/*
- * @brief  Init function for the parameters.
- * @param paramId: the id of the parameter to init.
- * @param para: the pointer to the parameter to init.
- * @param sharedParams: the pointer to an array of the parameter to be shared.
- *                      If it is null, no parameter sharing is used.
- *                      Only CPU paramters can be shared.
- * It handles CPU, CPU sparse, CPU sparse remote,
- * and GPU parameters differently. If the type
- * of a parameter is NORMAL. Basically nothing need to be done.
- * CPU value: NORMAL.
- * CPU param: NORMAL.
- *
- * CPU sparse value: NORMAL.
- * CPU sparse gradient: MAT_SPARSE_ROW_AUTO_GROW.
- *
- * CPU sparse remote value: MAT_SPARSE_ROW_PREFETCH(_FULL_SIZE).
- * CPU sparse remote gradient: MAT_SPARSE_ROW (!sharedParams)
- *                             MAT_SPARSE_ROW_AUTO_GROW (sharedParams)
- *
- * GPU value: NORMAL
- * GPU param: NORMAL
- */
-void parameterInitNN(int paramId,
-                     Parameter* para,
-                     std::vector<ParameterPtr>* sharedParams);
-
-class NeuralNetwork : public GradientMachine {
- public:
-  virtual void init(const ModelConfig& config,
-                    ParamInitCallback callback = nullptr,
-                    const std::vector<ParameterType>& parameterTypes =
-                        std::vector<ParameterType>{PARAMETER_VALUE,
-                                                   PARAMETER_GRADIENT,
-                                                   PARAMETER_MOMENTUM},
-                    bool useGpu = FLAGS_use_gpu);
-
-  /**
-   * Connect two submodels and
-   * down-submodel's output become up-submodel's input.
-   * By default, connection is one by one,
-   * If the agent height is smaller than real layer, *height* has to be filled.
-   *
-   * @param realLayer  The down-submodel's output layer.
-   * @param agentLayer The up-submodel's input agent layer.
-   */
-  static void connect(LayerPtr agentLayer, LayerPtr realLayer, int height = 0);
-  void connect(std::string agentLayerName,
-               NeuralNetwork* srcNN,
-               std::string realLayerName);
-
-  virtual void prefetch(const std::vector<Argument>& inArgs);
-
-  virtual void forward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs,
-                       PassType passType);
-
-  virtual void backward(const UpdateCallback& callback = nullptr);
-
-  virtual Argument getLayerOutput(const std::string& layerName);
-
-  const LayerPtr& getLayer(const std::string& layerName) const {
-    auto it = layerMap_.find(layerName);
-    CHECK(it != layerMap_.end()) << "Unknown layer " << layerName;
-    return it->second;
-  }
-
-  virtual void onPassEnd();
-
-#ifndef PADDLE_MOBILE_INFERENCE
-  virtual Evaluator* makeEvaluator() const;
-
-  virtual void eval(Evaluator* evaluator) const;
-#endif
-
-  virtual void resetState();
-  virtual void setOutputGrad(const std::vector<Argument>& args);
-
-  /// set machine state
-  virtual void setState(const MachineState& machineState);
-
-  /// get machine state
-  virtual void getState(MachineState& machineState);
-
-  static NeuralNetwork* create(const ModelConfig& config);
-
-  ParameterMap* getParameterMap() { return &parameterMap_; }
-
-  /**
-   * @brief Access each layer as a for each loop.
-   * @param callback invoke with each layer.
-   */
-  template <typename T>
-  void forEachLayer(T callback) {
-    for (auto& l : layers_) {
-      if (callback(l)) {
-        break;
-      }
-    }
-  }
-
-  static NeuralNetwork* newNeuralNetwork(const std::string& name = "",
-                                         NeuralNetwork* rootNetwork = nullptr);
-
-  const std::string& getName() const { return subModelName_; }
-
-  /// some finish work, like convert the weight format of MKLDNNLayers
-  void finish();
-
-  /**
-   * @brief   Release the middle layer's output memory.
-   *
-   * @note    This function is used for memory optimization in inference.
-   */
-  void releaseOutput();
-
- protected:
-  /**
-   * The constructor of NeuralNetwork.
-   * The sub networks can get parameters_ and parameterMap_
-   * from base NeuralNetwork.
-   *
-   * @param subModelName The name of sub-model.
-   * @param rootNetwork  It used in MultiNetwork.
-   */
-  NeuralNetwork(std::string subModelName = "",
-                NeuralNetwork* rootNetwork = nullptr)
-      : subModelName_(subModelName), rootNetwork_(rootNetwork) {}
-
-  std::string subModelName_;
-  ModelConfig config_;
-  std::vector<LayerPtr> layers_;
-  ParameterMap parameterMap_;
-  LayerMap layerMap_;
-
-  std::vector<DataLayerPtr> dataLayers_;
-  std::vector<LayerPtr> outputLayers_;
-  std::vector<LayerPtr> middleLayers_;
-
-  static std::map<std::string, bool> dllInitMap;
-
-  NeuralNetwork* rootNetwork_;
-
-  /// Whether parameter of this NN is initialized by its own
-  /// (i.e., not by callback supplied with the caller)
-  bool paramSelfInited_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.cpp b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.cpp
deleted file mode 100644
index 85cfc59fbef7017f8dea7fdfecd18aa3e75a871c..0000000000000000000000000000000000000000
--- a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.cpp
+++ /dev/null
@@ -1,213 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/Util.h"
-
-#include "ParallelNeuralNetwork.h"
-
-#include <pthread.h>
-#include <sched.h>
-
-namespace paddle {
-
-void ParallelNeuralNetwork::init(
-    const ModelConfig& config,
-    ParamInitCallback callback,
-    const std::vector<ParameterType>& parameterTypes,
-    bool useGpu) {
-  NeuralNetwork::init(config, callback, parameterTypes, useGpu);
-
-  if (config.type() == "recurrent_nn") {
-    LOG(FATAL)
-        << "You can not add `--parallel_nn=true` on the command line, "
-        << "parallel_nn training mode does not support the recurrent_nn model.";
-  }
-
-  useGpu_ = useGpu;
-  numDevices_ = 0;
-  if (useGpu_) {
-    numDevices_ = hl_get_device_count();
-  }
-
-  for (auto& layer : layers_) {
-    int deviceId = layer->getDeviceId();
-    CHECK_LT(deviceId, numDevices_);
-    addComputeThread(deviceId);
-  }
-}
-
-void ParallelNeuralNetwork::addComputeThread(int deviceId) {
-  for (auto& thread : threads_) {
-    if (thread->getDeviceId() == deviceId) {
-      return;
-    }
-  }
-
-  threads_.emplace_back(new ParallelThread(
-      threads_.size(), deviceId, deviceId >= 0 ? useGpu_ : false));
-}
-
-void ParallelNeuralNetwork::waitAllThread() {
-  for (auto& thread : threads_) {
-    thread->jobEnqueue(NULL, TASK_END_LAYER);
-  }
-
-  for (size_t i = 0; i < threads_.size(); i++) {
-    threads_[i]->queue_.waitEmpty();
-  }
-}
-
-void ParallelNeuralNetwork::dispatchByDeviceId(int deviceId,
-                                               LayerPtr layer,
-                                               TaskType task) {
-  for (auto& thread : threads_) {
-    if (thread->getDeviceId() == deviceId) {
-      thread->jobEnqueue(layer, task);
-      return;
-    }
-  }
-  LOG(FATAL) << "No specific device thread ";
-}
-
-void ParallelNeuralNetwork::forward(const std::vector<Argument>& inArgs,
-                                    std::vector<Argument>* outArgs,
-                                    PassType passType) {
-  for (auto& thread : threads_) {
-    thread->setForwardPassType(passType);
-  }
-  CHECK_EQ(inArgs.size(), dataLayers_.size());
-  outArgs->resize(outputLayers_.size());
-  for (size_t i = 0; i != dataLayers_.size(); ++i) {
-    const_cast<Argument&>(inArgs[i]).deviceId = -1;
-    dataLayers_[i]->setData(inArgs[i]);
-  }
-
-  for (auto& layer : layers_) {
-    dispatchByDeviceId(layer->getDeviceId(), layer, TASK_FORWARD);
-  }
-
-  {
-    REGISTER_TIMER("forwardTime");
-    waitAllThread();
-  }
-  outArgs->clear();
-  outArgs->reserve(outputLayers_.size());
-  for (auto& layer : outputLayers_) {
-    outArgs->push_back(layer->getOutput());
-  }
-}
-
-void ParallelNeuralNetwork::backward(const UpdateCallback& callback) {
-  for (auto& thread : threads_) {
-    thread->setBackwardCallback(callback);
-  }
-
-  FOR_EACH_R(layer, layers_) {
-    dispatchByDeviceId((*layer)->getDeviceId(), *layer, TASK_BACKWARD);
-  }
-  {
-    REGISTER_TIMER("backwardTime");
-    waitAllThread();
-  }
-}
-
-void ParallelNeuralNetwork::forwardBackward(const std::vector<Argument>& inArgs,
-                                            std::vector<Argument>* outArgs,
-                                            PassType passType,
-                                            const UpdateCallback& callback) {
-  forward(inArgs, outArgs, passType);
-  backward(callback);
-}
-
-void ParallelNeuralNetwork::start() {
-  for (auto& thread : threads_) {
-    thread->start();
-  }
-}
-
-ParallelThread::ParallelThread(int threadId, int deviceId, bool useGpu)
-    : threadId_(threadId), deviceId_(deviceId), useGpu_(useGpu) {}
-
-ParallelThread::~ParallelThread() { stop(); }
-
-void ParallelThread::stop() {
-  if (computeThread_) {
-    jobEnqueue(NULL, TASK_THREAD_FINISH);
-    computeThread_->join();
-    computeThread_.reset(nullptr);
-  }
-}
-
-void ParallelThread::computeThread() {
-  LOG(INFO) << "gradComputeThread " << threadId_;
-
-  if (useGpu_) {
-    hl_init(deviceId_);
-  }
-
-  while (true) {
-    struct Job job_work = queue_.dequeue();
-
-    if (job_work.task_ == TASK_END_LAYER) {
-      continue;
-    } else if (job_work.task_ == TASK_THREAD_FINISH) {
-      break;
-    }
-
-    if (TASK_FORWARD == job_work.task_) {
-      {
-        REGISTER_TIMER_INFO("waitInputValue",
-                            job_work.layer_->getName().c_str());
-        job_work.layer_->waitInputValue();
-      }
-      {
-        REGISTER_TIMER_INFO("threadForwardTimer",
-                            job_work.layer_->getName().c_str());
-        job_work.layer_->forward(passType_);
-      }
-      {
-        REGISTER_TIMER_INFO("copyOutputToOtherDevice",
-                            job_work.layer_->getName().c_str());
-        job_work.layer_->copyOutputToOtherDevice();
-      }
-    } else {
-      {
-        REGISTER_TIMER_INFO("waitAndMergeOutputGrad",
-                            job_work.layer_->getName().c_str());
-        job_work.layer_->waitAndMergeOutputGrad();
-      }
-      {
-        REGISTER_TIMER_INFO("threadBackwardTimer",
-                            job_work.layer_->getName().c_str());
-        job_work.layer_->backward(backwardCallback_);
-      }
-      hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-      job_work.layer_->markAllInputGrad();
-    }
-  }
-}
-
-void ParallelThread::start() {
-  computeThread_.reset(new std::thread([this]() { computeThread(); }));
-}
-
-void ParallelThread::jobEnqueue(LayerPtr layer, TaskType task) {
-  struct Job job_work;
-  job_work.layer_ = layer;
-  job_work.task_ = task;
-  queue_.enqueue(job_work);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
deleted file mode 100644
index 73ac8cda721f200c1a02cd9c1d9456df70d7b7d2..0000000000000000000000000000000000000000
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ /dev/null
@@ -1,1501 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "RecurrentGradientMachine.h"
-#include <dlfcn.h>
-#include <algorithm>
-#include <cmath>
-#include <functional>
-#include <limits>
-#include "NeuralNetwork.h"
-#include "paddle/gserver/layers/AgentLayer.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/Util.h"
-
-DEFINE_string(diy_beam_search_prob_so, "", "the diy beam search cost so");
-
-static const char* DIY_CALC_PROB_SYMBOL_NAME = "calc_prob";
-static const char* DIY_START_CALC_PROB_SYMBOL_NAME = "start_calc_prob";
-static const char* DIY_FINISH_CALC_PROB_SYMBOL_NAME = "finish_calc_prob";
-
-namespace paddle {
-
-/**
- * Start Custom Calculate Probability callback type.
- *
- * @param nNode, nodes: the path will be explored. nNodes is array size.
- *                      nodes is array elements.
- *
- * @return: A custom handler id that will passed to another callback.
- */
-typedef int (*DiyStartCalcProbCallback)(size_t nNodes, int* nodes);
-
-/**
- * Doing Custom Calculation of Probability callback type.
- *
- * @param handler: User custom handler. The return value from start calc prob.
- * @param nNode, nodes: Array. The current path.
- * @param curProb: The current log probability that neural network returns.
- *
- * @return: Log probability which user calculated, it will be updated to this
- *          path.
- * @NOTE: Return -INFINITY will DROP this path IMMEDIATELY!!
- */
-typedef real (*DiyCalcProbCallback)(
-    int handler, size_t nNodes, int* nodes, real curProb, bool atEos);
-
-/**
- * Finish Custom Calculation of Probability callback type.
- *
- * @param handler: User custom handler. The return value from start calc prob.
- */
-typedef void (*DiyStopCalcProbCallback)(int handler);
-
-static DiyCalcProbCallback gDiyProbMethod = nullptr;
-static DiyStartCalcProbCallback gDiyProbStart = nullptr;
-static DiyStopCalcProbCallback gDiyProbStop = nullptr;
-static void* gDiyProbHandle = nullptr;
-
-static void exit_diy_prob() { dlclose(gDiyProbHandle); }
-
-template <typename SymbolType>
-static inline SymbolType loadDiySymbol(const char* symbolName) {
-  void* sym = dlsym(gDiyProbHandle, symbolName);
-  CHECK(sym) << "Cannot load symbol " << symbolName << " from "
-             << FLAGS_diy_beam_search_prob_so;
-  return reinterpret_cast<SymbolType>(sym);
-}
-
-static InitFunction __init__diy_prob_method(
-    [] {
-      std::string soName = FLAGS_diy_beam_search_prob_so;
-      if (!soName.empty()) {
-        gDiyProbHandle = dlopen(soName.c_str(), RTLD_LAZY);
-        CHECK(gDiyProbHandle) << "Cannot Open DIY Prob So " << soName;
-        atexit(exit_diy_prob);
-        gDiyProbMethod =
-            loadDiySymbol<decltype(gDiyProbMethod)>(DIY_CALC_PROB_SYMBOL_NAME);
-        gDiyProbStart = loadDiySymbol<decltype(gDiyProbStart)>(
-            DIY_START_CALC_PROB_SYMBOL_NAME);
-        gDiyProbStop = loadDiySymbol<decltype(gDiyProbStop)>(
-            DIY_FINISH_CALC_PROB_SYMBOL_NAME);
-      }
-    },
-    std::numeric_limits<int>::max());
-
-class BeamSearchControlCallbacks {
- public:
-  RecurrentGradientMachine::BeamSearchCandidatesAdjustCallback
-      beamSearchCandidateAdjust;
-  RecurrentGradientMachine::NormOrDropNodeCallback normOrDropNode;
-  RecurrentGradientMachine::DropCallback stopDetermineCandidates;
-
-  //! for gcc46 aggregate initialization is not very well, so we need to
-  //! explicit
-  BeamSearchControlCallbacks(
-      const RecurrentGradientMachine::BeamSearchCandidatesAdjustCallback&
-          candidateAdjust,
-      const RecurrentGradientMachine::NormOrDropNodeCallback& norm,
-      const RecurrentGradientMachine::DropCallback& stop)
-      : beamSearchCandidateAdjust(candidateAdjust),
-        normOrDropNode(norm),
-        stopDetermineCandidates(stop) {}
-};
-
-class BeamSearchStatisticsCallbacks {
- public:
-  RecurrentGradientMachine::EachStepCallback onEachStepStarted;
-  RecurrentGradientMachine::EachStepCallback onEachStepStoped;
-
-  BeamSearchStatisticsCallbacks(
-      const RecurrentGradientMachine::EachStepCallback& start,
-      const RecurrentGradientMachine::EachStepCallback& stop)
-      : onEachStepStarted(start), onEachStepStoped(stop) {}
-};
-
-RecurrentGradientMachine::RecurrentGradientMachine(
-    const std::string& subModelName, NeuralNetwork* rootNetwork)
-    : NeuralNetwork(subModelName),
-      rootNetwork_(rootNetwork),
-      beamSearchCtrlCallbacks_(nullptr),
-      beamSearchStatistics_(nullptr) {
-  CHECK(!subModelName_.empty());
-}
-
-/**
- * bias layer, as input of memory frame 0 will give vector of zeros
- * if bias parameter is not set.
- *
- * boot bias layer create directly in recurrent gradient machine, because:
- *
- * 1. It is only one frame, so it should not be placed in layer group,
- *    which is one instance for every one frame.
- *
- * 2. It is no input layer, so it need resetHeight() before forward(),
- *    and resetHeight() must be called in recurrent gradient machine,
- *    so it's should not be placed in root network.
- */
-class BootBiasLayer : public Layer {
- protected:
-  std::unique_ptr<Weight> biases_;
-  IVectorPtr cpuIds_;
-
- public:
-  explicit BootBiasLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override {
-    if (!Layer::init(layerMap, parameterMap)) return false;
-
-    if (biasParameter_) {
-      biases_ =
-          std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-    }
-    return true;
-  }
-
-  void resetHeight(int height) {
-    if (config_.has_bos_id()) {  // used as a constant id layerConfig
-      IVector::resizeOrCreate(output_.ids, height, useGpu_);
-      output_.ids->reset((int)config_.bos_id());
-    } else {
-      resetOutput(height, getSize());
-    }
-  }
-
-  void forward(PassType passType) override {
-    if (biases_) {
-      MatrixPtr outV = getOutputValue();
-      outV->addBias(*(biases_->getW()), 1);
-      forwardActivation();
-    }
-  }
-
-  void backward(const UpdateCallback& callback) override {
-    if (biases_ && biases_->getWGrad()) {
-      backwardActivation();
-      biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-      biases_->getParameterPtr()->incUpdate(callback);
-    }
-  }
-};
-
-void RecurrentGradientMachine::init(
-    const ModelConfig& config,
-    ParamInitCallback callback,
-    const std::vector<ParameterType>& parameterTypes,
-    bool useGpu) {
-  NeuralNetwork::init(config, callback, parameterTypes, useGpu);
-  useGpu_ = useGpu;
-
-  auto subModelConfig =
-      std::find_if(config.sub_models().begin(),
-                   config.sub_models().end(),
-                   [this](const SubModelConfig& sub_model) {
-                     return sub_model.name() == this->subModelName_;
-                   });
-  CHECK(subModelConfig != config.sub_models().end());
-  reversed_ = subModelConfig->reversed();
-  generating_ = subModelConfig->has_generator();
-
-  inFrameLines_.resize(subModelConfig->in_links_size());
-  for (size_t i = 0; i < inFrameLines_.size(); ++i) {
-    inFrameLines_[i].linkName = subModelConfig->in_links(i).link_name();
-    inFrameLines_[i].inLayer =
-        rootNetwork_->getLayer(subModelConfig->in_links(i).layer_name());
-  }
-
-  outFrameLines_.resize(subModelConfig->out_links_size());
-  for (size_t i = 0; i < outFrameLines_.size(); ++i) {
-    auto& linkPair = subModelConfig->out_links(i);
-    outFrameLines_[i].layerName = linkPair.layer_name();
-    outFrameLines_[i].agentLayer = rootNetwork_->getLayer(linkPair.link_name());
-  }
-
-  memoryFrameLines_.resize(subModelConfig->memories_size());
-  for (size_t i = 0; i < memoryFrameLines_.size(); ++i) {
-    auto& memoryConfig = subModelConfig->memories(i);
-    memoryFrameLines_[i].layerName = memoryConfig.layer_name();
-    memoryFrameLines_[i].linkName = memoryConfig.link_name();
-    auto agentConfig =
-        std::find_if(config.layers().begin(),
-                     config.layers().end(),
-                     [&memoryConfig](const LayerConfig& layerConfig) {
-                       return layerConfig.name() == memoryConfig.link_name();
-                     });
-    CHECK(agentConfig != config.layers().end());
-    if (memoryConfig.has_boot_layer_name()) {
-      memoryFrameLines_[i].rootLayer =
-          rootNetwork_->getLayer(memoryConfig.boot_layer_name());
-
-      LayerConfig scatterConfig = *agentConfig;
-      memoryFrameLines_[i].rootAgent.reset(
-          new ScatterAgentLayer(scatterConfig));
-      memoryFrameLines_[i].rootAgent->init(LayerMap(), parameterMap_);
-
-      memoryFrameLines_[i].bootLayer = memoryFrameLines_[i].rootAgent;
-    } else {
-      LayerConfig biasConfig = *agentConfig;
-      if (memoryConfig.has_boot_bias_parameter_name()) {
-        biasConfig.set_bias_parameter_name(
-            memoryConfig.boot_bias_parameter_name());
-        biasConfig.set_active_type(memoryConfig.boot_bias_active_type());
-      } else if (memoryConfig.has_boot_with_const_id()) {
-        biasConfig.set_bos_id(memoryConfig.boot_with_const_id());
-      }
-      memoryFrameLines_[i].biasLayer.reset(new BootBiasLayer(biasConfig));
-      memoryFrameLines_[i].biasLayer->init(LayerMap(), parameterMap_);
-
-      memoryFrameLines_[i].bootLayer = memoryFrameLines_[i].biasLayer;
-    }
-
-    if (subModelConfig->has_generator()) {
-      memoryFrameLines_[i].scatterAgents.resize(2);
-      for (auto& agent : memoryFrameLines_[i].scatterAgents) {
-        agent.reset(new ScatterAgentLayer(*agentConfig));
-        agent->init(LayerMap(), parameterMap_);
-      }
-    }
-  }
-
-  if (subModelConfig->has_generator()) {
-    generator_.config = subModelConfig->generator();
-    eosFrameLine_.reset(new EosFrameLine);
-    maxSequenceLength_ = generator_.config.max_num_frames();
-  }
-
-  // get parameters actually used by this Layer Group
-  resizeOrCreateFrames(1);
-  for (auto& para : frames_[0]->getParameters()) {
-    if (para->getSharedCount() > 0) {
-      parameterIds_.push_back(para->getID());
-    }
-  }
-  for (auto& para : parameters_) {  // bias layer parameters
-    if (para->getSharedCount() > 0) {
-      parameterIds_.push_back(para->getID());
-    }
-  }
-}
-
-void RecurrentGradientMachine::resizeOrCreateFrames(int numFrames) {
-  if ((size_t)numFrames <= frames_.size()) {
-    return;
-  }
-
-  frames_.reserve(numFrames);
-  for (auto& inFrameLine : inFrameLines_) {
-    inFrameLine.agents.reserve(numFrames);
-  }
-  for (auto& outFrameLine : outFrameLines_) {
-    outFrameLine.frames.reserve(numFrames);
-  }
-  for (auto& memoryFrameLine : memoryFrameLines_) {
-    memoryFrameLine.frames.reserve(numFrames);
-    memoryFrameLine.agents.reserve(numFrames);
-  }
-  if (eosFrameLine_) {
-    eosFrameLine_->layers.reserve(numFrames);
-  }
-
-  ParamInitCallback subParamInitCb = [this](int paramId, Parameter* para) {
-    para->enableSharedType(PARAMETER_VALUE,
-                           this->parameters_[paramId]->getBuf(PARAMETER_VALUE),
-                           this->parameters_[paramId]->getMat(PARAMETER_VALUE));
-    para->enableSharedType(
-        PARAMETER_GRADIENT,
-        this->parameters_[paramId]->getBuf(PARAMETER_GRADIENT),
-        this->parameters_[paramId]->getMat(PARAMETER_GRADIENT));
-  };
-
-  for (int i = frames_.size(); i < numFrames; ++i) {
-    std::unique_ptr<NeuralNetwork> frame(
-        NeuralNetwork::newNeuralNetwork(subModelName_));
-    frame->init(config_, subParamInitCb);
-
-    for (auto& inFrameLine : inFrameLines_) {
-      inFrameLine.agents.push_back(frame->getLayer(inFrameLine.linkName));
-    }
-
-    for (auto& outFrameLine : outFrameLines_) {
-      outFrameLine.frames.push_back(frame->getLayer(outFrameLine.layerName));
-    }
-    for (auto& memoryFrameLine : memoryFrameLines_) {
-      memoryFrameLine.frames.push_back(
-          frame->getLayer(memoryFrameLine.layerName));
-      memoryFrameLine.agents.push_back(
-          frame->getLayer(memoryFrameLine.linkName));
-    }
-    if (eosFrameLine_) {
-      eosFrameLine_->layers.push_back(
-          frame->getLayer(generator_.config.eos_layer_name()));
-    }
-
-    frames_.emplace_back(std::move(frame));
-  }
-}
-
-void RecurrentGradientMachine::resizeBootFrame(int numSequences) {
-  for (auto& memoryFrameLine : memoryFrameLines_) {
-    if (memoryFrameLine.biasLayer) {
-      auto biasLayer =
-          dynamic_cast<BootBiasLayer*>(memoryFrameLine.biasLayer.get());
-      CHECK_NOTNULL(biasLayer);
-      biasLayer->resetHeight(numSequences);
-    } else {  // check input root layer height
-      CHECK_EQ(numSequences,
-               memoryFrameLine.rootLayer->getOutput().getNumSequences());
-    }
-  }
-}
-
-void RecurrentGradientMachine::prefetch(const std::vector<Argument>& inArgs) {
-  LOG(FATAL) << "should not use this function";
-}
-
-void RecurrentGradientMachine::checkInputConsistency(
-    int inlinkId, const std::vector<Argument::SeqInfo>& seqInfo) {
-  if (commonSeqInfo_.empty()) {
-    commonSeqInfo_.resize(seqInfo.size());
-    for (size_t i = 0; i < seqInfo.size(); ++i) {
-      commonSeqInfo_[i].topLevelLength = seqInfo[i].topLevelLength;
-      commonSeqInfo_[i].seqId = seqInfo[i].seqId;
-    }
-  } else {
-    CHECK_EQ(commonSeqInfo_.size(), seqInfo.size())
-        << " RecurrentGroup " << subModelName_ << " input " << inlinkId
-        << " has mismatched number of sequences";
-    for (size_t i = 0; i < seqInfo.size(); ++i) {
-      CHECK_EQ(commonSeqInfo_[i].topLevelLength, seqInfo[i].topLevelLength)
-          << " RecurrentGroup " << subModelName_ << " input " << inlinkId
-          << " has mismatched sequence length";
-      CHECK_EQ(commonSeqInfo_[i].seqId, seqInfo[i].seqId)
-          << " RecurrentGroup " << subModelName_ << " input " << inlinkId
-          << " has mismatched sequence length";
-    }
-  }
-}
-
-void RecurrentGradientMachine::calcNumSequencesAtEachStep() {
-  int numSequences = commonSeqInfo_.size();
-  numSeqs_.resize(maxSequenceLength_);
-  for (int i = 0; i < numSequences; ++i) {
-    for (int j = 0; j < commonSeqInfo_[i].topLevelLength; ++j) {
-      numSeqs_[j] = i + 1;
-    }
-  }
-}
-
-void RecurrentGradientMachine::reorganizeInput(PassType passType) {
-  info_.clear();
-  info_.resize(inFrameLines_.size());
-
-  commonSeqInfo_.clear();
-  seqInfos_.clear();
-  seqInfos_.resize(inFrameLines_.size());
-
-  for (size_t i = 0; i < inFrameLines_.size(); i++) {
-    const Argument& input = inFrameLines_[i].inLayer->getOutput();
-    if (!input.hasSeq()) {
-      continue;
-    }
-    input.getSeqInfo(&seqInfos_[i]);
-    checkInputConsistency(i, seqInfos_[i]);
-  }
-  CHECK(!commonSeqInfo_.empty())
-      << "At least one input needs to be sequence or subsequence";
-  maxSequenceLength_ = commonSeqInfo_[0].topLevelLength;
-
-  calcNumSequencesAtEachStep();
-
-  for (size_t i = 0; i < inFrameLines_.size(); ++i) {
-    const Argument& input = inFrameLines_[i].inLayer->getOutput();
-    if (!input.hasSeq()) {
-      seqInfos_[i] = commonSeqInfo_;
-    }
-    createInFrameInfo(i, input, passType);
-  }
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-
-    // inFrameLine select rows in real layer one time
-    for (size_t i = 0; i < inFrameLines_.size(); i++) {
-      selectRowsOneTime(inFrameLines_[i].inLayer,
-                        info_[i].allIds,
-                        &(inFrameLines_[i].outArg),
-                        passType);
-    }
-  }
-}
-
-void RecurrentGradientMachine::reorganizeOutput(PassType passType) {
-  calcSequenceStartPositions();
-  for (size_t i = 0; i < outFrameLines_.size(); ++i) {
-    Info info;
-    auto& outFrameLine = outFrameLines_[i];
-    ICpuGpuVectorPtr sequenceStartPositions;
-    ICpuGpuVectorPtr subSequenceStartPositions;
-    createOutFrameInfo(
-        outFrameLine, info, sequenceStartPositions, subSequenceStartPositions);
-    auto gatherAgent =
-        dynamic_cast<GatherAgentLayer*>(outFrameLine.agentLayer.get());
-    CHECK_NOTNULL(gatherAgent);
-    gatherAgent->copyIdAndSequenceInfo(sequenceStartPositions,
-                                       subSequenceStartPositions,
-                                       info.allIds,
-                                       info.idIndex);
-  }
-}
-
-void RecurrentGradientMachine::connectFrames(PassType passType) {
-  for (auto& memoryFrameLine : memoryFrameLines_) {
-    if (memoryFrameLine.rootAgent) {
-      auto scatterAgent =
-          dynamic_cast<ScatterAgentLayer*>(memoryFrameLine.rootAgent.get());
-      createMemoryFrameInfo(&memoryFrameLine, passType);
-      scatterAgent->setRealLayerAndOutput(memoryFrameLine.rootLayer,
-                                          memoryFrameLine.outArg,
-                                          memoryFrameLine.allIds,
-                                          /* idIndex */ 0,
-                                          memoryFrameLine.allIds->getSize(),
-                                          /* handleBackward */ true);
-      if (memoryFrameLine.sequenceStartPositions) {
-        int size = memoryFrameLine.sequenceStartPositions->getSize();
-        scatterAgent->setSequenceStartPositions(
-            memoryFrameLine.sequenceStartPositions,
-            /* seqStartPosIndex */ 0,
-            size);
-      }
-    }
-  }
-
-  for (auto& outFrameLine : outFrameLines_) {
-    auto gatherAgent =
-        dynamic_cast<GatherAgentLayer*>(outFrameLine.agentLayer.get());
-    gatherAgent->clearRealLayers();
-  }
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    // connect in_links
-    for (size_t j = 0; j < inFrameLines_.size(); ++j) {
-      Info& info = info_[j];
-      // idSize denotes the sum number of tokens in each length i
-      int idIndex = info.idIndex.empty() ? 0 : info.idIndex[i];
-      int idSize = info.idIndex.empty() ? numSeqs_[i]
-                                        : info.idIndex[i + 1] - info.idIndex[i];
-      InFrameLine inFrameLine = inFrameLines_[j];
-      auto scatterAgent =
-          dynamic_cast<ScatterAgentLayer*>(inFrameLine.agents[i].get());
-      scatterAgent->setRealLayerAndOutput(inFrameLine.inLayer,
-                                          inFrameLine.outArg,
-                                          info.allIds,
-                                          idIndex,
-                                          idSize,
-                                          i == 0);
-      if (info.sequenceStartPositions) {
-        // size: the length of subsequence
-        int size = info.seqStartPosIndex[i + 1] - info.seqStartPosIndex[i];
-        scatterAgent->setSequenceStartPositions(
-            info.sequenceStartPositions, info.seqStartPosIndex[i], size);
-      }
-    }
-
-    // connect out_links
-    for (auto& outFrameLine : outFrameLines_) {
-      auto gatherAgent =
-          dynamic_cast<GatherAgentLayer*>(outFrameLine.agentLayer.get());
-      gatherAgent->addRealLayer(outFrameLine.frames[i]);
-    }
-    for (auto& memoryFrameLine : memoryFrameLines_) {
-      NeuralNetwork::connect(
-          memoryFrameLine.agents[i],
-          i == 0 ? memoryFrameLine.bootLayer : memoryFrameLine.frames[i - 1],
-          numSeqs_[i] /*height of agent*/);
-    }
-  }
-}
-
-void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
-                                       std::vector<Argument>* outArgs,
-                                       PassType passType) {
-  /* inArgs and outArgs are not used.
-     The inputs are inFrameLines_[i].inLayer.
-     The outputs are outFramesLines_[i].agentLayer
-   */
-
-  if (generating_) {
-    generateSequence();
-    return;
-  }  // else forward..
-
-  reorganizeInput(passType);
-  int numSequences = commonSeqInfo_.size();
-
-  resizeOrCreateFrames(maxSequenceLength_);
-  resizeBootFrame(numSequences);
-
-  connectFrames(passType);
-
-  REGISTER_TIMER_INFO("RecurrentFwTime", "RecurrentFwTime");
-  // forward
-  for (auto& memoryFrameLine : memoryFrameLines_) {
-    memoryFrameLine.bootLayer->forward(passType);
-  }
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    const std::vector<Argument> inArgs;
-    std::vector<Argument> outArgs;
-    frames_[i]->forward(inArgs, &outArgs, passType);
-  }
-
-  reorganizeOutput(passType);
-}
-
-void RecurrentGradientMachine::backward(const UpdateCallback& callback) {
-  if (generating_) {
-    return;
-  }
-  REGISTER_TIMER_INFO("RecurrentBwTime", "RecurrentBwTime");
-  AsyncGpuBlock asyncGpuBlock;
-  for (int i = maxSequenceLength_ - 1; i >= 0; --i) {
-    frames_[i]->backward(nullptr);
-  }
-  for (auto& memoryFrameLine : memoryFrameLines_) {
-    memoryFrameLine.bootLayer->backward(nullptr);
-  }
-}
-
-void RecurrentGradientMachine::forwardBackward(
-    const std::vector<Argument>& inArgs,
-    std::vector<Argument>* outArgs,
-    PassType passType,
-    const UpdateCallback& callback) {
-  LOG(FATAL) << "should not use this function";
-}
-
-void RecurrentGradientMachine::eval(Evaluator* evaluator) const {
-  // call printers frame by frame
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    VLOG(2) << "Recurrent Layer Group eval frame " << i << " begin";
-    evaluator->eval(*(frames_[i].get()));
-    VLOG(2) << "Recurrent Layer Group eval frame " << i << " end";
-  }
-}
-
-void RecurrentGradientMachine::registerBeamSearchControlCallbacks(
-    const BeamSearchCandidatesAdjustCallback& adjustBeamSearch,
-    const NormOrDropNodeCallback& normOrDropNode,
-    const DropCallback& stopBeamSearch) {
-  this->removeBeamSearchControlCallbacks();
-  //! for gcc 46, aggregate initialization is not supported. TAT
-  this->beamSearchCtrlCallbacks_ = new BeamSearchControlCallbacks(
-      adjustBeamSearch, normOrDropNode, stopBeamSearch);
-}
-
-void RecurrentGradientMachine::removeBeamSearchControlCallbacks() {
-  if (this->beamSearchCtrlCallbacks_) {
-    delete this->beamSearchCtrlCallbacks_;
-    this->beamSearchCtrlCallbacks_ = nullptr;
-  }
-}
-
-void RecurrentGradientMachine::registerBeamSearchStatisticsCallbacks(
-    const EachStepCallback& onEachStepStarted,
-    const EachStepCallback& onEachStepStoped) {
-  this->removeBeamSearchStatisticsCallbacks();
-  this->beamSearchStatistics_ =
-      new BeamSearchStatisticsCallbacks(onEachStepStarted, onEachStepStoped);
-}
-
-void RecurrentGradientMachine::removeBeamSearchStatisticsCallbacks() {
-  if (this->beamSearchStatistics_) {
-    delete this->beamSearchStatistics_;
-    this->beamSearchStatistics_ = nullptr;
-  }
-}
-
-namespace {
-void lenToStarts(std::vector<int>& starts) {
-  int pos = 0;
-  starts.back() = 0;
-  for (auto& start : starts) {
-    int tmp = start;
-    start = pos;
-    pos += tmp;
-  }
-  starts.back() = pos;
-}
-}  // namespace
-
-void RecurrentGradientMachine::calcSequenceStartPositions() {
-  std::vector<int> starts(commonSeqInfo_.size() + 1);
-  for (auto& seqInfo : commonSeqInfo_) {
-    starts[seqInfo.seqId] = seqInfo.topLevelLength;
-  }
-  lenToStarts(starts);
-  ICpuGpuVector::resizeOrCreate(sequenceStartPositions_, starts.size(), false);
-  std::copy(starts.begin(),
-            starts.end(),
-            sequenceStartPositions_->getMutableData(false));
-}
-
-void RecurrentGradientMachine::checkOutputConsistency(
-    OutFrameLine& outFrameLine) {
-  bool hasSeq = outFrameLine.frames[0]->getOutput().hasSeq();
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    LayerPtr frame = outFrameLine.frames[i];
-    CHECK_EQ(hasSeq, frame->getOutput().hasSeq());
-    int numSequences = frame->getOutput().getNumSequences();
-    CHECK_EQ(numSeqs_[i], numSequences);
-  }
-}
-
-void RecurrentGradientMachine::createOutFrameInfo(
-    OutFrameLine& outFrameLine,
-    Info& info,
-    ICpuGpuVectorPtr& sequenceStartPositions,
-    ICpuGpuVectorPtr& subSequenceStartPositions) {
-  checkOutputConsistency(outFrameLine);
-
-  if (!outFrameLine.frames[0]->getOutput().hasSeq()) {
-    createOutFrameInfo_seq(
-        outFrameLine, info, sequenceStartPositions, subSequenceStartPositions);
-  } else {
-    createOutFrameInfo_subseq(
-        outFrameLine, info, sequenceStartPositions, subSequenceStartPositions);
-  }
-}
-
-void RecurrentGradientMachine::createOutFrameInfo_seq(
-    OutFrameLine& outFrameLine,
-    Info& info,
-    ICpuGpuVectorPtr& sequenceStartPositions,
-    ICpuGpuVectorPtr& subSequenceStartPositions) {
-  std::vector<int> allIds;
-  info.idIndex.resize(1, 0);  // first idIndex = 0
-
-  const int* starts = sequenceStartPositions_->getData(false);
-
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    LayerPtr frame = outFrameLine.frames[i];
-    size_t numSequences = frame->getOutput().getNumSequences();
-    for (size_t j = 0; j < numSequences; ++j) {
-      int seqStart = starts[commonSeqInfo_[j].seqId];
-      int seqLength = commonSeqInfo_[j].topLevelLength;
-      allIds.push_back(reversed_ ? (seqStart + seqLength - 1 - i)
-                                 : (seqStart + i));
-    }
-    info.idIndex.push_back(allIds.size());
-  }
-  sequenceStartPositions = sequenceStartPositions_;
-  copyScattedId(allIds, &info.allIds, allIds.size());
-  CHECK_EQ(info.idIndex.size(), static_cast<size_t>(maxSequenceLength_ + 1));
-}
-
-void RecurrentGradientMachine::createOutFrameInfo_subseq(
-    OutFrameLine& outFrameLine,
-    Info& info,
-    ICpuGpuVectorPtr& sequenceStartPositions,
-    ICpuGpuVectorPtr& subSequenceStartPositions) {
-  size_t numSequences = commonSeqInfo_.size();
-  std::vector<int> allIds;
-  info.idIndex.resize(1, 0);  // first idIndex = 0
-
-  const int* starts = sequenceStartPositions_->getData(false);
-  std::vector<int> subStarts(starts[numSequences] + 1);
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    LayerPtr frame = outFrameLine.frames[i];
-    size_t numSequences = frame->getOutput().getNumSequences();
-    const int* seqStarts =
-        frame->getOutput().sequenceStartPositions->getData(false);
-    for (size_t j = 0; j < numSequences; ++j) {
-      subStarts[starts[commonSeqInfo_[j].seqId] + i] =
-          seqStarts[j + 1] - seqStarts[j];
-    }
-  }
-  lenToStarts(subStarts);
-
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    LayerPtr frame = outFrameLine.frames[i];
-    size_t numSequences = frame->getOutput().getNumSequences();
-    for (size_t j = 0; j < numSequences; ++j) {
-      int pos = starts[commonSeqInfo_[j].seqId] + i;
-      int subSeqStart = subStarts[pos];
-      int subSeqEnd = subStarts[pos + 1];
-      for (int k = subSeqStart; k < subSeqEnd; ++k) {
-        allIds.push_back(k);
-      }
-    }
-    info.idIndex.push_back(allIds.size());
-  }
-
-  ICpuGpuVector::resizeOrCreate(
-      subSequenceStartPositions, subStarts.size(), false);
-  int* cpuSubSequenceStartPositions =
-      subSequenceStartPositions->getMutableData(false);
-  std::copy(subStarts.begin(), subStarts.end(), cpuSubSequenceStartPositions);
-  ICpuGpuVector::resizeOrCreate(
-      sequenceStartPositions, numSequences + 1, false);
-  int* cpuSequenceStartPositions =
-      sequenceStartPositions->getMutableData(false);
-  for (size_t i = 0; i <= numSequences; ++i) {
-    cpuSequenceStartPositions[i] = subStarts[starts[i]];
-  }
-  copyScattedId(allIds, &info.allIds, allIds.size());
-  CHECK_EQ(info.idIndex.size(), static_cast<size_t>(maxSequenceLength_ + 1));
-}
-
-/* create scattered id infomation for all realLayer of inFrameLines one time.
- * If hasSubseq, will also create scattered sequenceStartPositions infomation
- * for all realLayer of inFrameLines one time.
- */
-void RecurrentGradientMachine::createInFrameInfo(int inlinkId,
-                                                 const Argument& input,
-                                                 PassType passType) {
-  if (!input.hasSeq()) {
-    createInFrameInfo_nonseq(inlinkId, input, passType);
-  } else if (!input.hasSubseq()) {
-    createInFrameInfo_seq(inlinkId, input, passType);
-  } else {
-    createInFrameInfo_subseq(inlinkId, input, passType);
-  }
-}
-
-void RecurrentGradientMachine::createInFrameInfo_nonseq(int inlinkId,
-                                                        const Argument& input,
-                                                        PassType passType) {
-  std::vector<int> allIds;
-
-  auto& seqInfo = seqInfos_[inlinkId];
-  Info* inlinkInfo = &info_[inlinkId];
-  inlinkInfo->idIndex.clear();
-  for (size_t i = 0; i < seqInfo.size(); ++i) {
-    allIds.push_back(seqInfo[i].seqId);
-  }
-  // copy and check scatterId
-  copyScattedId(allIds, &inlinkInfo->allIds, input.getBatchSize());
-}
-
-void RecurrentGradientMachine::createInFrameInfo_seq(int inlinkId,
-                                                     const Argument& input,
-                                                     PassType passType) {
-  std::vector<int> allIds;
-  auto& seqInfo = seqInfos_[inlinkId];
-  Info* inlinkInfo = &info_[inlinkId];
-  inlinkInfo->idIndex.resize(1, 0);  // first idIndex = 0
-
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    for (int j = 0; j < numSeqs_[i]; ++j) {
-      int seqLength = seqInfo[j].topLevelLength;
-      int seqStart = seqInfo[j].seqStart;
-      allIds.push_back(reversed_ ? (seqStart + seqLength - 1 - i)
-                                 : (seqStart + i));
-    }
-    inlinkInfo->idIndex.push_back(allIds.size());
-  }
-
-  // copy and check scatterId
-  copyScattedId(allIds, &inlinkInfo->allIds, input.getBatchSize());
-  CHECK_EQ(inlinkInfo->idIndex.size(),
-           static_cast<size_t>(maxSequenceLength_ + 1));
-}
-void RecurrentGradientMachine::createInFrameInfo_subseq(int inlinkId,
-                                                        const Argument& input,
-                                                        PassType passType) {
-  std::vector<int> allIds;
-
-  auto& seqInfo = seqInfos_[inlinkId];
-
-  Info* inlinkInfo = &info_[inlinkId];
-  inlinkInfo->idIndex.resize(1, 0);  // first idIndex = 0
-  std::vector<int> sequenceStartPositions;
-  const int* subSequenceStartPositions = nullptr;
-
-  subSequenceStartPositions = input.subSequenceStartPositions->getData(false);
-  inlinkInfo->seqStartPosIndex.clear();
-  inlinkInfo->seqStartPosIndex.push_back(0);  // first seqStartPosIndex = 0
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    sequenceStartPositions.push_back(0);  // first element = 0
-    for (int j = 0; j < numSeqs_[i]; ++j) {
-      int subSeqStart = subSequenceStartPositions[seqInfo[j].subSeqStart + i];
-      int subSeqEnd = subSequenceStartPositions[seqInfo[j].subSeqStart + i + 1];
-      for (int k = subSeqStart; k < subSeqEnd; ++k) {
-        allIds.push_back(k);
-      }
-      sequenceStartPositions.push_back(sequenceStartPositions.back() +
-                                       subSeqEnd - subSeqStart);
-    }
-    inlinkInfo->idIndex.push_back(allIds.size());
-    inlinkInfo->seqStartPosIndex.push_back(sequenceStartPositions.size());
-  }
-  // inFrameLine create sequenceStartPositions one time
-  CHECK_EQ(
-      sequenceStartPositions.size(),
-      static_cast<size_t>(maxSequenceLength_ + input.getNumSubSequences()));
-  CHECK_EQ(inlinkInfo->seqStartPosIndex.size(),
-           static_cast<size_t>(maxSequenceLength_ + 1));
-  createSeqPos(sequenceStartPositions, &inlinkInfo->sequenceStartPositions);
-
-  // copy and check scatterId
-  copyScattedId(allIds, &inlinkInfo->allIds, input.getBatchSize());
-  CHECK_EQ(inlinkInfo->idIndex.size(),
-           static_cast<size_t>(maxSequenceLength_ + 1));
-}
-
-/* like createInFrameInfo, but for all realLayer of memoryFrameLines*/
-void RecurrentGradientMachine::createMemoryFrameInfo(
-    MemoryFrameLine* memoryFrameLine, PassType passType) {
-  const Argument& input = (*memoryFrameLine).rootLayer->getOutput();
-  size_t numSequences = input.getNumSequences();
-  std::vector<int> allIds;
-  bool seqFlag = input.hasSeq();
-  CHECK(!input.hasSubseq())
-      << "Subsequence boot layer for memory is not supported";
-
-  if (seqFlag) {  // for sequenceScatterAgentLayer
-    std::vector<int> sequenceStartPositions;
-    sequenceStartPositions.push_back(0);  // first element = 0
-    const int* starts = input.sequenceStartPositions->getData(false);
-    for (size_t i = 0; i < numSequences; ++i) {
-      // memory info adopt info of inlinks[0]
-      int seqId = seqInfos_[0][i].seqId;
-      for (int k = starts[seqId]; k < starts[seqId + 1]; ++k) {
-        allIds.push_back(k);
-      }
-      sequenceStartPositions.push_back(sequenceStartPositions.back() +
-                                       starts[seqId + 1] - starts[seqId]);
-    }
-    createSeqPos(sequenceStartPositions,
-                 &(*memoryFrameLine).sequenceStartPositions);
-
-  } else {  // for scatterAgentLayer
-    for (size_t i = 0; i < numSequences; ++i) {
-      allIds.push_back(seqInfos_[0][i].seqId);
-    }
-  }
-  // copy and check scatterId
-  copyScattedId(allIds, &(*memoryFrameLine).allIds, input.getBatchSize());
-  // memoryFrameLine select rows in real layer one time
-  selectRowsOneTime((*memoryFrameLine).rootLayer,
-                    (*memoryFrameLine).allIds,
-                    &(*memoryFrameLine).outArg,
-                    passType);
-}
-
-void RecurrentGradientMachine::copyScattedId(std::vector<int>& srcIds,
-                                             IVectorPtr* dstIds,
-                                             int size) {
-  int idSize = srcIds.size();
-  CHECK_EQ(idSize, size);
-  IVector::resizeOrCreate(*dstIds, idSize, useGpu_);
-  (*dstIds)->copyFrom(srcIds.data(), idSize);
-  // check
-  std::sort(srcIds.begin(), srcIds.end());
-  for (int i = 0; i < idSize; ++i) {
-    CHECK_EQ(srcIds[i], i);
-  }
-}
-
-void RecurrentGradientMachine::selectRowsOneTime(LayerPtr layer,
-                                                 const IVectorPtr& allIds,
-                                                 Argument* arg,
-                                                 PassType passType) {
-  Argument& src = layer->getOutput();
-  if (src.value) {
-    const MatrixPtr& realV = src.value;
-    int height = realV->getHeight();
-    int width = realV->getWidth();
-    Matrix::resizeOrCreate(
-        arg->value, height, width, /* trans */ false, useGpu_);
-    arg->value->zeroMem();
-    arg->value->selectRows(*realV, *allIds);
-    if (passType != PASS_TEST) {
-      Matrix::resizeOrCreate(
-          arg->grad, height, width, /* trans */ false, useGpu_);
-      arg->grad->zeroMem();
-    }
-  }
-  if (src.ids) {
-    IVector::resizeOrCreate(arg->ids, src.ids->getSize(), useGpu_);
-    arg->ids->selectFrom(*src.ids, *allIds);
-  }
-}
-
-void RecurrentGradientMachine::createSeqPos(
-    const std::vector<int>& sequenceStartPosition,
-    ICpuGpuVectorPtr* sequenceStartPositions) {
-  int size = sequenceStartPosition.size();
-  const int* data = sequenceStartPosition.data();
-  ICpuGpuVector::resizeOrCreate(*sequenceStartPositions, size, false);
-  (*sequenceStartPositions)->copyFrom(data, size, false);
-}
-
-size_t RecurrentGradientMachine::getGenBatchSize() {
-  size_t numSequences = 0;
-  for (auto& memoryFrameLine : memoryFrameLines_) {
-    if (!memoryFrameLine.rootLayer) continue;
-    Argument& bootArg = memoryFrameLine.rootLayer->getOutput();
-    size_t batchSize = bootArg.getNumSequences();
-    if (numSequences) {
-      CHECK_EQ(numSequences, batchSize);
-    } else {
-      numSequences = batchSize;
-    }
-  }
-  CHECK(numSequences)
-      << "Fail to get batch size in generation. "
-         "At least one of the Memory layer MUST have a layer that is NOT in "
-         "the layer group to boot it, and this boot layer is used to "
-         "decide batch_size in generation process.";
-  return numSequences;
-}
-
-void RecurrentGradientMachine::generateSequence() {
-  CHECK_NOTNULL(eosFrameLine_.get());
-  CHECK_GE(outFrameLines_.size(), 1UL);
-  size_t numSequences = getGenBatchSize();
-
-  resizeBootFrame(numSequences);
-  // We create only two sub-network in generation, one stores states of all
-  // layers in previous time step and the other storing the states at current
-  // time step.
-  resizeOrCreateFrames(2);
-
-  // outFrameLines_.size() > 1UL
-  dataArgsSize_ = outFrameLines_.size() - 1;
-  dataArgs_.resize(dataArgsSize_);
-  dataArgsFrame_.clear();
-  dataArgsFrame_.resize(dataArgsSize_);
-
-  // connect boot frame memory links
-  std::vector<int> ids(numSequences);
-  for (size_t i = 0; i < numSequences; ++i) {
-    ids[i] = i;
-  }
-  for (auto& memoryFrameLine : memoryFrameLines_) {
-    if (memoryFrameLine.rootAgent) {
-      auto scatterAgent =
-          dynamic_cast<ScatterAgentLayer*>(memoryFrameLine.rootAgent.get());
-      scatterAgent->setRealLayer(memoryFrameLine.rootLayer, ids);
-    }
-    NeuralNetwork::connect(
-        memoryFrameLine.agents[0], memoryFrameLine.bootLayer, ids.size());
-  }
-
-  // boot layer forward
-  AsyncGpuBlock asyncGpuBlock;
-
-  for (auto& memoryFrameLine : memoryFrameLines_) {
-    memoryFrameLine.bootLayer->forward(PASS_TEST);
-  }
-
-  // init outArg
-  size_t resultNum = generator_.config.num_results_per_sample();
-  size_t maxGenWordCount =
-      generator_.config.max_num_frames() * numSequences * resultNum;
-  IVector::resizeOrCreate(generator_.outArg.ids, maxGenWordCount, false);
-  if (resultNum > 1) {
-    CHECK_LE(resultNum, static_cast<size_t>(generator_.config.beam_size()));
-    Matrix::resizeOrCreate(generator_.outArg.in,
-                           /* height */ numSequences,
-                           /* width */ resultNum,
-                           false,
-                           /* useGpu */ false);
-  }
-  ICpuGpuVector::resizeOrCreate(generator_.outArg.sequenceStartPositions,
-                                numSequences + 1,
-                                /* useGpu */ false);
-  if (getBeamSize() > 1) {
-    beamSearch(numSequences);
-  } else {
-    oneWaySearch(numSequences);
-  }
-  if (dataArgsSize_) createDataOutlink();
-
-  size_t size = generator_.ids.size();
-  generator_.outArg.ids->resize(size);
-  generator_.outArg.ids->copyFrom(generator_.ids.data(), size);
-
-  OutFrameLine& outFrameLine = outFrameLines_[0];
-  auto dataAgent = dynamic_cast<DataLayer*>(outFrameLine.agentLayer.get());
-  CHECK_NOTNULL(dataAgent);
-  dataAgent->setData(generator_.outArg);
-  dataAgent->prefetch();
-}
-
-void RecurrentGradientMachine::oneWaySearch(size_t batchSize) {
-  OutFrameLine& outFrameLine = outFrameLines_[0];
-
-  // finalPaths_[0] stores the generated results of the
-  // entire batch, so its size exactly equals to batchSize.
-  finalPaths_.clear();
-  finalPaths_.resize(1);
-  std::vector<Path>& finalPaths = finalPaths_[0];
-  finalPaths.resize(batchSize);
-
-  seqIds_.resize(batchSize);
-  std::vector<int> scatterIds;
-  for (size_t i = 0; i < batchSize; ++i) {
-    finalPaths[i].seqId = i;
-    seqIds_[i] = i;
-  }
-
-  // forward
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    if (i && scatterIds.empty()) break;
-    int machineCur = i % 2;
-    int machinePrev = (i - 1) % 2;
-    // connect memory links
-    if (i) {
-      seqIds_.clear();
-      for (size_t j = 0; j < batchSize; ++j) {
-        if (finalPaths[j].seqId != -1) seqIds_.push_back(j);
-      }
-
-      for (auto& memoryFrameLine : memoryFrameLines_) {
-        auto scatterAgent = dynamic_cast<ScatterAgentLayer*>(
-            memoryFrameLine.scatterAgents[machineCur].get());
-        scatterAgent->setRealLayer(memoryFrameLine.frames[machinePrev],
-                                   scatterIds);
-        scatterAgent->forward(PASS_TEST);
-        NeuralNetwork::connect(memoryFrameLine.agents[machineCur],
-                               memoryFrameLine.scatterAgents[machineCur]);
-      }
-    }
-    const std::vector<Argument> inArgs;
-    std::vector<Argument> outArgs;
-    frames_[machineCur]->forward(inArgs, &outArgs, PASS_TEST);
-
-    const IVectorPtr& idVec = outFrameLine.frames[machineCur]->getOutput().ids;
-    for (size_t j = 0; j < seqIds_.size(); ++j) {
-      finalPaths[seqIds_[j]].ids.push_back(idVec->getElement(j));
-      finalPaths[seqIds_[j]].machineIdVec.push_back(j);
-    }
-
-    copyDataOutlinkFrame(machineCur);
-
-    // check eos
-    const IVectorPtr& eosVec =
-        eosFrameLine_->layers[machineCur]->getOutput().ids;
-    scatterIds.clear();
-    for (size_t j = 0; j < seqIds_.size(); ++j) {
-      if (eosVec->getElement(j) == 1U) {
-        // path.seqId = -1 indicates end of generation
-        // of an input sequence
-        finalPaths[seqIds_[j]].seqId = -1;
-      } else {
-        scatterIds.push_back(j);
-      }
-    }
-  }
-
-  batchMachineIdVec_.clear();
-  batchMachineStartPos_.clear();
-  int* starts = generator_.outArg.sequenceStartPositions->getMutableData(false);
-  starts[0] = 0;
-  generator_.ids.clear();
-  for (size_t i = 0; i < batchSize; ++i) {
-    generator_.ids.insert(generator_.ids.end(),
-                          finalPaths[i].ids.begin(),
-                          finalPaths[i].ids.end());
-    starts[i + 1] = generator_.ids.size();
-    batchMachineIdVec_.insert(batchMachineIdVec_.end(),
-                              finalPaths[i].machineIdVec.begin(),
-                              finalPaths[i].machineIdVec.end());
-  }
-}
-
-void RecurrentGradientMachine::connectPrevFrame(int stepId,
-                                                std::vector<Path>& paths) {
-  int machineCur = stepId % 2;
-  int machinePrev = (stepId - 1) % 2;
-  int beam = getBeamSize();
-  machineIds_.clear();
-  topIds_.clear();
-  seqIds_.clear();
-
-  for (size_t j = 0; j < paths.size(); ++j) {
-    machineIds_.push_back(paths[j].machineId);
-    topIds_.push_back(paths[j].machineId * beam + paths[j].topIndex);
-    seqIds_.push_back(paths[j].seqId);
-  }
-
-  for (auto& memoryFrameLine : memoryFrameLines_) {
-    bool isOutIds = (memoryFrameLine.layerName == outFrameLines_[0].layerName);
-    auto scatterAgent = dynamic_cast<ScatterAgentLayer*>(
-        memoryFrameLine.scatterAgents[machineCur].get());
-    scatterAgent->setRealLayer(memoryFrameLine.frames[machinePrev],
-                               isOutIds ? topIds_ : machineIds_);
-    scatterAgent->forward(PASS_TEST);
-    NeuralNetwork::connect(memoryFrameLine.agents[machineCur],
-                           memoryFrameLine.scatterAgents[machineCur]);
-  }
-}
-
-void RecurrentGradientMachine::forwardFrame(int machineCur) {
-  // forward
-  const std::vector<Argument> inArgs;
-  std::vector<Argument> outArgs;
-  frames_[machineCur]->forward(inArgs, &outArgs, PASS_TEST);
-
-  copyDataOutlinkFrame(machineCur);
-
-  IVectorPtr& ids = outFrameLines_[0].frames[machineCur]->getOutput().ids;
-  MatrixPtr in = outFrameLines_[0].frames[machineCur]->getOutput().in;
-  IVectorPtr& eos = eosFrameLine_->layers[machineCur]->getOutput().ids;
-  if (useGpu_) {
-    IVector::resizeOrCreate(cpuId_, ids->getSize(), false /* useGpu */);
-    cpuId_->copyFrom(*ids);
-    Matrix::resizeOrCreate(cpuProb_,
-                           in->getHeight(),
-                           in->getWidth(),
-                           false /* trans */,
-                           false /* useGpu */);
-    cpuProb_->copyFrom(*in);
-    IVector::resizeOrCreate(cpuEos_, eos->getSize(), false /* useGpu */);
-    cpuEos_->copyFrom(*eos);
-  } else {
-    cpuId_ = ids;
-    cpuProb_ = in;
-    cpuEos_ = eos;
-  }
-}
-
-void RecurrentGradientMachine::singlePathExpand(Path& curPath,
-                                                size_t curPathId,
-                                                std::vector<Path>& newPaths,
-                                                size_t expandWidth) {
-  int calc_id =
-      gDiyProbStart ? gDiyProbStart(curPath.ids.size(), curPath.ids.data()) : 0;
-
-  const int* idVec = cpuId_->getData();
-  const real* probMat = cpuProb_->getData();
-  const int* eosVec = cpuEos_->getData();
-
-  for (size_t k = 0; k < expandWidth; k++) {
-    int index = curPathId * expandWidth + k;
-    int id = idVec[index];
-    real prob = probMat[index];
-    /*
-     * Ordinarily, beam search greedily expands the most promising expandWidth
-     * paths that currently are ALWAYS returned by MaxIdLayer.
-     * In one condition, if user customizes the beam search procedure by
-     * restricting the expansion within a user defined subset,
-     * as a result, MaxIdLayer possibly COULD NOT return expandWidth
-     * vaild expansions, and it will use -1 to indicate the end of valid
-     * expansion candidates.
-     */
-    if (id == -1) break;
-
-    real newLogProb = generator_.config.log_prob() ? std::log(prob) : prob;
-    Path newPath(
-        curPath, id, newLogProb, curPathId /*machineId*/, k /*topIndex*/);
-    if (this->beamSearchCtrlCallbacks_) {
-      if (beamSearchCtrlCallbacks_->stopDetermineCandidates(
-              newPath.seqId, newPath.ids, newPath.probHistory))
-        return;
-    }
-    // outFrameLines_.size() > 1UL
-    if (dataArgsSize_) {
-      newPath.machineIdVec = curPath.machineIdVec;
-      newPath.machineIdVec.push_back(curPathId);
-    }
-    bool atEos =
-        eosVec[index] == 1U || newPath.ids.size() >= (size_t)maxSequenceLength_;
-    // adjustNewPath
-    newPath.adjustProb(calc_id, atEos);
-    if (this->beamSearchCtrlCallbacks_) {
-      this->beamSearchCtrlCallbacks_->normOrDropNode(
-          newPath.seqId, newPath.ids, newPath.probHistory, &newPath.logProb);
-    }
-    if (!newPath.isDropable()) {
-      atEos ? finalPaths_[curPath.seqId].push_back(newPath)
-            : newPaths.push_back(newPath);
-    }
-  }  // for expandWidth
-
-  if (gDiyProbStop) {
-    gDiyProbStop(calc_id);
-  }
-}
-
-void RecurrentGradientMachine::beamExpand(std::vector<Path>& paths,
-                                          std::vector<Path>& newPaths) {
-  size_t candidatePathCount = paths.size();
-  // idVec.size() could be larger than candidatePathCount * beam,
-  // so user can drop some node customly.
-  CHECK_EQ(cpuId_->getSize() % candidatePathCount, 0UL);
-  size_t expandWidth = cpuId_->getSize() / candidatePathCount;
-
-  // iterate over each sequence
-  size_t totalExpandCount = 0;
-  int prevSeqId = -1;
-  int curSeqId = 0;
-  for (size_t j = 0; j <= candidatePathCount; j++) {
-    // expansions of a single sequence are all processed
-    curSeqId = (j < candidatePathCount ? paths[j].seqId : curSeqId + 1);
-    if (prevSeqId != -1 && curSeqId != prevSeqId) {
-      totalExpandCount += beamShrink(newPaths, prevSeqId, totalExpandCount);
-    }
-    if (j == candidatePathCount) return;
-    singlePathExpand(paths[j], j, newPaths, expandWidth);
-
-    prevSeqId = paths[j].seqId;
-  }  // for paths
-}
-
-// Drop extra nodes to beam size.
-size_t RecurrentGradientMachine::beamShrink(std::vector<Path>& newPaths,
-                                            size_t seqId,
-                                            size_t totalExpandCount) {
-  size_t minNewPathSize =
-      std::min(getBeamSize(), newPaths.size() - totalExpandCount);
-  if (!minNewPathSize) {
-    return 0;
-  }
-  std::nth_element(newPaths.begin() + totalExpandCount,
-                   newPaths.begin() + totalExpandCount + minNewPathSize,
-                   newPaths.end(),
-                   Path::greaterPath);
-  newPaths.resize(totalExpandCount + minNewPathSize);
-
-  real minPathLogProb =
-      std::min_element(newPaths.end() - minNewPathSize, newPaths.end())
-          ->logProb;
-  real maxPathLogProb =
-      std::max_element(newPaths.end() - minNewPathSize, newPaths.end())
-          ->logProb;
-
-  // Remove the already formed paths that are relatively short
-  finalPaths_[seqId].erase(
-      std::remove_if(finalPaths_[seqId].begin(),
-                     finalPaths_[seqId].end(),
-                     [&](Path& p) { return p.logProb < minPathLogProb; }),
-      finalPaths_[seqId].end());
-  for (auto p : finalPaths_[seqId]) {
-    if (minFinalPathLogProb_[seqId] > p.logProb) {
-      minFinalPathLogProb_[seqId] = p.logProb;
-    }
-  }
-
-  if (finalPaths_[seqId].size() >= getBeamSize() &&
-      minFinalPathLogProb_[seqId] >= maxPathLogProb) {
-    newPaths.resize(totalExpandCount);
-    return 0;
-  }
-  return minNewPathSize;
-}
-
-void RecurrentGradientMachine::fillGenOutputs() {
-  size_t numResults = generator_.config.num_results_per_sample();
-  for (size_t i = 0; i < finalPaths_.size(); ++i) {
-    size_t minFinalPathsSize = std::min(numResults, finalPaths_[i].size());
-    std::partial_sort(finalPaths_[i].begin(),
-                      finalPaths_[i].begin() + minFinalPathsSize,
-                      finalPaths_[i].end(),
-                      Path::greaterPath);
-    finalPaths_[i].resize(minFinalPathsSize);
-  }
-
-  generator_.ids.clear();
-  int* starts = generator_.outArg.sequenceStartPositions->getMutableData(false);
-  starts[0] = 0;
-  if (numResults > 1) {
-    int idsProbSaveSize = 0;
-    for (auto inSeq : finalPaths_) {
-      for (auto path : inSeq) idsProbSaveSize += path.ids.size();
-      idsProbSaveSize += inSeq.size();
-    }
-    Matrix::resizeOrCreate(
-        generator_.outArg.value, idsProbSaveSize, 1, false, false);
-    real* idsProb = generator_.outArg.value->getData();
-
-    real* probs = generator_.outArg.in->getData();
-    size_t curPos = 0;
-    for (size_t i = 0; i < finalPaths_.size(); ++i) {
-      for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
-        Path& path = finalPaths_[i][j];
-        size_t genLen = path.ids.size();
-        generator_.ids.push_back(genLen);  // sequence size
-        generator_.ids.insert(
-            generator_.ids.end(), path.ids.begin(), path.ids.end());
-        generator_.ids.push_back(-1);  // end of sequence
-
-        memcpy(idsProb + curPos, path.idsProb.data(), sizeof(real) * genLen);
-        curPos += genLen;
-        idsProb[curPos++] = -1.0;
-        probs[i * numResults + j] = path.logProb;
-      }
-      starts[i + 1] = generator_.ids.size();
-    }
-  } else {
-    for (size_t i = 0; i < finalPaths_.size(); ++i) {
-      CHECK(!finalPaths_[i].empty());
-      Path& path = finalPaths_[i][0];
-      generator_.ids.insert(
-          generator_.ids.end(), path.ids.begin(), path.ids.end());
-      starts[i + 1] = starts[i] + path.ids.size();
-    }
-  }
-}
-
-void RecurrentGradientMachine::copyDataOutlinkFrame(size_t machineCur) {
-  for (size_t i = 0; i < dataArgsSize_; i++) {
-    Argument outFrame;
-    outFrame.resizeAndCopyFrom(
-        outFrameLines_[i + 1].frames[machineCur]->getOutput(), useGpu_);
-    dataArgsFrame_[i].emplace_back(outFrame);
-  }
-}
-
-void RecurrentGradientMachine::createDataOutlinkSelRowsInfo(
-    bool isSeq, std::vector<Argument>& outArgs) {
-  batchMachineIdVec_.clear();
-
-  size_t seqIdx = 0;
-  for (size_t i = 0; i < finalPaths_.size(); ++i) {
-    for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
-      std::vector<int>& machineIdVec = finalPaths_[i][j].machineIdVec;
-      if (isSeq) {
-        for (size_t i = 0; i < machineIdVec.size(); ++i) {
-          size_t rowId = machineIdVec[i];
-          int* seqPos =
-              outArgs[i].sequenceStartPositions->getMutableData(false);
-          batchMachineIdVec_.push_back(seqPos[rowId]);
-        }
-      } else {
-        batchMachineIdVec_.insert(
-            batchMachineIdVec_.end(), machineIdVec.begin(), machineIdVec.end());
-      }
-      seqIdx++;
-    }
-  }
-}
-
-void RecurrentGradientMachine::createDataOutlinkCopySizeInfo(
-    bool isSeq, std::vector<Argument>& outArgs, std::vector<int>& copySize) {
-  size_t totalSeqNum = std::accumulate(
-      finalPaths_.begin(),
-      finalPaths_.end(),
-      0UL,
-      [](size_t a, const std::vector<Path>& b) { return a + b.size(); });
-  copySize.resize(totalSeqNum, 1);
-
-  batchMachineStartPos_.resize(totalSeqNum + 1, 0);
-  if (isSeq) {
-    ICpuGpuVectorPtr inputSeqStartPos = outArgs[0].sequenceStartPositions;
-    CHECK_EQ(static_cast<size_t>(inputSeqStartPos->getSize() - 1),
-             getBeamSize() > 1 ? finalPaths_.size() : finalPaths_[0].size());
-    int* starts = inputSeqStartPos->getMutableData(false);
-    int seqId = 0;
-    for (size_t i = 0; i < finalPaths_.size(); ++i) {
-      for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
-        copySize[seqId] = getBeamSize() > 1 ? starts[i + 1] - starts[i]
-                                            : starts[j + 1] - starts[j];
-        batchMachineStartPos_[seqId + 1] =
-            batchMachineStartPos_[seqId] + finalPaths_[i][j].ids.size();
-        seqId++;
-      }
-    }
-  } else {
-    for (size_t i = 0; i < finalPaths_[0].size(); ++i)
-      batchMachineStartPos_[i + 1] =
-          batchMachineStartPos_[i] + finalPaths_[0][i].ids.size();
-  }
-}
-
-void RecurrentGradientMachine::createDataOutlink() {
-  for (size_t i = 0; i < dataArgsSize_; i++) {
-    bool isSeq = dataArgsFrame_[i][0].hasSeq();
-    std::vector<int> copySize;
-    createDataOutlinkCopySizeInfo(isSeq, dataArgsFrame_[i], copySize);
-    createDataOutlinkSelRowsInfo(isSeq, dataArgsFrame_[i]);
-
-    dataArgs_[i].concat(dataArgsFrame_[i],
-                        batchMachineIdVec_,
-                        batchMachineStartPos_,
-                        copySize,
-                        useGpu_,
-                        HPPL_STREAM_1,
-                        PASS_TEST);
-    auto dataAgent =
-        dynamic_cast<DataLayer*>(outFrameLines_[i + 1].agentLayer.get());
-    CHECK_NOTNULL(dataAgent);
-    dataAgent->setData(dataArgs_[i]);
-  }
-}
-
-void RecurrentGradientMachine::beamSearch(size_t batchSize) {
-  finalPaths_.clear();
-  finalPaths_.resize(batchSize);
-  seqIds_.resize(batchSize);
-  minFinalPathLogProb_.clear();
-  minFinalPathLogProb_.resize(batchSize, 0);
-
-  std::vector<Path> paths;
-  std::vector<Path> newPaths;
-  for (size_t i = 0; i < batchSize; ++i) {
-    paths.push_back(Path(i));
-    if (this->beamSearchCtrlCallbacks_) {
-      paths.back().recordHistory();
-    }
-  }
-
-  // restart beam search
-  stopBeamSearch_ = false;
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    int machineCur = i % 2;
-    std::unique_ptr<
-        ScopedCallbacks<const RecurrentGradientMachine::EachStepCallback&, int>>
-        statisticsBlock;
-    if (this->beamSearchStatistics_) {
-      auto ptr =
-          new ScopedCallbacks<const RecurrentGradientMachine::EachStepCallback&,
-                              int>(beamSearchStatistics_->onEachStepStarted,
-                                   beamSearchStatistics_->onEachStepStoped,
-                                   i);
-      statisticsBlock.reset(ptr);
-    }
-    if (stopBeamSearch_) break;
-
-    if (i) connectPrevFrame(i, paths);
-
-    if (this->beamSearchCtrlCallbacks_) {
-      std::vector<std::vector<int>*> prefixes;
-      prefixes.resize(paths.size());
-      std::transform(
-          paths.begin(), paths.end(), prefixes.begin(), [](const Path& p) {
-            return const_cast<std::vector<int>*>(&p.ids);
-          });
-      beamSearchCtrlCallbacks_->beamSearchCandidateAdjust(
-          prefixes, frames_[machineCur].get(), i);
-    }
-
-    forwardFrame(machineCur);
-    beamExpand(paths, newPaths);
-    if (newPaths.empty()) break;
-
-    paths = newPaths;
-    newPaths.clear();
-  }  // end for machineCur
-  fillGenOutputs();
-}
-
-void RecurrentGradientMachine::Path::adjustProb(int calc_id, bool atEos) {
-  if (gDiyProbMethod) {
-    logProb = gDiyProbMethod(calc_id, ids.size(), ids.data(), logProb, atEos);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
deleted file mode 100644
index 7e943cebd35234ba7af357c9f64fde6b0a9546ce..0000000000000000000000000000000000000000
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
+++ /dev/null
@@ -1,580 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include "GradientMachine.h"
-#include "NeuralNetwork.h"
-
-#include "paddle/utils/Locks.h"
-
-namespace paddle {
-
-/**
- * Private data class declares.
- * Used for user customized beam search.
- */
-class BeamSearchControlCallbacks;
-class BeamSearchStatisticsCallbacks;
-
-class RecurrentGradientMachine : public NeuralNetwork {
- public:
-  RecurrentGradientMachine(const std::string& subModelName,
-                           NeuralNetwork* rootNetwork);
-
-  // Disable copy and assign.
-  RecurrentGradientMachine(const RecurrentGradientMachine& other) = delete;
-  RecurrentGradientMachine& operator=(const RecurrentGradientMachine& other) =
-      delete;
-
-  virtual ~RecurrentGradientMachine() {
-    this->removeBeamSearchStatisticsCallbacks();
-    this->removeBeamSearchControlCallbacks();
-  }
-
-  virtual void init(const ModelConfig& config,
-                    ParamInitCallback callback,
-                    const std::vector<ParameterType>& parameterTypes,
-                    bool useGpu);
-
-  virtual void prefetch(const std::vector<Argument>& inArgs);
-
-  virtual void forward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs,
-                       PassType passType);
-
-  virtual void backward(const UpdateCallback& callback = nullptr);
-
-  void forwardBackward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs,
-                       PassType passType,
-                       const UpdateCallback& callback);
-
-  virtual void resetState() {}
-  virtual void eval(Evaluator* evaluator) const;
-
-  const std::vector<int>& getParameterIds() { return parameterIds_; }
-
-  /**
-   * @brief BeamSearchCandidatesAdjustCallback
-   *
-   * Adjust searching candidates to restrict beam search
-   * searching within a limited subset of all possibile paths.
-   *
-   * The first parameter is the prefixes of all formed paths in current
-   * beam search step, whose type is basically int[][].
-   *
-   * The second parameter is a pointer to the network used to generate sequence,
-   * user can use this pointer to tranverse each layer in the network to
-   * modify behaivors of a particular layer.
-   *
-   * The third parameter is an integer to indicate the iteration number of
-   * beam search, so that user can customize different operations in different
-   * beam search iterations.
-   */
-  typedef std::function<void(
-      const std::vector<std::vector<int>*>&, NeuralNetwork*, const int)>
-      BeamSearchCandidatesAdjustCallback;
-
-  /**
-   * @brief DropCallback
-   *
-   * Drop a whole prefix or one candidate in beam search or not.
-   *
-   * The first parameter is sequence index in a batch
-   *
-   * The second parameter is one path in beam search,
-   * which is made up of node indices.
-   *
-   * The third parameter is probabilites for each node in this path.
-   *
-   * Return true if this prefix or candidate is expected to be dropped.
-   */
-  typedef std::function<bool(
-      int seqId, const std::vector<int>&, const std::vector<real>&)>
-      DropCallback;
-
-  /**
-   * @brief NormOrDropNodeCallback
-   *
-   * Normalize a path's probabilities or just drop it by modifying path.logProb
-   *
-   * The first parameter is sequence index in a batch
-   *
-   * The second parameter is path.ids
-   *
-   * The third parameter is probabilites for each node in this path.
-   *
-   * The fourth parameter is the probability of the whole path.
-   */
-  typedef std::function<void(
-      int seqId, const std::vector<int>&, std::vector<real>&, real*)>
-      NormOrDropNodeCallback;
-
-  /**
-   * @brief Register beam search control callbacks. Used for prediction.
-   *
-   * @param queryBeamSearch: Give the sequences already formed, return the
-   * nodes expected to be expanded.
-   * Input: A pointer to an array holding pathes which have been expanded
-   * Return: A pointer to an array holding nodes wanted to be expanded.
-   *
-   * @param dropOneNode: Early drop a node in one beam search step.
-   * Given the path formed and probability history, decide whether a node
-   * should be dropped or not.
-   *
-   * @param stopBeamSearch: Early stop a path in one beam search step.
-   * Given the path and probability history, decide whether a path
-   * should be dropped or not.
-   */
-  void registerBeamSearchControlCallbacks(
-      const BeamSearchCandidatesAdjustCallback& adjustBeamSearch,
-      const NormOrDropNodeCallback& normOrDropNode,
-      const DropCallback& stopBeamSearch);
-
-  /**
-   * @brief Remove user costumized beam search callbacks,
-   *
-   * make sequence generation acts like normal beam search.
-   */
-  void removeBeamSearchControlCallbacks();
-
-  /**
-   * @brief EachStepCallback
-   *
-   * Invoke with beam search step.
-   */
-  typedef std::function<void(int)> EachStepCallback;
-
-  /**
-   * @brief register statistics methods for performance profile of beam search.
-   *
-   * @param onEachStepStarted: invoke once a beam search step starts.
-   * Its input is index of the beam search step.
-   *
-   * @param onEachStepStoped: invoke once a beam search step ends.
-   * Its input is index of the beam search step.
-   */
-  void registerBeamSearchStatisticsCallbacks(
-      const EachStepCallback& onEachStepStarted,
-      const EachStepCallback& onEachStepStoped);
-
-  /**
-   * @brief Remove beam search callbacks.
-   */
-  void removeBeamSearchStatisticsCallbacks();
-
-  /**
-   * @brief Stop beam search for current source.
-   *
-   * Will restart beam search in the next forward
-   */
-  void stopBeamSearch();
-
-  struct Path {
-    /**
-     * @brief ids, path of beam search.
-     */
-    std::vector<int> ids;
-
-    /**
-     * @brief idsProb, log probability of each generated word.
-     */
-    std::vector<real> idsProb;
-
-    /**
-     * @brief logProb, current probability of path.
-     */
-    real logProb;
-
-    int machineId;  // index of sample in frame
-    int topIndex;   // index of MaxIdLayer output in one sample
-    int seqId;      // index of sequence in batch generation
-    std::vector<int> machineIdVec;
-
-    /**
-     * @brief A record of each node's probality in a formed path in beam search.
-     *
-     * @note  It could be empty when history is not recorded. If the history is
-     *        wanted to be recorded, recordHistory() MUST be invoked first.
-     */
-    std::vector<real> probHistory;
-
-    /**
-     * @brief Path default ctor, first logProb is 0.
-     */
-    Path() {
-      logProb = 0;
-      seqId = 0;
-    }
-    explicit Path(size_t seqId) : seqId(seqId) { logProb = 0; }
-
-    /**
-     * @brief Create a new path based on an old path and
-     * a new node with probability.
-     *
-     * @param old       old path
-     * @param newId     index of the new node
-     * @param logProb   probability of the new node.
-     * @param machineId sample index of a frame in RNN
-     * @param topIndex  index of MaxIdLayer output in one sample
-     */
-    Path(Path& old, int newId, real logProb, int machineId, int topIndex)
-        : ids(old.ids),
-          idsProb(old.idsProb),
-          logProb(old.logProb + logProb),
-          machineId(machineId),
-          topIndex(topIndex),
-          seqId(old.seqId) {
-      ids.push_back(newId);
-      idsProb.push_back(logProb);
-      if (!old.probHistory.empty()) {
-        this->probHistory = old.probHistory;
-        // probHistory store current prob, not sum
-        this->probHistory.push_back(logProb);
-      }
-    }
-
-    /**
-     * @brief operator <
-     *
-     * Path a < Path b means log probability of a is smaller than that of b
-     */
-    bool operator<(const Path& other) const {
-      return (logProb < other.logProb);
-    }
-
-    static bool greaterPath(const Path& a, const Path& b) { return (b < a); }
-
-    /**
-     * @brief Start recording history in this path.
-     */
-    void recordHistory() { this->probHistory.push_back(this->logProb); }
-
-    /**
-     * @brief Adjust probability for DIY beam search interface.
-     * In normal situation, it will do nothing.
-     *
-     * @param calc_id: the object id for DIY beam search interface.
-     * @param atEos: at end of sequence or not.
-     */
-    void adjustProb(int calc_id, bool atEos = false);
-
-    /**
-     * @brief isDropable indacating whether the current node will be
-     * dropped or not in beam search.
-     *
-     * @note: if logProb is -inf, current node will be dropped.
-     * @return true to drop the current node.
-     */
-    bool isDropable() const { return std::isinf(logProb) && logProb < 0; }
-  };
-
-  /**
-   * @brief access beam search results.
-   * @return beam search results.
-   */
-  const std::vector<std::vector<Path>>& getFinalPaths() const {
-    return this->finalPaths_;
-  }
-
- protected:
-  std::vector<Argument::SeqInfo> commonSeqInfo_;
-  ICpuGpuVectorPtr sequenceStartPositions_;
-  void calcSequenceStartPositions();
-  void checkInputConsistency(int inlinkId,
-                             const std::vector<Argument::SeqInfo>& seqInfo);
-  void reorganizeInput(PassType passType);
-  void reorganizeOutput(PassType passType);
-  void connectFrames(PassType passType);
-  void calcNumSequencesAtEachStep();
-
-  void resizeOrCreateFrames(int numFrames);
-  void resizeBootFrame(int numSequences);
-
-  void generateSequence();
-  void oneWaySearch(size_t batchSize);
-  void beamSearch(size_t batchSize);
-
-  struct InFrameLine {
-    std::string linkName;
-    LayerPtr inLayer;
-    std::vector<LayerPtr> agents;  // Scatter Agents to reform batch input
-    Argument outArg;               // scatter output argument
-  };
-  std::vector<InFrameLine> inFrameLines_;
-
-  struct OutFrameLine {
-    std::string layerName;
-    LayerPtr agentLayer;
-    std::vector<LayerPtr> frames;
-  };
-  std::vector<OutFrameLine> outFrameLines_;
-
-  struct MemoryFrameLine {
-    std::string layerName;
-    std::string linkName;
-    LayerPtr bootLayer;  // actually used biasLayer or rootAgent
-    LayerPtr biasLayer;
-    LayerPtr rootLayer;  // layer in root network to boot this memory
-    LayerPtr rootAgent;  // agent to link rootLayer
-    std::vector<LayerPtr> frames;
-    std::vector<LayerPtr> agents;
-    std::vector<LayerPtr> scatterAgents;  // scatter agent used by beam search
-    Argument outArg;                      // scatter output argument
-    // Different memoryFrameLine have different element as follows
-    IVectorPtr allIds;  // scattered id of realLayer
-    ICpuGpuVectorPtr
-        sequenceStartPositions;  // scattered sequenceStartPositions
-  };
-  std::vector<MemoryFrameLine> memoryFrameLines_;
-
-  // Each inFrameLines(inlinks) has its own info(elements) below,
-  // and all outFrameLines(outlinks) share the info with one inFrameLine,
-  // which is assigned by targetInfoInlinkId_.
-  struct Info {
-    // The original positions in the original batch
-    IVectorPtr allIds;  // scattered id of realLayer [batchSize]
-
-    // index of allIds for each step [maxSequenceLength_]
-    // idIndex[i] is the total length of the first i sequences
-    std::vector<int> idIndex;
-
-    ICpuGpuVectorPtr
-        sequenceStartPositions;         // scattered sequenceStartPositions
-    std::vector<int> seqStartPosIndex;  // index of sequenceStartPositions
-  };
-  std::vector<Info> info_;  // for input
-
-  // numSeqs_[i] is the number sequences which is longer than i (for sequence
-  // data) or has more than i subsequences (for subsequence data)
-  // Equivalently, numSeqs_[i] is the number of sequences at step i;
-  std::vector<int> numSeqs_;
-
-  std::vector<std::vector<Argument::SeqInfo>> seqInfos_;
-
-  void checkOutputConsistency(OutFrameLine& outFrameLine);
-
-  /* create scattered id infomation for all realLayer of inFrameLines one time.
-   *  If hasSubseq, will also create scattered sequenceStartPositions infomation
-   *  for all realLayer of inFrameLines one time.
-   */
-  void createInFrameInfo(int inlinks_id,
-                         const Argument& input,
-                         PassType passType);
-  void createInFrameInfo_nonseq(int inlinks_id,
-                                const Argument& input,
-                                PassType passType);
-  void createInFrameInfo_seq(int inlinks_id,
-                             const Argument& input,
-                             PassType passType);
-  void createInFrameInfo_subseq(int inlinks_id,
-                                const Argument& input,
-                                PassType passType);
-
-  void createOutFrameInfo(OutFrameLine& outFrameLine,
-                          Info& info,
-                          ICpuGpuVectorPtr& sequenceStartPositions,
-                          ICpuGpuVectorPtr& subSequenceStartPositions);
-  void createOutFrameInfo_seq(OutFrameLine& outFrameLine,
-                              Info& info,
-                              ICpuGpuVectorPtr& sequenceStartPositions,
-                              ICpuGpuVectorPtr& subSequenceStartPositions);
-  void createOutFrameInfo_subseq(OutFrameLine& outFrameLine,
-                                 Info& info,
-                                 ICpuGpuVectorPtr& sequenceStartPositions,
-                                 ICpuGpuVectorPtr& subSequenceStartPositions);
-
-  void createMemoryFrameInfo(MemoryFrameLine* memoryFrameLine,
-                             PassType passType);
-
-  void copyScattedId(std::vector<int>& srcIds, IVectorPtr* dstIds, int size);
-
-  void selectRowsOneTime(LayerPtr layer,
-                         const IVectorPtr& allIds,
-                         Argument* arg,
-                         PassType passType);
-
-  void createSeqPos(const std::vector<int>& sequenceStartPosition,
-                    ICpuGpuVectorPtr* sequenceStartPositions);
-
-  // for generator
-  struct EosFrameLine {
-    std::vector<LayerPtr> layers;
-  };
-  std::unique_ptr<EosFrameLine> eosFrameLine_;
-
-  struct Generator {
-    GeneratorConfig config;
-    std::vector<int> ids;       // store generated sequences
-    std::vector<real> idsProb;  // log probability of each generated word
-    Argument outArg;            // final output argument
-  };
-  bool generating_;
-  Generator generator_;
-
-  std::vector<std::unique_ptr<NeuralNetwork>> frames_;
-
-  NeuralNetwork* rootNetwork_;
-  bool reversed_;
-
-  int maxSequenceLength_;  // Max top-level length
-  bool useGpu_;
-  bool stopBeamSearch_;
-
-  std::vector<int>
-      parameterIds_;  // parameters actually used by this Layer Group
-
-  // store final argument of outFrameLines_
-  std::vector<Argument> dataArgs_;
-  // store each frame's output argument of outFrameLines_
-  std::vector<std::vector<Argument>> dataArgsFrame_;
-  size_t dataArgsSize_;  // size of dataArgs_ = size of dataArgsFrame_
-
-  IVectorPtr cpuId_;
-  MatrixPtr cpuProb_;
-  IVectorPtr cpuEos_;
-
- private:
-  /*
-   * @return beam size in beam search
-   */
-  size_t getBeamSize() { return generator_.config.beam_size(); }
-
-  /*
-   * @return number of sequence in a batch in generation
-   */
-  size_t getGenBatchSize();
-
-  /*
-   * @brief store output of the machineCur-th frame during generation, for
-   * creating the final outlink after the entire generation process is finished.
-   *
-   * In generation, if the layer group has more than 1 outlink, the first
-   * one is reserved to store the generated word indices, the others are data
-   * outlinks, that can be used like a common layer in the network.
-   *
-   * @param machineCur : index to access the layer group frame in
-   * currrent generation step.
-   */
-  void copyDataOutlinkFrame(size_t machineCur);
-
-  /*
-   * @brief In generation, if the layer group has more than 1 outlink, outlink
-   * except the first one is a data outlink. In RecurrentLayerGroup, each time
-   * step is a separate Network, outputs of a layer inside the
-   * RecurrentLayerGroup are stored in separate Arguments. If one layer is
-   * specified as an outlink of RecurrentLayerGroup. This function will
-   * collect outputs in each time step of each generated sequence which are
-   * dispersed in separate Arguments to form a new single Argument as output of
-   * RecurrentLayerGroup.
-   */
-  void createDataOutlink();
-
-  /*
-   * @brief decide to select how many rows from the Matrix stored the forward
-   * pass results from a start position.
-   *
-   * @param isSeq: a flag indicating whetehr the layer to be output of the
-   * RecurrentGradientMachine is a sequence or not
-   * @param outArgs: all of the the returned Arguments of the forward pass
-   * during the generation process.
-   * @param copySize: the returned result, number of rows to select from the
-   * Matrix stored the forward pass results from a start position.
-   */
-  void createDataOutlinkCopySizeInfo(bool isSeq,
-                                     std::vector<Argument>& outArgs,
-                                     std::vector<int>& copySize);
-
-  /*
-   * @brief decide index of the start row for each time step of a generated
-   * sequence in Matrix stored the entire beam search batch's forward pass
-   * results.
-   *
-   * @param isSeq: a flag indicating whether the layer to be output of the
-   * RecurrentGradientMachine is a sequence or not
-   * @param outArgs: all of the returned Arguments of the forward pass
-   * during the generation process.
-   */
-  void createDataOutlinkSelRowsInfo(bool isSeq, std::vector<Argument>& outArgs);
-
-  /*
-   * @brief used in beam search, connect previous frame to form recurrent link
-   * @param stepId : iteration number of generation process.
-   * It equals to the length of longest half-generated sequence.
-   * @param paths : half-generated paths that are going to be expanded
-   * in current beam search iteration.
-   */
-  void connectPrevFrame(int stepId, std::vector<Path>& paths);
-
-  /*
-   * @brief used in beam search, forward current recurrent frame
-   * @param machineCur : index to access the layer group frame in
-   * currrent generation step.
-   */
-  void forwardFrame(int machineCur);
-
-  /*
-   * @brief reduce all expanded paths to beam size.
-   *
-   * @param newPaths : newPaths[totalExpandCount : ] stores all expanded paths
-   * for the seqId-th sequence
-   * @param seqId : sequence index in a batch
-   * @param totalExpandCount : number of already shrinked paths in newPaths
-   * @return size of retained paths at the end of a beam search iteration
-   */
-  size_t beamShrink(std::vector<Path>& newPaths,
-                    size_t seqId,
-                    size_t totalExpandCount);
-
-  /*
-   * @brief expand a single path to expandWidth new paths
-   * with highest probability
-   * @param curPath : path to be expanded
-   * @param curPathId : index of curPath in member newPaths
-   * @param expandWidth : number of paths to be expanded
-   */
-  void singlePathExpand(Path& curPath,
-                        size_t curPathId,
-                        std::vector<Path>& newPaths,
-                        size_t expandWidth);
-
-  /*
-   * @brief A new beam search iteration. Each half-generated paths in previous
-   * beam search iteration are further expanded to beam_size new paths
-   * with highest probabilities, and then all the expanded paths are again
-   * reduced to beam_size paths according to their log probabilities.
-   * @param paths : half-generated paths in previous iteration.
-   * @param newPaths : paths expanded and then reduces in current iteration.
-   */
-  void beamExpand(std::vector<Path>& paths, std::vector<Path>& newPaths);
-
-  /*
-   * @brief fill sequence start positions and some other information that are
-   * uesed by the "text_printer" evaluator.
-   */
-  void fillGenOutputs();
-
-  std::vector<int> machineIds_;
-  std::vector<int> topIds_;
-  std::vector<int> seqIds_;
-  std::vector<int> batchMachineIdVec_;
-  std::vector<int> batchMachineStartPos_;
-  std::vector<std::vector<Path>> finalPaths_;
-  std::vector<real> minFinalPathLogProb_;
-  BeamSearchControlCallbacks* beamSearchCtrlCallbacks_;
-  BeamSearchStatisticsCallbacks* beamSearchStatistics_;
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/AddtoLayer.cpp b/paddle/gserver/layers/AddtoLayer.cpp
deleted file mode 100644
index 75e17f52df64253232dc8fc042d0a1a8e7d98e26..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/AddtoLayer.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "AddtoLayer.h"
-
-#include "paddle/utils/Logging.h"
-
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(addto, AddtoLayer);
-
-bool AddtoLayer::init(const LayerMap& layerMap,
-                      const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-
-  return true;
-}
-
-void AddtoLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  int batchSize = getInputValue(0)->getHeight();
-  int size = getSize();
-
-  reserveOutput(batchSize, size);
-
-  MatrixPtr outV = getOutputValue();
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    MatrixPtr input = getInputValue(i);
-    i == 0 ? outV->assign(*input) : outV->add(*input);
-  }
-  /* add the bias-vector */
-  if (biases_.get() != NULL) {
-    outV->addBias(*(biases_->getW()), 1);
-  }
-
-  /* activation */ { forwardActivation(); }
-}
-
-void AddtoLayer::backward(const UpdateCallback& callback) {
-  /* Do derivation */ { backwardActivation(); }
-
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-
-    /* Increasing the number of gradient */
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    /* Calculate the input layers error */
-    MatrixPtr preGrad = getInputGrad(i);
-    if (NULL != preGrad) {
-      preGrad->add(*getOutputGrad());
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/AddtoLayer.h b/paddle/gserver/layers/AddtoLayer.h
deleted file mode 100644
index 6ea54f4a53d466594055db2fb5167fa1a9d6c9da..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/AddtoLayer.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/ThreadLocal.h"
-
-namespace paddle {
-
-/**
- * This layer just simply add all input layers together, then activate
- * the sum inputs. Each input of this layer should be the same size,
- * which is also the output size of this layer.
- * \f[
- *   y=f(\sum_{i}x_i + b)
- * \f]
- * where \f$y\f$ is output, \f$x\f$ is input, \f$b\f$ is bias, and \f$f\f$ is
- * activation function.
- *
- * The config file api is addto_layer.
- */
-class AddtoLayer : public Layer {
- protected:
-  std::unique_ptr<Weight> biases_;
-
- public:
-  explicit AddtoLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~AddtoLayer() {}
-
-  /**
-   * Intialization of AddtoLayer.
-   */
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  /**
-   * Forward propagation.
-   * @note There is no weight matrix for each input,
-   *       because it just a simple add operation.
-   */
-  void forward(PassType passType) override;
-
-  /**
-   * Backward propagation.
-   */
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/AgentLayer.cpp b/paddle/gserver/layers/AgentLayer.cpp
deleted file mode 100644
index e2f73f88f59278c6e6e6f0a1fe8457393d53f44a..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/AgentLayer.cpp
+++ /dev/null
@@ -1,281 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "AgentLayer.h"
-
-#include "paddle/utils/Logging.h"
-
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(agent, AgentLayer);
-
-bool AgentLayer::init(const LayerMap& layerMap,
-                      const ParameterMap& parameterMap) {
-  CHECK_EQ(config_.inputs_size(), 0);
-  if (!Layer::init(layerMap, parameterMap)) {
-    return false;
-  }
-  setNeedGradient(true);
-  return true;
-}
-
-void AgentLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  Argument& realOutput = realLayer_->getOutput();
-  int realNumSequences = realOutput.getNumSequences();
-  CHECK_LE(numSamples_, realNumSequences);
-
-  // get Arguments from real layers
-  if (numSamples_ > 0 && numSamples_ < realNumSequences) {
-    if (realOutput.hasSeq()) {
-      int numRows =
-          realOutput.sequenceStartPositions->getData(false)[numSamples_];
-      output_.subArgFrom(realOutput,
-                         /* offset */ 0,
-                         numRows,
-                         getSize(),
-                         useGpu_,
-                         /* trans */ false,
-                         /* seqFlag */ true,
-                         /* seqStart */ 0,
-                         /* seqSize */ numSamples_ + 1);
-    } else {
-      output_.subArgFrom(
-          realOutput, /* offset */ 0, numSamples_, getSize(), useGpu_);
-    }
-  } else {
-    output_ = realOutput;
-  }
-}
-
-bool GatherAgentLayer::init(const LayerMap& layerMap,
-                            const ParameterMap& parameterMap) {
-  CHECK_EQ(config_.inputs_size(), 0);
-  if (!Layer::init(layerMap, parameterMap)) {
-    return false;
-  }
-  setNeedGradient(true);
-  return true;
-}
-
-void GatherAgentLayer::copyIdAndSequenceInfo(
-    ICpuGpuVectorPtr sequenceStartPositions,
-    ICpuGpuVectorPtr subSequenceStartPositions,
-    const IVectorPtr& ids,
-    const std::vector<int>& idIndex) {
-  output_.sequenceStartPositions = sequenceStartPositions;
-  output_.subSequenceStartPositions = subSequenceStartPositions;
-  allIds_ = ids;
-  idIndex_ = idIndex;
-}
-
-void GatherAgentLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  forwardIds(passType);
-  forwardValue(passType);
-}
-
-void GatherAgentLayer::forwardValue(PassType passType) {
-  MatrixPtr valueReal = realLayers_[0]->getOutputValue();
-  if (!valueReal) return;
-
-  int height = allIds_->getSize();
-  int width = this->getSize();
-  resetOutput(height, width);
-  idsVec_.resize(idIndex_.size());
-
-  const MatrixPtr& outV = getOutputValue();
-
-  for (size_t i = 0; i < realLayers_.size(); ++i) {
-    const MatrixPtr& realV = realLayers_[i]->getOutputValue();
-    idsVec_[i] = IVector::create(allIds_->getData() + idIndex_[i],
-                                 /* size */ realV->getHeight(),
-                                 useGpu_);
-    realV->addToRows(*outV, *idsVec_[i]);
-  }
-}
-
-namespace {
-
-// dest[index[i]] <- src[i] for each i
-void copyElements(const IVector& srcVec,
-                  const IVector& indexVec,
-                  IVector& destVec) {
-  const int* src = srcVec.getData();
-  const int* index = indexVec.getData();
-  int* dest = destVec.getData();
-  int len = indexVec.getSize();
-  CHECK_EQ(srcVec.getSize(), indexVec.getSize());
-  for (int i = 0; i < len; ++i) {
-    dest[index[i]] = src[i];
-  }
-}
-}  // namespace
-
-void GatherAgentLayer::forwardIds(PassType passType) {
-  IVectorPtr realId = realLayers_[0]->getOutputLabel();
-  if (!realId) return;
-
-  IVector::resizeOrCreate(output_.ids, allIds_->getSize(), useGpu_);
-  IVectorPtr outId = output_.ids;
-  idsVec_.resize(idIndex_.size());
-
-  for (size_t i = 0; i < realLayers_.size(); ++i) {
-    const IVectorPtr& realId = realLayers_[i]->getOutputLabel();
-    idsVec_[i] = IVector::create(allIds_->getData() + idIndex_[i],
-                                 /* size */ realId->getSize(),
-                                 useGpu_);
-    execViaCpu(&copyElements, *realId, *idsVec_[i], *outId);
-  }
-}
-
-void GatherAgentLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-  const MatrixPtr& outputGrad = getOutputGrad();
-
-  for (size_t i = 0; i < realLayers_.size(); ++i) {
-    const MatrixPtr& realG = realLayers_[i]->getOutputGrad();
-    if (realG) {
-      realG->selectRows(*outputGrad, *idsVec_[i]);
-    }
-  }
-}
-
-bool ScatterAgentLayer::init(const LayerMap& layerMap,
-                             const ParameterMap& parameterMap) {
-  CHECK_EQ(config_.inputs_size(), 0);
-  if (!Layer::init(layerMap, parameterMap)) {
-    return false;
-  }
-  setNeedGradient(true);
-  return true;
-}
-
-void ScatterAgentLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  CHECK_EQ(realLayer_->getDeviceId(), this->getDeviceId());
-
-  int width = this->getSize();
-  if (selectionMode_) {
-    forwardWithSelection(passType);
-  } else {
-    if (realOutArg_.hasSeq()) {
-      output_.subArgFrom(realOutArg_,
-                         /* offset */ idIndex_,
-                         idSize_,
-                         width,
-                         useGpu_,
-                         /* trans */ false,
-                         /* seqFlag */ true,
-                         /* seqStart */ seqStartPosIndex_,
-                         /* seqSize */ numSequences_);
-    } else {
-      output_.subArgFrom(
-          realOutArg_, /* offset */ idIndex_, idSize_, width, useGpu_);
-    }
-  }
-}
-
-void ScatterAgentLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-
-  CHECK(!selectionMode_);
-
-  const MatrixPtr& outputGrad = realOutArg_.grad;
-  const MatrixPtr& realGrad = realLayer_->getOutputGrad();
-  if (realGrad) {
-    // for agent in inFrameLines and memoryFrameLines,
-    // only first scatterAgentLayer should do addToRows in backward
-    if (handleBackward_) {
-      outputGrad->addToRows(*realGrad, *ids_);
-    }
-  }
-}
-
-REGISTER_LAYER(gather_agent, GatherAgentLayer);
-REGISTER_LAYER(scatter_agent, ScatterAgentLayer);
-
-void ScatterAgentLayer::forwardWithSelection(PassType passType) {
-  Layer::forward(passType);
-  CHECK_EQ(realLayer_->getDeviceId(), this->getDeviceId());
-
-  const Argument& input = realLayer_->getOutput();
-  CHECK_EQ(realLayer_->getSize(), this->getSize());
-  int width = this->getSize();
-
-  AsyncGpuBlock asyncGpuBlock;
-  REGISTER_TIMER_INFO("SequenceAgentLayerForward", getName().c_str());
-
-  if (!input.hasSeq()) {
-    if (realLayer_->getOutput().ids) {
-      IVector::resizeOrCreate(output_.ids, ids_->getSize(), useGpu_);
-      output_.ids->selectFrom(*realLayer_->getOutput().ids, *ids_);
-    }
-    if (realLayer_->getOutput().value) {
-      int height = ids_->getSize();
-      resetOutput(height, width);
-
-      const MatrixPtr& outV = getOutputValue();
-      const MatrixPtr& realV = realLayer_->getOutputValue();
-      outV->selectRows(*realV, *ids_);
-    }
-  } else {
-    // Putting the generation logic here is really an ugly hack!
-    // used in generation
-    int height = 0;
-    size_t numSequences = ids_->getSize();
-    const int* starts = input.getCpuStartPositions();
-    size_t size = input.hasSubseq() ? input.getNumSubSequences()
-                                    : input.getNumSequences();
-    const int* cpuIds = cpuIds_->getData();
-
-    for (size_t i = 0; i < numSequences; ++i) {
-      size_t seqId = cpuIds[i];
-      CHECK_LT(seqId, size);
-      height += starts[seqId + 1] - starts[seqId];
-    }
-    reserveOutput(height, width);
-
-    const MatrixPtr& outputValue = getOutputValue();
-
-    CHECK_NE(input.sequenceStartPositions.get(),
-             output_.sequenceStartPositions.get());
-    ICpuGpuVector::resizeOrCreate(
-        output_.sequenceStartPositions, numSequences + 1, false);
-    int* outStarts = output_.sequenceStartPositions->getMutableData(false);
-
-    ICpuGpuVector::resizeOrCreate(inputStartPos_, height, false);
-    int* inStarts = inputStartPos_->getMutableData(false);
-
-    size_t offsetOut = 0;
-    for (size_t i = 0; i < numSequences; ++i) {
-      outStarts[i] = offsetOut;
-      size_t seqId = cpuIds[i];
-      int size = starts[seqId + 1] - starts[seqId];
-      for (int j = 0; j < size; j++) {
-        inStarts[offsetOut + j] = starts[seqId] + j;
-      }
-      offsetOut += size;
-    }
-    outStarts[numSequences] = offsetOut;
-
-    outputValue->copyByRowIndex(*input.value,
-                                *inputStartPos_->getVector(useGpu_));
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/AgentLayer.h b/paddle/gserver/layers/AgentLayer.h
deleted file mode 100644
index 51f346d5c9fdf9599cddf4b668c128035fd94187..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/AgentLayer.h
+++ /dev/null
@@ -1,177 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/ThreadLocal.h"
-
-namespace paddle {
-
-/**
- * AgentLayer use as a virtual input of another layer in config,
- * before execute forward/backward, setRealLayer() should be
- * called to set one and only one real layer
- */
-class AgentLayer : public Layer {
- protected:
-  LayerPtr realLayer_;
-  int numSamples_;
-
- public:
-  explicit AgentLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~AgentLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  // if *numSamples* set,
-  // real layer output will only use first *numSamples* rows
-  void setRealLayer(LayerPtr layer, int numSamples = 0) {
-    realLayer_ = layer;
-    numSamples_ = numSamples;
-  }
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override {}
-};
-
-/**
- * Like AgentLayer, but it can gather many real layers. Each real
- * layer give a few rows of a sequence, after gather all real layers,
- * GatherAgentLayer collect a complete sequence.
- */
-class GatherAgentLayer : public Layer {
- protected:
-  std::vector<LayerPtr> realLayers_;
-  std::vector<IVectorPtr> idsVec_;
-  // we don't clear idsVec_ vector to aviod IVector alloc/free
-  IVectorPtr allIds_;
-  std::vector<int> idIndex_;
-
- public:
-  explicit GatherAgentLayer(const LayerConfig& config) : Layer(config) {}
-
-  virtual ~GatherAgentLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  // call before addRealLayer
-  void clearRealLayers() { realLayers_.clear(); }
-
-  void copyIdAndSequenceInfo(ICpuGpuVectorPtr sequenceStartPositions,
-                             ICpuGpuVectorPtr subSequenceStartPositions,
-                             const IVectorPtr& allIds,
-                             const std::vector<int>& idIndex);
-
-  // add one real layer, can call many times
-  void addRealLayer(LayerPtr layer) { realLayers_.push_back(layer); }
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-  void forwardValue(PassType passType);
-  void forwardIds(PassType passType);
-};
-
-/**
- * Like AgentLayer, but only select a few rows in real layer.
- * [idIndex, idIndex + idSize) of *ids* in setRealLayerAndOutput()
- * are the selected row ids. It's used to scatter one layer's output
- * to many small submodels. ScatterAgentLayer can support ids real layer,
- * if it is, the agent will select a few ids in real layer.
- */
-class ScatterAgentLayer : public Layer {
- protected:
-  LayerPtr realLayer_;
-  IVectorPtr ids_;
-  IVectorPtr cpuIds_;
-  Argument realOutArg_;
-  int idIndex_;
-  int idSize_;
-  int seqStartPosIndex_;
-  int numSequences_;  // number of sequences in this scatterAgentLayer
-  bool handleBackward_;
-
-  // use to store expanded cpuStartPositions or subSequenceStartPositions
-  // of real layer.
-  ICpuGpuVectorPtr inputStartPos_;
-
-  // true for setRealLayer, false for setRealLayerAndOutput
-  bool selectionMode_;
-
- public:
-  explicit ScatterAgentLayer(const LayerConfig& config) : Layer(config) {}
-
-  virtual ~ScatterAgentLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  /**
-   * @brief set real layer in generation
-   *
-   * @param layer[input]    realLayer
-   * @param ids[input]      row id in real layer
-   * @param copyId[input]   whether to copy a cpu version of ids,
-   *                        false(default) in ScatterAgentLayer, and
-   *                        true in SequenceScatterAgentLayer.
-   */
-  void setRealLayer(LayerPtr layer, const std::vector<int>& ids) {
-    realLayer_ = layer;
-    IVector::resizeOrCreate(ids_, ids.size(), useGpu_);
-    ids_->copyFrom(ids.data(), ids.size());
-    if (useGpu_) {
-      IVector::resizeOrCreate(cpuIds_, ids.size(), false);
-      cpuIds_->copyFrom(ids.data(), ids.size());
-    } else {
-      cpuIds_ = ids_;
-    }
-    selectionMode_ = true;
-  }
-
-  // set real layer and output, [idIndex, idIndex + idSize) of *ids*
-  // are selected row for realOutArg in realLayer
-  void setRealLayerAndOutput(LayerPtr layer,
-                             const Argument& outArg,
-                             const IVectorPtr& ids,
-                             int idIndex,
-                             int idSize,
-                             bool handleBackward) {
-    realLayer_ = layer;
-    realOutArg_ = outArg;
-    ids_ = ids;
-    idIndex_ = idIndex;
-    idSize_ = idSize;
-    handleBackward_ = handleBackward;
-    selectionMode_ = false;
-  }
-
-  void setSequenceStartPositions(const ICpuGpuVectorPtr& sequenceStartPositions,
-                                 int seqStartPosIndex,
-                                 int numSequences) {
-    realOutArg_.sequenceStartPositions = sequenceStartPositions;
-    seqStartPosIndex_ = seqStartPosIndex;
-    numSequences_ = numSequences;
-  }
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-
-  void forwardWithSelection(PassType passType);
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/AverageLayer.cpp b/paddle/gserver/layers/AverageLayer.cpp
deleted file mode 100644
index b3787b1448a272d2879b372d34406aacc6c0bbfb..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/AverageLayer.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "AverageLayer.h"
-
-#include "paddle/utils/Logging.h"
-
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(average, AverageLayer);
-
-bool AverageLayer::init(const LayerMap& layerMap,
-                        const ParameterMap& parameterMap) {
-  SequencePoolLayer::init(layerMap, parameterMap);
-
-  // average strategy
-  if (config_.average_strategy() == "average") {
-    mode_ = kAverage;
-  } else if (config_.average_strategy() == "sum") {
-    mode_ = kSum;
-  } else if (config_.average_strategy() == "squarerootn") {
-    mode_ = kAverageSquareRootN;
-  } else {
-    LOG(FATAL) << "Unknown average strategy: " << config_.average_strategy();
-  }
-  return true;
-}
-
-void AverageLayer::forward(PassType passType) {
-  SequencePoolLayer::forward(passType);
-
-  MatrixPtr inputValue = getInputValue(0);
-  getOutputValue()->sequenceAvgForward(
-      *inputValue, *startPositions_->getVector(useGpu_), mode_);
-
-  /* add the bias-vector AFTER average operation */
-  if (biases_.get() != NULL) {
-    MatrixPtr outV = getOutputValue();
-    outV->addBias(*(biases_->getW()), 1);
-  }
-
-  /* activation */ { forwardActivation(); }
-}
-
-void AverageLayer::backward(const UpdateCallback& callback) {
-  SequencePoolLayer::backward(callback);
-
-  if (getInputGrad(0)) {
-    getInputGrad(0)->sequenceAvgBackward(
-        *getOutputGrad(), *startPositions_->getVector(useGpu_), mode_);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/AverageLayer.h b/paddle/gserver/layers/AverageLayer.h
deleted file mode 100644
index 03e2673b55ceca7a698f1b858327ad6fad739087..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/AverageLayer.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "SequencePoolLayer.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * A layer for "internal average" for sequence input.
- * Input: one or more sequences. Each sequence contains some instances.
- * If SequenceLevel = kNonSeq:
- *    Output: output size is the number of input sequences (NOT input instances)
- *    output[i] = average_{for each instance in this sequence}{input[i]}
- *    If stride_ > 0:
- *      Output: a shorten sequence. Stride is the step size by which we slide a
- *              window upon the input sequence, and the average pooling
- *              operation is then applied to each interval independently.
- * If SequenceLevel = kSeq:
- *    Check input sequence must has sub-sequence
- *    Output: output size is the number of input sub-sequences
- *    output[i] = average_{for each instance in this sub-sequence}{input[i]}
- *
- * The config file api is pooling_layer.
- */
-class AverageLayer : public SequencePoolLayer {
- public:
-  enum AverageStrategy { kAverage = 0, kSum = 1, kAverageSquareRootN = 2 };
-  explicit AverageLayer(const LayerConfig& config)
-      : SequencePoolLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
- protected:
-  int mode_;
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/BatchNormBaseLayer.cpp b/paddle/gserver/layers/BatchNormBaseLayer.cpp
deleted file mode 100644
index a3516f9423e62df0192485c4476357ac51dc27a4..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/BatchNormBaseLayer.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "BatchNormBaseLayer.h"
-#include "BatchNormalizationLayer.h"
-#include "Layer.h"
-#include "paddle/utils/Stat.h"
-#ifdef PADDLE_WITH_CUDA
-#include "CudnnBatchNormLayer.h"
-#endif
-
-namespace paddle {
-
-bool BatchNormBaseLayer::init(const LayerMap& layerMap,
-                              const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  if (!Layer::init(layerMap, parameterMap)) return false;
-
-  /* initialize the weightList */
-  // first is Input in configure
-  // other two is created in config_parser.py
-  CHECK_EQ(inputLayers_.size(), 3U);
-  CHECK_EQ(inputLayers_.size(), parameters_.size());
-  CHECK_EQ(inputLayers_.size(), size_t(config_.inputs_size()));
-  const ImageConfig& conf = config_.inputs(0).image_conf();
-  channels_ = conf.channels();
-  calFeatureMapSize();
-
-  if (config_.has_use_global_stats()) {
-    useGlobalStats_ = config_.use_global_stats();
-  }
-  movingAvgFraction_ = config_.moving_average_fraction();
-  epsilon_ = config_.epsilon();
-
-  weight_.reset(new Weight(1, channels_, parameters_[0]));
-  movingMean_.reset(new Weight(1, channels_, parameters_[1]));
-  movingVar_.reset(new Weight(1, channels_, parameters_[2]));
-
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, channels_, biasParameter_));
-  }
-
-  savedMean_ = Matrix::create(1, channels_, false, useGpu_);
-  savedInvVar_ = Matrix::create(1, channels_, false, useGpu_);
-  savedMean_->zeroMem();
-  savedInvVar_->zeroMem();
-
-  return true;
-}
-
-void BatchNormBaseLayer::calFeatureMapSize() {
-  const ImageConfig& conf = config_.inputs(0).image_conf();
-  imageH_ = inputLayers_[0]->getOutput().getFrameHeight();
-  imageW_ = inputLayers_[0]->getOutput().getFrameWidth();
-  imageD_ = inputLayers_[0]->getOutput().getFrameDepth();
-
-  if (0 == imageD_) imageD_ = conf.img_size_z();
-  if (imageH_ == 0 && imageW_ == 0) {
-    imageH_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
-    imageW_ = conf.img_size();
-  } else {
-    getOutput().setFrameHeight(imageH_);
-    getOutput().setFrameWidth(imageW_);
-    getOutput().setFrameDepth(imageD_);
-  }
-  imgPixels_ = imageH_ * imageW_ * imageD_;
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/BatchNormBaseLayer.h b/paddle/gserver/layers/BatchNormBaseLayer.h
deleted file mode 100644
index 5a446c0843a22adecbaf2ae09fcd526b68865ae2..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/BatchNormBaseLayer.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief Batch normalization layer use to normalizes the input to across the
- * batch.
- *
- * By default, calculating global mean and variance statistics via a running
- * average in the training peroid. Then the pre-calculated global mean and
- * variance are used for testing.
- *
- * Moving mean and variance are located in Parameter object when constructing
- * and the calculation will change them. Now we only save global mean and
- * variance of one thread in first node for GPU.
- * But the calculation in CPU is different, because parameters are shared by
- * multiple threads. Here using ShareCpuMatrix with lock to calculate. We
- * still save global mean and variance in first node in CPU when multi machine.
- *
- * [1] S. Ioffe and C. Szegedy, "Batch Normalization: Accelerating Deep Network
- *     Training by Reducing Internal Covariate Shift." arXiv preprint
- *     arXiv:1502.03167 (2015).
- */
-
-class BatchNormBaseLayer : public Layer {
- public:
-  explicit BatchNormBaseLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~BatchNormBaseLayer() {}
-
-  /**
-   * @brief Create BatchNorm layer by norm_type, including batch_norm and
-   * cudnn_batch_norm. If do not set norm_type, it will automatically select
-   * cudnn_batch_norm for GPU and batch_norm for CPU.
-   */
-  static Layer* create(const LayerConfig& config);
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  /**
-   * @brief Calculate feature map size. Some input uses frameHeight and
-   * frameWidth to store feature size
-   */
-  void calFeatureMapSize();
-
- protected:
-  /// Batch normalization scale parameter, which is referred to as gamma in
-  /// in original paper.
-  std::unique_ptr<Weight> weight_;
-  /// Moving average of mean.
-  std::unique_ptr<Weight> movingMean_;
-  /// Moving average of variance.
-  std::unique_ptr<Weight> movingVar_;
-  /// Batch normalization bias parameter, which is referred to as beta in
-  /// in original paper.
-  std::unique_ptr<Weight> biases_;
-
-  /// Save intermediate results computed during the forward pass,
-  /// these can then be reused to speed up the backward pass.
-  MatrixPtr savedMean_;
-  MatrixPtr savedInvVar_;
-
-  /// Height or width of input image feature.
-  /// Both of them are 1 if the input is fully-connected layer.
-  int imageD_;
-  int imageH_;
-  int imageW_;
-  /// Height * Width.
-  int imgPixels_;
-  /// Feature dimension. If the input layer is conv layer, it is the channels
-  /// of feature map of the conv layer. If the input layer is fully-connected
-  /// layer, it is the dimension of fc layer.
-  int channels_;
-  // if useGlobalStats_ is true, will use the loaded mean and variance.
-  // otherwise, calculate mean and variance in this mini-batch.
-  bool useGlobalStats_;
-  // use to compute moving mean and variance.
-  real movingAvgFraction_;
-  // Epsilon is a small random noise used in batch normalization for stability.
-  real epsilon_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/BatchNormalizationLayer.cpp b/paddle/gserver/layers/BatchNormalizationLayer.cpp
deleted file mode 100644
index 59831dd9049d70198721989b4a515df39e015968..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/BatchNormalizationLayer.cpp
+++ /dev/null
@@ -1,266 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/utils/Stat.h"
-#ifdef PADDLE_WITH_CUDA
-#include "hl_batch_transpose.h"
-#endif
-#include "BatchNormalizationLayer.h"
-
-namespace paddle {
-
-REGISTER_LAYER(batch_norm, BatchNormalizationLayer);
-
-bool BatchNormalizationLayer::init(const LayerMap& layerMap,
-                                   const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  if (!BatchNormBaseLayer::init(layerMap, parameterMap)) return false;
-
-  return true;
-}
-
-void BatchNormalizationLayer::calMeanAndStd(const MatrixPtr& mat) {
-  int numSamples = mat->getHeight();
-  Matrix::resizeOrCreate(tmpMat_, numSamples, channels_, false, useGpu_);
-  savedMean_->zeroMem();
-  savedMean_->accumulateColSum(*mat);
-  savedMean_->mulScalar(1.0 / numSamples);  // E[x]
-
-  tmpMat_->assign(*mat);
-  tmpMat_->square2();
-  savedInvVar_->zeroMem();
-  savedInvVar_->accumulateColSum(*tmpMat_);
-  savedInvVar_->mulScalar(1.0 / numSamples);   // E[x^2]
-  savedInvVar_->addSquare(*savedMean_, -1.0);  // E[x^2] - E^2[x]
-
-  // Variance may be small negative value
-  // because of the subtraction operation.
-  // Here using clipping.
-  savedInvVar_->downClip(real(0.0));
-
-  calMovingMeanAndVar();
-
-  savedInvVar_->subScalar(-epsilon_);
-  savedInvVar_->sqrt2(*savedInvVar_);
-}
-
-void BatchNormalizationLayer::calMovingMeanAndVar() {
-  // calculating and saving moving mean and variance
-  auto& movingMean = movingMean_->getW();
-  auto& movingVar = movingVar_->getW();
-  // movingMean =  movingMean * movingAvgFraction_
-  //            + savedMean_ * (1 - movingAvgFraction_)
-  movingMean->add(*savedMean_, movingAvgFraction_, 1.0 - movingAvgFraction_);
-  // movingVar =  movingVar * movingAvgFraction_
-  //           + savedInvVar_ * (1 - movingAvgFraction_)
-  movingVar->add(*savedInvVar_, movingAvgFraction_, 1.0 - movingAvgFraction_);
-}
-
-void BatchNormalizationLayer::setMeanAndStd() {
-  savedMean_->copyFrom(*(movingMean_->getW()));
-  savedInvVar_->copyFrom(*(movingVar_->getW()));
-  savedInvVar_->downClip(real(0.0));
-
-  savedInvVar_->subScalar(-epsilon_);
-  savedInvVar_->sqrt2(*savedInvVar_);
-}
-
-void BatchNormalizationLayer::expandMat(const MatrixPtr& in, MatrixPtr& out) {
-  CHECK_EQ(in->getWidth(), static_cast<size_t>(channels_ * imgPixels_));
-  CHECK_EQ(out->getWidth(), static_cast<size_t>(channels_));
-  CHECK(!in->isTransposed());
-  CHECK(!out->isTransposed());
-  if (imgPixels_ == 1) {
-    out->assign(*in);
-    return;
-  }
-  size_t batchSize = in->getHeight();
-  CHECK_EQ(out->getHeight(), batchSize * imgPixels_);
-  if (useGpu_) {
-#ifndef PADDLE_WITH_CUDA
-    LOG(FATAL) << "paddle is compiled only for cpu";
-#else
-    batchTranspose(
-        in->getData(), out->getData(), imgPixels_, channels_, batchSize);
-#endif
-  } else {
-    for (size_t i = 0; i < batchSize; i++) {
-      const MatrixPtr inTmp =
-          Matrix::create(in->getData() + i * imgPixels_ * channels_,
-                         channels_,
-                         imgPixels_,
-                         false,
-                         useGpu_);
-      MatrixPtr outTmp =
-          Matrix::create(out->getData() + i * imgPixels_ * channels_,
-                         imgPixels_,
-                         channels_,
-                         false,
-                         useGpu_);
-      inTmp->transpose(outTmp, false);
-    }
-  }
-}
-
-void BatchNormalizationLayer::shrinkMat(const MatrixPtr& in, MatrixPtr& out) {
-  CHECK_EQ(in->getWidth(), static_cast<size_t>(channels_));
-  CHECK_EQ(out->getWidth(), static_cast<size_t>(channels_ * imgPixels_));
-  size_t batchSize = out->getHeight();
-  CHECK(!in->isTransposed());
-  CHECK(!out->isTransposed());
-  if (imgPixels_ == 1) {
-    out->assign(*in);
-    return;
-  }
-  CHECK_EQ(in->getHeight(), static_cast<size_t>(batchSize * imgPixels_));
-  if (useGpu_) {
-#ifndef PADDLE_WITH_CUDA
-    LOG(FATAL) << "paddle is compiled only for cpu";
-#else
-    batchTranspose(
-        in->getData(), out->getData(), channels_, imgPixels_, batchSize);
-#endif
-  } else {
-    for (size_t i = 0; i < batchSize; i++) {
-      const MatrixPtr inTmp =
-          Matrix::create(in->getData() + i * channels_ * imgPixels_,
-                         imgPixels_,
-                         channels_,
-                         false,
-                         useGpu_);
-      MatrixPtr outTmp =
-          Matrix::create(out->getData() + i * imgPixels_ * channels_,
-                         channels_,
-                         imgPixels_,
-                         useGpu_);
-      inTmp->transpose(outTmp, false);
-    }
-  }
-}
-
-void BatchNormalizationLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  int batchSize = getInputValue(0)->getHeight();
-  calFeatureMapSize();
-  resetOutput(batchSize, getInputValue(0)->getWidth());
-
-  // for testing in training peroid.
-  useGlobalStats_ = (passType == PASS_TEST);
-  if (passType == PASS_TEST && config_.has_use_global_stats()) {
-    useGlobalStats_ = config_.use_global_stats();
-  }
-
-  Matrix::resizeOrCreate(
-      expandedIn_, batchSize * imgPixels_, channels_, false, useGpu_);
-  Matrix::resizeOrCreate(
-      normIn_, batchSize * imgPixels_, channels_, false, useGpu_);
-  Matrix::resizeOrCreate(
-      expandedOut_, batchSize * imgPixels_, channels_, false, useGpu_);
-  expandMat(getInputValue(0), expandedIn_);
-
-  if (useGlobalStats_) {
-    if (firstTest_) {
-      setMeanAndStd();
-      firstTest_ = false;
-    }
-  } else {
-    calMeanAndStd(expandedIn_);
-    firstTest_ = true;
-  }
-
-  normIn_->assign(*expandedIn_);
-  normIn_->addBias(*savedMean_, -1);     // subtract mean.
-  normIn_->divRowVector(*savedInvVar_);  // divide std.
-
-  expandedOut_->assign(*normIn_);
-  expandedOut_->mulRowVector(*weight_->getW());  // multiple gamma.
-  if (biases_) {
-    expandedOut_->addBias(*(biases_->getW()), 1);  // add beta.
-  }
-  MatrixPtr out = getOutputValue();
-  shrinkMat(expandedOut_, out);
-
-  /* activation */ {
-    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void BatchNormalizationLayer::backward(const UpdateCallback& callback) {
-  /* Do derivation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
-  int batchSize = getInputValue(0)->getHeight();
-
-  Matrix::resizeOrCreate(meanGrad_, 1, channels_, false, useGpu_);
-  Matrix::resizeOrCreate(stdGrad_, 1, channels_, false, useGpu_);
-
-  Matrix::resizeOrCreate(
-      expandedInGrad_, batchSize * imgPixels_, channels_, false, useGpu_);
-  Matrix::resizeOrCreate(
-      inGrad_, batchSize, imgPixels_ * channels_, false, useGpu_);
-  Matrix::resizeOrCreate(
-      normInGrad_, batchSize * imgPixels_, channels_, false, useGpu_);
-  Matrix::resizeOrCreate(
-      expandedOutGrad_, batchSize * imgPixels_, channels_, false, useGpu_);
-  Matrix::resizeOrCreate(
-      tmpMat_, batchSize * imgPixels_, channels_, false, useGpu_);
-  Matrix::resizeOrCreate(
-      tmpGrad_, batchSize * imgPixels_, channels_, false, useGpu_);
-
-  expandMat(getOutputGrad(), expandedOutGrad_);
-
-  // compute derivatives.
-  if (biases_ && biases_->getWGrad()) {
-    REGISTER_TIMER_INFO("BpBiasTimer", getName().c_str());
-    biases_->getWGrad()->collectBias(*expandedOutGrad_, 1);
-    /* Increasing the number of gradient */
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-  if (weight_->getWGrad()) {
-    tmpMat_->dotMul(*expandedOutGrad_, *normIn_);
-    weight_->getWGrad()->collectBias(*tmpMat_, 1);
-  }
-
-  // compute input gradients.
-  normInGrad_->assign(*expandedOutGrad_);
-  normInGrad_->mulRowVector(*(weight_->getW()));  // multiple gamma.
-  // normInGrad * (x - \mu)/ \sqrt(\delta^2)
-  tmpMat_->dotMul(*normInGrad_, *normIn_);
-  stdGrad_->zeroMem();
-  stdGrad_->collectBias(*tmpMat_, -1.0 / (batchSize * imgPixels_));
-  tmpGrad_->assign(*normIn_);
-  tmpGrad_->mulRowVector(*stdGrad_);
-
-  meanGrad_->zeroMem();
-  meanGrad_->collectBias(*normInGrad_, -1.0 / (batchSize * imgPixels_));
-
-  expandedInGrad_->zeroMem();
-  expandedInGrad_->add(*normInGrad_, *tmpGrad_);
-  expandedInGrad_->addRowVector(*meanGrad_);
-  expandedInGrad_->divRowVector(*savedInvVar_);
-
-  shrinkMat(expandedInGrad_, inGrad_);
-  if (getInputGrad(0)) {
-    getInputGrad(0)->add(*getInputGrad(0), *inGrad_);
-  }
-  {
-    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
-    weight_->getParameterPtr()->incUpdate(callback);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/BilinearInterpLayer.cpp b/paddle/gserver/layers/BilinearInterpLayer.cpp
deleted file mode 100644
index 9775914596ce3253aada71fbe7197410414fede5..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/BilinearInterpLayer.cpp
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "BilinearInterpLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(bilinear_interp, BilinearInterpLayer);
-
-size_t BilinearInterpLayer::getSize() {
-  inImgH_ = inputLayers_[0]->getOutput().getFrameHeight();
-  inImgW_ = inputLayers_[0]->getOutput().getFrameWidth();
-
-  const BilinearInterpConfig& conf = config_.inputs(0).bilinear_interp_conf();
-  if (inImgH_ == 0) {
-    inImgH_ = conf.image_conf().img_size_y();
-  }
-  if (inImgW_ == 0) {
-    inImgW_ = conf.image_conf().img_size();
-  }
-
-  outImgH_ = conf.out_size_y();
-  outImgW_ = conf.out_size_x();
-  numChannels_ = conf.image_conf().channels();
-
-  CHECK(outImgH_ > 0 && outImgW_ > 0);
-  CHECK(inImgH_ > 0 && inImgW_ > 0);
-  CHECK(numChannels_);
-
-  ratioH_ =
-      (outImgH_ > 1) ? static_cast<real>(inImgH_ - 1) / (outImgH_ - 1) : 0.f;
-  ratioW_ =
-      (outImgW_ > 1) ? static_cast<real>(inImgW_ - 1) / (outImgW_ - 1) : 0.f;
-
-  getOutput().setFrameHeight(outImgH_);
-  getOutput().setFrameWidth(outImgW_);
-  return outImgH_ * outImgW_ * numChannels_;
-}
-
-bool BilinearInterpLayer::init(const LayerMap& layerMap,
-                               const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(1, config_.inputs_size());
-
-  return true;
-}
-
-void BilinearInterpLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  size_t batchSize = getInput(0).getBatchSize();
-  size_t size = getSize();
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    resetOutput(batchSize, size);
-  }
-
-  MatrixPtr inV = getInputValue(0);
-  MatrixPtr outV = getOutputValue();
-  {
-    REGISTER_TIMER_INFO("FwBilinearInterpTimer", getName().c_str());
-    outV->bilinearForward(*inV,
-                          inImgH_,
-                          inImgW_,
-                          outImgH_,
-                          outImgW_,
-                          numChannels_,
-                          ratioH_,
-                          ratioW_);
-  }
-}
-
-void BilinearInterpLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-
-  MatrixPtr inputG = getInputGrad(0);
-  MatrixPtr outG = getOutputGrad();
-  {
-    REGISTER_TIMER_INFO("BwBilinearInterpTimer", getName().c_str());
-    if (inputG) {
-      inputG->bilinearBackward(*outG,
-                               outImgH_,
-                               outImgW_,
-                               inImgH_,
-                               inImgW_,
-                               numChannels_,
-                               ratioH_,
-                               ratioW_);
-    }
-  }
-}
-}  // namespace paddle
diff --git a/paddle/gserver/layers/BilinearInterpLayer.h b/paddle/gserver/layers/BilinearInterpLayer.h
deleted file mode 100644
index 8e08c2e1ce80172f55c93d8242821f683fa1a731..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/BilinearInterpLayer.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief A layer for bilinear interpolation which is
- *        used on conv layer output.
- *
- * @note  The config file api is bilinear_interp_layer.
- */
-class BilinearInterpLayer : public Layer {
- protected:
-  size_t outImgH_, outImgW_;
-  size_t inImgH_, inImgW_;
-  real ratioH_, ratioW_;
-  size_t numChannels_;
-
- public:
-  explicit BilinearInterpLayer(const LayerConfig& config) : Layer(config) {}
-
-  virtual ~BilinearInterpLayer() {}
-
-  size_t getSize();
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/BlockExpandLayer.cpp b/paddle/gserver/layers/BlockExpandLayer.cpp
deleted file mode 100644
index 793d24e884a6f76c2aa897b3d03f3adc3e201265..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/BlockExpandLayer.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "BlockExpandLayer.h"
-
-#include "paddle/utils/Logging.h"
-
-namespace paddle {
-
-REGISTER_LAYER(blockexpand, BlockExpandLayer);
-
-bool BlockExpandLayer::init(const LayerMap& layerMap,
-                            const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(config_.inputs_size(), 1);
-  const BlockExpandConfig& blockConf = config_.inputs(0).block_expand_conf();
-  blockH_ = blockConf.block_y();
-  blockW_ = blockConf.block_x();
-  strideH_ = blockConf.stride_y();
-  strideW_ = blockConf.stride_x();
-  paddingH_ = blockConf.padding_y();
-  paddingW_ = blockConf.padding_x();
-  channels_ = blockConf.channels();
-  imgSizeH_ = blockConf.img_size_y();
-  imgSizeW_ = blockConf.img_size_x();
-
-  std::vector<size_t> strides = {(size_t)strideH_, (size_t)strideW_};
-  std::vector<size_t> paddings = {(size_t)paddingH_, (size_t)paddingW_};
-  std::vector<size_t> blocks = {(size_t)blockH_, (size_t)blockW_};
-  createFunction(forward_,
-                 "BlockExpand",
-                 FuncConfig()
-                     .set("strides", strides)
-                     .set("paddings", paddings)
-                     .set("blocks", blocks));
-  createFunction(backward_,
-                 "BlockExpandGrad",
-                 FuncConfig()
-                     .set("strides", strides)
-                     .set("paddings", paddings)
-                     .set("blocks", blocks));
-
-  return true;
-}
-
-size_t BlockExpandLayer::getBlockNum() {
-  CHECK_EQ(inputLayers_.size(), 1UL);
-  const BlockExpandConfig& blockConf = config_.inputs(0).block_expand_conf();
-  imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
-  imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
-  if (imgSizeH_ == 0) {
-    imgSizeH_ = blockConf.img_size_y();
-  }
-  if (imgSizeW_ == 0) {
-    imgSizeW_ = blockConf.img_size_x();
-  }
-  size_t tmpH = 2 * paddingH_ + imgSizeH_ - blockH_;
-  outputH_ = (int)tmpH < 0 ? 1 : 1 + (tmpH + strideH_ - 1) / strideH_;
-  size_t tmpW = 2 * paddingW_ + imgSizeW_ - blockW_;
-  outputW_ = (int)tmpW < 0 ? 1 : 1 + (tmpW + strideW_ - 1) / strideW_;
-
-  return outputH_ * outputW_;
-}
-
-void BlockExpandLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  size_t batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  size_t blockNum = getBlockNum();
-  size_t blockSize = blockH_ * blockW_ * channels_;
-  resetOutput(blockNum * batchSize, blockSize);
-
-  // calculate output_.value
-  inputShape_ = TensorShape({batchSize, channels_, imgSizeH_, imgSizeW_});
-  outputShape_ = TensorShape({batchSize, blockNum, blockSize});
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*getInputValue(0), inputShape_);
-  outputs.addArg(*getOutputValue(), outputShape_, ASSIGN_TO);
-  forward_[0]->calc(inputs, outputs);
-
-  // calculate output_.sequenceStartPositions and output_.cpuSequenceDims
-  Argument& out = getOutput();
-  ICpuGpuVector::resizeOrCreate(
-      out.sequenceStartPositions, batchSize + 1, false);
-  IVector::resizeOrCreate(out.cpuSequenceDims, 2 * batchSize, false);
-  int* start = out.sequenceStartPositions->getMutableData(false);
-  int* dims = out.cpuSequenceDims->getData();
-  for (size_t i = 0; i < batchSize; i++) {
-    start[i] = i * blockNum;
-    dims[2 * i] = outputH_;
-    dims[2 * i + 1] = outputW_;
-  }
-  start[batchSize] = batchSize * blockNum;
-}
-
-void BlockExpandLayer::backward(const UpdateCallback& callback) {
-  /* Calculate the input layers error */
-  if (getInputGrad(0)) {
-    BufferArgs inputs;
-    BufferArgs outputs;
-    inputs.addArg(*getOutputGrad(), outputShape_);
-    outputs.addArg(*getInputGrad(0), inputShape_, ADD_TO);
-    backward_[0]->calc(inputs, outputs);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/BlockExpandLayer.h b/paddle/gserver/layers/BlockExpandLayer.h
deleted file mode 100644
index 9d76584f3a4eda19a9e8f806256a7b8da617cc37..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/BlockExpandLayer.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief Expand feature map to minibatch matrix.
- * - matrix width is: blockH_ * blockW_ * channels_
- * - matirx height is: outputH_ * outputW_
- *
- * \f[
- * outputH\_ = 1 + (2 * paddingH\_ + imgSizeH\_ - blockH\_ + strideH\_ - 1) /
- *             strideH\_ \\
- * outputW\_ = 1 + (2 * paddingW\_ + imgSizeW\_ - blockW\_ + strideW\_ - 1) /
- *             strideW\_
- * \f]
- *
- * The expand method is the same with ExpandConvLayer, but saved the transposed
- * value. After expanding, output_.sequenceStartPositions will store timeline.
- * The number of time steps are outputH_ * outputW_ and the dimension of each
- * time step is blockH_ * blockW_ * channels_. This layer can be used after
- * convolution neural network, and before recurrent neural network.
- *
- * The config file api is block_expand_layer.
- */
-class BlockExpandLayer : public Layer {
- protected:
-  /**
-   * @brief Calculate outputH_ and outputW_ and return block number which
-   * actually is time steps.
-   * @return time steps, outoutH_ * outputW_.
-   */
-  size_t getBlockNum();
-  size_t blockH_, blockW_, strideH_, strideW_, paddingH_, paddingW_;
-  size_t imgSizeH_, imgSizeW_, outputH_, outputW_, channels_;
-
-  TensorShape inputShape_;
-  TensorShape outputShape_;
-
- public:
-  explicit BlockExpandLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~BlockExpandLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConcatenateLayer.cpp b/paddle/gserver/layers/ConcatenateLayer.cpp
deleted file mode 100644
index e6de329ff3f9ccfdd1cbe697c1de1a9cd8c7926a..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConcatenateLayer.cpp
+++ /dev/null
@@ -1,208 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "Projection.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * A concatenate layer has multiple input layers. It concatenates rows of
- * each input as one row for the output of this layer and apply activation.
- */
-class ConcatenateLayer : public Layer {
- public:
-  explicit ConcatenateLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~ConcatenateLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(concat, ConcatenateLayer);
-
-bool ConcatenateLayer::init(const LayerMap& layerMap,
-                            const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  if (!Layer::init(layerMap, parameterMap)) return false;
-
-  CHECK(!biasParameter_);
-
-  return true;
-}
-
-void ConcatenateLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  int batchSize = getInput(0).getBatchSize();
-  int size = getSize();
-  reserveOutput(batchSize, size);
-
-  const MatrixPtr& out = getOutputValue();
-  int offset = 0;
-
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    const MatrixPtr& in = getInputValue(i);
-    size_t inSize = in->getWidth();
-    out->assignAtOffset(*in, offset);
-    offset += inSize;
-  }
-  CHECK_EQ(size, offset);
-
-  /* activation */ {
-    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void ConcatenateLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-
-  /* Do activation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
-
-  const MatrixPtr& out = getOutputGrad();
-  int offset = 0;
-
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    const MatrixPtr& in = getInputGrad(i);
-    size_t inSize = getInputValue(i)->getWidth();
-    if (in) {
-      in->addAtOffset(*out, offset);
-    }
-    offset += inSize;
-  }
-}
-
-/**
- * concat2 layer is like concat layer, but each input layer was
- * processed by a Projection.
- */
-class ConcatenateLayer2 : public Layer {
- public:
-  explicit ConcatenateLayer2(const LayerConfig& config) : Layer(config) {}
-
-  ~ConcatenateLayer2() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
- protected:
-  std::vector<std::unique_ptr<Projection>> projections_;
-  std::vector<Argument> projOutput_;
-  std::vector<std::pair<size_t, size_t>> projCol_;
-  bool sharedBias_;
-  std::unique_ptr<Weight> biases_;
-};
-
-REGISTER_LAYER(concat2, ConcatenateLayer2);
-
-bool ConcatenateLayer2::init(const LayerMap& layerMap,
-                             const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  if (!Layer::init(layerMap, parameterMap)) return false;
-
-  CHECK_EQ(inputLayers_.size(), parameters_.size());
-  projections_.reserve(inputLayers_.size());
-  projCol_.reserve(inputLayers_.size());
-  projOutput_.resize(inputLayers_.size());
-
-  size_t startCol = 0;
-  size_t endCol = 0;
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    projections_.emplace_back(Projection::create(
-        config_.inputs(i).proj_conf(), parameters_[i], useGpu_));
-
-    endCol += projections_[i]->getOutputSize();
-    projCol_.push_back(std::make_pair(startCol, endCol));
-    startCol = endCol;
-  }
-  CHECK_EQ(getSize(), endCol);
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    sharedBias_ = config_.shared_biases();
-    size_t psize = config_.bias_size();
-    biases_ = std::unique_ptr<Weight>(new Weight(1, psize, biasParameter_));
-  }
-
-  return true;
-}
-
-void ConcatenateLayer2::forward(PassType passType) {
-  Layer::forward(passType);
-
-  int batchSize = getInput(0).getBatchSize();
-  int size = getSize();
-  resetOutput(batchSize, size);
-
-  for (size_t i = 0; i < projections_.size(); i++) {
-    size_t startCol = projCol_[i].first;
-    size_t endCol = projCol_[i].second;
-    projOutput_[i].value = output_.value->subColMatrix(startCol, endCol);
-    if (output_.grad) {
-      projOutput_[i].grad = output_.grad->subColMatrix(startCol, endCol);
-    }
-  }
-
-  {
-    AsyncGpuBlock block;
-    for (size_t i = 0; i != inputLayers_.size(); ++i) {
-      projections_[i]->forward(&getInput(i), &projOutput_[i], passType);
-    }
-  }
-
-  /* add the bias-vector */
-  if (biases_) {
-    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
-    output_.value->addBias(*(biases_->getW()), 1, sharedBias_);
-  }
-
-  /* activation */ {
-    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void ConcatenateLayer2::backward(const UpdateCallback& callback) {
-  /* Do activation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
-
-  AsyncGpuBlock block;
-  if (biases_ && biases_->getWGrad()) {
-    REGISTER_TIMER_INFO("Concat2BpBiasTimer", getName().c_str());
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1, sharedBias_);
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    if (projections_[i]) {
-      projections_[i]->backward(callback);
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ContextProjection.cpp b/paddle/gserver/layers/ContextProjection.cpp
deleted file mode 100644
index 10c3cef0da61af76a6b0a207e4b914276a2fa39b..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ContextProjection.cpp
+++ /dev/null
@@ -1,185 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ContextProjection.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_PROJECTION(context, ContextProjection);
-
-ContextProjection::ContextProjection(const ProjectionConfig& config,
-                                     ParameterPtr parameter,
-                                     bool useGpu)
-    : Projection(config, parameter, useGpu) {
-  CHECK(config.has_context_start());
-  CHECK(config.has_context_length());
-  if (config.context_start() == 0 && config.context_length() == 1) {
-    config_.set_trainable_padding(false);
-  }
-  if (config_.trainable_padding()) {
-    CHECK(parameter);
-    beginPad_ = std::max(0, -config.context_start());
-    endPad_ = std::max(0, config.context_start() + config.context_length() - 1);
-    size_t totalPad = beginPad_ + endPad_;
-    size_t inputDim = parameter->getSize() / totalPad;
-    CHECK_EQ(config.input_size(), inputDim);
-    CHECK_EQ(inputDim * totalPad, parameter->getSize());
-    weight_.reset(new Weight(totalPad, inputDim, parameter));
-  }
-  // init forward_ and backward_ functions
-  init();
-}
-
-bool ContextProjection::init() {
-  size_t context_length = config_.context_length();
-  int context_start = config_.context_start();
-  bool is_padding = config_.trainable_padding();
-  size_t total_pad = is_padding ? beginPad_ + endPad_ : 0;
-
-  createFunction(forward_,
-                 "ContextProjectionForward",
-                 FuncConfig()
-                     .set("context_length", context_length)
-                     .set("context_start", context_start)
-                     .set("begin_pad", beginPad_));
-  createFunction(backward_,
-                 "ContextProjectionBackward",
-                 FuncConfig()
-                     .set("context_length", context_length)
-                     .set("context_start", context_start)
-                     .set("begin_pad", beginPad_)
-                     .set("is_padding", is_padding)
-                     .set("total_pad", total_pad));
-
-  return true;
-}
-
-void ContextProjection::resetState() {
-  CHECK_LE(config_.context_start() + config_.context_length(), 1)
-      << "state is not allowed for future context";
-  if (config_.context_start() >= 0) return;
-  Matrix::resizeOrCreate(state_,
-                         -config_.context_start(),
-                         config_.input_size(),
-                         false,  // trans
-                         useGpu_);
-  Matrix::resizeOrCreate(state2_,
-                         -config_.context_start(),
-                         config_.input_size(),
-                         false,  // trans
-                         useGpu_);
-  if (config_.trainable_padding()) {
-    state_->assign(*weight_->getW()->subMatrix(0, -config_.context_start()));
-  } else {
-    state_->zeroMem();
-  }
-}
-
-void ContextProjection::setState(LayerStatePtr state) {
-  CHECK(state->value.size() == 1)
-      << "one matrix is expected for ContextProjection state";
-  state_->copyFrom(*(state->value[0]));
-}
-
-LayerStatePtr ContextProjection::getState() {
-  if (state_ == nullptr) {
-    return nullptr;
-  }
-  LayerStatePtr res = std::make_shared<LayerState>();
-  res->value.push_back(state_->clone(0, 0, false));
-  res->value[0]->copyFrom(*state_);
-  return res;
-}
-
-void ContextProjection::forward() {
-  CHECK(in_->value && out_->value);
-  CHECK(in_->sequenceStartPositions);
-
-  size_t input_dim = in_->value->getWidth();
-  size_t dim = out_->value->getWidth();
-  CHECK_EQ(dim, input_dim * config_.context_length());
-  // size_t batch_size = in_->value->getHeight();
-  CHECK_EQ(forward_.size(), (size_t)1) << "Only one forward function here";
-
-  REGISTER_TIMER_INFO("ContextProjectionForward", getName().c_str());
-  bool is_padding = config_.trainable_padding();
-  /// first use state_, otherwise use weight_(padding false === w nullptr)
-  auto w_ptr =
-      state_ ? state_.get() : is_padding ? weight_->getW().get() : nullptr;
-  const auto start_pos = in_->sequenceStartPositions->getVector(useGpu_);
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*in_->value, *start_pos);
-  if (w_ptr) {
-    inputs.addArg(CpuMatrix(w_ptr->getData(), w_ptr->getHeight(), input_dim),
-                  *start_pos);
-  }
-  outputs.addArg(*out_->value, *start_pos, ADD_TO);
-  forward_[0]->calc(inputs, outputs);
-
-  if (state_ && config_.context_start() < 0) {
-    CHECK_EQ(1, in_->getNumSequences());
-    const int* starts = in_->sequenceStartPositions->getData(false);
-    int length = starts[1] - starts[0];
-    if (-config_.context_start() <= length) {
-      MatrixPtr sub = in_->value->subMatrix(starts[1] + config_.context_start(),
-                                            -config_.context_start());
-      state_->copyFrom(*sub);
-    } else {
-      int prevLength = -config_.context_start() - length;
-      state2_->subMatrix(0, prevLength)
-          ->copyFrom(*state_->subMatrix(length, prevLength));
-      state2_->subMatrix(prevLength, length)
-          ->copyFrom(*in_->value->subMatrix(starts[0], length));
-      std::swap(state_, state2_);
-    }
-  }
-}
-
-void ContextProjection::backward(const UpdateCallback& callback) {
-  CHECK(in_->value && out_->value && out_->grad);
-  size_t input_dim = in_->value->getWidth();
-  size_t dim = out_->value->getWidth();
-  CHECK_EQ(dim, input_dim * config_.context_length());
-  size_t batch_size = in_->value->getHeight();
-  CHECK_EQ(batch_size, out_->value->getHeight());
-  CHECK_EQ(static_cast<int>(backward_.size()), 1)
-      << "Only one backward function here";
-
-  REGISTER_TIMER_INFO("ContextProjectionBackward", getName().c_str());
-  bool is_padding = config_.trainable_padding();
-  auto start_pos = in_->sequenceStartPositions;
-  auto w_ptr = is_padding ? weight_->getWGrad() : nullptr;
-
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*out_->grad, *in_->sequenceStartPositions->getVector(useGpu_));
-  outputs.addArg(
-      CpuMatrix(
-          in_->grad ? in_->grad->getData() : nullptr, batch_size, input_dim),
-      *in_->sequenceStartPositions->getVector(useGpu_),
-      ADD_TO);
-  outputs.addArg(CpuMatrix(w_ptr ? w_ptr->getData() : nullptr,
-                           w_ptr ? w_ptr->getHeight() : 0,
-                           input_dim),
-                 ADD_TO);
-  backward_[0]->calc(inputs, outputs);
-
-  if (config_.trainable_padding()) {
-    weight_->getParameterPtr()->incUpdate(callback);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/Conv3DLayer.cpp b/paddle/gserver/layers/Conv3DLayer.cpp
deleted file mode 100644
index b38de86b1591f987a63478d019019f87c88cee20..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/Conv3DLayer.cpp
+++ /dev/null
@@ -1,253 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Conv3DLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(conv3d, Conv3DLayer);
-
-bool Conv3DLayer::init(const LayerMap &layerMap,
-                       const ParameterMap &parameterMap) {
-  if (!ConvBaseLayer::init(layerMap, parameterMap)) return false;
-  int index = 0;
-  for (auto &inputConfig : config_.inputs()) {
-    const ConvConfig &conf = inputConfig.conv_conf();
-    M_.push_back(numFilters_ / conf.groups());
-    K_.push_back(filterPixels_[index] * filterChannels_[index]);
-
-    // create a new weight
-    size_t height, width;
-    width = filterPixels_[index] * filterChannels_[index];
-    height = numFilters_;
-    CHECK_EQ(parameters_[index]->getSize(), width * height);
-    Weight *w = new Weight(height, width, parameters_[index]);
-    weights_.emplace_back(w);
-    ++index;
-  }
-  if (biasParameter_.get()) {
-    if (sharedBiases_) {
-      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
-      biases_ =
-          std::unique_ptr<Weight>(new Weight(numFilters_, 1, biasParameter_));
-    } else {
-      biases_ =
-          std::unique_ptr<Weight>(new Weight(getSize(), 1, biasParameter_));
-    }
-  }
-  return true;
-}
-
-size_t Conv3DLayer::getSize() {
-  CHECK_NE(inputLayers_.size(), 0UL);
-  outputH_.clear();
-  outputW_.clear();
-  outputD_.clear();
-  N_.clear();
-  size_t layerSize = 0;
-  for (size_t i = 0; i < inputLayers_.size(); ++i) {
-    outputW_.push_back(outputSize(
-        imgSizeW_[i], filterSize_[i], padding_[i], stride_[i], true));
-    outputH_.push_back(outputSize(
-        imgSizeH_[i], filterSizeY_[i], paddingY_[i], strideY_[i], true));
-    outputD_.push_back(outputSize(
-        imgSizeD_[i], filterSizeZ_[i], paddingZ_[i], strideZ_[i], true));
-
-    N_.push_back(outputD_[i] * outputH_[i] * outputW_[i]);
-    CHECK(layerSize == 0 || N_[i] * size_t(numFilters_) == layerSize);
-    layerSize += N_[i] * numFilters_;
-  }
-  getOutput().setFrameHeight(outputH_[0]);
-  getOutput().setFrameWidth(outputW_[0]);
-  getOutput().setFrameDepth(outputD_[0]);
-  return layerSize;
-}
-
-void Conv3DLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  int outWidth = getSize();
-  resetOutput(batchSize, outWidth);
-
-  REGISTER_TIMER_INFO("FwdConv3D", getName().c_str());
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    const MatrixPtr &inMat = getInputValue(i);
-    const MatrixPtr &outMat = getOutputValue();
-    int M = M_[i];
-    int N = N_[i];
-    int K = K_[i];
-    Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
-    MatrixPtr wMat = weights_[i]->getW();
-    for (int n = 0; n < batchSize; ++n) {
-      colBuf_->vol2Col(inMat->getData() + n * inMat->getStride(),
-                       channels_[i],
-                       imgSizeD_[i],
-                       imgSizeH_[i],
-                       imgSizeW_[i],
-                       filterSizeZ_[i],
-                       filterSizeY_[i],
-                       filterSize_[i],
-                       strideZ_[i],
-                       strideY_[i],
-                       stride_[i],
-                       paddingZ_[i],
-                       paddingY_[i],
-                       padding_[i]);
-
-      real *outData = outMat->getData() + n * outMat->getStride();
-      MatrixPtr outMatSub =
-          Matrix::create(outData, groups_[i] * M, N, false, useGpu_);
-      for (int g = 0; g < groups_[i]; g++) {
-        MatrixPtr wMatSub = wMat->subMatrix(g * M, M);
-        MatrixPtr in = colBuf_->subMatrix(g * K, K);
-        MatrixPtr out = outMatSub->subMatrix(g * M, M);
-        out->mul(*wMatSub, *in, 1.0, 1.0);
-      }
-    }
-  }
-  if (nullptr != this->biasParameter_) {
-    this->addBias();
-  }
-  forwardActivation();
-}
-
-void Conv3DLayer::backward(const UpdateCallback &callback) {
-  backwardActivation();
-
-  if (biases_ && biases_->getWGrad()) {
-    bpropBiases();
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  REGISTER_TIMER_INFO("BwdConv3D", getName().c_str());
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    if (weights_[i]->getWGrad()) {
-      bpropWeights(i);
-    }
-    if (getInputGrad(i)) {
-      bpropData(i);
-    }
-    weights_[i]->getParameterPtr()->incUpdate(callback);
-  }
-}
-
-void Conv3DLayer::bpropWeights(int i) {
-  int M = M_[i];
-  int N = N_[i];
-  int K = K_[i];
-  const MatrixPtr &inMat = getInputValue(i);
-  Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
-  MatrixPtr wGradMat = weights_[i]->getWGrad();
-  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  for (int n = 0; n < batchSize; ++n) {
-    colBuf_->vol2Col(inMat->getData() + n * inMat->getStride(),
-                     channels_[i],
-                     imgSizeD_[i],
-                     imgSizeH_[i],
-                     imgSizeW_[i],
-                     filterSizeZ_[i],
-                     filterSizeY_[i],
-                     filterSize_[i],
-                     strideZ_[i],
-                     strideY_[i],
-                     stride_[i],
-                     paddingZ_[i],
-                     paddingY_[i],
-                     padding_[i]);
-
-    real *outGradData =
-        getOutputGrad()->getData() + n * getOutputGrad()->getStride();
-    MatrixPtr outGradSub =
-        Matrix::create(outGradData, groups_[i] * M, N, false, useGpu_);
-    for (int g = 0; g < groups_[i]; ++g) {
-      MatrixPtr inMatSub = colBuf_->subMatrix(g * K, K);
-      MatrixPtr outG = outGradSub->subMatrix(g * M, M);
-      MatrixPtr wGradSub = wGradMat->subMatrix(g * M, M);
-      wGradSub->mul(*outG, *(inMatSub->getTranspose()), 1.0, 1.0);
-    }
-  }
-}
-
-void Conv3DLayer::bpropData(int i) {
-  int M = M_[i];
-  int N = N_[i];
-  int K = K_[i];
-  Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
-  MatrixPtr wMat = weights_[i]->getW();
-  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  for (int n = 0; n < batchSize; ++n) {
-    real *outGradData =
-        getOutputGrad()->getData() + n * getOutputGrad()->getStride();
-    real *preGradData =
-        getInputGrad(i)->getData() + n * getInputGrad(i)->getStride();
-    MatrixPtr outGradSub =
-        Matrix::create(outGradData, M * groups_[i], N, false, useGpu_);
-    for (int g = 0; g < groups_[i]; ++g) {
-      MatrixPtr wMatSub = wMat->subMatrix(g * M, M);
-      MatrixPtr outG = outGradSub->subMatrix(g * M, M);
-      MatrixPtr inGradMatSub = colBuf_->subMatrix(g * K, K);
-      inGradMatSub->mul(*(wMatSub->getTranspose()), *outG, 1.0, 0.0);
-    }
-    colBuf_->col2Vol(preGradData,
-                     channels_[i],
-                     imgSizeD_[i],
-                     imgSizeH_[i],
-                     imgSizeW_[i],
-                     filterSizeZ_[i],
-                     filterSizeY_[i],
-                     filterSize_[i],
-                     strideZ_[i],
-                     strideY_[i],
-                     stride_[i],
-                     paddingZ_[i],
-                     paddingY_[i],
-                     padding_[i],
-                     1.0,
-                     1.0);
-  }
-}
-
-void Conv3DLayer::bpropBiases() {
-  MatrixPtr biases = Matrix::create(biases_->getWGrad()->getData(),
-                                    1,
-                                    biases_->getWGrad()->getElementCnt(),
-                                    false,
-                                    useGpu_);
-  MatrixPtr outGradMat = getOutputGrad();
-
-  if (this->sharedBiases_) {
-    biases->collectSharedBias(*outGradMat, 1.0f);
-  } else {
-    biases->collectBias(*outGradMat, 1.0f);
-  }
-}
-
-void Conv3DLayer::addBias() {
-  MatrixPtr outMat = getOutputValue();
-  MatrixPtr bias = Matrix::create(biases_->getW()->getData(),
-                                  1,
-                                  biases_->getW()->getElementCnt(),
-                                  false,
-                                  useGpu_);
-  if (this->sharedBiases_) {
-    outMat->addSharedBias(*(bias), 1.0f);
-  } else {
-    outMat->addBias(*(bias), 1.0f);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/Conv3DLayer.h b/paddle/gserver/layers/Conv3DLayer.h
deleted file mode 100644
index 07b804bad02beb6ec9c3e9fd43c3cd3aa6d50b22..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/Conv3DLayer.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-#include "ConvBaseLayer.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief A subclass of convolution layer.
- * This layer expands input and use matrix multiplication to
- * calculate convolution operation.
- */
-class Conv3DLayer : public ConvBaseLayer {
- public:
-  explicit Conv3DLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
-  ~Conv3DLayer() {}
-
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-  void addBias();
-  void backward(const UpdateCallback& callback);
-  void bpropBiases();
-  void bpropData(int i);
-  void bpropWeights(int i);
-  size_t getSize();
-
- protected:
-  // Figure out the dimensions for individual gemms.
-  IntV M_;  /// numFilters_ / filter_group_;
-  IntV N_;  /// channels_ * filterSizeZ_ * filterSize_ * filterSizeY_
-  IntV K_;  /// outputD_ * outputH_ * outputW_
-  MatrixPtr colBuf_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvBaseLayer.cpp b/paddle/gserver/layers/ConvBaseLayer.cpp
deleted file mode 100644
index 56bf4f9fcb187f73409076b826b738f62d19516a..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConvBaseLayer.cpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ConvBaseLayer.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/utils/Logging.h"
-namespace paddle {
-
-bool ConvBaseLayer::init(const LayerMap& layerMap,
-                         const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-  isDeconv_ = (config_.type() == "exconv" || config_.type() == "cudnn_conv")
-                  ? false
-                  : true;
-
-  /* Initialize the convolutional layer parameter */
-  numFilters_ = config_.num_filters();
-  sharedBiases_ = config_.shared_biases();
-  for (auto& inputConfig : config_.inputs()) {
-    const ConvConfig& conf = inputConfig.conv_conf();
-    padding_.push_back(conf.padding());
-    stride_.push_back(conf.stride());
-    dilation_.push_back(conf.dilation());
-    filterSize_.push_back(conf.filter_size());
-    paddingY_.push_back(conf.padding_y());
-    strideY_.push_back(conf.stride_y());
-    dilationY_.push_back(conf.dilation_y());
-    filterSizeY_.push_back(conf.filter_size_y());
-    channels_.push_back(conf.channels());
-    imgSizeH_.push_back(conf.has_img_size_y() ? conf.img_size_y()
-                                              : conf.img_size());
-    imgSizeW_.push_back(conf.img_size());
-    groups_.push_back(conf.groups());
-    filterChannels_.push_back(conf.filter_channels());
-    outputH_.push_back(conf.has_output_y() ? conf.output_y() : conf.output_x());
-    outputW_.push_back(conf.output_x());
-
-    paddingZ_.push_back(conf.padding_z());
-    strideZ_.push_back(conf.stride_z());
-    filterSizeZ_.push_back(conf.filter_size_z());
-    imgSizeD_.push_back(conf.img_size_z());
-    outputD_.push_back(conf.output_z());
-    filterPixels_.push_back(filterSize_.back() * filterSizeY_.back() *
-                            filterSizeZ_.back());
-  }
-
-  CHECK(inputLayers_.size() == parameters_.size());
-
-  // create new weights_ in derived class
-  // create new biases_ in derived class
-
-  // default caffe model
-  caffeMode_ = true;
-
-  return true;
-}
-
-size_t ConvBaseLayer::calOutputSize() {
-  auto clearAndReserve = [this](IntV* vec) {
-    vec->clear();
-    vec->reserve(this->inputLayers_.size());
-  };
-  clearAndReserve(&imgSizeH_);
-  clearAndReserve(&imgSizeW_);
-  clearAndReserve(&outputH_);
-  clearAndReserve(&outputW_);
-  size_t layerSize = 0;
-
-  auto setLayerSize = [&](IntV& inH, IntV& inW, IntV& outH, IntV& outW) {
-    size_t filterSizeY;
-    size_t filterSize;
-    for (size_t i = 0; i < inputLayers_.size(); i++) {
-      filterSizeY = (filterSizeY_[i] - 1) * dilationY_[i] + 1;
-      filterSize = (filterSize_[i] - 1) * dilation_[i] + 1;
-      inH.push_back(inputLayers_[i]->getOutput().getFrameHeight());
-      inW.push_back(inputLayers_[i]->getOutput().getFrameWidth());
-      const ConvConfig& conf = config_.inputs(i).conv_conf();
-      if (isDeconv_) {
-        if (inH[i] == 0)
-          inH[i] = conf.has_output_y() ? conf.output_y() : conf.output_x();
-        if (inW[i] == 0) inW[i] = conf.output_x();
-        outH.push_back(imageSize(
-            inH[i], filterSizeY, paddingY_[i], strideY_[i], caffeMode_));
-        outW.push_back(
-            imageSize(inW[i], filterSize, padding_[i], stride_[i], caffeMode_));
-      } else {
-        if (inH[i] == 0)
-          inH[i] = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
-        if (inW[i] == 0) inW[i] = conf.img_size();
-        outH.push_back(outputSize(
-            inH[i], filterSizeY, paddingY_[i], strideY_[i], caffeMode_));
-        outW.push_back(outputSize(
-            inW[i], filterSize, padding_[i], stride_[i], caffeMode_));
-      }
-      CHECK_EQ(outH[i], outH[0]);
-      CHECK_EQ(outW[i], outW[0]);
-    }
-    getOutput().setFrameHeight(outH[0]);
-    getOutput().setFrameWidth(outW[0]);
-    layerSize = outH[0] * outW[0] * size_t(numFilters_);
-  };
-
-  setLayerSize(imgSizeH_, imgSizeW_, outputH_, outputW_);
-
-  return layerSize;
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvBaseLayer.h b/paddle/gserver/layers/ConvBaseLayer.h
deleted file mode 100644
index 801bc4f888c5a60e803c882dcf807678c64af20c..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConvBaseLayer.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/MathUtils.h"
-namespace paddle {
-
-/**
- * @brief A Base Convolution Layer, which convolves the input image
- * with learned filters and (optionally) adds biases.
- */
-
-class ConvBaseLayer : public Layer {
- protected:
-  typedef std::vector<int> IntV;
-
-  /// True if it's deconv layer, false if it's convolution layer
-  bool isDeconv_;
-
-  /// The number of filters.
-  int numFilters_;
-  /// The x dimension of the padding.
-  IntV padding_;
-  /// The y dimension of the padding.
-  IntV paddingY_;
-  /// The x dimension of the stride.
-  IntV stride_;
-  /// The y dimension of the stride.
-  IntV strideY_;
-  /// The x dimension of the dilation.
-  IntV dilation_;
-  /// The y dimension of the dilation.
-  IntV dilationY_;
-  /// The x dimension of a filter kernel.
-  IntV filterSize_;
-  /// The y dimension of a filter kernel.
-  IntV filterSizeY_;
-  /// The spatial dimensions of the convolution input.
-  IntV channels_;
-  /// The spatial dimensions of input feature map height.
-  IntV imgSizeH_;
-  /// The spatial dimensions of input feature map width.
-  IntV imgSizeW_;
-  /// filterPixels_ = filterSizeX_ * filterSizeY_.
-  IntV filterPixels_;
-  /// filterChannels_ = channels_/groups_.
-  IntV filterChannels_;
-  /// The spatial dimensions of output feature map height.
-  IntV outputH_;
-  /// The spatial dimensions of output feature map width.
-  IntV outputW_;
-
-  IntV outputD_;
-  IntV imgSizeD_;
-  IntV filterSizeZ_;
-  IntV strideZ_;
-  IntV paddingZ_;
-
-  /// Group size, refer to grouped convolution in
-  /// Alex Krizhevsky's paper: when group=2, the first half of the
-  /// filters are only connected to the first half of the input channels,
-  /// and the second half only connected to the second half.
-  IntV groups_;
-  /// Whether the bias is shared for feature in each channel.
-  bool sharedBiases_;
-
-  /// shape of weight: (numChannels * filterPixels_, numFilters)
-  WeightList weights_;
-  /// If shared_biases is false shape of bias: (numFilters_, 1)
-  /// If shared_biases is ture shape of bias:
-  /// (numFilters_ * outputX * outputY, 1)
-  std::unique_ptr<Weight> biases_;
-
-  /// True by default. The only difference is the calculation
-  /// of output size.
-  bool caffeMode_;
-
- public:
-  explicit ConvBaseLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  /**
-   * imgSizeH_ and imgSizeW_ will be set according to the previous input layers
-   * in this function. Then it will calculate outputH_ and outputW_ and set them
-   * into output argument.
-   */
-  virtual size_t calOutputSize();
-
-  Weight& getWeight(int idx) { return *weights_[idx]; }
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvBaseOperator.cpp b/paddle/gserver/layers/ConvBaseOperator.cpp
deleted file mode 100644
index 317e7d5c607683efa1e93aba9bc9ba472d37d60d..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConvBaseOperator.cpp
+++ /dev/null
@@ -1,151 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ConvBaseOperator.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief ConvBaseOperator takes two inputs to perform the convolution.
- * The first input is the image, and the second input is the convolution kernel.
- * The height of data for two inputs are the same. Each data of the first input
- * is convolved with each data of the second input indepedently.
- *
- * The config file api is conv_operator.
- */
-
-ConvBaseOperator::ConvBaseOperator(const OperatorConfig &config, bool useGpu)
-    : Operator(config, useGpu) {
-  CHECK(useGpu);
-  CHECK_EQ(config_.input_indices_size(), 2L);
-
-  caffeMode_ = true;
-  getConvParams();
-  computeConvSizes();
-
-  // initialize all to default algorithms
-  fwdAlgo_ = 0;
-  bwdFilterAlgo_ = 0;
-  bwdDataAlgo_ = 0;
-  fwdLimitBytes_ = 0;
-  bwdDataLimitBytes_ = 0;
-  bwdFilterLimitBytes_ = 0;
-  workSpaceInBytes_ = 0;
-  workSpace_ = nullptr;
-
-  isSelectAlgo_ = false;
-}
-
-void ConvBaseOperator::allocConvWorkSpace() {
-  hl_conv_workspace(imageDesc_,
-                    outputDesc_,
-                    filterDesc_,
-                    convDesc_,
-                    &fwdAlgo_,
-                    &fwdLimitBytes_,
-                    &bwdDataAlgo_,
-                    &bwdDataLimitBytes_,
-                    &bwdFilterAlgo_,
-                    &bwdFilterLimitBytes_,
-                    /*useDilation*/ false);
-
-  size_t maxWorkSpace = 0;
-  maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
-  maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
-
-  if (maxWorkSpace > workSpaceInBytes_) {
-    if (workSpaceInBytes_ != 0) {
-      hl_free_mem_device(workSpace_);
-    }
-    // total amount of storage needed
-    workSpace_ = hl_malloc_device(maxWorkSpace);
-    workSpaceInBytes_ = maxWorkSpace;
-  }
-}
-
-void ConvBaseOperator::computeConvSizes() {
-  hl_create_filter_descriptor(
-      &filterDesc_, channels_, numFilters_, filterSizeY_, filterSize_);
-  hl_create_tensor_descriptor(&imageDesc_);
-  hl_create_tensor_descriptor(&outputDesc_);
-  hl_create_convolution_descriptor(&convDesc_,
-                                   imageDesc_,
-                                   filterDesc_,
-                                   paddingY_,
-                                   padding_,
-                                   strideY_,
-                                   stride_);
-}
-
-void ConvBaseOperator::reshapeImageDescriptors() {
-  hl_tensor_reshape(imageDesc_,
-                    1,
-                    channels_,
-                    imageH_,
-                    imageW_,
-                    channels_ * imageH_ * imageW_,
-                    imageH_ * imageW_,
-                    imageW_,
-                    1);
-  hl_tensor_reshape(outputDesc_,
-                    1,
-                    numFilters_,
-                    outputH_,
-                    outputW_,
-                    numFilters_ * outputH_ * outputW_,
-                    outputH_ * outputW_,
-                    outputW_,
-                    1);
-  hl_reset_convolution_descriptor(convDesc_,
-                                  imageDesc_,
-                                  filterDesc_,
-                                  paddingY_,
-                                  padding_,
-                                  strideY_,
-                                  stride_);
-}
-
-void ConvBaseOperator::getConvParams() {
-  configNumFilters_ = config_.num_filters();
-  const ConvConfig &conf = config_.conv_conf();
-  padding_ = conf.padding();
-  stride_ = conf.stride();
-  filterSize_ = conf.filter_size();
-  paddingY_ = conf.padding_y();
-  strideY_ = conf.stride_y();
-  filterSizeY_ = conf.filter_size_y();
-  filterPixels_ = filterSize_ * filterSizeY_;
-  configChannels_ = conf.channels();
-  imgSize_ = conf.img_size();
-  imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
-  imgPixels_ = imgSize_ * imgSizeY_;
-  CHECK_EQ(conf.groups(), 1U);
-  filterChannels_ = conf.filter_channels();
-  outputX_ = conf.output_x();
-  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
-  outputs_ = outputX_ * outputX_;
-
-  isDeconv_ = (config_.type() == "conv") ? false : true;
-  if (isDeconv_) {
-    channels_ = configNumFilters_;
-    numFilters_ = configChannels_;
-  } else {
-    channels_ = configChannels_;
-    numFilters_ = configNumFilters_;
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvBaseOperator.h b/paddle/gserver/layers/ConvBaseOperator.h
deleted file mode 100644
index c3c647cb69da5a70eb5346737cc0092e2201c89e..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConvBaseOperator.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include "Operator.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief ConvOperator takes two inputs to perform the convolution.
- * The first input is the image, and the second input is the convolution kernel.
- * The height of data for two inputs are the same. Each data of the first input
- * is convolved with each data of the second input indepedently.
- *
- * The config file api is conv_operator.
- */
-
-class ConvBaseOperator : public Operator {
- public:
-  ConvBaseOperator(const OperatorConfig &config, bool useGpu);
-  /**
-   * Free workspace in device and destroy cudnn tensor descriptor.
-   */
-  virtual ~ConvBaseOperator() {
-    if (workSpaceInBytes_ != 0) {
-      hl_free_mem_device(workSpace_);
-      workSpaceInBytes_ = 0;
-    }
-
-    hl_destroy_tensor_descriptor(imageDesc_);
-    hl_destroy_tensor_descriptor(outputDesc_);
-    hl_destroy_filter_descriptor(filterDesc_);
-    hl_destroy_convolution_descriptor(convDesc_);
-  }
-
- protected:
-  /**
-   * Get convolution parameters from layer config and
-   * initialize member variables.
-   */
-  void getConvParams();
-
-  /**
-   * Allocate Gpu Memory for cudnn convolution algorithms.
-   */
-  void allocConvWorkSpace();
-
-  /**
-   * Create cudnn tensor descriptor for convolution operation.
-   */
-  void computeConvSizes();
-
-  /**
-   * Reshape cudnn tensor descriptor.
-   */
-  void reshapeImageDescriptors();
-
-  /**
-   * Reshape cudnn tensor descriptor.
-   */
-  virtual void reshape(int batchSize) = 0;
-
-  /**
-   * Check filter size is equal to the size calculated by parameters from
-   * layer config.
-   */
-  void checkFilterSize(const MatrixPtr &filter) {
-    CHECK_EQ(static_cast<int>(filter->getWidth()),
-             filterSize_ * filterSizeY_ * channels_ * numFilters_);
-  }
-
-  /// Most of member variables are same with CudnnConvLayer.
-  /// There is no explanation here.
-  bool isDeconv_;
-  int imageH_, imageW_, outputH_, outputW_;
-  hl_tensor_descriptor imageDesc_;
-  hl_tensor_descriptor outputDesc_;
-  hl_filter_descriptor filterDesc_;
-  hl_convolution_descriptor convDesc_;
-  bool caffeMode_;
-  int inputOffset_, outputOffset_, weightOffset_;
-  int numFilters_, channels_;
-
-  /// from parsing config
-  int configNumFilters_, configChannels_;
-  int padding_, stride_, filterSize_, imgSize_, imgSizeY_;
-  int paddingY_, strideY_, filterSizeY_;
-  int imgPixels_, filterPixels_, filterChannels_, outputX_, outputY_, outputs_;
-
-  /// Following member variables are same with CudnnConvLayer.
-  /// There is no explanation here.
-  int fwdAlgo_, bwdFilterAlgo_, bwdDataAlgo_;
-  size_t fwdLimitBytes_, bwdDataLimitBytes_, bwdFilterLimitBytes_;
-  size_t workSpaceInBytes_;
-  void *workSpace_;
-  bool isSelectAlgo_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvBaseProjection.cpp b/paddle/gserver/layers/ConvBaseProjection.cpp
deleted file mode 100644
index 39f433b78fe7ce22cc7f93b87d96ed19c10fc2e9..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConvBaseProjection.cpp
+++ /dev/null
@@ -1,199 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ConvBaseProjection.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-ThreadLocalD<std::vector<MemoryHandlePtr>> ConvBaseProjection::convMem_;
-
-ConvBaseProjection::ConvBaseProjection(const ProjectionConfig &config,
-                                       ParameterPtr parameter,
-                                       bool useGpu)
-    : Projection(config, parameter, useGpu) {
-  CHECK(useGpu);  // only support GPU
-  getConvParams();
-  initCudnn();
-
-  size_t height = filterH_ * filterW_ * channels_ / groups_;
-  size_t width = numFilters_;
-  weight_.reset(new Weight(height, width, parameter));
-  weightOffset_ = height * width / groups_;
-}
-
-void ConvBaseProjection::getConvParams() {
-  const ConvConfig &conf = config_.conv_conf();
-  paddingH_ = conf.padding_y();
-  paddingW_ = conf.padding();
-
-  strideH_ = conf.stride_y();
-  strideW_ = conf.stride();
-
-  dilationH_ = conf.dilation_y();
-  dilationW_ = conf.dilation();
-  CHECK_GT(dilationH_, 0);
-  CHECK_GT(dilationW_, 0);
-
-  filterH_ = conf.filter_size_y();
-  filterW_ = conf.filter_size();
-
-  configImgH_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
-  configImgW_ = conf.img_size();
-
-  configOutH_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
-  configOutW_ = conf.output_x();
-
-  configChannels_ = conf.channels();
-  configNumFilters_ = config_.num_filters();
-
-  isDeconv_ = (config_.type() == "conv") ? false : true;
-
-  channels_ = (isDeconv_) ? configNumFilters_ : configChannels_;
-  numFilters_ = (isDeconv_) ? configChannels_ : configNumFilters_;
-
-  groups_ = conf.groups();
-  CHECK_EQ(channels_ % groups_, 0);
-  CHECK_EQ(numFilters_ % groups_, 0);
-}
-
-void ConvBaseProjection::initCudnn() {
-  hl_create_filter_descriptor(&filterDesc_,
-                              channels_ / groups_,
-                              numFilters_ / groups_,
-                              filterH_,
-                              filterW_);
-  hl_create_tensor_descriptor(&imageDesc_);
-  hl_create_tensor_descriptor(&outputDesc_);
-  hl_create_convolution_descriptor(&convDesc_,
-                                   imageDesc_,
-                                   filterDesc_,
-                                   paddingH_,
-                                   paddingW_,
-                                   strideH_,
-                                   strideW_,
-                                   dilationH_,
-                                   dilationW_);
-
-  // initialize all to default algorithms
-  fwdAlgo_ = 0;
-  bwdFilterAlgo_ = 0;
-  bwdDataAlgo_ = 0;
-  fwdLimitBytes_ = 0;
-  bwdDataLimitBytes_ = 0;
-  bwdFilterLimitBytes_ = 0;
-  workSpaceInBytes_ = 0;
-}
-
-void ConvBaseProjection::reshapeTensorDesc(int batchSize) {
-  // The stride between two consecutive samples in the output of ConvProjection
-  // may not be numFilters_ * outputH_ * outputW_ (conv) or
-  // channels_ * imageH_ * imageW_ (deconv)
-  // for example, in the case of layer ConcatenateLayer2 with two
-  // ConvProjection, the stride is the output_size of layer ConcatenateLayer2.
-  // So the calculation of nStride is different from CudnnConvLayer.
-  size_t nStrideImage, nStrideOutput;
-  if (isDeconv_) {
-    nStrideImage = out_->value->getStride();
-    nStrideOutput = numFilters_ * outputH_ * outputW_;
-  } else {
-    nStrideImage = channels_ * imageH_ * imageW_;
-    nStrideOutput = out_->value->getStride();
-  }
-
-  hl_tensor_reshape(imageDesc_,
-                    batchSize,
-                    channels_ / groups_,
-                    imageH_,
-                    imageW_,
-                    nStrideImage,
-                    imageH_ * imageW_,
-                    imageW_,
-                    1);
-
-  hl_tensor_reshape(outputDesc_,
-                    batchSize,
-                    numFilters_ / groups_,
-                    outputH_,
-                    outputW_,
-                    nStrideOutput,
-                    outputH_ * outputW_,
-                    outputW_,
-                    1);
-
-  hl_reset_convolution_descriptor(convDesc_,
-                                  imageDesc_,
-                                  filterDesc_,
-                                  paddingH_,
-                                  paddingW_,
-                                  strideH_,
-                                  strideW_,
-                                  dilationH_,
-                                  dilationW_);
-}
-
-void ConvBaseProjection::reshape(int batchSize) {
-  size_t width = calOutputSize();
-  CHECK_EQ(width, out_->value->getWidth());
-  CHECK_EQ(calInputSize(), in_->value->getWidth());
-
-  reshapeTensorDesc(batchSize);
-  bool useDilation = false;
-  if (dilationH_ > 1 || dilationW_ > 1) {
-    useDilation = true;
-  }
-  hl_conv_workspace(imageDesc_,
-                    outputDesc_,
-                    filterDesc_,
-                    convDesc_,
-                    &fwdAlgo_,
-                    &fwdLimitBytes_,
-                    &bwdDataAlgo_,
-                    &bwdDataLimitBytes_,
-                    &bwdFilterAlgo_,
-                    &bwdFilterLimitBytes_,
-                    useDilation);
-
-  size_t maxWorkSpace = 0;
-  maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
-  maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
-  workSpaceInBytes_ = maxWorkSpace;
-
-  VLOG(3) << getName() << " Fwd / BwdData / BwdFilter algo: " << fwdAlgo_
-          << " / " << bwdDataAlgo_ << " / " << bwdFilterAlgo_;
-}
-
-void *ConvBaseProjection::getSpaceBytes(size_t size) {
-  std::vector<MemoryHandlePtr> &convMem = *convMem_;
-  if (convMem.empty()) {
-    int numDevices = hl_get_device_count();
-    convMem.resize(numDevices);
-  }
-
-  int devId = hl_get_device();
-  MemoryHandlePtr localMem = convMem[devId];
-  if (NULL == localMem || size > localMem->getAllocSize()) {
-    localMem = std::make_shared<GpuMemoryHandle>(size);
-  }
-  return localMem->getBuf();
-}
-
-ConvBaseProjection::~ConvBaseProjection() {
-  hl_destroy_tensor_descriptor(imageDesc_);
-  hl_destroy_tensor_descriptor(outputDesc_);
-  hl_destroy_filter_descriptor(filterDesc_);
-  hl_destroy_convolution_descriptor(convDesc_);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvBaseProjection.h b/paddle/gserver/layers/ConvBaseProjection.h
deleted file mode 100644
index f3266ae1ab945042cde9f24b7c2673c18d37bc11..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConvBaseProjection.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Projection.h"
-#include "paddle/math/MathUtils.h"
-
-namespace paddle {
-
-/**
- * @brief Base class for ConvProjection and ConvTransProjection.
- */
-class ConvBaseProjection : public Projection {
- public:
-  /**
-   * Constructor.
-   */
-  ConvBaseProjection(const ProjectionConfig& config,
-                     ParameterPtr parameter,
-                     bool useGpu);
-
-  ~ConvBaseProjection();
-
- protected:
-  void getConvParams();
-  void initCudnn();
-
-  void reshapeTensorDesc(int batchSize);
-  void reshape(int batchSize);
-
-  virtual size_t calOutputSize() = 0;
-  virtual size_t calInputSize() = 0;
-
-  static void* getSpaceBytes(size_t size);
-
-  /// True if it's deconv projection layer, false if it's ConvProjection layer
-  bool isDeconv_;
-  /// imageH_ and imageW_ / outputH_ and outputW_
-  /// is calculated from the input layer.
-  int imageH_, imageW_;
-  int outputH_, outputW_;
-  /// configImgH_ and configImgW_ / configOutH_ and configOutW_
-  /// is obtained from config.
-  int configImgH_, configImgW_;
-  int configOutH_, configOutW_;
-  /// channels_ and numFilters_ are defined in terms of convolution semantics
-  int channels_, numFilters_;
-  /// configChannels and configNumFilters_ are obtained from config
-  /// For Conv they are the same as channels_ and numFilters
-  /// For ConvTrans they are opposite to channels_ and numFilters
-  int configChannels_, configNumFilters_;
-  int paddingH_, paddingW_;
-  int strideH_, strideW_;
-  int dilationH_, dilationW_;
-  int filterH_, filterW_;
-  /// One group offset of input data.
-  int inputOffset_;
-  /// One group offset of output data.
-  int outputOffset_;
-  /// One group offset of weight.
-  int weightOffset_;
-  int groups_;
-
-  /// Cudnn tensor descriptor for input.
-  hl_tensor_descriptor imageDesc_;
-  /// Cudnn tensor descriptor for output.
-  hl_tensor_descriptor outputDesc_;
-  /// Cudnn tensor descriptor for filter.
-  hl_filter_descriptor filterDesc_;
-  /// Cudnn tensor descriptor for a convolution operation.
-  hl_convolution_descriptor convDesc_;
-
-  /// Record the algorithm for forward convolution, which is obtained by cudnn
-  /// api to search the best suited algorithm.
-  int fwdAlgo_;
-  /// Record the algorithm for computing convolution gradient with respect to
-  /// filter coefficients.
-  int bwdFilterAlgo_;
-  /// Record the algorithm for computing convolution gradient with respect to
-  /// the output.
-  int bwdDataAlgo_;
-  /// Amount of GPU memory needed as workspace to be able to execute a
-  /// forward convolution with the specified algo.
-  size_t fwdLimitBytes_;
-  /// Amount of GPU memory needed as workspace to be able to execute a
-  /// backwardFilter with the specified algo.
-  size_t bwdDataLimitBytes_;
-  /// Amount of GPU memory needed as workspace to be able to execute a
-  /// backwardData with the specified algo.
-  size_t bwdFilterLimitBytes_;
-  /// Size of total work space.
-  size_t workSpaceInBytes_;
-  bool bias_;
-
-  std::unique_ptr<Weight> weight_;
-  static ThreadLocalD<std::vector<MemoryHandlePtr>> convMem_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvOperator.cpp b/paddle/gserver/layers/ConvOperator.cpp
deleted file mode 100644
index 45498b92d32e0fa72adbe95a98e8d30c7f8929e2..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConvOperator.cpp
+++ /dev/null
@@ -1,128 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ConvOperator.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief ConvOperator takes two inputs to perform the convolution.
- * The first input is the image, and the second input is the convolution kernel.
- * The height of data for two inputs are the same. Each data of the first input
- * is convolved with each data of the second input indepedently.
- *
- * The config file api is conv_operator.
- */
-
-REGISTER_OPERATOR(conv, ConvOperator);
-
-void ConvOperator::reshape(int batchSize) {
-  imageH_ = ins_[0]->getFrameHeight();
-  imageW_ = ins_[0]->getFrameWidth();
-  if (imageH_ == 0) imageH_ = imgSizeY_;
-  if (imageW_ == 0) imageW_ = imgSize_;
-  outputH_ = outputSize(imageH_, filterSizeY_, paddingY_, strideY_, caffeMode_);
-  outputW_ = outputSize(imageW_, filterSize_, padding_, stride_, caffeMode_);
-  /// Check that the outputSizes are consistent with config
-  CHECK_EQ(outputH_, outputY_);
-  CHECK_EQ(outputW_, outputX_);
-  out_->setFrameHeight(outputH_);
-  out_->setFrameWidth(outputW_);
-
-  reshapeImageDescriptors();
-
-  inputOffset_ = channels_ * imageH_ * imageW_;
-  outputOffset_ = numFilters_ * outputH_ * outputW_;
-  weightOffset_ = numFilters_ * channels_ * filterSize_ * filterSizeY_;
-
-  if (!isSelectAlgo_) {
-    allocConvWorkSpace();
-  }
-
-  isSelectAlgo_ = true;
-}
-
-void ConvOperator::forward() {
-  size_t batchSize = ins_[0]->value->getHeight();
-  reshape(batchSize);
-  CHECK_EQ(ins_[1]->value->getHeight(), batchSize);
-  checkFilterSize(ins_[1]->value);
-  Matrix::resizeOrCreate(out_->value,
-                         batchSize,
-                         outputH_ * outputW_ * numFilters_,
-                         false,
-                         useGpu_);
-  {
-    AsyncGpuBlock block;
-    for (size_t batchId = 0; batchId < batchSize; ++batchId) {
-      real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
-      real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
-      real *outData = out_->value->getData() + outputOffset_ * batchId;
-      hl_convolution_forward(imageDesc_,
-                             inputData,
-                             outputDesc_,
-                             outData,
-                             filterDesc_,
-                             wgtData,
-                             convDesc_,
-                             workSpace_,
-                             workSpaceInBytes_,
-                             fwdAlgo_);
-    }
-  }
-}
-
-void ConvOperator::backward() {
-  size_t batchSize = ins_[0]->value->getHeight();
-  {
-    AsyncGpuBlock block;
-    for (size_t batchId = 0; batchId < batchSize; ++batchId) {
-      real *outGrad = out_->grad->getData() + outputOffset_ * batchId;
-      if (ins_[1]->grad) {
-        real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
-        real *weightGrad = ins_[1]->grad->getData() + weightOffset_ * batchId;
-        hl_convolution_backward_filter(imageDesc_,
-                                       inputData,
-                                       outputDesc_,
-                                       outGrad,
-                                       filterDesc_,
-                                       weightGrad,
-                                       convDesc_,
-                                       workSpace_,
-                                       workSpaceInBytes_,
-                                       bwdFilterAlgo_);
-      }
-
-      MatrixPtr preGrad = ins_[0]->grad;
-      if (NULL != preGrad) {
-        real *inputGrad = preGrad->getData() + inputOffset_ * batchId;
-        real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
-        hl_convolution_backward_data(imageDesc_,
-                                     inputGrad,
-                                     outputDesc_,
-                                     outGrad,
-                                     filterDesc_,
-                                     wgtData,
-                                     convDesc_,
-                                     workSpace_,
-                                     workSpaceInBytes_,
-                                     bwdDataAlgo_);
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvOperator.h b/paddle/gserver/layers/ConvOperator.h
deleted file mode 100644
index 527dbf8c270f35e19ca23acd8a3ba8197d03b988..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConvOperator.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include "ConvBaseOperator.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief ConvOperator takes two inputs to perform the convolution.
- * The first input is the image, and the second input is the convolution kernel.
- * The height of data for two inputs are the same. Each data of the first input
- * is convolved with each data of the second input indepedently.
- *
- * The config file api is conv_operator.
- */
-
-class ConvOperator : public ConvBaseOperator {
- public:
-  ConvOperator(const OperatorConfig &config, bool useGpu)
-      : ConvBaseOperator(config, useGpu) {}
-  /**
-   * Free workspace in device and destroy cudnn tensor descriptor.
-   */
-  virtual ~ConvOperator() {}
-  void forward() override;
-  void backward() override;
-  void reshape(int batchSize) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvProjection.cpp b/paddle/gserver/layers/ConvProjection.cpp
deleted file mode 100644
index f382e6cab12a833ce555c948f41e1086093bd78e..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConvProjection.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ConvProjection.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_PROJECTION(conv, ConvProjection);
-
-size_t ConvProjection::calOutputSize() {
-  imageH_ = in_->getFrameHeight();
-  imageW_ = in_->getFrameWidth();
-  if (imageH_ == 0) imageH_ = configImgH_;
-  if (imageW_ == 0) imageW_ = configImgW_;
-  outputH_ = outputSize(imageH_,
-                        (filterH_ - 1) * dilationH_ + 1,
-                        paddingH_,
-                        strideH_,
-                        /* caffeMode */ true);
-  outputW_ = outputSize(imageW_,
-                        (filterW_ - 1) * dilationW_ + 1,
-                        paddingW_,
-                        strideW_,
-                        /* caffeMode */ true);
-
-  const_cast<Argument *>(out_)->setFrameHeight(outputH_);
-  const_cast<Argument *>(out_)->setFrameWidth(outputW_);
-
-  inputOffset_ = (configChannels_ / groups_) * imageH_ * imageW_;
-  outputOffset_ = (configNumFilters_ / groups_) * outputH_ * outputW_;
-  return outputH_ * outputW_ * configNumFilters_;
-}
-
-size_t ConvProjection::calInputSize() {
-  return static_cast<size_t>(configChannels_ * imageH_ * imageW_);
-}
-
-void ConvProjection::forward() {
-  int batchSize = in_->value->getHeight();
-  reshape(batchSize);
-
-  void *workSpace = NULL;
-  if (workSpaceInBytes_ > 0) {
-    workSpace = getSpaceBytes(workSpaceInBytes_);
-  }
-
-  for (int g = 0; g < groups_; ++g) {
-    REGISTER_TIMER_INFO("CudnnConvFwTimer", getName().c_str());
-
-    real *inputData = in_->value->getData() + g * inputOffset_;
-    real *wgtData = weight_->getW()->getData() + g * weightOffset_;
-    real *outData = out_->value->getData() + g * outputOffset_;
-    hl_convolution_forward(imageDesc_,
-                           inputData,
-                           outputDesc_,
-                           outData,
-                           filterDesc_,
-                           wgtData,
-                           convDesc_,
-                           workSpace,
-                           fwdLimitBytes_,
-                           fwdAlgo_);
-  }
-}
-
-void ConvProjection::backward(const UpdateCallback &callback) {
-  REGISTER_TIMER_INFO("CudnnConvBpTimer", getName().c_str());
-
-  void *workSpace = NULL;
-  if (workSpaceInBytes_ > 0) {
-    workSpace = getSpaceBytes(workSpaceInBytes_);
-  }
-
-  for (int g = 0; g < groups_; ++g) {
-    real *outGrad = out_->grad->getData() + g * outputOffset_;
-    if (weight_->getWGrad()) {
-      real *inputData = in_->value->getData() + g * inputOffset_;
-      real *weightGrad = weight_->getWGrad()->getData() + g * weightOffset_;
-      hl_convolution_backward_filter(imageDesc_,
-                                     inputData,
-                                     outputDesc_,
-                                     outGrad,
-                                     filterDesc_,
-                                     weightGrad,
-                                     convDesc_,
-                                     workSpace,
-                                     bwdFilterLimitBytes_,
-                                     bwdFilterAlgo_);
-    }
-
-    MatrixPtr preGrad = in_->grad;
-    if (NULL != preGrad) {
-      real *inputGrad = preGrad->getData() + g * inputOffset_;
-      real *wgtData = weight_->getW()->getData() + g * weightOffset_;
-      hl_convolution_backward_data(imageDesc_,
-                                   inputGrad,
-                                   outputDesc_,
-                                   outGrad,
-                                   filterDesc_,
-                                   wgtData,
-                                   convDesc_,
-                                   workSpace,
-                                   bwdDataLimitBytes_,
-                                   bwdDataAlgo_);
-    }
-  }
-
-  weight_->getParameterPtr()->incUpdate(callback);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvProjection.h b/paddle/gserver/layers/ConvProjection.h
deleted file mode 100644
index 22a2202bb6cc256a4a5897724d8eb8a93fefb79f..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConvProjection.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "ConvBaseProjection.h"
-#include "paddle/math/MathUtils.h"
-
-namespace paddle {
-
-/**
- * @brief Convolution projection do the same calculation with CudnnConvLayer.
- */
-class ConvProjection : public ConvBaseProjection {
- public:
-  /**
-   * Constructor.
-   */
-  ConvProjection(const ProjectionConfig& config,
-                 ParameterPtr parameter,
-                 bool useGpu)
-      : ConvBaseProjection(config, parameter, useGpu) {}
-
-  ~ConvProjection() {}
-
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback);
-  virtual size_t calOutputSize();
-  virtual size_t calInputSize();
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvShiftLayer.cpp b/paddle/gserver/layers/ConvShiftLayer.cpp
deleted file mode 100644
index 615c3478061b591ea30cbf0b3d27ef2551c0dd28..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConvShiftLayer.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief A layer for circular convluation of two vectors,
- * which is used in NEURAL TURING MACHINE.
- * - Input: two vectors, the first is data (batchSize x dataDim)
- * the second is shift weights (batchSize x shiftDim)
- * - Output: a vector (batchSize x dataDim)
- * Assumed that:
- * - a[in]: contains M elements.
- * - b[in]: contains N elements (N should be odd).
- * - c[out]: contains M elements.
- *
- * \f[
- *     c[i] = \sum_{j=-(N-1)/2}^{(N-1)/2}a_{i+j} * b_{j}
- * \f]
- *
- * In this formula:
- *  - a's index is computed modulo M.
- *  - b's index is comupted modulo N.
- *
- * The config file api is conv_shift_layer.
- */
-
-class ConvShiftLayer : public Layer {
- public:
-  explicit ConvShiftLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~ConvShiftLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(conv_shift, ConvShiftLayer);
-
-bool ConvShiftLayer::init(const LayerMap& layerMap,
-                          const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 2U);
-
-  return true;
-}
-
-void ConvShiftLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-
-  size_t batchSize = inV0->getHeight();
-  size_t dataDim = inV0->getWidth();
-
-  CHECK_EQ(batchSize, inV1->getHeight());
-  CHECK_EQ(dataDim, getSize());
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    resetOutput(batchSize, dataDim);
-  }
-
-  MatrixPtr outV = getOutputValue();
-
-  REGISTER_TIMER_INFO("FwConvShiftTimer", getName().c_str());
-  outV->circularConv(*inV0, *inV1);
-}
-
-void ConvShiftLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-  MatrixPtr outG = getOutputGrad();
-  MatrixPtr inG0 = getInputGrad(0);
-  MatrixPtr inG1 = getInputGrad(1);
-
-  REGISTER_TIMER_INFO("BwConvShiftTimer", getName().c_str());
-
-  if (inG0 && inG1) {
-    outG->circularConvDerivative(*outG, *inV0, *inV1, *inG0, *inG1);
-  } else {
-    CHECK(!inG0 || !inG1) << "Not supported";
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvTransOperator.cpp b/paddle/gserver/layers/ConvTransOperator.cpp
deleted file mode 100644
index ac41d6f9a4f86364930e27ee401406432e731b65..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConvTransOperator.cpp
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ConvTransOperator.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief ConvTransOperator takes two inputs to perform the convolution.
- * The first input is the image, and the second input is the convolution kernel.
- * The height of data for two inputs are the same. Each data of the first input
- * is convolved with each data of the second input indepedently.
- *
- * The config file api is conv_operator.
- */
-
-REGISTER_OPERATOR(convt, ConvTransOperator);
-
-void ConvTransOperator::reshape(int batchSize) {
-  outputH_ = ins_[0]->getFrameHeight();
-  outputW_ = ins_[0]->getFrameWidth();
-  if (outputH_ == 0) outputH_ = outputY_;
-  if (outputW_ == 0) outputW_ = outputX_;
-  imageH_ = imageSize(outputH_, filterSizeY_, paddingY_, strideY_, caffeMode_);
-  imageW_ = imageSize(outputW_, filterSize_, padding_, stride_, caffeMode_);
-  /// Check that the imageSizes are consistent with config
-  CHECK_EQ(imageH_, imgSizeY_);
-  CHECK_EQ(imageW_, imgSize_);
-  out_->setFrameHeight(imageH_);
-  out_->setFrameWidth(imageW_);
-
-  reshapeImageDescriptors();
-
-  inputOffset_ = numFilters_ * outputH_ * outputW_;
-  outputOffset_ = channels_ * imageH_ * imageW_;
-  weightOffset_ = numFilters_ * channels_ * filterSize_ * filterSizeY_;
-
-  if (!isSelectAlgo_) {
-    allocConvWorkSpace();
-  }
-
-  isSelectAlgo_ = true;
-}
-
-void ConvTransOperator::forward() {
-  size_t batchSize = ins_[0]->value->getHeight();
-  reshape(batchSize);
-  CHECK_EQ(ins_[1]->value->getHeight(), batchSize);
-  checkFilterSize(ins_[1]->value);
-  Matrix::resizeOrCreate(
-      out_->value, batchSize, imageH_ * imageW_ * channels_, false, useGpu_);
-  {
-    AsyncGpuBlock block;
-    for (size_t batchId = 0; batchId < batchSize; ++batchId) {
-      real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
-      real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
-      real *outData = out_->value->getData() + outputOffset_ * batchId;
-      hl_convolution_backward_data(imageDesc_,
-                                   outData,
-                                   outputDesc_,
-                                   inputData,
-                                   filterDesc_,
-                                   wgtData,
-                                   convDesc_,
-                                   workSpace_,
-                                   workSpaceInBytes_,
-                                   bwdDataAlgo_);
-    }
-  }
-}
-
-void ConvTransOperator::backward() {
-  size_t batchSize = ins_[0]->value->getHeight();
-  {
-    AsyncGpuBlock block;
-    for (size_t batchId = 0; batchId < batchSize; ++batchId) {
-      real *outGrad = out_->grad->getData() + outputOffset_ * batchId;
-      if (ins_[1]->grad) {
-        real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
-        real *weightGrad = ins_[1]->grad->getData() + weightOffset_ * batchId;
-        hl_convolution_backward_filter(imageDesc_,
-                                       outGrad,
-                                       outputDesc_,
-                                       inputData,
-                                       filterDesc_,
-                                       weightGrad,
-                                       convDesc_,
-                                       workSpace_,
-                                       workSpaceInBytes_,
-                                       bwdFilterAlgo_);
-      }
-
-      MatrixPtr preGrad = ins_[0]->grad;
-      if (NULL != preGrad) {
-        real *inputGrad = preGrad->getData() + inputOffset_ * batchId;
-        real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
-        hl_convolution_forward(imageDesc_,
-                               outGrad,
-                               outputDesc_,
-                               inputGrad,
-                               filterDesc_,
-                               wgtData,
-                               convDesc_,
-                               workSpace_,
-                               workSpaceInBytes_,
-                               fwdAlgo_);
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvTransOperator.h b/paddle/gserver/layers/ConvTransOperator.h
deleted file mode 100644
index 53cb7a21b49189898d09aa20cd46d04cc5c20198..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConvTransOperator.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include "ConvBaseOperator.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief ConvTransOperator takes two inputs to perform the convolution.
- * The first input is the image, and the second input is the convolution kernel.
- * The height of data for two inputs are the same. Each data of the first input
- * is convolved with each data of the second input indepedently.
- *
- * The config file api is conv_operator.
- */
-
-class ConvTransOperator : public ConvBaseOperator {
- public:
-  ConvTransOperator(const OperatorConfig &config, bool useGpu)
-      : ConvBaseOperator(config, useGpu) {}
-  /**
-   * Free workspace in device and destroy cudnn tensor descriptor.
-   */
-  virtual ~ConvTransOperator() {}
-  void forward() override;
-  void backward() override;
-  void reshape(int batchSize) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvTransProjection.cpp b/paddle/gserver/layers/ConvTransProjection.cpp
deleted file mode 100644
index 242ce34a607057069a4d0a31e9b70d56279d37ab..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConvTransProjection.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ConvTransProjection.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_PROJECTION(convt, ConvTransProjection);
-size_t ConvTransProjection::calOutputSize() {
-  outputH_ = in_->getFrameHeight();
-  outputW_ = in_->getFrameWidth();
-  if (outputH_ == 0) outputH_ = configOutH_;
-  if (outputW_ == 0) outputW_ = configOutW_;
-  imageH_ = imageSize(outputH_,
-                      (filterH_ - 1) * dilationH_ + 1,
-                      paddingH_,
-                      strideH_,
-                      /* caffeMode */ true);
-
-  imageW_ = imageSize(outputW_,
-                      (filterW_ - 1) * dilationW_ + 1,
-                      paddingW_,
-                      strideW_,
-                      /* caffeMode */ true);
-
-  const_cast<Argument *>(out_)->setFrameHeight(imageH_);
-  const_cast<Argument *>(out_)->setFrameWidth(imageW_);
-
-  inputOffset_ = (configChannels_ / groups_) * outputH_ * outputW_;
-  outputOffset_ = (configNumFilters_ / groups_) * imageH_ * imageW_;
-  return imageH_ * imageW_ * configNumFilters_;
-}
-
-size_t ConvTransProjection::calInputSize() {
-  return static_cast<size_t>(configChannels_ * outputH_ * outputW_);
-}
-
-void ConvTransProjection::forward() {
-  int batchSize = in_->value->getHeight();
-  reshape(batchSize);
-
-  void *workSpace = NULL;
-  if (workSpaceInBytes_ > 0) {
-    workSpace = getSpaceBytes(workSpaceInBytes_);
-  }
-
-  for (int g = 0; g < groups_; ++g) {
-    REGISTER_TIMER_INFO("CudnnConvTransFwTimer", getName().c_str());
-
-    real *inData = in_->value->getData() + g * inputOffset_;
-    real *wgtData = weight_->getW()->getData() + g * weightOffset_;
-    real *outData = out_->value->getData() + g * outputOffset_;
-    hl_convolution_backward_data(imageDesc_,
-                                 outData,
-                                 outputDesc_,
-                                 inData,
-                                 filterDesc_,
-                                 wgtData,
-                                 convDesc_,
-                                 workSpace,
-                                 bwdDataLimitBytes_,
-                                 bwdDataAlgo_);
-  }
-}
-
-void ConvTransProjection::backward(const UpdateCallback &callback) {
-  REGISTER_TIMER_INFO("CudnnConvTransBpTimer", getName().c_str());
-
-  void *workSpace = NULL;
-  if (workSpaceInBytes_ > 0) {
-    workSpace = getSpaceBytes(workSpaceInBytes_);
-  }
-
-  for (int g = 0; g < groups_; ++g) {
-    real *outGrad = out_->grad->getData() + g * outputOffset_;
-    if (weight_->getWGrad()) {
-      real *inData = in_->value->getData() + g * inputOffset_;
-      real *weightGrad = weight_->getWGrad()->getData() + g * weightOffset_;
-      hl_convolution_backward_filter(imageDesc_,
-                                     outGrad,
-                                     outputDesc_,
-                                     inData,
-                                     filterDesc_,
-                                     weightGrad,
-                                     convDesc_,
-                                     workSpace,
-                                     bwdFilterLimitBytes_,
-                                     bwdFilterAlgo_);
-    }
-
-    MatrixPtr preGrad = in_->grad;
-    if (NULL != preGrad) {
-      real *inGrad = preGrad->getData() + g * inputOffset_;
-      real *wgtData = weight_->getW()->getData() + g * weightOffset_;
-      hl_convolution_forward(imageDesc_,
-                             outGrad,
-                             outputDesc_,
-                             inGrad,
-                             filterDesc_,
-                             wgtData,
-                             convDesc_,
-                             workSpace,
-                             fwdLimitBytes_,
-                             fwdAlgo_);
-    }
-  }
-
-  weight_->getParameterPtr()->incUpdate(callback);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvTransProjection.h b/paddle/gserver/layers/ConvTransProjection.h
deleted file mode 100644
index 0f9ed720d3b8855a3a24ac25a1c3917c4b98e81d..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConvTransProjection.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "ConvBaseProjection.h"
-#include "paddle/math/MathUtils.h"
-
-namespace paddle {
-
-/**
- * @brief Convolution projection do the same calculation with CudnnConvLayer.
- */
-class ConvTransProjection : public ConvBaseProjection {
- public:
-  /**
-   * Constructor.
-   */
-  ConvTransProjection(const ProjectionConfig& config,
-                      ParameterPtr parameter,
-                      bool useGpu)
-      : ConvBaseProjection(config, parameter, useGpu) {}
-
-  ~ConvTransProjection() {}
-
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback);
-  virtual size_t calOutputSize();
-  virtual size_t calInputSize();
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvexCombinationLayer.cpp b/paddle/gserver/layers/ConvexCombinationLayer.cpp
deleted file mode 100644
index 31363d97c4fd318ec2c6d48f9200f6ba1f49ba11..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConvexCombinationLayer.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief A layer for weighted sum of vectors,
- * which is used in NEURAL MACHINE TRANSLATION BY JOINTLY LEARNING TO ALIGN AND
- * TRANSLATE
- * - Input: the the size of the first input is weightDim,
- *          and the size of the second input is weightdim * dataDim.
- * - Output: the sizeof the output is dataDim
- * \f[
- *   out(j) = \sum_{i}(in0(i) * in1(i,j + i * dataDim)),
- *               i = 0,1,...,(weightDim-1); j = 0, 1,...,(dataDim-1)
- * \f]
- * Note that the above computation is for one sample. Multiple samples are
- * processed in one batch.
- *
- * The config file api is linear_comb_layer.
- */
-class ConvexCombinationLayer : public Layer {
- protected:
-  /// A matrix pointer pointing to second input.
-  MatrixPtr tmpMtx0;
-  /// A matrix pointer pointing to first input.
-  MatrixPtr tmpRow0;
-  /// A matrix pointer pointing to output.
-  MatrixPtr tmpRow1;
-
- public:
-  explicit ConvexCombinationLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~ConvexCombinationLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(convex_comb, ConvexCombinationLayer);
-
-bool ConvexCombinationLayer::init(const LayerMap& layerMap,
-                                  const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(2U, inputLayers_.size());
-  size_t dataDim = getSize();
-  size_t weightDim = inputLayers_[0]->getSize();
-
-  CHECK_EQ(weightDim * dataDim, inputLayers_[1]->getSize())
-      << "Dimension mismatch";
-
-  tmpRow0 = Matrix::create(nullptr,
-                           /* height= */ 1,
-                           weightDim,
-                           /* trans= */ false,
-                           useGpu_);
-  tmpRow1 = Matrix::create(nullptr,
-                           /* height= */ 1,
-                           dataDim,
-                           /* trans= */ false,
-                           useGpu_);
-  tmpMtx0 = Matrix::create(nullptr,
-                           /* height= */ weightDim,
-                           dataDim,
-                           /* trans= */ false,
-                           useGpu_);
-
-  return true;
-}
-
-void ConvexCombinationLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-
-  size_t batchSize = inV0->getHeight();
-  size_t weightDim = inV0->getWidth();
-  size_t dataDim = getSize();
-
-  CHECK_EQ(batchSize, inV1->getHeight());
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    reserveOutput(batchSize, dataDim);
-  }
-
-  MatrixPtr outV = getOutputValue();
-
-  REGISTER_TIMER_INFO("FwCvxCombTimer", getName().c_str());
-  for (size_t i = 0; i < batchSize; i++) {
-    tmpMtx0->setData(inV1->getData() + i * weightDim * dataDim);
-    tmpRow0->setData(inV0->getData() + i * weightDim);
-    tmpRow1->setData(outV->getData() + i * dataDim);
-
-    tmpRow1->mul(*tmpRow0, *tmpMtx0, 1, 0);
-  }
-}
-
-void ConvexCombinationLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr outG = getOutputGrad();
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-  MatrixPtr inG0 = getInputGrad(0);
-  MatrixPtr inG1 = getInputGrad(1);
-
-  size_t batchSize = inV0->getHeight();
-  size_t weightDim = inV0->getWidth();
-  size_t dataDim = getSize();
-
-  REGISTER_TIMER_INFO("BwCvxCombTimer", getName().c_str());
-
-  if (inG0) {
-    for (size_t i = 0; i < batchSize; i++) {
-      tmpRow0->setData(inG0->getData() + i * weightDim);
-      tmpRow1->setData(outG->getData() + i * dataDim);
-      tmpMtx0->setData(inV1->getData() + i * weightDim * dataDim);
-
-      tmpRow0->mul(*tmpRow1, *(tmpMtx0->getTranspose()), 1, 1);
-    }
-  }
-
-  if (inG1) {
-    for (size_t i = 0; i < batchSize; i++) {
-      tmpRow0->setData(inV0->getData() + i * weightDim);
-      tmpRow1->setData(outG->getData() + i * dataDim);
-      tmpMtx0->setData(inG1->getData() + i * weightDim * dataDim);
-
-      tmpMtx0->mul(*(tmpRow0->getTranspose()), *tmpRow1, 1, 1);
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/CosSimLayer.cpp b/paddle/gserver/layers/CosSimLayer.cpp
deleted file mode 100644
index 4e44a5e8dfdad98bff0cd0f405b4227340a45728..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/CosSimLayer.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CosSimLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(cos, CosSimLayer);
-
-bool CosSimLayer::init(const LayerMap& layerMap,
-                       const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 2LU);
-
-  createFunction(forward_,
-                 "CosSimForward",
-                 FuncConfig().set("scale", (real)config_.cos_scale()));
-  createFunction(backward_,
-                 "CosSimBackward",
-                 FuncConfig().set("scale", (real)config_.cos_scale()));
-
-  return true;
-}
-
-void CosSimLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  /* malloc memory for the output_ if necessary */
-  int batchSize = getInputValue(0)->getHeight();
-  int size = getSize();
-  CHECK_EQ(forward_.size(), 1UL) << "Only one forward function needed";
-
-  {
-    REGISTER_TIMER_INFO("CosFwResetTimer", getName().c_str());
-    reserveOutput(batchSize, size);
-  }
-
-  MatrixPtr outV = getOutputValue();
-  /* activation */ {
-    REGISTER_TIMER_INFO("CosFwAtvTimer", getName().c_str());
-    MatrixPtr prevOut1 = getInputValue(0);
-    MatrixPtr prevOut2 = getInputValue(1);
-
-    CHECK(outV && prevOut1 && prevOut2);
-    BufferArgs inputs;
-    BufferArgs outputs;
-    inputs.addArg(*prevOut1);
-    inputs.addArg(*prevOut2);
-    outputs.addArg(*outV, ASSIGN_TO);
-    forward_[0]->calc(inputs, outputs);
-  }
-}
-
-void CosSimLayer::backward(const UpdateCallback& callback) {
-  /* activation */ {
-    REGISTER_TIMER_INFO("CosBpAtvTimer", getName().c_str());
-    CHECK_EQ(backward_.size(), 1UL) << "Only one backward function needed";
-
-    const auto outG = this->getOutputGrad();
-    const auto outV = this->getOutputValue();
-    const auto inV1 = this->getInputValue(0);
-    const auto inV2 = this->getInputValue(1);
-    auto inG1 = this->getInputGrad(0);
-    auto inG2 = this->getInputGrad(1);
-    CHECK(outG && outV && inV1 && inV2 && inG1 && inG2);
-    BufferArgs inputs;
-    BufferArgs outputs;
-    inputs.addArg(*outG);
-    inputs.addArg(*outV);
-    inputs.addArg(*inV1);
-    inputs.addArg(*inV2);
-    outputs.addArg(*inG1, ADD_TO);
-    outputs.addArg(*inG2, ADD_TO);
-
-    backward_[0]->calc(inputs, outputs);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/CosSimLayer.h b/paddle/gserver/layers/CosSimLayer.h
deleted file mode 100644
index d9fe1ff270f1f76e3b246dca374ddf45445419f9..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/CosSimLayer.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/ThreadLocal.h"
-
-namespace paddle {
-/**
- * @brief A layer for calculating cosine similarity between two vector
- * \f[
- * f(x,y)=scale\frac{x_1y_1+x_2y_2+...+x_ny_n}{\sqrt{x_1^2+x_2^2+...
- * +x_n^2}\sqrt{y_1^2+y_2^2+...+y_n^2}}
- * \f]
- *
- * - Input1: A vector (batchSize * dataDim) *
- * - Input2: A vector (batchSize * dataDim) or (1 * dataDim) *
- * - Output: A vector (batchSize * 1)
- *
- * The config file api is cos_sim.
- */
-class CosSimLayer : public Layer {
- public:
-  explicit CosSimLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~CosSimLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/CosSimVecMatLayer.cpp b/paddle/gserver/layers/CosSimVecMatLayer.cpp
deleted file mode 100644
index 230ecc768b4d7314b21ac1d76899c3c3bab12309..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/CosSimVecMatLayer.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-/**
- * @brief A layer for computing cosine similarity between a vector
- * and each row of a matrix
- * out[i] = cos_scale * cos(in1, in2(i,:));
- * @note used in NEURAL TURING MACHINE
- *
- * Input1: a vector (batchSize * dataDim)
- *
- * Input2: a matrix in vector form (batchSize * (weightDim*dataDim))
- *
- * Output: a vector (batchSize * weightDim)
- */
-
-class CosSimVecMatLayer : public Layer {
- protected:
-  MatrixPtr tmpMtx0;
-  MatrixPtr tmpMtx1;
-  MatrixPtr tmpRow0;
-  MatrixPtr tmpRow1;
-  MatrixPtr tmpRow2;
-  MatrixPtr tmpRow3;
-
- public:
-  explicit CosSimVecMatLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~CosSimVecMatLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(cos_vm, CosSimVecMatLayer);
-
-bool CosSimVecMatLayer::init(const LayerMap& layerMap,
-                             const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 2U);
-
-  size_t dataDim = inputLayers_[0]->getSize();
-  size_t numKeys = getSize();
-  size_t memoryDim = inputLayers_[1]->getSize();
-
-  CHECK_EQ(dataDim * numKeys, memoryDim) << "Dimension mismatch";
-
-  tmpRow0 = Matrix::create(nullptr,
-                           /* height= */ 1,
-                           dataDim,
-                           /* trans= */ false,
-                           useGpu_);
-  tmpRow1 = Matrix::create(nullptr,
-                           /* height= */ 1,
-                           dataDim,
-                           /* trans= */ false,
-                           useGpu_);
-  tmpRow2 = Matrix::create(nullptr,
-                           /* height= */ numKeys,
-                           1,
-                           /* trans= */ false,
-                           useGpu_);
-  tmpRow3 = Matrix::create(nullptr,
-                           /* height= */ numKeys,
-                           1,
-                           /* trans= */ false,
-                           useGpu_);
-
-  tmpMtx0 = Matrix::create(nullptr,
-                           /* height= */ numKeys,
-                           dataDim,
-                           /* trans= */ false,
-                           useGpu_);
-  tmpMtx1 = Matrix::create(nullptr,
-                           /* height= */ numKeys,
-                           dataDim,
-                           /* trans= */ false,
-                           useGpu_);
-
-  CHECK(tmpRow0 && tmpRow1 && tmpRow2 && tmpRow3 && tmpMtx0 && tmpMtx1);
-
-  createFunction(forward_,
-                 "CosSimForward",
-                 FuncConfig().set("scale", (real)config_.cos_scale()));
-  createFunction(backward_,
-                 "CosSimBackward",
-                 FuncConfig().set("scale", (real)config_.cos_scale()));
-
-  return true;
-}
-
-void CosSimVecMatLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  CHECK_EQ(forward_.size(), 1UL) << "Only one forward function needed";
-
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-
-  size_t batchSize = inV0->getHeight();
-  size_t numKeys = getSize();
-
-  CHECK_EQ(batchSize, inV1->getHeight());
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    reserveOutput(batchSize, numKeys);
-  }
-
-  MatrixPtr outV = getOutputValue();
-  CHECK(outV && inV0 && inV1);
-  REGISTER_TIMER_INFO("FwCosVMTimer", getName().c_str());
-  for (size_t i = 0; i < batchSize; i++) {
-    tmpRow0->setData(inV0->rowBuf(i));
-    tmpMtx0->setData(inV1->rowBuf(i));
-    tmpRow2->setData(outV->rowBuf(i));
-
-    BufferArgs inputs;
-    BufferArgs outputs;
-    inputs.addArg(*tmpMtx0);
-    inputs.addArg(*tmpRow0);
-    outputs.addArg(*tmpRow2, ASSIGN_TO);
-    forward_[0]->calc(inputs, outputs);
-  }
-}
-
-void CosSimVecMatLayer::backward(const UpdateCallback& callback) {
-  CHECK_EQ(backward_.size(), 1UL) << "Only one forward function needed";
-
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-  MatrixPtr inG0 = getInputGrad(0);
-  MatrixPtr inG1 = getInputGrad(1);
-  MatrixPtr outV = getOutputValue();
-  MatrixPtr outG = getOutputGrad();
-
-  size_t batchSize = inV0->getHeight();
-  CHECK(inV0 && inV1 && inG0 && inG1 && outV && outG);
-  REGISTER_TIMER_INFO("BwCosVMTimer", getName().c_str());
-
-  for (size_t i = 0; i < batchSize; i++) {
-    tmpRow0->setData(inV0->rowBuf(i));
-    tmpRow1->setData(inG0->rowBuf(i));
-    tmpMtx0->setData(inV1->rowBuf(i));
-    tmpMtx1->setData(inG1->rowBuf(i));
-    tmpRow2->setData(outV->rowBuf(i));
-    tmpRow3->setData(outG->rowBuf(i));
-
-    BufferArgs inputs;
-    BufferArgs outputs;
-    inputs.addArg(*tmpRow3);
-    inputs.addArg(*tmpRow2);
-    inputs.addArg(*tmpMtx0);
-    inputs.addArg(*tmpRow0);
-    outputs.addArg(*tmpMtx1, ADD_TO);
-    outputs.addArg(*tmpRow1, ADD_TO);
-
-    backward_[0]->calc(inputs, outputs);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/CostLayer.cpp b/paddle/gserver/layers/CostLayer.cpp
deleted file mode 100644
index 1327616950a8887efa2cba410fa7ae8b5bd97da4..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/CostLayer.cpp
+++ /dev/null
@@ -1,748 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CostLayer.h"
-#include <algorithm>
-#include <cmath>
-#include <memory>
-#include "paddle/utils/Logging.h"
-
-#include "paddle/math/SparseMatrix.h"
-
-namespace paddle {
-
-bool CostLayer::init(const LayerMap& layerMap,
-                     const ParameterMap& parameterMap) {
-  bool ret = Layer::init(layerMap, parameterMap);
-  coeff_ = config_.coeff();
-  if (!ret) return ret;
-  CHECK_GE(inputLayers_.size(), 2UL);
-  CHECK_LE(inputLayers_.size(), 3UL);
-  if (inputLayers_.size() == 3) {
-    weightLayer_ = inputLayers_[2];
-  }
-  return true;
-}
-
-void CostLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  int batchSize = getInputValue(*getOutputLayer())->getHeight();
-  int size = 1;
-  resetOutput(batchSize, size);
-
-  const MatrixPtr& output = getInputValue(*getOutputLayer());
-  Argument label = getInput(*getLabelLayer());
-
-  /* get the cost value for each sample*/
-  forwardImp(*output, label, *getOutputValue());
-  if (weightLayer_) {
-    const MatrixPtr& weight = getInputValue(*weightLayer_);
-    getOutputValue()->dotMul(*getOutputValue(), *weight);
-  }
-}
-
-void CostLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-
-  const Argument& output = getInput(*getOutputLayer());
-  Argument label = getInput(*getLabelLayer());
-
-  bool support = true;
-  if (weightLayer_) {
-    support = output.grad->getAbsSum() == 0;
-  }
-
-  backwardImp(*output.value, label, *output.grad);
-
-  if (weightLayer_) {
-    CHECK(support) << "Weighted cost layer '" << getName()
-                   << "' must be the last layer "
-                      "connected to the output layer '"
-                   << getOutputLayer()->getName() << "'";
-    output.grad->rowScale(0, *output.grad, *getInputValue(*weightLayer_));
-  }
-  if (coeff_ != real(1.0f)) {
-    output.grad->add(coeff_, 0);
-  }
-}
-
-//
-// class MultiClassCrossEntropy
-//
-bool MultiClassCrossEntropy::init(const LayerMap& layerMap,
-                                  const ParameterMap& parameterMap) {
-  return CostLayer::init(layerMap, parameterMap);
-}
-
-void MultiClassCrossEntropy::forwardImp(Matrix& output,
-                                        Argument& label,
-                                        Matrix& target) {
-  target.oneHotCrossEntropy(output, *label.ids);
-}
-
-void MultiClassCrossEntropy::backwardImp(Matrix& output,
-                                         Argument& label,
-                                         Matrix& outputG) {
-  outputG.oneHotCrossEntropyBp(output, *label.ids);
-}
-
-//
-// class MultiClassCrossEntropyWithSelfNorm
-//
-REGISTER_LAYER(multi_class_cross_entropy_with_selfnorm,
-               MultiClassCrossEntropyWithSelfNorm);
-
-bool MultiClassCrossEntropyWithSelfNorm::init(
-    const LayerMap& layerMap, const ParameterMap& parameterMap) {
-  return CostLayer::init(layerMap, parameterMap);
-}
-
-void MultiClassCrossEntropyWithSelfNorm::forwardImp(Matrix& output,
-                                                    Argument& label,
-                                                    Matrix& target) {
-  Matrix::resizeOrCreate(sftMaxSum_, output.getHeight(), 1, false, useGpu_);
-  output.rowSum(*sftMaxSum_);
-  sftMaxSum_->log2();
-
-  target.oneHotCrossEntropy(output, *label.ids);
-  target.add(*sftMaxSum_);
-
-  sftMaxSum_->square2();
-  target.add(*sftMaxSum_, config_.softmax_selfnorm_alpha());
-}
-
-void MultiClassCrossEntropyWithSelfNorm::backwardImp(Matrix& output,
-                                                     Argument& label,
-                                                     Matrix& outputG) {
-  Matrix::resizeOrCreate(sftMaxSum_, output.getHeight(), 1, false, useGpu_);
-  output.rowSum(*sftMaxSum_);
-
-  Matrix::resizeOrCreate(sumInv_, output.getHeight(), 1, false, useGpu_);
-  sftMaxSum_->reciprocal2(*sumInv_);
-
-  outputG.oneHotCrossEntropyBp(output, *label.ids);
-  outputG.addColumnVector(*sumInv_);
-
-  sftMaxSum_->log2();
-  sumInv_->dotMul(*sumInv_, *sftMaxSum_);
-  sumInv_->mulScalar(2 * config_.softmax_selfnorm_alpha());
-
-  outputG.addColumnVector(*sumInv_);
-}
-
-//
-// class SoftBinaryClassCrossEntropy
-//
-REGISTER_LAYER(soft_binary_class_cross_entropy, SoftBinaryClassCrossEntropy);
-
-bool SoftBinaryClassCrossEntropy::init(const LayerMap& layerMap,
-                                       const ParameterMap& parameterMap) {
-  return CostLayer::init(layerMap, parameterMap);
-}
-
-void SoftBinaryClassCrossEntropy::forwardImp(Matrix& output,
-                                             Argument& label,
-                                             Matrix& target) {
-  Matrix::resizeOrCreate(
-      targetPerDim_, output.getHeight(), output.getWidth(), false, useGpu_);
-
-  targetPerDim_->softCrossEntropy(output, *label.value);
-  targetPerDim_->rowSum(target);
-}
-
-void SoftBinaryClassCrossEntropy::backwardImp(Matrix& output,
-                                              Argument& label,
-                                              Matrix& outputG) {
-  outputG.softCrossEntropyBp(output, *label.value);
-}
-
-//
-// class SumOfSquaresCostLayer
-//
-
-REGISTER_LAYER(square_error, SumOfSquaresCostLayer);
-
-bool SumOfSquaresCostLayer::init(const LayerMap& layerMap,
-                                 const ParameterMap& parameterMap) {
-  return CostLayer::init(layerMap, parameterMap);
-}
-
-void SumOfSquaresCostLayer::forwardImp(Matrix& output,
-                                       Argument& label,
-                                       Matrix& target) {
-  target.sumOfSquares(output, *label.value);
-}
-
-void SumOfSquaresCostLayer::backwardImp(Matrix& output,
-                                        Argument& label,
-                                        Matrix& outputG) {
-  outputG.sumOfSquaresBp(output, *label.value);
-}
-
-//
-// class SmoothL1CostLayer
-//
-
-REGISTER_LAYER(smooth_l1, SmoothL1CostLayer);
-
-bool SmoothL1CostLayer::init(const LayerMap& layerMap,
-                             const ParameterMap& parameterMap) {
-  return CostLayer::init(layerMap, parameterMap);
-}
-
-void SmoothL1CostLayer::forwardImp(Matrix& output,
-                                   Argument& label,
-                                   Matrix& target) {
-  MatrixPtr targetCpu, outputCpu, labelCpu;
-  if (useGpu_) {
-    targetCpu =
-        Matrix::create(target.getHeight(), target.getWidth(), false, false);
-    outputCpu =
-        Matrix::create(output.getHeight(), output.getWidth(), false, false);
-    labelCpu = Matrix::create(
-        label.value->getHeight(), label.value->getWidth(), false, false);
-    targetCpu->copyFrom(target);
-    outputCpu->copyFrom(output);
-    labelCpu->copyFrom(*label.value);
-    targetCpu->smoothL1(*outputCpu, *labelCpu, 1.0);
-    target.copyFrom(*targetCpu);
-  } else {
-    target.smoothL1(output, *label.value, 1.0);
-  }
-}
-
-void SmoothL1CostLayer::backwardImp(Matrix& output,
-                                    Argument& label,
-                                    Matrix& outputG) {
-  MatrixPtr outputGCpu, outputCpu, labelCpu;
-  if (useGpu_) {
-    outputGCpu =
-        Matrix::create(outputG.getHeight(), outputG.getWidth(), false, false);
-    outputCpu =
-        Matrix::create(output.getHeight(), output.getWidth(), false, false);
-    labelCpu = Matrix::create(
-        label.value->getHeight(), label.value->getWidth(), false, false);
-    outputGCpu->copyFrom(outputG);
-    outputCpu->copyFrom(output);
-    labelCpu->copyFrom(*label.value);
-    outputGCpu->smoothL1Bp(*outputCpu, *labelCpu, 1.0);
-    outputG.copyFrom(*outputGCpu);
-  } else {
-    outputG.smoothL1Bp(output, *label.value, 1.0);
-  }
-}
-
-//
-// class RankingCost
-//
-bool RankingCost::init(const LayerMap& layerMap,
-                       const ParameterMap& parameterMap) {
-  posPairCount_ = 0;
-  negPairCount_ = 0;
-
-  bool ret = Layer::init(layerMap, parameterMap);
-  if (!ret) return ret;
-  CHECK_GE(inputLayers_.size(), 3UL);
-  CHECK_LE(inputLayers_.size(), 4UL);
-  if (inputLayers_.size() == 4) {
-    weightLayer_ = inputLayers_[3];
-  }
-  return true;
-}
-
-void RankingCost::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  int batchSize = getInputValue(*getOutputLayer(0))->getHeight();
-  int size = 1;
-  resizeOutput(batchSize, size);
-  Matrix::resizeOrCreate(margin_, batchSize, size, /* trans= */ false, useGpu_);
-  MatrixPtr label = getInputValue(*getLabelLayer());
-  if (!label) {
-    // input label is not in value, try ids
-    IVectorPtr idLabel = getInput(*getLabelLayer()).ids;
-    CHECK(idLabel) << "label layer has neither value nor ids";
-    CHECK_EQ((size_t)batchSize, idLabel->getSize());
-    Matrix::resizeOrCreate(
-        labelBuf_, batchSize, /*width*/ 1, /*trans*/ false, useGpu_);
-    labelBuf_->copyFrom(*idLabel);
-    label = labelBuf_;
-  }
-
-  MatrixPtr output[] = {getInputValue(*getOutputLayer(0)),
-                        getInputValue(*getOutputLayer(1))};
-  MatrixPtr target = this->getOutputValue();
-  margin_->sub(*output[0], *output[1]);
-
-  // for validation
-  size_t height = output[0]->getHeight();
-  target->biggerThan(*(output[0]), *(output[1]), *label);
-  double total = static_cast<double>(height);
-  if (weightLayer_) {
-    const MatrixPtr& weight = getInputValue(*weightLayer_);
-    target->dotMul(*target, *weight);
-    total = weight->getSum();
-  }
-  double pos = target->getSum();
-  posPairCount_ += pos;
-  negPairCount_ += (total - pos);
-
-  // forward
-  target->logisticRegressionLoss(*margin_, *label);
-  if (weightLayer_) {
-    const MatrixPtr& weight = getInputValue(*weightLayer_);
-    target->dotMul(*target, *weight);
-  }
-}
-
-void RankingCost::backward(const UpdateCallback& callback) {
-  (void)callback;
-
-  MatrixPtr label = getInputValue(*getLabelLayer());
-  if (!label) {
-    // input label is not in value, but in ids
-    // use labelBuf_ (should already resized and copied during forward)
-    label = labelBuf_;
-  }
-
-  Matrix::resizeOrCreate(
-      marginGrad_, label->getHeight(), 1, /* trans= */ false, useGpu_);
-  marginGrad_->zeroMem();
-  marginGrad_->logisticRegressionLossBp(*margin_, *label);
-  if (weightLayer_) {
-    const MatrixPtr& weight = getInputValue(*weightLayer_);
-    marginGrad_->dotMul(*marginGrad_, *weight);
-  }
-
-  getInputGrad(0)->add(*marginGrad_);
-  getInputGrad(1)->sub(*marginGrad_);
-}
-
-void RankingCost::onPassEnd() {
-  double ratio = posPairCount_ / ((negPairCount_ <= 0) ? 1.0 : negPairCount_);
-  LOG(INFO) << "calc pos/neg: " << ratio << " pos= " << posPairCount_
-            << " neg= " << negPairCount_;
-
-  posPairCount_ = 0;
-  negPairCount_ = 0;
-}
-
-//
-// class LambdaCost
-//
-REGISTER_LAYER(lambda_cost, LambdaCost);
-
-bool LambdaCost::init(const LayerMap& layerMap,
-                      const ParameterMap& parameterMap) {
-  truncationSize_ = config_.ndcg_num();
-  maxSortSize_ = config_.max_sort_size();
-  if (maxSortSize_ != -1) {
-    CHECK_GE(maxSortSize_, truncationSize_)
-        << "maxSortSize must be greater than or equal to NDCG size!";
-  }
-  LOG(INFO) << "LambdaRank v1.3, NDCG size = " << truncationSize_
-            << ", Max partial sort size = " << maxSortSize_;
-  CHECK(!useGpu_) << "LambdaRank supports CPU only!";
-  return Layer::init(layerMap, parameterMap);
-}
-
-void LambdaCost::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  int batchSize = getInputValue(*getOutputLayer())->getHeight();
-  resizeOutput(batchSize, 1);
-
-  MatrixPtr score = getInputValue(*getScoreLayer());
-  MatrixPtr output = getInputValue(*getOutputLayer());
-  MatrixPtr target = this->getOutputValue();
-
-  real* scoreData = score->getData();
-  real* outputData = output->getData();
-  real* targetData = target->getData();
-
-  auto startPos = getInput(*getOutputLayer()).sequenceStartPositions;
-  const int* startPosData = startPos->getData(false);
-  size_t batchNum = startPos->getSize() - 1;
-  for (size_t i = 0; i < batchNum; ++i) {
-    int beginPos = startPosData[i];
-    int endPos = startPosData[i + 1];
-    real NDCG = calcNDCG(
-        outputData + beginPos, scoreData + beginPos, endPos - beginPos);
-    for (int j = beginPos; j < endPos; ++j) {
-      targetData[j] = NDCG;
-    }
-  }
-}
-
-void LambdaCost::backward(const UpdateCallback& callback) {
-  (void)callback;
-  MatrixPtr score = getInputValue(*getScoreLayer());
-  MatrixPtr output = getInputValue(*getOutputLayer());
-  Matrix::resizeOrCreate(marginGrad_,
-                         score->getHeight(),
-                         1,
-                         /* trans= */ false,
-                         useGpu_);
-  marginGrad_->zeroMem();
-
-  real* gradData = marginGrad_->getData();
-  real* scoreData = score->getData();
-  real* outputData = output->getData();
-
-  auto startPos = getInput(*getOutputLayer()).sequenceStartPositions;
-  const int* startPosData = startPos->getData(false);
-  size_t batchNum = startPos->getSize() - 1;
-
-  for (size_t i = 0; i < batchNum; ++i) {
-    int beginPos = startPosData[i];
-    int endPos = startPosData[i + 1];
-    calcGrad(outputData + beginPos,
-             scoreData + beginPos,
-             gradData + beginPos,
-             endPos - beginPos);
-  }
-
-  getInputGrad(0)->add(*marginGrad_);
-}
-
-void LambdaCost::calcGrad(const real* outputScore,
-                          const real* score,
-                          real* gradData,
-                          int size) {
-  CHECK_GE(size, truncationSize_)
-      << "Invalid: (Sample num in the same list) < (NDCG truncation num) !";
-  int sortSize = maxSortSize_ == -1 ? size : std::min(maxSortSize_, size);
-
-  scorePair_.clear();
-  for (int i = 0; i < size; ++i) {
-    scorePair_.push_back(std::make_pair(score[i], i));
-  }
-  if (size <= sortSize) {
-    std::sort(scorePair_.begin(),
-              scorePair_.end(),
-              [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
-                return a.first > b.first;
-              });
-  } else {
-    std::partial_sort(
-        scorePair_.begin(),
-        scorePair_.begin() + sortSize,
-        scorePair_.end(),
-        [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
-          return a.first > b.first;
-        });
-  }
-
-  real maxDCG = 0;
-  for (int i = 0; i < truncationSize_; ++i) {
-    maxDCG += (std::pow(2, scorePair_[i].first) - 1) / std::log(i + 2);
-  }
-  CHECK_GT(maxDCG, 0) << "Invalid: max DCG = 0!";
-
-  for (int i = 0; i < sortSize; ++i) {
-    for (int j = i + 1; j < size; ++j) {
-      int index_i = scorePair_[i].second;
-      int index_j = scorePair_[j].second;
-      real score_i = score[index_i];
-      real score_j = score[index_j];
-      real dcgDif = 0;
-      if (j < sortSize) {
-        dcgDif = (std::pow(2, score_i) - std::pow(2, score_j)) *
-                 (1 / std::log(i + 2) - 1 / std::log(j + 2));
-      } else {
-        dcgDif =
-            (std::pow(2, score_i) - std::pow(2, score_j)) / std::log(i + 2);
-      }
-
-      real lambda_ij =
-          -std::abs(dcgDif) /
-          (1 + std::exp(outputScore[index_i] - outputScore[index_j]));
-      gradData[index_i] += lambda_ij / maxDCG;
-      gradData[index_j] -= lambda_ij / maxDCG;
-    }
-  }
-}
-
-real LambdaCost::calcNDCG(const real* outputScore,
-                          const real* score,
-                          int size) {
-  CHECK_GE(size, truncationSize_)
-      << "Invalid: (Sample num in the same list) < (NDCG truncation num) !";
-
-  outputScorePair_.clear();
-  for (int i = 0; i < size; ++i) {
-    outputScorePair_.push_back(std::make_pair(outputScore[i], i));
-  }
-  std::partial_sort(
-      outputScorePair_.begin(),
-      outputScorePair_.begin() + truncationSize_,
-      outputScorePair_.end(),
-      [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
-        return a.first > b.first;
-      });
-
-  real DCG = 0;
-  for (int i = 0; i < truncationSize_; ++i) {
-    DCG +=
-        (std::pow(2, score[outputScorePair_[i].second]) - 1) / std::log(i + 2);
-  }
-
-  scoreVec_.resize(size);
-  std::copy(score, score + size, scoreVec_.begin());
-  real maxDCG = 0;
-  std::partial_sort(scoreVec_.begin(),
-                    scoreVec_.begin() + truncationSize_,
-                    scoreVec_.end(),
-                    std::greater<real>());
-  for (int i = 0; i < truncationSize_; ++i) {
-    maxDCG += (std::pow(2, scoreVec_[i]) - 1) / std::log(i + 2);
-  }
-  CHECK_GT(maxDCG, 0) << "Invalid: max DCG = 0!";
-
-  return DCG / maxDCG;
-}
-
-//
-// class MultiBinaryLabelCrossEntropy
-//
-
-REGISTER_LAYER(multi_binary_label_cross_entropy, MultiBinaryLabelCrossEntropy);
-
-bool MultiBinaryLabelCrossEntropy::init(const LayerMap& layerMap,
-                                        const ParameterMap& parameterMap) {
-  return CostLayer::init(layerMap, parameterMap);
-}
-
-void MultiBinaryLabelCrossEntropy::forwardImp(Matrix& output,
-                                              Argument& label,
-                                              Matrix& target) {
-  MatrixPtr value = nullptr;
-  if (label.ids) {
-    CHECK(!label.value);
-    value = label.ids->toOneHotSparseMatrix(output.getWidth(), useGpu_);
-  } else {
-    CHECK(label.value);
-    value = label.value;
-  }
-
-  if (dynamic_cast<CpuSparseMatrix*>(value.get()) ||
-      dynamic_cast<GpuSparseMatrix*>(value.get())) {
-    target.multiBinaryLabelCrossEntropy(output, *value);
-  } else {
-    Matrix::resizeOrCreate(
-        targetPerDim_, output.getHeight(), output.getWidth(), false, useGpu_);
-
-    targetPerDim_->binaryLabelCrossEntropy(output, *value);
-    targetPerDim_->rowSum(target);
-  }
-}
-
-void MultiBinaryLabelCrossEntropy::backwardImp(Matrix& output,
-                                               Argument& label,
-                                               Matrix& outputG) {
-  MatrixPtr value = nullptr;
-  if (label.ids) {
-    CHECK(!value);
-    value = label.ids->toOneHotSparseMatrix(output.getWidth(), useGpu_);
-  } else {
-    CHECK(label.value);
-    value = label.value;
-  }
-
-  if (dynamic_cast<CpuSparseMatrix*>(value.get()) ||
-      dynamic_cast<GpuSparseMatrix*>(value.get())) {
-    outputG.multiBinaryLabelCrossEntropyBp(output, *value);
-  } else {
-    outputG.binaryLabelCrossEntropyBp(output, *value);
-  }
-}
-
-bool HuberCost::init(const LayerMap& layerMap,
-                     const ParameterMap& parameterMap) {
-  CostLayer::init(layerMap, parameterMap);
-  if (useGpu_) {
-    tmpCpuInput_.reserve(inputLayers_.size());
-    for (size_t i = 0; i < inputLayers_.size(); i++) {
-      tmpCpuInput_.push_back(Argument());
-    }
-  }
-  return true;
-}
-
-void HuberCost::forwardImp(Matrix& output, Argument& label, Matrix& cost) {
-  if (useGpu_) {
-    for (size_t i = 0; i < inputLayers_.size(); i++) {
-      tmpCpuInput_[i].resizeAndCopyFrom(
-          getInput(i), false, HPPL_STREAM_DEFAULT);
-    }
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-  }
-}
-
-//
-// Huber loss for robust regression.
-//
-REGISTER_LAYER(huber_regression, HuberRegressionLoss);
-
-bool HuberRegressionLoss::init(const LayerMap& layerMap,
-                               const ParameterMap& parameterMap) {
-  HuberCost::init(layerMap, parameterMap);
-  delta_ = config_.delta();
-  return true;
-}
-
-void HuberRegressionLoss::forwardImp(Matrix& output,
-                                     Argument& label,
-                                     Matrix& target) {
-  HuberCost::forwardImp(output, label, target);
-  size_t numSamples = target.getHeight();
-  size_t dim = output.getWidth();
-  CHECK(label.value);
-  CHECK_EQ((*label.value).getHeight(), numSamples);
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(dim, (*label.value).getWidth());
-  CHECK_EQ(target.getWidth(), (size_t)1);
-
-  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
-  real* lbl =
-      useGpu_ ? tmpCpuInput_[1].value->getData() : (*label.value).getData();
-  std::vector<real> cost(numSamples, 0);
-  for (size_t i = 0; i < numSamples; ++i) {
-    for (size_t j = 0; j < dim; ++j) {
-      int index = i * dim + j;
-      real a = std::abs(lbl[index] - out[index]);
-      if (a <= delta_)
-        cost[i] += a * a / 2;
-      else
-        cost[i] += delta_ * (a - delta_ / 2);
-    }
-  }
-  target.copyFrom(cost.data(), numSamples);
-}
-
-void HuberRegressionLoss::backwardImp(Matrix& output,
-                                      Argument& label,
-                                      Matrix& outputG) {
-  size_t numSamples = output.getHeight();
-  size_t dim = output.getWidth();
-  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
-  real* lbl =
-      useGpu_ ? tmpCpuInput_[1].value->getData() : (*label.value).getData();
-  real* grad = useGpu_ ? tmpCpuInput_[0].grad->getData() : outputG.getData();
-  for (size_t i = 0; i < numSamples; ++i) {
-    for (size_t j = 0; j < dim; ++j) {
-      int index = i * dim + j;
-      real a = lbl[index] - out[index];
-      if (std::abs(a) <= delta_)
-        grad[index] += -a;
-      else
-        grad[index] += a > 0 ? -delta_ : delta_;
-    }
-  }
-  if (useGpu_) outputG.copyFrom(grad, numSamples * dim);
-}
-
-//
-// Huber loss for robust 2-classes classification
-//
-REGISTER_LAYER(huber_classification, HuberTwoClassification);
-
-bool HuberTwoClassification::init(const LayerMap& layerMap,
-                                  const ParameterMap& parameterMap) {
-  return HuberCost::init(layerMap, parameterMap);
-}
-
-void HuberTwoClassification::forwardImp(Matrix& output,
-                                        Argument& label,
-                                        Matrix& target) {
-  HuberCost::forwardImp(output, label, target);
-  size_t numSamples = target.getHeight();
-  CHECK(label.ids);
-  CHECK_EQ((*label.ids).getSize(), numSamples);
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(output.getWidth(), (size_t)1);
-  CHECK_EQ(target.getWidth(), (size_t)1);
-
-  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
-  int* lbl = useGpu_ ? tmpCpuInput_[1].ids->getData() : (*label.ids).getData();
-  std::vector<real> cost(numSamples, 0);
-  for (size_t i = 0; i < numSamples; ++i) {
-    int y = 2 * lbl[i] - 1;
-    real a = out[i] * y;
-    if (a < -1)
-      cost[i] = -4 * a;
-    else if (a < 1)
-      cost[i] = (1 - a) * (1 - a);
-  }
-  target.copyFrom(cost.data(), numSamples);
-}
-
-void HuberTwoClassification::backwardImp(Matrix& output,
-                                         Argument& label,
-                                         Matrix& outputG) {
-  size_t numSamples = output.getHeight();
-  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
-  int* lbl = useGpu_ ? tmpCpuInput_[1].ids->getData() : (*label.ids).getData();
-  real* grad = useGpu_ ? tmpCpuInput_[0].grad->getData() : outputG.getData();
-  for (size_t i = 0; i < numSamples; ++i) {
-    int y = 2 * lbl[i] - 1;
-    real a = out[i] * y;
-    if (a < -1)
-      grad[i] += -4 * y;
-    else if (a < 1)
-      grad[i] += -2 * (1 - a) * y;
-  }
-  if (useGpu_) outputG.copyFrom(grad, numSamples);
-}
-/**
- * This cost layer compute the sum of its input as loss.
- * \f[
- * o(i) = \sum_{j=1}^D y_{ij}
- * \f]
- */
-class SumCostLayer : public Layer {
- public:
-  explicit SumCostLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override {
-    bool ret = Layer::init(layerMap, parameterMap);
-    if (!ret) return ret;
-    CHECK_EQ(inputLayers_.size(), 1UL);
-    return true;
-  }
-
-  void forward(PassType passType) override {
-    Layer::forward(passType);
-    const MatrixPtr& input = getInputValue(0);
-
-    /* malloc memory for the output_ if necessary */
-    int batchSize = input->getHeight();
-    int size = 1;
-    resizeOutput(batchSize, size);
-    output_.value->sumRows(*input, /* scaleSum= */ 1, /* scaleDest= */ 0);
-  }
-
-  void backward(const UpdateCallback& callback = nullptr) override {
-    getInputGrad(0)->add((real)1);
-  }
-};
-
-REGISTER_LAYER(sum_cost, SumCostLayer);
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/CropLayer.cpp b/paddle/gserver/layers/CropLayer.cpp
deleted file mode 100644
index bc97ca2f9e0cdc86f82baa0ce3fbafde2db0c10f..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/CropLayer.cpp
+++ /dev/null
@@ -1,146 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CropLayer.h"
-#include "paddle/utils/Stat.h"
-namespace paddle {
-
-REGISTER_LAYER(crop, CropLayer);
-
-bool CropLayer::init(const LayerMap& layerMap,
-                     const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-  CHECK_LE(static_cast<int>(inputLayers_.size()), 2);
-  CHECK_GE(static_cast<int>(inputLayers_.size()), 1);
-  crop_axis_ = config_.axis();
-  for (int i = 0; i < config_.offset_size(); i++) {
-    crop_offsets_.push_back(config_.offset(i));
-  }
-
-  // 1. get input_0 shape
-  auto& input0_img_conf = config_.inputs(0).image_conf();
-  inDims_ = TensorShape({0,
-                         input0_img_conf.channels(),
-                         input0_img_conf.has_img_size_y()
-                             ? input0_img_conf.img_size_y()
-                             : input0_img_conf.img_size(),
-                         input0_img_conf.img_size()});
-  // 2. get target dims from config
-  if (config_.inputs_size() == 1) {
-    targetDims_ = TensorShape({config_.shape(0),
-                               config_.shape(1),
-                               config_.shape(2),
-                               config_.shape(3)});
-  } else {
-    // 2. get input_1 shape
-    auto& input1_img_conf = config_.inputs(1).image_conf();
-    targetDims_ = TensorShape({0,
-                               input1_img_conf.channels(),
-                               input1_img_conf.has_img_size_y()
-                                   ? input1_img_conf.img_size_y()
-                                   : input1_img_conf.img_size(),
-                               input1_img_conf.img_size()});
-  }
-
-  // 3. get final crop corner
-  int dimSize = 4;
-  crop_corner_ = {0, 0, 0, 0};
-  for (int i = 0; i < dimSize; i++) {
-    if (i >= crop_axis_) {
-      if (crop_offsets_.size() > 1) {
-        crop_corner_[i] = crop_offsets_[i - crop_axis_];
-      } else {
-        crop_corner_[i] = crop_offsets_[0];
-      }
-    }
-  }
-
-  outDims_ = TensorShape(4);
-
-  createFunction(
-      forward_, "Crop", FuncConfig().set("crop_corner", crop_corner_));
-  createFunction(
-      backward_, "CropGrad", FuncConfig().set("crop_corner", crop_corner_));
-
-  return true;
-}
-
-void CropLayer::setOutDims() {
-  MatrixPtr input = inputLayers_[1]->getOutputValue();
-  size_t batchSize = input->getHeight();
-  // get target dims from input_1
-  if (config_.inputs_size() == 2) {
-    targetDims_.setDim(0, batchSize);
-    int ch = config_.inputs(0).image_conf().channels();
-    if (ch != 0) targetDims_.setDim(1, ch);
-    int h = inputLayers_[1]->getOutput().getFrameHeight();
-    if (h != 0) targetDims_.setDim(2, h);
-    int w = inputLayers_[1]->getOutput().getFrameWidth();
-    if (w != 0) targetDims_.setDim(3, w);
-  }
-  // get final crop shape from target dims and crop axis
-  std::vector<uint32_t> crop_shape;
-  int dimSize = 4;
-  for (int i = 0; i < dimSize; i++) {
-    if (i >= crop_axis_) {
-      crop_shape.push_back(targetDims_[i]);
-    } else {
-      crop_shape.push_back(inDims_[i]);
-    }
-  }
-
-  outDims_.reshape(
-      {crop_shape[0], crop_shape[1], crop_shape[2], crop_shape[3]});
-  output_.setFrameHeight(crop_shape[2]);
-  output_.setFrameWidth(crop_shape[3]);
-}
-
-void CropLayer::setInDims() {
-  MatrixPtr input = inputLayers_[0]->getOutputValue();
-  size_t batchSize = input->getHeight();
-  inDims_.setDim(0, batchSize);
-  int h = inputLayers_[0]->getOutput().getFrameHeight();
-  if (h != 0) inDims_.setDim(2, h);
-  int w = inputLayers_[0]->getOutput().getFrameWidth();
-  if (w != 0) inDims_.setDim(3, w);
-}
-
-void CropLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  setInDims();
-  setOutDims();
-  int size = outDims_[1] * outDims_[2] * outDims_[3];
-  resetOutput(outDims_[0], size);
-  MatrixPtr outV = getOutputValue();
-  REGISTER_TIMER_INFO("CropForward", getName().c_str());
-
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*getInputValue(0), inDims_);
-  outputs.addArg(*getOutputValue(), outDims_, ASSIGN_TO);
-  forward_[0]->calc(inputs, outputs);
-}
-
-void CropLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-  REGISTER_TIMER_INFO("CropBackward", getName().c_str());
-
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*getOutputGrad(), outDims_);
-  outputs.addArg(*getInputGrad(0), inDims_, ADD_TO);
-  backward_[0]->calc(inputs, outputs);
-}
-}  // namespace paddle
diff --git a/paddle/gserver/layers/CrossChannelNormLayer.cpp b/paddle/gserver/layers/CrossChannelNormLayer.cpp
deleted file mode 100644
index 644450291ee8a308accf7a1fe096332cc8c241dc..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/CrossChannelNormLayer.cpp
+++ /dev/null
@@ -1,137 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "NormLayer.h"
-#include "paddle/math/BaseMatrix.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-MatrixPtr CrossChannelNormLayer::createSampleMatrix(MatrixPtr data,
-                                                    size_t iter,
-                                                    size_t spatialDim) {
-  return Matrix::create(data->getData() + iter * channels_ * spatialDim,
-                        channels_,
-                        spatialDim,
-                        false,
-                        useGpu_);
-}
-
-MatrixPtr CrossChannelNormLayer::createSpatialMatrix(MatrixPtr data,
-                                                     size_t iter,
-                                                     size_t spatialDim) {
-  return Matrix::create(
-      data->getData() + iter * spatialDim, 1, spatialDim, false, useGpu_);
-}
-
-bool CrossChannelNormLayer::init(const LayerMap& layerMap,
-                                 const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-  CHECK(parameters_[0]);
-  const NormConfig& conf = config_.inputs(0).norm_conf();
-  channels_ = conf.channels();
-  scale_.reset(new Weight(channels_, 1, parameters_[0]));
-  return true;
-}
-
-void CrossChannelNormLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  MatrixPtr inV = getInputValue(0);
-
-  size_t batchSize = inV->getHeight();
-  size_t dataDim = inV->getWidth();
-  CHECK_EQ(getSize(), dataDim);
-
-  reserveOutput(batchSize, dataDim);
-  MatrixPtr outV = getOutputValue();
-  size_t spatialDim = dataDim / channels_;
-
-  Matrix::resizeOrCreate(dataBuffer_, batchSize, dataDim, false, useGpu_);
-  Matrix::resizeOrCreate(spatialBuffer_, 1, spatialDim, false, useGpu_);
-  Matrix::resizeOrCreate(normBuffer_, batchSize, spatialDim, false, useGpu_);
-
-  inV->square2(*dataBuffer_);
-  for (size_t i = 0; i < batchSize; i++) {
-    const MatrixPtr inVTmp = createSampleMatrix(inV, i, spatialDim);
-    const MatrixPtr dataTmp = createSampleMatrix(dataBuffer_, i, spatialDim);
-    MatrixPtr outVTmp = createSampleMatrix(outV, i, spatialDim);
-    MatrixPtr normTmp = createSpatialMatrix(normBuffer_, i, spatialDim);
-
-    // compute norm.
-    spatialBuffer_->sumCols(*dataTmp, 1, 0);
-    // add eps to avoid overflow
-    spatialBuffer_->add(1e-6);
-    spatialBuffer_->sqrt2(*spatialBuffer_);
-    normTmp->copyFrom(*spatialBuffer_);
-    outVTmp->copyFrom(*inVTmp);
-    outVTmp->divRowVector(*spatialBuffer_);
-    // scale the layer.
-    outVTmp->mulColVector(*scale_->getW());
-  }
-}
-
-void CrossChannelNormLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inG = getInputGrad(0);
-  MatrixPtr inV = getInputValue(0);
-  MatrixPtr outG = getOutputGrad();
-  MatrixPtr outV = getOutputValue();
-
-  size_t batchSize = inG->getHeight();
-  size_t dataDim = inG->getWidth();
-  size_t spatialDim = dataDim / channels_;
-
-  MatrixPtr inGBuffer;
-  Matrix::resizeOrCreate(inGBuffer, channels_, spatialDim, false, useGpu_);
-
-  dataBuffer_->dotMul(*outG, *outV);
-  Matrix::resizeOrCreate(scaleDiff_, channels_, 1, false, useGpu_);
-  Matrix::resizeOrCreate(channelBuffer_, channels_, 1, false, useGpu_);
-  Matrix::resizeOrCreate(sampleBuffer_, channels_, spatialDim, false, useGpu_);
-  scaleDiff_->zeroMem();
-  for (size_t i = 0; i < batchSize; i++) {
-    MatrixPtr outGTmp = createSampleMatrix(outG, i, spatialDim);
-    const MatrixPtr dataTmp = createSampleMatrix(dataBuffer_, i, spatialDim);
-    const MatrixPtr inVTmp = createSampleMatrix(inV, i, spatialDim);
-    const MatrixPtr inGTmp = createSampleMatrix(inG, i, spatialDim);
-    const MatrixPtr normTmp = createSpatialMatrix(normBuffer_, i, spatialDim);
-
-    channelBuffer_->sumRows(*dataTmp, 1, 0);
-    channelBuffer_->dotDiv(*channelBuffer_, *(scale_->getW()));
-    // store a / scale[i] in scaleDiff_ temporary
-    scaleDiff_->add(*channelBuffer_, 1.);
-
-    sampleBuffer_->dotMul(*inVTmp, *outGTmp);
-    spatialBuffer_->sumCols(*sampleBuffer_, 1., 0.);
-    // scale the grad
-    inGBuffer->copyFrom(*inVTmp);
-    inGBuffer->mulRowVector(*spatialBuffer_);
-    // divide by square of norm
-    spatialBuffer_->dotMul(*normTmp, *normTmp);
-    inGBuffer->divRowVector(*spatialBuffer_);
-    // subtract
-    inGBuffer->add(*outGTmp, -1, 1);
-    // divide by norm
-    inGBuffer->divRowVector(*normTmp);
-    // scale the diff
-    inGBuffer->mulColVector(*scale_->getW());
-
-    inGTmp->add(*inGBuffer);
-  }
-  // updata scale
-  if (scale_->getWGrad()) scale_->getWGrad()->add(*scaleDiff_);
-  scale_->getParameterPtr()->incUpdate(callback);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/CudnnBatchNormLayer.cpp b/paddle/gserver/layers/CudnnBatchNormLayer.cpp
deleted file mode 100644
index 9a29e6a55e95334def2b83dc4a794e07a7fd5154..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/CudnnBatchNormLayer.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CudnnBatchNormLayer.h"
-#include "Layer.h"
-#include "paddle/cuda/include/hl_batch_norm.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(cudnn_batch_norm, CudnnBatchNormLayer);
-
-bool CudnnBatchNormLayer::init(const LayerMap& layerMap,
-                               const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  if (!BatchNormBaseLayer::init(layerMap, parameterMap)) return false;
-  CHECK(useGpu_) << "CudnnBatchNorm only support GPU";
-
-  hl_create_tensor_descriptor(&ioDesc_);
-  hl_create_tensor_descriptor(&bnParamDesc_);
-  hl_tensor_reshape(bnParamDesc_, 1, channels_, 1, 1);
-
-  return true;
-}
-
-void CudnnBatchNormLayer::reshape(int batchSize) {
-  hl_tensor_reshape(ioDesc_, batchSize, channels_, imageH_ * imageD_, imageW_);
-}
-
-void CudnnBatchNormLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  int batchSize = getInputValue(0)->getHeight();
-  calFeatureMapSize();
-  reshape(batchSize);
-  resetOutput(batchSize, getInputValue(0)->getWidth());
-
-  // for testing in training peroid.
-  useGlobalStats_ = (passType == PASS_TEST);
-  if (passType == PASS_TEST && config_.has_use_global_stats()) {
-    useGlobalStats_ = config_.use_global_stats();
-  }
-
-  real* input = getInputValue(0)->getData();
-  real* output = getOutputValue()->getData();
-  real* gamma = weight_->getW()->getData();
-  real* beta = biases_->getW()->getData();
-  real* movingMean = movingMean_->getW()->getData();
-  real* movingVar = movingVar_->getW()->getData();
-
-  // cuDNN does not allow an epsilon value less than CUDNN_BN_MIN_EPSILON.
-  eps_ = std::max(CUDNN_BN_MIN_EPSILON, static_cast<double>(epsilon_));
-
-  if (!useGlobalStats_) {
-    REGISTER_TIMER_INFO("CudnnBatchFwTimer", getName().c_str());
-    real* savedMean = savedMean_->getData();
-    real* savedInvVar = savedInvVar_->getData();
-    hl_batch_norm_forward_training(ioDesc_,
-                                   input,
-                                   ioDesc_,
-                                   output,
-                                   bnParamDesc_,
-                                   gamma,
-                                   beta,
-                                   1.0 - movingAvgFraction_,
-                                   movingMean,
-                                   movingVar,
-                                   eps_,
-                                   savedMean,
-                                   savedInvVar);
-  } else {
-    // used movingMean and movingVar in testing
-    if (batchSize <= 1024) {
-      hl_batch_norm_forward_inference(ioDesc_,
-                                      input,
-                                      ioDesc_,
-                                      output,
-                                      bnParamDesc_,
-                                      gamma,
-                                      beta,
-                                      movingMean,
-                                      movingVar,
-                                      eps_);
-    } else {
-      // There is a limitation in cudnn library.
-      // When the batch size is larger than 1024 in cuDNN v5.1,
-      // the cudnnBatchNormalizationForwardInference will fail.
-      hl_batch_norm_cuda_inference(input,
-                                   output,
-                                   gamma,
-                                   beta,
-                                   movingMean,
-                                   movingVar,
-                                   eps_,
-                                   batchSize,
-                                   channels_,
-                                   imageH_ * imageD_,
-                                   imageW_);
-    }
-  }
-
-  /* activation */ {
-    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void CudnnBatchNormLayer::backward(const UpdateCallback& callback) {
-  /* Do derivation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
-
-  real* input = getInputValue(0)->getData();
-  real* outGrad = getOutputGrad()->getData();
-  real* inGrad = getInputGrad(0)->getData();
-  real* gamma = weight_->getW()->getData();
-  real* savedMean = savedMean_->getData();
-  real* savedInvVar = savedInvVar_->getData();
-
-  // cuDNN does not allow an epsilon value less than CUDNN_BN_MIN_EPSILON.
-  eps_ = std::max(CUDNN_BN_MIN_EPSILON, static_cast<double>(epsilon_));
-
-  auto create = [](MatrixPtr& m, size_t h, size_t w, real** p) {
-    Matrix::resizeOrCreate(m, h, w, false, true);
-    m->zeroMem();
-    *p = m->getData();
-  };
-
-  real* gammaGrad = nullptr;
-  real* betaGrad = nullptr;
-  if (weight_->getWGrad()) {
-    gammaGrad = weight_->getWGrad()->getData();
-  } else {
-    create(tmpWGrad_, 1, channels_, &gammaGrad);
-  }
-  if (biases_ && biases_->getWGrad()) {
-    betaGrad = biases_->getWGrad()->getData();
-  } else {
-    create(tmpBiasGrad_, 1, channels_, &betaGrad);
-  }
-
-  hl_batch_norm_backward(ioDesc_,
-                         input,
-                         ioDesc_,
-                         outGrad,
-                         ioDesc_,
-                         inGrad,
-                         bnParamDesc_,
-                         gamma,
-                         gammaGrad,
-                         betaGrad,
-                         eps_,
-                         savedMean,
-                         savedInvVar);
-
-  {
-    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
-    biases_->getParameterPtr()->incUpdate(callback);
-    weight_->getParameterPtr()->incUpdate(callback);
-  }
-}
-
-CudnnBatchNormLayer::~CudnnBatchNormLayer() {
-  hl_destroy_tensor_descriptor(ioDesc_);
-  hl_destroy_tensor_descriptor(bnParamDesc_);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/CudnnBatchNormLayer.h b/paddle/gserver/layers/CudnnBatchNormLayer.h
deleted file mode 100644
index 1bb4eff8d2372660caa4ec4a4a20a27f365bebd0..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/CudnnBatchNormLayer.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cudnn.h>
-#include "BatchNormBaseLayer.h"
-#include "Layer.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief Cudnn Batch normalization layer use to cuDNN lib to implentment.
- * @note Cudnn version must >= v4.0, and better to use the latest version
- * (v5.1).
- *
- * The config file api is batch_norm_layer.
- */
-
-class CudnnBatchNormLayer : public BatchNormBaseLayer {
- public:
-  explicit CudnnBatchNormLayer(const LayerConfig& config)
-      : BatchNormBaseLayer(config) {}
-
-  ~CudnnBatchNormLayer();
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  /**
-   * reshape tensor of ioDesc_.
-   */
-  void reshape(int batchSize);
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
- protected:
-  /// Epsilon value used in the batch normalization formula.
-  /// Same epsilon value should be used in forward and backward functions.
-  double eps_;
-
-  /// Input/output tensor descriptor desc
-  hl_tensor_descriptor ioDesc_;
-  /// Shared tensor descriptor desc for the 6 tenros:
-  /// bnScale, bnBias, running mean/var, save_mean/var
-  hl_tensor_descriptor bnParamDesc_;
-
-  /**
-   * @brief The gradient of weight and bias in cudnn api can not be empty.
-   * If set is_static for weight or bias, it will not allocate memory for them,
-   * and the gradient is NULL. In this case, will use two matrix.
-   */
-  MatrixPtr tmpWGrad_, tmpBiasGrad_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/CudnnConvBaseLayer.cpp b/paddle/gserver/layers/CudnnConvBaseLayer.cpp
deleted file mode 100644
index 6d0a40a60710603900a9b89980d38b2d7638ad60..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/CudnnConvBaseLayer.cpp
+++ /dev/null
@@ -1,135 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CudnnConvBaseLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-REGISTER_LAYER(cudnn_conv, CudnnConvBaseLayer);
-REGISTER_LAYER(cudnn_convt, CudnnConvBaseLayer);
-
-bool CudnnConvBaseLayer::init(const LayerMap &layerMap,
-                              const ParameterMap &parameterMap) {
-  if (!ConvBaseLayer::init(layerMap, parameterMap)) return false;
-  CHECK(useGpu_) << "CudnnConvLayer only support gpu";
-
-  CHECK_EQ(inputLayers_.size(), parameters_.size());
-  projections_.reserve(inputLayers_.size());
-  projConf_.reserve(inputLayers_.size());
-
-  numFilters_ = config_.num_filters();
-  CHECK(config_.shared_biases());
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    ProjectionConfig *conf = new ProjectionConfig();
-    if (isDeconv_) {
-      conf->set_type("convt");
-    } else {
-      conf->set_type("conv");
-    }
-    conf->set_num_filters(numFilters_);
-    ConvConfig *convConf = conf->mutable_conv_conf();
-    *convConf = *(config_.mutable_inputs(i)->mutable_conv_conf());
-    conf->set_input_size(getPrev(i)->getSize());
-    conf->set_output_size(getSize());
-    projConf_.emplace_back(conf);
-    projections_.emplace_back(
-        Projection::create(*projConf_[i], parameters_[i], useGpu_));
-
-    // create a new weight
-    size_t height, width;
-    height = filterPixels_[i] * filterChannels_[i];
-    width = (!isDeconv_) ? numFilters_ : channels_[i];
-    CHECK_EQ(parameters_[i]->getSize(), width * height);
-    Weight *w = new Weight(height, width, parameters_[i]);
-    weights_.emplace_back(w);
-  }
-
-  if (biasParameter_.get()) {
-    if (sharedBiases_) {
-      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
-      biases_ =
-          std::unique_ptr<Weight>(new Weight(numFilters_, 1, biasParameter_));
-    } else {
-      biases_ =
-          std::unique_ptr<Weight>(new Weight(getSize(), 1, biasParameter_));
-    }
-  }
-  if (biases_.get() && sharedBiases_) {
-    hl_create_tensor_descriptor(&biasDesc_);
-    hl_create_tensor_descriptor(&outputDesc_);
-    hl_tensor_reshape(biasDesc_, 1, numFilters_, 1, 1);
-  }
-
-  return true;
-}
-
-void CudnnConvBaseLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  int batchSize = getInput(0).getBatchSize();
-  resetOutput(batchSize, calOutputSize());
-
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    projections_[i]->forward(&getInput(i), &getOutput(), passType);
-  }
-
-  if (biases_) {
-    REGISTER_TIMER_INFO("CudnnConvBiasTimer", getName().c_str());
-    int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-    int outH = outputH_[0];
-    int outW = outputW_[0];
-
-    hl_tensor_reshape(outputDesc_,
-                      batchSize,
-                      numFilters_,
-                      outH,
-                      outW,
-                      numFilters_ * outH * outW,
-                      outH * outW,
-                      outW,
-                      1);
-    real *outData = getOutputValue()->getData();
-    real *biasData = biases_->getW()->getData();
-    hl_convolution_forward_add_bias(biasDesc_, biasData, outputDesc_, outData);
-  }
-
-  forwardActivation();
-}
-
-void CudnnConvBaseLayer::backward(const UpdateCallback &callback) {
-  backwardActivation();
-
-  if (biases_ && biases_->getWGrad()) {
-    REGISTER_TIMER_INFO("CudnnConvBpBiasTimer", getName().c_str());
-    real *biasGrad = biases_->getWGrad()->getData();
-    real *outGrad = getOutputGrad()->getData();
-    hl_convolution_backward_bias(biasDesc_, biasGrad, outputDesc_, outGrad);
-
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    projections_[i]->backward(callback);
-  }
-}
-
-CudnnConvBaseLayer::~CudnnConvBaseLayer() {
-  if (biases_) {
-    hl_destroy_tensor_descriptor(biasDesc_);
-    hl_destroy_tensor_descriptor(outputDesc_);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/CudnnConvBaseLayer.h b/paddle/gserver/layers/CudnnConvBaseLayer.h
deleted file mode 100644
index 1ee1aa100d8adaed04ce24ee12b5b9af52c14b13..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/CudnnConvBaseLayer.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "ConvBaseLayer.h"
-#include "Projection.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief A 2-dimension conv layer implemented by cuDNN. It only
- *        supports GPU mode. We automatic select CudnnConvLayer for GPU
- *        mode and ExpandConvLayer for CPU mode if you set type of "conv".
- *        User also can specfiy type of "exconv" or "cudnn_conv" for
- *        particular type.
- *
- * The config file api is img_conv_layer.
- */
-class CudnnConvBaseLayer : public ConvBaseLayer {
- protected:
-  std::vector<std::unique_ptr<ProjectionConfig>> projConf_;
-  std::vector<std::unique_ptr<Projection>> projections_;
-
-  hl_tensor_descriptor biasDesc_;
-  hl_tensor_descriptor outputDesc_;
-
- public:
-  explicit CudnnConvBaseLayer(const LayerConfig& config)
-      : ConvBaseLayer(config) {}
-
-  ~CudnnConvBaseLayer();
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/CudnnPoolLayer.cpp b/paddle/gserver/layers/CudnnPoolLayer.cpp
deleted file mode 100644
index ac6d2168f43590a6acd70f6641ff729327894ea0..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/CudnnPoolLayer.cpp
+++ /dev/null
@@ -1,139 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CudnnPoolLayer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-bool CudnnPoolLayer::typeCheck(const std::string &poolType,
-                               hl_pooling_mode_t *mode) {
-  if (poolType == "cudnn-max-pool") {
-    if (mode) {
-      *mode = HL_POOLING_MAX;
-    }
-  } else if (poolType == "cudnn-avg-pool") {
-    if (mode) {
-      *mode = HL_POOLING_AVERAGE;
-    }
-  } else if (poolType == "cudnn-avg-incl-pad-pool") {
-    if (mode) {
-      *mode = HL_POOLING_AVERAGE_INCLUDE_PADDING;
-    }
-  } else {
-    return false;
-  }
-
-  return true;
-}
-
-CudnnPoolLayer::CudnnPoolLayer(const LayerConfig &config) : PoolLayer(config) {
-  const std::string &pool_type = config.inputs(0).pool_conf().pool_type();
-  CHECK_EQ(CudnnPoolLayer::typeCheck(pool_type, &mode_), true);
-}
-
-bool CudnnPoolLayer::init(const LayerMap &layerMap,
-                          const ParameterMap &parameterMap) {
-  PoolLayer::init(layerMap, parameterMap);
-
-  CHECK(useGpu_) << "CudnnPoolLayer only support gpu";
-
-  hl_create_tensor_descriptor(&inputDesc_);
-  hl_create_tensor_descriptor(&outputDesc_);
-
-  windowHeight = sizeY_;
-  windowWidth = sizeX_;
-  heightPadding = confPaddingY_;
-  widthPadding = confPadding_;
-  strideHeight = strideY_;
-  strideWidth = stride_;
-
-  hl_create_pooling_descriptor(&poolingDesc_,
-                               mode_,
-                               windowHeight,
-                               windowWidth,
-                               heightPadding,
-                               widthPadding,
-                               strideHeight,
-                               strideWidth);
-
-  return true;
-}
-
-void CudnnPoolLayer::reshape(int batchSize) {
-  imageH_ = inputLayers_[0]->getOutput().getFrameHeight();
-  imageW_ = inputLayers_[0]->getOutput().getFrameWidth();
-  if (imageH_ == 0) {
-    imageH_ = imgSizeY_;
-  }
-  if (imageW_ == 0) {
-    imageW_ = imgSize_;
-  }
-  CHECK_EQ(inputLayers_[0]->getOutput().value->getWidth(),
-           channels_ * imageH_ * imageW_);
-  outputH_ = outputSize(imageH_,
-                        sizeY_,
-                        confPaddingY_,
-                        strideY_,
-                        /* caffeMode */ false);
-  outputW_ =
-      outputSize(imageW_, sizeX_, confPadding_, stride_, /* caffeMode */ false);
-  getOutput().setFrameHeight(outputH_);
-  getOutput().setFrameWidth(outputW_);
-
-  hl_tensor_reshape(inputDesc_, batchSize, channels_, imageH_, imageW_);
-  hl_tensor_reshape(outputDesc_, batchSize, channels_, outputH_, outputW_);
-}
-
-void CudnnPoolLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  CHECK(inputLayers_[0]->getOutputValue()->useGpu());
-  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  reshape(batchSize);
-  resetOutput(batchSize, outputH_ * outputW_ * channels_);
-
-  real *inputData = getInputValue(0)->getData();
-  real *outData = getOutputValue()->getData();
-  hl_pooling_forward(inputDesc_, inputData, outputDesc_, outData, poolingDesc_);
-}
-
-void CudnnPoolLayer::backward(const UpdateCallback &callback) {
-  (void)callback;
-  if (NULL == getInputGrad(0)) {
-    return;
-  }
-
-  real *inputData = getInputValue(0)->getData();
-  real *inputGrad = getInputGrad(0)->getData();
-  real *outData = getOutputValue()->getData();
-  real *outGrad = getOutputGrad()->getData();
-  hl_pooling_backward(inputDesc_,
-                      inputData,
-                      inputGrad,
-                      outputDesc_,
-                      outData,
-                      outGrad,
-                      poolingDesc_);
-}
-
-CudnnPoolLayer::~CudnnPoolLayer() {
-  hl_destroy_tensor_descriptor(inputDesc_);
-  hl_destroy_tensor_descriptor(outputDesc_);
-  hl_destroy_pooling_descriptor(poolingDesc_);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/DataNormLayer.cpp b/paddle/gserver/layers/DataNormLayer.cpp
deleted file mode 100644
index 86da4d6f957e2ce0afc53d69f9d57c234f8f178f..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/DataNormLayer.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "DataNormLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(data_norm, DataNormLayer);
-
-bool DataNormLayer::init(const LayerMap& layerMap,
-                         const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  /* initialize the weight */
-  CHECK(!biasParameter_) << "DataNormLayer does not need bias";
-  CHECK(inputLayers_.size() == 1 && inputLayers_[0]->getType() == "data")
-      << "DataNormLayer accepts one and only one DataLayer as its input layer";
-  CHECK_EQ(inputLayers_.size(), parameters_.size());
-  CHECK_EQ(inputLayers_[0]->getSize(), getSize());
-  CHECK_EQ(parameters_[0]->getSize(), 5 * getSize());
-  CHECK(parameters_[0]->isStatic())
-      << "The parameter of DataNormLayer must be static";
-
-  weight_ = std::unique_ptr<Weight>(new Weight(5, getSize(), parameters_[0]));
-  min_ = Matrix::create(
-      nullptr, /* height= */ 1, getSize(), /* trans= */ false, useGpu_);
-  rangeReciprocal_ = Matrix::create(nullptr,
-                                    /* height= */ 1,
-                                    getSize(),
-                                    /* trans= */ false,
-                                    useGpu_);
-  mean_ = Matrix::create(nullptr,
-                         /* height= */ 1,
-                         getSize(),
-                         /* trans= */ false,
-                         useGpu_);
-  stdReciprocal_ = Matrix::create(nullptr,
-                                  /* height= */ 1,
-                                  getSize(),
-                                  /* trans= */ false,
-                                  useGpu_);
-  decimalReciprocal_ = Matrix::create(nullptr,
-                                      /* height= */ 1,
-                                      getSize(),
-                                      /* trans= */ false,
-                                      useGpu_);
-
-  min_->setData(weight_->getW()->getData());
-  rangeReciprocal_->setData(weight_->getW()->getData() + getSize());
-  mean_->setData(weight_->getW()->getData() + 2 * getSize());
-  stdReciprocal_->setData(weight_->getW()->getData() + 3 * getSize());
-  decimalReciprocal_->setData(weight_->getW()->getData() + 4 * getSize());
-
-  /* normalization strategy */
-  if (config_.data_norm_strategy() == "z-score") {
-    mode_ = kZScore;
-  } else if (config_.data_norm_strategy() == "min-max") {
-    mode_ = kMinMax;
-  } else if (config_.data_norm_strategy() == "decimal-scaling") {
-    mode_ = kDecimalScaling;
-  } else {
-    LOG(FATAL) << "Unknown data normalization strategy: "
-               << config_.data_norm_strategy();
-  }
-
-  return true;
-}
-
-void DataNormLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  int batchSize = getInput(0).getBatchSize();
-  int size = getSize();
-  reserveOutput(batchSize, size);
-
-  const MatrixPtr inValue = getInputValue(0);
-  MatrixPtr outValue = getOutputValue();
-  outValue->copyFrom(*inValue);
-  switch (mode_) {
-    case kZScore: {
-      outValue->addBias(*mean_, -1.0);
-      outValue->colScale(0, *outValue, *stdReciprocal_);
-      break;
-    }
-    case kMinMax: {
-      outValue->addBias(*min_, -1.0);
-      outValue->colScale(0, *outValue, *rangeReciprocal_);
-      break;
-    }
-    case kDecimalScaling: {
-      outValue->colScale(0, *outValue, *decimalReciprocal_);
-      break;
-    }
-    default:
-      LOG(FATAL) << "should not reach here";
-  }
-}
-
-void DataNormLayer::backward(const UpdateCallback& callback) {
-  // The parameter for DataNormLayer is static, and does not need to be updated
-  (void)callback;
-
-  /* Calculate the input layers error */
-  const MatrixPtr& outGrad = getOutputGrad();
-  MatrixPtr inGrad = getInputGrad(0);
-  if (inGrad) {
-    switch (mode_) {
-      case kZScore: {
-        inGrad->addColScale(0, *outGrad, *stdReciprocal_);
-        break;
-      }
-      case kMinMax: {
-        inGrad->addColScale(0, *outGrad, *rangeReciprocal_);
-        break;
-      }
-      case kDecimalScaling: {
-        inGrad->addColScale(0, *outGrad, *decimalReciprocal_);
-        break;
-      }
-      default: { LOG(FATAL) << "should not reach here"; }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/DataNormLayer.h b/paddle/gserver/layers/DataNormLayer.h
deleted file mode 100644
index 7ae67a877b488c8d197896b8b1e3e90057fbe1c9..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/DataNormLayer.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/ThreadLocal.h"
-
-namespace paddle {
-
-/**
- * @brief A layer for data normalization
- * - Input: One and only one input layer is accepted. The input layer must
- *        be DataLayer with dense data type.
- * - Output: The normalization of the input data
- *
- * Reference:
- *    LA Shalabi, Z Shaaban, B Kasasbeh. Data mining: A preprocessing engine
- *
- * Three data normalization methoeds are considered
- * - z-score: y = (x-mean)/std
- * - min-max: y = (x-min)/(max-min)
- * - decimal-scaling: y = x/10^j, where j is the smallest integer such that
- *max(|y|)<1
- */
-
-class DataNormLayer : public Layer {
- public:
-  enum NormalizationStrategy { kZScore = 0, kMinMax = 1, kDecimalScaling = 2 };
-
-  explicit DataNormLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~DataNormLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
- protected:
-  int mode_;
-  std::unique_ptr<Weight> weight_;
-  MatrixPtr min_;
-  MatrixPtr rangeReciprocal_;  // 1/(max-min)
-  MatrixPtr mean_;
-  MatrixPtr stdReciprocal_;      // 1/std
-  MatrixPtr decimalReciprocal_;  // 1/10^j
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/DeConv3DLayer.cpp b/paddle/gserver/layers/DeConv3DLayer.cpp
deleted file mode 100644
index db6d6e073c08c35c5a71b2b18ab0103d42ccd318..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/DeConv3DLayer.cpp
+++ /dev/null
@@ -1,220 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "DeConv3DLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(deconv3d, DeConv3DLayer);
-
-bool DeConv3DLayer::init(const LayerMap &layerMap,
-                         const ParameterMap &parameterMap) {
-  if (!ConvBaseLayer::init(layerMap, parameterMap)) return false;
-  // for Deconv, the dimension of Kernel is
-  // channel * output * depth * height * weigth
-  // Matrix storage format: (output * depth * height * weigth) x  channel
-  for (int index = 0; index < config_.inputs().size(); ++index) {
-    M_.push_back(filterChannels_[index]);
-    K_.push_back(filterPixels_[index] * (numFilters_ / groups_[index]));
-
-    // create a new weight
-    size_t height, width;
-    height = filterPixels_[index] * numFilters_;
-    width = filterChannels_[index];
-    CHECK_EQ(parameters_[index]->getSize(), width * height);
-    Weight *w = new Weight(height, width, parameters_[index]);
-    weights_.emplace_back(w);
-  }
-  if (biasParameter_.get()) {
-    if (sharedBiases_) {
-      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
-      biases_ =
-          std::unique_ptr<Weight>(new Weight(numFilters_, 1, biasParameter_));
-    } else {
-      biases_ =
-          std::unique_ptr<Weight>(new Weight(getSize(), 1, biasParameter_));
-    }
-  }
-  return true;
-}
-
-size_t DeConv3DLayer::getSize() {
-  CHECK_NE(inputLayers_.size(), 0UL);
-  imgSizeW_.clear();
-  imgSizeH_.clear();
-  imgSizeD_.clear();
-  N_.clear();
-  NOut_.clear();
-  size_t layerSize = 0;
-  for (size_t i = 0; i < inputLayers_.size(); ++i) {
-    imgSizeW_.push_back(
-        imageSize(outputW_[i], filterSize_[i], padding_[i], stride_[i], true));
-    imgSizeH_.push_back(imageSize(
-        outputH_[i], filterSizeY_[i], paddingY_[i], strideY_[i], true));
-    imgSizeD_.push_back(imageSize(
-        outputD_[i], filterSizeZ_[i], paddingZ_[i], strideZ_[i], true));
-    NOut_.push_back(imgSizeD_[i] * imgSizeH_[i] * imgSizeW_[i]);
-    N_.push_back(outputD_[i] * outputH_[i] * outputW_[i]);
-    CHECK(layerSize == 0 || N_[i] * size_t(numFilters_) == layerSize);
-    layerSize += NOut_[i] * numFilters_;
-  }
-  getOutput().setFrameHeight(imgSizeH_[0]);
-  getOutput().setFrameWidth(imgSizeW_[0]);
-  getOutput().setFrameDepth(imgSizeD_[0]);
-  return layerSize;
-}
-
-void DeConv3DLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  int outWidth = getSize();
-  resetOutput(batchSize, outWidth);
-  const MatrixPtr outMat = getOutputValue();
-
-  REGISTER_TIMER_INFO("FwdDeConv3D", getName().c_str());
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    const MatrixPtr &inMat = getInputValue(i);
-    int M = M_[i];
-    int N = N_[i];
-    int K = K_[i];
-    MatrixPtr wMat = weights_[i]->getW();
-    Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
-    for (int n = 0; n < batchSize; ++n) {
-      real *inData = inMat->getData() + n * inMat->getStride();
-      for (int g = 0; g < groups_[i]; ++g) {
-        MatrixPtr inMatSub = Matrix::create(inData, M, N, false, useGpu_);
-        MatrixPtr wMatSub = wMat->subMatrix(g * K, K);
-        MatrixPtr colBufDataSub = colBuf_->subMatrix(g * K, K);
-        colBufDataSub->mul(*wMatSub, *inMatSub, 1.0, 0.0);
-        inData += M * N;
-      }
-      colBuf_->col2Vol(outMat->getData() + n * outMat->getStride(),
-                       numFilters_,
-                       imgSizeD_[i],
-                       imgSizeH_[i],
-                       imgSizeW_[i],
-                       filterSizeZ_[i],
-                       filterSizeY_[i],
-                       filterSize_[i],
-                       strideZ_[i],
-                       strideY_[i],
-                       stride_[i],
-                       paddingZ_[i],
-                       paddingY_[i],
-                       padding_[i],
-                       1.0,
-                       1.0);
-    }
-  }
-  if (nullptr != this->biasParameter_) {
-    this->addBias();
-  }
-  forwardActivation();
-}
-
-void DeConv3DLayer::backward(const UpdateCallback &callback) {
-  backwardActivation();
-  int batchSize = getOutputGrad()->getHeight();
-  if (biases_ && biases_->getWGrad()) {
-    bpropBiases();
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-  REGISTER_TIMER_INFO("BwdDeConv3D", getName().c_str());
-  for (size_t i = 0; i < inputLayers_.size(); ++i) {
-    if (weights_[i]->getWGrad() || this->needGradient_) {
-      int M = M_[i];
-      int N = N_[i];
-      int K = K_[i];
-      Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
-      const MatrixPtr &inMat = getInputValue(i);
-      for (int n = 0; n < batchSize; ++n) {
-        colBuf_->vol2Col(
-            getOutputGrad()->getData() + n * getOutputGrad()->getStride(),
-            numFilters_,
-            imgSizeD_[i],
-            imgSizeH_[i],
-            imgSizeW_[i],
-            filterSizeZ_[i],
-            filterSizeY_[i],
-            filterSize_[i],
-            strideZ_[i],
-            strideY_[i],
-            stride_[i],
-            paddingZ_[i],
-            paddingY_[i],
-            padding_[i]);
-        if (weights_[i]->getWGrad()) {
-          real *inData = inMat->getData() + n * inMat->getStride();
-          for (int g = 0; g < groups_[i]; ++g) {
-            MatrixPtr colBufDataSub = colBuf_->subMatrix(g * K, K);
-            MatrixPtr wGradMatSub =
-                weights_[i]->getWGrad()->subMatrix(g * K, K);
-            MatrixPtr inMatSub = Matrix::create(inData, M, N, false, useGpu_);
-            wGradMatSub->mul(
-                *colBufDataSub, *(inMatSub->getTranspose()), 1.0, 1.0);
-            inData += M * N;
-          }
-        }
-        if (getInputGrad(i)) {
-          real *preGrad =
-              getInputGrad(i)->getData() + n * getInputGrad(i)->getStride();
-          for (int g = 0; g < groups_[i]; ++g) {
-            MatrixPtr w = weights_[i]->getW()->subMatrix(g * K, K);
-            MatrixPtr outGradMat = colBuf_->subMatrix(g * K, K);
-            MatrixPtr inGradMatSub =
-                Matrix::create(preGrad, M, N, false, useGpu_);
-            inGradMatSub->mul(*(w->getTranspose()), *outGradMat, 1.0, 1.0);
-            preGrad += M * N;
-          }
-        }
-      }
-      weights_[i]->getParameterPtr()->incUpdate(callback);
-    }
-  }
-}
-void DeConv3DLayer::bpropWeights(int i) {}
-void DeConv3DLayer::bpropData(int i) {}
-
-void DeConv3DLayer::bpropBiases() {
-  MatrixPtr biases = Matrix::create(biases_->getWGrad()->getData(),
-                                    1,
-                                    biases_->getWGrad()->getElementCnt(),
-                                    false,
-                                    useGpu_);
-  const MatrixPtr &outGradMat = getOutputGrad();
-
-  if (this->sharedBiases_) {
-    biases->collectSharedBias(*outGradMat, 1.0f);
-  } else {
-    biases->collectBias(*outGradMat, 1.0f);
-  }
-}
-
-void DeConv3DLayer::addBias() {
-  MatrixPtr outMat = getOutputValue();
-  MatrixPtr bias = Matrix::create(biases_->getW()->getData(),
-                                  1,
-                                  biases_->getW()->getElementCnt(),
-                                  false,
-                                  useGpu_);
-  if (this->sharedBiases_) {
-    outMat->addSharedBias(*(bias), 1.0f);
-  } else {
-    outMat->addBias(*(bias), 1.0f);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/DeConv3DLayer.h b/paddle/gserver/layers/DeConv3DLayer.h
deleted file mode 100644
index 13d1d07cf5cc6e2a6ea89768e29b1fe8cda5e81c..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/DeConv3DLayer.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "ConvBaseLayer.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief A subclass of deconvolution3D layer.
- * This layer expands input and use matrix multiplication to
- * calculate deconvolution3D operation.
- */
-class DeConv3DLayer : public ConvBaseLayer {
- public:
-  explicit DeConv3DLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
-  ~DeConv3DLayer() {}
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-  void addBias();
-  void backward(const UpdateCallback& callback);
-  void bpropBiases();
-  void bpropData(int i);
-  void bpropWeights(int i);
-  size_t getSize();
-
- protected:
-  // Figure out the dimensions for individual gemms.
-  IntV M_;  /// numFilters_ / filter_group_;
-  IntV N_;  /// channels_ * filterSizeZ_ * filterSize_ * filterSizeY_
-  IntV K_;  /// outputD_ * outputH_ * outputW_
-  IntV NOut_;
-  MatrixPtr colBuf_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/DetectionUtil.h b/paddle/gserver/layers/DetectionUtil.h
deleted file mode 100644
index d6502fcf8fb12a434632876c25ac3ca23b87e60e..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/DetectionUtil.h
+++ /dev/null
@@ -1,307 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <float.h>
-#include <algorithm>
-#include <vector>
-#include "paddle/math/Matrix.h"
-
-using std::vector;
-using std::pair;
-using std::map;
-
-namespace paddle {
-
-template <typename T>
-struct BBoxBase {
-  BBoxBase(T xMin, T yMin, T xMax, T yMax)
-      : xMin(xMin), yMin(yMin), xMax(xMax), yMax(yMax), isDifficult(false) {}
-
-  BBoxBase() {}
-
-  T getWidth() const { return xMax - xMin; }
-
-  T getHeight() const { return yMax - yMin; }
-
-  T getCenterX() const { return (xMin + xMax) / 2; }
-
-  T getCenterY() const { return (yMin + yMax) / 2; }
-
-  T getArea() const { return getWidth() * getHeight(); }
-
-  // coordinate of bounding box
-  T xMin;
-  T yMin;
-  T xMax;
-  T yMax;
-  // whether difficult object (e.g. object with heavy occlusion is difficult)
-  bool isDifficult;
-};
-
-struct NormalizedBBox : BBoxBase<real> {
-  NormalizedBBox() : BBoxBase<real>() {}
-};
-
-enum PermMode { kNCHWToNHWC, kNHWCToNCHW };
-
-/**
- * @brief First permute input maxtrix then append to output matrix
- */
-size_t appendWithPermute(const Matrix& inMatrix,
-                         size_t height,
-                         size_t width,
-                         size_t outTotalSize,
-                         size_t outOffset,
-                         size_t batchSize,
-                         Matrix& outMatrix,
-                         PermMode permMode);
-
-/**
- * @brief First permute input maxtrix then decompose to output
- */
-size_t decomposeWithPermute(const Matrix& inMatrix,
-                            size_t height,
-                            size_t width,
-                            size_t totalSize,
-                            size_t offset,
-                            size_t batchSize,
-                            Matrix& outMatrix,
-                            PermMode permMode);
-
-/**
- * @brief Compute jaccard overlap between two bboxes.
- * @param bbox1 The first bbox
- * @param bbox2 The second bbox
- */
-real jaccardOverlap(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2);
-
-/**
- * @brief Compute offset parameters between prior bbox and ground truth bbox
- * and variances of prior bbox are considered
- * @param priorBBox Input prior bbox
- * @param priorBBoxVar Variance parameters of prior bbox
- * @param gtBBox Groundtruth bbox
- * @param outVec Output vector
- */
-void encodeBBoxWithVar(const NormalizedBBox& priorBBox,
-                       const vector<real>& priorBBoxVar,
-                       const NormalizedBBox& gtBBox,
-                       vector<real>& outVec);
-
-/**
- * @brief Decode prior bbox with offset parameters
- * and variances of prior bbox are considered
- * @param priorBBox Prior bbox to be decoded
- * @param priorBBoxVar Variance parameters of prior bbox
- * @param locPredData Offset parameters
- */
-NormalizedBBox decodeBBoxWithVar(const NormalizedBBox& priorBBox,
-                                 const vector<real>& priorBBoxVar,
-                                 const vector<real>& locPredData);
-
-/**
- * @brief Extract bboxes from prior matrix, the layout is
- * xmin1 | ymin1 | xmax1 | ymax1 | xmin1Var | ymin1Var | xmax1Var | ymax1Var ...
- * @param priorData Matrix of prior value
- * @param numBBoxes Number of bbox to be extracted
- * @param bboxVec Append to the vector
- */
-void getBBoxFromPriorData(const real* priorData,
-                          const size_t numBBoxes,
-                          vector<NormalizedBBox>& bboxVec);
-
-/**
- * @brief Extract labels, scores and bboxes from detection matrix, the layout is
- * imageId | label | score | xmin | ymin | xmax | ymax
- * @param detectData Matrix of detection value
- * @param numBBoxes Number of bbox to be extracted
- * @param labelVec Label of bbox
- * @param scoreVec Score of bbox
- * @param bboxVec Append to the vector
- */
-void getBBoxFromDetectData(const real* detectData,
-                           const size_t numBBoxes,
-                           vector<real>& labelVec,
-                           vector<real>& scoreVec,
-                           vector<NormalizedBBox>& bboxVec);
-
-/**
- * @brief Extract variances from prior matrix, the layout is
- * xmin1 | ymin1 | xmax1 | ymax1 | xmin1Var | ymin1Var | xmax1Var | ymax1Var ...
- * @param priorData Matrix of prior value
- * @param num Number to be extracted
- * @param varVec Append to the vector
- */
-void getBBoxVarFromPriorData(const real* priorData,
-                             const size_t num,
-                             vector<vector<real>>& varVec);
-
-/**
- * @brief Extract bboxes from label matrix, the layout is
- * class1_1 | xmin1_1 | ymin1_1 | xmax1_1 | ymax1_1 | difficult1_1 | ...
- * @param labelData Matrix of label value
- * @param numBBoxes Number to be extracted
- * @param bboxVec Append to the vector
- */
-void getBBoxFromLabelData(const real* labelData,
-                          const size_t numBBoxes,
-                          vector<NormalizedBBox>& bboxVec);
-
-/**
-* @brief Match prior bbox to groundtruth bbox, the strategy is:
-1. Find the most overlaped bbox pair (prior and groundtruth)
-2. For rest of prior bboxes find the most overlaped groundtruth bbox
-* @param priorBBoxes prior bbox
-* @param gtBBoxes groundtruth bbox
-* @param overlapThreshold Low boundary of overlap (judge whether matched)
-* @param matchIndices For each prior bbox, groundtruth bbox index if matched
-otherwise -1
-* @param matchOverlaps For each prior bbox, overap with all groundtruth bboxes
-*/
-void matchBBox(const vector<NormalizedBBox>& priorBBoxes,
-               const vector<NormalizedBBox>& gtBBoxes,
-               real overlapThreshold,
-               vector<int>* matchIndices,
-               vector<real>* matchOverlaps);
-
-/**
-* @brief Generate positive bboxes and negative bboxes,
-|positive bboxes|/|negative bboxes| is negPosRatio
-* @param priorValue Prior value
-* @param numPriorBBoxes Number of prior bbox
-* @param gtValue Groundtruth value
-* @param gtStartPosPtr Since groundtruth value stored as sequence type,
-this parameter indicates start position of each record
-* @param seqNum Number of sequence
-* @param maxConfScore Classification score for prior bbox, used to mine
-negative examples
-* @param batchSize Image number
-* @param overlapThreshold Low boundary of overap
-* @param negOverlapThreshold Upper boundary of overap (judge negative example)
-* @param negPosRatio Control number of negative bboxes
-* @param matchIndicesVecPtr Save indices of matched prior bbox
-* @param negIndicesVecPtr Save indices of negative prior bbox
-*/
-pair<size_t, size_t> generateMatchIndices(
-    const Matrix& priorValue,
-    const size_t numPriorBBoxes,
-    const Matrix& gtValue,
-    const int* gtStartPosPtr,
-    const size_t seqNum,
-    const vector<vector<real>>& maxConfScore,
-    const size_t batchSize,
-    const real overlapThreshold,
-    const real negOverlapThreshold,
-    const size_t negPosRatio,
-    vector<vector<int>>* matchIndicesVecPtr,
-    vector<vector<int>>* negIndicesVecPtr);
-
-/**
- * @brief Get max confidence score for each prior bbox
- * @param confData Confidence scores, layout is
- * class1 score | class2 score | ... | classN score ...
- * @param batchSize Image number
- * @param numPriorBBoxes Prior bbox number
- * @param numClasses Classes number
- * @param backgroundId Background id
- * @param maxConfScoreVecPtr Ouput
- */
-void getMaxConfidenceScores(const real* confData,
-                            const size_t batchSize,
-                            const size_t numPriorBBoxes,
-                            const size_t numClasses,
-                            const size_t backgroundId,
-                            vector<vector<real>>* maxConfScoreVecPtr);
-
-template <typename T>
-bool sortScorePairDescend(const pair<real, T>& pair1,
-                          const pair<real, T>& pair2);
-
-template <>
-bool sortScorePairDescend(const pair<real, NormalizedBBox>& pair1,
-                          const pair<real, NormalizedBBox>& pair2);
-
-/**
- * @brief Do NMS for bboxes to remove duplicated bboxes
- * @param bboxes BBoxes to apply NMS
- * @param confScoreData Confidence scores
- * @param classIdx Class to do NMS
- * @param topK Number to keep
- * @param confThreshold Low boundary of confidence score
- * @param nmsThreshold Threshold of overlap
- * @param numPriorBBoxes Total number of prior bboxes
- * @param numClasses Total class number
- * @param indices Indices of high quality bboxes
- */
-void applyNMSFast(const vector<NormalizedBBox>& bboxes,
-                  const real* confScoreData,
-                  size_t classIdx,
-                  size_t topK,
-                  real confThreshold,
-                  real nmsThreshold,
-                  size_t numPriorBBoxes,
-                  size_t numClasses,
-                  vector<size_t>* indices);
-
-/**
- * @brief Get detection results which satify requirements
- * @param numPriorBBoxes Prior bbox number
- * @param numClasses Class number
- * @param backgroundId Background class
- * @param batchSize Image number
- * @param confThreshold Threshold of class confidence
- * @param nmsTopK Used in NMS operation to keep top k bbox
- * @param nmsThreshold Used in NMS, threshold of overlap
- * @param keepTopK How many bboxes keeped in an image
- * @param allDecodedBBoxes Decoded bboxes for all images
- * @param allDetectionIndices Save detection bbox indices
- */
-size_t getDetectionIndices(
-    const real* confData,
-    const size_t numPriorBBoxes,
-    const size_t numClasses,
-    const size_t backgroundId,
-    const size_t batchSize,
-    const real confThreshold,
-    const size_t nmsTopK,
-    const real nmsThreshold,
-    const size_t keepTopK,
-    const vector<vector<NormalizedBBox>>& allDecodedBBoxes,
-    vector<map<size_t, vector<size_t>>>* allDetectionIndices);
-
-/**
- * @brief Get detection results
- * @param confData Confidence scores
- * @param numPriorBBoxes Prior bbox number
- * @param numClasses Class number
- * @param batchSize Image number
- * @param allIndices Indices of predicted bboxes
- * @param allDecodedBBoxes BBoxes decoded
- * @param out Output matrix
- * image number | label | confidence score | xMin | yMin | xMax | yMax
- */
-void getDetectionOutput(const real* confData,
-                        const size_t numKept,
-                        const size_t numPriorBBoxes,
-                        const size_t numClasses,
-                        const size_t batchSize,
-                        const vector<map<size_t, vector<size_t>>>& allIndices,
-                        const vector<vector<NormalizedBBox>>& allDecodedBBoxes,
-                        Matrix& out);
-
-NormalizedBBox clipBBox(const NormalizedBBox& bbox);
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/DotProdLayer.cpp b/paddle/gserver/layers/DotProdLayer.cpp
deleted file mode 100644
index 72b0c707b2131dc275ba604cd20ae0007c34a9a9..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/DotProdLayer.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief A layer for computing the dot product of two vectors.
- * Input1: vector (batchSize * dim)
- * Input2: vector (batchSize * dim)
- * Output: a matrix: (batchSize * 1)
- */
-
-class DotProdLayer : public Layer {
- public:
-  explicit DotProdLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~DotProdLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(dot_prod, DotProdLayer);
-
-bool DotProdLayer::init(const LayerMap& layerMap,
-                        const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 2U);
-  CHECK_EQ(1UL, getSize())
-      << "The output dimensionality of this layer should be fixed to 1.";
-
-  return true;
-}
-
-void DotProdLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-
-  size_t batchSize = inV0->getHeight();
-  CHECK_EQ(inV1->getHeight(), batchSize);
-  CHECK_EQ(inV0->getWidth(), inV1->getWidth());
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    reserveOutput(batchSize, 1);
-  }
-
-  MatrixPtr outV = getOutputValue();
-  {
-    REGISTER_TIMER_INFO("FwDotProdTimer", getName().c_str());
-    outV->sumOfProducts(*inV0, *inV1, 1, 0);
-  }
-}
-
-void DotProdLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-  MatrixPtr outG = getOutputGrad();
-  MatrixPtr inG0 = getInputGrad(0);
-  MatrixPtr inG1 = getInputGrad(1);
-
-  {
-    REGISTER_TIMER_INFO("BwDotProdTimer", getName().c_str());
-
-    if (inG0) {
-      inG0->addRowScale(0, *inV1, *outG);
-    }
-
-    if (inG1) {
-      inG1->addRowScale(0, *inV0, *outG);
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/EosIdCheckLayer.cpp b/paddle/gserver/layers/EosIdCheckLayer.cpp
deleted file mode 100644
index 04400f2836581179849a4dd1c256bbddcc82530f..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/EosIdCheckLayer.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/utils/Logging.h"
-
-namespace paddle {
-/**
- * A layer for checking EOS for each sample:
- * - output_id = (input_id == conf.eos_id)
- *
- * The result is stored in output_.ids.
- * It is used by recurrent layer group.
- */
-class EosIdCheckLayer : public Layer {
- public:
-  explicit EosIdCheckLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override {
-    bool ret = Layer::init(layerMap, parameterMap);
-    CHECK_EQ(1UL, inputLayers_.size());
-    return ret;
-  }
-
-  void forward(PassType passType) override {
-    Layer::forward(passType);
-
-    const Argument& input = getInput(0);
-    IVector::resizeOrCreate(output_.ids, input.ids->getSize(), useGpu_);
-    output_.ids->isEqualTo(*input.ids, config_.eos_id());
-  }
-
-  void backward(const UpdateCallback& callback) override {}
-};
-
-REGISTER_LAYER(eos_id, EosIdCheckLayer);
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ExpandConvLayer.cpp b/paddle/gserver/layers/ExpandConvLayer.cpp
deleted file mode 100644
index 3a8478658249bfb0886e904aec43e50fe3618f79..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ExpandConvLayer.cpp
+++ /dev/null
@@ -1,248 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ExpandConvLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-DEFINE_bool(use_nnpack,
-            false,
-            "Whether to use nnpack for convolution calculation.");
-
-namespace paddle {
-
-/*
- * The calculation of the exconvt(convolution transpose (deconv) operation)
- * is a swap of forward and backward of the calculation of exconv.
- * */
-REGISTER_LAYER(exconv, ExpandConvLayer);
-REGISTER_LAYER(exconvt, ExpandConvLayer);
-
-inline bool isDepthwiseConv(int channels, int groups) {
-  return channels == groups;
-}
-
-bool ExpandConvLayer::init(const LayerMap &layerMap,
-                           const ParameterMap &parameterMap) {
-  /* Initialize the basic convolutional parent class */
-  ConvBaseLayer::init(layerMap, parameterMap);
-
-  int index = 0;
-  for (auto &inputConfig : config_.inputs()) {
-    const ConvConfig &conf = inputConfig.conv_conf();
-    /* Consistent caffe mode for multiple input */
-    caffeMode_ = conf.caffe_mode();
-
-    // create a new weight
-    size_t height, width;
-    height = filterPixels_[index] * filterChannels_[index];
-    width = (!isDeconv_) ? numFilters_ : channels_[index];
-    CHECK_EQ(parameters_[index]->getSize(), width * height);
-    Weight *w = new Weight(height, width, parameters_[index]);
-    weights_.emplace_back(w);
-    index++;
-  }
-
-  if (biasParameter_.get()) {
-    if (sharedBiases_) {
-      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
-      biases_ = std::unique_ptr<Weight>(
-          new Weight(1, numFilters_, biasParameter_, 0));
-    } else {
-      biases_ =
-          std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_, 0));
-    }
-  }
-
-  getOutputSize();
-
-  size_t numInputs = config_.inputs_size();
-  inputShape_.resize(numInputs);
-  filterShape_.resize(numInputs);
-  outputShape_.resize(numInputs);
-
-  std::string convType;
-  std::string convGradInputType;
-  std::string convGradFilterType;
-
-  for (int i = 0; i < config_.inputs_size(); i++) {
-    std::vector<size_t> paddings = {(size_t)paddingY_[i], (size_t)padding_[i]};
-    std::vector<size_t> strides = {(size_t)strideY_[i], (size_t)stride_[i]};
-    std::vector<size_t> dilations = {(size_t)dilationY_[i],
-                                     (size_t)dilation_[i]};
-
-    bool useDilation = ((size_t)dilationY_[i] > 1 || (size_t)dilation_[i] > 1);
-
-    // Convolution Layer uses the GemmConv function by default.
-    convType = "GemmConv";
-    convGradInputType = "GemmConvGradInput";
-    convGradFilterType = "GemmConvGradFilter";
-
-    // If depth wise convolution and useGpu == true
-    if (useGpu_ && isDepthwiseConv(channels_[i], groups_[i]) && !isDeconv_) {
-      convType = "DepthwiseConv";
-      convGradInputType = "DepthwiseConvGradInput";
-      convGradFilterType = "DepthwiseConvGradFilter";
-    }
-
-    // If depth wise convolution and useGpu == false and ARM-NEON
-    if (!useGpu_ && isDepthwiseConv(channels_[i], groups_[i]) && !isDeconv_) {
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-      if ((filterSize_[i] == filterSizeY_[i]) &&
-          (filterSize_[i] == 3 || filterSize_[i] == 4) &&
-          (stride_[i] == strideY_[i]) && (stride_[i] == 1 || stride_[i] == 2) &&
-          !useDilation) {
-        convType = "NeonDepthwiseConv";
-      }
-#endif
-    }
-
-    if (FLAGS_use_nnpack && !isDeconv_ && !useDilation) {
-      createFunction(forward_,
-                     "NNPACKConv",
-                     FuncConfig()
-                         .set("paddings", paddings)
-                         .set("strides", strides)
-                         .set("groups", (size_t)groups_[i])
-                         .set("algo", std::string("auto")));
-    } else {
-      createFunction(forward_,
-                     !isDeconv_ ? convType : convGradInputType,
-                     FuncConfig()
-                         .set("paddings", paddings)
-                         .set("strides", strides)
-                         .set("dilations", dilations)
-                         .set("groups", (size_t)groups_[i]));
-
-      createFunction(backward_,
-                     !isDeconv_ ? convGradInputType : convType,
-                     FuncConfig()
-                         .set("paddings", paddings)
-                         .set("strides", strides)
-                         .set("dilations", dilations)
-                         .set("groups", (size_t)groups_[i]));
-
-      createFunction(backward_,
-                     convGradFilterType,
-                     FuncConfig()
-                         .set("paddings", paddings)
-                         .set("strides", strides)
-                         .set("dilations", dilations)
-                         .set("groups", (size_t)groups_[i]));
-    }
-  }
-  return true;
-}
-
-size_t ExpandConvLayer::getOutputSize() {
-  CHECK_NE(inputLayers_.size(), 0UL);
-  size_t layerSize = ConvBaseLayer::calOutputSize();
-  return layerSize;
-}
-
-// i is the index of input layers
-#define BACKWARD_INPUT(i, inputs, outputs) \
-  backward_[2 * i]->calc(inputs, outputs)
-#define BACKWARD_FILTER(i, inputs, outputs) \
-  backward_[2 * i + 1]->calc(inputs, outputs)
-
-void ExpandConvLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  size_t batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  resetOutput(batchSize, getOutputSize());
-
-  // Calculate the shape of the input, output, and filter.
-  for (size_t i = 0; i < inputLayers_.size(); ++i) {
-    inputShape_[i] = TensorShape({(size_t)batchSize,
-                                  (size_t)channels_[i],
-                                  (size_t)imgSizeH_[i],
-                                  (size_t)imgSizeW_[i]});
-    filterShape_[i] =
-        TensorShape({(size_t)groups_[i],
-                     !isDeconv_ ? (size_t)numFilters_ / groups_[i]
-                                : (size_t)channels_[i] / groups_[i],
-                     !isDeconv_ ? (size_t)channels_[i] / groups_[i]
-                                : (size_t)numFilters_ / groups_[i],
-                     (size_t)filterSizeY_[i],
-                     (size_t)filterSize_[i]});
-    outputShape_[i] = TensorShape({(size_t)batchSize,
-                                   (size_t)numFilters_,
-                                   (size_t)outputH_[i],
-                                   (size_t)outputW_[i]});
-  }
-
-  // Calculate the output value.
-  for (size_t i = 0; i < inputLayers_.size(); ++i) {
-    BufferArgs inputs;
-    BufferArgs outputs;
-    inputs.addArg(*getInputValue(i), inputShape_[i]);
-    inputs.addArg(*weights_[i]->getW(), filterShape_[i]);
-    outputs.addArg(*getOutputValue(),
-                   outputShape_[i],
-                   !isDeconv_ && i == 0 ? ASSIGN_TO : ADD_TO);
-
-    forward_[i]->calc(inputs, outputs);
-  }
-
-  /* add the bias-vector */
-  if (biases_.get()) {
-    output_.value->addBias(*biases_->getW(), 1.0, sharedBiases_);
-  }
-
-  /* activation */
-  forwardActivation();
-}
-
-void ExpandConvLayer::backward(const UpdateCallback &callback) {
-  backwardActivation();
-
-  MatrixPtr outGrad = getOutputGrad();
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1, sharedBiases_);
-    /* Increasing the number of gradient */
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  // Calculate the input grad and filter grad.
-  for (size_t i = 0; i < inputLayers_.size(); ++i) {
-    if (getInputGrad(i)) {
-      BufferArgs inputs;
-      BufferArgs outputs;
-      inputs.addArg(*getOutputGrad(), outputShape_[i]);
-      inputs.addArg(*weights_[i]->getW(), filterShape_[i]);
-      outputs.addArg(*getInputGrad(i), inputShape_[i], ADD_TO);
-      BACKWARD_INPUT(i, inputs, outputs);
-    }
-
-    if (weights_[i]->getWGrad()) {
-      BufferArgs inputs;
-      BufferArgs outputs;
-      if (!isDeconv_) {
-        inputs.addArg(*getOutputGrad(), outputShape_[i]);
-        inputs.addArg(*getInputValue(i), inputShape_[i]);
-      } else {
-        inputs.addArg(*getInputValue(i), inputShape_[i]);
-        inputs.addArg(*getOutputGrad(), outputShape_[i]);
-      }
-      outputs.addArg(*weights_[i]->getWGrad(), filterShape_[i], ADD_TO);
-      BACKWARD_FILTER(i, inputs, outputs);
-
-      /* Increasing the number of gradient */
-      weights_[i]->getParameterPtr()->incUpdate(callback);
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ExpandConvLayer.h b/paddle/gserver/layers/ExpandConvLayer.h
deleted file mode 100644
index 6919ef71355a4c660b9ddd60bff75fee399cfaa9..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ExpandConvLayer.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "ConvBaseLayer.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief A subclass of convolution layer.
- * This layer expands input and use matrix multiplication to
- * calculate convolution operation.
- *
- * The config file api is img_conv_layer.
- */
-
-class ExpandConvLayer : public ConvBaseLayer {
- public:
-  explicit ExpandConvLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
-
-  ~ExpandConvLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-
-  size_t getOutputSize();
-
- protected:
-  std::vector<TensorShape> inputShape_;
-  std::vector<TensorShape> filterShape_;
-  std::vector<TensorShape> outputShape_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ExpandLayer.cpp b/paddle/gserver/layers/ExpandLayer.cpp
deleted file mode 100644
index 6b5776754017bca8f8c14170ecfb4faa4109e0b5..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ExpandLayer.cpp
+++ /dev/null
@@ -1,133 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ExpandLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(expand, ExpandLayer);
-
-bool ExpandLayer::init(const LayerMap& layerMap,
-                       const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-  CHECK_EQ(inputLayers_.size(), 2UL);
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-  // which sequence type of input[0]
-  if (config_.trans_type() == "non-seq") {
-    type_ = kNonSeq;
-  } else if (config_.trans_type() == "seq") {
-    type_ = kSeq;
-  } else {
-    LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
-  }
-  setNeedSequenceInfo(false);
-  return true;
-}
-
-void ExpandLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  // Expand layer should have exactly 2 input, one for data, one for size
-  CHECK_EQ(2U, inputLayers_.size());
-
-  // using two input:
-  // * first one for data;
-  // * second one only for sequence info
-  const Argument& shapeInput = getInput(1);
-  const Argument& dataInput = getInput(0);
-  size_t outputBatchSize = shapeInput.getBatchSize();
-  auto startPositions = type_ ? shapeInput.subSequenceStartPositions
-                              : shapeInput.sequenceStartPositions;
-  size_t numSequences = startPositions->getSize() - 1;
-  const int* starts = startPositions->getData(false);
-
-  CHECK_EQ(starts[numSequences], shapeInput.getBatchSize());
-  if (type_) {
-    // when trans_type = seq, input[1] must hasSubseq
-    CHECK_EQ(shapeInput.hasSubseq(), 1UL);
-    CHECK_EQ(dataInput.getNumSequences(), shapeInput.getNumSequences());
-  } else {
-    CHECK_EQ(dataInput.getBatchSize(), shapeInput.getNumSequences());
-  }
-
-  // set output sequence info as shape sequence
-  output_.sequenceStartPositions = shapeInput.sequenceStartPositions;
-  if (shapeInput.hasSubseq()) {
-    output_.subSequenceStartPositions = shapeInput.subSequenceStartPositions;
-  }
-
-  // reserve output: Expand output to batchsize of sequence data.
-  reserveOutput(outputBatchSize, dataInput.value->getWidth());
-
-  MatrixPtr inputValue = getInputValue(0);
-  MatrixPtr outputValue = getOutputValue();
-
-  ICpuGpuVector::resizeOrCreate(expandStartsPos_, outputBatchSize, false);
-  int* expandStarts = expandStartsPos_->getMutableData(false);
-  for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
-    int sequenceLength = starts[sequenceId + 1] - starts[sequenceId];
-    for (int j = 0; j < sequenceLength; j++) {
-      expandStarts[starts[sequenceId] + j] = sequenceId;
-    }
-  }
-
-  outputValue->copyByRowIndex(*inputValue,
-                              *expandStartsPos_->getVector(useGpu_));
-
-  if (biases_.get() != NULL) {
-    outputValue->addBias(*(biases_->getW()), 1);
-  }
-}
-
-void ExpandLayer::backward(const UpdateCallback& callback) {
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-    /* Increasing the number of gradient */
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  if (!getInputGrad(0)) return;
-  MatrixPtr inputGrad = getInputGrad(0);
-  MatrixPtr outputGrad = getOutputGrad();
-  auto cpuSeqStartPos = type_ ? getInput(1).subSequenceStartPositions
-                              : getInput(1).sequenceStartPositions;
-  size_t numSequences = cpuSeqStartPos->getSize() - 1;
-  const int* starts = cpuSeqStartPos->getData(false);
-
-  CHECK_EQ(inputGrad->getWidth(), outputGrad->getWidth());
-  CHECK_EQ(outputGrad->getHeight(), (size_t)starts[numSequences]);
-
-  AsyncGpuBlock asyncGpuBlock;
-
-  // sum to get the grad
-  real scale = 1;
-  for (size_t sequenceId = 0; sequenceId < numSequences; sequenceId++) {
-    // TODO(Dangqingqing) optimization for GPU
-    int sequenceLength = starts[sequenceId + 1] - starts[sequenceId];
-    if (sequenceLength == 0) {
-      // empty sequence
-      continue;
-    }
-    MatrixPtr copyData = inputGrad->subMatrix(sequenceId, 1);
-    copyData->collectBias(
-        *outputGrad->subMatrix(starts[sequenceId], sequenceLength), scale);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ExpandLayer.h b/paddle/gserver/layers/ExpandLayer.h
deleted file mode 100644
index 06bd4ef05ee206628d981fee8e7eec3c91b18b7a..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ExpandLayer.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * A layer for "Expand Dense data or (sequence data where the length of each
- * sequence is one) to sequence data."
- *
- * It should have exactly 2 input, one for data, one for size:
- * - first one for data
- *   - If ExpandLevel = kNonSeq: dense data
- *   - If ExpandLevel = kSeq: sequence data where the length of each sequence is
- * one
- * - second one only for sequence info
- *   - should be sequence data with or without sub-sequence.
- *
- * And the output size is the batch size(not instances) of second input.
- *
- * The config file api is expand_layer.
- */
-
-class ExpandLayer : public Layer {
- protected:
-  std::unique_ptr<Weight> biases_;
-  /// if input[0] is dense data, ExpandLevel=kNonSeq;
-  /// if input[0] is sequence data, ExpandLevel=kSeq
-  enum ExpandLevel { kNonSeq = 0, kSeq = 1 };
-  /// store the ExpandLevel
-  int type_;
-  /// expanded sequenceStartPositions or subSequenceStartPositions
-  /// of input[1]
-  ICpuGpuVectorPtr expandStartsPos_;
-
- public:
-  explicit ExpandLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~ExpandLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/FactorizationMachineLayer.cpp b/paddle/gserver/layers/FactorizationMachineLayer.cpp
deleted file mode 100644
index 1744faada2ebd9f2c88ba9a3952b6b2646729e3b..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/FactorizationMachineLayer.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "FactorizationMachineLayer.h"
-#include <algorithm>
-#include <vector>
-#include "paddle/math/SparseMatrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(factorization_machine, FactorizationMachineLayer);
-
-bool FactorizationMachineLayer::init(const LayerMap& layerMap,
-                                     const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  factorSize_ = config_.factor_size();
-
-  /* initialize the latentVectors_ */
-  CHECK_EQ(inputLayers_.size(), 1UL);
-  size_t inputSize = inputLayers_[0]->getSize();
-  CHECK_EQ(parameters_[0]->getSize(), inputSize * factorSize_);
-  latentVectors_ = std::unique_ptr<Weight>(
-      new Weight(inputSize, factorSize_, parameters_[0]));
-
-  return true;
-}
-
-void FactorizationMachineLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  const MatrixPtr& inputV = getInputValue(0);
-
-  size_t batchSize = inputV->getHeight();
-  size_t outputSize = getSize();
-  size_t inputSize = inputLayers_[0]->getSize();
-  reserveOutput(batchSize, outputSize);
-
-  MatrixPtr outV = getOutputValue();
-
-  Matrix::resizeOrCreate(
-      latentVectorsSquare_, inputSize, factorSize_, false, useGpu_);
-  Matrix::resizeOrCreate(
-      inputMulFactor_, batchSize, factorSize_, false, useGpu_);
-  Matrix::resizeOrCreate(tmpOut_, batchSize, factorSize_, false, useGpu_);
-
-  REGISTER_TIMER_INFO("FmInputMulFactorTimer", getName().c_str());
-  inputMulFactor_->mul(*inputV, *latentVectors_->getW());
-  inputMulFactor_->square2(*tmpOut_);
-  outV->sumRows(*tmpOut_, 0.5, 0);
-
-  if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
-    Matrix::resizeOrCreateSparseMatrix(inputSquare_,
-                                       inputV->getHeight(),
-                                       inputV->getWidth(),
-                                       inputV->getElementCnt(),
-                                       inputV->getValueType());
-    inputSquare_->copyFrom(*inputV);
-    (dynamic_cast<CpuSparseMatrix*>(inputSquare_.get()))->square2();
-  } else {
-    Matrix::resizeOrCreate(
-        inputSquare_, inputV->getHeight(), inputV->getWidth(), false, useGpu_);
-    inputV->square2(*inputSquare_);
-  }
-  latentVectors_->getW()->square2(*latentVectorsSquare_);
-  tmpOut_->mul(*inputSquare_, *latentVectorsSquare_);
-  outV->sumRows(*tmpOut_, -0.5, 1.0);
-
-  /* activation */ {
-    REGISTER_TIMER_INFO("FmFwAtvTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void FactorizationMachineLayer::backward(const UpdateCallback& callback) {
-  /* Do derivation */ { backwardActivation(); }
-
-  const MatrixPtr& inputV = getInputValue(0);
-  const MatrixPtr& oGrad = getOutputGrad();
-
-  Matrix::resizeOrCreate(
-      tmpSum_, 1, latentVectors_->getW()->getHeight(), false, useGpu_);
-  MatrixPtr tmpSumTrans = Matrix::create(tmpSum_->getRowBuf(0),
-                                         latentVectors_->getW()->getHeight(),
-                                         1,
-                                         false,
-                                         useGpu_);
-
-  /* Calculate the gradients of the latentVectors_ matrix */
-  if (latentVectors_->getWGrad()) {
-    if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
-      Matrix::resizeOrCreateSparseMatrix(tmpInput_,
-                                         inputV->getHeight(),
-                                         inputV->getWidth(),
-                                         inputV->getElementCnt());
-
-      CpuSparseMatrix* sparseInputV =
-          dynamic_cast<CpuSparseMatrix*>(inputV.get());
-      CpuSparseMatrix* sparseInputSquare =
-          dynamic_cast<CpuSparseMatrix*>(inputSquare_.get());
-      CpuSparseMatrix* sparseTmpInput =
-          dynamic_cast<CpuSparseMatrix*>(tmpInput_.get());
-      sparseTmpInput->copyFrom(*sparseInputV);
-
-      sparseTmpInput->rowScale(0, *sparseInputV, *oGrad);
-      latentVectors_->getWGrad()->mul(
-          *sparseTmpInput->getTranspose(), *inputMulFactor_, 1, 1);
-      sparseTmpInput->rowScale(0, *sparseInputSquare, *oGrad);
-
-      Matrix::resizeOrCreate(negOnes_, 1, inputV->getHeight(), false, useGpu_);
-      negOnes_->zeroMem();
-      negOnes_->add(-1);
-      tmpSum_->mul(*negOnes_, *sparseTmpInput, 1, 0);
-    } else {
-      Matrix::resizeOrCreate(
-          tmpInput_, inputV->getHeight(), inputV->getWidth(), false, useGpu_);
-
-      tmpInput_->rowScale(0, *inputV, *oGrad);
-      latentVectors_->getWGrad()->mul(
-          *tmpInput_->getTranspose(), *inputMulFactor_, 1, 1);
-      tmpInput_->rowScale(0, *inputSquare_, *oGrad);
-
-      tmpSum_->sumCols(*tmpInput_, -1, 0);
-    }
-
-    latentVectors_->getWGrad()->addRowScale(
-        0, *latentVectors_->getW(), *tmpSumTrans);
-
-    /* Increasing the number of gradient */
-    latentVectors_->getParameterPtr()->incUpdate(callback);
-  }
-
-  /* Calculate the input layers gradient */
-  MatrixPtr inGrad = getInputGrad(0);
-  if (inGrad != NULL) {
-    inGrad->mul(
-        *inputMulFactor_, *latentVectors_->getW()->getTranspose(), 1, 1);
-    tmpSumTrans->sumRows(*latentVectorsSquare_, -1, 0);
-    inGrad->addColScale(0, *inputV, *tmpSum_);
-    inGrad->rowScale(0, *inGrad, *oGrad);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/FactorizationMachineLayer.h b/paddle/gserver/layers/FactorizationMachineLayer.h
deleted file mode 100644
index 148abe238173dd44cd0fcf3f5cda732f70078706..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/FactorizationMachineLayer.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/ThreadLocal.h"
-
-namespace paddle {
-/**
- * @brief The Factorization Machine models pairwise (order-2) feature
- * interactions as inner product of the learned latent vectors corresponding
- * to each input feature.
- *
- * The Factorization Machine can effectively capture feature interactions
- * especially when the input is sparse. While in principle FM can model higher
- * order feature interaction, in practice usually only order-2 feature
- * interactions are considered. The Factorization Machine Layer here only
- * computes the order-2 interations with the formula:
- *
- * \f[
- *     y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j
- * \f]
- *
- * The detailed calculation for forward and backward can be found at this paper:
- *
- *     Factorization machines.
- *
- * The config file api is factorization_machine.
- */
-
-class FactorizationMachineLayer : public Layer {
- protected:
-  // The latent vectors, shape: (size, factorSize_)
-  // Each row of the latentVectors_ matrix is the latent vector
-  // corresponding to one input feature dimension
-  std::unique_ptr<Weight> latentVectors_;
-  // The hyperparameter that defines the dimensionality of the factorization
-  size_t factorSize_;
-
- private:
-  // Store the square values of the letent vectors matrix
-  MatrixPtr latentVectorsSquare_;
-  // Store the square values of input matrix
-  MatrixPtr inputSquare_;
-  // The result of input matrix * latent vector matrix that will be used in
-  // both forward and backward step
-  MatrixPtr inputMulFactor_;
-  // Store temporary calculation result
-  MatrixPtr tmpOut_;
-  MatrixPtr tmpSum_;
-  MatrixPtr tmpInput_;
-  // Negative identity matrix
-  MatrixPtr negOnes_;
-
- public:
-  explicit FactorizationMachineLayer(const LayerConfig& config)
-      : Layer(config) {}
-  ~FactorizationMachineLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/FeatureMapExpandLayer.cpp b/paddle/gserver/layers/FeatureMapExpandLayer.cpp
deleted file mode 100644
index d95f0b9b3d13e8bff635373cb4d5705c2351bd97..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/FeatureMapExpandLayer.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief A layer for expanding a batch of images to feature maps.
- * Each data of the input is a 2 dimensional matrix. Each element of the matrix
- * is replicated num_filters times to create a feature map with num_filters
- * channels.
- * - Input: Input one should be dense image data.
- * - Output: expanded fature maps.
- * \f[
- *  y.row[i] = x.row[i \mod x.width], i = 0,1,..., (x.width * num\_filters - 1)
- * \f]
- * For example, num_filters = 4:
- * @code
- *   x = [a1,a2;
- *        b1,b2]
- *   y = [a1, a2, a1, a2, a1, a2, a1, a2;
- *        b1, b2, b1, b2, b1, b2, b1, b2;]
- * @endcode
- */
-
-class FeatureMapExpandLayer : public Layer {
- private:
-  int numFilters_;
-  bool asRowVector_;
-
- public:
-  explicit FeatureMapExpandLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~FeatureMapExpandLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(featmap_expand, FeatureMapExpandLayer);
-
-bool FeatureMapExpandLayer::init(const LayerMap& layerMap,
-                                 const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 1UL);
-  numFilters_ = config_.num_filters();
-  asRowVector_ = config_.user_arg() != "as_col_vec";
-  return true;
-}
-
-void FeatureMapExpandLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  MatrixPtr inputV = getInputValue(0);
-  size_t batchSize = getInput(0).getBatchSize();
-  int imgSize = inputV->getWidth();
-  resetOutput(batchSize, imgSize * numFilters_);
-
-  MatrixPtr outputV = getOutputValue();
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    if (asRowVector_) {
-      for (size_t i = 0; i < batchSize; i++) {
-        MatrixPtr outVTmp =
-            Matrix::create(outputV->getData() + i * imgSize * numFilters_,
-                           numFilters_,
-                           imgSize,
-                           false,
-                           useGpu_);
-        MatrixPtr inVTmp = Matrix::create(
-            inputV->getData() + i * imgSize, 1, imgSize, false, useGpu_);
-        outVTmp->addRowVector(*inVTmp);
-      }
-    } else {
-      for (size_t i = 0; i < batchSize; i++) {
-        MatrixPtr outVTmp =
-            Matrix::create(outputV->getData() + i * imgSize * numFilters_,
-                           imgSize,
-                           numFilters_,
-                           false,
-                           useGpu_);
-        MatrixPtr inVTmp = Matrix::create(
-            inputV->getData() + i * imgSize, imgSize, 1, false, useGpu_);
-        outVTmp->addColVector(*inVTmp);
-      }
-    }
-  }
-  /* activation */ {
-    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void FeatureMapExpandLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inGrad = getInputGrad(0);
-  if (NULL == inGrad) {
-    return;
-  }
-  MatrixPtr outGrad = getOutputGrad();
-  size_t batchSize = getInput(0).getBatchSize();
-  int imgSize = inGrad->getWidth();
-  /* Do activation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    if (asRowVector_) {
-      for (size_t i = 0; i < batchSize; i++) {
-        MatrixPtr outGradTmp =
-            Matrix::create(outGrad->getData() + i * imgSize * numFilters_,
-                           numFilters_,
-                           imgSize,
-                           false,
-                           useGpu_);
-        MatrixPtr inGradTmp = Matrix::create(
-            inGrad->getData() + i * imgSize, 1, imgSize, false, useGpu_);
-        inGradTmp->collectBias(*outGradTmp, 1);
-      }
-    } else {
-      for (size_t i = 0; i < batchSize; i++) {
-        MatrixPtr outGradTmp =
-            Matrix::create(outGrad->getData() + i * imgSize * numFilters_,
-                           imgSize,
-                           numFilters_,
-                           false,
-                           useGpu_);
-        MatrixPtr inGradTmp = Matrix::create(
-            inGrad->getData() + i * imgSize, imgSize, 1, false, useGpu_);
-        inGradTmp->sumRows(*outGradTmp, 1, 1);
-      }
-    }
-  }
-}
-
-}  // namespace paddle.
diff --git a/paddle/gserver/layers/FullMatrixProjection.h b/paddle/gserver/layers/FullMatrixProjection.h
deleted file mode 100644
index a27aa4a12327ac39ec3418a849b1230e13f759ee..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/FullMatrixProjection.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/utils/Stat.h"
-
-#include "Projection.h"
-
-namespace paddle {
-
-/**
- * FullMatrixProjection performs full matrix multiplication:
- * \f[
- *    out.row[i] += in.row[i] * weight
- * \f]
- *
- * The config file api is full_matrix_projection.
- */
-class FullMatrixProjection : public Projection {
- public:
-  FullMatrixProjection(const ProjectionConfig& config,
-                       const ParameterPtr& parameter,
-                       bool useGpu);
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback);
-
- protected:
-  std::unique_ptr<Weight> weight_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/FullyConnectedLayer.cpp b/paddle/gserver/layers/FullyConnectedLayer.cpp
deleted file mode 100644
index 21ffa01d95a460b4b6edc2b02d63c19b32d0b070..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/FullyConnectedLayer.cpp
+++ /dev/null
@@ -1,150 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "FullyConnectedLayer.h"
-#include <algorithm>
-#include <vector>
-#include "paddle/math/SparseMatrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(fc, FullyConnectedLayer);
-
-bool FullyConnectedLayer::init(const LayerMap& layerMap,
-                               const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  /* initialize the weightList */
-  CHECK(inputLayers_.size() == parameters_.size());
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    // Option the parameters
-    size_t height = inputLayers_[i]->getSize();
-    size_t width = getSize();
-
-    // create a new weight
-    if (parameters_[i]->isSparse()) {
-      CHECK_LE(parameters_[i]->getSize(), width * height);
-    } else {
-      CHECK_EQ(parameters_[i]->getSize(), width * height);
-    }
-    Weight* w = new Weight(height, width, parameters_[i]);
-
-    // append the new weight to the list
-    weights_.emplace_back(w);
-  }
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-
-  return true;
-}
-
-void FullyConnectedLayer::prefetch() {
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    auto* sparseParam =
-        dynamic_cast<SparsePrefetchRowCpuMatrix*>(weights_[i]->getW().get());
-    if (sparseParam) {
-      MatrixPtr input = getInputValue(i);
-      sparseParam->addRows(input);
-    }
-  }
-}
-
-void FullyConnectedLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  int batchSize = getInput(0).getBatchSize();
-  int size = getSize();
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    reserveOutput(batchSize, size);
-  }
-
-  MatrixPtr outV = getOutputValue();
-
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    auto input = getInput(i);
-    CHECK(input.value) << "The input of 'fc' layer must be matrix";
-    REGISTER_TIMER_INFO("FwMulTimer", getName().c_str());
-    i == 0 ? outV->mul(*input.value, *weights_[i]->getW(), 1, 0)
-           : outV->mul(*input.value, *weights_[i]->getW(), 1, 1);
-  }
-
-  /* add the bias-vector */
-  if (biases_.get() != NULL) {
-    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
-    outV->addBias(*(biases_->getW()), 1);
-  }
-
-  /* activation */ {
-    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void FullyConnectedLayer::backward(const UpdateCallback& callback) {
-  /* Do derivation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
-
-  if (biases_ && biases_->getWGrad()) {
-    REGISTER_TIMER_INFO("BpBiasTimer", getName().c_str());
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-
-    /* Increasing the number of gradient */
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  bool syncFlag = hl_get_sync_flag();
-
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    /* Calculate the W-gradient for the current layer */
-    if (weights_[i]->getWGrad()) {
-      MatrixPtr input_T = getInputValue(i)->getTranspose();
-      MatrixPtr oGrad = getOutputGrad();
-      {
-        REGISTER_TIMER_INFO("GradMulTimer", getName().c_str());
-        weights_[i]->getWGrad()->mul(*input_T, *oGrad, 1, 1);
-      }
-    }
-
-    // If callback does not change value, backprop error asynchronously so that
-    // we can do the callback concurrently.
-    hl_set_sync_flag(false);
-
-    /* Calculate the input layers error */
-    MatrixPtr preGrad = getInputGrad(i);
-    if (NULL != preGrad) {
-      MatrixPtr weights_T = weights_[i]->getW()->getTranspose();
-      REGISTER_TIMER_INFO("BpMulTimer", getName().c_str());
-      preGrad->mul(*getOutputGrad(), *weights_T, 1, 1);
-    }
-
-    hl_set_sync_flag(syncFlag);
-    {
-      REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
-      weights_[i]->getParameterPtr()->incUpdate(callback);
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/FullyConnectedLayer.h b/paddle/gserver/layers/FullyConnectedLayer.h
deleted file mode 100644
index e0f9d6ce55fbdf73e5507032c108c735bf04597b..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/FullyConnectedLayer.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/ThreadLocal.h"
-
-namespace paddle {
-/**
- * A layer has full connections to all neurons in the previous layer.
- * It computes an inner product with a set of learned weights, and
- * (optionally) adds biases.
- *
- * The config file api is fc_layer.
- */
-
-class FullyConnectedLayer : public Layer {
- protected:
-  WeightList weights_;
-  std::unique_ptr<Weight> biases_;
-
- public:
-  explicit FullyConnectedLayer(const LayerConfig& config) : Layer(config) {}
-  ~FullyConnectedLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  Weight& getWeight(int idx) { return *weights_[idx]; }
-
-  void prefetch() override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/GatedRecurrentLayer.cpp b/paddle/gserver/layers/GatedRecurrentLayer.cpp
deleted file mode 100644
index 9d38849fdf97e6099e39384dd7e6546de9180462..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/GatedRecurrentLayer.cpp
+++ /dev/null
@@ -1,414 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "GatedRecurrentLayer.h"
-#include "Layer.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(gated_recurrent, GatedRecurrentLayer);
-
-bool GatedRecurrentLayer::init(const LayerMap& layerMap,
-                               const ParameterMap& parameterMap) {
-  if (!Layer::init(layerMap, parameterMap)) return false;
-  CHECK_EQ(1U, inputLayers_.size());
-  CHECK_EQ(1U, parameters_.size());
-  CHECK_EQ(getSize() * getSize() * 3, parameters_[0]->getSize());
-  CHECK_EQ(getSize() * 3, biasParameter_->getSize());
-  weight_.reset(new Weight(getSize(), getSize() * 3, parameters_[0]));
-  gateWeight_.reset(new Weight(getSize(), getSize() * 2, parameters_[0], 0));
-  stateWeight_.reset(new Weight(
-      getSize(), getSize(), parameters_[0], 2 * getSize() * getSize()));
-  if (biasParameter_.get() != NULL) {
-    bias_.reset(new Weight(1, getSize() * 3, biasParameter_));
-  }
-
-  reversed_ = config_.reversed();
-  activationGate_.reset(ActivationFunction::create(config_.active_gate_type()));
-
-  GruCompute::init(config_);
-  useBatch_ = true;
-
-  return true;
-}
-
-void GatedRecurrentLayer::resetState() {
-  CHECK(!reversed_) << "state is not allowed for reversed gated "
-                       "recurrent layer";
-  Matrix::resizeOrCreate(
-      prevOutput_, 1, getSize(), /* trans= */ false, useGpu_);
-  prevOutput_->zeroMem();
-
-  // TODO(hedaoyuan): support prev_batch_state
-  CHECK(!FLAGS_prev_batch_state) << "Not supported";
-
-  useBatch_ = false;
-}
-
-void GatedRecurrentLayer::setState(LayerStatePtr state) {
-  CHECK(state->value.size() == 1)
-      << "one matrix is expected for GatedRecurrentLayer state";
-  prevOutput_->copyFrom(*(state->value[0]));
-}
-
-LayerStatePtr GatedRecurrentLayer::getState() {
-  LayerStatePtr res = std::make_shared<LayerState>();
-  res->value.push_back(prevOutput_->clone(0, 0, useGpu_));
-  res->value[0]->copyFrom(*prevOutput_);
-  return res;
-}
-
-void GatedRecurrentLayer::forward(PassType passType) {
-  REGISTER_TIMER_INFO("GruFwTimer", getName().c_str());
-  Layer::forward(passType);
-
-  const Argument& input = getInput(0);
-  CHECK(input.sequenceStartPositions);
-  int batchSize = input.getBatchSize();
-  size_t numSequences = input.getNumSequences();
-  resetOutput(batchSize, getSize());
-  CHECK_EQ(getSize() * 3, input.value->getWidth());
-  const int* starts = input.sequenceStartPositions->getData(false);
-  // batchSize = length of total frames in a batch (NOT size of mini-batch)
-  CHECK_EQ(starts[numSequences], batchSize);
-
-  Matrix::resizeOrCreate(gate_.value,
-                         /* height= */ batchSize,
-                         getSize() * 3,
-                         /* trans= */ false,
-                         useGpu_);
-  Matrix::resizeOrCreate(resetOutput_.value,
-                         /* height= */ batchSize,
-                         getSize(),
-                         /* trans= */ false,
-                         useGpu_);
-
-  if (useBatch_) {
-    forwardBatch(batchSize, numSequences, starts, input.value);
-  } else {
-    forwardSequence(batchSize, numSequences, starts, input.value);
-  }
-}
-
-void GatedRecurrentLayer::backward(const UpdateCallback& callback) {
-  REGISTER_TIMER_INFO("GruBwTimer", getName().c_str());
-  const Argument& input = getInput(0);
-  CHECK(input.sequenceStartPositions);
-  int batchSize = input.getBatchSize();
-  const int* starts = input.sequenceStartPositions->getData(false);
-  size_t numSequences = input.getNumSequences();
-
-  Matrix::resizeOrCreate(gate_.grad,
-                         /* height= */ batchSize,
-                         getSize() * 3,
-                         /* trans= */ false,
-                         useGpu_);
-  Matrix::resizeOrCreate(resetOutput_.grad,
-                         /* height= */ batchSize,
-                         getSize(),
-                         /* trans= */ false,
-                         useGpu_);
-
-  if (useBatch_) {
-    backwardBatch(batchSize, input.grad);
-  } else {
-    backwardSequence(batchSize, numSequences, starts, input.grad);
-  }
-
-  if (bias_) {
-    bias_->getParameterPtr()->incUpdate(callback);
-  }
-
-  weight_->getParameterPtr()->incUpdate(callback);
-}
-
-void GatedRecurrentLayer::forwardSequence(int batchSize,
-                                          size_t numSequences,
-                                          const int* starts,
-                                          MatrixPtr inputValue) {
-  REGISTER_TIMER_INFO("GruFwSequenceTime", getName().c_str());
-  gate_.value->assign(*inputValue);
-  if (bias_) {
-    gate_.value->addBias(*(bias_->getW()), 1);
-  }
-
-  hl_gru_value gruValue;
-  gruValue.gateWeight = (gateWeight_->getW())->getData();
-  gruValue.stateWeight = (stateWeight_->getW())->getData();
-  gruValue.gateValue = gate_.value->getData();
-  gruValue.resetOutputValue = resetOutput_.value->getData();
-  gruValue.outputValue = output_.value->getData();
-  gruValue.prevOutValue = nullptr;
-
-  if (reversed_) {
-    gruValue.gateValue += (batchSize - 1) * getSize() * 3;
-    gruValue.resetOutputValue += (batchSize - 1) * getSize();
-    gruValue.outputValue += (batchSize - 1) * getSize();
-  }
-
-  auto nextFrame = [&gruValue](bool reversed, int frameSize) {
-    gruValue.prevOutValue = gruValue.outputValue;
-    if (!reversed) {
-      gruValue.gateValue += frameSize * 3;
-      gruValue.resetOutputValue += frameSize;
-      gruValue.outputValue += frameSize;
-    } else {
-      gruValue.gateValue -= frameSize * 3;
-      gruValue.resetOutputValue -= frameSize;
-      gruValue.outputValue -= frameSize;
-    }
-  };
-
-  if (!reversed_) {
-    if (prevOutput_) {
-      gruValue.prevOutValue = prevOutput_->getData();
-    }
-  }
-  AsyncGpuBlock asyncGpuBlock;
-  for (size_t n = 0; n < numSequences; ++n) {
-    int length;
-    if (!reversed_) {
-      length = starts[n + 1] - starts[n];
-    } else {
-      length = starts[numSequences - n] - starts[numSequences - n - 1];
-    }
-    for (int l = 0; l < length; ++l) {
-      if (useGpu_) {
-        GruCompute::forward<1>(gruValue, getSize());
-      } else {
-        GruCompute::forward<0>(gruValue, getSize());
-      }
-
-      nextFrame(reversed_, getSize());
-    }
-    if (!reversed_) {
-      if (!prevOutput_) gruValue.prevOutValue = nullptr;
-    } else {
-      gruValue.prevOutValue = nullptr;
-    }
-  }
-
-  if (!reversed_) {
-    if (prevOutput_) {
-      prevOutput_->assign(*output_.value->subMatrix(batchSize - 1, 1));
-    }
-  }
-}
-
-void GatedRecurrentLayer::backwardSequence(int batchSize,
-                                           size_t numSequences,
-                                           const int* starts,
-                                           MatrixPtr inputGrad) {
-  REGISTER_TIMER_INFO("GruBwSequenceTime", getName().c_str());
-
-  hl_gru_value gruValue;
-  gruValue.gateWeight = (gateWeight_->getW())->getData();
-  gruValue.stateWeight = (stateWeight_->getW())->getData();
-  gruValue.gateValue = gate_.value->getData();
-  gruValue.resetOutputValue = resetOutput_.value->getData();
-  gruValue.outputValue = output_.value->getData();
-
-  hl_gru_grad gruGrad;
-  gruGrad.gateWeightGrad =
-      (gateWeight_->getWGrad() ? gateWeight_->getWGrad()->getData() : nullptr);
-  gruGrad.stateWeightGrad =
-      (stateWeight_->getWGrad() ? stateWeight_->getWGrad()->getData()
-                                : nullptr);
-  gruGrad.gateGrad = gate_.grad->getData();
-  gruGrad.resetOutputGrad = resetOutput_.grad->getData();
-  gruGrad.outputGrad = output_.grad->getData();
-
-  if (!reversed_) {
-    gruValue.gateValue += (batchSize - 1) * getSize() * 3;
-    gruValue.resetOutputValue += (batchSize - 1) * getSize();
-    gruValue.outputValue += (batchSize - 1) * getSize();
-    gruGrad.gateGrad += (batchSize - 1) * getSize() * 3;
-    gruGrad.resetOutputGrad += (batchSize - 1) * getSize();
-    gruGrad.outputGrad += (batchSize - 1) * getSize();
-    gruValue.prevOutValue = gruValue.outputValue - getSize();
-    gruGrad.prevOutGrad = gruGrad.outputGrad - getSize();
-  } else {
-    gruValue.prevOutValue = gruValue.outputValue + getSize();
-    gruGrad.prevOutGrad = gruGrad.outputGrad + getSize();
-  }
-
-  auto nextFrame = [&gruValue, &gruGrad](bool reversed, int frameSize) {
-    if (reversed) {
-      gruValue.gateValue += frameSize * 3;
-      gruValue.resetOutputValue += frameSize;
-      gruValue.outputValue += frameSize;
-      gruGrad.gateGrad += frameSize * 3;
-      gruGrad.resetOutputGrad += frameSize;
-      gruGrad.outputGrad += frameSize;
-      gruValue.prevOutValue = gruValue.outputValue + frameSize;
-      gruGrad.prevOutGrad = gruGrad.outputGrad + frameSize;
-    } else {
-      gruValue.gateValue -= frameSize * 3;
-      gruValue.resetOutputValue -= frameSize;
-      gruValue.outputValue -= frameSize;
-      gruGrad.gateGrad -= frameSize * 3;
-      gruGrad.resetOutputGrad -= frameSize;
-      gruGrad.outputGrad -= frameSize;
-      gruValue.prevOutValue = gruValue.outputValue - frameSize;
-      gruGrad.prevOutGrad = gruGrad.outputGrad - frameSize;
-    }
-  };
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    for (size_t n = 0; n < numSequences; ++n) {
-      int length;
-      if (reversed_) {
-        length = starts[n + 1] - starts[n];
-      } else {
-        length = starts[numSequences - n] - starts[numSequences - n - 1];
-      }
-      for (int l = 0; l < length; ++l) {
-        if (l == length - 1) {
-          gruValue.prevOutValue = nullptr;
-          gruGrad.prevOutGrad = nullptr;
-        }
-        if (useGpu_) {
-          GruCompute::backward<1>(gruValue, gruGrad, getSize());
-        } else {
-          GruCompute::backward<0>(gruValue, gruGrad, getSize());
-        }
-        nextFrame(reversed_, getSize());
-      }
-    }
-  }
-
-  if (inputGrad) {
-    inputGrad->add(*gate_.grad);
-  }
-  if (bias_ && bias_->getWGrad()) {
-    bias_->getWGrad()->collectBias(*gate_.grad, 1);
-  }
-}
-
-void GatedRecurrentLayer::forwardBatch(int batchSize,
-                                       size_t numSequences,
-                                       const int* starts,
-                                       MatrixPtr inputValue) {
-  REGISTER_TIMER_INFO("GruFwBatchTime", getName().c_str());
-  hl_gru_value gruValue;
-  gruValue.gateWeight = (gateWeight_->getW())->getData();
-  gruValue.stateWeight = (stateWeight_->getW())->getData();
-
-  if (!batchValue_) {
-    batchValue_.reset(new SequenceToBatch(useGpu_));
-  }
-  batchValue_->resizeOrCreateBatch(batchSize, numSequences, starts, reversed_);
-
-  batchValue_->resizeOrCreate(*output_.value);
-  batchValue_->copy(*inputValue, *gate_.value, /* seq2batch */ true);
-  if (bias_) {
-    gate_.value->addBias(*(bias_->getW()), 1);
-  }
-
-  {
-    int numBatch = batchValue_->getNumBatch();
-    int curBatchSize = 0;
-    AsyncGpuBlock asyncGpuBlock;
-    for (int n = 0; n < numBatch; n++) {
-      MatrixPtr outputValueTmp = batchValue_->getBatchValue(n);
-      gruValue.outputValue = outputValueTmp->getData();
-      gruValue.gateValue =
-          (batchValue_->getBatchValue(*gate_.value, n))->getData();
-      gruValue.resetOutputValue =
-          (batchValue_->getBatchValue(*resetOutput_.value, n))->getData();
-
-      curBatchSize = outputValueTmp->getHeight();
-      gruValue.prevOutValue =
-          (n == 0
-               ? nullptr
-               : (batchValue_->getBatchValue(n - 1, curBatchSize))->getData());
-
-      {
-        if (useGpu_) {
-          GruCompute::forward<1>(gruValue, getSize(), curBatchSize);
-        } else {
-          GruCompute::forward<0>(gruValue, getSize(), curBatchSize);
-        }
-      }
-    }
-  }
-  { batchValue_->copyBackSeq(*output_.value); }
-}
-
-void GatedRecurrentLayer::backwardBatch(int batchSize, MatrixPtr inputGrad) {
-  REGISTER_TIMER_INFO("GruBwBatchTime", getName().c_str());
-  hl_gru_value gruValue;
-  gruValue.gateWeight = (gateWeight_->getW())->getData();
-  gruValue.stateWeight = (stateWeight_->getW())->getData();
-
-  hl_gru_grad gruGrad;
-  gruGrad.gateWeightGrad =
-      (gateWeight_->getWGrad() ? gateWeight_->getWGrad()->getData() : nullptr);
-  gruGrad.stateWeightGrad =
-      (stateWeight_->getWGrad() ? stateWeight_->getWGrad()->getData()
-                                : nullptr);
-
-  if (!batchGrad_) {
-    batchGrad_.reset(new SequenceToBatch(useGpu_));
-  }
-  batchGrad_->shareIndexWith(*batchValue_);
-
-  { batchGrad_->copyFromSeq(*output_.grad); }
-
-  {
-    int numBatch = batchGrad_->getNumBatch();
-    int batchSize = 0;
-    AsyncGpuBlock asyncGpuBlock;
-    for (int n = (int)numBatch - 1; n >= 0; n--) {
-      gruValue.gateValue =
-          (batchGrad_->getBatchValue(*gate_.value, n))->getData();
-      gruValue.resetOutputValue =
-          (batchGrad_->getBatchValue(*resetOutput_.value, n))->getData();
-
-      MatrixPtr outputGradTmp = batchGrad_->getBatchValue(n);
-      gruGrad.outputGrad = outputGradTmp->getData();
-      gruGrad.gateGrad = (batchGrad_->getBatchValue(*gate_.grad, n))->getData();
-      gruGrad.resetOutputGrad =
-          (batchGrad_->getBatchValue(*resetOutput_.grad, n))->getData();
-
-      {
-        batchSize = outputGradTmp->getHeight();
-        gruValue.prevOutValue =
-            (n == 0
-                 ? nullptr
-                 : (batchValue_->getBatchValue(n - 1, batchSize))->getData());
-        gruGrad.prevOutGrad =
-            (n == 0 ? nullptr
-                    : (batchGrad_->getBatchValue(n - 1, batchSize))->getData());
-
-        if (useGpu_) {
-          GruCompute::backward<1>(gruValue, gruGrad, getSize(), batchSize);
-        } else {
-          GruCompute::backward<0>(gruValue, gruGrad, getSize(), batchSize);
-        }
-      }
-    }
-  }
-
-  if (inputGrad) {
-    batchGrad_->add(*inputGrad, *gate_.grad, /* seq2batch */ false);
-  }
-  if (bias_ && bias_->getWGrad()) {
-    bias_->getWGrad()->collectBias(*gate_.grad, /* scale */ 1);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/GatedRecurrentLayer.h b/paddle/gserver/layers/GatedRecurrentLayer.h
deleted file mode 100644
index 46508dc977bf1a6fd33dc1fb024bd1aed36a0ff3..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/GatedRecurrentLayer.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "GruCompute.h"
-#include "Layer.h"
-#include "SequenceToBatch.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief Please refer to "Junyoung Chung, Empirical Evaluation
- * of Gated Recurrent Neural Networks on Sequence Modeling".
- *
- * GatedRecurrentLayer takes 1 input layer with size * 3.
- * Input layer is diveded into 3 equal parts: (xz_t, xr_t, xi_t).
- * parameter and biasParameter is also diveded into 3 equal parts:
- *   - parameter consists of (U_z, U_r, U)
- *   - baisParameter consists of (bias_z, bias_r, bias_o)
- *
- * \f[
- * update \ gate: z_t = actGate(xz_t + U_z * h_{t-1} + bias_z) \\
- * reset \ gate: r_t = actGate(xr_t + U_r * h_{t-1} + bias_r) \\
- * output \ candidate: {h}_t = actNode(xi_t + U * dot(r_t, h_{t-1}) + bias_o) \\
- * hidden \ activation: h_t = dot((1-z_t), h_{t-1}) + dot(z_t, {h}_t) \\
- * \f]
- *
- * @note
- * - dot denotes "element-wise multiplication".
- * - actNode is defined by config active_type
- * - actGate is defined by config actvie_gate_type
- *
- * The config file is grumemory.
- */
-
-class GatedRecurrentLayer : public Layer, public GruCompute {
- public:
-  explicit GatedRecurrentLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-
-  void backward(const UpdateCallback& callback) override;
-
-  void resetState() override;
-
-  void setState(LayerStatePtr state) override;
-
-  LayerStatePtr getState() override;
-
- protected:
-  void forwardSequence(int batchSize,
-                       size_t numSequences,
-                       const int* starts,
-                       MatrixPtr inputValue);
-  void backwardSequence(int batchSize,
-                        size_t numSequences,
-                        const int* starts,
-                        MatrixPtr inputGrad);
-
-  void forwardBatch(int batchSize,
-                    size_t numSequences,
-                    const int* starts,
-                    MatrixPtr inputValue);
-  void backwardBatch(int batchSize, MatrixPtr inputGrad);
-
- protected:
-  std::unique_ptr<Weight> weight_;
-  std::unique_ptr<Weight> gateWeight_;
-  std::unique_ptr<Weight> stateWeight_;
-  std::unique_ptr<Weight> bias_;
-
-  Argument gate_;
-  Argument resetOutput_;
-
-  bool reversed_;
-  bool useBatch_;
-  std::unique_ptr<SequenceToBatch> batchValue_;
-  std::unique_ptr<SequenceToBatch> batchGrad_;
-  std::unique_ptr<ActivationFunction> activationGate_;
-
-  MatrixPtr prevOutput_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/GruCompute.cpp b/paddle/gserver/layers/GruCompute.cpp
deleted file mode 100644
index 48ddbc413e6c915be6e86704f96e919932ca2970..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/GruCompute.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "GruCompute.h"
-#include "hl_recurrent_apply.cuh"
-#include "paddle/function/GruFunctor.h"
-#include "paddle/utils/Util.h"
-
-namespace paddle {
-
-void GruCompute::init(LayerConfig &config) {
-  activeNode_ = hlActiveType(config.active_type());
-  activeGate_ = hlActiveType(config.active_gate_type());
-}
-
-template <>
-void GruCompute::forward<0>(hl_gru_value value, int frameSize, int batchSize) {
-  GruFunctor<DEVICE_TYPE_CPU, real>::compute(hppl::forward::gru_resetOutput(),
-                                             hppl::forward::gru_finalOutput(),
-                                             value,
-                                             frameSize,
-                                             batchSize,
-                                             activeNode_,
-                                             activeGate_);
-}
-
-template <>
-void GruCompute::backward<0>(hl_gru_value value,
-                             hl_gru_grad grad,
-                             int frameSize,
-                             int batchSize) {
-  GruGradFunctor<DEVICE_TYPE_CPU, real>::compute(
-      hppl::backward::gru_stateGrad(),
-      hppl::backward::gru_resetGrad(),
-      value,
-      grad,
-      frameSize,
-      batchSize,
-      activeNode_,
-      activeGate_);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/GruCompute.h b/paddle/gserver/layers/GruCompute.h
deleted file mode 100644
index 50006325ce9969c4941aaf28604260f0aeb9b97a..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/GruCompute.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "ModelConfig.pb.h"
-#include "hl_gpu.h"
-#include "paddle/utils/Common.h"
-
-namespace paddle {
-
-class GruCompute {
- public:
-  void init(LayerConfig &config);
-
-  template <bool useGpu>
-  void forward(hl_gru_value value, int frameSize, int batchSize = 1);
-
-  template <bool useGpu>
-  void backward(hl_gru_value value,
-                hl_gru_grad grad,
-                int frameSize,
-                int batchSize = 1);
-
- public:
-  hl_activation_mode_t activeNode_;
-  hl_activation_mode_t activeGate_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/GruStepLayer.cpp b/paddle/gserver/layers/GruStepLayer.cpp
deleted file mode 100644
index 114f287411c2fccbc08b7da4c05462967c81b268..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/GruStepLayer.cpp
+++ /dev/null
@@ -1,177 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "GruCompute.h"
-#include "Layer.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief GruStepLayer is like GatedRecurrentLayer, but used in recurrent
- * layer group. GruStepLayer takes 2 input layer.
- * - input[0] with size * 3 and diveded into 3 equal parts: (xz_t, xr_t, xi_t).
- * - input[1] with size: {prev_out}.
- *
- * parameter and biasParameter is also diveded into 3 equal parts:
- * - parameter consists of (U_z, U_r, U)
- * - baisParameter consists of (bias_z, bias_r, bias_o)
- *
- * \f[
- * update \ gate: z_t = actGate(xz_t + U_z * prev_out + bias_z) \\
- * reset \ gate: r_t = actGate(xr_t + U_r * prev_out + bias_r)  \\
- * output \ candidate: {h}_t = actNode(xi_t + U * dot(r_t, prev_out) + bias_o)
- * \\
- * output: h_t = dot((1-z_t), prev_out) + dot(z_t, prev_out)
- * \f]
- *
- * @note
- *   - dot denotes "element-wise multiplication".
- *   - actNode is defined by config active_type
- *   - actGate is defined by config actvie_gate_type
- *
- * The config file api if gru_step_layer.
- */
-class GruStepLayer : public Layer, public GruCompute {
- protected:
-  Argument gate_;
-  Argument resetOutput_;
-  std::unique_ptr<Weight> weight_;
-  std::unique_ptr<Weight> bias_;
-
- public:
-  explicit GruStepLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~GruStepLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(gru_step, GruStepLayer);
-
-bool GruStepLayer::init(const LayerMap& layerMap,
-                        const ParameterMap& parameterMap) {
-  if (!Layer::init(layerMap, parameterMap)) return false;
-  CHECK_EQ(2U, inputLayers_.size());
-
-  CHECK_EQ(getSize() * getSize() * 3, parameters_[0]->getSize());
-  weight_.reset(new Weight(getSize(), getSize() * 3, parameters_[0]));
-
-  if (biasParameter_.get() != NULL) {
-    CHECK_EQ(getSize() * 3, biasParameter_->getSize());
-    bias_.reset(new Weight(1, getSize() * 3, biasParameter_));
-  }
-
-  GruCompute::init(config_);
-  return true;
-}
-
-void GruStepLayer::forward(PassType passType) {
-  REGISTER_TIMER_INFO("GruStepFwTime", getName().c_str());
-  Layer::forward(passType);
-
-  const Argument& input = getInput(0);
-  const Argument& prevOutput = getInput(1);
-  CHECK_EQ(getSize() * 3, input.value->getWidth());
-  CHECK_EQ(getSize(), prevOutput.value->getWidth());
-
-  int batchSize = input.getBatchSize();
-  resetOutput(batchSize, getSize());
-  resetSpecifyOutput(gate_,
-                     batchSize,
-                     getSize() * 3,
-                     /* isValueClean */ false,
-                     /* isGradClean */ false);
-  resetSpecifyOutput(resetOutput_,
-                     batchSize,
-                     getSize(),
-                     /* isValueClean */ false,
-                     /* isGradClean */ false);
-  gate_.value->assign(*input.value);
-  if (bias_) {
-    gate_.value->addBias(*(bias_->getW()), 1);
-  }
-
-  hl_gru_value gruValue;
-  gruValue.gateWeight = weight_->getW()->getData();
-  gruValue.stateWeight = weight_->getW()->getData() + getSize() * getSize() * 2;
-  gruValue.gateValue = gate_.value->getData();
-  gruValue.resetOutputValue = resetOutput_.value->getData();
-  gruValue.outputValue = output_.value->getData();
-  gruValue.prevOutValue = prevOutput.value->getData();
-
-  if (useGpu_) {
-    GruCompute::forward<1>(gruValue, getSize(), batchSize);
-  } else {
-    GruCompute::forward<0>(gruValue, getSize(), batchSize);
-  }
-}
-
-void GruStepLayer::backward(const UpdateCallback& callback) {
-  REGISTER_TIMER_INFO("GruStepBwTime", getName().c_str());
-
-  const Argument& input = getInput(0);
-  const Argument& prevOutput = getInput(1);
-  int batchSize = input.getBatchSize();
-
-  hl_gru_value gruValue;
-  gruValue.gateWeight = weight_->getW()->getData();
-  gruValue.stateWeight = weight_->getW()->getData() + getSize() * getSize() * 2;
-  gruValue.gateValue = gate_.value->getData();
-  gruValue.resetOutputValue = resetOutput_.value->getData();
-  gruValue.outputValue = output_.value->getData();
-  gruValue.prevOutValue = prevOutput.value->getData();
-
-  hl_gru_grad gruGrad;
-  gruGrad.gateWeightGrad =
-      (weight_->getWGrad() ? weight_->getWGrad()->getData() : nullptr);
-  gruGrad.stateWeightGrad =
-      (weight_->getWGrad()
-           ? weight_->getWGrad()->getData() + getSize() * getSize() * 2
-           : nullptr);
-
-  gruGrad.gateGrad = gate_.grad->getData();
-  gruGrad.resetOutputGrad = resetOutput_.grad->getData();
-  gruGrad.outputGrad = output_.grad->getData();
-  if (prevOutput.grad) {
-    gruGrad.prevOutGrad = prevOutput.grad->getData();
-  } else {
-    gruGrad.prevOutGrad = nullptr;
-  }
-
-  if (useGpu_) {
-    GruCompute::backward<1>(gruValue, gruGrad, getSize(), batchSize);
-  } else {
-    GruCompute::backward<0>(gruValue, gruGrad, getSize(), batchSize);
-  }
-
-  if (input.grad) {
-    input.grad->add(*gate_.grad);
-  }
-
-  if (bias_ && bias_->getWGrad()) {
-    bias_->getWGrad()->collectBias(*gate_.grad, 1);
-  }
-
-  if (bias_) {
-    bias_->getParameterPtr()->incUpdate(callback);
-  }
-  weight_->getParameterPtr()->incUpdate(callback);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
deleted file mode 100644
index 3e720f179ee66baa73f40b8f5f19bfb4090831c0..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
+++ /dev/null
@@ -1,240 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "HierarchicalSigmoidLayer.h"
-#include "paddle/utils/Util.h"
-
-namespace paddle {
-
-REGISTER_LAYER(hsigmoid, HierarchicalSigmoidLayer);
-
-bool HierarchicalSigmoidLayer::init(const LayerMap& layerMap,
-                                    const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK(config_.has_num_classes()) << "num_classes must be specifed in config";
-  numClasses_ = config_.num_classes();
-  CHECK_GE(numClasses_, (size_t)2);
-  codeLength_ = findLastSet(numClasses_ - 1);
-
-  size_t height = numClasses_ - 1;
-
-  /* initialize the weightList */
-  // The last input layer is for label
-  CHECK(!parameters_.back());
-  for (size_t i = 0; i < inputLayers_.size() - 1; i++) {
-    size_t width = inputLayers_[i]->getSize();
-    // create a new weight
-    CHECK_EQ(parameters_[i]->getSize(), width * height);
-    Weight* w = new Weight(height, width, parameters_[i]);
-
-    // append the new weight to the list
-    weights_.emplace_back(w);
-  }
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    CHECK_EQ(biasParameter_->getSize(), numClasses_ - 1);
-    biases_.reset(new Weight(1, numClasses_ - 1, biasParameter_));
-  }
-
-  return true;
-}
-
-void HierarchicalSigmoidLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  int batchSize = getInputValue(0)->getHeight();
-  int size = getSize();
-  reserveOutput(batchSize, size);
-  Matrix::resizeOrCreate(preOutput_.value,
-                         batchSize,
-                         codeLength_,
-                         /* trans */ false,
-                         false);
-  Matrix::resizeOrCreate(preOutput_.grad,
-                         batchSize,
-                         codeLength_,
-                         /* trans */ false,
-                         false);
-  IVectorPtr label = getInput(*getLabelLayer()).ids;
-  preOutput_.value->zeroMem();
-
-  if (useGpu_) {
-    Matrix::resizeOrCreate(cpuOutput_,
-                           output_.value->getHeight(),
-                           output_.value->getWidth(),
-                           /* trans */ false,
-                           false);
-    IVector::resizeOrCreate(cpuLabel_, label->getSize(), false);
-    cpuLabel_->copyFrom(*label);
-    cpuOutput_->copyFrom(*output_.value);
-  } else {
-    cpuOutput_ = output_.value;
-    cpuLabel_ = label;
-  }
-  /* add the bias-vector */
-  if (biases_.get() != NULL) {
-    if (useGpu_) {
-      Matrix::resizeOrCreate(cpuBias_,
-                             1,
-                             numClasses_ - 1,
-                             /* trans */ false,
-                             false);
-      cpuBias_->copyFrom(*biases_->getW());
-    } else {
-      cpuBias_ = biases_->getW();
-    }
-    preOutput_.value->addByBitCode(numClasses_, *cpuLabel_, *cpuBias_);
-  }
-  for (size_t i = 0; i < inputLayers_.size() - 1; ++i) {
-    MatrixPtr input = getInputValue(i);
-    if (useGpu_) {
-      Matrix::resizeOrCreate(cpuInput_,
-                             input->getHeight(),
-                             input->getWidth(),
-                             /* trans */ false,
-                             false);
-      Matrix::resizeOrCreate(cpuWeight_,
-                             weights_[i]->getW()->getHeight(),
-                             weights_[i]->getW()->getWidth(),
-                             /* trans */ false,
-                             false);
-      cpuInput_->copyFrom(*input);
-      cpuWeight_->copyFrom(*weights_[i]->getW());
-    } else {
-      cpuInput_ = input;
-      cpuWeight_ = weights_[i]->getW();
-    }
-    preOutput_.value->mulByBitCode(
-        numClasses_, *cpuLabel_, *cpuWeight_, *cpuInput_);
-  }
-  // keep consistent with the clipping in the following softrelu
-  preOutput_.value->clip(-40.0, 40.0);
-  preOutput_.value->sumByBitCode(numClasses_,
-                                 *cpuLabel_,
-                                 *cpuOutput_,
-                                 -1);  // scaleSum
-  preOutput_.value->softrelu(*preOutput_.value);
-  MatrixPtr sum = Matrix::create(batchSize, 1, /* trans= */ false, false);
-  preOutput_.value->rowSum(*sum);
-  cpuOutput_->add(*sum);
-  if (useGpu_) {
-    output_.value->copyFrom(*cpuOutput_);
-  } else {
-    output_.value = cpuOutput_;
-  }
-}
-
-void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
-  IVectorPtr label = getInput(*getLabelLayer()).ids;
-  if (useGpu_) {
-    IVector::resizeOrCreate(cpuLabel_, label->getSize(), false);
-    cpuLabel_->copyFrom(*label);
-  } else {
-    cpuLabel_ = label;
-  }
-  preOutput_.grad->one();
-  preOutput_.grad->softreluDerivative(*preOutput_.value);
-  preOutput_.grad->subByBitCode(numClasses_, *cpuLabel_);
-
-  if (biases_ && biases_->getWGrad()) {
-    MatrixPtr biases_grad = biases_->getWGrad();
-    if (useGpu_) {
-      Matrix::resizeOrCreate(cpuBias_,
-                             1,
-                             numClasses_ - 1,
-                             /* trans */ false,
-                             false);
-      cpuBias_->copyFrom(*biases_grad);
-    } else {
-      cpuBias_ = biases_grad;
-    }
-    preOutput_.grad->addByBitCodeBackward(numClasses_, *cpuLabel_, *cpuBias_);
-    if (useGpu_) {
-      biases_grad->copyFrom(*cpuBias_);
-    } else {
-      biases_grad = cpuBias_;
-    }
-    /* Increasing the number of gradient */
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  for (size_t i = 0; i < inputLayers_.size() - 1; ++i) {
-    /* Calculate the W-gradient for the current layer */
-    MatrixPtr input = getInputValue(i);
-    if (weights_[i]->getWGrad()) {
-      MatrixPtr weights_grad = weights_[i]->getWGrad();
-      if (useGpu_) {
-        Matrix::resizeOrCreate(cpuInput_,
-                               input->getHeight(),
-                               input->getWidth(),
-                               /* trans */ false,
-                               false);
-        Matrix::resizeOrCreate(cpuWeightGrad_,
-                               weights_grad->getHeight(),
-                               weights_grad->getWidth(),
-                               /* trans */ false,
-                               false);
-        cpuInput_->copyFrom(*input);
-        cpuWeightGrad_->copyFrom(*weights_grad);
-      } else {
-        cpuInput_ = input;
-        cpuWeightGrad_ = weights_grad;
-      }
-      preOutput_.grad->mulByBitCodeBackwardWeight(
-          numClasses_, *cpuLabel_, *cpuWeightGrad_, *cpuInput_);
-      if (useGpu_) {
-        weights_grad->copyFrom(*cpuWeightGrad_);
-      } else {
-        weights_grad = cpuWeightGrad_;
-      }
-      /* Increasing the number of gradient */
-      weights_[i]->getParameterPtr()->incUpdate(callback);
-    }
-
-    /* Calculate the input layers error */
-    MatrixPtr inputGrad = getInputGrad(i);
-    if (inputGrad) {
-      if (useGpu_) {
-        Matrix::resizeOrCreate(cpuInputGrad_,
-                               inputGrad->getHeight(),
-                               inputGrad->getWidth(),
-                               /* trans */ false,
-                               false);
-        Matrix::resizeOrCreate(cpuWeight_,
-                               weights_[i]->getW()->getHeight(),
-                               weights_[i]->getW()->getWidth(),
-                               /* trans */ false,
-                               false);
-        cpuInputGrad_->copyFrom(*inputGrad);
-        cpuWeight_->copyFrom(*weights_[i]->getW());
-      } else {
-        cpuInputGrad_ = inputGrad;
-        cpuWeight_ = weights_[i]->getW();
-      }
-      preOutput_.grad->mulByBitCodeBackwardError(
-          numClasses_, *cpuLabel_, *cpuWeight_, *cpuInputGrad_);
-      if (useGpu_) {
-        inputGrad->copyFrom(*cpuInputGrad_);
-      } else {
-        inputGrad = cpuInputGrad_;
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/IdentityProjection.cpp b/paddle/gserver/layers/IdentityProjection.cpp
deleted file mode 100644
index 34e9eb90161f7942c528b70f177e30f301a8f53f..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/IdentityProjection.cpp
+++ /dev/null
@@ -1,103 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Projection.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * IdentityProjection performs addition:
- * \f[
- *   out.row[i] += in.row[i]
- * \f]
- *
- * The config file api is identity_projection.
- */
-class IdentityProjection : public Projection {
- public:
-  IdentityProjection(const ProjectionConfig& config,
-                     const ParameterPtr& parameter,
-                     bool useGpu);
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback);
-};
-
-REGISTER_PROJECTION(identity, IdentityProjection);
-
-/**
- * Constructed function.
- * @note IdentityProjection should not have any parameter.
- */
-IdentityProjection::IdentityProjection(const ProjectionConfig& config,
-                                       const ParameterPtr& parameter,
-                                       bool useGpu)
-    : Projection(config, parameter, useGpu) {
-  CHECK(!parameter) << "'identity' projection should not have any parameter";
-}
-
-void IdentityProjection::forward() { out_->value->add(*in_->value); }
-
-void IdentityProjection::backward(const UpdateCallback& callback) {
-  if (in_->grad) {
-    in_->grad->add(*out_->grad);
-  }
-}
-
-/**
- * IdentityOffsetProjection likes IdentityProjection, but layer size may be
- * smaller
- * than input size. It selects dimensions [offset, offset+layer_size) from input
- * to
- * perform addition:
- * \f[
- *   out.row[i] += in.row[i + \textrm{offset}]
- * \f]
- *
- * The config file api is identity_projection.
- */
-class IdentityOffsetProjection : public Projection {
- public:
-  IdentityOffsetProjection(const ProjectionConfig& config,
-                           const ParameterPtr& parameter,
-                           bool useGpu);
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback);
-};
-
-REGISTER_PROJECTION(identity_offset, IdentityOffsetProjection);
-
-/**
- * Constructed function.
- * @note IdentityOffsetProjection should not have any parameter.
- */
-IdentityOffsetProjection::IdentityOffsetProjection(
-    const ProjectionConfig& config, const ParameterPtr& parameter, bool useGpu)
-    : Projection(config, parameter, useGpu) {
-  CHECK(!parameter) << "'identity_offset' projection "
-                       "should not have any parameter";
-  CHECK_LE(config.output_size() + config.offset(), config.input_size());
-}
-
-void IdentityOffsetProjection::forward() {
-  out_->value->addAtOffset(*in_->value, config_.offset());
-}
-
-void IdentityOffsetProjection::backward(const UpdateCallback& callback) {
-  if (in_->grad) {
-    in_->grad->addAtOffset(*out_->grad, config_.offset());
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/InterpolationLayer.cpp b/paddle/gserver/layers/InterpolationLayer.cpp
deleted file mode 100644
index 509c07cf22c9bcbe9283241b38540162b3dbe26b..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/InterpolationLayer.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * A layer for linear interpolation with two inputs,
- * which is used in NEURAL TURING MACHINE.
- * \f[
- *   y.row[i] = w[i] * x_1.row[i] + (1 - w[i]) * x_2.row[i]
- * \f]
- * where \f$x_1\f$ and \f$x_2\f$ are two (batchSize x dataDim) inputs,
- * \f$w\f$ is (batchSize x 1) weight vector,
- * and \f$y\f$ is (batchSize x dataDim) output.
- *
- * The config file api is interpolation_layer.
- */
-
-class InterpolationLayer : public Layer {
- protected:
-  /// weightLast = 1 - weight
-  MatrixPtr weightLast_;
-  MatrixPtr tmpMatrix;
-
- public:
-  explicit InterpolationLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~InterpolationLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(interpolation, InterpolationLayer);
-
-bool InterpolationLayer::init(const LayerMap& layerMap,
-                              const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(3U, inputLayers_.size());
-
-  return true;
-}
-
-void InterpolationLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr weightV = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-  MatrixPtr inV2 = getInputValue(2);
-
-  size_t batchSize = inV1->getHeight();
-  size_t dataDim = inV1->getWidth();
-
-  CHECK_EQ(dataDim, getSize());
-  CHECK_EQ(dataDim, inV2->getWidth());
-  CHECK_EQ(batchSize, inV1->getHeight());
-  CHECK_EQ(batchSize, inV2->getHeight());
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    resetOutput(batchSize, dataDim);
-  }
-
-  MatrixPtr outV = getOutputValue();
-
-  Matrix::resizeOrCreate(weightLast_, batchSize, 1, false, useGpu_);
-  weightLast_->one();
-  weightLast_->sub(*weightV);
-
-  REGISTER_TIMER_INFO("FwInterpTimer", getName().c_str());
-  // outV = inV1 * weight + inV2 * weightLast
-  outV->addRowScale(0, *inV1, *weightV);
-  outV->addRowScale(0, *inV2, *weightLast_);
-}
-
-void InterpolationLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr outG = getOutputGrad();
-  MatrixPtr weightV = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-  MatrixPtr inV2 = getInputValue(2);
-  MatrixPtr inG0 = getInputGrad(0);
-  MatrixPtr inG1 = getInputGrad(1);
-  MatrixPtr inG2 = getInputGrad(2);
-
-  size_t batchSize = inV1->getHeight();
-  size_t dataDim = inV1->getWidth();
-
-  REGISTER_TIMER_INFO("BwInterpTimer", getName().c_str());
-
-  if (inG0) {
-    Matrix::resizeOrCreate(tmpMatrix, batchSize, dataDim, false, useGpu_);
-
-    // inG0 += outG .* (inV1 - inV2)
-    tmpMatrix->sub(*inV1, *inV2);
-    inG0->rowDotMul(0, *outG, *tmpMatrix);
-  }
-
-  if (inG1) {
-    // inG1 += outG * weight
-    inG1->addRowScale(0, *outG, *weightV);
-  }
-
-  if (inG2) {
-    // inG2 += outG * weightLast
-    inG2->addRowScale(0, *outG, *weightLast_);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/L2DistanceLayer.cpp b/paddle/gserver/layers/L2DistanceLayer.cpp
deleted file mode 100644
index c8cca3762cc3ecd6c04d7d2b804bc588c281bfb4..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/L2DistanceLayer.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "L2DistanceLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(l2_distance, L2DistanceLayer);
-
-bool L2DistanceLayer::init(const LayerMap& layerMap,
-                           const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 2UL) << "The L2DistanceLayer accepts two and "
-                                     << "only two inputs.";
-  CHECK_EQ(getSize(), 1UL) << "The output dimensionality of L2DistanceLayer "
-                           << "is fixed to be 1.";
-
-  return true;
-}
-
-void L2DistanceLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  const auto inV1 = getInputValue(0);
-  const auto inV2 = getInputValue(1);
-
-  CHECK(inV1 && inV2);
-  CHECK_EQ(inV1->getHeight(), inV2->getHeight())
-      << "The height of two inputs of this layer must be the same.";
-  CHECK_EQ(inV1->getWidth(), inV2->getWidth())
-      << "The width of two inputs of this layer must be the same.";
-
-  int batchSize = inV1->getHeight();
-  int output_dim = getSize();
-  {
-    REGISTER_TIMER_INFO("L2DistanceBpAtvTimer", getName().c_str());
-    reserveOutput(batchSize, output_dim);
-    auto outV = getOutputValue();
-    CHECK(outV) << "The output matrix should not be null.";
-
-    Matrix::resizeOrCreate(
-        inputSub_, inV1->getHeight(), inV1->getWidth(), false, useGpu_);
-
-    inputSub_->assign(*inV1);
-    inputSub_->sub(*inV2);
-    outV->sumOfProducts(*inputSub_, *inputSub_, 1, 0);
-    outV->sqrt2(*outV);
-  }
-}
-
-void L2DistanceLayer::backward(const UpdateCallback& callback) {
-  const auto outG = getOutputGrad();
-  const auto outV = getOutputValue();
-  CHECK(outG && outV);
-
-  auto inGrad1 = getInputGrad(0);
-  auto inGrad2 = getInputGrad(1);
-
-  {
-    REGISTER_TIMER_INFO("L2DistanceBpAtvTimer", getName().c_str());
-
-    if (inGrad1 || inGrad2) {
-      outV->scalarDiv(*outV, 1.);
-      outV->dotMul(*outG, *outV);
-    }
-
-    if (inGrad1) inGrad1->addRowScale(0, *inputSub_, *outV);
-
-    if (inGrad2) {
-      inputSub_->mulScalar(-1.);
-      inGrad2->addRowScale(0, *inputSub_, *outV);
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/L2DistanceLayer.h b/paddle/gserver/layers/L2DistanceLayer.h
deleted file mode 100644
index 44e688e1377145845033d9d5cc3f31f5594a11f6..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/L2DistanceLayer.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief The layer calculates the l2 distance between two input vectors.
- * \f[
- * f(\bf{x}, \bf{y}) = \sqrt{\sum_{i=1}^D(x_i - y_i)}
- * \f]
- *
- * - Input1: A vector (batchSize * dataDim)
- * - Input2: A vector (batchSize * dataDim)
- * - Output: A vector (batchSize * 1)
- *
- * The configuration api is: l2_distance_layer.
- */
-
-class L2DistanceLayer : public Layer {
- public:
-  explicit L2DistanceLayer(const LayerConfig& config) : Layer(config) {}
-  ~L2DistanceLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
- private:
-  // Store the result of subtracting Input2 from Input1 in forward computation,
-  // which will be reused in backward computation.
-  MatrixPtr inputSub_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/Layer.cpp b/paddle/gserver/layers/Layer.cpp
deleted file mode 100644
index 32e2f4c9dd06e0ef7314b24719235c0be297961f..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/Layer.cpp
+++ /dev/null
@@ -1,410 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/utils/Util.h"
-
-#include "CostLayer.h"
-#include "paddle/math/SparseMatrix.h"
-#include "paddle/utils/Error.h"
-#include "paddle/utils/Logging.h"
-
-#ifndef PADDLE_MOBILE_INFERENCE
-#include "ValidationLayer.h"
-#endif
-
-DEFINE_bool(log_error_clipping, false, "enable log error clipping or not");
-
-namespace paddle {
-
-Layer::Layer(const LayerConfig& config, bool useGpu)
-    : config_(config),
-      useGpu_(useGpu),
-      deviceId_(CPU_DEVICE),
-      needSequenceInfo_(true) {}
-
-bool Layer::init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
-  if (useGpu_ && FLAGS_parallel_nn) {
-    /* gpu environment is specified by device property */
-    deviceId_ = config_.device();
-    if (deviceId_ < 0) {
-      useGpu_ = false;
-    }
-  }
-
-  output_.deviceId = deviceId_;
-
-  for (auto& inputConfig : config_.inputs()) {
-    std::string inputName = inputConfig.input_layer_name();
-    LayerPtr inputLayer;
-    CHECK(mapGet(inputName, layerMap, &inputLayer))
-        << "Cannot find input layer " << inputName << " for layer "
-        << getName();
-    this->addPrev(inputLayer);
-
-    inputLayer->addOutputArgument(deviceId_);
-
-    if (inputConfig.has_input_parameter_name()) {
-      ParameterPtr parameter;
-      CHECK(
-          mapGet(inputConfig.input_parameter_name(), parameterMap, &parameter))
-          << "Cannot find input parameter "
-          << inputConfig.input_parameter_name() << " for layer " << getName();
-      parameter->incShared();
-      CHECK_EQ(parameter->getDeviceId(), getDeviceId());
-      parameters_.push_back(parameter);
-    } else {
-      parameters_.push_back(nullptr);
-    }
-
-    if (inputConfig.has_input_layer_argument()) {
-      inputArgument_.push_back(inputConfig.input_layer_argument());
-    } else {
-      inputArgument_.push_back("");
-    }
-  }
-
-  if (config_.has_bias_parameter_name()) {
-    CHECK(mapGet(config_.bias_parameter_name(), parameterMap, &biasParameter_))
-        << "Cannot find bias parameter " << config_.bias_parameter_name()
-        << " for layer " << getName();
-    biasParameter_->incShared();
-    CHECK_EQ(biasParameter_->getDeviceId(), getDeviceId());
-  }
-
-  /* specify the activation function according to the configuration */
-  std::string action_type = config_.active_type();
-  activation_.reset(ActivationFunction::create(action_type));
-  CHECK(activation_);
-
-  initNeedFlags();
-  markInBackward_.assign(inputLayers_.size(), false);
-
-  return true;
-}
-
-ClassRegistrar<Layer, LayerConfig> Layer::registrar_;
-
-LayerPtr Layer::create(const LayerConfig& config) {
-  std::string type = config.type();
-
-#ifndef PADDLE_MOBILE_INFERENCE
-  // NOTE: As following types have illegal character '-',
-  // they can not use REGISTER_LAYER to registrar.
-  // Besides, to fit with old training models,
-  // they can not use '_' instead.
-  if (type == "multi-class-cross-entropy")
-    return LayerPtr(new MultiClassCrossEntropy(config));
-  else if (type == "rank-cost")
-    return LayerPtr(new RankingCost(config));
-  else if (type == "auc-validation")
-    return LayerPtr(new AucValidation(config));
-  else if (type == "pnpair-validation")
-    return LayerPtr(new PnpairValidation(config));
-#endif
-
-  return LayerPtr(registrar_.createByType(config.type(), config));
-}
-
-void Layer::resetSpecifyOutput(Argument& output,
-                               size_t height,
-                               size_t width,
-                               bool isValueClean,
-                               bool isGradClean) {
-  SetDevice device(output.deviceId);
-
-  Matrix::resizeOrCreate(
-      output.value, height, width, /* trans */ false, useGpu(output.deviceId));
-  if (isValueClean) {
-    output.value->zeroMem();
-  }
-
-  if (passType_ != PASS_TEST && needGradient()) {
-    Matrix::resizeOrCreate(
-        output.grad, height, width, /* trans */ false, useGpu(output.deviceId));
-    if (isGradClean) {
-      output.grad->zeroMem();
-    }
-  }
-}
-
-void Layer::resizeOutput(size_t height, size_t width) {
-  resetSpecifyOutput(output_, height, width, false, false);
-
-  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
-    resetSpecifyOutput(outputOtherDevice_[i], height, width, false, false);
-  }
-}
-
-void Layer::reserveOutput(size_t height, size_t width) {
-  resetSpecifyOutput(output_, height, width, false, true);
-
-  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
-    resetSpecifyOutput(outputOtherDevice_[i], height, width, false, true);
-  }
-}
-
-void Layer::resetOutput(size_t height, size_t width) {
-  resetSpecifyOutput(output_, height, width, true, true);
-
-  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
-    resetSpecifyOutput(outputOtherDevice_[i], height, width, true, true);
-  }
-}
-
-void Layer::addOutputArgument(int deviceId) {
-  if (deviceId == deviceId_) {
-    output_.countIncrement();
-    return;
-  } else {
-    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-      if (outputOtherDevice_[i].deviceId == deviceId) {
-        outputOtherDevice_[i].countIncrement();
-        return;
-      }
-    }
-  }
-
-  Argument argu;
-  argu.deviceId = deviceId;
-  outputOtherDevice_.push_back(argu);
-  outputOtherDevice_.back().countIncrement();
-}
-
-void Layer::copyOutputToOtherDevice() {
-  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
-    SetDevice device(outputOtherDevice_[i].deviceId);
-    // If outputOtherDevice_[i].value is a CpuMatrix,
-    // the copyFrom is a synchronous interface.
-    // If outputOtherDevice_[i].value is a GpuMatrix, since subsequent
-    // calculations are all on HPPL_STREAM_DEFAULT,
-    // copyFrom can be an asynchronous interface.
-    outputOtherDevice_[i].value->copyFrom(*getOutputValue(),
-                                          HPPL_STREAM_DEFAULT);
-    outputOtherDevice_[i].sequenceStartPositions =
-        output_.sequenceStartPositions;
-    outputOtherDevice_[i].subSequenceStartPositions =
-        output_.subSequenceStartPositions;
-    outputOtherDevice_[i].cpuSequenceDims = output_.cpuSequenceDims;
-
-    outputOtherDevice_[i].notifyValueReady();
-  }
-}
-
-void Layer::waitInputValue() {
-  for (size_t i = 0; i != inputLayers_.size(); i++) {
-    if (inputLayers_[i]->getDeviceId() != deviceId_) {
-      getInput(i).waitValueReady();
-    }
-  }
-}
-
-void Layer::waitAndMergeOutputGrad() {
-  if (!output_.grad || !outputOtherDevice_.size()) {
-    return;
-  }
-
-  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
-    outputOtherDevice_[i].waitGradReady();
-  }
-
-  /* merge output grad */
-  size_t i = 0;
-  if (!output_.getAllCount()) {
-    output_.grad->copyFrom(*outputOtherDevice_[0].grad, HPPL_STREAM_1);
-    hl_stream_synchronize(HPPL_STREAM_1);
-
-    i++;
-    if (outputOtherDevice_.size() == 1) return;
-  }
-
-  Matrix::resizeOrCreate(tmpGrad_,
-                         output_.grad->getHeight(),
-                         output_.grad->getWidth(),
-                         /* trans */ false,
-                         useGpu(output_.deviceId));
-
-  for (; i != outputOtherDevice_.size(); i++) {
-    tmpGrad_->copyFrom(*outputOtherDevice_[i].grad, HPPL_STREAM_1);
-    hl_stream_synchronize(HPPL_STREAM_1);
-    output_.grad->add(*tmpGrad_);
-  }
-}
-
-void Layer::markAllInputGrad() {
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    if (!markInBackward_[i]) {
-      inputLayers_[i]->getOutput(deviceId_).notifyGradReady();
-    }
-    markInBackward_[i] = false;
-  }
-}
-
-void Layer::markInputGrad(int inputIndex) {
-  inputLayers_[inputIndex]->getOutput(deviceId_).notifyGradReady();
-  markInBackward_[inputIndex] = true;
-}
-
-void Layer::zeroGrad() {
-  CHECK(output_.grad.get() != NULL);
-  output_.grad->zeroMem();
-}
-
-void Layer::initNeedFlags() {
-  auto initFlag = [this](
-      bool& flag, bool (Layer::*flagQueryFunc)() const, ParameterType type) {
-    flag = false;
-    if (biasParameter_ && biasParameter_->hasType(type)) {
-      flag = true;
-    }
-    if (!flag) {
-      for (auto& para : parameters_) {
-        if (para && para->hasType(type)) {
-          flag = true;
-          break;
-        }
-      }
-    }
-    if (!flag) {
-      for (auto& layer : inputLayers_) {
-        if ((layer.get()->*flagQueryFunc)()) {
-          flag = true;
-        }
-      }
-    }
-  };
-  initFlag(needGradient_, &Layer::needGradient, PARAMETER_GRADIENT);
-}
-
-void Layer::showOutputStats() {
-  MatrixPtr out = getOutputValue();
-  if (!out) return;
-  if (!out->getElementCnt()) {
-    LOG(INFO) << "The number of output of " << config_.name()
-              << " is 0, skip to show the statistics";
-    return;
-  }
-  MatrixPtr outSquare;
-  if (dynamic_cast<GpuSparseMatrix*>(out.get())) {
-    GpuSparseMatrix* tmp = dynamic_cast<GpuSparseMatrix*>(out.get());
-    outSquare = std::make_shared<CpuSparseMatrix>(tmp->getHeight(),
-                                                  tmp->getWidth(),
-                                                  tmp->getElementCnt(),
-                                                  tmp->getValueType(),
-                                                  tmp->getFormat());
-  } else {
-    outSquare = out->clone();
-  }
-  outSquare->copyFrom(*out, HPPL_STREAM_DEFAULT);
-  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-
-  real mean = outSquare->getSum() / out->getElementCnt();
-  real min;
-  real max;
-  if (dynamic_cast<CpuSparseMatrix*>(outSquare.get())) {
-    auto tmpMat = dynamic_cast<CpuSparseMatrix*>(outSquare.get());
-    min = tmpMat->getMin();
-    max = tmpMat->getMax();
-    tmpMat->square2();
-    LOG(INFO) << "show statistics of [none zero values] in sparse matrix";
-  } else {
-    min = outSquare->getMin();
-    max = outSquare->getMax();
-    outSquare->square2();
-  }
-  real std = (outSquare->getSum() / outSquare->getElementCnt()) - mean * mean;
-  std = std > 0 ? std : 0;
-  LOG(INFO) << "The output state of " << config_.name() << ": mean=" << mean
-            << ", "
-            << "std=" << std << ", "
-            << "min=" << min << ", "
-            << "max=" << max;
-}
-
-void Layer::forwardActivation() {
-  /* activation */
-  auto status = activation_->forward(output_);
-  status.check();
-
-  /* dropout */
-  if (config_.drop_rate() > 0) {
-    forwardDropOut();
-    CHECK_NE(activation_->getName(), "softmax")
-        << "Softmax activation cannot be used with Dropout";
-  }
-
-  if (FLAGS_show_layer_stat) {
-    showOutputStats();
-  }
-}
-
-void Layer::backwardActivation() {
-  /* Do error clipping */
-  if (config_.error_clipping_threshold() > 0.0f) {
-    if (FLAGS_log_error_clipping) {
-      VectorPtr outGradVec = Vector::create(
-          output_.grad->getData(), output_.grad->getElementCnt(), useGpu_);
-      real maxAbsGrad = outGradVec->getAbsMax();
-      if (maxAbsGrad > config_.error_clipping_threshold()) {
-        real avgAbsGrad = outGradVec->getAbsSum() / outGradVec->getSize();
-        LOG(INFO) << " layer=" << config_.name() << " need clipping,"
-                  << " max error=" << maxAbsGrad << " avg error=" << avgAbsGrad;
-      }
-    }
-    output_.grad->clip(-config_.error_clipping_threshold(),
-                       config_.error_clipping_threshold());
-  }
-
-  /* Do dropout for delta*/
-  if (config_.drop_rate() > 0 && passType_ != PASS_TEST) {
-    MatrixPtr oGrad = getOutputGrad();
-    oGrad->dotMul(*oGrad, *dropOutMask_);
-  }
-
-  auto status = activation_->backward(output_);
-  status.check();
-}
-
-void Layer::forwardDropOut() {
-  auto& outV = getOutputValue();
-
-  if (passType_ == PASS_TRAIN) {
-    // new dropOutMask_ if dropOutMask_ is null ptr
-    Matrix::resizeOrCreate(dropOutMask_,
-                           outV->getHeight(),
-                           outV->getWidth(),
-                           false,
-                           useGpu(deviceId_));
-    dropOutMask_->randomizeUniform();  // generate a uniform random matrix
-    dropOutMask_->biggerThanScalar(config_.drop_rate());  // random mask
-    outV->dotMul(*outV, *dropOutMask_);                   // dropout
-  } else if (passType_ == PASS_GC) {
-    // only initialize once
-    if (!dropOutMask_) {
-      dropOutMask_ = Matrix::create(
-          outV->getHeight(), outV->getWidth(), false, useGpu(deviceId_));
-      // We use cpu matrix to generate mask so that the mask
-      // will be same for both gpu version and cpu version.
-      // This will help unittest to make sure they have same result.
-      MatrixPtr tmpMask = Matrix::create(outV->getHeight(), outV->getWidth());
-      tmpMask->randomizeUniform();  // generate a uniform random matrix
-      tmpMask->biggerThanScalar(config_.drop_rate());  // random mask
-      dropOutMask_->copyFrom(*tmpMask);
-    }
-    outV->dotMul(*outV, *dropOutMask_);
-  } else {  // passType == PASS_TEST
-    outV->mulScalar(1.0 - config_.drop_rate());
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/Layer.h b/paddle/gserver/layers/Layer.h
deleted file mode 100644
index 13e20e8316323f9082a9615041584685853aa395..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/Layer.h
+++ /dev/null
@@ -1,512 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include <memory>
-#include "ModelConfig.pb.h"
-#include "paddle/function/Function.h"
-#include "paddle/gserver/activations/ActivationFunction.h"
-#include "paddle/math/CpuSparseMatrix.h"
-#include "paddle/parameter/Argument.h"
-#include "paddle/parameter/Parameter.h"
-#include "paddle/parameter/Weight.h"
-#include "paddle/utils/ClassRegistrar.h"
-#include "paddle/utils/Util.h"
-
-/// Macro for registering a layer type.
-/// Example: REGISTER_LAYER(crf_error, CRFDecodingErrorLayer);
-#define REGISTER_LAYER(__type_name, __class_name) \
-  static InitFunction __reg_type_##__type_name(   \
-      []() { Layer::registrar_.registerClass<__class_name>(#__type_name); })
-
-#define REGISTER_LAYER_CREATE_FUNC(__type_name, createFunction) \
-  static InitFunction __reg_type_##__type_name(                 \
-      []() { Layer::registrar_.registerClass(#__type_name, createFunction); })
-
-namespace paddle {
-
-class Layer;
-typedef std::shared_ptr<Layer> LayerPtr;
-typedef std::map<std::string, LayerPtr> LayerMap;
-class NeuralNetwork;
-
-/// layer state, used for RNN and LSTM layers
-struct LayerState {
-  std::vector<MatrixPtr> value;
-};
-typedef std::shared_ptr<LayerState> LayerStatePtr;
-
-/// Paddle device ID, MKLDNN is -2, CPU is -1
-enum PADDLE_DEVICE_ID {
-  MKLDNN_DEVICE = -2,
-  CPU_DEVICE = -1,
-};
-
-/**
- * @brief Base class for layer.
- * Define necessary variables and functions for every layer.
- */
-class Layer {
- protected:
-  /// Layer config
-  LayerConfig config_;
-  /// whether to use GPU
-  bool useGpu_;
-  /// Device Id. MKLDNN is -2, CPU is -1, and GPU is 0, 1, 2 ...
-  int deviceId_;
-  /// Input layers
-  std::vector<LayerPtr> inputLayers_;
-  /// Argument of input layers
-  std::vector<std::string> inputArgument_;
-
-  /// Parameter for each input layer.
-  /// Parameters_[i] is nullptr if inputLayers_[i] does not need parameter.
-  std::vector<ParameterPtr> parameters_;
-
-  /// nullptr if bias is not needed.
-  ParameterPtr biasParameter_;
-
-  /// Output
-  Argument output_;
-  /// Several outputs stored on different devices, used in 'parallel_nn' case,
-  /// and record them by deviceId_.
-  /// Also used in 'use_mkldnn' case.
-  std::vector<Argument> outputOtherDevice_;
-  /// If there are several outputs, map them by each name.
-  /// MKLDNNLayer use it only to merge output grad
-  std::map<std::string, Argument*> outputMap_;
-  /// Used to merge grad on different devices.
-  MatrixPtr tmpGrad_;
-
-  std::unique_ptr<ActivationFunction> activation_;
-
-  /// Current passType, PASS_TRAIN or PASS_TEST
-  PassType passType_;
-
-  /// Random 0-1 matrix for dropOut
-  MatrixPtr dropOutMask_;
-
-  /// Whether the layer need to compute gradient
-  bool needGradient_;
-  /// Whether the layer need to compute re-sequence information
-  bool needSequenceInfo_;
-
-  /// Mark input grad in(true) or out(false) of backward function.
-  std::vector<bool> markInBackward_;
-
-  /// Layer forward function
-  std::vector<std::shared_ptr<FunctionBase>> forward_;
-  /// Layer backward function
-  std::vector<std::shared_ptr<FunctionBase>> backward_;
-
- public:
-  /**
-   * Wait until all input value ready.
-   * Called before Layer::forward() function.
-   */
-  virtual void waitInputValue();
-
-  /**
-   * Copy layer's output_ to other device.
-   * If output layer is in other device, called after Layer::forward() function.
-   */
-  virtual void copyOutputToOtherDevice();
-
-  /**
-   * Wait until all output grad ready and merge them to output_.grad.
-   * Called before Layer::backward() function.
-   */
-  virtual void waitAndMergeOutputGrad();
-
-  /**
-   * Notify previous layer the output grad ready.
-   * Called after Layer::backward() function.
-   */
-  virtual void markAllInputGrad();
-
- protected:
-  /**
-   * Create layer function. Function is called in forward or backward.
-   * \param function, Layer::forward_ or Layer::backward_
-   * \param name, function name
-   * \param config, initialization configuration for the function
-   */
-  void createFunction(std::vector<std::shared_ptr<FunctionBase>>& function,
-                      const std::string& name,
-                      const FuncConfig& config) {
-    if (useGpu_) {
-      function.emplace_back(
-          FunctionBase::funcRegistrar_.createByType(name + "-GPU"));
-    } else {
-      function.emplace_back(
-          FunctionBase::funcRegistrar_.createByType(name + "-CPU"));
-    }
-    auto& func = function.back();
-    func->init(config);
-  }
-
-  /**
-   * Notify specified layer the output grad ready.
-   * Called in the backward function.
-   * If do mark input grad in the backward function, you should to ensure
-   * that all input grad will be marked in the backward function.
-   */
-  void markInputGrad(int inputIndex);
-
-  /**
-   * Get the argument of input layer.
-   */
-  const Argument& getInput(size_t inputIndex) const {
-    return inputLayers_[inputIndex]->getOutput(deviceId_);
-  }
-
-  /**
-   * Get the argument of input layer.
-   */
-  const Argument& getInput(const Layer& inputLayer) const {
-    return inputLayer.getOutput(deviceId_);
-  }
-
-  /**
-   * Get the argument of input layer with deviceId.
-   */
-  const Argument& getInput(size_t inputIndex, int deviceId) const {
-    return inputLayers_[inputIndex]->getOutput(deviceId);
-  }
-
-  /**
-   * Get the forward-input value.
-   */
-  const MatrixPtr& getInputValue(int inputIndex) {
-    return inputLayers_[inputIndex]->getOutput(deviceId_).value;
-  }
-
-  /**
-   * Get the forward-input value.
-   */
-  const MatrixPtr& getInputValue(const Layer& inputLayer) {
-    return inputLayer.getOutput(deviceId_).value;
-  }
-
-  /**
-   * Get the forward-input value with deviceId.
-   */
-  const MatrixPtr& getInputValue(int inputIndex, int deviceId) {
-    return inputLayers_[inputIndex]->getOutput(deviceId).value;
-  }
-
-  /**
-   * Get the forward-input grad.
-   */
-  const MatrixPtr& getInputGrad(int inputIndex) {
-    return inputLayers_[inputIndex]->getOutput(deviceId_).grad;
-  }
-
-  /**
-   * Get the forward-input grad.
-   */
-  const MatrixPtr& getInputGrad(const Layer& inputLayer) {
-    return inputLayer.getOutput(deviceId_).grad;
-  }
-
-  /**
-   * Get the forward-input grad.
-   */
-  const MatrixPtr& getInputGrad(int inputIndex, int deviceId) {
-    return inputLayers_[inputIndex]->getOutput(deviceId).grad;
-  }
-
-  /**
-   * Get the forward-input label.
-   */
-  const IVectorPtr& getInputLabel(const Layer& inputLayer) {
-    return inputLayer.getOutput(deviceId_).ids;
-  }
-
-  /**
-   * Change the size of output (value, grad).
-   * Reset to value zero if isValueClean = true,
-   * Reset to grad zero if isGradClean = true.
-   */
-  void resetSpecifyOutput(Argument& output,
-                          size_t height,
-                          size_t width,
-                          bool isValueClean,
-                          bool isGradClean);
-
-  /**
-   * Add output argument to other devices.
-   */
-  void addOutputArgument(int deviceId);
-
- public:
-  explicit Layer(const LayerConfig& config, bool useGpu = FLAGS_use_gpu);
-  virtual ~Layer() {}
-
-  /// Register a Layer
-  static ClassRegistrar<Layer, LayerConfig> registrar_;
-
-  /**
-   * Get the flag whether layer need to compute gradient.
-   */
-  bool needGradient() const { return needGradient_; }
-
-  /**
-   * Set the flag whether layer need to compute gradient.
-   */
-  void setNeedGradient(bool need) { needGradient_ = need; }
-
-  /**
-   * Set the flag whether layer need to re-compute sequence information,
-   * which includes sequenceStartPositions or subSequenceStartPositions.
-   */
-  void setNeedSequenceInfo(bool need) { needSequenceInfo_ = need; }
-
-  /**
-   * Get layer's name.
-   */
-  const std::string& getName() const { return config_.name(); }
-
-  /**
-   * Get layer's type.
-   */
-  const std::string& getType() const { return config_.type(); }
-
-  /**
-   * Get layer's size.
-   */
-  size_t getSize() const { return config_.size(); }
-
-  /**
-   * Get layer's deviceId.
-   */
-  int getDeviceId() const { return deviceId_; }
-
-  /**
-   * Add the inputLayer.
-   */
-  void addPrev(LayerPtr l) { inputLayers_.push_back(l); }
-
-  /**
-   * Get the size of inputLayer[i].
-   */
-  const LayerPtr& getPrev(size_t i) { return inputLayers_[i]; }
-
-  /**
-   * Get the forward-output value.
-   */
-  const MatrixPtr& getOutputValue() { return output_.value; }
-
-  /**
-   * Get the forward-output label.
-   */
-  const IVectorPtr& getOutputLabel() { return output_.ids; }
-
-  /**
-   * Get the backward-Loss value.
-   */
-  const MatrixPtr& getOutputGrad() { return output_.grad; }
-  /**
-   * If layer has multi-output, set output into outputMap_.
-   */
-  void setOutput(const std::string& name, Argument* output) {
-    outputMap_[name] = output;
-  }
-
-  /**
-   * Get the output map size, if layer has multi-output.
-   */
-  size_t getOutputMapSize() { return outputMap_.size(); }
-
-  /**
-   * Get the output based on layer's name.
-   */
-  Argument& getOutput(const std::string& str = "") {
-    if (str == "") {
-      return output_;
-    } else {
-      auto output = outputMap_.find(str);
-      if (output != outputMap_.end()) {
-        return *output->second;
-      } else {
-        LOG(FATAL) << "No specific output " << str;
-        return *((Argument*)nullptr);
-      }
-    }
-  }
-
-  /**
-   * Get the output based on deviceId.
-   */
-  const Argument& getOutput(int deviceId) const {
-    if (deviceId == getDeviceId()) {
-      return output_;
-    } else {
-      for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-        if (outputOtherDevice_[i].deviceId == deviceId) {
-          return outputOtherDevice_[i];
-        }
-      }
-
-      LOG(FATAL) << "No specific device output ";
-      return *((Argument*)nullptr);
-    }
-  }
-
-  /**
-   * Get layer's parameters.
-   */
-  const std::vector<ParameterPtr>& getParameters() { return parameters_; }
-
-  /**
-   * Get layer's bias-parameters.
-   */
-  const ParameterPtr& getBiasParameter() { return biasParameter_; }
-
-  /**
-   * Create pointer of layer.
-   */
-  static LayerPtr create(const LayerConfig& config);
-
-  /**
-   * Resize the output matrix size.
-   */
-  void resizeOutput(size_t height, size_t width);
-
-  /**
-   * Resize the output matrix size,
-   * and reset value to zero.
-   */
-  void reserveOutput(size_t height, size_t width);
-
-  /**
-   * Resize the output matrix size,
-   * and reset value and grad to zero.
-   */
-  void resetOutput(size_t height, size_t width);
-
-  /**
-   * Clear the gradient of output.
-   */
-  void zeroGrad();
-
-  /**
-   * Intialization.
-   * For example, adding input layers from layerMap and parameterMap.
-   */
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  /**
-   * Intialization for sub network if there has sub network.
-   * @param rootNetwork root network
-   * @param config model config
-   * @param parameterTypes parameter's type
-   * @param useGpu whether to use gpu or not
-   */
-  virtual void initSubNetwork(NeuralNetwork* rootNetwork,
-                              const ModelConfig& config,
-                              const std::vector<ParameterType>& parameterTypes,
-                              bool useGpu) {}
-
-  /**
-   * @brief Access SubNetwork Object.
-   *        If subnetwork exists, then invoke callback with subnetwrk.
-   * @param callback if sub-network is exist, the callback is invoked.
-   */
-  virtual void accessSubNetwork(
-      const std::function<void(NeuralNetwork&)>& callback) {}
-
-  /**
-   * If use sparse row matrix as parameter,
-   * prefetch feature ids in input label.
-   */
-  virtual void prefetch() {}
-
-  /**
-   * Forward propagation.
-   * All inherited implementation should call Layer::foward() function.
-   */
-  virtual void forward(PassType passType) {
-    passType_ = passType;
-    if (!inputLayers_.empty() && needSequenceInfo_) {
-      const Argument& input = getInput(0);
-      output_.sequenceStartPositions = input.sequenceStartPositions;
-      output_.subSequenceStartPositions = input.subSequenceStartPositions;
-      output_.cpuSequenceDims = input.cpuSequenceDims;
-    }
-  }
-
-  /**
-   * Reset the internal state variables.
-   * Allocate them if they have not been allocated.
-   * This function need to called before Layer::forward() for generating
-   * sequence.
-   *
-   * This is used for sequence generation. When generating sequence, the
-   * calculation at current timestamp depends on the state from previous
-   * timestamp. The model needs to keep the information about the previous
-   * timestamp in the state variables. Layers such as RecurrentLayer,
-   * LstmLayer and ContextLayer have state variables.
-   */
-  virtual void resetState() {}
-
-  /**
-   * Set layer state.
-   */
-  virtual void setState(LayerStatePtr state) {}
-
-  /**
-   * Get layer state.
-   * @return A copy of internal state.
-   */
-  virtual LayerStatePtr getState() { return nullptr; }
-
-  /**
-   * Show output state.
-   */
-  void showOutputStats();
-
-  /**
-   * Backward propagation.
-   * Should only be called after Layer::forward() function.
-   */
-  virtual void backward(const UpdateCallback& callback = nullptr) = 0;
-
-  /**
-   * One pass is finished.
-   */
-  virtual void onPassEnd() {}
-
- protected:
-  /**
-   * Forward of activation function.
-   */
-  void forwardActivation();
-  /**
-   * Backward of activation function.
-   */
-  void backwardActivation();
-  /**
-   * Forward of dropOut.
-   */
-  void forwardDropOut();
-  /**
-   * Initilize the needGradient_ flag.
-   */
-  void initNeedFlags();
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/LinearChainCRF.h b/paddle/gserver/layers/LinearChainCRF.h
deleted file mode 100644
index e802b701d0237bed44adc83273fe53c3e18c92ec..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/LinearChainCRF.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-class LinearChainCRF {
- public:
-  /**
-   * The size of para must be \f$(numClasses + 2) * numClasses\f$.
-   * The first numClasses values of para are for starting weights (\f$a\f$).
-   * The next numClasses values of para are for ending weights (\f$b\f$),
-   * The remaning values are for transition weights (\f$w\f$).
-   *
-   * The probability of a state sequence s of length \f$L\f$ is defined as:
-   * \f$P(s) = (1/Z) exp(a_{s_1} + b_{s_L}
-   *                  + \sum_{l=1}^L x_{s_l}
-   *                  + \sum_{l=2}^L w_{s_{l-1},s_l})\f$
-   * where \f$Z\f$ is a normalization value so that the sum of \f$P(s)\f$ over
-   * all possible
-   * sequences is \f$1\f$, and \f$x\f$ is the input feature to the CRF.
-   */
-  LinearChainCRF(int numClasses, real* para);
-
-  /**
-   * Calculate the negative log likelihood of s given x.
-   * The size of x must be length * numClasses. Each consecutive numClasses
-   * values are the features for one time step.
-   */
-  real forward(real* x, int* s, int length);
-
-  /**
-   * Calculate the gradient with respect to x, a, b, and w.
-   * backward() can only be called after a corresponding call to forward() with
-   * the same x, s and length.
-   * The gradient with respect to a, b, and w will not be calculated if
-   * needWGrad is false.
-   * @note Please call getWGrad() and getXGrad() to get the gradient with
-   * respect to (a, b, w) and x respectively.
-   */
-  void backward(real* x, int* s, int length, bool needWGrad);
-
-  /**
-   * Find the most probable sequence given x. The result will be stored in s.
-   */
-  void decode(real* x, int* s, int length);
-
-  /*
-   * Return the gradient with respect to (a, b, w). It can only be called after
-   * a corresponding call to backward().
-   */
-  MatrixPtr getWGrad() { return matWGrad_; }
-
-  /*
-   * Return the gradient with respect to x. It can only be called after a
-   * corresponding call to backward().
-   */
-  MatrixPtr getXGrad() { return matGrad_; }
-
- protected:
-  int numClasses_;
-  MatrixPtr a_;
-  MatrixPtr b_;
-  MatrixPtr w_;
-  MatrixPtr matWGrad_;
-  MatrixPtr da_;
-  MatrixPtr db_;
-  MatrixPtr dw_;
-  MatrixPtr ones_;
-
-  MatrixPtr expX_;
-  MatrixPtr matGrad_;
-  MatrixPtr alpha_;
-  MatrixPtr beta_;
-  MatrixPtr maxX_;
-  MatrixPtr expW_;
-
-  // track_(k,i) = j means that the best sequence at time k for class i comes
-  // from the sequence at time k-1 for class j
-  IVectorPtr track_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/LinearChainCTC.h b/paddle/gserver/layers/LinearChainCTC.h
deleted file mode 100644
index 5b325a0deb0e9d8df241175159321e52f527f6c4..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/LinearChainCTC.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-class LinearChainCTC {
- public:
-  LinearChainCTC(int numClasses, bool normByTimes);
-
-  // Calculate the negative log probability as loss
-  real forward(real* softmaxSeq,
-               int softmaxSeqLen,
-               int* labelSeq,
-               int labelSeqLen);
-
-  // calculate the gradient
-  void backward(real* softmaxSeq,
-                real* softmaxSeqGrad,
-                int* labelSeq,
-                int labelSeqLen);
-
- protected:
-  int numClasses_, blank_, totalSegments_, totalTime_;
-  bool normByTimes_;
-  bool isInvalid_;
-
-  MatrixPtr logActs_, forwardVars_, backwardVars_, gradTerms_;
-
-  real logProb_;
-
-  void segmentRange(int& start, int& end, int time);
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/LstmCompute.cpp b/paddle/gserver/layers/LstmCompute.cpp
deleted file mode 100644
index ea30f6d6b1b8586569407af6baac2c14034e709c..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/LstmCompute.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "LstmCompute.h"
-#include "hl_recurrent_apply.cuh"
-#include "paddle/utils/Util.h"
-
-namespace paddle {
-
-void LstmCompute::init(LayerConfig &config) {
-  activeNode_ = hlActiveType(config.active_type());
-  activeGate_ = hlActiveType(config.active_gate_type());
-  activeState_ = hlActiveType(config.active_state_type());
-}
-
-template <>
-void LstmCompute::forwardOneSequence<0>(hl_lstm_value value, int frameSize) {
-  hl_cpu_lstm_forward(hppl::forward::lstm(),
-                      value,
-                      frameSize,
-                      activeNode_,
-                      activeGate_,
-                      activeState_);
-}
-
-template <>
-void LstmCompute::backwardOneSequence<0>(hl_lstm_value value,
-                                         hl_lstm_grad grad,
-                                         int frameSize) {
-  hl_cpu_lstm_backward(hppl::backward::lstm(),
-                       value,
-                       grad,
-                       frameSize,
-                       activeNode_,
-                       activeGate_,
-                       activeState_);
-}
-
-template <>
-void LstmCompute::forwardBatch<0>(hl_lstm_value value,
-                                  int frameSize,
-                                  int batchSize) {
-  for (int b = 0; b < batchSize; b++) {
-    forwardOneSequence<0>(value, frameSize);
-
-    value.gateValue += frameSize * 4;
-    value.stateValue += frameSize;
-    value.stateActiveValue += frameSize;
-    value.outputValue += frameSize;
-    if (value.prevStateValue) {
-      value.prevStateValue += frameSize;
-    }
-  }
-}
-
-template <>
-void LstmCompute::backwardBatch<0>(hl_lstm_value value,
-                                   hl_lstm_grad grad,
-                                   int frameSize,
-                                   int batchSize) {
-  for (int b = 0; b < batchSize; b++) {
-    backwardOneSequence<0>(value, grad, frameSize);
-
-    value.gateValue += frameSize * 4;
-    value.stateValue += frameSize;
-    value.stateActiveValue += frameSize;
-    value.outputValue += frameSize;
-    if (value.prevStateValue) {
-      value.prevStateValue += frameSize;
-    }
-
-    grad.gateGrad += frameSize * 4;
-    grad.stateGrad += frameSize;
-    grad.stateActiveGrad += frameSize;
-    grad.outputGrad += frameSize;
-    if (grad.prevStateGrad) {
-      grad.prevStateGrad += frameSize;
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/LstmCompute.h b/paddle/gserver/layers/LstmCompute.h
deleted file mode 100644
index 80fb01cd1885151c8d62a4b5dfdb4ba08327926d..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/LstmCompute.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "ModelConfig.pb.h"
-#include "hl_gpu.h"
-#include "paddle/utils/Common.h"
-
-namespace paddle {
-
-class LstmCompute {
- public:
-  void init(LayerConfig &config);
-
-  /**
-   * LstmLayer batch compute API (forwardBatch, backwardBatch).
-   * If use batch compute api, lstm value(and grad) need to be batch structure.
-   * Compute order:
-   *   forwardBatch:  for 0 <= id < numBatch
-   *   backwardBatch:  for numBatch > id >= 0
-   */
-  template <bool useGpu>
-  void forwardBatch(hl_lstm_value value, int frameSize, int batchSize);
-
-  template <bool useGpu>
-  void backwardBatch(hl_lstm_value value,
-                     hl_lstm_grad grad,
-                     int frameSize,
-                     int batchSize);
-
-  /**
-   * LstmLayer sequence compute API (forwardOneSequence, backwardOneSequence).
-   * Compute order(for each sequence):
-   *   forwardOneSequence:
-   *     if (!reversed) for 0 <= seqId < seqLength
-   *     if (reversed)  for seqLength > seqId >= 0
-   *   backwardOneSequence:
-   *     if (!reversed) for seqLength > seqId >= 0
-   *     if (reversed)  for 0 <= seqId < seqLength
-   */
-  template <bool useGpu>
-  void forwardOneSequence(hl_lstm_value value, int frameSize);
-  template <bool useGpu>
-  void backwardOneSequence(hl_lstm_value value,
-                           hl_lstm_grad grad,
-                           int frameSize);
-
- public:
-  hl_activation_mode_t activeNode_;
-  hl_activation_mode_t activeGate_;
-  hl_activation_mode_t activeState_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/LstmLayer.cpp b/paddle/gserver/layers/LstmLayer.cpp
deleted file mode 100644
index f65ae6a3e69cb5f0a7e6073d17bfd0beae91cd5d..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/LstmLayer.cpp
+++ /dev/null
@@ -1,805 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "LstmLayer.h"
-#include "paddle/math/BaseMatrix.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Stat.h"
-
-DECLARE_bool(prev_batch_state);
-
-namespace paddle {
-
-REGISTER_LAYER(lstmemory, LstmLayer);
-
-bool LstmLayer::init(const LayerMap &layerMap,
-                     const ParameterMap &parameterMap) {
-  if (!Layer::init(layerMap, parameterMap)) return false;
-  CHECK_EQ(1U, inputLayers_.size());
-  CHECK_EQ(1U, parameters_.size());
-  CHECK_EQ(getSize() * getSize() * 4, parameters_[0]->getSize());
-  CHECK_EQ(getSize() * 7, biasParameter_->getSize());
-  weight_.reset(new Weight(getSize(), getSize() * 4, parameters_[0]));
-  if (biasParameter_.get() != NULL) {
-    bias_.reset(new Weight(1, getSize() * 7, biasParameter_));
-    if (bias_->getW()) {
-      localBias_ = Matrix::create(nullptr,
-                                  /* height= */ 1,
-                                  getSize() * 4,
-                                  /* trans= */ false,
-                                  useGpu_);
-      checkIg_ = Matrix::create(nullptr,
-                                /* height= */ 1,
-                                getSize(),
-                                /* trans= */ false,
-                                useGpu_);
-      checkFg_ = Matrix::create(nullptr,
-                                /* height= */ 1,
-                                getSize(),
-                                /* trans= */ false,
-                                useGpu_);
-      checkOg_ = Matrix::create(nullptr,
-                                /* height= */ 1,
-                                getSize(),
-                                /* trans= */ false,
-                                useGpu_);
-
-      localBias_->setData(bias_->getW()->getData());
-      checkIg_->setData(bias_->getW()->getData() + getSize() * 4);
-      checkFg_->setData(bias_->getW()->getData() + getSize() * 5);
-      checkOg_->setData(bias_->getW()->getData() + getSize() * 6);
-    }
-
-    if (bias_->getWGrad()) {
-      localBiasGrad_ = Matrix::create(nullptr,
-                                      /* height= */ 1,
-                                      getSize() * 4,
-                                      /* trans= */ false,
-                                      useGpu_);
-      checkIgGrad_ = Matrix::create(nullptr,
-                                    /* height= */ 1,
-                                    getSize(),
-                                    /* trans= */ false,
-                                    useGpu_);
-      checkFgGrad_ = Matrix::create(nullptr,
-                                    /* height= */ 1,
-                                    getSize(),
-                                    /* trans= */ false,
-                                    useGpu_);
-      checkOgGrad_ = Matrix::create(nullptr,
-                                    /* height= */ 1,
-                                    getSize(),
-                                    /* trans= */ false,
-                                    useGpu_);
-      localBiasGrad_->setData(bias_->getWGrad()->getData());
-      checkIgGrad_->setData(bias_->getWGrad()->getData() + getSize() * 4);
-      checkFgGrad_->setData(bias_->getWGrad()->getData() + getSize() * 5);
-      checkOgGrad_->setData(bias_->getWGrad()->getData() + getSize() * 6);
-    }
-  } else {
-    LOG(FATAL) << "Bias should be here.";
-  }
-  reversed_ = config_.reversed();
-
-  // create IdentityActivation for using drop_rate
-  activation_.reset(ActivationFunction::create(""));
-
-  LstmCompute::init(config_);
-  useBatch_ = true;
-  useSeqParallel_ = false;
-  if (useGpu_ && (getSize() == 32 || getSize() == 64)) {
-    useSeqParallel_ = true;
-  }
-
-  return true;
-}
-
-void LstmLayer::resetState() {
-  CHECK(!reversed_) << "state is not allowed for reversed lstmemory layer";
-  Matrix::resizeOrCreate(
-      prevOutput_, 1, getSize(), /* trans= */ false, useGpu_);
-  Matrix::resizeOrCreate(prevState_, 1, getSize(), /* trans= */ false, useGpu_);
-  prevOutput_->resize(0, getSize());
-  prevState_->resize(0, getSize());
-  if (FLAGS_prev_batch_state) {
-    useBatch_ = true;
-  } else {
-    useBatch_ = false;
-  }
-}
-
-void LstmLayer::setState(LayerStatePtr state) {
-  CHECK(state->value.size() == 2) << "two matrices are expected for LSTM state";
-  prevOutput_->resize(state->value[0]->getHeight(),
-                      state->value[0]->getWidth());
-  prevState_->resize(state->value[1]->getHeight(), state->value[1]->getWidth());
-  prevOutput_->copyFrom(*(state->value[0]));
-  prevState_->copyFrom(*(state->value[1]));
-}
-
-LayerStatePtr LstmLayer::getState() {
-  LayerStatePtr res = std::make_shared<LayerState>();
-  if (prevOutput_->getHeight() && prevOutput_->getWidth()) {
-    res->value.push_back(prevOutput_->clone(0, 0, useGpu_));
-    res->value[0]->copyFrom(*prevOutput_);
-    res->value.push_back(prevState_->clone(0, 0, useGpu_));
-    res->value[1]->copyFrom(*prevState_);
-  } else {
-    MatrixPtr output =
-        Matrix::create(1, getSize(), /* trans= */ false, useGpu_);
-    MatrixPtr state = Matrix::create(1, getSize(), /* trans= */ false, useGpu_);
-    output->resize(0, getSize());
-    state->resize(0, getSize());
-    res->value.push_back(output);
-    res->value.push_back(state);
-  }
-  return res;
-}
-
-void LstmLayer::forward(PassType passType) {
-  REGISTER_TIMER_INFO("LstmFwTimer", getName().c_str());
-  Layer::forward(passType);
-
-  const Argument &input = getInput(0);
-  CHECK(input.sequenceStartPositions);
-  int batchSize = input.getBatchSize();
-  resetOutput(batchSize, getSize());
-  CHECK_EQ(getSize() * 4, input.value->getWidth());
-  size_t numSequences = input.getNumSequences();
-  const int *starts = input.sequenceStartPositions->getData(false);
-  CHECK_EQ(starts[numSequences], batchSize);
-
-  Matrix::resizeOrCreate(gate_.value,
-                         /* height= */ batchSize,
-                         getSize() * 4,
-                         /* trans= */ false,
-                         useGpu_);
-  if (prevOutput_) {
-    size_t prevNumSeq = useBatch_ ? numSequences : 1;
-    if (prevOutput_->getHeight() == 0) {
-      prevOutput_->resize(prevNumSeq, getSize());
-      prevState_->resize(prevNumSeq, getSize());
-      prevOutput_->zeroMem();
-      prevState_->zeroMem();
-    } else {
-      CHECK_EQ(prevOutput_->getHeight(), prevNumSeq)
-          << "the number of sequences must be the same";
-    }
-    Matrix::resizeOrCreate(totalState_,
-                           prevState_->getHeight() + batchSize,
-                           getSize(),
-                           /*trans*/ false,
-                           useGpu_);
-    state_.value = Matrix::create(nullptr,
-                                  /* height= */ batchSize,
-                                  getSize(),
-                                  /* trans= */ false,
-                                  useGpu_);
-    state_.value->setData(totalState_->getData() +
-                          prevState_->getHeight() * getSize());
-  } else {
-    Matrix::resizeOrCreate(state_.value,
-                           /* height= */ batchSize,
-                           getSize(),
-                           /* trans= */ false,
-                           useGpu_);
-  }
-  Matrix::resizeOrCreate(preOutput_.value,
-                         /* height= */ batchSize,
-                         getSize(),
-                         /* trans= */ false,
-                         useGpu_);
-
-  if (!useBatch_) {
-    forwardSequence(batchSize, numSequences, starts, input.value);
-  } else {
-    if (!useSeqParallel_) {
-      forwardBatch(batchSize, numSequences, starts, input.value);
-    } else {
-      const int *starts = input.sequenceStartPositions->getData(useGpu_);
-      forwardSeqParallel(batchSize, numSequences, starts, input.value);
-    }
-  }
-  /*  activation */ { forwardActivation(); }
-}
-
-void LstmLayer::backward(const UpdateCallback &callback) {
-  REGISTER_TIMER_INFO("LstmBwTimer", getName().c_str());
-  /*  Do derivation */ { backwardActivation(); }
-
-  const Argument &input = getInput(0);
-  CHECK(input.sequenceStartPositions);
-  int batchSize = input.getBatchSize();
-  size_t numSequences = input.getNumSequences();
-
-  Matrix::resizeOrCreate(gate_.grad,
-                         /* height= */ batchSize,
-                         getSize() * 4,
-                         /* trans= */ false,
-                         useGpu_);
-  Matrix::resizeOrCreate(state_.grad,
-                         /* height= */ batchSize,
-                         getSize(),
-                         /* trans= */ false,
-                         useGpu_);
-  Matrix::resizeOrCreate(preOutput_.grad,
-                         /* height= */ batchSize,
-                         getSize(),
-                         /* trans= */ false,
-                         useGpu_);
-  state_.grad->zero();
-
-  const int *starts = input.sequenceStartPositions->getData(false);
-  if (!useBatch_) {
-    backwardSequence(batchSize, numSequences, starts, input.grad);
-  } else {
-    if (!useSeqParallel_) {
-      backwardBatch(batchSize, numSequences, starts, input.grad);
-    } else {
-      const int *starts = input.sequenceStartPositions->getData(useGpu_);
-      backwardSeqParallel(batchSize, numSequences, starts, input.grad);
-    }
-  }
-
-  if (bias_) {
-    bias_->getParameterPtr()->incUpdate(callback);
-  }
-  weight_->getParameterPtr()->incUpdate(callback);
-}
-
-void LstmLayer::forwardSequence(int batchSize,
-                                size_t numSequences,
-                                const int *starts,
-                                MatrixPtr inputValue) {
-  REGISTER_TIMER_INFO("LstmFwSequenceTime", getName().c_str());
-  gate_.value->assign(*inputValue);
-  if (bias_) {
-    gate_.value->addBias(*localBias_, 1);
-  }
-
-  hl_lstm_value lstmValue;
-  lstmValue.checkIg = checkIg_->getData();
-  lstmValue.checkFg = checkFg_->getData();
-  lstmValue.checkOg = checkOg_->getData();
-  lstmValue.gateValue = gate_.value->getData();
-  lstmValue.stateValue = state_.value->getData();
-  lstmValue.stateActiveValue = preOutput_.value->getData();
-  lstmValue.outputValue = output_.value->getData();
-  lstmValue.prevStateValue = nullptr;
-  if (reversed_) {
-    lstmValue.gateValue += (batchSize - 1) * getSize() * 4;
-    lstmValue.stateValue += (batchSize - 1) * getSize();
-    lstmValue.stateActiveValue += (batchSize - 1) * getSize();
-    lstmValue.outputValue += (batchSize - 1) * getSize();
-  }
-
-  auto nextFrame = [&lstmValue](bool reversed, int frameSize) {
-    lstmValue.prevStateValue = lstmValue.stateValue;
-    if (!reversed) {
-      lstmValue.gateValue += frameSize * 4;
-      lstmValue.stateValue += frameSize;
-      lstmValue.stateActiveValue += frameSize;
-      lstmValue.outputValue += frameSize;
-    } else {
-      lstmValue.gateValue -= frameSize * 4;
-      lstmValue.stateValue -= frameSize;
-      lstmValue.stateActiveValue -= frameSize;
-      lstmValue.outputValue -= frameSize;
-    }
-  };
-
-  MatrixPtr frameGate = Matrix::create(nullptr,
-                                       /* height= */ 1,
-                                       getSize() * 4,
-                                       /* trans= */ false,
-                                       useGpu_);
-  MatrixPtr frameOutput = Matrix::create(nullptr,
-                                         /* height= */ 1,
-                                         getSize(),
-                                         /* trans= */ false,
-                                         useGpu_);
-
-  if (!reversed_) {
-    if (prevState_) {
-      lstmValue.prevStateValue = prevState_->getData();
-    }
-    if (prevOutput_) {
-      frameGate->setData(lstmValue.gateValue);
-      frameGate->mul(*prevOutput_, *weight_->getW(), 1, 1);
-    }
-  }
-  AsyncGpuBlock asyncGpuBlock;
-  for (size_t n = 0; n < numSequences; ++n) {
-    int length;
-    if (!reversed_) {
-      length = starts[n + 1] - starts[n];
-    } else {
-      length = starts[numSequences - n] - starts[numSequences - n - 1];
-    }
-    for (int l = 0; l < length; ++l) {
-      if (useGpu_) {
-        LstmCompute::forwardOneSequence<1>(lstmValue, getSize());
-      } else {
-        LstmCompute::forwardOneSequence<0>(lstmValue, getSize());
-      }
-
-      if (l != length - 1) {
-        frameOutput->setData(lstmValue.outputValue);
-        nextFrame(reversed_, getSize());
-        frameGate->setData(lstmValue.gateValue);
-        frameGate->mul(*frameOutput, *weight_->getW(), 1, 1);
-      }
-    }
-    if (n != numSequences - 1) {
-      frameOutput->setData(lstmValue.outputValue);
-      nextFrame(reversed_, getSize());
-      frameGate->setData(lstmValue.gateValue);
-      if (!reversed_) {
-        if (!prevState_) lstmValue.prevStateValue = nullptr;
-        if (prevOutput_) {
-          frameGate->mul(*frameOutput, *weight_->getW(), 1, 1);
-        }
-      } else {
-        lstmValue.prevStateValue = nullptr;
-      }
-    }
-  }
-
-  if (!reversed_) {
-    if (prevState_) {
-      prevState_->assign(*state_.value->subMatrix(batchSize - 1, 1));
-    }
-    if (prevOutput_) {
-      prevOutput_->assign(*output_.value->subMatrix(batchSize - 1, 1));
-    }
-  }
-}
-
-void LstmLayer::backwardSequence(int batchSize,
-                                 size_t numSequences,
-                                 const int *starts,
-                                 MatrixPtr inputGrad) {
-  REGISTER_TIMER_INFO("LstmBwSequenceTime", getName().c_str());
-  MatrixPtr weightT = weight_->getW()->getTranspose();
-
-  hl_lstm_value lstmValue;
-  hl_lstm_grad lstmGrad;
-  lstmValue.checkIg = checkIg_->getData();
-  lstmValue.checkFg = checkFg_->getData();
-  lstmValue.checkOg = checkOg_->getData();
-  lstmValue.gateValue = gate_.value->getData();
-  lstmValue.stateValue = state_.value->getData();
-  lstmValue.stateActiveValue = preOutput_.value->getData();
-  lstmValue.outputValue = nullptr;
-
-  if (bias_->getWGrad()) {
-    lstmGrad.checkIgGrad = checkIgGrad_->getData();
-    lstmGrad.checkFgGrad = checkFgGrad_->getData();
-    lstmGrad.checkOgGrad = checkOgGrad_->getData();
-  } else {
-    lstmGrad.checkIgGrad = nullptr;
-    lstmGrad.checkFgGrad = nullptr;
-    lstmGrad.checkOgGrad = nullptr;
-  }
-  lstmGrad.gateGrad = gate_.grad->getData();
-  lstmGrad.stateGrad = state_.grad->getData();
-  lstmGrad.stateActiveGrad = nullptr;
-  lstmGrad.outputGrad = output_.grad->getData();
-
-  if (!reversed_) {
-    lstmValue.gateValue += (batchSize - 1) * getSize() * 4;
-    lstmGrad.gateGrad += (batchSize - 1) * getSize() * 4;
-    lstmValue.stateValue += (batchSize - 1) * getSize();
-    lstmGrad.stateGrad += (batchSize - 1) * getSize();
-    lstmValue.stateActiveValue += (batchSize - 1) * getSize();
-    lstmGrad.outputGrad += (batchSize - 1) * getSize();
-    lstmValue.prevStateValue = lstmValue.stateValue - getSize();
-    lstmGrad.prevStateGrad = lstmGrad.stateGrad - getSize();
-  } else {
-    lstmValue.prevStateValue = lstmValue.stateValue + getSize();
-    lstmGrad.prevStateGrad = lstmGrad.stateGrad + getSize();
-  }
-
-  auto nextFrame = [&lstmValue, &lstmGrad](bool reversed, int frameSize) {
-    if (reversed) {
-      lstmValue.gateValue += frameSize * 4;
-      lstmGrad.gateGrad += frameSize * 4;
-      lstmValue.stateValue += frameSize;
-      lstmGrad.stateGrad += frameSize;
-      lstmValue.stateActiveValue += frameSize;
-      lstmGrad.outputGrad += frameSize;
-      lstmValue.prevStateValue = lstmValue.stateValue + frameSize;
-      lstmGrad.prevStateGrad = lstmGrad.stateGrad + frameSize;
-    } else {
-      lstmValue.gateValue -= frameSize * 4;
-      lstmGrad.gateGrad -= frameSize * 4;
-      lstmValue.stateValue -= frameSize;
-      lstmGrad.stateGrad -= frameSize;
-      lstmValue.stateActiveValue -= frameSize;
-      lstmGrad.outputGrad -= frameSize;
-      lstmValue.prevStateValue = lstmValue.stateValue - frameSize;
-      lstmGrad.prevStateGrad = lstmGrad.stateGrad - frameSize;
-    }
-  };
-
-  MatrixPtr frameGate = Matrix::create(nullptr,
-                                       /* height= */ 1,
-                                       getSize() * 4,
-                                       /* trans= */ false,
-                                       useGpu_);
-  MatrixPtr frameOutput = Matrix::create(nullptr,
-                                         /* height= */ 1,
-                                         getSize(),
-                                         /* trans= */ false,
-                                         useGpu_);
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    for (size_t n = 0; n < numSequences; ++n) {
-      int length;
-      int start;
-      if (reversed_) {
-        length = starts[n + 1] - starts[n];
-        start = starts[n];
-      } else {
-        length = starts[numSequences - n] - starts[numSequences - n - 1];
-        start = starts[numSequences - n - 1];
-      }
-      for (int l = 0; l < length; ++l) {
-        if (l == length - 1) {
-          lstmValue.prevStateValue = nullptr;
-          lstmGrad.prevStateGrad = nullptr;
-        }
-        if (useGpu_) {
-          LstmCompute::backwardOneSequence<1>(lstmValue, lstmGrad, getSize());
-        } else {
-          LstmCompute::backwardOneSequence<0>(lstmValue, lstmGrad, getSize());
-        }
-        if (l != length - 1) {
-          frameGate->setData(lstmGrad.gateGrad);
-          nextFrame(reversed_, getSize());
-          frameOutput->setData(lstmGrad.outputGrad);
-          frameOutput->mul(*frameGate, *weightT, 1, 1);
-        } else {
-          nextFrame(reversed_, getSize());
-        }
-      }
-
-      if (weight_->getWGrad()) {
-        if (!reversed_) {
-          weight_->getWGrad()->mul(
-              *output_.value->subMatrix(start, length - 1)->getTranspose(),
-              *gate_.grad->subMatrix(start + 1, length - 1),
-              1,
-              1);
-        } else {
-          weight_->getWGrad()->mul(
-              *output_.value->subMatrix(start + 1, length - 1)->getTranspose(),
-              *gate_.grad->subMatrix(start, length - 1),
-              1,
-              1);
-        }
-      }
-    }
-  }
-
-  if (inputGrad) {
-    inputGrad->add(*gate_.grad);
-  }
-  if (bias_ && bias_->getWGrad()) {
-    localBiasGrad_->collectBias(*gate_.grad, 1);
-  }
-}
-
-void LstmLayer::forwardBatch(int batchSize,
-                             size_t numSequences,
-                             const int *starts,
-                             MatrixPtr inputValue) {
-  REGISTER_TIMER_INFO("LstmFwBatchTime", getName().c_str());
-
-  hl_lstm_value lstmValue;
-  lstmValue.checkIg = checkIg_->getData();
-  lstmValue.checkFg = checkFg_->getData();
-  lstmValue.checkOg = checkOg_->getData();
-
-  if (!batchValue_) {
-    batchValue_.reset(new SequenceToBatch(useGpu_));
-  }
-  batchValue_->resizeOrCreateBatch(
-      batchSize, numSequences, starts, reversed_, prevOutput_ ? true : false);
-
-  batchValue_->resizeOrCreate(*output_.value);
-  batchValue_->copy(*inputValue, *gate_.value, /* seq2batch */ true);
-  if (bias_) {
-    gate_.value->addBias(*localBias_, 1);
-  }
-
-  {
-    int numBatch = batchValue_->getNumBatch();
-    int batchSize = 0;
-    AsyncGpuBlock asyncGpuBlock;
-    if (prevState_) {
-      lstmValue.prevStateValue = totalState_->getData();
-    } else {
-      lstmValue.prevStateValue = nullptr;
-    }
-    for (int n = 0; n < numBatch; n++) {
-      MatrixPtr outputValue = batchValue_->getBatchValue(n);
-      MatrixPtr gateValue = batchValue_->getBatchValue(*gate_.value, n);
-      batchSize = outputValue->getHeight();
-
-      if (n != 0) {
-        MatrixPtr batch1 = batchValue_->getBatchValue(n - 1, batchSize);
-        gateValue->mul(*batch1, *weight_->getW(), 1, 1);
-      } else if (prevOutput_) {
-        Matrix::resizeOrCreate(prevBatchOutput2_,
-                               gateValue->getHeight(),
-                               getSize(),
-                               false,
-                               useGpu_);
-        batchValue_->prevOutput2Batch(*prevOutput_, *prevBatchOutput2_);
-        gateValue->mul(*prevBatchOutput2_, *weight_->getW(), 1, 1);
-
-        batchValue_->prevOutput2Batch(*prevState_,
-                                      *totalState_->subMatrix(0, numSequences));
-      }
-
-      lstmValue.gateValue = gateValue->getData();
-      lstmValue.outputValue = outputValue->getData();
-      lstmValue.stateValue =
-          batchValue_->getBatchValue(*state_.value, n)->getData();
-      lstmValue.stateActiveValue =
-          batchValue_->getBatchValue(*preOutput_.value, n)->getData();
-      {
-        if (useGpu_) {
-          LstmCompute::forwardBatch<1>(lstmValue, getSize(), batchSize);
-        } else {
-          LstmCompute::forwardBatch<0>(lstmValue, getSize(), batchSize);
-        }
-      }
-      lstmValue.prevStateValue = lstmValue.stateValue;
-    }
-  }
-  {
-    REGISTER_TIMER_INFO("batchToSeq", getName().c_str());
-    batchValue_->copyBackSeq(*output_.value);
-  }
-  if (prevOutput_) {
-    getPrevBatchOutput(numSequences);
-    getPrevBatchState(numSequences);
-  }
-}
-
-void LstmLayer::getPrevBatchOutput(size_t numSequences) {
-  prevOutput_->resize(numSequences, getSize());
-  batchValue_->getSeqOutputFromBatch(*prevOutput_,
-                                     *batchValue_->getBatchValue());
-}
-
-void LstmLayer::getPrevBatchState(size_t numSequences) {
-  prevState_->resize(numSequences, getSize());
-  batchValue_->getSeqOutputFromBatch(*prevState_, *state_.value);
-}
-
-void LstmLayer::backwardBatch(int batchSize,
-                              size_t numSequences,
-                              const int *starts,
-                              MatrixPtr inputGrad) {
-  REGISTER_TIMER_INFO("LstmBwBatchTime", getName().c_str());
-
-  hl_lstm_value lstmValue;
-  lstmValue.checkIg = checkIg_->getData();
-  lstmValue.checkFg = checkFg_->getData();
-  lstmValue.checkOg = checkOg_->getData();
-
-  hl_lstm_grad lstmGrad;
-  lstmGrad.stateActiveGrad = preOutput_.grad->getData();
-
-  if (bias_->getWGrad()) {
-    lstmGrad.checkIgGrad = checkIgGrad_->getData();
-    lstmGrad.checkFgGrad = checkFgGrad_->getData();
-    lstmGrad.checkOgGrad = checkOgGrad_->getData();
-  } else {
-    lstmGrad.checkIgGrad = nullptr;
-    lstmGrad.checkFgGrad = nullptr;
-    lstmGrad.checkOgGrad = nullptr;
-  }
-
-  if (!batchGrad_) {
-    batchGrad_.reset(new SequenceToBatch(useGpu_));
-  }
-  batchGrad_->shareIndexWith(*batchValue_);
-
-  {
-    REGISTER_TIMER_INFO("seqToBatch", getName().c_str());
-    batchGrad_->copyFromSeq(*output_.grad);
-  }
-
-  {
-    MatrixPtr weightT = weight_->getW()->getTranspose();
-    int numBatch = batchGrad_->getNumBatch();
-    int batchSize = 0;
-    AsyncGpuBlock asyncGpuBlock;
-    for (int n = (int)numBatch - 1; n >= 0; n--) {
-      MatrixPtr outputGrad = batchGrad_->getBatchValue(n);
-      MatrixPtr gateGrad = batchGrad_->getBatchValue(*gate_.grad, n);
-
-      lstmValue.gateValue =
-          batchGrad_->getBatchValue(*gate_.value, n)->getData();
-      lstmValue.stateValue =
-          batchGrad_->getBatchValue(*state_.value, n)->getData();
-      lstmValue.stateActiveValue =
-          batchGrad_->getBatchValue(*preOutput_.value, n)->getData();
-      lstmGrad.stateGrad =
-          batchGrad_->getBatchValue(*state_.grad, n)->getData();
-      lstmGrad.gateGrad = gateGrad->getData();
-      lstmGrad.outputGrad = outputGrad->getData();
-      {
-        batchSize = outputGrad->getHeight();
-        if (n != 0) {
-          lstmValue.prevStateValue =
-              batchGrad_->getBatchValue(*state_.value, n - 1)->getData();
-          lstmGrad.prevStateGrad =
-              batchGrad_->getBatchValue(*state_.grad, n - 1)->getData();
-        } else {
-          if (prevState_) {
-            lstmValue.prevStateValue = totalState_->getData();
-            lstmGrad.prevStateGrad = nullptr;
-          } else {
-            lstmValue.prevStateValue = nullptr;
-            lstmGrad.prevStateGrad = nullptr;
-          }
-        }
-        if (useGpu_) {
-          LstmCompute::backwardBatch<1>(
-              lstmValue, lstmGrad, getSize(), batchSize);
-        } else {
-          LstmCompute::backwardBatch<0>(
-              lstmValue, lstmGrad, getSize(), batchSize);
-        }
-      }
-
-      if (n != 0) {
-        MatrixPtr tmp = batchGrad_->getBatchValue(n - 1, batchSize);
-        tmp->mul(*gateGrad, *weightT, 1, 1);
-      }
-
-      if (n != 0 && weight_->getWGrad()) {
-        /* backward weight */
-        MatrixPtr outputValue = batchValue_->getBatchValue(n - 1, batchSize);
-        weight_->getWGrad()->mul(*outputValue->getTranspose(), *gateGrad, 1, 1);
-      } else if (prevOutput_ && weight_->getWGrad()) {
-        weight_->getWGrad()->mul(
-            *prevBatchOutput2_->getTranspose(), *gateGrad, 1, 1);
-      }
-    }
-  }
-
-  if (inputGrad) {
-    batchGrad_->add(*inputGrad, *gate_.grad, /* seq2batch */ false);
-  }
-  if (bias_ && bias_->getWGrad()) {
-    localBiasGrad_->collectBias(*gate_.grad, /* scale */ 1);
-  }
-}
-
-void LstmLayer::forwardSeqParallel(int batchSize,
-                                   size_t numSequences,
-                                   const int *starts,
-                                   MatrixPtr inputValue) {
-  REGISTER_TIMER_INFO("LstmFwSeqParallelTime", getName().c_str());
-  gate_.value->assign(*inputValue);
-  if (bias_) {
-    gate_.value->addBias(*localBias_, /* scale */ 1);
-  }
-
-  real *gateValue = gate_.value->getData();
-  real *stateValue = state_.value->getData();
-  real *outputValue = output_.value->getData();
-  real *preOutputValue = preOutput_.value->getData();
-  real *checkIg = checkIg_->getData();
-  real *checkFg = checkFg_->getData();
-  real *checkOg = checkOg_->getData();
-  real *weight = weight_->getW()->getData();
-  hl_lstm_parallel_forward(gateValue,
-                           stateValue,
-                           preOutputValue,
-                           outputValue,
-                           checkIg,
-                           checkFg,
-                           checkOg,
-                           weight,
-                           starts,
-                           getSize(),
-                           numSequences,
-                           reversed_,
-                           activeNode_,
-                           activeGate_,
-                           activeState_);
-}
-
-void LstmLayer::backwardSeqParallel(int batchSize,
-                                    size_t numSequences,
-                                    const int *starts,
-                                    MatrixPtr inputGrad) {
-  REGISTER_TIMER_INFO("LstmBwSeqParallelTime", getName().c_str());
-  real *gateValue = gate_.value->getData();
-  real *gateGrad = gate_.grad->getData();
-  real *stateValue = state_.value->getData();
-  real *stateGrad = state_.grad->getData();
-  real *preOutputValue = preOutput_.value->getData();
-  real *preOutputGrad = preOutput_.grad->getData();
-  real *checkIg = checkIg_->getData();
-  real *checkFg = checkFg_->getData();
-  real *checkOg = checkOg_->getData();
-  real *outputGrad = output_.grad->getData();
-  real *weight = weight_->getW()->getData();
-
-  real *checkIgGrad;
-  real *checkFgGrad;
-  real *checkOgGrad;
-  if (bias_->getWGrad()) {
-    checkIgGrad = checkIgGrad_->getData();
-    checkFgGrad = checkFgGrad_->getData();
-    checkOgGrad = checkOgGrad_->getData();
-  } else {
-    checkIgGrad = nullptr;
-    checkFgGrad = nullptr;
-    checkOgGrad = nullptr;
-  }
-
-  hl_lstm_parallel_backward_data(gateValue,
-                                 gateGrad,
-                                 stateValue,
-                                 stateGrad,
-                                 preOutputValue,
-                                 preOutputGrad,
-                                 outputGrad,
-                                 checkIg,
-                                 checkIgGrad,
-                                 checkFg,
-                                 checkFgGrad,
-                                 checkOg,
-                                 checkOgGrad,
-                                 weight,
-                                 starts,
-                                 getSize(),
-                                 numSequences,
-                                 reversed_,
-                                 activeNode_,
-                                 activeGate_,
-                                 activeState_);
-
-  if (inputGrad) {
-    inputGrad->add(*gate_.grad);
-  }
-  if (bias_ && bias_->getWGrad()) {
-    localBiasGrad_->collectBias(*gate_.grad, 1);
-  }
-
-  real *outputValue = output_.value->getData();
-  if (weight_->getWGrad()) {
-    real *weightGrad = weight_->getWGrad()->getData();
-    hl_lstm_parallel_backward_weight(weightGrad,
-                                     outputValue,
-                                     gateGrad,
-                                     starts,
-                                     getSize(),
-                                     batchSize,
-                                     numSequences,
-                                     reversed_);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/LstmLayer.h b/paddle/gserver/layers/LstmLayer.h
deleted file mode 100644
index 76dfe8146bf67a0b7b4fd4835851fae6ac38d80f..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/LstmLayer.h
+++ /dev/null
@@ -1,221 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "LstmCompute.h"
-#include "SequenceToBatch.h"
-#include "paddle/math/BaseMatrix.h"
-#include "paddle/math/Matrix.h"
-namespace paddle {
-
-/**
- * @brief LstmLayer takes 1 input layer with size * 4.
- * Input layer is diveded into 4 equal parts:
- *   (input_s, input_ig, input_fg, input_og)
- *
- * For each sequence [start, end] it performs the following computation:
- * @code
- * output_{i} = actState(state_{i}) * actGate(outputGate_{i})
- * state_{i} = actInput(input_s_{i} + bias_s +
- *             output_{i-1} * recurrIW) * actGate(inputGate_{i}) +
- *             actGate(forgetGate_{i}) * state_{i-1}
- * inputGate = input_ig_{i} + bias_ig + output_{i-1} * recurrIGW +
- *             state_{i-1} * inputCheck
- * ouputGate = input_og_{i} + bias_og + output_{i-1} * recurrOGW +
- *             state_{i} * outputCheck
- * forgetGate = input_fg_{i} + bias_fg + output_{i-1} * recurrFGW +
- *              state_{i-1} * forgetCheck
- * @endcode
- *
- * - parameter[0] consists of (recurrIW, recurrIGW, recurrFGW, recurrOGW)
- * - baisParameter consists of
- *   (bias_s, bias_ig, bias_og, bias_fg, inputCheck, forgetCheck, outputCheck)
- *
- * - actInput is defined by config active_type.
- * - actState is defined by config active_state_type.
- * - actGate is defined by config actvie_gate_type.
- *
- * There are two ways to compute, namely one sequence by one sequence or
- * one batch by one batch. By default and no setting pre_batch_state true,
- * it will compute batch by batch.
- *
- * The formula in the paper is as follows:
- * \f[
- * i_t = \sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i) \\
- * f_t = \sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f) \\
- * \tilde{c_t} = tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c) \\
- * o_t = \sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o) \\
- * c_t = f_t * c_{t-1} + i_t * \tilde{c_t} \\
- * h_t = o_t tanh(c_t)
- * \f]
- *
- * @note These \f$W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}\f$
- * operations on the input sequence were NOT included in LstmLayer. So
- * users should use fc_layer or mixed_layer before lstm_later.
- *
- * The weight ([size, 4*size]) contains \f$W_{hi}, W_{hf}, W_{hc}, W_{ho}\f$.
- * The bias contains \f$b_i, b_f, b_c, b_o\f$ and \f$W_{ci}, W_{cf}, W_{co}\f$.
- */
-
-class LstmLayer : public Layer, public LstmCompute {
- public:
-  explicit LstmLayer(const LayerConfig &config) : Layer(config) {}
-
-  bool init(const LayerMap &layerMap,
-            const ParameterMap &parameterMap) override;
-
-  void forward(PassType passType) override;
-
-  void backward(const UpdateCallback &callback) override;
-
-  void resetState() override;
-
-  void setState(LayerStatePtr state) override;
-
-  LayerStatePtr getState() override;
-
- protected:
-  /**
-   * @brief Compute lstm forward one sequence by one sequence.
-   * @param batchSize The batchSize is not equal to the batch_size in
-   * the config file. It is the total words number of all samples
-   * in this forward batch.
-   * @param numSequences The sample number. It is equal to the batch_size
-   * in the config file.
-   * @param starts Each start position of each samples.
-   * @param inputValue The input values.
-   */
-  void forwardSequence(int batchSize,
-                       size_t numSequences,
-                       const int *starts,
-                       MatrixPtr inputValue);
-  /**
-   * Compute lstm backward one sequence by one sequence.
-   */
-  void backwardSequence(int batchSize,
-                        size_t numSequences,
-                        const int *starts,
-                        MatrixPtr inputGrad);
-
-  /**
-   * Compute lstm forward one batch by one batch. The batch value is
-   * reorganized by SequenceToBatch class. The batch output value will
-   * be convert into sequence value after finishing forward. Here, one
-   * batch contains one word of each sample. If the length of each sample
-   * is not equality, the batch will not pads zero and contains less words.
-   * The total batch numbers are the max length of the sequence. The details
-   * can refer to SequenceToBatch class. On GPU mode, it will launch GPU
-   * kernel for loop.
-   *
-   * @code
-   * for (int i = 0; i < numBatch(max_sequence_length); ++i) {
-   *   compute one batch.
-   * }
-   * @endcode
-   */
-  void forwardBatch(int batchSize,
-                    size_t numSequences,
-                    const int *starts,
-                    MatrixPtr inputValue);
-  /**
-   * Compute lstm backward one batch by one batch.
-   */
-  void backwardBatch(int batchSize,
-                     size_t numSequences,
-                     const int *starts,
-                     MatrixPtr inputGrad);
-
-  /**
-   * This function only supports GPU. It not need to reorganize input into
-   * batch value. It will launch one kernel to parallelly compute forward
-   * propagation in sequence level.
-   */
-  void forwardSeqParallel(int batchSize,
-                          size_t numSequences,
-                          const int *starts,
-                          MatrixPtr inputValue);
-  /**
-   * Backward propagation corresponding to forwardSeqParallel.
-   */
-  void backwardSeqParallel(int batchSize,
-                           size_t numSequences,
-                           const int *starts,
-                           MatrixPtr inputGrad);
-  /**
-   * This function is used for sequence generation and get output after
-   * forwardBatch.
-   */
-  void getPrevBatchOutput(size_t numSequences);
-  /**
-   * This function is used for sequence generation and get state after
-   * forwardBatch.
-   */
-  void getPrevBatchState(size_t numSequences);
-
- protected:
-  /// Learned parameters, shape: (size, 4*size).
-  /// The weight ([size, 4*size]) contains \f$W_{hi}, W_{hf}, W_{hc}, W_{ho}\f$.
-  std::unique_ptr<Weight> weight_;
-  /// Learned bias parameter, shape: (1, 7 * size).
-  /// The bias contains \f$b_i, b_f, b_c, b_o\f$ and \f$W_{ci}, W_{cf},
-  /// W_{co}\f$.
-  std::unique_ptr<Weight> bias_;
-  /// The reeal bias, point to \f$b_i, b_f, b_c, b_o\f$.
-  MatrixPtr localBias_;
-  /// The peephole connection for input gate.
-  MatrixPtr checkIg_;
-  /// The peephole connection for forget gate.
-  MatrixPtr checkFg_;
-  /// The peephole connection for output gate.
-  MatrixPtr checkOg_;
-  /// The gradient of real bias
-  MatrixPtr localBiasGrad_;
-  /// The gradient of peephole connection for input gates.
-  MatrixPtr checkIgGrad_;
-  /// The gradient of peephole connection for forget gates.
-  MatrixPtr checkFgGrad_;
-  /// The gradient of peephole connection for output gates.
-  MatrixPtr checkOgGrad_;
-
-  /// Stores the cell state of previous time step, namely \f$c_{t-1}\f$.
-  Argument state_;
-  /// Stores the hidden of previous time step, namely \f$h_{t-1}\f$.
-  Argument preOutput_;
-  /// Stores the value and gradient of four gates, namely
-  /// \f$i_t, f_t, o_t, c_t\f$.
-  Argument gate_;
-  /// Whether it is reversed lstm.
-  bool reversed_;
-  /// Whether to use batch method to compute.
-  bool useBatch_;
-  /// Whether to use sequence parallell method to compute.
-  bool useSeqParallel_;
-  /// batchValue_ is used in method of batch calculation. It stores the
-  /// batch value after reorganized input.
-  std::unique_ptr<SequenceToBatch> batchValue_;
-  /// The gradient of batchValue_.
-  std::unique_ptr<SequenceToBatch> batchGrad_;
-
-  /// Used in generation and stores the state of previous time step.
-  MatrixPtr prevState_;
-  /// Used in generation and stores the output of previous time step.
-  MatrixPtr prevOutput_;
-  MatrixPtr prevBatchOutput2_;
-  /// The total state.
-  MatrixPtr totalState_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/LstmStepLayer.cpp b/paddle/gserver/layers/LstmStepLayer.cpp
deleted file mode 100644
index c44768ddb2b903763288465325899d86176df73a..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/LstmStepLayer.cpp
+++ /dev/null
@@ -1,194 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "LstmCompute.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/*
- * LstmStepLayer used in recurrent layer group.
- */
-class LstmStepLayer : public Layer, public LstmCompute {
- protected:
-  Argument state_;
-  Argument gate_;
-  Argument stateActive_;
-  MatrixPtr checkIg_, checkFg_, checkOg_;
-  MatrixPtr checkIgGrad_, checkFgGrad_, checkOgGrad_;
-  std::unique_ptr<Weight> weight_;
-
- public:
-  explicit LstmStepLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~LstmStepLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(lstm_step, LstmStepLayer);
-
-bool LstmStepLayer::init(const LayerMap& layerMap,
-                         const ParameterMap& parameterMap) {
-  if (!Layer::init(layerMap, parameterMap)) return false;
-  CHECK_EQ(2U, inputLayers_.size());
-
-  checkIg_ = Matrix::create(nullptr,
-                            /* height= */ 1,
-                            getSize(),
-                            /* trans= */ false,
-                            useGpu_);
-  checkFg_ = Matrix::create(nullptr,
-                            /* height= */ 1,
-                            getSize(),
-                            /* trans= */ false,
-                            useGpu_);
-  checkOg_ = Matrix::create(nullptr,
-                            /* height= */ 1,
-                            getSize(),
-                            /* trans= */ false,
-                            useGpu_);
-  checkIgGrad_ = Matrix::create(nullptr,
-                                /* height= */ 1,
-                                getSize(),
-                                /* trans= */ false,
-                                useGpu_);
-  checkFgGrad_ = Matrix::create(nullptr,
-                                /* height= */ 1,
-                                getSize(),
-                                /* trans= */ false,
-                                useGpu_);
-  checkOgGrad_ = Matrix::create(nullptr,
-                                /* height= */ 1,
-                                getSize(),
-                                /* trans= */ false,
-                                useGpu_);
-
-  if (biasParameter_.get() != NULL) {
-    CHECK_EQ(getSize() * 3, biasParameter_->getSize());
-    weight_.reset(new Weight(1, getSize() * 3, biasParameter_));
-    if (weight_->getW()) {
-      real* data = weight_->getW()->getData();
-      checkIg_->setData(data);
-      checkFg_->setData(data + getSize());
-      checkOg_->setData(data + getSize() * 2);
-    }
-
-    if (weight_->getWGrad()) {
-      real* data = weight_->getWGrad()->getData();
-      checkIgGrad_->setData(data);
-      checkFgGrad_->setData(data + getSize());
-      checkOgGrad_->setData(data + getSize() * 2);
-    }
-  }
-
-  setOutput("state", &state_);
-  LstmCompute::init(config_);
-  return true;
-}
-
-void LstmStepLayer::forward(PassType passType) {
-  REGISTER_TIMER_INFO("LstmRecurrentFwTime", getName().c_str());
-  Layer::forward(passType);
-
-  const Argument& input = getInput(0);
-  const Argument& prevState = getInput(1);
-  CHECK_EQ(getSize() * 4, input.value->getWidth());
-  CHECK_EQ(getSize(), prevState.value->getWidth());
-  int batchSize = input.getBatchSize();
-  reserveOutput(batchSize, getSize());
-  resetSpecifyOutput(state_,
-                     batchSize,
-                     getSize(),
-                     /*  isValueClean */ false,
-                     /* isGradClean */ true);
-  resetSpecifyOutput(gate_,
-                     batchSize,
-                     getSize() * 4,
-                     /* isValueClean */ false,
-                     /* isGradClean */ false);
-  resetSpecifyOutput(stateActive_,
-                     batchSize,
-                     getSize(),
-                     /*  isValueClean */ false,
-                     /* isGradClean */ false);
-  gate_.value->assign(*input.value);
-
-  hl_lstm_value lstmValue;
-  lstmValue.checkIg = checkIg_->getData();
-  lstmValue.checkFg = checkFg_->getData();
-  lstmValue.checkOg = checkOg_->getData();
-  lstmValue.gateValue = gate_.value->getData();
-  lstmValue.stateValue = state_.value->getData();
-  lstmValue.prevStateValue = prevState.value->getData();
-  lstmValue.stateActiveValue = stateActive_.value->getData();
-  lstmValue.outputValue = output_.value->getData();
-
-  if (useGpu_) {
-    LstmCompute::forwardBatch<1>(lstmValue, getSize(), batchSize);
-  } else {
-    LstmCompute::forwardBatch<0>(lstmValue, getSize(), batchSize);
-  }
-}
-
-void LstmStepLayer::backward(const UpdateCallback& callback) {
-  REGISTER_TIMER_INFO("LstmRecurrentBwTime", getName().c_str());
-  const Argument& input = getInput(0);
-  const Argument& prevState = getInput(1);
-  int batchSize = input.getBatchSize();
-
-  hl_lstm_value lstmValue;
-  hl_lstm_grad lstmGrad;
-  lstmValue.checkIg = checkIg_->getData();
-  lstmValue.checkFg = checkFg_->getData();
-  lstmValue.checkOg = checkOg_->getData();
-  lstmValue.gateValue = gate_.value->getData();
-  lstmValue.prevStateValue = prevState.value->getData();
-  lstmValue.stateValue = state_.value->getData();
-  lstmValue.stateActiveValue = stateActive_.value->getData();
-
-  lstmGrad.gateGrad = gate_.grad->getData();
-  if (prevState.grad) {
-    lstmGrad.prevStateGrad = prevState.grad->getData();
-  } else {
-    lstmGrad.prevStateGrad = nullptr;
-  }
-  lstmGrad.stateGrad = state_.grad->getData();
-  lstmGrad.stateActiveGrad = stateActive_.grad->getData();
-  lstmGrad.outputGrad = output_.grad->getData();
-  lstmGrad.checkIgGrad = checkIgGrad_->getData();
-  lstmGrad.checkFgGrad = checkFgGrad_->getData();
-  lstmGrad.checkOgGrad = checkOgGrad_->getData();
-
-  if (useGpu_) {
-    LstmCompute::backwardBatch<1>(lstmValue, lstmGrad, getSize(), batchSize);
-  } else {
-    LstmCompute::backwardBatch<0>(lstmValue, lstmGrad, getSize(), batchSize);
-  }
-
-  if (input.grad) {
-    input.grad->add(*gate_.grad);
-  }
-
-  if (weight_) {
-    weight_->getParameterPtr()->incUpdate(callback);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MDLstmLayer.cpp b/paddle/gserver/layers/MDLstmLayer.cpp
deleted file mode 100644
index 22c28157c5a5b19aa54b3151a6c9a4cdcfb01765..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MDLstmLayer.cpp
+++ /dev/null
@@ -1,769 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "LstmLayer.h"
-#include "paddle/math/BaseMatrix.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-class CoordIterator {
- public:
-  std::vector<int> dims_;
-  std::vector<bool> directions_;
-  std::vector<int> curPos_;
-  bool end_;
-
-  void step(size_t d, bool reversed) {
-    if (directions_[d] ^ reversed) {
-      if (curPos_[d] == dims_[d] - 1) {
-        curPos_[d] = 0;
-        if (d) {
-          step(d - 1, reversed);
-        } else {
-          end_ = true;
-        }
-      } else {
-        curPos_[d]++;
-      }
-    } else {
-      if (curPos_[d] == 0) {
-        curPos_[d] = dims_[d] - 1;
-        if (d) {
-          step(d - 1, reversed);
-        } else {
-          end_ = true;
-        }
-      } else {
-        curPos_[d]--;
-      }
-    }
-  }
-
- public:
-  CoordIterator(std::vector<int> dim, std::vector<bool> directions)
-      : dims_(dim), directions_(directions), end_(false) {
-    CHECK_EQ(dims_.size(), directions_.size());
-    for (size_t i = 0; i < dims_.size(); i++) {
-      curPos_.push_back(-1);
-    }
-  }
-  CoordIterator& operator++() {
-    step(dims_.size() - 1, false);
-    return *this;
-  }
-
-  CoordIterator& operator--() {
-    step(dims_.size() - 1, true);
-    return *this;
-  }
-
-  std::vector<int>& curPos() { return curPos_; }
-
-  int offset() {
-    int offset = curPos_[0];
-    for (size_t i = 1; i < dims_.size(); i++) {
-      offset = offset * dims_[i] + curPos_[i];
-    }
-    return offset;
-  }
-
-  int offset(const std::vector<int>& pos) {
-    int offset = pos[0];
-    for (size_t i = 1; i < dims_.size(); i++) {
-      offset = offset * dims_[i] + pos[i];
-    }
-    return offset;
-  }
-
-  std::vector<int>& begin() {
-    for (size_t i = 0; i < dims_.size(); i++) {
-      curPos_[i] = directions_[i] ? 0 : dims_[i] - 1;
-    }
-    end_ = false;
-    return curPos_;
-  }
-
-  std::vector<int>& rbegin() {
-    for (size_t i = 0; i < dims_.size(); i++) {
-      curPos_[i] = directions_[i] ? dims_[i] - 1 : 0;
-    }
-    end_ = false;
-    return curPos_;
-  }
-
-  bool end() { return end_; }
-
-  bool getPrePos(const std::vector<int>& delays,
-                 int idx,
-                 std::vector<int>& prePos) {
-    bool isAvial = true;
-    prePos.clear();
-    prePos.reserve(directions_.size());
-    for (size_t i = 0; i < directions_.size(); i++) {
-      if (int(i) == idx) {
-        prePos.push_back(curPos_[i] + delays[i] * (directions_[i] ? 1 : -1));
-        if (prePos[i] < 0) {
-          prePos[i] = 0;
-          isAvial = false;
-        }
-        if (prePos[i] >= dims_[i]) {
-          prePos[i] = dims_[i] - 1;
-          isAvial = false;
-        }
-      } else {
-        prePos.push_back(curPos_[i]);
-      }
-    }
-    return isAvial;
-  }
-
-  bool getNextPos(const std::vector<int>& delays,
-                  int idx,
-                  std::vector<int>& nextPos) {
-    bool isAvial = true;
-    nextPos.clear();
-    nextPos.reserve(directions_.size());
-    for (size_t i = 0; i < directions_.size(); i++) {
-      if (int(i) == idx) {
-        nextPos.push_back(curPos_[i] - delays[i] * (directions_[i] ? 1 : -1));
-        if (nextPos[i] < 0) {
-          nextPos[i] = 0;
-          isAvial = false;
-        }
-        if (nextPos[i] >= dims_[i]) {
-          nextPos[i] = dims_[i] - 1;
-          isAvial = false;
-        }
-      } else {
-        nextPos.push_back(curPos_[i]);
-      }
-    }
-    return isAvial;
-  }
-};
-/*
- * MDLstmLayer takes 1 input layer with size * (3+numDims).
- * For each sequence [start, end] it performs the following computation:
- * out_i = actState(state_i) * actGate(outputGate_i)
- *
- * For example the image with 2 dims, we take the scanning order from left-top
- * to right-bottom, then the 2 previous states of the current pixels are the
- * ones located at left and top. And each of them has a independent forget gate.
- *
- * state_i = actInput(input_i) * actGate(inputGate_i) +
- *           \sum{j}(actGate(forgetGate_i_j) * state_prev_i_j)
- *
- * inputGate = input_i * inputW + \sum{j}(output_prev_i_j * recurrInputW_j) +
- *             \sum{j}(state_prev_i_j * inputCheck_j)
- *
- * ouputGate = input_i * outputW + \sum{j}(output_prev_i_j * recurrOutputW_j) +
- *             state_i * outputCheck
- *
- * forgetGate_j = input_i * forgetW_j + \sum{j}(output_prev_i_j *
- *                recurrForgetW_j) + \sum{j}(state_prev_i_j * forgetCheck_j)
- *
- * IG Layer: (Input, InputGate, ForgetGates, OutputGate) * OutputSize
- * */
-
-class MDLstmLayer : public LstmLayer {
- public:
-  explicit MDLstmLayer(const LayerConfig& config) : LstmLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-
-  void backward(const UpdateCallback& callback) override;
-
- protected:
-  void forwardOneSequence(int start, CoordIterator& coordIter);
-  void backwardOneSequence(int start, CoordIterator& coordIter);
-  void forwardGate2OutputSequence(int start, CoordIterator& coordIter);
-  void backwardGate2OutputSequence(int start, CoordIterator& coordIter);
-
- protected:
-  std::vector<Argument> frameInputGate_;
-  std::vector<Argument> frameForgetGate_;
-  std::vector<Argument> frameOutputGate_;
-  std::vector<Argument> frameInputNode_;
-  std::vector<Argument> frameGate_;
-  std::vector<Argument> frameState_;
-  std::vector<Argument> framePreOutput_;
-  std::vector<Argument> frameOutput_;
-
-  // Activation
-  std::unique_ptr<ActivationFunction> activationGate_;
-  std::unique_ptr<ActivationFunction> activationState_;
-
-  int numDims_;
-  size_t numBlocks_;
-  std::vector<bool> directions_;
-  std::vector<int> delays_;
-  std::vector<std::vector<int>> dimsV_;
-};
-
-REGISTER_LAYER(mdlstmemory, MDLstmLayer);
-
-bool MDLstmLayer::init(const LayerMap& layerMap,
-                       const ParameterMap& parameterMap) {
-  if (!Layer::init(layerMap, parameterMap)) return false;
-  CHECK_EQ(1U, inputLayers_.size());
-  CHECK_EQ(1U, parameters_.size());
-
-  numBlocks_ = getSize();
-  numDims_ = config_.directions_size();
-  CHECK_EQ(numBlocks_ * numBlocks_ * (3 + numDims_), parameters_[0]->getSize());
-
-  // inode(1), ig(1), fg(numDims_), og(1), peepIg(1), peepFg(numDims_),
-  // peepOg(1), then size of localBias_ is 3+numDims_
-  CHECK_EQ(numBlocks_ * (5 + 2 * numDims_), biasParameter_->getSize());
-  weight_.reset(
-      new Weight(numBlocks_, numBlocks_ * (3 + numDims_), parameters_[0]));
-  if (biasParameter_.get() != NULL) {
-    bias_.reset(new Weight(1, numBlocks_ * (5 + 2 * numDims_), biasParameter_));
-    localBias_ = Matrix::create(nullptr,
-                                /* height= */ 1,
-                                numBlocks_ * (3 + numDims_),
-                                /* trans= */ false,
-                                useGpu_);
-    checkIg_ = Matrix::create(nullptr,
-                              /* height= */ 1,
-                              numBlocks_,
-                              /* trans= */ false,
-                              useGpu_);
-    checkFg_ = Matrix::create(nullptr,
-                              /* height= */ numDims_,
-                              numBlocks_,
-                              /* trans= */ false,
-                              useGpu_);
-    checkOg_ = Matrix::create(nullptr,
-                              /* height= */ 1,
-                              numBlocks_,
-                              /* trans= */ false,
-                              useGpu_);
-    localBiasGrad_ = Matrix::create(nullptr,
-                                    /* height= */ 1,
-                                    numBlocks_ * (3 + numDims_),
-                                    /* trans= */ false,
-                                    useGpu_);
-    checkIgGrad_ = Matrix::create(nullptr,
-                                  /* height= */ 1,
-                                  numBlocks_,
-                                  /* trans= */ false,
-                                  useGpu_);
-    checkFgGrad_ = Matrix::create(nullptr,
-                                  /* height= */ numDims_,
-                                  numBlocks_,
-                                  /* trans= */ false,
-                                  useGpu_);
-    checkOgGrad_ = Matrix::create(nullptr,
-                                  /* height= */ 1,
-                                  numBlocks_,
-                                  /* trans= */ false,
-                                  useGpu_);
-
-    localBias_->setData(bias_->getW()->getData());
-    checkIg_->setData(bias_->getW()->getData() + numBlocks_ * (3 + numDims_));
-    checkFg_->setData(bias_->getW()->getData() + numBlocks_ * (4 + numDims_));
-    checkOg_->setData(bias_->getW()->getData() +
-                      numBlocks_ * (4 + 2 * numDims_));
-
-    if (bias_->getWGrad()) {
-      localBiasGrad_->setData(bias_->getWGrad()->getData());
-      checkIgGrad_->setData(bias_->getWGrad()->getData() +
-                            numBlocks_ * (3 + numDims_));
-      checkFgGrad_->setData(bias_->getWGrad()->getData() +
-                            numBlocks_ * (4 + numDims_));
-      checkOgGrad_->setData(bias_->getWGrad()->getData() +
-                            numBlocks_ * (4 + 2 * numDims_));
-    }
-  } else {
-    LOG(FATAL) << "Bias should be here.";
-  }
-  for (int i = 0; i < numDims_; i++) {
-    directions_.push_back(config_.directions(i));
-  }
-  for (int i = 0; i < numDims_; i++) {
-    delays_.push_back(-1);
-  }
-  activationGate_.reset(ActivationFunction::create(config_.active_gate_type()));
-  activationState_.reset(
-      ActivationFunction::create(config_.active_state_type()));
-
-  return true;
-}
-
-void MDLstmLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  const Argument& input = getInput(0);
-  CHECK(input.sequenceStartPositions);
-  int batchSize = input.getBatchSize();
-  int numSequences = input.getNumSequences();
-  resetOutput(batchSize, numBlocks_);
-  CHECK_EQ(numBlocks_ * (3 + numDims_), input.value->getWidth());
-  const int* starts = input.sequenceStartPositions->getData(false);
-  CHECK_EQ(starts[numSequences], batchSize);
-
-  int* dimsData = input.cpuSequenceDims->getData();
-  CHECK_EQ(int(input.cpuSequenceDims->getSize()), numDims_* numSequences);
-
-  for (int i = 0; i < numSequences; i++) {
-    std::vector<int> dims;
-    for (int j = 0; j < numDims_; j++) {
-      dims.push_back(dimsData[i * numDims_ + j]);
-    }
-    dimsV_.push_back(dims);
-  }
-
-  frameInputGate_.reserve(batchSize);
-  frameForgetGate_.reserve(batchSize);
-  frameOutputGate_.reserve(batchSize);
-  frameInputNode_.reserve(batchSize);
-  frameGate_.reserve(batchSize);
-  frameState_.reserve(batchSize);
-  framePreOutput_.reserve(batchSize);
-  frameOutput_.reserve(batchSize);
-
-  Matrix::resizeOrCreate(gate_.value,
-                         /* height= */ batchSize,
-                         numBlocks_ * (3 + numDims_),
-                         /* trans= */ false,
-                         useGpu_);
-
-  for (int i = frameGate_.size(); i < batchSize; i++) {
-    Argument arg;
-    arg.value = Matrix::create(nullptr,
-                               /* height= */ 1,
-                               numBlocks_ * (3 + numDims_),
-                               /* trans= */ false,
-                               useGpu_);
-    arg.grad = Matrix::create(nullptr,
-                              /* height= */ 1,
-                              numBlocks_ * (3 + numDims_),
-                              /* trans= */ false,
-                              useGpu_);
-    frameGate_.push_back(arg);
-  }
-  for (int i = frameInputGate_.size(); i < batchSize; i++) {
-    Argument arg;
-    arg.value = Matrix::create(nullptr,
-                               /* height= */ 1,
-                               numBlocks_,
-                               /* trans= */ false,
-                               useGpu_);
-    arg.grad = Matrix::create(nullptr,
-                              /* height= */ 1,
-                              numBlocks_,
-                              /* trans= */ false,
-                              useGpu_);
-    frameInputGate_.push_back(arg);
-  }
-  for (int i = frameForgetGate_.size(); i < batchSize; i++) {
-    Argument arg;
-    arg.value = Matrix::create(nullptr,
-                               /* height= */ numDims_,
-                               numBlocks_,
-                               /* trans= */ false,
-                               useGpu_);
-    arg.grad = Matrix::create(nullptr,
-                              /* height= */ numDims_,
-                              numBlocks_,
-                              /* trans= */ false,
-                              useGpu_);
-    frameForgetGate_.push_back(arg);
-  }
-  for (int i = frameOutputGate_.size(); i < batchSize; i++) {
-    Argument arg;
-    arg.value = Matrix::create(nullptr,
-                               /* height= */ 1,
-                               numBlocks_,
-                               /* trans= */ false,
-                               useGpu_);
-    arg.grad = Matrix::create(nullptr,
-                              /* height= */ 1,
-                              numBlocks_,
-                              /* trans= */ false,
-                              useGpu_);
-    frameOutputGate_.push_back(arg);
-  }
-  for (int i = frameInputNode_.size(); i < batchSize; i++) {
-    Argument arg;
-    arg.value = Matrix::create(nullptr,
-                               /* height= */ 1,
-                               numBlocks_,
-                               /* trans= */ false,
-                               useGpu_);
-    arg.grad = Matrix::create(nullptr,
-                              /* height= */ 1,
-                              numBlocks_,
-                              /* trans= */ false,
-                              useGpu_);
-    frameInputNode_.push_back(arg);
-  }
-  for (int i = frameState_.size(); i < batchSize; i++) {
-    Argument arg;
-    arg.value = Matrix::create(
-        /* height= */ 1, numBlocks_, /* trans= */ false, useGpu_);
-    frameState_.push_back(arg);
-  }
-  for (int i = framePreOutput_.size(); i < batchSize; i++) {
-    Argument arg;
-    arg.value = Matrix::create(
-        /* height= */ 1, numBlocks_, /* trans= */ false, useGpu_);
-    framePreOutput_.push_back(arg);
-  }
-  for (int i = frameOutput_.size(); i < batchSize; i++) {
-    Argument arg;
-    arg.value = Matrix::create(nullptr,
-                               /* height= */ 1,
-                               numBlocks_,
-                               /* trans= */ false,
-                               useGpu_);
-    arg.grad = Matrix::create(nullptr,
-                              /* height= */ 1,
-                              numBlocks_,
-                              /* trans= */ false,
-                              useGpu_);
-    frameOutput_.push_back(arg);
-  }
-
-  for (int i = 0; i < batchSize; i++) {
-    frameOutput_[i].value->setData(output_.value->getData() + i * numBlocks_);
-    frameGate_[i].value->setData(gate_.value->getData() +
-                                 i * numBlocks_ * (3 + numDims_));
-    frameInputNode_[i].value->setData(gate_.value->getData() +
-                                      i * numBlocks_ * (3 + numDims_) +
-                                      numBlocks_ * 0);
-    frameInputGate_[i].value->setData(gate_.value->getData() +
-                                      i * numBlocks_ * (3 + numDims_) +
-                                      numBlocks_ * 1);
-    frameForgetGate_[i].value->setData(gate_.value->getData() +
-                                       i * numBlocks_ * (3 + numDims_) +
-                                       numBlocks_ * 2);
-    frameOutputGate_[i].value->setData(gate_.value->getData() +
-                                       i * numBlocks_ * (3 + numDims_) +
-                                       numBlocks_ * (2 + numDims_));
-  }
-
-  AsyncGpuBlock asyncGpuBlock;
-  gate_.value->assign(*input.value);
-
-  if (bias_) {
-    gate_.value->addBias(*localBias_, 1);
-  }
-
-  for (int i = 0; i < numSequences; i++) {
-    CoordIterator coordIter(dimsV_[i], directions_);
-    forwardOneSequence(starts[i], coordIter);
-  }
-}
-
-void MDLstmLayer::forwardGate2OutputSequence(int start,
-                                             CoordIterator& coordIter) {
-  int idxCurr = start + coordIter.offset();
-  std::vector<int> preOffsetV;
-  preOffsetV.reserve(numDims_);
-  for (int i = 0; i < numDims_; i++) {
-    std::vector<int> prePos;
-    if (coordIter.getPrePos(delays_, i, prePos)) {
-      preOffsetV[i] = coordIter.offset(prePos);
-    } else {
-      preOffsetV[i] = -1;
-    }
-  }
-
-  for (int i = 0; i < numDims_; i++) {
-    if (preOffsetV[i] >= 0) {
-      frameInputGate_[idxCurr].value->addDotMul(
-          *frameState_[start + preOffsetV[i]].value, *checkIg_, 1.0, 1.0);
-
-      MatrixPtr fgGateOneDim = Matrix::create(
-          frameForgetGate_[idxCurr].value->getData() + i * numBlocks_,
-          1,
-          numBlocks_,
-          false,
-          useGpu_);
-      MatrixPtr checkFgOneDim =
-          Matrix::create(checkFg_->getData() + i * numBlocks_,
-                         1.0,
-                         numBlocks_,
-                         false,
-                         useGpu_);
-      fgGateOneDim->addDotMul(
-          *frameState_[start + preOffsetV[i]].value, *checkFgOneDim, 1.0, 1.0);
-    }
-  }
-  auto status = activationGate_->forward(frameInputGate_[idxCurr]);
-  status.check();
-  status = activationGate_->forward(frameForgetGate_[idxCurr]);
-  status.check();
-  status = activation_->forward(frameInputNode_[idxCurr]);
-  status.check();
-
-  frameState_[idxCurr].value->zeroMem();
-  for (int i = 0; i < numDims_; i++) {
-    if (preOffsetV[i] >= 0) {
-      MatrixPtr fgGateOneDim = Matrix::create(
-          frameForgetGate_[idxCurr].value->getData() + i * numBlocks_,
-          1,
-          numBlocks_,
-          false,
-          useGpu_);
-      frameState_[idxCurr].value->addDotMul(
-          *frameState_[start + preOffsetV[i]].value, *fgGateOneDim, 1.0, 1.0);
-    }
-  }
-  frameState_[idxCurr].value->addDotMul(*frameInputNode_[idxCurr].value,
-                                        *frameInputGate_[idxCurr].value,
-                                        1.0,
-                                        1.0);
-
-  frameOutputGate_[idxCurr].value->addDotMul(
-      *frameState_[idxCurr].value, *checkOg_, 1.0, 1.0);
-  status = activationGate_->forward(frameOutputGate_[idxCurr]);
-  status.check();
-
-  framePreOutput_[idxCurr].value->copyFrom(*(frameState_[idxCurr].value));
-  status = activationState_->forward(framePreOutput_[idxCurr]);
-  status.check();
-
-  frameOutput_[idxCurr].value->dotMul(*framePreOutput_[idxCurr].value,
-                                      *frameOutputGate_[idxCurr].value);
-}
-
-void MDLstmLayer::forwardOneSequence(int start, CoordIterator& coordIter) {
-  for (coordIter.begin(); !coordIter.end(); ++coordIter) {
-    int offset = coordIter.offset();
-    for (int i = 0; i < numDims_; i++) {
-      std::vector<int> prePos;
-      if (coordIter.getPrePos(delays_, i, prePos)) {
-        int preOffset = coordIter.offset(prePos);
-        frameGate_[start + offset].value->mul(
-            *frameOutput_[start + preOffset].value, *weight_->getW(), 1.0, 1.0);
-      }
-    }
-    forwardGate2OutputSequence(start, coordIter);
-  }
-}
-
-void MDLstmLayer::backward(const UpdateCallback& callback) {
-  const Argument& input = getInput(0);
-  CHECK(input.sequenceStartPositions);
-  int batchSize = input.getBatchSize();
-  const int* starts = input.sequenceStartPositions->getData(false);
-  size_t numSequences = input.getNumSequences();
-
-  Matrix::resizeOrCreate(gate_.grad,
-                         /* height= */ batchSize,
-                         numBlocks_ * (3 + numDims_),
-                         /* trans= */ false,
-                         useGpu_);
-
-  for (int i = 0; i < batchSize; i++) {
-    if (frameState_[i].grad == NULL)
-      frameState_[i].grad = Matrix::create(
-          /* height= */ 1, numBlocks_, /* trans= */ false, useGpu_);
-  }
-  for (int i = 0; i < batchSize; i++) {
-    if (framePreOutput_[i].grad == NULL)
-      framePreOutput_[i].grad = Matrix::create(
-          /* height= */ 1, numBlocks_, /* trans= */ false, useGpu_);
-  }
-
-  for (int i = 0; i < batchSize; i++) {
-    frameOutput_[i].grad->setData(output_.grad->getData() + i * numBlocks_);
-    frameGate_[i].grad->setData(gate_.grad->getData() +
-                                i * numBlocks_ * (3 + numDims_));
-    frameInputNode_[i].grad->setData(gate_.grad->getData() +
-                                     i * numBlocks_ * (3 + numDims_) +
-                                     numBlocks_ * 0);
-    frameInputGate_[i].grad->setData(gate_.grad->getData() +
-                                     i * numBlocks_ * (3 + numDims_) +
-                                     numBlocks_ * 1);
-    frameForgetGate_[i].grad->setData(gate_.grad->getData() +
-                                      i * numBlocks_ * (3 + numDims_) +
-                                      numBlocks_ * 2);
-    frameOutputGate_[i].grad->setData(gate_.grad->getData() +
-                                      i * numBlocks_ * (3 + numDims_) +
-                                      numBlocks_ * (2 + numDims_));
-  }
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-
-    for (size_t i = 0; i < numSequences; i++) {
-      CoordIterator coordIter(dimsV_[i], directions_);
-      backwardOneSequence(starts[i], coordIter);
-    }
-  }
-
-  if (input.grad) {
-    input.grad->add(*gate_.grad);
-  }
-  if (bias_ && bias_->getWGrad()) {
-    localBiasGrad_->collectBias(*gate_.grad, 1);
-    bias_->getParameterPtr()->incUpdate(callback);
-  }
-
-  weight_->getParameterPtr()->incUpdate(callback);
-}
-
-void MDLstmLayer::backwardGate2OutputSequence(int start,
-                                              CoordIterator& coordIter) {
-  int idxCurr = start + coordIter.offset();
-  std::vector<int> preOffsetV;
-  std::vector<int> nextOffsetV;
-  preOffsetV.reserve(numDims_);
-  nextOffsetV.reserve(numDims_);
-  for (int i = 0; i < numDims_; i++) {
-    std::vector<int> prePos;
-    if (coordIter.getPrePos(delays_, i, prePos)) {
-      preOffsetV[i] = coordIter.offset(prePos);
-    } else {
-      preOffsetV[i] = -1;
-    }
-    std::vector<int> nextPos;
-    if (coordIter.getNextPos(delays_, i, nextPos)) {
-      nextOffsetV[i] = coordIter.offset(nextPos);
-    } else {
-      nextOffsetV[i] = -1;
-    }
-  }
-
-  framePreOutput_[idxCurr].grad->dotMul(*frameOutput_[idxCurr].grad,
-                                        *frameOutputGate_[idxCurr].value);
-  activationState_->backward(framePreOutput_[idxCurr]).check();
-  frameState_[idxCurr].grad->copyFrom(*(framePreOutput_[idxCurr].grad));
-
-  frameOutputGate_[idxCurr].grad->dotMul(*frameOutput_[idxCurr].grad,
-                                         *framePreOutput_[idxCurr].value);
-  activationGate_->backward(frameOutputGate_[idxCurr]).check();
-
-  frameState_[idxCurr].grad->addDotMul(
-      *frameOutputGate_[idxCurr].grad, *checkOg_, 1.0, 1.0);
-  for (int i = 0; i < numDims_; i++) {
-    if (nextOffsetV[i] >= 0) {
-      frameState_[idxCurr].grad->addDotMul(
-          *frameInputGate_[start + nextOffsetV[i]].grad, *checkIg_, 1.0, 1.0);
-
-      MatrixPtr fgGateOneDimGrad = Matrix::create(
-          frameForgetGate_[start + nextOffsetV[i]].grad->getData() +
-              i * numBlocks_,
-          1,
-          numBlocks_,
-          false,
-          useGpu_);
-      MatrixPtr fgGateOneDimVal = Matrix::create(
-          frameForgetGate_[start + nextOffsetV[i]].value->getData() +
-              i * numBlocks_,
-          1,
-          numBlocks_,
-          false,
-          useGpu_);
-      MatrixPtr checkFgOneDim = Matrix::create(
-          checkFg_->getData() + i * numBlocks_, 1, numBlocks_, false, useGpu_);
-
-      frameState_[idxCurr].grad->addDotMul(
-          *fgGateOneDimGrad, *checkFgOneDim, 1.0, 1.0);
-      frameState_[idxCurr].grad->addDotMul(
-          *frameState_[start + nextOffsetV[i]].grad,
-          *fgGateOneDimVal,
-          1.0,
-          1.0);
-    }
-  }
-
-  frameInputNode_[idxCurr].grad->dotMul(*frameState_[idxCurr].grad,
-                                        *frameInputGate_[idxCurr].value);
-  frameInputGate_[idxCurr].grad->dotMul(*frameState_[idxCurr].grad,
-                                        *frameInputNode_[idxCurr].value);
-
-  frameForgetGate_[idxCurr].grad->zeroMem();
-  for (int i = 0; i < numDims_; i++) {
-    if (preOffsetV[i] >= 0) {
-      MatrixPtr fgGateOneDimGrad = Matrix::create(
-          frameForgetGate_[idxCurr].grad->getData() + i * numBlocks_,
-          1,
-          numBlocks_,
-          false,
-          useGpu_);
-      fgGateOneDimGrad->addDotMul(*frameState_[idxCurr].grad,
-                                  *frameState_[start + preOffsetV[i]].value,
-                                  1.0,
-                                  1.0);
-    }
-  }
-
-  activationGate_->backward(frameInputGate_[idxCurr]).check();
-  activationGate_->backward(frameForgetGate_[idxCurr]).check();
-  activation_->backward(frameInputNode_[idxCurr]).check();
-
-  if (bias_->getWGrad()) {
-    for (int i = 0; i < numDims_; i++) {
-      if (preOffsetV[i] >= 0) {
-        checkIgGrad_->addDotMul(*frameInputGate_[idxCurr].grad,
-                                *frameState_[start + preOffsetV[i]].value,
-                                1.0,
-                                1.0);
-
-        MatrixPtr fgGateOneDimGrad = Matrix::create(
-            frameForgetGate_[idxCurr].grad->getData() + i * numBlocks_,
-            1,
-            numBlocks_,
-            false,
-            useGpu_);
-        MatrixPtr checkFgOneDimGrad =
-            Matrix::create(checkFgGrad_->getData() + i * numBlocks_,
-                           1,
-                           numBlocks_,
-                           false,
-                           useGpu_);
-        checkFgOneDimGrad->addDotMul(*fgGateOneDimGrad,
-                                     *frameState_[start + preOffsetV[i]].value,
-                                     1.0,
-                                     1.0);
-      }
-    }
-    checkOgGrad_->addDotMul(
-        *frameOutputGate_[idxCurr].grad, *frameState_[idxCurr].value, 1.0, 1.0);
-  }
-}
-
-void MDLstmLayer::backwardOneSequence(int start, CoordIterator& coordIter) {
-  MatrixPtr weightT = weight_->getW()->getTranspose();
-  for (coordIter.rbegin(); !coordIter.end(); --coordIter) {
-    int offset = coordIter.offset();
-    backwardGate2OutputSequence(start, coordIter);
-    for (int i = 0; i < numDims_; i++) {
-      std::vector<int> prePos;
-      if (coordIter.getPrePos(delays_, i, prePos)) {
-        int preOffset = coordIter.offset(prePos);
-        frameOutput_[start + preOffset].grad->mul(
-            *frameGate_[start + offset].grad, *weightT, 1.0, 1.0);
-        if (weight_->getWGrad()) {
-          weight_->getWGrad()->mul(
-              *frameOutput_[start + preOffset].value->getTranspose(),
-              *frameGate_[start + offset].grad,
-              1.0,
-              1.0);
-        }
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNConvLayer.cpp b/paddle/gserver/layers/MKLDNNConvLayer.cpp
deleted file mode 100644
index a442a0a01369f4ceb27ba4a1976df7f6e25b832f..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MKLDNNConvLayer.cpp
+++ /dev/null
@@ -1,388 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MKLDNNConvLayer.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/utils/Logging.h"
-
-using namespace mkldnn;  // NOLINT
-typedef memory::format format;
-
-namespace paddle {
-
-REGISTER_LAYER(mkldnn_conv, MKLDNNConvLayer);
-
-bool MKLDNNConvLayer::init(const LayerMap& layerMap,
-                           const ParameterMap& parameterMap) {
-  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
-    return false;
-  }
-  CHECK_EQ(inputLayers_.size(), 1UL) << "Only support one input layer yet";
-  CHECK_EQ(inputLayers_.size(), parameters_.size());
-  CHECK(config_.shared_biases()) << "Only support shared biases yet";
-
-  oc_ = config_.num_filters();
-  const ConvConfig& conf = config_.inputs(0).conv_conf();
-  ic_ = conf.channels();
-  fw_ = conf.filter_size();
-  fh_ = conf.filter_size_y();
-  pw_ = conf.padding();
-  ph_ = conf.padding_y();
-  dw_ = conf.dilation();
-  dh_ = conf.dilation_y();
-  sw_ = conf.stride();
-  sh_ = conf.stride_y();
-  gp_ = conf.groups();
-  oh_ = conf.output_y();
-  ow_ = conf.output_x();
-  ih_ = conf.img_size_y();
-  iw_ = conf.img_size();
-  caffeMode_ = conf.caffe_mode();
-  CHECK(caffeMode_) << "Only support caffe mode yet";
-  CHECK(dh_ == 1 && dw_ == 1) << "Only support dilation 1 yet";
-  // check group setting
-  CHECK_EQ((oc_ / gp_) * gp_, oc_) << "group is indivisible for oc";
-  CHECK_EQ((ic_ / gp_) * gp_, ic_) << "group is indivisible for ic";
-
-  // create weight
-  size_t height = oc_ / gp_;
-  size_t width = ic_ * fh_ * fw_;
-  CHECK_EQ(parameters_[0]->getSize(), height * width);
-  weight_ =
-      std::unique_ptr<Weight>(new Weight(height, width, parameters_[0], 0));
-
-  // create biases
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, oc_, biasParameter_, 0));
-  }
-  return true;
-}
-
-void MKLDNNConvLayer::convertWeightsFromPaddle() {
-  if (hasInitedWgt_) {
-    return;
-  }
-
-  CHECK(wgtVal_) << "should have been initialized";
-  // the paddle weight format is oihw or goihw
-  auto targetDim = wgtVal_->getDims();
-  auto srcFmt = (gp_ == 1) ? memory::format::oihw : memory::format::goihw;
-  wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim);
-  hasInitedWgt_ = true;
-}
-
-void MKLDNNConvLayer::convertWeightsToPaddle() {
-  CHECK(wgtVal_) << "should have been initialized";
-  auto targetDim = wgtVal_->getDims();
-  auto dstFmt = (gp_ == 1) ? memory::format::oihw : memory::format::goihw;
-  wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
-}
-
-void MKLDNNConvLayer::reshape(
-    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
-  reshapeInput(bs, ih, iw);
-
-  // cal output sizes
-  // oc can not be changed
-  int fh = (fh_ - 1) * dh_ + 1;
-  int fw = (fw_ - 1) * dw_ + 1;
-  oh = outputSize(ih, fh, ph_, sh_, caffeMode_);
-  ow = outputSize(iw, fw, pw_, sw_, caffeMode_);
-
-  reshapeOutput(oh, ow);
-  resizeOutput(bs, oc * oh * ow);
-}
-
-void MKLDNNConvLayer::resetFwd(std::vector<primitive>& pipeline,
-                               std::vector<MKLDNNMatrixPtr>& inputs,
-                               MKLDNNMatrixPtr& out) {
-  resetFwdPD(fwdPD_);
-
-  resetFwdBuffers(fwdPD_, inputs[0], wgtVal_, biasVal_, out);
-
-  resetFwdPipeline(pipeline, fwdPD_, inputs[0], wgtVal_, biasVal_, out);
-}
-
-void MKLDNNConvLayer::resetBwd(std::vector<primitive>& pipeline,
-                               std::vector<MKLDNNMatrixPtr>& inputs,
-                               MKLDNNMatrixPtr& out) {
-  std::shared_ptr<conv_bwdWgt::primitive_desc> bwdWgtPD;
-  std::shared_ptr<conv_bwdData::primitive_desc> bwdDataPD;
-
-  resetBwdWgtPD(bwdWgtPD);
-
-  resetBwdDataPD(bwdDataPD);
-
-  resetBwdBuffers(bwdWgtPD, bwdDataPD, inputs[0], wgtGrad_, biasGrad_, out);
-
-  resetBwdPipeline(
-      pipeline, bwdWgtPD, bwdDataPD, inputs[0], wgtGrad_, biasGrad_, out);
-}
-
-void MKLDNNConvLayer::updateWeights(const UpdateCallback& callback) {
-  weight_->getParameterPtr()->incUpdate(callback);
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-}
-
-void MKLDNNConvLayer::loadConvSettings(memory::dims& wgt,
-                                       memory::dims& bias,
-                                       memory::dims& stride,
-                                       memory::dims& dilation,
-                                       memory::dims& padL,
-                                       memory::dims& padR) {
-  wgt = (gp_ == 1) ? memory::dims{oc_, ic_, fh_, fw_}
-                   : memory::dims{gp_, oc_ / gp_, ic_ / gp_, fh_, fw_};
-  bias = memory::dims{oc_};
-  stride = memory::dims{sh_, sw_};
-  padL = memory::dims{ph_, pw_};
-  padR = getPaddingR();
-  // note: mkldnn dilation start from 0
-  dilation = memory::dims{dh_ - 1, dw_ - 1};
-}
-
-void MKLDNNConvLayer::resetFwdPD(
-    std::shared_ptr<conv_fwd::primitive_desc>& pd) {
-  // dims for conv
-  memory::dims inDims = memory::dims{bs_, ic_, ih_, iw_};
-  memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
-  memory::dims wgtDims, biasDims, strides, dilations, padL, padR;
-  loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
-
-  prop_kind pk = passType_ == PASS_TEST ? prop_kind::forward_scoring
-                                        : prop_kind::forward_training;
-  algorithm algo = algorithm::convolution_direct;
-  padding_kind padKind = padding_kind::zero;
-  conv_fwd::desc fwdDesc =
-      biases_ && biases_->getW()
-          ? conv_fwd::desc(pk,
-                           algo,
-                           MKLDNNMatrix::createMemoryDesc(inDims),
-                           MKLDNNMatrix::createMemoryDesc(wgtDims),
-                           MKLDNNMatrix::createMemoryDesc(biasDims),
-                           MKLDNNMatrix::createMemoryDesc(outDims),
-                           strides,
-                           dilations,
-                           padL,
-                           padR,
-                           padKind)
-          : conv_fwd::desc(pk,
-                           algo,
-                           MKLDNNMatrix::createMemoryDesc(inDims),
-                           MKLDNNMatrix::createMemoryDesc(wgtDims),
-                           MKLDNNMatrix::createMemoryDesc(outDims),
-                           strides,
-                           dilations,
-                           padL,
-                           padR,
-                           padKind);
-  pd.reset(new conv_fwd::primitive_desc(fwdDesc, engine_));
-}
-
-void MKLDNNConvLayer::resetFwdBuffers(
-    std::shared_ptr<conv_fwd::primitive_desc>& pd,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& wgt,
-    MKLDNNMatrixPtr& bias,
-    MKLDNNMatrixPtr& out) {
-  CHECK(pd);
-  resetInValue(
-      in, std::make_shared<memory::primitive_desc>(pd->src_primitive_desc()));
-
-  resetOutValue(out, pd->dst_primitive_desc());
-
-  resetWithMatrix(wgt, weight_->getW(), pd->weights_primitive_desc());
-
-  if (biases_ && biases_->getW()) {
-    resetWithMatrix(bias, biases_->getW(), pd->bias_primitive_desc());
-  } else {
-    bias = nullptr;
-  }
-}
-
-void MKLDNNConvLayer::resetFwdPipeline(
-    std::vector<primitive>& pipeline,
-    std::shared_ptr<conv_fwd::primitive_desc>& pd,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& wgt,
-    MKLDNNMatrixPtr& bias,
-    MKLDNNMatrixPtr& out) {
-  if (bias) {
-    fwd_.reset(new conv_fwd(*pd, *in, *wgt, *bias, *out));
-  } else {
-    fwd_.reset(new conv_fwd(*pd, *in, *wgt, *out));
-  }
-  pipeline.push_back(*fwd_);
-}
-
-void MKLDNNConvLayer::resetBwdWgtPD(
-    std::shared_ptr<conv_bwdWgt::primitive_desc>& pd) {
-  memory::dims wgtDims, biasDims, strides, dilations, padL, padR;
-  loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
-
-  // create backward weight using input, output and weight value memory desc
-  CHECK(inVals_[0]) << "Should have internal input value";
-  CHECK(outVal_) << "Should have internal output value";
-  CHECK(wgtVal_) << "Should have weight value";
-  algorithm algo = algorithm::convolution_direct;
-  padding_kind padKind = padding_kind::zero;
-  auto bwdWgtDesc = biasVal_ != nullptr
-                        ? conv_bwdWgt::desc(algo,
-                                            inVals_[0]->getMemoryDesc(),
-                                            wgtVal_->getMemoryDesc(),
-                                            biasVal_->getMemoryDesc(),
-                                            outVal_->getMemoryDesc(),
-                                            strides,
-                                            padL,
-                                            padR,
-                                            padKind)
-                        : conv_bwdWgt::desc(algo,
-                                            inVals_[0]->getMemoryDesc(),
-                                            wgtVal_->getMemoryDesc(),
-                                            outVal_->getMemoryDesc(),
-                                            strides,
-                                            padL,
-                                            padR,
-                                            padKind);
-  pd.reset(new conv_bwdWgt::primitive_desc(bwdWgtDesc, engine_, *fwdPD_));
-  CHECK_PRIMITIVE_DESC_EQ(inVals_[0], pd->src_primitive_desc());
-  CHECK_PRIMITIVE_DESC_EQ(
-      outVal_,
-      pd->diff_dst_primitive_desc(),
-      "primitive desc of out value and grad should be equal");
-  CHECK_PRIMITIVE_DESC_EQ(
-      wgtVal_,
-      pd->diff_weights_primitive_desc(),
-      "primitive desc of weight value and grad should be equal");
-}
-
-void MKLDNNConvLayer::resetBwdDataPD(
-    std::shared_ptr<conv_bwdData::primitive_desc>& pd) {
-  pd = nullptr;
-  if (inputLayers_[0]->getOutput().grad == nullptr) {
-    return;
-  }
-
-  memory::dims wgtDims, biasDims, strides, dilations, padL, padR;
-  loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
-  CHECK(inVals_[0]) << "Should have internal input value";
-  CHECK(outVal_) << "Should have internal output value";
-  // create backward data using input and output value memory desc
-  // but using weight memory desc with any format
-  auto bwdDataDesc = conv_bwdData::desc(algorithm::convolution_direct,
-                                        inVals_[0]->getMemoryDesc(),
-                                        MKLDNNMatrix::createMemoryDesc(wgtDims),
-                                        outVal_->getMemoryDesc(),
-                                        strides,
-                                        padL,
-                                        padR,
-                                        padding_kind::zero);
-  pd.reset(new conv_bwdData::primitive_desc(bwdDataDesc, engine_, *fwdPD_));
-  CHECK_PRIMITIVE_DESC_EQ(
-      inVals_[0],
-      pd->diff_src_primitive_desc(),
-      "primitive desc of in value and grad should be equal");
-  CHECK_PRIMITIVE_DESC_EQ(
-      outVal_,
-      pd->diff_dst_primitive_desc(),
-      "primitive desc of out value and grad should be equal");
-}
-
-void MKLDNNConvLayer::resetBwdBuffers(
-    std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
-    std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& wgt,
-    MKLDNNMatrixPtr& bias,
-    MKLDNNMatrixPtr& out) {
-  CHECK(wgtPD);
-  resetOutGrad(out, wgtPD->diff_dst_primitive_desc());
-
-  resetWithMatrix(
-      wgt, weight_->getWGrad(), wgtPD->diff_weights_primitive_desc());
-  CHECK_PRIMITIVE_DESC_EQ(
-      wgtVal_,
-      wgt->getPrimitiveDesc(),
-      "primitive desc of weight grad and value should be equal");
-
-  bias = nullptr;
-  if (biases_ && biases_->getWGrad()) {
-    resetWithMatrix(
-        bias, biases_->getWGrad(), wgtPD->diff_bias_primitive_desc());
-    CHECK(bias);
-    CHECK_PRIMITIVE_DESC_EQ(
-        biasVal_,
-        bias->getPrimitiveDesc(),
-        "primitive desc of bias grad and value should be equal");
-  }
-
-  if (dataPD == nullptr) {
-    return;
-  }
-  resetInGrad(in, dataPD->diff_src_primitive_desc());
-  resetWgtValBwdData(dataPD, wgtValBwdData_);
-}
-
-void MKLDNNConvLayer::resetBwdPipeline(
-    std::vector<primitive>& pipeline,
-    std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
-    std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& wgt,
-    MKLDNNMatrixPtr& bias,
-    MKLDNNMatrixPtr& out) {
-  CHECK(inVals_[0]);
-  // add bwdWgt handle
-  if (bias) {
-    bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVals_[0], *out, *wgt, *bias));
-  } else {
-    bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVals_[0], *out, *wgt));
-  }
-  pipeline.push_back(*bwdWgt_);
-
-  if (dataPD == nullptr) {
-    return;
-  }
-  if (cvtWgtVal_) {
-    pipeline.push_back(*cvtWgtVal_);
-  }
-  // add bwdData handle
-  CHECK(wgtValBwdData_) << "Should have weight memory";
-  bwdData_.reset(new conv_bwdData(*dataPD, *out, *wgtValBwdData_, *in));
-  pipeline.push_back(*bwdData_);
-}
-
-void MKLDNNConvLayer::resetWgtValBwdData(
-    std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
-    MKLDNNMatrixPtr& wgt) {
-  if (dataPD == nullptr) {
-    return;
-  }
-
-  // create new weight value for backward data, and create reorder if necessary
-  // since the primitive_desc would be different with wgtVal_
-  CHECK(wgtVal_) << "should have weight value";
-  if (dataPD->weights_primitive_desc() != wgtVal_->getPrimitiveDesc()) {
-    wgtValBwdData_ = MKLDNNMatrix::create(dataPD->weights_primitive_desc());
-    cvtWgtVal_ = MKLDNNMatrix::createReorder(wgtVal_, wgtValBwdData_);
-    CHECK(cvtWgtVal_);
-  } else {
-    wgtValBwdData_ = wgtVal_;
-  }
-  VLOG(MKLDNN_FMTS) << "weight value format for backward data: "
-                    << wgtValBwdData_->getFormat();
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp
deleted file mode 100644
index 0c7e6f16e24a65b552cebcbd2111926cefc211f4..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ /dev/null
@@ -1,262 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MKLDNNFcLayer.h"
-#include "paddle/utils/Logging.h"
-
-using namespace mkldnn;  // NOLINT
-typedef memory::format format;
-
-namespace paddle {
-
-REGISTER_LAYER(mkldnn_fc, MKLDNNFcLayer);
-
-bool MKLDNNFcLayer::init(const LayerMap& layerMap,
-                         const ParameterMap& parameterMap) {
-  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
-    return false;
-  }
-
-  CHECK_EQ(inputLayers_.size(), 1UL) << "Only support one input layer yet";
-  CHECK_EQ(inputLayers_.size(), parameters_.size());
-  CHECK(!parameters_[0]->isSparse()) << "Do not support sparse yet";
-
-  // output size, cat not be changed
-  oc_ = getSize();
-  oh_ = 1;
-  ow_ = 1;
-  ih_ = 1;
-  iw_ = 1;
-
-  // input size can not change in FC
-  iLayerSize_ = inputLayers_[0]->getSize();
-  CHECK_EQ(parameters_[0]->getSize(), iLayerSize_ * oc_);
-
-  // create weight
-  weight_ =
-      std::unique_ptr<Weight>(new Weight(oc_, iLayerSize_, parameters_[0], 0));
-
-  // create biases
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, oc_, biasParameter_, 0));
-  }
-  return true;
-}
-
-void MKLDNNFcLayer::convertWeightsFromPaddle() {
-  if (hasInitedWgt_) {
-    return;
-  }
-
-  CHECK(wgtVal_) << "should have been initialized";
-  auto targetDim = wgtVal_->getDims();
-  auto srcFmt = targetDim.size() == 2 ? format::io : format::ihwo;
-  wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim);
-  hasInitedWgt_ = true;
-}
-
-void MKLDNNFcLayer::convertWeightsToPaddle() {
-  CHECK(wgtVal_) << "should have been initialized";
-  auto targetDim = wgtVal_->getDims();
-  auto dstFmt = targetDim.size() == 2 ? format::io : format::ihwo;
-  wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
-}
-
-void MKLDNNFcLayer::reshape(
-    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
-  reshapeInput(bs, ih, iw);
-
-  CHECK_EQ(iLayerSize_, inputLayers_[0]->getSize());
-  ic = iLayerSize_ / (ih * iw);
-  CHECK_EQ(size_t(ic * ih * iw), iLayerSize_) << "not divisible";
-  CHECK_EQ(size_t(oc), getSize());
-
-  reshapeOutput(oh, ow);
-  resizeOutput(bs, oc);
-}
-
-void MKLDNNFcLayer::resetFwd(std::vector<primitive>& pipeline,
-                             std::vector<MKLDNNMatrixPtr>& inputs,
-                             MKLDNNMatrixPtr& out) {
-  resetFwdBuffers(inputs[0], wgtVal_, biasVal_, out);
-
-  resetFwdPD(fwdPD_, inputs[0], wgtVal_, biasVal_, out);
-
-  resetFwdPipeline(pipeline, fwdPD_, inputs[0], wgtVal_, biasVal_, out);
-}
-
-void MKLDNNFcLayer::resetBwd(std::vector<primitive>& pipeline,
-                             std::vector<MKLDNNMatrixPtr>& inputs,
-                             MKLDNNMatrixPtr& out) {
-  std::shared_ptr<fc_bwdWgt::primitive_desc> bwdWgtPD;
-  std::shared_ptr<fc_bwdData::primitive_desc> bwdDataPD;
-
-  resetBwdBuffers(inputs[0], wgtGrad_, biasGrad_, out);
-
-  resetBwdWgtPD(bwdWgtPD, wgtGrad_, biasGrad_, out);
-
-  resetBwdDataPD(bwdDataPD, inputs[0], out);
-
-  resetBwdPipeline(
-      pipeline, bwdWgtPD, bwdDataPD, inputs[0], wgtGrad_, biasGrad_, out);
-}
-
-void MKLDNNFcLayer::updateWeights(const UpdateCallback& callback) {
-  weight_->getParameterPtr()->incUpdate(callback);
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-}
-
-void MKLDNNFcLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
-                                    MKLDNNMatrixPtr& wgt,
-                                    MKLDNNMatrixPtr& bias,
-                                    MKLDNNMatrixPtr& out) {
-  resetInValue(in);
-  CHECK(in);
-  in->downSpatial();
-
-  auto outPD =
-      MKLDNNMatrix::createPrimitiveDesc({bs_, oc_}, format::nc, engine_);
-  resetOutValue(out, outPD);
-
-  format wgtFmt = format::oihw;
-  if (in->getFormat() == format::nChw8c) {
-    wgtFmt = format::oIhw8i;
-  } else if (in->getFormat() == format::nChw16c) {
-    wgtFmt = format::oIhw16i;
-  }
-  auto wgtPD =
-      MKLDNNMatrix::createPrimitiveDesc({oc_, ic_, ih_, iw_}, wgtFmt, engine_);
-  resetWithMatrix(wgt, weight_->getW(), wgtPD);
-  wgt->downSpatial();
-
-  if (biases_ && biases_->getW()) {
-    auto biasPD = MKLDNNMatrix::createPrimitiveDesc({oc_}, format::x, engine_);
-    resetWithMatrix(bias, biases_->getW(), biasPD);
-  } else {
-    bias = nullptr;
-  }
-}
-
-void MKLDNNFcLayer::resetFwdPD(std::shared_ptr<fc_fwd::primitive_desc>& pd,
-                               MKLDNNMatrixPtr in,
-                               MKLDNNMatrixPtr wgt,
-                               MKLDNNMatrixPtr bias,
-                               MKLDNNMatrixPtr out) {
-  CHECK(in);
-  CHECK(wgt);
-  CHECK(out);
-  prop_kind pk = prop_kind::forward;
-  fc_fwd::desc fwdDesc = bias != nullptr ? fc_fwd::desc(pk,
-                                                        in->getMemoryDesc(),
-                                                        wgt->getMemoryDesc(),
-                                                        bias->getMemoryDesc(),
-                                                        out->getMemoryDesc())
-                                         : fc_fwd::desc(pk,
-                                                        in->getMemoryDesc(),
-                                                        wgt->getMemoryDesc(),
-                                                        out->getMemoryDesc());
-  pd.reset(new fc_fwd::primitive_desc(fwdDesc, engine_));
-}
-
-void MKLDNNFcLayer::resetFwdPipeline(
-    std::vector<primitive>& pipeline,
-    std::shared_ptr<fc_fwd::primitive_desc>& pd,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& wgt,
-    MKLDNNMatrixPtr& bias,
-    MKLDNNMatrixPtr& out) {
-  if (bias) {
-    fwd_.reset(new fc_fwd(*pd, *in, *wgt, *bias, *out));
-  } else {
-    fwd_.reset(new fc_fwd(*pd, *in, *wgt, *out));
-  }
-  pipeline.push_back(*fwd_);
-}
-
-void MKLDNNFcLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
-                                    MKLDNNMatrixPtr& wgt,
-                                    MKLDNNMatrixPtr& bias,
-                                    MKLDNNMatrixPtr& out) {
-  CHECK(inVals_[0] && outVal_);
-  resetOutGrad(out, outVal_->getPrimitiveDesc());
-  resetInGrad(in, inVals_[0]->getPrimitiveDesc());
-
-  CHECK(wgtVal_);
-  resetWithMatrix(wgt, weight_->getWGrad(), wgtVal_->getPrimitiveDesc());
-
-  if (biasVal_) {
-    resetWithMatrix(bias, biases_->getWGrad(), biasVal_->getPrimitiveDesc());
-  } else {
-    bias = nullptr;
-  }
-}
-
-void MKLDNNFcLayer::resetBwdWgtPD(
-    std::shared_ptr<fc_bwdWgt::primitive_desc>& pd,
-    MKLDNNMatrixPtr& wgt,
-    MKLDNNMatrixPtr& bias,
-    MKLDNNMatrixPtr& out) {
-  CHECK(inVals_[0]);
-  fc_bwdWgt::desc bwdWgtDesc =
-      bias ? fc_bwdWgt::desc(inVals_[0]->getMemoryDesc(),
-                             wgt->getMemoryDesc(),
-                             bias->getMemoryDesc(),
-                             out->getMemoryDesc())
-           : fc_bwdWgt::desc(inVals_[0]->getMemoryDesc(),
-                             wgt->getMemoryDesc(),
-                             out->getMemoryDesc());
-  pd.reset(new fc_bwdWgt::primitive_desc(bwdWgtDesc, engine_, *fwdPD_));
-}
-
-void MKLDNNFcLayer::resetBwdDataPD(
-    std::shared_ptr<fc_bwdData::primitive_desc>& pd,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& out) {
-  pd = nullptr;
-  if (in == nullptr) {
-    return;
-  }
-  CHECK(wgtVal_);
-  fc_bwdData::desc bwdDataDesc = fc_bwdData::desc(
-      in->getMemoryDesc(), wgtVal_->getMemoryDesc(), out->getMemoryDesc());
-  pd.reset(new fc_bwdData::primitive_desc(bwdDataDesc, engine_, *fwdPD_));
-}
-
-void MKLDNNFcLayer::resetBwdPipeline(
-    std::vector<primitive>& pipeline,
-    std::shared_ptr<fc_bwdWgt::primitive_desc>& bwdWgtPD,
-    std::shared_ptr<fc_bwdData::primitive_desc>& bwdDataPD,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& wgt,
-    MKLDNNMatrixPtr& bias,
-    MKLDNNMatrixPtr& out) {
-  CHECK(inVals_[0]);
-  if (bias) {
-    bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVals_[0], *out, *wgt, *bias));
-  } else {
-    bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVals_[0], *out, *wgt));
-  }
-  pipeline.push_back(*bwdWgt_);
-
-  if (bwdDataPD == nullptr) {
-    return;
-  }
-  CHECK(wgtVal_) << "Should have weight memory";
-  bwdData_.reset(new fc_bwdData(*bwdDataPD, *out, *wgtVal_, *in));
-  pipeline.push_back(*bwdData_);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNLRNLayer.cpp b/paddle/gserver/layers/MKLDNNLRNLayer.cpp
deleted file mode 100644
index 88513ab8bca3899775be7822083b51120a04d6e4..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MKLDNNLRNLayer.cpp
+++ /dev/null
@@ -1,163 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MKLDNNLRNLayer.h"
-#include "paddle/utils/Logging.h"
-
-using namespace mkldnn;  // NOLINT
-typedef memory::format format;
-
-namespace paddle {
-
-REGISTER_LAYER(mkldnn_lrn, MKLDNNLRNLayer);
-
-bool MKLDNNLRNLayer::init(const LayerMap& layerMap,
-                          const ParameterMap& parameterMap) {
-  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
-    return false;
-  }
-
-  /* the size of inputs for norm-layer is 1 */
-  CHECK_EQ(config_.inputs_size(), 1);
-  const NormConfig& conf = config_.inputs(0).norm_conf();
-  localSize_ = conf.size();
-  alpha_ = conf.scale();
-  beta_ = conf.pow();
-
-  ic_ = conf.channels();
-  oc_ = ic_;
-  iw_ = conf.img_size();
-  ow_ = conf.output_x();
-  ih_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
-  oh_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
-  CHECK_EQ(iw_, ow_);
-  CHECK_EQ(ih_, oh_);
-  return true;
-}
-
-void MKLDNNLRNLayer::reshape(
-    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
-  CHECK_EQ(inputLayers_.size(), 1UL);
-  reshapeInput(bs, ih, iw);
-  // ic_ and oc can not be changed
-  CHECK_EQ((size_t)ic,
-           inputLayers_[0]->getOutputValue()->getElementCnt() / bs / ih / iw)
-      << "Input channel can not be changed";
-  oh = ih;
-  ow = iw;
-  reshapeOutput(oh, ow);
-  resizeOutput(bs, oc * oh * ow);
-}
-
-void MKLDNNLRNLayer::resetFwd(std::vector<primitive>& pipeline,
-                              std::vector<MKLDNNMatrixPtr>& inputs,
-                              MKLDNNMatrixPtr& out) {
-  resetFwdBuffers(inputs[0], out);
-
-  resetFwdPD(fwdPD_, inputs[0], out);
-
-  resetFwdPipeline(pipeline, fwdPD_, inputs[0], out);
-}
-
-void MKLDNNLRNLayer::resetBwd(std::vector<primitive>& pipeline,
-                              std::vector<MKLDNNMatrixPtr>& inputs,
-                              MKLDNNMatrixPtr& out) {
-  std::shared_ptr<lrn_bwd::primitive_desc> pd;
-
-  resetBwdBuffers(inputs[0], out);
-
-  resetBwdPD(pd, inputs[0], out);
-
-  resetBwdPipeline(pipeline, pd, inputs[0], out);
-}
-
-void MKLDNNLRNLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
-                                     MKLDNNMatrixPtr& out) {
-  resetInValue(in);
-  CHECK(in);
-  resetOutValue(out, in->getPrimitiveDesc());
-}
-
-void MKLDNNLRNLayer::resetFwdPD(std::shared_ptr<lrn_fwd::primitive_desc>& pd,
-                                MKLDNNMatrixPtr in,
-                                MKLDNNMatrixPtr out) {
-  prop_kind pk = passType_ == PASS_TEST ? prop_kind::forward_scoring
-                                        : prop_kind::forward_training;
-  auto fwdDesc = lrn_fwd::desc(pk,
-                               algorithm::lrn_across_channels,
-                               in->getMemoryDesc(),
-                               localSize_,
-                               alpha_,
-                               beta_,
-                               1.0f);
-  pd.reset(new lrn_fwd::primitive_desc(fwdDesc, engine_));
-  // prepare workspace if necessary
-  workspace_ =
-      passType_ != PASS_TEST
-          ? std::make_shared<memory>(memory(pd->workspace_primitive_desc()))
-          : nullptr;
-}
-
-void MKLDNNLRNLayer::resetFwdPipeline(
-    std::vector<primitive>& pipeline,
-    std::shared_ptr<lrn_fwd::primitive_desc>& pd,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& out) {
-  fwd_ = workspace_
-             ? std::make_shared<lrn_fwd>(lrn_fwd(*pd, *in, *workspace_, *out))
-             : std::make_shared<lrn_fwd>(lrn_fwd(*pd, *in, *out));
-  pipeline.push_back(*fwd_);
-}
-
-void MKLDNNLRNLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
-                                     MKLDNNMatrixPtr& out) {
-  CHECK(inVals_[0] && outVal_);
-  resetOutGrad(out, outVal_->getPrimitiveDesc());
-  resetInGrad(in, inVals_[0]->getPrimitiveDesc());
-}
-
-void MKLDNNLRNLayer::resetBwdPD(std::shared_ptr<lrn_bwd::primitive_desc>& pd,
-                                MKLDNNMatrixPtr& in,
-                                MKLDNNMatrixPtr& out) {
-  pd = nullptr;
-  if (in == nullptr) {
-    return;
-  }
-  CHECK(out);
-  auto bwdDesc = lrn_bwd::desc(algorithm::lrn_across_channels,
-                               in->getMemoryDesc(),
-                               out->getMemoryDesc(),
-                               localSize_,
-                               alpha_,
-                               beta_,
-                               1.0f);
-  pd.reset(new lrn_bwd::primitive_desc(bwdDesc, engine_, *fwdPD_));
-}
-
-void MKLDNNLRNLayer::resetBwdPipeline(
-    std::vector<primitive>& pipeline,
-    std::shared_ptr<lrn_bwd::primitive_desc>& pd,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& out) {
-  if (pd == nullptr) {
-    return;
-  }
-  CHECK(inVals_[0]);
-  CHECK(workspace_);
-  bwdData_ = std::make_shared<lrn_bwd>(
-      lrn_bwd(*pd, *inVals_[0], *out, *workspace_, *in));
-  pipeline.push_back(*bwdData_);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
deleted file mode 100644
index 2b164d0d3bc0e1446d7e4d82bb8a713195dbd927..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ /dev/null
@@ -1,477 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "Layer.h"
-#include "MKLDNNBase.h"
-#include "mkldnn.hpp"
-#include "paddle/math/MKLDNNMatrix.h"
-#include "paddle/utils/Stat.h"
-
-DECLARE_bool(use_mkldnn);
-
-namespace paddle {
-
-class MKLDNNLayer;
-typedef std::shared_ptr<MKLDNNLayer> MKLDNNLayerPtr;
-
-/**
- * @brief Base class of MKLDNNlayer.
- *
- */
-class MKLDNNLayer : public Layer {
- protected:
-  // batch size
-  int bs_;
-  // their sizes are always from the first input layer
-  // input image channel, height and width
-  int ic_, ih_, iw_;
-  // output image channel, height and width
-  int oc_, oh_, ow_;
-
-  // the condition that forward need be reset
-  size_t condition_;
-  // backward also need reset after reset forward handle
-  bool needResetBwd_;
-
-  // is output only mkldnn
-  bool outputOnlyMKLDNN_;
-
-  // mkldnn engine, stream and primivtives
-  mkldnn::engine engine_;
-  std::shared_ptr<MKLDNNStream> stream_;
-  std::shared_ptr<mkldnn::primitive> fwd_;
-  std::shared_ptr<mkldnn::primitive> bwdWgt_;
-  std::shared_ptr<mkldnn::primitive> bwdData_;
-  std::vector<mkldnn::primitive> pipelineFwd_;
-  std::vector<mkldnn::primitive> pipelineBwd_;
-
-  /* Value and grad are seperated as internal and external buffers.
-   * Each MKLDNNLayer must init or reset internal buffer at least,
-   * and the external buffer format is always nchw of nc(when h==w==1),
-   * which is the same format as paddle.
-   * The output_.value and output_.grad always save the external data,
-   * when mixed with cpu device.
-   * When all layers are mkldnn layers, they could save internal data.
-   */
-  // below MKLDNNMatrix buffers are all internal buffers
-  std::vector<MKLDNNMatrixPtr> inVals_;
-  std::vector<MKLDNNMatrixPtr> inGrads_;
-  MKLDNNMatrixPtr outVal_;
-  MKLDNNMatrixPtr outGrad_;
-  // below are external value and grad
-  std::vector<MKLDNNMatrixPtr> extInVals_;
-  std::vector<MKLDNNMatrixPtr> extInGrads_;
-  MKLDNNMatrixPtr extOutVal_;
-  MKLDNNMatrixPtr extOutGrad_;
-  // convert handle between external and internal buffers
-  std::vector<std::shared_ptr<mkldnn::reorder>> cvtInVals_;
-  std::vector<std::shared_ptr<mkldnn::reorder>> cvtInGrads_;
-  std::shared_ptr<mkldnn::reorder> cvtOutVal_;
-  std::shared_ptr<mkldnn::reorder> cvtOutGrad_;
-
-  // weight and bias are always internal buffers
-  MKLDNNMatrixPtr wgtVal_;
-  MKLDNNMatrixPtr wgtGrad_;
-  MKLDNNMatrixPtr biasVal_;
-  MKLDNNMatrixPtr biasGrad_;
-
-  // merge grad primitive
-  std::shared_ptr<mkldnn::primitive> mergeGrad_;
-  std::vector<mkldnn::primitive> pipelineMergeGrad_;
-  // tmp input argument to save input grad, only used to merge grad
-  Argument tmpInArg_;
-
- public:
-  explicit MKLDNNLayer(const LayerConfig& config)
-      : Layer(config),
-        ih_(0),
-        iw_(0),
-        condition_(0),
-        needResetBwd_(true),
-        outputOnlyMKLDNN_(false),
-        engine_(mkldnn::engine::cpu, 0),
-        stream_(nullptr),
-        fwd_(nullptr),
-        bwdWgt_(nullptr),
-        bwdData_(nullptr) {}
-
-  ~MKLDNNLayer() {}
-
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  virtual void forward(PassType passType);
-  virtual void backward(const UpdateCallback& callback);
-
-  /**
-   * reshape the input and output channels and image sizes
-   * and reset output buffer size
-   */
-  virtual void reshape(
-      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) = 0;
-
-  /**
-   * reset the mkldnn forward primitve and memories
-   * only would be called when input size changes
-   * weight and bias buffers should be coverd by child class itself
-   */
-  virtual void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                        std::vector<MKLDNNMatrixPtr>& inputs,
-                        MKLDNNMatrixPtr& out) = 0;
-
-  /**
-   * reset the mkldnn backward primitve and memories
-   * only would be called when needed
-   * weight and bias buffers should be coverd by child class itself
-   */
-  virtual void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                        std::vector<MKLDNNMatrixPtr>& inputs,
-                        MKLDNNMatrixPtr& out) = 0;
-
-  /**
-   * Update weights and biases if necessary.
-   */
-  virtual void updateWeights(const UpdateCallback& callback) {}
-
-  /**
-   * convert weight from paddle format to mkldnn format
-   * weight_ will be override
-   */
-  virtual void convertWeightsFromPaddle() {}
-
-  /**
-   * convert mkldnn weight to paddle format
-   * weight_ will be override
-   */
-  virtual void convertWeightsToPaddle() {}
-
-  /**
-   * add this interface as public for unit test
-   */
-  void addOutputArgument(int deviceId) { Layer::addOutputArgument(deviceId); }
-
- protected:
-  /**
-   * Some layers may have different condition to reset the forward.
-   * The function returns the condition that do not need reset forward.
-   */
-  inline virtual size_t keepCondition() {
-    // reset when the first input element size changed, not only the batchsize
-    return inputLayers_[0]->getOutputValue()->getElementCnt();
-  }
-
-  /**
-   * reshape the input image sizes and input batchsize
-   */
-  void reshapeInput(int& batchsize, int& height, int& width, size_t idx = 0);
-
-  /**
-   * reshape output image sizes
-   */
-  void reshapeOutput(size_t height, size_t width);
-
-  /**
-   * reset MKLDNNMatrix from Matrix and internal primitive desc.
-   * reset nullptr if matrix or primitive desc is empty
-   */
-  void resetWithMatrix(MKLDNNMatrixPtr& dnn,
-                       const MatrixPtr& mat,
-                       mkldnn::memory::primitive_desc pd);
-
-  /**
-   * reset input value from input MKLDNNMatrix and internal primitive desc.
-   * reset both internal and external buffer and create reorder if necessary.
-   * input channel may be different in concat.
-   */
-  void resetInValue(
-      MKLDNNMatrixPtr& in,
-      const std::shared_ptr<mkldnn::memory::primitive_desc>& intPD = nullptr,
-      size_t idx = 0,
-      int inputChannel = 0);
-
-  /**
-   * reset output value from internal primitive desc.
-   * reset both internal and external buffer and create reorder if necessary.
-   */
-  void resetOutValue(MKLDNNMatrixPtr& out,
-                     mkldnn::memory::primitive_desc intPD);
-
-  /**
-   * reset input grad from internal primitive desc.
-   * reset both internal and external buffer and create reorder if necessary.
-   */
-  void resetInGrad(MKLDNNMatrixPtr& in,
-                   mkldnn::memory::primitive_desc intPD,
-                   size_t idx = 0);
-
-  /**
-   * reset output grad from internal primitive desc.
-   * merge grad if necessary.
-   * reset both internal and external buffer and create reorder if necessary.
-   * note: about merge grad, when this layer has several outputs,
-   *       it could not be mixed with cpu device,
-   *       since it can not get memory desc from cpu device.
-   */
-  void resetOutGrad(MKLDNNMatrixPtr& out, mkldnn::memory::primitive_desc intPD);
-
-  /**
-   * reset the merge grad primitive if necessary.
-   * note: do not support the grads mixed with cpu device,
-   *       since it can not get memory desc from cpu device.
-   */
-  void resetMergeGrad(MKLDNNMatrixPtr& out);
-
- protected:
-  /**
-   * Set deviceId of this layer.
-   */
-  void setDevice(int id) { deviceId_ = id; }
-
-  /**
-   * check the format is nchw or nc,
-   * which is supported by Paddle default memory layout
-   */
-  bool isPaddleFormat(mkldnn::memory::format fmt) {
-    if (fmt == mkldnn::memory::format::nchw ||
-        fmt == mkldnn::memory::format::nc) {
-      return true;
-    } else {
-      return false;
-    }
-  }
-
-  /**
-   * If input only has MKLDNN device.
-   * Otherwise, only support the previous layer using CPU device.
-   */
-  bool inputIsOnlyMKLDNN(int index = 0) {
-    int prevDevice = getPrev(index)->getDeviceId();
-    if (prevDevice == MKLDNN_DEVICE) {
-      return true;
-    } else {
-      CHECK_EQ(prevDevice, CPU_DEVICE) << "Only support CPU yet";
-      return false;
-    }
-  }
-
-  /**
-   * If output only has MKLDNN device.
-   * Otherwise, other devices should only using CPU device.
-   */
-  bool outputIsOnlyMKLDNN() {
-    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-      CHECK_EQ(outputOtherDevice_[i].deviceId, CPU_DEVICE)
-          << "Only support other device is CPU yet";
-    }
-    outputOnlyMKLDNN_ = outputOtherDevice_.size() == 0;
-    return outputOnlyMKLDNN_;
-  }
-
-  /**
-   * print info about sizes
-   */
-  virtual void printSizeInfo() {
-    VLOG(MKLDNN_SIZES) << getName() << ": bs: " << bs_ << ", ic: " << ic_
-                       << ", ih: " << ih_ << ", iw: " << iw_ << ", oc: " << oc_
-                       << ", oh: " << oh_ << ", ow: " << ow_;
-  }
-
-  /**
-   * print the mkldnn memory format of value
-   */
-  virtual void printValueFormat() {
-    for (size_t i = 0; i < inVals_.size(); ++i) {
-      if (!inVals_[i]) {
-        continue;
-      }
-      VLOG(MKLDNN_FMTS) << "Input " << i << ", " << inputLayers_[i]->getName()
-                        << ": " << (extInVals_[i] ? extInVals_[i]->getFormat()
-                                                  : inVals_[i]->getFormat())
-                        << " >>> " << inVals_[i]->getFormat() << " >>>";
-    }
-    if (outVal_) {
-      VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> "
-                        << (extOutVal_ ? extOutVal_->getFormat()
-                                       : outVal_->getFormat());
-    }
-    if (wgtVal_) {
-      VLOG(MKLDNN_FMTS) << "Weight value format: " << wgtVal_->getFormat();
-    }
-    if (biasVal_) {
-      VLOG(MKLDNN_FMTS) << "Bias value format: " << biasVal_->getFormat();
-    }
-  }
-
-  /**
-   * print the mkldnn memory format of grad
-   */
-  virtual void printGradFormat() {
-    if (outGrad_) {
-      VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< "
-                        << (extOutGrad_ ? extOutGrad_->getFormat()
-                                        : outGrad_->getFormat());
-    }
-    for (size_t i = 0; i < inGrads_.size(); ++i) {
-      if (!inGrads_[i]) {
-        continue;
-      }
-      VLOG(MKLDNN_FMTS) << "Input " << i << ", " << inputLayers_[i]->getName()
-                        << ": " << (extInGrads_[i] ? extInGrads_[i]->getFormat()
-                                                   : inGrads_[i]->getFormat())
-                        << " <<< " << inGrads_[i]->getFormat() << " <<<";
-    }
-    if (wgtGrad_) {
-      VLOG(MKLDNN_FMTS) << "Weight grad format: " << wgtGrad_->getFormat();
-    }
-    if (biasGrad_) {
-      VLOG(MKLDNN_FMTS) << "Bias grad format: " << biasGrad_->getFormat();
-    }
-  }
-
- private:
-  /**
-   * clear all grad
-   */
-  void clearGrads() {
-    if (output_.grad) {
-      output_.grad->zeroMem();
-    }
-    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-      if (outputOtherDevice_[i].grad) {
-        outputOtherDevice_[i].grad->zeroMem();
-      }
-    }
-  }
-
-  /**
-   * Set deviceId of the params used in this layer.
-   */
-  void setParamsDevice(int id, const ParameterMap& parameterMap) {
-    for (auto& inputConfig : config_.inputs()) {
-      if (inputConfig.has_input_parameter_name()) {
-        ParameterPtr parameter;
-        std::string name = inputConfig.input_parameter_name();
-        CHECK(mapGet(name, parameterMap, &parameter))
-            << "Cannot find input parameter " << name << " for layer "
-            << getName();
-        parameter->setDevice(id);
-      }
-    }
-    if (config_.has_bias_parameter_name()) {
-      ParameterPtr parameter;
-      std::string name = config_.bias_parameter_name();
-      CHECK(mapGet(name, parameterMap, &parameter))
-          << "Cannot find bias parameter " << name << " for layer "
-          << getName();
-      parameter->setDevice(id);
-    }
-  }
-
-  /**
-   * Set output map of prev layers.
-   */
-  void setOutputMap() {
-    outputMap_.clear();
-    for (size_t i = 0; i < inputLayers_.size(); ++i) {
-      inputLayers_[i]->setOutput(getName(), &tmpInArg_);
-    }
-  }
-
-  /**
-   * if have cpu device, share value and grad data with output_
-   */
-  void shareCPUDevice() {
-    if (outputIsOnlyMKLDNN()) {
-      return;
-    }
-    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-      outputOtherDevice_[i].value = output_.value;
-      outputOtherDevice_[i].grad = output_.grad;
-    }
-  }
-
-  /**
-   * Check the cpu device number of outputOtherDevice_.
-   * should have only one at most.
-   */
-  void checkCPUOutputsNumber(int max = 1) {
-    int cnt = 0;
-    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-      if (outputOtherDevice_[i].deviceId == CPU_DEVICE) {
-        ++cnt;
-      }
-    }
-    CHECK_LE(cnt, max) << "too much CPU devies";
-  }
-
-  /**
-   * copy SeqInfo from input layer to this output and other output devices.
-   * @note: do not use getInput(0) since it used this deviceId_,
-   *        use "inputLayers_[0]->getOutput()" instead.
-   */
-  void copySeqInfoToOutputs() {
-    if (inputLayers_.empty() || !needSequenceInfo_) {
-      return;
-    }
-    const Argument& input = inputLayers_[0]->getOutput();
-    output_.sequenceStartPositions = input.sequenceStartPositions;
-    output_.subSequenceStartPositions = input.subSequenceStartPositions;
-    output_.cpuSequenceDims = input.cpuSequenceDims;
-    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-      outputOtherDevice_[i].sequenceStartPositions =
-          output_.sequenceStartPositions;
-      outputOtherDevice_[i].subSequenceStartPositions =
-          output_.subSequenceStartPositions;
-      outputOtherDevice_[i].cpuSequenceDims = output_.cpuSequenceDims;
-    }
-  }
-
-  void prepareValueConversions(std::vector<mkldnn::primitive>& pipeline) {
-    // MKLDNNLayer output value should be MKLDNNMatrix
-    // so external output value is necessary.
-    // Then external input value is not necessary,
-    // since input may be mkldnn internal buffer.
-    CHECK(extOutVal_) << "external output value is necessary";
-    output_.value = std::dynamic_pointer_cast<Matrix>(extOutVal_);
-    CHECK(inVals_[0] && outVal_) << "internal memories are necessary";
-    for (size_t i = 0; i < cvtInVals_.size(); ++i) {
-      if (cvtInVals_[i]) {
-        pipeline.insert(pipeline.begin(), *cvtInVals_[i]);
-      }
-    }
-    if (cvtOutVal_) {
-      pipeline.push_back(*cvtOutVal_);
-    }
-  }
-  void prepareGradConversions(std::vector<mkldnn::primitive>& pipeline) {
-    // external output grad is not necessary
-    // since output may be mkldnn internal buffer or merge them directly.
-    CHECK(outGrad_) << "internal output grad is necessary";
-    if (extOutGrad_) {
-      CHECK_EQ(extOutGrad_->getData(), output_.grad->getData())
-          << "the external buffer should share the same data with output_.grad";
-    }
-    if (cvtOutGrad_) {
-      pipeline.insert(pipeline.begin(), *cvtOutGrad_);
-    }
-    for (size_t i = 0; i < cvtInGrads_.size(); ++i) {
-      if (cvtInGrads_[i]) {
-        pipeline.push_back(*cvtInGrads_[i]);
-      }
-    }
-  }
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNPoolLayer.cpp b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
deleted file mode 100644
index 3be848c7496aac616903cb09844c5eadd320e91c..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MKLDNNPoolLayer.cpp
+++ /dev/null
@@ -1,195 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MKLDNNPoolLayer.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/utils/Logging.h"
-
-using namespace mkldnn;  // NOLINT
-typedef memory::format format;
-
-namespace paddle {
-
-REGISTER_LAYER(mkldnn_pool, MKLDNNPoolLayer);
-
-bool MKLDNNPoolLayer::init(const LayerMap& layerMap,
-                           const ParameterMap& parameterMap) {
-  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
-    return false;
-  }
-
-  /* the size of inputs for pool-layer is 1 */
-  CHECK_EQ(config_.inputs_size(), 1);
-  const PoolConfig& conf = config_.inputs(0).pool_conf();
-  ic_ = conf.channels();
-  ih_ = conf.img_size_y();
-  iw_ = conf.img_size();
-  oc_ = ic_;
-  oh_ = conf.output_y();
-  ow_ = conf.output_x();
-  fh_ = conf.size_y();
-  fw_ = conf.size_x();
-  ph_ = conf.padding_y();
-  pw_ = conf.padding();
-  sh_ = conf.stride_y();
-  sw_ = conf.stride();
-
-  const std::string& type = conf.pool_type();
-  if (type == "max-projection") {
-    poolAlgo_ = algorithm::pooling_max;
-  } else if (type == "avg-projection") {
-    // paddle only use exclude_padding
-    poolAlgo_ = algorithm::pooling_avg_exclude_padding;
-  } else {
-    LOG(FATAL) << "unknow pooling type!";
-  }
-  return true;
-}
-
-void MKLDNNPoolLayer::reshape(
-    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
-  reshapeInput(bs, ih, iw);
-  // ic_ and oc can not be changed
-  CHECK_EQ((size_t)ic,
-           inputLayers_[0]->getOutputValue()->getElementCnt() / bs / ih / iw)
-      << "Input channel can not be changed";
-
-  // cal output sizes
-  // paddle used false caffeMode for pooling
-  oh = outputSize(ih, fh_, ph_, sh_, false);
-  ow = outputSize(iw, fw_, pw_, sw_, false);
-  reshapeOutput(oh, ow);
-
-  resizeOutput(bs, oc * oh * ow);
-}
-
-void MKLDNNPoolLayer::resetFwd(std::vector<primitive>& pipeline,
-                               std::vector<MKLDNNMatrixPtr>& inputs,
-                               MKLDNNMatrixPtr& out) {
-  resetFwdBuffers(inputs[0], out);
-
-  resetFwdPD(fwdPD_, inputs[0], out);
-
-  resetFwdPipeline(pipeline, fwdPD_, inputs[0], out);
-}
-
-void MKLDNNPoolLayer::resetBwd(std::vector<primitive>& pipeline,
-                               std::vector<MKLDNNMatrixPtr>& inputs,
-                               MKLDNNMatrixPtr& out) {
-  std::shared_ptr<pool_bwd::primitive_desc> pd;
-
-  resetBwdBuffers(inputs[0], out);
-
-  resetBwdPD(pd, inputs[0], out);
-
-  resetBwdPipeline(pipeline, pd, inputs[0], out);
-}
-
-void MKLDNNPoolLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
-                                      MKLDNNMatrixPtr& out) {
-  resetInValue(in);
-
-  memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
-  CHECK(in);
-  auto outPD =
-      MKLDNNMatrix::createPrimitiveDesc(outDims, in->getFormat(), engine_);
-  resetOutValue(out, outPD);
-}
-
-void MKLDNNPoolLayer::resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
-                                 MKLDNNMatrixPtr in,
-                                 MKLDNNMatrixPtr out) {
-  memory::dims kernels = memory::dims{fh_, fw_};
-  memory::dims strides = memory::dims{sh_, sw_};
-  memory::dims padL = memory::dims{ph_, pw_};
-  memory::dims padR = getPaddingR();
-  padding_kind padKind = padding_kind::zero;
-  prop_kind pk = passType_ == PASS_TEST ? prop_kind::forward_scoring
-                                        : prop_kind::forward_training;
-  auto fwdDesc = pool_fwd::desc(pk,
-                                poolAlgo_,
-                                in->getMemoryDesc(),
-                                out->getMemoryDesc(),
-                                strides,
-                                kernels,
-                                padL,
-                                padR,
-                                padKind);
-  pd.reset(new pool_fwd::primitive_desc(fwdDesc, engine_));
-
-  // prepare workspace if necessary
-  workspace_ =
-      (passType_ != PASS_TEST && poolAlgo_ == algorithm::pooling_max)
-          ? std::make_shared<memory>(memory(pd->workspace_primitive_desc()))
-          : nullptr;
-}
-
-void MKLDNNPoolLayer::resetFwdPipeline(
-    std::vector<primitive>& pipeline,
-    std::shared_ptr<pool_fwd::primitive_desc>& pd,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& out) {
-  fwd_ = workspace_
-             ? std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out, *workspace_))
-             : std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out));
-  pipeline.push_back(*fwd_);
-}
-
-void MKLDNNPoolLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
-                                      MKLDNNMatrixPtr& out) {
-  CHECK(inVals_[0] && outVal_);
-  resetOutGrad(out, outVal_->getPrimitiveDesc());
-  resetInGrad(in, inVals_[0]->getPrimitiveDesc());
-}
-
-void MKLDNNPoolLayer::resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
-                                 MKLDNNMatrixPtr& in,
-                                 MKLDNNMatrixPtr& out) {
-  pd = nullptr;
-  if (in == nullptr) {
-    return;
-  }
-  memory::dims kernels = memory::dims{fh_, fw_};
-  memory::dims strides = memory::dims{sh_, sw_};
-  memory::dims padL = memory::dims{ph_, pw_};
-  memory::dims padR = getPaddingR();
-  CHECK(out);
-  auto bwdDesc = pool_bwd::desc(poolAlgo_,
-                                in->getMemoryDesc(),
-                                out->getMemoryDesc(),
-                                strides,
-                                kernels,
-                                padL,
-                                padR,
-                                padding_kind::zero);
-  pd.reset(new pool_bwd::primitive_desc(bwdDesc, engine_, *fwdPD_));
-}
-
-void MKLDNNPoolLayer::resetBwdPipeline(
-    std::vector<primitive>& pipeline,
-    std::shared_ptr<pool_bwd::primitive_desc>& pd,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& out) {
-  if (pd == nullptr) {
-    return;
-  }
-
-  bwdData_ =
-      workspace_
-          ? std::make_shared<pool_bwd>(pool_bwd(*pd, *out, *workspace_, *in))
-          : std::make_shared<pool_bwd>(pool_bwd(*pd, *out, *in));
-  pipeline.push_back(*bwdData_);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLPackedWeight.h b/paddle/gserver/layers/MKLPackedWeight.h
deleted file mode 100644
index b01a961d007a0e2e343db7b51e50fd3ee776435e..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MKLPackedWeight.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/math/MathFunctions.h"
-#include "paddle/parameter/Parameter.h"
-#include "paddle/parameter/Weight.h"
-
-namespace paddle {
-
-class MKLPackedWeight {
- protected:
-  /// The pointer of weight
-  real *weight_;
-  /// The pointer of cblas packed gemm to weight
-  real *packedWeight_;
-  size_t height_;
-  size_t width_;
-  bool transW_;
-
- public:
-  explicit MKLPackedWeight(MatrixPtr weight, bool transW = false) {
-    packedWeight_ = nullptr;
-    weight_ = weight->getData();
-    height_ = weight->getHeight();
-    width_ = weight->getWidth();
-    transW_ = transW;
-  }
-
-  ~MKLPackedWeight() { free_(); }
-
-  void pack() { pack_(weight_); }
-
-  void gemm_compute(const MatrixPtr src, MatrixPtr dst) {
-    cblas_sgemm_compute(CblasRowMajor,
-                        CblasNoTrans,
-                        CblasPacked,
-                        src->getHeight(),
-                        transW_ ? height_ : width_,
-                        transW_ ? width_ : height_,
-                        src->getData(),
-                        src->getWidth(),
-                        packedWeight_,
-                        width_,
-                        1.0,
-                        dst->getData(),
-                        dst->getWidth());
-  }
-
- protected:
-  void pack_(real *src) {
-    if (!packedWeight_) {
-      packedWeight_ = cblas_sgemm_alloc(CblasBMatrix, 1, width_, height_);
-    }
-    cblas_sgemm_pack(CblasRowMajor,
-                     CblasBMatrix,
-                     transW_ ? CblasTrans : CblasNoTrans,
-                     1,
-                     transW_ ? height_ : width_,
-                     transW_ ? width_ : height_,
-                     1.0,
-                     src,
-                     width_,
-                     packedWeight_);
-  }
-
-  void free_() {
-    if (packedWeight_) {
-      cblas_sgemm_free(packedWeight_);
-    }
-  }
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MaxLayer.cpp b/paddle/gserver/layers/MaxLayer.cpp
deleted file mode 100644
index 7ee2e0dd946d6f332f6b8454f977601b0ee8d249..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MaxLayer.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MaxLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(max, MaxLayer);
-
-void MaxLayer::forward(PassType passType) {
-  SequencePoolLayer::forward(passType);
-
-  IVector::resizeOrCreate(
-      maxIndex_, newBatchSize_ * getSize(), useGpu(deviceId_));
-  maxIndex_->zeroMem();
-
-  MatrixPtr inputValue = getInputValue(0);
-  MatrixPtr outputValue = getOutputValue();
-
-  {
-    REGISTER_TIMER_INFO("MaxLayerForward", getName().c_str());
-    outputValue->maxSequenceForward(
-        *inputValue, *startPositions_->getVector(useGpu_), *maxIndex_);
-  }
-
-  if (config_.output_max_index()) {
-    // copy maxIndex_ to output
-    outputValue->copyFrom(*maxIndex_);
-  } else {
-    /* add the bias-vector AFTER max operation */
-    if (biases_.get() != NULL) {
-      outputValue->addBias(*(biases_->getW()), 1);
-    }
-    /* activation */ { forwardActivation(); }
-  }
-}
-
-void MaxLayer::backward(const UpdateCallback& callback) {
-  CHECK(!config_.output_max_index())
-      << "backward is not available when output_max_index is set";
-  SequencePoolLayer::backward(callback);
-
-  MatrixPtr inputGrad = getInputGrad(0);
-  MatrixPtr outputGrad = getOutputGrad();
-  if (inputGrad) {
-    REGISTER_TIMER_INFO("MaxLayerBackward", getName().c_str());
-    inputGrad->maxSequenceBackward(
-        *outputGrad, *(startPositions_->getVector(useGpu_)), *maxIndex_);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MaxLayer.h b/paddle/gserver/layers/MaxLayer.h
deleted file mode 100644
index e46f997c342ce5d6b724629dff6950c4f1680ce8..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MaxLayer.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "SequencePoolLayer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/ThreadLocal.h"
-
-namespace paddle {
-
-/**
- * A layer for "internal max" for sequence input.
- * Input: one or more sequences. Each sequence contains some instances.
- * If SequenceLevel = kNonSeq:
- *    Output: output size is the number of input sequences (NOT input instances)
- *    output[i] = max_{for each instance in this sequence}{input[i]}
- *    If stride_ > 0:
- *      Output: a shorten sequence. Stride is the step size by which we slide a
- *              window upon the input sequence, and the max pooling operation is
- *              then applied to each interval independently.
- * If SequenceLevel = kSeq:
- *    Check input sequence must has sub-sequence
- *    Output: output size is the number of input sub-sequences
- *    output[i] = max_{for each instance in this sub-sequence}{input[i]}
- *
- * The config file api is pooling_layer.
- */
-
-class MaxLayer : public SequencePoolLayer {
- protected:
-  // maxIndex_[i][j] = k : the value at (i, j) is from input[k].
-  IVectorPtr maxIndex_;
-
- public:
-  explicit MaxLayer(const LayerConfig& config) : SequencePoolLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override {
-    return SequencePoolLayer::init(layerMap, parameterMap);
-  }
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MaxOutLayer.h b/paddle/gserver/layers/MaxOutLayer.h
deleted file mode 100644
index 0eb8674b4c4f3f58b103c6b59ad13931a6992a1b..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MaxOutLayer.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * A layer to do max out on conv layer output.
- * Input: output of a conv layer.
- * Output: feature map size same as input.  Channel is (input channel) / groups.
- * So the num of channels should be able to devided by groups.
- *
- * The config file api is maxout_layer.
- */
-
-class MaxOutLayer : public Layer {
- protected:
-  size_t groups_;
-  size_t imgSizeH_, imgSizeW_;
-  /// outputChannels_ = channels_ / groups_
-  size_t channels_, outputChannels_;
-  /// feature length = imgSizeH_ * imgSizeW_
-  size_t featLen_;
-  IVectorPtr maxoutId_;
-
- public:
-  /// return imgSizeH_ * imgSizeW_ * outputChannels_;
-  size_t getSize();
-
-  explicit MaxOutLayer(const LayerConfig& config) : Layer(config) {}
-  virtual ~MaxOutLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MaxPoolWithMaskLayer.cpp b/paddle/gserver/layers/MaxPoolWithMaskLayer.cpp
deleted file mode 100644
index e594e22b5eaa6027fdf5bbd09ab93774d9a798be..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MaxPoolWithMaskLayer.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MaxPoolWithMaskLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-bool MaxPoolWithMaskLayer::init(const LayerMap& layerMap,
-                                const ParameterMap& parameterMap) {
-  PoolLayer::init(layerMap, parameterMap);
-  setOutput("mask", &mask_);
-  return true;
-}
-
-size_t MaxPoolWithMaskLayer::getSize() {
-  CHECK_EQ(inputLayers_.size(), 1UL);
-  size_t layerSize = 0;
-
-  outputY_ = outputSize(imgSizeY_,
-                        sizeY_,
-                        confPaddingY_,
-                        strideY_,
-                        /* caffeMode */ false);
-  outputX_ = outputSize(imgSize_,
-                        sizeX_,
-                        confPadding_,
-                        stride_,
-                        /* caffeMode */ false);
-
-  layerSize = outputX_ * outputY_ * channels_;
-  getOutput().setFrameHeight(outputY_);
-  getOutput().setFrameWidth(outputX_);
-
-  return layerSize;
-}
-
-void MaxPoolWithMaskLayer::forward(PassType passType) {
-  size_t size = getSize();
-  MatrixPtr inputV = inputLayers_[0]->getOutputValue();
-  int batchSize = inputV->getHeight();
-  resetOutput(batchSize, size);
-
-  MatrixPtr outV = getOutputValue();
-  CHECK_EQ(size, outV->getWidth());
-
-  resetSpecifyOutput(mask_,
-                     batchSize,
-                     size,
-                     /* isValueClean */ false,
-                     /* isGradClean */ true);
-
-  MatrixPtr maskV = mask_.value;
-  outV->maxPoolForward(*inputV,
-                       imgSizeY_,
-                       imgSize_,
-                       channels_,
-                       sizeX_,
-                       sizeY_,
-                       strideY_,
-                       stride_,
-                       outputY_,
-                       outputX_,
-                       confPaddingY_,
-                       confPadding_,
-                       maskV);
-}
-
-void MaxPoolWithMaskLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-  if (NULL == getInputGrad(0)) {
-    return;
-  }
-
-  MatrixPtr outGrad = getOutputGrad();
-  MatrixPtr inputV = inputLayers_[0]->getOutputValue();
-  MatrixPtr outV = getOutputValue();
-  MatrixPtr inputGrad = inputLayers_[0]->getOutputGrad();
-
-  inputGrad->maxPoolBackward(*inputV,
-                             imgSizeY_,
-                             imgSize_,
-                             *outGrad,
-                             *outV,
-                             sizeX_,
-                             sizeY_,
-                             strideY_,
-                             stride_,
-                             outputY_,
-                             outputX_,
-                             1,
-                             1,
-                             confPaddingY_,
-                             confPadding_);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MaxPoolWithMaskLayer.h b/paddle/gserver/layers/MaxPoolWithMaskLayer.h
deleted file mode 100644
index c948364f6b83b0de1ee07cc185b69346f5cb1a7e..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MaxPoolWithMaskLayer.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "PoolLayer.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-/**
- * @brief Basic parent layer of different kinds of pooling
- */
-class MaxPoolWithMaskLayer : public PoolLayer {
- protected:
-  Argument mask_;
-
- public:
-  explicit MaxPoolWithMaskLayer(const LayerConfig& config)
-      : PoolLayer(config) {}
-
-  size_t getSize();
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MixedLayer.cpp b/paddle/gserver/layers/MixedLayer.cpp
deleted file mode 100644
index 7dcb30b98d6e6b08929d5fecba0833c8b1989725..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MixedLayer.cpp
+++ /dev/null
@@ -1,176 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MixedLayer.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(mixed, MixedLayer);
-
-bool MixedLayer::init(const LayerMap& layerMap,
-                      const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  if (!Layer::init(layerMap, parameterMap)) return false;
-
-  CHECK_EQ(inputLayers_.size(), parameters_.size());
-  projections_.resize(inputLayers_.size());
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    if (config_.inputs(i).has_proj_conf()) {
-      projections_[i].reset(Projection::create(
-          config_.inputs(i).proj_conf(), parameters_[i], useGpu_));
-    } else {
-      CHECK(!parameters_[i]) << "should no parameters for operators";
-    }
-  }
-  for (auto& operator_conf : config_.operator_confs()) {
-    for (auto& input_index : operator_conf.input_indices()) {
-      CHECK(!config_.inputs(input_index).has_proj_conf());
-    }
-    operators_.emplace_back(Operator::create(operator_conf, useGpu_));
-  }
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    sharedBias_ = config_.shared_biases();
-    size_t psize = config_.bias_size();
-    biases_ = std::unique_ptr<Weight>(new Weight(1, psize, biasParameter_));
-  }
-
-  return true;
-}
-
-void MixedLayer::prefetch() {
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    if (projections_[i]) {
-      projections_[i]->prefetch(&getInput(i));
-    }
-  }
-}
-
-void MixedLayer::resetState() {
-  for (auto& proj : projections_) {
-    if (proj) {
-      proj->resetState();
-    }
-  }
-}
-
-void MixedLayer::setState(LayerStatePtr state) {
-  CHECK(projectionStateMatrixSize_.size() == projections_.size())
-      << "projection size mis-match";
-
-  int start = 0;
-  LayerStatePtr statePtr = std::make_shared<LayerState>();
-  for (int i = 0; i < (int)projectionStateMatrixSize_.size(); i++) {
-    if (projectionStateMatrixSize_[i] > 0) {
-      statePtr->value.clear();
-      for (int j = start; j < start + projectionStateMatrixSize_[i]; j++) {
-        statePtr->value.push_back(state->value[j]);
-      }
-      projections_[i]->setState(statePtr);
-      start += projectionStateMatrixSize_[i];
-    }
-  }
-  CHECK((int)state->value.size() == start) << "state matrix size mis-match";
-}
-
-// Return state which consists of all projections states
-LayerStatePtr MixedLayer::getState() {
-  bool init = projectionStateMatrixSize_.size() == 0;
-  LayerStatePtr res = std::make_shared<LayerState>();
-  for (int i = 0; i < (int)projections_.size(); i++) {
-    LayerStatePtr statePtr =
-        projections_[i] ? projections_[i]->getState() : nullptr;
-    int stateSize = statePtr == nullptr ? 0 : statePtr->value.size();
-    if (init) {
-      projectionStateMatrixSize_.push_back(stateSize);
-    } else {
-      CHECK(projectionStateMatrixSize_[i] == stateSize)
-          << "state matrix size mis-match";
-    }
-    if (statePtr != nullptr) {
-      for (auto& matrixPtr : statePtr->value) {
-        res->value.push_back(matrixPtr);
-      }
-    }
-  }
-  return res;
-}
-
-void MixedLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  int batchSize = getInput(0).getBatchSize();
-  int size = getSize();
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    resetOutput(batchSize, size);
-  }
-
-  MatrixPtr outV = getOutputValue();
-
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    if (projections_[i]) {
-      projections_[i]->forward(&getInput(i), &output_, passType);
-    }
-  }
-
-  std::vector<const Argument*> ins;
-  for (auto& op : operators_) {
-    ins.clear();
-    for (auto& input_index : op->getConfig().input_indices()) {
-      ins.push_back(&getInput(input_index));
-    }
-    op->forward(ins, &output_, passType);
-  }
-
-  /* add the bias-vector */
-  if (biases_.get() != NULL) {
-    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
-    outV->addBias(*(biases_->getW()), 1, sharedBias_);
-  }
-
-  /* activation */ {
-    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void MixedLayer::backward(const UpdateCallback& callback) {
-  /* Do activation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
-
-  if (biases_ && biases_->getWGrad()) {
-    REGISTER_TIMER_INFO("BpBiasTimer", getName().c_str());
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1, sharedBias_);
-
-    /* Increasing the number of gradient */
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    if (projections_[i]) {
-      projections_[i]->backward(callback);
-    }
-  }
-
-  for (auto& op : operators_) {
-    op->backward();
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MultinomialSampler.h b/paddle/gserver/layers/MultinomialSampler.h
deleted file mode 100644
index 8cbb229f157c0904e63a696f860ec6739d5167c4..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MultinomialSampler.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <random>
-#include "paddle/utils/Common.h"
-
-namespace paddle {
-
-/**
- * @brief Given the probability of N objects, the sampler random select
- * one of the object.
- * @note: prob does not have to be unnormalized.
- *
- * The space requirement is O(N)=O(N * sizeof(Interval)).
- * The computational complexity of generate one sample is O(1).
- */
-class MultinomialSampler {
- public:
-  MultinomialSampler(const real* prob, int size);
-
-  //! protobuf always using double.
-  static MultinomialSampler* create(const double* prob, int size) {
-#ifdef PADDLE_TYPE_DOUBLE
-    return new MultinomialSampler(prob, size);
-#else
-    std::unique_ptr<real[]> tmp(new real[size]);
-    std::copy(prob, prob + size, tmp.get());
-    return new MultinomialSampler(tmp.get(), size);
-#endif
-  }
-
-  /**
-   * @brief Generate a random sample.
-   * @param g is a random number engine. See <random>.
-   * @return Random integer.
-   */
-  template <typename URNG>
-  int gen(URNG& g) {
-    return gen1([&g, this]() { return rand_(g); });
-  }
-
- protected:
-  /**
-   * @brief Generation
-   * @param[in] rand rand is a real random number distribution
-   * for the range [0, size).
-   * @return random int number or intervals_[random_int_number].otherId.
-   */
-  template <typename Rand>
-  int gen1(Rand rand) {
-    double r = rand();  // NOLINT
-    int i = (int)r;
-    r -= i;
-    return r < intervals_[i].thresh ? i : intervals_[i].otherId;
-  }
-
-  struct Interval {
-    int otherId;
-    real thresh;
-  };
-
-  /// The probability of each interval will be 1./size
-  std::vector<Interval> intervals_;
-  std::uniform_real_distribution<double> rand_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MultiplexLayer.cpp b/paddle/gserver/layers/MultiplexLayer.cpp
deleted file mode 100644
index 43ecc48cd97fb54d8dc4eb1d87ebf60f5aa040d8..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MultiplexLayer.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- *@brief This layer multiplex multiple layers according to the index,
- * which is provided by the first input layer.
- * - Input[0]: the index of the layer to output of size batchSize.
- * - Input[1:N]; the candidate output data.
- * For each index i from 0 to batchSize -1, the output is the i-th row of the
- * (index[i] + 1)-th layer.
- *
- * For each i-th row of output:
- *
- * \f[
- *   y[i][j] = x_{x_{0}[i] + 1}[i][j], j = 0,1, ... , (x_{1}.width - 1)
- * \f]
- * where, y is output. \f$x_{k}\f$ is the k-th input layer and
- * \f$k = x_{0}[i] + 1\f$.
- */
-
-class MultiplexLayer : public Layer {
- protected:
-  /**
-   * @brief A struct is used to save the copy information, includes input
-   * layer index and copy size.
-   */
-  struct CopyInfo {
-    CopyInfo(int inStartIdx, int inLength, int inCopyIdx)
-        : startIdx(inStartIdx), length(inLength), copyIdx(inCopyIdx) {}
-
-    /// The start row of input.
-    int startIdx;
-    /// Number of rows. If the layer index in Input[0] is not consecutive,
-    /// the length is one. Otherwise, the length is > 1 and copy multi rows
-    /// once.
-    int length;
-    /// The copied layer index, which needs to add 1.
-    int copyIdx;
-  };
-
-  /// A list of CopyInfo used to save copy information.
-  std::vector<CopyInfo> copySchedule_;
-
-  /// Temporary matrix pointer to point to input data.
-  MatrixPtr tmpSrc_;
-  /// Temporary matrix pointer to point to output data.
-  MatrixPtr tmpDest_;
-
- public:
-  explicit MultiplexLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~MultiplexLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
- private:
-  /**
-   * @brief Calculate copy info for input layers.
-   */
-  void calculateCopySchedule(const IVectorPtr& copyIds, size_t numIns);
-};
-
-REGISTER_LAYER(multiplex, MultiplexLayer);
-
-void MultiplexLayer::calculateCopySchedule(const IVectorPtr& copyIds,
-                                           size_t numIns) {
-  copySchedule_.clear();
-  CopyInfo prevCopyInfo(0, 0, -1);
-  for (size_t i = 0; i < copyIds->getSize(); i++) {
-    int copyId = copyIds->getElement(i);
-    CHECK_GE(copyId, 0);
-    CHECK_LT(copyId, int(numIns));
-    // copy same input layer with prevous and will copy consecutive.
-    if (copyId == prevCopyInfo.copyIdx) {
-      ++prevCopyInfo.length;
-    } else {
-      if (prevCopyInfo.copyIdx != -1) {
-        copySchedule_.emplace_back(prevCopyInfo);
-      }
-      prevCopyInfo.startIdx = i;
-      prevCopyInfo.length = 1;
-      prevCopyInfo.copyIdx = copyId;
-    }
-  }
-  if (prevCopyInfo.copyIdx != -1) {
-    copySchedule_.emplace_back(prevCopyInfo);
-  }
-}
-
-bool MultiplexLayer::init(const LayerMap& layerMap,
-                          const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_GE(inputLayers_.size(), 2U);
-
-  tmpSrc_ =
-      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
-  tmpDest_ =
-      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
-  return true;
-}
-
-void MultiplexLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  IVectorPtr copyIds = getInput(0).ids;
-  MatrixPtr inV1 = getInputValue(1);
-  CHECK_EQ(copyIds->getSize(), inV1->getHeight());
-  for (size_t i = 2; i < inputLayers_.size(); i++) {
-    CHECK_EQ(inV1->getHeight(), getInputValue(i)->getHeight());
-    CHECK_EQ(inV1->getWidth(), getInputValue(i)->getWidth());
-  }
-
-  calculateCopySchedule(copyIds, inputLayers_.size() - 1);
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    reserveOutput(inV1->getHeight(), inV1->getWidth());
-  }
-
-  MatrixPtr outV = getOutputValue();
-  {
-    REGISTER_TIMER_INFO("FwLMultplexingTimer", getName().c_str());
-    AsyncGpuBlock block;
-    for (const CopyInfo& info : copySchedule_) {
-      outV->subMatrix(info.startIdx, info.length, tmpDest_)
-          ->copyFrom(*getInputValue(info.copyIdx + 1)
-                          ->subMatrix(info.startIdx, info.length, tmpSrc_));
-    }
-  }
-
-  /* activation */ {
-    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void MultiplexLayer::backward(const UpdateCallback& callback) {
-  /* Do derivation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
-
-  MatrixPtr outG = getOutputGrad();
-
-  {
-    REGISTER_TIMER_INFO("BwLMultiplexTimer", getName().c_str());
-    AsyncGpuBlock block;
-    for (const CopyInfo& info : copySchedule_) {
-      if (getInputGrad(info.copyIdx + 1)) {
-        getInputGrad(info.copyIdx + 1)
-            ->subMatrix(info.startIdx, info.length, tmpDest_)
-            ->add(*outG->subMatrix(info.startIdx, info.length, tmpSrc_));
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/NCELayer.cpp b/paddle/gserver/layers/NCELayer.cpp
deleted file mode 100644
index cc48fe100f12446f9522078119ae2ead039a82cc..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/NCELayer.cpp
+++ /dev/null
@@ -1,323 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <random>
-
-#include "Layer.h"
-#include "MultinomialSampler.h"
-#include "paddle/math/MathFunctions.h"
-
-namespace paddle {
-
-/**
- * Noise-contrastive estimation.
- * Implements the method in the following paper:
- * A fast and simple algorithm for training neural probabilistic language
- * models.
- *
- * The config file api is nce_layer.
- */
-class NCELayer : public Layer {
-  int numClasses_;
-  /// number of input layer besides labelLayer and weightLayer
-  int numInputs_;
-  LayerPtr labelLayer_;
-  /// weight layer, can be None
-  LayerPtr weightLayer_;
-  WeightList weights_;
-  std::unique_ptr<Weight> biases_;
-  std::unique_ptr<MultinomialSampler> sampler_;
-
-  std::uniform_int_distribution<int> rand_;
-
-  struct Sample {
-    int sampleId;
-    int labelId;
-    bool target;
-    real weight;
-  };
-  std::vector<Sample> samples_;
-  /// whether samples_ is prepared
-  bool prepared_;
-  Argument sampleOut_;
-
-  IVectorPtr labelIds_;
-
- public:
-  explicit NCELayer(const LayerConfig& config)
-      : Layer(config),
-        numClasses_(config.num_classes()),
-        rand_(0, config.num_classes() - 1),
-        prepared_(false) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override {
-    /* Initialize the basic parent class */
-    Layer::init(layerMap, parameterMap);
-
-    /* initialize the weightList */
-    size_t i;
-    for (i = 0; i < inputLayers_.size(); i++) {
-      if (!parameters_[i]) break;
-      size_t width = inputLayers_[i]->getSize();
-      // create a new weight
-      CHECK_EQ(parameters_[i]->getSize(), width * numClasses_);
-      Weight* w = new Weight(numClasses_, width, parameters_[i]);
-
-      // append the new weight to the list
-      weights_.emplace_back(w);
-    }
-
-    CHECK_EQ(1U, getSize());
-
-    numInputs_ = i;
-    CHECK_GE(numInputs_, 1)
-        << "Must have at least one input besides label and weight";
-    CHECK_LT(i, inputLayers_.size()) << "Missing label layer";
-    labelLayer_ = inputLayers_[i];
-    if (++i < inputLayers_.size()) {
-      weightLayer_ = inputLayers_[i];
-      ++i;
-    }
-    CHECK_EQ(i, inputLayers_.size());
-
-    /* initialize biases_ */
-    if (biasParameter_.get() != NULL) {
-      CHECK_EQ(biasParameter_->getSize(), (size_t)numClasses_);
-      biases_.reset(new Weight(1, numClasses_, biasParameter_));
-    }
-
-    if (config_.neg_sampling_dist_size()) {
-      CHECK_EQ(numClasses_, config_.neg_sampling_dist_size());
-      sampler_.reset(MultinomialSampler::create(
-          config_.neg_sampling_dist().data(), numClasses_));
-    }
-
-    return true;
-  }
-
-  void prepareSamples() {
-    CHECK(!useGpu_) << "GPU is not supported";
-
-    int batchSize = getInput(*labelLayer_).getBatchSize();
-    IVectorPtr label = getInput(*labelLayer_).ids;
-
-    CpuSparseMatrixPtr multiLabel = std::dynamic_pointer_cast<CpuSparseMatrix>(
-        getInput(*labelLayer_).value);
-
-    CHECK(label || multiLabel)
-        << "The label layer must have ids or NonValueSparseMatrix value";
-
-    auto& randEngine = ThreadLocalRandomEngine::get();
-
-    samples_.clear();
-    samples_.reserve(batchSize * (1 + config_.num_neg_samples()));
-
-    real* weight =
-        weightLayer_ ? getInputValue(*weightLayer_)->getData() : nullptr;
-
-    for (int i = 0; i < batchSize; ++i) {
-      real w = weight ? weight[i] : 1;
-      if (label) {
-        int* ids = label->getData();
-        samples_.push_back({i, ids[i], true, w});
-      } else {
-        const int* cols = multiLabel->getRowCols(i);
-        int n = multiLabel->getColNum(i);
-        for (int j = 0; j < n; ++j) {
-          samples_.push_back({i, cols[j], true, w});
-        }
-      }
-      for (int j = 0; j < config_.num_neg_samples(); ++j) {
-        int id = sampler_ ? sampler_->gen(randEngine) : rand_(randEngine);
-        samples_.push_back({i, id, false, w});
-      }
-    }
-    prepared_ = true;
-  }
-
-  void prefetch() override {
-    prepareSamples();
-    IVector::resizeOrCreate(labelIds_, samples_.size(), useGpu_);
-    int* ids = labelIds_->getData();
-    for (size_t i = 0; i < samples_.size(); ++i) {
-      ids[i] = samples_[i].labelId;
-    }
-
-    for (int i = 0; i < numInputs_; ++i) {
-      auto sparseParam =
-          dynamic_cast<SparsePrefetchRowCpuMatrix*>(weights_[i]->getW().get());
-      if (sparseParam) {
-        sparseParam->addRows(labelIds_);
-      }
-    }
-  }
-
-  void forward(PassType passType) override {
-    Layer::forward(passType);
-
-    CHECK(!useGpu_) << "GPU is not supported";
-
-    if (!prepared_) {
-      if (passType == PASS_GC) {
-        ThreadLocalRandomEngine::get().seed(ThreadLocalRand::getDefaultSeed());
-      }
-      prepareSamples();
-    }
-    prepared_ = false;
-
-    /* malloc memory for the output_ if necessary */
-    int batchSize = getInputValue(0)->getHeight();
-    int size = getSize();
-    resetOutput(batchSize, size);
-
-    Matrix::resizeOrCreate(sampleOut_.value,
-                           1,
-                           samples_.size(),
-                           /* trans= */ false,
-                           useGpu_);
-
-    forwardBias();
-
-    for (int l = 0; l < numInputs_; ++l) {
-      forwardOneInput(l);
-    }
-
-    auto status = activation_->forward(sampleOut_);
-    status.check();
-
-    forwardCost();
-  }
-
-  void backward(const UpdateCallback& callback) override {
-    Matrix::resizeOrCreate(sampleOut_.grad,
-                           1,
-                           samples_.size(),
-                           /* trans= */ false,
-                           useGpu_);
-
-    backwardCost();
-
-    auto status = activation_->backward(sampleOut_);
-    status.check();
-
-    if (biases_->getWGrad()) {
-      backwardBias(callback);
-    }
-
-    for (int l = 0; l < numInputs_; ++l) {
-      backwardOneInput(l, callback);
-    }
-  }
-
-  void forwardBias() {
-    if (!biases_) {
-      sampleOut_.value->zeroMem();
-    } else {
-      real* bias = biases_->getW()->getData();
-      real* sampleOut = sampleOut_.value->getData();
-      for (size_t i = 0; i < samples_.size(); ++i) {
-        sampleOut[i] = bias[samples_[i].labelId];
-      }
-    }
-  }
-
-  void backwardBias(const UpdateCallback& callback) {
-    if (!biases_) return;
-    real* bias = biases_->getWGrad()->getData();
-    real* sampleOut = sampleOut_.grad->getData();
-    for (size_t i = 0; i < samples_.size(); ++i) {
-      bias[samples_[i].labelId] += sampleOut[i];
-    }
-    biases_->incUpdate(callback);
-  }
-
-  void forwardOneInput(int layerId) {
-    const MatrixPtr& inputMat = getInputValue(layerId);
-    const MatrixPtr& weightMat = weights_[layerId]->getW();
-
-    int dim = inputMat->getWidth();
-    real* sampleOut = sampleOut_.value->getData();
-
-    for (size_t i = 0; i < samples_.size(); ++i) {
-      sampleOut[i] += dotProduct(dim,
-                                 inputMat->getRowBuf(samples_[i].sampleId),
-                                 weightMat->getRowBuf(samples_[i].labelId));
-    }
-  }
-
-  void backwardOneInput(int layerId, const UpdateCallback& callback) {
-    const MatrixPtr& inputMat = getInputValue(layerId);
-    const MatrixPtr& inputGradMat = getInputGrad(layerId);
-    const MatrixPtr& weightMat = weights_[layerId]->getW();
-    const MatrixPtr& weightGradMat = weights_[layerId]->getWGrad();
-
-    int dim = inputMat->getWidth();
-    real* sampleGrad = sampleOut_.grad->getData();
-
-    if (weightGradMat) {
-      for (size_t i = 0; i < samples_.size(); ++i) {
-        axpy(dim,
-             sampleGrad[i],
-             inputMat->getRowBuf(samples_[i].sampleId),
-             weightGradMat->getRowBuf(samples_[i].labelId));
-      }
-      weights_[layerId]->incUpdate(callback);
-    }
-
-    if (inputGradMat) {
-      for (size_t i = 0; i < samples_.size(); ++i) {
-        axpy(dim,
-             sampleGrad[i],
-             weightMat->getRowBuf(samples_[i].labelId),
-             inputGradMat->getRowBuf(samples_[i].sampleId));
-      }
-    }
-  }
-
-  void forwardCost() {
-    real* out = output_.value->getData();
-    real* sampleOut = sampleOut_.value->getData();
-    real b = 1. / numClasses_ * config_.num_neg_samples();
-    for (size_t i = 0; i < samples_.size(); ++i) {
-      real o = sampleOut[i];
-      if (sampler_) {
-        b = config_.num_neg_samples() *
-            config_.neg_sampling_dist(samples_[i].labelId);
-      }
-      real cost = samples_[i].target ? -log(o / (o + b)) : -log(b / (o + b));
-      out[samples_[i].sampleId] += samples_[i].weight * cost;
-    }
-  }
-
-  void backwardCost() {
-    real* sampleOut = sampleOut_.value->getData();
-    real* sampleGrad = sampleOut_.grad->getData();
-
-    real b = 1. / numClasses_ * config_.num_neg_samples();
-    for (size_t i = 0; i < samples_.size(); ++i) {
-      real o = sampleOut[i];
-      if (sampler_) {
-        b = config_.num_neg_samples() *
-            config_.neg_sampling_dist(samples_[i].labelId);
-      }
-      real w = samples_[i].weight;
-      sampleGrad[i] = samples_[i].target ? -w * b / (o * (o + b)) : w / (o + b);
-    }
-  }
-};
-
-REGISTER_LAYER(nce, NCELayer);
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/NormLayer.cpp b/paddle/gserver/layers/NormLayer.cpp
deleted file mode 100644
index 4678f6fa9ab184870fc2651def18f47da9a0cc01..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/NormLayer.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "NormLayer.h"
-#include "NormProjectionLayer.h"
-#include "paddle/utils/Logging.h"
-namespace paddle {
-
-REGISTER_LAYER_CREATE_FUNC(norm, &NormLayer::create);
-
-Layer* NormLayer::create(const LayerConfig& config) {
-  CHECK_EQ(config.inputs_size(), 1);
-  const std::string& norm = config.inputs(0).norm_conf().norm_type();
-  if (norm == "rnorm") {
-    return new ResponseNormLayer(config);
-  } else if (norm == "cmrnorm-projection") {
-    return new CMRProjectionNormLayer(config);
-  } else if (norm == "cross-channel-norm") {
-    return new CrossChannelNormLayer(config);
-  } else {
-    LOG(FATAL) << "Unknown norm type: " << norm;
-    return nullptr;
-  }
-}
-
-bool ResponseNormLayer::init(const LayerMap& layerMap,
-                             const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  NormLayer::init(layerMap, parameterMap);
-
-  /* the size of inputs for norm-layer is 1 */
-  CHECK_EQ(config_.inputs_size(), 1);
-
-  const NormConfig& conf = config_.inputs(0).norm_conf();
-  channels_ = conf.channels();
-  size_ = conf.size();
-  scale_ = conf.scale();
-  pow_ = conf.pow();
-  outputX_ = conf.output_x();
-  imgSize_ = conf.img_size();
-  denoms_ = NULL;
-
-  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
-  imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
-  return true;
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/NormLayer.h b/paddle/gserver/layers/NormLayer.h
deleted file mode 100644
index 3807584415f99a7110170748501589dac85eac52..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/NormLayer.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "Layer.h"
-#include "NormLayer.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief Basic parent layer of normalization
- *
- * @note Normalize the input in local region
- */
-class NormLayer : public Layer {
- public:
-  explicit NormLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override {
-    Layer::init(layerMap, parameterMap);
-    return true;
-  }
-
-  /**
-   * @brief create norm layer by norm_type
-   */
-  static Layer* create(const LayerConfig& config);
-};
-
-/**
- * @brief response normalization within feature maps
- * namely normalize in independent channel
- * When code refactoring, we delete the original implementation.
- * Need to implement in the futrue.
- */
-class ResponseNormLayer : public NormLayer {
- protected:
-  size_t channels_, size_, outputX_, imgSize_, outputY_, imgSizeY_;
-  real scale_, pow_;
-  MatrixPtr denoms_;
-
- public:
-  explicit ResponseNormLayer(const LayerConfig& config) : NormLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override { LOG(FATAL) << "Not implemented"; }
-  void backward(const UpdateCallback& callback = nullptr) override {
-    LOG(FATAL) << "Not implemented";
-  }
-};
-
-/**
- * This layer applys normalization across the channels of each sample to a
- * conv layer's output, and scales the output by a group of trainable factors
- * whose dimensions equal to the number of channels.
- * - Input: One and only one input layer are accepted.
- * - Output: The normalized data of the input data.
- * Reference:
- *    Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
- *    Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector
- */
-class CrossChannelNormLayer : public NormLayer {
- public:
-  explicit CrossChannelNormLayer(const LayerConfig& config)
-      : NormLayer(config) {}
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback);
-  MatrixPtr createSampleMatrix(MatrixPtr data, size_t iter, size_t spatialDim);
-  MatrixPtr createSpatialMatrix(MatrixPtr data, size_t iter, size_t spatialDim);
-
- protected:
-  size_t channels_;
-  std::unique_ptr<Weight> scale_;
-  MatrixPtr scaleDiff_;
-  MatrixPtr normBuffer_;
-  MatrixPtr dataBuffer_;
-  MatrixPtr channelBuffer_;
-  MatrixPtr spatialBuffer_;
-  MatrixPtr sampleBuffer_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/NormProjectionLayer.cpp b/paddle/gserver/layers/NormProjectionLayer.cpp
deleted file mode 100644
index 3013bbdbc791546897fca51e73a056f2c843e63f..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/NormProjectionLayer.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "NormProjectionLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-size_t CMRProjectionNormLayer::getSize() {
-  CHECK_EQ(inputLayers_.size(), 1UL);
-  size_t layerSize = 0;
-  imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
-  imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
-  if (imgSizeH_ == 0) {
-    imgSizeH_ = imgSizeY_;
-  }
-  if (imgSizeW_ == 0) {
-    imgSizeW_ = imgSize_;
-  }
-  outputH_ = imgSizeH_;
-  outputW_ = imgSizeW_;
-  layerSize = outputH_ * outputW_ * channels_;
-
-  getOutput().setFrameHeight(outputH_);
-  getOutput().setFrameWidth(outputW_);
-  return layerSize;
-}
-
-bool CMRProjectionNormLayer::init(const LayerMap& layerMap,
-                                  const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  ResponseNormLayer::init(layerMap, parameterMap);
-
-  /* the size of inputs for norm-layer is 1 */
-  CHECK_EQ(config_.inputs_size(), 1);
-
-  createFunction(
-      forward_,
-      "CrossMapNormal",
-      FuncConfig().set("size", size_).set("scale", scale_).set("pow", pow_));
-  createFunction(
-      backward_,
-      "CrossMapNormalGrad",
-      FuncConfig().set("size", size_).set("scale", scale_).set("pow", pow_));
-
-  return true;
-}
-
-void CMRProjectionNormLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  /* malloc memory for the output_ if necessary */
-  /* note: one sample correspond to one row */
-  MatrixPtr input = inputLayers_[0]->getOutputValue();
-  size_t batchSize = input->getHeight();
-  int size = getSize();
-  resetOutput(batchSize, size);
-
-  Matrix::resizeOrCreate(denoms_, batchSize, size, /* trans */ false, useGpu_);
-
-  shape_ = TensorShape({batchSize, channels_, imgSizeH_, imgSizeW_});
-
-  // prepare forward arguments
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*getInputValue(0), shape_);
-  outputs.addArg(*getOutputValue(), shape_, ASSIGN_TO);
-  outputs.addArg(*denoms_, shape_, ASSIGN_TO);
-
-  forward_[0]->calc(inputs, outputs);
-}
-
-void CMRProjectionNormLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-
-  if (NULL == getInputGrad(0)) {
-    return;
-  }
-
-  // prepare backward arguments
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*getInputValue(0), shape_);
-  inputs.addArg(*getOutputValue(), shape_);
-  inputs.addArg(*getOutputGrad(), shape_);
-  inputs.addArg(*denoms_, shape_);
-  outputs.addArg(*getInputGrad(0), shape_, ADD_TO);
-
-  backward_[0]->calc(inputs, outputs);
-}
-}  // namespace paddle
diff --git a/paddle/gserver/layers/NormProjectionLayer.h b/paddle/gserver/layers/NormProjectionLayer.h
deleted file mode 100644
index 64803a1603599f2e393ec772a32d64f4d271fe71..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/NormProjectionLayer.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "NormLayer.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief response normalization across feature maps
- * namely normalize in number of size_ channels
- */
-class CMRProjectionNormLayer : public ResponseNormLayer {
-  size_t imgSizeH_, imgSizeW_;
-  size_t outputH_, outputW_;
-
- public:
-  explicit CMRProjectionNormLayer(const LayerConfig& config)
-      : ResponseNormLayer(config) {}
-
-  ~CMRProjectionNormLayer() {}
-
-  size_t getSize();
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
- protected:
-  TensorShape shape_;
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/Operator.h b/paddle/gserver/layers/Operator.h
deleted file mode 100644
index 42d525ef3e4534acea7512d5ecdbe8a0e1d110d9..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/Operator.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "ModelConfig.pb.h"
-#include "paddle/parameter/Parameter.h"
-
-#include "Layer.h"
-#include "paddle/parameter/Argument.h"
-
-namespace paddle {
-
-// Macro for registering a operator type
-// Example: REGISTER_OPERATOR(dot_mul, DotMulOperator);
-#define REGISTER_OPERATOR(__type_name, __class_name)                \
-  static InitFunction __reg_type_##__type_name([]() {               \
-    Operator::registrar_.registerClass<__class_name>(#__type_name); \
-  })
-
-/**
- * Operator like Projection, but takes more than one Arguments as input.
- * @note: Operator can't have parameters.
- */
-class Operator {
- public:
-  static Operator* create(const OperatorConfig& config, bool useGpu);
-
-  Operator(const OperatorConfig& config, bool useGpu)
-      : config_(config), useGpu_(useGpu) {}
-
-  virtual ~Operator() {}
-
-  const OperatorConfig& getConfig() const { return config_; }
-
-  static ClassRegistrar<Operator, OperatorConfig, bool> registrar_;
-
-  /**
-   * Forward propagation. If backward() will be called, in and out must be kept
-   * valid until then.
-   * @param ins inputs of operator
-   * @param out output of operator
-   * @param passType PASS_TRAIN of PASS_TEST
-   */
-  void forward(std::vector<const Argument*> ins,
-               Argument* out,
-               PassType passType) {
-    ins_ = ins;
-    out_ = out;
-    passType_ = passType;
-    forward();
-  }
-
-  virtual void prefetch(const Argument* in) {}
-  virtual void forward() = 0;
-  virtual void backward() = 0;
-
-  /**
-   * See comment in Layer.h for the function with the same name.
-   */
-  virtual void resetState() {}
-
-  /**
-   * Set layer state.
-   */
-  virtual void setState(LayerStatePtr state) {}
-
-  /**
-   * Set layer state.
-   */
-  virtual LayerStatePtr getState() { return nullptr; }
-
- protected:
-  /// Config of operator
-  OperatorConfig config_;
-  bool useGpu_;
-
-  /// Store `ins` passed to forward()
-  std::vector<const Argument*> ins_;
-  /// Store `out` passed to forward()
-  Argument* out_;
-  /// Store `passType` passed to forward()
-  PassType passType_;
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/OuterProdLayer.cpp b/paddle/gserver/layers/OuterProdLayer.cpp
deleted file mode 100644
index 11a910f3316114b309efe9007a156e842b3d6229..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/OuterProdLayer.cpp
+++ /dev/null
@@ -1,141 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief A layer for computing the outer product of two vectors
- * @note used in NEURAL TURING MACHINE
- * Input1: vector (batchSize * dim1)
- * Input2: vector (batchSize * dim2)
- * Output: a matrix: (batchSize * (dim1*dim2))
- */
-
-class OuterProdLayer : public Layer {
- protected:
-  MatrixPtr tmpMtx0;
-  MatrixPtr tmpRow0;
-  MatrixPtr tmpRow1;
-
- public:
-  explicit OuterProdLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~OuterProdLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(out_prod, OuterProdLayer);
-
-bool OuterProdLayer::init(const LayerMap& layerMap,
-                          const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 2U);
-
-  size_t dim0 = inputLayers_[0]->getSize();
-  size_t dim1 = inputLayers_[1]->getSize();
-
-  CHECK_EQ(dim0 * dim1, getSize()) << "Dimension mismatch";
-
-  tmpRow0 = Matrix::create(
-      nullptr, /* height= */ 1, dim0, /* trans= */ false, useGpu_);
-  tmpRow1 = Matrix::create(
-      nullptr, /* height= */ 1, dim1, /* trans= */ false, useGpu_);
-  tmpMtx0 = Matrix::create(nullptr,
-                           /* height= */ dim0,
-                           dim1,
-                           /* trans= */ false,
-                           useGpu_);
-  return true;
-}
-
-void OuterProdLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-
-  size_t batchSize = inV0->getHeight();
-  size_t dim0 = inV0->getWidth();
-  size_t dim1 = inV1->getWidth();
-
-  CHECK_EQ(dim0 * dim1, getSize());
-  CHECK_EQ(inV1->getHeight(), batchSize);
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    reserveOutput(batchSize, dim0 * dim1);
-  }
-
-  MatrixPtr outV = getOutputValue();
-
-  {
-    REGISTER_TIMER_INFO("FwOutProdTimer", getName().c_str());
-    for (size_t i = 0; i < batchSize; i++) {
-      tmpMtx0->setData(outV->getData() + i * dim0 * dim1);
-      tmpRow0->setData(inV0->getData() + i * dim0);
-      tmpRow1->setData(inV1->getData() + i * dim1);
-
-      tmpMtx0->mul(*tmpRow0->getTranspose(), *tmpRow1);
-    }
-  }
-}
-
-void OuterProdLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-  MatrixPtr outG = getOutputGrad();
-  MatrixPtr inG0 = getInputGrad(0);
-  MatrixPtr inG1 = getInputGrad(1);
-
-  size_t batchSize = inV0->getHeight();
-  size_t dim0 = inV0->getWidth();
-  size_t dim1 = inV1->getWidth();
-
-  {
-    REGISTER_TIMER_INFO("BwOutProdTimer", getName().c_str());
-
-    if (inG0) {
-      for (size_t i = 0; i < batchSize; i++) {
-        tmpMtx0->setData(outG->getData() + i * dim0 * dim1);
-        tmpRow0->setData(inG0->getData() + i * dim0);
-        tmpRow1->setData(inV1->getData() + i * dim1);
-
-        tmpRow0->mul(*tmpRow1, *tmpMtx0->getTranspose(), 1, 1);
-      }
-    }
-
-    if (inG1) {
-      for (size_t i = 0; i < batchSize; i++) {
-        tmpMtx0->setData(outG->getData() + i * dim0 * dim1);
-        tmpRow0->setData(inV0->getData() + i * dim0);
-        tmpRow1->setData(inG1->getData() + i * dim1);
-
-        tmpRow1->mul(*tmpRow0, *tmpMtx0, 1, 1);
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/PadLayer.cpp b/paddle/gserver/layers/PadLayer.cpp
deleted file mode 100644
index b1910e108b5b2f7b55a2aa1527b96e6b8a16f348..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/PadLayer.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PadLayer.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(pad, PadLayer);
-
-bool PadLayer::init(const LayerMap& layerMap,
-                    const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  auto& pad_conf = config_.inputs(0).pad_conf();
-  auto& img_conf = pad_conf.image_conf();
-  CHECK_EQ(config_.inputs_size(), 1);
-  inDims_ = TensorShape(
-      {0,
-       img_conf.channels(),
-       img_conf.has_img_size_y() ? img_conf.img_size_y() : img_conf.img_size(),
-       img_conf.img_size()});
-
-  CHECK_EQ(2, pad_conf.pad_c_size());
-  CHECK_EQ(2, pad_conf.pad_h_size());
-  CHECK_EQ(2, pad_conf.pad_w_size());
-  padc_ = {pad_conf.pad_c(0), pad_conf.pad_c(1)};
-  padh_ = {pad_conf.pad_h(0), pad_conf.pad_h(1)};
-  padw_ = {pad_conf.pad_w(0), pad_conf.pad_w(1)};
-
-  outDims_ = TensorShape(4);
-  setOutDims(0);
-
-  createFunction(forward_,
-                 "Pad",
-                 FuncConfig()
-                     .set("channel", padc_)
-                     .set("height", padh_)
-                     .set("width", padw_));
-  createFunction(backward_,
-                 "PadGrad",
-                 FuncConfig()
-                     .set("channel", padc_)
-                     .set("height", padh_)
-                     .set("width", padw_));
-
-  return true;
-}
-
-void PadLayer::setOutDims(const size_t batchSize) {
-  outDims_.reshape({batchSize,
-                    inDims_[1] + padc_[0] + padc_[1],
-                    inDims_[2] + padh_[0] + padh_[1],
-                    inDims_[3] + padw_[0] + padw_[1]});
-}
-
-void PadLayer::setTensorDim(const size_t batchSize) {
-  CHECK_EQ(static_cast<int>(inputLayers_.size()), 1);
-  inDims_.setDim(0, batchSize);
-  int h = inputLayers_[0]->getOutput().getFrameHeight();
-  if (h != 0) inDims_.setDim(2, h);
-  int w = inputLayers_[0]->getOutput().getFrameWidth();
-  if (w != 0) inDims_.setDim(3, w);
-  setOutDims(batchSize);
-}
-
-void PadLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  MatrixPtr input = inputLayers_[0]->getOutputValue();
-  size_t batchSize = input->getHeight();
-  setTensorDim(batchSize);
-  int size = outDims_[1] * outDims_[2] * outDims_[3];
-  resetOutput(batchSize, size);
-  MatrixPtr outV = getOutputValue();
-  REGISTER_TIMER_INFO("PadForward", getName().c_str());
-
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*getInputValue(0), inDims_);
-  outputs.addArg(*getOutputValue(), outDims_, ASSIGN_TO);
-  forward_[0]->calc(inputs, outputs);
-}
-
-void PadLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-  REGISTER_TIMER_INFO("PadBackward", getName().c_str());
-
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*getOutputGrad(), outDims_);
-  outputs.addArg(*getInputGrad(0), inDims_, ADD_TO);
-  backward_[0]->calc(inputs, outputs);
-}
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ParameterReluLayer.cpp b/paddle/gserver/layers/ParameterReluLayer.cpp
deleted file mode 100644
index 12d04fc1c3ca169179beafc372a07a2e6d0a1773..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ParameterReluLayer.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ParameterReluLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(prelu, ParameterReluLayer);
-
-bool ParameterReluLayer::init(const LayerMap& layerMap,
-                              const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-  CHECK_EQ(inputLayers_.size(), 1UL);
-  CHECK_EQ(inputLayers_.size(), parameters_.size());
-  partialSum_ = config_.partial_sum();
-  CHECK_GT(partialSum_, 0UL) << "partial_sum must be larger than zero.";
-  CHECK(!(inputLayers_[0]->getSize() % partialSum_))
-      << "Incorrect value for partialSum: " << partialSum_
-      << " must divide input size: " << inputLayers_[0]->getSize();
-  CHECK_EQ(getSize() / partialSum_, parameters_[0]->getSize());
-  weight_ = std::unique_ptr<Weight>(new Weight(
-      1UL, inputLayers_[0]->getSize() / partialSum_, parameters_[0]));
-  return true;
-}
-
-void ParameterReluLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  int batchSize = getInput(0).getBatchSize();
-  int size = getSize();
-  reserveOutput(batchSize, size);
-  MatrixPtr outV = getOutputValue();
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    outV->paramReluForward(*(getInput(0).value), *(weight_->getW()));
-  }
-}
-
-void ParameterReluLayer::backward(const UpdateCallback& callback) {
-  if (weight_->getWGrad()) {
-    weight_->getWGrad()->paramReluBackwardW(*getOutputGrad(),
-                                            *(getInputValue(0)));
-  }
-
-  MatrixPtr preGrad = getInputGrad(0);
-  preGrad->paramReluBackwardDiff(
-      *getOutputGrad(), *(getInputValue(0)), *(weight_->getW()));
-  {
-    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
-    weight_->getParameterPtr()->incUpdate(callback);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ParameterReluLayer.h b/paddle/gserver/layers/ParameterReluLayer.h
deleted file mode 100644
index 4553413fcdbecbc83e1f50e8ffbe874fdf05d828..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ParameterReluLayer.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/ThreadLocal.h"
-
-namespace paddle {
-
-/**
- *  @brief ParameterReluLayer active inputs with learnable parameter weight_.
- *  forward:
- *  \f[
- *      y = x > 0 ? x : w .* x
- *  \f]
- *  backward:
- *  \f[
- *      dx = x > 0 ? dy : w .* dy \\
- *      dw = x > 0 ? 0 : dy.*x
- *  \f]
- *  Here, x is the input, w is the weight, y is the output.
- *  dx, dw, dy is the gradient.
- */
-
-class ParameterReluLayer : public Layer {
- protected:
-  std::unique_ptr<Weight> weight_;
-
-  /**
-   *  @brief partialSum_ makes a group of inputs share same weights,
-   *  - partialSum_ = 1:
-   *       element wise activation: each element has a weight_,
-   *  - partialSum_ = number of elements in one channel,
-   *       channels wise parameter activation, elements in a channel
-   *       share same weight_,
-   *  - partialSum_ = number of outputs
-   *       all elements share same weight_,
-   */
-  size_t partialSum_;
-
- public:
-  explicit ParameterReluLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~ParameterReluLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/Pool3DLayer.cpp b/paddle/gserver/layers/Pool3DLayer.cpp
deleted file mode 100644
index 3ac9eb0d8198814c9f01fe101a60ab1f1f431062..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/Pool3DLayer.cpp
+++ /dev/null
@@ -1,178 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Pool3DLayer.h"
-#include "PoolProjectionLayer.h"
-#include "paddle/utils/Logging.h"
-
-namespace paddle {
-
-REGISTER_LAYER(pool3d, Pool3DLayer);
-
-bool Pool3DLayer::init(const LayerMap& layerMap,
-                       const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  /* the size of inputs for pool-layer is 1 */
-  CHECK_EQ(config_.inputs_size(), 1);
-
-  const PoolConfig& conf = config_.inputs(0).pool_conf();
-  poolType_ = conf.pool_type();
-  channels_ = conf.channels();
-
-  sizeX_ = conf.size_x();
-  sizeY_ = conf.size_y();
-  sizeZ_ = conf.size_z();
-
-  strideW_ = conf.stride();
-  strideH_ = conf.stride_y();
-  strideD_ = conf.stride_z();
-
-  imgSizeW_ = conf.img_size();
-  imgSizeH_ = conf.img_size_y();
-  imgSizeD_ = conf.img_size_z();
-
-  paddingW_ = conf.padding();
-  paddingH_ = conf.padding_y();
-  paddingD_ = conf.padding_z();
-
-  outputW_ = conf.output_x();
-  outputH_ = conf.output_y();
-  outputD_ = conf.output_z();
-
-  return true;
-}
-
-size_t Pool3DLayer::getSize() {
-  CHECK_EQ(inputLayers_.size(), 1UL);
-
-  size_t layerSize = 0;
-  outputD_ = outputSize(imgSizeD_, sizeZ_, paddingD_, strideD_, false);
-  outputH_ = outputSize(imgSizeH_, sizeY_, paddingH_, strideH_, false);
-  outputW_ = outputSize(imgSizeW_, sizeX_, paddingW_, strideW_, false);
-
-  layerSize = outputD_ * outputH_ * outputW_ * channels_;
-  getOutput().setFrameHeight(outputH_);
-  getOutput().setFrameWidth(outputW_);
-  getOutput().setFrameDepth(outputD_);
-  return layerSize;
-}
-
-void Pool3DLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  const MatrixPtr& inMat = inputLayers_[0]->getOutputValue();
-  size_t batchSize = inMat->getHeight();
-  size_t outWidth = getSize();
-  resetOutput(batchSize, outWidth);
-  Matrix::resizeOrCreate(maxPoolIdx_, batchSize, outWidth, false, useGpu_);
-  const MatrixPtr outMat = getOutputValue();
-
-  if (poolType_ == "avg") {
-    outMat->avgPool3DForward(*inMat,
-                             channels_,
-                             imgSizeD_,
-                             imgSizeH_,
-                             imgSizeW_,
-                             outputD_,
-                             outputH_,
-                             outputW_,
-                             sizeZ_,
-                             sizeY_,
-                             sizeX_,
-                             strideD_,
-                             strideH_,
-                             strideW_,
-                             paddingD_,
-                             paddingH_,
-                             paddingW_);
-  } else if (poolType_ == "max") {
-    outMat->maxPool3DForward(*inMat,
-                             *maxPoolIdx_,
-                             channels_,
-                             imgSizeD_,
-                             imgSizeH_,
-                             imgSizeW_,
-                             outputD_,
-                             outputH_,
-                             outputW_,
-                             sizeZ_,
-                             sizeY_,
-                             sizeX_,
-                             strideD_,
-                             strideH_,
-                             strideW_,
-                             paddingD_,
-                             paddingH_,
-                             paddingW_);
-  } else {
-    LOG(FATAL) << "Unknown pool type: " << poolType_;
-  }
-  forwardActivation();
-}
-
-void Pool3DLayer::backward(const UpdateCallback& callback) {
-  backwardActivation();
-
-  (void)callback;
-  if (NULL == getInputGrad(0)) return;
-  MatrixPtr inMat = inputLayers_[0]->getOutputValue();
-  MatrixPtr inGradMat = inputLayers_[0]->getOutputGrad();
-  MatrixPtr outMat = getOutputValue();
-  MatrixPtr outGradMat = getOutputGrad();
-
-  if (poolType_ == "avg") {
-    inGradMat->avgPool3DBackward(*outGradMat,
-                                 imgSizeD_,
-                                 imgSizeH_,
-                                 imgSizeW_,
-                                 outputD_,
-                                 outputH_,
-                                 outputW_,
-                                 sizeZ_,
-                                 sizeY_,
-                                 sizeZ_,
-                                 strideD_,
-                                 strideH_,
-                                 strideW_,
-                                 paddingD_,
-                                 paddingH_,
-                                 paddingW_,
-                                 1.0,
-                                 1.0);
-  } else if (poolType_ == "max") {
-    inGradMat->maxPool3DBackward(*outGradMat,
-                                 *maxPoolIdx_,
-                                 imgSizeD_,
-                                 imgSizeH_,
-                                 imgSizeW_,
-                                 outputD_,
-                                 outputH_,
-                                 outputW_,
-                                 sizeZ_,
-                                 sizeY_,
-                                 sizeZ_,
-                                 strideD_,
-                                 strideH_,
-                                 strideW_,
-                                 paddingD_,
-                                 paddingH_,
-                                 paddingW_,
-                                 1.0,
-                                 1.0);
-  } else {
-    LOG(FATAL) << "Unknown pool type: " << poolType_;
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/Pool3DLayer.h b/paddle/gserver/layers/Pool3DLayer.h
deleted file mode 100644
index 32605f8b7028cfb4909c885e83017a8cffa79575..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/Pool3DLayer.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "Layer.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief Basic parent layer of pooling
- * Pools the input within regions
- */
-class Pool3DLayer : public Layer {
- public:
-  explicit Pool3DLayer(const LayerConfig& config) : Layer(config) {}
-  ~Pool3DLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-  size_t getSize();
-
- protected:
-  int channels_;
-  int sizeX_, sizeY_, sizeZ_;
-  int strideW_, strideH_, strideD_;
-  int paddingW_, paddingH_, paddingD_;
-  int imgSizeW_, imgSizeH_, imgSizeD_;
-  int outputW_, outputH_, outputD_;
-  std::string poolType_;
-  MatrixPtr maxPoolIdx_;
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/PoolLayer.cpp b/paddle/gserver/layers/PoolLayer.cpp
deleted file mode 100644
index ee589e6be51b1e66984f5a1d808b73aab962821d..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/PoolLayer.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PoolLayer.h"
-#include "MaxPoolWithMaskLayer.h"
-#include "PoolProjectionLayer.h"
-#include "paddle/utils/Logging.h"
-#ifdef PADDLE_WITH_CUDA
-#include "CudnnPoolLayer.h"
-#endif
-namespace paddle {
-
-REGISTER_LAYER_CREATE_FUNC(pool, &PoolLayer::create);
-
-bool PoolLayer::init(const LayerMap& layerMap,
-                     const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  /* the size of inputs for pool-layer is 1 */
-  CHECK_EQ(config_.inputs_size(), 1);
-
-  const PoolConfig& conf = config_.inputs(0).pool_conf();
-  poolType_ = conf.pool_type();
-  channels_ = conf.channels();
-  sizeX_ = conf.size_x();
-  stride_ = conf.stride();
-  outputX_ = conf.output_x();
-  imgSize_ = conf.img_size();
-  confPadding_ = conf.padding();
-
-  sizeY_ = conf.has_size_y() ? conf.size_y() : conf.size_x();
-  imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
-  strideY_ = conf.has_stride_y() ? conf.stride_y() : conf.stride();
-  confPaddingY_ = conf.has_padding_y() ? conf.padding_y() : conf.padding();
-  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
-
-  excludeMode_ = conf.has_exclude_mode() ? conf.exclude_mode() : true;
-  return true;
-}
-
-Layer* PoolLayer::create(const LayerConfig& config) {
-  CHECK_EQ(config.inputs_size(), 1);
-  const std::string& pool = config.inputs(0).pool_conf().pool_type();
-  if (pool == "max-projection" || pool == "avg-projection") {
-    return new PoolProjectionLayer(config);
-#ifdef PADDLE_WITH_CUDA
-  } else if (CudnnPoolLayer::typeCheck(pool)) {
-    return new CudnnPoolLayer(config);
-#endif
-  } else if (pool == "max-pool-with-mask") {
-    return new MaxPoolWithMaskLayer(config);
-  } else {
-    LOG(FATAL) << "Unknown pool type: " << pool;
-    return nullptr;
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/PoolLayer.h b/paddle/gserver/layers/PoolLayer.h
deleted file mode 100644
index 99f8f148e2eb00f7e431e7d8c5acbf9e27574017..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/PoolLayer.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "Layer.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief Basic parent layer of pooling
- * Pools the input within regions
- */
-class PoolLayer : public Layer {
- protected:
-  size_t channels_, sizeX_, stride_, outputX_, imgSize_;
-  int confPadding_;
-
-  size_t sizeY_;
-  size_t imgSizeY_;
-  size_t strideY_;
-  size_t outputY_;
-  int confPaddingY_;
-
-  std::string poolType_;
-
-  bool excludeMode_;
-
- public:
-  explicit PoolLayer(const LayerConfig& config) : Layer(config) {}
-
-  /**
-   * @brief create pooling layer by pool_type
-   */
-  static Layer* create(const LayerConfig& config);
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/PoolProjection.h b/paddle/gserver/layers/PoolProjection.h
deleted file mode 100644
index 8004cc1550337160b7f022c97a23ed8eb9d43ca4..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/PoolProjection.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Projection.h"
-#include "paddle/math/MathUtils.h"
-
-namespace paddle {
-
-class PoolProjection : public Projection {
- protected:
-  size_t imgSizeY_, imgSize_;
-  size_t outputY_, outputX_;
-  size_t strideY_, stride_;
-  size_t sizeY_, sizeX_;
-  int confPaddingY_, confPadding_;
-  size_t channels_;
-  std::string poolType_;
-  bool excludeMode_;
-
- public:
-  PoolProjection(const ProjectionConfig& config,
-                 ParameterPtr parameter,
-                 bool useGpu);
-
-  static PoolProjection* create(const ProjectionConfig& config,
-                                ParameterPtr parameter,
-                                bool useGpu);
-
-  const std::string& getPoolType() const { return poolType_; }
-
-  size_t getSize();
-};
-
-class MaxPoolProjection : public PoolProjection {
- public:
-  MaxPoolProjection(const ProjectionConfig& config,
-                    ParameterPtr parameter,
-                    bool useGpu)
-      : PoolProjection(config, parameter, useGpu) {}
-
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback = nullptr);
-};
-
-class AvgPoolProjection : public PoolProjection {
- public:
-  AvgPoolProjection(const ProjectionConfig& config,
-                    ParameterPtr parameter,
-                    bool useGpu)
-      : PoolProjection(config, parameter, useGpu) {}
-
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback = nullptr);
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/PoolProjectionLayer.cpp b/paddle/gserver/layers/PoolProjectionLayer.cpp
deleted file mode 100644
index 73d320e67ec09513f419ecdd45a57fc5c54df5ed..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/PoolProjectionLayer.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PoolProjectionLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-size_t PoolProjectionLayer::getSize() {
-  CHECK_EQ(inputLayers_.size(), 1UL);
-  size_t layerSize = 0;
-  imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
-  imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
-  if (imgSizeH_ == 0) {
-    imgSizeH_ = imgSizeY_;
-  }
-  if (imgSizeW_ == 0) {
-    imgSizeW_ = imgSize_;
-  }
-
-  outputH_ = outputSize(imgSizeH_,
-                        sizeY_,
-                        confPaddingY_,
-                        strideY_,
-                        /* caffeMode */ false);
-  outputW_ = outputSize(imgSizeW_,
-                        sizeX_,
-                        confPadding_,
-                        stride_,
-                        /* caffeMode */ false);
-
-  layerSize = outputH_ * outputW_ * channels_;
-
-  return layerSize;
-}
-
-void PoolProjectionLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  const Argument& in = getInput(0);
-  int batchSize = in.value->getHeight();
-  int size = getSize();
-  resetOutput(batchSize, size);
-  poolProjection_->forward(&in, &output_, passType);
-}
-
-void PoolProjectionLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-  if (NULL == getInputGrad(0)) {
-    return;
-  }
-  poolProjection_->backward(callback);
-}
-}  // namespace paddle
diff --git a/paddle/gserver/layers/PoolProjectionLayer.h b/paddle/gserver/layers/PoolProjectionLayer.h
deleted file mode 100644
index 9ad144cc2ad426caa522bf1061a750d47e64a755..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/PoolProjectionLayer.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "PoolLayer.h"
-#include "PoolProjection.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-/**
- * @brief Basic parent layer of different kinds of pooling
- */
-class PoolProjectionLayer : public PoolLayer {
- protected:
-  size_t imgSizeH_, imgSizeW_;
-  size_t outputH_, outputW_;
-  std::unique_ptr<PoolProjection> poolProjection_;
-  ProjectionConfig projectionConfig_;
-
- public:
-  explicit PoolProjectionLayer(const LayerConfig& config) : PoolLayer(config) {
-    PoolConfig* conf = projectionConfig_.mutable_pool_conf();
-    *conf = config_.inputs(0).pool_conf();
-    poolProjection_.reset(
-        PoolProjection::create(projectionConfig_, nullptr, useGpu_));
-  }
-
-  size_t getSize();
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/PowerLayer.cpp b/paddle/gserver/layers/PowerLayer.cpp
deleted file mode 100644
index 7e8d60db8fe588026c6040099745c3aefd7237b5..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/PowerLayer.cpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * This layer applys a power function to a vector element-wise,
- * which is used in NEURAL TURING MACHINE.
- * \f[
- *   y = x^w
- * \f]
- * where \f$x\f$ is a input vector, \f$w\f$ is scalar weight,
- * and output \f$y\f$ is a vector.
- *
- * The config file api is power_layer.
- */
-
-class PowerLayer : public Layer {
- protected:
-  MatrixPtr tmpMtx;
-
- public:
-  explicit PowerLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~PowerLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(power, PowerLayer);
-
-bool PowerLayer::init(const LayerMap& layerMap,
-                      const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 2U);
-
-  return true;
-}
-
-void PowerLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-
-  size_t batchSize = inV1->getHeight();
-  size_t dataDim = inV1->getWidth();
-
-  CHECK_EQ(getSize(), dataDim);
-  CHECK_EQ(1U, inV0->getWidth());
-  CHECK_EQ(batchSize, inV0->getHeight());
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    reserveOutput(batchSize, dataDim);
-  }
-
-  MatrixPtr outV = getOutputValue();
-
-  {
-    REGISTER_TIMER_INFO("FwPowerTimer", getName().c_str());
-    outV->rowPow(0, *inV1, *inV0);
-  }
-}
-
-void PowerLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-  MatrixPtr inG0 = getInputGrad(0);
-  MatrixPtr inG1 = getInputGrad(1);
-  MatrixPtr outV = getOutputValue();
-  MatrixPtr outG = getOutputGrad();
-
-  size_t batchSize = inV1->getHeight();
-  size_t dataDim = inV1->getWidth();
-
-  {
-    REGISTER_TIMER_INFO("BwPowerTimer", getName().c_str());
-    Matrix::resizeOrCreate(tmpMtx, batchSize, dataDim, false, useGpu_);
-
-    if (inG0) {
-      tmpMtx->log2(*inV1);
-      tmpMtx->dotMul(*tmpMtx, *outV);
-
-      // inG0 += outG .* (log(inV1) * outV)
-      inG0->rowDotMul(0, *outG, *tmpMtx);
-    }
-
-    if (inG1) {
-      // tmp = (outV / inV1) * inV0
-      tmpMtx->dotDiv(*outV, *inV1);
-      tmpMtx->rowScale(0, *tmpMtx, *inV0);
-
-      inG1->addDotMul(*outG, *tmpMtx, 1, 1);
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/PriorBox.cpp b/paddle/gserver/layers/PriorBox.cpp
deleted file mode 100644
index 39d2c2d737fa90737635efdb209610e156c8662f..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/PriorBox.cpp
+++ /dev/null
@@ -1,159 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/BaseMatrix.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-/**
- * @brief A layer for generating priorbox locations and variances.
- * - Input: Two and only two input layer are accepted. The input layer must be
- *          be a data output layer and a convolution output layer.
- * - Output: The priorbox locations and variances of the input data.
- * Reference:
- *    Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
- *    Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector
- */
-
-class PriorBoxLayer : public Layer {
- public:  // NOLINT
-  explicit PriorBoxLayer(const LayerConfig& config) : Layer(config) {}
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override {}
-
- protected:  // NOLINT
-  int numPriors_;
-  std::vector<int> minSize_;
-  std::vector<int> maxSize_;
-  std::vector<real> aspectRatio_;
-  std::vector<real> variance_;
-  MatrixPtr buffer_;
-};
-
-REGISTER_LAYER(priorbox, PriorBoxLayer);
-
-bool PriorBoxLayer::init(const LayerMap& layerMap,
-                         const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-  auto pbConf = config_.inputs(0).priorbox_conf();
-  std::vector<real> tmp;
-  aspectRatio_.push_back(1.);
-  std::copy(pbConf.min_size().begin(),
-            pbConf.min_size().end(),
-            std::back_inserter(minSize_));
-  std::copy(pbConf.max_size().begin(),
-            pbConf.max_size().end(),
-            std::back_inserter(maxSize_));
-  std::copy(pbConf.variance().begin(),
-            pbConf.variance().end(),
-            std::back_inserter(variance_));
-  std::copy(pbConf.aspect_ratio().begin(),
-            pbConf.aspect_ratio().end(),
-            std::back_inserter(tmp));
-
-  if (maxSize_.size() > 0) CHECK_EQ(minSize_.size(), maxSize_.size());
-
-  // flip aspect ratios
-  for (unsigned index = 0; index < tmp.size(); index++) {
-    real ar = tmp[index];
-    if (fabs(ar - 1.) < 1e-6) continue;
-    aspectRatio_.push_back(ar);
-    aspectRatio_.push_back(1. / ar);
-  }
-
-  numPriors_ = aspectRatio_.size() * minSize_.size() + maxSize_.size();
-
-  return true;
-}
-
-void PriorBoxLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  auto input = getInput(0);
-  int layerWidth = input.getFrameWidth();
-  int layerHeight = input.getFrameHeight();
-
-  auto image = getInput(1);
-  int imageWidth = image.getFrameWidth();
-  int imageHeight = image.getFrameHeight();
-
-  real stepW = static_cast<real>(imageWidth) / layerWidth;
-  real stepH = static_cast<real>(imageHeight) / layerHeight;
-  int dim = layerHeight * layerWidth * numPriors_ * 4;
-  reserveOutput(1, dim * 2);
-  // use a cpu buffer to compute
-  Matrix::resizeOrCreate(buffer_, 1, dim * 2, false, false);
-  auto* tmpPtr = buffer_->getData();
-
-  int idx = 0;
-  for (int h = 0; h < layerHeight; ++h) {
-    for (int w = 0; w < layerWidth; ++w) {
-      real centerX = (w + 0.5) * stepW;
-      real centerY = (h + 0.5) * stepH;
-      for (size_t s = 0; s < minSize_.size(); s++) {
-        real minSize = minSize_[s];
-        real boxWidth = minSize;
-        real boxHeight = minSize;
-
-        // first prior: aspect_ratio == 1.0, compatible to old logic
-        tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
-        tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
-        tmpPtr[idx++] = (centerX + boxWidth / 2.) / imageWidth;
-        tmpPtr[idx++] = (centerY + boxHeight / 2.) / imageHeight;
-        // set the variance.
-        for (int t = 0; t < 4; t++) tmpPtr[idx++] = variance_[t];
-
-        if (maxSize_.size() > 0) {
-          // square prior with size sqrt(minSize * maxSize)
-          real maxSize = maxSize_[s];
-          boxWidth = boxHeight = sqrt(minSize * maxSize);
-          tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
-          tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
-          tmpPtr[idx++] = (centerX + boxWidth / 2.) / imageWidth;
-          tmpPtr[idx++] = (centerY + boxHeight / 2.) / imageHeight;
-          // set the variance.
-          for (int t = 0; t < 4; t++) tmpPtr[idx++] = variance_[t];
-        }
-
-        // priors with different aspect ratios
-        for (size_t r = 0; r < aspectRatio_.size(); r++) {
-          real ar = aspectRatio_[r];
-          if (fabs(ar - 1.0) < 1e-6) {
-            continue;
-          }
-          boxWidth = minSize * sqrt(ar);
-          boxHeight = minSize / sqrt(ar);
-          tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
-          tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
-          tmpPtr[idx++] = (centerX + boxWidth / 2.) / imageWidth;
-          tmpPtr[idx++] = (centerY + boxHeight / 2.) / imageHeight;
-          // set the variance.
-          for (int t = 0; t < 4; t++) tmpPtr[idx++] = variance_[t];
-        }
-      }
-    }
-  }
-
-  // clip the prior's coordidate such that it is within [0, 1]
-  for (int d = 0; d < dim * 2; ++d)
-    if ((d % 8) < 4)
-      tmpPtr[d] = std::min(std::max(tmpPtr[d], (real)0.), (real)1.);
-  MatrixPtr outV = getOutputValue();
-  outV->copyFrom(buffer_->data_, dim * 2);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/Projection.h b/paddle/gserver/layers/Projection.h
deleted file mode 100644
index 88a41355cfce711e1e9522655058d0f1198e4e76..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/Projection.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "ModelConfig.pb.h"
-#include "paddle/parameter/Parameter.h"
-
-namespace paddle {
-
-// Macro for registering a projection type
-// Example: REGISTER_LAYER(fc, FullMatrixProjection);
-#define REGISTER_PROJECTION(__type_name, __class_name)                \
-  static InitFunction __reg_type_##__type_name([]() {                 \
-    Projection::registrar_.registerClass<__class_name>(#__type_name); \
-  })
-
-#define REGISTER_PROJECTION_CREATE_FUNC(__type_name, createFunction)    \
-  static InitFunction __reg_type_##__type_name([]() {                   \
-    Projection::registrar_.registerClass(#__type_name, createFunction); \
-  })
-
-/**
- * A projection takes one Argument as input, calculate the result and add it
- * to output Argument.
- */
-class Projection {
- public:
-  static Projection* create(const ProjectionConfig& config,
-                            ParameterPtr parameter,
-                            bool useGpu);
-
-  Projection(const ProjectionConfig& config,
-             ParameterPtr parameter,
-             bool useGpu)
-      : config_(config), parameter_(parameter), useGpu_(useGpu) {}
-
-  virtual ~Projection() {}
-
-  const std::string& getName() const { return config_.name(); }
-
-  /// Register a projection
-  static ClassRegistrar<Projection, ProjectionConfig, ParameterPtr, bool>
-      registrar_;
-
-  /**
-   * Forward propagation. If backward() will be called, in and out must be kept
-   * valid until then.
-   * @param in input of projection
-   * @param out output of projection
-   * @param passType PASS_TRAIN of PASS_TEST
-   */
-  void forward(const Argument* in, const Argument* out, PassType passType) {
-    in_ = in;
-    out_ = out;
-    passType_ = passType;
-    forward();
-  }
-
-  virtual void prefetch(const Argument* in) {}
-  virtual void forward() = 0;
-  virtual void backward(const UpdateCallback& callback) = 0;
-
-  /**
-   * See comment in Layer.h for the function with the same name.
-   */
-  virtual void resetState() {}
-
-  /**
-   * Set layer state.
-   */
-  virtual void setState(LayerStatePtr state) {}
-
-  /**
-   * Get layer state. A copy of internal state is returned.
-   */
-  virtual LayerStatePtr getState() { return nullptr; }
-
-  /**
-   * init forward_ and backward_ functions
-   */
-  virtual bool init() { return true; }
-
-  /**
-   * Get output size of projection.
-   */
-  size_t getOutputSize() const { return config_.output_size(); }
-
- protected:
-  /**
-   * Create layer function. Function is called in forward or backward.
-   * \param function, Layer::forward_ or Layer::backward_
-   * \param name, function name
-   * \param config, initialization configuration for the function
-   */
-  void createFunction(std::vector<std::shared_ptr<FunctionBase>>& function,
-                      const std::string& name,
-                      const FuncConfig& config) {
-    if (useGpu_) {
-      function.emplace_back(
-          FunctionBase::funcRegistrar_.createByType(name + "-GPU"));
-    } else {
-      function.emplace_back(
-          FunctionBase::funcRegistrar_.createByType(name + "-CPU"));
-    }
-    auto& func = function.back();
-    func->init(config);
-  }
-
- protected:
-  /// Config of projection
-  ProjectionConfig config_;
-  /// Parameter of projection
-  ParameterPtr parameter_;
-  bool useGpu_;
-
-  /// Store `in` passed to forward()
-  const Argument* in_;
-  /// Store `out` passed to forward()
-  const Argument* out_;
-  /// Store `passType` passed to forward()
-  PassType passType_;
-  /// Layer forward function
-  std::vector<std::shared_ptr<FunctionBase>> forward_;
-  /// Layer backward function
-  std::vector<std::shared_ptr<FunctionBase>> backward_;
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/RecurrentLayer.h b/paddle/gserver/layers/RecurrentLayer.h
deleted file mode 100644
index 94e633e65777aad540738ea67ea1b4e03dd75954..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/RecurrentLayer.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <gflags/gflags.h>
-#include "Layer.h"
-#include "SequenceToBatch.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief RecurrentLayer takes 1 input layer. The output size is the same with
- * input layer.
- * For each sequence [start, end] it performs the following computation:
- * \f[
- *    out_{i} = act(in_{i})     \      \      \text{for} \ i = start \\
- *    out_{i} = act(in_{i} + out_{i-1} * W) \ \ \text{for} \ start < i <= end
- *
- * \f]
- * If reversed is true, the order is reversed:
- * \f[
- *   out_{i} = act(in_{i})           \    \   \text{for} \ i = end  \\
- *   out_{i} = act(in_{i} + out_{i+1} * W) \ \ \text{for} \ start <= i < end
- * \f]
- * There are two methods to calculate rnn. One way is to compute rnn one
- * sequence by one sequence. The other way is to reorganize the input
- * into batches, then compute rnn one batch by one batch. Users can select
- * them by rnn_use_batch flag.
- */
-
-class RecurrentLayer : public Layer {
- public:
-  explicit RecurrentLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-
-  void backward(const UpdateCallback& callback) override;
-
-  void resetState() override;
-
-  void setState(LayerStatePtr state) override;
-
-  LayerStatePtr getState() override;
-
- protected:
-  /**
-   * @brief If user do not set --rnn_use_batch=true, it will
-   * compute rnn forward one sequence by one sequence in default.
-   * @param batchSize Total words number of all samples in this batch.
-   * @param numSequences The sample number.
-   * @param starts Each start position of each samples.
-   */
-  void forwardSequence(int batchSize, size_t numSequences, const int* starts);
-  /**
-   * @brief Compute rnn forward by one sequence.
-   * @param start The start position of this sequence (or sample).
-   * @param length The length of this sequence (or sample), namely the words
-   * number of this sequence.
-   */
-  void forwardOneSequence(int start, int length);
-  /**
-   * @brief Compute rnn backward one sequence by onesequence.
-   * @param batchSize Total words number of all samples in this batch.
-   * @param numSequences The sample number.
-   * @param starts Each start position of each samples.
-   */
-  void backwardSequence(int batchSize, size_t numSequences, const int* starts);
-  /**
-   * @brief Compute rnn backward by one sequence.
-   * @param start The start position of this sequence (or sample).
-   * @param length The length of this sequence (or sample), namely the words
-   * number of this sequence.
-   */
-  void backwardOneSequence(int start, int length);
-
-  /**
-   * @brief Reorganize input into batches and compute rnn forward batch
-   * by batch. It will convert batch shape to sequence after finishing forward.
-   * The batch info can refer to SequenceToBatch class.
-   * @param batchSize Total words number of all samples in this batch.
-   * @param numSequences The sample number.
-   * @param starts Each start position of each samples.
-   */
-  virtual void forwardBatch(int batchSize,
-                            size_t numSequences,
-                            const int* starts);
-
-  /**
-   * @brief Reorganize input into batches and compute rnn forward batch
-   * by batch.
-   * @param batchSize Total words number of all samples in this batch.
-   * @param numSequences The sample number.
-   * @param starts Each start position of each samples.
-   */
-  virtual void backwardBatch(int batchSize,
-                             size_t numSequences,
-                             const int* starts);
-
- protected:
-  std::unique_ptr<Weight> weight_;
-  std::unique_ptr<Weight> bias_;
-
-  /// frameOutput_[i] is used to hold the i-th sample of output_
-  std::vector<Argument> frameOutput_;
-  MatrixPtr prevOutput_;
-  /// Whether compute rnn by reverse.
-  bool reversed_;
-  /// If compute batch by batch, batchValue_ will be used to save the
-  /// reorganized input value.
-  std::unique_ptr<SequenceToBatch> batchValue_;
-  /// If compute batch by batch, batchGrad_ will be used to save the
-  /// gradient with respect to reorganized input value.
-  std::unique_ptr<SequenceToBatch> batchGrad_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/RecurrentLayerGroup.cpp b/paddle/gserver/layers/RecurrentLayerGroup.cpp
deleted file mode 100644
index 6694e8f2996fdd2c98da1507e5fb3b90b271c850..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/RecurrentLayerGroup.cpp
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <functional>
-#include "paddle/gserver/layers/Layer.h"
-
-#include "paddle/gserver/gradientmachines/RecurrentGradientMachine.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * Recurrent layer group is a group of layers, which forward/backward one frame
- * after previous frame forward/backward through all layers in layer group.
- * It's automatically added by config_parser if some layers are defined
- * between RecurrentLayerGroupBegin and RecurrentLayerGroupEnd.
- */
-class RecurrentLayerGroup : public Layer {
- public:
-  explicit RecurrentLayerGroup(const LayerConfig& config) : Layer(config) {}
-
-  void initSubNetwork(NeuralNetwork* rootNetwork,
-                      const ModelConfig& config,
-                      const std::vector<ParameterType>& parameterTypes,
-                      bool useGpu) override;
-
-  void forward(PassType passType) override {
-    REGISTER_TIMER_INFO("RecurrentGroupFwTime", getName().c_str());
-    const std::vector<Argument> inArgs;
-    std::vector<Argument> outArgs;
-    network_->forward(inArgs, &outArgs, passType);
-  }
-  void backward(const UpdateCallback& callback) override {
-    REGISTER_TIMER_INFO("RecurrentGroupBwTime", getName().c_str());
-    network_->backward(nullptr);
-
-    for (auto& para : parameters_) {
-      para->incUpdate(callback);
-    }
-  }
-
-  /**
-   * @see Layer.accessSubNetwork
-   */
-  void accessSubNetwork(
-      const std::function<void(NeuralNetwork&)>& callback) override {
-    callback(*network_);
-  }
-
- private:
-  std::unique_ptr<RecurrentGradientMachine> network_;
-};
-
-REGISTER_LAYER(recurrent_layer_group, RecurrentLayerGroup);
-
-void RecurrentLayerGroup::initSubNetwork(
-    NeuralNetwork* rootNetwork,
-    const ModelConfig& config,
-    const std::vector<ParameterType>& parameterTypes,
-    bool useGpu) {
-  setNeedGradient(true);
-
-  network_.reset(new RecurrentGradientMachine(config_.name(), rootNetwork));
-  ParamInitCallback cb = [rootNetwork](int paramId, Parameter* para) {
-    para->enableSharedType(
-        PARAMETER_VALUE,
-        rootNetwork->getParameters()[paramId]->getBuf(PARAMETER_VALUE),
-        rootNetwork->getParameters()[paramId]->getMat(PARAMETER_VALUE));
-    para->enableSharedType(
-        PARAMETER_GRADIENT,
-        rootNetwork->getParameters()[paramId]->getBuf(PARAMETER_GRADIENT),
-        rootNetwork->getParameters()[paramId]->getMat(PARAMETER_GRADIENT));
-  };
-  network_->init(config, cb, parameterTypes, useGpu);
-
-  for (auto paramId : network_->getParameterIds()) {
-    ParameterPtr parameter = rootNetwork->getParameters()[paramId];
-    parameter->incShared();
-    CHECK_EQ(parameter->getDeviceId(), getDeviceId());
-    parameters_.push_back(parameter);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ResizeLayer.cpp b/paddle/gserver/layers/ResizeLayer.cpp
deleted file mode 100644
index d4ae9945934a40719d253d4b53915530423448af..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ResizeLayer.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/BaseMatrix.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-/**
- * @brief A layer for resizing a minibatch matrix h*w to h'*w'
- * @note
- * origin matrix height * width)
- * resize matrix: (height * width / size) * size
- */
-class ResizeLayer : public Layer {
- public:
-  explicit ResizeLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-
-  void backward(const UpdateCallback& callback) override;
-};
-
-REGISTER_LAYER(resize, ResizeLayer);
-
-bool ResizeLayer::init(const LayerMap& layerMap,
-                       const ParameterMap& parameterMap) {
-  if (!Layer::init(layerMap, parameterMap)) return false;
-  CHECK_EQ(1U, inputLayers_.size());
-
-  setNeedSequenceInfo(false);
-  return true;
-}
-
-void ResizeLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  const Argument& input = getInput(0);
-  size_t height = input.value->getHeight();
-  size_t width = input.value->getWidth();
-  CHECK_EQ((height * width) % getSize(), 0UL);
-
-  reserveOutput(height * width / getSize(), getSize());
-  MatrixPtr tmp =
-      Matrix::create(output_.value->getData(), height, width, false, useGpu_);
-  tmp->assign(*input.value);
-}
-
-void ResizeLayer::backward(const UpdateCallback& callback) {
-  const Argument& input = getInput(0);
-  size_t height = input.value->getHeight();
-  size_t width = input.value->getWidth();
-
-  if (!input.grad) {
-    return;
-  }
-
-  MatrixPtr tmp = Matrix::create(input.grad->getData(),
-                                 height * width / getSize(),
-                                 getSize(),
-                                 false,
-                                 useGpu_);
-  tmp->add(*output_.grad);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/RotateLayer.h b/paddle/gserver/layers/RotateLayer.h
deleted file mode 100644
index 7ecbff20167dd95f782f2d61dc34697ab3273934..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/RotateLayer.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-/**
- * A layer for rotating a multi-channel feature map (M x N x C) in the spatial
- * domain
- * The rotation is 90 degrees in clock-wise for each channel
- * \f[
- *   y(j,i,:) = x(M-i-1,j,:)
- * \f]
- * where \f$x\f$ is (M x N x C) input, and \f$y\f$ is (N x M x C) output.
- *
- * The config file api is rotate_layer
- *
- */
-
-class RotateLayer : public Layer {
- public:
-  explicit RotateLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
-
- private:
-  int batchSize_;
-  int size_;
-  int height_;
-  int width_;
-  int channels_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/RowConvLayer.cpp b/paddle/gserver/layers/RowConvLayer.cpp
deleted file mode 100644
index 63b499e486fd24b5f816ee0e897b040ee5007581..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/RowConvLayer.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "RowConvLayer.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(row_conv, RowConvLayer);
-
-bool RowConvLayer::init(const LayerMap& layerMap,
-                        const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  contexLength_ = config_.inputs(0).row_conv_conf().context_length();
-
-  CHECK_EQ(inputLayers_.size(), 1UL);
-  weight_.reset(new Weight(contexLength_, getSize(), parameters_[0]));
-  createFunction(forward_, "RowConv", FuncConfig());
-  createFunction(backward_, "RowConvGrad", FuncConfig());
-
-  return true;
-}
-
-void RowConvLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  MatrixPtr input = getInputValue(0);
-  size_t height = input->getHeight();
-  size_t width = input->getWidth();
-  CHECK_EQ(width, getSize());
-  resetOutput(height, width);
-
-  const auto startPos = getInput(0).sequenceStartPositions->getVector(useGpu_);
-  MatrixPtr w = weight_->getW();
-  wDims_ = TensorShape({w->getHeight(), w->getWidth()});
-
-  MatrixPtr outV = getOutputValue();
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*getInputValue(0), *startPos);
-  inputs.addArg(*w, wDims_);
-  outputs.addArg(*getOutputValue(), *startPos, ADD_TO);
-
-  {
-    REGISTER_TIMER_INFO("RowConvForward", getName().c_str());
-    forward_[0]->calc(inputs, outputs);
-  }
-
-  /* activation */ {
-    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void RowConvLayer::backward(const UpdateCallback& callback) {
-  /* Do derivation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
-
-  const auto startPos = getInput(0).sequenceStartPositions->getVector(useGpu_);
-
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*getOutputGrad(), *startPos);
-  inputs.addArg(*getInputValue(0), *startPos);
-  inputs.addArg(*weight_->getW(), wDims_);
-
-  MatrixPtr inGrad = getInputGrad(0);
-  MatrixPtr wGrad = weight_->getWGrad();
-  size_t h = getInputValue(0)->getHeight();
-  size_t w = getInputValue(0)->getWidth();
-  outputs.addArg(
-      inGrad ? (*inGrad) : *(Matrix::create(nullptr, h, w, false, useGpu_)),
-      *startPos,
-      ADD_TO);
-  outputs.addArg(
-      wGrad ? (*wGrad)
-            : *(Matrix::create(nullptr, contexLength_, w, false, useGpu_)),
-      wDims_,
-      ADD_TO);
-
-  {
-    REGISTER_TIMER_INFO("RowConvBackward", getName().c_str());
-    backward_[0]->calc(inputs, outputs);
-  }
-
-  {
-    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
-    weight_->getParameterPtr()->incUpdate(callback);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ScaleSubRegionLayer.cpp b/paddle/gserver/layers/ScaleSubRegionLayer.cpp
deleted file mode 100644
index 68a0ff735844679df1393473355f54ee616c09bd..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ScaleSubRegionLayer.cpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ScaleSubRegionLayer.h"
-#include "paddle/utils/Stat.h"
-namespace paddle {
-
-REGISTER_LAYER(scale_sub_region, ScaleSubRegionLayer);
-
-bool ScaleSubRegionLayer::init(const LayerMap& layerMap,
-                               const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-  CHECK_EQ(static_cast<int>(inputLayers_.size()), 2);
-  auto& conf = config_.inputs(0).scale_sub_region_conf();
-  value_ = conf.value();
-
-  createFunction(forward_, "ScaleSubRegion", FuncConfig().set("value", value_));
-  createFunction(
-      backward_, "ScaleSubRegionGrad", FuncConfig().set("value", value_));
-
-  return true;
-}
-
-void ScaleSubRegionLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  auto in0 = getInput(0);
-  imgH_ = in0.getFrameHeight();
-  imgW_ = in0.getFrameWidth();
-  if (imgH_ == 0 || imgW_ == 0) {
-    auto& conf = config_.inputs(0).scale_sub_region_conf();
-    imgH_ = conf.image_conf().img_size_y();
-    imgW_ = conf.image_conf().img_size();
-  }
-  MatrixPtr imgV = in0.value;
-  size_t batchSize = imgV->getHeight();
-  size_t spatialSize = imgH_ * imgW_;
-  channelsNum_ = imgV->getWidth() / spatialSize;
-  shape_ = TensorShape({batchSize, channelsNum_, imgH_, imgW_});
-
-  resetOutput(batchSize, imgV->getWidth());
-  auto& out = getOutput();
-  out.setFrameHeight(imgH_);
-  out.setFrameWidth(imgW_);
-
-  MatrixPtr indicesV = getInputValue(1);
-  indicesShape_ = TensorShape({batchSize, 6});
-
-  REGISTER_TIMER_INFO("ScaleSubRegionForward", getName().c_str());
-  BufferArgs inArgs;
-  BufferArgs outArgs;
-  inArgs.addArg(*imgV, shape_);
-  inArgs.addArg(*indicesV, indicesShape_);
-  outArgs.addArg(*out.value, shape_, ASSIGN_TO);
-  forward_[0]->calc(inArgs, outArgs);
-}
-
-void ScaleSubRegionLayer::backward(const UpdateCallback& callback) {
-  REGISTER_TIMER_INFO("ScaleSubRegionBackward", getName().c_str());
-  BufferArgs inArgs;
-  BufferArgs outArgs;
-  inArgs.addArg(*getOutputGrad(), shape_);
-  inArgs.addArg(*getInputValue(1), indicesShape_);
-  outArgs.addArg(*getInputGrad(0), shape_, ADD_TO);
-  backward_[0]->calc(inArgs, outArgs);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ScalingLayer.cpp b/paddle/gserver/layers/ScalingLayer.cpp
deleted file mode 100644
index 15e07daebee194a789da52d37a192e031348300c..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ScalingLayer.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * A layer for each row of a matrix, multiplying with a element of a vector,
- * which is used in NEURAL TURING MACHINE.
- * \f[
- *   y.row[i] = w[i] * x.row[i]
- * \f]
- * where \f$x\f$ is (batchSize x dataDim) input, \f$w\f$ is
- * (batchSize x 1) weight vector, and \f$y\f$ is (batchSize x dataDim) output.
- *
- * The config file api is scaling_layer.
- */
-
-class ScalingLayer : public Layer {
- public:
-  explicit ScalingLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~ScalingLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(scaling, ScalingLayer);
-
-bool ScalingLayer::init(const LayerMap& layerMap,
-                        const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 2U);
-
-  return true;
-}
-
-void ScalingLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr weightV = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-
-  size_t batchSize = inV1->getHeight();
-  size_t dataDim = inV1->getWidth();
-
-  CHECK_EQ(dataDim, getSize());
-  CHECK_EQ(weightV->getWidth(), 1U);
-  CHECK_EQ(weightV->getHeight(), batchSize);
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    resetOutput(batchSize, dataDim);
-  }
-
-  MatrixPtr outV = getOutputValue();
-  {
-    REGISTER_TIMER_INFO("FwScalingTimer", getName().c_str());
-    // outV += inV1 * weight
-    outV->addRowScale(0, *inV1, *weightV);
-  }
-}
-
-void ScalingLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr weightV = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-  MatrixPtr inG0 = getInputGrad(0);
-  MatrixPtr inG1 = getInputGrad(1);
-  MatrixPtr outG = getOutputGrad();
-
-  {
-    REGISTER_TIMER_INFO("BwScalingTimer", getName().c_str());
-
-    if (inG0) {
-      // inG0 += outG .* inV1
-      inG0->rowDotMul(0, *outG, *inV1);
-    }
-
-    if (inG1) {
-      // inG1 += outG * weight;
-      inG1->addRowScale(0, *outG, *weightV);
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp b/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp
deleted file mode 100644
index 43c98993f3f6f74c034c59176378c3ea97a9c19b..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp
+++ /dev/null
@@ -1,336 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "SelectiveFullyConnectedLayer.h"
-#include <algorithm>
-#include <vector>
-#include "paddle/math/SparseMatrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(selective_fc, SelectiveFullyConnectedLayer);
-
-bool SelectiveFullyConnectedLayer::init(const LayerMap& layerMap,
-                                        const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-  inputNum_ = inputLayers_.size();
-  if (config_.has_selected_colums()) {
-    inputNum_ -= 1;
-  }
-  for (size_t i = 0; i < inputNum_; i++) {
-    size_t height = inputLayers_[i]->getSize();
-    size_t width = getSize();
-    // NOTE weight is transpoed
-    weights_.emplace_back(new Weight(width, height, parameters_[i]));
-  }
-
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-
-  fullOutput_ = false;
-
-  return true;
-}
-
-void SelectiveFullyConnectedLayer::prefetch() {}
-
-void SelectiveFullyConnectedLayer::reserveOutput(size_t height,
-                                                 size_t width,
-                                                 size_t nnz) {
-  bool flag = (passType_ == PASS_TEST &&
-               config_.selective_fc_pass_generation() && !fullOutput_);
-  SetDevice device(output_.deviceId);
-  if (flag) {
-    // output_.value is sparse matrix
-    if (dynamic_cast<CpuMatrix*>(output_.value.get()) ||
-        dynamic_cast<GpuMatrix*>(output_.value.get())) {
-      output_.value = nullptr;
-    }
-    Matrix::resizeOrCreateSparseMatrix(output_.value,
-                                       height,
-                                       width,
-                                       nnz,
-                                       FLOAT_VALUE,
-                                       SPARSE_CSR,
-                                       /*trans=*/false,
-                                       /*useGpu=*/useGpu_);
-    output_.value->copyFrom(*selCols_);
-    interOutput_ = output_.value;
-  } else {
-    if (fullOutput_) {
-      // output_.value is dense matrix
-      if (dynamic_cast<CpuSparseMatrix*>(output_.value.get()) ||
-          dynamic_cast<GpuSparseMatrix*>(output_.value.get())) {
-        output_.value = nullptr;
-      }
-      Matrix::resizeOrCreate(output_.value,
-                             height,
-                             width,
-                             /*trans=*/false,
-                             /*useGpu=*/useGpu_);
-      interOutput_ = output_.value;
-    } else {
-      // output_.value is dense matrix, but width = nnz /height
-      CHECK_EQ(nnz % height, 0U);
-      CHECK(nnz / height);
-      Matrix::resizeOrCreate(output_.value,
-                             height,
-                             nnz / height,
-                             /*trans=*/false,
-                             /*useGpu=*/useGpu_);
-      interOutput_ = Matrix::createSparseMatrix(output_.value->getData(),
-                                                selCols_->getRows(),
-                                                selCols_->getCols(),
-                                                height,
-                                                width,
-                                                nnz,
-                                                FLOAT_VALUE,
-                                                SPARSE_CSR,
-                                                /*trans=*/false,
-                                                /*useGpu=*/useGpu_);
-    }
-  }
-  interOutput_->zeroMem();
-
-  if (passType_ != PASS_TEST && needGradient()) {
-    CHECK_EQ(nnz % height, 0U) << "during training, each sample must have a "
-                                  "same number of selected columns.";
-    CHECK(nnz / height)
-        << "during training, "
-           "each sample must have at least one column selected.";
-    Matrix::resizeOrCreate(output_.grad,
-                           height,
-                           nnz / height,
-                           /*trans=*/false,
-                           /*useGpu=*/useGpu_);
-    output_.grad->zeroMem();
-  }
-}
-
-void SelectiveFullyConnectedLayer::forward(PassType passType) {
-  REGISTER_TIMER("selective_fc.forward");
-  Layer::forward(passType);
-
-  getSelectiveCols();
-  size_t height = getInput(0).getBatchSize();
-  size_t width = getSize();
-  size_t nnz = height * width;
-  if (!fullOutput_) {
-    CHECK(selCols_);
-    CHECK(height == selCols_->getHeight());
-    CHECK(width == selCols_->getWidth());
-    nnz = selCols_->getElementCnt();
-  }
-
-  // Layer::ResetOutput(), here we set outV/outG as SparseMatrix manually
-  // this outV should be used as input of MaxIdLayer and softmax activation
-  reserveOutput(height, width, nnz);
-
-  bool flag = true;
-  for (size_t i = 0; i < inputNum_; i++) {
-    MatrixPtr input = getInputValue(i);
-    MatrixPtr weight = weights_[i]->getW();
-    size_t hsize = input->getHeight();
-    size_t wsize = weight->getHeight();
-    real scaleT = i == 0 ? real(0) : real(1);
-
-    flag = nnz < (hsize * wsize) * config_.selective_fc_full_mul_ratio() &&
-           !fullOutput_;
-    if (flag) {
-      // if the indecies are highly sparse,
-      // manully compute the multiplication of
-      // the input vector and the selected rows.
-      REGISTER_TIMER("selective.plain");
-      interOutput_->mul(*input, *weight->getTranspose(), 1, scaleT);
-    } else {
-      // if the indecies is not sparse enough,
-      // use full mul instead
-      REGISTER_TIMER("selective.mul");
-      if (fullOutput_) {
-        interOutput_->mul(*input, *weight->getTranspose(), 1, scaleT);
-      } else {
-        Matrix::resizeOrCreate(mmat_,
-                               hsize,
-                               wsize,
-                               /*trans=*/false,
-                               /*useGpu=*/useGpu_);
-        mmat_->mul(*input, *weight->getTranspose());
-        interOutput_->add3(mmat_);
-      }
-    }
-  }
-
-  if (biases_) {
-    interOutput_->addBias(*(biases_->getW()), 1);
-  }
-
-  flag = (passType_ == PASS_TEST && config_.selective_fc_pass_generation() &&
-          !fullOutput_);
-  if (flag) {
-    // during generation, output of this layer is a sparse csr matrix,
-    // which is probably the input of maxid layer
-    // if the model is trained with multi-class-cross-entroy-with-selfnorm,
-    // activiation of this layer should be exponential, not softmax.
-
-    Argument arg;
-    arg.value = Matrix::create(interOutput_->getData(),
-                               1,
-                               nnz,
-                               /*trans=*/false,
-                               /*useGpu=*/useGpu_);
-    //! TODO(yuyang18): Why we cannot invoke forwardActivation here?
-    activation_->forward(arg).check();
-  } else /* train and test in train, not generating */ {
-    // during training, this layer output value is *Matrix*, which is input of
-    // eg. multi-class-cross-entropy
-
-    // while training, every sample has a equal number of selected
-    // columns to be activated.
-    // note indices of multi-class-cross-entropy need to be remapped
-    // to this index.
-    // e.g. sample = [1,3,5] and 3 is gold, then label is 1
-
-    forwardActivation();
-  }
-}
-
-void SelectiveFullyConnectedLayer::backward(const UpdateCallback& callback) {
-  backwardActivation();
-  MatrixPtr oGrad = getOutputGrad();
-  if (!fullOutput_) {
-    interOutGrad_ = Matrix::createSparseMatrix(oGrad->getData(),
-                                               interOutput_->getRows(),
-                                               interOutput_->getCols(),
-                                               interOutput_->getHeight(),
-                                               interOutput_->getWidth(),
-                                               interOutput_->getElementCnt(),
-                                               FLOAT_VALUE,
-                                               SPARSE_CSR,
-                                               /*trans=*/false,
-                                               /*useGpu=*/useGpu_);
-  } else {
-    interOutGrad_ = Matrix::create(oGrad->getData(),
-                                   oGrad->getHeight(),
-                                   oGrad->getWidth(),
-                                   /*trans=*/false,
-                                   /*useGpu=*/useGpu_);
-  }
-
-  if (biases_ && biases_->getWGrad()) {
-    REGISTER_TIMER_INFO("BpBiasTimer", getName().c_str());
-    biases_->getWGrad()->collectBias(*interOutGrad_, 1);
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  // backward is different from FullyConnectedLayer
-  // because the weight is transposed
-  for (size_t i = 0; i < inputNum_; i++) {
-    AsyncGpuBlock block;
-    MatrixPtr preGrad = getInputGrad(i);
-    if (preGrad) {
-      REGISTER_TIMER_INFO("BpMulTimer", getName().c_str());
-      preGrad->mul(*interOutGrad_, *weights_[i]->getW(), 1, 1);
-    }
-
-    MatrixPtr wGrad = weights_[i]->getWGrad();
-    if (wGrad) {
-      REGISTER_TIMER_INFO("GradMulTimer", getName().c_str());
-      MatrixPtr input = getInputValue(i);
-      wGrad->mul(*interOutGrad_->getTranspose(), *input, 1, 1);
-    }
-
-    {
-      REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
-      weights_[i]->getParameterPtr()->incUpdate(callback);
-    }
-  }
-}
-
-void paddle::SelectiveFullyConnectedLayer::fillSelectiveData(
-    const std::shared_ptr<std::vector<std::pair<int*, size_t>>>& candidates) {
-  if (candidates == nullptr) {
-    fillFullySelectiveData();
-    return;
-  }
-
-  size_t sampleNum = candidates->size();
-  size_t outputWidth = getSize();
-  size_t nnz =
-      std::accumulate(candidates->begin(),
-                      candidates->end(),
-                      0UL,
-                      [](size_t a, const std::pair<int*, size_t>& arr) {
-                        return a + arr.second;
-                      });
-
-  Matrix::resizeOrCreateSparseMatrix(this->cpuSelCols_,
-                                     sampleNum,
-                                     outputWidth,
-                                     nnz,
-                                     NO_VALUE,
-                                     SPARSE_CSR,
-                                     false,
-                                     false);
-  CHECK(this->cpuSelCols_ != nullptr);
-  CpuSparseMatrixPtr selCols =
-      std::dynamic_pointer_cast<CpuSparseMatrix>(cpuSelCols_);
-  int* rowOffsets = selCols->getRows();
-  int* colIndices = selCols->getCols();
-
-  rowOffsets[0] = 0;
-  int idx = 0;
-  for (size_t i = 0; i < sampleNum; ++i) {
-    if ((*candidates)[i].second > 0) {
-      rowOffsets[i + 1] = rowOffsets[i] + (*candidates)[i].second;
-      for (size_t j = 0; j < (*candidates)[i].second; ++j) {
-        colIndices[idx] = (*candidates)[i].first[j];
-        idx++;
-      }
-    } else {
-      rowOffsets[i + 1] = rowOffsets[i];
-    }
-  }
-
-  CHECK_EQ(static_cast<size_t>(rowOffsets[sampleNum]), nnz);
-  if (!useGpu_) {
-    this->selCols_ = this->cpuSelCols_;
-  } else {
-    Matrix::resizeOrCreateSparseMatrix(this->selCols_,
-                                       sampleNum,
-                                       outputWidth,
-                                       nnz,
-                                       NO_VALUE,
-                                       SPARSE_CSR,
-                                       false,
-                                       true);
-    this->selCols_->copyFrom(*cpuSelCols_, HPPL_STREAM_1);
-    hl_stream_synchronize(HPPL_STREAM_1);
-  }
-
-  fullOutput_ = false;
-}
-
-void paddle::SelectiveFullyConnectedLayer::getSelectiveCols() {
-  if (config_.has_selected_colums()) {
-    this->selCols_ = inputLayers_[inputNum_]->getOutputValue();
-    fullOutput_ = false;
-  } else if (!config_.selective_fc_pass_generation() || selCols_ == nullptr) {
-    this->fillFullySelectiveData();
-  }  // else selCols_ is initialized by fillSelectiveData
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/SelectiveFullyConnectedLayer.h b/paddle/gserver/layers/SelectiveFullyConnectedLayer.h
deleted file mode 100644
index 4b32ce8b162c2a8b1a6c34adc0885a7701f5f91e..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/SelectiveFullyConnectedLayer.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/ThreadLocal.h"
-
-namespace paddle {
-
-/**
- * @brief The SelectiveFullyConnectedLayer class
- *
- * SelectiveFullyConnectedLayer differs from FullyConnectedLayer by that it
- * requires an additional input to indicate several selected columns, and only
- * compute the multiplications between the input matrices and the selected
- * columns of the parameter matrices of this layer. If the selected columns is
- * not specified, SelectiveFullyConnected layer acts exactly like
- * FullyConnectedLayer.
- *
- * The config file api is selective_fc_layer.
- */
-class SelectiveFullyConnectedLayer : public Layer {
- protected:
-  WeightList weights_;
-  std::unique_ptr<Weight> biases_;
-
- private:
-  /**
-   * Get selected columns each forward.
-   */
-  void getSelectiveCols();
-
-  MatrixPtr mmat_;
-  /// cpuSelCols_ is a CpuSparseMatrix, used to save selected columns.
-  MatrixPtr cpuSelCols_;
-  /// CpuSparseMatrix or GpuSparseMatrix. In CPU mode, selCols_ points
-  /// to cpuSelCols_.
-  MatrixPtr selCols_;
-  size_t inputNum_;
-
-  /// interOutput_ shared same memory with output_.value.
-  MatrixPtr interOutput_;
-
-  /// if fullOutput_ is false, interOutGrad_ sparse matrix
-  MatrixPtr interOutGrad_;
-
-  /// if true, means output_.value is the same as Fc Layer
-  bool fullOutput_;
-
- public:
-  explicit SelectiveFullyConnectedLayer(const LayerConfig& config)
-      : Layer(config), selCols_(nullptr) {}
-
-  ~SelectiveFullyConnectedLayer() {}
-  void prefetch() override;
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  Weight& getWeight(int idx) { return *weights_[idx]; }
-
-  /**
-   * @brief Resize the output matrix size.
-   * And reset value to zero
-   */
-  void reserveOutput(size_t height, size_t width, size_t nnz);
-
-  /**
-   * @brief Fill candidates to select several activations as output.
-   * @param candidates specifies several selected columns of the parameter
-   * matrices of this layer.
-   * Multiplications only between the input matrices and the selected columns
-   * are computed.
-   * If the candidates is a nullptr, selective fc layer acts exactly like the
-   * fully connected layer.
-   * @note CURRENTLY, THIS METHOD IS ONLY USED FOR BEAM SEARCH
-   */
-  void fillSelectiveData(
-      const std::shared_ptr<std::vector<std::pair<int*, size_t>>>& candidates);
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
- private:
-  /**
-   * @brief Make SelectiveFC act as FullyConnectedLayer
-   */
-  void fillFullySelectiveData() { fullOutput_ = true; }
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/SequenceConcatLayer.cpp b/paddle/gserver/layers/SequenceConcatLayer.cpp
deleted file mode 100644
index c84c3ce4f080cc19f4937f04585accb5b2b347f9..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/SequenceConcatLayer.cpp
+++ /dev/null
@@ -1,189 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * A layer for concatenating the first sequence with the second sequence
- * Input: two sequences each containing the same number of instances
- *        seq1 = [a1, a2, ..., an]
- *        seq2 = [b1, b2, ..., bn]
- * Output: a concatenated sequence of the two input sequences
- *        out = [a1, b1, a2, b2, ..., an, bn]
- */
-
-class SequenceConcatLayer : public Layer {
- protected:
-  std::unique_ptr<Weight> biases_;
-
- public:
-  explicit SequenceConcatLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~SequenceConcatLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(seqconcat, SequenceConcatLayer);
-
-bool SequenceConcatLayer::init(const LayerMap& layerMap,
-                               const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  // sequene concatenation layer should have exactly 2 inputs
-  CHECK_EQ(2U, inputLayers_.size());
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-
-  setNeedSequenceInfo(false);
-  return true;
-}
-
-void SequenceConcatLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  size_t dim = getSize();
-
-  const Argument& input1 = getInput(0);
-  size_t numSequences1 = input1.getNumSequences();
-  auto startPositions1 = input1.sequenceStartPositions->getVector(false);
-
-  const Argument& input2 = getInput(1);
-  size_t numSequences2 = input2.getNumSequences();
-  auto startPositions2 = input2.sequenceStartPositions->getVector(false);
-
-  CHECK_EQ(dim, input1.value->getWidth());
-  CHECK_EQ(startPositions1->getData()[numSequences1], input1.getBatchSize());
-  CHECK_EQ(numSequences1, startPositions1->getSize() - 1);
-
-  CHECK_EQ(dim, input2.value->getWidth());
-  CHECK_EQ(startPositions2->getData()[numSequences2], input2.getBatchSize());
-  CHECK_EQ(numSequences2, startPositions2->getSize() - 1);
-
-  CHECK_EQ(numSequences1, numSequences2);
-
-  MatrixPtr inputValue1 = getInputValue(0);
-  MatrixPtr inputValue2 = getInputValue(1);
-
-  // reset output
-  reserveOutput(inputValue1->getHeight() + inputValue2->getHeight(), dim);
-
-  MatrixPtr outputValue = getOutputValue();
-
-  const int* starts1 = startPositions1->getData();
-  const int* starts2 = startPositions2->getData();
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    REGISTER_TIMER_INFO("SequenceConcatLayerForward", getName().c_str());
-
-    size_t offset = 0;
-    size_t leftNumIns = 0;
-    size_t rightNumIns = 0;
-    for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
-      leftNumIns = starts1[seqId + 1] - starts1[seqId];
-      outputValue->subMatrix(offset, leftNumIns)
-          ->assign(*(inputValue1->subMatrix(starts1[seqId], leftNumIns)));
-      offset += leftNumIns;
-
-      rightNumIns = starts2[seqId + 1] - starts2[seqId];
-      outputValue->subMatrix(offset, rightNumIns)
-          ->assign(*(inputValue2->subMatrix(starts2[seqId], rightNumIns)));
-      offset += rightNumIns;
-    }
-
-    // modify the sequenceStartPositions
-    ICpuGpuVector::resizeOrCreate(
-        output_.sequenceStartPositions, numSequences1 + 1, false);
-
-    int* tgtBuf = output_.sequenceStartPositions->getMutableData(false);
-
-    for (size_t seqId = 0; seqId < numSequences1 + 1; ++seqId) {
-      tgtBuf[seqId] = starts1[seqId] + starts2[seqId];
-    }
-  }
-
-  if (biases_.get() != NULL) {
-    MatrixPtr outV = getOutputValue();
-    outV->addBias(*(biases_->getW()), 1);
-  }
-
-  /* activation */
-  forwardActivation();
-}
-
-void SequenceConcatLayer::backward(const UpdateCallback& callback) {
-  /* activation */
-  backwardActivation();
-
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-
-    // Increasing the number of gradient
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  MatrixPtr inputGrad1 = getInputGrad(0);
-  MatrixPtr inputGrad2 = getInputGrad(1);
-  MatrixPtr outputGrad = getOutputGrad();
-  auto startPositions1 = getInput(0).sequenceStartPositions->getVector(false);
-  auto startPositions2 = getInput(1).sequenceStartPositions->getVector(false);
-
-  size_t numSequences1 = startPositions1->getSize() - 1;
-  size_t numSequences2 = startPositions2->getSize() - 1;
-
-  CHECK_EQ(numSequences1, numSequences2);
-
-  const int* starts1 = startPositions1->getData();
-  const int* starts2 = startPositions2->getData();
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    REGISTER_TIMER_INFO("SequenceConcatLayerBackward", getName().c_str());
-
-    size_t offset = 0;
-    size_t leftNumIns = 0;
-    size_t rightNumIns = 0;
-    for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
-      leftNumIns = starts1[seqId + 1] - starts1[seqId];
-      if (inputGrad1) {
-        inputGrad1->subMatrix(starts1[seqId], leftNumIns)
-            ->add(*(outputGrad->subMatrix(offset, leftNumIns)));
-      }
-      offset += leftNumIns;
-
-      rightNumIns = starts2[seqId + 1] - starts2[seqId];
-      if (inputGrad2) {
-        inputGrad2->subMatrix(starts2[seqId], rightNumIns)
-            ->add(*(outputGrad->subMatrix(offset, rightNumIns)));
-      }
-      offset += rightNumIns;
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
deleted file mode 100644
index 28d0a9296d4accd4152e886ccae12a776fdb8f7f..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
+++ /dev/null
@@ -1,118 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/utils/Logging.h"
-
-#include "SequencePoolLayer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * A layer for extracting the last instance of the input sequence.
- * Input: a sequence
- * If SequenceLevel = kNonseq:
- *   Output: a sequence containing only the last instance of the input sequence
- *   If stride_ > 0:
- *      Output: a shorten sequence. Stride is the step size by which we slide a
- *              window upon the input sequence, and getting last instance
- *              operation is then applied to each interval independently.
- * If SequenceLevel = kSeq:
- *   Check input sequence must has sub-sequence
- *   Output: a sequence containing only the last instance of each sub-sequence
- *           of the input sequence
- *
- * The config file api is last_seq and first_seq.
- */
-
-class SequenceLastInstanceLayer : public SequencePoolLayer {
- protected:
-  MatrixPtr tmpSrc_;
-  MatrixPtr tmpDest_;
-  std::vector<int> instanceIds_;
-
- public:
-  explicit SequenceLastInstanceLayer(const LayerConfig& config)
-      : SequencePoolLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(seqlastins, SequenceLastInstanceLayer);
-
-bool SequenceLastInstanceLayer::init(const LayerMap& layerMap,
-                                     const ParameterMap& parameterMap) {
-  SequencePoolLayer::init(layerMap, parameterMap);
-  reversed_ = config_.select_first();
-
-  tmpSrc_ =
-      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
-  tmpDest_ =
-      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
-
-  return true;
-}
-
-void SequenceLastInstanceLayer::forward(PassType passType) {
-  SequencePoolLayer::forward(passType);
-
-  auto starts = startPositions_->getData(false);
-  MatrixPtr inputValue = getInputValue(0);
-  MatrixPtr outputValue = getOutputValue();
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    REGISTER_TIMER_INFO("SequenceLastInstanceLayerForward", getName().c_str());
-
-    instanceIds_.clear();
-    for (size_t seqId = 0; seqId < newBatchSize_; ++seqId) {
-      int insId = reversed_ ? starts[seqId] : starts[seqId + 1] - 1;
-      instanceIds_.push_back(insId);
-
-      outputValue->subMatrix(seqId, 1, tmpDest_)
-          ->assign(*(inputValue->subMatrix(insId, 1, tmpSrc_)));
-    }
-  }
-
-  if (biases_.get() != NULL) {
-    outputValue->addBias(*(biases_->getW()), 1);
-  }
-
-  /*  activation, should set to 'linear' in most cases */
-  forwardActivation();
-}
-
-void SequenceLastInstanceLayer::backward(const UpdateCallback& callback) {
-  SequencePoolLayer::backward(callback);
-
-  MatrixPtr inputGrad = getInputGrad(0);
-  MatrixPtr outputGrad = getOutputGrad();
-
-  if (inputGrad) {
-    AsyncGpuBlock asyncGpuBlock;
-    REGISTER_TIMER_INFO("SequenceLastInstanceLayerBackward", getName().c_str());
-
-    for (size_t seqId = 0; seqId < newBatchSize_; ++seqId) {
-      inputGrad->subMatrix(instanceIds_[seqId], 1, tmpDest_)
-          ->add(*(outputGrad->subMatrix(seqId, 1, tmpSrc_)));
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/SequencePoolLayer.cpp b/paddle/gserver/layers/SequencePoolLayer.cpp
deleted file mode 100644
index 650ab425d1fcca56d8862200f37dd5bb36a67240..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/SequencePoolLayer.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "SequencePoolLayer.h"
-#include "paddle/utils/Logging.h"
-
-namespace paddle {
-
-bool SequencePoolLayer::init(const LayerMap& layerMap,
-                             const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  // seqlastins/max/average layer should have exactly 1 input
-  CHECK_EQ(1U, inputLayers_.size());
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-  // transform to which sequence type
-  if (config_.trans_type() == "non-seq") {
-    type_ = kNonSeq;
-  } else if (config_.trans_type() == "seq") {
-    type_ = kSeq;
-  } else {
-    LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
-  }
-  stride_ = config_.seq_pool_stride();
-  setNeedSequenceInfo(false);
-  return true;
-}
-
-void SequencePoolLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  const Argument& input = getInput(0);
-  CHECK(input.hasSeq() || input.hasSubseq())
-      << "Input should be a sequence or subsequence for layer " << getName();
-
-  newBatchSize_ = type_ ? input.getNumSubSequences() : input.getNumSequences();
-  size_t dim = getSize();
-  // check
-  CHECK_EQ(dim, input.value->getWidth());
-  startPositions_ =
-      type_ ? input.subSequenceStartPositions : input.sequenceStartPositions;
-  auto starts = startPositions_->getVector(false);
-  CHECK_EQ(starts->getData()[newBatchSize_], input.getBatchSize());
-  CHECK_EQ(newBatchSize_, starts->getSize() - 1);
-
-  /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
-   * thus, in this case, output_ has no sequenceStartPositions.
-   * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
-   * case, we should compute the new sequenceStartPositions.
-   */
-  if (type_) {
-    CHECK(input.subSequenceStartPositions)
-        << "when trans_type = seq, input must hasSubseq";
-    output_.degradeSequence(input);
-  }
-  if (stride_ > 0) {
-    CHECK_EQ(input.hasSubseq(), 0UL)
-        << "sequence stride pooling is invalid for hasSubseq now";
-    output_.poolSequenceWithStride(input, stride_, &startPositions_, reversed_);
-    newBatchSize_ = startPositions_->getSize() - 1;
-  }
-
-  resetOutput(newBatchSize_, dim);
-}
-
-void SequencePoolLayer::backward(const UpdateCallback& callback) {
-  /* Do derivation */ { backwardActivation(); }
-
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-
-    // Increasing the number of gradient
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/SequencePoolLayer.h b/paddle/gserver/layers/SequencePoolLayer.h
deleted file mode 100644
index 01183060afd58376bb718dda64d8106cce4899f9..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/SequencePoolLayer.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-/**
- * A base layer for SequenceLastInstanceLayer/AverageLayer/MaxLayer.
- *
- * Input: one or more sequences. Each sequence contains some instances.
- * If SequenceLevel = kNonSeq:
- *    Output: output size is the number of input sequences (NOT input instances)
- *    output[i] = seqlastin/average/max_{for each instance in this
- * sequence}{input[i]}
- *    If stride_ > 0:
- *        Check input sequence must not have sub-sequence
- *        Output: a shorten sequence. Stride is the step size by which we slide
- *                a window upon the input sequence, and the pooling operation
- *                is then applied to each interval independently.
- * If SequenceLevel = kSeq:
- *    Check input sequence must has sub-sequence
- *    Output: output size is the number of input sub-sequences
- *    output[i] = seqlastin/average/max_{for each instance in this
- * sub-sequence}{input[i]}
- *
- * The config file api is pooling_layer.
- */
-
-class SequencePoolLayer : public Layer {
- protected:
-  int type_;
-  std::unique_ptr<Weight> biases_;
-  enum SequenceLevel { kNonSeq = 0, kSeq = 1 };
-  size_t newBatchSize_;
-  ICpuGpuVectorPtr startPositions_;
-  int stride_;
-  // Whether the input sequence is reversed or not.
-  bool reversed_ = false;
-
- public:
-  explicit SequencePoolLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/SequenceReshapeLayer.cpp b/paddle/gserver/layers/SequenceReshapeLayer.cpp
deleted file mode 100644
index 319310af8c4ac3bdefd814ad05b7fde6070f2340..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/SequenceReshapeLayer.cpp
+++ /dev/null
@@ -1,157 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- *  A layer for reshaping the sequence. Assume the input sequence has
- *  T instances, the dimension of each instance is M, and the input
- *  reshape_dim is N, then the output sequence has T*M/N instances,
- *  the dimension of each instance is N.
- *
- *  Note that T*M/N must be an integer.
- */
-
-class SequenceReshapeLayer : public Layer {
- protected:
-  std::unique_ptr<Weight> biases_;
-
-  MatrixPtr reshapedOutputGrad;
-
- public:
-  explicit SequenceReshapeLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(seqreshape, SequenceReshapeLayer);
-
-bool SequenceReshapeLayer::init(const LayerMap& layerMap,
-                                const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(1U, inputLayers_.size());
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-  setNeedSequenceInfo(false);
-  return true;
-}
-
-void SequenceReshapeLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  const Argument& input = getInput(0);
-
-  size_t inDim = input.value->getWidth();
-  size_t outDim = getSize();
-
-  size_t numSequences = input.getNumSequences();
-
-  // by default, we assume each instance as a sequence
-  IVectorPtr seqStarts;
-  IVector::resizeOrCreate(seqStarts, input.getBatchSize() + 1, false);
-  int* startsData = seqStarts->getData();
-  for (int i = 0; i < input.getBatchSize() + 1; i++) {
-    startsData[i] = i;
-  }
-  const int* starts = startsData;
-
-  // if there is sequence, then use start positions
-  if (input.sequenceStartPositions) {
-    auto startPositions = input.sequenceStartPositions->getVector(false);
-    starts = startPositions->getData();
-    CHECK_EQ(starts[numSequences], input.getBatchSize());
-    CHECK_EQ(numSequences, startPositions->getSize() - 1);
-  }
-
-  for (size_t seqID = 0; seqID < numSequences; seqID++) {
-    size_t inNumIns = starts[seqID + 1] - starts[seqID];
-    size_t outNumIns = inNumIns * inDim / outDim;
-    CHECK_EQ(outNumIns * outDim, inNumIns * inDim);
-  }
-
-  MatrixPtr inputValue = getInputValue(0);
-
-  // reset output
-  reserveOutput(inputValue->getHeight() * inDim / outDim, outDim);
-  MatrixPtr outputValue = getOutputValue();
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    REGISTER_TIMER_INFO("SequenceReshapeLayerForward", getName().c_str());
-
-    outputValue->copyFrom(*inputValue);
-
-    // modify the sequenceStartPositions
-    ICpuGpuVector::resizeOrCreate(
-        output_.sequenceStartPositions, numSequences + 1, false);
-
-    int* tgtBuf = output_.sequenceStartPositions->getMutableData(false);
-
-    for (size_t seqId = 0; seqId < numSequences + 1; ++seqId) {
-      tgtBuf[seqId] = starts[seqId] * inDim / outDim;
-    }
-  }
-
-  if (biases_.get() != NULL) {
-    MatrixPtr outV = getOutputValue();
-    outV->addBias(*(biases_->getW()), 1);
-  }
-
-  /* activation */
-  forwardActivation();
-}
-
-void SequenceReshapeLayer::backward(const UpdateCallback& callback) {
-  /* activation */
-  backwardActivation();
-
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-
-    // Increasing the number of gradient
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  MatrixPtr inputGrad = getInputGrad(0);
-  MatrixPtr outputGrad = getOutputGrad();
-
-  AsyncGpuBlock asyncGpuBlock;
-  REGISTER_TIMER_INFO("SequenceReshapeLayerBackward", getName().c_str());
-
-  if (inputGrad) {
-    Matrix::resizeOrCreate(reshapedOutputGrad,
-                           inputGrad->getHeight(),
-                           inputGrad->getWidth(),
-                           false,
-                           useGpu_);
-    reshapedOutputGrad->copyFrom(*outputGrad);
-    inputGrad->add(*reshapedOutputGrad);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/SequenceSliceLayer.cpp b/paddle/gserver/layers/SequenceSliceLayer.cpp
deleted file mode 100644
index a6d810b583aab6e44faa583795686f06e17beeb9..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/SequenceSliceLayer.cpp
+++ /dev/null
@@ -1,224 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/Vector.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-class SequenceSliceLayer : public Layer {
- public:
-  explicit SequenceSliceLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
- private:
-  /*
-   * TODO(caoying)
-   * In PaddePaddle, currently all matrices are real number types,
-   * but the second and the (optional) third input which are some
-   * selected indices of the give sequence to trim the sequence, are actually
-   * filled with int types so that storing int types information in real number
-   * matrices is very dangerous, since real numbers will be convered to int
-   * types. If a user fills this matrix himself, invalid data may occor.
-   */
-
-  MatrixPtr startIdsOnCpu_;
-  MatrixPtr endIdsOnCpu_;
-
-  std::vector<int> selectedRows_;
-  IVectorPtr rowIndice_;
-  std::vector<std::vector<int>> inputSeqInfoVec_;
-  std::vector<int> outSubSeqStartPos_;
-  std::vector<int> outSeqStartPos_;
-
-  void checkInputs();
-  void copySliceIdsToCpu();
-  void calSelectedRows(const MatrixPtr starts, const MatrixPtr ends);
-};
-
-REGISTER_LAYER(seq_slice, SequenceSliceLayer);
-
-bool SequenceSliceLayer::init(const LayerMap& layerMap,
-                              const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-  CHECK_GE(inputLayers_.size(), 2U);
-  CHECK_LE(inputLayers_.size(), 3U);
-
-  setNeedSequenceInfo(false);
-  return true;
-}
-
-void SequenceSliceLayer::checkInputs() {
-  const Argument& inputSeq = getInput(0);
-  CHECK(inputSeq.hasSeq()) << "The first input of sequence slice layer "
-                           << "must be a sequence.";
-  const MatrixPtr indices1 = getInputValue(1);
-  CHECK_EQ(
-      indices1->getHeight(),
-      static_cast<size_t>(inputSeq.hasSubseq() ? inputSeq.getNumSubSequences()
-                                               : inputSeq.getNumSequences()))
-      << "Height of the second input should be equal to number of sequence "
-      << "in the first input.";
-  if (inputLayers_.size() == 3) {
-    const MatrixPtr indices2 = getInputValue(2);
-    CHECK_EQ(indices2->getHeight(), indices1->getHeight())
-        << "start indices and end indices should have the same height.";
-    CHECK_EQ(indices2->getWidth(), indices1->getWidth())
-        << "start indices and end indices should have the same Width.";
-  }
-}
-
-void SequenceSliceLayer::copySliceIdsToCpu() {
-  const MatrixPtr indices1 = getInputValue(1);
-  if (inputLayers_.size() == 2U) {
-    if (config_.select_first()) {
-      Matrix::resizeOrCreate(startIdsOnCpu_,
-                             indices1->getHeight(),
-                             indices1->getWidth(),
-                             false /* trans */,
-                             false /* useGpu */);
-      startIdsOnCpu_->copyFrom(*indices1);
-      endIdsOnCpu_ = nullptr;
-    } else {
-      Matrix::resizeOrCreate(endIdsOnCpu_,
-                             indices1->getHeight(),
-                             indices1->getWidth(),
-                             false /* trans */,
-                             false /* useGpu */);
-      endIdsOnCpu_->copyFrom(*indices1);
-      startIdsOnCpu_ = nullptr;
-    }
-  } else if (inputLayers_.size() == 3U) {
-    Matrix::resizeOrCreate(startIdsOnCpu_,
-                           indices1->getHeight(),
-                           indices1->getWidth(),
-                           false /* trans */,
-                           false /* useGpu */);
-    startIdsOnCpu_->copyFrom(*indices1);
-
-    const MatrixPtr indices2 = getInputValue(2);
-    Matrix::resizeOrCreate(endIdsOnCpu_,
-                           indices2->getHeight(),
-                           indices2->getWidth(),
-                           false /* trans */,
-                           false /* useGpu */);
-    endIdsOnCpu_->copyFrom(*indices2);
-  }
-}
-
-void SequenceSliceLayer::calSelectedRows(const MatrixPtr starts,
-                                         const MatrixPtr ends) {
-  CHECK(starts || ends) << "At least one of the start or end indices "
-                        << "should be given.";
-
-  bool hasSubseq = getInput(0).hasSubseq();
-
-  outSeqStartPos_.resize(1, 0);
-  outSubSeqStartPos_.resize(1, 0);
-  selectedRows_.clear();
-
-  size_t beamSize = starts ? starts->getWidth() : ends->getWidth();
-  size_t rowIdx = 0;
-  for (size_t i = 0; i < inputSeqInfoVec_.size(); ++i) {
-    for (size_t j = 0; j < inputSeqInfoVec_[i].size() - 1; ++j) {
-      for (size_t k = 0; k < beamSize; ++k) {
-        if (starts && starts->getElement(rowIdx, k) == -1.) break;
-        if (ends && ends->getElement(rowIdx, k) == -1.) break;
-
-        int begPos = inputSeqInfoVec_[i][j];
-        if (starts) begPos += starts->getElement(rowIdx, k);
-
-        int endPos = inputSeqInfoVec_[i][j + 1] - 1;
-        if (ends) endPos = inputSeqInfoVec_[i][j] + ends->getElement(rowIdx, k);
-
-        int seqLen = endPos - begPos + 1;
-        CHECK_GT(seqLen, 0);
-        for (int m = begPos; m <= endPos; ++m) selectedRows_.push_back(m);
-        hasSubseq
-            ? outSubSeqStartPos_.push_back(outSubSeqStartPos_.back() + seqLen)
-            : outSeqStartPos_.push_back(outSeqStartPos_.back() + seqLen);
-      }
-      rowIdx++;
-    }
-    if (hasSubseq) outSeqStartPos_.push_back(outSubSeqStartPos_.back());
-  }
-
-  if (useGpu_) {
-    rowIndice_ = IVector::create(selectedRows_.size(), useGpu_);
-    rowIndice_->copyFrom(selectedRows_.data(), selectedRows_.size());
-  } else {
-    rowIndice_ =
-        IVector::create(selectedRows_.data(), selectedRows_.size(), useGpu_);
-  }
-
-  // create the sequence information for the output.
-  ICpuGpuVector::resizeOrCreate(
-      output_.sequenceStartPositions, outSeqStartPos_.size(), false);
-  output_.sequenceStartPositions->copyFrom(
-      outSeqStartPos_.data(), outSeqStartPos_.size(), false);
-
-  if (hasSubseq) {
-    ICpuGpuVector::resizeOrCreate(
-        output_.subSequenceStartPositions, outSubSeqStartPos_.size(), false);
-    output_.subSequenceStartPositions->copyFrom(
-        outSubSeqStartPos_.data(), outSubSeqStartPos_.size(), false);
-  }
-}
-
-void SequenceSliceLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  checkInputs();
-
-  const Argument& inputSeq = getInput(0);
-  inputSeqInfoVec_.clear();
-  Argument::reorganizeSeqInfo(inputSeq.sequenceStartPositions,
-                              inputSeq.subSequenceStartPositions,
-                              inputSeqInfoVec_);
-  if (!useGpu_) {
-    if (inputLayers_.size() == 2U) {
-      startIdsOnCpu_ = config_.select_first() ? getInputValue(1) : nullptr;
-      endIdsOnCpu_ = config_.select_first() ? nullptr : getInputValue(1);
-    } else if (inputLayers_.size() == 3U) {
-      startIdsOnCpu_ = getInputValue(1);
-      endIdsOnCpu_ = getInputValue(2);
-    }
-  } else {
-    copySliceIdsToCpu();
-  }
-
-  /*
-   * calculate the selected row indices in a batch, and build the output
-   * sequence information.
-   */
-  calSelectedRows(startIdsOnCpu_, endIdsOnCpu_);
-
-  resetOutput(selectedRows_.size(), getSize());
-
-  getOutputValue()->selectRows(*getInputValue(0), *rowIndice_);
-}
-
-void SequenceSliceLayer::backward(const UpdateCallback& callback) {
-  getOutputGrad()->addToRows(*getInputGrad(0), *rowIndice_);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/SequenceToBatch.h b/paddle/gserver/layers/SequenceToBatch.h
deleted file mode 100644
index 5200e702d9bc947746567c19ca7d552750828131..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/SequenceToBatch.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/math/Matrix.h"
-#include "paddle/math/Vector.h"
-
-namespace paddle {
-
-/*
- * This class can used to modify the matrix structure of sequence matrix into
- * batch structure.
- * sequence matrix: [C1_s ... Cn_s | ...... | C1_t ... Cn_t]
- * batch matrix:    [C1_s ... C1_t | ...... | Cn_s ... Cn_t]
- * Cn_s is the state for sequence s at time n.
- *
- * Exampel:  sequence matrix = {{0, 0, 0, 0}, {1, 1, 1, 1, 1}, {2, 2, 2}}
- *           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
- *           batch matrix = {{1, 0, 2}, {1, 0, 2}, {1, 0, 2}, {1, 0}, {1}}
- *           b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1
- *
- * Use:
- * Input: seqMatrix, seqStarts(Sequence Start Positions)
- * Output: batchMatrix
- * 1. SequenceToBatch seq2batch;
- * 2. seq2batch.resizeOrCreateBatch(seqStarts);     // calculate seq2BatchIdx
- * 3. seq2batch.copy(seqMatrix, batchMatrix, true); // copy seq to batch matrix
- *
- */
-class SequenceToBatch {
- public:
-  explicit SequenceToBatch(bool useGpu) : useGpu_(useGpu) {}
-
-  /* resize and calculate the batchIndex_ */
-  void resizeOrCreateBatch(int batchSize,
-                           size_t numSequences,
-                           const int *seqStarts,
-                           bool reversed,
-                           bool prevBatchState = false);
-
-  /* sequence matrix and batch matrix copy:
-   * seq2batch: copy(seqValue, batchValue, true);
-   * batch2seq: copy(seqValue, batchValue, false);
-   */
-  void copy(Matrix &seqValue, Matrix &batchValue, bool seq2batch);
-  /* sequence/batch matrix add to batch/sequence matrix */
-  void add(Matrix &seqValue, Matrix &batchValue, bool seq2batch);
-  MatrixPtr getBatchValue(Matrix &batchValue, int batchId, int numRows = 0);
-
-  size_t getNumBatch() const { return numBatch_; }
-
-  /* resize or create a batch matrix(batchValue_) */
-  void resizeOrCreate(Matrix &seqValue);
-  /* copy seqValue to batchValue_ */
-  void copyFromSeq(Matrix &seqValue);
-  /* copy batchValue_ to seqValue */
-  void copyBackSeq(Matrix &seqValue);
-  MatrixPtr getBatchValue(int batchId, int numRows = 0);
-  MatrixPtr getBatchValue() { return batchValue_; }
-  /*tranfer preBatchOutput to batch struct*/
-  void prevOutput2Batch(Matrix &src, Matrix &dst);
-  /*get sequence output from batch struct*/
-  void getSeqOutputFromBatch(Matrix &sequence, Matrix &batch);
-
-  /* Copy the index from another seq2batch. */
-  void shareIndexWith(const SequenceToBatch &seq2batch) {
-    CHECK(useGpu_ == seq2batch.useGpu_);
-    batchStartPositions_ = seq2batch.batchStartPositions_;
-    seq2BatchIdx_ = seq2batch.seq2BatchIdx_;
-    cpuSeq2BatchIdx_ = seq2batch.cpuSeq2BatchIdx_;
-    numBatch_ = seq2batch.numBatch_;
-  }
-
- protected:
-  void sequence2BatchCopy(Matrix &batch,
-                          Matrix &sequence,
-                          IVector &seq2BatchIdx,
-                          bool seq2batch);
-  void sequence2BatchAdd(Matrix &batch,
-                         Matrix &sequence,
-                         IVector &seq2BatchIdx,
-                         bool seq2batch);
-
-  IVectorPtr batchStartPositions_;
-  IVectorPtr seq2BatchIdx_;
-  IVectorPtr cpuSeq2BatchIdx_;
-  IVectorPtr cpuSeqIdx_;
-  IVectorPtr cpuSeqEndIdxInBatch_;
-  IVectorPtr seqIdx_;
-  IVectorPtr seqEndIdxInBatch_;
-  size_t numBatch_;
-  bool useGpu_;
-  MatrixPtr batchValue_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/SlopeInterceptLayer.cpp b/paddle/gserver/layers/SlopeInterceptLayer.cpp
deleted file mode 100644
index f7f4735c1b72d4ac6540714573fd7e15ef99ea5b..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/SlopeInterceptLayer.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief A layer for applying a slope and an intercept to the input
- * element-wise.
- * This layer is used in NEURAL TURING MACHINE.
- * @note There is no activation and weight in this layer.
- *
- * \f[
- *    y = ax + b
- * \f]
- *
- * Here, a is scale and b is offset, which are provided as attributes of the
- * layer.
- *
- * The config file api is slope_intercept_layer.
- */
-
-class SlopeInterceptLayer : public Layer {
- public:
-  explicit SlopeInterceptLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(slope_intercept, SlopeInterceptLayer);
-
-bool SlopeInterceptLayer::init(const LayerMap& layerMap,
-                               const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 1U);
-
-  return true;
-}
-
-void SlopeInterceptLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr inV = getInputValue(0);
-
-  /* malloc memory for the output_ if necessary */
-  size_t batchSize = inV->getHeight();
-  size_t size = getSize();
-
-  CHECK_EQ(size, inV->getWidth());
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    reserveOutput(batchSize, size);
-  }
-
-  MatrixPtr outV = getOutputValue();
-  {
-    REGISTER_TIMER_INFO("FwSlopeInterceptTimer", getName().c_str());
-    outV->mulScalar(*inV, config_.slope());
-    outV->add(config_.intercept());
-  }
-}
-
-void SlopeInterceptLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inG = getInputGrad(0);
-  MatrixPtr outG = getOutputGrad();
-
-  if (inG) {
-    REGISTER_TIMER_INFO("BwSlopeInterceptTimer", getName().c_str());
-    inG->add(*outG, config_.slope());
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/SpatialPyramidPoolLayer.h b/paddle/gserver/layers/SpatialPyramidPoolLayer.h
deleted file mode 100644
index 421bdfe09c46f656f500daff195c755274bf8bb7..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/SpatialPyramidPoolLayer.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "PoolProjection.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/utils/Logging.h"
-
-namespace paddle {
-/**
- * @brief A layer for spatial pyramid pooling on the input image by taking
- * the max, average, etc. within regions, so that the result vector of
- * different sized images are of the same size.
- *
- * The config file api is spp_layer.
- */
-
-class SpatialPyramidPoolLayer : public Layer {
- protected:
-  size_t channels_;
-  size_t imgSizeW_;
-  size_t imgSizeH_;
-  size_t pyramidHeight_;
-  std::string poolType_;
-
-  std::vector<std::unique_ptr<PoolProjection>> poolProjections_;
-  std::vector<Argument> projOutput_;
-  std::vector<std::pair<size_t, size_t>> projCol_;
-
- public:
-  explicit SpatialPyramidPoolLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  ProjectionConfig getConfig(size_t sizeX_,
-                             size_t sizeY_,
-                             size_t channels,
-                             size_t pyamidLevel_,
-                             std::string& poolType_);
-  size_t getSize();
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/SubNestedSequenceLayer.cpp b/paddle/gserver/layers/SubNestedSequenceLayer.cpp
deleted file mode 100644
index e2bb00bbfacb26dc736a63877119b379f22b5983..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/SubNestedSequenceLayer.cpp
+++ /dev/null
@@ -1,187 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/Vector.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-class SubNestedSequenceLayer : public Layer {
- public:
-  explicit SubNestedSequenceLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
- private:
-  /*
-   * This functions generates the indices of rows in a batch according to the
-   * indices of selected sub-sequence in each sequence.
-   *
-   * Examples:
-   * selectedIndices:
-   *   [
-   *     [0, 1, -1],
-   *     [0, 1, 2],
-   *     [0, -1, -1],
-   *     [0, 2, 3],
-   *   ]
-   * inputSeqInfo:
-   *   [
-   *     [0,3,4],
-   *     [4,5,7,10,15],
-   *     [15,20],
-   *     [20,22,23,25,28]
-   *   ]
-   *
-   * ths output is saved to private member rowIndice_;
-   * [0,1,2,3,4,5,6,7,8,9,15,16,17,18,19,20,21,23,24,25,26,27]
-   */
-
-  void calSelectedRows(const MatrixPtr selectedIndices,
-                       const std::vector<std::vector<int>>& inputSeqInfo);
-
-  /*
-   * TODO(caoying)
-   * In PaddePaddle, currently all matrices are real number types,
-   * but the second is some selected indices of the give sequence to trim
-   * the nested sequence, are actually filled with int types so that storing
-   * int types information in real number matrices is very dangerous, since
-   * real numbers will be convered to int types. If a user fills this matrix
-   * himself, invalid data may occor.
-   *
-   * if the second input of this layer is on GPU memory, copy it to CPU memory.
-   */
-  MatrixPtr selIdsCpu_;
-
-  /*
-   * reorganize sequenceStartPositions and subSequenceStartPositions
-   * into a 2d vector to facilitate the sequence selection process.
-   */
-  std::vector<std::vector<int>> inputSeqInfoVec_;
-
-  /* store the final selected row indices in a batch */
-  IVectorPtr rowIndice_;
-  /* rowIndice_ and selectedRows_ actually share a same memory. */
-  std::vector<int> selectedRows_;
-};
-
-REGISTER_LAYER(sub_nested_seq, SubNestedSequenceLayer);
-
-bool SubNestedSequenceLayer::init(const LayerMap& layerMap,
-                                  const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-  CHECK_EQ(2U, inputLayers_.size());
-  setNeedSequenceInfo(false);
-  return true;
-}
-
-void SubNestedSequenceLayer::calSelectedRows(
-    const MatrixPtr selectedIndices,
-    const std::vector<std::vector<int>>& inputSeqInfo) {
-  selectedRows_.clear();
-
-  std::vector<int> outSeqStartInfo(1, 0);
-  std::vector<int> outSubSeqStartInfo(1, 0);
-
-  size_t seqNum = selectedIndices->getHeight();
-  size_t beamSize = selectedIndices->getWidth();
-  for (size_t i = 0; i < seqNum; ++i) {
-    for (size_t j = 0; j < beamSize; ++j) {
-      if (selectedIndices->getElement(i, j) == -1.) break;
-      size_t selSubSeqIdx = selectedIndices->getElement(i, j);
-      CHECK_GT(inputSeqInfoVec_[i].size() - 1, selSubSeqIdx);
-
-      size_t subSeqLen = inputSeqInfoVec_[i][selSubSeqIdx + 1] -
-                         inputSeqInfoVec_[i][selSubSeqIdx];
-      for (size_t k = 0; k < subSeqLen; ++k)
-        selectedRows_.push_back(inputSeqInfoVec_[i][selSubSeqIdx] + k);
-      outSubSeqStartInfo.push_back(outSubSeqStartInfo.back() + subSeqLen);
-    }
-    outSeqStartInfo.push_back(outSubSeqStartInfo.back());
-  }
-
-  if (useGpu_) {
-    rowIndice_ = IVector::create(selectedRows_.size(), useGpu_);
-    rowIndice_->copyFrom(selectedRows_.data(), selectedRows_.size());
-  } else {
-    rowIndice_ =
-        IVector::create(selectedRows_.data(), selectedRows_.size(), useGpu_);
-  }
-
-  // create the sequence information for the output.
-  ICpuGpuVector::resizeOrCreate(
-      output_.sequenceStartPositions, outSeqStartInfo.size(), false);
-  output_.sequenceStartPositions->copyFrom(
-      outSeqStartInfo.data(), outSeqStartInfo.size(), false);
-
-  ICpuGpuVector::resizeOrCreate(
-      output_.subSequenceStartPositions, outSubSeqStartInfo.size(), false);
-  output_.subSequenceStartPositions->copyFrom(
-      outSubSeqStartInfo.data(), outSubSeqStartInfo.size(), false);
-}
-
-void SubNestedSequenceLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  const Argument& inputSeq = getInput(0);
-  CHECK(inputSeq.hasSubseq()) << "The first input of SubNestSequence layer "
-                              << "must be a nested sequence.";
-  const MatrixPtr selectedIndices = getInputValue(1);
-  CHECK_EQ(size_t(inputSeq.getNumSequences()), selectedIndices->getHeight());
-
-  if (dynamic_cast<GpuMatrix*>(selectedIndices.get())) {
-    /*
-     * Currently, the second input for this layer is generated by
-     * kmax_sequence_score_layer whose output is always stored on CPU,
-     * or a data_layer which canbe on GPU.
-     *
-     * If the second input is on GPU, copy it to CPU memory, because this
-     * input always uses very few memory, and operations related to it are
-     * all logic control, not computations.
-     */
-    Matrix::resizeOrCreate(selIdsCpu_,
-                           selectedIndices->getHeight(),
-                           selectedIndices->getWidth(),
-                           false /* trans */,
-                           false /* useGpu */);
-    selIdsCpu_->copyFrom(*selectedIndices);
-  } else {
-    selIdsCpu_ = selectedIndices;
-  }
-
-  Argument::reorganizeSeqInfo(inputSeq.sequenceStartPositions,
-                              inputSeq.subSequenceStartPositions,
-                              inputSeqInfoVec_);
-  calSelectedRows(selIdsCpu_, inputSeqInfoVec_);
-
-  resetOutput(selectedRows_.size(), getSize());
-  getOutputValue()->selectRows(*getInputValue(0), *rowIndice_);
-}
-
-void SubNestedSequenceLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inputSeqGrad = getInputGrad(0);
-  MatrixPtr outputGrad = getOutputGrad();
-
-  if (inputSeqGrad) outputGrad->addToRows(*inputSeqGrad, *rowIndice_);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/SubSequenceLayer.cpp b/paddle/gserver/layers/SubSequenceLayer.cpp
deleted file mode 100644
index ba49f5710f9d0bb985cf1e80d5c4a972d8f046a6..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/SubSequenceLayer.cpp
+++ /dev/null
@@ -1,226 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/Vector.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * A layer for taking the subsequence according to given offset and size
- * Input: original sequence, offset, size
- * Output: subsequence
- */
-
-class SubSequenceLayer : public Layer {
- protected:
-  std::unique_ptr<Weight> biases_;
-  MatrixPtr tmpSrc_;
-  MatrixPtr tmpDest_;
-
- public:
-  explicit SubSequenceLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(subseq, SubSequenceLayer);
-
-bool SubSequenceLayer::init(const LayerMap& layerMap,
-                            const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  // sequene concatenation layer should have exactly 2 inputs
-  CHECK_EQ(3U, inputLayers_.size());
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-
-  tmpSrc_ =
-      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
-  tmpDest_ =
-      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
-
-  setNeedSequenceInfo(false);
-  return true;
-}
-
-void SubSequenceLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  size_t dim = getSize();
-
-  const Argument& input = getInput(0);
-  size_t numSequences1 = input.getNumSequences();
-  auto startPositions1 = input.sequenceStartPositions->getVector(false);
-
-  const Argument& offsetSeq = getInput(1);
-  size_t numSequences2 = offsetSeq.getNumSequences();
-  auto startPositions2 = offsetSeq.sequenceStartPositions->getVector(false);
-
-  const Argument& sizeSeq = getInput(2);
-  size_t numSequences3 = sizeSeq.getNumSequences();
-  auto startPositions3 = sizeSeq.sequenceStartPositions->getVector(false);
-
-  CHECK_EQ(dim, input.value->getWidth());
-
-  CHECK_EQ(startPositions1->getData()[numSequences1], input.getBatchSize());
-  CHECK_EQ(numSequences1, startPositions1->getSize() - 1);
-
-  CHECK_EQ(startPositions2->getData()[numSequences2], offsetSeq.getBatchSize());
-  CHECK_EQ(numSequences2, startPositions2->getSize() - 1);
-
-  CHECK_EQ(startPositions3->getData()[numSequences3], sizeSeq.getBatchSize());
-  CHECK_EQ(numSequences3, startPositions3->getSize() - 1);
-
-  CHECK_EQ(numSequences1, numSequences2);
-  CHECK_EQ(numSequences2, numSequences3);
-
-  MatrixPtr inputValue = input.value;
-  IVectorPtr offsetValue;
-  IVectorPtr sizeValue;
-
-  if (useGpu_) {
-    // copy to cpu
-    IVector::resizeOrCreate(offsetValue, offsetSeq.ids->getSize(), false);
-    IVector::resizeOrCreate(sizeValue, sizeSeq.ids->getSize(), false);
-    offsetValue->copyFrom(*offsetSeq.ids);
-    sizeValue->copyFrom(*sizeSeq.ids);
-  } else {
-    offsetValue = offsetSeq.ids;
-    sizeValue = sizeSeq.ids;
-  }
-
-  CHECK_EQ(offsetValue->getSize(), numSequences1);
-  CHECK_EQ(sizeValue->getSize(), numSequences1);
-
-  int* offsets = offsetValue->getData();
-  int* sizes = sizeValue->getData();
-
-  // get total height of output
-  size_t height = 0;
-  for (size_t seqId = 0; seqId < numSequences1; seqId++) {
-    height += sizes[seqId];
-  }
-
-  // reset output
-  resetOutput(height, dim);
-
-  MatrixPtr outputValue = getOutputValue();
-
-  const int* starts1 = startPositions1->getData();
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    REGISTER_TIMER_INFO("SubSequenceLayerForward", getName().c_str());
-
-    size_t offsetIn = 0;
-    size_t offsetOut = 0;
-    size_t size = 0;
-    for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
-      offsetIn = starts1[seqId] + offsets[seqId];
-      size = sizes[seqId];
-
-      outputValue->subMatrix(offsetOut, size, tmpDest_)
-          ->assign(*(inputValue->subMatrix(offsetIn, size, tmpSrc_)));
-
-      offsetOut += size;
-    }
-
-    // modify the sequenceStartPositions
-    ICpuGpuVector::resizeOrCreate(
-        output_.sequenceStartPositions, numSequences1 + 1, false);
-
-    int* tgtBuf = output_.sequenceStartPositions->getMutableData(false);
-    int offset = 0;
-    for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
-      tgtBuf[seqId] = offset;
-      offset += sizes[seqId];
-    }
-    tgtBuf[numSequences1] = offset;
-  }
-
-  if (biases_.get() != NULL) {
-    MatrixPtr outV = getOutputValue();
-    outV->addBias(*(biases_->getW()), 1);
-  }
-
-  /* activation */
-  forwardActivation();
-}
-
-void SubSequenceLayer::backward(const UpdateCallback& callback) {
-  /* activation */
-  backwardActivation();
-
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-
-    // Increasing the number of gradient
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  MatrixPtr inputGrad1 = getInputGrad(0);
-  MatrixPtr outputGrad = getOutputGrad();
-  auto startPositions1 = getInput(0).sequenceStartPositions->getVector(false);
-  size_t numSequences1 = startPositions1->getSize() - 1;
-  const int* starts1 = startPositions1->getData();
-
-  const Argument& offsetSeq = getInput(1);
-  const Argument& sizeSeq = getInput(2);
-  IVectorPtr offsetValue;
-  IVectorPtr sizeValue;
-
-  if (useGpu_) {
-    // copy to cpu
-    IVector::resizeOrCreate(offsetValue, offsetSeq.ids->getSize(), false);
-    IVector::resizeOrCreate(sizeValue, sizeSeq.ids->getSize(), false);
-    offsetValue->copyFrom(*offsetSeq.ids);
-    sizeValue->copyFrom(*sizeSeq.ids);
-  } else {
-    offsetValue = offsetSeq.ids;
-    sizeValue = sizeSeq.ids;
-  }
-
-  int* offsets = offsetValue->getData();
-  int* sizes = sizeValue->getData();
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    REGISTER_TIMER_INFO("SubSequenceLayerBackward", getName().c_str());
-
-    int offsetIn = 0;
-    int offsetOut = 0;
-    int size = 0;
-    for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
-      offsetIn = starts1[seqId] + offsets[seqId];
-      size = sizes[seqId];
-
-      inputGrad1->subMatrix(offsetIn, size, tmpDest_)
-          ->add(*(outputGrad->subMatrix(offsetOut, size, tmpSrc_)));
-      offsetOut += size;
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/SumToOneNormLayer.cpp b/paddle/gserver/layers/SumToOneNormLayer.cpp
deleted file mode 100644
index 00764717e8b6be30230e44626974033e929352da..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/SumToOneNormLayer.cpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * A layer for sum-to-one normalization,
- * which is used in NEURAL TURING MACHINE.
- * \f[
- *   out[i] = \frac {in[i]} {\sum_{k=1}^N in[k]}
- * \f]
- * where \f$in\f$ is a (batchSize x dataDim) input vector,
- * and \f$out\f$ is a (batchSize x dataDim) output vector.
- *
- * The config file api is sum_to_one_norm_layer.
- */
-
-class SumToOneNormLayer : public Layer {
- protected:
-  /// reciprocalRowSum_ = \f$1 / \sum_{k=1}^N in[k]\f$
-  MatrixPtr reciprocalRowSum_;
-  /// dotSum = output_.grad \f$.*\f$ output_.value
-  MatrixPtr dotSum_;
-
- public:
-  explicit SumToOneNormLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(sum_to_one_norm, SumToOneNormLayer);
-
-bool SumToOneNormLayer::init(const LayerMap& layerMap,
-                             const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 1U);
-
-  return true;
-}
-
-void SumToOneNormLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr inV = getInputValue(0);
-
-  /* malloc memory for the output_ if necessary */
-  size_t batchSize = inV->getHeight();
-  size_t dataDim = getSize();
-
-  CHECK_EQ(dataDim, inV->getWidth());
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    resetOutput(batchSize, dataDim);
-  }
-
-  MatrixPtr outV = getOutputValue();
-  {
-    REGISTER_TIMER_INFO("FwSumToOneNormTimer", getName().c_str());
-
-    Matrix::resizeOrCreate(reciprocalRowSum_, batchSize, 1, false, useGpu_);
-    inV->rowSum(*reciprocalRowSum_);
-
-    // todo: matrix checks
-    CHECK_GT(reciprocalRowSum_->getMin(), 0.0);
-
-    reciprocalRowSum_->scalarDiv(*reciprocalRowSum_, 1.0);
-
-    // outV = inV * reciprocalRowSum
-    outV->rowScale(0, *inV, *reciprocalRowSum_);
-  }
-}
-
-void SumToOneNormLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inV = getInputValue(0);
-  MatrixPtr inG = getInputGrad(0);
-  MatrixPtr outV = getOutputValue();
-  MatrixPtr outG = getOutputGrad();
-
-  size_t batchSize = inV->getHeight();
-
-  if (inG) {
-    REGISTER_TIMER_INFO("BwSumToOneTimer", getName().c_str());
-
-    Matrix::resizeOrCreate(dotSum_, batchSize, 1, false, useGpu_);
-
-    // dotSum = outG .* outV
-    dotSum_->zeroMem();
-    dotSum_->rowDotMul(0, *outG, *outV);
-
-    // inG += -1 * (dotSum / rowSum)
-    dotSum_->dotMul(*dotSum_, *reciprocalRowSum_);
-    inG->rowAdd(0, *inG, *dotSum_, -1.0);
-    // inG += outG * (1/rowSum)
-    inG->addRowScale(0, *outG, *reciprocalRowSum_);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/SwitchOrderLayer.cpp b/paddle/gserver/layers/SwitchOrderLayer.cpp
deleted file mode 100644
index 704735de38bd373c0714de6bb4e139d1505c5451..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/SwitchOrderLayer.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "SwitchOrderLayer.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(switch_order, SwitchOrderLayer);
-
-bool SwitchOrderLayer::init(const LayerMap& layerMap,
-                            const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-  auto& img_conf = config_.inputs(0).image_conf();
-  size_t inD = img_conf.img_size_z();
-  size_t inH =
-      img_conf.has_img_size_y() ? img_conf.img_size_y() : img_conf.img_size();
-  size_t inW = img_conf.img_size();
-  size_t inC = img_conf.channels();
-  inH = inH * inD;
-  inDims_ = TensorShape({0, inC, inH, inW});
-  outDims_ = TensorShape(4);
-
-  auto& reshape_conf = config_.reshape_conf();
-  for (int i = 0; i < reshape_conf.height_axis_size(); i++) {
-    heightAxis_.push_back(reshape_conf.height_axis(i));
-  }
-  for (int i = 0; i < reshape_conf.width_axis_size(); i++) {
-    widthAxis_.push_back(reshape_conf.width_axis(i));
-  }
-  createFunction(nchw2nhwc_, "NCHW2NHWC", FuncConfig());
-  createFunction(nhwc2nchw_, "NHWC2NCHW", FuncConfig());
-  return true;
-}
-
-void SwitchOrderLayer::setOutDims() {
-  outDims_.setDim(0, inDims_[0]);
-  outDims_.setDim(1, inDims_[2]);
-  outDims_.setDim(2, inDims_[3]);
-  outDims_.setDim(3, inDims_[1]);
-  reshapeHeight_ = 1;
-  for (size_t i = 0; i < heightAxis_.size(); i++) {
-    reshapeHeight_ *= outDims_[heightAxis_[i]];
-  }
-  output_.setFrameHeight(reshapeHeight_);
-  reshapeWidth_ = 1;
-  for (size_t i = 0; i < widthAxis_.size(); i++) {
-    reshapeWidth_ *= outDims_[widthAxis_[i]];
-  }
-  output_.setFrameWidth(reshapeWidth_);
-}
-
-void SwitchOrderLayer::setInDims() {
-  MatrixPtr input = inputLayers_[0]->getOutputValue();
-  size_t batchSize = input->getHeight();
-  inDims_.setDim(0, batchSize);
-  int d = inputLayers_[0]->getOutput().getFrameDepth();
-  d = (d == 0 ? 1 : d);
-  int h = inputLayers_[0]->getOutput().getFrameHeight();
-  if (h != 0) inDims_.setDim(2, h * d);
-  int w = inputLayers_[0]->getOutput().getFrameWidth();
-  if (w != 0) inDims_.setDim(3, w);
-  int totalCount = input->getElementCnt();
-  int channels = totalCount / (inDims_[0] * inDims_[2] * inDims_[3]);
-  if (channels != 0) inDims_.setDim(1, channels);
-}
-
-void SwitchOrderLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  setInDims();
-  setOutDims();
-  resetOutput(outDims_[0], outDims_[1] * outDims_[2] * outDims_[3]);
-  if (heightAxis_.size() > 0) {
-    resetOutput(reshapeHeight_, reshapeWidth_);
-  }
-
-  // switch NCHW to NHWC
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*getInputValue(0), inDims_);
-  outputs.addArg(*getOutputValue(), outDims_);
-  nchw2nhwc_[0]->calc(inputs, outputs);
-  forwardActivation();
-}
-
-void SwitchOrderLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-  backwardActivation();
-
-  // switch NHWC to NCHW
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*getOutputGrad(), outDims_);
-  outputs.addArg(*getInputGrad(0), inDims_, ADD_TO);
-  nhwc2nchw_[0]->calc(inputs, outputs);
-}
-}  // namespace paddle
diff --git a/paddle/gserver/layers/TensorLayer.cpp b/paddle/gserver/layers/TensorLayer.cpp
deleted file mode 100644
index b2271c63ef76d85574cf7f71b18aef4239938b8e..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/TensorLayer.cpp
+++ /dev/null
@@ -1,145 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "TensorLayer.h"
-
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(tensor, TensorLayer);
-
-bool TensorLayer::init(const LayerMap& layerMap,
-                       const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  /* initialize the weightList */
-  CHECK_EQ(inputLayers_.size(), 2LU);
-  CHECK(parameters_[0]);
-  CHECK(!parameters_[1]);
-
-  // Option the parameters
-  size_t height = inputLayers_[0]->getSize();
-  size_t width = inputLayers_[1]->getSize();
-  CHECK_EQ(width * height * getSize(), parameters_[0]->getSize());
-
-  for (size_t i = 0; i < getSize(); ++i) {
-    // create a new weight
-    Weight* w = new Weight(height, width, parameters_[0], i * width * height);
-
-    // append the new weight to the list
-    weights_.emplace_back(w);
-  }
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-
-  return true;
-}
-
-void TensorLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  int batchSize = getInputValue(0)->getHeight();
-  int size = getSize();
-
-  { resetOutput(batchSize, size); }
-
-  MatrixPtr outV = getOutputValue();
-  /* add the bias-vector */
-  if (biases_.get() != NULL) {
-    outV->addBias(*(biases_->getW()), 1);
-  }
-
-  /* e1 * W * trans(e2) */ {
-    MatrixPtr input1 = getInputValue(0);
-    MatrixPtr input2 = getInputValue(1);
-    MatrixPtr tmpMat = Matrix::create(input2->getHeight(),
-                                      input2->getWidth(),
-                                      /* trans= */ false,
-                                      input2->useGpu());
-    REGISTER_TIMER_INFO("TensorFwMulTimer", getName().c_str());
-    for (size_t i = 0; i < getSize(); ++i) {
-      MatrixPtr weights = weights_[i]->getW();
-      tmpMat->mul(*input1, *weights, 1, 0);
-      outV->rowDotMul(i, *tmpMat, *input2);
-    }
-  }
-
-  /* activation */ { forwardActivation(); }
-}
-
-void TensorLayer::backward(const UpdateCallback& callback) {
-  /* Do derivation */ { backwardActivation(); }
-
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-
-    /* Increasing the number of gradient */
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  bool syncFlag = hl_get_sync_flag();
-
-  /* Calculate the W-gradient for the current layer */
-  MatrixPtr input1 = getInputValue(0);
-  MatrixPtr input2 = getInputValue(1);
-  MatrixPtr oGrad = getOutputGrad();
-  MatrixPtr tmpMat = Matrix::create(input1->getHeight(),
-                                    input1->getWidth(),
-                                    /* trans= */ false,
-                                    input1->useGpu());
-
-  /* trans(grad * e1) * e2 */ {
-    REGISTER_TIMER_INFO("TensorGradMulTimer", getName().c_str());
-    for (size_t i = 0; i < getSize(); ++i) {
-      if (weights_[i]->getWGrad()) {
-        tmpMat->rowScale(i, *input1, *oGrad);
-        MatrixPtr input1_T = tmpMat->getTranspose();
-        weights_[i]->getWGrad()->mul(*input1_T, *input2, 1, 1);
-      }
-    }
-  }
-
-  hl_set_sync_flag(false);
-
-  /* Calculate the input layers error */ {
-    MatrixPtr preGrad1 = getInputGrad(0);
-    MatrixPtr preGrad2 = getInputGrad(1);
-
-    REGISTER_TIMER_INFO("TensorBpMulTimer", getName().c_str());
-    for (size_t i = 0; i < getSize(); ++i) {
-      MatrixPtr weights = weights_[i]->getW();
-
-      if (NULL != preGrad1) { /* (grad * e2) * trans(W) */
-        tmpMat->rowScale(i, *input2, *oGrad);
-        MatrixPtr weights_T = weights->getTranspose();
-        preGrad1->mul(*tmpMat, *weights_T, 1, 1);
-      }
-      if (NULL != preGrad2) { /* (grad * e1) * W */
-        tmpMat->rowScale(i, *input1, *oGrad);
-        preGrad2->mul(*tmpMat, *weights, 1, 1);
-      }
-    }
-  }
-  hl_set_sync_flag(syncFlag);
-  parameters_[0]->incUpdate(callback);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/TensorLayer.h b/paddle/gserver/layers/TensorLayer.h
deleted file mode 100644
index 5c1ee40ceda9387138a82368ec4edcbae4bd3419..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/TensorLayer.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/ThreadLocal.h"
-
-namespace paddle {
-
-/**
- * @brief TensorLayer takes two input vectors.
- * \f[
- *     y_{i} = x_{1} * W_{i} * x_{2}^{\rm T}, i=0, 1, ...,K-1
- * \f]
- *
- * - \f$x_{1}\f$: the first input, size is M.
- * - \f$x_{2}\f$: the second input, size is N.
- * - y: output, size is K.
- * - \f$y_{i}\f$: i-th element of y.
- * - \f$W_{i}\f$: the i-th learned weight, dimensions: [M, N].
- * - \f$x_{2}^{\rm T}\f$: the transpose of \f$x_{2}\f$.
- *
- * The config file api is tensor_layer.
- */
-
-class TensorLayer : public Layer {
- protected:
-  WeightList weights_;
-  std::unique_ptr<Weight> biases_;
-
- public:
-  explicit TensorLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  Weight& getWeight(int idx) { return *weights_[idx]; }
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/TransLayer.cpp b/paddle/gserver/layers/TransLayer.cpp
deleted file mode 100644
index cf87ca53d1def32708400c507da673c3a6ec0a87..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/TransLayer.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "TransLayer.h"
-#include "paddle/utils/Logging.h"
-namespace paddle {
-
-REGISTER_LAYER(trans, TransLayer);
-
-bool TransLayer::init(const LayerMap& layerMap,
-                      const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  /* the size of inputs for trans-layer is 1 */
-  CHECK_EQ(config_.inputs_size(), 1);
-
-  return true;
-}
-
-void TransLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  MatrixPtr input = getInputValue(0);
-  int height = input->getHeight();
-  int width = input->getWidth();
-
-  resizeOutput(width, height);
-
-  MatrixPtr outV = getOutputValue();
-
-  /* outV's memory has been allocated, so memAlloc = false */
-  input->transpose(outV, false);
-  if (getInputGrad(0)) {
-    zeroGrad();
-  }
-}
-
-void TransLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-
-  MatrixPtr outputGrad = getOutputGrad();
-  if (outputGrad == NULL) {
-    return;
-  }
-  MatrixPtr preGrad = getInputGrad(0);
-  if (preGrad) {
-    MatrixPtr transGrad = Matrix::create(preGrad->getHeight(),
-                                         preGrad->getWidth(),
-                                         /* trans= */ false,
-                                         preGrad->useGpu());
-    outputGrad->transpose(transGrad, false);
-    preGrad->add(*transGrad);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/TransLayer.h b/paddle/gserver/layers/TransLayer.h
deleted file mode 100644
index 1cd8fd91f785d5a43fc7d7663e657702b32fa534..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/TransLayer.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-/**
- * A layer for transposing a minibatch matrix.
- * \f[
-     y = x^\mathrm{T}
- * \f]
- * where \f$x\f$ is (M x N) input, and \f$y\f$ is (N x M) output.
- *
- * The config file api is trans_layer.
- */
-class TransLayer : public Layer {
- public:
-  explicit TransLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/TransposedFullMatrixProjection.cpp b/paddle/gserver/layers/TransposedFullMatrixProjection.cpp
deleted file mode 100644
index 45f59779896f993aface284e3485e1e3d801f4c5..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/TransposedFullMatrixProjection.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Projection.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief TransposedFullMatrixProjection performs full matrix multiplication:
- * out.row[i] += in.row[i] * weight.transpose
- *
- * The config file api is trans_full_matrix_projection.
- */
-class TransposedFullMatrixProjection : public Projection {
- public:
-  TransposedFullMatrixProjection(const ProjectionConfig& config,
-                                 ParameterPtr parameter,
-                                 bool useGPu);
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback);
-
- protected:
-  std::unique_ptr<Weight> weight_;
-};
-
-REGISTER_PROJECTION(trans_fc, TransposedFullMatrixProjection);
-
-TransposedFullMatrixProjection::TransposedFullMatrixProjection(
-    const ProjectionConfig& config, ParameterPtr parameter, bool useGpu)
-    : Projection(config, parameter, useGpu) {
-  weight_.reset(
-      new Weight(config.output_size(), config.input_size(), parameter));
-}
-
-void TransposedFullMatrixProjection::forward() {
-  REGISTER_TIMER_INFO("FwMulTimer", getName().c_str());
-  out_->value->mul(*(in_->value), *(weight_->getW()->getTranspose()), 1, 1);
-}
-
-void TransposedFullMatrixProjection::backward(const UpdateCallback& callback) {
-  bool syncFlag = hl_get_sync_flag();
-
-  /* Calculate the W-gradient for the current layer */
-  if (weight_->getWGrad()) {
-    REGISTER_TIMER_INFO("GradMulTimer", getName().c_str());
-    weight_->getWGrad()->mul(
-        *(out_->grad->getTranspose()), *(in_->value), 1, 1);
-  }
-
-  // If callback does not change value, backprop error asynchronously so that
-  // we can do the callback concurrently.
-  // This is still a little bit dangerous since theoretically for
-  // SyncMultiGpuMachine it is possible that the value copyback can still
-  // happen at the same time as the error backprop where the value is being
-  // used.
-  hl_set_sync_flag(false);
-
-  /* Calculate the input layers error */
-  if (in_->grad) {
-    REGISTER_TIMER_INFO("BpMulTimer", getName().c_str());
-    in_->grad->mul(*(out_->grad), *(weight_->getW()), 1, 1);
-  }
-
-  hl_set_sync_flag(syncFlag);
-  parameter_->incUpdate(callback);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/UpsampleLayer.h b/paddle/gserver/layers/UpsampleLayer.h
deleted file mode 100644
index c9d079c3141c37517866bfdad10d9b2cdb89f7d5..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/UpsampleLayer.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * This layer transpose the pooling process.
- * It takes two input, the first input is the input data, and
- * the second is the mask data from the max-pool-with-mask layer.
- *
- */
-
-class UpsampleLayer : public Layer {
- public:
-  explicit UpsampleLayer(const LayerConfig& config) : Layer(config) {}
-  ~UpsampleLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-
-  size_t getOutputSize();
-
- protected:
-  size_t scale_, scaleY_;
-  size_t upsampleSize_, upsampleSizeY_;
-  size_t padOutX_, padOutY_;
-  size_t imgSize_, imgSizeY_;
-  size_t channels_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ValidationLayer.cpp b/paddle/gserver/layers/ValidationLayer.cpp
deleted file mode 100644
index b626825a7b45fdb09cd8f9e8cc6727e218ab2940..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ValidationLayer.cpp
+++ /dev/null
@@ -1,171 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include <fstream>
-#include <memory>
-
-#include "ValidationLayer.h"
-#include "paddle/utils/Logging.h"
-
-namespace paddle {
-
-bool ValidationLayer::init(const LayerMap& layerMap,
-                           const ParameterMap& parameterMap) {
-  return Layer::init(layerMap, parameterMap);
-}
-
-void ValidationLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr output = getInputValue(*getOutputLayer());
-  CHECK(output);
-  IVectorPtr label = getInputLabel(*getLabelLayer());
-  CHECK(label);
-  validationImp(output, label);
-}
-
-void ValidationLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-}
-
-bool AucValidation::init(const LayerMap& layerMap,
-                         const ParameterMap& parameterMap) {
-  bool ret = ValidationLayer::init(layerMap, parameterMap);
-  EvaluatorConfig config;
-  config.set_name(getName());
-  config.set_type("last-column-auc");
-  config.add_input_layers(inputLayers_[0]->getName());
-  config.add_input_layers(inputLayers_[1]->getName());
-  if (3 == inputLayers_.size()) {
-    config.add_input_layers(inputLayers_[2]->getName());
-  }
-  evaluator_.reset(Evaluator::create(config));
-  passBegin_ = false;
-  return ret;
-}
-
-void AucValidation::validationImp(MatrixPtr output, IVectorPtr label) {
-  if (!passBegin_) {
-    passBegin_ = true;
-    evaluator_->start();
-  }
-
-  bool supportWeight = (3 == inputLayers_.size()) ? true : false;
-  MatrixPtr weight = supportWeight ? getInputValue(*inputLayers_[2]) : nullptr;
-  if (dynamic_cast<GpuMatrix*>(output.get())) {
-    size_t height = output->getHeight();
-    size_t width = output->getWidth();
-    Matrix::resizeOrCreate(cpuOutput_,
-                           height,
-                           width,
-                           /* trans=*/false,
-                           /* useGpu=*/false);
-    cpuOutput_->copyFrom(*output);
-    IVector::resizeOrCreate(cpuLabel_, height, false);
-    cpuLabel_->copyFrom(*label);
-
-    if (supportWeight) {
-      Matrix::resizeOrCreate(cpuWeight_, height, (size_t)1, false, false);
-      cpuWeight_->copyFrom(*weight);
-    }
-
-    output = cpuOutput_;
-    label = cpuLabel_;
-    weight = cpuWeight_;
-  }
-
-  for (size_t i = 0; i < output->getHeight(); i++) {
-    float y1 = output->getData()[i * output->getWidth() + 1];
-    int* labels = label->getData();
-    predictArray_.push_back(PredictionResult(y1, labels[i]));
-  }
-  std::vector<Argument> arguments;
-  if (3 == inputLayers_.size()) {
-    arguments.resize(3);
-    arguments[2].value = weight;
-  } else {
-    arguments.resize(2);
-  }
-  arguments[0].value = output;
-  arguments[1].ids = label;
-  evaluator_->evalImp(arguments);
-}
-
-void AucValidation::onPassEnd() {
-  if (!FLAGS_predict_file.empty()) {
-    std::ofstream fs(FLAGS_predict_file);
-    CHECK(fs) << "Fail to open " << FLAGS_predict_file;
-    for (auto& res : predictArray_) {
-      fs << res.out << " " << res.label << std::endl;
-    }
-  }
-
-  evaluator_->finish();
-  LOG(INFO) << *evaluator_;
-  passBegin_ = false;
-  predictArray_.clear();
-}
-
-bool PnpairValidation::init(const LayerMap& layerMap,
-                            const ParameterMap& parameterMap) {
-  bool ret = ValidationLayer::init(layerMap, parameterMap);
-  if (!ret) return ret;
-  CHECK_GE(inputLayers_.size(), 3UL);
-  CHECK_LE(inputLayers_.size(), 4UL);
-  EvaluatorConfig config;
-  config.set_name(getName());
-  config.set_type("pnpair");
-  config.add_input_layers(inputLayers_[0]->getName());
-  config.add_input_layers(inputLayers_[1]->getName());
-  config.add_input_layers(inputLayers_[2]->getName());
-  if (4 == inputLayers_.size()) {
-    config.add_input_layers(inputLayers_[3]->getName());
-  }
-  evaluator_.reset(Evaluator::create(config));
-  passBegin_ = false;
-  return true;
-}
-
-void PnpairValidation::validationImp(MatrixPtr output, IVectorPtr label) {
-  if (!passBegin_) {
-    passBegin_ = true;
-    evaluator_->start();
-  }
-  MatrixPtr weight =
-      (4 == inputLayers_.size()) ? getInputValue(*inputLayers_[3]) : nullptr;
-  IVectorPtr info = getInputLabel(*getInfoLayer());
-  std::vector<Argument> arguments;
-  if (4 == inputLayers_.size()) {
-    arguments.resize(4);
-    arguments[3].value = weight;
-  } else {
-    arguments.resize(3);
-  }
-  arguments[0].value = output;
-  arguments[1].ids = label;
-  arguments[2].ids = info;
-  evaluator_->evalImp(arguments);
-}
-
-void PnpairValidation::onPassEnd() {
-  if (!FLAGS_predict_file.empty()) {
-    (dynamic_cast<PnpairEvaluator*>(evaluator_.get()))->printPredictResults();
-  }
-  evaluator_->finish();
-  LOG(INFO) << *evaluator_;
-  passBegin_ = false;
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ValidationLayer.h b/paddle/gserver/layers/ValidationLayer.h
deleted file mode 100644
index be41128ef4530f32a63c757648c2f393fd118ea6..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ValidationLayer.h
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <memory>
-
-#include "Layer.h"
-#include "paddle/gserver/evaluators/Evaluator.h"
-
-DECLARE_int32(trainer_id);
-
-namespace paddle {
-
-class ValidationLayer : public Layer {
- public:
-  explicit ValidationLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  LayerPtr getOutputLayer() { return inputLayers_[0]; }
-
-  LayerPtr getLabelLayer() { return inputLayers_[1]; }
-
-  LayerPtr getInfoLayer() {
-    assert(inputLayers_.size() > 2);
-    return inputLayers_[2];
-  }
-
-  void forward(PassType passType) override;
-
-  void backward(const UpdateCallback& callback = nullptr) override;
-
-  virtual void validationImp(MatrixPtr outputValue, IVectorPtr label) = 0;
-
-  void onPassEnd() override = 0;
-};
-
-/*
- * AucValidation
- */
-class AucValidation : public ValidationLayer {
- public:
-  explicit AucValidation(const LayerConfig& config)
-      : ValidationLayer(config),
-        cpuOutput_(nullptr),
-        cpuLabel_(nullptr),
-        cpuWeight_(nullptr) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void validationImp(MatrixPtr outputValue, IVectorPtr label) override;
-
-  void onPassEnd() override;
-
-  struct PredictionResult {
-    PredictionResult(real __out, int __label) : out(__out), label(__label) {}
-    real out;
-    int label;
-  };
-  std::vector<PredictionResult> predictArray_;
-
- private:
-  bool passBegin_;
-  std::unique_ptr<Evaluator> evaluator_;
-  MatrixPtr cpuOutput_;
-  IVectorPtr cpuLabel_;
-  MatrixPtr cpuWeight_;
-};
-
-/*
- * positive-negative pair rate Validation
- */
-class PnpairValidation : public ValidationLayer {
- public:
-  explicit PnpairValidation(const LayerConfig& config)
-      : ValidationLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void validationImp(MatrixPtr outputValue, IVectorPtr label) override;
-
-  void onPassEnd() override;
-
- private:
-  bool passBegin_;
-  std::unique_ptr<Evaluator> evaluator_;
-};
-
-typedef std::shared_ptr<ValidationLayer> ValidationLayerPtr;
-}  // namespace paddle
diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt
deleted file mode 100644
index 9d7cad7584d1defefe38bdd4d041b98bd9e45bf0..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/CMakeLists.txt
+++ /dev/null
@@ -1,103 +0,0 @@
-# gserver pacakge unittests
-add_simple_unittest(test_LinearChainCRF)
-add_simple_unittest(test_RecurrentLayer)
-
-if(NOT MOBILE_INFERENCE)
-  add_simple_unittest(test_MultinomialSampler)
-endif()
-
-function(gserver_test TARGET)
-  add_unittest_without_exec(${TARGET}
-      ${TARGET}.cpp
-      LayerGradUtil.cpp)
-  add_test(NAME ${TARGET}
-      COMMAND ${TARGET})
-endfunction()
-
-add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/concat_dotmul_a.conf
-    COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/* ${CMAKE_CURRENT_BINARY_DIR}
-)
-add_custom_target(copy_gserver_conf ALL DEPENDS concat_dotmul_a.conf)
-
-gserver_test(test_LayerGrad)
-gserver_test(test_CRFLayerGrad)
-gserver_test(test_CrossEntropyOverBeamGrad)
-gserver_test(test_SeqSliceLayerGrad)
-gserver_test(test_ActivationGrad)
-gserver_test(test_ConvTrans)
-gserver_test(test_PriorBox)
-gserver_test(test_DetectionOutput)
-gserver_test(test_ConvUnify)
-gserver_test(test_BatchNorm)
-gserver_test(test_KmaxSeqScore)
-gserver_test(test_Expand)
-gserver_test(test_MaxPoolingWithMaskOutput)
-gserver_test(test_Upsample)
-
-set(PYTHON_PATH 
-   ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d 
-   ${PADDLE_BINARY_DIR}/python/:${PADDLE_BINARY_DIR}/paddle/gserver/tests)
-function(gserver_test_with_python TARGET)
-  add_unittest_without_exec(${TARGET} ${TARGET}.cpp)
-  add_test(NAME ${TARGET}
-    COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}
-      WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
-endfunction()
-
-gserver_test_with_python(test_PyDataProvider2)
-if(WITH_PYTHON)
-    gserver_test_with_python(test_PyDataProvider)
-endif()
-if(NOT MOBILE_INFERENCE)
-    gserver_test_with_python(test_CompareTwoNets)
-    # TODO(yuyang18): There is some bug in test_RecurrentGradientMachine, I will fix it.
-    gserver_test_with_python(test_RecurrentGradientMachine)
-endif()
-
-########## test_MKLDNN layers and activations ##########
-if(WITH_MKLDNN)
-    add_unittest_without_exec(test_MKLDNN
-        test_MKLDNN.cpp
-        MKLDNNTester.cpp
-        LayerGradUtil.cpp)
-    add_test(NAME test_MKLDNN
-        COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_MKLDNN
-            WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle)
-endif()
-
-############### test_WarpCTCLayer #######################
-if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE)
-    add_unittest_without_exec(test_WarpCTCLayer
-        test_WarpCTCLayer.cpp)
-    add_test(NAME test_WarpCTCLayer
-        COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_WarpCTCLayer --warpctc_dir=${WARPCTC_LIB_DIR}
-        WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle)
-endif()
-
-if(NOT MOBILE_INFERENCE)
-    ################## test_Evaluator #############
-    add_unittest(test_Evaluator
-        test_Evaluator.cpp)
-      
-    ########### test_NetworkCompare ###############
-    add_unittest_without_exec(test_NetworkCompare
-        test_NetworkCompare.cpp)
-    if(WITH_GPU)
-        set(use_gpu true)
-    else()
-        set(use_gpu false)
-    endif()
-    add_test(NAME test_NetworkCompare
-        COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=${use_gpu}
-        WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle)
-
-    ############ test_CompareSparse ################
-    add_unittest_without_exec(test_CompareSparse
-        test_CompareSparse.cpp)
-    if(NOT ON_TRAVIS)
-      add_test(NAME test_CompareSparse
-        COMMAND ${PYTHON_PATH} ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port -n 6
-                ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
-        WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
-    endif()
-endif()
diff --git a/paddle/gserver/tests/LayerGradUtil.h b/paddle/gserver/tests/LayerGradUtil.h
deleted file mode 100644
index 1999b2204b1728bd60b1e107dfe7b10718e752a5..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/LayerGradUtil.h
+++ /dev/null
@@ -1,329 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "ModelConfig.pb.h"
-#include "paddle/gserver/layers/DataLayer.h"
-
-#include "paddle/testing/TestUtil.h"
-using namespace std;  // NOLINT
-
-namespace paddle {
-enum InputType {
-  INPUT_DATA,         // dense vector
-  INPUT_LABEL,        // id
-  INPUT_DATA_TARGET,  // dense vector, but no gradient
-  INPUT_SEQUENCE_DATA,
-  INPUT_HASSUB_SEQUENCE_DATA,  // sequence has sub-sequence
-  INPUT_SEQUENCE_MDIM_DATA,
-  INPUT_SEQUENCE_LABEL,
-  INPUT_SPARSE_NON_VALUE_DATA,
-  INPUT_SPARSE_FLOAT_VALUE_DATA,
-  INPUT_DENSE_DIM_DATA,    // using sequence length to init dense data
-  INPUT_SELF_DEFINE_DATA,  // support customizing for input value
-};
-
-struct ParaSparse {
-  bool sparse;
-  string format;
-  // if equalNnzPerSample is set true,
-  // every row of the sparse matrix in a format of CSR has a same
-  // number of nnz values. Currently, this flag is only used for
-  // selective_fc layer
-  bool equalNnzPerSample;
-  ParaSparse(const string& formatIn = "") {  // NOLINT
-    if (formatIn == "") {
-      sparse = false;
-    } else {
-      sparse = true;
-    }
-    equalNnzPerSample = false;
-  }
-  ParaSparse(const string& formatIn, bool equalNnz) {
-    format = formatIn;
-    sparse = true;
-    equalNnzPerSample = equalNnz;
-  }
-};
-
-struct InputDef {
-  InputType inputType;
-  string name;
-  size_t dim;
-  size_t paraSize;
-  ParaSparse sparse;
-  bool isStatic;
-  std::vector<int> labelInitValue;
-  std::vector<int> labelSeqStartPositions;
-  std::vector<int> labelSubSeqStartPositions;
-  std::vector<int> ids;
-  MatrixPtr selfDefinedData;
-
-  InputDef(InputType type, string nameIn, size_t dimIn, size_t sizeIn) {
-    inputType = type;
-    name = nameIn;
-    dim = dimIn;
-    paraSize = sizeIn;
-    sparse = {""};
-    isStatic = false;
-  }
-
-  InputDef(InputType type,
-           string nameIn,
-           MatrixPtr selfDefinedData,
-           std::vector<int> selfDefinedSeqStartPos = {},
-           std::vector<int> selfDefinedSubSeqStartPos = {})
-      : labelSeqStartPositions(selfDefinedSeqStartPos),
-        labelSubSeqStartPositions(selfDefinedSubSeqStartPos),
-        selfDefinedData(selfDefinedData) {
-    inputType = type;
-    name = nameIn;
-    dim = 0;
-    sparse = {""};
-    paraSize = 0;
-    isStatic = false;
-  }
-
-  InputDef(InputType type,
-           string nameIn,
-           const std::vector<int>& ids,
-           const std::vector<int>& selfDefinedSeqStartPos = {},
-           const std::vector<int>& selfDefinedSubSeqStartPos = {})
-      : labelSeqStartPositions(selfDefinedSeqStartPos),
-        labelSubSeqStartPositions(selfDefinedSubSeqStartPos),
-        ids(ids) {
-    selfDefinedData = nullptr;
-    inputType = type;
-    name = nameIn;
-    dim = 0;
-    sparse = {""};
-    paraSize = 0;
-    isStatic = false;
-  }
-
-  InputDef(InputType type,
-           string nameIn,
-           size_t dimIn,
-           size_t sizeIn,
-           const std::vector<int>& labelInitValue,
-           const std::vector<int>& labelSeqStartPositions)
-      : labelInitValue(labelInitValue),
-        labelSeqStartPositions(labelSeqStartPositions) {
-    inputType = type;
-    name = nameIn;
-    dim = dimIn;
-    paraSize = sizeIn;
-    sparse = {""};
-    isStatic = false;
-  }
-
-  InputDef(InputType type,
-           string nameIn,
-           size_t dimIn,
-           size_t sizeIn,
-           ParaSparse sparseIn) {
-    inputType = type;
-    name = nameIn;
-    dim = dimIn;
-    paraSize = sizeIn;
-    sparse = sparseIn;
-  }
-};
-
-struct TestConfig {
-  LayerConfig layerConfig;
-  std::vector<InputDef> inputDefs;
-  size_t biasSize;
-  real paramInitialMean;
-  real paramInitialStd;
-  bool testAccumulate;
-  bool testState;
-  bool staticBias;
-  bool testBatchState;
-  TestConfig()
-      : biasSize(0),
-        paramInitialMean(0.0),
-        paramInitialStd(1.0),
-        testAccumulate(true),
-        testState(false),
-        staticBias(false),
-        testBatchState(false) {}
-};
-
-real getCostSum(ParameterPtr& parameter,
-                CpuVector& cpuPara,
-                LayerPtr& testLayer,
-                MatrixPtr weights = nullptr);
-
-real getDiffAndPrint(real newCost1,
-                     real newCost2,
-                     real callbackCount,
-                     char fill,
-                     string testLayerName,
-                     string name,
-                     real step,
-                     real delta);
-
-/**
- * @brief verify that sequentially running forward() one timestamp at one time
- *        has same result as running forward() with one whole sequence
- *
- * @param testLayer[in/out]    testLayer
- * @param dataLayers[in/out]   dataLayers
- * @param datas[in/out]        data of dataLayers
- */
-void testState(LayerPtr testLayer,
-               vector<DataLayerPtr>& dataLayers,
-               vector<Argument>& datas);
-
-/**
- * @brief verify that sequentially running forward() with short sequences one
- *        time has same result as running forward() with long sequences.
- *
- * @param testLayer[in/out]    testLayer
- * @param dataLayers[in/out]   dataLayers
- * @param datas[in/out]        data of dataLayers
- */
-void testBatchState(LayerPtr testLayer,
-                    vector<DataLayerPtr>& dataLayers,
-                    vector<Argument>& datas);
-
-/**
- * @brief Generate a perturbation so that it is roughly aligned with the
- *        gradient direction. This is to make sure that change along this
- *        direction will make cost increase (or decrease) in a meaningful
- *        way so that the finite difference can be used to approximate the
- *        directional dirivative well.
- *
- * @param oldGrad[in]  input gradient
- *        newGrad[out] output gradient
- *        dim          dimension of oldGrad/newGrad
- *
- * @return sum_i(oldGrad[i] * newGrad[i])
- */
-double genPerturbation(const real* oldGrad, real* newGrad, size_t dim);
-
-void initWeight(MatrixPtr& weights);
-
-void initBatchState(LayerPtr dataLayer,
-                    LayerPtr testLayer,
-                    LayerStatePtr state,
-                    bool useGpu);
-
-/**
- * @brief initialize the dataLayer by its inputType
- *
- * @param testConf[in]        test config
- *        dataLayers[out]     dataLayers
- *        datas[out]          initialized data of dataLayers
- *        layerMap[out]       layerMap
- */
-void initDataLayer(TestConfig testConf,
-                   std::vector<DataLayerPtr>* dataLayers,
-                   vector<Argument>* datas,
-                   LayerMap* layerMap,
-                   string testLayerName,
-                   size_t batchSize,
-                   bool trans,
-                   bool useGpu);
-
-/**
- * @brief initialize the parameter of testLayer
- *
- * @param testConf[in/out]    test config
- *        layerMap[out]       layerMap
- *        parameters[out]     parameters of testLayer
- *        testLayer[out]      testLayer
- */
-void initTestLayer(TestConfig testConf,
-                   LayerMap* layerMap,
-                   std::vector<ParameterPtr>* parameters,
-                   LayerPtr* testLayer);
-
-/**
- * @brief Test whether the layer's forward calculation is stable by adding
- *        perturbation to its parameters
- *
- * @param testConf[in]         test config
- *        weights[in]          weights of testLayer
- *        state[in]            state of testLayer
- *        cost[in]             input cost
- *        callbackCount[in]    number of done callback
- *        maxDiff[in/out]      max of all previous diff
- *        testLayer[in/out]    testLayer
- *        parameters[in/out]   parameters of testLayer
- */
-void testPerturbParameter(TestConfig testConf,
-                          const MatrixPtr weights,
-                          const LayerStatePtr state,
-                          real cost,
-                          real callbackCount,
-                          real* maxDiff,
-                          LayerPtr testLayer,
-                          std::vector<ParameterPtr>* parameters);
-
-/**
- * @brief Test whether the layer's forward calculation is stable by adding
- *        perturbation to its input layers
- *
- * @param testConf[in]         test config
- *        weights[in]          weights of testLayer
- *        state[in]            state of testLayer
- *        cost[in]             input cost
- *        callbackCount[in]    number of done callback
- *        maxDiff[in/out]      max of all previous diff
- *        testLayer[in/out]    testLayer
- *        dataLayers[in/out]   dataLayers
- */
-void testPerturbInput(TestConfig testConf,
-                      const MatrixPtr weights,
-                      const LayerStatePtr state,
-                      real cost,
-                      real callbackCount,
-                      real* maxDiff,
-                      LayerPtr testLayer,
-                      std::vector<DataLayerPtr> dataLayers);
-
-void testLayerGradKernel(TestConfig testConf,
-                         string testLayerName,
-                         size_t batchSize,
-                         bool trans,
-                         bool useGpu,
-                         bool useWeight = false,
-                         float epsilon = 0.02);
-
-void testLayerGrad(TestConfig testConf,
-                   string testLayerName,
-                   size_t batchSize,
-                   bool trans,
-                   bool useGpu,
-                   bool useWeight = false,
-                   float epsilon = 0.02);
-
-void testProjectionGrad(ProjectionConfig conf,
-                        InputType inputType,
-                        size_t parameterSize,
-                        size_t batchSize,
-                        bool useGpu,
-                        bool testState = false,
-                        int biasSize = 0,
-                        bool sharedBias = false);
-
-void testOperatorGrad(TestConfig& config,
-                      OperatorConfig& operatorConf,
-                      size_t batchSize,
-                      bool useGpu,
-                      bool testState = false);
-
-}  //  namespace paddle
diff --git a/paddle/gserver/tests/MKLDNNTester.cpp b/paddle/gserver/tests/MKLDNNTester.cpp
deleted file mode 100644
index d2a9761a4e16832a0722d4375cc11adb42524a8c..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/MKLDNNTester.cpp
+++ /dev/null
@@ -1,580 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MKLDNNTester.h"
-#include "paddle/gserver/layers/MKLDNNBase.h"
-#include "paddle/gserver/layers/MKLDNNLayer.h"
-#include "paddle/trainer/Trainer.h"
-
-namespace paddle {
-
-// init data layer and test layer of both dnn and reference
-void MKLDNNTester::reset(const TestConfig& dnn,
-                         const TestConfig& ref,
-                         size_t batchSize) {
-  const bool trans = false;
-  const bool useGpu = false;
-
-  // clear
-  configs_.clear();
-  layerNames_.clear();
-  dataLayers_.clear();
-  datas_.clear();
-  layerMaps_.clear();
-  parameters_.clear();
-  testLayers_.clear();
-
-  // resize
-  configs_.resize(NUM);
-  layerNames_.resize(NUM);
-  dataLayers_.resize(NUM);
-  datas_.resize(NUM);
-  layerMaps_.resize(NUM);
-  parameters_.resize(NUM);
-  testLayers_.resize(NUM);
-
-  // reset configs and layer names
-  configs_[DNN] = dnn;
-  configs_[REF] = ref;
-  layerNames_[DNN] = "mkldnn";     // the first is mkldnn layer
-  layerNames_[REF] = "reference";  // second is reference layer
-
-  // reset others
-  for (size_t i = 0; i < NUM; ++i) {
-    configs_[i].layerConfig.set_name(layerNames_[i]);
-    initDataLayer(configs_[i],
-                  &(dataLayers_[i]),
-                  &(datas_[i]),
-                  &(layerMaps_[i]),
-                  layerNames_[i],
-                  batchSize,
-                  trans,
-                  useGpu);
-    initTestLayer(
-        configs_[i], &(layerMaps_[i]), &(parameters_[i]), &(testLayers_[i]));
-  }
-  refLayer_ = testLayers_[REF];
-  dnnLayer_ = testLayers_[DNN];
-  EXPECT_EQ(dataLayers_[DNN].size(), dataLayers_[REF].size());
-  EXPECT_EQ(parameters_[DNN].size(), parameters_[REF].size());
-  setInputImgSize();
-
-  // for comparison with Paddle reference results,
-  // need manually add cpu device output for test
-  MKLDNNLayerPtr dnnLayer = std::dynamic_pointer_cast<MKLDNNLayer>(dnnLayer_);
-  if (dnnLayer) {
-    dnnLayer->addOutputArgument(CPU_DEVICE);
-  }
-}
-
-void MKLDNNTester::setInputImgSize() {
-  for (size_t n = 0; n < dataLayers_.size(); ++n) {
-    for (size_t i = 0; i < dataLayers_[n].size(); ++i) {
-      // TODO(TJ): fix me when concat and elewise ready
-      dataLayers_[n][i]->getOutput().setFrameHeight(ih_);
-      dataLayers_[n][i]->getOutput().setFrameWidth(iw_);
-    }
-  }
-}
-
-// init randome parameters of ref, and copy to mkldnn
-void MKLDNNTester::randomWgtDatas() {
-  EXPECT_EQ(parameters_[DNN].size(), parameters_[REF].size());
-  const bool isBN = refLayer_->getType() == "batch_norm";
-  for (size_t i = 0; i < parameters_[REF].size(); ++i) {
-    const VectorPtr& dnnValue = parameters_[DNN][i]->getBuf(PARAMETER_VALUE);
-    const VectorPtr& refValue = parameters_[REF][i]->getBuf(PARAMETER_VALUE);
-    parameters_[REF][i]->randomize();
-    if (isBN && i == 2) {
-      // this param is moving average in batch norm, which must larger than 0
-      real offset = fabs(refValue->getMin()) + 1.0;
-      refValue->add(offset);
-    }
-    dnnValue->copyFrom(*refValue);
-
-    VLOG(MKLDNN_TESTS) << "Random weight " << parameters_[DNN][i]->getName();
-    printVector(dnnValue);
-  }
-}
-
-// random botdata of ref layer and copy same to mkldnn
-void MKLDNNTester::randomBotDatas() {
-  CHECK_EQ(dataLayers_.size(), NUM);
-  for (size_t i = 0; i < dataLayers_[DNN].size(); ++i) {
-    dataLayers_[REF][i]->getOutputValue()->randomizeUniform();
-    dataLayers_[DNN][i]->getOutputValue()->copyFrom(
-        *(dataLayers_[REF][i]->getOutputValue()));
-    VLOG(MKLDNN_TESTS) << "Random Foward, InputValue " << i;
-    printMatrix(dataLayers_[REF][i]->getOutputValue());
-  }
-}
-
-void MKLDNNTester::randomTopDiffs() {
-  refLayer_->getOutputGrad()->randomizeUniform();
-  dnnLayer_->getOutput(CPU_DEVICE)
-      .grad->copyFrom(*(refLayer_->getOutputGrad()));
-  VLOG(MKLDNN_TESTS) << "Random Backward, OutputGrad";
-  printMatrix(refLayer_->getOutputGrad());
-}
-
-void MKLDNNTester::checkForward() {
-  VLOG(MKLDNN_TESTS) << "Check Forward";
-  printTopDatas();
-  double delta =
-      compareMatrix(refLayer_->getOutputValue(), dnnLayer_->getOutputValue());
-  EXPECT_LE(fabs(delta), eps_);
-}
-
-void MKLDNNTester::checkBackwardData() {
-  VLOG(MKLDNN_TESTS) << "Check Backward Data";
-  const bool isBN = refLayer_->getType() == "batch_norm";
-  for (size_t i = 0; i < dataLayers_[DNN].size(); ++i) {
-    const MatrixPtr& dnnDiff = dataLayers_[DNN][i]->getOutputGrad();
-    const MatrixPtr& refDiff = dataLayers_[REF][i]->getOutputGrad();
-    VLOG(MKLDNN_ALL) << "MKLDNN Backward Result: InputGrad " << i;
-    printMatrix(dnnDiff);
-    VLOG(MKLDNN_ALL) << "Reference Backward Result: InputGrad " << i;
-    printMatrix(refDiff);
-
-    double delta = compareMatrix(refDiff, dnnDiff);
-    EXPECT_LE(fabs(delta), eps_);
-    if (isBN) {
-      // the other two inputs in batch norm are for moving mean and var
-      // do not have grad to compare
-      break;
-    }
-  }
-}
-
-void MKLDNNTester::checkBackwardWgts() {
-  VLOG(MKLDNN_TESTS) << "Check Backward Weight";
-  CHECK_EQ(parameters_[DNN].size(), parameters_[REF].size());
-  vector<VectorPtr> dnnWgts;  // used to temply save mkldnn weights
-  saveWgt(parameters_[DNN], dnnWgts);
-
-  MKLDNNLayerPtr dnnLayer = std::dynamic_pointer_cast<MKLDNNLayer>(dnnLayer_);
-  if (dnnLayer) {
-    dnnLayer->convertWeightsToPaddle();
-  }
-  for (size_t i = 0; i < parameters_[DNN].size(); ++i) {
-    const VectorPtr& dnn = parameters_[DNN][i]->getBuf(PARAMETER_VALUE);
-    const VectorPtr& ref = parameters_[REF][i]->getBuf(PARAMETER_VALUE);
-    VLOG(MKLDNN_ALL) << "MKLDNN Result: weight value"
-                     << parameters_[DNN][i]->getName();
-    printVector(dnn);
-    VLOG(MKLDNN_ALL) << "Reference Result: weight value "
-                     << parameters_[REF][i]->getName();
-    printVector(ref);
-
-    double delta = compareVector(ref, dnn);
-    EXPECT_LE(fabs(delta), eps_);
-  }
-
-  VLOG(MKLDNN_ALL) << "Restore dnn weights before comapre";
-  restoreWgt(dnnWgts, parameters_[DNN]);
-}
-
-void MKLDNNTester::saveWgt(const vector<ParameterPtr>& from,
-                           vector<VectorPtr>& to) {
-  const bool useGpu = false;
-  to.resize(from.size());
-  for (size_t i = 0; i < to.size(); ++i) {
-    const VectorPtr& wgt = from[i]->getBuf(PARAMETER_VALUE);
-    to[i] = Vector::create(wgt->getSize(), useGpu);
-    to[i]->copyFrom(*wgt);
-  }
-}
-
-void MKLDNNTester::restoreWgt(const vector<VectorPtr>& from,
-                              vector<ParameterPtr>& to) {
-  CHECK_EQ(from.size(), to.size());
-  for (size_t i = 0; i < from.size(); ++i) {
-    const VectorPtr& wgt = to[i]->getBuf(PARAMETER_VALUE);
-    wgt->copyFrom(*from[i]);
-  }
-}
-
-// clear parameters grad
-void MKLDNNTester::clearWgtDiffs(size_t id) {
-  CHECK_LE(id, parameters_.size());
-  for (size_t n = 0; n < parameters_.size(); ++n) {
-    if (id == n || id == parameters_.size()) {
-      for (size_t i = 0; i < parameters_[n].size(); ++i) {
-        const VectorPtr& grad = parameters_[n][i]->getBuf(PARAMETER_GRADIENT);
-        if (grad) {
-          grad->zeroMem();
-        }
-      }
-    }
-  }
-}
-
-void MKLDNNTester::clearBotDiffs(size_t id) {
-  CHECK_LE(id, dataLayers_.size());
-  for (size_t n = 0; n < dataLayers_.size(); ++n) {
-    if (id == n || id == dataLayers_.size()) {
-      // clear inputs layers of this specific layer
-      for (size_t i = 0; i < dataLayers_[n].size(); ++i) {
-        dataLayers_[n][i]->getOutputGrad()->zeroMem();
-      }
-    }
-  }
-}
-
-void MKLDNNTester::clearTopDatas(size_t id) {
-  CHECK_LE(id, testLayers_.size());
-  for (size_t i = 0; i < testLayers_.size(); ++i) {
-    if (id == i || id == testLayers_.size()) {
-      testLayers_[i]->getOutputValue()->zeroMem();
-    }
-  }
-}
-
-void MKLDNNTester::printTopDatas() {
-  if (!log_) {
-    return;
-  }
-
-  for (int n = 0; n < NUM; ++n) {
-    VLOG(MKLDNN_ALL) << testLayers_[n]->getType()
-                     << " Forward Result: OutputValue";
-    printMatrix(testLayers_[n]->getOutputValue());
-  }
-}
-
-void MKLDNNTester::printMatrix(const MatrixPtr& m) {
-  if (!log_) {
-    return;
-  }
-
-  std::ostringstream ostr;
-  m->print(ostr);
-  VLOG(MKLDNN_ALL) << std::endl << ostr.str();
-}
-
-void MKLDNNTester::printVector(const VectorPtr& v) {
-  if (!log_) {
-    return;
-  }
-
-  std::ostringstream ostr;
-  v->print(ostr, v->getSize());
-  VLOG(MKLDNN_ALL) << std::endl << ostr.str();
-}
-
-double MKLDNNTester::getDelta(const real* refer,
-                              const real* value,
-                              size_t len,
-                              const float failRate,
-                              const float thres) {
-  double delta = 0, sum = 0;
-  int failCnt = 0;
-  const double eps = 1e-5;
-  double maxRatio = 0;
-  for (size_t i = 0; i < len; ++i) {
-    double ref = fabs(refer[i]);
-    double val = fabs(value[i]);
-    double diff = fabs(refer[i] - value[i]);
-    delta += diff;
-    sum += ref;
-    if (ref < eps && val < eps) {  // both values are very small
-      continue;
-    }
-    double ratio = diff / ref;
-    if (ratio > thres) {
-      maxRatio = std::max(maxRatio, ratio);
-      failCnt++;
-    }
-  }
-  EXPECT_FALSE(std::isinf(sum));
-  EXPECT_FALSE(std::isnan(sum));
-  EXPECT_FALSE(std::isnan(delta));
-  VLOG(MKLDNN_ALL) << "reference avg data: " << sum / len
-                   << ", delta: " << delta / sum << ", failCnt:" << failCnt;
-  double res = sum > eps ? delta / sum : eps;
-  return (failCnt / (float)len) > failRate ? maxRatio : res;
-}
-
-double MKLDNNTester::compareMatrix(const MatrixPtr& m1, const MatrixPtr& m2) {
-  CHECK_EQ(m1->getElementCnt(), m2->getElementCnt());
-  return getDelta(m1->getData(), m2->getData(), m1->getElementCnt());
-}
-
-double MKLDNNTester::compareVector(const VectorPtr& v1, const VectorPtr& v2) {
-  CHECK_EQ(v1->getSize(), v2->getSize());
-  return getDelta(v1->getData(), v2->getData(), v1->getSize());
-}
-
-void MKLDNNTester::runOnce() {
-  // test forward
-  randomBotDatas();
-  dnnLayer_->forward(passType_);
-  refLayer_->forward(passType_);
-  checkForward();
-
-  if (passType_ == PASS_TEST) {
-    return;
-  }
-
-  // test backward
-  // simple updater
-  UpdateCallback updateCallback = [](Parameter* para) {
-    auto& grad = para->getBuf(PARAMETER_GRADIENT);
-    auto& value = para->getBuf(PARAMETER_VALUE);
-    real lr = 1e-2;
-    value->add(*grad, lr);
-    grad->zeroMem();
-  };
-  randomTopDiffs();
-  dnnLayer_->backward(updateCallback);
-  refLayer_->backward(updateCallback);
-  checkBackwardData();
-  checkBackwardWgts();
-
-  // clear buffers
-  // ref code will addto the diff, dnn code will writeto it
-  // and clearTopDatas(REF) should be coverd by ref layers
-  clearBotDiffs(REF);
-  clearWgtDiffs(REF);
-  // it is necessary to clear bottom diffs when only activation is dnn type
-  if (configs_[DNN].layerConfig.active_type().compare(0, 7, "mkldnn_") == 0) {
-    clearBotDiffs(DNN);
-  }
-}
-
-void MKLDNNTester::run(const TestConfig& dnn,
-                       const TestConfig& ref,
-                       size_t batchSize,
-                       size_t inputImgH,
-                       size_t inputImgW,
-                       PassType passType,
-                       bool printDetails,
-                       size_t iter,
-                       float epsilon) {
-  CHECK(dnn.layerConfig.type().compare(0, 7, "mkldnn_") == 0 ||
-        dnn.layerConfig.active_type().compare(0, 7, "mkldnn_") == 0)
-      << "should be MKLDNN layer or MKLDNN activation";
-  if (dnn.layerConfig.type() == ref.layerConfig.type()) {
-    VLOG(MKLDNN_TESTS) << "Test MKLDNN functionality: "
-                       << dnn.layerConfig.active_type() << " vs "
-                       << ref.layerConfig.active_type();
-  } else {
-    VLOG(MKLDNN_TESTS) << "Test MKLDNN functionality: "
-                       << dnn.layerConfig.type() << " vs "
-                       << ref.layerConfig.type();
-  }
-
-  ih_ = inputImgH;
-  iw_ = inputImgW;
-  passType_ = passType;
-  log_ = printDetails;
-  iter_ = iter;
-  eps_ = epsilon;
-
-  // Firstly test mkldnn init from PARAM_FORMAT_ORIGINAL weight
-  reset(dnn, ref, batchSize);
-  randomWgtDatas();
-  clearWgtDiffs();
-  clearBotDiffs();
-  for (size_t i = 0; i < iter_; ++i) {
-    VLOG(MKLDNN_TESTS) << "Check Iteration " << i;
-    runOnce();
-  }
-
-  if (parameters_[DNN].empty()) {
-    // has no paramters
-    return;
-  }
-
-  // After run some iterations, the mkldnn weight has been stored in dnnLayer
-  // and we can also get the mkldnn weight parameter header format.
-  // Weight parameter should always be index 0 (and bias index 1).
-  // TODO(TJ): should also consider mean and var format when batchnorm ready
-  int dnnWgtFmt = parameters_[DNN][0]->getHeaderFormat();
-  int refWgtFmt = parameters_[REF][0]->getHeaderFormat();
-  if (dnnWgtFmt == refWgtFmt) {
-    // weight format are equal, so no need check more
-    return;
-  }
-
-  // then save the weights and restart again
-  vector<VectorPtr> dnnWgts, refWgts;
-  CHECK_EQ(parameters_[DNN].size(), parameters_[REF].size());
-  saveWgt(parameters_[DNN], dnnWgts);
-  saveWgt(parameters_[REF], refWgts);
-
-  // restart again with dnn weight format
-  reset(dnn, ref, batchSize);
-  // TODO(TJ): should also considerate mean and var format when batchnorm ready
-  parameters_[DNN][0]->setHeaderFormat(dnnWgtFmt);
-
-  // restore wgt
-  restoreWgt(dnnWgts, parameters_[DNN]);
-  restoreWgt(refWgts, parameters_[REF]);
-  clearWgtDiffs();
-  clearBotDiffs();
-
-  for (size_t i = 0; i < iter_; ++i) {
-    VLOG(MKLDNN_TESTS) << "Check Iteration " << i;
-    runOnce();
-  }
-}
-
-void MKLDNNTester::initArgument(DataIn& data,
-                                const std::string& configPath,
-                                const size_t iter) {
-  TrainerConfigHelper config(configPath);
-  size_t batchSize = config.getOptConfig().batch_size();
-  data.inArgs.resize(iter);
-  data.outGrads.resize(iter);
-  data.paraValues.clear();
-  for (const auto& layer_name : config.getModelConfig().input_layer_names()) {
-    auto layer_config = std::find_if(config.getModelConfig().layers().begin(),
-                                     config.getModelConfig().layers().end(),
-                                     [=](const LayerConfig& layer_config) {
-                                       return layer_config.name() == layer_name;
-                                     });
-    CHECK(layer_config != config.getModelConfig().layers().end());
-
-    size_t layerSize = layer_config->size();
-    for (size_t i = 0; i < iter; ++i) {
-      Argument arg;
-      arg.value = Matrix::create(batchSize, layerSize, false, false);
-      arg.grad = Matrix::create(batchSize, layerSize, false, false);
-      arg.value->randomizeUniform();
-      arg.value->add(-0.5);
-      arg.value->sigmoid(*arg.value);
-      arg.grad->zeroMem();
-      arg.ids = VectorT<int>::create(batchSize, false);
-      arg.ids->rand(layerSize);
-      generateSequenceStartPositions(batchSize, arg.sequenceStartPositions);
-      data.inArgs[i].push_back(arg);
-    }
-  }
-
-  for (const auto& layer_name : config.getModelConfig().output_layer_names()) {
-    auto layer_config = std::find_if(config.getModelConfig().layers().begin(),
-                                     config.getModelConfig().layers().end(),
-                                     [=](const LayerConfig& layer_config) {
-                                       return layer_config.name() == layer_name;
-                                     });
-    CHECK(layer_config != config.getModelConfig().layers().end());
-
-    size_t layerSize = layer_config->size();
-    for (size_t i = 0; i < iter; ++i) {
-      MatrixPtr grad = Matrix::create(batchSize, layerSize, false, false);
-      grad->randomizeUniform();
-      data.outGrads[i].push_back(grad);
-    }
-  }
-
-  for (const auto& para_config : config.getModelConfig().parameters()) {
-    VectorPtr value = Vector::create(para_config.size(), false);
-    value->randnorm(0, 2);
-    data.paraValues.push_back(value);
-  }
-}
-
-void MKLDNNTester::getOutResult(const std::string& configPath,
-                                DataIn& in,
-                                DataOut& out,
-                                bool use_mkldnn,
-                                size_t iter) {
-  FLAGS_use_gpu = false;
-  FLAGS_use_mkldnn = use_mkldnn;
-  *ThreadLocalRand::getSeed() = 1;
-  srand(1);
-
-  Trainer trainer;
-  auto config = std::make_shared<TrainerConfigHelper>(configPath);
-  trainer.init(config, false);
-  auto gradientMachine = trainer.getGradientMachine();
-  std::vector<ParameterPtr> parameters = gradientMachine->getParameters();
-  for (size_t i = 0; i < in.paraValues.size(); i++) {
-    parameters[i]->getBuf(PARAMETER_VALUE)->copyFrom(*in.paraValues[i]);
-  }
-  UpdateCallback simpleUpdate = [](Parameter* para) {
-    auto& grad = para->getBuf(PARAMETER_GRADIENT);
-    auto& value = para->getBuf(PARAMETER_VALUE);
-    real lr = 1e-2;
-    value->add(*grad, lr);
-    grad->zeroMem();
-  };
-
-  vector<Argument> outArgs;
-  gradientMachine->start();
-  out.outValues.clear();
-  out.paraValues.clear();
-  for (size_t i = 0; i < iter; ++i) {
-    VLOG(MKLDNN_TESTS) << "runing iteration " << i;
-    gradientMachine->forward(in.inArgs[i], &outArgs, PASS_TRAIN);
-    // save forward result
-    for (size_t k = 0; k < outArgs.size(); k++) {
-      const MatrixPtr& src = outArgs[k].value;
-      MatrixPtr dst =
-          Matrix::create(src->getHeight(), src->getWidth(), false, false);
-      if (typeid(*src) == typeid(MKLDNNMatrix)) {
-        MKLDNNMatrixPtr dnnSrc = std::dynamic_pointer_cast<MKLDNNMatrix>(src);
-        dnnSrc->copyTo(*dst);
-      } else {
-        dst->copyFrom(*src);
-      }
-      out.outValues.push_back(dst);
-    }
-
-    // random backward input
-    for (size_t k = 0; k < outArgs.size(); k++) {
-      outArgs[k].grad->copyFrom(*in.outGrads[i][k]);
-    }
-    gradientMachine->backward(simpleUpdate);
-  }
-  gradientMachine->finish();
-
-  // save param value
-  for (size_t i = 0; i < in.paraValues.size(); i++) {
-    VectorPtr val = Vector::create(
-        parameters[i]->getBuf(PARAMETER_VALUE)->getSize(), false);
-    val->copyFrom(*parameters[i]->getBuf(PARAMETER_VALUE));
-    out.paraValues.push_back(val);
-  }
-}
-
-void MKLDNNTester::compareResult(DataOut& ref, DataOut& dnn, float eps) {
-  CHECK_EQ(ref.outValues.size(), dnn.outValues.size());
-  CHECK_EQ(ref.paraValues.size(), dnn.paraValues.size());
-  for (size_t i = 0; i < ref.outValues.size(); i++) {
-    VLOG(MKLDNN_TESTS) << "compare value index: " << i;
-    EXPECT_LE(fabs(compareMatrix(ref.outValues[i], dnn.outValues[i])), eps);
-  }
-  for (size_t i = 0; i < ref.paraValues.size(); i++) {
-    VLOG(MKLDNN_TESTS) << "compare param index: " << i;
-    EXPECT_LE(fabs(compareVector(ref.paraValues[i], dnn.paraValues[i])), eps);
-  }
-}
-
-void MKLDNNTester::runNetTest(const std::string& configPath,
-                              size_t iter,
-                              float eps) {
-  DataIn in;
-  initArgument(in, configPath, iter);
-  DataOut outCpu, outDnn;
-  VLOG(MKLDNN_TESTS) << "runing cpu network";
-  getOutResult(configPath, in, outCpu, false, iter);
-  VLOG(MKLDNN_TESTS) << "runing mkldnn network";
-  getOutResult(configPath, in, outDnn, true, iter);
-
-  compareResult(outCpu, outDnn, eps);
-}
-
-}  //  namespace paddle
diff --git a/paddle/gserver/tests/MKLDNNTester.h b/paddle/gserver/tests/MKLDNNTester.h
deleted file mode 100644
index 41ac46b70ab08d4071f4e6abfca94667268015d7..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/MKLDNNTester.h
+++ /dev/null
@@ -1,143 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "LayerGradUtil.h"
-#include "paddle/gserver/layers/MKLDNNBase.h"
-#include "paddle/gserver/layers/MKLDNNLayer.h"
-
-namespace paddle {
-
-/**
- * @brief test the functionality of MKLDNNlayers and MKLDNNActivations
- * refer to paddle original function
- */
-class MKLDNNTester {
-  enum {
-    DNN = 0,  // MKLDNN layer
-    REF = 1,  // Reference layer
-    NUM = 2,  // Number of total
-  };
-
-  struct DataIn {
-    std::vector<std::vector<Argument>> inArgs;
-    std::vector<std::vector<MatrixPtr>> outGrads;
-    std::vector<VectorPtr> paraValues;
-  };
-
-  struct DataOut {
-    std::vector<MatrixPtr> outValues;
-    std::vector<VectorPtr> paraValues;
-  };
-
- protected:
-  std::vector<TestConfig> configs_;
-  vector<string> layerNames_;
-  vector<vector<DataLayerPtr>> dataLayers_;
-  vector<vector<Argument>> datas_;
-  vector<LayerMap> layerMaps_;
-  vector<vector<ParameterPtr>> parameters_;
-  vector<LayerPtr> testLayers_;
-  LayerPtr refLayer_, dnnLayer_;
-
-  /// run some iterations, all the result should pass
-  size_t iter_;
-  /// whether to print out the details
-  bool log_;
-  /// epsilon
-  float eps_;
-  /// input image size, default 1
-  size_t ih_, iw_;
-  /// passType, PASS_TRAIN, PASS_TEST or PASS_GC (Gradient Check pass)
-  PassType passType_;
-
- public:
-  explicit MKLDNNTester(size_t iter = 3, float epsilon = 1e-4) {
-    iter_ = iter;
-    eps_ = epsilon;
-    log_ = false;
-    passType_ = PASS_TRAIN;
-  }
-
-  ~MKLDNNTester() {}
-
- public:
-  void run(const TestConfig& dnn,
-           const TestConfig& ref,
-           size_t batchSize,
-           size_t inputImgH = 1,
-           size_t inputImgW = 1,
-           PassType passType = PASS_TRAIN,
-           bool printDetails = false,
-           size_t iter = 3,
-           float epsilon = 1e-4);
-  static void runNetTest(const std::string& configPath,
-                         size_t iter = 2,
-                         float eps = 1e-4);
-  static void initArgument(DataIn& data,
-                           const std::string& configPath,
-                           size_t iter = 2);
-  static void getOutResult(const std::string& configPath,
-                           DataIn& in,
-                           DataOut& out,
-                           bool use_mkldnn,
-                           size_t iter = 2);
-
- private:
-  void reset(const TestConfig& dnn, const TestConfig& ref, size_t batchSize);
-  void setInputImgSize();
-  void runOnce();
-
-  void randomWgtDatas();
-  void randomBotDatas();
-  void randomTopDiffs();
-
-  void checkForward();
-  void checkBackwardData();
-  void checkBackwardWgts();
-
-  // clear specific layer, clear all when id equals NUM
-  void clearWgtDiffs(size_t id = NUM);
-  void clearBotDiffs(size_t id = NUM);
-  void clearTopDatas(size_t id = NUM);
-
-  void printTopDatas();
-  void printMatrix(const MatrixPtr& m);
-  void printVector(const VectorPtr& v);
-
-  void saveWgt(const vector<ParameterPtr>& from, vector<VectorPtr>& to);
-  void restoreWgt(const vector<VectorPtr>& from, vector<ParameterPtr>& to);
-
-  static double compareMatrix(const MatrixPtr& m1, const MatrixPtr& m2);
-  static double compareVector(const VectorPtr& v1, const VectorPtr& v2);
-  static void compareResult(DataOut& ref, DataOut& dnn, float eps = 1e-4);
-
-  /**
-   * Get delta percent
-   * if many(>failRate) wrong(abs(val-ref)/abs(ref) > thres) points
-   * return the max(diff/ref)
-   * else return sum(abs(diff)) / sum(abs(ref))
-   * The return value should be smaller than eps when passing.
-   */
-  static double getDelta(const real* refer,
-                         const real* value,
-                         size_t len,
-                         const float failRate = 1e-3,
-                         const float thres = 0.1);
-};
-
-}  //  namespace paddle
diff --git a/paddle/gserver/tests/Sequence/train.list b/paddle/gserver/tests/Sequence/train.list
deleted file mode 100644
index be27acb3a5411d8fe65797079a9a5977c1f0f90a..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/Sequence/train.list
+++ /dev/null
@@ -1 +0,0 @@
-gserver/tests/Sequence/tour_train_wdseg
diff --git a/paddle/gserver/tests/Sequence/train.list.nest b/paddle/gserver/tests/Sequence/train.list.nest
deleted file mode 100644
index 7683ebc68efbb07ce01d8faab14574109df99af9..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/Sequence/train.list.nest
+++ /dev/null
@@ -1 +0,0 @@
-gserver/tests/Sequence/tour_train_wdseg.nest
diff --git a/paddle/gserver/tests/sequence_layer_group.conf b/paddle/gserver/tests/sequence_layer_group.conf
deleted file mode 100644
index 50f2d89d0271b2eaa460e57636eb09b6d6aeda18..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/sequence_layer_group.conf
+++ /dev/null
@@ -1,62 +0,0 @@
-#!/usr/bin/env python
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
-dict_file = dict()
-for line_count, line in enumerate(open(dict_path, "r")):
-    dict_file[line.strip()] = line_count
-
-define_py_data_sources2(
-    train_list='gserver/tests/Sequence/train.list',
-    test_list=None,
-    module='sequenceGen',
-    obj='process',
-    args={"dict_file": dict_file})
-
-settings(batch_size=5)
-######################## network configure ################################
-dict_dim = len(open(dict_path, 'r').readlines())
-word_dim = 128
-hidden_dim = 256
-label_dim = 3
-
-data = data_layer(name="word", size=dict_dim)
-
-emb = embedding_layer(input=data, size=word_dim)
-
-# (lstm_input + lstm) is equal to lstmemory 
-with mixed_layer(size=hidden_dim * 4) as lstm_input:
-    lstm_input += full_matrix_projection(input=emb)
-
-lstm = lstmemory_group(
-    input=lstm_input,
-    size=hidden_dim,
-    act=TanhActivation(),
-    gate_act=SigmoidActivation(),
-    state_act=TanhActivation())
-
-lstm_last = last_seq(input=lstm)
-
-with mixed_layer(
-        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
-    output += full_matrix_projection(input=lstm_last)
-
-outputs(
-    classification_cost(
-        input=output, label=data_layer(
-            name="label", size=1)))
diff --git a/paddle/gserver/tests/sequence_lstm.conf b/paddle/gserver/tests/sequence_lstm.conf
deleted file mode 100644
index f49a827f22edce056eaf9903e99b732cab7f3784..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/sequence_lstm.conf
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/usr/bin/env python
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
-dict_file = dict()
-for line_count, line in enumerate(open(dict_path, "r")):
-    dict_file[line.strip()] = line_count
-
-define_py_data_sources2(
-    train_list='gserver/tests/Sequence/train.list',
-    test_list=None,
-    module='sequenceGen',
-    obj='process',
-    args={"dict_file": dict_file})
-
-settings(batch_size=5)
-######################## network configure ################################
-dict_dim = len(open(dict_path, 'r').readlines())
-word_dim = 128
-hidden_dim = 256
-label_dim = 3
-sparse_update = get_config_arg("sparse_update", bool, False)
-
-data = data_layer(name="word", size=dict_dim)
-
-emb = embedding_layer(
-    input=data,
-    size=word_dim,
-    param_attr=ParamAttr(sparse_update=sparse_update))
-
-with mixed_layer(size=hidden_dim * 4) as lstm_input:
-    lstm_input += full_matrix_projection(input=emb)
-
-lstm = lstmemory(
-    input=lstm_input,
-    act=TanhActivation(),
-    gate_act=SigmoidActivation(),
-    state_act=TanhActivation())
-
-lstm_last = last_seq(input=lstm)
-
-with mixed_layer(
-        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
-    output += full_matrix_projection(input=lstm_last)
-
-outputs(
-    classification_cost(
-        input=output, label=data_layer(
-            name="label", size=1)))
diff --git a/paddle/gserver/tests/sequence_nest_layer_group.conf b/paddle/gserver/tests/sequence_nest_layer_group.conf
deleted file mode 100644
index 71ef53d08a2cea070806afb2c65ef15c4dd28f31..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/sequence_nest_layer_group.conf
+++ /dev/null
@@ -1,83 +0,0 @@
-#!/usr/bin/env python
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
-dict_file = dict()
-for line_count, line in enumerate(open(dict_path, "r")):
-    dict_file[line.strip()] = line_count
-
-define_py_data_sources2(
-    train_list='gserver/tests/Sequence/train.list.nest',
-    test_list=None,
-    module='sequenceGen',
-    obj='process2',
-    args={"dict_file": dict_file})
-
-settings(batch_size=2)
-######################## network configure ################################
-dict_dim = len(open(dict_path, 'r').readlines())
-word_dim = 128
-hidden_dim = 256
-label_dim = 3
-
-data = data_layer(name="word", size=dict_dim)
-
-emb_group = embedding_layer(input=data, size=word_dim)
-
-
-# (lstm_input + lstm) is equal to lstmemory 
-def lstm_group(lstm_group_input):
-    with mixed_layer(size=hidden_dim * 4) as group_input:
-        group_input += full_matrix_projection(input=lstm_group_input)
-
-    lstm_output = lstmemory_group(
-        input=group_input,
-        name="lstm_group",
-        size=hidden_dim,
-        act=TanhActivation(),
-        gate_act=SigmoidActivation(),
-        state_act=TanhActivation())
-    return lstm_output
-
-
-lstm_nest_group = recurrent_group(
-    input=SubsequenceInput(emb_group), step=lstm_group, name="lstm_nest_group")
-# hasSubseq ->(seqlastins) seq
-lstm_last = last_seq(
-    input=lstm_nest_group, agg_level=AggregateLevel.TO_SEQUENCE)
-
-# seq ->(expand) hasSubseq
-lstm_expand = expand_layer(
-    input=lstm_last,
-    expand_as=emb_group,
-    expand_level=ExpandLevel.FROM_SEQUENCE)
-
-# hasSubseq ->(average) seq
-lstm_average = pooling_layer(
-    input=lstm_expand,
-    pooling_type=AvgPooling(),
-    agg_level=AggregateLevel.TO_SEQUENCE)
-
-with mixed_layer(
-        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
-    output += full_matrix_projection(input=lstm_average)
-
-outputs(
-    classification_cost(
-        input=output, label=data_layer(
-            name="label", size=1)))
diff --git a/paddle/gserver/tests/sequence_nest_rnn.conf b/paddle/gserver/tests/sequence_nest_rnn.conf
deleted file mode 100644
index 2873a599669b4281a53cd71e8bb56f0d18c26b5a..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/sequence_nest_rnn.conf
+++ /dev/null
@@ -1,74 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
-                        test_list=None,
-                        module='rnn_data_provider',
-                        obj='process_subseq')
-
-
-settings(batch_size=2, learning_rate=0.01)
-######################## network configure ################################
-dict_dim = 10
-word_dim = 8
-hidden_dim = 8
-label_dim = 3
-
-data = data_layer(name="word", size=dict_dim)
-
-emb = embedding_layer(input=data, size=word_dim)
-
-# This hierachical RNN is designed to be equivalent to the simple RNN in
-# sequence_rnn.conf
-
-def outer_step(x):
-    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
-    def inner_step(y):
-        inner_mem = memory(name="inner_rnn_state",
-                           size=hidden_dim,
-                           boot_layer=outer_mem)
-        out = fc_layer(input=[y, inner_mem],
-                        size=hidden_dim,
-                        act=TanhActivation(),
-                        bias_attr=True,
-                        name="inner_rnn_state")
-        return out
-
-    inner_rnn_output = recurrent_group(
-        step=inner_step,
-        name="inner",
-        input=x)
-    last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
-
-    # "return last" won't work, because recurrent_group only support the input 
-    # sequence type is same as return sequence type.
-    return inner_rnn_output
-
-out = recurrent_group(
-    name="outer",
-    step=outer_step,
-    input=SubsequenceInput(emb))
-
-rep = last_seq(input=out)
-prob = fc_layer(size=label_dim,
-                input=rep,
-                act=SoftmaxActivation(),
-                bias_attr=True)
-
-outputs(classification_cost(input=prob,
-                            label=data_layer(name="label", size=label_dim)))
diff --git a/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf b/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf
deleted file mode 100644
index afdacfffd7aecfe2f4762f04a987126381bcea34..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf
+++ /dev/null
@@ -1,76 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
-                        test_list=None,
-                        module='rnn_data_provider',
-                        obj='process_subseq')
-
-
-settings(batch_size=2, learning_rate=0.01)
-######################## network configure ################################
-dict_dim = 10
-word_dim = 8
-hidden_dim = 8
-label_dim = 3
-
-data = data_layer(name="word", size=dict_dim)
-
-emb = embedding_layer(input=data, size=word_dim)
-
-# This hierachical RNN is designed to be equivalent to the simple RNN in
-# sequence_rnn.conf
-
-def outer_step(wid, x):
-    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
-    def inner_step(y, wid):
-        z = embedding_layer(input=wid, size=word_dim)
-        inner_mem = memory(name="inner_rnn_state",
-                           size=hidden_dim,
-                           boot_layer=outer_mem)
-        out = fc_layer(input=[y, z, inner_mem],
-                        size=hidden_dim,
-                        act=TanhActivation(),
-                        bias_attr=True,
-                        name="inner_rnn_state")
-        return out
-
-    inner_rnn_output = recurrent_group(
-        step=inner_step,
-        name="inner",
-        input=[x, wid])
-    last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
-
-    # "return last" should also work. But currently RecurrentGradientMachine
-    # does not handle it, and will report error: In hierachical RNN, all out
-    # links should be from sequences now.
-    return inner_rnn_output
-
-out = recurrent_group(
-    name="outer",
-    step=outer_step,
-    input=[SubsequenceInput(data), SubsequenceInput(emb)])
-
-rep = last_seq(input=out)
-prob = fc_layer(size=label_dim,
-                input=rep,
-                act=SoftmaxActivation(),
-                bias_attr=True)
-
-outputs(classification_cost(input=prob,
-                            label=data_layer(name="label", size=label_dim)))
diff --git a/paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py b/paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
deleted file mode 100644
index 569d3c094b6f5517dad0f1e04f98de12aaef9633..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
+++ /dev/null
@@ -1,96 +0,0 @@
-#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-define_py_data_sources2(
-    train_list='gserver/tests/Sequence/dummy.list',
-    test_list=None,
-    module='rnn_data_provider',
-    obj='process_unequalength_subseq')
-
-settings(batch_size=2, learning_rate=0.01)
-######################## network configure ################################
-dict_dim = 10
-word_dim = 8
-hidden_dim = 8
-label_dim = 2
-
-speaker1 = data_layer(name="word1", size=dict_dim)
-speaker2 = data_layer(name="word2", size=dict_dim)
-
-emb1 = embedding_layer(input=speaker1, size=word_dim)
-emb2 = embedding_layer(input=speaker2, size=word_dim)
-
-
-# This hierarchical RNN is designed to be equivalent to the simple RNN in
-# sequence_rnn_multi_unequalength_inputs.conf
-def outer_step(x1, x2):
-    index = [0]
-
-    def inner_step(ipt):
-        index[0] += 1
-        i = index[0]
-        outer_mem = memory(name="outer_rnn_state_%d" % i, size=hidden_dim)
-
-        def inner_step_impl(y):
-            inner_mem = memory(
-                name="inner_rnn_state_" + y.name,
-                size=hidden_dim,
-                boot_layer=outer_mem)
-            out = fc_layer(
-                input=[y, inner_mem],
-                size=hidden_dim,
-                act=TanhActivation(),
-                bias_attr=True,
-                name='inner_rnn_state_' + y.name)
-            return out
-
-        encoder = recurrent_group(
-            step=inner_step_impl, name='inner_%d' % i, input=ipt)
-        last = last_seq(name="outer_rnn_state_%d" % i, input=encoder)
-        return encoder, last
-
-    encoder1, sentence_last_state1 = inner_step(ipt=x1)
-    encoder2, sentence_last_state2 = inner_step(ipt=x2)
-
-    encoder1_expand = expand_layer(
-        input=sentence_last_state1, expand_as=encoder2)
-
-    return [encoder1_expand, encoder2]
-
-
-encoder1_rep, encoder2_rep = recurrent_group(
-    name="outer",
-    step=outer_step,
-    input=[SubsequenceInput(emb1), SubsequenceInput(emb2)],
-    targetInlink=emb2)
-
-encoder1_last = last_seq(input=encoder1_rep)
-encoder1_expandlast = expand_layer(input=encoder1_last, expand_as=encoder2_rep)
-context = mixed_layer(
-    input=[
-        identity_projection(encoder1_expandlast),
-        identity_projection(encoder2_rep)
-    ],
-    size=hidden_dim)
-
-rep = last_seq(input=context)
-prob = fc_layer(
-    size=label_dim, input=rep, act=SoftmaxActivation(), bias_attr=True)
-
-outputs(
-    classification_cost(
-        input=prob, label=data_layer(
-            name="label", size=label_dim)))
diff --git a/paddle/gserver/tests/sequence_recurrent.py b/paddle/gserver/tests/sequence_recurrent.py
deleted file mode 100644
index b88c09084e1bc167a177b59566e9794ac4d616c7..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/sequence_recurrent.py
+++ /dev/null
@@ -1,55 +0,0 @@
-#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
-dict_file = dict()
-for line_count, line in enumerate(open(dict_path, "r")):
-    dict_file[line.strip()] = line_count
-
-define_py_data_sources2(
-    train_list='gserver/tests/Sequence/train.list',
-    test_list=None,
-    module='sequenceGen',
-    obj='process',
-    args={"dict_file": dict_file})
-
-settings(batch_size=5)
-######################## network configure ################################
-dict_dim = len(open(dict_path, 'r').readlines())
-word_dim = 128
-hidden_dim = 128
-label_dim = 3
-
-# This config is designed to be equivalent with sequence_recurrent_group.py
-
-data = data_layer(name="word", size=dict_dim)
-
-emb = embedding_layer(
-    input=data, size=word_dim, param_attr=ParamAttr(name="emb"))
-
-recurrent = recurrent_layer(input=emb, bias_attr=False, act=SoftmaxActivation())
-
-recurrent_last = last_seq(input=recurrent)
-
-with mixed_layer(
-        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
-    output += full_matrix_projection(input=recurrent_last)
-
-outputs(
-    classification_cost(
-        input=output, label=data_layer(
-            name="label", size=1)))
diff --git a/paddle/gserver/tests/sequence_recurrent_group.py b/paddle/gserver/tests/sequence_recurrent_group.py
deleted file mode 100644
index 0daf746700231d302550004b1c10729e36807b8b..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/sequence_recurrent_group.py
+++ /dev/null
@@ -1,68 +0,0 @@
-#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
-dict_file = dict()
-for line_count, line in enumerate(open(dict_path, "r")):
-    dict_file[line.strip()] = line_count
-
-define_py_data_sources2(
-    train_list='gserver/tests/Sequence/train.list',
-    test_list=None,
-    module='sequenceGen',
-    obj='process',
-    args={"dict_file": dict_file})
-
-settings(batch_size=5)
-######################## network configure ################################
-dict_dim = len(open(dict_path, 'r').readlines())
-word_dim = 128
-hidden_dim = 128
-label_dim = 3
-
-# This config is designed to be equivalent with sequence_recurrent.py
-
-data = data_layer(name="word", size=dict_dim)
-
-emb = embedding_layer(
-    input=data, size=word_dim, param_attr=ParamAttr(name="emb"))
-
-
-def step(y):
-    mem = memory(name="rnn_state", size=hidden_dim)
-    with mixed_layer(
-            name="rnn_state",
-            size=hidden_dim,
-            bias_attr=False,
-            act=SoftmaxActivation()) as out:
-        out += identity_projection(input=y)
-        out += full_matrix_projection(
-            input=mem, param_attr=ParamAttr(name="___recurrent_layer_0__"))
-    return out
-
-
-recurrent = recurrent_group(name="rnn", step=step, input=emb)
-
-recurrent_last = last_seq(input=recurrent)
-
-with mixed_layer(
-        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
-    output += full_matrix_projection(input=recurrent_last)
-
-outputs(
-    classification_cost(
-        input=output, label=data_layer(
-            name="label", size=1)))
diff --git a/paddle/gserver/tests/sequence_rnn.conf b/paddle/gserver/tests/sequence_rnn.conf
deleted file mode 100644
index 1084edfe708c3348d40b67e270f64d8cda3cee0f..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/sequence_rnn.conf
+++ /dev/null
@@ -1,57 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
-                        test_list=None,
-                        module='rnn_data_provider',
-                        obj='process_seq')
-
-
-settings(batch_size=2, learning_rate=0.01)
-######################## network configure ################################
-dict_dim = 10
-word_dim = 8
-hidden_dim = 8
-label_dim = 3
-
-data = data_layer(name="word", size=dict_dim)
-
-emb = embedding_layer(input=data, size=word_dim)
-
-def step(y):
-    mem = memory(name="rnn_state", size=hidden_dim)
-    out = fc_layer(input=[y, mem],
-                    size=hidden_dim,
-                    act=TanhActivation(),
-                    bias_attr=True,
-                    name="rnn_state")
-    return out
-
-out = recurrent_group(
-    name="rnn",
-    step=step,
-    input=emb)
-
-rep = last_seq(input=out)
-prob = fc_layer(size=label_dim,
-                input=rep,
-                act=SoftmaxActivation(),
-                bias_attr=True)
-
-outputs(classification_cost(input=prob,
-                            label=data_layer(name="label", size=label_dim)))
diff --git a/paddle/gserver/tests/sequence_rnn_matched_inputs.py b/paddle/gserver/tests/sequence_rnn_matched_inputs.py
deleted file mode 100644
index 41a581e0ccd59588d1bcce9345056bea9d80b73d..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/sequence_rnn_matched_inputs.py
+++ /dev/null
@@ -1,84 +0,0 @@
-#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-define_py_data_sources2(
-    train_list='gserver/tests/Sequence/dummy.list',
-    test_list=None,
-    module='rnn_data_provider',
-    obj='process_mixed')
-
-settings(batch_size=2, learning_rate=0.01)
-######################## network configure ################################
-dict_dim = 10
-word_dim = 2
-hidden_dim = 2
-label_dim = 2
-
-data1 = data_layer(name="word1", size=dict_dim)
-data2 = data_layer(name="word2", size=dict_dim)
-label = data_layer(name="label", size=label_dim)
-
-encoding = embedding_layer(input=data2, size=word_dim)
-
-subseq = embedding_layer(input=data1, size=word_dim)
-seq = embedding_layer(input=data2, size=word_dim)
-nonseq = embedding_layer(input=label, size=word_dim)
-
-
-# This hierarchical RNN is designed to be equivalent to the simple RNN in
-# sequence_rnn_mixed_inputs.conf
-def outer_step(subseq, seq, nonseq, encoding):
-    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
-
-    def inner_step(subseq, seq, nonseq):
-        inner_mem = memory(
-            name="inner_rnn_state", size=hidden_dim, boot_layer=outer_mem)
-
-        out = fc_layer(
-            input=[subseq, seq, nonseq, inner_mem],
-            size=hidden_dim,
-            act=TanhActivation(),
-            bias_attr=True,
-            name='inner_rnn_state')
-        return out
-
-    decoder = recurrent_group(
-        step=inner_step, name='inner', input=[subseq, seq, nonseq])
-    last = last_seq(name="outer_rnn_state", input=decoder)
-    context = simple_attention(
-        encoded_sequence=encoding, encoded_proj=encoding, decoder_state=last)
-    return context
-
-
-out = recurrent_group(
-    name="outer",
-    step=outer_step,
-    input=[
-        subseq, expand_layer(
-            seq, expand_as=subseq,
-            expand_level=ExpandLevel.FROM_SEQUENCE), expand_layer(
-                nonseq,
-                expand_as=subseq,
-                expand_level=ExpandLevel.FROM_NO_SEQUENCE),
-        StaticInput(encoding)
-    ])
-
-rep = last_seq(input=out)
-prob = fc_layer(
-    size=label_dim, input=rep, act=SoftmaxActivation(), bias_attr=True)
-
-outputs(classification_cost(input=prob, label=label))
diff --git a/paddle/gserver/tests/sequence_rnn_mixed_inputs.py b/paddle/gserver/tests/sequence_rnn_mixed_inputs.py
deleted file mode 100644
index ae89d8e2bb6f672eaf697ae4d24895b89f76544f..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/sequence_rnn_mixed_inputs.py
+++ /dev/null
@@ -1,78 +0,0 @@
-#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-define_py_data_sources2(
-    train_list='gserver/tests/Sequence/dummy.list',
-    test_list=None,
-    module='rnn_data_provider',
-    obj='process_mixed')
-
-settings(batch_size=2, learning_rate=0.01)
-######################## network configure ################################
-dict_dim = 10
-word_dim = 2
-hidden_dim = 2
-label_dim = 2
-
-data1 = data_layer(name="word1", size=dict_dim)
-data2 = data_layer(name="word2", size=dict_dim)
-label = data_layer(name="label", size=label_dim)
-
-encoding = embedding_layer(input=data2, size=word_dim)
-
-
-# This hierarchical RNN is designed to be equivalent to the simple RNN in
-# sequence_rnn_matched_inputs.conf
-def outer_step(subseq, seq, nonseq, encoding):
-    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
-
-    def inner_step(data1, data2, label):
-        inner_mem = memory(
-            name="inner_rnn_state", size=hidden_dim, boot_layer=outer_mem)
-
-        subseq = embedding_layer(input=data1, size=word_dim)
-        seq = embedding_layer(input=data2, size=word_dim)
-        nonseq = embedding_layer(input=label, size=word_dim)
-
-        print_layer(input=[data1, seq, label, inner_mem])
-        out = fc_layer(
-            input=[subseq, seq, nonseq, inner_mem],
-            size=hidden_dim,
-            act=TanhActivation(),
-            bias_attr=True,
-            name='inner_rnn_state')
-        return out
-
-    decoder = recurrent_group(
-        step=inner_step, name='inner',
-        input=[subseq, StaticInput(seq), nonseq])
-    last = last_seq(name="outer_rnn_state", input=decoder)
-    context = simple_attention(
-        encoded_sequence=encoding, encoded_proj=encoding, decoder_state=last)
-    return context
-
-
-out = recurrent_group(
-    name="outer",
-    step=outer_step,
-    input=[data1, data2, StaticInput(label), StaticInput(encoding)])
-
-rep = last_seq(input=out)
-prob = fc_layer(
-    size=label_dim, input=rep, act=SoftmaxActivation(), bias_attr=True)
-
-outputs(classification_cost(input=prob, label=label))
diff --git a/paddle/gserver/tests/sequence_rnn_multi_input.conf b/paddle/gserver/tests/sequence_rnn_multi_input.conf
deleted file mode 100644
index 9fae974f3079c49ad03d6ba34e30190f325414e8..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/sequence_rnn_multi_input.conf
+++ /dev/null
@@ -1,58 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
-                        test_list=None,
-                        module='rnn_data_provider',
-                        obj='process_seq')
-
-
-settings(batch_size=2, learning_rate=0.01)
-######################## network configure ################################
-dict_dim = 10
-word_dim = 8
-hidden_dim = 8
-label_dim = 3
-
-data = data_layer(name="word", size=dict_dim)
-
-emb = embedding_layer(input=data, size=word_dim)
-
-def step(y, wid):
-    z = embedding_layer(input=wid, size=word_dim)
-    mem = memory(name="rnn_state", size=hidden_dim)
-    out = fc_layer(input=[y, z, mem],
-                    size=hidden_dim,
-                    act=TanhActivation(),
-                    bias_attr=True,
-                    name="rnn_state")
-    return out
-
-out = recurrent_group(
-    name="rnn",
-    step=step,
-    input=[emb, data])
-
-rep = last_seq(input=out)
-prob = fc_layer(size=label_dim,
-                input=rep,
-                act=SoftmaxActivation(),
-                bias_attr=True)
-
-outputs(classification_cost(input=prob,
-                            label=data_layer(name="label", size=label_dim)))
diff --git a/paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py b/paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
deleted file mode 100644
index 6473fb3f3eddc803282911a156c489e4ba39aded..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
+++ /dev/null
@@ -1,76 +0,0 @@
-#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-define_py_data_sources2(
-    train_list='gserver/tests/Sequence/dummy.list',
-    test_list=None,
-    module='rnn_data_provider',
-    obj='process_unequalength_seq')
-
-settings(batch_size=2, learning_rate=0.01)
-######################## network configure ################################
-dict_dim = 10
-word_dim = 8
-hidden_dim = 8
-label_dim = 2
-
-speaker1 = data_layer(name="word1", size=dict_dim)
-speaker2 = data_layer(name="word2", size=dict_dim)
-
-emb1 = embedding_layer(input=speaker1, size=word_dim)
-emb2 = embedding_layer(input=speaker2, size=word_dim)
-
-# This hierachical RNN is designed to be equivalent to the RNN in
-# sequence_nest_rnn_multi_unequalength_inputs.conf
-
-
-def step(x1, x2):
-    def calrnn(y):
-        mem = memory(name='rnn_state_' + y.name, size=hidden_dim)
-        out = fc_layer(
-            input=[y, mem],
-            size=hidden_dim,
-            act=TanhActivation(),
-            bias_attr=True,
-            name='rnn_state_' + y.name)
-        return out
-
-    encoder1 = calrnn(x1)
-    encoder2 = calrnn(x2)
-    return [encoder1, encoder2]
-
-
-encoder1_rep, encoder2_rep = recurrent_group(
-    name="stepout", step=step, input=[emb1, emb2])
-
-encoder1_last = last_seq(input=encoder1_rep)
-encoder1_expandlast = expand_layer(input=encoder1_last, expand_as=encoder2_rep)
-context = mixed_layer(
-    input=[
-        identity_projection(encoder1_expandlast),
-        identity_projection(encoder2_rep)
-    ],
-    size=hidden_dim)
-
-rep = last_seq(input=context)
-prob = fc_layer(
-    size=label_dim, input=rep, act=SoftmaxActivation(), bias_attr=True)
-
-outputs(
-    classification_cost(
-        input=prob, label=data_layer(
-            name="label", size=label_dim)))
diff --git a/paddle/gserver/tests/test_ActivationGrad.cpp b/paddle/gserver/tests/test_ActivationGrad.cpp
deleted file mode 100644
index b5e4af26dc123be3748adb4faed5fe1656ca44b3..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_ActivationGrad.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-#include "ModelConfig.pb.h"
-#include "paddle/gserver/layers/DataLayer.h"
-
-#include "LayerGradUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_bool(use_gpu);
-DECLARE_bool(thread_local_rand_use_global_seed);
-
-void testActivation(const string& act) {
-  LOG(INFO) << "test activation: " << act;
-  size_t size = 10;
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("addto");
-  config.layerConfig.set_size(size);
-  config.layerConfig.set_active_type(act);
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", size, 0});
-  config.layerConfig.add_inputs();
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  act + "_activation",
-                  100,
-                  /* trans= */ false,
-                  useGpu,
-                  /* useWeight */ true);
-  }
-}
-
-TEST(Activation, activation) {
-  auto types = ActivationFunction::getAllRegisteredTypes();
-  std::set<string> excluded{"sequence_softmax"};
-  for (auto type : types) {
-    if (excluded.count(type)) continue;
-    testActivation(type);
-  }
-}
-
-void testSequenceSoftmaxAct(bool hasSubseq) {
-  LOG(INFO) << "test activation: sequence softmax";
-
-  const size_t size = 1;
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("addto");
-  config.layerConfig.set_size(size);
-  config.layerConfig.set_active_type("sequence_softmax");
-  config.inputDefs.push_back(
-      {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA,
-       "layer_0",
-       1,
-       0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "sequence_softmax",
-                  100,
-                  /* trans= */ false,
-                  useGpu,
-                  /* useWeight */ true);
-  }
-}
-
-TEST(SequenceSoftmaxActivation, activation) {
-  for (auto hasSubseq : {false, true}) {
-    LOG(INFO) << "hasSubseq = " << hasSubseq;
-    testSequenceSoftmaxAct(hasSubseq);
-  }
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(1);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_BatchNorm.cpp b/paddle/gserver/tests/test_BatchNorm.cpp
deleted file mode 100644
index a3ec66c75829c5ef0ae834656ee82e40be76c892..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_BatchNorm.cpp
+++ /dev/null
@@ -1,195 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-#include "ModelConfig.pb.h"
-#include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/utils/GlobalConstants.h"
-
-#include "LayerGradUtil.h"
-#include "paddle/cuda/include/hl_batch_norm.h"
-#include "paddle/math/tests/TensorCheck.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_bool(use_gpu);
-DECLARE_int32(gpu_id);
-DECLARE_double(checkgrad_eps);
-DECLARE_bool(thread_local_rand_use_global_seed);
-DECLARE_bool(prev_batch_state);
-
-// Test that the batchNormLayer can be followed by a ConvLayer
-TEST(Layer, batchNorm) {
-  FLAGS_use_gpu = false;
-  TestConfig configBN;
-  const int CHANNELS = 6272;
-  const int IMG_SIZE = 1;
-  configBN.layerConfig.set_type("batch_norm");
-  configBN.layerConfig.set_name("bn");
-  configBN.layerConfig.set_size(CHANNELS * IMG_SIZE * IMG_SIZE);
-  configBN.layerConfig.set_active_type("relu");
-  configBN.biasSize = CHANNELS;
-  configBN.inputDefs.push_back({INPUT_DATA,
-                                "layer_0",
-                                /* dim= */ IMG_SIZE * IMG_SIZE * CHANNELS,
-                                /* paraSize= */ CHANNELS});
-
-  configBN.inputDefs.push_back(
-      {INPUT_DATA, "layer_1_running_mean", 1, CHANNELS});
-  configBN.inputDefs.back().isStatic = true;
-  configBN.inputDefs.push_back(
-      {INPUT_DATA, "layer_2_running_var", 1, CHANNELS});
-  configBN.inputDefs.back().isStatic = true;
-
-  LayerInputConfig* input = configBN.layerConfig.add_inputs();
-  configBN.layerConfig.add_inputs();
-  configBN.layerConfig.add_inputs();
-
-  ImageConfig* img_conf = input->mutable_image_conf();
-  img_conf->set_channels(CHANNELS);
-  img_conf->set_img_size(IMG_SIZE);
-
-  // Setting up conv-layer config
-  TestConfig config;
-  config.biasSize = 64;
-  config.layerConfig.set_type("exconv");
-  config.layerConfig.set_num_filters(64);
-  config.layerConfig.set_partial_sum(1);
-  config.layerConfig.set_shared_biases(true);
-
-  config.inputDefs.push_back({INPUT_DATA, "bn", 6272, 204800});
-  input = config.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-  conv->set_filter_size(5);
-  conv->set_filter_size_y(5);
-  conv->set_channels(128);
-  conv->set_padding(1);
-  conv->set_padding_y(1);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_groups(1);
-  conv->set_filter_channels(conv->channels() / conv->groups());
-  conv->set_img_size(7);
-  conv->set_output_x(3);
-  config.layerConfig.set_size(conv->output_x() * conv->output_x() *
-                              config.layerConfig.num_filters());
-  config.layerConfig.set_name("conv");
-
-  // data layer initialize
-  std::vector<DataLayerPtr> dataLayers;
-  LayerMap layerMap;
-  vector<Argument> datas;
-  initDataLayer(configBN,
-                &dataLayers,
-                &datas,
-                &layerMap,
-                "batch_norm",
-                100,
-                false,
-                false);
-  // test layer initialize
-  std::vector<ParameterPtr> parameters;
-  LayerPtr bnLayer;
-  initTestLayer(configBN, &layerMap, &parameters, &bnLayer);
-
-  std::vector<ParameterPtr> parameters2;
-  LayerPtr convLayer;
-  initTestLayer(config, &layerMap, &parameters2, &convLayer);
-
-  bnLayer->forward(PASS_GC);
-  convLayer->forward(PASS_GC);
-
-  CHECK_EQ(static_cast<int>(convLayer->getOutputValue()->getHeight()), 100);
-  CHECK_EQ(static_cast<int>(convLayer->getOutputValue()->getWidth()), 576);
-}
-
-#ifdef PADDLE_WITH_CUDA
-void batchNormInference(int n, int c, int h, int w) {
-  MatrixPtr input = std::make_shared<GpuMatrix>(n, c * h * w);
-  MatrixPtr cudnnOut = std::make_shared<GpuMatrix>(n, c * h * w);
-  MatrixPtr cudaOut = std::make_shared<GpuMatrix>(n, c * h * w);
-  MatrixPtr cudnnCheck = std::make_shared<CpuMatrix>(n, c * h * w);
-  MatrixPtr cudaCheck = std::make_shared<CpuMatrix>(n, c * h * w);
-  input->randomizeUniform();
-  cudnnOut->zeroMem();
-  cudaOut->zeroMem();
-
-  MatrixPtr scale = std::make_shared<GpuMatrix>(1, c);
-  scale->randomizeUniform();
-  MatrixPtr bias = std::make_shared<GpuMatrix>(1, c);
-  bias->randomizeUniform();
-
-  MatrixPtr movingMean = std::make_shared<GpuMatrix>(1, c);
-  movingMean->randomizeUniform();
-
-  MatrixPtr movingVar = std::make_shared<GpuMatrix>(1, c);
-  movingVar->randomizeUniform();
-  movingVar->clip(0.01, 50);
-
-  hl_tensor_descriptor ioDesc;
-  hl_tensor_descriptor bnDesc;
-  hl_create_tensor_descriptor(&ioDesc);
-  hl_create_tensor_descriptor(&bnDesc);
-  hl_tensor_reshape(ioDesc, n, c, h, w);
-  hl_tensor_reshape(bnDesc, 1, c, 1, 1);
-
-  double EPS = 1E-5;
-  hl_batch_norm_forward_inference(ioDesc,
-                                  input->getData(),
-                                  ioDesc,
-                                  cudnnOut->getData(),
-                                  bnDesc,
-                                  scale->getData(),
-                                  bias->getData(),
-                                  movingMean->getData(),
-                                  movingVar->getData(),
-                                  EPS);
-
-  hl_batch_norm_cuda_inference(input->getData(),
-                               cudaOut->getData(),
-                               scale->getData(),
-                               bias->getData(),
-                               movingMean->getData(),
-                               movingVar->getData(),
-                               EPS,
-                               n,
-                               c,
-                               h,
-                               w);
-
-  cudnnCheck->copyFrom(*cudnnOut);
-  cudaCheck->copyFrom(*cudaOut);
-  autotest::TensorCheckErr(*cudnnCheck, *cudaCheck);
-
-  hl_destroy_tensor_descriptor(ioDesc);
-  hl_destroy_tensor_descriptor(bnDesc);
-}
-
-TEST(BatchNorm, Inference) {
-  batchNormInference(33, 267, 1, 1);
-  batchNormInference(19, 105, 4, 4);
-}
-#endif
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(1);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_CRFLayerGrad.cpp b/paddle/gserver/tests/test_CRFLayerGrad.cpp
deleted file mode 100644
index 9f3d2936569af8f1923a471f4d262e9a472649c0..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_CRFLayerGrad.cpp
+++ /dev/null
@@ -1,173 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "ModelConfig.pb.h"
-#include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/gserver/layers/LinearChainCRF.h"
-
-#include "LayerGradUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-
-DECLARE_int32(gpu_id);
-DECLARE_bool(thread_local_rand_use_global_seed);
-
-static inline bool getNextSequence(std::vector<int>& seq, int numClasses) {
-  for (auto& v : seq) {
-    if (++v < numClasses) {
-      return true;
-    }
-    v = 0;
-  }
-  return false;
-}
-
-// log(exp(x) + exp(y))
-static inline real logSum(real x, real y) {
-  real maxValue = std::max(x, y);
-  if (std::isinf(maxValue)) {
-    return -std::numeric_limits<real>::infinity();
-  } else {
-    return maxValue + log(exp(x - maxValue) + exp(y - maxValue));
-  }
-}
-
-static inline std::vector<int> genRandLabels(int numClasses, int length) {
-  std::vector<int> labels(length);
-  for (int i = 0; i < length; ++i) {
-    labels[i] = rand() % numClasses;  // NOLINT
-  }
-  return labels;
-}
-
-TEST(CRFLayer, cost) {
-  const int numClasses = 4;
-  CpuVector para(numClasses * (numClasses + 2));
-  real* a = para.getData();
-  real* b = para.getData() + numClasses;
-  real* w = para.getData() + 2 * numClasses;
-  LinearChainCRF crf(4, para.getData());
-  for (int length : {1, 2, 3, 10}) {
-    for (int tries = 0; tries < 10; ++tries) {
-      CpuMatrix x(length, numClasses);
-      x.randomizeUniform();
-      para.randnorm(0, 2);
-
-      std::vector<int> goldenLabels = genRandLabels(numClasses, length);
-
-      real cost = crf.forward(x.getData(), goldenLabels.data(), length);
-
-      real logZ = -std::numeric_limits<real>::infinity();
-      real logNominator = -std::numeric_limits<real>::infinity();
-      std::vector<int> testResult(length, 0);
-      do {
-        real score = a[testResult.front()];
-        score += x.getElement(0, testResult.front());
-        for (int k = 1; k < length; ++k) {
-          score += x.getElement(k, testResult[k]) +
-                   w[numClasses * testResult[k - 1] + testResult[k]];
-        }
-        score += b[testResult.back()];
-        logZ = logSum(logZ, score);
-
-        if (goldenLabels == testResult) {
-          logNominator = score;
-        }
-      } while (getNextSequence(testResult, numClasses));
-
-      real trueCost = -logNominator + logZ;
-
-      real diff = fabs(trueCost - cost);
-      diff /= fabs(cost) < fabs(trueCost) ? fabs(cost) : fabs(trueCost);
-      VLOG(1) << "cost=" << cost << " trueCost=" << trueCost << " diff=" << diff
-              << std::endl;
-      if (typeid(real) == typeid(double)) {  // NOLINT
-        EXPECT_LE(diff, 1e-10);
-      } else {
-        EXPECT_LE(diff, 5e-3);
-      }
-    }
-  }
-}
-
-inline real epsilon() { return typeid(real) == typeid(double) ? 1e-10 : 0.06; }
-
-TestConfig initTestConfig(size_t numClasses, bool withWeight) {
-  TestConfig config;
-  config.layerConfig.set_type("crf");
-  config.layerConfig.set_size(numClasses);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA,
-                              "layer_0",
-                              numClasses,
-                              numClasses * (numClasses + 2)});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back(
-      {INPUT_SEQUENCE_LABEL, "layer_label", numClasses, 0});
-  config.layerConfig.add_inputs();
-
-  if (withWeight) {
-    config.inputDefs.push_back({INPUT_DENSE_DIM_DATA, "layer_weight", 1, 0});
-    config.layerConfig.add_inputs();
-  }
-
-  return config;
-}
-
-TEST(Layer, CRFLayer) {
-  size_t numClasses = 10;
-  for (int tries = 0; tries < 5; ++tries) {
-    TestConfig config = initTestConfig(numClasses, /* withWeight= */ false);
-    for (int length : {1, 3, 100}) {
-      // Not support GPU now
-      testLayerGrad(config,
-                    "crf",
-                    length,
-                    /* trans= */ false,
-                    /* useGpu= */ false,
-                    /* useWeight= */ false,
-                    epsilon());
-    }
-  }
-}
-
-TEST(Layer, CRFLayerUseWeight) {
-  size_t numClasses = 10;
-  for (int tries = 0; tries < 5; ++tries) {
-    TestConfig config = initTestConfig(numClasses, /* withWeight= */ true);
-    for (int length : {1, 3, 100}) {
-      // Not support GPU now
-      testLayerGrad(config,
-                    "crf",
-                    length,
-                    /* trans= */ false,
-                    /* useGpu= */ false,
-                    /* useWeight= */ false,
-                    epsilon());
-    }
-  }
-}
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  hl_start();
-  hl_init(FLAGS_gpu_id);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(1);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_CompareSparse.cpp b/paddle/gserver/tests/test_CompareSparse.cpp
deleted file mode 100644
index 2fbc404125a9364ac44a990f8ec92962cf7d1298..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_CompareSparse.cpp
+++ /dev/null
@@ -1,228 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <paddle/utils/PythonUtil.h>
-
-#include "paddle/trainer/Trainer.h"
-
-#include <gtest/gtest.h>
-#include <paddle/pserver/ParameterServer2.h>
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-static const string& configFile1 = "gserver/tests/sequence_lstm.conf";
-
-DECLARE_bool(use_gpu);
-DECLARE_string(config);
-DECLARE_int32(gpu_id);
-DECLARE_int32(seed);
-DECLARE_int32(num_passes);
-DECLARE_int32(saving_period);
-
-DECLARE_int32(num_gradient_servers);
-DECLARE_int32(port);
-DECLARE_bool(local);
-DECLARE_bool(use_old_updater);
-DECLARE_bool(parallel_nn);
-DECLARE_string(config_args);
-DEFINE_double(max_diff_ratio,
-              0.0f,
-              "max diff ratio allowed for parameters value");
-
-int gNumDevices = 0;
-
-std::vector<ParameterPtr> trainerOnePassTest(const string& configFile,
-                                             bool sparseUpdate,
-                                             int trainerCount = 1,
-                                             bool useGpu = false) {
-  FLAGS_use_gpu = useGpu;
-  FLAGS_config = configFile;
-  FLAGS_trainer_count = trainerCount;
-  FLAGS_config_args = sparseUpdate ? "sparse_update=1" : "sparse_update=0";
-
-  LOG(INFO) << " useGpu=" << useGpu << " trainerCount=" << trainerCount
-            << " configFile=" << configFile << " sparseUpdate=" << sparseUpdate;
-  srand(FLAGS_seed);
-  *ThreadLocalRand::getSeed() = FLAGS_seed;
-  ThreadLocalRandomEngine::get().seed(FLAGS_seed);
-  if (useGpu) {
-    CHECK_LE(trainerCount, gNumDevices);
-  }
-
-  std::vector<std::shared_ptr<ParameterServer2>> pservers;
-  if (!FLAGS_local) {
-    int numPorts = FLAGS_ports_num + FLAGS_ports_num_for_sparse;
-    pservers.resize(numPorts);
-
-    for (int i = 0; i < numPorts; ++i) {
-      pservers[i].reset(new ParameterServer2(std::string(), FLAGS_port + i));
-      pservers[i]->init();
-      pservers[i]->start();
-    }
-  }
-
-  Trainer trainer;
-  trainer.init(TrainerConfigHelper::createFromFlagConfig());
-  trainer.train();
-  return trainer.getGradientMachine()->getParameters();
-}
-
-std::vector<ParameterPtr>& getDenseParameters() {
-  static std::vector<ParameterPtr> denseParameters;
-  if (denseParameters.empty()) {
-    // use dense training as base
-    FLAGS_local = true;
-    denseParameters = trainerOnePassTest(configFile1, false);
-  }
-
-  return denseParameters;
-}
-
-void checkBuffer(real* A,
-                 const char* desA,
-                 real* B,
-                 const char* desB,
-                 size_t len,
-                 double maxDiffRatio) {
-  double maxDiff = 0;
-  double maxValue = 0;
-  for (size_t i = 0; i < len; ++i) {
-    double diff = fabs(A[i] - B[i]);
-    maxValue = std::max<double>(maxValue, std::max(fabs(A[i]), fabs(B[i])));
-    maxDiff = std::max(maxDiff, diff);
-  }
-  EXPECT_LE(maxDiff / maxValue, maxDiffRatio);
-  LOG(INFO) << " maxDiff=" << maxDiff << " maxValue=" << maxValue
-            << " maxDiff/maxValue=" << maxDiff / maxValue << "\n\n";
-}
-
-void compareValue(const vector<ParameterPtr>& parametersA,
-                  const vector<ParameterPtr>& parametersB,
-                  double maxDiffRatio = 0.0) {
-  LOG(INFO) << "\n\n--------------------------------"
-            << " Check Gradient Machine Parameters:"
-            << " -------------------------------------\n";
-  for (size_t i = 0; i < parametersA.size(); ++i) {
-    ParameterPtr parameterA, parameterB;
-    parameterA = parametersA[i];
-    parameterB = parametersB[i];
-
-    CpuVector paraA(parameterA->getSize());
-    CpuVector paraB(parameterB->getSize());
-    paraA.copyFrom(*parameterA->getBuf(PARAMETER_VALUE));
-    paraB.copyFrom(*parameterB->getBuf(PARAMETER_VALUE));
-
-    LOG(INFO) << "\n\n----------- PARAMETER_VALUE:  " << parameterA->getName()
-              << " ; size : " << paraA.getSize() << " ------------";
-    checkBuffer(paraA.getData(),
-                "para_A",
-                paraB.getData(),
-                "para_B",
-                paraA.getSize(),
-                maxDiffRatio);
-  }
-}
-
-TEST(compareSparse, cpu) {
-  FLAGS_local = 1;  // disable remote sparse update in parameter config
-  std::vector<ParameterPtr> parameters = trainerOnePassTest(configFile1, true);
-  compareValue(getDenseParameters(), parameters);
-}
-
-TEST(compareSparse, remote_cpu) {
-  FLAGS_local = 0;  // will enable remote sparse update
-  FLAGS_ports_num_for_sparse = 5;
-  std::vector<ParameterPtr> parameters = trainerOnePassTest(configFile1, true);
-  compareValue(getDenseParameters(), parameters);
-}
-
-TEST(compareSparse, cpu10_local_vs_remote) {
-  FLAGS_local = 1;  // disable remote sparse update in parameter config
-  std::vector<ParameterPtr> localParameters =
-      trainerOnePassTest(configFile1, true, 2);
-
-  FLAGS_local = 0;  // will enable remote sparse update
-  FLAGS_ports_num_for_sparse = 5;
-  std::vector<ParameterPtr> remoteParameters =
-      trainerOnePassTest(configFile1, true, 2);
-
-  compareValue(localParameters, remoteParameters);
-}
-
-TEST(compareSparse, multiGradientMachine) {
-  int numGpu;
-#ifdef PADDLE_TYPE_DOUBLE
-  double eps = 1e-8;
-#else
-  double eps = 1e-4;
-#endif
-  numGpu = hl_get_device_count();
-  for (bool local : {false, true}) {
-    FLAGS_local = local;
-    FLAGS_ports_num_for_sparse = 5;
-    for (bool useGpu : {false, true}) {
-#ifndef PADDLE_WITH_CUDA
-      if (useGpu) continue;
-#endif
-      FLAGS_parallel_nn = useGpu;
-      LOG(INFO) << " local=" << local << " useGpu=" << useGpu;
-      int trainerCount = useGpu ? numGpu : 2;
-      std::vector<ParameterPtr> parameters =
-          trainerOnePassTest(configFile1, true, trainerCount, useGpu);
-      compareValue(getDenseParameters(), parameters, eps);
-    }
-  }
-  FLAGS_parallel_nn = false;
-}
-
-TEST(compareSparse, NeuralNetwork) {
-#ifdef PADDLE_TYPE_DOUBLE
-  double eps = 1e-8;
-#else
-  double eps = 1e-4;
-#endif
-  for (bool local : {false, true}) {
-    FLAGS_local = local;
-    FLAGS_ports_num_for_sparse = 5;
-    for (bool useGpu : {false, true}) {
-#ifndef PADDLE_WITH_CUDA
-      if (useGpu) continue;
-#endif
-      FLAGS_parallel_nn = useGpu;
-      LOG(INFO) << " local=" << local << " useGpu=" << useGpu;
-      int trainerCount = 1;
-      std::vector<ParameterPtr> parameters =
-          trainerOnePassTest(configFile1, true, trainerCount, useGpu);
-      compareValue(getDenseParameters(), parameters, useGpu ? eps : 0);
-    }
-  }
-  FLAGS_parallel_nn = false;
-}
-
-int main(int argc, char** argv) {
-  // FIXME(tonyyang-svail):
-  //   Turn off this test due CI failure:
-  //   https://paddleci.ngrok.io/viewLog.html?buildId=27608&buildTypeId=Paddle_PrCi&tab=buildLog&_focus=10430
-  return 0;
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  initPython(argc, argv);
-
-  gNumDevices = hl_get_device_count();
-  FLAGS_num_passes = 1;          // train one pass
-  FLAGS_saving_period = 100000;  // do not save parameter
-
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_CompareTwoNets.cpp b/paddle/gserver/tests/test_CompareTwoNets.cpp
deleted file mode 100644
index 1c9b4002a34ca5a9b668be69bd0ad392eb763803..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_CompareTwoNets.cpp
+++ /dev/null
@@ -1,209 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/utils/PythonUtil.h>
-#include <algorithm>
-#include <cstdlib>
-
-#include "paddle/trainer/Trainer.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_int32(gpu_id);
-
-DECLARE_bool(local);
-DECLARE_bool(use_gpu);
-
-DECLARE_string(config);
-DECLARE_string(nics);
-
-DEFINE_bool(need_high_accuracy,
-            false,
-            "whether need to run in double accuracy");
-DEFINE_double(
-    max_diff_ratio,
-    0.0f,
-    "max diff ratio allowed for outputs and parameters (value/gradient)");
-DECLARE_bool(thread_local_rand_use_global_seed);
-DECLARE_int32(seed);
-
-static const string& config_file_a = "gserver/tests/sequence_recurrent.py";
-static const string& config_file_b =
-    "gserver/tests/sequence_recurrent_group.py";
-
-struct ComData {
-  vector<Argument> outArgs;
-  vector<ParameterPtr> parameters;
-};
-
-void calcGradient(ComData& data, const string configFile) {
-  FLAGS_config = configFile;
-
-  FLAGS_local = true;
-  FLAGS_use_gpu = false;
-
-  FLAGS_nics = "";
-
-  *ThreadLocalRand::getSeed() = FLAGS_seed;
-  srand(FLAGS_seed);
-
-  Trainer trainer;
-  trainer.init(TrainerConfigHelper::createFromFlagConfig(), false);
-
-  data.parameters = trainer.getGradientMachine()->getParameters();
-
-  DataBatch dataBatch;
-  int32_t batchSize = trainer.getConfig().opt_config().batch_size();
-
-  trainer.getDataProvider()->reset();
-  trainer.getDataProvider()->setSkipShuffle();
-  trainer.getDataProvider()->getNextBatch(batchSize, &dataBatch);
-
-  CHECK(dataBatch.getSize()) << "No data from data provider";
-  vector<Argument>& inArgs = dataBatch.getStreams();
-
-  trainer.getGradientMachine()->start();
-  trainer.getGradientMachine()->forwardBackward(
-      inArgs, &data.outArgs, PASS_TRAIN);
-
-  trainer.getGradientMachine()->finish();
-}
-
-void checkBuffer(real* A,
-                 const char* desA,
-                 real* B,
-                 const char* desB,
-                 size_t len,
-                 size_t width = 1) {
-  int nNum = 0;
-  real maxVal = 0;
-  for (size_t i = 0; i < len; ++i) {
-    maxVal = std::max(maxVal, std::max(A[i], B[i]));
-  }
-  real maxDiff = 0;
-  for (size_t i = 0; i < len; ++i) {
-    real diff = fabs(A[i] - B[i]);
-    maxDiff = std::max(maxDiff, diff);
-    if (diff > maxVal * FLAGS_max_diff_ratio) {
-      nNum++;
-      VLOG(1) << "Row: " << i / width << ", " << desA << " : " << A[i] << "    "
-              << desB << " : " << B[i] << " diff=" << diff;
-    }
-  }
-  EXPECT_EQ(0, nNum);
-  LOG(INFO) << "maxValue=" << maxVal << " maxDiff=" << maxDiff << "\n\n";
-}
-
-void compareGradient(ComData& comDataA, ComData& comDataB) {
-  vector<Argument> outArgsA = comDataA.outArgs;
-  vector<Argument> outArgsB = comDataB.outArgs;
-
-  for (size_t i = 0; i < outArgsA.size(); ++i) {
-    CpuMatrix matA(outArgsA[i].value->getHeight(),
-                   outArgsA[i].value->getWidth());
-    CpuMatrix matB(outArgsB[i].value->getHeight(),
-                   outArgsB[i].value->getWidth());
-
-    matA.copyFrom(*outArgsA[i].value);
-    matB.copyFrom(*outArgsB[i].value);
-
-    LOG(INFO) << "\n--------------------------------"
-              << " Check Network Output_" << i << ":"
-              << " -------------------------------------\n";
-    checkBuffer(matA.getData(),
-                "network A output",
-                matB.getData(),
-                "network B output",
-                matA.getElementCnt(),
-                matA.getWidth());
-  }
-
-  vector<ParameterPtr>& parametersA = comDataA.parameters;
-  vector<ParameterPtr>& parametersB = comDataB.parameters;
-
-  LOG(INFO) << "\n\n--------------------------------"
-            << " Check Gradient Machine Parameters:"
-            << " -------------------------------------\n";
-  for (size_t i = 0; i < parametersA.size(); ++i) {
-    ParameterPtr parameterA, parameterB;
-    parameterA = parametersA[i];
-    parameterB = parametersB[i];
-
-    CpuVector paraA(parameterA->getSize());
-    CpuVector paraB(parameterB->getSize());
-    paraA.copyFrom(*parameterA->getBuf(PARAMETER_VALUE));
-    paraB.copyFrom(*parameterB->getBuf(PARAMETER_VALUE));
-
-    LOG(INFO) << "\n\n----------- PARAMETER_VALUE:  " << parameterA->getName()
-              << " ; size : " << paraA.getSize() << " ------------";
-    checkBuffer(paraA.getData(),
-                "Network A",
-                paraB.getData(),
-                "Network B",
-                paraA.getSize());
-
-    CpuVector gradA(*parameterA->getBuf(PARAMETER_GRADIENT));
-    CpuVector gradB(*parameterB->getBuf(PARAMETER_GRADIENT));
-
-    LOG(INFO) << "\n\n----------- PARAMETER_GRADIENT: " << parameterA->getName()
-              << " ; size : " << gradA.getSize() << " -----------";
-    checkBuffer(gradA.getData(),
-                "Network A",
-                gradB.getData(),
-                "Network B",
-                gradA.getSize());
-  }
-}
-
-TEST(Trainer, create) {
-  ComData dataA;
-  calcGradient(dataA, config_file_a);
-  LOG(INFO) << "\n\nforwardBackward of Network A is finished\n\n";
-
-  ComData dataB;
-  calcGradient(dataB, config_file_b);
-  LOG(INFO) << "\n\nforwardBackward of the Network B is finished\n\n";
-
-  compareGradient(dataA, dataB);
-}
-
-int main(int argc, char** argv) {
-  FLAGS_thread_local_rand_use_global_seed = true;
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  initPython(argc, argv);
-
-#ifndef PADDLE_TYPE_DOUBLE
-  if (FLAGS_need_high_accuracy) {
-    LOG(INFO) << "skip test due to it's need high accuracy";
-    return 0;
-  }
-  if (FLAGS_max_diff_ratio == 0.0f) {
-    FLAGS_max_diff_ratio = 1e-5;
-    LOG(INFO) << "auto set max_diff_ratio " << FLAGS_max_diff_ratio
-              << " in low accuracy mode";
-  }
-#else
-  if (FLAGS_max_diff_ratio == 0.0f) {
-    FLAGS_max_diff_ratio = 1e-10;
-    LOG(INFO) << "auto set max_diff_ratio " << FLAGS_max_diff_ratio
-              << " in high accuracy mode";
-  }
-#endif
-
-  int ret = RUN_ALL_TESTS();
-  return ret;
-}
diff --git a/paddle/gserver/tests/test_ConvTrans.cpp b/paddle/gserver/tests/test_ConvTrans.cpp
deleted file mode 100644
index 2e394a74b7d53fc53727d817c06479d545ade65d..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_ConvTrans.cpp
+++ /dev/null
@@ -1,244 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-#include "ModelConfig.pb.h"
-#include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/utils/GlobalConstants.h"
-
-#include "LayerGradUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_bool(use_gpu);
-DECLARE_int32(gpu_id);
-DECLARE_double(checkgrad_eps);
-DECLARE_bool(thread_local_rand_use_global_seed);
-DECLARE_bool(prev_batch_state);
-
-// Test that the convTrans forward is the same as conv backward
-TEST(Layer, convTransLayerFwd) {
-  // Setting up conv-trans layer
-  TestConfig configt;
-  configt.biasSize = 3;
-  configt.layerConfig.set_type("exconvt");
-  configt.layerConfig.set_num_filters(3);
-  configt.layerConfig.set_partial_sum(1);
-  configt.layerConfig.set_shared_biases(true);
-
-  configt.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 384});
-  LayerInputConfig* input = configt.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-  conv->set_filter_size(2);
-  conv->set_filter_size_y(4);
-  conv->set_channels(16);
-  conv->set_padding(0);
-  conv->set_padding_y(1);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_groups(1);
-  conv->set_filter_channels(3 / conv->groups());
-  conv->set_img_size(16);
-  conv->set_output_x(outputSize(conv->img_size(),
-                                conv->filter_size(),
-                                conv->padding(),
-                                conv->stride(),
-                                /* caffeMode */ true));
-  configt.layerConfig.set_size(conv->img_size() * conv->img_size() *
-                               configt.layerConfig.num_filters());
-  configt.layerConfig.set_name("convTrans");
-
-  // data layer initialize
-  std::vector<DataLayerPtr> dataLayers;
-  LayerMap layerMap;
-  vector<Argument> datas;
-  initDataLayer(
-      configt, &dataLayers, &datas, &layerMap, "convTrans", 100, false, false);
-  // test layer initialize
-  std::vector<ParameterPtr> parameters;
-  LayerPtr convtLayer;
-  initTestLayer(configt, &layerMap, &parameters, &convtLayer);
-  convtLayer->getBiasParameter()->zeroMem();
-  convtLayer->forward(PASS_GC);
-
-  // Setting up conv-layer config
-  TestConfig config;
-  config.biasSize = 16;
-  config.layerConfig.set_type("exconv");
-  config.layerConfig.set_num_filters(16);
-  config.layerConfig.set_partial_sum(1);
-  config.layerConfig.set_shared_biases(true);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 768, 384});
-  input = config.layerConfig.add_inputs();
-  conv = input->mutable_conv_conf();
-  conv->set_filter_size(2);
-  conv->set_filter_size_y(4);
-  conv->set_channels(3);
-  conv->set_padding(0);
-  conv->set_padding_y(1);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_groups(1);
-  conv->set_filter_channels(conv->channels() / conv->groups());
-  conv->set_img_size(16);
-  conv->set_output_x(outputSize(conv->img_size(),
-                                conv->filter_size(),
-                                conv->padding(),
-                                conv->stride(),
-                                /* caffeMode */ true));
-  config.layerConfig.set_size(conv->output_x() * conv->output_x() *
-                              config.layerConfig.num_filters());
-  config.layerConfig.set_name("conv");
-
-  // data layer initialize
-  std::vector<DataLayerPtr> dataLayers2;
-  LayerMap layerMap2;
-  vector<Argument> datas2;
-  initDataLayer(
-      config, &dataLayers2, &datas2, &layerMap2, "conv", 100, false, false);
-  // test layer initialize
-  std::vector<ParameterPtr> parameters2;
-  LayerPtr convLayer;
-  initTestLayer(config, &layerMap2, &parameters2, &convLayer);
-
-  // Sync convLayer and convtLayer parameter
-  convLayer->getBiasParameter()->zeroMem();
-  convLayer->getParameters()[0]
-      ->getBuf(PARAMETER_VALUE)
-      ->copyFrom(*(convtLayer->getParameters()[0]->getBuf(PARAMETER_VALUE)));
-
-  // Set convLayer outputGrad as convTransLayer input value
-  convLayer->forward(PASS_GC);
-  convLayer->getOutput().grad->copyFrom(*(dataLayers[0]->getOutputValue()));
-
-  vector<int> callbackFlags(parameters2.size(), 0);
-  auto callback = [&](Parameter* para) { ++callbackFlags[para->getID()]; };
-  convLayer->backward(callback);
-
-  // Check that the convLayer backward is the same as convTransLayer forward
-  checkMatrixEqual(convtLayer->getOutputValue(),
-                   dataLayers2[0]->getOutputGrad());
-}
-
-// Do one forward pass of convTrans layer and check to see if its output
-// matches the given result
-void doOneConvtTest(size_t imgSize,
-                    size_t output_x,
-                    size_t stride,
-                    size_t padding,
-                    size_t filter_size,
-                    MatrixPtr& result) {
-  TestConfig configt;
-  configt.biasSize = 1;
-  configt.layerConfig.set_type("exconvt");
-  configt.layerConfig.set_num_filters(1);
-  configt.layerConfig.set_partial_sum(1);
-  configt.layerConfig.set_shared_biases(true);
-
-  configt.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", output_x * output_x, filter_size * filter_size});
-  LayerInputConfig* input = configt.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-  conv->set_filter_size(filter_size);
-  conv->set_filter_size_y(filter_size);
-  conv->set_channels(1);
-  conv->set_padding(padding);
-  conv->set_padding_y(padding);
-  conv->set_stride(stride);
-  conv->set_stride_y(stride);
-  conv->set_groups(1);
-  conv->set_filter_channels(1);
-  conv->set_img_size(imgSize);
-  conv->set_output_x(output_x);
-
-  configt.layerConfig.set_size(conv->img_size() * conv->img_size() *
-                               configt.layerConfig.num_filters());
-  configt.layerConfig.set_name("convTrans");
-
-  std::vector<DataLayerPtr> dataLayers;
-  LayerMap layerMap;
-  vector<Argument> datas;
-  initDataLayer(
-      configt, &dataLayers, &datas, &layerMap, "convTrans", 1, false, false);
-  dataLayers[0]->getOutputValue()->zeroMem();
-  dataLayers[0]->getOutputValue()->add(1.0);
-
-  // test layer initialize
-  std::vector<ParameterPtr> parameters;
-  LayerPtr convtLayer;
-  initTestLayer(configt, &layerMap, &parameters, &convtLayer);
-  convtLayer->getBiasParameter()->zeroMem();
-  convtLayer->getParameters()[0]->zeroMem();
-  convtLayer->getParameters()[0]->getBuf(PARAMETER_VALUE)->add(1.0);
-  convtLayer->forward(PASS_GC);
-
-  checkMatrixEqual(convtLayer->getOutputValue(), result);
-}
-
-TEST(Layer, convTransLayerFwd2) {
-  MatrixPtr result;
-  result = Matrix::create(1, 5 * 5, false, false);
-  result->zeroMem();
-  result->add(1.0);
-  doOneConvtTest(/* imgSize */ 5,
-                 /* output_x */ 1,
-                 /* stride */ 1,
-                 /* padding */ 0,
-                 /* filter_size */ 5,
-                 result);
-
-  real resultData[] = {1, 2, 2, 2, 1, 2, 4, 4, 4, 2, 2, 4, 4,
-                       4, 2, 2, 4, 4, 4, 2, 1, 2, 2, 2, 1};
-  result->setData(resultData);
-  doOneConvtTest(/* imgSize */ 5,
-                 /* output_x */ 2,
-                 /* stride */ 1,
-                 /* padding */ 0,
-                 /* filter_size */ 4,
-                 result);
-
-  real resultData2[] = {1, 2, 2, 2, 1, 2, 4, 4, 4, 2, 2, 4, 4,
-                        4, 2, 2, 4, 4, 4, 2, 1, 2, 2, 2, 1};
-  result->setData(resultData2);
-  doOneConvtTest(/* imgSize */ 5,
-                 /* output_x */ 2,
-                 /* stride */ 2,
-                 /* padding */ 1,
-                 /* filter_size */ 5,
-                 result);
-
-  real resultData3[] = {1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 2, 4,
-                        2, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1};
-  result->setData(resultData3);
-  doOneConvtTest(/* imgSize */ 5,
-                 /* output_x */ 2,
-                 /* stride */ 2,
-                 /* padding */ 0,
-                 /* filter_size */ 3,
-                 result);
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(1);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_ConvUnify.cpp b/paddle/gserver/tests/test_ConvUnify.cpp
deleted file mode 100644
index ba820d9a2acabf95ff816705e4df124bb95da077..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_ConvUnify.cpp
+++ /dev/null
@@ -1,315 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-#include "ModelConfig.pb.h"
-#include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/utils/GlobalConstants.h"
-
-#include "LayerGradUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_bool(use_gpu);
-DECLARE_int32(gpu_id);
-DECLARE_double(checkgrad_eps);
-DECLARE_bool(thread_local_rand_use_global_seed);
-DECLARE_bool(prev_batch_state);
-
-// Do one forward pass of ConvLayer using either exconv or cudnn_conv
-MatrixPtr doOneConvTest(size_t imgSize,
-                        size_t output_x,
-                        size_t stride,
-                        size_t padding,
-                        size_t filter_size,
-                        size_t channel,
-                        size_t numfilters,
-                        size_t groups,
-                        MatrixPtr& inputData,
-                        real* param,
-                        bool useGpu,
-                        bool isDeconv = false) {
-  TestConfig config;
-  config.biasSize = numfilters;
-  string layerType;
-  if (useGpu) {
-    layerType = (isDeconv) ? "cudnn_convt" : "cudnn_conv";
-  } else {
-    layerType = (isDeconv) ? "exconvt" : "exconv";
-  }
-  config.layerConfig.set_type(layerType);
-  config.layerConfig.set_num_filters(numfilters);
-  config.layerConfig.set_partial_sum(1);
-  config.layerConfig.set_shared_biases(true);
-
-  size_t weightSize = channel * filter_size * filter_size *
-                      config.layerConfig.num_filters() / groups;
-  if (isDeconv) {
-    config.inputDefs.push_back(
-        {INPUT_DATA, "layer_0", output_x * output_x * channel, weightSize});
-    config.layerConfig.set_size(imgSize * imgSize *
-                                config.layerConfig.num_filters());
-  } else {
-    config.inputDefs.push_back(
-        {INPUT_DATA, "layer_0", imgSize * imgSize * channel, weightSize});
-    config.layerConfig.set_size(output_x * output_x *
-                                config.layerConfig.num_filters());
-  }
-
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-  conv->set_filter_size(filter_size);
-  conv->set_filter_size_y(filter_size);
-  conv->set_channels(channel);
-  conv->set_padding(padding);
-  conv->set_padding_y(padding);
-  conv->set_stride(stride);
-  conv->set_stride_y(stride);
-  conv->set_groups(groups);
-  conv->set_img_size(imgSize);
-  conv->set_output_x(output_x);
-
-  if (isDeconv) {
-    conv->set_filter_channels(numfilters / groups);
-  } else {
-    conv->set_filter_channels(channel / groups);
-  }
-
-  config.layerConfig.set_name("conv");
-
-  std::vector<DataLayerPtr> dataLayers;
-  LayerMap layerMap;
-  vector<Argument> datas;
-  initDataLayer(
-      config, &dataLayers, &datas, &layerMap, "conv", 1, false, useGpu);
-  dataLayers[0]->getOutputValue()->zeroMem();
-  dataLayers[0]->getOutputValue()->copyFrom(*inputData);
-
-  // test layer initialize
-  std::vector<ParameterPtr> parameters;
-  LayerPtr convLayer;
-  initTestLayer(config, &layerMap, &parameters, &convLayer);
-  convLayer->getBiasParameter()->zeroMem();
-  convLayer->getParameters()[0]->zeroMem();
-  convLayer->getParameters()[0]
-      ->getBuf(PARAMETER_VALUE)
-      ->copyFrom(param, weightSize);
-  convLayer->forward(PASS_GC);
-
-  return convLayer->getOutputValue();
-}
-
-TEST(Layer, convParaUnified) {
-#ifdef PADDLE_WITH_CUDA
-  MatrixPtr input, resultCpu, resultGpu;
-
-  /// TEST1 for conv ///
-  input = Matrix::create(1, 4 * 4, false, false);
-  real inputData[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
-  real param[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 7, 6, 5, 4, 3, 2, 1};
-
-  input->setData(inputData);
-
-  resultCpu = doOneConvTest(/* imgSize */ 4,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 3,
-                            /*channel*/ 1,
-                            /*numfilters*/ 2,
-                            /*groups*/ 1,
-                            input,
-                            param,
-                            /*useGpu*/ false);
-
-  resultGpu = doOneConvTest(/* imgSize */ 4,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 3,
-                            /*channel*/ 1,
-                            /*numfilters*/ 2,
-                            /*groups*/ 1,
-                            input,
-                            param,
-                            /*useGpu*/ true);
-  checkMatrixEqual(resultCpu, resultGpu);
-
-  /// TEST1 for deconv ///
-  input = Matrix::create(1, 2 * 2, false, false);
-  real inputDataT[] = {1, 2, 3, 4};
-  input->setData(inputDataT);
-
-  resultCpu = doOneConvTest(/* imgSize */ 4,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 3,
-                            /*channel*/ 1,
-                            /*numfilters*/ 2,
-                            /*groups*/ 1,
-                            input,
-                            param,
-                            /*useGpu*/ false,
-                            /*isDeconv*/ true);
-
-  resultGpu = doOneConvTest(/* imgSize */ 4,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 3,
-                            /*channel*/ 1,
-                            /*numfilters*/ 2,
-                            /*groups*/ 1,
-                            input,
-                            param,
-                            /*useGpu*/ true,
-                            /*isDeconv*/ true);
-  checkMatrixEqual(resultCpu, resultGpu);
-
-  /// TEST2 for conv ///
-  input = Matrix::create(1, 3 * 3 * 2, false, false);
-  real inputData2[] = {
-      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18};
-  real param2[] = {1, 2, 3, 4, 5, 6, 7, 8, 8, 7, 6, 5, 4, 3, 2, 1};
-
-  input->setData(inputData2);
-
-  resultCpu = doOneConvTest(/* imgSize */ 3,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 2,
-                            /*channel*/ 2,
-                            /*numfilters*/ 2,
-                            /*groups*/ 1,
-                            input,
-                            param2,
-                            /*useGpu*/ false);
-
-  resultGpu = doOneConvTest(/* imgSize */ 3,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 2,
-                            /*channel*/ 2,
-                            /*numfilters*/ 2,
-                            /*groups*/ 1,
-                            input,
-                            param2,
-                            /*useGpu*/ true);
-  checkMatrixEqual(resultCpu, resultGpu);
-
-  /// TEST3 for conv ///
-  real param3[] = {1, 2, 3, 4, 4, 3, 2, 1};
-
-  resultCpu = doOneConvTest(/* imgSize */ 3,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 2,
-                            /*channel*/ 2,
-                            /*numfilters*/ 2,
-                            /*groups*/ 2,
-                            input,
-                            param3,
-                            /*useGpu*/ false);
-
-  resultGpu = doOneConvTest(/* imgSize */ 3,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 2,
-                            /*channel*/ 2,
-                            /*numfilters*/ 2,
-                            /*groups*/ 2,
-                            input,
-                            param3,
-                            /*useGpu*/ true);
-  checkMatrixEqual(resultCpu, resultGpu);
-
-  /// TEST2 for deconv ///
-  input = Matrix::create(1, 2 * 2 * 2, false, false);
-  real inputData2T[] = {1, 2, 3, 4, 5, 6, 7, 8};
-  input->setData(inputData2T);
-
-  resultCpu = doOneConvTest(/* imgSize */ 3,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 2,
-                            /*channel*/ 2,
-                            /*numfilters*/ 2,
-                            /*groups*/ 1,
-                            input,
-                            param2,
-                            /*useGpu*/ false,
-                            /*isDeconv*/ true);
-
-  resultGpu = doOneConvTest(/* imgSize */ 3,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 2,
-                            /*channel*/ 2,
-                            /*numfilters*/ 2,
-                            /*groups*/ 1,
-                            input,
-                            param2,
-                            /*useGpu*/ true,
-                            /*isDeconv*/ true);
-  checkMatrixEqual(resultCpu, resultGpu);
-
-  /// TEST3 for deconv ///
-  resultCpu = doOneConvTest(/* imgSize */ 3,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 2,
-                            /*channel*/ 2,
-                            /*numfilters*/ 2,
-                            /*groups*/ 2,
-                            input,
-                            param3,
-                            /*useGpu*/ false,
-                            /*isDeconv*/ true);
-
-  resultGpu = doOneConvTest(/* imgSize */ 3,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 2,
-                            /*channel*/ 2,
-                            /*numfilters*/ 2,
-                            /*groups*/ 2,
-                            input,
-                            param3,
-                            /*useGpu*/ true,
-                            /*isDeconv*/ true);
-  checkMatrixEqual(resultCpu, resultGpu);
-#endif
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(1);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp b/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp
deleted file mode 100644
index 0041ed30939d1a6111a2db753da6172bb65e374b..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp
+++ /dev/null
@@ -1,352 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <random>
-#include <sstream>
-
-#include <gtest/gtest.h>
-#include "ModelConfig.pb.h"
-#include "paddle/gserver/layers/DataLayer.h"
-
-#include "LayerGradUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-
-DECLARE_int32(gpu_id);
-DECLARE_bool(thread_local_rand_use_global_seed);
-
-const size_t MAX_SEQ_NUM = 23;
-const size_t MAX_SEQ_LEN = 50;
-const size_t MAX_BEAM_SIZE = 27;
-
-const size_t SEED = (size_t)(time(NULL));
-
-struct SingleBeamExpansion {
-  vector<int> seqStartPos;
-  vector<int> subSeqStartPos;
-  vector<real> candidateScores;
-
-  // TODO(caoying): store this into Argument.ids
-  vector<real> selectedIndices;
-
-  vector<int> groundTruth;
-  vector<size_t> inBeam;
-  vector<int> rowIdxInBeam;
-  vector<int> colIdxInBeam;
-
-  void resetGroundTruth(size_t n) {
-    groundTruth.clear();
-    groundTruth.resize(n, -1);
-
-    inBeam.clear();
-    inBeam.resize(n, 0);
-
-    rowIdxInBeam.clear();
-    rowIdxInBeam.resize(n, -1);
-
-    colIdxInBeam.clear();
-    colIdxInBeam.resize(n, -1);
-  }
-};
-
-inline float randFloat() {
-  return static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
-}
-
-void genRand(real* numbers, size_t n) {
-  default_random_engine generator;
-  uniform_real_distribution<real> distribution(0.0, 1.0);
-  for (size_t i = 0; i < n; ++i) numbers[i] = distribution(generator);
-}
-
-vector<real> randSampling(real range, int n) {
-  CHECK_GE(range, n);
-  vector<real> num(range);
-  iota(begin(num), end(num), 0.);
-  if (range == n) return num;
-
-  random_shuffle(begin(num), end(num));
-  num.resize(n);
-  sort(begin(num), end(num));
-  return num;
-}
-
-void genCandidateScores(bool hasSubseq,
-                        size_t beamSize,
-                        SingleBeamExpansion& prevBeam,
-                        SingleBeamExpansion& curBeam) {
-  vector<int>& seqStartPos = curBeam.seqStartPos;
-  seqStartPos.resize(1, 0);
-  vector<int>& subSeqStartPos = curBeam.subSeqStartPos;
-  subSeqStartPos.resize(1, 0);
-
-  srand(SEED);
-  if (prevBeam.selectedIndices.size()) {
-    if (prevBeam.subSeqStartPos.size() > 1) {
-      int seqIdx = 1;
-      // samples in previous beam are nested sequences.
-      for (size_t i = 1; i < prevBeam.subSeqStartPos.size(); ++i) {
-        for (size_t j = 0; j < beamSize; ++j) {
-          if (prevBeam.selectedIndices[(i - 1) * beamSize + j] == -1.) break;
-          subSeqStartPos.push_back(1 + (rand() % MAX_SEQ_LEN) +
-                                   subSeqStartPos.back());
-        }
-        if (prevBeam.seqStartPos[seqIdx] == prevBeam.subSeqStartPos[i]) {
-          seqStartPos.push_back(subSeqStartPos.back());
-          seqIdx++;
-        }
-      }
-    } else {
-      for (size_t i = 0; i <= prevBeam.selectedIndices.size(); ++i) {
-        if (i && i % beamSize == 0) {
-          seqStartPos.push_back(subSeqStartPos.back());
-          if (i == prevBeam.selectedIndices.size()) break;
-        }
-        if (prevBeam.selectedIndices[i] == -1.) continue;
-        subSeqStartPos.push_back(subSeqStartPos.back() +
-                                 (1 + (rand() % MAX_SEQ_LEN)));
-      }
-    }
-  } else {
-    // the first beam expansion
-    int seqNum = 1 + (rand() % MAX_SEQ_NUM);
-    for (int i = 0; i < seqNum; ++i) {
-      if (hasSubseq) {
-        for (size_t j = 0; j < 1 + (rand() % MAX_SEQ_NUM); ++j)
-          subSeqStartPos.push_back(subSeqStartPos.back() +
-                                   (1 + (rand() % MAX_SEQ_LEN)));
-        seqStartPos.push_back(subSeqStartPos.back());
-      } else {
-        seqStartPos.push_back(seqStartPos.back() +
-                              (1 + (rand() % MAX_SEQ_LEN)));
-      }
-    }
-  }
-
-  size_t totalSeqNum = hasSubseq ? subSeqStartPos.back() : seqStartPos.back();
-  curBeam.candidateScores.resize(totalSeqNum, 0.);
-  genRand(curBeam.candidateScores.data(), totalSeqNum);
-}
-
-void genSelectedIndices(size_t beamSize,
-                        vector<int>& seqStartPos,
-                        vector<real>& selectedIndices) {
-  size_t selectedIdsCount = beamSize * (seqStartPos.size() - 1);
-  selectedIndices.resize(selectedIdsCount, -1.);
-
-  for (size_t i = 0; i < seqStartPos.size() - 1; ++i) {
-    int seqLen = seqStartPos[i + 1] - seqStartPos[i];
-    int n = min(seqLen, static_cast<int>(beamSize));
-    vector<real> ids = randSampling(seqLen, n);
-    memcpy(selectedIndices.data() + i * beamSize,
-           ids.data(),
-           sizeof(real) * ids.size());
-  }
-}
-
-void genGroundTruth(vector<SingleBeamExpansion>& beamExpansions,
-                    size_t beamSize) {
-  SingleBeamExpansion& beam = beamExpansions[1];
-  size_t seqNum = beam.seqStartPos.size() - 1;
-  for (size_t i = 2; i < beamExpansions.size(); ++i)
-    CHECK_EQ(seqNum, beamExpansions[i].seqStartPos.size() - 1);
-
-  srand(SEED);
-
-  // initialize the first beam.
-  beam.resetGroundTruth(seqNum);
-  for (size_t i = 0; i < seqNum; ++i) {
-    if (randFloat() > 0.5) {
-      /*
-       * force the randomly generated label falls in the beam by chance 0.5.
-       * otherwise, when sequence length is relatively long and beam size is
-       * relatively small, the gold sequences falls off the beam at in the
-       * first search.
-       */
-      real* begPos = beam.selectedIndices.data() + i * beamSize;
-      beam.colIdxInBeam[i] =
-          rand() % count_if(begPos, begPos + beamSize, [](const real& val) {
-            return val != -1.;
-          });
-      beam.groundTruth[i] =
-          beam.selectedIndices[i * beamSize + beam.colIdxInBeam[i]];
-      beam.inBeam[i] = 1;
-    } else {
-      int label = rand() % (beam.seqStartPos[i + 1] - beam.seqStartPos[i]);
-      beam.groundTruth[i] = label;
-
-      real* begPos = beam.selectedIndices.data() + i * beamSize;
-      real* endPos = begPos + beamSize;
-      real* lblPos = find(begPos, endPos, real(label));
-      if (lblPos != endPos) {
-        beam.inBeam[i] = 1;
-        beam.colIdxInBeam[i] = lblPos - begPos;
-      }
-    }
-    beam.rowIdxInBeam[i] = i;
-  }
-
-  // iterate over each beam expansions
-  for (size_t i = 2; i < beamExpansions.size(); ++i) {
-    SingleBeamExpansion& curBeam = beamExpansions[i];
-    SingleBeamExpansion& prevBeam = beamExpansions[i - 1];
-    curBeam.resetGroundTruth(seqNum);
-
-    // iterate over each sequence
-    for (size_t j = 0; j < seqNum; ++j) {
-      if (!prevBeam.inBeam[j]) continue;
-
-      // gold sequence falls in the beam in previous search.
-      real* begPos = prevBeam.selectedIndices.data();
-      int offset =
-          prevBeam.rowIdxInBeam[j] * beamSize + prevBeam.colIdxInBeam[j];
-      curBeam.rowIdxInBeam[j] = count_if(
-          begPos, begPos + offset, [](const real& val) { return val != -1.; });
-
-      if (randFloat() > 0.5) {
-        // force the randomly generated label falls in the beam by chance 0.5.
-
-        real* start =
-            curBeam.selectedIndices.data() + curBeam.rowIdxInBeam[j] * beamSize;
-        int n = rand() % count_if(start, start + beamSize, [](const real& val) {
-                  return val != -1.;
-                });
-        curBeam.colIdxInBeam[j] = n;
-        curBeam.groundTruth[j] = *(start + n);
-        curBeam.inBeam[j] = 1;
-      } else {
-        CHECK_LE((size_t)curBeam.rowIdxInBeam[j] + 1,
-                 curBeam.subSeqStartPos.size() - 1);
-        int start = curBeam.subSeqStartPos[curBeam.rowIdxInBeam[j]];
-        int end = curBeam.subSeqStartPos[curBeam.rowIdxInBeam[j] + 1];
-        CHECK_GT(size_t(end), size_t(start));
-        int label = rand() % (end - start);
-
-        curBeam.groundTruth[j] = label;
-        real* findBeg =
-            curBeam.selectedIndices.data() + curBeam.rowIdxInBeam[j] * beamSize;
-        real* lblPos =
-            find(findBeg, findBeg + beamSize, static_cast<real>(label));
-        if (lblPos != (findBeg + beamSize)) {
-          curBeam.inBeam[j] = 1;
-          curBeam.colIdxInBeam[j] = lblPos - findBeg;
-        }
-      }
-    }
-  }
-}
-
-void genOneBeam(size_t beamSize,
-                bool hasSubseq,
-                SingleBeamExpansion& prevBeam,
-                SingleBeamExpansion& curBeam) {
-  genCandidateScores(hasSubseq, beamSize, prevBeam, curBeam);
-  genSelectedIndices(beamSize,
-                     hasSubseq ? curBeam.subSeqStartPos : curBeam.seqStartPos,
-                     curBeam.selectedIndices);
-}
-
-void genRandomBeamExpansion(size_t expansionCount,
-                            size_t beamSize,
-                            vector<SingleBeamExpansion>& beamExpansions) {
-  beamExpansions.clear();
-  beamExpansions.resize(expansionCount + 1);
-
-  // beamExpansions[0] is reserved.
-  for (size_t i = 1; i <= expansionCount; ++i)
-    genOneBeam(beamSize, bool(i - 1), beamExpansions[i - 1], beamExpansions[i]);
-  genGroundTruth(beamExpansions, beamSize);
-}
-
-void testCrossEntropyOverBeam(bool useGpu,
-                              size_t beamSize,
-                              vector<SingleBeamExpansion>& beams) {
-  TestConfig config;
-  config.layerConfig.set_type("cross_entropy_over_beam");
-
-  size_t seqNum = 0;
-  for (size_t i = 1; i < beams.size(); ++i) {
-    const SingleBeamExpansion& beam = beams[i];
-    // create scores for all the candidates
-    MatrixPtr candidateScorePtr =
-        Matrix::create(beam.candidateScores.size(), 1, false, false);
-    candidateScorePtr->copyFrom(beam.candidateScores.data(),
-                                beam.candidateScores.size());
-
-    ostringstream paramName;
-    paramName << "candidate_scores_" << i;
-
-    if (beam.subSeqStartPos.size() > 1) {
-      seqNum = beam.subSeqStartPos.size() - 1;
-      config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
-                                  paramName.str(),
-                                  candidateScorePtr,
-                                  beam.seqStartPos,
-                                  beam.subSeqStartPos});
-    } else {
-      seqNum = beam.seqStartPos.size() - 1;
-      config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
-                                  paramName.str(),
-                                  candidateScorePtr,
-                                  beam.seqStartPos});
-    }
-    config.layerConfig.add_inputs();
-
-    // create indices for the selected candidates
-    MatrixPtr selectedCandidates =
-        Matrix::create(seqNum, beamSize, false, false);
-    selectedCandidates->copyFrom(beam.selectedIndices.data(),
-                                 beam.selectedIndices.size());
-    paramName.clear();
-    paramName << "selected_candidates_" << i;
-    config.inputDefs.push_back(
-        {INPUT_SELF_DEFINE_DATA, paramName.str(), selectedCandidates});
-    config.layerConfig.add_inputs();
-
-    // create the ground truth
-    paramName.clear();
-    paramName << "label_" << i;
-    config.inputDefs.push_back(
-        {INPUT_SELF_DEFINE_DATA, paramName.str(), beam.groundTruth});
-    config.layerConfig.add_inputs();
-  }
-
-  testLayerGrad(
-      config, "cross_entropy_over_beam", seqNum, false, useGpu, false);
-}
-
-TEST(Layer, CrossEntropyOverBeam) {
-  LOG(INFO) << "SEED = " << SEED;
-  const size_t beamSize = 1 + rand() % MAX_BEAM_SIZE;
-  LOG(INFO) << "beamSize = " << beamSize;
-
-  // TODO(caoying): test with random beam expansions.
-  const size_t expansionCount = 3;
-  vector<SingleBeamExpansion> beams;
-  genRandomBeamExpansion(expansionCount, beamSize, beams);
-
-  for (bool useGpu : {false, true})
-    testCrossEntropyOverBeam(useGpu, beamSize, beams);
-}
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  hl_start();
-  hl_init(FLAGS_gpu_id);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(SEED);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_Evaluator.cpp b/paddle/gserver/tests/test_Evaluator.cpp
deleted file mode 100644
index 4a8843f3affe7b1d4f3172be733aefc085c9e7a5..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_Evaluator.cpp
+++ /dev/null
@@ -1,267 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <vector>
-#include "ModelConfig.pb.h"
-#include "paddle/testing/TestUtil.h"
-#include "paddle/trainer/Trainer.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_bool(use_gpu);
-DECLARE_int32(gpu_id);
-DECLARE_bool(thread_local_rand_use_global_seed);
-
-enum InputType {
-  INPUT_DATA,         // dense vector
-  INPUT_LABEL,        // id
-  INPUT_DATA_TARGET,  // dense vector, but no gradient
-  INPUT_SEQUENCE_DATA,
-  INPUT_SEQUENCE_LABEL,
-  INPUT_SPARSE_NON_VALUE_DATA
-};
-
-struct InputDef {
-  InputType inputType;
-  string name;
-  size_t dim;
-};
-
-struct TestConfig {
-  EvaluatorConfig evaluatorConfig;
-  std::vector<InputDef> inputDefs;
-  bool testAccumulate;
-  TestConfig() : testAccumulate(true) {}
-};
-
-void testEvaluator(TestConfig testConf,
-                   string testEvaluatorName,
-                   size_t batchSize,
-                   bool useGpu) {
-#ifndef PADDLE_WITH_CUDA
-  if (useGpu) return;
-#endif
-  FLAGS_use_gpu = useGpu;
-  testConf.evaluatorConfig.set_name(testEvaluatorName);
-  LOG(INFO) << " evaluator_type=" << testConf.evaluatorConfig.type()
-            << " useGpu=" << useGpu;
-
-  std::vector<Argument> arguments;
-  for (size_t i = 0; i < testConf.inputDefs.size(); ++i) {
-    Argument data;
-    size_t dim = testConf.inputDefs[i].dim;
-    switch (testConf.inputDefs[i].inputType) {
-      case INPUT_DATA:
-      case INPUT_SEQUENCE_DATA:
-      case INPUT_DATA_TARGET:
-        data.value = Matrix::create(batchSize, dim, false, useGpu);
-        data.value->randomizeUniform();
-
-        // make sure output > 0 && output < 1
-        data.value->add(-0.5);
-        data.value->sigmoid(*data.value);
-        break;
-      case INPUT_LABEL:
-      case INPUT_SEQUENCE_LABEL:
-        data.ids = VectorT<int>::create(batchSize, useGpu);
-        data.ids->rand(dim);  // now rand number can be 0 to inputDefs[i].dim.
-        break;
-      case INPUT_SPARSE_NON_VALUE_DATA:
-        data.value = makeRandomSparseMatrix(batchSize,
-                                            dim,
-                                            /* withValue= */ false,
-                                            useGpu);
-        break;
-      default:
-        LOG(FATAL) << " unknown inputType ";
-        return;
-    }
-
-    ICpuGpuVectorPtr sequenceStartPositions;
-    if (testConf.inputDefs[i].inputType == INPUT_SEQUENCE_DATA ||
-        testConf.inputDefs[i].inputType == INPUT_SEQUENCE_LABEL) {
-      if (!sequenceStartPositions) {
-        generateSequenceStartPositions(batchSize, sequenceStartPositions);
-      }
-      data.sequenceStartPositions = sequenceStartPositions;
-    }
-
-    arguments.push_back(data);
-  }
-
-  Evaluator* testEvaluator = Evaluator::create(testConf.evaluatorConfig);
-  double totalScore = 0.0;
-  testEvaluator->start();
-  totalScore += testEvaluator->evalImp(arguments);
-  testEvaluator->updateSamplesNum(arguments);
-  testEvaluator->finish();
-  LOG(INFO) << *testEvaluator;
-
-  std::vector<std::string> names;
-  testEvaluator->getNames(&names);
-  paddle::Error err;
-  for (auto& name : names) {
-    auto value = testEvaluator->getValue(name, &err);
-    ASSERT_TRUE(err.isOK());
-    LOG(INFO) << name << " " << value;
-    auto tp = testEvaluator->getType(name, &err);
-    ASSERT_TRUE(err.isOK());
-    ASSERT_EQ(testConf.evaluatorConfig.type(), tp);
-  }
-
-  double totalScore2 = 0.0;
-  if (testConf.testAccumulate) {
-    testEvaluator->start();
-    totalScore2 += testEvaluator->evalImp(arguments);
-    testEvaluator->finish();
-    EXPECT_LE(fabs(totalScore - totalScore2), 1.0e-5);
-  }
-}
-
-void testEvaluatorAll(TestConfig testConf,
-                      string testEvaluatorName,
-                      size_t batchSize) {
-  testEvaluator(testConf, testEvaluatorName, batchSize, true);
-  testEvaluator(testConf, testEvaluatorName, batchSize, false);
-}
-
-TEST(Evaluator, detection_map) {
-  TestConfig config;
-  config.evaluatorConfig.set_type("detection_map");
-  config.evaluatorConfig.set_overlap_threshold(0.5);
-  config.evaluatorConfig.set_background_id(0);
-  config.evaluatorConfig.set_ap_type("Integral");
-  config.evaluatorConfig.set_evaluate_difficult(0);
-
-  config.inputDefs.push_back({INPUT_DATA, "output", 7});
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "label", 6});
-  config.evaluatorConfig.set_evaluate_difficult(false);
-  testEvaluatorAll(config, "detection_map", 100);
-
-  config.evaluatorConfig.set_evaluate_difficult(true);
-  testEvaluatorAll(config, "detection_map", 100);
-}
-
-TEST(Evaluator, classification_error) {
-  TestConfig config;
-  config.evaluatorConfig.set_type("classification_error");
-  config.evaluatorConfig.set_top_k(5);
-
-  config.inputDefs.push_back({INPUT_DATA, "output", 50});
-  config.inputDefs.push_back({INPUT_LABEL, "label", 50});
-  testEvaluatorAll(config, "classification_error", 100);
-  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
-  testEvaluatorAll(config, "classification_error_weight", 100);
-
-  // multi binary labels
-  config.inputDefs.clear();
-  config.inputDefs.push_back({INPUT_DATA, "output", 100});
-  config.inputDefs.push_back({INPUT_SPARSE_NON_VALUE_DATA, "label", 100});
-  // Not support GPU
-  testEvaluator(config, "classification_error_multi_binary_label", 50, false);
-
-  config.evaluatorConfig.set_classification_threshold(0.4);
-  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
-  // Not support GPU
-  testEvaluator(
-      config, "classification_error_weight_multi_binary_label", 50, false);
-}
-
-TEST(Evaluator, sum) {
-  TestConfig config;
-  config.evaluatorConfig.set_type("sum");
-
-  // sum of output
-  config.inputDefs.push_back({INPUT_DATA, "output", 10});
-  testEvaluatorAll(config, "sum_output", 200);
-  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
-  testEvaluatorAll(config, "sum_output_weight", 200);
-
-  // sum of label
-  config.inputDefs.clear();
-  config.inputDefs.push_back({INPUT_LABEL, "label", 10});
-  testEvaluatorAll(config, "sum_label", 200);
-  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
-  testEvaluatorAll(config, "sum_label_weight", 200);
-}
-
-TEST(Evaluator, last_column_sum) {
-  TestConfig config;
-  config.evaluatorConfig.set_type("last-column-sum");
-
-  config.inputDefs.push_back({INPUT_DATA, "output", 50});
-  testEvaluatorAll(config, "last-column-sum", 200);
-  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
-  testEvaluatorAll(config, "last-column-sum_weight", 200);
-}
-
-TEST(Evaluator, last_column_auc) {
-  TestConfig config;
-  config.evaluatorConfig.set_type("last-column-auc");
-
-  config.inputDefs.push_back({INPUT_DATA, "output", 2});
-  config.inputDefs.push_back({INPUT_LABEL, "label", 2});
-  testEvaluatorAll(config, "last-column-auc", 500);
-  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
-  testEvaluatorAll(config, "last-column-auc_weight", 200);
-}
-
-TEST(Evaluator, precision_recall) {
-  TestConfig config;
-  config.evaluatorConfig.set_type("precision_recall");
-
-  config.inputDefs.push_back({INPUT_DATA, "output", 10});
-  config.inputDefs.push_back({INPUT_LABEL, "label", 10});
-  testEvaluatorAll(config, "precision_recall", 200);
-  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
-  testEvaluatorAll(config, "precision_recall_weight", 200);
-
-  LOG(INFO) << "positive_label = 5";
-  config.evaluatorConfig.set_positive_label(5);
-  testEvaluatorAll(config, "precision_recall_weight", 200);
-
-  // multi binary labels
-  config.inputDefs.clear();
-  config.evaluatorConfig.set_positive_label(-1);
-  config.inputDefs.push_back({INPUT_DATA, "output", 10});
-  config.inputDefs.push_back({INPUT_SPARSE_NON_VALUE_DATA, "label", 10});
-  // Not support GPU
-  testEvaluator(config, "precision_recall_multi_binary_label", 100, false);
-
-  LOG(INFO) << "classification_threshold = 0.4";
-  config.evaluatorConfig.set_classification_threshold(0.4);
-  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
-  // Not support GPU
-  testEvaluator(
-      config, "precision_recall_weight_multi_binary_label", 100, false);
-}
-
-TEST(Evaluator, ctc_error_evaluator) {
-  TestConfig config;
-  config.evaluatorConfig.set_type("ctc_edit_distance");
-
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "output", 32});
-  config.inputDefs.push_back({INPUT_SEQUENCE_LABEL, "label", 1});
-  testEvaluatorAll(config, "ctc_error_evaluator", 100);
-}
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(1);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_KmaxSeqScore.cpp b/paddle/gserver/tests/test_KmaxSeqScore.cpp
deleted file mode 100644
index 168ffbdac8cd6fb0ee4fa62e3766905c30d1844b..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_KmaxSeqScore.cpp
+++ /dev/null
@@ -1,164 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "ModelConfig.pb.h"
-#include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/utils/GlobalConstants.h"
-
-#include "LayerGradUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_bool(use_gpu);
-DECLARE_int32(gpu_id);
-DECLARE_bool(thread_local_rand_use_global_seed);
-
-vector<int> randSampling(int range, int n) {
-  CHECK_GE(range, n);
-  vector<int> num(range);
-  iota(begin(num), end(num), 0);
-  if (range == n) return num;
-
-  random_shuffle(begin(num), end(num));
-  num.resize(n);
-  return num;
-}
-
-void genRandomSeqInfo(vector<int>& seqStartPosition,
-                      vector<int>& subSeqStartPosition) {
-  const int maxSeqNum = 100;
-  // generate random start position information
-  int seqNum = 1 + (rand() % maxSeqNum);
-  seqStartPosition.resize(seqNum + 1, 0);
-  subSeqStartPosition.resize(1, 0);
-
-  for (int i = 0; i < seqNum; ++i) {
-    int subSeqLen = 1 + (rand() % maxSeqNum);
-    for (int j = 0; j < subSeqLen; ++j)
-      subSeqStartPosition.push_back(subSeqStartPosition.back() + subSeqLen);
-    seqStartPosition[i + 1] = subSeqStartPosition.back();
-  }
-}
-
-void genRandomGroundTruth(real* values,
-                          vector<vector<int>>& groundTruth,
-                          vector<int>& startPos,
-                          size_t beamSize) {
-  groundTruth.resize(startPos.size() - 1, vector<int>(beamSize, -1));
-  for (size_t i = 0; i < startPos.size() - 1; ++i) {
-    int seqLen = startPos[i + 1] - startPos[i];
-    vector<int> pos =
-        randSampling(seqLen, min(static_cast<int>(beamSize), seqLen));
-    for (size_t j = 0; j < pos.size(); ++j) {
-      groundTruth[i][j] = pos[j];
-      values[startPos[i] + pos[j]] = 1.;
-    }
-  }
-}
-
-void checkLayerOut(vector<vector<int>> groundTruth,
-                   real* layerOut,
-                   size_t beamSize) {
-  for (size_t i = 0; i < groundTruth.size(); ++i) {
-    int begPos = i * beamSize;
-    vector<real> tmp(layerOut + begPos, layerOut + begPos + beamSize);
-    sort(begin(tmp), end(tmp));
-    sort(begin(groundTruth[i]), end(groundTruth[i]));
-    for (size_t j = 0; j < beamSize; ++j) CHECK_EQ(tmp[j], groundTruth[i][j]);
-  }
-}
-
-TEST(Layer, kmaxSeqScoreLayer) {
-  const size_t maxBeamSize = 100;
-  size_t beamSize = 1 + (rand() % maxBeamSize);
-
-  vector<int> seqStartPosition;
-  vector<int> subSeqStartPosition;
-  genRandomSeqInfo(seqStartPosition, subSeqStartPosition);
-  MatrixPtr inValue =
-      Matrix::create(subSeqStartPosition.back(), 1, false, false);
-
-  std::vector<bool> mode = {false};
-#ifdef PADDLE_WITH_CUDA
-  mode.push_back(true);
-#endif
-
-  for (auto hasSubseq : {false, true}) {
-    vector<vector<int>> groundTruth;
-    inValue->randomizeUniform();
-    genRandomGroundTruth(inValue->getData(),
-                         groundTruth,
-                         hasSubseq ? subSeqStartPosition : seqStartPosition,
-                         beamSize);
-
-    for (auto useGpu : mode) {
-      TestConfig config;
-      config.layerConfig.set_type("kmax_seq_score");
-      config.layerConfig.set_beam_size(beamSize);
-
-      if (hasSubseq) {
-        config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
-                                    "scores",
-                                    inValue,
-                                    seqStartPosition,
-                                    subSeqStartPosition});
-      } else {
-        config.inputDefs.push_back(
-            {INPUT_SELF_DEFINE_DATA, "scores", inValue, seqStartPosition});
-      }
-      config.layerConfig.add_inputs();
-
-      // data layer initialize
-      std::vector<DataLayerPtr> dataLayers;
-      LayerMap layerMap;
-      vector<Argument> datas;
-      initDataLayer(
-          config,
-          &dataLayers,
-          &datas,
-          &layerMap,
-          "kmax_seq_score",
-          100 /* actually this parameter is unused in self-defined input*/,
-          false,
-          useGpu);
-      // test layer initialize
-      std::vector<ParameterPtr> parameters;
-      LayerPtr kmaxSeqScoreLayer;
-      FLAGS_use_gpu = useGpu;
-      initTestLayer(config, &layerMap, &parameters, &kmaxSeqScoreLayer);
-      kmaxSeqScoreLayer->forward(PASS_TRAIN);
-
-      const MatrixPtr outValue = kmaxSeqScoreLayer->getOutputValue();
-      CHECK_EQ(outValue->getHeight(),
-               hasSubseq ? subSeqStartPosition.size() - 1
-                         : seqStartPosition.size() - 1);
-      CHECK_EQ(outValue->getWidth(), beamSize);
-      checkLayerOut(groundTruth, outValue->getData(), beamSize);
-    }
-  }
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand((size_t)(time(NULL)));
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
deleted file mode 100644
index 1254d580505512dc8fd7e34a053a7538832d271f..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ /dev/null
@@ -1,2532 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_CUDA
-#include <cudnn.h>
-#endif
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-#include "ModelConfig.pb.h"
-#include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/math/MathUtils.h"
-
-#include "LayerGradUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_bool(use_gpu);
-DECLARE_int32(gpu_id);
-DECLARE_double(checkgrad_eps);
-DECLARE_bool(thread_local_rand_use_global_seed);
-DECLARE_bool(prev_batch_state);
-
-TEST(Operator, dot_mul) {
-  TestConfig config;
-  config.layerConfig.set_size(10);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  OperatorConfig& operatorConf = *config.layerConfig.add_operator_confs();
-  operatorConf.set_type("dot_mul");
-  operatorConf.set_dotmul_scale(-1);
-
-  testOperatorGrad(config, operatorConf, 100, false, false);
-}
-
-TEST(Projection, context) {
-  for (auto contextStart : {-5, -3, -1, 0, 3}) {
-    for (auto contextLength : {1, 2, 5, 7}) {
-      for (auto batchSize : {1, 2, 5, 20}) {
-        for (auto trainablePadding : {false, true}) {
-          LOG(INFO) << " contextStart=" << contextStart
-                    << " contextLength=" << contextLength
-                    << " batchSize=" << batchSize
-                    << " trainablePadding=" << trainablePadding;
-          ProjectionConfig conf;
-          conf.set_type("context");
-          conf.set_input_size(10);
-          conf.set_context_start(contextStart);
-          conf.set_context_length(contextLength);
-          conf.set_trainable_padding(trainablePadding);
-          conf.set_output_size(conf.context_length() * conf.input_size());
-          int pad =
-              std::max(0, -conf.context_start()) +
-              std::max(0, conf.context_start() + conf.context_length() - 1);
-          for (auto useGpu : {false, true}) {
-            testProjectionGrad(
-                conf,
-                INPUT_SEQUENCE_DATA,
-                trainablePadding ? conf.input_size() * pad : 0,
-                batchSize,
-                useGpu,
-                contextStart + contextLength <= 1);  // = testState
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(Projection, trans_fc) {
-  ProjectionConfig conf;
-  conf.set_type("trans_fc");
-  conf.set_input_size(50);
-  conf.set_output_size(20);
-  for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf,
-                       INPUT_DATA,
-                       /* parameterSize */ 1000,
-                       /* batchSize */ 100,
-                       useGpu);
-  }
-}
-
-TEST(Projection, fc) {
-  ProjectionConfig conf;
-  conf.set_type("fc");
-  conf.set_input_size(10);
-  conf.set_output_size(20);
-  for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf,
-                       INPUT_DATA,
-                       /* parameterSize */ 200,
-                       /* batchSize */ 100,
-                       useGpu);
-  }
-}
-
-TEST(Projection, dot_mul) {
-  ProjectionConfig conf;
-  conf.set_type("dot_mul");
-  conf.set_input_size(20);
-  conf.set_output_size(20);
-  for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf,
-                       INPUT_DATA,
-                       /* parameterSize */ 20,
-                       /* batchSize */ 100,
-                       useGpu);
-  }
-}
-
-TEST(Projection, table) {
-  ProjectionConfig conf;
-  conf.set_type("table");
-  conf.set_input_size(10);
-  conf.set_output_size(20);
-  for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf,
-                       INPUT_LABEL,
-                       /* parameterSize */ 200,
-                       /* batchSize */ 100,
-                       useGpu);
-  }
-}
-
-TEST(Projection, identity) {
-  ProjectionConfig conf;
-  conf.set_type("identity");
-  conf.set_input_size(10);
-  conf.set_output_size(10);
-  for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf,
-                       INPUT_DATA,
-                       /* parameterSize */ 0,
-                       /* batchSize */ 100,
-                       useGpu);
-  }
-}
-
-TEST(Projection, slice) {
-  ProjectionConfig conf;
-  conf.set_type("slice");
-  conf.set_input_size(100);
-  SliceConfig& slice1 = *conf.add_slices();
-  slice1.set_start(10);
-  slice1.set_end(20);
-  SliceConfig& slice2 = *conf.add_slices();
-  slice2.set_start(50);
-  slice2.set_end(70);
-  conf.set_output_size(30);
-  for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf,
-                       INPUT_DATA,
-                       /* parameterSize */ 0,
-                       /* batchSize */ 10,
-                       useGpu);
-  }
-}
-
-TEST(Projection, scaling) {
-  ProjectionConfig conf;
-  conf.set_type("scaling");
-  conf.set_input_size(10);
-  conf.set_output_size(10);
-  for (auto useGpu : {false}) {
-    testProjectionGrad(conf,
-                       INPUT_DATA,
-                       /* parameterSize */ 1,
-                       /* batchSize */ 100,
-                       useGpu);
-  }
-}
-
-void testProjectionConv(size_t groups, bool isDeconv) {
-  const int NUM_FILTERS = 18;
-  const int FILTER_SIZE = 2;
-  const int FILTER_SIZE_Y = 2;
-  const int CHANNELS = 3;
-  const int IMAGE_SIZE = 16;
-
-#if CUDNN_VERSION >= 6000
-  const int DILATION = 2;
-#else
-  const int DILATION = 1;
-#endif
-
-  ProjectionConfig conf;
-  if (isDeconv) {
-    conf.set_type("convt");
-  } else {
-    conf.set_type("conv");
-  }
-  conf.set_num_filters(NUM_FILTERS);
-
-  ConvConfig* conv = conf.mutable_conv_conf();
-  conv->set_filter_size(FILTER_SIZE);
-  conv->set_filter_size_y(FILTER_SIZE_Y);
-  conv->set_channels(CHANNELS);
-  conv->set_padding(0);
-  conv->set_padding_y(1);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_dilation(DILATION);
-  conv->set_dilation_y(DILATION);
-  conv->set_groups(groups);
-  if (isDeconv) {
-    conv->set_filter_channels(NUM_FILTERS / conv->groups());
-  } else {
-    conv->set_filter_channels(conv->channels() / conv->groups());
-  }
-  conv->set_img_size(IMAGE_SIZE);
-  int output_x = outputSize(conv->img_size(),
-                            (conv->filter_size() - 1) * DILATION + 1,
-                            conv->padding(),
-                            conv->stride(),
-                            /* caffeMode */ true);
-  int output_y = outputSize(conv->img_size(),
-                            (conv->filter_size_y() - 1) * DILATION + 1,
-                            conv->padding_y(),
-                            conv->stride_y(),
-                            /* caffeMode */ true);
-  conv->set_output_x(output_x);
-  conv->set_output_y(output_y);
-  LOG(INFO) << "DILATION:" << DILATION << "; output_x: " << output_x
-            << "; output_y: " << output_y;
-  if (isDeconv) {
-    int deconv_image_x = imageSize(output_x,
-                                   (conv->filter_size() - 1) * DILATION + 1,
-                                   conv->padding(),
-                                   conv->stride(),
-                                   /* caffeMode */ true);
-    int deconv_image_y = imageSize(output_y,
-                                   (conv->filter_size_y() - 1) * DILATION + 1,
-                                   conv->padding_y(),
-                                   conv->stride_y(),
-                                   /* caffeMode */ true);
-
-    LOG(INFO) << " deconv_image_x: " << deconv_image_x
-              << "; deconv_image_y: " << deconv_image_y;
-    conf.set_input_size(output_x * output_y * CHANNELS);
-    conf.set_output_size(deconv_image_x * deconv_image_y * NUM_FILTERS);
-  } else {
-    conf.set_input_size(IMAGE_SIZE * IMAGE_SIZE * CHANNELS);
-    conf.set_output_size(output_x * output_y * NUM_FILTERS);
-  }
-
-  testProjectionGrad(conf,
-                     INPUT_DATA,
-                     /* parameterSize */ NUM_FILTERS * CHANNELS * FILTER_SIZE *
-                         FILTER_SIZE_Y / groups,
-                     /* batchSize */ 100,
-                     true,
-                     false,
-                     NUM_FILTERS,
-                     true);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(Projection, conv) {
-  /// test ConvProjection
-  testProjectionConv(1, false);
-  testProjectionConv(3, false);
-  /// test ConvTransProjection
-  testProjectionConv(1, true);
-  testProjectionConv(3, true);
-}
-#endif
-
-TEST(Layer, BilinearInterpLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("bilinear_interp");
-  config.biasSize = 0;
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 4096, 0});
-
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  BilinearInterpConfig* bilinear = input->mutable_bilinear_interp_conf();
-  ImageConfig* image = bilinear->mutable_image_conf();
-  image->set_img_size(32);
-  image->set_img_size_y(32);
-  image->set_channels(4);
-
-  for (auto useGpu : {false, true}) {
-    for (auto outSize : {32, 64}) {
-      bilinear->set_out_size_x(outSize);
-      bilinear->set_out_size_y(outSize);
-      testLayerGrad(config, "bilinear_interp", 10, false, useGpu);
-    }
-  }
-}
-
-TEST(Layer, concat) {
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("concat");
-  config.layerConfig.set_size(15);
-  config.layerConfig.set_active_type("sigmoid");
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 5, 0});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "concat", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, AddtoLayer) {
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("addto");
-  config.layerConfig.set_size(10);
-  config.layerConfig.set_active_type("sigmoid");
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "addto", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, CTCLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("ctc");
-  config.layerConfig.set_norm_by_times(false);
-  config.layerConfig.set_size(10);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 10, 0});
-  config.inputDefs.push_back({INPUT_SEQUENCE_LABEL, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "ctc",
-                  100,
-                  /* trans */ false, /* useGpu */
-                  useGpu);
-  }
-}
-
-TEST(Layer, cosSimLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("cos");
-  config.layerConfig.set_size(1);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 50, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "cos", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, CosSimVecMatLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("cos_vm");
-  config.layerConfig.set_size(5);  // output size
-  config.layerConfig.set_cos_scale(2.0);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 20, 0});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 100, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "cos_vm", 100, false, useGpu);
-  }
-}
-
-void testDepthwiseConvLayer(const string& type, bool useGpu) {
-  TestConfig config;
-  config.biasSize = 32;
-  config.layerConfig.set_type(type);
-  config.layerConfig.set_num_filters(32);
-  config.layerConfig.set_partial_sum(1);
-  config.layerConfig.set_shared_biases(true);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 2048, 192});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-  conv->set_filter_size(2);
-  conv->set_filter_size_y(3);
-  conv->set_channels(16);
-  conv->set_padding(0);
-  conv->set_padding_y(1);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_groups(16);
-  conv->set_filter_channels(conv->channels() / conv->groups());
-  conv->set_img_size(16);
-  conv->set_img_size_y(8);
-  conv->set_output_x(outputSize(conv->img_size(),
-                                conv->filter_size(),
-                                conv->padding(),
-                                conv->stride(),
-                                /* caffeMode */ true));
-  conv->set_output_y(outputSize(conv->img_size_y(),
-                                conv->filter_size_y(),
-                                conv->padding_y(),
-                                conv->stride_y(),
-                                /* caffeMode */ true));
-  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
-                              config.layerConfig.num_filters());
-
-  testLayerGrad(config, "depthwise_conv", 100, false, useGpu);
-  // Use small batch_size and useWeight=true to test biasGrad
-  testLayerGrad(config, "depthwise_conv", 2, false, useGpu, true, 0.02);
-}
-
-TEST(Layer, depthwiseConvLayer) {
-  //  'depthwise_conv' is a sepecial case of 'exconv' whose
-  //  groups size equals to the input channels size.
-  testDepthwiseConvLayer("exconv", /* useGpu= */ false);
-#ifdef PADDLE_WITH_CUDA
-  testDepthwiseConvLayer("exconv", /* useGpu= */ true);
-#endif
-}
-
-void testConvLayer(const string& type, bool trans, bool useGpu) {
-  TestConfig config;
-  config.biasSize = 16;
-  config.layerConfig.set_type(type);
-  config.layerConfig.set_num_filters(16);
-  config.layerConfig.set_partial_sum(1);
-  config.layerConfig.set_shared_biases(true);
-
-  int dilation = 2;
-  if (type == "cudnn_conv") {
-#if CUDNN_VERSION >= 6000
-    dilation = 2;
-#else
-    dilation = 1;
-#endif
-  }
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 768, 192});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-  conv->set_filter_size(2);
-  conv->set_filter_size_y(2);
-  conv->set_channels(3);
-  conv->set_padding(0);
-  conv->set_padding_y(1);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_dilation(dilation);
-  conv->set_dilation_y(dilation);
-  conv->set_groups(1);
-  conv->set_filter_channels(conv->channels() / conv->groups());
-  conv->set_img_size(16);
-  conv->set_img_size_y(16);
-  conv->set_output_x(outputSize(conv->img_size(),
-                                (conv->filter_size() - 1) * dilation + 1,
-                                conv->padding(),
-                                conv->stride(),
-                                /* caffeMode */ true));
-  conv->set_output_y(outputSize(conv->img_size_y(),
-                                (conv->filter_size_y() - 1) * dilation + 1,
-                                conv->padding_y(),
-                                conv->stride_y(),
-                                /* caffeMode */ true));
-  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
-                              config.layerConfig.num_filters());
-
-  testLayerGrad(config, "conv", 100, trans, useGpu);
-  // Use small batch_size and useWeight=true to test biasGrad
-  testLayerGrad(config, "conv", 2, trans, useGpu, true, 0.02);
-}
-
-TEST(Layer, convLayer) {
-  testConvLayer("exconv", /* trans= */ false, /* useGpu= */ false);
-#ifdef PADDLE_WITH_CUDA
-  testConvLayer("exconv", /* trans= */ false, /* useGpu= */ true);
-  testConvLayer("cudnn_conv", /* trans= */ false, /* useGpu= */ true);
-#endif
-}
-
-void testConvTransLayer(const string& type, bool trans, bool useGpu) {
-  TestConfig config;
-  config.biasSize = 3;
-  config.layerConfig.set_type(type);
-  config.layerConfig.set_num_filters(3);
-  config.layerConfig.set_partial_sum(1);
-  config.layerConfig.set_shared_biases(true);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 384});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-  conv->set_filter_size(2);
-  conv->set_filter_size_y(4);
-  conv->set_channels(16);
-  conv->set_padding(0);
-  conv->set_padding_y(1);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_groups(1);
-  conv->set_filter_channels(3 / conv->groups());
-  conv->set_img_size(16);
-  conv->set_output_x(outputSize(conv->img_size(),
-                                conv->filter_size(),
-                                conv->padding(),
-                                conv->stride(),
-                                /* caffeMode */ true));
-
-  config.layerConfig.set_size(conv->img_size() * conv->img_size() *
-                              config.layerConfig.num_filters());
-
-  testLayerGrad(config, "convTrans", 100, trans, useGpu);
-  // Use small batch_size and useWeight=true to test biasGrad
-  testLayerGrad(config, "convTrans", 2, trans, useGpu, true, 0.02);
-}
-
-TEST(Layer, convTransLayer) {
-  for (auto useGpu : {false, true}) {
-    testConvTransLayer("exconvt", /* trans= */ false, /* useGpu= */ useGpu);
-  }
-#ifdef PADDLE_WITH_CUDA
-  testConvTransLayer("cudnn_convt", /* trans= */ false, /* useGpu= */ true);
-#endif
-}
-
-TEST(Layer, blockExpandLayer) {
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("blockexpand");
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 6144, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  BlockExpandConfig* blockExpand = input->mutable_block_expand_conf();
-  blockExpand->set_img_size_x(64);
-  blockExpand->set_img_size_y(32);
-  blockExpand->set_channels(3);
-  blockExpand->set_padding_x(0);
-  blockExpand->set_padding_y(0);
-  blockExpand->set_block_x(4);
-  blockExpand->set_block_y(32);
-  blockExpand->set_stride_x(2);
-  blockExpand->set_stride_y(2);
-  blockExpand->set_output_x(outputSize(blockExpand->img_size_x(),
-                                       blockExpand->block_x(),
-                                       blockExpand->padding_x(),
-                                       blockExpand->stride_x(),
-                                       /* caffeMode */ false));
-  blockExpand->set_output_y(outputSize(blockExpand->img_size_y(),
-                                       blockExpand->block_y(),
-                                       blockExpand->padding_y(),
-                                       blockExpand->stride_y(),
-                                       /* caffeMode */ false));
-  config.layerConfig.set_size(blockExpand->block_x() * blockExpand->block_y() *
-                              blockExpand->channels());
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "blockexpand", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, maxoutLayer) {
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("maxout");
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 4096, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  MaxOutConfig* maxout = input->mutable_maxout_conf();
-  ImageConfig* image = maxout->mutable_image_conf();
-
-  image->set_img_size(32);
-  image->set_img_size_y(32);
-  image->set_channels(4);
-  maxout->set_groups(2);
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "maxout", 10, false, useGpu);
-  }
-}
-
-void testFcLayer(string format, size_t nnz) {
-  TestConfig config;
-  config.biasSize = 1024;
-  config.layerConfig.set_type("fc");
-  config.layerConfig.set_size(1024);
-  config.layerConfig.set_active_type("sigmoid");
-  config.layerConfig.set_drop_rate(0.1);
-
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", 2048, nnz, ParaSparse(format)});
-  config.layerConfig.add_inputs();
-
-  LOG(INFO) << config.inputDefs[0].sparse.sparse << " "
-            << config.inputDefs[0].sparse.format;
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "fc",
-                  100,
-                  /* trans */ false,
-                  useGpu,
-                  /* weight */ true);
-  }
-}
-
-TEST(Layer, fcLayer) {
-  testFcLayer("", 1024 * 1024 * 2);
-  testFcLayer("csc", 1024 * 10);
-  testFcLayer("csr", 1024 * 10);
-}
-
-TEST(Layer, SelectiveFullyConnectedLayer) {
-  TestConfig config;
-  size_t nin = 16;
-  size_t nout = 256;
-  config.layerConfig.set_type("selective_fc");
-  config.layerConfig.set_size(nout);
-  config.layerConfig.set_active_type("sigmoid");
-  config.layerConfig.set_has_selected_colums(true);
-  config.layerConfig.set_selective_fc_pass_generation(false);
-  config.biasSize = nout;
-
-  config.inputDefs.push_back({INPUT_DATA, "input0", nin, nin * nout});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back(
-      {INPUT_SPARSE_NON_VALUE_DATA, "index", nout, 0, ParaSparse("csr", true)});
-  config.layerConfig.add_inputs();
-
-  testLayerGrad(config,
-                "selective_fc",
-                100,
-                /* trans= */ false,
-                /* useGup= */ false,
-                false);
-#ifdef PADDLE_WITH_CUDA
-  testLayerGrad(config,
-                "selective_fc",
-                100,
-                /* trans= */ false,
-                /* useGup= */ true,
-                false);
-#endif
-}
-
-TEST(Layer, DataNormLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("data_norm");
-  config.layerConfig.set_size(20);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 20, 100});
-  config.inputDefs.back().isStatic = true;
-  config.layerConfig.add_inputs();
-
-  for (auto strategy : {"z-score", "min-max", "decimal-scaling"}) {
-    config.layerConfig.set_data_norm_strategy(strategy);
-    // The parameters are static, so not support GPU now
-    testLayerGrad(config,
-                  "data_norm",
-                  200,
-                  /* trans */ false,
-                  /* useGpu */ false);
-  }
-}
-
-TEST(Layer, hsigmoidLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("hsigmoid");
-  config.layerConfig.set_num_classes(5);
-  config.layerConfig.set_size(1);
-  config.biasSize = config.layerConfig.num_classes() - 1;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 200});
-  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 5, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "hsigmoid",
-                  100,
-                  /* trans */ false,
-                  /* useGpu */ useGpu);
-  }
-}
-
-TEST(Layer, multi_cross) {
-  TestConfig config;
-  config.layerConfig.set_type("multi-class-cross-entropy");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(
-        config, "multi-class-cross-entropy", 100, /* trans */ false, useGpu);
-  }
-}
-
-TEST(Layer, multi_binary_label_sparse_mat) {
-  TestConfig config;
-  config.layerConfig.set_type("multi_binary_label_cross_entropy");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-  config.inputDefs.push_back({INPUT_SPARSE_NON_VALUE_DATA, "layer_1", 50, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "multi_binary_label_cross_entropy",
-                  100,
-                  /* trans */ false,
-                  useGpu);
-  }
-}
-
-TEST(layer, multi_binary_label_id) {
-  TestConfig config;
-  config.layerConfig.set_type("multi_binary_label_cross_entropy");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "multi_binary_label_cross_entropy",
-                  100,
-                  /* trans */ false,
-                  useGpu);
-  }
-}
-
-TEST(Layer, multi_cross_with_selfnorm) {
-  TestConfig config;
-  config.layerConfig.set_type("multi_class_cross_entropy_with_selfnorm");
-  config.layerConfig.set_softmax_selfnorm_alpha(0.1);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  // Not support GPU now
-  testLayerGrad(config,
-                "multi_class_cross_entropy_with_selfnorm",
-                100,
-                /* trans */ false,
-                /* useGpu */ false);
-}
-
-TEST(Layer, multi_cross_soft) {
-  TestConfig config;
-  config.layerConfig.set_type("soft_binary_class_cross_entropy");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "soft_binary_class_cross_entropy",
-                  100,
-                  /* trans */ false,
-                  useGpu);
-  }
-}
-
-TEST(Layer, square_error) {
-  TestConfig config;
-  config.layerConfig.set_type("square_error");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "square_error", 100, /* trans */ false, useGpu);
-  }
-}
-
-TEST(Layer, sparse_square_error) {
-  TestConfig config;
-  config.layerConfig.set_type("square_error");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-  config.inputDefs.push_back({INPUT_SPARSE_NON_VALUE_DATA, "layer_1", 50, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  // "GpuSparseMatrix" as label is not supported
-  testLayerGrad(config,
-                "square_error",
-                100,
-                /* trans */ false,
-                /* useGpu */ false);
-}
-
-TEST(Layer, sparse_float_square_error) {
-  TestConfig config;
-  config.layerConfig.set_type("square_error");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-  config.inputDefs.push_back({INPUT_SPARSE_FLOAT_VALUE_DATA, "layer_1", 50, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  // "GpuSparseMatrix" as label is not supported
-  testLayerGrad(config,
-                "square_error",
-                100,
-                /* trans */ false,
-                /* useGpu */ false);
-}
-
-TEST(Layer, square_error_weighted) {
-  TestConfig config;
-  config.layerConfig.set_type("square_error");
-  config.biasSize = 0;
-  config.testAccumulate = false;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_2", 1, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "square_error", 100, /* trans */ false, useGpu);
-  }
-}
-
-TEST(Layer, huber_regression_loss) {
-  TestConfig config;
-  config.layerConfig.set_type("huber_regression");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    for (auto delta : {1, 3, 5}) {
-      config.layerConfig.set_delta(delta);
-      testLayerGrad(config, "huber_regression", 100, /* trans */ false, useGpu);
-    }
-  }
-}
-
-TEST(Layer, huber_two_class) {
-  TestConfig config;
-  config.layerConfig.set_type("huber_classification");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 2, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "huber_two_class", 100, /* trans */ false, useGpu);
-  }
-}
-
-void testExpandLayer(string trans_type, bool hasSubseq) {
-  TestConfig config;
-  config.layerConfig.set_type("expand");
-
-  config.inputDefs.push_back(
-      {trans_type == "non-seq" ? INPUT_DENSE_DIM_DATA : INPUT_SEQUENCE_DATA,
-       "layer_0",
-       10,
-       0});
-  config.inputDefs.push_back(
-      {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA,
-       "layer_1",
-       10,
-       0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.set_trans_type(trans_type);
-  LOG(INFO) << " trans_type=" << trans_type << " hasSubseq=" << hasSubseq;
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "expand", 30, false, useGpu);
-  }
-}
-
-TEST(Layer, ExpandLayer) {
-  testExpandLayer("non-seq", false);  // non-seq expand to seq
-  testExpandLayer("non-seq", true);   // non-seq expand to hasSubseq
-  testExpandLayer("seq", true);       // seq expand to hasSubseq
-}
-
-void testDegradeLayer(bool hasSubseq,
-                      string layer_type,
-                      string trans_type,
-                      int stride) {
-  TestConfig config;
-  config.layerConfig.set_type(layer_type);
-  config.layerConfig.set_size(10);
-  config.layerConfig.set_seq_pool_stride(stride);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back(
-      {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA,
-       "layer_0",
-       10,
-       0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.set_trans_type(trans_type);
-
-  auto testDegradeLayerGrad = [](TestConfig& config, string layer_type) {
-    for (auto useGpu : {false, true}) {
-      testLayerGrad(config, layer_type, 100, false, useGpu);
-    }
-  };
-
-  if (layer_type == "average") {
-    for (auto strategy : {"average", "sum", "squarerootn"}) {
-      LOG(INFO) << " hasSubseq=" << hasSubseq << " trans_type=" << trans_type
-                << " average_strategy=" << strategy
-                << " seq_pool_stride=" << stride;
-      config.layerConfig.set_average_strategy(strategy);
-      testDegradeLayerGrad(config, layer_type);
-    }
-  } else {
-    LOG(INFO) << " hasSubseq=" << hasSubseq << " trans_type=" << trans_type
-              << " seq_pool_stride=" << stride;
-    testDegradeLayerGrad(config, layer_type);
-  }
-}
-
-TEST(Layer, MaxLayer) {
-  testDegradeLayer(false, "max", "non-seq", -1);  // seq max to non-seq
-  testDegradeLayer(false,
-                   "max",
-                   "non-seq",
-                   5);  // seq max to a shorten seq, stride window = 5
-  testDegradeLayer(true, "max", "non-seq", -1);  // hasSubseq max to non-seq
-  testDegradeLayer(true, "max", "seq", -1);      // hasSubseq max to seq
-}
-
-TEST(Layer, SequenceLastInstanceLayer) {
-  testDegradeLayer(false,
-                   "seqlastins",
-                   "non-seq",
-                   -1);  // seq seqlastins to non-seq
-  testDegradeLayer(false,
-                   "seqlastins",
-                   "non-seq",
-                   5);  // seq seqlastins to a shorten seq, stride window = 5
-  testDegradeLayer(true,
-                   "seqlastins",
-                   "non-seq",
-                   -1);  // hasSubseq seqlastins to non-seq
-  testDegradeLayer(true,
-                   "seqlastins",
-                   "seq",
-                   -1);  // hasSubseq seqlastins to seq
-}
-
-TEST(Layer, AverageLayer) {
-  testDegradeLayer(false, "average", "non-seq", -1);  // seq average to non-seq
-  testDegradeLayer(false,
-                   "average",
-                   "non-seq",
-                   5);  // seq average to a shorten seq, stride window = 5
-  testDegradeLayer(true,
-                   "average",
-                   "non-seq",
-                   -1);                          // hasSubseq average to non-seq
-  testDegradeLayer(true, "average", "seq", -1);  // hasSubseq average to seq
-}
-
-TEST(Layer, SequenceConcatLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("seqconcat");
-  config.layerConfig.set_size(10);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 10, 0});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "seqconcat", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, SequenceReshapeLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("seqreshape");
-  config.layerConfig.set_size(10);
-
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 100, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "seqreshape", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, ConvShiftLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("conv_shift");
-  config.layerConfig.set_size(10);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 3, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  // Not support GPU now
-  testLayerGrad(config, "conv_shift", 100, false, false);
-}
-
-TEST(Layer, PowerLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("power");
-  config.layerConfig.set_size(10);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "power", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, ConvexCombinationLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("convex_comb");
-  config.layerConfig.set_size(20);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 5, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 100, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "convex_comb", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, InterpolationLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("interpolation");
-  config.layerConfig.set_size(10);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_2", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "interpolation", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, DotProdLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("dot_prod");
-  config.layerConfig.set_size(1);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "dot_prod", 10, false, useGpu);
-  }
-}
-
-TEST(Layer, OuterProdLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("out_prod");
-  config.layerConfig.set_size(100);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "out_prod", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, SlopeInterceptLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("slope_intercept");
-  config.layerConfig.set_size(10);
-  config.layerConfig.set_slope(1.0);
-  config.layerConfig.set_intercept(0.1);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "slope_intercept", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, ScalingLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("scaling");
-  config.layerConfig.set_size(10);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "scaling", 100, false, useGpu);
-  }
-}
-
-void testNormLayer(const string& normType, bool trans, bool useGpu) {
-  TestConfig config;
-  config.layerConfig.set_type("norm");
-  config.layerConfig.set_active_type("relu");
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1568, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  NormConfig* norm = input->mutable_norm_conf();
-  norm->set_norm_type(normType);
-  norm->set_channels(16);
-  norm->set_size(5);
-  norm->set_scale(0.001);
-  norm->set_pow(0.75);
-  norm->set_blocked(0);
-  norm->set_img_size(14);
-  norm->set_img_size_y(7);
-  norm->set_output_x(norm->img_size());
-  norm->set_output_y(norm->img_size_y());
-  if (norm->norm_type() == "cmrnorm" ||
-      norm->norm_type() == "cmrnorm-projection") {
-    norm->set_scale(norm->scale() / norm->size());
-  } else {
-    norm->set_scale(norm->scale() / (norm->size() * norm->size()));
-  }
-
-  config.layerConfig.set_size(norm->output_x() * norm->output_y() *
-                              norm->channels());
-  config.biasSize = 0;
-
-  testLayerGrad(config, "norm", 100, trans, useGpu);
-}
-
-TEST(Layer, NormLayer) {
-  testNormLayer("cmrnorm-projection",
-                /* trans= */ false, /* useGpu= */
-                true);
-  testNormLayer("cmrnorm-projection",
-                /* trans= */ false, /* useGpu= */
-                false);
-}
-
-void setPoolConfig(TestConfig* config,
-                   PoolConfig* pool,
-                   const string& poolType) {
-  (*config).biasSize = 0;
-  (*config).layerConfig.set_type("pool");
-  (*config).layerConfig.set_num_filters(16);
-
-  int kw = 3, kh = 3;
-  int pw = 0, ph = 0;
-  int sw = 2, sh = 2;
-  pool->set_pool_type(poolType);
-  pool->set_channels(16);
-  pool->set_size_x(kw);
-  pool->set_size_y(kh);
-  pool->set_start(0);
-  pool->set_padding(pw);
-  pool->set_padding_y(ph);
-  pool->set_stride(sw);
-  pool->set_stride_y(sh);
-
-  int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
-  int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
-  pool->set_output_x(ow);
-  pool->set_output_y(oh);
-}
-
-void testPoolLayer(const string& poolType,
-                   bool trans,
-                   bool useGpu,
-                   bool excludeMode = true) {
-  TestConfig config;
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 3136, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  PoolConfig* pool = input->mutable_pool_conf();
-
-  pool->set_img_size(14);
-  pool->set_img_size_y(14);
-  pool->set_exclude_mode(excludeMode);
-  setPoolConfig(&config, pool, poolType);
-  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
-                              pool->channels());
-
-  testLayerGrad(config, "pool", 100, trans, useGpu);
-}
-
-#ifdef PADDLE_WITH_CUDA
-void testPoolLayer2(const string& poolType, bool trans, bool useGpu) {
-  TestConfig config;
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 3200, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  PoolConfig* pool = input->mutable_pool_conf();
-
-  pool->set_size_y(4);
-  pool->set_stride_y(3);
-  pool->set_img_size(10);
-  pool->set_img_size_y(20);
-  setPoolConfig(&config, pool, poolType);
-  pool->set_output_y((pool->img_size_y() - pool->start() - pool->size_y()) /
-                         ((float)pool->stride_y()) +
-                     1.5);
-  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
-                              pool->channels());
-
-  testLayerGrad(config, "pool", 100, trans, useGpu);
-}
-#endif
-
-TEST(Layer, PoolLayer) {
-  testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ false);
-  testPoolLayer("avg-projection",
-                /* trans= */ false,
-                /* useGpu= */ false,
-                /* excludeMode= */ false);
-  testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ false);
-  testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ false);
-
-#ifdef PADDLE_WITH_CUDA
-  testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ true);
-  testPoolLayer("avg-projection",
-                /* trans= */ false,
-                /* useGpu= */ true,
-                /* excludeMode= */ false);
-  testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ true);
-  testPoolLayer("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
-  testPoolLayer("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
-  testPoolLayer2("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
-  testPoolLayer2("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
-  testPoolLayer2("cudnn-avg-incl-pad-pool",
-                 /* trans= */ false,
-                 /* useGpu= */ true);
-  testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ true);
-#endif
-}
-
-void setPool3DConfig(TestConfig* config,
-                     PoolConfig* pool,
-                     const string& poolType) {
-  // filter size
-  const int NUM_FILTERS = 16;
-  const int FILTER_SIZE = 3;
-  const int FILTER_SIZE_Y = 3;
-  const int FILTER_SIZE_Z = 3;
-  const int CHANNELS = 16;
-
-  (*config).biasSize = 0;
-  (*config).layerConfig.set_type("pool3d");
-  (*config).layerConfig.set_num_filters(NUM_FILTERS);
-
-  int kw = FILTER_SIZE, kh = FILTER_SIZE_Y, kd = FILTER_SIZE_Z;
-  int pw = 0, ph = 0, pd = 0;
-  int sw = 2, sh = 2, sd = 2;
-
-  pool->set_pool_type(poolType);
-  pool->set_pool_type("avg");
-  pool->set_channels(CHANNELS);
-  pool->set_size_x(kw);
-  pool->set_size_y(kh);
-  pool->set_size_z(kd);
-  pool->set_padding(0);
-  pool->set_padding_y(0);
-  pool->set_padding_z(0);
-  pool->set_stride(sw);
-  pool->set_stride_y(sh);
-  pool->set_stride_z(sd);
-  pool->set_start(0);
-  int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
-  int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
-  int od = outputSize(pool->img_size_z(), kd, pd, sd, /* caffeMode */ false);
-  pool->set_output_x(ow);
-  pool->set_output_y(oh);
-  pool->set_output_z(od);
-}
-
-void testPool3DLayer(const string& poolType, bool trans, bool useGpu) {
-  TestConfig config;
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 11664, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  PoolConfig* pool = input->mutable_pool_conf();
-
-  const int IMAGE_SIZE = 9;
-  const int IMAGE_SIZE_Y = 9;
-  const int IMAGE_SIZE_Z = 9;
-
-  pool->set_img_size(IMAGE_SIZE);
-  pool->set_img_size_y(IMAGE_SIZE_Y);
-  pool->set_img_size_z(IMAGE_SIZE_Z);
-
-  setPool3DConfig(&config, pool, poolType);
-  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
-                              pool->channels());
-
-  testLayerGrad(config, "pool3d", 100, trans, useGpu);
-}
-
-TEST(Layer, Pool3DLayer) {
-  testPool3DLayer("avg", /* trans= */ false, /* useGpu= */ false);
-  testPool3DLayer("max", /* trans= */ false, /* useGpu= */ false);
-#ifdef PADDLE_WITH_CUDA
-  testPool3DLayer("avg", /* trans= */ false, /* useGpu= */ true);
-  testPool3DLayer("max", /* trans= */ false, /* useGpu= */ true);
-#endif
-}
-
-void testSppLayer(const string& poolType,
-                  const int pyramidHeight,
-                  bool trans,
-                  bool useGpu) {
-  TestConfig config;
-  config.layerConfig.set_type("spp");
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 3200, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  SppConfig* sppConfig = input->mutable_spp_conf();
-  sppConfig->set_pool_type(poolType);
-  sppConfig->set_pyramid_height(pyramidHeight);
-  ImageConfig* imageConfig = sppConfig->mutable_image_conf();
-  imageConfig->set_channels(16);
-  imageConfig->set_img_size(10);
-  imageConfig->set_img_size_y(20);
-  int outputSize = (std::pow(4, sppConfig->pyramid_height()) - 1) / (4 - 1);
-  config.layerConfig.set_size(outputSize * imageConfig->channels());
-  testLayerGrad(config, "spp", 100, trans, useGpu);
-}
-
-TEST(Layer, SpatialPyramidPoolLayer) {
-  for (auto useGpu : {false, true}) {
-    for (auto pyramidHeight : {1, 2, 3}) {
-      testSppLayer("avg-projection", pyramidHeight, false, useGpu);
-      testSppLayer("max-projection", pyramidHeight, false, useGpu);
-    }
-  }
-}
-
-TEST(Layer, rankCostLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("rank-cost");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 1, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_2", 1, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "rank-cost", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, sumCostLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("sum_cost");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "sum_cost", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, weightedRankCostLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("rank-cost");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 1, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_2", 1, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_3", 1, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "weighted-rank-cost", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, TensorLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("tensor");
-  config.layerConfig.set_size(10);
-  config.layerConfig.set_active_type("sigmoid");
-  config.biasSize = config.layerConfig.size();
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 5, 250});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 5, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "tensor", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, RecurrentLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("recurrent");
-  config.layerConfig.set_size(4);
-  config.layerConfig.set_active_type("tanh");
-  config.biasSize = 4;
-
-  config.inputDefs.push_back(
-      {INPUT_SEQUENCE_DATA, "layer_0", /* dim= */ 4, /* paraSize= */ 16});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    for (auto reversed : {false, true}) {
-      config.layerConfig.set_reversed(reversed);
-      config.testState = !reversed;
-      testLayerGrad(
-          config, "recurrent", 50, /* trans= */ false, useGpu, false, 1.0);
-    }
-  }
-}
-
-TEST(Layer, LstmLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("lstmemory");
-  config.layerConfig.set_size(4);
-  config.layerConfig.set_active_type("tanh");
-  config.layerConfig.set_active_state_type("sigmoid");
-  config.layerConfig.set_active_gate_type("sigmoid");
-  config.biasSize = 28;
-
-  config.inputDefs.push_back(
-      {INPUT_SEQUENCE_DATA, "layer_0", /* dim= */ 16, /* paraSize= */ 64});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    for (auto reversed : {false, true}) {
-      config.layerConfig.set_reversed(reversed);
-      config.testState = !reversed;
-      testLayerGrad(
-          config, "lstmemory", 100, /* trans= */ false, useGpu, false, 0.02);
-    }
-  }
-  for (auto useGpu : {true}) {
-    config.testBatchState = true;
-    config.layerConfig.set_reversed(false);
-    testLayerGrad(config, "lstmemory", 10, /* trans= */ false, useGpu);
-  }
-}
-
-TEST(Layer, MDLstmLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("mdlstmemory");
-  config.layerConfig.set_size(4);
-  config.layerConfig.set_active_type("sigmoid");
-  config.layerConfig.set_active_state_type("sigmoid");
-  config.layerConfig.set_active_gate_type("sigmoid");
-  config.biasSize = 4 * 9;
-
-  config.inputDefs.push_back(
-      {INPUT_SEQUENCE_MDIM_DATA, "layer_0", 4 * 5, 4 * 4 * 5});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_directions(true);
-  config.layerConfig.add_directions(true);
-
-  for (auto useGpu : {false, true}) {
-    for (int i = 0; i < 2; i++) {
-      for (int j = 0; j < 2; j++) {
-        config.layerConfig.set_directions(0, bool(i));
-        config.layerConfig.set_directions(1, bool(j));
-        testLayerGrad(config, "mdlstmemory", 100, false, useGpu);
-      }
-    }
-  }
-}
-
-TEST(Layer, ParameterReluLayer) {
-  auto testParameterReluLayer = [&](size_t inputSize, size_t channels) {
-    TestConfig config;
-    config.layerConfig.set_type("prelu");
-    config.inputDefs.push_back({INPUT_DATA, "layer_0", inputSize, channels});
-    config.layerConfig.add_inputs();
-    config.layerConfig.set_size(inputSize);
-    config.layerConfig.set_partial_sum(inputSize /
-                                       channels);  // size of feature map
-    for (auto useGpu : {false, true}) {
-      testLayerGrad(config, "prelu", 100, false, useGpu);
-    }
-  };
-
-  testParameterReluLayer(192, 1);
-  testParameterReluLayer(192, 3);
-  testParameterReluLayer(192, 192);
-}
-
-TEST(Layer, ResizeLayer) {
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("resize");
-  config.layerConfig.set_size(64);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 16, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "resize", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, RotateLayer) {
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("rotate");
-  const int CHANNEL = 2;
-  const int HEIGHT = 8;
-  const int WIDTH = 4;
-  const int INPUT_SIZE = HEIGHT * WIDTH * CHANNEL;
-  config.layerConfig.set_size(INPUT_SIZE);
-  config.layerConfig.set_height(HEIGHT);
-  config.layerConfig.set_width(WIDTH);
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", INPUT_SIZE, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "rotate", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, NCELayer) {
-  TestConfig config;
-  size_t numClasses = 4;
-  config.layerConfig.set_type("nce");
-  config.layerConfig.set_size(1);
-  config.layerConfig.set_active_type("sigmoid");
-  config.layerConfig.set_num_classes(numClasses);
-  config.biasSize = numClasses;
-
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", /* dim= */ 16, /* paraSize= */ 16 * numClasses});
-  config.inputDefs.push_back(
-      {INPUT_LABEL, "label", /* dim= */ numClasses, /* paraSize= */ 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto withWeight : {false, true}) {
-    if (withWeight) {
-      config.inputDefs.push_back(
-          {INPUT_DATA_TARGET, "weight", /* dim= */ 1, /* paraSize= */ 0});
-      config.layerConfig.add_inputs();
-    }
-
-    for (auto isIdLabel : {false, true}) {
-      config.inputDefs[1] = {
-          isIdLabel ? INPUT_LABEL : INPUT_SPARSE_NON_VALUE_DATA,
-          "label",
-          /* dim= */ numClasses,
-          /* paraSize= */ 0};
-
-      for (auto withDist : {false, true}) {
-        config.layerConfig.clear_neg_sampling_dist();
-        if (withDist) {
-          double sum = 0;
-          for (size_t i = 0; i < numClasses; ++i) {
-            real p = rand();  // NOLINT use rand_r
-            config.layerConfig.add_neg_sampling_dist(p);
-            sum += p;
-          }
-          for (size_t i = 0; i < numClasses; ++i) {
-            real p = config.layerConfig.neg_sampling_dist(i) / sum;
-            config.layerConfig.set_neg_sampling_dist(i, p);
-          }
-        }
-        LOG(INFO) << "NCELayer "
-                  << " isIdLabel=" << isIdLabel << " withWeight=" << withWeight
-                  << " withDist=" << withDist;
-        // Not support GPU now
-        testLayerGrad(config,
-                      "nce",
-                      100,
-                      /* trans= */ false,
-                      /* useGpu */ false);
-      }
-    }
-  }
-}
-
-TEST(Layer, GatedRecurrentLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("gated_recurrent");
-  config.layerConfig.set_size(4);
-  config.layerConfig.set_active_type("sigmoid");
-  config.layerConfig.set_active_gate_type("sigmoid");
-  config.biasSize = 12;
-
-  config.inputDefs.push_back(
-      {INPUT_SEQUENCE_DATA, "layer_0", /* dim= */ 12, /* paraSize= */ 48});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    for (auto reversed : {false, true}) {
-      config.layerConfig.set_reversed(reversed);
-      config.testState = !reversed;
-      testLayerGrad(config, "gated_recurrent", 100, /* trans= */ false, useGpu);
-    }
-  }
-}
-
-TEST(Layer, GruStepLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("gru_step");
-  config.layerConfig.set_size(4);
-  config.layerConfig.set_active_type("sigmoid");
-  config.layerConfig.set_active_gate_type("sigmoid");
-  config.biasSize = 12;
-
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", /* dim= */ 12, /* paraSize= */ 48});
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_1", /* dim= */ 4, /* paraSize= */ 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "gruStep", 100, /* trans= */ false, useGpu);
-  }
-}
-
-TEST(Layer, LstmStepLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("lstm_step");
-  config.layerConfig.set_size(4);
-  config.layerConfig.set_active_type("sigmoid");
-  config.layerConfig.set_active_state_type("sigmoid");
-  config.layerConfig.set_active_gate_type("sigmoid");
-  config.biasSize = 12;
-  config.testAccumulate = false;
-
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", /* dim= */ 16, /* paraSize= */ 0});
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_1", /* dim= */ 4, /* paraSize= */ 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "lstmStep", 100, /* trans= */ false, useGpu);
-  }
-}
-
-void testBatchNormLayer(const string& type, bool trans, bool useGpu) {
-  TestConfig config;
-  const int CHANNELS = 10;
-  const int IMG_SIZE = 16;
-  const int IMG_SIZE_Y = 8;
-  size_t size = CHANNELS * IMG_SIZE * IMG_SIZE_Y;
-  config.layerConfig.set_type(type);
-  config.layerConfig.set_size(size);
-  config.layerConfig.set_active_type("sigmoid");
-  config.biasSize = CHANNELS;
-  config.inputDefs.push_back({INPUT_DATA,
-                              "layer_0",
-                              /* dim= */ size,
-                              /* paraSize= */ CHANNELS});
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_1_running_mean", 1, CHANNELS});
-  config.inputDefs.back().isStatic = true;
-  config.inputDefs.push_back({INPUT_DATA, "layer_2_running_var", 1, CHANNELS});
-  config.inputDefs.back().isStatic = true;
-
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  ImageConfig* img_conf = input->mutable_image_conf();
-  img_conf->set_channels(CHANNELS);
-  img_conf->set_img_size(IMG_SIZE);
-  img_conf->set_img_size_y(IMG_SIZE_Y);
-
-  testLayerGrad(config,
-                "batch_norm",
-                64,
-                /* trans= */ trans,
-                useGpu,
-                /* useWeight */ true);
-}
-
-TEST(Layer, BatchNormalizationLayer) {
-  testBatchNormLayer("batch_norm", false, false);
-#ifdef PADDLE_WITH_CUDA
-  testBatchNormLayer("batch_norm", false, true);
-  if (hl_get_cudnn_lib_version() >= int(4000)) {
-    testBatchNormLayer("cudnn_batch_norm", false, true);
-  }
-#endif
-}
-
-void testBatchNorm3DLayer(const string& type, bool trans, bool useGpu) {
-  TestConfig config;
-  const int CHANNELS = 10;
-  const int IMG_SIZE = 16;
-  const int IMG_SIZE_Y = 8;
-  const int IMG_SIZE_Z = 8;
-  size_t size = CHANNELS * IMG_SIZE * IMG_SIZE_Y * IMG_SIZE_Z;
-  config.layerConfig.set_type(type);
-  config.layerConfig.set_size(size);
-  config.layerConfig.set_active_type("sigmoid");
-  config.biasSize = CHANNELS;
-  config.inputDefs.push_back({INPUT_DATA,
-                              "layer_0",
-                              /* dim= */ size,
-                              /* paraSize= */ CHANNELS});
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_1_running_mean", 1, CHANNELS});
-  config.inputDefs.back().isStatic = true;
-  config.inputDefs.push_back({INPUT_DATA, "layer_2_running_var", 1, CHANNELS});
-  config.inputDefs.back().isStatic = true;
-
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  ImageConfig* img_conf = input->mutable_image_conf();
-  img_conf->set_channels(CHANNELS);
-  img_conf->set_img_size(IMG_SIZE);
-  img_conf->set_img_size_y(IMG_SIZE_Y);
-  img_conf->set_img_size_z(IMG_SIZE_Z);
-
-  testLayerGrad(config,
-                "batch_norm",
-                64,
-                /* trans= */ trans,
-                useGpu,
-                /* useWeight */ true);
-}
-
-TEST(Layer, testBatchNorm3DLayer) {
-  testBatchNorm3DLayer("batch_norm", false, false);
-#ifdef PADDLE_WITH_CUDA
-  testBatchNorm3DLayer("batch_norm", false, true);
-  if (hl_get_cudnn_lib_version() >= int(4000)) {
-    testBatchNorm3DLayer("cudnn_batch_norm", false, true);
-  }
-#endif
-}
-
-void testConvOperator(bool isDeconv) {
-  TestConfig config;
-  const int NUM_FILTERS = 16;
-  const int FILTER_SIZE = 2;
-  const int FILTER_SIZE_Y = 3;
-  const int CHANNELS = 3;
-  const int IMAGE_SIZE = 16;
-  const int IMAGE_SIZE_Y = 9;
-  OperatorConfig& operatorConf = *config.layerConfig.add_operator_confs();
-  if (isDeconv) {
-    operatorConf.set_type("convt");
-  } else {
-    operatorConf.set_type("conv");
-  }
-  ConvConfig* conv = operatorConf.mutable_conv_conf();
-  operatorConf.set_num_filters(NUM_FILTERS);
-  conv->set_filter_size(FILTER_SIZE);
-  conv->set_filter_size_y(FILTER_SIZE_Y);
-  conv->set_channels(CHANNELS);
-  conv->set_padding(0);
-  conv->set_padding_y(1);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_groups(1);
-  conv->set_img_size(IMAGE_SIZE);
-  conv->set_img_size_y(IMAGE_SIZE_Y);
-  conv->set_output_x(outputSize(conv->img_size(),
-                                conv->filter_size(),
-                                conv->padding(),
-                                conv->stride(),
-                                /*  caffeMode */ true));
-  conv->set_output_y(outputSize(conv->img_size_y(),
-                                conv->filter_size_y(),
-                                conv->padding_y(),
-                                conv->stride_y(),
-                                /*  caffeMode */ true));
-
-  if (isDeconv) {
-    conv->set_filter_channels(NUM_FILTERS / conv->groups());
-    config.inputDefs.push_back({INPUT_DATA,
-                                "layer_0",
-                                conv->output_x() * conv->output_y() * CHANNELS,
-                                0});
-    config.layerConfig.set_size(IMAGE_SIZE * IMAGE_SIZE_Y * NUM_FILTERS);
-  } else {
-    conv->set_filter_channels(conv->channels() / conv->groups());
-    config.inputDefs.push_back(
-        {INPUT_DATA, "layer_0", IMAGE_SIZE * IMAGE_SIZE_Y * CHANNELS, 0});
-    config.layerConfig.set_size(conv->output_x() * conv->output_y() *
-                                NUM_FILTERS);
-  }
-
-  config.inputDefs.push_back(
-      {INPUT_DATA,
-       "layer_1",
-       FILTER_SIZE * FILTER_SIZE_Y * CHANNELS * NUM_FILTERS,
-       0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  testOperatorGrad(config, operatorConf, 100, /*useGpu*/ true, false);
-}
-
-TEST(Operator, conv) {
-  testConvOperator(/*isDeconv*/ true);
-  testConvOperator(/*isDeconv*/ false);
-}
-
-TEST(Layer, FeatureMapExpandLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("featmap_expand");
-  const int CHANNELS = 10;
-  const int INPUT_SIZE = 100;
-  config.layerConfig.set_size(INPUT_SIZE * CHANNELS);
-  config.layerConfig.set_num_filters(CHANNELS);
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA,
-                              "layer_0",
-                              /* dim= */ INPUT_SIZE,
-                              /* paraSize= */ 0});
-  config.layerConfig.add_inputs();
-  for (auto useGpu : {false, true}) {
-    for (auto asRowVec : {false, true}) {
-      config.layerConfig.set_user_arg(asRowVec ? "as_row_vec" : "as_col_vec");
-      testLayerGrad(config,
-                    "featmap_expand",
-                    /*batch_size*/ 100,
-                    /* trans= */ false,
-                    useGpu,
-                    /* useWeight */ true);
-    }
-  }
-}
-
-TEST(Layer, MultiplexLayer) {
-  TestConfig config;
-  const int LAYER_SIZE = 100;
-  config.layerConfig.set_type("multiplex");
-  config.layerConfig.set_size(LAYER_SIZE);
-
-  config.inputDefs.push_back({INPUT_LABEL, "layer_0", 2, 0});
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_1", /* dim= */ LAYER_SIZE, /* paraSize= */ 0});
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_2", /* dim= */ LAYER_SIZE, /* paraSize= */ 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "multiplex", 512, /* trans= */ false, useGpu);
-  }
-}
-
-TEST(Layer, PadLayer) {
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("pad");
-
-  int c = 4;
-  int h = 31;
-  int w = 36;
-  size_t size = c * h * w;
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", size, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  PadConfig* pad = input->mutable_pad_conf();
-  ImageConfig* image = pad->mutable_image_conf();
-
-  image->set_channels(c);
-  image->set_img_size(h);
-  image->set_img_size_y(w);
-  pad->add_pad_c(1);
-  pad->add_pad_c(2);
-  pad->add_pad_h(2);
-  pad->add_pad_h(3);
-  pad->add_pad_w(3);
-  pad->add_pad_w(5);
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "pad", 10, false, useGpu);
-  }
-}
-
-TEST(Layer, CrossChannelNormLayer) {
-  TestConfig config;
-  config.paramInitialMean = 1.;
-  config.paramInitialStd = 0.;
-  config.layerConfig.set_type("norm");
-  config.layerConfig.set_size(100);
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  NormConfig* norm = input->mutable_norm_conf();
-  norm->set_norm_type("cross-channel-norm");
-  norm->set_channels(10);
-  norm->set_size(100);
-  norm->set_scale(0);
-  norm->set_pow(0);
-  norm->set_blocked(0);
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 100, 10});
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "cross-channel-norm", 10, false, useGpu, false);
-  }
-}
-
-TEST(Layer, smooth_l1) {
-  TestConfig config;
-  config.layerConfig.set_type("smooth_l1");
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 200, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 200, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "smooth_l1", 100, false, useGpu, false);
-  }
-}
-
-TEST(Layer, multibox_loss) {
-  TestConfig config;
-  config.layerConfig.set_type("multibox_loss");
-  config.biasSize = 0;
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  MultiBoxLossConfig* multiboxLoss = input->mutable_multibox_loss_conf();
-  multiboxLoss->set_num_classes(21);
-  multiboxLoss->set_input_num(1);
-  multiboxLoss->set_overlap_threshold(0.5);
-  multiboxLoss->set_neg_pos_ratio(3);
-  multiboxLoss->set_neg_overlap(0.5);
-  multiboxLoss->set_background_id(0);
-  multiboxLoss->set_height(3);
-  multiboxLoss->set_width(3);
-
-  size_t gtNum = 1;
-  MatrixPtr labelValue = Matrix::create(gtNum, 6, false, false);
-  labelValue->randomizeUniform();
-  labelValue->add(-0.5);
-  labelValue->sigmoid(*labelValue);
-  real* labelData = labelValue->getData();
-  size_t labelWidth = labelValue->getWidth();
-  for (size_t i = 0; i < gtNum; ++i) {
-    *(labelData + i * labelWidth) = std::rand() % 20 + 1;
-    *(labelData + i * labelWidth + 1) = 0.400259;
-    *(labelData + i * labelWidth + 2) = 0.377857;
-    *(labelData + i * labelWidth + 3) = 0.525712;
-    *(labelData + i * labelWidth + 4) = 0.519368;
-  }
-  vector<int> seqStartPositions(gtNum + 1, 0);
-  for (size_t i = 1; i <= gtNum; ++i) {
-    seqStartPositions[i] = i;
-  }
-
-  // Ensure at lease one matched bbox
-  MatrixPtr priorValue = Matrix::create(1, 72, false, false);
-  priorValue->randomizeUniform();
-  priorValue->add(-0.5);
-  priorValue->sigmoid(*priorValue);
-  real* priorData = priorValue->getData();
-  *(priorData) = 0.424811;
-  *(priorData + 1) = 0.397059;
-  *(priorData + 2) = 0.538905;
-  *(priorData + 3) = 0.447091;
-  *(priorData + 4) = 0.425720;
-  *(priorData + 5) = 0.515228;
-  *(priorData + 6) = 0.519452;
-  *(priorData + 7) = 0.591065;
-
-  config.inputDefs.push_back(
-      {INPUT_SELF_DEFINE_DATA, "priorbox", priorValue, {}});
-  config.inputDefs.push_back(
-      {INPUT_SELF_DEFINE_DATA, "label", labelValue, seqStartPositions});
-  config.inputDefs.push_back({INPUT_DATA, "locPred", 36, 0});
-  config.inputDefs.push_back({INPUT_DATA, "confPred", 189, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "multibox_loss", 1, false, useGpu, false);
-  }
-}
-
-TEST(Layer, TransLayer) {
-  TestConfig config;
-  const int height = 128;
-  const int width = 256;
-  config.layerConfig.set_type("trans");
-  config.layerConfig.set_size(width);
-
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", /* dim= */ height * width, /* paraSize= */ 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "trans", height, /* trans= */ false, useGpu);
-  }
-}
-
-TEST(Layer, RowConvLayer) {
-  const int context = 3;
-  const int size = 512;
-
-  TestConfig config;
-  config.layerConfig.set_type("row_conv");
-  config.layerConfig.set_size(size);
-  config.layerConfig.set_active_type("sigmoid");
-
-  config.inputDefs.push_back(
-      {INPUT_SEQUENCE_DATA, "layer_0", size, context * size});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  RowConvConfig* conv = input->mutable_row_conv_conf();
-  conv->set_context_length(context);
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "row_conv", 100, false, useGpu, false);
-  }
-}
-
-TEST(Layer, CropLayer) {
-  TestConfig config;
-  // config input_0
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ImageConfig* img = input->mutable_image_conf();
-  img->set_channels(4);
-  img->set_img_size(16);
-  config.layerConfig.set_axis(2);
-  config.layerConfig.add_offset(0);
-  config.layerConfig.add_offset(0);
-
-  // config input_1
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 128, 0});
-  input = config.layerConfig.add_inputs();
-  img = input->mutable_image_conf();
-  img->set_channels(2);
-  img->set_img_size(8);
-
-  // config crop layer
-  config.layerConfig.set_type("crop");
-  config.layerConfig.set_name("cropLayer");
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "crop", 100, false, useGpu, false);
-  }
-}
-
-TEST(Layer, roi_pool) {
-  TestConfig config;
-  config.layerConfig.set_type("roi_pool");
-  config.biasSize = 0;
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ROIPoolConfig* roiPoolConf = input->mutable_roi_pool_conf();
-  roiPoolConf->set_pooled_width(7);
-  roiPoolConf->set_pooled_height(7);
-  roiPoolConf->set_spatial_scale(1. / 16);
-  roiPoolConf->set_width(14);
-  roiPoolConf->set_height(14);
-
-  const size_t roiNum = 10;
-  const size_t roiDim = 10;
-  const size_t batchSize = 5;
-  MatrixPtr roiValue = Matrix::create(roiNum, roiDim, false, false);
-  roiValue->zeroMem();
-  real* roiData = roiValue->getData();
-  for (size_t i = 0; i < roiNum; ++i) {
-    roiData[i * roiDim + 0] = std::rand() % batchSize;
-    roiData[i * roiDim + 1] = std::rand() % 224;  // xMin
-    roiData[i * roiDim + 2] = std::rand() % 224;  // yMin
-    size_t xMin = static_cast<size_t>(roiData[i * roiDim + 1]);
-    size_t yMin = static_cast<size_t>(roiData[i * roiDim + 2]);
-    roiData[i * roiDim + 3] = xMin + std::rand() % (224 - xMin);  // xMax
-    roiData[i * roiDim + 4] = yMin + std::rand() % (224 - yMin);  // yMax
-  }
-
-  config.inputDefs.push_back({INPUT_DATA, "input", 3 * 14 * 14, {}});
-  config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, "rois", roiValue, {}});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "roi_pool", batchSize, false, useGpu, false);
-  }
-}
-
-TEST(Layer, SwitchOrderLayer) {
-  TestConfig config;
-  // config input_0
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ImageConfig* img = input->mutable_image_conf();
-  img->set_channels(4);
-  img->set_img_size(16);
-  img->set_img_size_y(16);
-
-  ReshapeConfig* reshape = config.layerConfig.mutable_reshape_conf();
-  reshape->add_height_axis(0);
-  reshape->add_height_axis(1);
-  reshape->add_height_axis(2);
-  reshape->add_width_axis(3);
-
-  // config softmax layer
-  config.layerConfig.set_type("switch_order");
-  config.layerConfig.set_name("switchOrderLayer");
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "switch_order", 100, false, useGpu, true);
-  }
-}
-
-vector<real> randSampling(real range, int n) {
-  CHECK_GE(range, n);
-  vector<real> num(range);
-  iota(begin(num), end(num), 0.);
-  if (range == n) return num;
-
-  random_shuffle(begin(num), end(num));
-  num.resize(n);
-  sort(begin(num), end(num));
-  return num;
-}
-
-TEST(Layer, SubNestedSequenceLayer) {
-  // layer size is not crutial for this layer,
-  // so use a small layer size in unittest
-  const int layerSize = 4;
-
-  const int maxSeqNum = 50;
-  const int maxSeqLen = 50;
-  const int maxBeamSize = 32;
-
-  srand((size_t)(time(NULL)));
-  int beamSize = 1 + (rand() % maxBeamSize);
-
-  TestConfig config;
-  config.layerConfig.set_type("sub_nested_seq");
-  config.layerConfig.set_name("sub_nested_seq_layer");
-  config.layerConfig.set_size(layerSize);
-
-  int seqNum = 1 + (rand() % maxSeqNum);
-
-  // sequence information for the first input, it is a nested sequence
-  vector<int> seqStartPos(seqNum + 1, 0);
-  vector<int> subSeqStartPos(1, 0);
-
-  // selected indices
-  MatrixPtr selectedIndices = Matrix::create(seqNum, beamSize, false, false);
-  selectedIndices->one();
-  selectedIndices->mulScalar(-1.);
-  real* indicesData = selectedIndices->getData();
-
-  for (int i = 0; i < seqNum; ++i) {
-    int subSeqNum = 1 + (rand() % maxSeqNum);
-    for (int j = 0; j < subSeqNum; ++j) {
-      subSeqStartPos.push_back(subSeqStartPos.back() +
-                               (1 + (rand() % maxSeqLen)));
-    }
-    vector<real> selSeqs =
-        randSampling(static_cast<real>(subSeqNum), min(beamSize, subSeqNum));
-    memcpy(indicesData + (i * beamSize),
-           selSeqs.data(),
-           selSeqs.size() * sizeof(real));
-    seqStartPos[i + 1] = subSeqStartPos.back();
-  }
-
-  MatrixPtr seqInputPtr =
-      Matrix::create(seqStartPos.back(), layerSize, false, false);
-  seqInputPtr->randomizeUniform();
-  config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
-                              "nested_seq_input",
-                              seqInputPtr,
-                              seqStartPos,
-                              subSeqStartPos});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back(
-      {INPUT_SELF_DEFINE_DATA, "selected_indices", selectedIndices});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "sub_nested_seq",
-                  /* batchSize */ seqNum,
-                  /* trans */ false,
-                  /* useGpu*/ useGpu,
-                  /* useWeight */ false);
-  }
-}
-
-TEST(Layer, ClipLayer) {
-  const size_t batchSize = 128;
-  const size_t size = 512;
-  TestConfig config;
-  config.layerConfig.set_type("clip");
-  config.inputDefs.push_back({INPUT_DATA, "input", size, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ClipConfig* layerConf = input->mutable_clip_conf();
-  double p1 = std::rand() / (double)RAND_MAX;
-  double p2 = std::rand() / (double)RAND_MAX;
-  layerConf->set_min(std::min(p1, p2));
-  layerConf->set_max(std::max(p1, p2));
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "clip", batchSize, false, useGpu, false);
-  }
-}
-
-TEST(Layer, RowL2NormLayer) {
-  const size_t batchSize = 128;
-  const size_t size = 512;
-  TestConfig config;
-  config.layerConfig.set_type("row_l2_norm");
-  config.layerConfig.set_size(size);
-  config.inputDefs.push_back({INPUT_DATA, "input", size, 0});
-  config.layerConfig.add_inputs();
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "row_l2_norm", batchSize, false, useGpu, false);
-  }
-}
-
-void test3DConvLayer(const string& type, bool trans, bool useGpu) {
-  // filter size
-  const int NUM_FILTERS = 6;
-  // const int CHANNELS = 3;
-  const int FILTER_SIZE = 3;
-  const int FILTER_SIZE_Y = 3;
-  const int FILTER_SIZE_Z = 3;
-
-  // input image
-  const int CHANNELS = 3;
-  const int IMAGE_SIZE = 9;
-  const int IMAGE_SIZE_Y = 9;
-  const int IMAGE_SIZE_Z = 9;
-
-  TestConfig config;
-  config.biasSize = NUM_FILTERS;
-  config.layerConfig.set_type(type);
-  config.layerConfig.set_num_filters(NUM_FILTERS);
-  config.layerConfig.set_partial_sum(1);
-  config.layerConfig.set_shared_biases(true);
-
-  // Setting up conv3D-trans layer
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-
-  conv->set_channels(CHANNELS);
-  conv->set_filter_size(FILTER_SIZE);
-  conv->set_filter_size_y(FILTER_SIZE_Y);
-  conv->set_filter_size_z(FILTER_SIZE_Z);
-  conv->set_padding(0);
-  conv->set_padding_y(0);
-  conv->set_padding_z(0);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_stride_z(2);
-  conv->set_img_size(IMAGE_SIZE);
-  conv->set_img_size_y(IMAGE_SIZE_Y);
-  conv->set_img_size_z(IMAGE_SIZE_Z);
-  conv->set_output_x(outputSize(conv->img_size(),
-                                conv->filter_size(),
-                                conv->padding(),
-                                conv->stride(),
-                                /*  caffeMode */ true));
-  conv->set_output_y(outputSize(conv->img_size_y(),
-                                conv->filter_size_y(),
-                                conv->padding_y(),
-                                conv->stride_y(),
-                                /*  caffeMode */ true));
-  conv->set_output_z(outputSize(conv->img_size_z(),
-                                conv->filter_size_z(),
-                                conv->padding_z(),
-                                conv->stride_z(),
-                                /*  caffeMode */ true));
-
-  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
-                              conv->output_z() * NUM_FILTERS);
-  conv->set_groups(1);
-  conv->set_filter_channels(conv->channels() / conv->groups());
-  config.inputDefs.push_back(
-      {INPUT_DATA,
-       "layer_0",
-       CHANNELS * IMAGE_SIZE * IMAGE_SIZE_Y * IMAGE_SIZE_Z,
-       conv->filter_channels() * FILTER_SIZE * FILTER_SIZE_Y * FILTER_SIZE_Z *
-           NUM_FILTERS});
-
-  testLayerGrad(config, "conv3D", 10, trans, useGpu);
-  // Use small batch_size and useWeight=true to test biasGrad
-  testLayerGrad(config, "conv3D", 2, trans, useGpu, true, 0.02);
-}
-
-TEST(Layer, test3DConvLayer) {
-  test3DConvLayer("conv3d", /* trans= */ false, /* useGpu= */ false);
-#ifdef PADDLE_WITH_CUDA
-  test3DConvLayer("conv3d", /* trans= */ false, /* useGpu= */ true);
-#endif
-}
-
-void test3DDeConvLayer(const string& type, bool trans, bool useGpu) {
-  // filter size
-  const int NUM_FILTERS = 6;
-  // const int CHANNELS = 3;
-  const int FILTER_SIZE = 3;
-  const int FILTER_SIZE_Y = 3;
-  const int FILTER_SIZE_Z = 3;
-
-  // input image
-  const int CHANNELS = 3;
-  const int IMAGE_SIZE = 4;
-  const int IMAGE_SIZE_Y = 6;
-  const int IMAGE_SIZE_Z = 6;
-
-  // Setting up conv-trans layer
-  TestConfig config;
-  config.biasSize = NUM_FILTERS;
-  config.layerConfig.set_type("deconv3d");
-  config.layerConfig.set_num_filters(NUM_FILTERS);
-  config.layerConfig.set_partial_sum(1);
-  config.layerConfig.set_shared_biases(true);
-
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-
-  conv->set_channels(CHANNELS);
-  conv->set_filter_size(FILTER_SIZE);
-  conv->set_filter_size_y(FILTER_SIZE_Y);
-  conv->set_filter_size_z(FILTER_SIZE_Z);
-  conv->set_padding(0);
-  conv->set_padding_y(0);
-  conv->set_padding_z(0);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_stride_z(2);
-  conv->set_output_x(IMAGE_SIZE);
-  conv->set_output_y(IMAGE_SIZE_Y);
-  conv->set_output_z(IMAGE_SIZE_Z);
-
-  conv->set_img_size(imageSize(conv->output_x(),
-                               conv->filter_size(),
-                               conv->padding(),
-                               conv->stride(),
-                               true));
-  conv->set_img_size_y(imageSize(conv->output_y(),
-                                 conv->filter_size_y(),
-                                 conv->padding_y(),
-                                 conv->stride_y(),
-                                 true));
-  conv->set_img_size_z(imageSize(conv->output_z(),
-                                 conv->filter_size_z(),
-                                 conv->padding_z(),
-                                 conv->stride_z(),
-                                 true));
-  config.layerConfig.set_size(conv->img_size() * conv->img_size_y() *
-                              conv->img_size_z() * NUM_FILTERS);
-  conv->set_groups(1);
-  conv->set_filter_channels(conv->channels() / conv->groups());
-  config.inputDefs.push_back(
-      {INPUT_DATA,
-       "layer_0",
-       CHANNELS * IMAGE_SIZE * IMAGE_SIZE_Y * IMAGE_SIZE_Z,
-       conv->filter_channels() * FILTER_SIZE * FILTER_SIZE_Y * FILTER_SIZE_Z *
-           NUM_FILTERS});
-
-  testLayerGrad(config, "deconv3D", 10, trans, useGpu);
-  // Use small batch_size and useWeight=true to test biasGrad
-  testLayerGrad(config, "deconv3D", 2, trans, useGpu, true, 0.02);
-}
-
-TEST(Layer, test3DDeConvLayer) {
-  test3DDeConvLayer("deconv3d", /* trans= */ false, /* useGpu= */ false);
-#ifdef PADDLE_WITH_CUDA
-  test3DDeConvLayer("deconv3d", /* trans= */ false, /* useGpu= */ true);
-#endif
-}
-
-TEST(Layer, ScaleShiftLayer) {
-  // FIXME: Disable ScaleShiftLayer because it is not stable.
-  // https://github.com/PaddlePaddle/Paddle/issues/7781
-  return;
-  //  const size_t batchSize = 16;
-  //  const size_t size = 32;
-  //  TestConfig config;
-  //  config.layerConfig.set_type("scale_shift");
-  //  config.layerConfig.set_size(size);
-  //  config.biasSize = 1;
-  //  config.inputDefs.push_back(
-  //      {INPUT_DATA, "input", /* dim= */ size, /* paraSize= */ 1});
-  //  config.layerConfig.add_inputs();
-  //  for (auto useGpu : {false, true}) {
-  //    testLayerGrad(config, "scale_shift", batchSize, false, useGpu, false);
-  //  }
-}
-
-TEST(Layer, ScaleSubRegionLayer) {
-  const size_t batchSize = 64;
-  const size_t size = 4096;
-  TestConfig config;
-  config.layerConfig.set_type("scale_sub_region");
-  config.inputDefs.push_back({INPUT_DATA, "input", size, 0});
-  MatrixPtr indicesV = Matrix::create(batchSize, 6, false, false);
-  auto* data = indicesV->getData();
-  for (size_t i = 0; i < batchSize; ++i) {
-    data[i * 2] = 2;
-    data[i * 2 + 1] = 4;
-    data[i * 2 + 2] = 16;
-    data[i * 2 + 3] = 32;
-    data[i * 2 + 4] = 16;
-    data[i * 2 + 5] = 32;
-  }
-  config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, "indices", indicesV, {}});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ScaleSubRegionConfig* scaleSubRegionConf =
-      input->mutable_scale_sub_region_conf();
-  ImageConfig* imgConf = scaleSubRegionConf->mutable_image_conf();
-  imgConf->set_img_size(32);
-  imgConf->set_img_size_y(32);
-  imgConf->set_channels(4);
-  scaleSubRegionConf->set_value(2.0);
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "scale_sub_region", batchSize, false, useGpu, false);
-  }
-}
-
-TEST(Layer, L2DistanceLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("l2_distance");
-  config.layerConfig.set_size(1);
-  config.biasSize = 0;
-
-  const size_t input_dim = 27;
-  const size_t batch_size = 11;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", input_dim, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", input_dim, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "l2_distance", batch_size, false, useGpu);
-  }
-}
-
-void testFactorizationMachineLayer(InputType type, bool useGpu) {
-  const int FACTOR_SIZE = 10;
-  TestConfig config;
-  config.layerConfig.set_type("factorization_machine");
-  config.layerConfig.set_factor_size(FACTOR_SIZE);
-  config.layerConfig.set_size(1);
-  config.biasSize = 0;
-  config.inputDefs.push_back({type, "layer_0", 128, 1280});
-  config.layerConfig.add_inputs();
-  testLayerGrad(config, "factorization_machine", 16, false, useGpu, false);
-}
-
-TEST(Layer, FactorizationMachineLayer) {
-  for (auto useGpu : {false, true}) {
-    testFactorizationMachineLayer(INPUT_DATA, useGpu);
-  }
-  testFactorizationMachineLayer(INPUT_SPARSE_FLOAT_VALUE_DATA, false);
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(1);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_LinearChainCRF.cpp b/paddle/gserver/tests/test_LinearChainCRF.cpp
deleted file mode 100644
index 423c31e27d7ca223f1cbff8f030b006d3889f0bb..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_LinearChainCRF.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <vector>
-#include "paddle/gserver/layers/LinearChainCRF.h"
-#include "paddle/utils/Util.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-static inline bool getNextSequence(vector<int>& seq, int numClasses) {
-  for (auto& v : seq) {
-    if (++v < numClasses) {
-      return true;
-    }
-    v = 0;
-  }
-  return false;
-}
-
-TEST(LinearChainCRF, decoding) {
-  const int numClasses = 4;
-  CpuVector para(numClasses * (numClasses + 2));
-  real* a = para.getData();
-  real* b = para.getData() + numClasses;
-  real* w = para.getData() + 2 * numClasses;
-  LinearChainCRF crf(4, para.getData());
-  for (int length : {1, 2, 3, 10}) {
-    for (int tries = 0; tries < 10; ++tries) {
-      CpuMatrix x(length, numClasses);
-      x.randomizeUniform();
-      para.randnorm(0, 2);
-      vector<int> decodingResult(length);
-      vector<int> bestResult(length);
-      vector<int> testResult(length, 0);
-      crf.decode(x.getData(), &decodingResult[0], length);
-      real bestScore = -std::numeric_limits<real>::max();
-      do {
-        real score = a[testResult.front()] + b[testResult.back()];
-        score += x.getElement(0, testResult.front());
-        for (int k = 1; k < length; ++k) {
-          score += x.getElement(k, testResult[k]) +
-                   w[numClasses * testResult[k - 1] + testResult[k]];
-        }
-        if (score > bestScore) {
-          bestScore = score;
-          bestResult = testResult;
-        }
-      } while (getNextSequence(testResult, numClasses));
-      for (int k = 0; k < length; ++k) {
-        EXPECT_EQ(decodingResult[k], bestResult[k]);
-      }
-    }
-  }
-}
diff --git a/paddle/gserver/tests/test_MKLDNN.cpp b/paddle/gserver/tests/test_MKLDNN.cpp
deleted file mode 100644
index a34a3f6206171fb1e0563ab9ef8550bc890359ce..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ /dev/null
@@ -1,448 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/utils/PythonUtil.h>
-#include <string>
-#include <vector>
-#include "MKLDNNTester.h"
-#include "ModelConfig.pb.h"
-#include "paddle/gserver/activations/MKLDNNActivation.h"
-#include "paddle/math/MathUtils.h"
-
-using namespace paddle;  // NOLINT
-
-DECLARE_bool(thread_local_rand_use_global_seed);
-DECLARE_bool(use_gpu);
-DECLARE_bool(use_mkldnn);
-
-#define RUN_MKLDNN_TEST(DNN_CONFIG, REF_CONFIG, DESC)         \
-  MKLDNNTester tester;                                        \
-  for (auto bs : {DESC.bs, 1}) {                              \
-    tester.run(DNN_CONFIG, REF_CONFIG, bs, DESC.ih, DESC.iw); \
-  }
-
-#define RUN_MKLDNN_TEST_LAYER(DNN_CONFIG, REF_TYPE, DESC) \
-  TestConfig ref = DNN_CONFIG;                            \
-  ref.layerConfig.set_type(REF_TYPE);                     \
-  RUN_MKLDNN_TEST(DNN_CONFIG, ref, DESC)
-
-struct testFcDesc {
-  int bs;
-  int ic;
-  int ih, iw;  // oh == ow == 1
-  int oc;
-};
-
-static void getMKLDNNFcConfig(TestConfig& cfg, const testFcDesc& pm) {
-  cfg.layerConfig.set_type("mkldnn_fc");
-  cfg.layerConfig.set_active_type("relu");
-  cfg.layerConfig.set_size(pm.oc);
-  cfg.inputDefs.push_back(
-      {INPUT_DATA,
-       "layer_0",
-       /* size of input layer= */ size_t(pm.ic * pm.ih * pm.iw),
-       /* size of weight= */ size_t(pm.oc * pm.ic * pm.ih * pm.iw)});
-  cfg.layerConfig.add_inputs();
-}
-
-void testFcLayer(const testFcDesc& pm) {
-  TestConfig dnnConfig;
-  getMKLDNNFcConfig(dnnConfig, pm);
-  for (auto biasSize : {pm.oc, 0}) {
-    dnnConfig.biasSize = biasSize;
-    RUN_MKLDNN_TEST_LAYER(dnnConfig, "fc", pm)
-  }
-}
-
-TEST(MKLDNNLayer, FcLayer) {
-  /* bs, ic, ih, iw, oc */
-  testFcLayer({2, 2, 1, 1, 3});
-  testFcLayer({3, 7, 1, 1, 19});
-  testFcLayer({8, 16, 13, 13, 32});
-  testFcLayer({4, 12, 13, 13, 18});
-  testFcLayer({2, 64, 16, 16, 32});
-  testFcLayer({15, 3, 16, 16, 6});
-}
-
-struct testConvDesc {
-  int bs, gp;
-  int ic, ih, iw;
-  int oc, oh, ow;
-  int fh, fw;
-  int ph, pw;
-  int sh, sw;
-  int dh, dw;
-};
-
-static void getMKLDNNConvConfig(TestConfig& cfg, const testConvDesc& pm) {
-  cfg.layerConfig.set_type("mkldnn_conv");
-  cfg.layerConfig.set_active_type("relu");
-  cfg.layerConfig.set_num_filters(pm.oc);
-  cfg.layerConfig.set_size(pm.oc * pm.oh * pm.ow);
-  cfg.layerConfig.set_shared_biases(true);
-  cfg.inputDefs.push_back(
-      {INPUT_DATA,
-       "layer_0",
-       /* size of input layer= */ size_t(pm.ic * pm.ih * pm.iw),
-       /* size of weight= */ size_t(pm.oc * pm.ic * pm.fh * pm.fw / pm.gp)});
-  LayerInputConfig* input = cfg.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-  conv->set_groups(pm.gp);
-  conv->set_img_size(pm.iw);
-  conv->set_img_size_y(pm.ih);
-  conv->set_output_x(pm.ow);
-  conv->set_output_y(pm.oh);
-  conv->set_filter_size(pm.fw);
-  conv->set_filter_size_y(pm.fh);
-  conv->set_channels(pm.ic);
-  conv->set_padding(pm.pw);
-  conv->set_padding_y(pm.ph);
-  conv->set_stride(pm.sw);
-  conv->set_stride_y(pm.sh);
-  conv->set_dilation(pm.dw);
-  conv->set_dilation_y(pm.dh);
-  conv->set_caffe_mode(true);
-  conv->set_filter_channels(conv->channels() / conv->groups());
-  CHECK_EQ(conv->filter_channels() * pm.gp, conv->channels())
-      << "it is indivisible";
-
-  int fh = (pm.fh - 1) * pm.dh + 1;
-  int fw = (pm.fw - 1) * pm.dw + 1;
-  int ow = outputSize(pm.iw, fw, pm.pw, pm.sw, true);
-  int oh = outputSize(pm.ih, fh, pm.ph, pm.sh, true);
-  CHECK_EQ(ow, pm.ow) << "output size check failed";
-  CHECK_EQ(oh, pm.oh) << "output size check failed";
-}
-
-void testConvLayer(const testConvDesc& pm) {
-  TestConfig dnnConfig;
-  getMKLDNNConvConfig(dnnConfig, pm);
-  for (auto biasSize : {pm.oc, 0}) {
-    dnnConfig.biasSize = biasSize;
-    RUN_MKLDNN_TEST_LAYER(dnnConfig, "exconv", pm)
-  }
-}
-
-TEST(MKLDNNLayer, ConvLayer) {
-  /* bs, gp, ic, ih, iw, oc, oh, ow, fh, fw, ph, pw, sh, sw, dh, dw */
-  testConvLayer({2, 1, 3, 32, 32, 16, 32, 32, 3, 3, 1, 1, 1, 1, 1, 1});
-  testConvLayer({2, 1, 8, 16, 16, 8, 16, 16, 3, 3, 1, 1, 1, 1, 1, 1});
-  testConvLayer({3, 1, 16, 32, 32, 3, 32, 32, 3, 3, 1, 1, 1, 1, 1, 1});
-  testConvLayer({8, 1, 16, 18, 18, 32, 18, 18, 3, 3, 1, 1, 1, 1, 1, 1});
-  testConvLayer({16, 1, 1, 42, 31, 32, 23, 11, 4, 5, 3, 2, 2, 3, 1, 1});
-  testConvLayer({2, 1, 8, 16, 16, 8, 8, 8, 3, 3, 1, 1, 2, 2, 1, 1});
-  testConvLayer({3, 1, 8, 13, 13, 8, 7, 7, 3, 3, 1, 1, 2, 2, 1, 1});
-  // with groups
-  testConvLayer({2, 2, 4, 5, 5, 8, 5, 5, 3, 3, 1, 1, 1, 1, 1, 1});
-  testConvLayer({2, 3, 3, 5, 5, 3, 5, 5, 3, 3, 1, 1, 1, 1, 1, 1});
-  testConvLayer({4, 4, 16, 3, 3, 16, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1});
-}
-
-struct testPoolDesc {
-  int bs, ic;  // input channel and output channel are the same
-  int ih, iw;
-  int oh, ow;
-  int fh, fw;
-  int ph, pw;
-  int sh, sw;
-};
-
-static void getMKLDNNPoolConfig(TestConfig& cfg, const testPoolDesc& pm) {
-  cfg.layerConfig.set_type("mkldnn_pool");
-  cfg.layerConfig.set_active_type("relu");
-  cfg.layerConfig.set_size(pm.ic * pm.oh * pm.ow);
-  cfg.inputDefs.push_back(
-      {INPUT_DATA,
-       "layer_0",
-       /* size of input layer= */ size_t(pm.ic * pm.ih * pm.iw),
-       0});
-  LayerInputConfig* input = cfg.layerConfig.add_inputs();
-  PoolConfig* pool = input->mutable_pool_conf();
-  pool->set_pool_type("avg-projection");
-  pool->set_channels(pm.ic);
-  pool->set_img_size(pm.iw);
-  pool->set_img_size_y(pm.ih);
-  pool->set_output_x(pm.ow);
-  pool->set_output_y(pm.oh);
-  pool->set_size_x(pm.fw);
-  pool->set_size_y(pm.fh);
-  pool->set_padding(pm.pw);
-  pool->set_padding_y(pm.ph);
-  pool->set_stride(pm.sw);
-  pool->set_stride_y(pm.sh);
-
-  int oh = outputSize(pm.ih, pm.fh, pm.ph, pm.sh, false);
-  int ow = outputSize(pm.iw, pm.fw, pm.pw, pm.sw, false);
-  CHECK_EQ(ow, pm.ow) << "output size check failed";
-  CHECK_EQ(oh, pm.oh) << "output size check failed";
-}
-
-void testPoolLayer(const testPoolDesc& pm) {
-  TestConfig dnnConfig;
-  getMKLDNNPoolConfig(dnnConfig, pm);
-  LayerInputConfig* input = dnnConfig.layerConfig.mutable_inputs(0);
-  PoolConfig* pool = input->mutable_pool_conf();
-  for (auto type : {"max-projection", "avg-projection"}) {
-    pool->set_pool_type(type);
-    RUN_MKLDNN_TEST_LAYER(dnnConfig, "pool", pm)
-  }
-}
-
-TEST(MKLDNNLayer, PoolLayer) {
-  /* bs, ch, ih, iw, oh, ow, fh, fw, ph, pw, sh, sw */
-  testPoolLayer({2, 1, 4, 4, 2, 2, 3, 3, 0, 0, 2, 2});
-  testPoolLayer({10, 8, 16, 16, 8, 8, 2, 2, 0, 0, 2, 2});
-  testPoolLayer({4, 2, 5, 5, 3, 3, 3, 3, 1, 1, 2, 2});
-  testPoolLayer({8, 16, 56, 56, 28, 28, 3, 3, 0, 0, 2, 2});
-  testPoolLayer({8, 16, 14, 14, 7, 7, 3, 3, 0, 0, 2, 2});
-  testPoolLayer({4, 16, 7, 7, 1, 1, 7, 7, 0, 0, 1, 1});
-  testPoolLayer({4, 2, 5, 5, 3, 3, 5, 5, 1, 1, 1, 1});
-  testPoolLayer({2, 8, 56, 56, 29, 29, 3, 3, 1, 1, 2, 2});
-}
-
-struct testBatchNormDesc {
-  int bs;
-  int ic;
-  int ih, iw;
-};
-
-static void getMKLDNNBatchNormConfig(TestConfig& cfg,
-                                     const testBatchNormDesc& pm) {
-  cfg.layerConfig.set_size(pm.ic * pm.ih * pm.iw);
-  cfg.layerConfig.set_type("mkldnn_batch_norm");
-  cfg.biasSize = pm.ic;
-  cfg.inputDefs.push_back(
-      {INPUT_DATA,
-       "layer_0",
-       /* size of input layer= */ size_t(pm.ic * pm.ih * pm.iw),
-       /* size of weight= */ size_t(pm.ic)});
-  cfg.inputDefs.push_back(
-      {INPUT_DATA, "layer_1_moving_mean", 1, size_t(pm.ic)});
-  cfg.inputDefs.back().isStatic = true;
-  cfg.inputDefs.push_back({INPUT_DATA, "layer_2_moving_var", 1, size_t(pm.ic)});
-  cfg.inputDefs.back().isStatic = true;
-  LayerInputConfig* input = cfg.layerConfig.add_inputs();
-  cfg.layerConfig.set_active_type("relu");
-  cfg.layerConfig.add_inputs();
-  cfg.layerConfig.add_inputs();
-  ImageConfig* img_conf = input->mutable_image_conf();
-  img_conf->set_channels(pm.ic);
-  img_conf->set_img_size_y(pm.ih);
-  img_conf->set_img_size(pm.iw);
-}
-
-void testBatchNormLayer(const testBatchNormDesc& pm) {
-  TestConfig dnnConfig;
-  getMKLDNNBatchNormConfig(dnnConfig, pm);
-  TestConfig refConfig = dnnConfig;
-  refConfig.layerConfig.set_type("batch_norm");
-  // for PASS_TRAIN, use_global_stats always should be false, and batchsize != 1
-  VLOG(MKLDNN_TESTS) << "check train phase";
-  dnnConfig.layerConfig.set_use_global_stats(false);
-  refConfig.layerConfig.set_use_global_stats(false);
-  MKLDNNTester tester;
-  tester.run(dnnConfig, refConfig, pm.bs, pm.ih, pm.iw, PASS_TRAIN);
-  // for PASS_TEST, check use_global_stats true and false, and batchsize 1
-  VLOG(MKLDNN_TESTS) << "check test phase";
-  for (auto useGS : {false, true}) {
-    dnnConfig.layerConfig.set_use_global_stats(useGS);
-    refConfig.layerConfig.set_use_global_stats(useGS);
-    MKLDNNTester tester;
-    for (auto bs : {pm.bs, 1}) {
-      tester.run(dnnConfig, refConfig, bs, pm.ih, pm.iw, PASS_TEST);
-    }
-  }
-}
-
-TEST(MKLDNNLayer, BatchNormLayer) {
-  testBatchNormLayer({4, 10, 6, 6});
-  testBatchNormLayer({16, 32, 16, 16});
-  testBatchNormLayer({4, 16, 8, 10});
-}
-
-struct testLRNDesc {
-  int bs, ic, ih, iw;
-  float scale, pow;
-  int localSize;
-};
-
-void getMKLDNNLRNConfig(TestConfig& cfg, const testLRNDesc& pm) {
-  cfg.layerConfig.set_type("mkldnn_lrn");
-  cfg.layerConfig.set_active_type("relu");
-  size_t layerSize = pm.ic * pm.ih * pm.iw;
-  cfg.inputDefs.push_back({INPUT_DATA, "layer_0", layerSize, 0});
-  LayerInputConfig* input = cfg.layerConfig.add_inputs();
-  NormConfig* norm = input->mutable_norm_conf();
-  norm->set_channels(pm.ic);
-  norm->set_size(pm.localSize);
-  norm->set_scale(pm.scale);
-  norm->set_pow(pm.pow);
-  norm->set_blocked(0);
-  norm->set_img_size(pm.iw);
-  norm->set_img_size_y(pm.ih);
-  norm->set_output_x(norm->img_size());
-  norm->set_output_y(norm->img_size_y());
-  cfg.layerConfig.set_size(layerSize);
-  cfg.biasSize = 0;
-}
-
-void testLRNLayer(const testLRNDesc& pm) {
-  TestConfig dnnConfig;
-  getMKLDNNLRNConfig(dnnConfig, pm);
-  // mkldnn_lrn <==> norm with cmrnorm-projection type
-  TestConfig refConfig = dnnConfig;
-  refConfig.layerConfig.set_type("norm");
-  LayerInputConfig* input = refConfig.layerConfig.mutable_inputs(0);
-  NormConfig* norm = input->mutable_norm_conf();
-  norm->set_norm_type("cmrnorm-projection");
-  norm->set_scale(norm->scale() / norm->size());
-  RUN_MKLDNN_TEST(dnnConfig, refConfig, pm)
-}
-
-TEST(MKLDNNLayer, LRNLayer) {
-  testLRNLayer({4, 10, 12, 12, 0.001f, 0.75f, 5});
-  testLRNLayer({2, 32, 6, 6, 0.001f, 0.75f, 5});
-  testLRNLayer({4, 16, 8, 10, 0.01f, 0.5f, 5});
-}
-
-struct testImageDesc {
-  int bs, ic, ih, iw;
-};
-
-static void getAddtoConfig(TestConfig& cfg,
-                           const testImageDesc& pm,
-                           const size_t nInputs = 1) {
-  cfg.biasSize = 0;
-  cfg.layerConfig.set_type("addto");
-  size_t layerSize = pm.ic * pm.ih * pm.iw;
-  cfg.layerConfig.set_size(layerSize);
-  cfg.layerConfig.set_active_type("relu");
-  for (size_t i = 0; i < nInputs; ++i) {
-    std::stringstream ss;
-    ss << "layer_" << i;
-    cfg.inputDefs.push_back({INPUT_DATA, ss.str(), layerSize, 0});
-    LayerInputConfig* input = cfg.layerConfig.add_inputs();
-    ImageConfig* img_conf = input->mutable_image_conf();
-    img_conf->set_channels(pm.ic);
-    img_conf->set_img_size_y(pm.ih);
-    img_conf->set_img_size(pm.iw);
-  }
-}
-
-void testAddtoLayer(const testImageDesc& pm, const size_t nInputs) {
-  CHECK_GE(nInputs, 1UL);
-  TestConfig dnnConfig;
-  getAddtoConfig(dnnConfig, pm, nInputs);
-  dnnConfig.layerConfig.set_type("mkldnn_addto");
-  for (auto withBias : {false, true}) {
-    dnnConfig.biasSize = withBias ? pm.ic * pm.ih * pm.iw : 0;
-    RUN_MKLDNN_TEST_LAYER(dnnConfig, "addto", pm)
-  }
-}
-
-TEST(MKLDNNLayer, AddtoLayer) {
-  testAddtoLayer({16, 5, 14, 14}, 1);
-  testAddtoLayer({8, 10, 8, 8}, 2);
-  testAddtoLayer({4, 12, 1, 1}, 3);
-}
-
-static void getMKLDNNConcatConfig(TestConfig& cfg,
-                                  const std::vector<testImageDesc>& inputs) {
-  CHECK_GE(inputs.size(), 2UL) << "at least two inputs";
-  int oc = inputs[0].ic;
-  for (size_t i = 1; i < inputs.size(); ++i) {
-    CHECK_EQ(inputs[i].bs, inputs[0].bs);
-    CHECK_EQ(inputs[i].ih, inputs[0].ih);
-    CHECK_EQ(inputs[i].iw, inputs[0].iw);
-    oc += inputs[i].ic;
-  }
-  cfg.biasSize = 0;
-  cfg.layerConfig.set_type("mkldnn_concat");
-  cfg.layerConfig.set_size(oc * inputs[0].ih * inputs[0].iw);
-  cfg.layerConfig.set_active_type("relu");
-  for (size_t i = 0; i < inputs.size(); ++i) {
-    std::stringstream ss;
-    ss << "layer_" << i;
-    cfg.inputDefs.push_back(
-        {INPUT_DATA,
-         ss.str(),
-         (size_t)(inputs[i].ic) * inputs[i].ih * inputs[i].iw,
-         0});
-    LayerInputConfig* input = cfg.layerConfig.add_inputs();
-    ImageConfig* img_conf = input->mutable_image_conf();
-    img_conf->set_channels(inputs[i].ic);
-    img_conf->set_img_size_y(inputs[i].ih);
-    img_conf->set_img_size(inputs[i].iw);
-  }
-}
-
-void testConcatLayer(const std::vector<testImageDesc>& inputs) {
-  TestConfig dnnConfig;
-  getMKLDNNConcatConfig(dnnConfig, inputs);
-  RUN_MKLDNN_TEST_LAYER(dnnConfig, "concat", inputs[0])
-}
-
-TEST(MKLDNNLayer, ConcatLayer) {
-  testConcatLayer({{64, 128, 1, 1}, {64, 32, 1, 1}, {64, 64, 1, 1}});
-  testConcatLayer({{32, 100, 8, 8}, {32, 10, 8, 8}});
-}
-
-void testActivation(std::string actType, const testImageDesc& pm) {
-  // TODO(TJ): remove me when paddle support elu activation
-  if (actType == "mkldnn_elu") {
-    return;
-  }
-  const std::string compareTypes[] = {actType, actType.erase(0, 7)};
-  TestConfig cfg;
-  getAddtoConfig(cfg, pm);
-  TestConfig ref = cfg;
-  cfg.layerConfig.set_active_type(compareTypes[0]);
-  ref.layerConfig.set_active_type(compareTypes[1]);
-  RUN_MKLDNN_TEST(cfg, ref, pm)
-}
-
-TEST(MKLDNNActivation, Activations) {
-  auto types = MKLDNNActivation::getAllRegisteredTypes();
-  for (auto type : types) {
-    /* bs, c, h, w*/
-    testActivation(type, {16, 64, 32, 32});
-    testActivation(type, {2, 8, 1, 1});
-  }
-}
-
-DECLARE_string(config_args);
-TEST(MKLDNNNet, net) {
-  std::vector<std::string> cases = {"simple", "branch"};
-  for (auto name : cases) {
-    std::string config = "./gserver/tests/mkldnn_" + name + "_net.conf";
-    for (auto channels : {2, 32}) {
-      std::ostringstream oss;
-      oss << "channels=" << channels;
-      FLAGS_config_args = oss.str();
-      MKLDNNTester::runNetTest(config);
-    }
-  }
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  FLAGS_use_gpu = false;
-  FLAGS_use_mkldnn = true;
-  initMain(argc, argv);
-  initPython(argc, argv);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(1);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_MaxPoolingWithMaskOutput.cpp b/paddle/gserver/tests/test_MaxPoolingWithMaskOutput.cpp
deleted file mode 100644
index 5188d2abed899a210de66084109034ee381cd078..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_MaxPoolingWithMaskOutput.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-
-#include "LayerGradUtil.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;
-
-void setPoolConfig(TestConfig* config,
-                   PoolConfig* pool,
-                   const string& poolType) {
-  (*config).biasSize = 0;
-  (*config).layerConfig.set_type("pool");
-  (*config).layerConfig.set_num_filters(1);
-
-  int kw = 3, kh = 3;
-  int pw = 0, ph = 0;
-  int sw = 2, sh = 2;
-  pool->set_pool_type(poolType);
-  pool->set_channels(1);
-  pool->set_size_x(kw);
-  pool->set_size_y(kh);
-  pool->set_start(0);
-  pool->set_padding(pw);
-  pool->set_padding_y(ph);
-  pool->set_stride(sw);
-  pool->set_stride_y(sh);
-
-  int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
-  int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
-  pool->set_output_x(ow);
-  pool->set_output_y(oh);
-}
-
-void doOneMaxPoolingWithMaskOutputTest(MatrixPtr& inputMat,
-                                       const string& poolType,
-                                       bool use_gpu,
-                                       MatrixPtr& maskMat) {
-  TestConfig config;
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 25, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  PoolConfig* pool = input->mutable_pool_conf();
-
-  pool->set_img_size(5);
-  pool->set_img_size_y(5);
-  setPoolConfig(&config, pool, poolType);
-  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
-                              pool->channels());
-
-  config.layerConfig.set_name("MaxPoolWithMask");
-
-  std::vector<DataLayerPtr> dataLayers;
-  LayerMap layerMap;
-  vector<Argument> datas;
-
-  initDataLayer(config,
-                &dataLayers,
-                &datas,
-                &layerMap,
-                "MaxPoolWithMask",
-                1,
-                false,
-                use_gpu);
-
-  dataLayers[0]->getOutputValue()->copyFrom(*inputMat);
-
-  FLAGS_use_gpu = use_gpu;
-  std::vector<ParameterPtr> parameters;
-  LayerPtr maxPoolingWithMaskOutputLayer;
-  initTestLayer(config, &layerMap, &parameters, &maxPoolingWithMaskOutputLayer);
-  maxPoolingWithMaskOutputLayer->forward(PASS_GC);
-
-  checkMatrixEqual(maxPoolingWithMaskOutputLayer->getOutput("mask").value,
-                   maskMat);
-}
-
-TEST(Layer, maxPoolingWithMaskOutputLayerFwd) {
-  bool useGpu = false;
-  MatrixPtr inputMat;
-  MatrixPtr maskMat;
-  real inputData[] = {0.1, 0.1, 0.5, 0.5, 1.1, 0.2, 0.2, 0.6, 0.1,
-                      0.1, 0.3, 0.3, 0.7, 0.1, 0.1, 0.4, 0.4, 0.8,
-                      0.8, 0.1, 1.0, 2.0, 3.0, 0.0, 9.0};
-  real maskData[] = {12, 4, 22, 24};
-
-  inputMat = Matrix::create(1, 25, false, useGpu);
-  maskMat = Matrix::create(1, 4, false, useGpu);
-  inputMat->setData(inputData);
-  maskMat->setData(maskData);
-  doOneMaxPoolingWithMaskOutputTest(
-      inputMat, "max-pool-with-mask", useGpu, maskMat);
-#ifdef PADDLE_WITH_CUDA
-  useGpu = true;
-  inputMat = Matrix::create(1, 25, false, useGpu);
-  maskMat = Matrix::create(1, 4, false, useGpu);
-  inputMat->copyFrom(inputData, 25);
-  maskMat->copyFrom(maskData, 4);
-  doOneMaxPoolingWithMaskOutputTest(
-      inputMat, "max-pool-with-mask", useGpu, maskMat);
-#endif
-}
diff --git a/paddle/gserver/tests/test_MultinomialSampler.cpp b/paddle/gserver/tests/test_MultinomialSampler.cpp
deleted file mode 100644
index 043025239e744601cbef3ca5c241509872963bd8..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_MultinomialSampler.cpp
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <random>
-
-#include <gtest/gtest.h>
-#include <vector>
-
-#undef PADDLE_DISABLE_TIMER
-#include "paddle/utils/Stat.h"
-
-#include "paddle/gserver/layers/MultinomialSampler.h"
-#include "paddle/utils/Util.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-class MultinomialSamplerTester : public MultinomialSampler {
- public:
-  MultinomialSamplerTester(real* prob, int size)
-      : MultinomialSampler(prob, size) {}
-
-  template <typename Rand1>
-  int testGen(Rand1 rand1) {
-    return gen1(rand1);
-  }
-};
-
-TEST(MultinomialSampler, gen) {
-  int numGrids = 1024 * 1024;
-  int size = 1024 * 4;
-  default_random_engine reng;
-
-  for (size_t iter = 0; iter < 256; ++iter) {
-    uniform_int_distribution<int> rand(1, numGrids / size * 1.8);
-    vector<real> prob;
-    int sum = 0;
-    for (int i = 0; i < size; ++i) {
-      prob.push_back(rand(reng));
-      sum += prob.back();
-    }
-
-    CHECK_LE(sum, numGrids);
-    prob.back() += numGrids - sum;
-
-    vector<int> counts(size);
-    MultinomialSamplerTester sampler(&prob[0], size);
-    counts.assign(size, 0);
-    {
-      double s = (double)size / (double)numGrids;
-      REGISTER_TIMER("MultinomialSampler");
-      for (double i = 0; i < numGrids; ++i) {
-        int ret = sampler.testGen([i, s]() { return s * i; });
-        if (ret < 0 || ret >= size) {
-          EXPECT_GE(ret, 0);
-          EXPECT_LT(ret, size);
-          break;
-        }
-        ++counts[ret];
-      }
-    }
-    for (int i = 0; i < size; ++i) {
-      if (prob[i] != counts[i]) {
-        EXPECT_EQ(prob[i], counts[i]);
-        LOG(INFO) << iter;
-        break;
-      }
-    }
-  }
-}
-
-void benchmarkRandom() {
-  int n = 1024 * 1024;
-
-  int sum;
-  double sum1;
-
-  sum = 0;
-  unsigned int seed = 1;
-  {
-    REGISTER_TIMER("crand");
-    for (int i = 0; i < n; ++i) {
-      sum += rand_r(&seed) % 1000;
-    }
-  }
-  LOG(INFO) << "sum=" << sum;
-
-  default_random_engine reng;
-  uniform_int_distribution<int> rand(1, 1000);
-  sum = 0;
-  {
-    REGISTER_TIMER("stdrand");
-    for (int i = 0; i < n; ++i) {
-      sum += rand(reng);
-    }
-  }
-  LOG(INFO) << "sum=" << sum;
-
-  sum = 0;
-  {
-    REGISTER_TIMER("default_random_engine");
-    for (int i = 0; i < n; ++i) {
-      sum += reng();
-    }
-  }
-  LOG(INFO) << "sum=" << sum;
-
-  uniform_real_distribution<double> rand1(0, 1);
-  sum1 = 0;
-  {
-    REGISTER_TIMER("stdrand1");
-    for (int i = 0; i < n; ++i) {
-      sum1 += rand1(reng);
-    }
-  }
-  LOG(INFO) << "sum1=" << sum1;
-
-  sum1 = 0;
-  {
-    real a = 1.0f / (real)RAND_MAX;
-    REGISTER_TIMER("crand1");
-    for (int i = 0; i < n; ++i) {
-      sum1 += a * rand_r(&seed);
-    }
-  }
-  LOG(INFO) << "sum1=" << sum1;
-}
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  benchmarkRandom();
-  int ret = RUN_ALL_TESTS();
-  globalStat.printSegTimerStatus();
-  return ret;
-}
diff --git a/paddle/gserver/tests/test_NetworkCompare.cpp b/paddle/gserver/tests/test_NetworkCompare.cpp
deleted file mode 100644
index fda3f2f7934adde09303f443ca5e8de6a7d077cd..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_NetworkCompare.cpp
+++ /dev/null
@@ -1,294 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#undef PADDLE_DISABLE_TIMER
-#include <gtest/gtest.h>
-#include <paddle/utils/PythonUtil.h>
-#include <algorithm>
-#include <cstdlib>
-
-#include "paddle/testing/TestUtil.h"
-#include "paddle/trainer/Trainer.h"
-#include "paddle/utils/Stat.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_int32(gpu_id);
-DECLARE_double(checkgrad_eps);
-DEFINE_bool(use_label, true, "input label or sequence label");
-DEFINE_bool(static_para, false, "static parameter");
-
-struct DataIn {
-  std::vector<Argument> inArgs;
-  std::vector<MatrixPtr> outGrads;
-  std::vector<VectorPtr> paraValues;
-};
-
-struct DataOut {
-  std::vector<MatrixPtr> outValues;
-  std::vector<VectorPtr> paraGrads;
-};
-
-void initArgument(DataIn& data,
-                  const std::string& configPath,
-                  bool useGpu = FLAGS_use_gpu) {
-  TrainerConfigHelper config(configPath);
-  size_t batchSize = config.getOptConfig().batch_size();
-
-  for (const auto& layer_name : config.getModelConfig().input_layer_names()) {
-    auto layer_config = std::find_if(config.getModelConfig().layers().begin(),
-                                     config.getModelConfig().layers().end(),
-                                     [=](const LayerConfig& layer_config) {
-                                       return layer_config.name() == layer_name;
-                                     });
-    CHECK(layer_config != config.getModelConfig().layers().end());
-
-    size_t layerSize = layer_config->size();
-    Argument arg;
-    arg.value = Matrix::create(batchSize, layerSize, false, useGpu);
-    arg.grad = Matrix::create(batchSize, layerSize, false, useGpu);
-    arg.value->randomizeUniform();
-    arg.value->add(-0.5);
-    arg.value->sigmoid(*arg.value);
-    arg.grad->zeroMem();
-    if (FLAGS_use_label) {
-      arg.ids = VectorT<int>::create(batchSize, useGpu);
-      arg.ids->rand(layerSize);
-    }
-    generateSequenceStartPositions(batchSize, arg.sequenceStartPositions);
-    data.inArgs.push_back(arg);
-  }
-
-  for (const auto& layer_name : config.getModelConfig().output_layer_names()) {
-    auto layer_config = std::find_if(config.getModelConfig().layers().begin(),
-                                     config.getModelConfig().layers().end(),
-                                     [=](const LayerConfig& layer_config) {
-                                       return layer_config.name() == layer_name;
-                                     });
-    CHECK(layer_config != config.getModelConfig().layers().end());
-
-    size_t layerSize = layer_config->size();
-    MatrixPtr grad = Matrix::create(batchSize, layerSize, false, useGpu);
-    grad->randomizeUniform();
-    data.outGrads.push_back(grad);
-  }
-
-  for (const auto& para_config : config.getModelConfig().parameters()) {
-    VectorPtr value = Vector::create(para_config.size(), useGpu);
-    value->randnorm(0, 2);
-    data.paraValues.push_back(value);
-  }
-}
-
-void calcGradient(DataIn& in, DataOut& out, const std::string& configPath) {
-  *ThreadLocalRand::getSeed() = 0;
-  srand(0);
-
-  Trainer trainer;
-  auto config = std::make_shared<TrainerConfigHelper>(configPath);
-  trainer.init(config, false);
-
-  std::vector<ParameterPtr> parameters;
-  vector<Argument> outArgs;
-
-  auto gradientMachine = trainer.getGradientMachine();
-  parameters = gradientMachine->getParameters();
-  if (FLAGS_static_para) {
-    for (size_t i = 0; i < parameters.size(); i++) {
-      parameters[i]->getBuf(PARAMETER_VALUE)->one();
-    }
-  } else {
-    for (size_t i = 0; i < in.paraValues.size(); i++) {
-      parameters[i]->getBuf(PARAMETER_VALUE)->copyFrom(*in.paraValues[i]);
-    }
-  }
-  gradientMachine->start();
-  gradientMachine->forward(in.inArgs, &outArgs, PASS_TRAIN);
-  for (size_t i = 0; i < in.outGrads.size(); i++) {
-    // If the all the layers in the config have no parameters, also
-    // not set NeedGradient(), the outArgs[i] will be nullptr.
-    outArgs[i].grad->copyFrom(*in.outGrads[i]);
-  }
-  gradientMachine->backward();
-  for (size_t i = 0; i < in.outGrads.size(); i++) {
-    MatrixPtr value = Matrix::create(outArgs[i].value->getHeight(),
-                                     outArgs[i].value->getWidth(),
-                                     false,
-                                     false);
-    value->copyFrom(*outArgs[i].value);
-    out.outValues.push_back(value);
-  }
-  for (size_t i = 0; i < in.paraValues.size(); i++) {
-    VectorPtr grad = Vector::create(
-        parameters[i]->getBuf(PARAMETER_GRADIENT)->getSize(), false);
-    grad->copyFrom(*parameters[i]->getBuf(PARAMETER_GRADIENT));
-    out.paraGrads.push_back(grad);
-  }
-
-  for (int i = 0; i < 20; i++) {
-    REGISTER_TIMER("forward");
-    gradientMachine->forward(in.inArgs, &outArgs, PASS_TRAIN);
-  }
-  for (int i = 0; i < 20; i++) {
-    REGISTER_TIMER("backward");
-    gradientMachine->backward();
-  }
-
-  gradientMachine->finish();
-}
-
-void checkBuffer(real* A,
-                 const char* desA,
-                 real* B,
-                 const char* desB,
-                 size_t len,
-                 size_t width = 1) {
-  int nNum = 0;
-  for (size_t i = 0; i < len; ++i) {
-    real diff = fabs(A[i] - B[i]);
-    if (diff > 0.0f &&
-        diff / std::max(fabs(A[i]), fabs(B[i])) > FLAGS_checkgrad_eps) {
-      nNum++;
-      LOG(INFO) << "Row: " << i / width << ", " << desA << " : " << A[i]
-                << "    " << desB << " : " << B[i];
-    }
-  }
-  EXPECT_EQ(0, nNum);
-}
-
-void compareGradient(DataOut& outA, DataOut& outB) {
-  LOG(INFO) << "------------------------------"
-            << " Check Network Output "
-            << "------------------------------";
-  for (size_t i = 0; i < outA.outValues.size(); ++i) {
-    LOG(INFO) << "OUTPUT VALUE: " << i;
-    checkBuffer(outA.outValues[i]->getData(),
-                "network A output",
-                outB.outValues[i]->getData(),
-                "network B output",
-                outA.outValues[i]->getElementCnt(),
-                outA.outValues[i]->getWidth());
-  }
-
-  if (!FLAGS_static_para) {
-    LOG(INFO) << "------------------------------"
-              << " Check Parameters "
-              << "------------------------------";
-    for (size_t i = 0; i < outA.paraGrads.size(); ++i) {
-      LOG(INFO) << "PARAMETER GRADIENT: " << i;
-      checkBuffer(outA.paraGrads[i]->getData(),
-                  "Network A",
-                  outB.paraGrads[i]->getData(),
-                  "Network B",
-                  outA.paraGrads[i]->getSize());
-    }
-  }
-}
-
-void compareNetwork(const std::string& config_file_a,
-                    const std::string& config_file_b) {
-  DataIn in;
-  initArgument(in, config_file_a);
-
-  DataOut dataA;
-  calcGradient(in, dataA, config_file_a);
-  LOG(INFO) << "forwardBackward of Network A is finished";
-  globalStat.printSegTimerStatus();
-  globalStat.reset();
-  LOG(INFO) << "\n\n";
-
-  DataOut dataB;
-  calcGradient(in, dataB, config_file_b);
-  LOG(INFO) << "forwardBackward of the Network B is finished";
-  globalStat.printSegTimerStatus();
-  globalStat.reset();
-  LOG(INFO) << "\n\n";
-
-  compareGradient(dataA, dataB);
-}
-
-TEST(Compare, concat_dotmul) {
-  std::string config_file_a = "./gserver/tests/concat_dotmul_a.conf";
-  std::string config_file_b = "./gserver/tests/concat_dotmul_b.conf";
-  compareNetwork(config_file_a, config_file_b);
-}
-
-TEST(Compare, concat_fullmatrix) {
-  std::string config_file_a = "./gserver/tests/concat_fullmatrix_a.conf";
-  std::string config_file_b = "./gserver/tests/concat_fullmatrix_b.conf";
-  compareNetwork(config_file_a, config_file_b);
-}
-
-TEST(Compare, concat_table) {
-  std::string config_file_a = "./gserver/tests/concat_table_a.conf";
-  std::string config_file_b = "./gserver/tests/concat_table_b.conf";
-  compareNetwork(config_file_a, config_file_b);
-}
-
-TEST(Compare, concat_slice) {
-  std::string config_file_a = "./gserver/tests/concat_slice_a.conf";
-  std::string config_file_b = "./gserver/tests/concat_slice_b.conf";
-  compareNetwork(config_file_a, config_file_b);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(Compare, img_pool) {
-  std::string config_file_a = "./gserver/tests/img_pool_a.conf";
-  std::string config_file_b = "./gserver/tests/img_pool_b.conf";
-  bool useGpu = FLAGS_use_gpu;
-  FLAGS_use_gpu = true;
-  compareNetwork(config_file_a, config_file_b);
-  FLAGS_use_gpu = useGpu;
-}
-
-TEST(Compare, img_conv) {
-  std::string config_file_a = "./gserver/tests/img_conv_a.conf";
-  std::string config_file_b = "./gserver/tests/img_conv_b.conf";
-  bool useGpu = FLAGS_use_gpu;
-  FLAGS_use_gpu = true;
-  compareNetwork(config_file_a, config_file_b);
-  FLAGS_use_gpu = useGpu;
-}
-
-// Test cudnn_conv and exconv give the same result
-TEST(Compare, img_conv2) {
-  std::string config_file_a = "./gserver/tests/img_conv_cudnn.py";
-  std::string config_file_b = "./gserver/tests/img_conv_exconv.py";
-  bool useGpu = FLAGS_use_gpu;
-  double eps = FLAGS_checkgrad_eps;
-  FLAGS_use_gpu = true;
-  // Sometimes, this unit test will fail with 1e-2
-  FLAGS_checkgrad_eps = 4e-2;
-  compareNetwork(config_file_a, config_file_b);
-  FLAGS_use_gpu = useGpu;
-  FLAGS_checkgrad_eps = eps;
-}
-#endif
-
-DEFINE_string(config_file_a, "", "config of one network to compare");
-DEFINE_string(config_file_b, "", "config of another network to compare");
-TEST(Compare, network) {
-  if (FLAGS_config_file_a != "" && FLAGS_config_file_b != "") {
-    compareNetwork(FLAGS_config_file_a, FLAGS_config_file_b);
-  }
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  paddle::initMain(argc, argv);
-  initPython(argc, argv);
-  int ret = RUN_ALL_TESTS();
-  return ret;
-}
diff --git a/paddle/gserver/tests/test_PyDataProvider.cpp b/paddle/gserver/tests/test_PyDataProvider.cpp
deleted file mode 100644
index a1dee9795077b835392469b5085e9728679a1664..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_PyDataProvider.cpp
+++ /dev/null
@@ -1,175 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include <gtest/gtest.h>
-
-#include "paddle/gserver/dataproviders/PyDataProvider.h"
-#include "paddle/utils/Util.h"
-
-#include "paddle/testing/TestUtil.h"
-
-using namespace std;     // NOLINT
-using namespace paddle;  // NOLINT
-
-void simpleValueCheck(const vector<Argument>& argumentList, bool useGpu);
-void simpleSequenceCheck(const vector<Argument>& argumentList, int sample_num);
-
-TEST(PyDataProvider, py_fill_slots) {
-  DataConfig config;
-  config.set_type("py");
-  config.set_async_load_data(false);
-  config.set_load_data_module(std::string("pyDataProvider"));
-  config.set_load_data_object(std::string("SimpleDataProvider"));
-  config.clear_files();
-  std::string dataFile = "gserver/tests/pyDataProvider/pyDataProviderList";
-  config.set_files(dataFile);
-#ifndef PADDLE_WITH_CUDA
-  bool useGpu = false;
-#else
-  bool useGpu = true;
-#endif
-  unique_ptr<DataProvider> dataProvider(DataProvider::create(config, useGpu));
-  DataBatch dataBatch;
-  dataProvider->getNextBatchInternal(2, &dataBatch);
-  const std::vector<Argument>& argumentList = dataBatch.getStreams();
-  // Check size
-  EXPECT_EQ(argumentList.size(), 3UL);
-  EXPECT_EQ(argumentList[0].value->getWidth(), 3UL);
-  EXPECT_EQ(argumentList[0].value->getHeight(), 2UL);
-  EXPECT_EQ(argumentList[0].value->getElementCnt(), 6UL);
-  EXPECT_EQ(argumentList[1].value->getWidth(), 7UL);
-  EXPECT_EQ(argumentList[1].value->getHeight(), 2UL);
-  EXPECT_EQ(argumentList[1].value->getElementCnt(), 4UL);
-  EXPECT_EQ(argumentList[2].ids->getSize(), 2UL);
-  // Check value
-  simpleValueCheck(argumentList, useGpu);
-  // Check sequenceStartPositions
-  simpleSequenceCheck(argumentList, 2);
-}
-
-TEST(PyDataProvider, py_fill_nest_slots) {
-  DataConfig config;
-  config.set_type("py");
-  config.set_async_load_data(false);
-  config.set_load_data_module(std::string("pyDataProvider"));
-  config.set_load_data_object(std::string("SimpleNestDataProvider"));
-  config.clear_files();
-  std::string dataFile = "gserver/tests/pyDataProvider/pyDataProviderList";
-  config.set_files(dataFile);
-  EXPECT_EQ(config.IsInitialized(), true);
-#ifndef PADDLE_WITH_CUDA
-  bool useGpu = false;
-#else
-  bool useGpu = true;
-#endif
-  unique_ptr<DataProvider> dataProvider(DataProvider::create(config, useGpu));
-  DataBatch dataBatch;
-  dataProvider->getNextBatchInternal(2, &dataBatch);
-  const std::vector<Argument>& argumentList = dataBatch.getStreams();
-  // Check size
-  EXPECT_EQ(argumentList.size(), 3UL);
-  EXPECT_EQ(argumentList[0].value->getWidth(), 3UL);
-  EXPECT_EQ(argumentList[0].value->getHeight(), 4UL);
-  EXPECT_EQ(argumentList[0].value->getElementCnt(), 12UL);
-  EXPECT_EQ(argumentList[1].value->getWidth(), 7UL);
-  EXPECT_EQ(argumentList[1].value->getHeight(), 4UL);
-  EXPECT_EQ(argumentList[1].value->getElementCnt(), 8UL);
-  EXPECT_EQ(argumentList[2].ids->getSize(), 4UL);
-  // Check value
-  simpleValueCheck(argumentList, useGpu);
-  // Check sequenceStartPositions
-  simpleSequenceCheck(argumentList, 4);
-  // Check subSequenceStartPositions
-  EXPECT_EQ(argumentList[0].subSequenceStartPositions->getSize(), 4UL);
-  EXPECT_EQ(argumentList[1].subSequenceStartPositions->getSize(), 3UL);
-  EXPECT_EQ(argumentList[2].subSequenceStartPositions->getSize(), 4UL);
-  for (size_t i = 0; i < argumentList.size(); i++) {
-    EXPECT_EQ(argumentList[i].subSequenceStartPositions->getElement(0), 0);
-    EXPECT_EQ(argumentList[i].subSequenceStartPositions->getElement(1), 1);
-    if (i != 1) {
-      EXPECT_EQ(argumentList[i].subSequenceStartPositions->getElement(2), 2);
-      EXPECT_EQ(argumentList[i].subSequenceStartPositions->getElement(3), 4);
-    } else {
-      EXPECT_EQ(argumentList[i].subSequenceStartPositions->getElement(2), 4);
-    }
-  }
-}
-
-void simpleValueCheck(const vector<Argument>& argumentList, bool useGpu) {
-  // Dense
-  real* data;
-  if (useGpu) {
-    MatrixPtr cpuMatrixPtr = Matrix::create(argumentList[0].value->getHeight(),
-                                            argumentList[0].value->getWidth(),
-                                            0,
-                                            0);
-    cpuMatrixPtr->copyFrom(*argumentList[0].value);
-    data = cpuMatrixPtr->getData();
-  } else {
-    data = argumentList[0].value->getData();
-  }
-  for (size_t i = 0; i < argumentList[0].value->getElementCnt(); ++i) {
-    EXPECT_EQ(*(data + i), (float)(i % 3 + 1));
-  }
-  // Sparse without value
-  GpuSparseMatrixPtr matGpu;
-  CpuSparseMatrixPtr matCpu;
-  if (useGpu) {
-    matGpu = dynamic_pointer_cast<GpuSparseMatrix>(argumentList[1].value);
-    ASSERT_TRUE(matGpu != NULL);
-  } else {
-    data = argumentList[0].value->getData();
-    matCpu = dynamic_pointer_cast<CpuSparseMatrix>(argumentList[1].value);
-    ASSERT_TRUE(matCpu != NULL);
-  }
-  for (size_t i = 0; i < argumentList[1].value->getHeight(); ++i) {
-    size_t colNum = useGpu ? matGpu->getColNum(i) : matCpu->getColNum(i);
-    EXPECT_EQ(colNum, (size_t)2);
-    const int* buf = useGpu ? matGpu->getRowCols(i) : matCpu->getRowCols(i);
-    for (size_t j = 0; j < colNum; ++j) {
-      EXPECT_EQ((size_t)buf[j], (size_t)(j + 1));
-    }
-  }
-  // Index
-  for (size_t j = 0; j < argumentList[2].ids->getSize(); ++j) {
-    EXPECT_EQ((size_t)argumentList[2].ids->get(j), 0UL);
-  }
-}
-
-void simpleSequenceCheck(const vector<Argument>& argumentList, int sample_num) {
-  EXPECT_EQ(argumentList[0].sequenceStartPositions->getSize(), 3UL);
-  EXPECT_EQ(argumentList[1].sequenceStartPositions->getSize(), 2UL);
-  EXPECT_EQ(argumentList[2].sequenceStartPositions->getSize(), 3UL);
-  for (size_t i = 0; i < argumentList.size(); i++) {
-    EXPECT_EQ(argumentList[i].sequenceStartPositions->getElement(0), 0);
-    if (i != 1) {
-      EXPECT_EQ(argumentList[i].sequenceStartPositions->getElement(1), 1);
-      EXPECT_EQ(argumentList[i].sequenceStartPositions->getElement(2),
-                sample_num);
-    } else {
-      EXPECT_EQ(argumentList[i].sequenceStartPositions->getElement(1),
-                sample_num);
-    }
-  }
-}
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  initPython(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_PyDataProvider2.cpp b/paddle/gserver/tests/test_PyDataProvider2.cpp
deleted file mode 100644
index b39fb3534509ebde2702c02e35800fe3ed6016c3..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_PyDataProvider2.cpp
+++ /dev/null
@@ -1,409 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef PADDLE_NO_PYTHON
-#include <gtest/gtest.h>
-#include <fstream>
-#include "paddle/gserver/dataproviders/DataProvider.h"
-#include "paddle/utils/PythonUtil.h"
-#include "paddle/utils/Util.h"
-
-DEFINE_string(train_list, "unittest.list", "file list for unittest");
-
-namespace paddle {
-namespace unittest {
-namespace pydp2 {
-extern void setOnPoolFilledHook(const std::function<void(size_t)> &func);
-extern void clearOnPoolFilledHook();
-
-}  // namespace pydp2
-}  // namespace unittest
-}  // namespace paddle
-
-const paddle::real epsilon = 1e-5;
-
-static inline int64_t readDataBatch(paddle::DataBatch *batch,
-                                    const std::string &funcName,
-                                    int64_t batchSize = 65535) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object(funcName);
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-  provider->setSkipShuffle();
-  provider->reset();
-  return provider->getNextBatchInternal(batchSize, batch);
-}
-
-TEST(PyDataProvider2, dense_no_seq) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_dense_no_seq");
-
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-
-  provider->setSkipShuffle();  // skip shuffle for unittest.
-
-  paddle::DataBatch batch;
-  for (size_t pass = 0; pass < 2; ++pass) {  // read 2 passes
-    provider->reset();
-    int64_t num = provider->getNextBatchInternal(100, &batch);
-    ASSERT_NE(num, 0);
-    ASSERT_EQ((size_t)batch.getStreams().size(), (size_t)1);
-    ASSERT_EQ((size_t)batch.getSize(), (size_t)100);
-    // Check batch data.
-    for (size_t i = 0; i < 100; ++i) {
-      for (size_t j = 0; j < 200; ++j) {
-        paddle::real tmp = (paddle::real)((j - 100.0) * (i + 1) / 200.0);
-        ASSERT_NEAR(
-            batch.getStreams()[0].value->getData()[i * 200 + j], tmp, epsilon);
-      }
-    }
-
-    num = provider->getNextBatchInternal(100, &batch);
-    ASSERT_NE(num, 0);
-    ASSERT_EQ(batch.getStreams().size(), (size_t)1);
-    ASSERT_EQ((size_t)batch.getSize(), (size_t)100);
-    // Check batch data.
-    for (size_t i = 0; i < 100; ++i) {
-      size_t ii = i + 100;
-      for (size_t j = 0; j < 200; ++j) {
-        paddle::real tmp = (paddle::real)((j - 100.0) * (ii + 1) / 200.0);
-        ASSERT_NEAR(
-            batch.getStreams()[0].value->getData()[i * 200 + j], tmp, epsilon);
-      }
-    }
-    num = provider->getNextBatchInternal(100, &batch);
-    ASSERT_EQ(num, 0);
-  }
-}
-
-TEST(PyDataProvider2, index_no_seq) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_index_no_seq");
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-
-  provider->setSkipShuffle();  // skip shuffle for unittest.
-  paddle::DataBatch batch;
-  for (size_t pass = 0; pass < 2; ++pass) {
-    provider->reset();
-    int64_t num = provider->getNextBatchInternal(10000, &batch);
-    CHECK_EQ(num, 200);
-    for (int i = 0; i < 200; ++i) {
-      CHECK_EQ(i, batch.getStreams()[0].ids->getData()[i]);
-    }
-  }
-}
-
-TEST(PyDataProvider2, init_hook) {
-  paddle::PyObjectPtr pickle = paddle::py::import("pickle");
-  paddle::PyObjectPtr globals(PyModule_GetDict(PyImport_AddModule("__main__")));
-  PyDict_SetItemString(globals.get(), "pickle", pickle.get());
-  paddle::PyObjectPtr locals(PyDict_New());
-  paddle::PyObjectPtr mdl(PyRun_String(
-      "dumps = pickle.dumps({'value':[float(x) for x in xrange(20)]})",
-      Py_file_input,
-      globals.get(),
-      locals.get()));
-  CHECK_PY(mdl) << "Error!";
-  paddle::PyObjectPtr dps(PyDict_GetItemString(locals.get(), "dumps"));
-  CHECK_PY(dps) << "Error!";
-
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_init_hook");
-  config.set_load_data_args(PyString_AsString(dps.get()));
-
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-  provider->setSkipShuffle();  // skip shuffle for unittest.
-  provider->reset();
-  paddle::DataBatch batch;
-  int64_t num = provider->getNextBatchInternal(100000, &batch);
-  ASSERT_EQ(num, 200);
-  auto &mat = batch.getStreams()[0].value;
-  ASSERT_EQ((size_t)mat->getWidth(), (size_t)20);
-  for (size_t i = 0; i < 200; ++i) {
-    for (size_t j = 0; j < 20; ++j) {
-      ASSERT_NEAR((paddle::real)j, mat->getData()[i * 20 + j], epsilon);
-    }
-  }
-}
-
-TEST(PyDataProvider2, sparse_no_value_no_seq) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_sparse_non_value_no_seq");
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-  provider->setSkipShuffle();
-  provider->reset();
-  paddle::DataBatch batch;
-  int64_t num = provider->getNextBatchInternal(10000, &batch);
-  CHECK_EQ(num, 200);
-  auto csm = std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(
-      batch.getStreams()[0].value);
-  CHECK(csm != nullptr);
-  for (int i = 0; i < 200; ++i) {
-    CHECK_EQ(csm->getColNum(i), (size_t)10);
-    int *cols = csm->getRowCols(i);
-    for (int j = 0; j < 10; ++j) {
-      CHECK_EQ(cols[j], (i + 1) * (j + 1));
-    }
-  }
-}
-
-TEST(PyDataProvider2, sparse_value_no_seq) {
-  paddle::DataBatch batch;
-  CHECK_EQ(readDataBatch(&batch, "test_sparse_value_no_seq"), 200);
-  auto csm = std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(
-      batch.getStreams()[0].value);
-  CHECK(csm != nullptr);
-  for (int i = 0; i < 200; ++i) {
-    CHECK_EQ(csm->getColNum(i), (size_t)10);
-    int *cols = csm->getRowCols(i);
-    real *dat = csm->getRowValues(i);
-    for (int j = 0; j < 10; ++j) {
-      EXPECT_EQ(cols[j], (i + 1) * (j + 1));
-      EXPECT_EQ(dat[j], real(j) / real(i + 1));
-    }
-  }
-}
-
-TEST(PyDataProvider2, index_seq) {
-  paddle::DataBatch batch;
-  CHECK_EQ(readDataBatch(&batch, "test_index_seq"), 200);
-  auto &arg = batch.getStreams()[0];
-  CHECK_EQ((int)arg.ids->getSize(), (200 + 1) * 200 / 2);
-  size_t tmp = 0;
-  for (size_t i = 0; i < 200; ++i) {  // CHECK DATA CORRECT
-    for (size_t j = 0; j < i + 1; ++j) {
-      ASSERT_EQ((size_t)arg.ids->getData()[tmp], j);
-      ++tmp;
-    }
-  }
-  ASSERT_EQ(arg.sequenceStartPositions->getSize(), (size_t)201);
-  tmp = 0;
-  for (size_t i = 0; i < 200; ++i) {
-    tmp += i;
-    ASSERT_EQ((size_t)arg.sequenceStartPositions->getData(false)[i], tmp);
-  }
-  tmp += 200;
-  ASSERT_EQ((size_t)arg.sequenceStartPositions->getData(false)[200], tmp);
-}
-
-TEST(PyDataProvider2, index_sub_seq) {
-  paddle::DataBatch batch;
-  ASSERT_EQ(readDataBatch(&batch, "test_index_sub_seq"), 200);
-  auto &arg = batch.getStreams()[0];
-  size_t tmp = 0;
-  for (size_t i = 0; i < 200; ++i) {
-    for (size_t j = 0; j < i + 1; ++j) {
-      for (size_t k = 0; k < j + 1; ++k) {
-        CHECK_EQ((size_t)arg.ids->getData()[tmp++], k);
-      }
-    }
-  }
-
-  CHECK_EQ(tmp, arg.ids->getSize());
-
-  ASSERT_EQ((size_t)arg.sequenceStartPositions->getSize(), (size_t)201);
-  ASSERT_EQ(arg.subSequenceStartPositions->getData(false)[0], 0);
-  ASSERT_EQ(arg.sequenceStartPositions->getData(false)[0], 0);
-  size_t idx = 1;
-  tmp = 0;
-  for (size_t i = 0; i < 200; ++i) {
-    for (size_t j = 0; j < i + 1; ++j) {
-      tmp += j + 1;
-      ASSERT_EQ((size_t)arg.subSequenceStartPositions->getData(false)[idx],
-                (size_t)tmp);
-      ++idx;
-    }
-    ASSERT_EQ((size_t)arg.sequenceStartPositions->getData(false)[i + 1], tmp);
-  }
-}
-
-TEST(PyDataProvider2, min_pool_size) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_min_pool_size");
-  config.set_load_data_args("");
-  size_t totalData = 1 << 14;
-  constexpr size_t batchSize = 100;
-  constexpr size_t minPoolSize = 1000;
-  paddle::DataBatch batch;
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-  provider->reset();
-
-  paddle::unittest::pydp2::setOnPoolFilledHook([&](size_t poolSize) {
-    if (totalData > batchSize) {
-      CHECK_GE(poolSize, std::min(totalData - batchSize, minPoolSize));
-    }
-  });
-  while (true) {
-    int64_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
-    if (realBatchSize) {
-      totalData -= realBatchSize;
-    } else {
-      break;
-    }
-  }
-  paddle::unittest::pydp2::clearOnPoolFilledHook();
-}
-
-TEST(PyDataProvider2, can_over_batch_size) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_can_over_batch_size");
-  config.set_load_data_args("");
-  paddle::DataBatch batch;
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-  provider->reset();
-  constexpr size_t batchSize = 100;
-  while (true) {
-    int64_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
-    if (realBatchSize) {
-      CHECK_LE(static_cast<size_t>(realBatchSize), batchSize);
-    } else {
-      break;
-    }
-  }
-}
-
-TEST(PyDataProvider2, input_order) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_input_order");
-  config.set_load_data_args("");
-
-  paddle::ModelConfig modelConfig;
-  *modelConfig.add_input_layer_names() = "input1";
-  *modelConfig.add_input_layer_names() = "input2";
-  paddle::DataBatch batch;
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, modelConfig, false));
-  provider->reset();
-  constexpr size_t batchSize = 100;
-  while (true) {
-    int64_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
-    if (!realBatchSize) {
-      break;
-    }
-    ASSERT_EQ(batch.getStreams().size(), static_cast<size_t>(2));
-    for (int64_t i = 0; i < realBatchSize; ++i) {
-      ASSERT_EQ(batch.getStream(0).ids->getData()[i], 0);
-      ASSERT_EQ(batch.getStream(1).ids->getData()[i], 1);
-    }
-  }
-}
-
-TEST(PyDataProvider2, test_check) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_check");
-  config.set_load_data_args("");
-  paddle::DataBatch batch;
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-  provider->reset();
-  while (true) {
-    int64_t realBatchSize = provider->getNextBatchInternal(100, &batch);
-    if (!realBatchSize) {
-      break;
-    } else {
-      auto &ivec = batch.getStream(0).ids;
-      for (size_t i = 0; i < ivec->getSize(); ++i) {
-        CHECK_LT(ivec->getData()[i], 10);
-      }
-    }
-  }
-}
-
-TEST(PyDataProvider2, multiThread) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_dense_no_seq");
-  config.set_async_load_data(true);
-
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-  provider->reset();
-  paddle::DataBatch batch;
-  provider->getNextBatch(100, &batch);
-  provider->reset();
-  provider.reset();
-}
-
-TEST(PyDataProvider2, minPoolSizeWithCache) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_min_pool_size_with_cache");
-  config.set_async_load_data(true);
-
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-
-  paddle::DataBatch batch;
-
-  for (int i = 0; i < 10; ++i) {
-    provider->reset();
-    int64_t sum = 0;
-    while (int64_t actualNum = provider->getNextBatch(100, &batch)) {
-      sum += actualNum;
-    }
-    ASSERT_EQ(1 << 20, sum);
-  }
-}
-
-int main(int argc, char **argv) {
-  testing::InitGoogleTest(&argc, argv);
-  paddle::initMain(argc, argv);
-  paddle::initPython(argc, argv);
-
-  std::ofstream fout(FLAGS_train_list);
-  CHECK(fout.is_open());
-  fout << "stub file name" << std::endl;  // in unittest, filename is not used.
-  fout.close();
-
-  return RUN_ALL_TESTS();
-}
-
-#endif
diff --git a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
deleted file mode 100644
index 9770567b88a2af946b30439300540ed61694ba10..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/gserver/gradientmachines/GradientMachine.h>
-#include <paddle/parameter/ParameterUpdateFunctions.h>
-#include <paddle/trainer/Trainer.h>
-#include <paddle/trainer/TrainerInternal.h>
-#include <paddle/utils/PythonUtil.h>
-#include <paddle/utils/Util.h>
-#include <paddle/utils/Version.h>
-
-DECLARE_int32(seed);
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-class TrainerForTest : public paddle::Trainer {
- public:
-  void startTrain() {
-    GradientMachine& gm = *this->trainerInternal_.getGradientMachine();
-    gm.start();
-  }
-
-  void finishTrain() {
-    GradientMachine& gm = *this->trainerInternal_.getGradientMachine();
-    gm.finish();
-  }
-
-  /**
-   * Get total dimension of all parameters.
-   *
-   * @return the total dimension of all parameters
-   */
-  size_t getTotalParameterSize() const {
-    auto p = const_cast<TrainerForTest*>(this);
-    auto& params = p->getGradientMachine()->getParameters();
-    return std::accumulate(
-        params.begin(), params.end(), 0UL, [](size_t a, const ParameterPtr& p) {
-          return a + p->getSize();
-        });
-  }
-};
-
-void CalCost(const string& conf,
-             const string& dir,
-             real* cost,
-             int num_passes) {
-  auto config = std::make_shared<TrainerConfigHelper>(conf);
-  TrainerForTest trainer;
-  trainer.init(config);
-  mkDir(dir.c_str());
-  config->setSaveDir(dir);
-  auto dataProvider = trainer.getDataProvider();
-  int32_t batchSize = config->getOptConfig().batch_size();
-  real learningRate = config->getOptConfig().learning_rate();
-  real momentum = 0;
-  real decayRate = 0;
-  int64_t dim = trainer.getTotalParameterSize();
-  CpuVector vecW(dim);
-  CpuVector vecGradient(dim);
-  CpuVector vecMomentum(dim);
-
-  // vecW needs to be assigned, otherwise the variable is an uncertain value.
-
-  *ThreadLocalRand::getSeed() = FLAGS_seed;
-  vecW.randnorm(0, 0.1);
-  vecMomentum.randnorm(0, 0.1);
-
-  trainer.startTrain();
-  for (int i = 0; i < num_passes; ++i) {
-    real totalCost = 0;
-    dataProvider->reset();
-    while (true) {
-      DataBatch dataBatch;
-      int num = dataProvider->getNextBatch(batchSize, &dataBatch);
-      if (num == 0) break;
-      totalCost += trainer.calcGradient(dataBatch, vecW, vecGradient);
-      sgdUpdate(
-          learningRate, momentum, decayRate, &vecW, &vecGradient, &vecMomentum);
-    }
-    cost[i] = totalCost;
-  }
-  trainer.finishTrain();
-  rmDir(dir.c_str());
-}
-
-void test(const string& conf1, const string& conf2, double eps, bool useGpu) {
-  if (!paddle::version::isWithGpu() && useGpu) {
-    return;
-  }
-  FLAGS_use_gpu = useGpu;
-  int num_passes = 5;
-  real* cost1 = new real[num_passes];
-  const string dir1 = "gserver/tests/t1";
-  CalCost(conf1, dir1, cost1, num_passes);
-
-  real* cost2 = new real[num_passes];
-  const string dir2 = "gserver/tests/t2";
-  CalCost(conf2, dir2, cost2, num_passes);
-
-  for (int i = 0; i < num_passes; i++) {
-    LOG(INFO) << "num_passes: " << i << ", cost1=" << cost1[i]
-              << ", cost2=" << cost2[i]
-              << ", diff=" << std::abs(cost1[i] - cost2[i]);
-    ASSERT_NEAR(cost1[i], cost2[i], eps);
-  }
-  delete[] cost1;
-  delete[] cost2;
-}
-
-TEST(RecurrentGradientMachine, HasSubSequence) {
-  for (bool useGpu : {false, true}) {
-    test("gserver/tests/sequence_layer_group.conf",
-         "gserver/tests/sequence_nest_layer_group.conf",
-         1e-5,
-         useGpu);
-  }
-}
-
-TEST(RecurrentGradientMachine, rnn) {
-  for (bool useGpu : {false, true}) {
-    test("gserver/tests/sequence_rnn.conf",
-         "gserver/tests/sequence_nest_rnn.conf",
-         1e-6,
-         useGpu);
-  }
-}
-
-TEST(RecurrentGradientMachine, rnn_multi_input) {
-  for (bool useGpu : {false, true}) {
-    test("gserver/tests/sequence_rnn_multi_input.conf",
-         "gserver/tests/sequence_nest_rnn_multi_input.conf",
-         1e-6,
-         useGpu);
-  }
-}
-
-TEST(RecurrentGradientMachine, rnn_multi_unequalength_input) {
-  for (bool useGpu : {false, true}) {
-    test("gserver/tests/sequence_rnn_multi_unequalength_inputs.py",
-         "gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py",
-         1e-6,
-         useGpu);
-  }
-}
-
-TEST(RecurrentGradientMachine, rnn_mixed_input) {
-  for (bool useGpu : {false, true}) {
-    test("gserver/tests/sequence_rnn_mixed_inputs.py",
-         "gserver/tests/sequence_rnn_matched_inputs.py",
-         1e-6,
-         useGpu);
-  }
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-
-  if (paddle::version::isWithPyDataProvider()) {
-    if (!paddle::version::isWithGpu()) {
-      FLAGS_use_gpu = false;
-    }
-    initMain(argc, argv);
-    initPython(argc, argv);
-    return RUN_ALL_TESTS();
-  } else {
-    return 0;
-  }
-}
diff --git a/paddle/gserver/tests/test_RecurrentLayer.cpp b/paddle/gserver/tests/test_RecurrentLayer.cpp
deleted file mode 100644
index b54e37b7dbf8bffeb949f709e6a4f9ec86ea13c3..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_RecurrentLayer.cpp
+++ /dev/null
@@ -1,571 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/utils/Version.h>
-#include <vector>
-#include "ModelConfig.pb.h"
-#include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/gserver/layers/Layer.h"
-
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-DECLARE_bool(use_gpu);
-DECLARE_bool(rnn_use_batch);
-DECLARE_int32(fixed_seq_length);
-
-void checkError(const Matrix& matrix1, const Matrix& matrix2) {
-  CHECK(matrix1.getHeight() == matrix2.getHeight());
-  CHECK(matrix1.getWidth() == matrix2.getWidth());
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-
-  int height = matrix1.getHeight();
-  int width = matrix1.getWidth();
-  const real* data1 = matrix1.getData();
-  const real* data2 = matrix2.getData();
-  int count = 0;
-  for (int i = 0; i < height; i++) {
-    for (int j = 0; j < width; j++) {
-      if (fabs(data1[i * width + j] - data2[i * width + j]) > err) {
-        count++;
-      }
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-}
-
-void checkError(const CpuVector& vector1, const CpuVector& vector2) {
-  CHECK(vector1.getSize() == vector2.getSize());
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-
-  int size = vector1.getSize();
-  const real* data1 = vector1.getData();
-  const real* data2 = vector2.getData();
-  int count = 0;
-  for (int i = 0; i < size; i++) {
-    if (fabs(data1[i] - data2[i]) > err) {
-      count++;
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-}
-
-LayerPtr creatDataLayer(string name,
-                        size_t batchSize,
-                        int layerSize,
-                        bool useGpu) {
-  LayerConfig dataConfig;
-  dataConfig.set_name(name);
-  dataConfig.set_type("data");
-  dataConfig.set_size(layerSize);
-  LayerPtr layer = LayerPtr(new DataLayer(dataConfig));
-
-  Argument data;
-  data.value = Matrix::create(batchSize, layer->getSize(), false, useGpu);
-  data.grad = Matrix::create(batchSize, layer->getSize(), false, useGpu);
-  data.value->randomizeUniform();
-  data.value->add(-0.5);
-  data.value->sigmoid(*data.value);
-  data.grad->zeroMem();
-
-  generateSequenceStartPositions(batchSize, data.sequenceStartPositions);
-
-  DataLayerPtr dataLayer = std::dynamic_pointer_cast<DataLayer>(layer);
-  dataLayer->setData(data);
-  dataLayer->forward(PASS_GC);
-
-  return layer;
-}
-
-ParameterPtr creatParameter(string name,
-                            int pid,
-                            size_t paraSize,
-                            bool useGpu) {
-  ParameterConfig paraConfig;
-  paraConfig.set_name(name);
-  paraConfig.set_size(paraSize);
-
-  ParameterPtr parameter =
-      std::make_shared<Parameter>(paraConfig, useGpu, /*initialize */ false);
-  parameter->enableType(PARAMETER_VALUE);
-  parameter->enableType(PARAMETER_GRADIENT);
-  parameter->randomize();
-  parameter->setID(pid);
-
-  return parameter;
-}
-
-ParameterPtr creatParameterBias(string name,
-                                int pid,
-                                size_t paraSize,
-                                bool useGpu) {
-  ParameterConfig paraConfig;
-  paraConfig.set_name(name);
-  paraConfig.set_size(paraSize);
-  paraConfig.set_initial_std(1);
-
-  ParameterPtr parameter =
-      std::make_shared<Parameter>(paraConfig, useGpu, /*initialize */ true);
-  parameter->randomize();
-  parameter->setID(pid);
-
-  return parameter;
-}
-
-LayerPtr initRecurrentLayer(LayerConfig layerConfig,
-                            size_t batchSize,
-                            int layerSize,
-                            bool useGpu) {
-  FLAGS_use_gpu = useGpu;
-  LayerMap layerMap;
-  ParameterMap parameterMap;
-  LayerPtr dataLayer = creatDataLayer("layer_0", batchSize, layerSize, useGpu);
-  layerMap[dataLayer->getName()] = dataLayer;
-
-  ParameterPtr para =
-      creatParameter("para_0", 0, layerSize * layerSize, useGpu);
-  parameterMap[para->getName()] = para;
-
-  layerConfig.add_inputs();
-  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
-  input.set_input_layer_name("layer_0");
-  input.set_input_parameter_name("para_0");
-  LayerPtr testLayer = Layer::create(layerConfig);
-  layerMap[testLayer->getName()] = testLayer;
-
-  testLayer->init(layerMap, parameterMap);
-  testLayer->setNeedGradient(true);
-
-  return testLayer;
-}
-
-void checkRecurrentLayer(LayerPtr testLayer) {
-  const VectorPtr& weightGrad =
-      (testLayer->getParameters()[0])->getBuf(PARAMETER_GRADIENT);
-  const MatrixPtr& inputGrad = testLayer->getPrev(0)->getOutputGrad();
-  CpuVector seqPara(weightGrad->getSize());
-  CpuVector batPara(weightGrad->getSize());
-  CpuMatrix seqInputGrad(inputGrad->getHeight(), inputGrad->getWidth());
-  CpuMatrix batInputGrad(inputGrad->getHeight(), inputGrad->getWidth());
-
-  CpuMatrix outputGrad(inputGrad->getHeight(), inputGrad->getWidth());
-  outputGrad.randomizeUniform();
-
-  /* use sequence calculate */
-  FLAGS_rnn_use_batch = false;
-  weightGrad->zero();
-  inputGrad->zero();
-  testLayer->forward(PASS_GC);
-  testLayer->getOutputGrad()->copyFrom(outputGrad);
-  testLayer->backward();
-  seqPara.copyFrom(*weightGrad);
-  seqInputGrad.copyFrom(*inputGrad);
-
-  /* use batch calculate */
-  FLAGS_rnn_use_batch = true;
-  weightGrad->zero();
-  inputGrad->zero();
-  testLayer->forward(PASS_GC);
-  testLayer->getOutputGrad()->copyFrom(outputGrad);
-  testLayer->backward();
-  batPara.copyFrom(*weightGrad);
-  batInputGrad.copyFrom(*inputGrad);
-
-  /* check */
-  checkError(seqInputGrad, batInputGrad);
-  checkError(seqPara, batPara);
-}
-
-TEST(Layer, RecurrentLayer) {
-  LayerConfig layerConfig;
-  layerConfig.set_name("rnn");
-  layerConfig.set_type("recurrent");
-  layerConfig.set_active_type("tanh");
-  for (auto layerSize : {1, 10, 64, 128, 256, 512}) {
-    for (auto batchSize : {1, 5, 20, 100, 128}) {
-      for (auto useGpu : {false, true}) {
-        for (auto reversed : {false, true}) {
-          LOG(INFO) << " layerSize=" << layerSize << " batchSize=" << batchSize
-                    << " useGpu=" << useGpu << " reversed=" << reversed;
-          layerConfig.set_size(layerSize);
-          layerConfig.set_reversed(reversed);
-          LayerPtr testLayer =
-              initRecurrentLayer(layerConfig, batchSize, layerSize, useGpu);
-          checkRecurrentLayer(testLayer);
-        }
-      }
-    }
-  }
-}
-
-#define protected public
-#include "paddle/gserver/layers/GatedRecurrentLayer.h"
-#include "paddle/gserver/layers/LstmLayer.h"
-#include "paddle/gserver/layers/RecurrentLayer.h"
-template <class T>
-class TestRecurrentLayer {
- public:
-  LayerConfig config_;
-  bool useGpu_;
-  bool useBatch_;
-  LayerPtr testLayer_;
-  LayerPtr dataLayer_;
-  ParameterPtr para_;
-  ParameterPtr bias_;
-  LayerMap layerMap_;
-  ParameterMap parameterMap_;
-  TestRecurrentLayer(const LayerConfig& config,
-                     bool useGpu,
-                     bool useBatch = false)
-      : config_(config), useGpu_(useGpu), useBatch_(useBatch) {}
-  void init(size_t batchSize) {
-    FLAGS_use_gpu = useGpu_;
-    testLayer_ = Layer::create(config_);
-    if (typeid(T) == typeid(GatedRecurrentLayer)) {
-      dataLayer_ = creatDataLayer(config_.mutable_inputs(0)->input_layer_name(),
-                                  batchSize,
-                                  config_.size() * 3,
-                                  useGpu_);
-      para_ = creatParameter(config_.mutable_inputs(0)->input_parameter_name(),
-                             0,
-                             config_.size() * config_.size() * 3,
-                             useGpu_);
-      bias_ = creatParameterBias(
-          config_.bias_parameter_name(), 1, config_.size() * 3, useGpu_);
-    } else if (typeid(T) == typeid(LstmLayer)) {
-      dataLayer_ = creatDataLayer(config_.mutable_inputs(0)->input_layer_name(),
-                                  batchSize,
-                                  config_.size() * 4,
-                                  useGpu_);
-      para_ = creatParameter(config_.mutable_inputs(0)->input_parameter_name(),
-                             0,
-                             config_.size() * config_.size() * 4,
-                             useGpu_);
-      bias_ = creatParameterBias(
-          config_.bias_parameter_name(), 1, config_.size() * 7, useGpu_);
-    }
-    layerMap_[dataLayer_->getName()] = dataLayer_;
-    parameterMap_[para_->getName()] = para_;
-    parameterMap_[bias_->getName()] = bias_;
-
-    layerMap_[testLayer_->getName()] = testLayer_;
-    testLayer_->init(layerMap_, parameterMap_);
-    testLayer_->setNeedGradient(true);
-    (dynamic_cast<T*>(testLayer_.get()))->useBatch_ = useBatch_;
-  }
-  void forward() {
-    FLAGS_use_gpu = useGpu_;
-    testLayer_->forward(PASS_GC);
-  }
-  void backward() {
-    FLAGS_use_gpu = useGpu_;
-    testLayer_->backward(nullptr);
-  }
-};
-
-template <class T>
-void checkRecurrentLayer(LayerConfig layerConfig,
-                         size_t batchSize,
-                         bool cpuBatch,
-                         bool gpuBatch) {
-  TestRecurrentLayer<T> testCpu(layerConfig, false, cpuBatch);
-  TestRecurrentLayer<T> testGpu(layerConfig, true, gpuBatch);
-  testCpu.init(batchSize);
-  testGpu.init(batchSize);
-  auto checkError = [](
-      MatrixPtr cpu, MatrixPtr gpu, int numSequences, const char* str) {
-    CpuMatrix check(gpu->getHeight(), gpu->getWidth());
-    check.copyFrom(*gpu);
-    int height = cpu->getHeight();
-    int width = cpu->getWidth();
-    const real* data1 = cpu->getData();
-    const real* data2 = check.getData();
-    int count = 0;
-    for (int i = 0; i < height; i++) {
-      for (int j = 0; j < width; j++) {
-        if (fabs(data1[i * width + j] - data2[i * width + j]) / numSequences >
-            1e-4) {
-          count++;
-        }
-      }
-    }
-    EXPECT_EQ(count, 0) << "[" << str << "]"
-                        << "There are " << count << " different element.";
-  };
-  T* cpuLayer = dynamic_cast<T*>(testCpu.testLayer_.get());
-  T* gpuLayer = dynamic_cast<T*>(testGpu.testLayer_.get());
-
-  Argument& cpuInput = testCpu.dataLayer_->getOutput();
-  Argument& gpuInput = testGpu.dataLayer_->getOutput();
-  gpuInput.resizeAndCopyFrom(cpuInput, true);
-
-  const VectorPtr& cpuVec = testCpu.para_->getBuf(PARAMETER_VALUE);
-  const VectorPtr& gpuVec = testGpu.para_->getBuf(PARAMETER_VALUE);
-  gpuVec->copyFrom(*cpuVec);
-
-  const VectorPtr& cpuBiasVec = testCpu.bias_->getBuf(PARAMETER_VALUE);
-  const VectorPtr& gpuBiasVec = testGpu.bias_->getBuf(PARAMETER_VALUE);
-  gpuBiasVec->copyFrom(*cpuBiasVec);
-
-  /* check forward */
-  testCpu.forward();
-  testGpu.forward();
-
-  checkError(
-      cpuLayer->getOutputValue(), gpuLayer->getOutputValue(), 1, "outputValue");
-
-  /* check backward */
-  cpuLayer->getOutputGrad()->randomizeUniform();
-  gpuLayer->getOutputGrad()->copyFrom(*cpuLayer->getOutputGrad());
-  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-
-  testCpu.backward();
-  testGpu.backward();
-
-  // check input grad
-  checkError(cpuInput.grad, gpuInput.grad, 1, "inputGrad");
-  // check weight grad
-  int numSequences = cpuInput.getNumSequences();
-  checkError(cpuLayer->weight_->getWGrad(),
-             gpuLayer->weight_->getWGrad(),
-             numSequences,
-             "weightGrad");
-  // check bias grad
-  checkError(cpuLayer->bias_->getWGrad(),
-             gpuLayer->bias_->getWGrad(),
-             numSequences,
-             "biasGrad");
-}
-
-TEST(Layer, GatedRecurrentLayer) {
-  LayerConfig layerConfig;
-  layerConfig.set_type("gated_recurrent");
-  layerConfig.set_active_type("sigmoid");
-  layerConfig.set_active_gate_type("sigmoid");
-
-  layerConfig.add_inputs();
-  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
-  input.set_input_layer_name("layer_0");
-  input.set_input_parameter_name("para_0");
-  layerConfig.set_bias_parameter_name("bias");
-
-  for (auto frameSize : {32, 64, 128, 256, 512}) {
-    for (auto batchSize : {1, 5, 100, 500}) {
-      for (auto reversed : {false, true}) {
-        for (auto cpuBatch : {false, true}) {
-          for (auto gpuBatch : {false, true}) {
-            LOG(INFO) << " batchSize=" << batchSize
-                      << " frameSize=" << frameSize << " reversed=" << reversed
-                      << " cpuBatch=" << cpuBatch << " gpuBatch=" << gpuBatch;
-            layerConfig.set_size(frameSize);
-            layerConfig.set_reversed(reversed);
-            checkRecurrentLayer<GatedRecurrentLayer>(
-                layerConfig, batchSize, cpuBatch, gpuBatch);
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(Layer, LstmLayer) {
-  LayerConfig layerConfig;
-  layerConfig.set_type("lstmemory");
-  layerConfig.set_active_type("relu");
-  layerConfig.set_active_state_type("tanh");
-  layerConfig.set_active_gate_type("sigmoid");
-
-  layerConfig.add_inputs();
-  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
-  input.set_input_layer_name("layer_0");
-  input.set_input_parameter_name("para_0");
-  layerConfig.set_bias_parameter_name("bias");
-
-  for (auto frameSize : {32, 64, 128, 256, 512}) {
-    for (auto batchSize : {1, 5, 100, 500}) {
-      for (auto reversed : {false, true}) {
-        for (auto cpuBatch : {false, true}) {
-          for (auto gpuBatch : {false, true}) {
-            LOG(INFO) << " batchSize=" << batchSize
-                      << " frameSize=" << frameSize << " reversed=" << reversed
-                      << " cpuBatch=" << cpuBatch << " gpuBatch=" << gpuBatch;
-            layerConfig.set_size(frameSize);
-            layerConfig.set_reversed(reversed);
-            checkRecurrentLayer<LstmLayer>(
-                layerConfig, batchSize, cpuBatch, gpuBatch);
-          }
-        }
-      }
-    }
-  }
-}
-
-#ifdef PADDLE_WITH_MKLML
-
-#include "paddle/gserver/layers/MKLPackedRecurrentLayer.h"
-
-LayerPtr initMKLPackedLayer(LayerConfig layerConfig,
-                            bool reversed,
-                            int layerSize,
-                            LayerPtr dataLayer,
-                            ParameterPtr para,
-                            ParameterPtr bias = nullptr) {
-  LayerMap layerMap;
-  ParameterMap parameterMap;
-  layerMap[dataLayer->getName()] = dataLayer;
-  parameterMap[para->getName()] = para;
-  if (bias) {
-    parameterMap[bias->getName()] = bias;
-    layerConfig.set_bias_parameter_name("bias_0");
-  }
-
-  layerConfig.set_size(layerSize);
-  layerConfig.set_reversed(reversed);
-  layerConfig.add_inputs();
-  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
-  input.set_input_layer_name("layer_0");
-  input.set_input_parameter_name("para_0");
-
-  LayerPtr testLayer = Layer::create(layerConfig);
-  layerMap[testLayer->getName()] = testLayer;
-
-  testLayer->init(layerMap, parameterMap);
-  testLayer->setNeedGradient(true);
-
-  return testLayer;
-}
-
-void checkMKLPackedLayer(LayerConfig layerConfig1,
-                         LayerConfig layerConfig2,
-                         bool reversed,
-                         int layerSize,
-                         int batchSize,
-                         bool useBatch1,
-                         bool useBatch2) {
-  LayerPtr dataLayer;
-  ParameterPtr para, bias;
-
-  if (layerConfig1.type() == "recurrent") {
-    dataLayer = creatDataLayer("layer_0", batchSize, layerSize, false);
-    para = creatParameter("para_0", 0, layerSize * layerSize, false);
-    bias = nullptr;
-  } else if (layerConfig1.type() == "gated_recurrent") {
-    dataLayer = creatDataLayer("layer_0", batchSize, layerSize * 3, false);
-    para = creatParameter("para_0", 0, layerSize * layerSize * 3, false);
-    bias = creatParameterBias("bias_0", 1, layerSize * 3, false);
-  }
-
-  LayerPtr testLayer1 = initMKLPackedLayer(
-      layerConfig1, reversed, layerSize, dataLayer, para, bias);
-  LayerPtr testLayer2 = initMKLPackedLayer(
-      layerConfig2, reversed, layerSize, dataLayer, para, bias);
-
-  const VectorPtr& weightGrad =
-      (testLayer1->getParameters()[0])->getBuf(PARAMETER_GRADIENT);
-  const MatrixPtr& inputGrad = testLayer1->getPrev(0)->getOutputGrad();
-  CpuVector wgt_grad1(weightGrad->getSize());
-  CpuVector wgt_grad2(weightGrad->getSize());
-  CpuMatrix input_grad1(inputGrad->getHeight(), inputGrad->getWidth());
-  CpuMatrix input_grad2(inputGrad->getHeight(), inputGrad->getWidth());
-
-  for (int i = 0; i < 2; i++) {
-    FLAGS_rnn_use_batch = useBatch1;
-
-    testLayer1->forward(PASS_GC);
-
-    FLAGS_rnn_use_batch = useBatch2;
-    testLayer2->forward(PASS_GC);
-
-    testLayer1->getOutputGrad()->randomizeUniform();
-    testLayer2->getOutputGrad()->copyFrom(*testLayer1->getOutputGrad());
-
-    weightGrad->zero();
-    inputGrad->zero();
-    FLAGS_rnn_use_batch = useBatch1;
-    testLayer1->backward(nullptr);
-
-    wgt_grad1.copyFrom(*weightGrad);
-    input_grad1.copyFrom(*inputGrad);
-
-    weightGrad->zero();
-    inputGrad->zero();
-    FLAGS_rnn_use_batch = useBatch2;
-    testLayer2->backward(nullptr);
-
-    wgt_grad2.copyFrom(*weightGrad);
-    input_grad2.copyFrom(*inputGrad);
-
-    checkError(*testLayer1->getOutputValue(), *testLayer2->getOutputValue());
-    checkError(wgt_grad1, wgt_grad2);
-    checkError(input_grad1, input_grad2);
-  }
-}
-
-TEST(MKLPackedLayer, RecurrentLayer) {
-  LayerConfig layerConfig1;
-  LayerConfig layerConfig2;
-
-  layerConfig1.set_name("paddle-rnn");
-  layerConfig1.set_type("recurrent");
-  layerConfig1.set_active_type("relu");
-
-  layerConfig2.set_name("mkl-packed-rnn");
-  layerConfig2.set_type("mkl_packed_recurrent");
-  layerConfig2.set_active_type("relu");
-
-  FLAGS_use_gpu = false;
-
-  for (auto layerSize : {32, 64, 128, 256, 512}) {
-    for (auto batchSize : {1, 5, 100, 500}) {
-      for (auto reversed : {true, false}) {
-        for (auto paddle_use_batch : {true, false}) {
-          for (auto MKLPacked_use_batch : {true, false}) {
-            LOG(INFO) << " layerSize=" << layerSize
-                      << " batchSize=" << batchSize << " reversed=" << reversed
-                      << " paddle_use_batch=" << paddle_use_batch
-                      << " MKLPacked_use_batch=" << MKLPacked_use_batch;
-
-            checkMKLPackedLayer(layerConfig1,
-                                layerConfig2,
-                                reversed,
-                                layerSize,
-                                batchSize,
-                                paddle_use_batch,
-                                MKLPacked_use_batch);
-          }
-        }
-      }
-    }
-  }
-}
-#endif
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  if (!version::isWithGpu()) {
-    testing::GTEST_FLAG(filter) = "-Layer.*";
-  }
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_SelectiveFCLayer.cpp b/paddle/gserver/tests/test_SelectiveFCLayer.cpp
deleted file mode 100644
index 583e3bc545a3b5eb158490a8ccc5ea7060c7c6ab..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_SelectiveFCLayer.cpp
+++ /dev/null
@@ -1,471 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <math.h>
-#include <paddle/utils/PythonUtil.h>
-#include <algorithm>
-#include <cstdlib>
-#include <ctime>
-#include "ModelConfig.pb.h"
-#include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/gserver/layers/FullyConnectedLayer.h"
-#include "paddle/gserver/layers/Layer.h"
-#include "paddle/gserver/layers/SelectiveFullyConnectedLayer.h"
-#include "paddle/math/CpuSparseMatrix.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_bool(use_gpu);
-DECLARE_int32(num_passes);
-DECLARE_string(config);
-DECLARE_string(init_model_path);
-DECLARE_string(config_args);
-
-size_t fcLayerWidth = 1024;
-
-struct ComData {
-  vector<Argument> outArgs;
-  vector<ParameterPtr> parameters;
-};
-
-int randint(int* data, size_t int_max, size_t size) {
-  srand((size_t)(time(NULL)));
-  if (int_max < size) {
-    return -1;
-  }
-  size_t count = 0;
-  std::map<int, int> tmp;
-  int this_int = 0;
-
-  while (count < size) {
-    this_int = std::rand() % int_max;  // NOLINT
-    if (tmp.find(this_int) == tmp.end()) {
-      tmp[this_int] = 0;
-      count += 1;
-    }
-  }
-
-  if (tmp.size() != size) {
-    return -1;
-  }
-  count = 0;
-  for (auto itr = tmp.begin(); itr != tmp.end(); ++itr) {
-    data[count] = itr->first;
-    count += 1;
-  }
-  return 0;
-}
-
-void calcOutput(ComData& comData,
-                const string configFile,
-                const string configArgs,
-                bool useGpu) {
-  FLAGS_config = configFile;
-  FLAGS_config_args = configArgs;
-  FLAGS_use_gpu = useGpu;
-  FLAGS_init_model_path = "gserver/tests/SelectiveFcTest/model";
-  *ThreadLocalRand::getSeed() = 0;
-  srand(0);
-
-  Trainer trainer;
-  trainer.init(TrainerConfigHelper::createFromFlags(), false);
-
-  comData.parameters = trainer.getGradientMachine()->getParameters();
-
-  auto dataProvider = trainer.getDataProvider();
-  int32_t batchSize = trainer.getConfig().opt_config().batch_size();
-  DataBatch dataBatch;
-  dataProvider->setSkipShuffle();
-  dataProvider->reset();
-  dataProvider->getNextBatch(batchSize, &dataBatch);
-  CHECK(dataBatch.getSize()) << "No data from data provider";
-
-  vector<Argument>& inArgs = dataBatch.getStreams();
-  trainer.getGradientMachine()->start(trainer.getConfig(), nullptr);
-  trainer.getGradientMachine()->forwardBackward(
-      inArgs, &comData.outArgs, PASS_TRAIN);
-  trainer.getGradientMachine()->finish();
-}
-
-void checkMatrix(real* A, real* B, size_t matSize) {
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-  int diffNum = 0;
-  for (size_t i = 0; i < matSize; ++i) {
-    if (std::isinf(A[i]) || std::isnan(A[i]) || std::isinf(B[i]) ||
-        std::isnan(B[i])) {
-    } else if (fabs(A[i] - B[i]) > err) {
-      diffNum++;
-    }
-  }
-  EXPECT_EQ(0, diffNum);
-}
-
-void checkTranspose(real* matrix,
-                    real* transpose,
-                    size_t width,
-                    size_t matSize) {
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-  size_t height = matSize / width;
-  int diffNum = 0;
-  size_t rowId = 0;
-  size_t colId = 0;
-  for (size_t i = 0; i < matSize; ++i) {
-    if (i % width == 0 && i) {
-      rowId++;
-    }
-    colId = i % width;
-    if (fabs(matrix[i] - transpose[colId * height + rowId]) > err) {
-      diffNum++;
-      LOG(INFO) << i << " diff : " << matrix[i] << "\t"
-                << transpose[colId * height + rowId];
-    }
-  }
-  EXPECT_EQ(0, diffNum);
-}
-
-void compareOutput(ComData& fcData, ComData& selFcData) {
-  vector<Argument> outArgsFc = fcData.outArgs;
-  vector<Argument> outArgsSelfc = selFcData.outArgs;
-
-  // check cost
-  LOG(INFO) << "Check cost";
-  CpuMatrix fcCost(outArgsFc[0].value->getHeight(),
-                   outArgsFc[0].value->getWidth());
-  CpuMatrix selfcCost(outArgsSelfc[0].value->getHeight(),
-                      outArgsSelfc[0].value->getWidth());
-  fcCost.copyFrom(*outArgsFc[0].value);
-  selfcCost.copyFrom(*outArgsSelfc[0].value);
-  checkMatrix(fcCost.getData(), selfcCost.getData(), fcCost.getElementCnt());
-
-  // check selective fc output and fc output
-  LOG(INFO) << "Compare output of SelectiveFullyConectedLayer "
-            << "with FullyConectedLayer";
-  CpuMatrix fcOut(outArgsFc[1].value->getHeight(),
-                  outArgsFc[1].value->getWidth());
-  CpuMatrix selfcOut(outArgsSelfc[1].value->getHeight(),
-                     outArgsSelfc[1].value->getWidth());
-
-  fcOut.copyFrom(*outArgsFc[1].value);
-  selfcOut.copyFrom(*outArgsSelfc[1].value);
-  checkMatrix(fcOut.getData(), selfcOut.getData(), fcOut.getElementCnt());
-
-  // check gradient math
-  vector<ParameterPtr>& fcParam = fcData.parameters;
-  vector<ParameterPtr>& selfcParam = selFcData.parameters;
-  for (size_t i = 0; i < fcParam.size(); ++i) {
-    ParameterPtr p1, p2;
-    p1 = fcParam[i];
-    p2 = selfcParam[i];
-
-    string paramName = p1->getName();
-    LOG(INFO) << "check parameter : " << paramName;
-
-    // check parameter value
-    CpuVector paraValue1(p1->getSize());
-    CpuVector paraValue2(p2->getSize());
-    paraValue1.copyFrom(*p1->getBuf(PARAMETER_VALUE));
-    paraValue2.copyFrom(*p2->getBuf(PARAMETER_VALUE));
-
-    // check gradient
-    CpuVector paraGrad1(*p1->getBuf(PARAMETER_GRADIENT));
-    CpuVector paraGrad2(*p2->getBuf(PARAMETER_GRADIENT));
-    if (paramName == "rand_fc_param.bias") {
-      checkMatrix(
-          paraValue1.getData(), paraValue2.getData(), paraValue1.getSize());
-      checkMatrix(
-          paraGrad1.getData(), paraGrad2.getData(), paraGrad1.getSize());
-    } else {
-      checkTranspose(paraValue1.getData(),
-                     paraValue2.getData(),
-                     fcLayerWidth,
-                     paraValue1.getSize());
-      checkTranspose(paraGrad1.getData(),
-                     paraGrad2.getData(),
-                     fcLayerWidth,
-                     paraGrad1.getSize());
-    }
-  }
-}
-
-void compareSparseMulOutput(
-    real* fcOutput,
-    real* selOutput,
-    size_t nnz,
-    const std::shared_ptr<std::vector<std::pair<int*, size_t>>>& selCols) {
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-  size_t nnzCount =
-      std::accumulate(selCols->begin(),
-                      selCols->end(),
-                      0UL,
-                      [](size_t a, const std::pair<int*, size_t>& arr) {
-                        return a + arr.second;
-                      });
-  EXPECT_EQ(nnz, nnzCount);
-
-  size_t sampleNum = selCols->size();
-  int diffNum = 0;
-  size_t count = 0;
-  for (size_t i = 0; i < sampleNum; ++i) {
-    for (size_t j = 0; j < (*selCols)[i].second; ++j) {
-      size_t selIdx = (*selCols)[i].first[j];
-      if (fabs(fcOutput[i * fcLayerWidth + selIdx] - selOutput[count]) > err) {
-        diffNum++;
-        LOG(INFO) << count << " diff : " << fcOutput[i * fcLayerWidth + selIdx]
-                  << "\t" << selOutput[count];
-      }
-      count++;
-    }
-  }
-  EXPECT_EQ(0, diffNum);
-}
-
-LayerPtr creatDataLayer(string name,
-                        size_t batchSize,
-                        size_t layerSize,
-                        std::vector<real>& values,
-                        bool useGpu) {
-  LayerConfig dataConfig;
-  dataConfig.set_name(name);
-  dataConfig.set_type("data");
-  dataConfig.set_size(layerSize);
-  LayerPtr layer = LayerPtr(new DataLayer(dataConfig));
-
-  Argument data;
-  data.value = Matrix::create(batchSize, layerSize, false, useGpu);
-  data.value->copyFrom(values.data(), batchSize * layerSize);
-
-  DataLayerPtr dataLayer = std::dynamic_pointer_cast<DataLayer>(layer);
-  dataLayer->setData(data);
-  dataLayer->forward(PASS_TEST);
-  return layer;
-}
-
-ParameterPtr creatParameter(
-    string name, int pid, size_t paraSize, string paramFile, bool useGpu) {
-  ParameterConfig paraConfig;
-  paraConfig.set_name(name);
-  paraConfig.set_size(paraSize);
-
-  ParameterPtr parameter =
-      std::make_shared<Parameter>(paraConfig, useGpu, /*initialize */ false);
-  parameter->enableType(PARAMETER_VALUE);
-  parameter->randomize();
-  parameter->setID(pid);
-  parameter->load(paramFile);
-  return parameter;
-}
-
-LayerPtr initFcLayer(LayerPtr dataLayer,
-                     LayerConfig layerConfig,
-                     int dataLayerSize,
-                     int fcLayerSize,
-                     string paraName,
-                     string paraFile,
-                     bool useGpu) {
-  LayerMap layerMap;
-  ParameterMap parameterMap;
-
-  layerMap[dataLayer->getName()] = dataLayer;
-  ParameterPtr para = creatParameter(
-      paraName, 0, dataLayerSize * fcLayerSize, paraFile, useGpu);
-  parameterMap[para->getName()] = para;
-
-  layerConfig.add_inputs();
-  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
-  input.set_input_layer_name(dataLayer->getName());
-  input.set_input_parameter_name(paraName);
-
-  LayerPtr testLayer = Layer::create(layerConfig);
-  layerMap[testLayer->getName()] = testLayer;
-
-  testLayer->setNeedGradient(false);
-  testLayer->init(layerMap, parameterMap);
-  return testLayer;
-}
-
-#ifndef PADDLE_TYPE_DOUBLE
-// The parameter file used in fc.conf and selective_fc.conf is float
-TEST(Layer, SelectiveFcLayer_train_dense_mul) {
-  const string& fcConfig = "gserver/tests/SelectiveFcTest/conf/fc.conf";
-  const string& fcConfigArgs =
-      "filelist=gserver/tests/SelectiveFcTest/dense_mul_list";
-  const string& selFcConfig =
-      "gserver/tests/SelectiveFcTest/conf/selective_fc.conf";
-  const string& selConfigArgs =
-      "filelist=gserver/tests/SelectiveFcTest/dense_mul_list";
-
-  for (auto useGpu : {false, true}) {
-#ifndef PADDLE_WITH_CUDA
-    if (useGpu) {
-      break;
-    }
-#endif
-    LOG(INFO) << "FullyConnectedLayer forwardBackward()";
-    ComData fcData;
-    calcOutput(fcData, fcConfig, fcConfigArgs, useGpu);
-
-    LOG(INFO) << "SelectiveFullyConnectedLayer forwardBackward()";
-    ComData selFcData;
-    calcOutput(selFcData, selFcConfig, selConfigArgs, useGpu);
-    compareOutput(fcData, selFcData);
-  }
-}
-#endif  // PADDLE_TYPE_DOUBLE
-
-void testSelectiveFcLayerTrainSparseMul(const LayerConfig& config,
-                                        bool useGpu) {
-  FLAGS_use_gpu = useGpu;
-  size_t batchSize = 100;
-  size_t dataLayerSize = 512;
-  std::vector<real> values(batchSize * dataLayerSize);
-  for (size_t j = 0; j < batchSize * dataLayerSize; ++j) {
-    values[j] = std::rand() / real(RAND_MAX);
-  }
-  LayerPtr dataLayer =
-      creatDataLayer("data", batchSize, dataLayerSize, values, useGpu);
-
-  const string& selfcParaFile =
-      "gserver/tests/SelectiveFcTest/model/rand_fc_param.w.transpose";
-  const string& selfcParaName = "rand_fc_param.w.transpose";
-
-  std::shared_ptr<SelectiveFullyConnectedLayer> selfcLayer =
-      std::dynamic_pointer_cast<SelectiveFullyConnectedLayer>(
-          initFcLayer(dataLayer,
-                      config,
-                      dataLayerSize,
-                      fcLayerWidth,
-                      selfcParaName,
-                      selfcParaFile,
-                      useGpu));
-
-  // create selected columns
-  std::shared_ptr<std::vector<std::pair<int*, size_t>>> selCols(
-      new std::vector<std::pair<int*, size_t>>(batchSize));
-  size_t maxNNZ = 30;
-  srand((size_t)(time(NULL)));
-  int total = 0;
-  while (total == 0) {
-    for (size_t i = 0; i < batchSize; ++i) {
-      size_t num = std::rand() % maxNNZ;
-      int* data = new int[num];
-      randint(data, fcLayerWidth, num);
-      (*selCols)[i] = std::make_pair(data, num);
-      total += num;
-    }
-  }
-  selfcLayer->fillSelectiveData(selCols);
-  selfcLayer->forward(PASS_TEST);
-
-  MatrixPtr outMatSelfc = selfcLayer->getOutputValue();
-  CpuSparseMatrixPtr cpuOutMatSelfc(
-      new CpuSparseMatrix(outMatSelfc->getHeight(),
-                          outMatSelfc->getWidth(),
-                          outMatSelfc->getElementCnt()));
-  cpuOutMatSelfc->copyFrom(*outMatSelfc, HPPL_STREAM_DEFAULT);
-#ifdef PADDLE_WITH_CUDA
-  if (useGpu) {
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-  }
-#endif
-  real* outValueSelfc = cpuOutMatSelfc->getValue();
-  size_t nnz = cpuOutMatSelfc->getElementCnt();
-
-  const string& fcParaFile =
-      "gserver/tests/SelectiveFcTest/model/rand_fc_param.w";
-  const string& fcParaName = "rand_fc_param.w";
-  LayerConfig fcLayerConfig;
-  fcLayerConfig.set_name("fc_layer");
-  fcLayerConfig.set_type("fc");
-  fcLayerConfig.set_active_type("linear");
-  fcLayerConfig.set_size(fcLayerWidth);
-
-  LayerPtr fcLayer = initFcLayer(dataLayer,
-                                 fcLayerConfig,
-                                 dataLayerSize,
-                                 fcLayerWidth,
-                                 fcParaName,
-                                 fcParaFile,
-                                 useGpu);
-  fcLayer->forward(PASS_TEST);
-
-  MatrixPtr outMatFc = fcLayer->getOutputValue();
-  MatrixPtr cpuOutMatFc(
-      new CpuMatrix(outMatFc->getHeight(), outMatFc->getWidth()));
-  cpuOutMatFc->copyFrom(*outMatFc, HPPL_STREAM_DEFAULT);
-#ifdef PADDLE_WITH_CUDA
-  if (useGpu) {
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-  }
-#endif
-  real* outValueFc = cpuOutMatFc->getData();
-
-  compareSparseMulOutput(outValueFc, outValueSelfc, nnz, selCols);
-  for (size_t i = 0; i < batchSize; ++i) {
-    delete[](*selCols)[i].first;
-  }
-}
-
-#ifndef PADDLE_TYPE_DOUBLE
-// The parameter file used in testSelectiveFcLayerTrainSparseMul is float
-TEST(Layer, SelectiveFcLayer_train_sparse_mul) {
-  LayerConfig selLayerConfig;
-  selLayerConfig.set_name("sel_fc");
-  selLayerConfig.set_type("selective_fc");
-  selLayerConfig.set_active_type("linear");
-  selLayerConfig.set_has_selected_colums(false);
-  selLayerConfig.set_selective_fc_pass_generation(true);
-  selLayerConfig.set_size(fcLayerWidth);
-
-  testSelectiveFcLayerTrainSparseMul(selLayerConfig, false);
-#ifdef PADDLE_WITH_CUDA
-  testSelectiveFcLayerTrainSparseMul(selLayerConfig, true);
-#endif
-}
-#endif  // PADDLE_TYPE_DOUBLE
-
-// TODO(dangqingqing) test multi threads after support in matrix
-// TEST(Layer, SelectiveFcLayer_train_sparse_mul_parallel) {
-//   LayerConfig selLayerConfig;
-//   selLayerConfig.set_name("sel_fc");
-//   selLayerConfig.set_type("selective_fc");
-//   selLayerConfig.set_active_type("linear");
-//   selLayerConfig.set_has_selected_colums(false);
-//   selLayerConfig.set_selective_fc_pass_generation(true);
-//   selLayerConfig.set_selective_fc_parallel_plain_mul_thread_num(10);
-//   selLayerConfig.set_selective_fc_full_mul_ratio(1000);
-//   selLayerConfig.set_size(fcLayerWidth);
-//   SelectiveFcLayer_test(selLayerConfig, false);
-// }
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  initPython(argc, argv);
-  int ret = RUN_ALL_TESTS();
-  return ret;
-}
diff --git a/paddle/gserver/tests/test_SeqSliceLayerGrad.cpp b/paddle/gserver/tests/test_SeqSliceLayerGrad.cpp
deleted file mode 100644
index 406ca63b6ee030a0882e05294d8d355d84531385..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_SeqSliceLayerGrad.cpp
+++ /dev/null
@@ -1,224 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "ModelConfig.pb.h"
-#include "paddle/gserver/layers/DataLayer.h"
-
-#include "LayerGradUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_int32(gpu_id);
-DECLARE_bool(thread_local_rand_use_global_seed);
-
-const int MAX_SEQ_NUM = 17;
-const int MAX_SEQ_LEN = 23;
-const int MAX_BEAM_SIZE = 13;
-
-const size_t SEED = (size_t)(time(NULL));
-
-vector<real> randSampling(real range, int n) {
-  CHECK_GE(range, n);
-  vector<real> num(range);
-  iota(begin(num), end(num), 0.);
-  if (range == n) return num;
-
-  random_shuffle(begin(num), end(num));
-  num.resize(n);
-  sort(begin(num), end(num));
-  return num;
-}
-
-void genSeqInfo(vector<int>& seqStartPos, vector<int>& subSeqStartPos) {
-  seqStartPos.resize(1, 0);
-  subSeqStartPos.resize(1, 0);
-
-  srand(SEED);
-  int seqNum = 1 + (rand() % MAX_SEQ_NUM);
-  for (int i = 0; i < seqNum; ++i) {
-    int subSeqNum = 1 + (rand() % MAX_SEQ_NUM);
-    for (int j = 0; j < subSeqNum; ++j)
-      subSeqStartPos.push_back(subSeqStartPos.back() +
-                               (1 + (rand() % MAX_SEQ_LEN)));
-    seqStartPos.push_back(subSeqStartPos.back());
-  }
-}
-
-/*
-  generate start indices according to sequence start positions.
- */
-void genStarts(vector<int>& seqStartPos,
-               vector<vector<real>>& starts,
-               size_t beamSize) {
-  starts.clear();
-  starts.resize(seqStartPos.size() - 1, vector<real>(beamSize, -1.));
-
-  for (size_t i = 0; i < seqStartPos.size() - 1; ++i) {
-    int seqLen = seqStartPos[i + 1] - seqStartPos[i];
-    vector<real> randStarts =
-        randSampling(seqLen, min(seqLen, static_cast<int>(beamSize)));
-    copy(begin(randStarts), end(randStarts), begin(starts[i]));
-  }
-}
-
-/*
-  generate end indices according to sequence start positions and start indices.
- */
-void genEnds(vector<int>& seqStartPos,
-             vector<vector<real>>& starts,
-             vector<vector<real>>& ends,
-             size_t beamSize) {
-  CHECK_EQ(seqStartPos.size() - 1, starts.size());
-  ends.clear();
-  ends.resize(seqStartPos.size() - 1, vector<real>(beamSize, -1.));
-
-  for (size_t i = 0; i < starts.size(); ++i) {
-    for (size_t j = 0; j < starts[i].size(); ++j) {
-      int seqLen = seqStartPos[i + 1] - seqStartPos[i];
-      CHECK_GE(seqLen - 1, starts[i][j]);
-      if (starts[i][j] == -1.) break;
-      if (starts[i][j] == (seqLen - 1)) {
-        ends[i][j] = starts[i][j];
-      } else {
-        ends[i][j] = starts[i][j] + randSampling(seqLen - starts[i][j], 1)[0];
-      }
-    }
-  }
-}
-
-void genTestData(vector<int>& seqStartPos,
-                 vector<int>& subSeqStartPos,
-                 vector<vector<real>>& starts,
-                 vector<vector<real>>& ends,
-                 bool hasSubseq) {
-  size_t beamSize = 1 + (rand() % MAX_BEAM_SIZE);
-  genSeqInfo(seqStartPos, subSeqStartPos);
-
-  genStarts(hasSubseq ? subSeqStartPos : seqStartPos, starts, beamSize);
-  genEnds(hasSubseq ? subSeqStartPos : seqStartPos, starts, ends, beamSize);
-}
-
-template <typename T>
-void flatten2dVector(vector<vector<T>>& inVec, vector<T>& outVec) {
-  size_t totalSize{0};
-  for (auto const& items : inVec) totalSize += items.size();
-  outVec.reserve(totalSize);
-
-  for (auto& items : inVec)
-    move(items.begin(), items.end(), back_inserter(outVec));
-}
-
-void testSeqSliceLayer(bool hasSubseq,
-                       bool useGpu,
-                       vector<int>& seqStartPos,
-                       vector<int>& subSeqStartPos,
-                       vector<vector<real>>& starts,
-                       vector<vector<real>>& ends) {
-  // layer size is not crutial for this layer,
-  // so here use a small layer size in the unittest.
-  const size_t layerSize{4};
-  TestConfig config;
-  config.layerConfig.set_type("seq_slice");
-  config.layerConfig.set_size(layerSize);
-
-  // add the first input
-  MatrixPtr seqInputPtr =
-      Matrix::create(hasSubseq ? subSeqStartPos.back() : seqStartPos.back(),
-                     layerSize,
-                     false,
-                     false);
-  seqInputPtr->randomizeUniform();
-
-  if (hasSubseq) {
-    config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
-                                "seq_input",
-                                seqInputPtr,
-                                seqStartPos,
-                                subSeqStartPos});
-  } else {
-    config.inputDefs.push_back(
-        {INPUT_SELF_DEFINE_DATA, "seq_input", seqInputPtr, seqStartPos});
-  }
-  config.layerConfig.add_inputs();
-
-  // add start indices
-  if (starts.size()) {
-    vector<real> startsToVec;
-    flatten2dVector(starts, startsToVec);
-
-    MatrixPtr startMatrixPtr =
-        Matrix::create(starts.size(), starts[0].size(), false, false);
-    startMatrixPtr->copyFrom(startsToVec.data(), startsToVec.size());
-
-    config.inputDefs.push_back(
-        {INPUT_SELF_DEFINE_DATA, "starts", startMatrixPtr});
-    config.layerConfig.add_inputs();
-    config.layerConfig.set_select_first(true);
-  }
-
-  // add end indices
-  if (ends.size()) {
-    vector<real> endsToVec;
-    flatten2dVector(ends, endsToVec);
-
-    MatrixPtr endMatrixPtr =
-        Matrix::create(ends.size(), ends[0].size(), false, false);
-    endMatrixPtr->copyFrom(endsToVec.data(), endsToVec.size());
-
-    config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, "ends", endMatrixPtr});
-    config.layerConfig.add_inputs();
-    config.layerConfig.set_select_first(false);
-  }
-
-  testLayerGrad(config, "seq_slice", /*batchSize*/ 100, false, useGpu, false);
-}
-
-TEST(Layer, SeqSliceLayer) {
-  vector<int> seqStartPos;
-  vector<int> subSeqStartPos;
-  vector<vector<real>> starts;
-  vector<vector<real>> ends;
-
-  std::vector<bool> mode = {false};
-#ifdef PADDLE_WITH_CUDA
-  mode.push_back(true);
-#endif
-  genSeqInfo(seqStartPos, subSeqStartPos);
-  for (bool hasSubseq : {true, false}) {
-    LOG(INFO) << "hasSubSeq : " << hasSubseq;
-    genTestData(seqStartPos, subSeqStartPos, starts, ends, hasSubseq);
-    for (bool useGpu : mode) {
-      vector<vector<real>> tmp;
-      testSeqSliceLayer(
-          hasSubseq, useGpu, seqStartPos, subSeqStartPos, tmp, ends);
-      testSeqSliceLayer(
-          hasSubseq, useGpu, seqStartPos, subSeqStartPos, starts, tmp);
-      testSeqSliceLayer(
-          hasSubseq, useGpu, seqStartPos, subSeqStartPos, starts, ends);
-    }
-  }
-}
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  hl_start();
-  hl_init(FLAGS_gpu_id);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(1);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_Upsample.cpp b/paddle/gserver/tests/test_Upsample.cpp
deleted file mode 100644
index 39b902fcc75e71007f855e4e258e54ed8d40f16b..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_Upsample.cpp
+++ /dev/null
@@ -1,153 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-
-#include "LayerGradUtil.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/testing/TestUtil.h"
-
-void setPoolConfig(paddle::TestConfig* config,
-                   paddle::PoolConfig* pool,
-                   const string& poolType) {
-  (*config).biasSize = 0;
-  (*config).layerConfig.set_type("pool");
-  (*config).layerConfig.set_num_filters(1);
-
-  int kw = 2, kh = 2;
-  int pw = 0, ph = 0;
-  int sw = 2, sh = 2;
-  pool->set_pool_type(poolType);
-  pool->set_channels(2);
-  pool->set_size_x(kw);
-  pool->set_size_y(kh);
-  pool->set_start(0);
-  pool->set_padding(pw);
-  pool->set_padding_y(ph);
-  pool->set_stride(sw);
-  pool->set_stride_y(sh);
-
-  int ow =
-      paddle::outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
-  int oh =
-      paddle::outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
-  pool->set_output_x(ow);
-  pool->set_output_y(oh);
-}
-
-paddle::LayerPtr doOneUpsampleTest(const paddle::MatrixPtr& inputMat,
-                                   const string& poolType,
-                                   bool use_gpu,
-                                   real* tempGradData) {
-  /* prepare maxPoolWithMaskLayer */
-  paddle::TestConfig config;
-  config.inputDefs.push_back({paddle::INPUT_DATA, "layer_0", 128, 0});
-  paddle::LayerInputConfig* input = config.layerConfig.add_inputs();
-  paddle::PoolConfig* pool = input->mutable_pool_conf();
-
-  pool->set_img_size(8);
-  pool->set_img_size_y(8);
-  setPoolConfig(&config, pool, "max-pool-with-mask");
-  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
-                              pool->channels());
-
-  config.layerConfig.set_name("MaxPoolWithMask");
-
-  std::vector<paddle::DataLayerPtr> dataLayers;
-  paddle::LayerMap layerMap;
-  vector<paddle::Argument> datas;
-
-  initDataLayer(config,
-                &dataLayers,
-                &datas,
-                &layerMap,
-                "MaxPoolWithMask",
-                1,
-                false,
-                use_gpu);
-
-  dataLayers[0]->getOutputValue()->copyFrom(*inputMat);
-
-  FLAGS_use_gpu = use_gpu;
-  std::vector<paddle::ParameterPtr> parameters;
-  paddle::LayerPtr maxPoolingWithMaskOutputLayer;
-  initTestLayer(config, &layerMap, &parameters, &maxPoolingWithMaskOutputLayer);
-  maxPoolingWithMaskOutputLayer->forward(paddle::PASS_GC);
-
-  /* prepare the upsample layer */
-  paddle::LayerConfig upsampleLayerConfig;
-  upsampleLayerConfig.set_type("upsample");
-  paddle::LayerInputConfig* input1 = upsampleLayerConfig.add_inputs();
-  upsampleLayerConfig.add_inputs();
-
-  paddle::UpsampleConfig* upsampleConfig = input1->mutable_upsample_conf();
-  upsampleConfig->set_scale(2);
-  paddle::ImageConfig* imageConfig = upsampleConfig->mutable_image_conf();
-  imageConfig->set_channels(2);
-  imageConfig->set_img_size(4);
-  imageConfig->set_img_size_y(4);
-  upsampleLayerConfig.set_size(2 * 8 * 8);
-  upsampleLayerConfig.set_name("upsample");
-
-  for (size_t i = 0; i < 2; i++) {
-    paddle::LayerInputConfig& inputTemp =
-        *(upsampleLayerConfig.mutable_inputs(i));
-    inputTemp.set_input_layer_name("MaxPoolWithMask");
-  }
-
-  paddle::LayerPtr upsampleLayer;
-  paddle::ParameterMap parameterMap;
-  upsampleLayer = paddle::Layer::create(upsampleLayerConfig);
-  layerMap[upsampleLayerConfig.name()] = upsampleLayer;
-  upsampleLayer->init(layerMap, parameterMap);
-  upsampleLayer->setNeedGradient(true);
-  upsampleLayer->forward(paddle::PASS_GC);
-  upsampleLayer->getOutputGrad()->copyFrom(tempGradData, 128);
-  upsampleLayer->backward();
-
-  return upsampleLayer;
-}
-
-TEST(Layer, maxPoolingWithMaskOutputLayerFwd) {
-  bool useGpu = false;
-  paddle::MatrixPtr inputMat;
-  paddle::MatrixPtr inputGPUMat;
-  paddle::MatrixPtr tempGradMat;
-
-  inputMat = paddle::Matrix::create(1, 128, false, useGpu);
-  inputMat->randomizeUniform();
-
-  tempGradMat = paddle::Matrix::create(1, 128, false, useGpu);
-  tempGradMat->randomizeUniform();
-  real* tempGradData = tempGradMat->getData();
-
-  paddle::LayerPtr upsampleLayerCPU =
-      doOneUpsampleTest(inputMat, "max-pool-with-mask", useGpu, tempGradData);
-
-#ifdef PADDLE_WITH_CUDA
-  useGpu = true;
-  real* data = inputMat->getData();
-  inputGPUMat = paddle::Matrix::create(1, 128, false, useGpu);
-  inputGPUMat->copyFrom(data, 128);
-  paddle::LayerPtr upsampleLayerGPU = doOneUpsampleTest(
-      inputGPUMat, "max-pool-with-mask", useGpu, tempGradData);
-  paddle::checkMatrixEqual(upsampleLayerCPU->getOutput("").value,
-                           upsampleLayerGPU->getOutput("").value);
-
-  paddle::checkMatrixEqual(upsampleLayerCPU->getPrev(0)->getOutputGrad(),
-                           upsampleLayerGPU->getPrev(0)->getOutputGrad());
-#endif
-}
diff --git a/paddle/gserver/tests/test_WarpCTCLayer.cpp b/paddle/gserver/tests/test_WarpCTCLayer.cpp
deleted file mode 100644
index f2299d7da2a51e4015793ae531af002aed1f6b2f..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_WarpCTCLayer.cpp
+++ /dev/null
@@ -1,244 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/utils/Version.h>
-#include "ModelConfig.pb.h"
-#include "paddle/gserver/layers/CTCLayer.h"
-#include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/gserver/layers/Layer.h"
-#include "paddle/gserver/layers/WarpCTCLayer.h"
-
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_bool(use_gpu);
-
-const real* getData(const Matrix& matrix) {
-  if (matrix.useGpu()) {
-    MatrixPtr cpuMatrix = Matrix::create(
-        matrix.getHeight(), matrix.getWidth(), matrix.isTransposed(), false);
-    cpuMatrix->copyFrom(matrix);
-    return cpuMatrix->getData();
-  } else {
-    return matrix.getData();
-  }
-}
-
-int checkError(const Matrix& matrix1, const Matrix& matrix2) {
-  CHECK_EQ(matrix1.getHeight(), matrix2.getHeight());
-  CHECK_EQ(matrix1.getWidth(), matrix2.getWidth());
-  CHECK_EQ(matrix1.isTransposed(), matrix2.isTransposed());
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-
-  int height = matrix1.getHeight();
-  int width = matrix1.getWidth();
-
-  const real* data1 = getData(matrix1);
-  const real* data2 = getData(matrix2);
-  int count = 0;
-  for (int i = 0; i < height; i++) {
-    for (int j = 0; j < width; j++) {
-      if (fabs(data1[i * width + j] - data2[i * width + j]) > err) {
-        count++;
-      }
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-  return count;
-}
-
-void initArgument(size_t batchSize,
-                  int layerSize,
-                  bool useGpu,
-                  Argument& data) {
-  data.value = Matrix::create(batchSize, layerSize, false, useGpu);
-  data.grad = Matrix::create(batchSize, layerSize, false, useGpu);
-  data.value->randomizeUniform();
-  data.value->add(-0.5);
-  data.grad->zeroMem();
-
-  generateSequenceStartPositions(batchSize, data.sequenceStartPositions);
-}
-
-LayerPtr createDataLayer(
-    string name, size_t batchSize, int layerSize, bool useGpu, Argument& data) {
-  LayerConfig layerConfig;
-  layerConfig.set_name(name);
-  layerConfig.set_type("data");
-  layerConfig.set_size(layerSize);
-  LayerPtr layer = LayerPtr(new DataLayer(layerConfig));
-
-  DataLayerPtr dataLayer = std::dynamic_pointer_cast<DataLayer>(layer);
-  dataLayer->setData(data);
-  dataLayer->forward(PASS_GC);
-
-  return layer;
-}
-
-LayerPtr createLabelLayer(string name,
-                          size_t batchSize,
-                          size_t numClasses,
-                          bool useGpu) {
-  LayerConfig layerConfig;
-  layerConfig.set_name(name);
-  layerConfig.set_type("data");
-  layerConfig.set_size(1);
-  LayerPtr layer = LayerPtr(new DataLayer(layerConfig));
-
-  Argument data;
-  data.ids = IVector::create(batchSize, useGpu);
-  data.ids->rand(numClasses - 1);
-
-  generateSequenceStartPositions(batchSize, data.sequenceStartPositions);
-
-  DataLayerPtr labelLayer = std::dynamic_pointer_cast<DataLayer>(layer);
-  labelLayer->setData(data);
-  labelLayer->forward(PASS_GC);
-
-  return layer;
-}
-
-LayerPtr createCTCLayer(string name,
-                        size_t numClasses,
-                        bool useGpu,
-                        bool normByTimes,
-                        LayerPtr dataLayer,
-                        LayerPtr labelLayer) {
-  LayerMap layerMap;
-  layerMap[dataLayer->getName()] = dataLayer;
-  layerMap[labelLayer->getName()] = labelLayer;
-
-  ParameterMap parameterMap;
-
-  LayerConfig layerConfig;
-  layerConfig.set_name(name);
-  layerConfig.set_type("ctc");
-  layerConfig.set_size(numClasses);
-  layerConfig.set_norm_by_times(normByTimes);
-
-  layerConfig.add_inputs();
-  LayerInputConfig& input0 = *(layerConfig.mutable_inputs(0));
-  input0.set_input_layer_name(dataLayer->getName());
-
-  layerConfig.add_inputs();
-  LayerInputConfig& input1 = *(layerConfig.mutable_inputs(1));
-  input1.set_input_layer_name(labelLayer->getName());
-
-  LayerPtr layer = LayerPtr(new CTCLayer(layerConfig));
-  layerMap[layer->getName()] = layer;
-  layer->init(layerMap, parameterMap);
-
-  ActivationFunction* softmaxActivation = ActivationFunction::create("softmax");
-
-  softmaxActivation->forward(dataLayer->getOutput()).check();
-  layer->forward(PASS_GC);
-
-  layer->backward();
-  softmaxActivation->backward(dataLayer->getOutput()).check();
-
-  return layer;
-}
-
-LayerPtr createWarpCTCLayer(string name,
-                            size_t numClasses,
-                            bool useGpu,
-                            bool normByTimes,
-                            LayerPtr dataLayer,
-                            LayerPtr labelLayer) {
-  LayerMap layerMap;
-  layerMap[dataLayer->getName()] = dataLayer;
-  layerMap[labelLayer->getName()] = labelLayer;
-
-  ParameterMap parameterMap;
-
-  LayerConfig layerConfig;
-  layerConfig.set_name(name);
-  layerConfig.set_type("warp_ctc");
-  layerConfig.set_size(numClasses);
-  layerConfig.set_blank(numClasses - 1);
-  layerConfig.set_norm_by_times(normByTimes);
-
-  layerConfig.add_inputs();
-  LayerInputConfig& input0 = *(layerConfig.mutable_inputs(0));
-  input0.set_input_layer_name(dataLayer->getName());
-
-  layerConfig.add_inputs();
-  LayerInputConfig& input1 = *(layerConfig.mutable_inputs(1));
-  input1.set_input_layer_name(labelLayer->getName());
-
-  LayerPtr layer = LayerPtr(new WarpCTCLayer(layerConfig));
-  layerMap[layer->getName()] = layer;
-  layer->init(layerMap, parameterMap);
-
-  layer->forward(PASS_GC);
-  layer->backward();
-
-  return layer;
-}
-
-TEST(Layer, WarpCTCLayer) {
-  for (auto layerSize : {10, 64}) {
-    for (auto batchSize : {1, 10, 32}) {
-      for (auto normByTimes : {false, true}) {
-        for (auto useGpu : {false, true}) {
-#ifndef PADDLE_WITH_CUDA
-          if (useGpu) continue;
-#endif
-          LOG(INFO) << "layerSize=" << layerSize << " batchSize=" << batchSize
-                    << " normByTimes = " << normByTimes << " useGpu=" << useGpu;
-
-          FLAGS_use_gpu = useGpu;
-
-          Argument data0;
-          initArgument(batchSize, layerSize, useGpu, data0);
-
-          Argument data1;
-          data1.resizeAndCopyFrom(data0);
-
-          LayerPtr dataLayer0 =
-              createDataLayer("data", batchSize, layerSize, useGpu, data0);
-          LayerPtr dataLayer1 =
-              createDataLayer("data", batchSize, layerSize, useGpu, data1);
-
-          LayerPtr labelLayer =
-              createLabelLayer("label", batchSize, layerSize, useGpu);
-
-          LayerPtr warpctcLayer = createWarpCTCLayer(
-              "cost", layerSize, useGpu, normByTimes, dataLayer0, labelLayer);
-          LayerPtr ctcLayer = createCTCLayer(
-              "cost", layerSize, useGpu, normByTimes, dataLayer1, labelLayer);
-
-          /// Check cost
-          LOG(INFO) << "Check cost: "
-                    << checkError(*(warpctcLayer->getOutput().value),
-                                  *(ctcLayer->getOutput().value))
-                    << " different elements.";
-
-          /// Check gradients
-          LOG(INFO) << "Check gradients: "
-                    << checkError(*(dataLayer0->getOutput().grad),
-                                  *(dataLayer1->getOutput().grad))
-                    << " different elements";
-        }
-      }
-    }
-  }
-}
diff --git a/paddle/legacy/api/Arguments.cpp b/paddle/legacy/api/Arguments.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7bb5a6f75b9a8ab800fc74c6cc01c0b104ccdd5e
--- /dev/null
+++ b/paddle/legacy/api/Arguments.cpp
@@ -0,0 +1,174 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "PaddleAPI.h"
+#include "PaddleAPIPrivate.h"
+
+#include "paddle/legacy/parameter/Argument.h"
+
+size_t Arguments::getSlotNum() const { return m->outputs.size(); }
+
+Arguments* Arguments::createArguments(size_t slotNum) {
+  auto args = new Arguments();
+  args->m->outputs.resize(slotNum);
+  return args;
+}
+
+void Arguments::resize(size_t slotNum) { m->outputs.resize(slotNum); }
+
+Arguments::Arguments() : m(new ArgumentsPrivate()) {}
+
+Arguments::~Arguments() { delete m; }
+
+Arguments* Arguments::createByPaddleArgumentVector(void* ptr) {
+  auto p = (std::vector<paddle::Argument>*)(ptr);
+  auto args = new Arguments();
+  args->m->outputs = *p;
+  return args;
+}
+
+Arguments* Arguments::createByPaddleArgument(const void* ptr) {
+  auto p = (paddle::Argument*)(ptr);
+  auto args = new Arguments();
+  args->m->outputs.push_back(*p);
+  return args;
+}
+
+Matrix* Arguments::getSlotValue(size_t idx) const throw(RangeError) {
+  auto& a = m->getArg(idx);
+  return Matrix::createByPaddleMatrixPtr(&a.value);
+}
+
+Matrix* Arguments::getSlotGrad(size_t idx) const throw(RangeError) {
+  auto& a = m->getArg(idx);
+  return Matrix::createByPaddleMatrixPtr(&a.grad);
+}
+
+IVector* Arguments::getSlotIds(size_t idx) const throw(RangeError) {
+  auto& a = m->getArg(idx);
+  return IVector::createByPaddleVectorPtr(&a.ids);
+}
+
+Matrix* Arguments::getSlotIn(size_t idx) const throw(RangeError) {
+  auto& a = m->getArg(idx);
+  return Matrix::createByPaddleMatrixPtr(&a.in);
+}
+
+void Arguments::setSlotValue(size_t idx, Matrix* mat) throw(RangeError) {
+  auto& a = m->getArg(idx);
+  a.value = m->cast<paddle::Matrix>(mat->getSharedPtr());
+}
+
+void Arguments::setSlotGrad(size_t idx, Matrix* mat) throw(RangeError) {
+  auto& a = m->getArg(idx);
+  a.grad = m->cast<paddle::Matrix>(mat->getSharedPtr());
+}
+
+void Arguments::setSlotIn(size_t idx, Matrix* mat) throw(RangeError) {
+  auto& a = m->getArg(idx);
+  a.in = m->cast<paddle::Matrix>(mat->getSharedPtr());
+}
+
+void Arguments::setSlotIds(size_t idx, IVector* vec) throw(RangeError) {
+  auto& a = m->getArg(idx);
+  auto& v = m->cast<paddle::IVector>(vec->getSharedPtr());
+  a.ids = v;
+}
+
+template <typename T1>
+static inline void doCopyFromSafely(std::shared_ptr<T1>& dest,
+                                    std::shared_ptr<T1>& src) {
+  if (src) {
+    if (dest) {
+      dest->copyFrom(*src);
+    } else {
+      dest = src;
+    }
+  }
+}
+
+IVector* Arguments::getSlotSequenceStartPositions(size_t idx) const
+    throw(RangeError) {
+  auto& a = m->getArg(idx);
+  if (a.sequenceStartPositions) {
+    return IVector::createByPaddleVectorPtr(
+        &a.sequenceStartPositions->getMutableVector(false));
+  } else {
+    return nullptr;
+  }
+}
+
+IVector* Arguments::getSlotSubSequenceStartPositions(size_t idx) const
+    throw(RangeError) {
+  auto& a = m->getArg(idx);
+  if (a.subSequenceStartPositions) {
+    return IVector::createByPaddleVectorPtr(
+        &a.subSequenceStartPositions->getMutableVector(false));
+  } else {
+    return nullptr;
+  }
+}
+
+void Arguments::setSlotSequenceStartPositions(size_t idx,
+                                              IVector* vec) throw(RangeError) {
+  auto& a = m->getArg(idx);
+  auto& v = m->cast<paddle::IVector>(vec->getSharedPtr());
+  a.sequenceStartPositions = std::make_shared<paddle::ICpuGpuVector>(v);
+}
+
+void Arguments::setSlotSubSequenceStartPositions(
+    size_t idx, IVector* vec) throw(RangeError) {
+  auto& a = m->getArg(idx);
+  auto& v = m->cast<paddle::IVector>(vec->getSharedPtr());
+  a.subSequenceStartPositions = std::make_shared<paddle::ICpuGpuVector>(v);
+}
+
+IVector* Arguments::getSlotSequenceDim(size_t idx) const throw(RangeError) {
+  auto& a = m->getArg(idx);
+  return IVector::createByPaddleVectorPtr(&a.cpuSequenceDims);
+}
+
+void Arguments::setSlotSequenceDim(size_t idx, IVector* vec) throw(RangeError) {
+  auto& a = m->getArg(idx);
+  a.cpuSequenceDims = m->cast<paddle::IVector>(vec->getSharedPtr());
+}
+
+float Arguments::sum() const { return paddle::Argument::sum(m->outputs); }
+
+int64_t Arguments::getBatchSize(size_t idx) const throw(RangeError) {
+  auto& a = m->getArg(idx);
+  return a.getBatchSize();
+}
+
+void Arguments::setSlotFrameHeight(size_t idx, size_t h) throw(RangeError) {
+  auto& a = m->getArg(idx);
+  a.setFrameHeight(h);
+}
+
+void Arguments::setSlotFrameWidth(size_t idx, size_t w) throw(RangeError) {
+  auto& a = m->getArg(idx);
+  a.setFrameWidth(w);
+}
+
+size_t Arguments::getSlotFrameHeight(size_t idx) const throw(RangeError) {
+  auto& a = m->getArg(idx);
+  return a.getFrameHeight();
+}
+
+size_t Arguments::getSlotFrameWidth(size_t idx) const throw(RangeError) {
+  auto& a = m->getArg(idx);
+  return a.getFrameWidth();
+}
+
+void* Arguments::getInternalArgumentsPtr() const { return &m->outputs; }
diff --git a/paddle/api/CMakeLists.txt b/paddle/legacy/api/CMakeLists.txt
similarity index 100%
rename from paddle/api/CMakeLists.txt
rename to paddle/legacy/api/CMakeLists.txt
diff --git a/paddle/legacy/api/ConfigParser.cpp b/paddle/legacy/api/ConfigParser.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..016d6da4e2e4ce888527fe9b61a163056d7729eb
--- /dev/null
+++ b/paddle/legacy/api/ConfigParser.cpp
@@ -0,0 +1,114 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "PaddleAPI.h"
+#include "PaddleAPIPrivate.h"
+#include "paddle/legacy/trainer/Trainer.h"
+
+struct ParameterConfigPrivate {
+  paddle::ParameterPtr parameter;
+  paddle::ParameterConfig config;
+
+  inline paddle::ParameterConfig* getConfigPtr() {
+    if (parameter != nullptr) {
+      auto& conf = parameter->getConfig();
+      return const_cast<paddle::ParameterConfig*>(&conf);
+    } else {
+      return &config;
+    }
+  }
+};
+
+TrainerConfig::TrainerConfig() : m(new TrainerConfigPrivate()) {}
+
+TrainerConfig::~TrainerConfig() { delete m; }
+
+TrainerConfig* TrainerConfig::createFromTrainerConfigFile(
+    const std::string& confPath) {
+  LOG(INFO) << "load trainer config from " << confPath;
+  auto conf = std::make_shared<paddle::TrainerConfigHelper>(confPath);
+  auto retv = new TrainerConfig();
+  retv->m->conf = conf;
+  return retv;
+}
+
+TrainerConfig* TrainerConfig::createFromProtoString(const std::string& str) {
+  auto retv = new TrainerConfig();
+  paddle::TrainerConfig trainerConfigProto;
+  auto conf = std::make_shared<paddle::TrainerConfigHelper>(trainerConfigProto);
+  CHECK(conf->getMutableConfig().ParseFromString(str));
+  retv->m->conf = conf;
+  return retv;
+}
+
+ModelConfig::ModelConfig() : m(new ModelConfigPrivate()) {}
+
+ModelConfig::~ModelConfig() { delete m; }
+
+ModelConfig* TrainerConfig::getModelConfig() const {
+  auto retv = new ModelConfig();
+  retv->m->conf = m->conf;
+  return retv;
+}
+
+ParameterConfig::ParameterConfig() : m(new ParameterConfigPrivate()) {}
+
+ParameterConfig::~ParameterConfig() { delete m; }
+
+ParameterConfig* ParameterConfig::createParameterConfigFromParameterSharedPtr(
+    void* ptr) {
+  auto& p = *(paddle::ParameterPtr*)(ptr);
+  if (p != nullptr) {
+    auto conf = new ParameterConfig();
+    conf->m->parameter = p;
+    return conf;
+  } else {
+    return nullptr;
+  }
+}
+
+ParameterConfig* ParameterConfig::createParameterConfigFromParameterPtr(
+    void* ptr) {
+  auto& p = *(paddle::Parameter*)(ptr);
+  auto conf = new ParameterConfig();
+  conf->m->config = p.getConfig();
+  return conf;
+}
+
+std::string ParameterConfig::toProtoString() const {
+  return m->getConfigPtr()->SerializeAsString();
+}
+
+void* ParameterConfig::getRawPtr() { return m->getConfigPtr(); }
+
+OptimizationConfig::OptimizationConfig() : m(new OptimizationConfigPrivate()) {}
+
+OptimizationConfig::~OptimizationConfig() { delete m; }
+
+std::string OptimizationConfig::toProtoString() {
+  return m->getConfig().SerializeAsString();
+}
+
+OptimizationConfig* TrainerConfig::getOptimizationConfig() const {
+  auto opt_config = new OptimizationConfig();
+  opt_config->m->trainer_config = m->conf;
+  return opt_config;
+}
+
+OptimizationConfig* OptimizationConfig::createFromProtoString(
+    const std::string& str) {
+  auto conf = new OptimizationConfig();
+  conf->m->config.ParseFromString(str);
+  return conf;
+}
diff --git a/paddle/api/Evaluator.cpp b/paddle/legacy/api/Evaluator.cpp
similarity index 100%
rename from paddle/api/Evaluator.cpp
rename to paddle/legacy/api/Evaluator.cpp
diff --git a/paddle/legacy/api/GradientMachine.cpp b/paddle/legacy/api/GradientMachine.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5ad2fe11a4c668a318f76492f57091f386183986
--- /dev/null
+++ b/paddle/legacy/api/GradientMachine.cpp
@@ -0,0 +1,196 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "PaddleAPI.h"
+#include "PaddleAPIPrivate.h"
+
+#include "Internal.h"
+#include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
+
+std::vector<int> GradientMachine::defaultParamTypes = {
+    PARAMETER_VALUE, PARAMETER_GRADIENT, PARAMETER_MOMENTUM};
+
+GradientMachine::GradientMachine() : m(new GradientMachinePrivate()) {}
+
+GradientMachine::~GradientMachine() { delete m; }
+
+GradientMachine* GradientMachine::createFromPaddleModelPtr(
+    const void* confPtr,
+    GradientMatchineCreateMode mode,
+    const std::vector<int>& types) {
+  auto& conf = *(const paddle::ModelConfig*)(confPtr);
+  std::vector<ParameterType> realTypes;
+  staticCastVector(&realTypes, types);
+  auto machineRawPtr = paddle::GradientMachine::create(conf, mode, realTypes);
+  auto machinePtr = std::shared_ptr<paddle::GradientMachine>(machineRawPtr);
+  if (machinePtr != nullptr) {
+    auto machine = new GradientMachine();
+    machine->m->machine = machinePtr;
+    return machine;
+  } else {
+    return nullptr;
+  }
+}
+
+GradientMachine* GradientMachine::createByConfigProtoStr(
+    const std::string& protoStr,
+    GradientMatchineCreateMode mode,
+    const std::vector<int>& types) {
+  paddle::ModelConfig conf;
+  conf.ParseFromString(protoStr);
+  if (conf.IsInitialized()) {
+    return GradientMachine::createFromPaddleModelPtr(&conf, mode, types);
+  } else {
+    return nullptr;
+  }
+}
+
+GradientMachine* GradientMachine::createByModelConfig(
+    ModelConfig* conf,
+    GradientMatchineCreateMode mode,
+    const std::vector<int>& types) {
+  auto confPtr = &conf->m->conf->getModelConfig();
+  return GradientMachine::createFromPaddleModelPtr(confPtr, mode, types);
+}
+
+void GradientMachine::start() { m->machine->start(); }
+
+void GradientMachine::finish() { m->machine->finish(); }
+
+void GradientMachine::onPassEnd() { m->machine->onPassEnd(); }
+
+void GradientMachine::prefetch(const Arguments& inArgs) {
+  auto& in =
+      m->cast<std::vector<paddle::Argument>>(inArgs.getInternalArgumentsPtr());
+  m->machine->prefetch(in);
+}
+
+void GradientMachine::forward(const Arguments& inArgs,
+                              Arguments* outArgs,
+                              PassType passType) {
+  auto& in =
+      m->cast<std::vector<paddle::Argument>>(inArgs.getInternalArgumentsPtr());
+  auto& out = m->cast<std::vector<paddle::Argument>>(
+      outArgs->getInternalArgumentsPtr());
+  paddle::PassType pt = (paddle::PassType)(passType);
+  m->machine->forward(in, &out, pt);
+}
+
+UpdateCallback::~UpdateCallback() {}
+
+void UpdateCallback::apply(Parameter* p) {
+  // UNUSED(p);
+}
+
+class UpdateCallbackWrapper {
+ public:
+  explicit UpdateCallbackWrapper(const UpdateCallback& callback)
+      : callback(const_cast<UpdateCallback&>(callback)) {}
+
+  void operator()(paddle::Parameter* param) {
+    auto p = Parameter::createFromRawPtr(&param);
+    // @TODO Use Stack variable instead.
+    callback.apply(p);
+    delete p;
+  }
+
+ private:
+  UpdateCallback& callback;
+};
+
+void GradientMachine::backward(const UpdateCallback& callback) {
+  m->machine->backward(UpdateCallbackWrapper(callback));
+}
+
+void GradientMachine::forwardBackward(const Arguments& inArgs,
+                                      Arguments* outArgs,
+                                      PassType passType,
+                                      const UpdateCallback& callback) {
+  auto& in =
+      m->cast<std::vector<paddle::Argument>>(inArgs.getInternalArgumentsPtr());
+  auto& out = m->cast<std::vector<paddle::Argument>>(
+      outArgs->getInternalArgumentsPtr());
+  paddle::PassType pt = (paddle::PassType)(passType);
+  m->machine->forwardBackward(in, &out, pt, UpdateCallbackWrapper(callback));
+}
+
+void GradientMachine::loadParameters(const std::string& path) {
+  m->machine->loadParameters(path);
+}
+
+size_t GradientMachine::getParameterSize() const {
+  return m->machine->getParameters().size();
+}
+
+Parameter* GradientMachine::getParameter(size_t i) throw(RangeError) {
+  auto params = m->machine->getParameters();
+  if (i < params.size()) {
+    return Parameter::createFromSharedPtr(&m->machine->getParameters()[i]);
+  } else {
+    throw RangeError();
+  }
+}
+
+size_t GradientMachine::getNonStaticParameterSize() const {
+  return m->machine->getNonStaticParameters().size();
+}
+
+Parameter* GradientMachine::getNonStaticParameter(size_t i) throw(RangeError) {
+  auto params = m->machine->getNonStaticParameters();
+  if (i < params.size()) {
+    return Parameter::createFromSharedPtr(
+        &m->machine->getNonStaticParameters()[i]);
+  } else {
+    throw RangeError();
+  }
+}
+
+void GradientMachine::randParameters() { m->machine->randParameters(); }
+
+Arguments* GradientMachine::getLayerOutput(const std::string& layerName) const
+    throw(UnsupportError) {
+  auto nn = m->machine;
+  if (nn) {
+    auto arg = nn->getLayerOutput(layerName);
+    return Arguments::createByPaddleArgument(&arg);
+  } else {
+    throw UnsupportError();
+  }
+}
+
+SequenceGenerator* GradientMachine::asSequenceGenerator(
+    const std::vector<std::string>& dict,
+    size_t begin_id,
+    size_t end_id,
+    size_t max_length,
+    size_t beam_size) {
+  SequenceGenerator* r =
+      SequenceGenerator::createByGradientMachineSharedPtr(&m->machine);
+  r->setDict(dict);
+  r->setBos(begin_id);
+  r->setEos(end_id);
+  r->setMaxLength(max_length);
+  r->setBeamSize(beam_size);
+  return r;
+}
+
+Evaluator* GradientMachine::makeEvaluator() {
+  auto ev = new Evaluator();
+  ev->m->rawPtr = m->machine->makeEvaluator();
+  return ev;
+}
+
+void GradientMachine::eval(Evaluator* evaluator) {
+  m->machine->eval(evaluator->m->rawPtr);
+}
diff --git a/paddle/api/Internal.h b/paddle/legacy/api/Internal.h
similarity index 100%
rename from paddle/api/Internal.h
rename to paddle/legacy/api/Internal.h
diff --git a/paddle/legacy/api/Matrix.cpp b/paddle/legacy/api/Matrix.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8862d0ea92c92a2608b49c6b1315badae9e9fd98
--- /dev/null
+++ b/paddle/legacy/api/Matrix.cpp
@@ -0,0 +1,317 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/math/Matrix.h"
+#include <cstring>
+#include <iostream>
+#include "PaddleAPI.h"
+#include "paddle/legacy/math/CpuSparseMatrix.h"
+#include "paddle/legacy/math/SparseMatrix.h"
+
+struct MatrixPrivate {
+  std::shared_ptr<paddle::Matrix> mat;
+};
+
+Matrix::Matrix() : m(new MatrixPrivate()) {}
+
+Matrix* Matrix::createByPaddleMatrixPtr(void* sharedPtr) {
+  auto* mat = reinterpret_cast<paddle::MatrixPtr*>(sharedPtr);
+  if ((*mat) != nullptr) {
+    auto m = new Matrix();
+    m->m->mat = *mat;
+    return m;
+  } else {
+    return nullptr;
+  }
+}
+
+Matrix* Matrix::createZero(size_t height, size_t width, bool useGpu) {
+  auto m = new Matrix();
+  m->m->mat = paddle::Matrix::create(height, width, useGpu);
+  m->m->mat->zero();
+  return m;
+}
+
+Matrix* Matrix::createDense(const std::vector<float>& data,
+                            size_t height,
+                            size_t width,
+                            bool useGpu) {
+  auto m = new Matrix();
+  m->m->mat = paddle::Matrix::create(height, width, useGpu);
+  m->m->mat->copyFrom(data.data(), data.size());
+  return m;
+}
+
+Matrix* Matrix::createDenseFromNumpy(float* data,
+                                     int dim1,
+                                     int dim2,
+                                     bool copy,
+                                     bool useGpu) throw(UnsupportError) {
+  if (useGpu) {
+    /// Gpu mode only supports copy=True
+    if (!copy) {
+      throw UnsupportError("Gpu mode only supports copy=True");
+    }
+    return Matrix::createGpuDenseFromNumpy(data, dim1, dim2);
+  } else {
+    return Matrix::createCpuDenseFromNumpy(data, dim1, dim2, copy);
+  }
+}
+
+Matrix* Matrix::createCpuDenseFromNumpy(float* data,
+                                        int dim1,
+                                        int dim2,
+                                        bool copy) {
+  auto m = new Matrix();
+  if (copy) {
+    m->m->mat = paddle::Matrix::create(dim1, dim2);
+    m->m->mat->copyFrom(data, dim1 * dim2);
+  } else {
+    m->m->mat = paddle::Matrix::create(data, dim1, dim2, false);
+  }
+  return m;
+}
+
+Matrix* Matrix::createGpuDenseFromNumpy(float* data, int dim1, int dim2) {
+  auto m = new Matrix();
+  m->m->mat = paddle::Matrix::create(dim1, dim2, false, true);
+  m->m->mat->copyFrom(data, dim1 * dim2);
+  return m;
+}
+
+Matrix* Matrix::createSparse(size_t height,
+                             size_t width,
+                             size_t nnz,
+                             bool isNonVal,
+                             bool isTrans,
+                             bool useGpu) {
+  auto m = new Matrix();
+  m->m->mat = paddle::Matrix::createSparseMatrix(
+      height,
+      width,
+      nnz,
+      isNonVal ? paddle::NO_VALUE : paddle::FLOAT_VALUE,
+      isTrans,
+      useGpu);
+  return m;
+}
+
+Matrix::~Matrix() { delete m; }
+
+size_t Matrix::getHeight() const { return m->mat->getHeight(); }
+
+size_t Matrix::getWidth() const { return m->mat->getWidth(); }
+
+float Matrix::get(size_t x, size_t y) const throw(RangeError) {
+  if (x > this->getWidth() || y > this->getHeight()) {
+    RangeError e;
+    throw e;
+  }
+  return m->mat->getElement(x, y);
+}
+
+void Matrix::set(size_t x, size_t y, float val) throw(RangeError,
+                                                      UnsupportError) {
+  if (x > this->getWidth() || y > this->getHeight()) {
+    RangeError e;
+    throw e;
+  }
+  auto rawMat = m->mat.get();
+  if (auto cDenseMat = dynamic_cast<paddle::CpuMatrix*>(rawMat)) {
+    *(cDenseMat->getData() + x + y * cDenseMat->getWidth()) = val;
+  } else {
+    UnsupportError e;
+    throw e;
+  }
+}
+
+bool Matrix::isSparse() const {
+  auto raw_mat = m->mat.get();
+  return dynamic_cast<paddle::CpuSparseMatrix*>(raw_mat) != nullptr ||
+         dynamic_cast<paddle::GpuSparseMatrix*>(raw_mat) != nullptr;
+}
+
+SparseValueType Matrix::getSparseValueType() const throw(UnsupportError) {
+  auto cpuSparseMat =
+      std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
+  if (cpuSparseMat != nullptr) {
+    return (SparseValueType)cpuSparseMat->getValueType();
+  } else {
+    auto gpuSparseMat =
+        std::dynamic_pointer_cast<paddle::GpuSparseMatrix>(m->mat);
+    if (gpuSparseMat != nullptr) {
+      return (SparseValueType)gpuSparseMat->getValueType();
+    } else {
+      UnsupportError e;
+      throw e;
+    }
+  }
+}
+
+SparseFormatType Matrix::getSparseFormat() const throw(UnsupportError) {
+  auto cpuSparseMat =
+      std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
+  if (cpuSparseMat != nullptr) {
+    return (SparseFormatType)cpuSparseMat->getFormat();
+  } else {
+    auto gpuSparseMat =
+        std::dynamic_pointer_cast<paddle::GpuSparseMatrix>(m->mat);
+    if (gpuSparseMat != nullptr) {
+      return SPARSE_CSR;
+    } else {
+      UnsupportError e;
+      throw e;
+    }
+  }
+}
+
+IntArray Matrix::getSparseRowCols(size_t i) const
+    throw(UnsupportError, RangeError) {
+  auto cpuSparseMat =
+      std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
+  if (cpuSparseMat != nullptr &&
+      cpuSparseMat->getFormat() == paddle::SPARSE_CSR) {
+    if (i < cpuSparseMat->getHeight()) {
+      // cpuSparseMat->print(std::cout);
+      size_t len = cpuSparseMat->getColNum(i);
+      return IntArray(cpuSparseMat->getRowCols(i), len);
+    } else {
+      RangeError e;
+      throw e;
+    }
+  } else {
+    UnsupportError e;
+    throw e;
+  }
+}
+
+IntWithFloatArray Matrix::getSparseRowColsVal(size_t i) const
+    throw(UnsupportError, RangeError) {
+  auto cpuSparseMat =
+      std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
+  if (cpuSparseMat != nullptr &&
+      cpuSparseMat->getValueType() == paddle::FLOAT_VALUE) {
+    if (i < cpuSparseMat->getHeight()) {
+      return IntWithFloatArray(cpuSparseMat->getRowValues(i),
+                               cpuSparseMat->getRowCols(i),
+                               cpuSparseMat->getColNum(i));
+    } else {
+      RangeError e;
+      throw e;
+    }
+  } else {
+    UnsupportError e;
+    throw e;
+  }
+}
+
+FloatArray Matrix::getData() const {
+  auto rawMat = m->mat.get();
+  if (dynamic_cast<paddle::GpuMemoryHandle*>(rawMat->getMemoryHandle().get())) {
+    // is gpu. then copy data
+    float* data = rawMat->getData();
+    size_t len = rawMat->getElementCnt();
+    float* cpuData = new float[len];
+    hl_memcpy_device2host(cpuData, data, len * sizeof(float));
+    FloatArray ret_val(cpuData, len);
+    ret_val.needFree = true;
+    return ret_val;
+  } else {
+    FloatArray ret_val(rawMat->getData(), rawMat->getElementCnt());
+    return ret_val;
+  }
+}
+
+void Matrix::sparseCopyFrom(
+    const std::vector<int>& rows,
+    const std::vector<int>& cols,
+    const std::vector<float>& vals) throw(UnsupportError) {
+  auto cpuSparseMat =
+      std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
+  if (cpuSparseMat != nullptr) {
+    // LOG(INFO) <<"RowSize = "<<rows.size()
+    //  <<" ColSize = "<<cols.size()
+    //  <<" ValSize = "<<vals.size();
+    cpuSparseMat->copyFrom(const_cast<std::vector<int>&>(rows),
+                           const_cast<std::vector<int>&>(cols),
+                           const_cast<std::vector<float>&>(vals));
+  } else {
+    UnsupportError e;
+    throw e;
+  }
+}
+
+void* Matrix::getSharedPtr() const { return &m->mat; }
+
+void Matrix::toNumpyMatInplace(float** view_data,
+                               int* dim1,
+                               int* dim2) throw(UnsupportError) {
+  auto cpuMat = std::dynamic_pointer_cast<paddle::CpuMatrix>(m->mat);
+  if (cpuMat) {
+    *dim1 = cpuMat->getHeight();
+    *dim2 = cpuMat->getWidth();
+    *view_data = cpuMat->getData();
+  } else {
+    throw UnsupportError();
+  }
+}
+void Matrix::copyToNumpyMat(float** view_m_data,
+                            int* dim1,
+                            int* dim2) throw(UnsupportError) {
+  static_assert(sizeof(paddle::real) == sizeof(float),
+                "Currently PaddleAPI only support for single "
+                "precision version of paddle.");
+  if (this->isSparse()) {
+    throw UnsupportError();
+  } else {
+    *dim1 = m->mat->getHeight();
+    *dim2 = m->mat->getWidth();
+    *view_m_data = new float[(*dim1) * (*dim2)];
+    if (auto cpuMat = dynamic_cast<paddle::CpuMatrix*>(m->mat.get())) {
+      auto src = cpuMat->getData();
+      auto dest = *view_m_data;
+      std::memcpy(dest, src, sizeof(paddle::real) * (*dim1) * (*dim2));
+    } else if (auto gpuMat = dynamic_cast<paddle::GpuMatrix*>(m->mat.get())) {
+      auto src = gpuMat->getData();
+      auto dest = *view_m_data;
+      hl_memcpy_device2host(
+          dest, src, sizeof(paddle::real) * (*dim1) * (*dim2));
+    } else {
+      LOG(WARNING) << "Unexpected Situation";
+      throw UnsupportError();
+    }
+  }
+}
+
+void Matrix::copyFromNumpyMat(float* data,
+                              int dim1,
+                              int dim2) throw(UnsupportError, RangeError) {
+  if (isSparse()) {
+    throw UnsupportError();
+  } else {
+    if (this->getHeight() == (size_t)dim1 && this->getWidth() == (size_t)dim2) {
+      if (m->mat->getData() != data) {
+        m->mat->copyFrom(data, dim1 * dim2);
+      }
+    } else {
+      throw RangeError();
+    }
+  }
+}
+
+bool Matrix::isGpu() const {
+  auto rawPtr = m->mat.get();
+  return dynamic_cast<paddle::GpuMatrix*>(rawPtr) != nullptr ||
+         dynamic_cast<paddle::GpuSparseMatrix*>(rawPtr) != nullptr;
+}
diff --git a/paddle/legacy/api/Paddle.i b/paddle/legacy/api/Paddle.i
new file mode 100644
index 0000000000000000000000000000000000000000..7a1456a5c065821caa54fbf4a10f7ceda08780c0
--- /dev/null
+++ b/paddle/legacy/api/Paddle.i
@@ -0,0 +1,202 @@
+%module(directors="1") swig_paddle
+%include "std_string.i"
+%{
+#define SWIG_FILE_WITH_INIT
+#include "legacy/api/PaddleAPI.h"
+%}
+
+%include "exception.i"
+%typemap(throws) UnsupportError %{
+  SWIG_exception(SWIG_RuntimeError, $1.what());
+  SWIG_fail;
+%}
+
+%include "std_vector.i"
+%include "std_pair.i"
+#ifdef SWIGPYTHON
+%include "numpy.i"
+#endif
+
+%init %{
+#ifdef SWIGPYTHON
+import_array();
+#endif
+%}
+
+
+namespace std {
+%template(vector_int) vector<int>;
+%template(vector_uint) vector<unsigned int>;
+%template(vector_float) vector<float>;
+%template(vector_string) vector<string>;
+%template(vector_vec_star) vector<Vector*>;
+}
+#ifdef SWIGPYTHON 
+%typemap(in) (int argc, char** argv) { 
+    int i = 0; 
+    if (!PyList_Check($input)) { 
+        PyErr_SetString(PyExc_ValueError, "Expecting a list"); 
+        return NULL; 
+    } 
+    $1 = PyList_Size($input); 
+    $2 = (char **) malloc(($1+1)*sizeof(char *)); 
+    for (i = 0; i < $1; i++) { 
+        PyObject *s = PyList_GetItem($input,i); 
+        if (!PyString_Check(s)) { 
+            free($2); 
+            PyErr_SetString(PyExc_ValueError, "List items must be strings"); 
+            return NULL; 
+        } 
+        $2[i] = PyString_AsString(s); 
+    } 
+    $2[i] = 0; 
+} 
+%typemap(freearg) (int argc, char** argv) { 
+    if ($2) free($2); 
+} 
+
+%typemap(out) FloatArray {
+  $result = PyList_New($1.length);
+  for (size_t i=0; i<$1.length; ++i) {
+    PyList_SetItem($result, i, PyFloat_FromDouble($1.buf[i]));
+  }  
+  if($1.needFree) {
+    delete [] $1.buf;  
+  }
+}
+
+%typemap(out) IntArray {
+  $result = PyList_New($1.length);  
+  for (size_t i=0; i<$1.length; ++i) {
+    PyList_SetItem($result, i, PyInt_FromLong($1.buf[i]));  
+  }
+  if ($1.needFree) {
+    delete [] $1.buf;  
+  }
+}
+
+%typemap(out) IntWithFloatArray {
+  $result = PyList_New($1.length);
+  for (size_t i=0; i<$1.length; ++i) {
+    PyList_SetItem($result, i, PyTuple_Pack(2, 
+      PyInt_FromLong($1.idxBuf[i]),
+      PyFloat_FromDouble($1.valBuf[i])
+    ));
+  }
+  if ($1.needFree) {
+    delete [] $1.idxBuf;
+    delete [] $1.valBuf;
+  } 
+}
+
+
+%rename(__getitem__) IVector::get;
+%rename(__setitem__) IVector::set;
+%rename(__len__) IVector::getSize;
+%rename(__getitem__) Vector::get;
+%rename(__setitem__) Vector::set;
+%rename(__len__) Vector::getSize;
+%rename(__len__) Parameter::getSize;
+%rename(__call__) ParameterTraverseCallback::apply;
+%rename(__repr__) Evaluator::toString;
+
+%apply (float* INPLACE_ARRAY2, int DIM1, int DIM2) { 
+  (float* data, int dim1, int dim2) 
+}
+
+%apply (float** ARGOUTVIEW_ARRAY2, int* DIM1, int* DIM2) { 
+  (float** view_data, int* dim1, int* dim2) 
+}
+
+%apply (float** ARGOUTVIEWM_ARRAY2, int* DIM1, int* DIM2) {
+  (float** view_m_data, int* dim1, int* dim2)  
+}
+
+%apply (int** ARGOUTVIEWM_ARRAY1, int* DIM1) {
+  (int** view_m_data, int* dim1)  
+}
+
+%apply (int* INPLACE_ARRAY1, int DIM1) { 
+  (int* data, int dim) 
+}
+
+%apply (int** ARGOUTVIEW_ARRAY1, int* DIM1) {
+  (int** view_data, int* dim1)  
+}
+
+%apply (float* INPLACE_ARRAY1, int DIM1) {
+  (float* data, int dim)
+}
+
+%apply (float** ARGOUTVIEW_ARRAY1, int* DIM1) {
+  (float** view_data, int* dim1)
+}
+
+%apply (float** ARGOUTVIEWM_ARRAY1, int* DIM1) {
+  (float** view_m_data, int* dim1)
+}
+
+#endif
+// The below functions internally create object by "new", so it should use
+// use SWIG to handle gc. There are hints for SWIG to handle GC.
+%newobject Matrix::createZero;
+%newobject Matrix::createSparse;
+%newobject Matrix::createDense;
+%newobject Matrix::createDenseFromNumpy;
+%newobject Matrix::createCpuDenseFromNumpy;
+%newobject Matrix::createGpuDenseFromNumpy;
+%newobject Vector::createZero;
+%newobject Vector::create;
+%newobject Vector::createVectorFromNumpy;
+%newobject Vector::createCpuVectorFromNumpy;
+%newobject Vector::createGpuVectorFromNumpy;
+%newobject IVector::createZero;
+%newobject IVector::create;
+%newobject IVector::createVectorFromNumpy;
+%newobject IVector::createCpuVectorFromNumpy;
+%newobject IVector::createGpuVectorFromNumpy;
+%newobject Trainer::createByCommandLine;
+%newobject Trainer::getForwardOutput;
+%newobject Trainer::getLayerOutput;
+%newobject Arguments::getSlotValue;
+%newobject Arguments::getSlotIds;
+%newobject Arguments::getSlotIn;
+%newobject Arguments::getSlotSequenceStartPositions;
+%newobject Arguments::getSlotSequenceDim;
+%newobject Arguments::createArguments;
+%newobject GradientMachine::createByConfigProtoStr;
+%newobject GradientMachine::createByModelConfig;
+%newobject GradientMachine::asSequenceGenerator;
+%newobject GradientMachine::getParameter;
+%newobject GradientMachine::getLayerOutput;
+%newobject GradientMachine::makeEvaluator;
+%newobject TrainerConfig::createFromTrainerConfigFile;
+%newobject TrainerConfig::getModelConfig;
+%newobject TrainerConfig::getOptimizationConfig;
+%newobject Parameter::getBuf;
+%newobject Parameter::getConfig;
+%newobject ParameterOptimizer::create;
+%newobject ParameterOptimizer::needSpecialTraversal;
+%newobject ParameterUpdater::createLocalUpdater;
+%newobject ParameterUpdater::createRemoteUpdater;
+%newobject ParameterUpdater::createNewRemoteUpdater;
+
+%feature("director") UpdateCallback;
+%feature("autodoc", 1); // To generate method stub, for code hint in ide
+
+// Ignore many private class, and method cannot be handled by swig.
+%ignore MatrixPrivate;
+%ignore TrainerPrivate;
+%ignore IVector::operator[];
+%ignore ArgumentsPrivate;
+%ignore GradientMachinePrivate;
+%ignore TrainerConfigPrivate;
+%ignore ModelConfigPrivate;
+%ignore ParameterPrivate;
+%ignore SequenceGeneratorPrivate;
+%ignore VectorPrivate;
+%ignore ParameterConfigPrivate;
+%ignore OptimizationConfigPrivate;
+%ignore ParameterTraverseCallbackPrivate;
+%include "legacy/utils/GlobalConstants.h"
+%include "legacy/api/PaddleAPI.h"
diff --git a/paddle/legacy/api/PaddleAPI.h b/paddle/legacy/api/PaddleAPI.h
new file mode 100644
index 0000000000000000000000000000000000000000..475984a3d57ebc25d5d071c33b7e6562ac78c503
--- /dev/null
+++ b/paddle/legacy/api/PaddleAPI.h
@@ -0,0 +1,1054 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdexcept>
+#include <string>
+#include <vector>
+#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
+#include "paddle/legacy/utils/Common.h"
+#include "paddle/legacy/utils/GlobalConstants.h"
+
+/// Import PaddlePaddle's enumeration into global namespace.
+using namespace paddle::enumeration_wrapper;  // NOLINT
+
+/**
+ * @brief Initialize paddle.
+ *
+ * In python, this method should be invoked as
+ * @code
+ *  import sys
+ *  import paddle
+ *  paddle.initPaddle(sys.argv)
+ *  or you can change arguments as any list of str.
+ * @endcode
+ */
+void initPaddle(int argc, char** argv);
+
+/// Return FLAGS_use_gpu
+bool isUsingGpu();
+
+/// Set the Flags_use_gpu to the given parameter
+void setUseGpu(bool useGpu);
+
+/// Return true if this py_paddle is compiled in GPU Version
+bool isGpuVersion();
+
+/// Return FLAGS_trainer_count
+int getTrainerCount();
+
+/// The Error of IO Operation. Such as file not found, etc.
+class IOError {};
+
+/// Out of range error
+class RangeError {};
+
+/// Not support Error, such as access GPU memory directly, etc.
+class UnsupportError : public std::runtime_error {
+ public:
+  UnsupportError() : std::runtime_error(" ") {}
+  explicit UnsupportError(const std::string& message)
+      : std::runtime_error(message) {}
+};
+
+/// This type will map to python's list of float.
+struct FloatArray {
+  const float* buf;
+  const size_t length;
+  bool needFree;  // true if the buf is dynamic alloced.
+  FloatArray(const float* b, const size_t l);
+};
+
+/// This type will map to python's list of int
+struct IntArray {
+  const int* buf;
+  const size_t length;
+  bool needFree;
+  IntArray(const int* b, const size_t l, bool f = false);
+};
+
+/// This type will map to python's list of (int, float)
+struct IntWithFloatArray {
+  const float* valBuf;
+  const int* idxBuf;
+  const size_t length;
+  bool needFree;
+  IntWithFloatArray(const float* v, const int* i, size_t l, bool f = false);
+};
+
+enum SparseValueType { SPARSE_NON_VALUE = 0, SPARSE_VALUE = 1 };
+
+enum SparseFormatType { SPARSE_CSR = 0, SPARSE_CSC = 1 };
+
+/**
+ * In Python, -1UL is hard to write. So define a const value used by python
+ * side.
+ */
+const size_t NO_SPARSE_ID = -1UL;
+
+struct MatrixPrivate;
+class Matrix {
+  Matrix();  // User Cannot Create Matrix.
+  DISABLE_COPY(Matrix);
+  static Matrix* createByPaddleMatrixPtr(void* sharedPtr);
+
+ public:
+  virtual ~Matrix();
+
+  /**
+   * Create A Matrix with height,width, which is filled by zero.
+   */
+  static Matrix* createZero(size_t height,
+                            size_t width,
+                            bool useGpu = isUsingGpu());
+
+  /**
+   * Create Sparse Matrix.
+   *
+   * After create sparse, sparseCopyFrom can be used to fill matrix.
+   *
+   * @param nnz  Number of non zero values.
+   *
+   * @note the default sparse type is SPARSE_CSR.
+   */
+  static Matrix* createSparse(size_t height,
+                              size_t width,
+                              size_t nnz,
+                              bool isNonVal = true,
+                              bool trans = false,
+                              bool useGpu = isUsingGpu());
+
+  /**
+   * Create Dense Matrix.
+   *
+   * @param data  list of float should be passed in python.
+   * @note        the value will be copy into a new matrix.
+   */
+  static Matrix* createDense(const std::vector<float>& data,
+                             size_t height,
+                             size_t width,
+                             bool useGpu = isUsingGpu());
+
+  static Matrix* createDenseFromNumpy(
+      float* data,
+      int dim1,
+      int dim2,
+      bool copy = true,
+      bool useGpu = isUsingGpu()) throw(UnsupportError);
+
+  /**
+   *  Create Cpu Dense Matrix from numpy matrix, dtype=float32
+   *
+   *  @param data  a numpy matrix.
+   *  @param dim1  dimension of data.
+   *  @param dim2  dimension of data.
+   *  @param copy  true if copy into a new matrix, false will create
+   *               matrix inplace. copy = false should be used with extreme
+   *               care because Matrix will share the memory with the given
+   *               numpy array. If the numpy array object is no longer valid,
+   *               the memory space will not be usable.
+   */
+  static Matrix* createCpuDenseFromNumpy(float* data,
+                                         int dim1,
+                                         int dim2,
+                                         bool copy = true);
+
+  /// Create Gpu Dense Matrix from numpy matrix, dtype=float32
+  static Matrix* createGpuDenseFromNumpy(float* data, int dim1, int dim2);
+
+  /**
+   * Cast to numpy matrix.
+   *
+   * @note    This method take no parameter in python.
+   * @note    This method in python will return a numpy matrix, not void.
+   * @note    Only CpuDenseMatrix is supported.
+   *
+   * Example:
+   * @code
+   * import paddle
+   * m = paddle.Matrix.createZero(10,2)
+   * numpy_mat = m.toNumpyMat()
+   * @endcode
+   */
+  void toNumpyMatInplace(float** view_data,
+                         int* dim1,
+                         int* dim2) throw(UnsupportError);
+
+  /// Copy To numpy mat.
+  void copyToNumpyMat(float** view_m_data,
+                      int* dim1,
+                      int* dim2) throw(UnsupportError);
+
+  /// Copy From Numpy Mat
+  void copyFromNumpyMat(float* data, int dim1, int dim2) throw(UnsupportError,
+                                                               RangeError);
+
+  /// return true if this matrix is sparse.
+  bool isSparse() const;
+
+  SparseValueType getSparseValueType() const throw(UnsupportError);
+
+  SparseFormatType getSparseFormat() const throw(UnsupportError);
+
+  IntArray getSparseRowCols(size_t i) const throw(UnsupportError, RangeError);
+
+  IntWithFloatArray getSparseRowColsVal(size_t i) const
+      throw(UnsupportError, RangeError);
+
+  size_t getHeight() const;
+
+  size_t getWidth() const;
+
+  float get(size_t x, size_t y) const throw(RangeError);
+
+  void set(size_t x, size_t y, float val) throw(RangeError, UnsupportError);
+
+  /// return type is list of float
+  FloatArray getData() const;
+
+  /**
+   * Copy from rows, cols, values.
+   *
+   * if sparse_nonvalue, the values should be []
+   */
+  void sparseCopyFrom(const std::vector<int>& rows,
+                      const std::vector<int>& cols,
+                      const std::vector<float>& values =
+                          std::vector<float>()) throw(UnsupportError);
+
+  bool isGpu() const;
+
+ private:
+  void* getSharedPtr() const;
+
+  MatrixPrivate* m;
+  friend class Trainer;
+  friend class GradientMachine;
+  friend class Arguments;
+};
+
+struct VectorPrivate;
+class Vector {
+  DISABLE_COPY(Vector);
+  Vector();
+  static Vector* createByPaddleVectorPtr(void* ptr);
+
+  void* getSharedPtr();
+
+ public:
+  ~Vector();
+
+  /// Create Vector filled with zero.
+  static Vector* createZero(size_t sz, bool useGpu = isUsingGpu());
+
+  /**
+   * Create Vector from list of float.
+   *
+   * It will create a new vector, and copy data into it.
+   */
+  static Vector* create(const std::vector<float>& data,
+                        bool useGpu = isUsingGpu());
+
+  static Vector* createVectorFromNumpy(
+      float* data,
+      int dim,
+      bool copy = true,
+      bool useGpu = isUsingGpu()) throw(UnsupportError);
+  /**
+   * Create Cpu Vector from numpy array, which dtype=float32
+   *
+   * If copy is false, it will create vector inplace.
+   */
+  static Vector* createCpuVectorFromNumpy(float* data,
+                                          int dim,
+                                          bool copy = true);
+
+  /// Create Gpu Vector from numpy array, which dtype=float32
+  static Vector* createGpuVectorFromNumpy(float* data, int dim);
+
+  /**
+   * copy from another vector
+   * throw(RangeError) if size of src vector is different from size of this
+   * vector
+   */
+  void copyFrom(Vector* src) throw(RangeError);
+
+  /// Cast to numpy array inplace.
+  void toNumpyArrayInplace(float** view_data, int* dim1) throw(UnsupportError);
+
+  /// Copy to numpy array.
+  void copyToNumpyArray(float** view_m_data, int* dim1);
+
+  /// Copy from numpy array.
+  void copyFromNumpyArray(float* data, int dim);
+
+  /// __getitem__ in python
+  float get(const size_t idx) const throw(RangeError, UnsupportError);
+
+  /// __setitem__ in python
+  void set(const size_t idx, float val) throw(RangeError, UnsupportError);
+
+  /// Return is GPU vector or not.
+  bool isGpu() const;
+
+  /// Return a list of float, the memory is alloced and copied.
+  FloatArray getData() const;
+
+  /// __len__ in python
+  size_t getSize() const;
+
+ private:
+  VectorPrivate* m;
+
+ private:
+  friend class Parameter;
+  friend class ParameterOptimizer;
+  friend struct ParameterTraverseCallbackPrivate;
+};
+
+struct IVectorPrivate;
+class IVector {
+  IVector();
+  DISABLE_COPY(IVector);
+  static IVector* createByPaddleVectorPtr(void* ptr);
+
+ public:
+  /// Create IVector filled with zero
+  static IVector* createZero(size_t sz, bool useGpu = isUsingGpu());
+
+  /**
+   * Create IVector from list of int.
+   * It will create a new vector, and copy data into it.
+   */
+  static IVector* create(const std::vector<int>& data,
+                         bool useGpu = isUsingGpu());
+
+  static IVector* createVectorFromNumpy(
+      int* data,
+      int dim,
+      bool copy = true,
+      bool useGpu = isUsingGpu()) throw(UnsupportError);
+
+  /**
+   * Create Cpu IVector from numpy array, which dtype=int32
+   *
+   * If copy is false, it will create vector inplace
+   */
+  static IVector* createCpuVectorFromNumpy(int* data,
+                                           int dim,
+                                           bool copy = true);
+  /**
+   * Create Gpu IVector from numpy array, which dtype=int32
+   */
+  static IVector* createGpuVectorFromNumpy(int* data, int dim);
+
+  /// Cast to numpy array inplace.
+  void toNumpyArrayInplace(int** view_data, int* dim1) throw(UnsupportError);
+
+  /// Copy to numpy array.
+  void copyToNumpyArray(int** view_m_data, int* dim1);
+
+  /// Copy from numpy array.
+  void copyFromNumpyArray(int* data, int dim);
+
+  virtual ~IVector();
+
+  /// Return a list of int, the memory is alloced and copied.
+  IntArray getData() const;
+
+  /// This method will map to python [] method.
+  int& operator[](const size_t idx) throw(RangeError, UnsupportError);
+
+  const int& operator[](const size_t idx) const
+      throw(RangeError, UnsupportError);
+
+  inline int get(const size_t idx) const throw(RangeError, UnsupportError) {
+    return (*this)[idx];
+  }
+
+  inline void set(const size_t idx, int val) throw(RangeError, UnsupportError) {
+    (*this)[idx] = val;
+  }
+
+  /// Return true if it is gpu vector.
+  bool isGpu() const;
+
+  /// This method will map to python __len__();
+  size_t getSize() const;
+
+ private:
+  void* getSharedPtr() const;
+
+  friend class Arguments;
+  IVectorPrivate* m;
+};
+
+struct ArgumentsPrivate;
+
+/// The Arguments is actual a std::vector<paddle::Argument> in paddle.
+class Arguments {
+ private:
+  Arguments();  // Internal Create.
+  DISABLE_COPY(Arguments);
+
+ public:
+  /**
+   * Create a arguments with size.
+   * Note that it can be zero.
+   */
+  static Arguments* createArguments(size_t slotNum);
+
+  void resize(size_t slotNum);
+
+  virtual ~Arguments();
+
+  /**
+   * Return the slot number that aguments contains.
+   *
+   * It is actually the vector's size
+   */
+  size_t getSlotNum() const;
+
+  /**
+   * The get functions of Arguments
+   *
+   * the param idx is the slot id
+   */
+  Matrix* getSlotValue(size_t idx) const throw(RangeError);
+  Matrix* getSlotGrad(size_t idx) const throw(RangeError);
+  IVector* getSlotIds(size_t idx) const throw(RangeError);
+  Matrix* getSlotIn(size_t idx) const throw(RangeError);
+  IVector* getSlotSequenceStartPositions(size_t idx) const throw(RangeError);
+  IVector* getSlotSubSequenceStartPositions(size_t idx) const throw(RangeError);
+  IVector* getSlotSequenceDim(size_t idx) const throw(RangeError);
+  // End Of get functions of Arguments
+
+  int64_t getBatchSize(size_t idx = 0) const throw(RangeError);
+
+  /**
+   * The set functions of Arguments.
+   *
+   * The param idx is the slot id.
+   * The other param is the input Matrix or vector.
+   */
+  void setSlotValue(size_t idx, Matrix* mat) throw(RangeError);
+  void setSlotGrad(size_t idx, Matrix* mat) throw(RangeError);
+  void setSlotIn(size_t idx, Matrix* mat) throw(RangeError);
+  void setSlotIds(size_t idx, IVector* vec) throw(RangeError);
+  void setSlotSequenceStartPositions(size_t idx,
+                                     IVector* vec) throw(RangeError);
+  void setSlotSubSequenceStartPositions(size_t idx,
+                                        IVector* vec) throw(RangeError);
+  void setSlotSequenceDim(size_t idx, IVector* vec) throw(RangeError);
+
+  /**
+   * Set the frame height of the idx-th Argument.
+   *
+   * @param ids The index of which Argument.
+   * @param h The height value.
+   */
+  void setSlotFrameHeight(size_t idx, size_t h) throw(RangeError);
+
+  /**
+   * Set the frame height of the idx-th Argument.
+   *
+   * @param ids The index of which Argument.
+   * @param h The height value.
+   */
+  void setSlotFrameWidth(size_t idx, size_t w) throw(RangeError);
+
+  size_t getSlotFrameHeight(size_t idx = 0) const throw(RangeError);
+  size_t getSlotFrameWidth(size_t idx = 0) const throw(RangeError);
+
+  float sum() const;
+
+ private:
+  static Arguments* createByPaddleArgumentVector(void* ptr);
+  static Arguments* createByPaddleArgument(const void* ptr);
+  void* getInternalArgumentsPtr() const;
+
+ private:
+  ArgumentsPrivate* m;
+  friend class Trainer;
+  friend class GradientMachine;
+  friend class SequenceGenerator;
+};
+
+enum GradientMatchineCreateMode {
+  CREATE_MODE_NORMAL = paddle::GradientMachine::kNormal,
+  CREATE_MODE_SGD_SPARSE_CPU_TRAINING =
+      paddle::GradientMachine::kSgdSparseCpuTraining,
+  CREATE_MODE_TESTING = paddle::GradientMachine::kTesting
+};
+
+struct ParameterConfigPrivate;
+class ParameterConfig {
+  DISABLE_COPY(ParameterConfig);
+  ParameterConfig();
+
+  /**
+   * Internal methods
+   */
+  static ParameterConfig* createParameterConfigFromParameterSharedPtr(
+      void* ptr);
+  static ParameterConfig* createParameterConfigFromParameterPtr(void* ptr);
+  void* getRawPtr();
+
+ public:
+  ~ParameterConfig();
+
+  /**
+   * return proto buf string.
+   */
+  std::string toProtoString() const;
+
+ private:
+  ParameterConfigPrivate* m;
+
+ private:
+  friend class Parameter;
+  friend class ParameterOptimizer;
+  friend struct ParameterTraverseCallbackPrivate;
+};
+
+struct OptimizationConfigPrivate;
+class OptimizationConfig {
+  DISABLE_COPY(OptimizationConfig);
+  OptimizationConfig();
+
+ public:
+  static OptimizationConfig* createFromProtoString(const std::string& str);
+  ~OptimizationConfig();
+
+  /**
+   * return protobuf string.
+   */
+  std::string toProtoString();
+
+ private:
+  OptimizationConfigPrivate* m;
+
+  friend class TrainerConfig;
+  friend class ParameterOptimizer;
+  friend class ParameterUpdater;
+  friend class Trainer;
+};
+
+struct ParameterPrivate;
+class Parameter {
+ private:
+  Parameter();
+  DISABLE_COPY(Parameter);
+
+ public:
+  virtual ~Parameter();
+
+  /**
+   * get parameter name
+   */
+  std::string getName() const;
+
+  /**
+   * get buf in Parameter
+   */
+  Vector* getBuf(ParameterType type);
+
+  /**
+   * get id
+   */
+  size_t getID() const;
+
+  ParameterConfig* getConfig();
+  void setValueUpdated();
+
+  bool save(const std::string& filename) const;
+
+  bool load(const std::string& filename) const;
+
+  size_t getSize() const;
+
+ private:
+  static Parameter* createFromRawPtr(void* ptr);
+  static Parameter* createFromSharedPtr(void* ptr);
+
+ private:
+  ParameterPrivate* m;
+  friend class UpdateCallbackWrapper;
+  friend class GradientMachine;
+  friend class ParameterUpdater;
+};
+
+struct ModelConfigPrivate;
+/**
+ * You can only get model config from TrainerConfig.
+ *
+ * It is used by GradientMachine.
+ */
+class ModelConfig {
+ private:
+  ModelConfig();
+  DISABLE_COPY(ModelConfig);
+
+ public:
+  virtual ~ModelConfig();
+
+ private:
+  ModelConfigPrivate* m;
+  friend class TrainerConfig;
+  friend struct TrainerConfigPrivate;
+  friend class GradientMachine;
+};
+
+struct TrainerConfigPrivate;
+/**
+ * To get TrainerConfig from file.
+ *
+ * It is used by GradientMachine.
+ */
+class TrainerConfig {
+ private:
+  TrainerConfig();
+  DISABLE_COPY(TrainerConfig);
+
+ public:
+  virtual ~TrainerConfig();
+
+  static TrainerConfig* createFromTrainerConfigFile(
+      const std::string& configPath);
+  static TrainerConfig* createFromProtoString(const std::string& str);
+
+  ModelConfig* getModelConfig() const;
+
+  OptimizationConfig* getOptimizationConfig() const;
+
+ private:
+  TrainerConfigPrivate* m;
+  friend class Trainer;
+};
+
+/**
+ * The callback in backword.
+ *
+ * You can inherit this class in python.
+ *
+ * @code
+ * class UpdateCallbackInPython(paddle.UpdateCallback):
+ *   def __init__(self):
+ *     paddle.UpdateCallback.__init__(self)
+ *
+ *   def apply(self, param):
+ *     assert isinstance(param, paddle.Parameter)
+ * @endcode
+ */
+class UpdateCallback {
+ public:
+  virtual ~UpdateCallback();
+  virtual void apply(Parameter* p);
+};
+
+struct ParameterTraverseCallbackPrivate;
+class ParameterTraverseCallback {
+  DISABLE_COPY(ParameterTraverseCallback);
+  ParameterTraverseCallback();
+
+ public:
+  ~ParameterTraverseCallback();
+
+  void apply(const std::vector<Vector*>& vecs,
+             const ParameterConfig& config,
+             size_t sparseId);
+
+ private:
+  ParameterTraverseCallbackPrivate* m;
+  friend class ParameterOptimizer;
+};
+
+/**
+ * The ParameterOptimizer Wrapper Class.
+ *
+ * Basically same as common/ParameterOptimizer.h
+ */
+struct ParameterOptimizerPrivate;
+class ParameterOptimizer {
+  DISABLE_COPY(ParameterOptimizer);
+  ParameterOptimizer();
+
+ public:
+  static ParameterOptimizer* create(OptimizationConfig* config);
+
+  ~ParameterOptimizer();
+
+  void init(size_t numRows, const ParameterConfig* config);
+
+  void startPass();
+
+  void finishPass();
+
+  void startBatch(size_t numSamplesProcessed);
+
+  void finishBatch();
+
+  void update(const std::vector<Vector*>& vecs,
+              const ParameterConfig& conf,
+              size_t sparseId = NO_SPARSE_ID);
+
+  std::vector<int> getParameterTypes() const;
+
+  ParameterTraverseCallback* needSpecialTraversal(
+      const ParameterConfig& config) const;
+
+ private:
+  ParameterOptimizerPrivate* m;
+};
+
+class SequenceGenerator;
+class Evaluator;
+struct GradientMachinePrivate;
+class GradientMachine {
+ private:
+  GradientMachine();
+  DISABLE_COPY(GradientMachine);
+
+ public:
+  virtual ~GradientMachine();
+
+  /**
+   * Create By ProtoStr.
+   *
+   * The ProtoStr can be generate by python's protobuf code.
+   */
+  static GradientMachine* createByConfigProtoStr(
+      const std::string& protoStr,
+      GradientMatchineCreateMode mode = CREATE_MODE_NORMAL,
+      const std::vector<int>& parameterTypes = defaultParamTypes);
+
+  /**
+   * Create by ModelConfig object.
+   *
+   * To get ModelConfig, you can get TrainerConfig from config file, then get
+   * model config by TrainerConfig
+   */
+  static GradientMachine* createByModelConfig(
+      ModelConfig* conf,
+      GradientMatchineCreateMode mode = CREATE_MODE_NORMAL,
+      const std::vector<int>& parameterTypes = defaultParamTypes);
+
+  /**
+   * @brief finish
+   */
+  void finish();
+
+  void start();
+
+  /**
+   * Prefetch row ids of sparse parameter.
+   */
+  void prefetch(const Arguments& inArgs);
+
+  /**
+   * Do some thing when train pass ended.
+   */
+  void onPassEnd();
+
+  /**
+   * The forward stage of GradientMachine.
+   *
+   * @note  the outArgs could be zero length arguemnts.
+   * @note  THIS METHOD IS VERY USEFULL FOR PREDICT FROM TRAINED MODEL.
+   */
+  void forward(const Arguments& inArgs, Arguments* outArgs, PassType passType);
+
+  /**
+   * The backward stage of GradientMachine.
+   *
+   * @note  Currently the ParameterUpdater is not wrapped in SWIG, so backward
+   * cannot actually train a network. But you can write a update callback to
+   * change the parameter or implement a ParameterUpdater in python side.
+   */
+  void backward(const UpdateCallback& callback = UpdateCallback());
+
+  /**
+   * Combine forward/backward
+   */
+  void forwardBackward(const Arguments& inArgs,
+                       Arguments* outArgs,
+                       PassType passType,
+                       const UpdateCallback& callback = UpdateCallback());
+
+  void loadParameters(const std::string& path);
+
+  size_t getParameterSize() const;
+  Parameter* getParameter(size_t i) throw(RangeError);
+
+  size_t getNonStaticParameterSize() const;
+  Parameter* getNonStaticParameter(size_t i) throw(RangeError);
+
+  void randParameters();
+
+  Arguments* getLayerOutput(const std::string& layerName) const
+      throw(UnsupportError);
+
+  /**
+   * Create a sequence generator.
+   *
+   * @note  It just like a paddle_gen_sequence.
+   */
+  SequenceGenerator* asSequenceGenerator(
+      const std::vector<std::string>& dict = std::vector<std::string>(),
+      size_t begin_id = 0UL,
+      size_t end_id = 0UL,
+      size_t max_length = 100UL,
+      size_t beam_size = -1UL);
+
+  Evaluator* makeEvaluator();
+
+  void eval(Evaluator* evaluator);
+
+ private:
+  GradientMachinePrivate* m;
+
+  static GradientMachine* createFromPaddleModelPtr(
+      const void* confPtr,
+      GradientMatchineCreateMode mode,
+      const std::vector<int>& types);
+
+  // Not to use c++ 11 init-list, so we use static var as function default arg.
+  static std::vector<int> defaultParamTypes;
+  friend class Trainer;
+  friend class ParameterUpdater;
+};
+
+struct ParameterUpdaterPrivate;
+class ParameterUpdater {
+ private:
+  ParameterUpdater();
+
+ public:
+  static ParameterUpdater* createLocalUpdater(OptimizationConfig* config);
+  static ParameterUpdater* createRemoteUpdater(OptimizationConfig* config,
+                                               int passCount,
+                                               bool useSparseUpdater);
+  static ParameterUpdater* createNewRemoteUpdater(
+      OptimizationConfig* config,
+      const std::string pserverSpec,
+      const bool useEtcd) throw(UnsupportError);
+  ~ParameterUpdater();
+
+  /**
+   * @brief initialize Parameter Updater by GradientMachine.
+   * @param gm
+   */
+  void init(const GradientMachine& gm);
+
+  /**
+   * @brief begin of a training/testing of one pass.
+   */
+  void startPass();
+
+  /**
+   * @brief end of a traning/testing of one pass.
+   */
+  void finishPass();
+
+  /**
+   * @brief begin of a training/testing of one batch.
+   * @param data batch's size
+   * @return PassType, mostly will be training.
+   */
+  PassType startBatch(size_t batchSize);
+
+  /**
+   * @brief end of a traning/testing of one batch
+   * @param cost current batch cost.
+   */
+  void finishBatch(float cost);
+
+  /**
+   * @brief update a parameter (by local optimizer or by cluster pserver)
+   * @param param
+   */
+  void update(Parameter* param);
+
+  /**
+   * @breif only get required sparse rows by default.
+   * @param fullSize: get full matrix parameter if *fullSize* set
+   * @param apply: get PARAMETER_APPLY on pserver if *apply* set
+   */
+  void getParametersRemote(bool fullSize = false, bool apply = false);
+
+  /**
+   * @brief restore the average parameter.
+   * @note It is only used in AverageOptimizer. Restore will get the current
+   * PARAMETER_VALUE back.
+   */
+  void restore();
+
+  /**
+   * @brief apply. Store the average parameter.
+   * @note It is only used in AverageOptimizer. Apply will store the current
+   * PARAMETER_VALUE to buffer, calcaualte current Average Parameter, and save
+   * it to PARAMETER_VALUE.
+   */
+  void apply();
+
+  /**
+   * @brief catchUpWith The Regularization will be delayed in many situations(
+   * pserver, local sparse). Catch Up means catch the regularization up, apply
+   * regularization to all params.
+   */
+  void catchUpWith();
+
+ private:
+  ParameterUpdaterPrivate* m;
+};
+
+struct EvaluatorPrivate;
+class Evaluator {
+ private:
+  Evaluator();
+  DISABLE_COPY(Evaluator);
+
+ public:
+  ~Evaluator();
+
+  /**
+   * @brief begin an evaluate stage.
+   */
+  void start();
+
+  /**
+   * @brief end an evaluate stage.
+   */
+  void finish();
+
+  /**
+   * @brief toString will get a evaluate result.
+   *
+   * __repr__ method in python
+   */
+  std::string toString();
+
+  std::vector<std::string> getNames() const;
+
+  double getValue(const std::string name) const;
+
+ private:
+  EvaluatorPrivate* m;
+
+  friend class GradientMachine;
+};
+
+struct TrainerPrivate;
+class Trainer {
+ private:
+  TrainerPrivate* m;
+  Trainer();
+  Trainer(TrainerConfig* optConfig, GradientMachine* gm);
+  DISABLE_COPY(Trainer);
+
+ public:
+  virtual ~Trainer();
+
+  /// Create A Trainer By TrainerConfig. using paddle command line.
+  static Trainer* createByCommandLine() throw(IOError);
+
+  static Trainer* create(TrainerConfig* optConfig,
+                         GradientMachine* gm) throw(IOError);
+
+  /// Start training
+  void startTrain();
+
+  /// Finish training
+  void finishTrain();
+
+  /// Start a pass.
+  void startTrainPass();
+
+  /// Finish a pass
+  void finishTrainPass();
+
+  /**
+   * Train one batch,
+   *
+   * @return true if all batch finished.
+   */
+  bool trainOneBatch(size_t batchSize);
+
+  void trainOneDataBatch(size_t batchSize, const Arguments& args);
+
+  void startTestPeriod();
+  void testOneDataBatch(size_t batchSize, const Arguments& args);
+  void finishTestPeriod();
+
+  void forwardOneBatch(size_t batchSize);
+
+  Arguments* getForwardOutput();
+
+  Arguments* getLayerOutput(const std::string& layerName) const;
+};
+
+/// the N-Best results generated from one input sequence.
+class ISequenceResults {
+ public:
+  virtual ~ISequenceResults();
+
+  /// Number of result.
+  virtual size_t getSize() const = 0;
+
+  /**
+   * Get sentence from dictionary.
+   *
+   * @param id  the index of result.
+   * @param split  if true, the return sentence will be splited with ' ' by
+   *               each word. Default is false.
+   */
+  virtual std::string getSentence(size_t id, bool split = false) const
+      throw(RangeError) = 0;
+  virtual std::vector<int> getSequence(size_t id) const throw(RangeError) = 0;
+  virtual float getScore(size_t id) const throw(RangeError) = 0;
+};
+
+struct SequenceGeneratorPrivate;
+class SequenceGenerator {
+  DISABLE_COPY(SequenceGenerator);
+  SequenceGenerator();
+
+ public:
+  virtual ~SequenceGenerator();
+
+  /**
+   * Generate Sequence by input.
+   *
+   * @note  The inArgs is just one sequence of data.
+   * @note  The return will get a N-best generate result by inArgs.
+   *        Sort by score.
+   */
+  ISequenceResults* generateSequence(const Arguments& inArgs) const;
+
+  void setDict(const std::vector<std::string>& dict);
+  void setBos(size_t bos);
+  void setEos(size_t eos);
+  void setMaxLength(size_t maxlength);
+  void setBeamSize(size_t beamSize);
+
+ private:
+  static SequenceGenerator* createByGradientMachineSharedPtr(void* ptr);
+  friend class GradientMachine;
+
+ private:
+  SequenceGeneratorPrivate* m;
+};
diff --git a/paddle/legacy/api/PaddleAPIPrivate.h b/paddle/legacy/api/PaddleAPIPrivate.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ee192c31d597c4b4575e4a53a4aece09e642831
--- /dev/null
+++ b/paddle/legacy/api/PaddleAPIPrivate.h
@@ -0,0 +1,97 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <memory>
+#include "PaddleAPI.h"
+#include "paddle/legacy/gserver/evaluators/Evaluator.h"
+#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
+#include "paddle/legacy/parameter/ParameterUpdaterBase.h"
+#include "paddle/legacy/trainer/TrainerConfigHelper.h"
+
+struct GradientMachinePrivate {
+  std::shared_ptr<paddle::GradientMachine> machine;
+
+  template <typename T>
+  inline T& cast(void* ptr) {
+    return *(T*)(ptr);
+  }
+};
+
+struct OptimizationConfigPrivate {
+  std::shared_ptr<paddle::TrainerConfigHelper> trainer_config;
+  paddle::OptimizationConfig config;
+
+  const paddle::OptimizationConfig& getConfig() {
+    if (trainer_config != nullptr) {
+      return trainer_config->getOptConfig();
+    } else {
+      return config;
+    }
+  }
+};
+
+struct TrainerConfigPrivate {
+  std::shared_ptr<paddle::TrainerConfigHelper> conf;
+  TrainerConfigPrivate() {}
+};
+
+struct ModelConfigPrivate {
+  std::shared_ptr<paddle::TrainerConfigHelper> conf;
+};
+
+struct ArgumentsPrivate {
+  std::vector<paddle::Argument> outputs;
+
+  inline paddle::Argument& getArg(size_t idx) throw(RangeError) {
+    if (idx < outputs.size()) {
+      return outputs[idx];
+    } else {
+      RangeError e;
+      throw e;
+    }
+  }
+
+  template <typename T>
+  std::shared_ptr<T>& cast(void* rawPtr) const {
+    return *(std::shared_ptr<T>*)(rawPtr);
+  }
+};
+
+struct ParameterUpdaterPrivate {
+  std::unique_ptr<paddle::ParameterUpdater> updater;
+};
+
+struct ParameterPrivate {
+  std::shared_ptr<paddle::Parameter> sharedPtr;
+  paddle::Parameter* rawPtr;  // rawPtr only used in ParameterUpdater,
+                              // in other situation sharedPtr should
+                              // contains value.
+
+  ParameterPrivate() : sharedPtr(nullptr), rawPtr(nullptr) {}
+
+  paddle::Parameter* getPtr() {
+    if (sharedPtr) {
+      return sharedPtr.get();
+    } else {
+      return rawPtr;
+    }
+  }
+};
+
+struct EvaluatorPrivate {
+  paddle::Evaluator* rawPtr;
+
+  EvaluatorPrivate() : rawPtr(nullptr) {}
+  ~EvaluatorPrivate() { delete rawPtr; }
+};
diff --git a/paddle/legacy/api/Parameter.cpp b/paddle/legacy/api/Parameter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f05740eb750cccd8cfb6cbc826a04585ec06822e
--- /dev/null
+++ b/paddle/legacy/api/Parameter.cpp
@@ -0,0 +1,68 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/parameter/Parameter.h"
+#include "PaddleAPI.h"
+#include "PaddleAPIPrivate.h"
+
+Parameter::Parameter() : m(new ParameterPrivate()) {}
+
+Parameter::~Parameter() { delete m; }
+
+Parameter* Parameter::createFromRawPtr(void* ptr) {
+  auto p = new Parameter();
+  p->m->rawPtr = *static_cast<paddle::Parameter**>(ptr);
+  return p;
+}
+
+Parameter* Parameter::createFromSharedPtr(void* ptr) {
+  auto& p = *(paddle::ParameterPtr*)(ptr);
+  if (p == nullptr) {
+    return nullptr;
+  } else {
+    auto retParam = new Parameter();
+    retParam->m->sharedPtr = p;
+    return retParam;
+  }
+}
+
+std::string Parameter::getName() const { return m->getPtr()->getName(); }
+
+Vector* Parameter::getBuf(ParameterType type) {
+  auto buf = m->getPtr()->getBuf(type);
+  return Vector::createByPaddleVectorPtr(&buf);
+}
+
+ParameterConfig* Parameter::getConfig() {
+  if (m->sharedPtr) {
+    return ParameterConfig::createParameterConfigFromParameterSharedPtr(
+        &m->sharedPtr);
+  } else {
+    return ParameterConfig::createParameterConfigFromParameterPtr(m->rawPtr);
+  }
+}
+
+size_t Parameter::getID() const { return m->getPtr()->getID(); }
+
+void Parameter::setValueUpdated() { m->getPtr()->setValueUpdated(); }
+
+bool Parameter::save(const std::string& filename) const {
+  return m->getPtr()->save(filename);
+}
+
+bool Parameter::load(const std::string& filename) const {
+  return m->getPtr()->load(filename);
+}
+
+size_t Parameter::getSize() const { return m->getPtr()->getSize(); }
diff --git a/paddle/legacy/api/ParameterOptimizer.cpp b/paddle/legacy/api/ParameterOptimizer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..477d9dae44362f9073639093c3c4d1cf0ac12044
--- /dev/null
+++ b/paddle/legacy/api/ParameterOptimizer.cpp
@@ -0,0 +1,124 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/parameter/ParameterOptimizer.h"
+#include <algorithm>
+#include "Internal.h"
+#include "PaddleAPI.h"
+#include "PaddleAPIPrivate.h"
+
+struct ParameterOptimizerPrivate {
+  std::unique_ptr<paddle::ParameterOptimizer> optimizer;
+};
+
+struct ParameterTraverseCallbackPrivate {
+  paddle::ParameterOptimizer::TraverseCallback callback;
+
+  ParameterTraverseCallbackPrivate() {}
+
+  ParameterTraverseCallbackPrivate(
+      const paddle::ParameterOptimizer::TraverseCallback& callback)
+      : callback(callback) {}
+
+  void apply(const std::vector<Vector*>& vecs,
+             const ParameterConfig& conf,
+             size_t sparseId) {
+    std::vector<paddle::VectorPtr> real_vecs;
+    real_vecs.resize(vecs.size());
+    std::transform(vecs.begin(), vecs.end(), real_vecs.begin(), [](Vector* v) {
+      if (v) {
+        return *(paddle::VectorPtr*)(v->getSharedPtr());
+      } else {
+        return paddle::VectorPtr();
+      }
+    });
+
+    paddle::ParameterConfig& real_conf =
+        *(paddle::ParameterConfig*)(const_cast<ParameterConfig&>(conf)
+                                        .getRawPtr());
+    callback(real_vecs.data(), real_conf, sparseId);
+  }
+};
+
+ParameterOptimizer::ParameterOptimizer() : m(new ParameterOptimizerPrivate()) {}
+
+ParameterOptimizer::~ParameterOptimizer() { delete m; }
+
+ParameterOptimizer* ParameterOptimizer::create(OptimizationConfig* config) {
+  CHECK(config != nullptr);
+  auto retOptimizer = new ParameterOptimizer();
+  retOptimizer->m->optimizer.reset(
+      paddle::ParameterOptimizer::create(config->m->getConfig(), false));
+  return retOptimizer;
+}
+
+void ParameterOptimizer::init(size_t numRows, const ParameterConfig* config) {
+  auto& conf = *(paddle::ParameterConfig*)(const_cast<ParameterConfig*>(config)
+                                               ->getRawPtr());
+  m->optimizer->init(numRows, &conf);
+}
+
+void ParameterOptimizer::startPass() { m->optimizer->startPass(); }
+
+void ParameterOptimizer::finishPass() { m->optimizer->finishPass(); }
+
+void ParameterOptimizer::startBatch(size_t numSamplesProcessed) {
+  constexpr size_t high_1 = 1UL << (sizeof(size_t) * 8 - 1);
+  CHECK_EQ(numSamplesProcessed & high_1, 0UL);  // Safely cast.
+  m->optimizer->startBatch((int64_t)numSamplesProcessed);
+}
+
+void ParameterOptimizer::finishBatch() { m->optimizer->finishBatch(); }
+
+void ParameterOptimizer::update(const std::vector<Vector*>& vecs,
+                                const ParameterConfig& conf,
+                                size_t sparseId) {
+  ParameterTraverseCallbackPrivate invoker(
+      [&](const paddle::VectorPtr _vecs[],
+          const paddle::ParameterConfig& config,
+          size_t sid = -1UL) { m->optimizer->update(_vecs, config, sid); });
+  invoker.apply(vecs, conf, sparseId);
+}
+
+std::vector<int> ParameterOptimizer::getParameterTypes() const {
+  std::vector<int> returnValue;
+  staticCastVector(&returnValue, m->optimizer->getParameterTypes());
+  return returnValue;
+}
+
+ParameterTraverseCallback::ParameterTraverseCallback()
+    : m(new ParameterTraverseCallbackPrivate()) {}
+
+ParameterTraverseCallback::~ParameterTraverseCallback() { delete m; }
+
+void ParameterTraverseCallback::apply(const std::vector<Vector*>& vecs,
+                                      const ParameterConfig& conf,
+                                      size_t sparseId) {
+  m->apply(vecs, conf, sparseId);
+}
+
+ParameterTraverseCallback* ParameterOptimizer::needSpecialTraversal(
+    const ParameterConfig& config) const {
+  auto& param_config =
+      *(paddle::ParameterConfig*)const_cast<ParameterConfig&>(config)
+           .getRawPtr();
+  auto callback = m->optimizer->needSpecialTraversal(param_config);
+  if (callback) {
+    auto retCallback = new ParameterTraverseCallback();
+    retCallback->m->callback = callback;
+    return retCallback;
+  } else {
+    return nullptr;
+  }
+}
diff --git a/paddle/legacy/api/ParameterUpdater.cpp b/paddle/legacy/api/ParameterUpdater.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..44af3f4635f2bda07d0079faff0bbc1ec7ed3954
--- /dev/null
+++ b/paddle/legacy/api/ParameterUpdater.cpp
@@ -0,0 +1,99 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "PaddleAPI.h"
+
+#include "PaddleAPIPrivate.h"
+#ifndef PADDLE_WITHOUT_GOLANG
+#include "paddle/legacy/trainer/NewRemoteParameterUpdater.h"
+#endif
+#include "paddle/legacy/trainer/RemoteParameterUpdater.h"
+#include "paddle/legacy/trainer/ThreadParameterUpdater.h"
+
+ParameterUpdater::ParameterUpdater() : m(new ParameterUpdaterPrivate()) {}
+
+ParameterUpdater *ParameterUpdater::createLocalUpdater(
+    OptimizationConfig *config) {
+  auto updater = new ParameterUpdater();
+  updater->m->updater.reset(
+      new paddle::SgdThreadUpdater(config->m->getConfig()));
+  return updater;
+}
+
+ParameterUpdater *ParameterUpdater::createNewRemoteUpdater(
+    OptimizationConfig *config,
+    const std::string pserverSpec,
+    const bool useEtcd) throw(UnsupportError) {
+#ifndef PADDLE_WITHOUT_GOLANG
+  auto updater = new ParameterUpdater();
+  updater->m->updater.reset(new paddle::NewRemoteParameterUpdater(
+      config->m->getConfig(), pserverSpec, useEtcd));
+  return updater;
+#else
+  throw UnsupportError("not compiled with WITH_GOLANG");
+#endif
+}
+
+ParameterUpdater *ParameterUpdater::createRemoteUpdater(
+    OptimizationConfig *config, int passCount, bool useSparseUpdater) {
+  auto updater = new ParameterUpdater();
+  auto remoteUpdater = new paddle::RemoteParameterUpdater(
+      config->m->getConfig(), passCount, nullptr);
+  if (useSparseUpdater) {
+    std::unique_ptr<paddle::ParameterUpdater> remoteUpdaterPtr(remoteUpdater);
+    auto sparseRemoteUpdater =
+        new paddle::SparseRemoteParameterUpdaterComposite(
+            config->m->getConfig(),
+            passCount,
+            false,
+            std::move(remoteUpdaterPtr));
+    updater->m->updater.reset(sparseRemoteUpdater);
+  } else {
+    updater->m->updater.reset(remoteUpdater);
+  }
+  return updater;
+}
+
+ParameterUpdater::~ParameterUpdater() { delete m; }
+
+void ParameterUpdater::init(const GradientMachine &gm) {
+  m->updater->init(gm.m->machine->getNonStaticParameters());
+}
+
+void ParameterUpdater::startPass() { m->updater->startPass(); }
+
+void ParameterUpdater::finishPass() { m->updater->finishPass(); }
+
+PassType ParameterUpdater::startBatch(size_t batchSize) {
+  return m->updater->startBatch((int64_t)batchSize);
+}
+
+void ParameterUpdater::finishBatch(float cost) {
+  m->updater->finishBatch(cost);
+}
+
+void ParameterUpdater::update(Parameter *param) {
+  auto paddleParam = param->m->getPtr();
+  m->updater->update(paddleParam);
+}
+
+void ParameterUpdater::getParametersRemote(bool fullSize, bool apply) {
+  m->updater->getParametersRemote(fullSize, apply);
+}
+
+void ParameterUpdater::restore() { m->updater->restore(); }
+
+void ParameterUpdater::apply() { m->updater->apply(); }
+
+void ParameterUpdater::catchUpWith() { m->updater->catchUpWith(); }
diff --git a/paddle/legacy/api/SequenceGenerator.cpp b/paddle/legacy/api/SequenceGenerator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2a73228f6d4770d9be31defd7a5dc217fc5c21f2
--- /dev/null
+++ b/paddle/legacy/api/SequenceGenerator.cpp
@@ -0,0 +1,242 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <iterator>
+#include <sstream>
+#include <vector>
+#include "PaddleAPI.h"
+#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
+#include "paddle/legacy/parameter/Argument.h"
+#include "paddle/legacy/utils/Flags.h"
+
+// used to represent partial sequence
+struct Path {
+  std::vector<int> ids;
+  float logProb;
+  paddle::MachineState machineState;
+
+  Path() { logProb = 0; }
+
+  Path(std::vector<int>& ids, float logProb, paddle::MachineState& machineState)
+      : ids(ids), logProb(logProb), machineState(machineState) {}
+
+  bool operator<(const Path& other) const { return (logProb > other.logProb); }
+};
+
+// Return top k (k == beam_size) optimal paths using beam search. The last
+// element of inArgs is the Argument of feedback. gradMachine has MaxIdLayer
+// as output and outArgs thus stores top k labels and their probabilities per
+// position
+static void findNBest(paddle::GradientMachine* gradMachine,
+                      std::vector<paddle::Argument>& inArgs,
+                      std::vector<Path>& finalPaths,
+                      size_t bos_id,
+                      size_t eos_id,
+                      size_t max_length) {
+  std::vector<Path> paths;
+  Path emptyPath;
+  paths.push_back(emptyPath);
+  finalPaths.clear();
+  gradMachine->resetState();
+  paddle::Argument feedback = inArgs.back();
+  feedback.ids->setElement(0, (int)(bos_id));
+  float minFinalPathLogProb = 0;
+  size_t beam = 0;
+  int id;
+  std::vector<paddle::Argument> outArgs;
+  while (true) {  // iterate over each generated word
+    std::vector<Path> newPaths;
+    paddle::MachineState machineState;
+    for (size_t j = 0; j < paths.size(); j++) {
+      Path& path = paths[j];
+      if (path.machineState.size() > 0) {
+        gradMachine->setState(path.machineState);
+        feedback.ids->setElement(0, path.ids.back());
+      }
+      gradMachine->forward(inArgs, &outArgs, paddle::PASS_TEST);
+      gradMachine->getState(machineState);
+      beam = outArgs[0].ids->getSize();
+      for (size_t k = 0; k < beam; k++) {
+        id = outArgs[0].ids->getElement(k);
+        float prob = outArgs[0].in->getElement(0, k);
+        std::vector<int> nids(path.ids);
+        nids.push_back(id);
+        float newLogProb = path.logProb + log(prob);
+        Path newPath(nids, newLogProb, machineState);
+        if (id == (int)eos_id || nids.size() >= max_length) {
+          finalPaths.push_back(newPath);
+          if (minFinalPathLogProb > newPath.logProb) {
+            minFinalPathLogProb = newPath.logProb;
+          }
+        } else {
+          newPaths.push_back(newPath);
+        }
+      }
+    }
+
+    if (newPaths.size() == 0) {
+      break;
+    }
+    std::nth_element(newPaths.begin(),
+                     newPaths.begin() + std::min(beam, newPaths.size()),
+                     newPaths.end());
+    if (newPaths.size() > beam) {
+      newPaths.resize(beam);
+    }
+    // pathA < pathB means pathA.logProb > pathB.logProb
+    float maxPathLogProb =
+        std::min_element(newPaths.begin(), newPaths.end())->logProb;
+    if (finalPaths.size() >= beam && minFinalPathLogProb >= maxPathLogProb) {
+      break;
+    }
+    paths = newPaths;
+  }  // end while
+
+  std::partial_sort(finalPaths.begin(),
+                    finalPaths.begin() + std::min(beam, finalPaths.size()),
+                    finalPaths.end());
+  if (finalPaths.size() > beam) {
+    finalPaths.resize(beam);
+  }
+}
+
+struct SequenceGeneratorPrivate {
+  std::shared_ptr<paddle::GradientMachine> machine;
+  std::shared_ptr<std::vector<std::string>> dict;
+  size_t beginPos;
+  size_t endPos;
+  size_t maxLength;
+
+  paddle::Argument feedback;
+
+  template <typename T>
+  inline T& cast(void* ptr) {
+    return *(T*)(ptr);
+  }
+
+  inline void findNBest(std::vector<paddle::Argument>& inArgs,
+                        std::vector<Path>& path) {
+    ::findNBest(machine.get(), inArgs, path, beginPos, endPos, maxLength);
+  }
+
+  SequenceGeneratorPrivate()
+      : dict(std::make_shared<std::vector<std::string>>()),
+        beginPos(0UL),
+        endPos(0UL),
+        maxLength(0UL),
+        feedback(__create_feedback__()) {}
+
+ private:
+  static paddle::Argument __create_feedback__() {
+    paddle::Argument feedback;
+    feedback.ids = paddle::IVector::create(/* size= */ 1, FLAGS_use_gpu);
+
+    feedback.sequenceStartPositions =
+        paddle::ICpuGpuVector::create(/* size= */ 2, /* useGpu= */ false);
+    feedback.sequenceStartPositions->getMutableData(false)[0] = 0;
+    feedback.sequenceStartPositions->getMutableData(false)[1] = 1;
+    return feedback;
+  }
+};
+
+SequenceGenerator::SequenceGenerator() : m(new SequenceGeneratorPrivate()) {}
+
+SequenceGenerator::~SequenceGenerator() { delete m; }
+
+class PathSequenceResults : public ISequenceResults {
+  // ISequenceResults interface
+ public:
+  PathSequenceResults(const std::shared_ptr<std::vector<Path>>& path,
+                      const std::shared_ptr<std::vector<std::string>>& dict)
+      : path_(path), dict_(dict) {}
+
+  size_t getSize() const { return path_->size(); }
+  std::string getSentence(size_t id, bool split) const throw(RangeError) {
+    if (id < getSize()) {
+      Path& p = (*path_)[id];
+      std::ostringstream sout;
+      std::transform(p.ids.begin(),
+                     p.ids.end(),
+                     std::ostream_iterator<std::string>(sout, split ? " " : ""),
+                     [&](int id) { return (*dict_)[id]; });
+      return sout.str();
+    } else {
+      RangeError e;
+      throw e;
+    }
+  }
+  std::vector<int> getSequence(size_t id) const throw(RangeError) {
+    if (id < getSize()) {
+      Path& p = (*path_)[id];
+      return p.ids;
+    } else {
+      RangeError e;
+      throw e;
+    }
+  }
+  float getScore(size_t id) const throw(RangeError) {
+    if (id < getSize()) {
+      Path& p = (*path_)[id];
+      return p.logProb;
+    } else {
+      RangeError e;
+      throw e;
+    }
+  }
+
+ private:
+  std::shared_ptr<std::vector<Path>> path_;
+  std::shared_ptr<std::vector<std::string>> dict_;
+};
+
+ISequenceResults* SequenceGenerator::generateSequence(
+    const Arguments& inArgs) const {
+  auto& in_args =
+      m->cast<std::vector<paddle::Argument>>(inArgs.getInternalArgumentsPtr());
+  for (auto& arg : in_args) {
+    arg.sequenceStartPositions = m->feedback.sequenceStartPositions;
+  }
+  in_args.push_back(m->feedback);
+  auto path = std::make_shared<std::vector<Path>>();
+  m->findNBest(in_args, *path);
+  return new PathSequenceResults(path, m->dict);
+}
+
+SequenceGenerator* SequenceGenerator::createByGradientMachineSharedPtr(
+    void* ptr) {
+  SequenceGenerator* r = new SequenceGenerator();
+  r->m->machine = r->m->cast<std::shared_ptr<paddle::GradientMachine>>(ptr);
+  return r;
+}
+
+void SequenceGenerator::setDict(const std::vector<std::string>& dict) {
+  *m->dict = dict;
+}
+
+void SequenceGenerator::setBos(size_t bos) { m->beginPos = bos; }
+
+void SequenceGenerator::setEos(size_t eos) { m->endPos = eos; }
+
+void SequenceGenerator::setMaxLength(size_t maxLength) {
+  m->maxLength = maxLength;
+}
+
+void SequenceGenerator::setBeamSize(size_t beamSize) {
+  if (beamSize != -1UL) {
+    FLAGS_beam_size = beamSize;
+  }
+}
+
+ISequenceResults::~ISequenceResults() {}
diff --git a/paddle/legacy/api/Trainer.cpp b/paddle/legacy/api/Trainer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e7c607201b0b946a6d6b2f3da35356e2c4e5e15e
--- /dev/null
+++ b/paddle/legacy/api/Trainer.cpp
@@ -0,0 +1,175 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "PaddleAPI.h"
+#include "PaddleAPIPrivate.h"
+
+#include <stdlib.h>
+#include <atomic>
+#include <memory>
+
+#include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
+#include "paddle/legacy/trainer/ParamUtil.h"
+#include "paddle/legacy/trainer/Trainer.h"
+#include "paddle/legacy/trainer/TrainerInternal.h"
+#include "paddle/legacy/utils/Flags.h"
+
+using paddle::real;
+
+DECLARE_string(config);
+DECLARE_string(init_model_path);
+DECLARE_int32(start_pass);
+
+struct TrainerPrivate : public paddle::Trainer {
+  bool _trainOneBatch(size_t batchSize);
+  bool forwardOneBatch(size_t batchSize);
+  void forwardOneDataBatch(const std::vector<paddle::Argument>& inArgs);
+  void setBatchSize(size_t batchSize);
+  std::vector<paddle::Argument>& getForwardOutput();
+
+  void startTestPeriod();
+  void finishTestPeriod();
+  void testOneDataBatch(const paddle::DataBatch& dataBatch);
+  TrainerPrivate() : paddle::Trainer() {}
+};
+
+Trainer::Trainer() : m(new TrainerPrivate()) {
+  auto conf = paddle::TrainerConfigHelper::createFromFlags();
+  if (conf != nullptr) {
+    m->init(conf);
+  }
+}
+
+Trainer::~Trainer() { delete m; }
+
+Trainer* Trainer::createByCommandLine() throw(IOError) {
+  auto retv = new Trainer();
+  if (retv->m->getConfig().IsInitialized()) {
+    return retv;
+  } else {
+    throw IOError();
+  }
+}
+
+Trainer::Trainer(TrainerConfig* config, GradientMachine* gm)
+    : m(new TrainerPrivate()) {
+  m->init(config->m->conf, /* testing= */ false, gm ? gm->m->machine : nullptr);
+}
+
+Trainer* Trainer::create(TrainerConfig* config,
+                         GradientMachine* gm) throw(IOError) {
+  auto retv = new Trainer(config, gm);
+  if (retv->m->getConfig().IsInitialized()) {
+    return retv;
+  } else {
+    retv->m->getConfig().CheckInitialized();
+    throw IOError();
+  }
+}
+
+void Trainer::startTrain() { m->startTrain(); }
+
+void Trainer::finishTrain() { m->finishTrain(); }
+
+void Trainer::startTrainPass() { m->startTrainPass(); }
+
+void Trainer::finishTrainPass() { m->finishTrainPass(); }
+
+void Trainer::trainOneDataBatch(size_t batchSize, const Arguments& inArgs) {
+  paddle::DataBatch dataBatch;
+  dataBatch.getStreams() = inArgs.m->outputs;
+  dataBatch.setSize(batchSize);
+  m->trainOneDataBatch(dataBatch);
+}
+
+bool Trainer::trainOneBatch(size_t batchSize) {
+  return m->_trainOneBatch(batchSize);
+}
+
+bool TrainerPrivate::_trainOneBatch(size_t batchSize) {
+  paddle::DataBatch dataBatch;
+  CHECK(dataProvider_) << "data_provider is not specified";
+  int num = dataProvider_->getNextBatch(batchSize, &dataBatch);
+  if (num == 0) {
+    return false;
+  }
+  trainOneDataBatch(dataBatch);
+  return false;
+}
+
+void TrainerPrivate::startTestPeriod() {
+  if (!tester_) {
+    createTester();
+  }
+  tester_->startTestPeriod();
+}
+
+void Trainer::startTestPeriod() { m->startTestPeriod(); }
+
+void TrainerPrivate::testOneDataBatch(const paddle::DataBatch& dataBatch) {
+  tester_->testOneDataBatch(dataBatch, &forwardOutput_);
+}
+
+void Trainer::testOneDataBatch(size_t batchSize, const Arguments& args) {
+  paddle::DataBatch dataBatch;
+  dataBatch.getStreams() = args.m->outputs;
+  dataBatch.setSize(batchSize);
+  m->testOneDataBatch(dataBatch);
+}
+
+void TrainerPrivate::finishTestPeriod() { tester_->finishTestPeriod(); }
+void Trainer::finishTestPeriod() { m->finishTestPeriod(); }
+
+Arguments* Trainer::getLayerOutput(const std::string& layerName) const {
+  auto nn = this->m->getGradientMachine();
+  CHECK(nn) << "trainerInternal_.getGradientMachine() is not NeuralNetwork";
+  auto arg = nn->getLayerOutput(layerName);
+  return Arguments::createByPaddleArgument(&arg);
+}
+
+void Trainer::forwardOneBatch(size_t batchSize) {
+  m->forwardOneBatch(batchSize);
+}
+
+bool TrainerPrivate::forwardOneBatch(size_t batchSize) {
+  CHECK(dataProvider_) << "data_provider is not specified";
+  paddle::DataBatch dataBatch;
+  int num = dataProvider_->getNextBatch(batchSize, &dataBatch);
+  if (num == 0) {
+    return false;
+  }
+
+  forwardOneDataBatch(dataBatch.getStreams());
+  return true;
+}
+
+void TrainerPrivate::forwardOneDataBatch(
+    const std::vector<paddle::Argument>& inArgs) {
+  std::vector<paddle::Argument>& outArgs = forwardOutput_;
+
+  if (config_->getOptConfig().use_sparse_remote_updater()) {
+    trainerInternal_.getGradientMachine()->prefetch(inArgs);
+    trainerInternal_.getParameterUpdater()->getParametersRemote();
+  }
+  trainerInternal_.getGradientMachine()->forward(
+      inArgs, &outArgs, paddle::PASS_TEST);
+}
+
+Arguments* Trainer::getForwardOutput() {
+  return Arguments::createByPaddleArgumentVector(&m->getForwardOutput());
+}
+
+std::vector<paddle::Argument>& TrainerPrivate::getForwardOutput() {
+  return forwardOutput_;
+}
diff --git a/paddle/legacy/api/Util.cpp b/paddle/legacy/api/Util.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b458c4d90ecc7333066f887dcbc93c4da5c43853
--- /dev/null
+++ b/paddle/legacy/api/Util.cpp
@@ -0,0 +1,60 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "PaddleAPI.h"
+
+#include "paddle/legacy/parameter/Parameter.h"
+#include "paddle/legacy/utils/Common.h"
+#include "paddle/legacy/utils/Flags.h"
+#include "paddle/legacy/utils/PythonUtil.h"
+#include "paddle/legacy/utils/Util.h"
+
+#include <algorithm>
+#include <iostream>
+#include <iterator>
+
+void initPaddle(int argc, char** argv) {
+  paddle::initMain(argc, argv);
+  paddle::initPython(argc, argv);
+  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
+}
+
+FloatArray::FloatArray(const float* b, const size_t l)
+    : buf(b), length(l), needFree(false) {}
+
+IntArray::IntArray(const int* b, const size_t l, bool f)
+    : buf(b), length(l), needFree(f) {}
+
+IntWithFloatArray::IntWithFloatArray(const float* v,
+                                     const int* i,
+                                     size_t l,
+                                     bool f)
+    : valBuf(v), idxBuf(i), length(l), needFree(f) {}
+
+bool isUsingGpu() { return FLAGS_use_gpu; }
+
+void setUseGpu(bool useGpu) { FLAGS_use_gpu = useGpu; }
+
+bool isGpuVersion() {
+#ifndef PADDLE_WITH_CUDA
+  return false;
+#else
+  return true;
+#endif
+}
+
+int getTrainerCount() { return FLAGS_trainer_count; }
+
+static_assert(NUM_PARAMETER_TYPES == paddle::NUM_PARAMETER_TYPES,
+              "The Parameter Type should be same in core/api and core/common");
diff --git a/paddle/legacy/api/Vector.cpp b/paddle/legacy/api/Vector.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..73b6d3a15d6d0ddc80a17846604d9500d8f7e4e3
--- /dev/null
+++ b/paddle/legacy/api/Vector.cpp
@@ -0,0 +1,304 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "PaddleAPI.h"
+
+#include "paddle/legacy/math/Vector.h"
+
+#include <cstring>
+
+struct IVectorPrivate {
+  paddle::IVectorPtr vec;
+};
+
+IVector::IVector() : m(new IVectorPrivate()) {}
+
+IVector* IVector::createZero(size_t sz, bool useGpu) {
+  auto v = new IVector();
+  v->m->vec = paddle::IVector::create(sz, useGpu);
+  v->m->vec->zeroMem();
+  return v;
+}
+
+IVector* IVector::create(const std::vector<int>& data, bool useGpu) {
+  auto v = new IVector();
+  v->m->vec = paddle::IVector::create(data.size(), useGpu);
+  v->m->vec->copyFrom(data.data(), data.size());
+  return v;
+}
+
+IVector* IVector::createVectorFromNumpy(int* data,
+                                        int dim,
+                                        bool copy,
+                                        bool useGpu) throw(UnsupportError) {
+  if (useGpu) {
+    /// if use gpu only copy=true is supported
+    if (!copy) {
+      throw UnsupportError("Gpu mode only supports copy=True");
+    }
+    return IVector::createGpuVectorFromNumpy(data, dim);
+  } else {
+    return IVector::createCpuVectorFromNumpy(data, dim, copy);
+  }
+}
+
+IVector* IVector::createCpuVectorFromNumpy(int* data, int dim, bool copy) {
+  auto v = new IVector();
+  if (copy) {
+    v->m->vec = paddle::IVector::create(dim, false);
+    v->m->vec->copyFrom(data, dim);
+  } else {
+    v->m->vec = paddle::IVector::create(data, dim, false);
+  }
+  return v;
+}
+
+IVector* IVector::createGpuVectorFromNumpy(int* data, int dim) {
+  auto v = new IVector();
+  v->m->vec = paddle::IVector::create(dim, true);
+  v->m->vec->copyFrom(data, dim);
+  return v;
+}
+
+bool IVector::isGpu() const {
+  return dynamic_cast<paddle::GpuIVector*>(m->vec.get()) != nullptr;
+}
+
+IntArray IVector::getData() const {
+  if (this->isGpu()) {
+    int* src = m->vec->getData();
+    size_t len = m->vec->getSize();
+    int* dest = new int[len];
+    hl_memcpy_device2host(dest, src, len * sizeof(int));
+    return IntArray(dest, len, true);
+  } else {
+    return IntArray(m->vec->getData(), m->vec->getSize());
+  }
+}
+
+int& IVector::operator[](const size_t idx) throw(RangeError, UnsupportError) {
+  if (this->isGpu()) {
+    UnsupportError e;
+    throw e;
+  } else {
+    if (idx >= m->vec->getSize()) {
+      RangeError e;
+      throw e;
+    }
+  }
+  return m->vec->getData()[idx];
+}
+
+const int& IVector::operator[](const size_t idx) const
+    throw(RangeError, UnsupportError) {
+  return (*const_cast<IVector*>(this))[idx];
+}
+
+IVector* IVector::createByPaddleVectorPtr(void* ptr) {
+  auto* p = (paddle::IVectorPtr*)ptr;
+  if ((*p) != nullptr) {
+    IVector* vec = new IVector();
+    vec->m->vec = *p;
+    return vec;
+  } else {
+    return nullptr;
+  }
+}
+
+IVector::~IVector() { delete m; }
+
+void* IVector::getSharedPtr() const { return &m->vec; }
+
+size_t IVector::getSize() const { return m->vec->getSize(); }
+
+void IVector::toNumpyArrayInplace(int** data, int* dim1) throw(UnsupportError) {
+  auto v = std::dynamic_pointer_cast<paddle::CpuIVector>(m->vec);
+  if (v) {
+    *data = v->getData();
+    *dim1 = v->getSize();
+  } else {
+    throw UnsupportError();
+  }
+}
+
+void IVector::copyToNumpyArray(int** view_m_data, int* dim1) {
+  *dim1 = m->vec->getSize();
+  *view_m_data = new int[*dim1];
+  if (auto cpuVec = dynamic_cast<paddle::CpuIVector*>(m->vec.get())) {
+    std::memcpy(*view_m_data, cpuVec->getData(), sizeof(int) * (*dim1));
+  } else if (auto gpuVec = dynamic_cast<paddle::GpuIVector*>(m->vec.get())) {
+    hl_memcpy_device2host(
+        *view_m_data, gpuVec->getData(), sizeof(int) * (*dim1));
+  } else {
+    LOG(INFO) << "Unexpected situation";
+  }
+}
+
+void IVector::copyFromNumpyArray(int* data, int dim) {
+  m->vec->resize(dim);
+  m->vec->copyFrom(data, dim);
+}
+
+struct VectorPrivate {
+  paddle::VectorPtr vec;
+
+  void safeAccessData(const size_t idx,
+                      const std::function<void(float&)>& func) const
+      throw(RangeError, UnsupportError) {
+    auto cpuVec = std::dynamic_pointer_cast<const paddle::CpuVector>(vec);
+    if (cpuVec != nullptr) {
+      if (idx < vec->getSize()) {
+        func(vec->getData()[idx]);
+      } else {
+        throw RangeError();
+      }
+    } else {
+      throw UnsupportError();
+    }
+  }
+};
+
+Vector::Vector() : m(new VectorPrivate()) {}
+
+Vector::~Vector() { delete m; }
+
+Vector* Vector::createZero(size_t sz, bool useGpu) {
+  auto retVec = new Vector();
+  retVec->m->vec = paddle::Vector::create(sz, useGpu);
+  retVec->m->vec->zero();
+  return retVec;
+}
+
+Vector* Vector::create(const std::vector<float>& data, bool useGpu) {
+  auto retVec = new Vector();
+  retVec->m->vec = paddle::Vector::create(data.size(), useGpu);
+  retVec->m->vec->copyFrom(data.data(), data.size());
+  return retVec;
+}
+
+Vector* Vector::createByPaddleVectorPtr(void* ptr) {
+  auto& v = *(paddle::VectorPtr*)(ptr);
+  if (v == nullptr) {
+    return nullptr;
+  } else {
+    auto retVec = new Vector();
+    retVec->m->vec = v;
+    return retVec;
+  }
+}
+
+Vector* Vector::createVectorFromNumpy(float* data,
+                                      int dim,
+                                      bool copy,
+                                      bool useGpu) throw(UnsupportError) {
+  if (useGpu) {
+    /// if use gpu only copy=True is supported
+    if (!copy) {
+      throw UnsupportError("Gpu mode only supports copy=True");
+    }
+    return Vector::createGpuVectorFromNumpy(data, dim);
+  } else {
+    return Vector::createCpuVectorFromNumpy(data, dim, copy);
+  }
+}
+
+Vector* Vector::createCpuVectorFromNumpy(float* data, int dim, bool copy) {
+  CHECK_GT(dim, 0);
+  auto retVec = new Vector();
+  if (copy) {
+    retVec->m->vec = paddle::Vector::create((size_t)dim, false);
+    retVec->m->vec->copyFrom(data, dim);
+  } else {
+    retVec->m->vec = paddle::Vector::create(data, (size_t)dim, false);
+  }
+  return retVec;
+}
+
+Vector* Vector::createGpuVectorFromNumpy(float* data, int dim) {
+  CHECK_GT(dim, 0);
+  auto retVec = new Vector();
+  retVec->m->vec = paddle::Vector::create((size_t)dim, true);
+  retVec->m->vec->copyFrom(data, (size_t)dim);
+  return retVec;
+}
+
+void Vector::toNumpyArrayInplace(float** view_data,
+                                 int* dim1) throw(UnsupportError) {
+  auto v = std::dynamic_pointer_cast<paddle::CpuVector>(m->vec);
+  if (v != nullptr) {
+    *view_data = v->getData();
+    *dim1 = (int)v->getSize();
+  } else {
+    throw UnsupportError();
+  }
+}
+
+void Vector::copyToNumpyArray(float** view_m_data, int* dim1) {
+  *dim1 = m->vec->getSize();
+  *view_m_data = new float[*dim1];
+  if (auto cpuVec = dynamic_cast<paddle::CpuVector*>(m->vec.get())) {
+    std::memcpy(*view_m_data, cpuVec->getData(), sizeof(float) * (*dim1));
+  } else if (auto gpuVec = dynamic_cast<paddle::GpuVector*>(m->vec.get())) {
+    hl_memcpy_device2host(
+        *view_m_data, gpuVec->getData(), sizeof(float) * (*dim1));
+  } else {
+    LOG(INFO) << "Unexpected situation";
+  }
+}
+
+void Vector::copyFromNumpyArray(float* data, int dim) {
+  m->vec->resize(dim);
+  m->vec->copyFrom(data, dim);
+}
+
+FloatArray Vector::getData() const {
+  if (this->isGpu()) {
+    float* src = m->vec->getData();
+    size_t len = m->vec->getSize();
+    float* dest = new float[len];
+    hl_memcpy_device2host(dest, src, len * sizeof(float));
+    FloatArray ret_val(dest, len);
+    ret_val.needFree = true;
+    return ret_val;
+  } else {
+    FloatArray ret_val(m->vec->getData(), m->vec->getSize());
+    return ret_val;
+  }
+}
+
+void Vector::copyFrom(Vector* src) throw(RangeError) {
+  if (src->m->vec->getSize() != m->vec->getSize()) {
+    throw RangeError();
+  }
+  m->vec->copyFrom(*src->m->vec);
+}
+
+bool Vector::isGpu() const {
+  return std::dynamic_pointer_cast<paddle::GpuVector>(m->vec) != nullptr;
+}
+
+float Vector::get(const size_t idx) const throw(RangeError, UnsupportError) {
+  float r;
+  m->safeAccessData(idx, [&](float& o) { r = o; });
+  return r;
+}
+
+void Vector::set(const size_t idx, float val) throw(RangeError,
+                                                    UnsupportError) {
+  m->safeAccessData(idx, [&](float& o) { o = val; });
+}
+
+size_t Vector::getSize() const { return m->vec->getSize(); }
+
+void* Vector::getSharedPtr() { return &m->vec; }
diff --git a/paddle/api/__init__.py b/paddle/legacy/api/__init__.py
similarity index 100%
rename from paddle/api/__init__.py
rename to paddle/legacy/api/__init__.py
diff --git a/paddle/api/numpy.i b/paddle/legacy/api/numpy.i
similarity index 100%
rename from paddle/api/numpy.i
rename to paddle/legacy/api/numpy.i
diff --git a/paddle/api/test/.gitignore b/paddle/legacy/api/test/.gitignore
similarity index 100%
rename from paddle/api/test/.gitignore
rename to paddle/legacy/api/test/.gitignore
diff --git a/paddle/api/test/CMakeLists.txt b/paddle/legacy/api/test/CMakeLists.txt
similarity index 100%
rename from paddle/api/test/CMakeLists.txt
rename to paddle/legacy/api/test/CMakeLists.txt
diff --git a/paddle/api/test/testArguments.py b/paddle/legacy/api/test/testArguments.py
similarity index 100%
rename from paddle/api/test/testArguments.py
rename to paddle/legacy/api/test/testArguments.py
diff --git a/paddle/api/test/testGradientMachine.py b/paddle/legacy/api/test/testGradientMachine.py
similarity index 100%
rename from paddle/api/test/testGradientMachine.py
rename to paddle/legacy/api/test/testGradientMachine.py
diff --git a/paddle/api/test/testMatrix.py b/paddle/legacy/api/test/testMatrix.py
similarity index 100%
rename from paddle/api/test/testMatrix.py
rename to paddle/legacy/api/test/testMatrix.py
diff --git a/paddle/api/test/testTrain.py b/paddle/legacy/api/test/testTrain.py
similarity index 100%
rename from paddle/api/test/testTrain.py
rename to paddle/legacy/api/test/testTrain.py
diff --git a/paddle/api/test/testTrainConfig.py b/paddle/legacy/api/test/testTrainConfig.py
similarity index 100%
rename from paddle/api/test/testTrainConfig.py
rename to paddle/legacy/api/test/testTrainConfig.py
diff --git a/paddle/api/test/testTrainer.py b/paddle/legacy/api/test/testTrainer.py
similarity index 100%
rename from paddle/api/test/testTrainer.py
rename to paddle/legacy/api/test/testTrainer.py
diff --git a/paddle/api/test/testVector.py b/paddle/legacy/api/test/testVector.py
similarity index 100%
rename from paddle/api/test/testVector.py
rename to paddle/legacy/api/test/testVector.py
diff --git a/paddle/api/test/util.py b/paddle/legacy/api/test/util.py
similarity index 100%
rename from paddle/api/test/util.py
rename to paddle/legacy/api/test/util.py
diff --git a/paddle/capi/Arguments.cpp b/paddle/legacy/capi/Arguments.cpp
similarity index 100%
rename from paddle/capi/Arguments.cpp
rename to paddle/legacy/capi/Arguments.cpp
diff --git a/paddle/capi/CMakeLists.txt b/paddle/legacy/capi/CMakeLists.txt
similarity index 100%
rename from paddle/capi/CMakeLists.txt
rename to paddle/legacy/capi/CMakeLists.txt
diff --git a/paddle/legacy/capi/Main.cpp b/paddle/legacy/capi/Main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..17d8f00a88a9fd0818e6b90f8f6888b7d793a46e
--- /dev/null
+++ b/paddle/legacy/capi/Main.cpp
@@ -0,0 +1,53 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <fenv.h>
+#include <stdlib.h>
+#include <string.h>
+#include <vector>
+#include "capi_private.h"
+#include "main.h"
+#include "paddle/legacy/trainer/TrainerConfigHelper.h"
+#include "paddle/legacy/utils/Excepts.h"
+#include "paddle/legacy/utils/PythonUtil.h"
+
+static void initPaddle(int argc, char** argv) {
+  paddle::initMain(argc, argv);
+  paddle::initPython(argc, argv);
+}
+
+extern "C" {
+paddle_error paddle_init(int argc, char** argv) {
+  static bool isInit = false;
+  if (isInit) return kPD_NO_ERROR;
+
+  std::vector<char*> realArgv;
+  realArgv.reserve(argc + 1);
+  realArgv.push_back(strdup(""));
+  for (int i = 0; i < argc; ++i) {
+    realArgv.push_back(argv[i]);
+  }
+  initPaddle(argc + 1, realArgv.data());
+  free(realArgv[0]);
+  isInit = true;
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_init_thread() {
+  if (FLAGS_use_gpu) {
+    hl_init(FLAGS_gpu_id);
+  }
+  return kPD_NO_ERROR;
+}
+}
diff --git a/paddle/capi/Matrix.cpp b/paddle/legacy/capi/Matrix.cpp
similarity index 100%
rename from paddle/capi/Matrix.cpp
rename to paddle/legacy/capi/Matrix.cpp
diff --git a/paddle/capi/Vector.cpp b/paddle/legacy/capi/Vector.cpp
similarity index 100%
rename from paddle/capi/Vector.cpp
rename to paddle/legacy/capi/Vector.cpp
diff --git a/paddle/capi/arguments.h b/paddle/legacy/capi/arguments.h
similarity index 100%
rename from paddle/capi/arguments.h
rename to paddle/legacy/capi/arguments.h
diff --git a/paddle/capi/capi.h b/paddle/legacy/capi/capi.h
similarity index 100%
rename from paddle/capi/capi.h
rename to paddle/legacy/capi/capi.h
diff --git a/paddle/legacy/capi/capi_private.h b/paddle/legacy/capi/capi_private.h
new file mode 100644
index 0000000000000000000000000000000000000000..e5f8c8c5c8bd506f9c8f49ee7d03f9b20460efdb
--- /dev/null
+++ b/paddle/legacy/capi/capi_private.h
@@ -0,0 +1,82 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "capi.h"
+#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/legacy/parameter/Argument.h"
+#pragma once
+
+namespace paddle {
+namespace capi {
+
+enum CType { kIVECTOR = 0, kMATRIX, kARGUMENTS, kGRADIENT_MACHINE };
+
+#define STRUCT_HEADER CType type;
+
+struct CHeader {
+  STRUCT_HEADER
+};
+
+struct CIVector {
+  STRUCT_HEADER
+  IVectorPtr vec;
+
+  CIVector() : type(kIVECTOR) {}
+};
+
+struct CMatrix {
+  STRUCT_HEADER
+  MatrixPtr mat;
+
+  CMatrix() : type(kMATRIX) {}
+};
+
+struct CArguments {
+  STRUCT_HEADER
+  std::vector<paddle::Argument> args;
+
+  CArguments() : type(kARGUMENTS) {}
+
+  template <typename T>
+  paddle_error accessSeqPos(uint64_t ID, uint32_t nestedLevel, T callback) {
+    if (ID >= args.size()) return kPD_OUT_OF_RANGE;
+    switch (nestedLevel) {
+      case 0:
+        callback(args[ID].sequenceStartPositions);
+        break;
+      case 1:
+        callback(args[ID].subSequenceStartPositions);
+        break;
+      default:
+        return kPD_OUT_OF_RANGE;
+    }
+    return kPD_NO_ERROR;
+  }
+};
+
+struct CGradientMachine {
+  STRUCT_HEADER
+  paddle::GradientMachinePtr machine;
+
+  CGradientMachine() : type(kGRADIENT_MACHINE) {}
+};
+
+template <typename T>
+inline T* cast(void* ptr) {
+  return reinterpret_cast<T*>(ptr);
+}
+}  // namespace capi
+}  // namespace paddle
diff --git a/paddle/capi/config.h.in b/paddle/legacy/capi/config.h.in
similarity index 100%
rename from paddle/capi/config.h.in
rename to paddle/legacy/capi/config.h.in
diff --git a/paddle/capi/error.cpp b/paddle/legacy/capi/error.cpp
similarity index 100%
rename from paddle/capi/error.cpp
rename to paddle/legacy/capi/error.cpp
diff --git a/paddle/capi/error.h b/paddle/legacy/capi/error.h
similarity index 100%
rename from paddle/capi/error.h
rename to paddle/legacy/capi/error.h
diff --git a/paddle/capi/examples/.gitignore b/paddle/legacy/capi/examples/.gitignore
similarity index 100%
rename from paddle/capi/examples/.gitignore
rename to paddle/legacy/capi/examples/.gitignore
diff --git a/paddle/capi/examples/README.md b/paddle/legacy/capi/examples/README.md
similarity index 100%
rename from paddle/capi/examples/README.md
rename to paddle/legacy/capi/examples/README.md
diff --git a/paddle/capi/examples/model_inference/README.md b/paddle/legacy/capi/examples/model_inference/README.md
similarity index 100%
rename from paddle/capi/examples/model_inference/README.md
rename to paddle/legacy/capi/examples/model_inference/README.md
diff --git a/paddle/capi/examples/model_inference/common/common.h b/paddle/legacy/capi/examples/model_inference/common/common.h
similarity index 100%
rename from paddle/capi/examples/model_inference/common/common.h
rename to paddle/legacy/capi/examples/model_inference/common/common.h
diff --git a/paddle/capi/examples/model_inference/dense/CMakeLists.txt b/paddle/legacy/capi/examples/model_inference/dense/CMakeLists.txt
similarity index 100%
rename from paddle/capi/examples/model_inference/dense/CMakeLists.txt
rename to paddle/legacy/capi/examples/model_inference/dense/CMakeLists.txt
diff --git a/paddle/capi/examples/model_inference/dense/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/dense/convert_protobin.sh
similarity index 100%
rename from paddle/capi/examples/model_inference/dense/convert_protobin.sh
rename to paddle/legacy/capi/examples/model_inference/dense/convert_protobin.sh
diff --git a/paddle/capi/examples/model_inference/dense/main.c b/paddle/legacy/capi/examples/model_inference/dense/main.c
similarity index 100%
rename from paddle/capi/examples/model_inference/dense/main.c
rename to paddle/legacy/capi/examples/model_inference/dense/main.c
diff --git a/paddle/capi/examples/model_inference/dense/merge_v2_model.py b/paddle/legacy/capi/examples/model_inference/dense/merge_v2_model.py
similarity index 100%
rename from paddle/capi/examples/model_inference/dense/merge_v2_model.py
rename to paddle/legacy/capi/examples/model_inference/dense/merge_v2_model.py
diff --git a/paddle/capi/examples/model_inference/dense/mnist_v2.py b/paddle/legacy/capi/examples/model_inference/dense/mnist_v2.py
similarity index 100%
rename from paddle/capi/examples/model_inference/dense/mnist_v2.py
rename to paddle/legacy/capi/examples/model_inference/dense/mnist_v2.py
diff --git a/paddle/capi/examples/model_inference/dense/trainer_config.py b/paddle/legacy/capi/examples/model_inference/dense/trainer_config.py
similarity index 100%
rename from paddle/capi/examples/model_inference/dense/trainer_config.py
rename to paddle/legacy/capi/examples/model_inference/dense/trainer_config.py
diff --git a/paddle/capi/examples/model_inference/multi_thread/.gitignore b/paddle/legacy/capi/examples/model_inference/multi_thread/.gitignore
similarity index 100%
rename from paddle/capi/examples/model_inference/multi_thread/.gitignore
rename to paddle/legacy/capi/examples/model_inference/multi_thread/.gitignore
diff --git a/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt b/paddle/legacy/capi/examples/model_inference/multi_thread/CMakeLists.txt
similarity index 100%
rename from paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt
rename to paddle/legacy/capi/examples/model_inference/multi_thread/CMakeLists.txt
diff --git a/paddle/capi/examples/model_inference/multi_thread/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/multi_thread/convert_protobin.sh
similarity index 100%
rename from paddle/capi/examples/model_inference/multi_thread/convert_protobin.sh
rename to paddle/legacy/capi/examples/model_inference/multi_thread/convert_protobin.sh
diff --git a/paddle/capi/examples/model_inference/multi_thread/main.c b/paddle/legacy/capi/examples/model_inference/multi_thread/main.c
similarity index 100%
rename from paddle/capi/examples/model_inference/multi_thread/main.c
rename to paddle/legacy/capi/examples/model_inference/multi_thread/main.c
diff --git a/paddle/capi/examples/model_inference/multi_thread/main_gpu.c b/paddle/legacy/capi/examples/model_inference/multi_thread/main_gpu.c
similarity index 100%
rename from paddle/capi/examples/model_inference/multi_thread/main_gpu.c
rename to paddle/legacy/capi/examples/model_inference/multi_thread/main_gpu.c
diff --git a/paddle/capi/examples/model_inference/multi_thread/trainer_config.py b/paddle/legacy/capi/examples/model_inference/multi_thread/trainer_config.py
similarity index 100%
rename from paddle/capi/examples/model_inference/multi_thread/trainer_config.py
rename to paddle/legacy/capi/examples/model_inference/multi_thread/trainer_config.py
diff --git a/paddle/capi/examples/model_inference/sequence/.gitignore b/paddle/legacy/capi/examples/model_inference/sequence/.gitignore
similarity index 100%
rename from paddle/capi/examples/model_inference/sequence/.gitignore
rename to paddle/legacy/capi/examples/model_inference/sequence/.gitignore
diff --git a/paddle/capi/examples/model_inference/sequence/CMakeLists.txt b/paddle/legacy/capi/examples/model_inference/sequence/CMakeLists.txt
similarity index 100%
rename from paddle/capi/examples/model_inference/sequence/CMakeLists.txt
rename to paddle/legacy/capi/examples/model_inference/sequence/CMakeLists.txt
diff --git a/paddle/capi/examples/model_inference/sequence/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/sequence/convert_protobin.sh
similarity index 100%
rename from paddle/capi/examples/model_inference/sequence/convert_protobin.sh
rename to paddle/legacy/capi/examples/model_inference/sequence/convert_protobin.sh
diff --git a/paddle/capi/examples/model_inference/sequence/main.c b/paddle/legacy/capi/examples/model_inference/sequence/main.c
similarity index 100%
rename from paddle/capi/examples/model_inference/sequence/main.c
rename to paddle/legacy/capi/examples/model_inference/sequence/main.c
diff --git a/paddle/capi/examples/model_inference/sequence/trainer_config.py b/paddle/legacy/capi/examples/model_inference/sequence/trainer_config.py
similarity index 100%
rename from paddle/capi/examples/model_inference/sequence/trainer_config.py
rename to paddle/legacy/capi/examples/model_inference/sequence/trainer_config.py
diff --git a/paddle/capi/examples/model_inference/sparse_binary/.gitignore b/paddle/legacy/capi/examples/model_inference/sparse_binary/.gitignore
similarity index 100%
rename from paddle/capi/examples/model_inference/sparse_binary/.gitignore
rename to paddle/legacy/capi/examples/model_inference/sparse_binary/.gitignore
diff --git a/paddle/capi/examples/model_inference/sparse_binary/CMakeLists.txt b/paddle/legacy/capi/examples/model_inference/sparse_binary/CMakeLists.txt
similarity index 100%
rename from paddle/capi/examples/model_inference/sparse_binary/CMakeLists.txt
rename to paddle/legacy/capi/examples/model_inference/sparse_binary/CMakeLists.txt
diff --git a/paddle/capi/examples/model_inference/sparse_binary/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/sparse_binary/convert_protobin.sh
similarity index 100%
rename from paddle/capi/examples/model_inference/sparse_binary/convert_protobin.sh
rename to paddle/legacy/capi/examples/model_inference/sparse_binary/convert_protobin.sh
diff --git a/paddle/capi/examples/model_inference/sparse_binary/main.c b/paddle/legacy/capi/examples/model_inference/sparse_binary/main.c
similarity index 100%
rename from paddle/capi/examples/model_inference/sparse_binary/main.c
rename to paddle/legacy/capi/examples/model_inference/sparse_binary/main.c
diff --git a/paddle/capi/examples/model_inference/sparse_binary/trainer_config.py b/paddle/legacy/capi/examples/model_inference/sparse_binary/trainer_config.py
similarity index 100%
rename from paddle/capi/examples/model_inference/sparse_binary/trainer_config.py
rename to paddle/legacy/capi/examples/model_inference/sparse_binary/trainer_config.py
diff --git a/paddle/legacy/capi/gradient_machine.cpp b/paddle/legacy/capi/gradient_machine.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0c5ddd856b5d374ae90d6c8ef898be52aa2e4e89
--- /dev/null
+++ b/paddle/legacy/capi/gradient_machine.cpp
@@ -0,0 +1,180 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "gradient_machine.h"
+#include "capi_private.h"
+#include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
+
+#define cast(v) paddle::capi::cast<paddle::capi::CGradientMachine>(v)
+
+enum GradientMatchineCreateMode {
+  CREATE_MODE_NORMAL = 0,
+  CREATE_MODE_TESTING = 4
+};
+
+namespace paddle {
+
+class MyNeuralNetwork : public NeuralNetwork {
+ public:
+  MyNeuralNetwork(const std::string& name, NeuralNetwork* network)
+      : NeuralNetwork(name, network) {}
+};
+
+NeuralNetwork* newCustomNerualNetwork(const std::string& name,
+                                      NeuralNetwork* network) {
+  return new MyNeuralNetwork(name, network);
+}
+}  // namespace paddle
+
+extern "C" {
+paddle_error paddle_gradient_machine_create_for_inference(
+    paddle_gradient_machine* machine, void* modelConfigProtobuf, int size) {
+  if (modelConfigProtobuf == nullptr) return kPD_NULLPTR;
+  paddle::ModelConfig config;
+  if (!config.ParseFromArray(modelConfigProtobuf, size) ||
+      !config.IsInitialized()) {
+    return kPD_PROTOBUF_ERROR;
+  }
+
+  auto ptr = new paddle::capi::CGradientMachine();
+  ptr->machine.reset(paddle::GradientMachine::create(
+      config, CREATE_MODE_TESTING, {paddle::PARAMETER_VALUE}));
+  *machine = ptr;
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_gradient_machine_create_for_inference_with_parameters(
+    paddle_gradient_machine* machine, void* mergedModel, uint64_t size) {
+  if (mergedModel == nullptr) return kPD_NULLPTR;
+  std::istringstream is(std::string(static_cast<char*>(mergedModel), size));
+  int64_t modelConfigSize = 0;
+  is.read((char*)(&modelConfigSize), sizeof(modelConfigSize));
+  std::string modelConfigProtobuf;
+  modelConfigProtobuf.resize(modelConfigSize);
+  is.read(&modelConfigProtobuf[0], modelConfigSize);
+  paddle::TrainerConfig config;
+  paddle::ModelConfig modelConfig;
+  if (!config.ParseFromString(modelConfigProtobuf) || !config.IsInitialized()) {
+    if (!modelConfig.ParseFromString(modelConfigProtobuf) ||
+        !modelConfig.IsInitialized()) {
+      return kPD_PROTOBUF_ERROR;
+    }
+  } else {
+    modelConfig = config.model_config();
+  }
+  auto ptr = new paddle::capi::CGradientMachine();
+  ptr->machine.reset(paddle::GradientMachine::create(
+      modelConfig, CREATE_MODE_TESTING, {paddle::PARAMETER_VALUE}));
+  std::vector<paddle::ParameterPtr>& parameters = ptr->machine->getParameters();
+  for (auto& para : parameters) {
+    para->load(is);
+  }
+
+  *machine = ptr;
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_gradient_machine_destroy(paddle_gradient_machine machine) {
+  delete cast(machine);
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_gradient_machine_load_parameter_from_disk(
+    paddle_gradient_machine machine, const char* path) {
+  auto m = cast(machine);
+  if (m == nullptr || path == nullptr || m->machine == nullptr)
+    return kPD_NULLPTR;
+  m->machine->loadParameters(path);
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_gradient_machine_forward(paddle_gradient_machine machine,
+                                             paddle_arguments inArgs,
+                                             paddle_arguments outArgs,
+                                             bool isTrain) {
+  auto m = cast(machine);
+  auto in = paddle::capi::cast<paddle::capi::CArguments>(inArgs);
+  auto out = paddle::capi::cast<paddle::capi::CArguments>(outArgs);
+  if (m == nullptr || in == nullptr || out == nullptr || m->machine == nullptr)
+    return kPD_NULLPTR;
+  m->machine->forward(
+      in->args, &out->args, isTrain ? paddle::PASS_TRAIN : paddle::PASS_TEST);
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_gradient_machine_create_shared_param(
+    paddle_gradient_machine origin,
+    void* modelConfigProtobuf,
+    int size,
+    paddle_gradient_machine* slave) {
+  auto o = cast(origin);
+  if (origin == nullptr || slave == nullptr || o->machine == nullptr) {
+    return kPD_NULLPTR;
+  }
+  paddle::ModelConfig config;
+  if (!config.ParseFromArray(modelConfigProtobuf, size) ||
+      !config.IsInitialized()) {
+    return kPD_PROTOBUF_ERROR;
+  }
+
+  std::unique_ptr<paddle::capi::CGradientMachine> ptr(
+      new paddle::capi::CGradientMachine());
+  auto nn = paddle::NeuralNetwork::create(config);
+  nn->init(config,
+           [&o](int paramId, paddle::Parameter* param) {
+             auto p = o->machine->getParameters()[paramId];
+             param->enableSharedType(paddle::PARAMETER_VALUE,
+                                     p->getBuf(paddle::PARAMETER_VALUE));
+           },
+           {paddle::PARAMETER_VALUE},
+           false);
+  ptr->machine.reset(nn);
+  *slave = ptr.release();
+  return kPD_NO_ERROR;
+}
+}
+
+paddle_error paddle_gradient_machine_randomize_param(
+    paddle_gradient_machine machine) {
+  auto m = cast(machine);
+  if (m == nullptr || m->machine == nullptr) return kPD_NULLPTR;
+  m->machine->randParameters();
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_gradient_machine_get_layer_output(
+    paddle_gradient_machine machine,
+    const char* layerName,
+    paddle_arguments args) {
+  auto m = cast(machine);
+  auto out = paddle::capi::cast<paddle::capi::CArguments>(args);
+  if (m == nullptr || layerName == nullptr || out == nullptr ||
+      m->machine == nullptr) {
+    return kPD_NULLPTR;
+  }
+
+  auto layerOutput = m->machine->getLayerOutput(layerName);
+  out->args.push_back(layerOutput);
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_gradient_machine_release_layer_output(
+    paddle_gradient_machine machine) {
+  auto m = cast(machine);
+  if (m == nullptr || m->machine == nullptr) {
+    return kPD_NULLPTR;
+  }
+  m->machine->releaseOutput();
+  return kPD_NO_ERROR;
+}
diff --git a/paddle/capi/gradient_machine.h b/paddle/legacy/capi/gradient_machine.h
similarity index 100%
rename from paddle/capi/gradient_machine.h
rename to paddle/legacy/capi/gradient_machine.h
diff --git a/paddle/capi/main.h b/paddle/legacy/capi/main.h
similarity index 100%
rename from paddle/capi/main.h
rename to paddle/legacy/capi/main.h
diff --git a/paddle/capi/matrix.h b/paddle/legacy/capi/matrix.h
similarity index 100%
rename from paddle/capi/matrix.h
rename to paddle/legacy/capi/matrix.h
diff --git a/paddle/capi/paddle_capi.map b/paddle/legacy/capi/paddle_capi.map
similarity index 100%
rename from paddle/capi/paddle_capi.map
rename to paddle/legacy/capi/paddle_capi.map
diff --git a/paddle/capi/tests/.gitignore b/paddle/legacy/capi/tests/.gitignore
similarity index 100%
rename from paddle/capi/tests/.gitignore
rename to paddle/legacy/capi/tests/.gitignore
diff --git a/paddle/capi/tests/CMakeLists.txt b/paddle/legacy/capi/tests/CMakeLists.txt
similarity index 100%
rename from paddle/capi/tests/CMakeLists.txt
rename to paddle/legacy/capi/tests/CMakeLists.txt
diff --git a/paddle/legacy/capi/tests/test_Arguments.cpp b/paddle/legacy/capi/tests/test_Arguments.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6fb379719dc0f3230c0801752720703ad185216f
--- /dev/null
+++ b/paddle/legacy/capi/tests/test_Arguments.cpp
@@ -0,0 +1,129 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <functional>
+#include "capi.h"
+#include "gtest/gtest.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
+
+static std::vector<paddle_real> randomBuffer(size_t bufSize) {
+  auto& eng = paddle::ThreadLocalRandomEngine::get();
+  std::uniform_real_distribution<paddle_real> dist(-1.0, 1.0);
+  std::vector<paddle_real> retv;
+  retv.reserve(bufSize);
+  for (size_t i = 0; i < bufSize; ++i) {
+    retv.push_back(dist(eng));
+  }
+  return retv;
+}
+
+TEST(CAPIArguments, create) {
+  //! TODO(yuyang18): Test GPU Code.
+  paddle_arguments args = paddle_arguments_create_none();
+  uint64_t size;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_size(args, &size));
+  ASSERT_EQ(0UL, size);
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(args));
+}
+
+TEST(CAPIArguments, value) {
+  paddle_arguments args = paddle_arguments_create_none();
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_resize(args, 1));
+
+  paddle_matrix mat = paddle_matrix_create(128, 64, false);
+  for (size_t i = 0; i < 128; ++i) {
+    std::vector<paddle_real> sampleBuf = randomBuffer(64);
+    paddle_matrix_set_row(mat, i, sampleBuf.data());
+  }
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_set_value(args, 0, mat));
+
+  paddle_matrix val = paddle_matrix_create_none();
+
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_value(args, 0, val));
+
+  for (size_t i = 0; i < 128; ++i) {
+    paddle_real* row1;
+    paddle_real* row2;
+
+    ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_row(mat, i, &row1));
+    ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_row(val, i, &row2));
+    ASSERT_EQ(row1, row2);
+  }
+
+  paddle_ivector ivec = paddle_ivector_create_none();
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(ivec));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(val));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(args));
+}
+
+TEST(CAPIArguments, ids) {
+  paddle_arguments args = paddle_arguments_create_none();
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_resize(args, 1));
+
+  paddle_ivector ivec;
+  int array[3] = {1, 2, 3};
+  ivec = paddle_ivector_create(array, 3, true, false);
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_set_ids(args, 0, ivec));
+
+  paddle_ivector val = paddle_ivector_create_none();
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_ids(args, 0, val));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(ivec));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(val));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(args));
+}
+
+template <typename T1, typename T2>
+void testSequenceHelper(T1 setter, T2 getter) {
+  paddle_arguments args = paddle_arguments_create_none();
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_resize(args, 1));
+
+  paddle_ivector ivec;
+  int array[3] = {1, 2, 3};
+  ivec = paddle_ivector_create(array, 3, true, false);
+  ASSERT_EQ(kPD_NO_ERROR, setter(args, 0, ivec));
+
+  paddle_ivector val = paddle_ivector_create_none();
+  ASSERT_EQ(kPD_NO_ERROR, getter(args, 0, val));
+  uint64_t size;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_get_size(val, &size));
+
+  int* rawBuf;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_get(val, &rawBuf));
+  for (size_t i = 0; i < size; ++i) {
+    ASSERT_EQ(array[i], rawBuf[i]);
+  }
+
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(ivec));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(val));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(args));
+}
+
+TEST(CAPIArguments, Sequence) {
+  auto testSequence = [](uint32_t nestedLevel) {
+    testSequenceHelper(std::bind(paddle_arguments_set_sequence_start_pos,
+                                 std::placeholders::_1,
+                                 std::placeholders::_2,
+                                 nestedLevel,
+                                 std::placeholders::_3),
+                       std::bind(paddle_arguments_get_sequence_start_pos,
+                                 std::placeholders::_1,
+                                 std::placeholders::_2,
+                                 nestedLevel,
+                                 std::placeholders::_3));
+  };
+  for (uint32_t i = 0; i < 2; ++i) {  // test seq and sub-seq.
+    testSequence(i);
+  }
+}
diff --git a/paddle/legacy/capi/tests/test_GradientMachine.cpp b/paddle/legacy/capi/tests/test_GradientMachine.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5d1b7cb6ca4073c0a489366e415f8f74d3c19bec
--- /dev/null
+++ b/paddle/legacy/capi/tests/test_GradientMachine.cpp
@@ -0,0 +1,117 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <paddle/legacy/gserver/gradientmachines/GradientMachine.h>
+#include <paddle/legacy/trainer/TrainerConfigHelper.h>
+#include <stdlib.h>
+#include <string.h>
+#include <type_traits>
+#include "capi.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
+
+static std::vector<paddle_real> randomBuffer(size_t bufSize) {
+  auto& eng = paddle::ThreadLocalRandomEngine::get();
+  std::uniform_real_distribution<paddle_real> dist(-1.0, 1.0);
+  std::vector<paddle_real> retv;
+  retv.reserve(bufSize);
+  for (size_t i = 0; i < bufSize; ++i) {
+    retv.push_back(dist(eng));
+  }
+  return retv;
+}
+
+TEST(GradientMachine, testPredict) {
+  //! TODO(yuyang18): Test GPU Code.
+  paddle::TrainerConfigHelper config("./test_predict_network.py");
+  std::string buffer;
+  ASSERT_TRUE(config.getModelConfig().SerializeToString(&buffer));
+  paddle_gradient_machine machine;
+
+  ASSERT_EQ(kPD_NO_ERROR,
+            paddle_gradient_machine_create_for_inference(
+                &machine, &buffer[0], (int)buffer.size()));
+  std::unique_ptr<paddle::GradientMachine> gm(
+      paddle::GradientMachine::create(config.getModelConfig()));
+  ASSERT_NE(nullptr, gm);
+  gm->randParameters();
+  gm->saveParameters("./");
+
+  ASSERT_EQ(kPD_NO_ERROR,
+            paddle_gradient_machine_load_parameter_from_disk(machine, "./"));
+
+  paddle_gradient_machine machineSlave;
+  ASSERT_EQ(kPD_NO_ERROR,
+            paddle_gradient_machine_create_shared_param(
+                machine, &buffer[0], (int)buffer.size(), &machineSlave));
+  std::swap(machineSlave, machine);
+  paddle_arguments outArgs = paddle_arguments_create_none();
+
+  paddle_arguments inArgs = paddle_arguments_create_none();
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_resize(inArgs, 1));
+  paddle_matrix mat = paddle_matrix_create(1, 100, false);
+  static_assert(std::is_same<paddle_real, paddle::real>::value, "");
+
+  auto data = randomBuffer(100);
+  paddle_real* rowPtr;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_row(mat, 0, &rowPtr));
+  memcpy(rowPtr, data.data(), data.size() * sizeof(paddle_real));
+
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_set_value(inArgs, 0, mat));
+  ASSERT_EQ(kPD_NO_ERROR,
+            paddle_gradient_machine_forward(machine, inArgs, outArgs, false));
+
+  uint64_t sz;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_size(outArgs, &sz));
+  ASSERT_EQ(1UL, sz);
+
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_value(outArgs, 0, mat));
+  std::vector<paddle::Argument> paddleInArgs;
+  std::vector<paddle::Argument> paddleOutArgs;
+  paddleInArgs.resize(1);
+  paddleInArgs[0].value =
+      paddle::Matrix::create(data.data(), 1, 100, false, false);
+
+  gm->forward(paddleInArgs, &paddleOutArgs, paddle::PASS_TEST);
+
+  auto matPaddle = paddleOutArgs[0].value;
+
+  uint64_t height, width;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_shape(mat, &height, &width));
+  ASSERT_EQ(matPaddle->getHeight(), height);
+  ASSERT_EQ(matPaddle->getWidth(), width);
+
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_row(mat, 0, &rowPtr));
+  for (size_t i = 0; i < width; ++i) {
+    ASSERT_NEAR(matPaddle->getData()[i], rowPtr[i], 1e-5);
+  }
+
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(inArgs));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(outArgs));
+  std::swap(machineSlave, machine);
+  ASSERT_EQ(kPD_NO_ERROR, paddle_gradient_machine_destroy(machineSlave));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_gradient_machine_destroy(machine));
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  std::vector<char*> argvs;
+  argvs.push_back(strdup("--use_gpu=false"));
+  paddle_init((int)argvs.size(), argvs.data());
+  for (auto each : argvs) {
+    free(each);
+  }
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/capi/tests/test_Matrix.cpp b/paddle/legacy/capi/tests/test_Matrix.cpp
similarity index 100%
rename from paddle/capi/tests/test_Matrix.cpp
rename to paddle/legacy/capi/tests/test_Matrix.cpp
diff --git a/paddle/capi/tests/test_Vector.cpp b/paddle/legacy/capi/tests/test_Vector.cpp
similarity index 100%
rename from paddle/capi/tests/test_Vector.cpp
rename to paddle/legacy/capi/tests/test_Vector.cpp
diff --git a/paddle/capi/tests/test_predict_network.py b/paddle/legacy/capi/tests/test_predict_network.py
similarity index 100%
rename from paddle/capi/tests/test_predict_network.py
rename to paddle/legacy/capi/tests/test_predict_network.py
diff --git a/paddle/capi/vector.h b/paddle/legacy/capi/vector.h
similarity index 100%
rename from paddle/capi/vector.h
rename to paddle/legacy/capi/vector.h
diff --git a/paddle/cuda/CMakeLists.txt b/paddle/legacy/cuda/CMakeLists.txt
similarity index 100%
rename from paddle/cuda/CMakeLists.txt
rename to paddle/legacy/cuda/CMakeLists.txt
diff --git a/paddle/cuda/include/hl_activation_functions.h b/paddle/legacy/cuda/include/hl_activation_functions.h
similarity index 100%
rename from paddle/cuda/include/hl_activation_functions.h
rename to paddle/legacy/cuda/include/hl_activation_functions.h
diff --git a/paddle/cuda/include/hl_aggregate.h b/paddle/legacy/cuda/include/hl_aggregate.h
similarity index 100%
rename from paddle/cuda/include/hl_aggregate.h
rename to paddle/legacy/cuda/include/hl_aggregate.h
diff --git a/paddle/cuda/include/hl_avx_functions.h b/paddle/legacy/cuda/include/hl_avx_functions.h
similarity index 100%
rename from paddle/cuda/include/hl_avx_functions.h
rename to paddle/legacy/cuda/include/hl_avx_functions.h
diff --git a/paddle/legacy/cuda/include/hl_base.h b/paddle/legacy/cuda/include/hl_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..bfe812a4387be72c3e73d6b45852e3a90b1926eb
--- /dev/null
+++ b/paddle/legacy/cuda/include/hl_base.h
@@ -0,0 +1,250 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cstddef>
+
+#ifdef PADDLE_TYPE_DOUBLE
+#define HL_FLOAT_MAX 3.40282347e+38F
+#define HL_FLOAT_MIN 1.17549435e-38F
+using real = double;
+#else
+#define HL_FLOAT_MAX 1.7976931348623157e+308
+#define HL_FLOAT_MIN 2.2250738585072014e-308
+using real = float;
+#endif
+
+/**
+ * The maximum input value for exp, used to avoid overflow problem.
+ * currently only used for tanh function.
+ */
+#define EXP_MAX_INPUT 40.0
+
+/**
+ * @brief DIVUP(x, y) is similar to ceil(x / y).
+ * @note  For CUDA, DIVUP will be used to specify
+ *        the size of blockDim.
+ */
+#ifndef DIVUP
+#define DIVUP(x, y) (((x) + (y)-1) / (y))
+#endif
+
+/**
+ * HPPL is an internal high performance parallel computing library
+ * for high-level neural network routines, which can support many
+ * heterogeneous compute architectures, such as GPU, FPGA, etc.
+ */
+
+/**
+ * @brief   HPPL CUDA Stream.
+ *
+ * @note    Each thread can use HPPL_STREAM_* after calling hl_init.
+ *          HPPL_STREAM_DEFAULT is HPPL default stream.
+ */
+typedef enum {
+  HPPL_STREAM_DEFAULT = 0, /* Thread Default Stream*/
+  HPPL_STREAM_1 = 1,
+  HPPL_STREAM_2 = 2,
+  HPPL_STREAM_3 = 3,
+  HPPL_STREAM_4 = 4,
+  HPPL_THREAD_STREAM_1 = 5,
+  HPPL_THREAD_STREAM_2 = 6,
+  HPPL_THREAD_STREAM_3 = 7,
+  HPPL_THREAD_STREAM_4 = 8,
+  HPPL_STREAM_END
+} hl_stream_t;
+
+/**
+ * @brief HPPL activation mode.
+ */
+typedef enum {
+  HL_ACTIVATION_SIGMOID = 0,
+  HL_ACTIVATION_RELU = 1,
+  HL_ACTIVATION_TANH = 2,
+  HL_ACTIVATION_LINEAR = 3,
+  HL_ACTIVATION_END
+} hl_activation_mode_t;
+
+/**
+ * @brief Transpose type.
+ */
+typedef enum {
+  HPPL_OP_N = 0, /* transpose */
+  HPPL_OP_T = 1, /* non transpose */
+  HPPL_OP_END
+} hl_trans_op_t;
+
+/**
+ * @brief Lstm value.
+ *
+ * @param  gateValue         input value.
+ * @param  prevStateValue    previous state value.
+ * @param  stateValue        state value.
+ * @param  stateActiveValue  state active value.
+ * @param  outputValue       output value.
+ */
+typedef struct {
+  real *gateValue;
+  real *prevStateValue;
+  real *stateValue;
+  real *stateActiveValue;
+  real *outputValue;
+  real *checkIg;
+  real *checkFg;
+  real *checkOg;
+} hl_lstm_value;
+
+/**
+ * @brief Lstm gradient.
+ *
+ * @param  gateGrad          input gradient.
+ * @param  prevStateGrad     previous state gradient.
+ * @param  stateGrad         state gradient.
+ * @param  stateActiveGrad   state active gradient.
+ * @param  outputGrad        output gradient.
+ */
+typedef struct {
+  real *gateGrad;
+  real *prevStateGrad;
+  real *stateGrad;
+  real *stateActiveGrad;
+  real *outputGrad;
+  real *checkIgGrad;
+  real *checkFgGrad;
+  real *checkOgGrad;
+} hl_lstm_grad;
+
+/**
+ * @brief Gru value.
+ *
+ * @param  gateWeight           gate weight (updateGate + resetGate).
+ * @param  stateWeight          frame state weight.
+ * @param  gateValue            gate value results.
+ * @param  resetOutputValue     resetOutput value.
+ * @param  outputValue          output value.
+ * @param  prevOutValue         previous output value.
+ *
+ */
+typedef struct {
+  real *gateWeight;
+  real *stateWeight;
+  real *gateValue;
+  real *resetOutputValue;
+  real *outputValue;
+  real *prevOutValue;
+} hl_gru_value;
+
+/**
+ * @brief Gru gradient.
+ *
+ * @param  gateWeightGrad       gate weight gradient.
+ * @param  stateWeightGrad      frame state weight gradient.
+ * @param  gateGrad             gate gradient results.
+ * @param  resetOutputGrad      resetOutput gradient.
+ * @param  outputGrad           output gradient.
+ * @param  prevOutGrad          previous output gradient.
+ */
+typedef struct {
+  real *gateWeightGrad;
+  real *stateWeightGrad;
+  real *gateGrad;
+  real *resetOutputGrad;
+  real *outputGrad;
+  real *prevOutGrad;
+} hl_gru_grad;
+
+/**
+ * @brief  Sparse matrix value type.
+ */
+typedef enum {
+  HL_NO_VALUE = 0, /* matrix values only 0 or 1 */
+  HL_FLOAT_VALUE = 1,
+  HL_VALUE_END
+} hl_matrix_value_t;
+
+/**
+ * @brief  HPPL matrix format.
+ */
+typedef enum {
+  HL_SPARSE_CSR = 0,
+  HL_SPARSE_CSC = 1,
+  HL_SPARSE_END
+} hl_matrix_format_t;
+
+typedef struct _hl_matrix_s *hl_matrix_s;
+
+/**
+ * @brief   HPPL sparse matrix.
+ *
+ * @param  matrix     sparse matrix.
+ * @param  format     matrix format.
+ * @param  type       the type of matrix values.
+ * @param  rows       matrix rows.
+ * @param  cols       matrix columns.
+ * @param  nnz        nonzero values of sparse matrix.
+ */
+typedef struct {
+  hl_matrix_s matrix;
+  hl_matrix_format_t format;
+  hl_matrix_value_t type;
+  int rows;
+  int cols;
+  size_t nnz;
+} _hl_sparse_matrix_s, *hl_sparse_matrix_s;
+
+#ifdef __NVCC__
+
+#include <cuda_runtime.h>
+#include "paddle/legacy/cuda/include/hl_cuda.h"
+#include "paddle/legacy/utils/Logging.h"
+
+extern __thread bool g_sync_flag;
+extern __thread cudaStream_t default_stream;
+#define STREAM_DEFAULT default_stream
+
+/**
+ * @brief   Check cuda kernel execution.
+ * @param   msg   error string
+ */
+#define CHECK_SYNC(msg)                                               \
+  if (true == g_sync_flag) {                                          \
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);                       \
+    cudaError_t err = (cudaError_t)hl_get_device_last_error();        \
+    CHECK_EQ(cudaSuccess, err)                                        \
+        << "[" << msg << "] "                                         \
+        << "CUDA error: " << hl_get_device_error_string((size_t)err); \
+  }
+
+// __shfl has been deprecated as of CUDA 9.0.
+#if CUDA_VERSION < 9000
+template <typename T>
+__forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) {
+  return __shfl_down(val, delta);
+}
+
+template <typename T>
+__forceinline__ __device__ T
+__shfl_sync(unsigned, T val, int src_line, int width) {
+  return __shfl(val, src_line, width);
+}
+
+#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
+#else
+#define FULL_WARP_MASK 0xFFFFFFFF
+#define CREATE_SHFL_MASK(mask, predicate) \
+  mask = __ballot_sync(FULL_WARP_MASK, (predicate))
+#endif
+
+#endif  // __NVCC__
diff --git a/paddle/cuda/include/hl_batch_norm.h b/paddle/legacy/cuda/include/hl_batch_norm.h
similarity index 100%
rename from paddle/cuda/include/hl_batch_norm.h
rename to paddle/legacy/cuda/include/hl_batch_norm.h
diff --git a/paddle/cuda/include/hl_batch_transpose.h b/paddle/legacy/cuda/include/hl_batch_transpose.h
similarity index 100%
rename from paddle/cuda/include/hl_batch_transpose.h
rename to paddle/legacy/cuda/include/hl_batch_transpose.h
diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/legacy/cuda/include/hl_cnn.h
similarity index 100%
rename from paddle/cuda/include/hl_cnn.h
rename to paddle/legacy/cuda/include/hl_cnn.h
diff --git a/paddle/cuda/include/hl_cpu_gru.cuh b/paddle/legacy/cuda/include/hl_cpu_gru.cuh
similarity index 100%
rename from paddle/cuda/include/hl_cpu_gru.cuh
rename to paddle/legacy/cuda/include/hl_cpu_gru.cuh
diff --git a/paddle/cuda/include/hl_cpu_lstm.cuh b/paddle/legacy/cuda/include/hl_cpu_lstm.cuh
similarity index 100%
rename from paddle/cuda/include/hl_cpu_lstm.cuh
rename to paddle/legacy/cuda/include/hl_cpu_lstm.cuh
diff --git a/paddle/cuda/include/hl_cpu_matrix_kernel.cuh b/paddle/legacy/cuda/include/hl_cpu_matrix_kernel.cuh
similarity index 100%
rename from paddle/cuda/include/hl_cpu_matrix_kernel.cuh
rename to paddle/legacy/cuda/include/hl_cpu_matrix_kernel.cuh
diff --git a/paddle/cuda/include/hl_cpu_matrix_kernel_detail.cuh b/paddle/legacy/cuda/include/hl_cpu_matrix_kernel_detail.cuh
similarity index 100%
rename from paddle/cuda/include/hl_cpu_matrix_kernel_detail.cuh
rename to paddle/legacy/cuda/include/hl_cpu_matrix_kernel_detail.cuh
diff --git a/paddle/cuda/include/hl_cpu_scalar.cuh b/paddle/legacy/cuda/include/hl_cpu_scalar.cuh
similarity index 100%
rename from paddle/cuda/include/hl_cpu_scalar.cuh
rename to paddle/legacy/cuda/include/hl_cpu_scalar.cuh
diff --git a/paddle/cuda/include/hl_cpu_simd_neon.cuh b/paddle/legacy/cuda/include/hl_cpu_simd_neon.cuh
similarity index 100%
rename from paddle/cuda/include/hl_cpu_simd_neon.cuh
rename to paddle/legacy/cuda/include/hl_cpu_simd_neon.cuh
diff --git a/paddle/cuda/include/hl_cpu_simd_sse.cuh b/paddle/legacy/cuda/include/hl_cpu_simd_sse.cuh
similarity index 100%
rename from paddle/cuda/include/hl_cpu_simd_sse.cuh
rename to paddle/legacy/cuda/include/hl_cpu_simd_sse.cuh
diff --git a/paddle/cuda/include/hl_cuda.h b/paddle/legacy/cuda/include/hl_cuda.h
similarity index 100%
rename from paddle/cuda/include/hl_cuda.h
rename to paddle/legacy/cuda/include/hl_cuda.h
diff --git a/paddle/cuda/include/hl_cuda.ph b/paddle/legacy/cuda/include/hl_cuda.ph
similarity index 100%
rename from paddle/cuda/include/hl_cuda.ph
rename to paddle/legacy/cuda/include/hl_cuda.ph
diff --git a/paddle/cuda/include/hl_cuda_cublas.h b/paddle/legacy/cuda/include/hl_cuda_cublas.h
similarity index 100%
rename from paddle/cuda/include/hl_cuda_cublas.h
rename to paddle/legacy/cuda/include/hl_cuda_cublas.h
diff --git a/paddle/cuda/include/hl_cuda_cudnn.h b/paddle/legacy/cuda/include/hl_cuda_cudnn.h
similarity index 100%
rename from paddle/cuda/include/hl_cuda_cudnn.h
rename to paddle/legacy/cuda/include/hl_cuda_cudnn.h
diff --git a/paddle/cuda/include/hl_cuda_cudnn.ph b/paddle/legacy/cuda/include/hl_cuda_cudnn.ph
similarity index 100%
rename from paddle/cuda/include/hl_cuda_cudnn.ph
rename to paddle/legacy/cuda/include/hl_cuda_cudnn.ph
diff --git a/paddle/cuda/include/hl_device_functions.cuh b/paddle/legacy/cuda/include/hl_device_functions.cuh
similarity index 100%
rename from paddle/cuda/include/hl_device_functions.cuh
rename to paddle/legacy/cuda/include/hl_device_functions.cuh
diff --git a/paddle/cuda/include/hl_functions.h b/paddle/legacy/cuda/include/hl_functions.h
similarity index 100%
rename from paddle/cuda/include/hl_functions.h
rename to paddle/legacy/cuda/include/hl_functions.h
diff --git a/paddle/cuda/include/hl_gpu.h b/paddle/legacy/cuda/include/hl_gpu.h
similarity index 100%
rename from paddle/cuda/include/hl_gpu.h
rename to paddle/legacy/cuda/include/hl_gpu.h
diff --git a/paddle/cuda/include/hl_gpu_functions.cuh b/paddle/legacy/cuda/include/hl_gpu_functions.cuh
similarity index 100%
rename from paddle/cuda/include/hl_gpu_functions.cuh
rename to paddle/legacy/cuda/include/hl_gpu_functions.cuh
diff --git a/paddle/legacy/cuda/include/hl_gpu_gru.cuh b/paddle/legacy/cuda/include/hl_gpu_gru.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..8d299572c73e879a3a1e9fb60608c4f3abd1f685
--- /dev/null
+++ b/paddle/legacy/cuda/include/hl_gpu_gru.cuh
@@ -0,0 +1,393 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_GPU_GRU_CUH_
+#define HL_GPU_GRU_CUH_
+
+#ifdef __NVCC__
+
+#include "paddle/legacy/utils/Logging.h"
+
+/*
+ * threads(framePerBlock, batchPerBlock)
+ * grid(frameBlocks, batchBlocks)
+ */
+template<class OpResetOutput, bool isBatch>
+__global__ void KeGruForwardResetOutput(OpResetOutput opResetOutput,
+                                        real *gateValue,
+                                        real *resetOutputValue,
+                                        real *prevOutputValue,
+                                        int frameSize,
+                                        int batchSize,
+                                        hl_activation_mode_t active_gate) {
+  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frameIdx >= frameSize) return;
+
+  int batchIdx = 0;
+  if (isBatch) {
+    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batchIdx >= batchSize) return;
+    gateValue += batchIdx * 3 * frameSize;
+    resetOutputValue += batchIdx * frameSize;
+  }
+
+  real rPrevOut = 0;
+  real rValueResetOutput;
+  real rValueUpdateGate = gateValue[frameIdx + frameSize * 0];
+  real rValueResetGate  = gateValue[frameIdx + frameSize * 1];
+
+  if (prevOutputValue) {
+    if (isBatch) prevOutputValue += batchIdx * frameSize;
+    rPrevOut = prevOutputValue[frameIdx];
+  }
+
+  opResetOutput(rValueUpdateGate,
+                rValueResetGate,
+                rPrevOut,
+                rValueResetOutput,
+                hppl::gpu::forward[active_gate]);
+
+  gateValue[frameIdx + frameSize * 0] = rValueUpdateGate;
+  gateValue[frameIdx + frameSize * 1] = rValueResetGate;
+  resetOutputValue[frameIdx] = rValueResetOutput;
+}
+
+/*
+ * threads(framePerBlock, batchPerBlock)
+ * grid(frameBlocks, batchBlocks)
+ */
+template<class OpFinalOutput, bool isBatch>
+__global__ void KeGruForwardFinalOutput(OpFinalOutput opFinalOutput,
+                                        real *gateValue,
+                                        real *prevOutputValue,
+                                        real *outputValue,
+                                        int frameSize,
+                                        int batchSize,
+                                        hl_activation_mode_t active_node) {
+  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frameIdx >= frameSize) return;
+  int batchIdx = 0;
+  if (isBatch) {
+    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batchIdx >= batchSize) return;
+    gateValue += batchIdx * 3 * frameSize;
+    outputValue += batchIdx * frameSize;
+  }
+
+  real rOutput;
+  real rPrevOut = 0;
+  real rValueUpdateGate = gateValue[frameIdx + frameSize * 0];
+  real rValueFrameState = gateValue[frameIdx + frameSize * 2];
+
+  if (prevOutputValue) {
+    if (isBatch) prevOutputValue += batchIdx * frameSize;
+    rPrevOut = prevOutputValue[frameIdx];
+  }
+
+  opFinalOutput(rValueUpdateGate,
+                rValueFrameState,
+                rPrevOut,
+                rOutput,
+                hppl::gpu::forward[active_node]);
+
+  gateValue[frameIdx + frameSize * 2] = rValueFrameState;
+  outputValue[frameIdx] = rOutput;
+}
+
+template<class OpResetOutput, class OpFinalOutput>
+void hl_gpu_gru_forward(OpResetOutput opResetOutput,
+                        OpFinalOutput opFinalOutput,
+                        hl_gru_value value,
+                        int frameSize,
+                        int batchSize,
+                        hl_activation_mode_t active_node,
+                        hl_activation_mode_t active_gate) {
+  dim3 threads;
+  dim3 grid;
+  if (batchSize == 1) {
+    int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
+    int frameBlocks = (frameSize + 1024 - 1) / 1024;
+    threads = dim3(framePerBlock, 1);
+    grid = dim3(frameBlocks, 1);
+  } else {
+    threads = dim3(32, 32);
+    grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
+  }
+
+  if (value.prevOutValue) {
+    hl_matrix_mul(value.prevOutValue, HPPL_OP_N,
+                  value.gateWeight, HPPL_OP_N,
+                  value.gateValue,
+                  batchSize, 2*frameSize, frameSize,
+                  /*alpha = */ 1, /*beta = */ 1,
+                  frameSize, 2* frameSize, 3*frameSize);
+  }
+
+  if (batchSize == 1) {
+    KeGruForwardResetOutput<OpResetOutput, /* isBatch= */false>
+      <<<grid, threads, 0, STREAM_DEFAULT>>>(opResetOutput,
+        value.gateValue, value.resetOutputValue, value.prevOutValue,
+        frameSize, batchSize, active_gate);
+  } else {
+    KeGruForwardResetOutput<OpResetOutput, /* isBatch= */true>
+      <<<grid, threads, 0, STREAM_DEFAULT>>>(opResetOutput,
+        value.gateValue, value.resetOutputValue, value.prevOutValue,
+        frameSize, batchSize, active_gate);
+  }
+
+  if (value.prevOutValue) {
+    hl_matrix_mul(value.resetOutputValue, HPPL_OP_N,
+                  value.stateWeight, HPPL_OP_N,
+                  value.gateValue + 2*frameSize,
+                  batchSize, frameSize, frameSize,
+                  /*alpha = */ 1, /*beta = */ 1,
+                  frameSize, frameSize, 3*frameSize);
+  }
+
+  if (batchSize == 1) {
+    KeGruForwardFinalOutput<OpFinalOutput, /* isBatch= */false>
+      <<<grid, threads, 0, STREAM_DEFAULT>>>(opFinalOutput,
+        value.gateValue, value.prevOutValue, value.outputValue,
+        frameSize, batchSize, active_node);
+  } else {
+    KeGruForwardFinalOutput<OpFinalOutput, /* isBatch= */true>
+      <<<grid, threads, 0, STREAM_DEFAULT>>>(opFinalOutput,
+        value.gateValue, value.prevOutValue, value.outputValue,
+        frameSize, batchSize, active_node);
+  }
+
+  CHECK_SYNC("hl_gpu_gru_forward failed");
+}
+
+/*
+ * threads(framePerBlock, batchPerBlock)
+ * grid(frameBlocks, batchBlocks)
+ */
+template<class OpStateGrad, bool isBatch>
+__global__ void KeGruBackwardStateGrad(OpStateGrad opStateGrad,
+                                       real *gateValue,
+                                       real *gateGrad,
+                                       real *prevOutValue,
+                                       real *prevOutGrad,
+                                       real *outputGrad,
+                                       int frameSize,
+                                       int batchSize,
+                                       hl_activation_mode_t active_node) {
+  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frameIdx >= frameSize) return;
+  int batchIdx = 0;
+  if (isBatch) {
+    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batchIdx >= batchSize) return;
+    gateValue += batchIdx * 3 * frameSize;
+    gateGrad  += batchIdx * 3 * frameSize;
+    outputGrad += batchIdx * frameSize;
+  }
+
+  real rUpdateGateGrad;
+  real rFrameStateGrad;
+  real rPrevOutValue = 0;
+  real rPrevOutGrad  = 0;
+  real rUpdateGateValue = gateValue[frameIdx + frameSize * 0];
+  real rFrameStateValue = gateValue[frameIdx + frameSize * 2];
+  real rOutGrad  = outputGrad[frameIdx];
+
+  if (prevOutValue && prevOutGrad) {
+    if (isBatch) prevOutValue += batchIdx * frameSize;
+    rPrevOutValue = prevOutValue[frameIdx];
+
+    if (isBatch) prevOutGrad  += batchIdx * frameSize;
+    rPrevOutGrad  = prevOutGrad[frameIdx];
+  }
+
+  opStateGrad(rUpdateGateValue,
+              rUpdateGateGrad,
+              rFrameStateValue,
+              rFrameStateGrad,
+              rPrevOutValue,
+              rPrevOutGrad,
+              rOutGrad,
+              hppl::gpu::backward[active_node]);
+
+  gateGrad[frameIdx + frameSize * 0] = rUpdateGateGrad;
+  gateGrad[frameIdx + frameSize * 2] = rFrameStateGrad;
+  if (prevOutGrad) {
+    prevOutGrad[frameIdx] = rPrevOutGrad;
+  }
+}
+
+/*
+ * threads(framePerBlock, batchPerBlock)
+ * grid(frameBlocks, batchBlocks)
+ */
+template<class OpResetGrad, bool isBatch>
+__global__ void KeGruBackwardResetGrad(OpResetGrad opResetGrad,
+                                       real *gateValue,
+                                       real *gateGrad,
+                                       real *prevOutValue,
+                                       real *prevOutGrad,
+                                       real *resetOutputGrad,
+                                       int frameSize,
+                                       int batchSize,
+                                       hl_activation_mode_t active_gate) {
+  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frameIdx >= frameSize) return;
+  int batchIdx = 0;
+  if (isBatch) {
+    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batchIdx >= batchSize) return;
+    gateValue += batchIdx * 3 * frameSize;
+    gateGrad  += batchIdx * 3 * frameSize;
+    resetOutputGrad += batchIdx * frameSize;
+  }
+
+  real rResetGateGrad;
+  real rPrevOutValue = 0;
+  real rPrevOutGrad  = 0;
+  real rResetOutputGrad = 0;
+  real rUpdateGateValue = gateValue[frameIdx + frameSize * 0];
+  real rUpdateGateGrad  = gateGrad[frameIdx + frameSize * 0];
+  real rResetGateValue  = gateValue[frameIdx + frameSize * 1];
+
+  if (prevOutValue && prevOutGrad) {
+    if (isBatch) prevOutValue += batchIdx * frameSize;
+    if (isBatch) prevOutGrad  += batchIdx * frameSize;
+    rPrevOutValue = prevOutValue[frameIdx];
+    rPrevOutGrad  = prevOutGrad[frameIdx];
+    rResetOutputGrad = resetOutputGrad[frameIdx];
+  }
+
+  opResetGrad(rUpdateGateValue,
+              rUpdateGateGrad,
+              rResetGateValue,
+              rResetGateGrad,
+              rPrevOutValue,
+              rPrevOutGrad,
+              rResetOutputGrad,
+              hppl::gpu::backward[active_gate]);
+
+  gateGrad[frameIdx + frameSize * 0] = rUpdateGateGrad;
+  gateGrad[frameIdx + frameSize * 1] = rResetGateGrad;
+  if (prevOutGrad) {
+    prevOutGrad[frameIdx] = rPrevOutGrad;
+  }
+}
+
+template<class OpStateGrad, class OpResetGrad>
+void hl_gpu_gru_backward(OpStateGrad opStateGrad,
+                         OpResetGrad opResetGrad,
+                         hl_gru_value value,
+                         hl_gru_grad  grad,
+                         int frameSize,
+                         int batchSize,
+                         hl_activation_mode_t active_node,
+                         hl_activation_mode_t active_gate) {
+  dim3 threads;
+  dim3 grid;
+  if (batchSize == 1) {
+    int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
+    int frameBlocks = (frameSize + 1024 - 1) / 1024;
+    threads = dim3(framePerBlock, 1);
+    grid = dim3(frameBlocks, 1);
+  } else {
+    threads = dim3(32, 32);
+    grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
+  }
+
+  if (batchSize == 1) {
+    KeGruBackwardStateGrad<OpStateGrad, /* isBatch= */false>
+      <<<grid, threads, 0, STREAM_DEFAULT>>>(opStateGrad,
+        value.gateValue, grad.gateGrad, value.prevOutValue, grad.prevOutGrad,
+        grad.outputGrad, frameSize, batchSize, active_node);
+  } else {
+    KeGruBackwardStateGrad<OpStateGrad, /* isBatch= */true>
+      <<<grid, threads, 0, STREAM_DEFAULT>>>(opStateGrad,
+        value.gateValue, grad.gateGrad, value.prevOutValue, grad.prevOutGrad,
+        grad.outputGrad, frameSize, batchSize, active_node);
+  }
+
+  if (value.prevOutValue && grad.prevOutGrad) {
+    hl_matrix_mul(grad.gateGrad + 2*frameSize, HPPL_OP_N,
+                  value.stateWeight, HPPL_OP_T,
+                  grad.resetOutputGrad,
+                  batchSize, frameSize, frameSize,
+                  /*alpha = */ 1, /*beta = */ 0,
+                  3*frameSize, frameSize, frameSize);
+    if (grad.stateWeightGrad) {
+      hl_matrix_mul(value.resetOutputValue, HPPL_OP_T,
+                    grad.gateGrad + 2*frameSize, HPPL_OP_N,
+                    grad.stateWeightGrad,
+                    frameSize, frameSize, batchSize,
+                    /*alpha = */ 1, /*beta = */ 1,
+                    frameSize, 3*frameSize, frameSize);
+    }
+  }
+
+  if (batchSize == 1) {
+    KeGruBackwardResetGrad<OpResetGrad, /* isBatch= */false>
+      <<<grid, threads, 0, STREAM_DEFAULT>>>(opResetGrad,
+        value.gateValue, grad.gateGrad, value.prevOutValue, grad.prevOutGrad,
+        grad.resetOutputGrad, frameSize, batchSize, active_gate);
+  } else {
+    KeGruBackwardResetGrad<OpResetGrad, /* isBatch= */true>
+      <<<grid, threads, 0, STREAM_DEFAULT>>>(opResetGrad,
+        value.gateValue, grad.gateGrad, value.prevOutValue, grad.prevOutGrad,
+        grad.resetOutputGrad, frameSize, batchSize, active_gate);
+  }
+
+  if (grad.prevOutGrad && value.prevOutValue) {
+    hl_matrix_mul(grad.gateGrad, HPPL_OP_N,
+                  value.gateWeight, HPPL_OP_T,
+                  grad.prevOutGrad,
+                  batchSize, frameSize, 2*frameSize,
+                  /*alpha = */ 1, /*beta = */ 1,
+                  3*frameSize, 2*frameSize, frameSize);
+    if (grad.gateWeightGrad) {
+      hl_matrix_mul(value.prevOutValue, HPPL_OP_T,
+                    grad.gateGrad, HPPL_OP_N,
+                    grad.gateWeightGrad,
+                    frameSize, 2*frameSize, batchSize,
+                    /*alpha = */ 1, /*beta = */ 1,
+                    frameSize, 3*frameSize, 2*frameSize);
+    }
+  }
+
+  CHECK_SYNC("hl_gpu_gru_backward failed");
+}
+
+#else
+
+template<class OpResetOutput, class OpFinalOutput>
+void hl_gpu_gru_forward(OpResetOutput opResetOutput,
+                        OpFinalOutput opFinalOutput,
+                        hl_gru_value value,
+                        int frameSize,
+                        int batchSize,
+                        hl_activation_mode_t active_node,
+                        hl_activation_mode_t active_gate) {}
+
+template<class OpStateGrad, class OpResetGrad>
+void hl_gpu_gru_backward(OpStateGrad opStateGrad,
+                         OpResetGrad opResetGrad,
+                         hl_gru_value value,
+                         hl_gru_grad  grad,
+                         int frameSize,
+                         int batchSize,
+                         hl_activation_mode_t active_node,
+                         hl_activation_mode_t active_gate) {}
+
+#endif
+
+#endif /* HL_GPU_GRU_CUH_ */
diff --git a/paddle/legacy/cuda/include/hl_gpu_lstm.cuh b/paddle/legacy/cuda/include/hl_gpu_lstm.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..aae011b838c0eca1197f55d236d759eab8ea993c
--- /dev/null
+++ b/paddle/legacy/cuda/include/hl_gpu_lstm.cuh
@@ -0,0 +1,300 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_GPU_LSTM_CUH_
+#define HL_GPU_LSTM_CUH_
+
+#ifdef __NVCC__
+
+#include "paddle/legacy/utils/Logging.h"
+#include "hl_device_functions.cuh"
+
+/*
+ * threads(framePerBlock, batchPerBlock)
+ * grid(frameBlocks, batchBlocks)
+ */
+template<class Op, bool isBatch>
+__global__ void KeLstmForward(Op op,
+                              hl_lstm_value value,
+                              int frameSize,
+                              int batchSize,
+                              hl_activation_mode_t active_node,
+                              hl_activation_mode_t active_gate,
+                              hl_activation_mode_t active_state) {
+  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frameIdx >= frameSize) return;
+
+  int batchIdx = 0;
+  if (isBatch) {
+    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batchIdx >= batchSize) return;
+    value.gateValue += batchIdx * frameSize * 4;
+    value.outputValue += batchIdx * frameSize;
+    value.stateValue  += batchIdx * frameSize;
+    value.stateActiveValue += batchIdx * frameSize;
+  }
+
+  real rState;
+  real rPrevState = 0;
+  real rStateAtv;
+  real rOut;
+  real rValueIn;
+  real rValueIg;
+  real rValueFg;
+  real rValueOg;
+  real rCheckI = value.checkIg[frameIdx];
+  real rCheckF = value.checkFg[frameIdx];
+  real rCheckO = value.checkOg[frameIdx];
+
+  rValueIn = value.gateValue[frameIdx];
+  rValueIg = value.gateValue[frameIdx + frameSize];
+  rValueFg = value.gateValue[frameIdx + frameSize * 2];
+  rValueOg = value.gateValue[frameIdx + frameSize * 3];
+
+  if (value.prevStateValue) {
+    if (isBatch) value.prevStateValue += batchIdx * frameSize;
+    rPrevState = value.prevStateValue[frameIdx];
+  }
+
+  op(rValueIn,
+     rValueIg,
+     rValueFg,
+     rValueOg,
+     rPrevState,
+     rState,
+     rStateAtv,
+     rOut,
+     rCheckI,
+     rCheckF,
+     rCheckO,
+     hppl::gpu::forward[active_node],
+     hppl::gpu::forward[active_gate],
+     hppl::gpu::forward[active_state]);
+
+  value.gateValue[frameIdx] = rValueIn;
+  value.gateValue[frameIdx + frameSize] = rValueIg;
+  value.gateValue[frameIdx + frameSize * 2] = rValueFg;
+  value.gateValue[frameIdx + frameSize * 3] = rValueOg;
+
+  value.stateValue[frameIdx] = rState;
+  value.stateActiveValue[frameIdx] = rStateAtv;
+  value.outputValue[frameIdx] = rOut;
+}
+
+/*
+ * threads(framePerBlock, batchPerBlock)
+ * grid(frameBlocks, batchBlocks)
+ */
+template<class Op, bool isBatch>
+__global__ void KeLstmBackward(Op op,
+                               hl_lstm_value value,
+                               hl_lstm_grad grad,
+                               int frameSize,
+                               int batchSize,
+                               hl_activation_mode_t active_node,
+                               hl_activation_mode_t active_gate,
+                               hl_activation_mode_t active_state) {
+  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frameIdx >= frameSize) return;
+
+  int batchIdx = 0;
+  if (isBatch) {
+    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batchIdx >= batchSize) return;
+    value.gateValue += batchIdx * frameSize * 4;
+    value.stateValue += batchIdx * frameSize;
+    value.stateActiveValue += batchIdx * frameSize;
+    grad.gateGrad += batchIdx * frameSize * 4;
+    grad.stateGrad += batchIdx * frameSize;
+    grad.outputGrad += batchIdx * frameSize;
+  }
+
+  real rValueIn;
+  real rValueIg;
+  real rValueFg;
+  real rValueOg;
+  real rGradIn;
+  real rGradIg;
+  real rGradFg;
+  real rGradOg;
+  real rPrevState = 0;
+  real rPrevStateGrad;
+  real rState;
+  real rStateGrad;
+  real rStateAtv;
+  real rOutputGrad;
+  real rCheckI = value.checkIg[frameIdx];
+  real rCheckF = value.checkFg[frameIdx];
+  real rCheckO = value.checkOg[frameIdx];
+  real rCheckIGrad;
+  real rCheckFGrad;
+  real rCheckOGrad;
+
+  rValueIn = value.gateValue[frameIdx];
+  rValueIg = value.gateValue[frameIdx + frameSize];
+  rValueFg = value.gateValue[frameIdx + frameSize * 2];
+  rValueOg = value.gateValue[frameIdx + frameSize * 3];
+  rState = value.stateValue[frameIdx];
+  rStateAtv = value.stateActiveValue[frameIdx];
+  rOutputGrad = grad.outputGrad[frameIdx];
+  rStateGrad = grad.stateGrad[frameIdx];
+
+  if (value.prevStateValue) {
+    if (isBatch) value.prevStateValue += batchIdx * frameSize;
+    rPrevState = value.prevStateValue[frameIdx];
+  }
+
+  op(rValueIn,
+     rValueIg,
+     rValueFg,
+     rValueOg,
+     rGradIn,
+     rGradIg,
+     rGradFg,
+     rGradOg,
+     rPrevState,
+     rPrevStateGrad,
+     rState,
+     rStateGrad,
+     rStateAtv,
+     rOutputGrad,
+     rCheckI,
+     rCheckF,
+     rCheckO,
+     rCheckIGrad,
+     rCheckFGrad,
+     rCheckOGrad,
+     hppl::gpu::backward[active_node],
+     hppl::gpu::backward[active_gate],
+     hppl::gpu::backward[active_state]);
+
+  grad.gateGrad[frameIdx] = rGradIn;
+  grad.gateGrad[frameIdx + frameSize    ] = rGradIg;
+  grad.gateGrad[frameIdx + frameSize * 2] = rGradFg;
+  grad.gateGrad[frameIdx + frameSize * 3] = rGradOg;
+  grad.stateGrad[frameIdx] = rStateGrad;
+  if (grad.prevStateGrad) {
+    if (isBatch) grad.prevStateGrad += batchIdx * frameSize;
+    grad.prevStateGrad[frameIdx] = rPrevStateGrad;
+  }
+
+  if (isBatch) {
+    if (value.prevStateValue) {
+      if (grad.checkIgGrad) paddle::paddleAtomicAdd(grad.checkIgGrad+frameIdx, rCheckIGrad);
+      if (grad.checkFgGrad) paddle::paddleAtomicAdd(grad.checkFgGrad+frameIdx, rCheckFGrad);
+    }
+    if (grad.checkOgGrad) paddle::paddleAtomicAdd(grad.checkOgGrad+frameIdx, rCheckOGrad);
+  } else {
+    if (value.prevStateValue) {
+      if (grad.checkIgGrad) grad.checkIgGrad[frameIdx] += rCheckIGrad;
+      if (grad.checkFgGrad) grad.checkFgGrad[frameIdx] += rCheckFGrad;
+    }
+    if (grad.checkOgGrad) grad.checkOgGrad[frameIdx] += rCheckOGrad;
+  }
+}
+
+template<class Op>
+void hl_gpu_lstm_forward(Op op,
+                         hl_lstm_value value,
+                         int frameSize,
+                         int batchSize,
+                         hl_activation_mode_t active_node,
+                         hl_activation_mode_t active_gate,
+                         hl_activation_mode_t active_state) {
+  dim3 threads;
+  dim3 grid;
+  if (batchSize == 1) {
+    int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
+    int frameBlocks = (frameSize + 1024 - 1) / 1024;
+    threads = dim3(framePerBlock, 1);
+    grid = dim3(frameBlocks, 1);
+  } else {
+    /* framePerBlock = 32 batchPerBlock = 32 */
+    threads = dim3(32, 32);
+    grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
+  }
+
+  if (batchSize == 1) {
+    KeLstmForward<Op, /* isBatch= */false>
+      <<<grid, threads, 0, STREAM_DEFAULT>>>(op, value,
+      frameSize, batchSize, active_node, active_gate, active_state);
+  } else {
+    KeLstmForward<Op, /* isBatch= */true>
+      <<<grid, threads, 0, STREAM_DEFAULT>>>(op, value,
+      frameSize, batchSize, active_node, active_gate, active_state);
+  }
+
+  CHECK_SYNC("hl_gpu_lstm_forward failed");
+}
+
+template<class Op>
+void hl_gpu_lstm_backward(Op op,
+                          hl_lstm_value value,
+                          hl_lstm_grad grad,
+                          int frameSize,
+                          int batchSize,
+                          hl_activation_mode_t active_node,
+                          hl_activation_mode_t active_gate,
+                          hl_activation_mode_t active_state) {
+  dim3 threads;
+  dim3 grid;
+  if (batchSize == 1) {
+    int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
+    int frameBlocks = (frameSize + 1024 - 1) / 1024;
+    threads = dim3(framePerBlock, 1);
+    grid = dim3(frameBlocks, 1);
+  } else {
+    /* framePerBlock = 32 batchPerBlock = 32 */
+    threads = dim3(32, 32);
+    grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
+  }
+
+  if (batchSize == 1) {
+    KeLstmBackward<Op, /* isBatch= */false>
+      <<<grid, threads, 0, STREAM_DEFAULT>>>(op, value, grad,
+      frameSize, batchSize, active_node, active_gate, active_state);
+  } else {
+    KeLstmBackward<Op, /* isBatch= */true>
+      <<<grid, threads, 0, STREAM_DEFAULT>>>(op, value, grad,
+      frameSize, batchSize, active_node, active_gate, active_state);
+  }
+
+  CHECK_SYNC("hl_gpu_lstm_backward failed");
+}
+
+#else
+
+template<class Op>
+void hl_gpu_lstm_forward(Op op,
+                         hl_lstm_value value,
+                         int frameSize,
+                         int batchSize,
+                         hl_activation_mode_t active_node,
+                         hl_activation_mode_t active_gate,
+                         hl_activation_mode_t active_state) {}
+
+template<class Op>
+void hl_gpu_lstm_backward(Op op,
+                          hl_lstm_value value,
+                          hl_lstm_grad grad,
+                          int frameSize,
+                          int batchSize,
+                          hl_activation_mode_t active_node,
+                          hl_activation_mode_t active_gate,
+                          hl_activation_mode_t active_state) {}
+
+#endif
+
+#endif /* HL_GPU_LSTM_CUH_ */
diff --git a/paddle/legacy/cuda/include/hl_gpu_matrix_kernel.cuh b/paddle/legacy/cuda/include/hl_gpu_matrix_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..6177d23657fba5b2800041a3dd7b5f76bf35aa1a
--- /dev/null
+++ b/paddle/legacy/cuda/include/hl_gpu_matrix_kernel.cuh
@@ -0,0 +1,629 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+
+#ifndef HL_GPU_MATRIX_KERNEL_CUH_
+#define HL_GPU_MATRIX_KERNEL_CUH_
+
+#include <algorithm>
+#include "paddle/legacy/utils/Logging.h"
+#include "hl_base.h"
+
+#ifdef __NVCC__
+/* gpu apply interface */
+
+template<class T, class Op>
+__global__ void KeEltWiseUnaryOp(T* A_d, const int border, Op op) {
+  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < border) {
+    op.gpuOperator(A_d[idx]);
+  }
+}
+
+template<class T, class Op>
+__global__ void KeEltWiseUnaryOp(T* A_d,
+                                 int dimM,
+                                 int dimN,
+                                 int lda,
+                                 Op op) {
+  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  for (int i = rowIdx; i < dimM; i += gridDim.y * blockDim.y) {
+    for (int j = colIdx; j < dimN; j += gridDim.x * blockDim.x) {
+      op.gpuOperator(A_d[i * lda + j]);
+    }
+  }
+}
+
+template<class T, class Op>
+__global__ void KeEltWiseBinaryOp(T* A_d, T *B_d, const int border, Op op) {
+  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < border) {
+    op.gpuOperator(A_d[idx], B_d[idx]);
+  }
+}
+
+template<class T, class Op, bool BAsRowVector, bool BAsColVector>
+__global__ void KeEltWiseBinaryOp(T *A_d,
+                                  T *B_d,
+                                  int dimM,
+                                  int dimN,
+                                  int lda,
+                                  int ldb,
+                                  Op op) {
+  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  for (int i = rowIdx; i < dimM; i += gridDim.y * blockDim.y) {
+    for (int j = colIdx; j < dimN; j += gridDim.x * blockDim.x) {
+      if (BAsRowVector == 0 && BAsColVector == 0) {
+        op.gpuOperator(A_d[i * lda + j], B_d[i * ldb + j]);
+      } else if (BAsRowVector == 1 && BAsColVector == 0) {
+        op.gpuOperator(A_d[i * lda + j], B_d[j]);
+      } else if (BAsRowVector == 0 && BAsColVector == 1) {
+        op.gpuOperator(A_d[i * lda + j], B_d[i * ldb]);
+      } else {
+        op.gpuOperator(A_d[i * lda + j], B_d[0]);
+      }
+    }
+  }
+}
+
+template<class T, class Op>
+__global__ void KeEltWiseTernaryOp(T* A_d,
+                                   T *B_d,
+                                   T *C_d,
+                                   const int border,
+                                   Op op) {
+  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < border) {
+    op.gpuOperator(A_d[idx], B_d[idx], C_d[idx]);
+  }
+}
+
+template<class T, class Op, bool CAsRowVector, bool CAsColVector>
+__global__ void KeEltWiseTernaryOp(T* A_d,
+                                   T* B_d,
+                                   T* C_d,
+                                   int dimM,
+                                   int dimN,
+                                   int lda,
+                                   int ldb,
+                                   int ldc,
+                                   Op op) {
+  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  for (int i = rowIdx; i < dimM; i += gridDim.y * blockDim.y) {
+    for (int j = colIdx; j < dimN; j += gridDim.x * blockDim.x) {
+      if (CAsRowVector == 0 && CAsColVector == 0) {
+        op.gpuOperator(A_d[i*lda + j], B_d[i*ldb + j], C_d[i*ldc + j]);
+      } else if (CAsRowVector == 1 && CAsColVector == 0) {
+        op.gpuOperator(A_d[i*lda + j], B_d[i*ldb + j], C_d[j]);
+      } else if (CAsRowVector == 0 && CAsColVector == 1) {
+        op.gpuOperator(A_d[i*lda + j], B_d[i*ldb + j], C_d[i*ldc]);
+      } else {
+        op.gpuOperator(A_d[i*lda + j], B_d[i*ldb + j], C_d[0]);
+      }
+    }
+  }
+}
+
+template<class T, class Op>
+__global__ void KeEltWiseQuaternaryOp(T* A_d,
+                                      T* B_d,
+                                      T* C_d,
+                                      T* D_d,
+                                      const int border,
+                                      Op op) {
+  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < border) {
+    op.gpuOperator(A_d[idx], B_d[idx], C_d[idx], D_d[idx]);
+  }
+}
+
+template<class T, class Op>
+__global__ void KeEltWiseQuaternaryOp(T* A_d,
+                                      T* B_d,
+                                      T* C_d,
+                                      T* D_d,
+                                      int dimM,
+                                      int dimN,
+                                      int lda,
+                                      int ldb,
+                                      int ldc,
+                                      int ldd,
+                                      Op op) {
+  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  for (int i = rowIdx; i < dimM; i += gridDim.y * blockDim.y) {
+    for (int j = colIdx; j < dimN; j += gridDim.x * blockDim.x) {
+      op.gpuOperator(A_d[i*lda + j],
+        B_d[i*ldb + j], C_d[i*ldc + j], D_d[i*ldd + j]);
+    }
+  }
+}
+
+/**
+ * @brief   gpu element wise unary operator.
+ */
+template <class T, class Op>
+void hl_gpu_apply_unary_op(Op op, T* A_d, int dimM, int dimN, int lda) {
+  CHECK_NOTNULL(A_d);
+
+  if (dimM == 1 || dimN == lda) {
+    int size = dimM * dimN;
+    int blockSize = size <= 1024 ? size : 1024;
+    int gridSize = (size + 1024 - 1) / 1024;
+    KeEltWiseUnaryOp<T, Op><<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+      (A_d, size, op);
+  } else {
+    int blockSizeY = std::min(32, dimM);
+    int blockSizeX = (32 / blockSizeY) * 32;
+    int gridSizeX = std::min(32, (dimN + blockSizeX - 1) / blockSizeX);
+    int gridSizeY = std::min(32, (dimM + blockSizeY - 1) / blockSizeY);
+    dim3 threads(blockSizeX, blockSizeY);
+    dim3 grid(gridSizeX, gridSizeY);
+    KeEltWiseUnaryOp<T, Op><<<grid, threads, 0, STREAM_DEFAULT>>>
+      (A_d, dimM, dimN, lda, op);
+  }
+
+  CHECK_SYNC("hl_gpu_apply_unary_op failed");
+}
+
+/**
+ * @brief   gpu element wise binary operator.
+ */
+template <class T, class Op, bool BAsRowVector, bool BAsColVector>
+void hl_gpu_apply_binary_op(Op op,
+                            T* A_d,
+                            T* B_d,
+                            int dimM,
+                            int dimN,
+                            int lda,
+                            int ldb) {
+  CHECK_NOTNULL(A_d);
+
+  if ((BAsRowVector == 0 && BAsColVector == 0) &&
+      ((dimM == 1) || (dimN == lda && dimN == ldb))) {
+    int size = dimM * dimN;
+    int blockSize = size <= 1024 ? size : 1024;
+    int gridSize = (size + 1024 - 1) / 1024;
+    KeEltWiseBinaryOp<T, Op><<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+      (A_d, B_d, size, op);
+  } else {
+    int blockSizeY = std::min(32, dimM);
+    int blockSizeX = (32 / blockSizeY) * 32;
+    int gridSizeX = std::min(32, (dimN + blockSizeX - 1) / blockSizeX);
+    int gridSizeY = std::min(32, (dimM + blockSizeY - 1) / blockSizeY);
+    dim3 threads(blockSizeX, blockSizeY);
+    dim3 grid(gridSizeX, gridSizeY);
+    KeEltWiseBinaryOp<T, Op, BAsRowVector, BAsColVector>
+      <<<grid, threads, 0, STREAM_DEFAULT>>>
+      (A_d, B_d, dimM, dimN, lda, ldb, op);
+  }
+
+  CHECK_SYNC("hl_gpu_apply_binary_op failed");
+}
+
+/**
+ * @brief   gpu element wise ternary operator.
+ */
+template <class T, class Op, bool CAsRowVector, bool CAsColVector>
+void hl_gpu_apply_ternary_op(Op op,
+                             T* A_d,
+                             T* B_d,
+                             T* C_d,
+                             int dimM,
+                             int dimN,
+                             int lda,
+                             int ldb,
+                             int ldc) {
+  CHECK_NOTNULL(A_d);
+
+  if ((CAsRowVector == 0 && CAsColVector == 0) &&
+      ((dimM == 1) || (dimN == lda && dimN == ldb && dimN == ldc))) {
+    int size = dimM * dimN;
+    int blockSize = size <= 1024 ? size : 1024;
+    int gridSize = (size + 1024 - 1) / 1024;
+    KeEltWiseTernaryOp<T, Op><<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+      (A_d, B_d, C_d, size, op);
+  } else {
+    int blockSizeY = std::min(32, dimM);
+    int blockSizeX = (32 / blockSizeY) * 32;
+    int gridSizeX = std::min(32, (dimN + blockSizeX - 1) / blockSizeX);
+    int gridSizeY = std::min(32, (dimM + blockSizeY - 1) / blockSizeY);
+    dim3 threads(blockSizeX, blockSizeY);
+    dim3 grid(gridSizeX, gridSizeY);
+    KeEltWiseTernaryOp<T, Op, CAsRowVector, CAsColVector>
+      <<<grid, threads, 0, STREAM_DEFAULT>>>
+      (A_d, B_d, C_d, dimM, dimN, lda, ldb, ldc, op);
+  }
+
+  CHECK_SYNC("hl_gpu_apply_ternary_op failed");
+}
+
+
+/**
+ * @brief   gpu element wise quaternary operator.
+ */
+template <class T, class Op>
+void hl_gpu_apply_quaternary_op(Op op,
+                                T* A_d,
+                                T* B_d,
+                                T* C_d,
+                                T* D_d,
+                                int dimM,
+                                int dimN,
+                                int lda,
+                                int ldb,
+                                int ldc,
+                                int ldd) {
+  CHECK_NOTNULL(A_d);
+
+  if ((dimM == 1) ||
+      (dimN == lda && dimN == ldb && dimN == ldc && dimN == ldd)) {
+    int size = dimM * dimN;
+    int blockSize = size <= 1024 ? size : 1024;
+    int gridSize = (size + 1024 - 1) / 1024;
+    KeEltWiseQuaternaryOp<T, Op><<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+      (A_d, B_d, C_d, D_d, size, op);
+  } else {
+    int blockSizeY = std::min(32, dimM);
+    int blockSizeX = (32 / blockSizeY) * 32;
+    int gridSizeX = std::min(32, (dimN + blockSizeX - 1) / blockSizeX);
+    int gridSizeY = std::min(32, (dimM + blockSizeY - 1) / blockSizeY);
+    dim3 threads(blockSizeX, blockSizeY);
+    dim3 grid(gridSizeX, gridSizeY);
+    KeEltWiseQuaternaryOp<T, Op><<<grid, threads, 0, STREAM_DEFAULT>>>
+      (A_d, B_d, C_d, D_d, dimM, dimN, lda, ldb, ldc, ldd, op);
+  }
+
+  CHECK_SYNC("hl_gpu_apply_quaternary_op failed");
+}
+
+#else
+
+template <class T, class Op>
+void hl_gpu_apply_unary_op(Op op, T* A_d, int dimM, int dimN, int lda) {}
+
+template <class T, class Op, bool BAsRowVector, bool BAsColVector>
+void hl_gpu_apply_binary_op(Op op,
+                            T* A_d,
+                            T* B_d,
+                            int dimM,
+                            int dimN,
+                            int lda,
+                            int ldb) {}
+
+template <class T, class Op, bool CAsRowVector, bool CAsColVector>
+void hl_gpu_apply_ternary_op(Op op,
+                             T* A_d,
+                             T* B_d,
+                             T* C_d,
+                             int dimM,
+                             int dimN,
+                             int lda,
+                             int ldb,
+                             int ldc) {}
+
+template <class T, class Op>
+void hl_gpu_apply_quaternary_op(Op op,
+                                T* A_d,
+                                T* B_d,
+                                T* C_d,
+                                T* D_d,
+                                int dimM,
+                                int dimN,
+                                int lda,
+                                int ldb,
+                                int ldc,
+                                int ldd) {}
+#endif
+
+#ifdef __NVCC__
+/**
+ * @brief   matrix row operator.
+ */
+
+template<class Agg, class Op>
+__device__ __inline__ real sumRow(Agg agg, Op op,
+                                  int idx, int blockSize,
+                                  int dimN, real *A) {
+  real tmp = agg.init();
+  int cnt = (dimN + blockSize -1) / blockSize;
+  for (int i = 0; i < cnt && idx < dimN; i++) {
+      tmp = agg(tmp, op(A[idx]));
+      idx += blockSize;
+  }
+  return tmp;
+}
+
+template<class Agg, class Op>
+__device__ __inline__ real sumRow(Agg agg, Op op,
+                                  int idx, int blockSize,
+                                  int dimN, real *A, real *B) {
+  real tmp = agg.init();
+  int cnt = (dimN + blockSize -1) / blockSize;
+  for (int i = 0; i < cnt && idx < dimN; i++) {
+    tmp = agg(tmp, op(A[idx], B[idx]));
+    idx += blockSize;
+  }
+  return tmp;
+}
+
+template<class Agg>
+__device__ __inline__ void aggRow(Agg agg, real *row, int size, int tid) {
+  for (int stride = size/2; stride > 0; stride = stride/2) {
+    if (tid < stride) {
+      row[tid] = agg(row[tid], row[tid + stride]);
+    }
+    __syncthreads();
+  }
+}
+
+template<class Agg, class Op, class Saver, int blockSize>
+__global__ void KeMatrixRowOp(Agg agg, Op op, Saver sv,
+                              int dimN,
+                              real *dst, int ld,
+                              real *A, int lda) {
+  __shared__ real row_s[blockSize];
+  int rowId = blockIdx.x + blockIdx.y*gridDim.x;
+  int tid = threadIdx.x;
+
+  A += rowId*lda;
+  row_s[tid] = sumRow(agg, op, tid, blockSize, dimN, A);
+  __syncthreads();
+
+  aggRow(agg, row_s, blockSize, tid);
+  __syncthreads();
+
+  if (tid == 0) {
+    dst[rowId*ld] = sv(dst[rowId*ld], row_s[0]);
+  }
+}
+
+template<class Agg, class Op, class Saver, int blockSize>
+__global__ void KeMatrixRowOp(Agg agg, Op op, Saver sv,
+                              int dimN,
+                              real *dst, int ld,
+                              real *A, int lda,
+                              real *B, int ldb) {
+  __shared__ real row_s[blockSize];
+  int rowId = blockIdx.x + blockIdx.y*gridDim.x;
+  int tid = threadIdx.x;
+
+  A += rowId*lda;
+  B += rowId*ldb;
+  row_s[tid] = sumRow(agg, op, tid, blockSize, dimN, A, B);
+  __syncthreads();
+
+  aggRow(agg, row_s, blockSize, tid);
+  __syncthreads();
+
+  if (tid == 0) {
+    dst[rowId*ld] = sv(dst[rowId*ld], row_s[0]);
+  }
+}
+
+/**
+ * @brief   matrix column operator.
+ */
+template <class Agg, class Op>
+__device__ __inline__ real sumCol(Agg agg, Op op,
+                                  int index, int stride,
+                                  int dimM, real *A, int lda) {
+  real tmp = agg.init();
+  for (; index < dimM;) {
+    tmp = agg(tmp, op(A[index*lda]));
+    index += stride;
+  }
+  return tmp;
+}
+
+template <class Agg, class Op>
+__device__ __inline__ real sumCol(Agg agg, Op op,
+                                  int index, int stride, int dimM,
+                                  real *A, int lda, real *B, int ldb) {
+  real tmp = agg.init();
+  for (; index < dimM;) {
+    tmp = agg(tmp, op(A[index*lda], B[index*ldb]));
+    index += stride;
+  }
+  return tmp;
+}
+
+template <class Agg, class Op, class Saver>
+__global__ void KeMatrixColumnOp(Agg agg, Op op, Saver sv,
+                                 int dimM, int dimN,
+                                 real *dst,
+                                 real *A, int lda) {
+  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (rowIdx < dimN) {
+    A += rowIdx;
+    real tmp = sumCol(agg, op, 0, 1, dimM, A, lda);
+    dst[rowIdx] = sv(dst[rowIdx], tmp);
+  }
+}
+
+template <class Agg, class Op, class Saver, int blockDimX, int blockDimY>
+__global__ void KeMatrixColumnOp_S(Agg agg, Op op, Saver sv,
+                                   int dimM, int dimN,
+                                   real *dst,
+                                   real *A, int lda) {
+  __shared__ real col_s[blockDimX*blockDimY];
+  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (rowIdx < dimN) {
+    A += rowIdx;
+    real tmp = sumCol(agg, op, threadIdx.y, blockDimY, dimM, A, lda);
+    col_s[threadIdx.x + threadIdx.y*blockDimX] = tmp;
+  }
+  __syncthreads();
+
+  if (rowIdx < dimN) {
+    if (threadIdx.y ==0) {
+      real tmp = agg.init();
+      for (int i=0; i < blockDimY; i++) {
+        tmp = agg(tmp, col_s[threadIdx.x + i*blockDimX]);
+      }
+      dst[rowIdx] = sv(dst[rowIdx], tmp);
+    }
+  }
+}
+
+template <class Agg, class Op, class Saver>
+__global__ void KeMatrixColumnOp(Agg agg, Op op, Saver sv,
+                                 int dimM, int dimN,
+                                 real *dst,
+                                 real *A, int lda,
+                                 real *B, int ldb) {
+  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (rowIdx < dimN) {
+    A += rowIdx;
+    B += rowIdx;
+    real tmp = sumCol(agg, op, 0, 1, dimM, A, lda, B, ldb);
+    dst[rowIdx] = sv(dst[rowIdx], tmp);
+  }
+}
+
+template <class Agg, class Op, class Saver, int blockDimX, int blockDimY>
+__global__ void KeMatrixColumnOp_S(Agg agg, Op op, Saver sv,
+                                   int dimM, int dimN,
+                                   real *dst,
+                                   real *A, int lda,
+                                   real *B, int ldb) {
+  __shared__ real col_s[blockDimX*blockDimY];
+  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (rowIdx < dimN) {
+    A += rowIdx;
+    B += rowIdx;
+    real tmp = sumCol(agg, op,
+        threadIdx.y, blockDimY, dimM, A, lda, B, ldb);
+    col_s[threadIdx.x + threadIdx.y*blockDimX] = tmp;
+  }
+  __syncthreads();
+
+  if (rowIdx < dimN) {
+    if (threadIdx.y ==0) {
+      real tmp = agg.init();
+      for (int i=0; i < blockDimY; i++) {
+        tmp = agg(tmp, col_s[threadIdx.x + i*blockDimX]);
+      }
+      dst[rowIdx] = sv(dst[rowIdx], tmp);
+    }
+  }
+}
+
+#endif
+
+template <class Agg, class Op, class Saver>
+void hl_gpu_matrix_row_op(Agg agg, Op op, Saver sv,
+                          int dimM, int dimN,
+                          real *dst, int ld,
+                          real *A, int lda) {
+#ifdef __NVCC__
+  CHECK_NOTNULL(dst);
+  CHECK_NOTNULL(A);
+
+  int blocksX = dimM;
+  int blocksY = 1;
+  dim3 threads(128, 1);
+  dim3 grid(blocksX, blocksY);
+  KeMatrixRowOp<Agg, Op, Saver, 128><<< grid, threads, 0, STREAM_DEFAULT >>>
+      (agg, op, sv, dimN, dst, ld, A, lda);
+
+  CHECK_SYNC("hl_matrix_row_op failed");
+#endif
+}
+
+template <class Agg, class Op, class Saver>
+void hl_gpu_matrix_row_op(Agg agg, Op op, Saver sv,
+                          int dimM, int dimN,
+                          real *dst, int ld,
+                          real *A, int lda,
+                          real *B, int ldb) {
+#ifdef __NVCC__
+  CHECK_NOTNULL(dst);
+  CHECK_NOTNULL(A);
+
+  int blocksX = dimM;
+  int blocksY = 1;
+  dim3 threads(128, 1);
+  dim3 grid(blocksX, blocksY);
+  KeMatrixRowOp<Agg, Op, Saver, 128><<< grid, threads, 0, STREAM_DEFAULT >>>
+    (agg, op, sv, dimN, dst, ld, A, lda, B, ldb);
+
+  CHECK_SYNC("hl_matrix_row_op failed");
+#endif
+}
+
+template <class Agg, class Op, class Saver>
+void hl_gpu_matrix_column_op(Agg agg, Op op, Saver sv,
+                             int dimM, int dimN,
+                             real *dst,
+                             real *A, int lda) {
+#ifdef __NVCC__
+  if (dimN >= 8192) {
+    int blocksX = (dimN + 128 -1) / 128;
+    int blocksY = 1;
+    dim3 threads(128, 1);
+    dim3 grid(blocksX, blocksY);
+    KeMatrixColumnOp<Agg, Op, Saver>
+        <<< grid, threads, 0, STREAM_DEFAULT >>>
+        (agg, op, sv, dimM, dimN, dst, A, lda);
+  } else {
+    int blocksX = (dimN + 32 -1) / 32;
+    int blocksY = 1;
+    dim3 threads(32, 32);
+    dim3 grid(blocksX, blocksY);
+    KeMatrixColumnOp_S<Agg, Op, Saver, 32, 32>
+        <<< grid, threads, 0, STREAM_DEFAULT>>>
+        (agg, op, sv, dimM, dimN, dst, A, lda);
+  }
+
+  CHECK_SYNC("hl_matrix_column_op failed");
+#endif
+}
+
+template <class Agg, class Op, class Saver>
+void hl_gpu_matrix_column_op(Agg agg, Op op, Saver sv,
+                             int dimM, int dimN,
+                             real *dst,
+                             real *A, int lda,
+                             real *B, int ldb) {
+#ifdef __NVCC__
+  if (dimN >= 8192) {
+    int blocksX = (dimN + 128 -1) / 128;
+    int blocksY = 1;
+    dim3 threads(128, 1);
+    dim3 grid(blocksX, blocksY);
+    KeMatrixColumnOp<Agg, Op, Saver>
+        <<< grid, threads, 0, STREAM_DEFAULT >>>
+        (agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
+  } else {
+    int blocksX = (dimN + 32 -1) / 32;
+    int blocksY = 1;
+    dim3 threads(32, 32);
+    dim3 grid(blocksX, blocksY);
+    KeMatrixColumnOp_S<Agg, Op, Saver, 32, 32>
+        <<< grid, threads, 0, STREAM_DEFAULT>>>
+        (agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
+  }
+
+  CHECK_SYNC("hl_matrix_column_op failed");
+#endif
+}
+
+#endif /* HL_GPU_MATRIX_KERNEL_CUH_ */
diff --git a/paddle/cuda/include/hl_gru_ops.cuh b/paddle/legacy/cuda/include/hl_gru_ops.cuh
similarity index 100%
rename from paddle/cuda/include/hl_gru_ops.cuh
rename to paddle/legacy/cuda/include/hl_gru_ops.cuh
diff --git a/paddle/cuda/include/hl_lstm.h b/paddle/legacy/cuda/include/hl_lstm.h
similarity index 100%
rename from paddle/cuda/include/hl_lstm.h
rename to paddle/legacy/cuda/include/hl_lstm.h
diff --git a/paddle/cuda/include/hl_lstm_ops.cuh b/paddle/legacy/cuda/include/hl_lstm_ops.cuh
similarity index 100%
rename from paddle/cuda/include/hl_lstm_ops.cuh
rename to paddle/legacy/cuda/include/hl_lstm_ops.cuh
diff --git a/paddle/cuda/include/hl_matrix.h b/paddle/legacy/cuda/include/hl_matrix.h
similarity index 100%
rename from paddle/cuda/include/hl_matrix.h
rename to paddle/legacy/cuda/include/hl_matrix.h
diff --git a/paddle/cuda/include/hl_matrix_apply.cuh b/paddle/legacy/cuda/include/hl_matrix_apply.cuh
similarity index 100%
rename from paddle/cuda/include/hl_matrix_apply.cuh
rename to paddle/legacy/cuda/include/hl_matrix_apply.cuh
diff --git a/paddle/cuda/include/hl_matrix_base.cuh b/paddle/legacy/cuda/include/hl_matrix_base.cuh
similarity index 100%
rename from paddle/cuda/include/hl_matrix_base.cuh
rename to paddle/legacy/cuda/include/hl_matrix_base.cuh
diff --git a/paddle/cuda/include/hl_matrix_base_detail.cuh b/paddle/legacy/cuda/include/hl_matrix_base_detail.cuh
similarity index 100%
rename from paddle/cuda/include/hl_matrix_base_detail.cuh
rename to paddle/legacy/cuda/include/hl_matrix_base_detail.cuh
diff --git a/paddle/cuda/include/hl_matrix_ops.cuh b/paddle/legacy/cuda/include/hl_matrix_ops.cuh
similarity index 100%
rename from paddle/cuda/include/hl_matrix_ops.cuh
rename to paddle/legacy/cuda/include/hl_matrix_ops.cuh
diff --git a/paddle/cuda/include/hl_matrix_type.cuh b/paddle/legacy/cuda/include/hl_matrix_type.cuh
similarity index 100%
rename from paddle/cuda/include/hl_matrix_type.cuh
rename to paddle/legacy/cuda/include/hl_matrix_type.cuh
diff --git a/paddle/cuda/include/hl_perturbation_util.cuh b/paddle/legacy/cuda/include/hl_perturbation_util.cuh
similarity index 100%
rename from paddle/cuda/include/hl_perturbation_util.cuh
rename to paddle/legacy/cuda/include/hl_perturbation_util.cuh
diff --git a/paddle/cuda/include/hl_recurrent_apply.cuh b/paddle/legacy/cuda/include/hl_recurrent_apply.cuh
similarity index 100%
rename from paddle/cuda/include/hl_recurrent_apply.cuh
rename to paddle/legacy/cuda/include/hl_recurrent_apply.cuh
diff --git a/paddle/cuda/include/hl_sequence.h b/paddle/legacy/cuda/include/hl_sequence.h
similarity index 100%
rename from paddle/cuda/include/hl_sequence.h
rename to paddle/legacy/cuda/include/hl_sequence.h
diff --git a/paddle/cuda/include/hl_sparse.h b/paddle/legacy/cuda/include/hl_sparse.h
similarity index 100%
rename from paddle/cuda/include/hl_sparse.h
rename to paddle/legacy/cuda/include/hl_sparse.h
diff --git a/paddle/cuda/include/hl_sparse.ph b/paddle/legacy/cuda/include/hl_sparse.ph
similarity index 100%
rename from paddle/cuda/include/hl_sparse.ph
rename to paddle/legacy/cuda/include/hl_sparse.ph
diff --git a/paddle/cuda/include/hl_table_apply.h b/paddle/legacy/cuda/include/hl_table_apply.h
similarity index 100%
rename from paddle/cuda/include/hl_table_apply.h
rename to paddle/legacy/cuda/include/hl_table_apply.h
diff --git a/paddle/cuda/include/hl_tensor_ops.h b/paddle/legacy/cuda/include/hl_tensor_ops.h
similarity index 100%
rename from paddle/cuda/include/hl_tensor_ops.h
rename to paddle/legacy/cuda/include/hl_tensor_ops.h
diff --git a/paddle/cuda/include/hl_thread.ph b/paddle/legacy/cuda/include/hl_thread.ph
similarity index 100%
rename from paddle/cuda/include/hl_thread.ph
rename to paddle/legacy/cuda/include/hl_thread.ph
diff --git a/paddle/cuda/include/hl_time.h b/paddle/legacy/cuda/include/hl_time.h
similarity index 100%
rename from paddle/cuda/include/hl_time.h
rename to paddle/legacy/cuda/include/hl_time.h
diff --git a/paddle/cuda/include/hl_top_k.h b/paddle/legacy/cuda/include/hl_top_k.h
similarity index 100%
rename from paddle/cuda/include/hl_top_k.h
rename to paddle/legacy/cuda/include/hl_top_k.h
diff --git a/paddle/cuda/include/hl_warpctc_wrap.h b/paddle/legacy/cuda/include/hl_warpctc_wrap.h
similarity index 100%
rename from paddle/cuda/include/hl_warpctc_wrap.h
rename to paddle/legacy/cuda/include/hl_warpctc_wrap.h
diff --git a/paddle/cuda/include/stub/hl_aggregate_stub.h b/paddle/legacy/cuda/include/stub/hl_aggregate_stub.h
similarity index 100%
rename from paddle/cuda/include/stub/hl_aggregate_stub.h
rename to paddle/legacy/cuda/include/stub/hl_aggregate_stub.h
diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/legacy/cuda/include/stub/hl_cnn_stub.h
similarity index 100%
rename from paddle/cuda/include/stub/hl_cnn_stub.h
rename to paddle/legacy/cuda/include/stub/hl_cnn_stub.h
diff --git a/paddle/cuda/include/stub/hl_cuda_cublas_stub.h b/paddle/legacy/cuda/include/stub/hl_cuda_cublas_stub.h
similarity index 100%
rename from paddle/cuda/include/stub/hl_cuda_cublas_stub.h
rename to paddle/legacy/cuda/include/stub/hl_cuda_cublas_stub.h
diff --git a/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h b/paddle/legacy/cuda/include/stub/hl_cuda_cudnn_stub.h
similarity index 100%
rename from paddle/cuda/include/stub/hl_cuda_cudnn_stub.h
rename to paddle/legacy/cuda/include/stub/hl_cuda_cudnn_stub.h
diff --git a/paddle/cuda/include/stub/hl_cuda_stub.h b/paddle/legacy/cuda/include/stub/hl_cuda_stub.h
similarity index 100%
rename from paddle/cuda/include/stub/hl_cuda_stub.h
rename to paddle/legacy/cuda/include/stub/hl_cuda_stub.h
diff --git a/paddle/cuda/include/stub/hl_lstm_stub.h b/paddle/legacy/cuda/include/stub/hl_lstm_stub.h
similarity index 100%
rename from paddle/cuda/include/stub/hl_lstm_stub.h
rename to paddle/legacy/cuda/include/stub/hl_lstm_stub.h
diff --git a/paddle/cuda/include/stub/hl_matrix_stub.h b/paddle/legacy/cuda/include/stub/hl_matrix_stub.h
similarity index 100%
rename from paddle/cuda/include/stub/hl_matrix_stub.h
rename to paddle/legacy/cuda/include/stub/hl_matrix_stub.h
diff --git a/paddle/cuda/include/stub/hl_sequence_stub.h b/paddle/legacy/cuda/include/stub/hl_sequence_stub.h
similarity index 100%
rename from paddle/cuda/include/stub/hl_sequence_stub.h
rename to paddle/legacy/cuda/include/stub/hl_sequence_stub.h
diff --git a/paddle/cuda/include/stub/hl_sparse_stub.h b/paddle/legacy/cuda/include/stub/hl_sparse_stub.h
similarity index 100%
rename from paddle/cuda/include/stub/hl_sparse_stub.h
rename to paddle/legacy/cuda/include/stub/hl_sparse_stub.h
diff --git a/paddle/cuda/src/avx_mathfun.h b/paddle/legacy/cuda/src/avx_mathfun.h
similarity index 100%
rename from paddle/cuda/src/avx_mathfun.h
rename to paddle/legacy/cuda/src/avx_mathfun.h
diff --git a/paddle/cuda/src/hl_avx_functions.cc b/paddle/legacy/cuda/src/hl_avx_functions.cc
similarity index 100%
rename from paddle/cuda/src/hl_avx_functions.cc
rename to paddle/legacy/cuda/src/hl_avx_functions.cc
diff --git a/paddle/cuda/src/hl_batch_norm.cu b/paddle/legacy/cuda/src/hl_batch_norm.cu
similarity index 100%
rename from paddle/cuda/src/hl_batch_norm.cu
rename to paddle/legacy/cuda/src/hl_batch_norm.cu
diff --git a/paddle/cuda/src/hl_batch_transpose.cu b/paddle/legacy/cuda/src/hl_batch_transpose.cu
similarity index 100%
rename from paddle/cuda/src/hl_batch_transpose.cu
rename to paddle/legacy/cuda/src/hl_batch_transpose.cu
diff --git a/paddle/cuda/src/hl_cpu_functions.cc b/paddle/legacy/cuda/src/hl_cpu_functions.cc
similarity index 100%
rename from paddle/cuda/src/hl_cpu_functions.cc
rename to paddle/legacy/cuda/src/hl_cpu_functions.cc
diff --git a/paddle/legacy/cuda/src/hl_cuda_aggregate.cu b/paddle/legacy/cuda/src/hl_cuda_aggregate.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9831c5ecc340135c27b49d24715c63f8a8dfa8e9
--- /dev/null
+++ b/paddle/legacy/cuda/src/hl_cuda_aggregate.cu
@@ -0,0 +1,293 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "hl_aggregate.h"
+#include "hl_base.h"
+#include "hl_cuda.h"
+#include "hl_cuda.ph"
+#include "hl_matrix_base.cuh"
+#include "hl_thread.ph"
+#include "paddle/legacy/utils/Logging.h"
+
+/**
+ * @brief   matrix row operator.
+ */
+template <class Agg, int blockSize>
+__global__ void KeMatrixRowOp(Agg agg, real *E, real *Sum, int dimN) {
+  __shared__ real sum_s[blockSize];
+  int cnt = (dimN + blockSize - 1) / blockSize;
+  int rowId = blockIdx.x + blockIdx.y * gridDim.x;
+  int index = rowId * dimN;
+  int tid = threadIdx.x;
+  int lmt = tid;
+
+  real tmp = agg.init();
+  for (int ii = 0; ii < cnt && lmt < dimN; ii++) {
+    tmp = agg(tmp, E[index + lmt]);
+    lmt += blockSize;
+  }
+  sum_s[tid] = tmp;
+  __syncthreads();
+
+  for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
+    if (tid < stride) {
+      sum_s[tid] = agg(sum_s[tid], sum_s[tid + stride]);
+    }
+    __syncthreads();
+  }
+  __syncthreads();
+
+  if (tid == 0) {
+    Sum[rowId] = sum_s[0];
+  }
+}
+
+template <class Agg>
+void hl_matrix_row_op(Agg agg, real *A_d, real *C_d, int dimM, int dimN) {
+  int blocksX = dimM;
+  int blocksY = 1;
+  dim3 threads(128, 1);
+  dim3 grid(blocksX, blocksY);
+
+  KeMatrixRowOp<Agg, 128><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      agg, A_d, C_d, dimN);
+}
+
+void hl_matrix_row_sum(real *A_d, real *C_d, int dimM, int dimN) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_d);
+
+  hl_matrix_row_op(aggregate::sum(), A_d, C_d, dimM, dimN);
+  CHECK_SYNC("hl_matrix_row_sum failed");
+}
+
+void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_d);
+
+  hl_matrix_row_op(aggregate::max(), A_d, C_d, dimM, dimN);
+  CHECK_SYNC("hl_matrix_row_max failed");
+}
+
+void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_d);
+
+  hl_matrix_row_op(aggregate::min(), A_d, C_d, dimM, dimN);
+  CHECK_SYNC("hl_matrix_row_min failed");
+}
+
+/**
+ * @brief   matrix column operator.
+ */
+template <class Agg>
+__global__ void KeMatrixColumnOp(
+    Agg agg, real *E, real *Sum, int dimM, int dimN) {
+  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  real tmp = agg.init();
+  if (rowIdx < dimN) {
+    for (int index = 0; index < dimM; index++) {
+      tmp = agg(tmp, E[dimN * index + rowIdx]);
+    }
+    Sum[rowIdx] = tmp;
+  }
+}
+
+template <class Agg, int blockDimX, int blockDimY>
+__global__ void KeMatrixColumnOp_S(
+    Agg agg, real *E, real *Sum, int dimM, int dimN) {
+  __shared__ real _sum[blockDimX * blockDimY];
+  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int index = threadIdx.y;
+
+  real tmp = agg.init();
+  if (rowIdx < dimN) {
+    for (; index < dimM;) {
+      tmp = agg(tmp, E[dimN * index + rowIdx]);
+      index += blockDimY;
+    }
+  }
+  _sum[threadIdx.x + threadIdx.y * blockDimX] = tmp;
+  __syncthreads();
+
+  if (rowIdx < dimN) {
+    if (threadIdx.y == 0) {
+      real tmp = agg.init();
+      for (int i = 0; i < blockDimY; i++) {
+        tmp = agg(tmp, _sum[threadIdx.x + i * blockDimX]);
+      }
+      Sum[rowIdx] = tmp;
+    }
+  }
+}
+
+template <class Agg>
+void hl_matrix_column_op(Agg agg, real *A_d, real *C_d, int dimM, int dimN) {
+  if (dimN >= 8192) {
+    int blocksX = (dimN + 128 - 1) / 128;
+    int blocksY = 1;
+    dim3 threads(128, 1);
+    dim3 grid(blocksX, blocksY);
+    KeMatrixColumnOp<Agg><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        agg, A_d, C_d, dimM, dimN);
+  } else {
+    int blocksX = (dimN + 32 - 1) / 32;
+    int blocksY = 1;
+    dim3 threads(32, 32);
+    dim3 grid(blocksX, blocksY);
+    KeMatrixColumnOp_S<Agg, 32, 32><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        agg, A_d, C_d, dimM, dimN);
+  }
+
+  return;
+}
+
+void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_d);
+
+  hl_matrix_column_op(aggregate::sum(), A_d, C_d, dimM, dimN);
+
+  CHECK_SYNC("hl_matrix_column_sum failed");
+}
+
+void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_d);
+
+  hl_matrix_column_op(aggregate::max(), A_d, C_d, dimM, dimN);
+
+  CHECK_SYNC("hl_matrix_column_max failed");
+}
+
+void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_d);
+
+  hl_matrix_column_op(aggregate::min(), A_d, C_d, dimM, dimN);
+
+  CHECK_SYNC("hl_matrix_column_min failed");
+}
+
+template <int blockSize>
+__global__ void KeVectorSum(real *E, real *Sum, int dimM) {
+  __shared__ double sum_s[blockSize];
+  int tid = threadIdx.x;
+  int index = blockIdx.y * blockDim.x + threadIdx.x;
+
+  sum_s[tid] = 0.0f;
+  while (index < dimM) {
+    sum_s[tid] += E[index];
+    index += blockDim.x * gridDim.y;
+  }
+  __syncthreads();
+
+  for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
+    if (tid < stride) {
+      sum_s[tid] += sum_s[tid + stride];
+    }
+    __syncthreads();
+  }
+  __syncthreads();
+
+  if (tid == 0) {
+    Sum[blockIdx.y] = sum_s[0];
+  }
+}
+
+void hl_vector_sum(real *A_d, real *C_h, int dimM) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_h);
+
+  int blockSize = 128;
+  int gridSize = 128;
+  int blocksX = 1;
+  int blocksY = gridSize;
+  dim3 threads(blockSize, 1);
+  dim3 grid(blocksX, blocksY);
+
+  struct _hl_event_st hl_event_st = {.cu_event = t_resource.event};
+  hl_event_t hl_event = &hl_event_st;
+  while (!hl_cuda_event_is_ready(hl_event)) {
+  }
+
+  KeVectorSum<128><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      A_d, t_resource.gpu_mem, dimM);
+  KeVectorSum<128><<<1, threads, 0, STREAM_DEFAULT>>>(
+      t_resource.gpu_mem, t_resource.cpu_mem, 128);
+
+  hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT);
+  hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event);
+
+  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  cudaError_t err = (cudaError_t)hl_get_device_last_error();
+  CHECK_EQ(cudaSuccess, err) << "CUDA error: "
+                             << hl_get_device_error_string((size_t)err);
+}
+
+template <int blockSize>
+__global__ void KeVectorAbsSum(real *E, real *Sum, int dimM) {
+  __shared__ double sum_s[blockSize];
+  int tid = threadIdx.x;
+  int index = blockIdx.y * blockDim.x + threadIdx.x;
+
+  sum_s[tid] = 0.0f;
+  while (index < dimM) {
+    sum_s[tid] += abs(E[index]);
+    index += blockDim.x * gridDim.y;
+  }
+  __syncthreads();
+
+  for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
+    if (tid < stride) {
+      sum_s[tid] += sum_s[tid + stride];
+    }
+    __syncthreads();
+  }
+  __syncthreads();
+
+  if (tid == 0) {
+    Sum[blockIdx.y] = sum_s[0];
+  }
+}
+
+void hl_vector_abs_sum(real *A_d, real *C_h, int dimM) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_h);
+
+  int blockSize = 128;
+  int gridSize = 128;
+  int blocksX = 1;
+  int blocksY = gridSize;
+  dim3 threads(blockSize, 1);
+  dim3 grid(blocksX, blocksY);
+
+  struct _hl_event_st hl_event_st = {.cu_event = t_resource.event};
+  hl_event_t hl_event = &hl_event_st;
+  while (!hl_cuda_event_is_ready(hl_event)) {
+  }
+
+  KeVectorAbsSum<128><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      A_d, t_resource.gpu_mem, dimM);
+  KeVectorAbsSum<128><<<1, threads, 0, STREAM_DEFAULT>>>(
+      t_resource.gpu_mem, t_resource.cpu_mem, 128);
+
+  hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT);
+  hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event);
+
+  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  cudaError_t err = (cudaError_t)hl_get_device_last_error();
+  CHECK_EQ(cudaSuccess, err) << "CUDA error: "
+                             << hl_get_device_error_string((size_t)err);
+}
diff --git a/paddle/cuda/src/hl_cuda_cnn.cu b/paddle/legacy/cuda/src/hl_cuda_cnn.cu
similarity index 100%
rename from paddle/cuda/src/hl_cuda_cnn.cu
rename to paddle/legacy/cuda/src/hl_cuda_cnn.cu
diff --git a/paddle/legacy/cuda/src/hl_cuda_cublas.cc b/paddle/legacy/cuda/src/hl_cuda_cublas.cc
new file mode 100644
index 0000000000000000000000000000000000000000..283b8b6e9c8e7b843a8d28b940c6ef53b77ef655
--- /dev/null
+++ b/paddle/legacy/cuda/src/hl_cuda_cublas.cc
@@ -0,0 +1,400 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "hl_cuda_cublas.h"
+#include <sys/time.h>
+#include "hl_cuda.h"
+#include "hl_thread.ph"
+#include "paddle/legacy/utils/DynamicLoader.h"
+#include "paddle/legacy/utils/Logging.h"
+
+namespace dynload {
+
+std::once_flag cublas_dso_flag;
+void *cublas_dso_handle = nullptr;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load cublas routine
+ * via operator overloading.
+ *
+ * note: default dynamic linked libs
+ */
+#ifdef PADDLE_USE_DSO
+#define DYNAMIC_LOAD_CUBLAS_WRAP(__name)                                       \
+  struct DynLoad__##__name {                                                   \
+    template <typename... Args>                                                \
+    cublasStatus_t operator()(Args... args) {                                  \
+      typedef cublasStatus_t (*cublasFunc)(Args...);                           \
+      std::call_once(cublas_dso_flag, GetCublasDsoHandle, &cublas_dso_handle); \
+      void *p_##__name = dlsym(cublas_dso_handle, #__name);                    \
+      return reinterpret_cast<cublasFunc>(p_##__name)(args...);                \
+    }                                                                          \
+  } __name;  // struct DynLoad__##__name
+#else
+#define DYNAMIC_LOAD_CUBLAS_WRAP(__name)      \
+  struct DynLoad__##__name {                  \
+    template <typename... Args>               \
+    cublasStatus_t operator()(Args... args) { \
+      return __name(args...);                 \
+    }                                         \
+  } __name;  // struct DynLoad__##__name
+#endif
+
+#define DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) DYNAMIC_LOAD_CUBLAS_WRAP(__name)
+
+// include all needed cublas functions in HPPL
+// clang-format off
+#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
+  __macro(cublasSgemv)                    \
+  __macro(cublasDgemv)                    \
+  __macro(cublasSgemm)                    \
+  __macro(cublasDgemm)                    \
+  __macro(cublasSgeam)                    \
+  __macro(cublasDgeam)                    \
+
+DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasCreate)
+DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasDestroy)
+DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasSetStream)
+DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasSetPointerMode)
+DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasGetPointerMode)
+DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgemmBatched)
+DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgemmBatched)
+DYNAMIC_LOAD_CUBLAS_WRAP(cublasCgemmBatched)
+DYNAMIC_LOAD_CUBLAS_WRAP(cublasZgemmBatched)
+DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetrfBatched)
+DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetriBatched)
+DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgetrfBatched)
+DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgetriBatched)
+CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP)
+
+#undef DYNAMIC_LOAD_CUBLAS_WRAP
+#undef DYNAMIC_LOAD_CUBLAS_V2_WRAP
+#undef CUBLAS_BLAS_ROUTINE_EACH
+
+} /* namespace dynload */
+
+// clang-format on
+#ifndef PADDLE_TYPE_DOUBLE
+#define CUBLAS_GEAM dynload::cublasSgeam
+#define CUBLAS_GEMV dynload::cublasSgemv
+#define CUBLAS_GEMM dynload::cublasSgemm
+#define CUBLAS_GETRF dynload::cublasSgetrfBatched
+#define CUBLAS_GETRI dynload::cublasSgetriBatched
+#else
+#define CUBLAS_GEAM dynload::cublasDgeam
+#define CUBLAS_GEMV dynload::cublasDgemv
+#define CUBLAS_GEMM dynload::cublasDgemm
+#define CUBLAS_GETRF dynload::cublasDgetrfBatched
+#define CUBLAS_GETRI dynload::cublasDgetriBatched
+#endif
+
+const char *hl_cublas_get_error_string(cublasStatus_t status) {
+  switch (status) {
+    case CUBLAS_STATUS_NOT_INITIALIZED:
+      return "[cublas status]: not initialized";
+    case CUBLAS_STATUS_ALLOC_FAILED:
+      return "[cublas status]: allocate failed";
+    case CUBLAS_STATUS_INVALID_VALUE:
+      return "[cublas status]: invalid value";
+    case CUBLAS_STATUS_ARCH_MISMATCH:
+      return "[cublas status]: arch mismatch";
+    case CUBLAS_STATUS_MAPPING_ERROR:
+      return "[cublas status]: mapping error";
+    case CUBLAS_STATUS_EXECUTION_FAILED:
+      return "[cublas status]: execution failed";
+    case CUBLAS_STATUS_INTERNAL_ERROR:
+      return "[cublas status]: internal error";
+    case CUBLAS_STATUS_SUCCESS:
+      return "[cublas status]: success";
+    default:
+      return "[cublas status]: unknown error";
+  }
+}
+
+/**
+ * Check build-in cublas function using glog and it also
+ * support << operator for more details error info.
+ */
+cublasStatus_t g_cublasStat;
+#define CHECK_CUBLAS(cublas_func)               \
+  g_cublasStat = cublas_func;                   \
+  CHECK_EQ(CUBLAS_STATUS_SUCCESS, g_cublasStat) \
+      << "Cublas Error: " << hl_cublas_get_error_string(g_cublasStat) << " "
+
+void hl_cublas_init(cublasHandle_t *cublas_handle, cudaStream_t stream) {
+  CHECK_CUBLAS(dynload::cublasCreate(cublas_handle))
+      << "[cublas init] Cublas create handle faild!";
+
+  CHECK_CUBLAS(dynload::cublasSetStream(*cublas_handle, stream))
+      << "[cublas init] Cublas set stream faild!";
+}
+
+void hl_matrix_transpose(
+    real *A_d, real *C_d, int dimM, int dimN, int lda, int ldc) {
+  real alpha = 1.0;
+  real beta = 0.0;
+
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_d);
+
+  CHECK_CUBLAS(CUBLAS_GEAM(t_resource.handle,
+                           CUBLAS_OP_T,
+                           CUBLAS_OP_N,
+                           dimM,
+                           dimN,
+                           &alpha,
+                           A_d,
+                           lda,
+                           &beta,
+                           nullptr,
+                           dimM,
+                           C_d,
+                           ldc));
+  CHECK_SYNC("hl_matrix_transpose failed");
+}
+
+void hl_matrix_transpose(real *A_d, real *C_d, int dimM, int dimN) {
+  hl_matrix_transpose(A_d, C_d, dimM, dimN, dimN, dimM);
+}
+
+void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) {
+  /* Solve Ax = I */
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_d);
+
+  /* Step 1: Compute the LU decomposition of matrix A */
+  real **inout_h = &A_d;
+  real **inout_d = (real **)hl_malloc_device(sizeof(real *));
+  hl_memcpy(inout_d, inout_h, sizeof(real *));
+
+  int *pivot_d = (int *)hl_malloc_device(dimN * sizeof(int));
+  int *info_d = (int *)t_resource.gpu_mem;
+
+  /* Note: cublasSgetrfBatched is used to calculate a number of
+     small-sized matrices. There may be a better way to reconstruct
+     the API for better performance.
+   */
+  CHECK_CUBLAS(
+      CUBLAS_GETRF(t_resource.handle, dimN, inout_d, lda, pivot_d, info_d, 1));
+
+  int info_h;
+  hl_memcpy(&info_h, info_d, sizeof(int));
+  if (info_h != 0) {
+    LOG(FATAL) << "Factorization of matrix failed: matrix may be singular.\n";
+  }
+
+  /* Step 2: Compute the inverse of the matrix given its LU decomposition */
+  real **out_h = &C_d;
+  real **out_d = (real **)hl_malloc_device(sizeof(real *));
+  hl_memcpy(out_d, out_h, sizeof(real *));
+
+  CHECK_CUBLAS(CUBLAS_GETRI(t_resource.handle,
+                            dimN,
+                            (const real **)inout_d,
+                            lda,
+                            pivot_d,
+                            out_d,
+                            ldc,
+                            info_d,
+                            1));
+
+  hl_memcpy(&info_h, info_d, sizeof(int));
+  if (info_h != 0) {
+    LOG(FATAL) << "Inversion of matrix failed: matrix may be singular.\n";
+  }
+
+  hl_free_mem_device(inout_d);
+  hl_free_mem_device(pivot_d);
+  hl_free_mem_device(out_d);
+
+  CHECK_SYNC("hl_matrix_inverse failed");
+}
+
+void hl_matrix_mul(real *A_d,
+                   hl_trans_op_t transa,
+                   real *B_d,
+                   hl_trans_op_t transb,
+                   real *C_d,
+                   int dimM,
+                   int dimN,
+                   int dimK,
+                   real alpha,
+                   real beta,
+                   int lda,
+                   int ldb,
+                   int ldc) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(B_d);
+  CHECK_NOTNULL(C_d);
+
+  if (dimN == 1 && dimM != 1 && dimK != 1 && transb == HPPL_OP_N) {
+    int m = (transa == HPPL_OP_N) ? dimM : dimK;
+    int n = (transa == HPPL_OP_N) ? dimK : dimM;
+    hl_matrix_mul_vector(
+        A_d, transa, B_d, C_d, m, n, alpha, beta, lda, ldb, ldc);
+    return;
+  }
+
+  if (dimM == 1 && dimN != 1 && dimK != 1 && transa == HPPL_OP_N) {
+    int m = (transb == HPPL_OP_N) ? dimK : dimN;
+    int n = (transb == HPPL_OP_N) ? dimN : dimK;
+    hl_trans_op_t trans = (transb == HPPL_OP_N) ? HPPL_OP_T : HPPL_OP_N;
+    hl_matrix_mul_vector(B_d, trans, A_d, C_d, m, n, alpha, beta, ldb, 1, 1);
+    return;
+  }
+
+  cublasStatus_t stat;
+  if ((HPPL_OP_N == transa) && (HPPL_OP_N == transb)) {
+    stat = CUBLAS_GEMM(t_resource.handle,
+                       CUBLAS_OP_N,
+                       CUBLAS_OP_N,
+                       dimN,
+                       dimM,
+                       dimK,
+                       &alpha,
+                       B_d,
+                       ldb,
+                       A_d,
+                       lda,
+                       &beta,
+                       C_d,
+                       ldc);
+  } else if ((HPPL_OP_T == transa) && (HPPL_OP_N == transb)) {
+    stat = CUBLAS_GEMM(t_resource.handle,
+                       CUBLAS_OP_N,
+                       CUBLAS_OP_T,
+                       dimN,
+                       dimM,
+                       dimK,
+                       &alpha,
+                       B_d,
+                       ldb,
+                       A_d,
+                       lda,
+                       &beta,
+                       C_d,
+                       ldc);
+  } else if ((HPPL_OP_N == transa) && (HPPL_OP_T == transb)) {
+    stat = CUBLAS_GEMM(t_resource.handle,
+                       CUBLAS_OP_T,
+                       CUBLAS_OP_N,
+                       dimN,
+                       dimM,
+                       dimK,
+                       &alpha,
+                       B_d,
+                       ldb,
+                       A_d,
+                       lda,
+                       &beta,
+                       C_d,
+                       ldc);
+  } else {
+    LOG(FATAL) << "parameter transa error!";
+  }
+  CHECK_EQ(stat, CUBLAS_STATUS_SUCCESS) << hl_cublas_get_error_string(stat);
+  CHECK_SYNC("hl_matrix_mul failed");
+}
+
+void hl_matrix_mul(real *A_d,
+                   hl_trans_op_t transa,
+                   real *B_d,
+                   hl_trans_op_t transb,
+                   real *C_d,
+                   int dimM,
+                   int dimN,
+                   int dimK,
+                   real alpha,
+                   real beta) {
+  int lda = (HPPL_OP_N == transa) ? dimK : dimM;
+  int ldb = (HPPL_OP_N == transb) ? dimN : dimK;
+  int ldc = dimN;
+
+  hl_matrix_mul(A_d,
+                transa,
+                B_d,
+                transb,
+                C_d,
+                dimM,
+                dimN,
+                dimK,
+                alpha,
+                beta,
+                lda,
+                ldb,
+                ldc);
+}
+
+void hl_matrix_mul_vector(real *A_d,
+                          hl_trans_op_t trans,
+                          real *B_d,
+                          real *C_d,
+                          int dimM,
+                          int dimN,
+                          real alpha,
+                          real beta,
+                          int lda,
+                          int incb,
+                          int incc) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(B_d);
+  CHECK_NOTNULL(C_d);
+
+  cublasStatus_t stat;
+  if (HPPL_OP_N == trans) {
+    stat = CUBLAS_GEMV(t_resource.handle,
+                       CUBLAS_OP_T,
+                       dimN,
+                       dimM,
+                       &alpha,
+                       A_d,
+                       lda,
+                       B_d,
+                       incb,
+                       &beta,
+                       C_d,
+                       incc);
+  } else if (HPPL_OP_T == trans) {
+    stat = CUBLAS_GEMV(t_resource.handle,
+                       CUBLAS_OP_N,
+                       dimN,
+                       dimM,
+                       &alpha,
+                       A_d,
+                       lda,
+                       B_d,
+                       incb,
+                       &beta,
+                       C_d,
+                       incc);
+  } else {
+    LOG(FATAL) << "parameter transa error!";
+  }
+
+  CHECK_EQ(stat, CUBLAS_STATUS_SUCCESS) << hl_cublas_get_error_string(stat);
+  CHECK_SYNC("hl_matrix_mul_vector");
+}
+
+void hl_matrix_mul_vector(real *A_d,
+                          hl_trans_op_t trans,
+                          real *B_d,
+                          real *C_d,
+                          int dimM,
+                          int dimN,
+                          real alpha,
+                          real beta) {
+  hl_matrix_mul_vector(
+      A_d, trans, B_d, C_d, dimM, dimN, alpha, beta, dimN, 1, 1);
+}
diff --git a/paddle/legacy/cuda/src/hl_cuda_cudnn.cc b/paddle/legacy/cuda/src/hl_cuda_cudnn.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b0ac5aaac284cd939fc46be6a7320242312674ab
--- /dev/null
+++ b/paddle/legacy/cuda/src/hl_cuda_cudnn.cc
@@ -0,0 +1,1117 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "hl_cuda_cudnn.h"
+#include <cudnn.h>
+#include <gflags/gflags.h>
+#include "hl_cuda_cudnn.ph"
+#include "hl_thread.ph"
+#include "paddle/legacy/utils/DynamicLoader.h"
+#include "paddle/legacy/utils/Logging.h"
+
+DEFINE_int32(cudnn_conv_workspace_limit_in_mb,
+             4096,
+             "Specify cuDNN max workspace limit, in units MB, "
+             "4096MB=4GB by default.");
+
+namespace dynload {
+
+std::once_flag cudnn_dso_flag;
+void* cudnn_dso_handle = nullptr;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load cudbnn routine
+ * via operator overloading: operator ()
+ *
+ * note: default dynamic linked libs
+ **/
+
+#ifdef PADDLE_USE_DSO
+
+#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                                     \
+  struct DynLoad__##__name {                                                \
+    template <typename... Args>                                             \
+    auto operator()(Args... args) -> decltype(__name(args...)) {            \
+      using cudnn_func = decltype(__name(args...)) (*)(Args...);            \
+      std::call_once(cudnn_dso_flag, GetCudnnDsoHandle, &cudnn_dso_handle); \
+      void* p_##__name = dlsym(cudnn_dso_handle, #__name);                  \
+      return reinterpret_cast<cudnn_func>(p_##__name)(args...);             \
+    }                                                                       \
+  } __name; /* struct DynLoad__##__name */
+
+#else
+
+#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                          \
+  struct DynLoad__##__name {                                     \
+    template <typename... Args>                                  \
+    auto operator()(Args... args) -> decltype(__name(args...)) { \
+      return __name(args...);                                    \
+    }                                                            \
+  } __name; /* struct DynLoad__##__name */
+
+#endif
+
+/**
+ * include all needed cudnn functions in HPPL
+ * different cudnn version has different interfaces
+ **/
+// clang-format off
+#define CUDNN_DNN_ROUTINE_EACH(__macro)                   \
+  __macro(cudnnSetTensor4dDescriptor)                     \
+  __macro(cudnnSetTensor4dDescriptorEx)                   \
+  __macro(cudnnGetConvolutionNdForwardOutputDim)          \
+  __macro(cudnnGetConvolutionForwardAlgorithm)            \
+  __macro(cudnnCreateTensorDescriptor)                    \
+  __macro(cudnnDestroyTensorDescriptor)                   \
+  __macro(cudnnCreateFilterDescriptor)                    \
+  __macro(cudnnSetFilter4dDescriptor)                     \
+  __macro(cudnnSetPooling2dDescriptor)                    \
+  __macro(cudnnDestroyFilterDescriptor)                   \
+  __macro(cudnnCreateConvolutionDescriptor)               \
+  __macro(cudnnCreatePoolingDescriptor)                   \
+  __macro(cudnnDestroyPoolingDescriptor)                  \
+  __macro(cudnnSetConvolution2dDescriptor)                \
+  __macro(cudnnDestroyConvolutionDescriptor)              \
+  __macro(cudnnCreate)                                    \
+  __macro(cudnnDestroy)                                   \
+  __macro(cudnnSetStream)                                 \
+  __macro(cudnnActivationForward)                         \
+  __macro(cudnnConvolutionForward)                        \
+  __macro(cudnnConvolutionBackwardBias)                   \
+  __macro(cudnnGetConvolutionForwardWorkspaceSize)        \
+  __macro(cudnnTransformTensor)                           \
+  __macro(cudnnPoolingForward)                            \
+  __macro(cudnnPoolingBackward)                           \
+  __macro(cudnnSoftmaxBackward)                           \
+  __macro(cudnnSoftmaxForward)                            \
+  __macro(cudnnGetVersion)                                \
+  __macro(cudnnGetErrorString)
+CUDNN_DNN_ROUTINE_EACH(DYNAMIC_LOAD_CUDNN_WRAP)
+
+#define CUDNN_DNN_ROUTINE_EACH_R2(__macro)                \
+  __macro(cudnnAddTensor)                                 \
+  __macro(cudnnConvolutionBackwardData)                   \
+  __macro(cudnnConvolutionBackwardFilter)
+CUDNN_DNN_ROUTINE_EACH_R2(DYNAMIC_LOAD_CUDNN_WRAP)
+
+// APIs available after R3:
+#if CUDNN_VERSION >= 3000
+#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3(__macro)              \
+  __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize)     \
+  __macro(cudnnGetConvolutionBackwardDataAlgorithm)           \
+  __macro(cudnnGetConvolutionBackwardFilterAlgorithm)         \
+  __macro(cudnnGetConvolutionBackwardDataWorkspaceSize)
+CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DYNAMIC_LOAD_CUDNN_WRAP)
+#undef CUDNN_DNN_ROUTINE_EACH_AFTER_R3
+#endif
+
+
+// APIs available after R4:
+#if CUDNN_VERSION >= 4007
+#define CUDNN_DNN_ROUTINE_EACH_AFTER_R4(__macro)             \
+  __macro(cudnnBatchNormalizationForwardTraining)            \
+  __macro(cudnnBatchNormalizationForwardInference)           \
+  __macro(cudnnBatchNormalizationBackward)
+CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DYNAMIC_LOAD_CUDNN_WRAP)
+#undef CUDNN_DNN_ROUTINE_EACH_AFTER_R4
+#endif
+
+// APIs in R5
+#if CUDNN_VERSION >= 5000
+#define CUDNN_DNN_ROUTINE_EACH_R5(__macro)                    \
+  __macro(cudnnCreateActivationDescriptor)                    \
+  __macro(cudnnSetActivationDescriptor)                       \
+  __macro(cudnnGetActivationDescriptor)                       \
+  __macro(cudnnDestroyActivationDescriptor)
+CUDNN_DNN_ROUTINE_EACH_R5(DYNAMIC_LOAD_CUDNN_WRAP)
+#undef CUDNN_DNN_ROUTINE_EACH_R5
+#endif
+
+#undef CUDNN_DNN_ROUTINE_EACH
+// clang-format on
+} /* namespace dynload */
+
+/**
+ * Check build-in cudnn function using glog and it **does not**
+ * support << operator for more details error info.
+ */
+#define CHECK_CUDNN(cudnnFunc)                                         \
+  do {                                                                 \
+    cudnnStatus_t cudnnStat = cudnnFunc;                               \
+    CHECK_EQ(CUDNN_STATUS_SUCCESS, cudnnStat)                          \
+        << "Cudnn Error: " << dynload::cudnnGetErrorString(cudnnStat); \
+  } while (0)
+
+bool g_is_libcudnn_init = false;
+int g_cudnn_lib_version = 0;
+
+void hl_cudnn_desc_init(cudnnTensorDescriptor_t* cudnn_desc) {
+  CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(cudnn_desc));
+}
+
+void hl_cudnn_init(cudnnHandle_t* cudnn_handle, cudaStream_t stream) {
+  size_t cudnn_dso_ver = dynload::cudnnGetVersion();
+  size_t cudnn_dso_major = cudnn_dso_ver / 1000;
+  size_t cudnn_cuh_major = CUDNN_VERSION / 1000;
+
+  // Compare cudnn header version with that of cudnn.so.
+  CHECK((cudnn_cuh_major < 4 && cudnn_dso_major < 4) ||
+        (cudnn_cuh_major == cudnn_dso_major))
+      << "[cudnn init] libcudnn v" << cudnn_dso_major << " with header v"
+      << cudnn_cuh_major << " unmatched!\n"
+      << "PaddlePaddle Requirement: "
+      << "(header v[2-3] with libcudnn v[2-3]) Or "
+      << "(header v4 with libcudnn v4) Or "
+      << "(header v5 with libcudnn v5) Or"
+      << "(header v6 with libcudnn v6).";
+
+  CHECK(!(CUDNN_VERSION < 6000 && CUDNN_VERSION >= 5000 && CUDA_VERSION < 7050))
+      << "cudnn v5 requires cuda version >= 7.5";
+
+  CHECK(!(CUDNN_VERSION >= 6000 && CUDA_VERSION < 8000))
+      << "cudnn v6 requires cuda version >= 8.0";
+
+  CHECK_CUDNN(dynload::cudnnCreate(cudnn_handle));
+  CHECK_CUDNN(dynload::cudnnSetStream(*cudnn_handle, stream));
+
+  g_is_libcudnn_init = true;
+  g_cudnn_lib_version = cudnn_dso_ver;
+}
+
+int hl_get_cudnn_lib_version() { return g_cudnn_lib_version; }
+
+void hl_conv_workspace(hl_tensor_descriptor input,
+                       hl_tensor_descriptor output,
+                       hl_filter_descriptor filter,
+                       hl_convolution_descriptor conv,
+                       int* convFwdAlgo,
+                       size_t* fwdLimitBytes,
+                       int* convBwdDataAlgo,
+                       size_t* bwdDataLimitBytes,
+                       int* convBwdFilterAlgo,
+                       size_t* bwdFilterLimitBytes,
+                       bool useDilation) {
+#if CUDNN_VERSION >= 4000
+
+  CHECK_NOTNULL(input);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(filter);
+  CHECK_NOTNULL(conv);
+
+  // Specify workspace limit directly
+  size_t memoryLimitBytes =
+      (1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb;
+
+  // For dilation
+  int algo = 0;
+
+  // cudnn convolution forward configuration
+  cudnnTensorDescriptor_t fwd_src_desc = GET_TENSOR_DESCRIPTOR(input);
+  cudnnTensorDescriptor_t fwd_dest_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnFilterDescriptor_t fwd_filter_desc = GET_FILTER_DESCRIPTOR(filter);
+  cudnnConvolutionDescriptor_t fwd_conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+  // cudnn convolution backward data configuration
+  cudnnFilterDescriptor_t bwd_data_filter_desc = GET_FILTER_DESCRIPTOR(filter);
+  cudnnTensorDescriptor_t bwd_data_diff_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnTensorDescriptor_t bwd_data_grad_desc = GET_TENSOR_DESCRIPTOR(input);
+  cudnnConvolutionDescriptor_t bwd_data_conv_desc =
+      GET_CONVOLUTION_DESCRIPTOR(conv);
+  // cudnn convolution backward filter configuration
+  cudnnTensorDescriptor_t bwd_filter_src_desc = GET_TENSOR_DESCRIPTOR(input);
+  cudnnTensorDescriptor_t bwd_filter_diff_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnConvolutionDescriptor_t bwd_filter_conv_desc =
+      GET_CONVOLUTION_DESCRIPTOR(conv);
+  cudnnFilterDescriptor_t bwd_filter_grad_desc = GET_FILTER_DESCRIPTOR(filter);
+
+  if (useDilation) {
+    convFwdAlgo = &algo;
+    convBwdDataAlgo = &algo;
+    convBwdFilterAlgo = &algo;
+  } else {
+    CHECK_CUDNN(dynload::cudnnGetConvolutionForwardAlgorithm(
+        t_resource.cudnn_handle,
+        fwd_src_desc,
+        fwd_filter_desc,
+        fwd_conv_desc,
+        fwd_dest_desc,
+        CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+        memoryLimitBytes,
+        reinterpret_cast<cudnnConvolutionFwdAlgo_t*>(convFwdAlgo)));
+    CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataAlgorithm(
+        t_resource.cudnn_handle,
+        bwd_data_filter_desc,
+        bwd_data_diff_desc,
+        bwd_data_conv_desc,
+        bwd_data_grad_desc,
+        CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+        memoryLimitBytes,
+        reinterpret_cast<cudnnConvolutionBwdDataAlgo_t*>(convBwdDataAlgo)));
+    CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
+        t_resource.cudnn_handle,
+        bwd_filter_src_desc,
+        bwd_filter_diff_desc,
+        bwd_filter_conv_desc,
+        bwd_filter_grad_desc,
+        CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+        memoryLimitBytes,
+        reinterpret_cast<cudnnConvolutionBwdFilterAlgo_t*>(convBwdFilterAlgo)));
+  }
+
+  CHECK_CUDNN(dynload::cudnnGetConvolutionForwardWorkspaceSize(
+      t_resource.cudnn_handle,
+      fwd_src_desc,
+      fwd_filter_desc,
+      fwd_conv_desc,
+      fwd_dest_desc,
+      static_cast<cudnnConvolutionFwdAlgo_t>(*convFwdAlgo),
+      fwdLimitBytes));
+
+  CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
+      t_resource.cudnn_handle,
+      bwd_data_filter_desc,
+      bwd_data_diff_desc,
+      bwd_data_conv_desc,
+      bwd_data_grad_desc,
+      static_cast<cudnnConvolutionBwdDataAlgo_t>(*convBwdDataAlgo),
+      bwdDataLimitBytes));
+
+  CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
+      t_resource.cudnn_handle,
+      bwd_filter_src_desc,
+      bwd_filter_diff_desc,
+      bwd_filter_conv_desc,
+      bwd_filter_grad_desc,
+      static_cast<cudnnConvolutionBwdFilterAlgo_t>(*convBwdFilterAlgo),
+      bwdFilterLimitBytes));
+
+#endif
+}
+
+void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc,
+                                 int batch_size,
+                                 int feature_maps,
+                                 int height,
+                                 int width) {
+  CHECK_NOTNULL(image_desc);
+
+  cudnn_tensor_descriptor hl_desc =
+      (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
+  CHECK_NOTNULL(hl_desc);
+
+#ifndef PADDLE_TYPE_DOUBLE
+  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+#else
+  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+#endif
+  CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(&hl_desc->desc));
+
+  CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(hl_desc->desc,
+                                                  CUDNN_TENSOR_NCHW,
+                                                  data_type,
+                                                  batch_size,
+                                                  feature_maps,
+                                                  height,
+                                                  width));
+
+  hl_desc->format = CUDNN_TENSOR_NCHW;
+  hl_desc->data_type = data_type;
+  hl_desc->batch_size = batch_size;
+  hl_desc->feature_maps = feature_maps;
+  hl_desc->height = height;
+  hl_desc->width = width;
+
+  *image_desc = (hl_tensor_descriptor)hl_desc;
+}
+
+void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc) {
+  CHECK_NOTNULL(image_desc);
+
+  cudnn_tensor_descriptor hl_desc =
+      (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
+  CHECK_NOTNULL(hl_desc);
+
+#ifndef PADDLE_TYPE_DOUBLE
+  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+#else
+  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+#endif
+  CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(&hl_desc->desc));
+
+  hl_desc->data_type = data_type;
+
+  *image_desc = (hl_tensor_descriptor)hl_desc;
+}
+
+void hl_tensor_reshape(hl_tensor_descriptor image_desc,
+                       int batch_size,
+                       int feature_maps,
+                       int height,
+                       int width) {
+  const int stride_w = 1;
+  const int stride_h = width * stride_w;
+  const int stride_c = height * stride_h;
+  const int stride_n = feature_maps * stride_c;
+  return hl_tensor_reshape(image_desc,
+                           batch_size,
+                           feature_maps,
+                           height,
+                           width,
+                           stride_n,
+                           stride_c,
+                           stride_h,
+                           stride_w);
+}
+
+void hl_tensor_reshape(hl_tensor_descriptor image_desc,
+                       int batch_size,
+                       int feature_maps,
+                       int height,
+                       int width,
+                       int nStride,
+                       int cStride,
+                       int hStride,
+                       int wStride) {
+  CHECK_NOTNULL(image_desc);
+
+  cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
+  CHECK_NOTNULL(hl_desc->desc);
+
+  CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptorEx(hl_desc->desc,
+                                                    hl_desc->data_type,
+                                                    batch_size,
+                                                    feature_maps,
+                                                    height,
+                                                    width,
+                                                    nStride,
+                                                    cStride,
+                                                    hStride,
+                                                    wStride));
+
+  hl_desc->batch_size = batch_size;
+  hl_desc->feature_maps = feature_maps;
+  hl_desc->height = height;
+  hl_desc->width = width;
+}
+
+void hl_destroy_tensor_descriptor(hl_tensor_descriptor image_desc) {
+  CHECK_NOTNULL(image_desc);
+
+  cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
+  CHECK_NOTNULL(hl_desc->desc);
+
+  CHECK_CUDNN(dynload::cudnnDestroyTensorDescriptor(hl_desc->desc));
+
+  hl_desc->desc = NULL;
+
+  free(image_desc);
+}
+
+void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc,
+                                  hl_pooling_mode_t mode,
+                                  int height,
+                                  int width,
+                                  int height_padding,
+                                  int width_padding,
+                                  int stride_height,
+                                  int stride_width) {
+  cudnnPoolingMode_t cudnn_mode;
+  switch (mode) {
+    case HL_POOLING_MAX:
+      cudnn_mode = CUDNN_POOLING_MAX;
+      break;
+    case HL_POOLING_AVERAGE:
+      cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
+      break;
+    case HL_POOLING_AVERAGE_INCLUDE_PADDING:
+      cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
+      break;
+    default:
+      LOG(FATAL) << "parameter mode error";
+  }
+
+  CHECK_NOTNULL(pooling_desc);
+
+  cudnn_pooling_descriptor hl_pooling_desc =
+      (cudnn_pooling_descriptor)malloc(sizeof(_cudnn_pooling_descriptor));
+  CHECK_NOTNULL(hl_pooling_desc);
+
+  CHECK_CUDNN(dynload::cudnnCreatePoolingDescriptor(&hl_pooling_desc->desc));
+
+  CHECK_CUDNN(dynload::cudnnSetPooling2dDescriptor(hl_pooling_desc->desc,
+                                                   cudnn_mode,
+#if CUDNN_VERSION >= 5000
+                                                   CUDNN_PROPAGATE_NAN,
+#endif
+                                                   height,
+                                                   width,
+                                                   height_padding,
+                                                   width_padding,
+                                                   stride_height,
+                                                   stride_width));
+
+  hl_pooling_desc->mode = cudnn_mode;
+  hl_pooling_desc->window_height = height;
+  hl_pooling_desc->window_width = width;
+  hl_pooling_desc->stride_height = stride_height;
+  hl_pooling_desc->stride_width = stride_width;
+
+  *pooling_desc = (hl_pooling_descriptor)hl_pooling_desc;
+}
+
+void hl_destroy_pooling_descriptor(hl_pooling_descriptor pooling_desc) {
+  CHECK_NOTNULL(pooling_desc);
+
+  cudnn_pooling_descriptor hl_pooling = (cudnn_pooling_descriptor)pooling_desc;
+
+  CHECK_NOTNULL(hl_pooling->desc);
+  CHECK_CUDNN(dynload::cudnnDestroyPoolingDescriptor(hl_pooling->desc));
+
+  hl_pooling->desc = NULL;
+
+  free(pooling_desc);
+}
+
+void hl_pooling_forward(hl_tensor_descriptor input,
+                        real* input_image,
+                        hl_tensor_descriptor output,
+                        real* output_image,
+                        hl_pooling_descriptor pooling) {
+  cudnnPoolingDescriptor_t pooling_desc;
+  cudnnTensorDescriptor_t input_desc;
+  cudnnTensorDescriptor_t output_desc;
+
+  CHECK_NOTNULL(input);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(pooling);
+  CHECK_NOTNULL(input_image);
+  CHECK_NOTNULL(output_image);
+
+  real alpha = 1.0f;
+  real beta = 1.0f;
+  input_desc = ((cudnn_tensor_descriptor)input)->desc;
+  output_desc = ((cudnn_tensor_descriptor)output)->desc;
+  pooling_desc = ((cudnn_pooling_descriptor)pooling)->desc;
+  CHECK_CUDNN(dynload::cudnnPoolingForward(t_resource.cudnn_handle,
+                                           pooling_desc,
+                                           &alpha,
+                                           input_desc,
+                                           input_image,
+                                           &beta,
+                                           output_desc,
+                                           output_image));
+  CHECK_SYNC("hl_pooling_forward failed");
+}
+
+void hl_pooling_backward(hl_tensor_descriptor input,
+                         real* input_image,
+                         real* input_image_grad,
+                         hl_tensor_descriptor output,
+                         real* output_image,
+                         real* output_image_grad,
+                         hl_pooling_descriptor pooling) {
+  cudnnPoolingDescriptor_t pooling_desc;
+  cudnnTensorDescriptor_t input_desc;
+  cudnnTensorDescriptor_t output_desc;
+
+  CHECK_NOTNULL(input);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(pooling);
+  CHECK_NOTNULL(input_image);
+  CHECK_NOTNULL(input_image_grad);
+  CHECK_NOTNULL(output_image);
+  CHECK_NOTNULL(output_image_grad);
+
+  real alpha = 1.0f;
+  real beta = 1.0f;
+  input_desc = ((cudnn_tensor_descriptor)input)->desc;
+  output_desc = ((cudnn_tensor_descriptor)output)->desc;
+  pooling_desc = ((cudnn_pooling_descriptor)pooling)->desc;
+  CHECK_CUDNN(dynload::cudnnPoolingBackward(t_resource.cudnn_handle,
+                                            pooling_desc,
+                                            &alpha,
+                                            output_desc,
+                                            output_image,
+                                            output_desc,
+                                            output_image_grad,
+                                            input_desc,
+                                            input_image,
+                                            &beta,
+                                            input_desc,
+                                            input_image_grad));
+  CHECK_SYNC("hl_pooling_backward failed");
+}
+
+void hl_create_filter_descriptor(hl_filter_descriptor* filter,
+                                 int input_feature_maps,
+                                 int output_feature_maps,
+                                 int height,
+                                 int width) {
+  CHECK_NOTNULL(filter);
+
+  cudnn_filter_descriptor hl_filter =
+      (cudnn_filter_descriptor)malloc(sizeof(_cudnn_filter_descriptor));
+  CHECK_NOTNULL(hl_filter);
+
+  CHECK_CUDNN(dynload::cudnnCreateFilterDescriptor(&hl_filter->desc));
+
+#ifndef PADDLE_TYPE_DOUBLE
+  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+#else
+  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+#endif
+  CHECK_CUDNN(dynload::cudnnSetFilter4dDescriptor(hl_filter->desc,
+                                                  data_type,
+#if CUDNN_VERSION >= 5000
+                                                  CUDNN_TENSOR_NCHW,
+#endif
+                                                  output_feature_maps,
+                                                  input_feature_maps,
+                                                  height,
+                                                  width));
+
+  hl_filter->data_type = data_type;
+  hl_filter->output_feature_maps = output_feature_maps;
+  hl_filter->input_feature_maps = input_feature_maps;
+  hl_filter->filter_height = height;
+  hl_filter->filter_width = width;
+
+  *filter = (hl_filter_descriptor)hl_filter;
+}
+
+void hl_destroy_filter_descriptor(hl_filter_descriptor filter) {
+  CHECK_NOTNULL(filter);
+
+  cudnn_filter_descriptor hl_filter = (cudnn_filter_descriptor)filter;
+  CHECK_NOTNULL(hl_filter->desc);
+
+  CHECK_CUDNN(dynload::cudnnDestroyFilterDescriptor(hl_filter->desc));
+
+  hl_filter->desc = NULL;
+
+  free(filter);
+}
+
+void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
+                                      hl_tensor_descriptor image,
+                                      hl_filter_descriptor filter,
+                                      int padding_height,
+                                      int padding_width,
+                                      int stride_height,
+                                      int stride_width,
+                                      int dilation_h,
+                                      int dilation_w) {
+  CHECK_NOTNULL(conv);
+
+  cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)malloc(
+      sizeof(_cudnn_convolution_descriptor));
+
+  CHECK_NOTNULL(hl_conv);
+  CHECK_CUDNN(dynload::cudnnCreateConvolutionDescriptor(&hl_conv->desc));
+
+  cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
+
+#if CUDNN_VERSION >= 6000
+#ifndef PADDLE_TYPE_DOUBLE
+  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+#else
+  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+#endif
+  CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(hl_conv->desc,
+                                                       padding_height,
+                                                       padding_width,
+                                                       stride_height,
+                                                       stride_width,
+                                                       dilation_h,
+                                                       dilation_w,
+                                                       mode,
+                                                       data_type));
+#else
+  if (dilation_h > 1 || dilation_w > 1) {
+    LOG(FATAL)
+        << "Current cuDNN version does't support for dilation convolution. "
+        << "The dilation convolution requires cuDNN >= v6.0.";
+  }
+
+  CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(hl_conv->desc,
+                                                       padding_height,
+                                                       padding_width,
+                                                       stride_height,
+                                                       stride_width,
+                                                       dilation_h,
+                                                       dilation_w,
+                                                       mode));
+#endif
+
+  hl_conv->input_image = image;
+  hl_conv->filter = filter;
+  hl_conv->padding_height = padding_height;
+  hl_conv->padding_width = padding_width;
+  hl_conv->stride_height = stride_height;
+  hl_conv->stride_width = stride_width;
+  hl_conv->upscalex = 1;
+  hl_conv->upscaley = 1;
+  hl_conv->mode = mode;
+
+  *conv = (hl_convolution_descriptor)hl_conv;
+}
+
+void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
+                                     hl_tensor_descriptor image,
+                                     hl_filter_descriptor filter,
+                                     int padding_height,
+                                     int padding_width,
+                                     int stride_height,
+                                     int stride_width,
+                                     int dilation_h,
+                                     int dilation_w) {
+  CHECK_NOTNULL(conv);
+  CHECK_NOTNULL(image);
+  CHECK_NOTNULL(filter);
+
+  cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+  cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
+
+#if CUDNN_VERSION >= 6000
+#ifndef PADDLE_TYPE_DOUBLE
+  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+#else
+  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+#endif
+  CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(conv_desc,
+                                                       padding_height,
+                                                       padding_width,
+                                                       stride_height,
+                                                       stride_width,
+                                                       dilation_h,
+                                                       dilation_w,
+                                                       mode,
+                                                       data_type));
+#else
+  CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(conv_desc,
+                                                       padding_height,
+                                                       padding_width,
+                                                       stride_height,
+                                                       stride_width,
+                                                       dilation_h,
+                                                       dilation_w,
+                                                       mode));
+#endif
+
+  cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
+  hl_conv->input_image = image;
+  hl_conv->filter = filter;
+  hl_conv->padding_height = padding_height;
+  hl_conv->padding_width = padding_width;
+  hl_conv->stride_height = stride_height;
+  hl_conv->stride_width = stride_width;
+  hl_conv->upscalex = 1;
+  hl_conv->upscaley = 1;
+  hl_conv->mode = mode;
+}
+
+void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv) {
+  CHECK_NOTNULL(conv);
+
+  cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
+  CHECK_NOTNULL(hl_conv->desc);
+
+  CHECK_CUDNN(dynload::cudnnDestroyConvolutionDescriptor(hl_conv->desc));
+  hl_conv->desc = NULL;
+
+  free(conv);
+}
+
+void hl_convolution_forward(hl_tensor_descriptor input,
+                            real* input_data,
+                            hl_tensor_descriptor output,
+                            real* output_data,
+                            hl_filter_descriptor filter,
+                            real* filter_data,
+                            hl_convolution_descriptor conv,
+                            void* gpuWorkSpace,
+                            size_t sizeInBytes,
+                            int convFwdAlgo) {
+  CHECK_NOTNULL(input);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(filter);
+  CHECK_NOTNULL(conv);
+  CHECK_NOTNULL(input_data);
+  CHECK_NOTNULL(output_data);
+  CHECK_NOTNULL(filter_data);
+  cudnnTensorDescriptor_t src_desc = GET_TENSOR_DESCRIPTOR(input);
+  cudnnTensorDescriptor_t dest_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnFilterDescriptor_t filter_desc = GET_FILTER_DESCRIPTOR(filter);
+  cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+  real alpha = 1.0f;
+  real beta = 1.0f;
+  CHECK_CUDNN(dynload::cudnnConvolutionForward(
+      t_resource.cudnn_handle,
+      &alpha,
+      src_desc,
+      input_data,
+      filter_desc,
+      filter_data,
+      conv_desc,
+      static_cast<cudnnConvolutionFwdAlgo_t>(convFwdAlgo),
+      gpuWorkSpace,
+      sizeInBytes,
+      &beta,
+      dest_desc,
+      output_data));
+  CHECK_SYNC("hl_convolution_forward failed");
+}
+
+void hl_convolution_forward_add_bias(hl_tensor_descriptor bias,
+                                     real* bias_data,
+                                     hl_tensor_descriptor output,
+                                     real* output_data) {
+  CHECK_NOTNULL(bias);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(bias_data);
+  CHECK_NOTNULL(output_data);
+
+  cudnnTensorDescriptor_t output_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnTensorDescriptor_t bias_desc = GET_TENSOR_DESCRIPTOR(bias);
+  real alpha = 1.0f;
+  real beta = 1.0f;
+
+  CHECK_CUDNN(dynload::cudnnAddTensor(t_resource.cudnn_handle,
+#if CUDNN_VERSION < 4000
+                                      CUDNN_ADD_SAME_C,
+#endif
+                                      &alpha,
+                                      bias_desc,
+                                      bias_data,
+                                      &beta,
+                                      output_desc,
+                                      output_data));
+  CHECK_SYNC("hl_convolution_forward_add_bias failed");
+}
+
+void hl_convolution_backward_bias(hl_tensor_descriptor bias,
+                                  real* bias_grad_data,
+                                  hl_tensor_descriptor output,
+                                  real* output_grad_data) {
+  CHECK_NOTNULL(bias);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(bias_grad_data);
+  CHECK_NOTNULL(output_grad_data);
+
+  real alpha = 1.0f;
+  real beta = 1.0f;
+  cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnTensorDescriptor_t bias_desc = GET_TENSOR_DESCRIPTOR(bias);
+  CHECK_CUDNN(dynload::cudnnConvolutionBackwardBias(t_resource.cudnn_handle,
+                                                    &alpha,
+                                                    diff_desc,
+                                                    output_grad_data,
+                                                    &beta,
+                                                    bias_desc,
+                                                    bias_grad_data));
+  CHECK_SYNC("hl_convolution_backward_bias failed");
+}
+
+void hl_convolution_backward_filter(hl_tensor_descriptor input,
+                                    real* input_data,
+                                    hl_tensor_descriptor output,
+                                    real* output_grad_data,
+                                    hl_filter_descriptor filter,
+                                    real* filter_grad_data,
+                                    hl_convolution_descriptor conv,
+                                    void* gpuWorkSpace,
+                                    size_t sizeInBytes,
+                                    int convBwdFilterAlgo) {
+  CHECK_NOTNULL(input);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(filter);
+  CHECK_NOTNULL(conv);
+  CHECK_NOTNULL(input_data);
+  CHECK_NOTNULL(output_grad_data);
+  CHECK_NOTNULL(filter_grad_data);
+
+  real alpha = 1.0f;
+  real beta = 1.0f;
+  cudnnTensorDescriptor_t src_desc = GET_TENSOR_DESCRIPTOR(input);
+  cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+  cudnnFilterDescriptor_t grad_desc = GET_FILTER_DESCRIPTOR(filter);
+
+  CHECK_CUDNN(dynload::cudnnConvolutionBackwardFilter(
+      t_resource.cudnn_handle,
+      &alpha,
+      src_desc,
+      input_data,
+      diff_desc,
+      output_grad_data,
+      conv_desc,
+#if CUDNN_VERSION >= 4000
+      static_cast<cudnnConvolutionBwdFilterAlgo_t>(convBwdFilterAlgo),
+      gpuWorkSpace,
+      sizeInBytes,
+#endif
+      &beta,
+      grad_desc,
+      filter_grad_data));
+  CHECK_SYNC("hl_convolution_backward_filter failed");
+}
+
+void hl_convolution_backward_data(hl_tensor_descriptor input,
+                                  real* input_data_grad,
+                                  hl_tensor_descriptor output,
+                                  real* output_grad_data,
+                                  hl_filter_descriptor filter,
+                                  real* filter_data,
+                                  hl_convolution_descriptor conv,
+                                  void* gpuWorkSpace,
+                                  size_t sizeInBytes,
+                                  int convBwdDataAlgo) {
+  real alpha = 1.0f;
+  real beta = 1.0f;
+  cudnnFilterDescriptor_t filter_desc = GET_FILTER_DESCRIPTOR(filter);
+  cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnTensorDescriptor_t grad_desc = GET_TENSOR_DESCRIPTOR(input);
+  cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+
+  CHECK_CUDNN(dynload::cudnnConvolutionBackwardData(
+      t_resource.cudnn_handle,
+      &alpha,
+      filter_desc,
+      filter_data,
+      diff_desc,
+      output_grad_data,
+      conv_desc,
+#if CUDNN_VERSION >= 4000
+      static_cast<cudnnConvolutionBwdDataAlgo_t>(convBwdDataAlgo),
+      gpuWorkSpace,
+      sizeInBytes,
+#endif
+      &beta,
+      grad_desc,
+      input_data_grad));
+  CHECK_SYNC("hl_convolution_backward_data failed");
+}
+
+void hl_softmax_forward(real* input, real* output, int height, int width) {
+#ifndef PADDLE_TYPE_DOUBLE
+  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+#else
+  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+#endif
+  CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(t_resource.cudnn_desc,
+                                                  CUDNN_TENSOR_NCHW,
+                                                  data_type,
+                                                  height,
+                                                  width,
+                                                  1,
+                                                  1));
+
+  real alpha = 1.0f;
+  real beta = 0.0f;
+  CHECK_CUDNN(dynload::cudnnSoftmaxForward(t_resource.cudnn_handle,
+                                           CUDNN_SOFTMAX_ACCURATE,
+                                           CUDNN_SOFTMAX_MODE_CHANNEL,
+                                           &alpha,
+                                           t_resource.cudnn_desc,
+                                           input,
+                                           &beta,
+                                           t_resource.cudnn_desc,
+                                           output));
+  CHECK_SYNC("hl_softmax_forward failed");
+}
+
+void hl_softmax_backward(real* output_value,
+                         real* output_grad,
+                         int height,
+                         int width) {
+#ifndef PADDLE_TYPE_DOUBLE
+  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+#else
+  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+#endif
+  CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(t_resource.cudnn_desc,
+                                                  CUDNN_TENSOR_NCHW,
+                                                  data_type,
+                                                  height,
+                                                  width,
+                                                  1,
+                                                  1));
+
+  real alpha = 1.0f;
+  real beta = 0.0f;
+  CHECK_CUDNN(dynload::cudnnSoftmaxBackward(t_resource.cudnn_handle,
+                                            CUDNN_SOFTMAX_ACCURATE,
+                                            CUDNN_SOFTMAX_MODE_CHANNEL,
+                                            &alpha,
+                                            t_resource.cudnn_desc,
+                                            output_value,
+                                            t_resource.cudnn_desc,
+                                            output_grad,
+                                            &beta,
+                                            t_resource.cudnn_desc,
+                                            output_grad));
+  CHECK_SYNC("hl_softmax_backward failed");
+}
+
+void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
+                                    real* input,
+                                    hl_tensor_descriptor outputDesc,
+                                    real* output,
+                                    hl_tensor_descriptor bnParamDesc,
+                                    real* scale,
+                                    real* bias,
+                                    double factor,
+                                    real* runningMean,
+                                    real* runningInvVar,
+                                    double epsilon,
+                                    real* savedMean,
+                                    real* savedVar) {
+#if CUDNN_VERSION >= 4007
+  if ((NULL != runningMean && NULL == runningInvVar) ||
+      (NULL == runningMean && NULL != runningInvVar)) {
+    LOG(FATAL) << "runningMean and runningInvVar can be NULL "
+               << "but only at the same time.";
+  }
+  if ((NULL != savedMean && NULL == savedVar) ||
+      (NULL == savedMean && NULL != savedVar)) {
+    LOG(FATAL) << "savedMean and savedVar can be NULL "
+               << "but only at the same time.";
+  }
+
+  cudnnTensorDescriptor_t xDesc = GET_TENSOR_DESCRIPTOR(inputDesc);
+  cudnnTensorDescriptor_t yDesc = GET_TENSOR_DESCRIPTOR(outputDesc);
+  cudnnTensorDescriptor_t bnDesc = GET_TENSOR_DESCRIPTOR(bnParamDesc);
+  real alpha = 1.0f;
+  real beta = 1.0f;
+  cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
+  CHECK_CUDNN(
+      dynload::cudnnBatchNormalizationForwardTraining(t_resource.cudnn_handle,
+                                                      mode,
+                                                      &alpha,
+                                                      &beta,
+                                                      xDesc,
+                                                      input,
+                                                      yDesc,
+                                                      output,
+                                                      bnDesc,
+                                                      scale,
+                                                      bias,
+                                                      factor,
+                                                      runningMean,
+                                                      runningInvVar,
+                                                      epsilon,
+                                                      savedMean,
+                                                      savedVar));
+
+  CHECK_SYNC("hl_batch_norm_forward_training failed");
+#else
+  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4007. "
+             << "But cudnn lib version is " << g_cudnn_lib_version;
+#endif
+}
+
+void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
+                                     real* input,
+                                     hl_tensor_descriptor outputDesc,
+                                     real* output,
+                                     hl_tensor_descriptor bnParamDesc,
+                                     real* scale,
+                                     real* bias,
+                                     real* estimatedMean,
+                                     real* estimatedInvVar,
+                                     double epsilon) {
+#if CUDNN_VERSION >= 4007
+  cudnnTensorDescriptor_t xDesc = GET_TENSOR_DESCRIPTOR(inputDesc);
+  cudnnTensorDescriptor_t yDesc = GET_TENSOR_DESCRIPTOR(outputDesc);
+  cudnnTensorDescriptor_t bnDesc = GET_TENSOR_DESCRIPTOR(bnParamDesc);
+  real alpha = 1.0f;
+  real beta = 1.0f;
+  cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
+
+  CHECK_CUDNN(
+      dynload::cudnnBatchNormalizationForwardInference(t_resource.cudnn_handle,
+                                                       mode,
+                                                       &alpha,
+                                                       &beta,
+                                                       xDesc,
+                                                       input,
+                                                       yDesc,
+                                                       output,
+                                                       bnDesc,
+                                                       scale,
+                                                       bias,
+                                                       estimatedMean,
+                                                       estimatedInvVar,
+                                                       epsilon));
+
+  CHECK_SYNC("hl_batch_norm_forward_inference failed");
+#else
+  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4007. "
+             << "But cudnn lib version is " << g_cudnn_lib_version;
+#endif
+}
+
+void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
+                            real* input,
+                            hl_tensor_descriptor outGradDesc,
+                            real* outGrad,
+                            hl_tensor_descriptor inGradDesc,
+                            real* inGrad,
+                            hl_tensor_descriptor dBnParamDesc,
+                            real* scale,
+                            real* scaleGrad,
+                            real* biasGrad,
+                            double epsilon,
+                            real* savedMean,
+                            real* savedInvVar) {
+#if CUDNN_VERSION >= 4007
+  if ((NULL != savedMean && NULL == savedInvVar) ||
+      (NULL == savedMean && NULL != savedInvVar)) {
+    LOG(FATAL) << "savedMean and savedVar can be NULL "
+               << "but only at the same time.";
+  }
+
+  cudnnTensorDescriptor_t xDesc = GET_TENSOR_DESCRIPTOR(inputDesc);
+  cudnnTensorDescriptor_t dyDesc = GET_TENSOR_DESCRIPTOR(outGradDesc);
+  cudnnTensorDescriptor_t dxDesc = GET_TENSOR_DESCRIPTOR(inGradDesc);
+  cudnnTensorDescriptor_t bnDesc = GET_TENSOR_DESCRIPTOR(dBnParamDesc);
+  real alpha = 1.0f;
+  real beta = 1.0f;
+  cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
+  CHECK_CUDNN(dynload::cudnnBatchNormalizationBackward(t_resource.cudnn_handle,
+                                                       mode,
+                                                       &alpha,
+                                                       &beta,
+                                                       &alpha,
+                                                       &beta,
+                                                       xDesc,
+                                                       input,
+                                                       dyDesc,
+                                                       outGrad,
+                                                       dxDesc,
+                                                       inGrad,
+                                                       bnDesc,
+                                                       scale,
+                                                       scaleGrad,
+                                                       biasGrad,
+                                                       epsilon,
+                                                       savedMean,
+                                                       savedInvVar));
+
+  CHECK_SYNC("hl_batch_norm_backward failed");
+#else
+  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4007. "
+             << "But cudnn lib version is " << g_cudnn_lib_version;
+#endif
+}
diff --git a/paddle/legacy/cuda/src/hl_cuda_device.cc b/paddle/legacy/cuda/src/hl_cuda_device.cc
new file mode 100644
index 0000000000000000000000000000000000000000..501e3b0f3be02b9364f9182b2484d542f0f39889
--- /dev/null
+++ b/paddle/legacy/cuda/src/hl_cuda_device.cc
@@ -0,0 +1,677 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+// clang-format off
+// Because clang-format 4.X and clang-format 3.8+ format
+// following lines in different. So disable clang-format.
+#include "hl_cuda.h"
+#include <cuda_profiler_api.h>
+#include <string.h>
+#include <sys/syscall.h>
+#include <sys/time.h>
+#include <unistd.h>
+#include "hl_cuda.ph"
+#include "hl_thread.ph"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/DynamicLoader.h"
+// clang-format on
+
+namespace dynload {
+
+std::once_flag curand_dso_flag;
+void *curand_dso_handle = nullptr;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load curand routine
+ * via operator overloading.
+ *
+ * note: default dynamic linked libs
+ */
+#ifdef PADDLE_USE_DSO
+#define DYNAMIC_LOAD_CURAND_WRAP(__name)                                       \
+  struct DynLoad__##__name {                                                   \
+    template <typename... Args>                                                \
+    curandStatus_t operator()(Args... args) {                                  \
+      typedef curandStatus_t (*curandFunc)(Args...);                           \
+      std::call_once(curand_dso_flag, GetCurandDsoHandle, &curand_dso_handle); \
+      void *p_##__name = dlsym(curand_dso_handle, #__name);                    \
+      return reinterpret_cast<curandFunc>(p_##__name)(args...);                \
+    }                                                                          \
+  } __name; /* struct DynLoad__##__name */
+#else
+#define DYNAMIC_LOAD_CURAND_WRAP(__name)      \
+  struct DynLoad__##__name {                  \
+    template <typename... Args>               \
+    curandStatus_t operator()(Args... args) { \
+      return __name(args...);                 \
+    }                                         \
+  } __name; /* struct DynLoad__##__name */
+#endif
+
+/* include all needed curand functions in HPPL */
+// clang-format off
+#define CURAND_RAND_ROUTINE_EACH(__macro)    \
+  __macro(curandCreateGenerator)             \
+  __macro(curandSetStream)                   \
+  __macro(curandSetPseudoRandomGeneratorSeed)\
+  __macro(curandGenerateUniform)             \
+  __macro(curandGenerateUniformDouble)
+// clang-format on
+
+CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP)
+
+#undef CURAND_RAND_ROUTINE_EACH
+#undef DYNAMIC_LOAD_CURAND_WRAP
+
+} /* namespace dynload */
+
+/**
+ * @brief   global resource.
+ */
+int g_system_device_num = 0;                /* system device number */
+int device_num = 0;                         /* use    device number */
+hl_device_prop *g_device;                   /* device info table */
+__thread thread_device_resources *t_device; /* device resources table */
+int g_cuda_lib_version = 0;
+
+/* number of global stream */
+#define NUMBER_OF_GLOBAL_STREAM (HPPL_THREAD_STREAM_1)
+/* number of thread stream */
+#define NUMBER_OF_THREAD_STREAM (HPPL_STREAM_END - HPPL_THREAD_STREAM_1)
+/* sizeof of device memory */
+#define HPPL_GPU_MEMORY_SIZE (256 * 4)
+
+/**
+ * Check build-in cuda function using glog and it **does not**
+ * support << operator for more details error info.
+ */
+#define CHECK_CUDA(cudaFunc)                                         \
+  do {                                                               \
+    cudaError_t cudaStat = cudaFunc;                                 \
+    CHECK_EQ(cudaSuccess, cudaStat) << "Cuda Error: "                \
+                                    << cudaGetErrorString(cudaStat); \
+  } while (0)
+
+/**
+ * @brief   thread resource.
+ */
+__thread _hl_thread_resource t_resource = {{0},    /* stream */
+                                           0,      /* handle */
+                                           0,      /* gen */
+                                           0,      /* cudnn_handle */
+                                           0,      /* cudnn_desc */
+                                           NULL,   /* gen_mutex */
+                                           NULL,   /* gpu_mem */
+                                           NULL,   /* cpu_mem */
+                                           0,      /* event */
+                                           -1,     /* device */
+                                           0,      /* major */
+                                           false}; /* is_init */
+
+__thread cudaStream_t default_stream = 0;
+__thread bool g_sync_flag = true;
+bool hl_start_flag = false;
+
+inline pid_t gettid() {
+#if defined(__APPLE__) || defined(__OSX__)
+  // syscall is deprecated: first deprecated in macOS 10.12.
+  // syscall is unsupported;
+  // syscall pid_t tid = syscall(SYS_thread_selfid);
+  uint64_t tid;
+  pthread_threadid_np(NULL, &tid);
+#else
+#ifndef __NR_gettid
+#define __NR_gettid 224
+#endif
+  pid_t tid = syscall(__NR_gettid);
+#endif
+  CHECK_NE((int)tid, -1);
+  return tid;
+}
+
+void hl_init(int device) {
+  CHECK(hl_start_flag) << "[Init failed] hl_start() did not succeed.";
+
+  /* thread has been initialized */
+  if (true == t_resource.is_init) {
+    hl_set_device(device);
+    return;
+  }
+
+  /* create thread devcie resources */
+  char *tmp;
+  thread_device_resources device_res;
+  tmp = (char *)malloc(g_system_device_num * sizeof(thread_device_resources *) +
+                       device_num * sizeof(_thread_device_resources));
+  CHECK_NOTNULL(tmp);
+  t_device = (thread_device_resources *)tmp;
+  device_res = (thread_device_resources)(
+      (char *)tmp + g_system_device_num * sizeof(thread_device_resources *));
+  memset(t_device, 0, g_system_device_num * sizeof(thread_device_resources *));
+
+  char *tmp_stream = (char *)malloc(device_num * NUMBER_OF_THREAD_STREAM *
+                                    sizeof(cudaStream_t));
+  CHECK_NOTNULL(tmp_stream);
+
+  int num = 0;
+  for (int dev = 0; dev < g_system_device_num; dev++) {
+    if (!g_device[dev]) {
+      continue;
+    }
+
+    t_device[dev] = &device_res[num];
+    t_device[dev]->stream =
+        (cudaStream_t *)(tmp_stream +
+                         num * NUMBER_OF_THREAD_STREAM * sizeof(cudaStream_t));
+
+    hl_create_thread_resources(dev, t_device[dev]);
+    num++;
+  }
+
+  hl_cudnn_desc_init(&t_resource.cudnn_desc);
+
+  /* thread initialization is complete */
+  t_resource.is_init = true;
+  /* set device */
+  t_resource.device = -1;
+  hl_set_device(device);
+}
+
+void hl_fini() {
+  if (false == t_resource.is_init) {
+    return;
+  }
+
+  /* hppl stream fini */
+  t_resource.device = -1;
+  for (int i = NUMBER_OF_GLOBAL_STREAM; i < HPPL_STREAM_END; i++) {
+    t_resource.stream[i] = 0;
+  }
+
+  char *tmp = (char *)t_device;
+  char *tmp_stream = NULL;
+  for (int dev = 0; dev < g_system_device_num; dev++) {
+    if (!t_device[dev]) {
+      continue;
+    }
+    if (!tmp_stream) {
+      tmp_stream = (char *)t_device[dev]->stream;
+    }
+    for (int j = 0; j < NUMBER_OF_THREAD_STREAM; j++) {
+      CHECK_CUDA(cudaStreamDestroy(t_device[dev]->stream[j]));
+    }
+
+    /* free device memory */
+    hl_free_mem_device(t_device[dev]->gpu_mem);
+    hl_free_mem_host(t_device[dev]->cpu_mem);
+    CHECK_CUDA(cudaEventDestroy(t_device[dev]->mem_event));
+  }
+
+  free(tmp);
+  free(tmp_stream);
+  t_resource.is_init = false;
+}
+
+int hl_get_device_count() { return device_num; }
+
+void hl_set_device(int device) {
+  if (device == t_resource.device) {
+    return;
+  }
+
+  CHECK(device >= 0 && device < g_system_device_num && g_device[device])
+      << "Device: " << device << " is not specified in startup.";
+
+  CHECK_CUDA(cudaSetDevice(device));
+
+  /* switch thread stream */
+  for (int i = 0; i < NUMBER_OF_GLOBAL_STREAM; i++) {
+    t_resource.stream[i] = g_device[device]->device_resources->stream[i];
+  }
+
+  if (true == t_resource.is_init) {
+    for (int i = NUMBER_OF_GLOBAL_STREAM; i < HPPL_STREAM_END; i++) {
+      t_resource.stream[i] =
+          t_device[device]->stream[i - NUMBER_OF_GLOBAL_STREAM];
+    }
+    t_resource.gpu_mem = t_device[device]->gpu_mem;
+    t_resource.cpu_mem = t_device[device]->cpu_mem;
+    t_resource.event = t_device[device]->mem_event;
+  }
+
+  t_resource.handle = g_device[device]->device_resources->handle;
+  t_resource.gen = g_device[device]->device_resources->gen;
+  t_resource.cudnn_handle = g_device[device]->device_resources->cudnn_handle;
+  t_resource.gen_mutex = g_device[device]->device_resources->gen_mutex;
+  t_resource.device = device;
+  t_resource.major = g_device[device]->major;
+  default_stream = t_resource.stream[0];
+}
+
+int hl_get_device() {
+  int device;
+  CHECK_CUDA(cudaGetDevice(&device));
+  return device;
+}
+
+void *hl_malloc_device(size_t size) {
+  void *dest_d;
+
+  CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
+  CHECK_CUDA(cudaMalloc((void **)&dest_d, size));
+
+  return dest_d;
+}
+
+void hl_free_mem_device(void *dest_d) {
+  CHECK_NOTNULL(dest_d);
+
+  cudaError_t err = cudaFree(dest_d);
+  CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
+      << hl_get_device_error_string();
+}
+
+void *hl_malloc_host(size_t size) {
+  void *dest_h;
+
+  CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
+  CHECK_CUDA(cudaHostAlloc((void **)&dest_h, size, cudaHostAllocDefault));
+
+  return dest_h;
+}
+
+void hl_free_mem_host(void *dest_h) {
+  CHECK_NOTNULL(dest_h);
+
+  cudaError_t err = cudaFreeHost(dest_h);
+  CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
+      << hl_get_device_error_string();
+}
+
+void hl_memcpy(void *dst, void *src, size_t size) {
+  if (0 == size) {
+    return;
+  }
+  CHECK_NOTNULL(dst);
+  CHECK_NOTNULL(src);
+  CHECK_CUDA(cudaMemcpy(dst, src, size, cudaMemcpyDefault));
+}
+
+void hl_memset_device(void *dest_d, int value, size_t size) {
+  CHECK_CUDA(cudaMemset(dest_d, value, size));
+}
+
+void hl_memcpy_host2device(void *dest_d, void *src_h, size_t size) {
+  if (0 == size) {
+    return;
+  }
+  CHECK_NOTNULL(src_h);
+  CHECK_NOTNULL(dest_d);
+  CHECK_CUDA(cudaMemcpy(dest_d, src_h, size, cudaMemcpyHostToDevice));
+}
+
+void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) {
+  if (0 == size) {
+    return;
+  }
+  CHECK_NOTNULL(dest_h);
+  CHECK_NOTNULL(src_d);
+  CHECK_CUDA(cudaMemcpy(dest_h, src_d, size, cudaMemcpyDeviceToHost));
+}
+
+void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) {
+  if (0 == size) {
+    return;
+  }
+  CHECK_NOTNULL(dest_d);
+  CHECK_NOTNULL(src_d);
+  CHECK_CUDA(cudaMemcpy(dest_d, src_d, size, cudaMemcpyDeviceToDevice));
+}
+
+void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) {
+  cudaStream_t cu_stream;
+
+  if (0 == size) {
+    return;
+  }
+  CHECK_NOTNULL(dst);
+  CHECK_NOTNULL(src);
+  CHECK_LT(stream, HPPL_STREAM_END);
+  cu_stream = t_resource.stream[stream];
+
+  CHECK_CUDA(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, cu_stream));
+}
+
+void hl_start() {
+  hl_specify_devices_start(NULL, 0);
+  /* set default device */
+  hl_set_device(0);
+}
+
+bool hl_device_can_access_peer(int device, int peerDevice) {
+  int canAccessPeer;
+  CHECK_CUDA(cudaDeviceCanAccessPeer(&canAccessPeer, device, peerDevice));
+
+  if (canAccessPeer == 1) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+void hl_device_enable_peer_access(int peerDevice) {
+  cudaError_t err = cudaDeviceEnablePeerAccess(peerDevice, 0);
+  if (cudaErrorPeerAccessAlreadyEnabled == err) {
+    cudaGetLastError();
+  } else {
+    CHECK_CUDA(err);
+  }
+}
+
+void hl_create_global_resources(hl_device_prop device_prop) {
+  struct cudaDeviceProp cu_prop;
+  int device = device_prop->device;
+  global_device_resources device_res = device_prop->device_resources;
+
+  CHECK_CUDA(cudaSetDevice(device));
+  /* device properties */
+  CHECK_CUDA(cudaGetDeviceProperties(&cu_prop, device));
+
+  device_prop->major = cu_prop.major;
+  device_prop->minor = cu_prop.minor;
+  strncpy(device_prop->device_name, cu_prop.name, 256);
+  device_prop->device_mem = cu_prop.totalGlobalMem;
+
+  /* create device stream */
+  for (int j = 0; j < NUMBER_OF_GLOBAL_STREAM; j++) {
+    CHECK_CUDA(cudaStreamCreate(&device_res->stream[j]));
+  }
+
+  /* cublas init */
+  hl_cublas_init(&device_res->handle, device_res->stream[0]);
+
+  /* create curand gen */
+  CHECK_EQ(dynload::curandCreateGenerator(&device_res->gen,
+                                          CURAND_RNG_PSEUDO_DEFAULT),
+           CURAND_STATUS_SUCCESS)
+      << "[Start failed] Curand init failed.";
+
+  CHECK_EQ(dynload::curandSetStream(device_res->gen, device_res->stream[0]),
+           CURAND_STATUS_SUCCESS)
+      << "[Start failed] Curand set stream failed!";
+
+  /* create cudnn handle */
+  hl_cudnn_init(&device_res->cudnn_handle, device_res->stream[0]);
+
+  int seed = gettid();
+  CHECK_EQ(dynload::curandSetPseudoRandomGeneratorSeed(device_res->gen,
+                                                       seed + device),
+           CURAND_STATUS_SUCCESS);
+
+  device_res->gen_mutex = (pthread_mutex_t *)(malloc(sizeof(pthread_mutex_t)));
+  pthread_mutex_init(device_res->gen_mutex, NULL);
+
+  CHECK_CUDA(cudaRuntimeGetVersion(&g_cuda_lib_version));
+}
+
+int hl_get_cuda_version() { return g_cuda_lib_version; }
+
+void hl_create_thread_resources(int device,
+                                thread_device_resources device_res) {
+  CHECK_CUDA(cudaSetDevice(device));
+
+  /* create thread stream */
+  for (int j = 0; j < NUMBER_OF_THREAD_STREAM; j++) {
+    CHECK_CUDA(cudaStreamCreate(&device_res->stream[j]));
+  }
+
+  /* allocation device memory */
+  device_res->gpu_mem = (real *)hl_malloc_device(HPPL_GPU_MEMORY_SIZE);
+
+  /* allocation host memory */
+  device_res->cpu_mem = (real *)hl_malloc_host(HPPL_GPU_MEMORY_SIZE);
+
+  CHECK_CUDA(cudaEventCreate(&device_res->mem_event));
+}
+
+void hl_specify_devices_start(int *device, int number) {
+  if (hl_start_flag) return;
+
+  /* 1. get the number of devices */
+  CHECK_CUDA(cudaGetDeviceCount(&g_system_device_num));
+  CHECK_NE(g_system_device_num, 0) << "[Start failed] there is no GPU device";
+  if (device == NULL) {
+    number = g_system_device_num;
+  }
+
+  /* 2. check device & create device property table */
+  CHECK_LE(number, g_system_device_num)
+      << "[Start failed] System does not have enough device. "
+      << "Device number: " << g_system_device_num << "Input number: " << number;
+
+  char *tmp;
+  hl_device_prop device_prop;
+  tmp = (char *)malloc(g_system_device_num * sizeof(hl_device_prop *) +
+                       number * sizeof(_hl_device_prop));
+  CHECK(tmp) << "[Start failed] System memory is not enough.";
+
+  g_device = (hl_device_prop *)tmp;
+  device_prop = (hl_device_prop)(
+      (char *)tmp + g_system_device_num * sizeof(hl_device_prop *));
+  memset(g_device, 0, g_system_device_num * sizeof(hl_device_prop *));
+  int num = 0;
+  for (int i = 0; i < number; i++) {
+    int dev;
+    if (device == NULL) {
+      dev = i;
+    } else {
+      dev = device[i];
+    }
+
+    CHECK_LT(dev, g_system_device_num)
+        << "[Start failed] The specified device number is "
+        << "out of range. Max device number: " << g_system_device_num - 1
+        << " Specified devcie number: " << dev;
+
+    if (g_device[dev]) {
+      /* Warning */
+      LOG(WARNING) << "[Warning] Repeat specify device: " << dev;
+      continue;
+    }
+
+    g_device[dev] = &device_prop[num];
+    g_device[dev]->device = dev;
+    num++;
+  }
+  device_num = num;
+
+  /* 3.  create global device resources */
+  char *tmp_res = (char *)malloc(device_num * sizeof(_global_device_resources));
+  CHECK_NOTNULL(tmp_res);
+
+  char *tmp_stream = (char *)malloc(device_num * NUMBER_OF_GLOBAL_STREAM *
+                                    sizeof(cudaStream_t));
+  CHECK_NOTNULL(tmp_stream);
+
+  num = 0;
+  for (int i = 0; i < g_system_device_num; i++) {
+    if (!g_device[i]) {
+      continue;
+    }
+
+    g_device[i]->device_resources = (global_device_resources)(
+        tmp_res + num * sizeof(_global_device_resources));
+    g_device[i]->device_resources->stream =
+        (cudaStream_t *)(tmp_stream +
+                         num * NUMBER_OF_GLOBAL_STREAM * sizeof(cudaStream_t));
+
+    hl_create_global_resources(g_device[i]);
+    num++;
+  }
+
+  /* hl_start() is ok */
+  hl_start_flag = true;
+  /* set default device */
+  if (device == NULL) {
+    hl_set_device(0);
+  } else {
+    hl_set_device(device[0]);
+  }
+}
+
+void hl_rand(real *dest_d, size_t num) {
+  pthread_mutex_lock(t_resource.gen_mutex);
+  CHECK_EQ(
+#ifndef PADDLE_TYPE_DOUBLE
+      dynload::curandGenerateUniform(t_resource.gen, dest_d, num),
+#else
+      dynload::curandGenerateUniformDouble(t_resource.gen, dest_d, num),
+#endif
+      CURAND_STATUS_SUCCESS);
+  pthread_mutex_unlock(t_resource.gen_mutex);
+  CHECK_SYNC("hl_rand failed");
+}
+
+void hl_srand(unsigned int seed) {
+  pthread_mutex_lock(t_resource.gen_mutex);
+  CHECK_EQ(dynload::curandSetPseudoRandomGeneratorSeed(t_resource.gen, seed),
+           CURAND_STATUS_SUCCESS);
+  pthread_mutex_unlock(t_resource.gen_mutex);
+}
+
+void hl_set_sync_flag(bool flag) { g_sync_flag = flag; }
+
+bool hl_get_sync_flag() { return g_sync_flag; }
+
+void hl_stream_synchronize(hl_stream_t stream) {
+  cudaStream_t cu_stream;
+
+  CHECK_LT(stream, HPPL_STREAM_END) << __func__
+                                    << ": the parameter stream is error.";
+
+  cu_stream = t_resource.stream[stream];
+  CHECK_CUDA(cudaStreamSynchronize(cu_stream));
+}
+
+void hl_create_event(hl_event_t *event) {
+  CHECK_NOTNULL(event);
+
+  struct _hl_event_st *st_event =
+      (struct _hl_event_st *)malloc(sizeof(struct _hl_event_st));
+
+  CHECK_CUDA(cudaEventCreate(&st_event->cu_event));
+
+  *event = st_event;
+}
+
+float hl_event_elapsed_time(hl_event_t start, hl_event_t end) {
+  float time;
+  CHECK_NOTNULL(start);
+  CHECK_NOTNULL(end);
+
+  CHECK_CUDA(cudaEventElapsedTime(&time, start->cu_event, end->cu_event));
+  return time;
+}
+
+void hl_stream_record_event(hl_stream_t stream, hl_event_t event) {
+  cudaStream_t cu_stream;
+
+  CHECK_NOTNULL(event);
+  CHECK_LT(stream, HPPL_STREAM_END) << __func__
+                                    << ": the parameter stream is error.";
+
+  cu_stream = t_resource.stream[stream];
+  CHECK_CUDA(cudaEventRecord(event->cu_event, cu_stream));
+}
+
+void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) {
+  cudaStream_t cu_stream;
+
+  CHECK_NOTNULL(event);
+  CHECK_LT(stream, HPPL_STREAM_END) << __func__
+                                    << ": the parameter stream is error.";
+
+  cu_stream = t_resource.stream[stream];
+  CHECK_CUDA(cudaStreamWaitEvent(cu_stream, event->cu_event, 0));
+}
+
+void hl_destroy_event(hl_event_t event) {
+  CHECK_NOTNULL(event);
+  CHECK_CUDA(cudaEventDestroy(event->cu_event));
+
+  free(event);
+  event = NULL;
+}
+
+void hl_event_synchronize(hl_event_t event) {
+  CHECK_NOTNULL(event);
+  CHECK_CUDA(cudaEventSynchronize(event->cu_event));
+}
+
+void hl_get_device_name(char *name, int len, int device) {
+  CHECK_NOTNULL(name);
+  CHECK(device >= 0 && device < g_system_device_num && g_device[device])
+      << "Device(" << device << ") is not specified in startup.";
+
+  strncpy(name, g_device[device]->device_name, len);
+}
+
+void hl_get_device_memory(size_t *mem_size, int device) {
+  CHECK_NOTNULL(mem_size);
+  CHECK(device >= 0 && device < g_system_device_num && g_device[device])
+      << "Device(" << device << ") is not specified in startup.";
+
+  *mem_size = g_device[device]->device_mem;
+}
+
+void hl_get_device_compute_capability(int *major, int *minor, int device) {
+  CHECK_NOTNULL(major);
+  CHECK_NOTNULL(minor);
+  CHECK(device >= 0 && device < g_system_device_num && g_device[device])
+      << "Device(" << device << ") is not specified in startup.";
+
+  *major = g_device[device]->major;
+  *minor = g_device[device]->minor;
+}
+
+int hl_get_device_last_error() { return (int)cudaGetLastError(); }
+
+const char *hl_get_device_error_string() {
+  cudaError_t err = cudaGetLastError();
+  return cudaGetErrorString(err);
+}
+
+const char *hl_get_device_error_string(size_t err) {
+  return cudaGetErrorString((cudaError_t)err);
+}
+
+void hl_device_synchronize() { CHECK_CUDA(cudaDeviceSynchronize()); }
+void hl_set_device_flags_block() {
+  CHECK_CUDA(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
+}
+
+bool hl_cuda_event_is_ready(hl_event_t event) {
+  cudaError_t err = cudaEventQuery(event->cu_event);
+  CHECK(cudaSuccess == err || cudaErrorNotReady == err);
+
+  if (cudaErrorNotReady == err) {
+    return false;
+  }
+  return true;
+}
+
+void hl_profiler_start() { CHECK_CUDA(cudaProfilerStart()); }
+
+void hl_profiler_end() { CHECK_CUDA(cudaProfilerStop()); }
diff --git a/paddle/legacy/cuda/src/hl_cuda_lstm.cu b/paddle/legacy/cuda/src/hl_cuda_lstm.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9ac564fd2548cc782bee2380350f4ab888670ca3
--- /dev/null
+++ b/paddle/legacy/cuda/src/hl_cuda_lstm.cu
@@ -0,0 +1,876 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "hl_activation_functions.h"
+#include "hl_base.h"
+#include "hl_cuda_cublas.h"
+#include "hl_device_functions.cuh"
+#include "paddle/legacy/utils/Logging.h"
+
+typedef hppl::Active<real>::forward t_forward;
+typedef hppl::Active<real>::backward t_backward;
+
+bool hl_lstm_sequence_parallel(int frameSize) {
+  if (frameSize == 32 || frameSize == 64) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+class frameValue {
+ public:
+  real *value_;
+  __device__ frameValue(real *value) : value_(value) {}
+  template <int reversed, int frameSize>
+  __device__ inline void init(int start, int length, int idx) {
+    if (reversed == 0) {
+      value_ += start * frameSize + idx;
+    } else {
+      value_ += (start + length - 1) * frameSize + idx;
+    }
+  }
+  __device__ inline real *getPtr() const { return value_; }
+  __device__ inline real getValue() { return *value_; }
+  __device__ inline void setValue(real value) { *value_ = value; }
+  template <int reversed, int frameSize>
+  __device__ inline void nextFrame() {
+    if (reversed == 0) {
+      value_ += frameSize;
+    } else {
+      value_ -= frameSize;
+    }
+  }
+};
+
+__device__ __forceinline__ void ptx_sync(const int id, const int barriers) {
+  asm volatile("bar.sync %0, %1;" : : "r"(id), "r"(barriers) : "memory");
+}
+
+__device__ __forceinline__ void ptx_arrive(const int id, const int barriers) {
+  asm volatile("bar.arrive %0, %1;" : : "r"(id), "r"(barriers) : "memory");
+}
+
+template <int valueSize, int frameSize>
+__device__ __forceinline__ real forward_sequence(real value,
+                                                 real *shValue,
+                                                 real *state,
+                                                 real *preOutput,
+                                                 real *output,
+                                                 real check,
+                                                 int index,
+                                                 t_forward activeNode,
+                                                 t_forward activeGate,
+                                                 t_forward activeState) {
+  real out;
+  real prevOut;
+  real state_r;
+  const int idx = index % frameSize;
+  const int idy = index / frameSize;
+  // assert(index < valueSize);
+
+  if (idy == 0) {
+    value = activeNode(value);
+    shValue[index] = value;
+  }
+  if (idy == 1 || idy == 2) {
+    state_r = state[idx];
+    value += state_r * check;
+    value = activeGate(value);
+    shValue[index] = value;
+  }
+  ptx_sync(1, valueSize);
+  if (idy == 3) {
+    state_r = state[idx];
+    state_r = state_r * shValue[idx + frameSize * 2];
+    state_r += shValue[idx] * shValue[idx + frameSize];
+    state[idx] = state_r;
+    ptx_arrive(2, frameSize * 2);
+    value += state_r * check;
+    value = activeGate(value);
+    shValue[index] = value;
+    ptx_sync(3, frameSize * 2);
+    prevOut = preOutput[idx];
+    out = prevOut * value;
+    output[idx] = out;
+  }
+  if (idy == 0) {
+    ptx_sync(2, frameSize * 2);
+    prevOut = state[idx];
+    prevOut = activeState(prevOut);
+    preOutput[idx] = prevOut;
+    ptx_arrive(3, frameSize * 2);
+  }
+  return value;
+}
+
+#define OUTPUT_BARRIER_ID 10
+#define OUTPUT_BARRIER_ID2 11
+template <int valueSize,
+          int frameSize,
+          int reversed,
+          int computeThreads,
+          int blockSize>
+__global__ void KeLstmForward(real *gateValue,
+                              real *state,
+                              real *output,
+                              real *preOutput,
+                              real *checkIg,
+                              real *checkFg,
+                              real *checkOg,
+                              real *weight,
+                              const int *starts,
+                              hl_activation_mode_t active_node,
+                              hl_activation_mode_t active_gate,
+                              hl_activation_mode_t active_state) {
+  __shared__ real shValue[valueSize];
+  __shared__ real shState[frameSize];
+  __shared__ real shPrevOutput[frameSize];
+  __shared__ real shOutput[frameSize];
+
+  const int index = threadIdx.x;
+  int start = starts[blockIdx.x];
+  int length = starts[blockIdx.x + 1] - start;
+
+  /* init */
+  real check;
+  real value;
+  frameValue frameGate(gateValue);
+  frameValue frameState(state);
+  frameValue frameOutput(output);
+  frameValue framePreOutput(preOutput);
+  if (index < valueSize) {
+    const int idx = index % frameSize;
+    const int idy = index / frameSize;
+    frameGate.init<reversed, valueSize>(start, length, index);
+    value = frameGate.getValue();
+    if (idy == 0) {
+      shState[idx] = 0.0;
+    } else if (idy == 1) {
+      check = checkIg[idx];
+    } else if (idy == 2) {
+      check = checkFg[idx];
+    } else if (idy == 3) {
+      check = checkOg[idx];
+    }
+
+    if (idy == 3) {
+      frameState.init<reversed, frameSize>(start, length, idx);
+      frameOutput.init<reversed, frameSize>(start, length, idx);
+      framePreOutput.init<reversed, frameSize>(start, length, idx);
+    }
+
+    ptx_sync(1, valueSize);
+  }
+
+  for (int i = 0; i < length; ++i) {
+    if (index < valueSize) {
+      if (valueSize == 128) {
+        if (i != 0) {
+          ptx_sync(OUTPUT_BARRIER_ID2, blockSize);
+          value += shValue[index];
+        }
+      }
+      value = forward_sequence<valueSize, frameSize>(
+          value,
+          shValue,
+          shState,
+          shPrevOutput,
+          shOutput,
+          check,
+          index,
+          hppl::gpu::forward[active_node],
+          hppl::gpu::forward[active_gate],
+          hppl::gpu::forward[active_state]);
+      const int idx = index % frameSize;
+      const int idy = index / frameSize;
+      if (valueSize == 128) {
+        if (idy == 3) {
+          ptx_arrive(OUTPUT_BARRIER_ID, frameSize + 128);
+        }
+      }
+      if (valueSize == 256) {
+        ptx_sync(OUTPUT_BARRIER_ID, valueSize);
+      }
+      frameGate.setValue(value);
+      if (idy == 3) {
+        frameState.setValue(shState[idx]);
+        frameOutput.setValue(shOutput[idx]);
+        framePreOutput.setValue(shPrevOutput[idx]);
+        frameState.nextFrame<reversed, frameSize>();
+        frameOutput.nextFrame<reversed, frameSize>();
+        framePreOutput.nextFrame<reversed, frameSize>();
+      }
+      if (i != length - 1) {
+        frameGate.nextFrame<reversed, valueSize>();
+        value = frameGate.getValue();
+      }
+    }
+    if (i != length - 1) {
+      if (valueSize == 128) {
+        if (valueSize <= index) {
+          real B_r[frameSize];
+          const int computeIdx = index - valueSize;
+          if (i == 0) {
+#pragma unroll
+            for (int n = 0; n < frameSize; n++) {
+              B_r[n] = weight[n * valueSize + computeIdx];
+            }
+          }
+          ptx_sync(OUTPUT_BARRIER_ID, frameSize + 128);
+          real A_r[frameSize];
+          for (int n = 0; n < frameSize; n++) {
+            A_r[n] = shOutput[n];
+          }
+          real sum = 0.0f;
+          for (int n = 0; n < frameSize; n++) {
+            sum += A_r[n] * B_r[n];
+          }
+          shValue[computeIdx] = sum;
+          ptx_arrive(OUTPUT_BARRIER_ID2, blockSize);
+        }
+      }
+      if (valueSize == 256) {
+        real B_r[frameSize];
+        if (i == 0) {
+#pragma unroll
+          for (int n = 0; n < frameSize; n++) {
+            B_r[n] = weight[n * valueSize + index];
+          }
+        }
+        real sum = 0.0f;
+        for (int n = 0; n < frameSize; n++) {
+          sum += shOutput[n] * B_r[n];
+        }
+        value += sum;
+      }
+    }
+  }
+}
+
+void hl_lstm_parallel_forward(real *gateValue,
+                              real *stateValue,
+                              real *preOutputValue,
+                              real *outputValue,
+                              real *checkIg,
+                              real *checkFg,
+                              real *checkOg,
+                              real *weight,
+                              const int *sequence,
+                              int frameSize,
+                              int numSequences,
+                              bool reversed,
+                              hl_activation_mode_t active_node,
+                              hl_activation_mode_t active_gate,
+                              hl_activation_mode_t active_state) {
+  CHECK(frameSize == 32 || frameSize == 64);
+  dim3 grid(numSequences, 1);
+  if (!reversed) {
+    if (frameSize == 32) {
+      KeLstmForward<128, 32, 0, 128, 256><<<grid, 256, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          stateValue,
+          outputValue,
+          preOutputValue,
+          checkIg,
+          checkFg,
+          checkOg,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
+    } else if (frameSize == 64) {
+      KeLstmForward<256, 64, 0, 256, 256><<<grid, 256, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          stateValue,
+          outputValue,
+          preOutputValue,
+          checkIg,
+          checkFg,
+          checkOg,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
+    }
+  } else {
+    if (frameSize == 32) {
+      KeLstmForward<128, 32, 1, 128, 256><<<grid, 256, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          stateValue,
+          outputValue,
+          preOutputValue,
+          checkIg,
+          checkFg,
+          checkOg,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
+    } else if (frameSize == 64) {
+      KeLstmForward<256, 64, 1, 256, 256><<<grid, 256, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          stateValue,
+          outputValue,
+          preOutputValue,
+          checkIg,
+          checkFg,
+          checkOg,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
+    }
+  }
+  CHECK_SYNC("hl_lstm_parallel_forward failed");
+}
+
+__device__ __forceinline__ void transpose_32x32(real a[], const int idx) {
+  const int warp_size = 32;
+  int addr = idx % warp_size;
+  unsigned mask = 0u;
+  CREATE_SHFL_MASK(mask, addr < warp_size);
+#pragma unroll
+  for (int k = 1; k < 32; k++) {
+    // rSrc[k] = __shfl_sync(rSrc[k], (threadIdx.x + k) % 32, 32);
+    addr = __shfl_sync(mask, addr, (idx + 1) % 32, 32);
+    a[k] = __shfl_sync(mask, a[k], addr, 32);
+  }
+
+#pragma unroll
+  for (int tid = 0; tid < 31; tid++) {
+    real tmp = (idx > tid) ? a[0] : a[1];
+#pragma unroll
+    for (int k = 31; k > 0; k--) {
+      a[(k + 1) % 32] = (idx > tid) ? a[k] : a[(k + 1) % 32];
+    }
+    a[1] = tmp;
+  }
+
+  addr = (32 - idx) % 32;
+  CREATE_SHFL_MASK(mask, idx % 32 < warp_size);
+#pragma unroll
+  for (int k = 0; k < 32; k++) {
+    a[k] = __shfl_sync(mask, a[k], addr, 32);
+    addr = __shfl_sync(mask, addr, (idx + 31) % 32, 32);
+  }
+}
+
+template <int valueSize, int frameSize>
+__device__ void backward_sequence(real rGateValue,
+                                  real rOutputGrad,
+                                  real rPreOutputValue,
+                                  real &rGateGrad,
+                                  real &rStateGrad,
+                                  real *shStateGrad,
+                                  real *shStateValue,
+                                  real *shGateValue,
+                                  real rCheck,
+                                  real &rGateValuePrev,
+                                  int index,
+                                  t_backward activeNode,
+                                  t_backward activeGate,
+                                  t_backward activeState) {
+  const int frameIdx = index % frameSize;
+  const int frameIdy = index / frameSize;
+  if (frameIdy == 3) {
+    real rPrevOutputGrad;
+    rPrevOutputGrad = rOutputGrad * rGateValue;
+    rStateGrad = activeState(rPrevOutputGrad, rPreOutputValue);
+    rGateGrad = rOutputGrad * rPreOutputValue;
+    rGateGrad = activeGate(rGateGrad, rGateValue);
+    rStateGrad += rGateGrad * rCheck;
+    shStateGrad[index] = rStateGrad;
+    ptx_arrive(3, valueSize);
+  } else if (frameIdy == 1) {
+    shGateValue[frameIdx + frameSize] = rGateValue;
+    rStateGrad = rGateGrad * rCheck;
+    shStateGrad[index] = rStateGrad;
+    ptx_sync(3, valueSize);
+    rStateGrad += shStateGrad[frameIdx + frameSize * 2];
+    rStateGrad += shStateGrad[frameIdx + frameSize * 3];
+    rGateGrad = rStateGrad * shGateValue[frameIdx];
+    rGateGrad = activeGate(rGateGrad, rGateValue);
+  } else if (frameIdy == 2) {
+    rStateGrad = rStateGrad * rGateValuePrev;
+    rStateGrad += rGateGrad * rCheck;
+    shStateGrad[index] = rStateGrad;
+    ptx_sync(3, valueSize);
+    rStateGrad += shStateGrad[frameIdx + frameSize];
+    rStateGrad += shStateGrad[frameIdx + frameSize * 3];
+    rGateValuePrev = rGateValue;
+    rGateGrad = rStateGrad * shStateValue[frameIdx];
+    rGateGrad = activeGate(rGateGrad, rGateValue);
+  } else if (frameIdy == 0) {
+    shGateValue[frameIdx] = rGateValue;
+    ptx_sync(3, valueSize);
+    rStateGrad = shStateGrad[frameIdx + frameSize];
+    rStateGrad += shStateGrad[frameIdx + frameSize * 2];
+    rStateGrad += shStateGrad[frameIdx + frameSize * 3];
+    rGateGrad = rStateGrad * shGateValue[frameIdx + frameSize];
+    rGateGrad = activeNode(rGateGrad, rGateValue);
+  }
+}
+
+template <int valueSize, int frameSize>
+__device__ void load_weight(real rWeight[], real *weight, const int index) {
+  if (valueSize == 128) {
+    weight += index;
+#pragma unroll
+    for (int n = 0; n < frameSize; n++) {
+      rWeight[n] = weight[n * valueSize];
+    }
+    transpose_32x32(rWeight, index % 32);
+  }
+  if (valueSize == 256) {
+    int id = (index / 32) % 2;
+    weight += index - id * 32 + id * 32 * valueSize;
+#pragma unroll
+    for (int n = 0; n < 32; n++) {
+      rWeight[n] = weight[n * valueSize];
+      rWeight[n + 32] = weight[n * valueSize + 32];
+    }
+    transpose_32x32(rWeight, index % 32);
+    transpose_32x32(&rWeight[32], index % 32);
+  }
+}
+
+template <int valueSize, int frameSize, int reversed>
+__global__ void KeLstmBackward(real *gateValue,
+                               real *gateGrad,
+                               real *stateValue,
+                               real *stateGrad, /* do not need save */
+                               real *preOutputValue,
+                               real *preOutputGrad, /* do not need save */
+                               real *checkIg,
+                               real *checkIgGrad,
+                               real *checkFg,
+                               real *checkFgGrad,
+                               real *checkOg,
+                               real *checkOgGrad,
+                               real *outputGrad,
+                               real *weightValue,
+                               const int *starts,
+                               hl_activation_mode_t active_node,
+                               hl_activation_mode_t active_gate,
+                               hl_activation_mode_t active_state) {
+  __shared__ real shGateValue[valueSize];
+  __shared__ real shStateGrad[valueSize];
+  __shared__ real shStateValue[frameSize];
+  __shared__ real shGateGrad[4][frameSize];
+  __shared__ real shOutputGrad[4][frameSize];
+  const int index = threadIdx.x;
+  int start = starts[blockIdx.x];
+  int length = starts[blockIdx.x + 1] - start;
+
+  const int frameIdx = index % frameSize;
+  const int frameIdy = index / frameSize;
+  real rCheck;
+  real rCheckGrad;
+  real rGateGrad;
+  real rStateGrad;
+  real rGateValuePrev;
+  real rPreOutputValue;
+  real rOutputGrad;
+  real rGateValue;
+  real rStateValue;
+
+  frameValue frameGateValue(gateValue);
+  frameValue frameGateGrad(gateGrad);
+  frameValue framePreOutputValue(preOutputValue);
+  frameValue frameStateValue(stateValue);
+  frameValue frameOutputGrad(outputGrad);
+  if (frameIdy == 0) {
+  } else if (frameIdy == 1) {
+    rCheck = checkIg[frameIdx];
+  } else if (frameIdy == 2) {
+    rCheck = checkFg[frameIdx];
+    rGateValuePrev = 0.0;
+    rStateGrad = 0.0;
+  } else if (frameIdy == 3) {
+    rCheck = checkOg[frameIdx];
+    framePreOutputValue.init<!reversed, frameSize>(start, length, frameIdx);
+    frameOutputGrad.init<!reversed, frameSize>(start, length, frameIdx);
+    rOutputGrad = frameOutputGrad.getValue();
+    rPreOutputValue = framePreOutputValue.getValue();
+    frameStateValue.init<!reversed, frameSize>(start, length, frameIdx);
+    rStateValue = frameStateValue.getValue();
+  }
+
+  frameGateValue.init<!reversed, valueSize>(start, length, index);
+  frameGateGrad.init<!reversed, valueSize>(start, length, index);
+  rGateValue = frameGateValue.getValue();
+  rGateGrad = 0.0;
+  rCheckGrad = 0.0;
+
+  real B_r[frameSize];
+  load_weight<valueSize, frameSize>(B_r, weightValue, index);
+
+  for (int i = 0; i < length; ++i) {
+    if (frameIdy == 3) {
+      if (i != length - 1) {
+        frameStateValue.nextFrame<!reversed, frameSize>();
+        shStateValue[frameIdx] = frameStateValue.getValue();
+      } else {
+        shStateValue[frameIdx] = 0.0;
+      }
+    }
+    backward_sequence<valueSize, frameSize>(rGateValue,
+                                            rOutputGrad,
+                                            rPreOutputValue,
+                                            rGateGrad,
+                                            rStateGrad,
+                                            shStateGrad,
+                                            shStateValue,
+                                            shGateValue,
+                                            rCheck,
+                                            rGateValuePrev,
+                                            index,
+                                            hppl::gpu::backward[active_node],
+                                            hppl::gpu::backward[active_gate],
+                                            hppl::gpu::backward[active_state]);
+    if (frameIdy == 3) {
+      rCheckGrad += rGateGrad * rStateValue;
+      rStateValue = shStateValue[frameIdx];
+    }
+
+    frameGateGrad.setValue(rGateGrad);
+    frameGateGrad.nextFrame<!reversed, valueSize>();
+
+    if (i != length - 1) {
+      if (frameIdy == 3) {
+        framePreOutputValue.nextFrame<!reversed, frameSize>();
+        rPreOutputValue = framePreOutputValue.getValue();
+        frameOutputGrad.nextFrame<!reversed, frameSize>();
+        rOutputGrad = frameOutputGrad.getValue();
+      } else if (frameIdy == 2) {
+        rCheckGrad += rGateGrad * shStateValue[frameIdx];
+      } else if (frameIdy == 1) {
+        rCheckGrad += rGateGrad * shStateValue[frameIdx];
+      }
+
+      frameGateValue.nextFrame<!reversed, valueSize>();
+      rGateValue = frameGateValue.getValue();
+      shGateGrad[frameIdy][frameIdx] = rGateGrad;
+      if (valueSize == 128) {
+        real sum = 0.0f;
+#pragma unroll
+        for (int n = 0; n < frameSize; n++) {
+          sum += shGateGrad[frameIdy][n] * B_r[n];
+        }
+        if (frameIdy == 3) {
+          rOutputGrad += sum;
+        } else {
+          shOutputGrad[frameIdy][frameIdx] = sum;
+        }
+      }
+      if (valueSize == 256) {
+        ptx_sync(5, valueSize);
+        real A_r[frameSize];
+        for (int n = 0; n < frameSize; n++) {
+          A_r[n] = shGateGrad[frameIdy][n];
+        }
+        real sum = 0.0f;
+        for (int n = 0; n < frameSize; n++) {
+          sum += A_r[n] * B_r[n];
+        }
+        if (frameIdy == 3) {
+          rOutputGrad += sum;
+        } else {
+          shOutputGrad[frameIdy][frameIdx] = sum;
+        }
+      }
+
+      if (frameIdy == 3) {
+        ptx_sync(6, valueSize);
+#pragma unroll
+        for (int i = 0; i < 3; i++) {
+          rOutputGrad += shOutputGrad[i][frameIdx];
+        }
+      } else {
+        ptx_arrive(6, valueSize);
+      }
+    }
+  }
+
+  /* TODO: Temporary save & merger in another kernel */
+  if (frameIdy == 1) {
+    if (checkIgGrad)
+      paddle::paddleAtomicAdd(checkIgGrad + frameIdx, rCheckGrad);
+  } else if (frameIdy == 2) {
+    if (checkFgGrad)
+      paddle::paddleAtomicAdd(checkFgGrad + frameIdx, rCheckGrad);
+  } else if (frameIdy == 3) {
+    if (checkOgGrad)
+      paddle::paddleAtomicAdd(checkOgGrad + frameIdx, rCheckGrad);
+  }
+}
+
+void hl_lstm_parallel_backward_data(real *gateValue,
+                                    real *gateGrad,
+                                    real *stateValue,
+                                    real *stateGrad,
+                                    real *preOutputValue,
+                                    real *preOutputGrad,
+                                    real *outputGrad,
+                                    real *checkIg,
+                                    real *checkIgGrad,
+                                    real *checkFg,
+                                    real *checkFgGrad,
+                                    real *checkOg,
+                                    real *checkOgGrad,
+                                    real *weight,
+                                    const int *sequence,
+                                    int frameSize,
+                                    int numSequences,
+                                    bool reversed,
+                                    hl_activation_mode_t active_node,
+                                    hl_activation_mode_t active_gate,
+                                    hl_activation_mode_t active_state) {
+  CHECK(frameSize == 32 || frameSize == 64 || frameSize == 128 ||
+        frameSize == 256);
+  dim3 grid(numSequences, 1);
+  if (!reversed) {
+    if (frameSize == 32) {
+      KeLstmBackward<128, 32, 0><<<grid, 128, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          gateGrad,
+          stateValue,
+          stateGrad,
+          preOutputValue,
+          preOutputGrad,
+          checkIg,
+          checkIgGrad,
+          checkFg,
+          checkFgGrad,
+          checkOg,
+          checkOgGrad,
+          outputGrad,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
+    } else if (frameSize == 64) {
+      KeLstmBackward<256, 64, 0><<<grid, 256, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          gateGrad,
+          stateValue,
+          stateGrad,
+          preOutputValue,
+          preOutputGrad,
+          checkIg,
+          checkIgGrad,
+          checkFg,
+          checkFgGrad,
+          checkOg,
+          checkOgGrad,
+          outputGrad,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
+    } else if (frameSize == 128) {
+      KeLstmBackward<512, 128, 0><<<grid, 512, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          gateGrad,
+          stateValue,
+          stateGrad,
+          preOutputValue,
+          preOutputGrad,
+          checkIg,
+          checkIgGrad,
+          checkFg,
+          checkFgGrad,
+          checkOg,
+          checkOgGrad,
+          outputGrad,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
+    } else if (frameSize == 256) {
+      KeLstmBackward<1024, 256, 0><<<grid, 1024, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          gateGrad,
+          stateValue,
+          stateGrad,
+          preOutputValue,
+          preOutputGrad,
+          checkIg,
+          checkIgGrad,
+          checkFg,
+          checkFgGrad,
+          checkOg,
+          checkOgGrad,
+          outputGrad,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
+    }
+  } else {
+    if (frameSize == 32) {
+      KeLstmBackward<128, 32, 1><<<grid, 128, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          gateGrad,
+          stateValue,
+          stateGrad,
+          preOutputValue,
+          preOutputGrad,
+          checkIg,
+          checkIgGrad,
+          checkFg,
+          checkFgGrad,
+          checkOg,
+          checkOgGrad,
+          outputGrad,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
+    } else if (frameSize == 64) {
+      KeLstmBackward<256, 64, 1><<<grid, 256, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          gateGrad,
+          stateValue,
+          stateGrad,
+          preOutputValue,
+          preOutputGrad,
+          checkIg,
+          checkIgGrad,
+          checkFg,
+          checkFgGrad,
+          checkOg,
+          checkOgGrad,
+          outputGrad,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
+    } else if (frameSize == 128) {
+      KeLstmBackward<512, 128, 1><<<grid, 512, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          gateGrad,
+          stateValue,
+          stateGrad,
+          preOutputValue,
+          preOutputGrad,
+          checkIg,
+          checkIgGrad,
+          checkFg,
+          checkFgGrad,
+          checkOg,
+          checkOgGrad,
+          outputGrad,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
+    } else if (frameSize == 256) {
+      KeLstmBackward<1024, 256, 1><<<grid, 1024, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          gateGrad,
+          stateValue,
+          stateGrad,
+          preOutputValue,
+          preOutputGrad,
+          checkIg,
+          checkIgGrad,
+          checkFg,
+          checkFgGrad,
+          checkOg,
+          checkOgGrad,
+          outputGrad,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
+    }
+  }
+  CHECK_SYNC("hl_lstm_parallel_backward_data");
+}
+
+template <int B_X, int B_Y>
+__global__ void KeSetGradZero(real *gateGrad,
+                              const int *starts,
+                              int valueSize,
+                              int numSequences,
+                              bool reversed) {
+  // const int tid = threadIdx.x;
+
+  const int frameIdx = blockIdx.x * B_X + threadIdx.x;
+  const int numSeqId = blockIdx.y * B_Y + threadIdx.y;
+
+  if (numSeqId >= numSequences || frameIdx >= valueSize) return;
+
+  if (!reversed) {
+    int seqId = starts[numSeqId];
+    gateGrad[seqId * valueSize + frameIdx] = 0.0;
+  } else {
+    int seqId = starts[numSeqId + 1] - 1;
+    gateGrad[seqId * valueSize + frameIdx] = 0.0;
+  }
+}
+
+void hl_lstm_parallel_backward_weight(real *weightGrad,
+                                      real *outputValue,
+                                      real *gateGrad,
+                                      const int *sequence,
+                                      int frameSize,
+                                      int batchSize,
+                                      int numSequences,
+                                      bool reversed) {
+  int valueSize = 4 * frameSize;
+  dim3 threads(32, 32);
+  dim3 grid((valueSize + 32 - 1) / 32, (numSequences + 32 - 1) / 32);
+  KeSetGradZero<32, 32><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      gateGrad, sequence, valueSize, numSequences, reversed);
+
+  if (!reversed) {
+    hl_matrix_mul(outputValue,
+                  HPPL_OP_T,
+                  gateGrad + valueSize,
+                  HPPL_OP_N,
+                  weightGrad,
+                  frameSize,
+                  valueSize,
+                  batchSize - 1,
+                  1.0,
+                  1.0);
+  } else {
+    hl_matrix_mul(outputValue + frameSize,
+                  HPPL_OP_T,
+                  gateGrad,
+                  HPPL_OP_N,
+                  weightGrad,
+                  frameSize,
+                  valueSize,
+                  batchSize - 1,
+                  1.0,
+                  1.0);
+  }
+  CHECK_SYNC("hl_lstm_parallel_backward_weight");
+}
diff --git a/paddle/legacy/cuda/src/hl_cuda_matrix.cu b/paddle/legacy/cuda/src/hl_cuda_matrix.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6fe460026bbd404e15b43bd221551094a7abeda2
--- /dev/null
+++ b/paddle/legacy/cuda/src/hl_cuda_matrix.cu
@@ -0,0 +1,806 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "hl_base.h"
+#include "hl_device_functions.cuh"
+#include "hl_gpu_matrix_kernel.cuh"
+#include "hl_matrix.h"
+#include "hl_matrix_apply.cuh"
+#include "hl_matrix_ops.cuh"
+#include "hl_sequence.h"
+#include "hl_sparse.ph"
+#include "paddle/legacy/utils/Logging.h"
+
+DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1 * a + p2 * b);
+void hl_matrix_add(real* A_d,
+                   real* B_d,
+                   real* C_d,
+                   int dimM,
+                   int dimN,
+                   real alpha,
+                   real beta) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(B_d);
+  CHECK_NOTNULL(C_d);
+
+  hl_gpu_apply_ternary_op<real, ternary::_add<real>, 0, 0>(
+      ternary::_add<real>(alpha, beta),
+      A_d,
+      B_d,
+      C_d,
+      dimM,
+      dimN,
+      dimN,
+      dimN,
+      dimN);
+  CHECK_SYNC("hl_matrix_add failed");
+}
+
+#ifdef PADDLE_TYPE_DOUBLE
+#define THRESHOLD 128
+#else
+#define THRESHOLD 64
+#endif
+__device__ __forceinline__ void findMax(real* I,
+                                        real* dfMax_s,
+                                        int blockSize,
+                                        int base,
+                                        int curIdx,
+                                        int nextIdx,
+                                        int dimN,
+                                        real* max) {
+  dfMax_s[base] = -1.0e20;
+  while (curIdx < dimN) {
+    if (dfMax_s[base] < I[nextIdx]) {
+      dfMax_s[base] = I[nextIdx];
+    }
+    nextIdx += blockSize;
+    curIdx += blockSize;
+  }
+  __syncthreads();
+
+  for (int stride = blockSize >> 1; stride > 0; stride >>= 1) {
+    __syncthreads();
+    if (base < stride) {
+      nextIdx = base + stride;
+      if (dfMax_s[base] < dfMax_s[nextIdx]) {
+        dfMax_s[base] = dfMax_s[nextIdx];
+      }
+    }
+  }
+
+  if (0 == base) {
+    max[0] = dfMax_s[0];
+  }
+  __syncthreads();
+}
+
+__device__ __forceinline__ void subMaxAndExp(real* I,
+                                             real* O,
+                                             int curIdx,
+                                             int nextIdx,
+                                             int blockSize,
+                                             int dimN,
+                                             real max) {
+  real val;
+  while (curIdx < dimN) {
+    val = I[nextIdx] - max;
+    if (val < -THRESHOLD) {
+      val = -THRESHOLD;
+    }
+    I[nextIdx] = val;
+#ifndef PADDLE_TYPE_DOUBLE
+    O[nextIdx] = __expf(val);
+#else
+    O[nextIdx] = exp(val);
+#endif
+    nextIdx += blockSize;
+    curIdx += blockSize;
+  }
+  __syncthreads();
+}
+
+__device__ __forceinline__ void valueSum(real* O,
+                                         real* dfMax_s,
+                                         int blockSize,
+                                         int base,
+                                         int curIdx,
+                                         int nextIdx,
+                                         int dimN) {
+  dfMax_s[base] = 0;
+  while (curIdx < dimN) {
+    dfMax_s[base] += O[nextIdx];
+    nextIdx += blockSize;
+    curIdx += blockSize;
+  }
+  __syncthreads();
+
+  for (int stride = blockSize >> 1; stride > 0; stride >>= 1) {
+    __syncthreads();
+    if (base < stride) {
+      nextIdx = base + stride;
+      dfMax_s[base] += dfMax_s[nextIdx];
+    }
+  }
+  __syncthreads();
+}
+
+__device__ __forceinline__ void divSum(
+    real* O, real sum, int curIdx, int nextIdx, int blockSize, int dimN) {
+  while (curIdx < dimN) {
+    O[nextIdx] /= sum;
+    nextIdx += blockSize;
+    curIdx += blockSize;
+  }
+}
+
+__device__ __forceinline__ void softmax(real* I,
+                                        real* O,
+                                        real* dfMax_s,
+                                        int blockSize,
+                                        int base,
+                                        int curIdx,
+                                        int nextIdx,
+                                        int dimN) {
+  __shared__ real max;
+
+  // find the max number
+  findMax(I, dfMax_s, blockSize, base, curIdx, nextIdx, dimN, &max);
+
+  // sub max Value and do Exp operation
+  subMaxAndExp(I, O, base, nextIdx, blockSize, dimN, max);
+
+  // add dimN values into blockDim.x buffer
+  // sum is in dfMax_s[0]
+  valueSum(O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN);
+
+  // divided by sum
+  divSum(O, dfMax_s[0], curIdx, nextIdx, blockSize, dimN);
+}
+
+template <int blockSize>
+__global__ void KeMatrixSoftMax(real* O, real* I, int dimN) {
+  int base = threadIdx.x;
+  __shared__ real dfMax_s[blockSize];
+  int nextIdx = blockIdx.x * dimN + base;
+  int curIdx = base;
+
+  softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN);
+}
+
+void hl_matrix_softmax(real* A_d, real* C_d, int dimM, int dimN) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_d);
+
+  dim3 block(512, 1);
+  dim3 grid(dimM, 1);
+  KeMatrixSoftMax<512><<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, dimN);
+  CHECK_SYNC("hl_matrix_softmax failed");
+}
+
+template <int blockSize>
+__global__ void KeSequenceSoftMax(real* O, real* I, const int* index) {
+  int base = threadIdx.x;
+  int bid = blockIdx.x;
+  __shared__ real dfMax_s[blockSize];
+
+  int start = index[bid];
+  int dimN = index[bid + 1] - start;
+
+  int nextIdx = start + base;
+  int curIdx = base;
+
+  softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN);
+}
+
+void hl_sequence_softmax_forward(real* A_d,
+                                 real* C_d,
+                                 const int* index,
+                                 int numSequence) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_d);
+
+  dim3 block(512, 1);
+  dim3 grid(numSequence, 1);
+  KeSequenceSoftMax<512><<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, index);
+  CHECK_SYNC("hl_sequence_softmax_forward failed");
+}
+
+__global__ void KeMatrixDerivative(
+    real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {
+  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int colIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  int index;
+
+  if (rowIdx < dimM && colIdx < dimN) {
+    index = rowIdx * dimN + colIdx;
+    grad_d[index] = output_d[index] * (grad_d[index] - sftmaxSum_d[rowIdx]);
+  }
+}
+
+void hl_matrix_softmax_derivative(
+    real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {
+  CHECK_NOTNULL(grad_d);
+  CHECK_NOTNULL(output_d);
+  CHECK_NOTNULL(sftmaxSum_d);
+
+  int blocksX = (dimM + 0) / 1;
+  int blocksY = (dimN + 1024 - 1) / 1024;
+  dim3 threads(1, 1024);
+  dim3 grid(blocksX, blocksY);
+
+  KeMatrixDerivative<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      grad_d, output_d, sftmaxSum_d, dimM, dimN);
+  CHECK_SYNC("hl_matrix_softmax_derivative failed");
+}
+
+__global__ void KeMatrixMultiBinaryCrossEntropy(
+    real* output, real* entropy, int* row, int* col, int dimM, int dimN) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < dimM) {
+    for (int i = 0; i < dimN; i++) {
+      entropy[index] -= log(1 - output[index * dimN + i]);
+    }
+    int* row_col = col + row[index];
+    int col_num = row[index + 1] - row[index];
+    for (int i = 0; i < col_num; i++) {
+      real o = output[index * dimN + row_col[i]];
+      entropy[index] -= log(o / (1 - o));
+    }
+  }
+}
+
+void hl_matrix_multi_binary_cross_entropy(real* output,
+                                          real* entropy,
+                                          hl_sparse_matrix_s csr_mat,
+                                          int dimM,
+                                          int dimN) {
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(entropy);
+  CHECK_NOTNULL(csr_mat);
+  CHECK_EQ(csr_mat->format, HL_SPARSE_CSR);
+  int n_threads = 1024;
+  int blocks = (dimM + n_threads - 1) / n_threads;
+  dim3 threads(n_threads);
+  dim3 grid(blocks);
+  hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix);
+  KeMatrixMultiBinaryCrossEntropy<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      output, entropy, mat->csr_row, mat->csr_col, dimM, dimN);
+  CHECK_SYNC("hl_matrix_multi_binary_cross_entropy failed");
+}
+
+__global__ void KeMatrixMultiBinaryCrossEntropyBp(
+    real* output, real* grad, int* row, int* col, int dimM, int dimN) {
+  int row_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (row_idx < dimM) {
+    for (int i = 0; i < dimN; i++) {
+      int index = row_idx * dimN + i;
+      grad[index] += 1.0 / (1 - output[index]);
+    }
+    int col_num = row[row_idx + 1] - row[row_idx];
+    int* row_col = col + row[row_idx];
+    for (int i = 0; i < col_num; i++) {
+      int index = row_idx * dimN + row_col[i];
+      grad[index] -= 1.0 / (output[index] * (1 - output[index]));
+    }
+  }
+}
+
+void hl_matrix_multi_binary_cross_entropy_bp(
+    real* output, real* grad, hl_sparse_matrix_s csr_mat, int dimM, int dimN) {
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(grad);
+  CHECK_NOTNULL(csr_mat);
+  CHECK_EQ(csr_mat->format, HL_SPARSE_CSR);
+  int n_threads = 1024;
+  int blocks = (dimM + n_threads - 1) / n_threads;
+  dim3 threads(n_threads);
+  dim3 grid(blocks);
+  hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix);
+  KeMatrixMultiBinaryCrossEntropyBp<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      output, grad, mat->csr_row, mat->csr_col, dimM, dimN);
+  CHECK_SYNC("hl_matrix_multi_binary_cross_entropy_bp failed");
+}
+
+__global__ void KeMatrixCrossEntropy(
+    real* O, real* E, int* label, int dimM, int dimN) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int newBase;
+  if (index < dimM) {
+    newBase = label[index];
+    newBase = newBase % dimN;
+    E[index] = -log(O[index * dimN + newBase]);
+  }
+}
+
+void hl_matrix_cross_entropy(
+    real* A_d, real* C_d, int* label_d, int dimM, int dimN) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_d);
+
+  int blocks = (dimM + 1024 - 1) / 1024;
+  dim3 threads(1024, 1);
+  dim3 grid(blocks, 1);
+  KeMatrixCrossEntropy<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      A_d, C_d, label_d, dimM, dimN);
+  CHECK_SYNC("hl_matrix_cross_entropy failed");
+}
+
+__global__ void KeMatrixCrossEntropyBp(
+    real* grad_d, real* output_d, int* label_d, int dimM, int dimN) {
+  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int colIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  int index;
+  if (rowIdx < dimM && colIdx < dimN) {
+    index = rowIdx * dimN + colIdx;
+    if (label_d[rowIdx] == colIdx) {
+      grad_d[index] -= 1.0f / output_d[index];
+    }
+  }
+}
+
+void hl_matrix_cross_entropy_bp(
+    real* grad_d, real* output_d, int* label_d, int dimM, int dimN) {
+  CHECK_NOTNULL(grad_d);
+  CHECK_NOTNULL(output_d);
+  CHECK_NOTNULL(label_d);
+
+  int blocksX = (dimM + 0) / 1;
+  int blocksY = (dimN + 1024 - 1) / 1024;
+  dim3 threads(1, 1024);
+  dim3 grid(blocksX, blocksY);
+  KeMatrixCrossEntropyBp<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      grad_d, output_d, label_d, dimM, dimN);
+  CHECK_SYNC("hl_matrix_cross_entropy_bp failed");
+}
+
+void hl_matrix_zero_mem(real* data, int num) {
+  hl_gpu_apply_unary_op(unary::Zero<real>(), data, 1, num, num);
+}
+
+__global__ void KeParamReluForward(real* output,
+                                   real* input,
+                                   real* w,
+                                   int width,
+                                   int height,
+                                   int partial_sum) {
+  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+  int ty = blockIdx.y * blockDim.y + threadIdx.y;
+  if (tx < width && ty < height) {
+    int index = ty * width + tx;
+    output[index] =
+        input[index] > 0 ? input[index] : input[index] * w[tx / partial_sum];
+  }
+}
+
+void hl_param_relu_forward(real* output,
+                           real* input,
+                           real* w,
+                           int width,
+                           int height,
+                           int partial_sum) {
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(input);
+  CHECK_NOTNULL(w);
+  dim3 threads(16, 16);
+  int blockX = (width + 16 - 1) / 16;
+  int blockY = (height + 16 - 1) / 16;
+  dim3 grid(blockX, blockY);
+  KeParamReluForward<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      output, input, w, width, height, partial_sum);
+  CHECK_SYNC("hl_param_relu_forward failed");
+}
+
+template <int blockSize>
+__global__ void KeParamReluBackWardW(real* grad_w,
+                                     real* grad_o,
+                                     real* input,
+                                     int width,
+                                     int height,
+                                     int partial_sum) {
+  const int tid = threadIdx.x;
+  __shared__ real temp[blockSize];
+  grad_o += partial_sum * blockIdx.x;
+  input += partial_sum * blockIdx.x;
+  real tmp = 0.0;
+  for (int index = tid; index < partial_sum * height; index += blockSize) {
+    int row = index / partial_sum;
+    int offset = row * width + (index - row * partial_sum);
+    if (input[offset] < 0) {
+      tmp += grad_o[offset] * input[offset];
+    }
+  }
+  temp[tid] = tmp;
+  __syncthreads();
+  for (int s = blockSize / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      temp[tid] += temp[tid + s];
+    }
+    __syncthreads();
+  }
+  if (tid == 0) {
+    grad_w[blockIdx.x] += temp[0];
+  }
+}
+
+void hl_param_relu_backward_w(real* grad_w,
+                              real* grad_o,
+                              real* input,
+                              int width,
+                              int height,
+                              int partial_sum) {
+  CHECK_NOTNULL(grad_w);
+  CHECK_NOTNULL(grad_o);
+  CHECK_NOTNULL(input);
+  const int blockSize = 1024;
+  int grid_num = width / partial_sum;
+  dim3 threads(blockSize, 1);
+  dim3 grid(grid_num, 1);
+  KeParamReluBackWardW<blockSize><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      grad_w, grad_o, input, width, height, partial_sum);
+  CHECK_SYNC("hl_param_relu_backward_w failed");
+}
+
+__global__ void KeParamReluBackwardDiff(real* grad_o,
+                                        real* input,
+                                        real* w,
+                                        real* diff,
+                                        int width,
+                                        int height,
+                                        int partial_sum) {
+  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+  int ty = blockIdx.y * blockDim.y + threadIdx.y;
+  if (tx < width && ty < height) {
+    int index = ty * width + tx;
+    diff[index] += grad_o[index] * (input[index] > 0 ? 1 : w[tx / partial_sum]);
+  }
+}
+
+void hl_param_relu_backward_diff(real* grad_o,
+                                 real* data,
+                                 real* w,
+                                 real* diff,
+                                 int width,
+                                 int height,
+                                 int partial_sum) {
+  CHECK_NOTNULL(grad_o);
+  CHECK_NOTNULL(data);
+  CHECK_NOTNULL(w);
+  CHECK_NOTNULL(diff);
+  dim3 threads(16, 16);
+  int blockX = (width + 16 - 1) / 16;
+  int blockY = (height + 16 - 1) / 16;
+  dim3 grid(blockX, blockY);
+  KeParamReluBackwardDiff<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      grad_o, data, w, diff, width, height, partial_sum);
+  CHECK_SYNC("hl_param_relu_backward_diff failed");
+}
+
+__global__ void KeMatrixAddSharedBias(
+    real* A, real* B, const int channel, const int M, const int N, real scale) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int dim = N / channel;
+  if (index < M * N) {
+    int i = index % N;
+    i = i / dim;
+    A[index] += scale * B[i];
+  }
+}
+
+void hl_matrix_add_shared_bias(real* A_d,
+                               real* B_d,
+                               const int channel,
+                               const int dimM,
+                               const int dimN,
+                               real scale) {
+  const int blocks = 512;
+  const int grids = DIVUP(dimM * dimN, blocks);
+  KeMatrixAddSharedBias<<<grids, blocks, 0, STREAM_DEFAULT>>>(
+      A_d, B_d, channel, dimM, dimN, scale);
+  CHECK_SYNC("hl_matrix_add_shared_bias failed");
+}
+
+template <int blockSize>
+__global__ void KeMatrixCollectSharedBias(real* B,
+                                          real* A,
+                                          const int channel,
+                                          const int M,
+                                          const int N,
+                                          const int dim,
+                                          const int limit,
+                                          real scale) {
+  if (dim < limit) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index < channel) {
+      real sum = 0.0;
+      for (int i = 0; i < M; ++i) {
+        for (int j = 0; j < dim; ++j) {
+          sum += A[i * N + index * dim + j];
+        }
+      }
+      B[index] += scale * sum;
+    }
+  } else {
+    const int tid = threadIdx.x;
+    const int bid = blockIdx.x;
+    __shared__ real smem[blockSize];
+    real sum = 0.0;
+    for (int j = 0; j < ((dim * M + blockSize - 1) / blockSize); ++j) {
+      int n = j * blockSize + tid;
+      int m = n / dim;
+      int w = n % dim;
+      smem[tid] = (m < M && w < dim) ? A[m * N + bid * dim + w] : 0.0;
+      __syncthreads();
+      simpleReduce(smem, tid, blockSize);
+      sum += smem[0];
+    }
+    if (tid == 0) {
+      B[bid] += scale * sum;
+    }
+  }
+}
+
+void hl_matrix_collect_shared_bias(real* B_d,
+                                   real* A_d,
+                                   const int channel,
+                                   const int dimM,
+                                   const int dimN,
+                                   real scale) {
+  const int dim = dimN / channel;
+  const int blocks = 256;
+  const int limit = 64;
+  int grids = (dimM * dim) < limit ? DIVUP(channel, blocks) : channel;
+
+  KeMatrixCollectSharedBias<blocks><<<grids, blocks, 0, STREAM_DEFAULT>>>(
+      B_d, A_d, channel, dimM, dimN, dim, limit, scale);
+  CHECK_SYNC("hl_matrix_collect_shared_bias failed");
+}
+
+__global__ void keMatrixRotate(
+    real* mat, real* matRot, int dimM, int dimN, bool clockWise) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < dimM * dimN) {
+    int i = idx / dimN;
+    int j = idx % dimN;
+    if (clockWise) {
+      matRot[j * dimM + i] = mat[(dimM - i - 1) * dimN + j];
+    } else {
+      matRot[j * dimM + i] = mat[i * dimN + (dimN - j - 1)];
+    }
+  }
+}
+
+void hl_matrix_rotate(
+    real* mat, real* matRot, int dimM, int dimN, bool clockWise) {
+  CHECK_NOTNULL(mat);
+  CHECK_NOTNULL(matRot);
+  const int threads = 512;
+  const int blocks = DIVUP(dimM * dimN, threads);
+  keMatrixRotate<<<blocks, threads, 0, STREAM_DEFAULT>>>(
+      mat, matRot, dimM, dimN, clockWise);
+  CHECK_SYNC("hl_matrix_rotate failed");
+}
+
+__global__ void keMatrixVol2Col(int num_kernels,
+                                const real* dataSrc,
+                                real* dataDst,
+                                int depth,
+                                int height,
+                                int width,
+                                int filterD,
+                                int filterH,
+                                int filterW,
+                                int strideD,
+                                int strideH,
+                                int strideW,
+                                int paddingD,
+                                int paddingH,
+                                int paddingW,
+                                int depth_col,
+                                int height_col,
+                                int width_col) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels;
+       index += blockDim.x * gridDim.x) {
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int d_out = (index / width_col / height_col) % depth_col;
+    int channel_in = index / width_col / height_col / depth_col;
+    int channel_out = channel_in * filterD * filterH * filterW;
+    int w_in = w_out * strideW - paddingW;
+    int h_in = h_out * strideH - paddingH;
+    int d_in = d_out * strideD - paddingD;
+
+    dataDst +=
+        ((channel_out * depth_col + d_out) * height_col + h_out) * width_col +
+        w_out;
+    dataSrc += ((channel_in * depth + d_in) * height + h_in) * width + w_in;
+    for (int k = 0; k < filterD; ++k) {
+      for (int i = 0; i < filterH; ++i) {
+        for (int j = 0; j < filterW; ++j) {
+          int d = d_in + k;
+          int h = h_in + i;
+          int w = w_in + j;
+          *dataDst = (d >= 0 && d < depth && h >= 0 && h < height && w >= 0 &&
+                      w < width)
+                         ? dataSrc[(k * height + i) * width + j]
+                         : 0;
+          dataDst += depth_col * height_col * width_col;
+        }
+      }
+    }
+  }
+}
+
+void hl_matrix_vol2Col(const real* dataSrc,
+                       int channels,
+                       int depth,
+                       int height,
+                       int width,
+                       int filterD,
+                       int filterH,
+                       int filterW,
+                       int strideD,
+                       int strideH,
+                       int strideW,
+                       int paddingD,
+                       int paddingH,
+                       int paddingW,
+                       real* dataDst) {
+  int depth_col = (depth + 2 * paddingD - filterD) / strideD + 1;
+  int height_col = (height + 2 * paddingH - filterH) / strideH + 1;
+  int width_col = (width + 2 * paddingW - filterW) / strideW + 1;
+  int num_kernels = channels * depth_col * height_col * width_col;
+
+  const int threads = 512;
+  const int blocks = DIVUP(num_kernels, threads);
+
+  keMatrixVol2Col<<<blocks, threads, 0, STREAM_DEFAULT>>>(num_kernels,
+                                                          dataSrc,
+                                                          dataDst,
+                                                          depth,
+                                                          height,
+                                                          width,
+                                                          filterD,
+                                                          filterH,
+                                                          filterW,
+                                                          strideD,
+                                                          strideH,
+                                                          strideW,
+                                                          paddingD,
+                                                          paddingH,
+                                                          paddingW,
+                                                          depth_col,
+                                                          height_col,
+                                                          width_col);
+  CHECK_SYNC("hl_matrix_vol2Col failed");
+}
+
+__global__ void keMatrixCol2Vol(int num_kernels,
+                                real* dataDst,
+                                const real* dataSrc,
+                                int depth,
+                                int height,
+                                int width,
+                                int filterD,
+                                int filterH,
+                                int filterW,
+                                int strideD,
+                                int strideH,
+                                int strideW,
+                                int paddingD,
+                                int paddingH,
+                                int paddingW,
+                                int depth_col,
+                                int height_col,
+                                int width_col,
+                                real alpha,
+                                real beta) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels;
+       index += blockDim.x * gridDim.x) {
+    real srcVal = 0;
+    real dstVal = dataDst[index];
+    int w = index % width + paddingW;
+    int h = (index / width) % height + paddingH;
+    int d = (index / width / height) % depth + paddingD;
+    int c = index / width / height / depth;
+    // compute the start and end of the output
+    int w_col_start = (w < filterW) ? 0 : (w - filterW) / strideW + 1;
+    int w_col_end = min(w / strideW + 1, width_col);
+    int h_col_start = (h < filterH) ? 0 : (h - filterH) / strideH + 1;
+    int h_col_end = min(h / strideH + 1, height_col);
+    int d_col_start = (d < filterD) ? 0 : (d - filterD) / strideD + 1;
+    int d_col_end = min(d / strideD + 1, depth_col);
+
+    int offset = (c * filterD * filterW * filterH + d * filterW * filterH +
+                  h * filterW + w) *
+                 depth_col * height_col * width_col;
+
+    int coeff_d_col =
+        (1 - strideD * filterW * filterH * depth_col) * height_col * width_col;
+    int coeff_h_col =
+        (1 - strideH * filterW * depth_col * height_col) * width_col;
+    int coeff_w_col = (1 - strideW * depth_col * height_col * width_col);
+
+    for (int d_col = d_col_start; d_col < d_col_end; ++d_col) {
+      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+          srcVal += dataSrc[offset + d_col * coeff_d_col + h_col * coeff_h_col +
+                            w_col * coeff_w_col];
+        }
+      }
+    }
+    dataDst[index] = alpha * srcVal + beta * dstVal;
+  }
+}
+
+void hl_matrix_col2Vol(real* dataDst,
+                       int channels,
+                       int depth,
+                       int height,
+                       int width,
+                       int filterD,
+                       int filterH,
+                       int filterW,
+                       int strideD,
+                       int strideH,
+                       int strideW,
+                       int paddingD,
+                       int paddingH,
+                       int paddingW,
+                       const real* dataSrc,
+                       real alpha,
+                       real beta) {
+  int depth_col = (depth + 2 * paddingD - filterD) / strideD + 1;
+  int height_col = (height + 2 * paddingH - filterH) / strideH + 1;
+  int width_col = (width + 2 * paddingW - filterW) / strideW + 1;
+  int num_kernels = channels * depth * height * width;
+
+  const int threads = 512;
+  const int blocks = DIVUP(num_kernels, threads);
+
+  keMatrixCol2Vol<<<blocks, threads, 0, STREAM_DEFAULT>>>(num_kernels,
+                                                          dataDst,
+                                                          dataSrc,
+                                                          depth,
+                                                          height,
+                                                          width,
+                                                          filterD,
+                                                          filterH,
+                                                          filterW,
+                                                          strideD,
+                                                          strideH,
+                                                          strideW,
+                                                          paddingD,
+                                                          paddingH,
+                                                          paddingW,
+                                                          depth_col,
+                                                          height_col,
+                                                          width_col,
+                                                          alpha,
+                                                          beta);
+
+  CHECK_SYNC("hl_matrix_col2Vol failed");
+}
+
+__global__ void keVectorCast2Int(int* out, real* vec, int size) {
+  for (int i = threadIdx.x; i < (size); i += blockDim.x) {
+    out[i] = int(vec[i]);
+  }
+}
+
+void hl_vector_cast2int(int* out, real* vec, int size) {
+  keVectorCast2Int<<<1, 512, 0, STREAM_DEFAULT>>>(out, vec, size);
+  CHECK_SYNC("hl_vector_cast2int failed");
+}
diff --git a/paddle/legacy/cuda/src/hl_cuda_sequence.cu b/paddle/legacy/cuda/src/hl_cuda_sequence.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1d772b5ce27615673d85231ec8fd3ab1d0aed523
--- /dev/null
+++ b/paddle/legacy/cuda/src/hl_cuda_sequence.cu
@@ -0,0 +1,408 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "hl_base.h"
+#include "hl_device_functions.cuh"
+#include "paddle/legacy/utils/Logging.h"
+
+__global__ void KeMaxSequenceForward(real* input,
+                                     const int* sequence,
+                                     real* output,
+                                     int* index,
+                                     int numSequences,
+                                     int dim) {
+  int dimIdx = threadIdx.x;
+  int sequenceId = blockIdx.x;
+  if (sequenceId >= numSequences) return;
+  int start = sequence[sequenceId];
+  int end = sequence[sequenceId + 1];
+
+  for (int i = dimIdx; i < dim; i += blockDim.x) {
+    real tmp = -HL_FLOAT_MAX;
+    int tmpId = -1;
+    for (int insId = start; insId < end; insId++) {
+      if (tmp < input[insId * dim + i]) {
+        tmp = input[insId * dim + i];
+        tmpId = insId;
+      }
+    }
+    output[sequenceId * dim + i] = tmp;
+    index[sequenceId * dim + i] = tmpId;
+  }
+}
+
+void hl_max_sequence_forward(real* input,
+                             const int* sequence,
+                             real* output,
+                             int* index,
+                             int numSequences,
+                             int dim) {
+  CHECK_NOTNULL(input);
+  CHECK_NOTNULL(sequence);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(index);
+
+  dim3 threads(256, 1);
+  dim3 grid(numSequences, 1);
+  KeMaxSequenceForward<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      input, sequence, output, index, numSequences, dim);
+  CHECK_SYNC("hl_max_sequence_forward failed");
+}
+
+__global__ void KeMaxSequenceBackward(
+    real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int colIdx = idx % dim;
+  if (idx < numSequences * dim) {
+    int insId = index[idx];
+    inputGrad[insId * dim + colIdx] += outputGrad[idx];
+  }
+}
+
+void hl_max_sequence_backward(
+    real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {
+  CHECK_NOTNULL(outputGrad);
+  CHECK_NOTNULL(index);
+  CHECK_NOTNULL(inputGrad);
+
+  unsigned int blocks = (numSequences * dim + 128 - 1) / 128;
+  dim3 threads(128, 1);
+  dim3 grid(blocks, 1);
+  KeMaxSequenceBackward<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      outputGrad, index, inputGrad, numSequences, dim);
+  CHECK_SYNC("hl_max_sequence_backward failed");
+}
+
+template <int blockDimX, int blockDimY, int gridDimX, bool AddRow>
+__global__ void KeMatrixAddRows(real* output,
+                                real* table,
+                                int* ids,
+                                int numSamples,
+                                int tableSize,
+                                int dim) {
+  int idx = threadIdx.x;
+  int idy = threadIdx.y;
+  int sampleId = blockIdx.x + idy * gridDimX;
+
+  while (sampleId < numSamples) {
+    int tableId = ids[sampleId];
+    if ((0 <= tableId) && (tableId < tableSize)) {
+      real* outputData = output + sampleId * dim;
+      real* tableData = table + tableId * dim;
+      for (int i = idx; i < dim; i += blockDimX) {
+        if (AddRow == 0) {
+          outputData[i] += tableData[i];
+        } else {
+          paddle::paddleAtomicAdd(&tableData[i], outputData[i]);
+        }
+      }
+    }
+    sampleId += blockDimY * gridDimX;
+  }
+}
+
+template <int blockDimX,
+          int blockDimY,
+          int gridDimX,
+          bool seq2batch,
+          bool isAdd>
+__global__ void KeSequence2Batch(real* batch,
+                                 real* sequence,
+                                 const int* batchIndex,
+                                 int seqWidth,
+                                 int batchCount) {
+  int idx = threadIdx.x;
+  int idy = threadIdx.y;
+  int id = blockIdx.x + idy * gridDimX;
+  while (id < batchCount) {
+    int seqId = batchIndex[id];
+    real* batchData = batch + id * seqWidth;
+    real* seqData = sequence + seqId * seqWidth;
+    for (int i = idx; i < seqWidth; i += blockDimX) {
+      if (seq2batch) {
+        if (isAdd) {
+          batchData[i] += seqData[i];
+        } else {
+          batchData[i] = seqData[i];
+        }
+      } else {
+        if (isAdd) {
+          seqData[i] += batchData[i];
+        } else {
+          seqData[i] = batchData[i];
+        }
+      }
+    }
+    id += blockDimY * gridDimX;
+  }
+}
+
+void hl_sequence2batch_copy(real* batch,
+                            real* sequence,
+                            const int* batchIndex,
+                            int seqWidth,
+                            int batchCount,
+                            bool seq2batch) {
+  CHECK_NOTNULL(sequence);
+  CHECK_NOTNULL(batch);
+  CHECK_NOTNULL(batchIndex);
+
+  dim3 threads(128, 8);
+  dim3 grid(8, 1);
+  if (seq2batch) {
+    KeSequence2Batch<128, 8, 8, 1, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        batch, sequence, batchIndex, seqWidth, batchCount);
+  } else {
+    KeSequence2Batch<128, 8, 8, 0, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        batch, sequence, batchIndex, seqWidth, batchCount);
+  }
+  CHECK_SYNC("hl_sequence2batch_copy failed");
+}
+
+void hl_sequence2batch_add(real* batch,
+                           real* sequence,
+                           int* batchIndex,
+                           int seqWidth,
+                           int batchCount,
+                           bool seq2batch) {
+  CHECK_NOTNULL(sequence);
+  CHECK_NOTNULL(batch);
+  CHECK_NOTNULL(batchIndex);
+
+  dim3 threads(128, 8);
+  dim3 grid(8, 1);
+  if (seq2batch) {
+    KeSequence2Batch<128, 8, 8, 1, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        batch, sequence, batchIndex, seqWidth, batchCount);
+  } else {
+    KeSequence2Batch<128, 8, 8, 0, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        batch, sequence, batchIndex, seqWidth, batchCount);
+  }
+  CHECK_SYNC("hl_sequence2batch_add failed");
+}
+
+template <bool normByTimes, bool seq2batch>
+__global__ void KeSequence2BatchPadding(real* batch,
+                                        real* sequence,
+                                        const int* sequenceStartPositions,
+                                        const size_t sequenceWidth,
+                                        const size_t maxSequenceLength,
+                                        const size_t numSequences) {
+  int batchIdx = blockIdx.y;
+  int sequenceStart = sequenceStartPositions[batchIdx];
+  int sequenceLength = sequenceStartPositions[batchIdx + 1] - sequenceStart;
+
+  int sequenceIdx = blockIdx.x * blockDim.y + threadIdx.y;
+  int batchBaseIdx = (sequenceIdx * numSequences + batchIdx) * sequenceWidth;
+  int sequenceBaseIdx = (sequenceStart + sequenceIdx) * sequenceWidth;
+
+  real scale = normByTimes ? (1.0f / (real)sequenceLength) : 1.0f;
+
+  if (sequenceIdx < sequenceLength) {
+    if (seq2batch) {
+      /* sequence -> batch */
+      for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
+        batch[batchBaseIdx + i] = scale * sequence[sequenceBaseIdx + i];
+      }
+    } else {
+      /* batch -> sequence */
+      for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
+        sequence[sequenceBaseIdx + i] = scale * batch[batchBaseIdx + i];
+      }
+    }
+  } else if (sequenceIdx < maxSequenceLength) {
+    if (seq2batch) {
+      /* sequence -> batch */
+      for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
+        batch[batchBaseIdx + i] = 0;
+      }
+    }
+  }
+}
+
+void hl_sequence2batch_copy_padding(real* batch,
+                                    real* sequence,
+                                    const int* sequenceStartPositions,
+                                    const size_t sequenceWidth,
+                                    const size_t maxSequenceLength,
+                                    const size_t numSequences,
+                                    bool normByTimes,
+                                    bool seq2batch) {
+  CHECK_NOTNULL(batch);
+  CHECK_NOTNULL(sequence);
+  CHECK_NOTNULL(sequenceStartPositions);
+
+  if (!normByTimes && numSequences == 1) {
+    size_t elementCount = maxSequenceLength * sequenceWidth;
+    if (seq2batch) {
+      /* sequence -> batch */
+      hl_memcpy_device2device(batch, sequence, sizeof(real) * elementCount);
+    } else {
+      /* batch -> sequence */
+      hl_memcpy_device2device(sequence, batch, sizeof(real) * elementCount);
+    }
+    return;
+  }
+
+  const int CUDA_BLOCK_SIZE = 512;
+
+  /* At least use 32 threads to copy sequenceWidth elements,
+     and at least 8 elements for each thread. */
+  int blockDimX = ((((sequenceWidth + 7) >> 3) + 31) >> 5) << 5;
+  blockDimX = (blockDimX < CUDA_BLOCK_SIZE) ? blockDimX : CUDA_BLOCK_SIZE;
+
+  int blockDimY = CUDA_BLOCK_SIZE / blockDimX;
+  dim3 threads(blockDimX, blockDimY);
+
+  int gridDimX = (maxSequenceLength + blockDimY - 1) / blockDimY;
+  int gridDimY = numSequences;
+  dim3 grid(gridDimX, gridDimY);
+
+  if (seq2batch) {
+    /* sequence -> batch */
+    if (normByTimes) {
+      KeSequence2BatchPadding<1, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          batch,
+          sequence,
+          sequenceStartPositions,
+          sequenceWidth,
+          maxSequenceLength,
+          numSequences);
+    } else {
+      KeSequence2BatchPadding<0, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          batch,
+          sequence,
+          sequenceStartPositions,
+          sequenceWidth,
+          maxSequenceLength,
+          numSequences);
+    }
+  } else {
+    /* batch -> sequence */
+    if (normByTimes) {
+      KeSequence2BatchPadding<1, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          batch,
+          sequence,
+          sequenceStartPositions,
+          sequenceWidth,
+          maxSequenceLength,
+          numSequences);
+    } else {
+      KeSequence2BatchPadding<0, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          batch,
+          sequence,
+          sequenceStartPositions,
+          sequenceWidth,
+          maxSequenceLength,
+          numSequences);
+    }
+  }
+
+  CHECK_SYNC("hl_sequence2batch_copy_padding failed");
+}
+
+__device__ inline float my_rsqrt(float x) { return rsqrtf(x); }
+
+__device__ inline double my_rsqrt(double x) { return rsqrt(x); }
+
+__global__ void KeSequenceAvgForward(real* dst,
+                                     real* src,
+                                     const int* starts,
+                                     int height,
+                                     int width,
+                                     const int mode) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int row = gid / width;
+  int col = gid % width;
+
+  if (gid < height * width) {
+    int start = starts[row];
+    int end = starts[row + 1];
+    int seqLength = end - start;
+    if (seqLength == 0) return;
+    real sum = 0.0;
+    for (int i = start; i < end; i++) {
+      sum += src[i * width + col];
+    }
+    sum = mode == 1 ? sum : (mode == 0 ? sum / seqLength
+                                       : sum * my_rsqrt((real)seqLength));
+    dst[gid] += sum;
+  }
+}
+
+void hl_sequence_avg_forward(real* dst,
+                             real* src,
+                             const int* starts,
+                             int height,
+                             int width,
+                             const int mode) {
+  CHECK_NOTNULL(dst);
+  CHECK_NOTNULL(src);
+  CHECK_NOTNULL(starts);
+
+  int block = 512;
+  int grid = DIVUP(width * height, 512);
+
+  CHECK(mode == 0 || mode == 1 || mode == 2)
+      << "mode error in hl_sequence_avg_forward!";
+
+  KeSequenceAvgForward<<<grid, block, 0, STREAM_DEFAULT>>>(
+      dst, src, starts, height, width, mode);
+  CHECK_SYNC("hl_sequence_avg_forward failed");
+}
+
+__global__ void KeSequenceAvgBackward(real* dst,
+                                      real* src,
+                                      const int* starts,
+                                      int height,
+                                      int width,
+                                      const int mode) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int row = gid / width;
+  int col = gid % width;
+
+  if (gid < height * width) {
+    int start = starts[row];
+    int end = starts[row + 1];
+    int seqLength = end - start;
+    if (seqLength == 0) return;
+    real grad = src[gid];
+    grad = mode == 1 ? grad : (mode == 0 ? grad / seqLength
+                                         : grad * my_rsqrt((real)seqLength));
+    for (int i = start; i < end; i++) {
+      dst[i * width + col] += grad;
+    }
+  }
+}
+
+void hl_sequence_avg_backward(real* dst,
+                              real* src,
+                              const int* starts,
+                              int height,
+                              int width,
+                              const int mode) {
+  CHECK_NOTNULL(dst);
+  CHECK_NOTNULL(src);
+  CHECK_NOTNULL(starts);
+
+  int block = 512;
+  int grid = DIVUP(width * height, 512);
+
+  CHECK(mode == 0 || mode == 1 || mode == 2)
+      << "mode error in hl_sequence_avg_backward!";
+
+  KeSequenceAvgBackward<<<grid, block, 0, STREAM_DEFAULT>>>(
+      dst, src, starts, height, width, mode);
+  CHECK_SYNC("hl_sequence_avg_backward failed");
+}
diff --git a/paddle/legacy/cuda/src/hl_cuda_sparse.cu b/paddle/legacy/cuda/src/hl_cuda_sparse.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8065a6f9f6f2ac4cacf9a63b7b80dd00391824a0
--- /dev/null
+++ b/paddle/legacy/cuda/src/hl_cuda_sparse.cu
@@ -0,0 +1,1262 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "hl_cuda.h"
+#include "hl_cuda_sparse.cuh"
+#include "hl_matrix_apply.cuh"
+#include "hl_matrix_ops.cuh"
+#include "hl_sparse.h"
+#include "hl_sparse.ph"
+#include "paddle/legacy/utils/Logging.h"
+
+DEFINE_MATRIX_UNARY_PARAMETER_OP(mul_scalar, ONE_PARAMETER, a = a * p);
+DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
+
+void hl_matrix_csr2dense(hl_sparse_matrix_s A_d,
+                         real *C_d,
+                         int dimM,
+                         int dimN) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_d);
+  CHECK(dimM > 0 && dimN > 0 && A_d->rows == dimM && A_d->cols == dimN);
+  CHECK(A_d->format == HL_SPARSE_CSR) << "matrix format error!";
+
+  if (A_d->nnz == 0) {
+    hl_gpu_apply_unary_op(unary::Zero<real>(), C_d, dimM, dimN, dimN);
+    return;
+  }
+
+  /* nnz != 0 */
+  hl_csr_matrix A_d2 = (hl_csr_matrix)(A_d->matrix);
+  CHECK((A_d2->csr_val || A_d->type == HL_NO_VALUE) && A_d2->csr_row &&
+        A_d2->csr_col)
+      << "parameter transa error!";
+
+  int blocksX = (dimN + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X;
+  int blocksY = (dimM + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X;
+  dim3 threads(CU_CSR2DENSE_THREAD_X, CU_CSR2DENSE_THREAD_X);
+  dim3 grid(blocksX, blocksY);
+
+  if (A_d->type == HL_NO_VALUE) {
+    KeSMatrixCsr2Dense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        A_d2->csr_val, A_d2->csr_row, A_d2->csr_col, C_d, dimM, dimN);
+  } else if (A_d->type == HL_FLOAT_VALUE) {
+    KeSMatrixCsr2Dense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        A_d2->csr_val, A_d2->csr_row, A_d2->csr_col, C_d, dimM, dimN);
+  } else {
+  }
+  CHECK_SYNC("hl_matrix_csr2dense failed");
+}
+
+void hl_matrix_csc2dense(hl_sparse_matrix_s A_d,
+                         real *C_d,
+                         int dimM,
+                         int dimN) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_d);
+  CHECK(dimM > 0 && dimN > 0 && A_d->rows == dimM && A_d->cols == dimN);
+  CHECK(A_d->format == HL_SPARSE_CSC) << "matrix format error!";
+
+  if (A_d->nnz == 0) {
+    hl_gpu_apply_unary_op(unary::Zero<real>(), C_d, dimM, dimN, dimN);
+    return;
+  }
+
+  /* nnz != 0 */
+  hl_csc_matrix A_d2 = (hl_csc_matrix)(A_d->matrix);
+  CHECK((A_d2->csc_val || A_d->type == HL_NO_VALUE) && A_d2->csc_row &&
+        A_d2->csc_col)
+      << "parameter transa error!";
+
+  int blocksX = (dimN + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X;
+  int blocksY = (dimM + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X;
+  dim3 threads(CU_CSR2DENSE_THREAD_X, CU_CSR2DENSE_THREAD_X);
+  dim3 grid(blocksX, blocksY);
+
+  if (A_d->type == HL_NO_VALUE) {
+    KeSMatrixCsc2Dense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        A_d2->csc_val, A_d2->csc_row, A_d2->csc_col, C_d, dimM, dimN);
+  } else if (A_d->type == HL_FLOAT_VALUE) {
+    KeSMatrixCsc2Dense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        A_d2->csc_val, A_d2->csc_row, A_d2->csc_col, C_d, dimM, dimN);
+  } else {
+  }
+  CHECK_SYNC("hl_matrix_csc2dense failed");
+}
+
+void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
+                             hl_matrix_format_t format,
+                             hl_matrix_value_t value_type,
+                             int dimM,
+                             int dimN,
+                             int nnz) {
+  CHECK_NOTNULL(A_d);
+  CHECK(format == HL_SPARSE_CSR || format == HL_SPARSE_CSC)
+      << "sparse matrix format error!";
+  CHECK(value_type == HL_FLOAT_VALUE || value_type == HL_NO_VALUE)
+      << "sparse matrix value type error!";
+  /* avoid malloc 0 bytes */
+  int nnz_s = (nnz == 0 ? 1 : nnz);
+
+  if (format == HL_SPARSE_CSR) {
+    CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
+
+    char *tmp =
+        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csr_matrix));
+    CHECK_NOTNULL(tmp);
+
+    hl_csr_matrix csr = (hl_csr_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
+    csr->sparsity = -1.0;
+
+    if (value_type == HL_NO_VALUE) {
+      csr->csr_val = NULL;
+      csr->nnz_s = nnz_s;
+      csr->row_s = dimM + 1;
+      csr->csr_row = (int *)hl_malloc_device((dimM + 1) * sizeof(int));
+      csr->csr_col = (int *)hl_malloc_device((nnz_s) * sizeof(int));
+
+      *A_d = (hl_sparse_matrix_s)tmp;
+      (*A_d)->matrix = (hl_matrix_s)csr;
+    } else if (value_type == HL_FLOAT_VALUE) {
+      csr->nnz_s = nnz_s;
+      csr->row_s = dimM + 1;
+      csr->csr_val = (real *)hl_malloc_device((nnz_s) * sizeof(real));
+      csr->csr_row = (int *)hl_malloc_device((dimM + 1) * sizeof(int));
+      csr->csr_col = (int *)hl_malloc_device((nnz_s) * sizeof(int));
+
+      *A_d = (hl_sparse_matrix_s)tmp;
+      (*A_d)->matrix = (hl_matrix_s)csr;
+    }
+  } else if (format == HL_SPARSE_CSC) {
+    CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
+
+    char *tmp =
+        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csc_matrix));
+    CHECK_NOTNULL(tmp);
+
+    hl_csc_matrix csc = (hl_csc_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
+    csc->sparsity = -1.0f;
+
+    if (value_type == HL_NO_VALUE) {
+      csc->csc_val = NULL;
+      csc->nnz_s = nnz_s;
+      csc->col_s = dimN + 1;
+      csc->csc_row = (int *)hl_malloc_device((nnz_s) * sizeof(int));
+      csc->csc_col = (int *)hl_malloc_device((dimN + 1) * sizeof(int));
+
+      *A_d = (hl_sparse_matrix_s)tmp;
+      (*A_d)->matrix = (hl_matrix_s)csc;
+    } else if (value_type == HL_FLOAT_VALUE) {
+      csc->nnz_s = nnz_s;
+      csc->col_s = dimN + 1;
+      csc->csc_val = (real *)hl_malloc_device((nnz_s) * sizeof(real));
+      csc->csc_row = (int *)hl_malloc_device((nnz_s) * sizeof(int));
+      csc->csc_col = (int *)hl_malloc_device((dimN + 1) * sizeof(int));
+
+      *A_d = (hl_sparse_matrix_s)tmp;
+      (*A_d)->matrix = (hl_matrix_s)csc;
+    }
+  }
+
+  (*A_d)->format = format;
+  (*A_d)->type = value_type;
+  (*A_d)->rows = dimM;
+  (*A_d)->cols = dimN;
+  (*A_d)->nnz = nnz;
+}
+
+void hl_free_sparse_matrix(hl_sparse_matrix_s A_d) {
+  CHECK_NOTNULL(A_d);
+  CHECK(A_d->format == HL_SPARSE_CSR || A_d->format == HL_SPARSE_CSC)
+      << "sparse matrix format error!";
+
+  if (A_d->matrix == NULL) {
+    free(A_d);
+    return;
+  }
+
+  if (A_d->format == HL_SPARSE_CSR) {
+    hl_csr_matrix csr = (hl_csr_matrix)A_d->matrix;
+    if (csr->csr_val != NULL) {
+      hl_free_mem_device(csr->csr_val);
+      csr->csr_val = NULL;
+    }
+
+    if (csr->csr_row != NULL) {
+      hl_free_mem_device(csr->csr_row);
+      csr->csr_row = NULL;
+    }
+
+    if (csr->csr_col != NULL) {
+      hl_free_mem_device(csr->csr_col);
+      csr->csr_col = NULL;
+    }
+
+    A_d->matrix = NULL;
+    free(A_d);
+  } else if (A_d->format == HL_SPARSE_CSC) {
+    hl_csc_matrix csc = (hl_csc_matrix)A_d->matrix;
+    if (csc->csc_val != NULL) {
+      hl_free_mem_device(csc->csc_val);
+      csc->csc_val = NULL;
+    }
+
+    if (csc->csc_row != NULL) {
+      hl_free_mem_device(csc->csc_row);
+      csc->csc_row = NULL;
+    }
+
+    if (csc->csc_col != NULL) {
+      hl_free_mem_device(csc->csc_col);
+      csc->csc_col = NULL;
+    }
+
+    A_d->matrix = NULL;
+    free(A_d);
+  }
+}
+
+void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
+                                void *dest_d,
+                                size_t size,
+                                hl_matrix_format_t format,
+                                hl_matrix_value_t value_type,
+                                int dimM,
+                                int dimN,
+                                int nnz) {
+  CHECK_NOTNULL(A_d);
+  CHECK(format == HL_SPARSE_CSR || format == HL_SPARSE_CSC)
+      << "sparse matrix format error!";
+
+  if (format == HL_SPARSE_CSR) {
+    CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
+
+    size_t size_ = (dimM + 1) * sizeof(int) + nnz * sizeof(int);
+    if (value_type != HL_NO_VALUE) {
+      size_ += nnz * sizeof(real);
+    }
+    CHECK_LE(size_, size) << "dest_d size(" << size
+                          << ") too small, should bigger than(" << size_
+                          << ")!";
+
+    char *tmp =
+        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csr_matrix));
+    CHECK_NOTNULL(tmp);
+
+    hl_csr_matrix csr = (hl_csr_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
+
+    if (value_type == HL_NO_VALUE) {
+      csr->csr_val = NULL;
+      csr->csr_row = (int *)dest_d;
+      csr->csr_col = (int *)((char *)dest_d + (dimM + 1) * sizeof(int));
+    } else {
+      csr->csr_val = (real *)dest_d;
+      csr->csr_row = (int *)((char *)dest_d + nnz * sizeof(real));
+      csr->csr_col = (int *)((char *)dest_d + nnz * sizeof(real) +
+                             (dimM + 1) * sizeof(int));
+    }
+    csr->nnz_s = nnz;
+    csr->row_s = dimM + 1;
+    csr->sparsity = -1.0;
+    *A_d = (hl_sparse_matrix_s)tmp;
+    (*A_d)->matrix = (hl_matrix_s)csr;
+  } else if (format == HL_SPARSE_CSC) {
+    CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
+
+    size_t size_ = (dimN + 1) * sizeof(int) + nnz * sizeof(int);
+    if (value_type != HL_NO_VALUE) {
+      size_ += nnz * sizeof(real);
+    }
+    CHECK_LE(size_, size) << "dest_d size(" << size
+                          << ") too small, should bigger than(" << size_
+                          << ")!";
+
+    char *tmp =
+        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csc_matrix));
+    CHECK_NOTNULL(tmp);
+
+    hl_csc_matrix csc = (hl_csc_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
+    if (value_type == HL_NO_VALUE) {
+      csc->csc_val = NULL;
+      csc->csc_col = (int *)dest_d;
+      csc->csc_row = (int *)((char *)dest_d + (dimN + 1) * sizeof(int));
+    } else {
+      csc->csc_val = (real *)dest_d;
+      csc->csc_col = (int *)((char *)dest_d + nnz * sizeof(real));
+      csc->csc_row = (int *)((char *)dest_d + nnz * sizeof(real) +
+                             (dimN + 1) * sizeof(int));
+    }
+    csc->nnz_s = nnz;
+    csc->col_s = dimN + 1;
+    csc->sparsity = -1.0f;
+    *A_d = (hl_sparse_matrix_s)tmp;
+    (*A_d)->matrix = (hl_matrix_s)csc;
+  }
+
+  (*A_d)->format = format;
+  (*A_d)->type = value_type;
+  (*A_d)->rows = dimM;
+  (*A_d)->cols = dimN;
+  (*A_d)->nnz = nnz;
+}
+
+void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
+                                real *value_d,
+                                int *rows_d,
+                                int *cols_d,
+                                hl_matrix_format_t format,
+                                hl_matrix_value_t value_type,
+                                int dimM,
+                                int dimN,
+                                int nnz) {
+  CHECK_NOTNULL(A_d);
+  CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
+
+  CHECK(format == HL_SPARSE_CSR || format == HL_SPARSE_CSC)
+      << "sparse matrix format error!";
+
+  if (format == HL_SPARSE_CSR) {
+    char *tmp =
+        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csr_matrix));
+    CHECK_NOTNULL(tmp);
+
+    hl_csr_matrix csr = (hl_csr_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
+    csr->csr_row = rows_d;
+    csr->csr_col = cols_d;
+    csr->csr_val = value_d;
+    csr->nnz_s = nnz;
+    csr->row_s = dimM + 1;
+    csr->sparsity = -1.0;
+    *A_d = (hl_sparse_matrix_s)tmp;
+    (*A_d)->matrix = (hl_matrix_s)csr;
+  } else if (format == HL_SPARSE_CSC) {
+    char *tmp =
+        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csc_matrix));
+    CHECK_NOTNULL(tmp);
+
+    hl_csc_matrix csc = (hl_csc_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
+    csc->csc_row = rows_d;
+    csc->csc_col = cols_d;
+    csc->csc_val = value_d;
+    csc->nnz_s = nnz;
+    csc->col_s = dimN + 1;
+    csc->sparsity = -1.0f;
+    *A_d = (hl_sparse_matrix_s)tmp;
+    (*A_d)->matrix = (hl_matrix_s)csc;
+  }
+
+  (*A_d)->format = format;
+  (*A_d)->type = value_type;
+  (*A_d)->rows = dimM;
+  (*A_d)->cols = dimN;
+  (*A_d)->nnz = nnz;
+}
+
+void hl_destruct_sparse_matrix(hl_sparse_matrix_s A_d) {
+  CHECK_NOTNULL(A_d);
+  free(A_d);
+}
+
+void hl_memcpy_csr_matrix(hl_sparse_matrix_s csr_matrix,
+                          real *csr_val,
+                          int *csr_row,
+                          int *csr_col,
+                          hl_stream_t stream) {
+  CHECK_NOTNULL(csr_matrix);
+  CHECK_EQ(csr_matrix->format, HL_SPARSE_CSR)
+      << "csr_matrix is not csr format!";
+  CHECK_NOTNULL(csr_matrix->matrix);
+
+  hl_csr_matrix csr = (hl_csr_matrix)(csr_matrix->matrix);
+  CHECK_LE(csr_matrix->nnz, csr->nnz_s) << "copy size " << csr_matrix->nnz
+                                        << " is big than alloc size "
+                                        << csr->nnz_s;
+
+  CHECK_LE((csr_matrix->rows + 1), csr->row_s)
+      << "copy size " << (csr_matrix->rows + 1) << " is big than alloc size "
+      << csr->row_s;
+
+  CHECK(csr_matrix->type == HL_FLOAT_VALUE || csr_matrix->type == HL_NO_VALUE)
+      << "sparse matrix value type error!";
+
+  if (csr_matrix->type == HL_NO_VALUE) {
+    if (csr_row == NULL && csr_col == NULL) {
+      return;
+    } else if (csr_row != NULL && csr_col != NULL) {
+      hl_memcpy_async(
+          csr->csr_row, csr_row, (csr_matrix->rows + 1) * sizeof(int), stream);
+
+      hl_memcpy_async(
+          csr->csr_col, csr_col, (csr_matrix->nnz) * sizeof(int), stream);
+    } else {
+      LOG(FATAL) << "parameter csr_row or csr_col is null pointer!";
+    }
+  } else if (csr_matrix->type == HL_FLOAT_VALUE) {
+    if (csr_val == NULL && csr_row == NULL && csr_col == NULL) {
+      return;
+    } else if (csr_val != NULL && csr_row == NULL && csr_col == NULL) {
+      hl_memcpy_async(
+          csr->csr_val, csr_val, (csr_matrix->nnz) * sizeof(real), stream);
+    } else if (csr_val != NULL && csr_row != NULL && csr_col != NULL) {
+      hl_memcpy_async(
+          csr->csr_val, csr_val, (csr_matrix->nnz) * sizeof(real), stream);
+      hl_memcpy_async(
+          csr->csr_row, csr_row, (csr_matrix->rows + 1) * sizeof(int), stream);
+      hl_memcpy_async(
+          csr->csr_col, csr_col, (csr_matrix->nnz) * sizeof(int), stream);
+    } else {
+      LOG(FATAL) << "parameter csr_row or csr_col is null pointer!";
+    }
+  }
+
+  csr->sparsity = ((float)csr_matrix->nnz) / ((float)csr_matrix->rows) /
+                  ((float)csr_matrix->cols);
+}
+
+void hl_memcpy_csc_matrix(hl_sparse_matrix_s csc_matrix,
+                          real *csc_val,
+                          int *csc_row,
+                          int *csc_col,
+                          hl_stream_t stream) {
+  CHECK_NOTNULL(csc_matrix);
+  CHECK_EQ(csc_matrix->format, HL_SPARSE_CSC)
+      << "csc_matrix is not csc format error!";
+
+  hl_csc_matrix csc = (hl_csc_matrix)(csc_matrix->matrix);
+  CHECK_LE(csc_matrix->nnz, csc->nnz_s) << "copy size " << csc_matrix->nnz
+                                        << " is big than alloc size "
+                                        << csc->nnz_s;
+
+  CHECK_LE((csc_matrix->cols + 1), csc->col_s)
+      << "copy size " << (csc_matrix->cols + 1) << " is big than alloc size "
+      << csc->col_s;
+
+  CHECK(csc_matrix->type == HL_FLOAT_VALUE || csc_matrix->type == HL_NO_VALUE)
+      << "sparse matrix value type error!";
+
+  if (csc_matrix->type == HL_NO_VALUE) {
+    if (csc_row == NULL && csc_col == NULL) {
+      return;
+    } else if (csc_row != NULL && csc_col != NULL) {
+      hl_memcpy_async(
+          csc->csc_row, csc_row, (csc_matrix->nnz) * sizeof(int), stream);
+      hl_memcpy_async(
+          csc->csc_col, csc_col, (csc_matrix->cols + 1) * sizeof(int), stream);
+    } else {
+      LOG(FATAL) << "parameter csc_row or csc_col is null pointer!";
+    }
+  } else if (csc_matrix->type == HL_FLOAT_VALUE) {
+    if (csc_val == NULL && csc_row == NULL && csc_col == NULL) {
+      return;
+    } else if (csc_val != NULL && csc_row == NULL && csc_col == NULL) {
+      hl_memcpy_async(
+          csc->csc_val, csc_val, (csc_matrix->nnz) * sizeof(real), stream);
+    } else if (csc_val != NULL && csc_row != NULL && csc_col != NULL) {
+      hl_memcpy_async(
+          csc->csc_val, csc_val, (csc_matrix->nnz) * sizeof(real), stream);
+      hl_memcpy_async(
+          csc->csc_row, csc_row, (csc_matrix->nnz) * sizeof(int), stream);
+      hl_memcpy_async(
+          csc->csc_col, csc_col, (csc_matrix->cols + 1) * sizeof(int), stream);
+    } else {
+      LOG(FATAL) << "parameter csc_row or csc_col is null pointer!";
+    }
+  }
+
+  csc->sparsity = ((float)csc_matrix->nnz) / ((float)csc_matrix->rows) /
+                  ((float)csc_matrix->cols);
+}
+
+void hl_memcpy_sparse_matrix(hl_sparse_matrix_s dst,
+                             hl_sparse_matrix_s src,
+                             hl_stream_t stream) {
+  CHECK(dst && src && dst->matrix && src->matrix)
+      << "parameter dst or src is null pointer!";
+  CHECK_EQ(dst->format, src->format) << "sparse matrix format does not match!";
+  CHECK(dst->type != HL_FLOAT_VALUE || src->type != HL_NO_VALUE)
+      << "src sparse matrix is no value, dst sparse matrix has value!";
+
+  if (dst->format == HL_SPARSE_CSR) {
+    dst->rows = src->rows;
+    dst->cols = src->cols;
+    dst->nnz = src->nnz;
+    hl_csr_matrix csr = (hl_csr_matrix)src->matrix;
+    hl_memcpy_csr_matrix(dst, csr->csr_val, csr->csr_row, csr->csr_col, stream);
+  } else if (dst->format == HL_SPARSE_CSC) {
+    dst->rows = src->rows;
+    dst->cols = src->cols;
+    dst->nnz = src->nnz;
+    hl_csc_matrix csc = (hl_csc_matrix)src->matrix;
+    hl_memcpy_csc_matrix(dst, csc->csc_val, csc->csc_row, csc->csc_col, stream);
+  } else {
+    LOG(FATAL) << "sparse matrix format error!";
+  }
+}
+
+/**
+ * Calculate beta * C, if beta is zero, C does not have to be a valid input.
+ */
+static void _beta_mul_c(real *c, int dimM, int dimN, real beta) {
+  if (beta == 0.0) {
+    hl_gpu_apply_unary_op(unary::Zero<real>(), c, dimM, dimN, dimN);
+  } else {
+    if (beta != 1.0) {
+      hl_gpu_apply_unary_op(unary::mul_scalar<real>(beta), c, dimM, dimN, dimN);
+    }
+  }
+
+  return;
+}
+
+void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d,
+                             hl_trans_op_t transa,
+                             real *B_d,
+                             hl_trans_op_t transb,
+                             real *C_d,
+                             int dimM,
+                             int dimN,
+                             int dimK,
+                             real alpha,
+                             real beta) {
+  CHECK_EQ(transb, HPPL_OP_N);
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(B_d);
+  CHECK_NOTNULL(C_d);
+  CHECK(dimM > 0 && dimN > 0 && dimK > 0);
+  CHECK_EQ(A_d->format, HL_SPARSE_CSR) << "matrix format error!";
+
+  if ((HPPL_OP_N == transa && (A_d->rows != dimM || A_d->cols != dimK)) ||
+      (HPPL_OP_T == transa && (A_d->rows != dimK || A_d->cols != dimM))) {
+    LOG(FATAL) << "parameter error!";
+  }
+
+  if (A_d->nnz == 0) {
+    _beta_mul_c(C_d, dimM, dimN, beta);
+    return;
+  }
+
+  /* nnz != 0 */
+  hl_csr_matrix A_d2 = (hl_csr_matrix)(A_d->matrix);
+  if ((A_d2->csr_val == NULL && A_d->type != HL_NO_VALUE) ||
+      A_d2->csr_row == NULL || A_d2->csr_col == NULL) {
+    LOG(FATAL) << "parameter error!";
+  }
+
+  if (HPPL_OP_N == transa) {
+    int blocksX = (dimN + CU_CSRMM_BLOCK_N - 1) / CU_CSRMM_BLOCK_N;
+    int blocksY = (dimM + CU_CSRMM_THREAD_Y - 1) / CU_CSRMM_THREAD_Y;
+    dim3 threads(CU_CSRMM_THREAD_X, CU_CSRMM_THREAD_Y);
+    dim3 grid(blocksX, blocksY);
+
+    /* sparsity pattern */
+    // A_d->sparsity;
+    if (A_d->type == HL_NO_VALUE) {
+      KeSMatrixCsrMulDense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d2->csr_val,
+          A_d2->csr_col,
+          A_d2->csr_row,
+          B_d,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
+    } else {
+      KeSMatrixCsrMulDense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d2->csr_val,
+          A_d2->csr_col,
+          A_d2->csr_row,
+          B_d,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
+    }
+  } else if (HPPL_OP_T == transa) {
+    _beta_mul_c(C_d, dimM, dimN, beta);
+
+    int blocksX =
+        (dimN + CU_CSC_MUL_DENSE_BLOCK_N - 1) / CU_CSC_MUL_DENSE_BLOCK_N;
+    int blocksY =
+        (dimK + CU_CSC_MUL_DENSE_BLOCK_K - 1) / CU_CSC_MUL_DENSE_BLOCK_K;
+    dim3 threads(CU_CSC_MUL_DENSE_THREAD_X, CU_CSC_MUL_DENSE_THREAD_Y);
+    dim3 grid(blocksX, blocksY);
+    if (A_d->type == HL_NO_VALUE) {
+      KeSMatrixCscMulDense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d2->csr_val,
+          A_d2->csr_col,
+          A_d2->csr_row,
+          B_d,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
+    } else {
+      KeSMatrixCscMulDense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d2->csr_val,
+          A_d2->csr_col,
+          A_d2->csr_row,
+          B_d,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
+    }
+  } else {
+    LOG(FATAL) << "parameter transa error!";
+  }
+
+  CHECK_SYNC("hl_matrix_csr_mul_dense failed");
+}
+
+void hl_matrix_dense_mul_csc(real *A_d,
+                             hl_trans_op_t transa,
+                             hl_sparse_matrix_s B_d,
+                             hl_trans_op_t transb,
+                             real *C_d,
+                             int dimM,
+                             int dimN,
+                             int dimK,
+                             real alpha,
+                             real beta) {
+  CHECK_EQ(transa, HPPL_OP_N);
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(B_d);
+  CHECK_NOTNULL(C_d);
+
+  if (dimM <= 0 || dimN <= 0 || dimK <= 0 ||
+      ((transb == HPPL_OP_N) && (B_d->rows != dimK || B_d->cols != dimN)) ||
+      ((transb == HPPL_OP_T) && (B_d->rows != dimN || B_d->cols != dimK))) {
+    LOG(FATAL) << "parameter dims error!";
+  }
+
+  CHECK_EQ(B_d->format, HL_SPARSE_CSC) << "matrix format error!";
+
+  if (B_d->nnz == 0) {
+    _beta_mul_c(C_d, dimM, dimN, beta);
+    return;
+  }
+
+  /* nnz != 0 */
+  hl_csc_matrix B_d2 = (hl_csc_matrix)(B_d->matrix);
+  if ((B_d2->csc_val == NULL && B_d->type != HL_NO_VALUE) ||
+      B_d2->csc_row == NULL || B_d2->csc_col == NULL) {
+    LOG(FATAL) << "parameter B is null!";
+  }
+
+  if (transb == HPPL_OP_N) {
+    int blocksX = (dimM + CU_CSCMM_BLOCK_M_BEST - 1) / CU_CSCMM_BLOCK_M_BEST;
+    int blocksY = (dimN + CU_CSCMM_BLOCK_N_BEST - 1) / CU_CSCMM_BLOCK_N_BEST;
+    dim3 threads(CU_CSCMM_THREAD_X_BEST, CU_CSCMM_THREAD_Y_BEST);
+    dim3 grid(blocksX, blocksY);
+
+    if (B_d->type == HL_NO_VALUE) {
+      KeSMatrixDenseMulCsc<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d,
+          B_d2->csc_val,
+          B_d2->csc_row,
+          B_d2->csc_col,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
+    } else {
+      KeSMatrixDenseMulCsc<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d,
+          B_d2->csc_val,
+          B_d2->csc_row,
+          B_d2->csc_col,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
+    }
+  } else if (transb == HPPL_OP_T) {
+    _beta_mul_c(C_d, dimM, dimN, beta);
+    int blocksX = 1 + (dimK - 1) / CU_DM_CSR_THREAD_X;
+    int blocksY = 1 + (dimM - 1) / CU_DM_CSR_BLOCK_M;
+    dim3 threads(CU_DM_CSR_THREAD_X, CU_DM_CSR_THREAD_Y);
+    dim3 grid(blocksX, blocksY);
+    if (B_d->type == HL_NO_VALUE) {
+      KeSMatrixDenseMulCsr<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d,
+          B_d2->csc_val,
+          B_d2->csc_col,
+          B_d2->csc_row,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
+    } else {
+      KeSMatrixDenseMulCsr<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d,
+          B_d2->csc_val,
+          B_d2->csc_col,
+          B_d2->csc_row,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
+    }
+  } else {
+    LOG(FATAL) << "parameter transb error!";
+  }
+
+  CHECK_SYNC("hl_matrix_dense_mul_csc failed");
+}
+
+void hl_matrix_dense_mul_csr(real *A_d,
+                             hl_trans_op_t transa,
+                             hl_sparse_matrix_s B_d,
+                             hl_trans_op_t transb,
+                             real *C_d,
+                             int dimM,
+                             int dimN,
+                             int dimK,
+                             real alpha,
+                             real beta) {
+  CHECK_EQ(transa, HPPL_OP_N);
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(B_d);
+  CHECK_NOTNULL(C_d);
+
+  if (dimM <= 0 || dimN <= 0 || dimK <= 0 ||
+      (transb == HPPL_OP_N && (B_d->rows != dimK || B_d->cols != dimN)) ||
+      (transb == HPPL_OP_T && (B_d->rows != dimN || B_d->cols != dimK))) {
+    LOG(FATAL) << "parameter dims error!";
+  }
+
+  CHECK_EQ(B_d->format, HL_SPARSE_CSR) << "matrix format error!";
+
+  if (B_d->nnz == 0) {
+    _beta_mul_c(C_d, dimM, dimN, beta);
+    return;
+  }
+
+  /* nnz != 0 */
+  hl_csr_matrix B_d2 = (hl_csr_matrix)(B_d->matrix);
+  if ((B_d2->csr_val == NULL && B_d->type != HL_NO_VALUE) ||
+      B_d2->csr_row == NULL || B_d2->csr_col == NULL) {
+    LOG(FATAL) << "parameter transa error!";
+  }
+
+  if (transb == HPPL_OP_N) {
+    _beta_mul_c(C_d, dimM, dimN, beta);
+    int blocksX = 1 + (dimK - 1) / CU_DM_CSR_THREAD_X;
+    int blocksY = 1 + (dimM - 1) / CU_DM_CSR_BLOCK_M;
+    dim3 threads(CU_DM_CSR_THREAD_X, CU_DM_CSR_THREAD_Y);
+    dim3 grid(blocksX, blocksY);
+    if (B_d->type == HL_NO_VALUE) {
+      KeSMatrixDenseMulCsr<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d,
+          B_d2->csr_val,
+          B_d2->csr_row,
+          B_d2->csr_col,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
+    } else {
+      KeSMatrixDenseMulCsr<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d,
+          B_d2->csr_val,
+          B_d2->csr_row,
+          B_d2->csr_col,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
+    }
+  } else if (transb == HPPL_OP_T) {
+    int blocksX = (dimM + CU_CSCMM_BLOCK_M_BEST - 1) / CU_CSCMM_BLOCK_M_BEST;
+    int blocksY = (dimN + CU_CSCMM_BLOCK_N_BEST - 1) / CU_CSCMM_BLOCK_N_BEST;
+    dim3 threads(CU_CSCMM_THREAD_X_BEST, CU_CSCMM_THREAD_Y_BEST);
+    dim3 grid(blocksX, blocksY);
+    if (B_d->type == HL_NO_VALUE) {
+      KeSMatrixDenseMulCsc<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d,
+          B_d2->csr_val,
+          B_d2->csr_col,
+          B_d2->csr_row,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
+    } else {
+      KeSMatrixDenseMulCsc<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d,
+          B_d2->csr_val,
+          B_d2->csr_col,
+          B_d2->csr_row,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
+    }
+  } else {
+    LOG(FATAL) << "parameter transb error!";
+  }
+
+  CHECK_SYNC("hl_matrix_dense_mul_csr failed");
+}
+
+void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d,
+                             hl_trans_op_t transa,
+                             real *B_d,
+                             hl_trans_op_t transb,
+                             real *C_d,
+                             int dimM,
+                             int dimN,
+                             int dimK,
+                             real alpha,
+                             real beta) {
+  CHECK_EQ(transb, HPPL_OP_N);
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(B_d);
+  CHECK_NOTNULL(C_d);
+  CHECK(dimM > 0 && dimN > 0 && dimK > 0) << "parameter error!";
+  CHECK_EQ(A_d->format, HL_SPARSE_CSC) << "matrix format error!";
+
+  if ((HPPL_OP_N == transa && (A_d->rows != dimM || A_d->cols != dimK)) ||
+      (HPPL_OP_T == transa && (A_d->rows != dimK || A_d->cols != dimM))) {
+    LOG(FATAL) << "parameter error!";
+  }
+
+  if (A_d->nnz == 0) {
+    _beta_mul_c(C_d, dimM, dimN, beta);
+    return;
+  }
+
+  /* nnz != 0 */
+  hl_csc_matrix A_d2 = (hl_csc_matrix)(A_d->matrix);
+  if ((A_d2->csc_val == NULL && A_d->type != HL_NO_VALUE) ||
+      A_d2->csc_row == NULL || A_d2->csc_col == NULL) {
+    LOG(FATAL) << "parameter error!";
+  }
+
+  if (HPPL_OP_N == transa) {
+    _beta_mul_c(C_d, dimM, dimN, beta);
+
+    int blocksX =
+        (dimN + CU_CSC_MUL_DENSE_BLOCK_N - 1) / CU_CSC_MUL_DENSE_BLOCK_N;
+    int blocksY =
+        (dimK + CU_CSC_MUL_DENSE_BLOCK_K - 1) / CU_CSC_MUL_DENSE_BLOCK_K;
+    dim3 threads(CU_CSC_MUL_DENSE_THREAD_X, CU_CSC_MUL_DENSE_THREAD_Y);
+    dim3 grid(blocksX, blocksY);
+    if (A_d->type == HL_NO_VALUE) {
+      KeSMatrixCscMulDense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d2->csc_val,
+          A_d2->csc_row,
+          A_d2->csc_col,
+          B_d,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
+    } else {
+      KeSMatrixCscMulDense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d2->csc_val,
+          A_d2->csc_row,
+          A_d2->csc_col,
+          B_d,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
+    }
+  } else if (HPPL_OP_T == transa) {
+    int blocksX = (dimN + CU_CSRMM_BLOCK_N - 1) / CU_CSRMM_BLOCK_N;
+    int blocksY = (dimM + CU_CSRMM_THREAD_Y - 1) / CU_CSRMM_THREAD_Y;
+    dim3 threads(CU_CSRMM_THREAD_X, CU_CSRMM_THREAD_Y);
+    dim3 grid(blocksX, blocksY);
+
+    /* sparsity pattern */
+    // A_d->sparsity;
+    if (A_d->type == HL_NO_VALUE) {
+      KeSMatrixCsrMulDense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d2->csc_val,
+          A_d2->csc_row,
+          A_d2->csc_col,
+          B_d,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
+    } else {
+      KeSMatrixCsrMulDense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d2->csc_val,
+          A_d2->csc_row,
+          A_d2->csc_col,
+          B_d,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
+    }
+  } else {
+    LOG(FATAL) << "parameter transa error!";
+  }
+
+  CHECK_SYNC("hl_matrix_csc_mul_dense failed");
+}
+
+void hl_sparse_matrix_mul(real *A_d,
+                          hl_trans_op_t transa,
+                          real *B_d,
+                          hl_trans_op_t transb,
+                          hl_sparse_matrix_s C_d,
+                          int dimM,
+                          int dimN,
+                          int dimK,
+                          real alpha,
+                          real beta) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(B_d);
+  CHECK_NOTNULL(C_d);
+  CHECK(dimM > 0 && dimN > 0 && dimK > 0) << "parameter error!";
+  CHECK_NE(C_d->type, HL_NO_VALUE) << "C value type error!";
+
+  if (C_d->nnz == 0) return;
+
+  if (C_d->format == HL_SPARSE_CSC) {
+    hl_csc_matrix C_d2 = (hl_csc_matrix)(C_d->matrix);
+    if (C_d2->csc_val == NULL || C_d2->csc_row == NULL ||
+        C_d2->csc_col == NULL) {
+      LOG(FATAL) << "parameter error!";
+    }
+
+    if (beta != 1.0) {
+      hl_gpu_apply_unary_op(
+          unary::mul_scalar<real>(beta), C_d2->csc_val, 1, C_d->nnz, C_d->nnz);
+    }
+
+    int blocksX = dimN;
+    int blocksY = 1;
+    dim3 threads(CU_CSCMM_DMD2CSC_THREAD_X, 1);
+    dim3 grid(blocksX, blocksY);
+    bool transA = transa == HPPL_OP_T ? 1 : 0;
+    bool transB = transb == HPPL_OP_T ? 1 : 0;
+    KeSMatrixDenseMulDense2CSC<<<grid, threads, 0, STREAM_DEFAULT>>>(
+        C_d2->csc_val,
+        C_d2->csc_row,
+        C_d2->csc_col,
+        A_d,
+        B_d,
+        transA,
+        transB,
+        dimM,
+        dimN,
+        dimK,
+        alpha,
+        beta);
+    CHECK_SYNC("hl_sparse_matrix_mul failed");
+  } else {
+    hl_csr_matrix C_d2 = (hl_csr_matrix)(C_d->matrix);
+    if ((C_d2->csr_val == NULL && C_d->type != HL_NO_VALUE) ||
+        C_d2->csr_row == NULL || C_d2->csr_col == NULL) {
+      LOG(FATAL) << "parameter error!";
+    }
+
+    if (beta != 1.0) {
+      hl_gpu_apply_unary_op(
+          unary::mul_scalar<real>(beta), C_d2->csr_val, 1, C_d->nnz, C_d->nnz);
+    }
+
+    bool transA = transa == HPPL_OP_T ? 1 : 0;
+    bool transB = transb == HPPL_OP_T ? 1 : 0;
+    if (!transB) {
+      int blocksX = dimM;
+      int blocksY = 1;
+      dim3 threads(CU_CSCMM_DMD2CSR_THREAD_X, 1);
+      dim3 grid(blocksX, blocksY);
+
+      KeSMatrixDenseMulDense2CSR<<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d2->csr_val,
+          C_d2->csr_row,
+          C_d2->csr_col,
+          A_d,
+          B_d,
+          transA,
+          transB,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
+      CHECK_SYNC("hl_sparse_matrix_mul failed");
+    } else {
+      CHECK(!transA) << "Not supported A is trans and B is not trans!";
+
+      dim3 block(CU_BLOCK_SIZE, 1);
+      int avgNnzPerRow = C_d->nnz / dimM;
+      avgNnzPerRow = avgNnzPerRow > 0 ? avgNnzPerRow : 1;
+      int gridx = DIVUP(avgNnzPerRow, CU_BLOCK_SIZE);
+      dim3 grid(gridx, dimM);
+      KeSMatrixDenseMulDenseTrans2CSR<<<grid, block, 0, STREAM_DEFAULT>>>(
+          C_d2->csr_val,
+          C_d2->csr_row,
+          C_d2->csr_col,
+          A_d,
+          B_d,
+          transA,
+          transB,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
+      CHECK_SYNC("hl_sparse_matrix_mul failed");
+    }
+  }
+}
+
+void hl_memcpy_from_csc_matrix(real *csc_val,
+                               size_t val_size,
+                               int *csc_row,
+                               size_t row_size,
+                               int *csc_col,
+                               size_t col_size,
+                               hl_sparse_matrix_s csc_matrix,
+                               hl_stream_t stream) {
+  CHECK_NOTNULL(csc_matrix);
+  CHECK_NOTNULL(csc_row);
+  CHECK_NOTNULL(csc_col);
+
+  CHECK_EQ(csc_matrix->format, HL_SPARSE_CSC)
+      << "csc_matrix is not csc format error!";
+
+  if (csc_matrix->nnz > row_size ||
+      csc_matrix->cols + 1 > static_cast<int>(col_size)) {
+    LOG(FATAL) << "size not match!";
+  }
+
+  hl_csc_matrix csc = (hl_csc_matrix)(csc_matrix->matrix);
+  hl_memcpy_async((void *)csc_row,
+                  (void *)csc->csc_row,
+                  (csc_matrix->nnz) * sizeof(int),
+                  stream);
+  hl_memcpy_async((void *)csc_col,
+                  (void *)csc->csc_col,
+                  (csc_matrix->cols + 1) * sizeof(int),
+                  stream);
+  if (csc_matrix->type == HL_FLOAT_VALUE) {
+    if (csc_val != NULL) {
+      CHECK_LE(csc_matrix->nnz, val_size) << "size not match!";
+      hl_memcpy_async((void *)csc_val,
+                      (void *)csc->csc_val,
+                      (csc_matrix->nnz) * sizeof(real),
+                      stream);
+    } else {
+      LOG(FATAL) << "parameter csr_val is null pointer!";
+    }
+  }
+}
+
+void hl_memcpy_from_csr_matrix(real *csr_val,
+                               size_t val_size,
+                               int *csr_row,
+                               size_t row_size,
+                               int *csr_col,
+                               size_t col_size,
+                               hl_sparse_matrix_s csr_matrix,
+                               hl_stream_t stream) {
+  CHECK_NOTNULL(csr_matrix);
+  CHECK_NOTNULL(csr_row);
+  CHECK_NOTNULL(csr_col);
+  CHECK_EQ(csr_matrix->format, HL_SPARSE_CSR)
+      << "csr_matrix is not csr format error!";
+
+  if (csr_matrix->nnz > col_size ||
+      csr_matrix->rows + 1 > static_cast<int>(row_size)) {
+    LOG(FATAL) << "size not match!";
+  }
+
+  hl_csr_matrix csr = (hl_csr_matrix)(csr_matrix->matrix);
+  hl_memcpy_async((void *)csr_row,
+                  (void *)csr->csr_row,
+                  (csr_matrix->rows + 1) * sizeof(int),
+                  stream);
+  hl_memcpy_async((void *)csr_col,
+                  (void *)csr->csr_col,
+                  (csr_matrix->nnz) * sizeof(int),
+                  stream);
+  if (csr_matrix->type == HL_FLOAT_VALUE) {
+    if (csr_val != NULL) {
+      CHECK_LE(csr_matrix->nnz, val_size) << "size not match!";
+      hl_memcpy_async((void *)csr_val,
+                      (void *)csr->csr_val,
+                      (csr_matrix->nnz) * sizeof(real),
+                      stream);
+    } else {
+      LOG(FATAL) << "parameter csr_val is null pointer!";
+    }
+  }
+}
+
+void hl_sparse_matrix_column_sum(
+    real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale) {
+  if (B_d->format == HL_SPARSE_CSR) {
+    hl_matrix_csr_column_sum(A_d, B_d, dimM, dimN, scale);
+  } else {
+    LOG(FATAL) << "Not support CSC format error!";
+  }
+}
+
+void hl_matrix_csr_column_sum(
+    real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(B_d);
+
+  if (dimM <= 0 || dimN <= 0 || (B_d->rows != dimM || B_d->cols != dimN)) {
+    LOG(FATAL) << "parameter dims error!";
+  }
+
+  hl_csr_matrix B_d2 = (hl_csr_matrix)(B_d->matrix);
+  if ((B_d2->csr_val == NULL && B_d->type != HL_NO_VALUE) ||
+      B_d2->csr_row == NULL || B_d2->csr_col == NULL) {
+    LOG(FATAL) << "parameter B is null!";
+  }
+
+  if (B_d->nnz == 0) return;
+
+  int nnz = B_d->nnz;
+  int block = 512;
+  int grid = DIVUP(nnz, 512);
+  KeSMatrixCsrColumnSum<<<grid, block, 0, STREAM_DEFAULT>>>(
+      A_d, B_d2->csr_val, B_d2->csr_col, nnz);
+
+  CHECK_SYNC("hl_matrix_csr_column_sum failed");
+}
+
+void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d, real *B_d, real scale) {
+  if (A_d->format == HL_SPARSE_CSR) {
+    hl_matrix_csr_add_bias(A_d, B_d, scale);
+  } else {
+    LOG(FATAL) << "Not support CSC format error!";
+  }
+}
+
+void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d, real *B_d, real scale) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(B_d);
+
+  hl_csr_matrix A_d2 = (hl_csr_matrix)(A_d->matrix);
+  if ((A_d2->csr_val == NULL && A_d->type != HL_NO_VALUE) ||
+      A_d2->csr_row == NULL || A_d2->csr_col == NULL) {
+    LOG(FATAL) << "parameter A_d is null!";
+  }
+
+  if (A_d->nnz == 0) return;
+
+  int nnz = A_d->nnz;
+  int block = 512;
+  int grid = DIVUP(nnz, 512);
+  KeSMatrixCsrAddBias<<<grid, block, 0, STREAM_DEFAULT>>>(
+      A_d2->csr_val, A_d2->csr_col, B_d, scale, nnz);
+
+  CHECK_SYNC("hl_sparse_matrix_add_bias failed");
+}
+
+void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d,
+                                real *B_d,
+                                int dimM,
+                                int dimN,
+                                real alpha,
+                                real beta) {
+  if (A_d->format == HL_SPARSE_CSR) {
+    hl_matrix_csr_add_dense(A_d, B_d, dimM, dimN, alpha, beta);
+  } else {
+    LOG(FATAL) << "Not support CSC format error!";
+  }
+}
+
+void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d,
+                             real *B_d,
+                             int dimM,
+                             int dimN,
+                             real alpha,
+                             real beta) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(B_d);
+
+  if (dimM <= 0 || dimN <= 0 || A_d->rows != dimM || A_d->cols != dimN) {
+    LOG(FATAL) << "parameter dim error!";
+  }
+
+  hl_csr_matrix A_d2 = (hl_csr_matrix)(A_d->matrix);
+  if ((A_d2->csr_val == NULL && A_d->type != HL_NO_VALUE) ||
+      A_d2->csr_row == NULL || A_d2->csr_col == NULL) {
+    LOG(FATAL) << "parameter A_d is null!";
+  }
+
+  if (A_d->nnz == 0) return;
+
+  int gridX = DIVUP((A_d->nnz / dimM), 512);
+  gridX = gridX > 0 ? gridX : 1;
+  dim3 block(512, 1);
+  dim3 grid(gridX, dimM);
+  KeSMatrixCsrAddDense<<<grid, block, 0, STREAM_DEFAULT>>>(A_d2->csr_val,
+                                                           A_d2->csr_row,
+                                                           A_d2->csr_col,
+                                                           B_d,
+                                                           alpha,
+                                                           beta,
+                                                           dimM,
+                                                           dimN);
+
+  CHECK_SYNC("hl_sparse_matrix_add_dense failed");
+}
+
+int *hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat) {
+  __sparse_get_return__(sMat, row);
+}
+
+int *hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat) {
+  __sparse_get_return__(sMat, col);
+}
+
+real *hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat) {
+  __sparse_get_return__(sMat, val);
+}
diff --git a/paddle/cuda/src/hl_cuda_sparse.cuh b/paddle/legacy/cuda/src/hl_cuda_sparse.cuh
similarity index 100%
rename from paddle/cuda/src/hl_cuda_sparse.cuh
rename to paddle/legacy/cuda/src/hl_cuda_sparse.cuh
diff --git a/paddle/cuda/src/hl_math.cc b/paddle/legacy/cuda/src/hl_math.cc
similarity index 100%
rename from paddle/cuda/src/hl_math.cc
rename to paddle/legacy/cuda/src/hl_math.cc
diff --git a/paddle/cuda/src/hl_perturbation_util.cu b/paddle/legacy/cuda/src/hl_perturbation_util.cu
similarity index 100%
rename from paddle/cuda/src/hl_perturbation_util.cu
rename to paddle/legacy/cuda/src/hl_perturbation_util.cu
diff --git a/paddle/legacy/cuda/src/hl_table_apply.cu b/paddle/legacy/cuda/src/hl_table_apply.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7411ae35d382833253e3ceabe36b3a1938138028
--- /dev/null
+++ b/paddle/legacy/cuda/src/hl_table_apply.cu
@@ -0,0 +1,124 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "hl_base.h"
+#include "hl_cuda.h"
+#include "hl_device_functions.cuh"
+#include "paddle/legacy/utils/Logging.h"
+
+template <int blockDimX, int blockDimY, int gridDimX, bool AddRow>
+__global__ void KeMatrixAddRows(real* output,
+                                int ldo,
+                                real* table,
+                                int ldt,
+                                int* ids,
+                                int numSamples,
+                                int tableSize,
+                                int dim) {
+  int idx = threadIdx.x;
+  int idy = blockIdx.x + threadIdx.y * gridDimX;
+
+  while (idy < numSamples) {
+    int tableId = ids[idy];
+    if ((0 <= tableId) && (tableId < tableSize)) {
+      real* out = output + idy * ldo;
+      real* tab = table + tableId * ldt;
+      for (int i = idx; i < dim; i += blockDimX) {
+        if (AddRow) {
+          paddle::paddleAtomicAdd(&tab[i], out[i]);
+        } else {
+          out[i] += tab[i];
+        }
+      }
+    }
+    idy += blockDimY * gridDimX;
+  }
+}
+
+void hl_matrix_select_rows(real* output,
+                           int ldo,
+                           real* table,
+                           int ldt,
+                           int* ids,
+                           int numSamples,
+                           int tableSize,
+                           int dim) {
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(table);
+  CHECK_NOTNULL(ids);
+
+  dim3 threads(128, 8);
+  dim3 grid(8, 1);
+  KeMatrixAddRows<128, 8, 8, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      output, ldo, table, ldt, ids, numSamples, tableSize, dim);
+
+  CHECK_SYNC("hl_matrix_select_rows failed");
+}
+
+void hl_matrix_add_to_rows(real* table,
+                           int ldt,
+                           real* input,
+                           int ldi,
+                           int* ids,
+                           int numSamples,
+                           int tableSize,
+                           int dim) {
+  CHECK_NOTNULL(input);
+  CHECK_NOTNULL(table);
+  CHECK_NOTNULL(ids);
+
+  dim3 threads(128, 8);
+  dim3 grid(8, 1);
+  KeMatrixAddRows<128, 8, 8, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      input, ldi, table, ldt, ids, numSamples, tableSize, dim);
+
+  CHECK_SYNC("hl_matrix_add_to_rows failed");
+}
+
+template <class T, int blockDimX, int gridDimX>
+__global__ void KeVectorSelect(
+    T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) {
+  int idx = threadIdx.x + blockDimX * blockIdx.x;
+  while (idx < sizei) {
+    int index = ids[idx];
+    // check(index < sizes);
+    dst[idx] = src[index];
+    idx += blockDimX * gridDimX;
+  }
+}
+
+template <class T>
+void hl_vector_select_from(
+    T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) {
+  CHECK_NOTNULL(dst);
+  CHECK_NOTNULL(src);
+  CHECK_NOTNULL(ids);
+  CHECK_EQ(sized, sizei);
+
+  dim3 threads(512, 1);
+  dim3 grid(8, 1);
+  KeVectorSelect<T, 512, 8><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      dst, sized, src, sizes, ids, sizei);
+
+  CHECK_SYNC("hl_vector_select_from failed");
+}
+
+template void hl_vector_select_from(real* dst,
+                                    int sized,
+                                    const real* src,
+                                    int sizes,
+                                    const int* ids,
+                                    int sizei);
+template void hl_vector_select_from(
+    int* dst, int sized, const int* src, int sizes, const int* ids, int sizei);
diff --git a/paddle/cuda/src/hl_time.cc b/paddle/legacy/cuda/src/hl_time.cc
similarity index 100%
rename from paddle/cuda/src/hl_time.cc
rename to paddle/legacy/cuda/src/hl_time.cc
diff --git a/paddle/legacy/cuda/src/hl_top_k.cu b/paddle/legacy/cuda/src/hl_top_k.cu
new file mode 100644
index 0000000000000000000000000000000000000000..041ac419f5addfa49148270b8a8b421eb8ada78c
--- /dev/null
+++ b/paddle/legacy/cuda/src/hl_top_k.cu
@@ -0,0 +1,481 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/cuda/include/hl_base.h"
+#include "paddle/legacy/cuda/include/hl_sparse.ph"
+#include "paddle/legacy/cuda/include/hl_top_k.h"
+#include "paddle/legacy/utils/Logging.h"
+
+// using namespace hppl;
+
+struct Pair {
+  __device__ __forceinline__ Pair() {}
+
+  __device__ __forceinline__ Pair(real value, int id) : v_(value), id_(id) {}
+
+  __device__ __forceinline__ void set(real value, int id) {
+    v_ = value;
+    id_ = id;
+  }
+
+  __device__ __forceinline__ void operator=(const Pair& in) {
+    v_ = in.v_;
+    id_ = in.id_;
+  }
+
+  __device__ __forceinline__ bool operator<(const real value) const {
+    return (v_ < value);
+  }
+
+  __device__ __forceinline__ bool operator<(const Pair& in) const {
+    return (v_ < in.v_) || ((v_ == in.v_) && (id_ > in.id_));
+  }
+
+  __device__ __forceinline__ bool operator>(const Pair& in) const {
+    return (v_ > in.v_) || ((v_ == in.v_) && (id_ < in.id_));
+  }
+
+  real v_;
+  int id_;
+};
+
+__device__ __forceinline__ void addTo(Pair topK[],
+                                      const Pair& p,
+                                      int beamSize) {
+  for (int k = beamSize - 2; k >= 0; k--) {
+    if (topK[k] < p) {
+      topK[k + 1] = topK[k];
+    } else {
+      topK[k + 1] = p;
+      return;
+    }
+  }
+  topK[0] = p;
+}
+
+template <int beamSize>
+__device__ __forceinline__ void addTo(Pair topK[], const Pair& p) {
+  for (int k = beamSize - 2; k >= 0; k--) {
+    if (topK[k] < p) {
+      topK[k + 1] = topK[k];
+    } else {
+      topK[k + 1] = p;
+      return;
+    }
+  }
+  topK[0] = p;
+}
+
+template <int blockSize>
+__device__ __forceinline__ void getTopK(
+    Pair topK[], real* src, int idx, int dim, int beamSize) {
+  while (idx < dim) {
+    if (topK[beamSize - 1] < src[idx]) {
+      Pair tmp(src[idx], idx);
+      addTo(topK, tmp, beamSize);
+    }
+    idx += blockSize;
+  }
+}
+
+template <int blockSize>
+__device__ __forceinline__ void getTopK(
+    Pair topK[], real* src, int idx, int dim, const Pair& max, int beamSize) {
+  while (idx < dim) {
+    if (topK[beamSize - 1] < src[idx]) {
+      Pair tmp(src[idx], idx);
+      if (tmp < max) {
+        addTo(topK, tmp, beamSize);
+      }
+    }
+    idx += blockSize;
+  }
+}
+
+template <int blockSize>
+__device__ __forceinline__ void getTopK(
+    Pair topK[], real* val, int* col, int idx, int dim, int beamSize) {
+  while (idx < dim) {
+    if (topK[beamSize - 1] < val[idx]) {
+      Pair tmp(val[idx], col[idx]);
+      addTo(topK, tmp, beamSize);
+    }
+    idx += blockSize;
+  }
+}
+
+template <int blockSize>
+__device__ __forceinline__ void getTopK(Pair topK[],
+                                        real* val,
+                                        int* col,
+                                        int idx,
+                                        int dim,
+                                        const Pair& max,
+                                        int beamSize) {
+  while (idx < dim) {
+    if (topK[beamSize - 1] < val[idx]) {
+      Pair tmp(val[idx], col[idx]);
+      if (tmp < max) {
+        addTo(topK, tmp, beamSize);
+      }
+    }
+    idx += blockSize;
+  }
+}
+
+template <int maxLength, int blockSize>
+__device__ __forceinline__ void threadGetTopK(Pair topK[],
+                                              int& beam,
+                                              int beamSize,
+                                              real* src,
+                                              bool& firstStep,
+                                              bool& isEmpty,
+                                              Pair& max,
+                                              int dim,
+                                              const int tid) {
+  if (beam > 0) {
+    int length = beam < beamSize ? beam : beamSize;
+    if (firstStep) {
+      firstStep = false;
+      getTopK<blockSize>(topK, src, tid, dim, length);
+    } else {
+      for (int k = 0; k < maxLength; k++) {
+        if (k < maxLength - beam) {
+          topK[k] = topK[k + beam];
+        } else {
+          topK[k].set(-HL_FLOAT_MAX, -1);
+        }
+      }
+      if (!isEmpty) {
+        getTopK<blockSize>(topK + maxLength - beam, src, tid, dim, max, length);
+      }
+    }
+
+    max = topK[maxLength - 1];
+    if (max.id_ == -1) isEmpty = true;
+    beam = 0;
+  }
+}
+
+template <int maxLength, int blockSize>
+__device__ __forceinline__ void threadGetTopK(Pair topK[],
+                                              int& beam,
+                                              int beamSize,
+                                              real* val,
+                                              int* col,
+                                              bool& firstStep,
+                                              bool& isEmpty,
+                                              Pair& max,
+                                              int dim,
+                                              const int tid) {
+  if (beam > 0) {
+    int length = beam < beamSize ? beam : beamSize;
+    if (firstStep) {
+      firstStep = false;
+      getTopK<blockSize>(topK, val, col, tid, dim, length);
+    } else {
+      for (int k = 0; k < maxLength; k++) {
+        if (k < maxLength - beam) {
+          topK[k] = topK[k + beam];
+        } else {
+          topK[k].set(-HL_FLOAT_MAX, -1);
+        }
+      }
+      if (!isEmpty) {
+        getTopK<blockSize>(
+            topK + maxLength - beam, val, col, tid, dim, max, length);
+      }
+    }
+
+    max = topK[maxLength - 1];
+    if (max.id_ == -1) isEmpty = true;
+    beam = 0;
+  }
+}
+
+template <int maxLength, int blockSize>
+__device__ __forceinline__ void blockReduce(Pair* shTopK,
+                                            int* maxId,
+                                            Pair topK[],
+                                            real** topVal,
+                                            int** topIds,
+                                            int& beam,
+                                            int& beamSize,
+                                            const int tid,
+                                            const int warp) {
+  while (true) {
+    __syncthreads();
+    if (tid < blockSize / 2) {
+      if (shTopK[tid] < shTopK[tid + blockSize / 2]) {
+        maxId[tid] = tid + blockSize / 2;
+      } else {
+        maxId[tid] = tid;
+      }
+    }
+    __syncthreads();
+    for (int stride = blockSize / 4; stride > 0; stride = stride / 2) {
+      if (tid < stride) {
+        if (shTopK[maxId[tid]] < shTopK[maxId[tid + stride]]) {
+          maxId[tid] = maxId[tid + stride];
+        }
+      }
+      __syncthreads();
+    }
+    __syncthreads();
+
+    if (tid == 0) {
+      **topVal = shTopK[maxId[0]].v_;
+      **topIds = shTopK[maxId[0]].id_;
+      (*topVal)++;
+      (*topIds)++;
+    }
+    if (tid == maxId[0]) beam++;
+    if (--beamSize == 0) break;
+    __syncthreads();
+
+    // NOTE(zcd): temporary solution
+    unsigned mask = 0u;
+    CREATE_SHFL_MASK(mask, true);
+
+    if (tid == maxId[0]) {
+      if (beam < maxLength) {
+        shTopK[tid] = topK[beam];
+      }
+    }
+    if (maxId[0] / 32 == warp) {
+      if (__shfl_sync(mask, beam, (maxId[0]) % 32, 32) == maxLength) break;
+    }
+  }
+}
+
+/**
+ * Each block compute one sample.
+ * In a block:
+ * 1. every thread get top maxLength value;
+ * 2. merge to shTopK, block reduce and get max value;
+ * 3. go to the second setp, until one thread's topK value is null;
+ * 4. go to the first setp, until get the topK value.
+ */
+template <int maxLength, int blockSize>
+__global__ void KeMatrixTopK(real* topVal,
+                             int ldv,
+                             int* topIds,
+                             real* src,
+                             int lds,
+                             int dim,
+                             int beamSize) {
+  __shared__ Pair shTopK[blockSize];
+  __shared__ int maxId[blockSize / 2];
+  const int tid = threadIdx.x;
+  const int warp = threadIdx.x / 32;
+  src += blockIdx.x * lds;
+  topVal += blockIdx.x * ldv;
+  topIds += blockIdx.x * beamSize;
+
+  Pair topK[maxLength];  // NOLINT
+  int beam = maxLength;
+  Pair max;
+  bool isEmpty = false;
+  bool firstStep = true;
+
+  for (int k = 0; k < maxLength; k++) {
+    topK[k].set(-HL_FLOAT_MAX, -1);
+  }
+  while (beamSize) {
+    threadGetTopK<maxLength, blockSize>(
+        topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
+
+    shTopK[tid] = topK[0];
+    blockReduce<maxLength, blockSize>(
+        shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
+  }
+}
+
+template <int maxLength, int blockSize>
+__global__ void KeSMatrixTopK(real* topVal,
+                              int ldv,
+                              int* topIds,
+                              real* val,
+                              int* row,
+                              int* col,
+                              int beamSize) {
+  __shared__ Pair shTopK[blockSize];
+  __shared__ int maxId[blockSize / 2];
+  const int tid = threadIdx.x;
+  const int warp = threadIdx.x / 32;
+  topVal += blockIdx.x * ldv;
+  topIds += blockIdx.x * beamSize;
+
+  Pair topK[maxLength];  // NOLINT
+  int beam = maxLength;
+  Pair max;
+  bool isEmpty = false;
+  bool firstStep = true;
+
+  int start = row[blockIdx.x];
+  int end = row[blockIdx.x + 1];
+  int dim = end - start;
+  val += start;
+  col += start;
+
+  if (beamSize > dim) {
+    // if the number of values to sort are less than the output size,
+    // use -1 to indicate the end of valid sorted values.
+    if (tid == 0) {
+      topIds[dim] = -1;
+    }
+
+    beamSize = dim;
+  }
+
+  for (int k = 0; k < maxLength; k++) {
+    topK[k].set(-HL_FLOAT_MAX, -1);
+  }
+  while (beamSize) {
+    threadGetTopK<maxLength, blockSize>(
+        topK, beam, beamSize, val, col, firstStep, isEmpty, max, dim, tid);
+
+    shTopK[tid] = topK[0];
+    blockReduce<maxLength, blockSize>(
+        shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
+  }
+}
+
+void hl_matrix_top_k(real* topVal,
+                     int ldv,
+                     int* topIds,
+                     real* src,
+                     int lds,
+                     int dim,
+                     int beamSize,
+                     int numSamples) {
+  CHECK_NOTNULL(topVal);
+  CHECK_NOTNULL(topIds);
+  CHECK_NOTNULL(src);
+
+  if (beamSize > dim) beamSize = dim;
+
+  dim3 threads(256, 1);
+  dim3 grid(numSamples, 1);
+  KeMatrixTopK<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      topVal, ldv, topIds, src, lds, dim, beamSize);
+
+  CHECK_SYNC("hl_matrix_top_k failed");
+}
+
+void hl_sparse_matrix_top_k(real* topVal,
+                            int ldv,
+                            int* topIds,
+                            hl_sparse_matrix_s src,
+                            int beamSize,
+                            int numSamples) {
+  CHECK_NOTNULL(topVal);
+  CHECK_NOTNULL(topIds);
+  CHECK_NOTNULL(src);
+  CHECK_EQ(src->format, HL_SPARSE_CSR) << "sparse matrix format error!";
+
+  hl_csr_matrix csr = (hl_csr_matrix)src->matrix;
+  if (csr->csr_val == NULL || csr->csr_row == NULL || csr->csr_col == NULL) {
+    LOG(FATAL) << "parameter src is null!";
+  }
+
+  dim3 threads(256, 1);
+  dim3 grid(numSamples, 1);
+  KeSMatrixTopK<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      topVal, ldv, topIds, csr->csr_val, csr->csr_row, csr->csr_col, beamSize);
+
+  CHECK_SYNC("hl_sparse_matrix_top_k failed");
+}
+
+/**
+ * Each block compute one sample.
+ * In a block:
+ * 1. every thread get top maxLength value;
+ * 2. merge to shTopK, block reduce and get max value;
+ * 3. go to the second setp, until one thread's topK value is null;
+ * 4. go to the first setp, until get the topK value.
+ */
+template <int maxLength, int blockSize>
+__global__ void KeMatrixTopKClassificationError(real* topVal,
+                                                int ldv,
+                                                int* topIds,
+                                                real* src,
+                                                int lds,
+                                                int dim,
+                                                int beamSize,
+                                                int* label,
+                                                real* recResult) {
+  __shared__ Pair shTopK[blockSize];
+  __shared__ int maxId[blockSize / 2];
+  const int tid = threadIdx.x;
+  const int warp = threadIdx.x / 32;
+  src += blockIdx.x * lds;
+  topVal += blockIdx.x * ldv;
+  topIds += blockIdx.x * beamSize;
+
+  Pair topK[maxLength];  // NOLINT
+  int beam = maxLength;
+  Pair max;
+  bool isEmpty = false;
+  bool firstStep = true;
+  int topkSize = beamSize;
+
+  for (int k = 0; k < maxLength; k++) {
+    topK[k].set(-HL_FLOAT_MAX, -1);
+  }
+
+  while (beamSize) {
+    threadGetTopK<maxLength, blockSize>(
+        topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
+
+    shTopK[tid] = topK[0];
+    blockReduce<maxLength, blockSize>(
+        shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
+  }
+
+  __syncthreads();
+  if (tid == 0) {
+    for (int i = 0; i < topkSize; i++) {
+      if (*--topIds == label[blockIdx.x]) {
+        recResult[blockIdx.x] = 0;
+        break;
+      }
+      recResult[blockIdx.x] = 1.0f;
+    }
+  }
+}
+
+void hl_matrix_classification_error(real* topVal,
+                                    int ldv,
+                                    int* topIds,
+                                    real* src,
+                                    int lds,
+                                    int dim,
+                                    int topkSize,
+                                    int numSamples,
+                                    int* label,
+                                    real* recResult) {
+  CHECK_NOTNULL(topVal);
+  CHECK_NOTNULL(topIds);
+  CHECK_NOTNULL(src);
+
+  if (topkSize > dim) topkSize = dim;
+
+  dim3 threads(256, 1);
+  dim3 grid(numSamples, 1);
+  KeMatrixTopKClassificationError<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      topVal, ldv, topIds, src, lds, dim, topkSize, label, recResult);
+
+  CHECK_SYNC("hl_matrix_top_k classification error failed");
+}
diff --git a/paddle/legacy/cuda/src/hl_warpctc_wrap.cc b/paddle/legacy/cuda/src/hl_warpctc_wrap.cc
new file mode 100644
index 0000000000000000000000000000000000000000..31a8652f1f55387ae48cb516cd092442be784cbb
--- /dev/null
+++ b/paddle/legacy/cuda/src/hl_warpctc_wrap.cc
@@ -0,0 +1,151 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "hl_warpctc_wrap.h"
+#include <mutex>
+#include "paddle/legacy/utils/DynamicLoader.h"
+#include "paddle/legacy/utils/Logging.h"
+
+namespace dynload {
+
+std::once_flag warpctc_dso_flag;
+void* warpctc_dso_handle = nullptr;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load warpctc routine
+ * via operator overloading. When PADDLE_USE_DSO is
+ * false, you need to add the path of libwarp-ctc.so to
+ * the linked-libs of paddle or to LD_PRELOAD.
+ */
+#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                              \
+  struct DynLoad__##__name {                                           \
+    template <typename... Args>                                        \
+    auto operator()(Args... args) -> decltype(__name(args...)) {       \
+      using warpctcFunc = decltype(__name(args...)) (*)(Args...);      \
+      std::call_once(                                                  \
+          warpctc_dso_flag, GetWarpCTCDsoHandle, &warpctc_dso_handle); \
+      void* p_##_name = dlsym(warpctc_dso_handle, #__name);            \
+      return reinterpret_cast<warpctcFunc>(p_##_name)(args...);        \
+    }                                                                  \
+  } __name;  // struct DynLoad__##__name
+
+// include all needed warp-ctc functions
+DYNAMIC_LOAD_WARPCTC_WRAP(get_warpctc_version)
+DYNAMIC_LOAD_WARPCTC_WRAP(ctcGetStatusString)
+DYNAMIC_LOAD_WARPCTC_WRAP(compute_ctc_loss)
+DYNAMIC_LOAD_WARPCTC_WRAP(get_workspace_size)
+
+#undef DYNAMIC_LOAD_WARPCTC_WRAP
+
+} /* namespace dynload */
+
+#define WARPCTC_GET_VERSION dynload::get_warpctc_version
+#define WARPCTC_GET_STATUS_STRING dynload::ctcGetStatusString
+
+static int g_warpctcVersion = -1;
+#ifndef PADDLE_TYPE_DOUBLE
+#define WARPCTC_COMPUTE_LOSS dynload::compute_ctc_loss
+#define WARPCTC_GET_WORKSPACE_SIZE dynload::get_workspace_size
+#else
+hl_warpctc_status_t fatal(...) {
+  LOG(FATAL) << "warp-ctc [version " << g_warpctcVersion
+             << "] Error: not support double precision.";
+  // both of get_warpctc_version() and get_workspace_size() return an ctcStatus
+  // type value
+  return CTC_STATUS_EXECUTION_FAILED;
+}
+#define WARPCTC_COMPUTE_LOSS fatal
+#define WARPCTC_GET_WORKSPACE_SIZE fatal
+#endif
+
+/**
+ * Check build-in warp-ctc function using glog and it also
+ * support << operator for more details error info.
+ */
+#define CHECK_WARPCTC(warpctcStat)                \
+  CHECK_EQ(CTC_STATUS_SUCCESS, warpctcStat)       \
+      << "warp-ctc [version " << g_warpctcVersion \
+      << "] Error: " << WARPCTC_GET_STATUS_STRING(warpctcStat) << " "
+
+void hl_warpctc_init(const size_t blank,
+                     bool useGpu,
+                     hl_warpctc_options_t* options) {
+  CHECK_NOTNULL(options);
+
+  g_warpctcVersion = WARPCTC_GET_VERSION();
+
+  if (useGpu) {
+#ifdef __NVCC__
+    options->loc = CTC_GPU;
+    options->stream = STREAM_DEFAULT;
+#else
+    LOG(FATAL) << "[warpctc init] GPU is not enabled.";
+#endif
+  } else {
+    options->loc = CTC_CPU;
+    options->num_threads = 1;
+  }
+
+  options->blank_label = blank;
+}
+
+void hl_warpctc_compute_loss(const real* batchInput,
+                             real* batchGrad,
+                             const int* cpuLabels,
+                             const int* cpuLabelLengths,
+                             const int* cpuInputLengths,
+                             const size_t numClasses,
+                             const size_t numSequences,
+                             real* cpuCosts,
+                             void* workspace,
+                             hl_warpctc_options_t* options) {
+  CHECK_NOTNULL(batchInput);
+  CHECK_NOTNULL(cpuLabels);
+  CHECK_NOTNULL(cpuLabelLengths);
+  CHECK_NOTNULL(cpuInputLengths);
+  CHECK_NOTNULL(cpuCosts);
+  CHECK_NOTNULL(workspace);
+  CHECK_NOTNULL(options);
+
+  CHECK_WARPCTC(WARPCTC_COMPUTE_LOSS(batchInput,
+                                     batchGrad,
+                                     cpuLabels,
+                                     cpuLabelLengths,
+                                     cpuInputLengths,
+                                     numClasses,
+                                     numSequences,
+                                     cpuCosts,
+                                     workspace,
+                                     *options));
+}
+
+void hl_warpctc_get_workspace_size(const int* cpuLabelLengths,
+                                   const int* cpuInputLengths,
+                                   const size_t numClasses,
+                                   const size_t numSequences,
+                                   hl_warpctc_options_t* options,
+                                   size_t* bytes) {
+  CHECK_NOTNULL(cpuLabelLengths);
+  CHECK_NOTNULL(cpuInputLengths);
+  CHECK_NOTNULL(options);
+  CHECK_NOTNULL(bytes);
+
+  CHECK_WARPCTC(WARPCTC_GET_WORKSPACE_SIZE(cpuLabelLengths,
+                                           cpuInputLengths,
+                                           numClasses,
+                                           numSequences,
+                                           *options,
+                                           bytes));
+}
diff --git a/paddle/function/BlockExpandOp.cpp b/paddle/legacy/function/BlockExpandOp.cpp
similarity index 100%
rename from paddle/function/BlockExpandOp.cpp
rename to paddle/legacy/function/BlockExpandOp.cpp
diff --git a/paddle/function/BlockExpandOpTest.cpp b/paddle/legacy/function/BlockExpandOpTest.cpp
similarity index 100%
rename from paddle/function/BlockExpandOpTest.cpp
rename to paddle/legacy/function/BlockExpandOpTest.cpp
diff --git a/paddle/legacy/function/BufferArg.cpp b/paddle/legacy/function/BufferArg.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1f3d505c31bf8d50503032a4baae6230b9f7241d
--- /dev/null
+++ b/paddle/legacy/function/BufferArg.cpp
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <glog/logging.h>
+
+#include "BufferArg.h"
+#include "paddle/legacy/math/SparseMatrix.h"
+
+namespace paddle {
+
+const SequenceArg& BufferArg::sequence() const {
+  CHECK_EQ(bufferType_, TENSOR_SEQUENCE_DATA);
+  return dynamic_cast<const SequenceArg&>(*this);
+}
+
+const SparseMatrixArg& BufferArg::sparse() const {
+  CHECK_EQ(bufferType_, TENSOR_SPARSE);
+  return dynamic_cast<const SparseMatrixArg&>(*this);
+}
+
+SparseMatrixArg::SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType)
+    : BufferArg(sparse, argType),
+      row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
+      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32),
+      nnz_(sparse.getElementCnt()),
+      format_(static_cast<SparseDataFormat>(sparse.getFormat())),
+      type_(static_cast<SparseDataType>(sparse.getValueType())) {
+  bufferType_ = TENSOR_SPARSE;
+}
+
+SparseMatrixArg::SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType)
+    : BufferArg(sparse, argType),
+      row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
+      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32),
+      nnz_(sparse.getElementCnt()),
+      format_(static_cast<SparseDataFormat>(sparse.getFormat())),
+      type_(static_cast<SparseDataType>(sparse.getValueType())) {
+  bufferType_ = TENSOR_SPARSE;
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/function/BufferArg.h b/paddle/legacy/function/BufferArg.h
new file mode 100644
index 0000000000000000000000000000000000000000..1f47ad556d29363d784fde718fdacdf0658ef010
--- /dev/null
+++ b/paddle/legacy/function/BufferArg.h
@@ -0,0 +1,364 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <glog/logging.h>
+
+#include "TensorShape.h"
+#include "TensorType.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+enum BufferType {
+  TENSOR_UNKNOWN = 0,
+  TENSOR_NORMAL = 1,
+  TENSOR_SEQUENCE_ID = 2,
+  TENSOR_SEQUENCE_DATA = 3,
+  TENSOR_SPARSE = 4
+};
+
+class BufferArg;
+class SequenceArg;
+class SparseMatrixArg;
+
+/**
+ * \brief BufferArg used as the argument type of Function.
+ *
+ * The arguments of the Paddle Function have four Buffer types.
+ * 1. BufferArg for a dense Buffer of any dimension.
+ * 2. SequenceIdArg for a Buffer of sequence start positions.
+ * 3. SequenceArg for a Buffer of sequence data.
+ * 4. SparseMatrixArg for a Buffer of sparse matrix.
+ *
+ * Buffer shape
+ * For most buffers, the first dimension `shape()[0]` represents
+ * the size of the mini-batch.
+ *
+ * Buffer argType
+ * There is an ArgType property for the BufferArg used as Function Output.
+ * Whether the result of the Function calculation is assigned to the
+ * output Buffer or added to the output Buffer is determined by the
+ * argType_ property of the output BufferArg.
+ */
+
+// ArgType is only used by output BufferArg.
+// For input argument, argType_ is ignored.
+// For output argument, need to set the argType_ of the BufferArg.
+enum ArgType {
+  UNSPECIFIED = 0,
+  ASSIGN_TO = 1,
+  ADD_TO = 2,
+};
+class BufferArg {
+ public:
+  void setArgType(ArgType argType) { argType_ = argType; }
+
+  ArgType getArgType() const { return argType_; }
+
+ public:
+  BufferArg(ValueType valueType,
+            const TensorShape& shape,
+            ArgType argType = UNSPECIFIED)
+      : buf_(nullptr), valueType_(valueType), shape_(shape), argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
+  }
+
+  BufferArg(void* buf,
+            ValueType valueType,
+            const TensorShape& shape,
+            ArgType argType = UNSPECIFIED)
+      : buf_(buf), valueType_(valueType), shape_(shape), argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
+  }
+
+  BufferArg(void* buf, ValueType valueType) : buf_(buf), valueType_(valueType) {
+    bufferType_ = TENSOR_NORMAL;
+  }
+
+  BufferArg(const Matrix& matrix, ArgType argType = UNSPECIFIED)
+      : buf_(
+            const_cast<void*>(reinterpret_cast<const void*>(matrix.getData()))),
+        valueType_(DataType<real>::value),
+        shape_(2),
+        argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
+    shape_.setDim(0, matrix.getHeight());
+    shape_.setDim(1, matrix.getWidth());
+  }
+
+  BufferArg(const Matrix& matrix,
+            const TensorShape& shape,
+            ArgType argType = UNSPECIFIED)
+      : buf_(
+            const_cast<void*>(reinterpret_cast<const void*>(matrix.getData()))),
+        valueType_(DataType<real>::value),
+        shape_(shape),
+        argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
+    CHECK_EQ(matrix.getElementCnt(), shape.getElements());
+  }
+
+  BufferArg(const Vector& vector, ArgType argType = UNSPECIFIED)
+      : buf_(
+            const_cast<void*>(reinterpret_cast<const void*>(vector.getData()))),
+        valueType_(DataType<real>::value),
+        shape_(1),
+        argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
+    shape_.setDim(0, vector.getSize());
+  }
+
+  BufferArg(const IVector& vector, ArgType argType = UNSPECIFIED)
+      : buf_(
+            const_cast<void*>(reinterpret_cast<const void*>(vector.getData()))),
+        valueType_(VALUE_TYPE_INT32),
+        shape_(1),
+        argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
+    shape_.setDim(0, vector.getSize());
+  }
+
+  template <DeviceType DType>
+  typename Tensor<real, DType>::Matrix matrix() const {
+    CHECK(buf_);
+    CHECK(valueType_ == DataType<real>::value);
+    // CHECK(deviceType_ == DType);
+    CHECK_EQ((size_t)2, shape_.ndims());
+    return typename Tensor<real, DType>::Matrix(
+        reinterpret_cast<real*>(buf_), shape_[0], shape_[1]);
+  }
+
+  template <typename VType, DeviceType DType>
+  typename Tensor<VType, DType>::Vector vector() const {
+    CHECK(buf_);
+    CHECK(valueType_ == DataType<VType>::value);
+    // CHECK(deviceType_ == DType);
+    CHECK_EQ((size_t)1, shape_.ndims());
+    return typename Tensor<VType, DType>::Vector(
+        shape_[0], reinterpret_cast<VType*>(buf_));
+  }
+
+  virtual ~BufferArg() {}
+
+  template <typename T>
+  T* data() const {
+    return reinterpret_cast<T*>(buf_);
+  }
+
+  void* data() const { return buf_; }
+  ValueType valueType() const { return valueType_; }
+  BufferType bufferType() const { return bufferType_; }
+  const TensorShape& shape() const { return shape_; }
+  bool isSparseArg() const { return TENSOR_SPARSE == bufferType_; }
+  bool isSequenceArg() const { return TENSOR_SEQUENCE_DATA == bufferType_; }
+  virtual size_t numElements() const { return shape_.getElements(); }
+
+  const SequenceArg& sequence() const;
+  const SparseMatrixArg& sparse() const;
+
+ protected:
+  void* buf_;
+  ValueType valueType_;
+  TensorShape shape_;
+  BufferType bufferType_{TENSOR_UNKNOWN};
+  ArgType argType_{UNSPECIFIED};
+  // TODO(tianbing), add deviceType_
+  // leading dimensions. The size is dims_.size()
+  // Dims lds_;
+};
+
+// sequence start positions in a mini-batch of sequences
+// shape_.ndims() == 1
+// valueType_ = int32
+// if a < b then value_.buf_[a] < value_.buf_[b]
+class SequenceIdArg : public BufferArg {
+ public:
+  SequenceIdArg(const TensorShape& shape, ArgType argType = UNSPECIFIED)
+      : BufferArg(VALUE_TYPE_INT32, shape, argType) {
+    bufferType_ = TENSOR_SEQUENCE_ID;
+    CHECK_EQ(shape_.ndims(), 1UL);
+    CHECK_GE(shape_[0], 1UL);
+    numSeqs_ = shape_[0] - 1;
+  }
+
+  SequenceIdArg(void* buf,
+                const TensorShape& shape,
+                ArgType argType = UNSPECIFIED)
+      : BufferArg(buf, VALUE_TYPE_INT32, shape, argType) {
+    bufferType_ = TENSOR_SEQUENCE_ID;
+    CHECK_EQ(shape_.ndims(), 1UL);
+    numSeqs_ = shape_[0] - 1;
+  }
+
+  SequenceIdArg(const IVector& vector) : BufferArg(vector) {
+    bufferType_ = TENSOR_SEQUENCE_ID;
+    numSeqs_ = shape_[0] - 1;
+  }
+
+  ~SequenceIdArg() {}
+
+  size_t numSeqs() const { return numSeqs_; }
+
+ private:
+  size_t numSeqs_;
+};
+
+// sequences data
+// For mini-batch calculate,
+// one batch can contain more than one sequence of data.
+// SequenceArg can be used to represent sequences that contain multiple
+// unequal lengths.
+class SequenceArg : public BufferArg {
+ public:
+  SequenceArg(ValueType valueType,
+              const TensorShape& shape,
+              ArgType argType = UNSPECIFIED)
+      : BufferArg(valueType, shape, argType),
+        startPositions_(TensorShape({shape[0]})) {
+    bufferType_ = TENSOR_SEQUENCE_DATA;
+  }
+
+  SequenceArg(void* buf,
+              ValueType valueType,
+              const TensorShape& shape,
+              const SequenceIdArg& startPositions,
+              ArgType argType = UNSPECIFIED)
+      : BufferArg(buf, valueType, shape, argType),
+        startPositions_(startPositions) {
+    bufferType_ = TENSOR_SEQUENCE_DATA;
+  }
+
+  SequenceArg(const Matrix& matrix,
+              const IVector& vector,
+              ArgType argType = UNSPECIFIED)
+      : BufferArg(matrix, argType), startPositions_(vector) {
+    bufferType_ = TENSOR_SEQUENCE_DATA;
+  }
+
+  ~SequenceArg() {}
+
+  void* getIdBuf() const { return startPositions_.data(); }
+  size_t numSeqs() const { return startPositions_.numSeqs(); }
+  SequenceIdArg& getSequenceId() { return startPositions_; }
+  const SequenceIdArg& getSequenceId() const { return startPositions_; }
+
+ private:
+  SequenceIdArg startPositions_;
+};
+
+// sparse matrix
+// valueType_ == float or double
+// shape_.ndims() == 2
+class SparseMatrixArg : public BufferArg {
+ public:
+  SparseMatrixArg(void* buf,
+                  ValueType valueType,
+                  const TensorShape& shape,
+                  const BufferArg& row,
+                  const BufferArg& col,
+                  size_t nnz,
+                  SparseFormat format,
+                  SparseValueType type,
+                  ArgType argType = UNSPECIFIED)
+      : BufferArg(buf, valueType, shape, argType),
+        row_(row),
+        col_(col),
+        nnz_(nnz),
+        format_(static_cast<SparseDataFormat>(format)),
+        type_(static_cast<SparseDataType>(type)) {
+    bufferType_ = TENSOR_SPARSE;
+    CHECK((valueType == VALUE_TYPE_FLOAT) || (valueType == VALUE_TYPE_DOUBLE));
+    CHECK_EQ(shape_.ndims(), 2UL);
+    CHECK_EQ(row_.shape().ndims(), 1UL);
+    CHECK_EQ(col_.shape().ndims(), 1UL);
+    if (format_ == T_SPARSE_CSR) {
+      CHECK_EQ(nnz, col.shape()[0]);
+    } else if (format_ == T_SPARSE_CSC) {
+      CHECK_EQ(nnz, row.shape()[0]);
+    }
+  }
+
+  SparseMatrixArg(ValueType valueType,
+                  const TensorShape& shape,
+                  size_t nnz,
+                  SparseFormat format,
+                  SparseValueType type,
+                  ArgType argType = UNSPECIFIED)
+      : BufferArg(valueType, shape, argType),
+        row_(BufferArg(nullptr, VALUE_TYPE_INT32)),
+        col_(BufferArg(nullptr, VALUE_TYPE_INT32)),
+        nnz_(nnz),
+        format_(static_cast<SparseDataFormat>(format)),
+        type_(static_cast<SparseDataType>(type)) {
+    bufferType_ = TENSOR_SPARSE;
+    CHECK((valueType == VALUE_TYPE_FLOAT) || (valueType == VALUE_TYPE_DOUBLE));
+    CHECK_EQ(shape_.ndims(), 2UL);
+
+    /// len of row_ : height + 1 (CSR) or nnz (CSC), buf_ == nullptr
+    row_ = (format_ == T_SPARSE_CSR
+                ? BufferArg(VALUE_TYPE_INT32, TensorShape{shape_[0] + 1})
+                : BufferArg(VALUE_TYPE_INT32, TensorShape{nnz}));
+    /// len of col_ :  width + 1 (CSC) or nnz (CSR), buf_ == nullptr
+    col_ = (format_ == T_SPARSE_CSR
+                ? BufferArg(VALUE_TYPE_INT32, TensorShape{nnz})
+                : BufferArg(VALUE_TYPE_INT32, TensorShape{shape_[1] + 1}));
+  }
+
+  SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED);
+
+  SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED);
+
+  template <DeviceType DType>
+  typename Tensor<real, DType>::SparseMatrix SparseMatrix() const {
+    CHECK(buf_);
+    CHECK(valueType_ == DataType<real>::value);
+    // CHECK(deviceType_ == DType);
+    CHECK_EQ(2UL, shape_.ndims());
+    return typename Tensor<real, DType>::SparseMatrix(
+        reinterpret_cast<real*>(buf_),
+        reinterpret_cast<int*>(row_.data()),
+        reinterpret_cast<int*>(col_.data()),
+        shape_[0],
+        shape_[1],
+        nnz_,
+        static_cast<SparseValueType>(type_),
+        static_cast<SparseFormat>(format_),
+        false);
+  }
+
+  ~SparseMatrixArg() {}
+
+  void* getRowBuf() const { return row_.data(); }
+
+  void* getColBuf() const { return col_.data(); }
+
+  size_t nnz() const { return nnz_; }
+
+  size_t numElements() const override { return nnz_; }
+
+  SparseDataFormat dataFormat() const { return format_; }
+
+  SparseDataType dataType() const { return type_; }
+
+ private:
+  BufferArg row_;
+  BufferArg col_;
+  size_t nnz_;
+  SparseDataFormat format_;
+  SparseDataType type_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/function/BufferArgTest.cpp b/paddle/legacy/function/BufferArgTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1ec153bea89f25414b0df3088ab0c366c92ecbe0
--- /dev/null
+++ b/paddle/legacy/function/BufferArgTest.cpp
@@ -0,0 +1,38 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "BufferArg.h"
+#include <gtest/gtest.h>
+#include "paddle/legacy/math/MemoryHandle.h"
+
+namespace paddle {
+
+TEST(BufferTest, BufferArg) {
+  TensorShape shape({8, 10});
+  CpuMemoryHandle memory(shape.getElements() *
+                         sizeOfValuType(VALUE_TYPE_FLOAT));
+  BufferArg buffer(memory.getBuf(), VALUE_TYPE_FLOAT, shape);
+  EXPECT_EQ(buffer.data(), memory.getBuf());
+}
+
+TEST(BufferTest, SequenceIdArg) {
+  TensorShape shape({10});
+  CpuMemoryHandle memory(shape.getElements() *
+                         sizeOfValuType(VALUE_TYPE_INT32));
+  SequenceIdArg buffer(memory.getBuf(), shape);
+  EXPECT_EQ(buffer.data(), memory.getBuf());
+  EXPECT_EQ(buffer.numSeqs(), 9U);
+}
+
+}  // namespace paddle
diff --git a/paddle/function/CMakeLists.txt b/paddle/legacy/function/CMakeLists.txt
similarity index 100%
rename from paddle/function/CMakeLists.txt
rename to paddle/legacy/function/CMakeLists.txt
diff --git a/paddle/legacy/function/ContextProjectionOp.cpp b/paddle/legacy/function/ContextProjectionOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..05a3f915862b6657fc0a4300cbbea36721219e10
--- /dev/null
+++ b/paddle/legacy/function/ContextProjectionOp.cpp
@@ -0,0 +1,412 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ContextProjectionOp.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/Vector.h"
+
+namespace paddle {
+/**
+ * Context Projection Forward with CPU Matrix Device.
+ *
+ */
+template <>
+void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
+                                               const CpuMatrix& input_mat,
+                                               const CpuMatrix& weight_mat,
+                                               const CpuIVector& seq_vec,
+                                               size_t context_length,
+                                               int context_start,
+                                               size_t begin_pad) {
+  const int* starts = seq_vec.getData();
+  const size_t num_sequences = seq_vec.getSize() - 1;
+  for (size_t i = 0; i < num_sequences; ++i) {
+    for (size_t j = 0; j < context_length; ++j) {
+      int begin = starts[i] + context_start + j;
+      int end = starts[i + 1] + context_start + j;
+      int dst_begin = starts[i];
+      int dst_end = starts[i + 1];
+      if (begin < starts[i]) {
+        int64_t pad_size =
+            std::min(starts[i] - begin, starts[i + 1] - starts[i]);
+        MatrixPtr mat = out_mat.subMatrix(starts[i], pad_size);
+        if (weight_mat) {
+          MatrixPtr sub =
+              const_cast<CpuMatrix&>(weight_mat).subMatrix(j, pad_size);
+          mat->addAtOffset(*sub, j * input_mat.getWidth());
+        }
+        dst_begin = starts[i] + pad_size;
+        begin = starts[i];
+      }
+      if (end > starts[i + 1]) {
+        int64_t pad_size =
+            std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
+        MatrixPtr mat = out_mat.subMatrix(starts[i + 1] - pad_size, pad_size);
+        if (weight_mat) {
+          MatrixPtr sub =
+              const_cast<CpuMatrix&>(weight_mat)
+                  .subMatrix(begin_pad + context_start + j - pad_size,
+                             pad_size);
+          mat->addAtOffset(*sub, j * input_mat.getWidth());
+        }
+        dst_end = starts[i + 1] - pad_size;
+        end = starts[i + 1];
+      }
+      if (end <= begin) continue;
+      MatrixPtr src =
+          const_cast<CpuMatrix&>(input_mat).subMatrix(begin, end - begin);
+      MatrixPtr dst = out_mat.subMatrix(dst_begin, dst_end - dst_begin);
+      dst->addAtOffset(*src, j * input_mat.getWidth());
+    }
+  }
+}
+
+/**
+ * Paddle Function for Context Projection Forward.
+ * Calculate the output layer value sequence after context projection.
+ *
+ * What is Context Projection for a sequence?
+ * For example, assumed input (x) has 4 words and the dimension of each word
+ * representation is 2. If we use zero to pad instead of learned weight to pad,
+ * and the context_lenth is 3, the output (y) is:
+ *
+ * @code
+ *  x = [a1, a2;
+ *       b1, b2;
+ *       c1, c2;
+ *       d1, d2]
+ *  y = [0,  0,  a1, a2, b1, b2;
+ *       a1, a2, b1, b2, c1, c2;
+ *       b1, b2, c1, c2, d1, d2;
+ *       c1, c2, d1, d2, 0,  0]
+ * @endcode
+ *
+ * \param outputs[0].matrix   output layer value, n * (d * l)
+ * \param outputs[0].vector   start position sequence, n * 1
+ * \param inputs[0].matrix    input layer value, n * d
+ * \param inputs[0].vector    start position sequence, n * 1
+ * \param inputs[1].matrix    input layer weight, pad * d
+ */
+template <DeviceType Device>
+class ContextProjectionForwardFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    context_length_ = config.get<size_t>("context_length");
+    context_start_ = config.get<int>("context_start");
+    begin_pad_ = config.get<size_t>("begin_pad");
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK(1UL == inputs.size() || 2UL == inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
+        << "SequenceArg required here";
+    const auto val_seqs = dynamic_cast<const SequenceArg&>(inputs[0]);
+    auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
+
+    CHECK(out_seq.data() && val_seqs.data() && val_seqs.getSequenceId().data());
+    CHECK_EQ(out_seq.shape().ndims(), 2UL);
+    CHECK_EQ(val_seqs.shape().ndims(), 2UL);
+    /// dim of output = dim of input * context_length
+    CHECK_EQ(out_seq.shape()[1], val_seqs.shape()[1] * context_length_);
+    /// input and output has the same batch_size
+    CHECK_EQ(val_seqs.shape()[0], out_seq.shape()[0]);
+    if (2UL == inputs.size()) {
+      CHECK_EQ(inputs[1].shape().ndims(), 2UL);
+      /// dim of input == dim of weight
+      CHECK_EQ(val_seqs.shape()[1], inputs[1].shape()[1]);
+    }
+
+    CHECK_EQ(out_seq.getArgType(), ADD_TO);
+    auto out_mat = out_seq.matrix<Device>();
+    const auto in_mat = val_seqs.matrix<Device>();
+    const auto w_mat =
+        (2UL == inputs.size() && inputs[1].data())
+            ? inputs[1].matrix<Device>()
+            : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
+    const auto seq_vec = val_seqs.getSequenceId().vector<int, Device>();
+
+    ContextProjectionForward<Device>(out_mat,
+                                     in_mat,
+                                     w_mat,
+                                     seq_vec,
+                                     context_length_,
+                                     context_start_,
+                                     begin_pad_);
+  }
+
+ private:
+  size_t context_length_;
+  int context_start_;
+  size_t begin_pad_;
+};
+
+/**
+ * Context Projection Backward with CPU Matrix Device.
+ *
+ */
+template <>
+void ContextProjectionBackward<DEVICE_TYPE_CPU>(const CpuMatrix& out_grad_mat,
+                                                CpuMatrix& in_grad_mat,
+                                                CpuMatrix& w_grad_mat,
+                                                const CpuIVector& seq_vec,
+                                                size_t context_length,
+                                                int context_start,
+                                                size_t begin_pad,
+                                                bool is_padding,
+                                                size_t total_pad) {
+  size_t input_dim = in_grad_mat ? in_grad_mat.getWidth()
+                                 : w_grad_mat ? w_grad_mat.getWidth() : 0;
+  const int* starts = seq_vec.getData();
+  size_t num_sequences = seq_vec.getSize() - 1;
+  for (size_t i = 0; i < num_sequences; ++i) {
+    for (size_t j = 0; j < context_length; ++j) {
+      int begin = starts[i] + context_start + j;
+      int end = starts[i + 1] + context_start + j;
+      int dst_begin = starts[i];
+      int dst_end = starts[i + 1];
+      if (begin < starts[i]) {
+        int64_t pad_size =
+            std::min(starts[i] - begin, starts[i + 1] - starts[i]);
+        if (is_padding && w_grad_mat) {
+          MatrixPtr mat = const_cast<CpuMatrix&>(out_grad_mat)
+                              .subMatrix(starts[i], pad_size);
+          MatrixPtr sub = w_grad_mat.subMatrix(j, pad_size);
+          sub->addAtOffset(*mat, j * input_dim);
+        }
+        dst_begin = starts[i] + pad_size;
+        begin = starts[i];
+      }
+      if (end > starts[i + 1]) {
+        int64_t pad_size =
+            std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
+        if (is_padding && w_grad_mat) {
+          MatrixPtr mat = const_cast<CpuMatrix&>(out_grad_mat)
+                              .subMatrix(starts[i + 1] - pad_size, pad_size);
+          MatrixPtr sub = w_grad_mat.subMatrix(
+              begin_pad + context_start + j - pad_size, pad_size);
+          sub->addAtOffset(*mat, j * input_dim);
+        }
+        dst_end = starts[i + 1] - pad_size;
+        end = starts[i + 1];
+      }
+      if (end <= begin) continue;
+      if (!in_grad_mat) continue;
+      MatrixPtr src = in_grad_mat.subMatrix(begin, end - begin);
+      MatrixPtr dst = const_cast<CpuMatrix&>(out_grad_mat)
+                          .subMatrix(dst_begin, dst_end - dst_begin);
+      src->addAtOffset(*dst, j * input_dim);
+    }
+  }
+}
+
+/**
+ * Context Projection Backward Function.
+ * Update the weight gradient and input layer gradient with backprop
+ *
+ * \param inputs[0].matrix          output layer grad, n * (d * l)
+ * \param inputs[0].vector          start position sequence, n * 1
+ * \param outputs[0].matrix         input layer grad, n * d
+ * \param outputs[0].vector         start position sequence, n * 1
+ * \param outputs[1]                weight grad, pad * d
+ */
+template <DeviceType Device>
+class ContextProjectionBackwardFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    context_length_ = config.get<size_t>("context_length");
+    context_start_ = config.get<int>("context_start");
+    begin_pad_ = config.get<size_t>("begin_pad");
+    is_padding_ = config.get<bool>("is_padding");
+    total_pad_ = config.get<size_t>("total_pad");
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK(1UL == outputs.size() || 2UL == outputs.size());
+    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
+        << "SequenceArg required here";
+    const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
+    auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
+    CHECK(in_seq.data() && in_seq.getSequenceId().data());
+    CHECK_EQ(in_seq.shape().ndims(), 2UL);
+    CHECK_EQ(out_seq.shape().ndims(), 2UL);
+    CHECK_EQ(out_seq.getSequenceId().shape().ndims(), 1UL);
+
+    /// input and output grad has the same batch_size
+    CHECK_EQ(out_seq.shape()[0], in_seq.shape()[0]);
+    /// dim of output grad = dim of input grad * context_length
+    CHECK_EQ(in_seq.shape()[1], out_seq.shape()[1] * context_length_);
+    CHECK_EQ(out_seq.getArgType(), ADD_TO);
+
+    if (2UL == outputs.size()) {
+      CHECK_EQ(outputs[1].shape().ndims(), 2UL);
+      /// dim of input grad == dim of weight
+      CHECK_EQ(out_seq.shape()[1], outputs[1].shape()[1]);
+      CHECK_EQ(outputs[1].getArgType(), ADD_TO);
+    }
+
+    const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
+    const auto out_grad_mat = in_seq.matrix<Device>();
+    auto in_grad_mat =
+        !out_seq.data() ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
+                        : out_seq.matrix<Device>();
+    auto w_grad_mat =
+        (2UL == outputs.size() && outputs[1].data())
+            ? outputs[1].matrix<Device>()
+            : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
+
+    ContextProjectionBackward<Device>(out_grad_mat,
+                                      in_grad_mat,
+                                      w_grad_mat,
+                                      seq_vec,
+                                      context_length_,
+                                      context_start_,
+                                      begin_pad_,
+                                      is_padding_,
+                                      total_pad_);
+  }
+
+ private:
+  size_t context_length_;
+  int context_start_;
+  size_t begin_pad_;
+  bool is_padding_;
+  size_t total_pad_;
+};
+
+/**
+ * Context Projection Backward Data Function
+ * Update input layer grad
+ * input:  sequence of output layer grad
+ * output: sequence of input layer grad
+ *
+ * \param outputs[0].matrix              input layer grad, n * d
+ * \param outputs[0].vector              start position sequence, n * 1
+ * \param inputs[0].matrix               output layer grad, n * (d * l)
+ * \param inputs[0].vector               start positon sequence, n * 1
+ */
+template <DeviceType Device>
+class ContextProjectionBackwardDataFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    context_length_ = config.get<size_t>("context_length");
+    context_start_ = config.get<int>("context_start");
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
+        << "SequenceArg required here";
+    const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
+    const auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
+
+    CHECK(in_seq.data() && out_seq.data() && in_seq.getSequenceId().data());
+    CHECK_EQ(out_seq.shape().ndims(), 2UL);
+    CHECK_EQ(in_seq.shape().ndims(), 2UL);
+    CHECK_EQ(in_seq.getSequenceId().shape().ndims(), 1UL);
+    /// output layer grad dim == input layer grad dim * context_length_
+    CHECK_EQ(in_seq.shape().ndims(), out_seq.shape().ndims() * context_length_);
+    /// input and output has the same batch_size
+    CHECK_EQ(in_seq.shape()[0], out_seq.shape()[0]);
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+
+    const auto out_grad_mat = in_seq.matrix<Device>();
+    const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
+    auto in_grad_mat = out_seq.matrix<Device>();
+
+    ContextProjectionBackwardData<Device>(
+        out_grad_mat, in_grad_mat, seq_vec, context_length_, context_start_);
+  }
+
+ private:
+  size_t context_length_;
+  int context_start_;
+};
+
+/**
+ * Context Projection Backward Weight Function
+ * Update weight grad by backprop
+ * input:  sequence of output layer grad
+ * output: weight grad
+ *
+ * \param outputs[0]                   weight grad, pad * d
+ * \param inputs[0].matrix             output layer grad, n * (d * l)
+ * \param inputs[0].vecotr             start positon sequence, n * 1
+ */
+template <DeviceType Device>
+class ContextProjectionBackwardWeightFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    context_length_ = config.get<size_t>("context_length");
+    context_start_ = config.get<int>("context_start");
+    begin_pad_ = config.get<size_t>("begin_pad");
+    total_pad_ = config.get<size_t>("total_pad");
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK(inputs[0].isSequenceArg()) << "SequenceArg required here";
+    const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
+    CHECK(in_seq.data() && in_seq.getSequenceId().data() && outputs[0].data());
+    CHECK_EQ(outputs[0].shape().ndims(), 2UL);
+    CHECK_EQ(in_seq.shape().ndims(), 2UL);
+    CHECK_EQ(in_seq.getSequenceId().shape().ndims(), 1UL);
+    CHECK_EQ(in_seq.shape()[0], outputs[0].shape()[0]);
+    /// output layer grad dim == weight dim * context_length_
+    CHECK_EQ(in_seq.shape()[1], outputs[0].shape()[1] * context_length_);
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+
+    const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
+    const auto out_grad_mat = in_seq.matrix<Device>();
+    auto w_grad_mat = outputs[0].matrix<Device>();
+    ContextProjectionBackwardWeight<Device>(out_grad_mat,
+                                            w_grad_mat,
+                                            seq_vec,
+                                            context_length_,
+                                            context_start_,
+                                            total_pad_,
+                                            begin_pad_);
+  }
+
+ private:
+  size_t context_length_;
+  int context_start_;
+  size_t begin_pad_;
+  size_t total_pad_;
+};
+
+REGISTER_TYPED_FUNC(ContextProjectionForward,
+                    CPU,
+                    ContextProjectionForwardFunc);
+REGISTER_TYPED_FUNC(ContextProjectionBackward,
+                    CPU,
+                    ContextProjectionBackwardFunc);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_TYPED_FUNC(ContextProjectionForward,
+                    GPU,
+                    ContextProjectionForwardFunc);
+REGISTER_TYPED_FUNC(ContextProjectionBackward,
+                    GPU,
+                    ContextProjectionBackwardFunc);
+REGISTER_TYPED_FUNC(ContextProjectionBackwardData,
+                    GPU,
+                    ContextProjectionBackwardDataFunc);
+REGISTER_TYPED_FUNC(ContextProjectionBackwardWeight,
+                    GPU,
+                    ContextProjectionBackwardWeightFunc);
+#endif
+}  // namespace paddle
diff --git a/paddle/function/ContextProjectionOp.h b/paddle/legacy/function/ContextProjectionOp.h
similarity index 100%
rename from paddle/function/ContextProjectionOp.h
rename to paddle/legacy/function/ContextProjectionOp.h
diff --git a/paddle/function/ContextProjectionOpGpu.cu b/paddle/legacy/function/ContextProjectionOpGpu.cu
similarity index 100%
rename from paddle/function/ContextProjectionOpGpu.cu
rename to paddle/legacy/function/ContextProjectionOpGpu.cu
diff --git a/paddle/legacy/function/ContextProjectionOpTest.cpp b/paddle/legacy/function/ContextProjectionOpTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3b0a34567fe17b466de6186e537243fe8166a77a
--- /dev/null
+++ b/paddle/legacy/function/ContextProjectionOpTest.cpp
@@ -0,0 +1,114 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+
+void testMatrixProjectionForward(int context_start,
+                                 size_t context_length,
+                                 bool is_padding,
+                                 size_t batch_size,
+                                 size_t input_dim) {
+  size_t pad = std::max(0, -context_start) +
+               std::max(0, (int)(context_start + context_length - 1));
+  if (pad == 0) is_padding = false;
+
+  CpuGpuFuncCompare test(
+      "ContextProjectionForward",
+      FuncConfig()
+          .set("context_length", context_length)
+          .set("context_start", context_start)
+          .set("begin_pad", (size_t)std::max(0, -context_start)));
+
+  // prepare input arguments
+  test.addSequence(SequenceIdArg(TensorShape{batch_size}));
+  test.addInputs(
+      SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batch_size, input_dim}));
+  if (is_padding) {  // weight
+    test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{pad, input_dim}));
+  }
+  test.addOutputs(
+      SequenceArg(VALUE_TYPE_FLOAT,
+                  TensorShape{batch_size, input_dim * context_length}),
+      ADD_TO);
+
+  // run Function
+  test.run();
+}
+
+void testMatrixProjectionBackward(int context_start,
+                                  size_t context_length,
+                                  bool is_padding,
+                                  size_t batch_size,
+                                  size_t input_dim) {
+  size_t pad = std::max(0, -context_start) +
+               std::max(0, (int)(context_start + context_length - 1));
+  if (pad == 0) is_padding = false;
+
+  CpuGpuFuncCompare test(
+      "ContextProjectionBackward",
+      FuncConfig()
+          .set("context_length", context_length)
+          .set("context_start", context_start)
+          .set("begin_pad", (size_t)std::max(0, -context_start))
+          .set("is_padding", is_padding)
+          .set("total_pad", pad));
+
+  // prepare input arguments
+  test.addSequence(SequenceIdArg(TensorShape{batch_size}));
+  test.addInputs(SequenceArg(
+      VALUE_TYPE_FLOAT, TensorShape{batch_size, input_dim * context_length}));
+  test.addOutputs(
+      SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batch_size, input_dim}),
+      ADD_TO);
+  if (is_padding) {  // weight
+    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{pad, input_dim}),
+                    ADD_TO);
+  }
+
+  // run Function
+  test.run();
+}
+
+TEST(ContextProjection, Projection) {
+  for (auto context_start : {-5, -3, -1, 0, 3}) {
+    for (auto context_length : {1, 2, 5, 7}) {
+      for (auto trainable_padding : {false, true}) {
+        for (auto batch_size : {1, 2, 5, 20, 100}) {
+          for (auto input_dim : {15, 32, 63, 128, 200}) {
+            VLOG(3) << " context_start=" << context_start
+                    << " context_length=" << context_length
+                    << " trainable_padding=" << trainable_padding
+                    << " batch_size=" << batch_size
+                    << " input_dim=" << input_dim;
+            testMatrixProjectionForward(context_start,
+                                        context_length,
+                                        trainable_padding,
+                                        batch_size,
+                                        input_dim);
+            testMatrixProjectionBackward(context_start,
+                                         context_length,
+                                         trainable_padding,
+                                         batch_size,
+                                         input_dim);
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/paddle/function/ConvOp.h b/paddle/legacy/function/ConvOp.h
similarity index 100%
rename from paddle/function/ConvOp.h
rename to paddle/legacy/function/ConvOp.h
diff --git a/paddle/function/ConvOpTest.h b/paddle/legacy/function/ConvOpTest.h
similarity index 100%
rename from paddle/function/ConvOpTest.h
rename to paddle/legacy/function/ConvOpTest.h
diff --git a/paddle/legacy/function/CosSimOp.cpp b/paddle/legacy/function/CosSimOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d04f4396caade803aa846fa81388f95a194845e6
--- /dev/null
+++ b/paddle/legacy/function/CosSimOp.cpp
@@ -0,0 +1,240 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "CosSimOp.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/Vector.h"
+
+namespace paddle {
+/**
+ * Cosine Similarity for CpuMatrix
+ *
+ * \param out_mat, output value, size: nSamples * 1.
+ * \param in1_mat, input value 1, size: nSamples * dim.
+ * \param in2_mat, input value 2, size: n2 * dim (n2 == 1 or n2 == nSamples).
+ * \param scale, default 1.0
+ *
+ */
+template <>
+void CosSimForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
+                                    const CpuMatrix& in1_mat,
+                                    const CpuMatrix& in2_mat,
+                                    real scale) {
+  CHECK(out_mat.getData() && in1_mat.getData() && in2_mat.getData());
+  size_t num_samples = out_mat.getHeight();
+  size_t dim = in1_mat.getWidth();
+  /// column vector [nSamples, 1]
+  real* out = out_mat.getData();
+  const real* x = in1_mat.getData();
+  const real* y = in2_mat.getData();
+
+  /// in2 might only have one row or full rows
+  CHECK(in2_mat.getHeight() == 1LU || in2_mat.getHeight() == num_samples);
+  size_t inc = (in2_mat.getHeight() == 1LU) ? 0 : dim;
+  for (size_t i = 0; i < num_samples; ++i, x += dim, y += inc) {
+    real square_sum_x = 0;
+    real square_sum_y = 0;
+    real xy = 0;
+    for (size_t j = 0; j < dim; ++j) {
+      square_sum_x += x[j] * x[j];
+      square_sum_y += y[j] * y[j];
+      xy += x[j] * y[j];
+    }
+    CHECK(square_sum_x > 0 && square_sum_y > 0);
+    out[i] = scale * xy / (std::sqrt(square_sum_x) * std::sqrt(square_sum_y));
+  }
+}
+
+/**
+ * Cosine Similarity
+ * for each row i,
+ *   out[i] = scale * cos(input1[i], input2[i])
+ *      = scale * <input1[i], input2[i]>/sqrt(|input1[i]|^2 * |input2[i]|^2)
+ * when input2 only has one row, then for each row i,
+ *   out[i] = cos(input1[i], input2[0])
+ *
+ * \param inputs[0] input matrix 1, size: nSamples * dim.
+ * \param inputs[1] input matrix 2, size: n2 * dim (n2 == 1 or n2 == nSamples).
+ * \param outputs[0] output matrix, size : nSamples * 1.
+ */
+
+template <DeviceType Device>
+class CosSimForwardFunc : public FunctionBase {
+  void init(const FuncConfig& config) override {
+    scale_ = config.get<real>("scale");
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(inputs.size(), 2UL);
+    CHECK_EQ(outputs.size(), 1UL);
+
+    CHECK_EQ(inputs[0].shape().ndims(), 2UL);
+    CHECK_EQ(inputs[1].shape().ndims(), 2UL);
+    CHECK_EQ(outputs[0].shape().ndims(), 2UL);
+
+    CHECK_EQ(inputs[0].shape()[0], outputs[0].shape()[0]);
+    CHECK_EQ(inputs[0].shape()[1], inputs[1].shape()[1]);
+    CHECK_EQ(outputs[0].shape()[1], 1UL);
+
+    CHECK(outputs[0].data() && inputs[0].data() && inputs[1].data());
+
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+    auto out_mat = outputs[0].matrix<Device>();
+    const auto in1_mat = inputs[0].matrix<Device>();
+    const auto in2_mat = inputs[1].matrix<Device>();
+
+    CosSimForward<Device>(out_mat, in1_mat, in2_mat, scale_);
+  }
+
+ private:
+  real scale_;
+};
+
+/**
+ * Cosine Similarity Derivative for CpuMatrix
+ *
+ * \param in1_grad  forward input grad 1, size: nSamples * dim.
+ * \param in2_grad  forward input grad 2,
+ *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
+ *
+ * \param out_grad  backward loss output grad, size : nSamples * 1.
+ * \param out_val   forward output value, size: nSamples * 1.
+ * \param in1_val   forward input value 1, size: nSamples * dim.
+ * \param in2_val   forward input value 2,
+ *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
+ * \param scale,    default 1.0
+ */
+template <>
+void CosSimBackward<DEVICE_TYPE_CPU>(const CpuMatrix& out_grad,
+                                     const CpuMatrix& out_val,
+                                     const CpuMatrix& in1_val,
+                                     const CpuMatrix& in2_val,
+                                     CpuMatrix& in1_grad,
+                                     CpuMatrix& in2_grad,
+                                     real scale) {
+  CHECK(out_grad.getData() && out_val.getData() && in1_val.getData() &&
+        in2_val.getData() && in1_grad.getData() && in2_grad.getData());
+  CHECK_EQ(out_val.useGpu_, false) << "Matrix type are GPU, CPU required";
+
+  const real* grad = out_grad.getData();
+  const real* out = out_val.getData();
+  const real* prev_out_x = in1_val.getData();
+  const real* prev_out_y = in2_val.getData();
+  real* prev_grad_x = in1_grad.getData();
+  real* prev_grad_y = in2_grad.getData();
+
+  size_t num_samples = out_grad.getHeight();
+  size_t dim = in1_val.getWidth();
+  CHECK_EQ(in2_val.getHeight(), in2_grad.getHeight());
+  CHECK(in2_val.getHeight() == 1LU || in2_val.getHeight() == num_samples);
+  size_t inc = (in2_val.getHeight() == 1LU) ? 0 : dim;
+  for (size_t i = 0; i < num_samples; ++i,
+              prev_out_x += dim,
+              prev_out_y += inc,
+              prev_grad_x += dim,
+              prev_grad_y += inc) {
+    real square_sum_x = 0;
+    real square_sum_y = 0;
+    real xy = 0;
+    for (size_t j = 0; j < dim; ++j) {
+      square_sum_x += prev_out_x[j] * prev_out_x[j];
+      square_sum_y += prev_out_y[j] * prev_out_y[j];
+      xy += prev_out_x[j] * prev_out_y[j];
+    }
+    CHECK(square_sum_x > 0 && square_sum_y > 0);
+    if (xy == 0) {
+      real reciprocal =
+          1.0f / (std::sqrt(square_sum_x) * std::sqrt(square_sum_y));
+      for (size_t j = 0; j < dim; ++j) {
+        prev_grad_x[j] += scale * grad[i] * prev_out_y[j] * reciprocal;
+        prev_grad_y[j] += scale * grad[i] * prev_out_x[j] * reciprocal;
+      }
+    } else {
+      real reciprocal_xy = 1.0f / xy;
+      real reciprocal_square_sum_x = 1.0f / square_sum_x;
+      real reciprocal_square_sum_y = 1.0f / square_sum_y;
+      for (size_t j = 0; j < dim; ++j) {
+        prev_grad_x[j] +=
+            out[i] * grad[i] * (prev_out_y[j] * reciprocal_xy -
+                                prev_out_x[j] * reciprocal_square_sum_x);
+        prev_grad_y[j] +=
+            out[i] * grad[i] * (prev_out_x[j] * reciprocal_xy -
+                                prev_out_y[j] * reciprocal_square_sum_y);
+      }
+    }
+  }
+}
+
+/**
+ * Cosine Similarity backward Derivative
+ *
+ * \param outputs[0] forward input grad 1, size: nSamples * dim.
+ * \param outputs[1] forward input grad 2,
+ *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
+ *
+ * \param inputs[0] backward loss output grad, size : nSamples * 1.
+ * \param inputs[1] forward output value, size: nSamples * 1.
+ * \param inputs[2] forward input value 1, size: nSamples * dim.
+ * \param inputs[3] forward input value 2,
+ *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
+ */
+template <DeviceType Device>
+class CosSimBackwardFunc : public FunctionBase {
+  void init(const FuncConfig& config) override {
+    scale_ = config.get<real>("scale");
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(inputs.size(), 4UL);
+    CHECK_EQ(outputs.size(), 2UL);
+    /// dim of out_grad and out_val == 1, column vector
+    CHECK_EQ(inputs[0].shape()[1], 1UL);
+    CHECK_EQ(inputs[1].shape()[1], 1UL);
+    /// nSamples of out_grad == out_val == in_val1 == in_grad1
+    CHECK_EQ(inputs[1].shape()[0], inputs[0].shape()[0]);
+    CHECK_EQ(inputs[0].shape()[0], inputs[0].shape()[0]);
+    CHECK_EQ(outputs[0].shape()[0], inputs[0].shape()[0]);
+    /// dim of in1_val1 == in_val2 == in_grad1 == in_grad2
+    CHECK_EQ(inputs[3].shape()[1], inputs[2].shape()[1]);
+    CHECK_EQ(outputs[0].shape()[1], inputs[2].shape()[1]);
+    CHECK_EQ(outputs[1].shape()[1], inputs[2].shape()[1]);
+
+    CHECK(inputs[0].data() && inputs[1].data() && inputs[2].data() &&
+          inputs[3].data() && outputs[0].data() && outputs[1].data());
+
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    CHECK_EQ(outputs[1].getArgType(), ADD_TO);
+
+    const auto out_grad = inputs[0].matrix<Device>();
+    const auto out_val = inputs[1].matrix<Device>();
+    const auto in1_val = inputs[2].matrix<Device>();
+    const auto in2_val = inputs[3].matrix<Device>();
+    auto in1_grad = outputs[0].matrix<Device>();
+    auto in2_grad = outputs[1].matrix<Device>();
+
+    CosSimBackward<Device>(
+        out_grad, out_val, in1_val, in2_val, in1_grad, in2_grad, scale_);
+  }
+
+ private:
+  real scale_;
+};
+
+REGISTER_TYPED_FUNC(CosSimForward, CPU, CosSimForwardFunc);
+REGISTER_TYPED_FUNC(CosSimBackward, CPU, CosSimBackwardFunc);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_TYPED_FUNC(CosSimForward, GPU, CosSimForwardFunc);
+REGISTER_TYPED_FUNC(CosSimBackward, GPU, CosSimBackwardFunc);
+#endif
+}  // namespace paddle
diff --git a/paddle/function/CosSimOp.h b/paddle/legacy/function/CosSimOp.h
similarity index 100%
rename from paddle/function/CosSimOp.h
rename to paddle/legacy/function/CosSimOp.h
diff --git a/paddle/function/CosSimOpGpu.cu b/paddle/legacy/function/CosSimOpGpu.cu
similarity index 100%
rename from paddle/function/CosSimOpGpu.cu
rename to paddle/legacy/function/CosSimOpGpu.cu
diff --git a/paddle/legacy/function/CosSimOpTest.cpp b/paddle/legacy/function/CosSimOpTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..31bb43e1baa9a6d890d1b8fe2abf15a07a7094c6
--- /dev/null
+++ b/paddle/legacy/function/CosSimOpTest.cpp
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+#include "paddle/legacy/math/Matrix.h"
+
+using namespace paddle;  // NOLINT
+
+void testCosSimForward(size_t height_x,
+                       size_t height_y,
+                       size_t width,
+                       real scale) {
+  CpuGpuFuncCompare test("CosSimForward", FuncConfig().set("scale", scale));
+  // prepare input arguments
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, width}));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_y, width}));
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1}),
+                  ASSIGN_TO);
+  // run Function
+  test.run();
+}
+
+void testCosSimBackward(size_t height_x,
+                        size_t height_y,
+                        size_t width,
+                        real scale) {
+  CpuGpuFuncCompare test("CosSimBackward", FuncConfig().set("scale", scale));
+  // prepare input arguments
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1}));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1}));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, width}));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_y, width}));
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, width}),
+                  ADD_TO);
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_y, width}),
+                  ADD_TO);
+  // run Function
+  test.run();
+}
+
+TEST(Matrix, cosSim) {
+  for (auto height_x : {10, 100, 1000}) {
+    for (auto height_y : {1, height_x}) {
+      for (auto width : {10, 100, 1000}) {
+        for (auto scale : {1.0, 2.0}) {
+          testCosSimForward(height_x, height_y, width, scale);
+          testCosSimBackward(height_x, height_y, width, scale);
+        }
+      }
+    }
+  }
+}
diff --git a/paddle/legacy/function/CropOp.cpp b/paddle/legacy/function/CropOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e22678822f06a323d1e6c17dce63d44d143484a3
--- /dev/null
+++ b/paddle/legacy/function/CropOp.cpp
@@ -0,0 +1,177 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "CropOp.h"
+#include "paddle/legacy/function/TensorShape.h"
+#include "paddle/legacy/math/Vector.h"
+
+namespace paddle {
+
+template <>
+void Crop<DEVICE_TYPE_CPU>(real* outputs,
+                           const real* inputs,
+                           const TensorShape inShape,
+                           const TensorShape outShape,
+                           const FuncConfig& conf) {
+  std::vector<uint32_t> crop_corner =
+      conf.get<std::vector<uint32_t>>("crop_corner");
+  int cCrop = crop_corner[1];
+  int hCrop = crop_corner[2];
+  int wCrop = crop_corner[3];
+
+  int num = inShape[0];
+  int inC = inShape[1];
+  int inH = inShape[2];
+  int inW = inShape[3];
+
+  int outC = outShape[1];
+  int outH = outShape[2];
+  int outW = outShape[3];
+
+  for (int n = 0; n < num; n++) {
+    for (int c = 0; c < outC; c++) {
+      for (int h = 0; h < outH; h++) {
+        int outoff = ((n * outC + c) * outH + h) * outW;
+        int inoff = ((n * inC + c + cCrop) * inH + h + hCrop) * inW + wCrop;
+        memcpy(outputs + outoff, inputs + inoff, outW * sizeof(real));
+      }
+    }
+  }
+}
+
+template <>
+void CropGrad<DEVICE_TYPE_CPU>(const real* inGrad,
+                               real* outGrad,
+                               const TensorShape inShape,
+                               const TensorShape outShape,
+                               const FuncConfig& conf) {
+  std::vector<uint32_t> crop_corner =
+      conf.get<std::vector<uint32_t>>("crop_corner");
+  int cCrop = crop_corner[1];
+  int hCrop = crop_corner[2];
+  int wCrop = crop_corner[3];
+
+  int num = outShape[0];
+  int outC = outShape[1];
+  int outH = outShape[2];
+  int outW = outShape[3];
+
+  int inC = inShape[1];
+  int inH = inShape[2];
+  int inW = inShape[3];
+
+  for (int n = 0; n < num; n++) {
+    for (int c = 0; c < inC; c++) {
+      for (int h = 0; h < inH; h++) {
+        int outoff = ((n * outC + c + cCrop) * outH + h + hCrop) * outW + wCrop;
+        int inoff = ((n * inC + c) * inH + h) * inW;
+        CpuVector inG = CpuVector(inW, const_cast<real*>(inGrad + inoff));
+        CpuVector outG = CpuVector(inW, outGrad + outoff);
+        outG += inG;
+      }
+    }
+  }
+}
+
+/**
+ * \brief Crop input according to the specify corner and shape.
+ *        The input and output is a 4D tensor. In CropFunc, we only
+ *        crop the 2nd to 4th dimension.
+ *
+ * Argument in this Function:
+ * \param pad_    A struct object contains the cropping corner and shape.
+ * \param inputs  A 4D tensor, only one input.
+ * \param outputs A 4D tensor, the output value after cropping.
+ *
+ * For example,
+ * Input(2,2,2,3) = [
+ *                    [ [[1,2,3], [3,4,5]],
+ *                      [[2,3,5], [1,6,7]] ],
+ *                    [ [[4,3,1], [1,8,7]],
+ *                      [[3,8,9], [2,3,5]] ]
+ *                  ] # the input shape is (2,2,2,3)
+ *
+ * pad_: if corner = (0,1,1) and crop_shape = (2,1,2)
+ * Output(2,2,1,2) = [
+ *                    [ [[4,5]],
+ *                      [[6,7]] ],
+ *                    [ [[8,7]],
+ *                      [[3,5]] ]
+ *                  ] # the input shape is (2,2,2,3)
+ */
+template <DeviceType Device>
+class CropFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override { conf_ = config; }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+
+    TensorShape inShape = inputs[0].shape();
+    TensorShape outShape = outputs[0].shape();
+
+    Crop<Device>(outputs[0].data<real>(),
+                 inputs[0].data<real>(),
+                 inShape,
+                 outShape,
+                 conf_);
+  }
+
+ private:
+  FuncConfig conf_;
+};
+
+/**
+ * \brief The backward propagation of cropping Function.
+ *
+ * Argument in this Function:
+ * \param crop_    The same meaning as it in CropFunc.
+ * \param inputs  The gradient with respect to the output value of CropFunc.
+ * \param outputs The gradient with respect to the input value of CropFunc.
+ */
+
+template <DeviceType Device>
+class CropGradFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override { conf_ = config; }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+
+    TensorShape outShape = outputs[0].shape();
+    TensorShape inShape = inputs[0].shape();
+
+    CropGrad<Device>(inputs[0].data<real>(),
+                     outputs[0].data<real>(),
+                     inShape,
+                     outShape,
+                     conf_);
+  }
+
+ private:
+  FuncConfig conf_;
+};
+
+REGISTER_TYPED_FUNC(Crop, CPU, CropFunc);
+REGISTER_TYPED_FUNC(CropGrad, CPU, CropGradFunc);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_TYPED_FUNC(Crop, GPU, CropFunc);
+REGISTER_TYPED_FUNC(CropGrad, GPU, CropGradFunc);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/CropOp.h b/paddle/legacy/function/CropOp.h
similarity index 100%
rename from paddle/function/CropOp.h
rename to paddle/legacy/function/CropOp.h
diff --git a/paddle/function/CropOpGpu.cu b/paddle/legacy/function/CropOpGpu.cu
similarity index 100%
rename from paddle/function/CropOpGpu.cu
rename to paddle/legacy/function/CropOpGpu.cu
diff --git a/paddle/function/CropOpTest.cpp b/paddle/legacy/function/CropOpTest.cpp
similarity index 100%
rename from paddle/function/CropOpTest.cpp
rename to paddle/legacy/function/CropOpTest.cpp
diff --git a/paddle/legacy/function/CrossMapNormalOp.cpp b/paddle/legacy/function/CrossMapNormalOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f28703af00fa4bd7bebd98839cb077798083b61f
--- /dev/null
+++ b/paddle/legacy/function/CrossMapNormalOp.cpp
@@ -0,0 +1,344 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "CrossMapNormalOp.h"
+#include "paddle/legacy/math/Vector.h"
+
+namespace paddle {
+
+template <>
+void CrossMapNormal<DEVICE_TYPE_CPU>(real* outputs,
+                                     real* denoms,
+                                     const real* inputs,
+                                     size_t numSamples,
+                                     size_t channels,
+                                     size_t height,
+                                     size_t width,
+                                     size_t size,
+                                     real scale,
+                                     real pow) {
+  size_t oneImage = height * width;
+  size_t oneSample = channels * oneImage;
+
+  CpuVector outputsV(numSamples * oneSample, outputs);
+  CpuVector inputsV(numSamples * oneSample, const_cast<real*>(inputs));
+  CpuVector denomsV(numSamples * oneSample, denoms);
+
+  // f(x) = x * ( 1 + scale * SUM((x)^2) )^(-pow)
+  // x represents inputs
+  // f(x) represents outputs
+  // denoms save the intermediate result for backward
+  denomsV = denomsV.constant(1.0);
+  const int start = -((int)size - 1) / 2;
+  const int end = (int)size + start;
+  for (size_t i = 0; i < numSamples; i++) {
+    real* oneDenom = denoms + i * oneSample;
+    real* oneInput = const_cast<real*>(inputs) + i * oneSample;
+    for (int c = 0; c < (int)channels; c++) {
+      CpuVector denom(oneImage, oneDenom + c * oneImage);
+      for (int s = start; s < end; s++) {
+        if (c + s >= 0 && c + s < (int)channels) {
+          CpuVector input(oneImage, oneInput + (c + s) * oneImage);
+          denom += input.square() * scale;
+        }
+      }
+    }
+  }
+
+  outputsV = inputsV * denomsV.pow(-pow);
+}
+
+template <>
+void CrossMapNormalGrad<DEVICE_TYPE_CPU>(real* inputsGrad,
+                                         const real* inputsValue,
+                                         const real* outputsValue,
+                                         const real* outputsGrad,
+                                         const real* denoms,
+                                         size_t numSamples,
+                                         size_t channels,
+                                         size_t height,
+                                         size_t width,
+                                         size_t size,
+                                         real scale,
+                                         real pow) {
+  size_t oneSample = channels * height * width;
+  std::function<CpuVector(real*, size_t)> oneImage = [=](real* data,
+                                                         size_t offset) {
+    return CpuVector(height * width, data + offset);
+  };
+
+  const int start = -((int)size) / 2;
+  const int end = (int)size + start;
+  const real ratio = -(real)2 * scale * pow;
+  for (size_t i = 0; i < numSamples; i++) {
+    size_t sOffset = i * oneSample;
+    real* oneInputGrad = inputsGrad + sOffset;
+    real* oneInputValue = const_cast<real*>(inputsValue) + sOffset;
+    real* oneDenom = const_cast<real*>(denoms) + sOffset;
+    real* oneOutputGrad = const_cast<real*>(outputsGrad) + sOffset;
+    real* oneOutputValue = const_cast<real*>(outputsValue) + sOffset;
+
+    for (int c = 0; c < (int)channels; c++) {
+      size_t cOffset = c * height * width;
+      CpuVector inputGrad = oneImage(oneInputGrad, cOffset);
+      CpuVector inputValue = oneImage(oneInputValue, cOffset);
+      CpuVector denom = oneImage(oneDenom, cOffset);
+      CpuVector outputGrad = oneImage(oneOutputGrad, cOffset);
+
+      inputGrad = inputGrad + denom.pow(-pow) * outputGrad;
+      for (int s = start; s < end; s++) {
+        if (c + s >= 0 && c + s < (int)channels) {
+          size_t offset = (c + s) * height * width;
+          CpuVector output = oneImage(oneOutputValue, offset);
+          CpuVector outputGrad = oneImage(oneOutputGrad, offset);
+          CpuVector denom = oneImage(oneDenom, offset);
+
+          inputGrad += ((outputGrad * output * ratio) / denom) * inputValue;
+        }
+      }
+    }
+  }
+}
+
+/**
+ * \brief Normalization with across maps.
+ *
+ * This Function comes from the paper
+ * "ImageNet Classification with Deep Convolutional Neural Networks".
+ *
+ * The original formula is:
+ *
+ *                                Input(i, x, y)
+ * Output(i, x, y) = ----------------------------------------------
+ *                                 -- upper
+ *                    (k + alpha * >  (Input(j, x, y))^2) ^ (beta)
+ *                                 -- j = lower
+ *
+ * upper is `min(C, c + N/2)`
+ * lower if `max(0, c - N/2)`
+ *
+ * Function implementation:
+ *
+ * inputs and outpus is NCHW format, while input.shape.ndims() is equal 4.
+ * And the meaning of each dimension(0-3) is respectively batch size,
+ * feature maps, rows and columns.
+ *
+ * Input and Output in the above formula is for each map(i) of one image, and
+ * Input(i, x, y), Output(i, x, y) represents an element in an image.
+ *
+ * C is the number of feature maps of one image, and N is a hyper-parameters
+ * is configured when Function is initialized. The sum in the denominator
+ * is the sum of the same position in the neighboring maps.
+ *
+ * In the implementation of Function, k is equal to 1,
+ * so Function has no argument for k.
+ *
+ * Function Arguments:
+ *
+ * \param size_      represent N
+ * \param scale_     represent alpha
+ * \param pow_       represent beta
+ * \param inputs[0]  represent Input
+ * \param outputs[0] represent Output
+ * \param outputs[1] represent The denominator in the formula(except beta)
+ *
+ * Note:
+ * Save output[1] is to simplify the backward calculation.
+ * TODO, if only consider the forward calculation, we can optimize to
+ * remove the output[1].
+ */
+template <DeviceType Device>
+class CrossMapNormalFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    // function arguments
+    size_ = config.get<size_t>("size");
+    scale_ = config.get<real>("scale");
+    pow_ = config.get<real>("pow");
+
+    // number of inputs and outputs
+    numInputs_ = 1;
+    numOutputs_ = 2;
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    check(inputs, outputs);
+    // ArgType check still on here,
+    // not sure whether it is better to put inside the check.
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+    CHECK_EQ(outputs[1].getArgType(), ASSIGN_TO);
+    size_t batchSize = inputs[0].shape()[0];
+    size_t maps = inputs[0].shape()[1];
+    size_t rows = inputs[0].shape()[2];
+    size_t columns = inputs[0].shape()[3];
+
+    CrossMapNormal<Device>(outputs[0].data<real>(),
+                           outputs[1].data<real>(),
+                           inputs[0].data<real>(),
+                           batchSize,
+                           maps,
+                           rows,
+                           columns,
+                           size_,
+                           scale_,
+                           pow_);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+
+    CHECK_EQ(inputs[0].shape().ndims(), (size_t)4);
+    CHECK(inputs[0].shape() == outputs[0].shape());
+    CHECK(inputs[0].shape() == outputs[1].shape());
+  }
+
+  // Only need the shape of the input, can calculate the
+  // floating-point operation.
+  size_t ops(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ((size_t)numInputs_, inputs.size());
+    size_t batchSize = inputs[0].shape()[0];
+    size_t maps = inputs[0].shape()[1];
+    size_t rows = inputs[0].shape()[2];
+    size_t columns = inputs[0].shape()[3];
+
+    // number of floating-point operations
+    // an approximate value
+    size_t ops = batchSize * maps * rows * columns * (size_ * 2 + 3);
+
+    return ops;
+  }
+
+ private:
+  size_t size_;
+  real scale_;
+  real pow_;
+};
+
+/**
+ * \brief Backward calculation for normalization with across maps.
+ *
+ * Function implementation:
+ *
+ * The implementation of this Function is derived from the
+ * CrossMapNormalFunc implementation.
+ *
+ * InputGrad = OutputGrad * denoms ^ (-beta)
+ *    -- upper
+ *  + > (OutputGrad * OutputValue * (-2 * alpha * beta) / denoms) * InputValue
+ *    -- lower
+ *
+ * The data of inputs/outputs format is the same as the forward interface
+ * and is NCHW.
+ *
+ * The upper and lower is the same as forward. The logic of the sum
+ * is also the same as forward.
+ *
+ * Function Arguments:
+ *
+ * \param size_      represent N
+ * \param scale_     represent alpha
+ * \param pow_       represent beta
+ * \param inputs[0]  represent InputValue, inputs[0] of CrossMapNormalFunc
+ * \param inputs[1]  represent OutputValue, outputs[0] of CrossMapNormalFunc
+ * \param inputs[2]  represent OutputGrad
+ * \param inputs[3]  represent denoms, outputs[1] of CrossMapNormalFunc
+ *                   This is the intermediate result that is
+ *                   preserved in the forward calculation.
+ * \param outputs[0] represent InputGrad
+ */
+template <DeviceType Device>
+class CrossMapNormalGradFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    // function arguments
+    size_ = config.get<size_t>("size");
+    scale_ = config.get<real>("scale");
+    pow_ = config.get<real>("pow");
+
+    // number of inputs and outputs
+    numInputs_ = 4;
+    numOutputs_ = 1;
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    check(inputs, outputs);
+    if (outputs[0].getArgType() != ADD_TO) {
+      // Currently, some algorithm implementations are ASSIGN_TO mode,
+      // if need to support the ADD_TO calculation, need to clear the output.
+      typename Tensor<real, Device>::Vector tmp(
+          outputs[0].shape().getElements(), outputs[0].data<real>());
+      tmp.zero();
+    }
+
+    size_t batchSize = inputs[0].shape()[0];
+    size_t maps = inputs[0].shape()[1];
+    size_t rows = inputs[0].shape()[2];
+    size_t columns = inputs[0].shape()[3];
+
+    CrossMapNormalGrad<Device>(outputs[0].data<real>(),
+                               inputs[0].data<real>(),
+                               inputs[1].data<real>(),
+                               inputs[2].data<real>(),
+                               inputs[3].data<real>(),
+                               batchSize,
+                               maps,
+                               rows,
+                               columns,
+                               size_,
+                               scale_,
+                               pow_);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+
+    CHECK_EQ(inputs[0].shape().ndims(), (size_t)4);
+    CHECK(inputs[0].shape() == inputs[1].shape());
+    CHECK(inputs[0].shape() == inputs[2].shape());
+    CHECK(inputs[0].shape() == inputs[3].shape());
+    CHECK(inputs[0].shape() == outputs[0].shape());
+  }
+
+  // Only need the shape of one input, can calculate the
+  // floating-point operation.
+  size_t ops(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_LT((size_t)1, inputs.size());
+    size_t batchSize = inputs[0].shape()[0];
+    size_t maps = inputs[0].shape()[1];
+    size_t rows = inputs[0].shape()[2];
+    size_t columns = inputs[0].shape()[3];
+
+    // number of floating-point operations
+    // an approximate value
+    size_t ops = batchSize * maps * rows * columns * (size_ * 4 + 2);
+
+    return ops;
+  }
+
+ private:
+  size_t size_;
+  real scale_;
+  real pow_;
+};
+
+REGISTER_TYPED_FUNC(CrossMapNormal, CPU, CrossMapNormalFunc);
+REGISTER_TYPED_FUNC(CrossMapNormalGrad, CPU, CrossMapNormalGradFunc);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_TYPED_FUNC(CrossMapNormal, GPU, CrossMapNormalFunc);
+REGISTER_TYPED_FUNC(CrossMapNormalGrad, GPU, CrossMapNormalGradFunc);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/CrossMapNormalOp.h b/paddle/legacy/function/CrossMapNormalOp.h
similarity index 100%
rename from paddle/function/CrossMapNormalOp.h
rename to paddle/legacy/function/CrossMapNormalOp.h
diff --git a/paddle/function/CrossMapNormalOpGpu.cu b/paddle/legacy/function/CrossMapNormalOpGpu.cu
similarity index 100%
rename from paddle/function/CrossMapNormalOpGpu.cu
rename to paddle/legacy/function/CrossMapNormalOpGpu.cu
diff --git a/paddle/function/CrossMapNormalOpTest.cpp b/paddle/legacy/function/CrossMapNormalOpTest.cpp
similarity index 100%
rename from paddle/function/CrossMapNormalOpTest.cpp
rename to paddle/legacy/function/CrossMapNormalOpTest.cpp
diff --git a/paddle/function/DepthwiseConvOp.cpp b/paddle/legacy/function/DepthwiseConvOp.cpp
similarity index 100%
rename from paddle/function/DepthwiseConvOp.cpp
rename to paddle/legacy/function/DepthwiseConvOp.cpp
diff --git a/paddle/function/DepthwiseConvOp.h b/paddle/legacy/function/DepthwiseConvOp.h
similarity index 100%
rename from paddle/function/DepthwiseConvOp.h
rename to paddle/legacy/function/DepthwiseConvOp.h
diff --git a/paddle/legacy/function/DepthwiseConvOpGpu.cu b/paddle/legacy/function/DepthwiseConvOpGpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..17138cc56390d0fcfb15d4b77a56eda466bcfd3c
--- /dev/null
+++ b/paddle/legacy/function/DepthwiseConvOpGpu.cu
@@ -0,0 +1,376 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "DepthwiseConvOp.h"
+#include "paddle/legacy/math/BaseMatrix.h"
+
+namespace paddle {
+
+// CUDA kernel to compute the depthwise convolution forward pass
+template <class T>
+__global__ void ConvolutionDepthwiseForward(const int nthreads,
+                                            const T* const inputData,
+                                            const T* const filterData,
+                                            const int batchSize,
+                                            const int outputChannels,
+                                            const int outputHeight,
+                                            const int outputWidth,
+                                            const int inputChannels,
+                                            const int inputHeight,
+                                            const int inputWidth,
+                                            const int filterMultiplier,
+                                            const int filterHeight,
+                                            const int filterWidth,
+                                            const int strideH,
+                                            const int strideW,
+                                            const int paddingH,
+                                            const int paddingW,
+                                            T* const outputData) {
+  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if (index < nthreads) {
+    const int batch = index / outputChannels / outputHeight / outputWidth;
+    const int c_out = (index / outputHeight / outputWidth) % outputChannels;
+    const int h_out = (index / outputWidth) % outputHeight;
+    const int w_out = index % outputWidth;
+
+    const int c_in = c_out / filterMultiplier;
+    const T* weight = filterData + c_out * filterHeight * filterWidth;
+    T value = 0;
+    const int h_in_start = -paddingH + h_out * strideH;
+    const int w_in_start = -paddingW + w_out * strideW;
+    const int h_in_end = -paddingH + h_out * strideH + filterHeight - 1;
+    const int w_in_end = -paddingW + w_out * strideW + filterWidth - 1;
+    if ((h_in_start >= 0) && (h_in_end < inputHeight) && (w_in_start >= 0) &&
+        (w_in_end < inputWidth)) {
+      for (int kh = 0; kh < filterHeight; ++kh) {
+        for (int kw = 0; kw < filterWidth; ++kw) {
+          const int h_in = -paddingH + h_out * strideH + kh;
+          const int w_in = -paddingW + w_out * strideW + kw;
+          const int offset =
+              ((batch * inputChannels + c_in) * inputHeight + h_in) *
+                  inputWidth +
+              w_in;
+          value += (*weight) * inputData[offset];
+          ++weight;
+        }
+      }
+    } else {
+      for (int kh = 0; kh < filterHeight; ++kh) {
+        for (int kw = 0; kw < filterWidth; ++kw) {
+          const int h_in = -paddingH + h_out * strideH + kh;
+          const int w_in = -paddingW + w_out * strideW + kw;
+          if ((h_in >= 0) && (h_in < inputHeight) && (w_in >= 0) &&
+              (w_in < inputWidth)) {
+            const int offset =
+                ((batch * inputChannels + c_in) * inputHeight + h_in) *
+                    inputWidth +
+                w_in;
+            value += (*weight) * inputData[offset];
+          }
+          ++weight;
+        }
+      }
+    }
+    outputData[index] = value;
+  }
+}
+
+// CUDA kernel to compute the depthwise convolution backprop w.r.t input.
+template <class T>
+__global__ void ConvolutionDepthwiseInputBackward(const int nthreads,
+                                                  const T* const top_diff,
+                                                  const T* const weight_data,
+                                                  const int num,
+                                                  const int outputChannels,
+                                                  const int outputHeight,
+                                                  const int outputWidth,
+                                                  const int inputChannels,
+                                                  const int inputHeight,
+                                                  const int inputWidth,
+                                                  const int filterMultiplier,
+                                                  const int filterHeight,
+                                                  const int filterWidth,
+                                                  const int strideH,
+                                                  const int strideW,
+                                                  const int paddingH,
+                                                  const int paddingW,
+                                                  T* const bottom_diff) {
+  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < nthreads) {
+    const int batch = index / inputChannels / inputHeight / inputWidth;
+    const int c_in = (index / inputHeight / inputWidth) % inputChannels;
+    const int h_in = (index / inputWidth) % inputHeight;
+    const int w_in = index % inputWidth;
+
+    const int c_out_start = c_in * filterMultiplier;
+
+    int h_out_start = (h_in - filterHeight + paddingH + strideH) / strideH;
+    h_out_start = 0 > h_out_start ? 0 : h_out_start;
+    int h_out_end = (h_in + paddingH) / strideH;
+    h_out_end = outputHeight - 1 < h_out_end ? outputHeight - 1 : h_out_end;
+    int w_out_start = (w_in - filterWidth + paddingW + strideW) / strideW;
+    w_out_start = 0 > w_out_start ? 0 : w_out_start;
+    int w_out_end = (w_in + paddingW) / strideW;
+    w_out_end = outputWidth - 1 < w_out_end ? outputWidth - 1 : w_out_end;
+
+    T value = 0;
+
+    for (int c_out = c_out_start; c_out < c_out_start + filterMultiplier;
+         c_out++) {
+      for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) {
+        const int filter_h = h_in + paddingH - h_out * strideH;
+        for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) {
+          const int filter_w = w_in + paddingW - w_out * strideW;
+          const int filter_offset = c_out * filterHeight * filterWidth +
+                                    filter_h * filterWidth + filter_w;
+          const int top_diff_offset =
+              ((batch * outputChannels + c_out) * outputHeight + h_out) *
+                  outputWidth +
+              w_out;
+          value += top_diff[top_diff_offset] * weight_data[filter_offset];
+        }
+      }
+    }
+    bottom_diff[index] += value;
+  }
+}
+
+// CUDA kernel to compute the depthwise convolution backprop w.r.t filter.
+template <class T>
+__global__ void ConvolutionDepthwiseFilterBackward(const int num_i,
+                                                   const int nthreads,
+                                                   const T* const top_diff,
+                                                   const T* const inputData,
+                                                   const int num,
+                                                   const int outputChannels,
+                                                   const int outputHeight,
+                                                   const int outputWidth,
+                                                   const int inputChannels,
+                                                   const int inputHeight,
+                                                   const int inputWidth,
+                                                   const int filterMultiplier,
+                                                   const int filterHeight,
+                                                   const int filterWidth,
+                                                   const int strideH,
+                                                   const int strideW,
+                                                   const int paddingH,
+                                                   const int paddingW,
+                                                   T* const buffer_data) {
+  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < nthreads) {
+    const int h_out = (index / outputWidth) % outputHeight;
+    const int w_out = index % outputWidth;
+    const int kh =
+        (index / filterWidth / outputHeight / outputWidth) % filterHeight;
+    const int kw = (index / outputHeight / outputWidth) % filterWidth;
+    const int h_in = -paddingH + h_out * strideH + kh;
+    const int w_in = -paddingW + w_out * strideW + kw;
+    if ((h_in >= 0) && (h_in < inputHeight) && (w_in >= 0) &&
+        (w_in < inputWidth)) {
+      const int c_out =
+          index / (filterHeight * filterWidth * outputHeight * outputWidth);
+      const int c_in = c_out / filterMultiplier;
+      const int batch = num_i;
+      const int top_offset =
+          ((batch * outputChannels + c_out) * outputHeight + h_out) *
+              outputWidth +
+          w_out;
+      const int bottom_offset =
+          ((batch * inputChannels + c_in) * inputHeight + h_in) * inputWidth +
+          w_in;
+      buffer_data[index] = top_diff[top_offset] * inputData[bottom_offset];
+    } else {
+      buffer_data[index] = 0;
+    }
+  }
+}
+
+template <class T>
+class DepthwiseConvFunctor<DEVICE_TYPE_GPU, T> {
+ public:
+  void operator()(const T* inputData,
+                  const T* filterData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterMultiplier,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* outputData) {
+    int outputSize = batchSize * outputChannels * outputHeight * outputWidth;
+
+    size_t blocks = (outputSize + 1024 - 1) / 1024;
+    size_t blockX = 512;
+    size_t blockY = (blocks + 512 - 1) / 512;
+    dim3 threads(1024, 1);
+    dim3 grid(blockX, blockY);
+
+    ConvolutionDepthwiseForward<T><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        outputSize,
+        inputData,
+        filterData,
+        batchSize,
+        outputChannels,
+        outputHeight,
+        outputWidth,
+        inputChannels,
+        inputHeight,
+        inputWidth,
+        filterMultiplier,
+        filterHeight,
+        filterWidth,
+        strideH,
+        strideW,
+        paddingH,
+        paddingW,
+        outputData);
+  }
+};
+
+template <class T>
+class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, T> {
+ public:
+  void operator()(const T* outputGrad,
+                  const T* filterData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterMultiplier,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* inputGrad) {
+    int inputSize = batchSize * inputChannels * inputHeight * inputWidth;
+
+    size_t blocks = (inputSize + 1024 - 1) / 1024;
+    size_t blockX = 512;
+    size_t blockY = (blocks + 512 - 1) / 512;
+    dim3 threads(1024, 1);
+    dim3 grid(blockX, blockY);
+
+    ConvolutionDepthwiseInputBackward<T>
+        // NOLINT_NEXT_LINE(whitespace/operators)
+        <<<grid, threads, 0, STREAM_DEFAULT>>>(inputSize,
+                                               outputGrad,
+                                               filterData,
+                                               batchSize,
+                                               outputChannels,
+                                               outputHeight,
+                                               outputWidth,
+                                               inputChannels,
+                                               inputHeight,
+                                               inputWidth,
+                                               filterMultiplier,
+                                               filterHeight,
+                                               filterWidth,
+                                               strideH,
+                                               strideW,
+                                               paddingH,
+                                               paddingW,
+                                               inputGrad);
+  }
+};
+
+template <class T>
+class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, T> {
+ public:
+  void operator()(const T* outputGrad,
+                  const T* inputData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterMultiplier,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* colData,
+                  T* filterGrad) {
+    int colDataSize = outputChannels * filterHeight * filterWidth *
+                      outputHeight * outputWidth;
+
+    size_t blocks = (colDataSize + 1024 - 1) / 1024;
+    size_t blockX = 512;
+    size_t blockY = (blocks + 512 - 1) / 512;
+    dim3 threads(1024, 1);
+    dim3 grid(blockX, blockY);
+    BaseMatrix filterGradMatrix(outputChannels * filterHeight * filterWidth,
+                                1,
+                                filterGrad,
+                                false,
+                                true);
+
+    for (int i = 0; i < batchSize; i++) {
+      ConvolutionDepthwiseFilterBackward<
+          T><<<grid, threads, 0, STREAM_DEFAULT>>>(i,
+                                                   colDataSize,
+                                                   outputGrad,
+                                                   inputData,
+                                                   batchSize,
+                                                   outputChannels,
+                                                   outputHeight,
+                                                   outputWidth,
+                                                   inputChannels,
+                                                   inputHeight,
+                                                   inputWidth,
+                                                   filterMultiplier,
+                                                   filterHeight,
+                                                   filterWidth,
+                                                   strideH,
+                                                   strideW,
+                                                   paddingH,
+                                                   paddingW,
+                                                   colData);
+      int K = outputHeight * outputWidth;
+      int M = colDataSize / K;
+
+      BaseMatrix colMatrix(M, K, colData, false, true);
+      filterGradMatrix.sumRows(colMatrix, (T)1.0, (T)1.0);
+    }
+  }
+};
+
+#ifdef PADDLE_TYPE_DOUBLE
+template class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, double>;
+template class DepthwiseConvFunctor<DEVICE_TYPE_GPU, double>;
+template class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, double>;
+#else
+template class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, float>;
+template class DepthwiseConvFunctor<DEVICE_TYPE_GPU, float>;
+template class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, float>;
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/DepthwiseConvOpTest.cpp b/paddle/legacy/function/DepthwiseConvOpTest.cpp
similarity index 100%
rename from paddle/function/DepthwiseConvOpTest.cpp
rename to paddle/legacy/function/DepthwiseConvOpTest.cpp
diff --git a/paddle/legacy/function/EigenGemm.cpp b/paddle/legacy/function/EigenGemm.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5929c5c68ec818c2307580b06f76c63f04e0db5f
--- /dev/null
+++ b/paddle/legacy/function/EigenGemm.cpp
@@ -0,0 +1,102 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <glog/logging.h>
+#include "paddle/legacy/function/EigenThreadDevice.h"
+
+namespace paddle {
+
+template <class T>
+struct EigenBlasGemm {
+  typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, int>,
+                           Eigen::Aligned>
+      EigenMatrix;
+
+  static void compute(const bool transA,
+                      const bool transB,
+                      const int M,
+                      const int N,
+                      const int K,
+                      const T alpha,
+                      const T* A,
+                      const int lda,
+                      const T* B,
+                      const int ldb,
+                      const T beta,
+                      T* C,
+                      const int ldc) {
+    Eigen::array<int, 2> sizeA;
+    if (transA) {
+      sizeA[0] = K;
+      sizeA[1] = M;
+      CHECK_EQ(M, lda);
+    } else {
+      sizeA[0] = M;
+      sizeA[1] = K;
+      CHECK_EQ(K, lda);
+    }
+    Eigen::array<int, 2> sizeB;
+    if (transB) {
+      sizeB[0] = N;
+      sizeB[1] = K;
+      CHECK_EQ(K, ldb);
+    } else {
+      sizeB[0] = K;
+      sizeB[1] = N;
+      CHECK_EQ(N, ldb);
+    }
+    Eigen::array<int, 2> sizeC = {{M, ldc}};
+    Eigen::array<int, 2> offsetC = {{0, 0}};
+    Eigen::array<int, 2> extentC = {{M, N}};
+
+    const EigenMatrix a(const_cast<T*>(A), sizeA);
+    const EigenMatrix b(const_cast<T*>(B), sizeB);
+    EigenMatrix c(C, sizeC);
+
+    typedef typename Eigen::Tensor<T, 2>::DimensionPair DimPair;
+    Eigen::array<DimPair, 1> dims;
+    dims[0] = DimPair(1, 0);
+    dims[0].first = transA ? 0 : 1;
+    dims[0].second = transB ? 1 : 0;
+
+    auto* device = EigenDeviceWarpper::device();
+    if (N == ldc) {
+      if (alpha == T(1) && beta == T(0)) {
+        c.device(*device) = a.contract(b, dims);
+      } else if (alpha == T(1) && beta == T(1)) {
+        c.device(*device) += a.contract(b, dims);
+      } else {
+        c.device(*device) = alpha * a.contract(b, dims) + beta * c;
+      }
+    } else {
+      if (alpha == T(1) && beta == T(0)) {
+        c.slice(offsetC, extentC).device(*device) = a.contract(b, dims);
+      } else if (alpha == T(1) && beta == T(1)) {
+        c.slice(offsetC, extentC).device(*device) += a.contract(b, dims);
+      } else {
+        c.slice(offsetC, extentC).device(*device) =
+            alpha * a.contract(b, dims) + beta * c.slice(offsetC, extentC);
+      }
+    }
+    EigenDeviceWarpper::free_device(device);
+  }
+};
+
+#ifdef PADDLE_TYPE_DOUBLE
+template struct EigenBlasGemm<double>;
+#else
+template struct EigenBlasGemm<float>;
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/EigenThreadDevice.h b/paddle/legacy/function/EigenThreadDevice.h
similarity index 100%
rename from paddle/function/EigenThreadDevice.h
rename to paddle/legacy/function/EigenThreadDevice.h
diff --git a/paddle/function/Function.cpp b/paddle/legacy/function/Function.cpp
similarity index 100%
rename from paddle/function/Function.cpp
rename to paddle/legacy/function/Function.cpp
diff --git a/paddle/legacy/function/Function.h b/paddle/legacy/function/Function.h
new file mode 100644
index 0000000000000000000000000000000000000000..bc5ef7e6f20b63a120a577ded876820aafecff19
--- /dev/null
+++ b/paddle/legacy/function/Function.h
@@ -0,0 +1,214 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <map>
+#include <vector>
+#include "BufferArg.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/Any.h"
+#include "paddle/legacy/utils/ClassRegistrar.h"
+#include "paddle/legacy/utils/Error.h"
+
+namespace paddle {
+
+/**
+ * Function Configuration.
+ * The argument type of Function::init.
+ */
+class FuncConfig {
+ public:
+  template <typename T>
+  T get(const std::string& key, Error* err = nullptr) const {
+    try {
+      return any_cast<T>(valueMap_.at(key));
+    } catch (std::exception& e) {  // could be cast or out of range exception.
+      if (err) {
+        *err = Error(e.what());
+      } else {
+        LOG(FATAL) << "Cannot get key " << key << " with error " << e.what();
+      }
+      return T();
+    }
+  }
+
+  template <typename T>
+  FuncConfig& set(const std::string& key, T v, Error* err = nullptr) {
+    auto it = valueMap_.find(key);
+    if (it != valueMap_.end()) {  // already contains key.
+      if (err) {
+        *err = Error("Key %s is already set in FuncConfig", key.c_str());
+      } else {
+        LOG(FATAL) << "Key " << key << " is already set in FuncConfig.";
+      }
+      return *this;
+    }
+    valueMap_[key] = any(v);
+    return *this;
+  }
+
+ protected:
+  mutable std::unordered_map<std::string, any> valueMap_;
+};
+
+/**
+ * Argument type for Function::calc().
+ * A BufferArgs contains a set of BufferArg,
+ * because Function can have multiple inputs and outputs.
+ *
+ * addArg() with Matix object used to adapt Layer Argument.
+ * Will create a BufferArg object in addArg(),
+ * and free in destructor of BufferArgs.
+ *
+ * addArg() with BufferArg object, just save BufferArg object address,
+ * and the caller needs to guarantee the validity of the BufferArg object
+ * in the BufferArgs life time.
+ */
+class BufferArgs {
+ public:
+  BufferArgs() {}
+
+  ~BufferArgs() {
+    for (auto arg : _args_) {
+      delete arg;
+    }
+  }
+
+  size_t size() const { return args_.size(); }
+
+  // add argument into BufferArgs
+  // Tensor can be Matrix, Vector, IVector.
+  // For inputs, do not need argType.
+  // For outputs, the argType needs to be specified as ASSIGN_TO or ADD_TO.
+  void addArg(const Matrix& arg, ArgType argType = UNSPECIFIED) {
+    _args_.push_back(new BufferArg(arg, argType));
+    addArg(*_args_.back());
+  }
+
+  void addArg(const Vector& arg, ArgType argType = UNSPECIFIED) {
+    _args_.push_back(new BufferArg(arg, argType));
+    addArg(*_args_.back());
+  }
+
+  void addArg(const IVector& arg, ArgType argType = UNSPECIFIED) {
+    _args_.push_back(new BufferArg(arg, argType));
+    addArg(*_args_.back());
+  }
+
+  // Add arg into BufferArgs and reshape the arg.
+  //
+  // For example, arg represents an image buffer,
+  // but Matrix can only represent a two-dimensional Tensor.
+  // So need an extra argument to describe the shape of the image buffer.
+  void addArg(const Matrix& arg,
+              const TensorShape& shape,
+              ArgType argType = UNSPECIFIED);
+
+  void addArg(const CpuSparseMatrix& arg, ArgType argType = UNSPECIFIED);
+  void addArg(const GpuSparseMatrix& arg, ArgType argType = UNSPECIFIED);
+
+  void addArg(const Matrix& matrix,
+              const IVector& vector,
+              ArgType argType = UNSPECIFIED);
+
+  // get argument
+  const BufferArg& operator[](size_t num) const {
+    CHECK_LT(num, args_.size());
+    return *args_[num];
+  }
+
+  void addArg(BufferArg& arg) { args_.push_back(&arg); }
+
+  void addArg(SequenceIdArg& arg) { args_.push_back(&arg); }
+
+  void addArg(SequenceArg& arg) { args_.push_back(&arg); }
+
+  void addArg(SparseMatrixArg& arg) { args_.push_back(&arg); }
+
+ private:
+  std::vector<BufferArg*> args_;
+  // The BufferArg object is constructed and freed by BufferArgs.
+  std::vector<BufferArg*> _args_;
+};
+
+/**
+ * \brief Base class for Function.
+ * The basic Function implementation requires override init and calc interfaces.
+ *
+ * The caller needs to ensure the validity of the arguments
+ * during Function execution.
+ *
+ * Function inputs are readonly, Function outputs have two modes: ASSIGN_TO
+ * and ADD_TO.
+ * If output.getArgType() == ASSIGN_TO, this is assign mode, and the calculation
+ * result of Function assigned to the output BufferArg.
+ * If output.getArgType() == ADD_TO, this is add mode, and the calculation
+ * result of Function need added to the output BufferArg.
+ *
+ * For example:
+ * ASSIGN_TO: output = Function(inputs)
+ * ADD_TO: output += Function(inputs)
+ * If Function has more than one output, each output can have different modes.
+ */
+class FunctionBase {
+ public:
+  virtual ~FunctionBase() {}
+
+  virtual void init(const FuncConfig& config) {}
+
+  virtual void calc(const BufferArgs& inputs, const BufferArgs& outputs) {}
+
+  // This member function is used to check whether the BufferType and shape of
+  // the inputs and outputs arguments of the Function are correct.
+  // General calc function which will call this check to do arguments check.
+  // And before the calc called, the caller can also check their own arguments.
+  virtual void check(const BufferArgs& inputs, const BufferArgs& outputs) {}
+
+  // Calculate the number of floating-point operations of this Function.
+  // The inputs and outputs arguments do not need to contain the actual data,
+  // only the shape.
+  // And some Functions have the same input and output shapes,
+  // so you may not need to enter the complete number of arguments.
+  // But entering the full arguments is always correct for this interface.
+  virtual size_t ops(const BufferArgs& inputs, const BufferArgs& outputs) {
+    return 0;
+  }
+
+  int getNumInputs() const { return numInputs_; }
+
+  int getNumOutputs() const { return numOutputs_; }
+
+  static ClassRegistrar<FunctionBase> funcRegistrar_;
+
+ protected:
+  // numInputs_ and numOutputs_ represents the maximum
+  // input and output supported by Function.
+  // Some functions are optimized for input and output,
+  // so when comparing the number of arguments, for these functions
+  // inputs.size() <= numInputs_ or outputs.size() <= numOutputs_
+  size_t numInputs_;
+  size_t numOutputs_;
+};
+
+#define FUNC_NAME(typeName, deviceName) #typeName "-" #deviceName
+
+#define REGISTER_TYPED_FUNC(typeName, deviceName, className)   \
+  static InitFunction __reg_type_##typeName##deviceName([]() { \
+    FunctionBase::funcRegistrar_                               \
+        .registerClass<className<DEVICE_TYPE_##deviceName>>(   \
+            FUNC_NAME(typeName, deviceName));                  \
+  })
+
+}  // namespace paddle
diff --git a/paddle/legacy/function/FunctionTest.cpp b/paddle/legacy/function/FunctionTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1a0993e3135bcad9eb8a431e079ed56a267174ea
--- /dev/null
+++ b/paddle/legacy/function/FunctionTest.cpp
@@ -0,0 +1,166 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Function.h"
+#include <gtest/gtest.h>
+#include "paddle/legacy/math/SparseMatrix.h"
+
+namespace paddle {
+
+template <DeviceType DType>
+void FunctionApi(typename Tensor<real, DType>::Matrix& output,
+                 const typename Tensor<real, DType>::Matrix& input);
+
+template <>
+void FunctionApi<DEVICE_TYPE_CPU>(CpuMatrix& output, const CpuMatrix& input) {
+  EXPECT_EQ(output.getHeight(), 100U);
+  EXPECT_EQ(output.getWidth(), 200U);
+}
+
+template <>
+void FunctionApi<DEVICE_TYPE_GPU>(GpuMatrix& output, const GpuMatrix& input) {
+  EXPECT_EQ(output.getHeight(), 10U);
+  EXPECT_EQ(output.getWidth(), 20U);
+}
+
+template <DeviceType DType>
+void Function(const BufferArgs& arguments) {
+  const auto input = arguments[0].matrix<DType>();
+  auto output = arguments[1].matrix<DType>();
+  FunctionApi<DType>(output, input);
+}
+
+TEST(Function, BufferArgs) {
+  CpuMatrix cpuInput = CpuMatrix(100, 200);
+  CpuMatrix cpuOutput = CpuMatrix(100, 200);
+  BufferArgs cpuArgments;
+  cpuArgments.addArg(cpuInput);
+  cpuArgments.addArg(cpuOutput);
+  Function<DEVICE_TYPE_CPU>(cpuArgments);
+
+  GpuMatrix gpuInput = GpuMatrix(10, 20);
+  GpuMatrix gpuOutput = GpuMatrix(10, 20);
+  BufferArgs gpuArgments;
+  gpuArgments.addArg(gpuInput);
+  gpuArgments.addArg(gpuOutput);
+  Function<DEVICE_TYPE_GPU>(gpuArgments);
+}
+
+/**
+ * Some tests case are used to check the consistency between the BufferArg type
+ * argument received by Function and the original type argument.
+ *
+ * Use Case:
+ *  TEST() {
+ *    Matrix matrix(...);
+ *    CheckBufferArg lambda = [=](const BufferArg& arg) {
+ *      // check matrix and arg are equivalent
+ *      EXPECT_EQ(matrix, arg);
+ *    }
+ *
+ *   BufferArgs argments{matrix...};
+ *   std::vector<CheckBufferArg> checkFunc{lambda...};
+ *   testBufferArgs(argments, checkFunc);
+ *  }
+ */
+typedef std::function<void(const BufferArg&)> CheckBufferArg;
+
+void testBufferArgs(const BufferArgs& inputs,
+                    const std::vector<CheckBufferArg>& check) {
+  EXPECT_EQ(inputs.size(), check.size());
+  for (size_t i = 0; i < inputs.size(); i++) {
+    check[i](inputs[i]);
+  }
+}
+
+void testBufferArgs(const BufferArgs& inputs, const CheckBufferArg& check) {
+  EXPECT_EQ(inputs.size(), 1U);
+  check(inputs[0]);
+}
+
+TEST(Arguments, Matrix) {
+  MatrixPtr matrix = Matrix::create(100, 200);
+  CheckBufferArg check = [=](const BufferArg& arg) {
+    EXPECT_EQ(arg.shape().ndims(), 2U);
+    EXPECT_EQ(arg.shape()[0], 100U);
+    EXPECT_EQ(arg.shape()[1], 200U);
+    EXPECT_EQ(arg.data(), matrix->getData());
+
+    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getHeight(), matrix->getHeight());
+    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getWidth(), matrix->getWidth());
+    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getData(), matrix->getData());
+  };
+
+  BufferArgs argments;
+  argments.addArg(*matrix);
+  std::vector<CheckBufferArg> checkFunc;
+  checkFunc.push_back(check);
+  testBufferArgs(argments, checkFunc);
+}
+
+TEST(Arguments, Vector) {
+  VectorPtr vector = Vector::create(100, false);
+  CheckBufferArg check = [=](const BufferArg& arg) {
+    EXPECT_EQ(arg.shape().ndims(), 1U);
+    EXPECT_EQ(arg.shape()[0], 100U);
+    EXPECT_EQ(arg.data(), vector->getData());
+
+    CpuVector inVector = arg.vector<real, DEVICE_TYPE_CPU>();
+    EXPECT_EQ(inVector.getSize(), vector->getSize());
+    EXPECT_EQ(inVector.getData(), vector->getData());
+  };
+
+  BufferArgs argments;
+  argments.addArg(*vector);
+  std::vector<CheckBufferArg> checkFunc;
+  checkFunc.push_back(check);
+  testBufferArgs(argments, checkFunc);
+}
+
+TEST(Arguments, CpuSparseMatrix) {
+  CpuSparseMatrix sparse(200, 300, 50);
+  CheckBufferArg check = [=](const BufferArg& arg) {
+    EXPECT_EQ(arg.shape().ndims(), 2U);
+    EXPECT_EQ(arg.shape()[0], 200U);
+    EXPECT_EQ(arg.shape()[1], 300U);
+    EXPECT_EQ(arg.data(), sparse.getData());
+    // CHECK_EQ(arg.sparse().nnz(), 50);
+    // CHECK_EQ(arg.sparse().dataFormat(), SPARSE_CSR_FORMAT);
+    // CHECK_EQ(arg.sparse().dataType(), SPARSE_FLOAT_VALUE);
+    EXPECT_EQ(arg.sparse().getRowBuf(), sparse.getRows());
+    EXPECT_EQ(arg.sparse().getColBuf(), sparse.getCols());
+  };
+
+  BufferArgs argments;
+  argments.addArg(sparse);
+  std::vector<CheckBufferArg> checkFunc;
+  checkFunc.push_back(check);
+  testBufferArgs(argments, checkFunc);
+}
+
+TEST(Arguments, BufferArg) {
+  BufferArg arg(nullptr, VALUE_TYPE_FLOAT, {1, 2, 3});
+  CheckBufferArg check = [=](const BufferArg& arg) {
+    EXPECT_EQ(arg.shape().ndims(), 3U);
+    EXPECT_EQ(arg.shape()[0], 1U);
+    EXPECT_EQ(arg.shape()[1], 2U);
+    EXPECT_EQ(arg.shape()[2], 3U);
+  };
+
+  BufferArgs argments;
+  argments.addArg(arg);
+  testBufferArgs(argments, check);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/function/FunctionTest.h b/paddle/legacy/function/FunctionTest.h
new file mode 100644
index 0000000000000000000000000000000000000000..6f01981a34bff0a7d9bb04d0a0012117ecf5f803
--- /dev/null
+++ b/paddle/legacy/function/FunctionTest.h
@@ -0,0 +1,410 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Function.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/SparseMatrix.h"
+#include "paddle/legacy/math/tests/TensorCheck.h"
+#include "paddle/testing/TestUtil.h"
+
+namespace paddle {
+
+typedef std::shared_ptr<BufferArg> BufferArgPtr;
+
+namespace test {
+template <DeviceType DType>
+struct Allocator;
+
+template <>
+struct Allocator<DEVICE_TYPE_CPU> {
+  using type = CpuMemoryHandle;
+};
+
+template <>
+struct Allocator<DEVICE_TYPE_GPU> {
+  using type = GpuMemoryHandle;
+};
+
+// Copy argument1 to argument2
+template <DeviceType DType1, DeviceType DType2>
+class CopyArgument {
+ public:
+  void operator()(const BufferArg& arg1, BufferArg& arg2) {
+    CHECK_EQ(arg1.valueType(), arg2.valueType());
+    CHECK_LE(arg1.shape().getElements(), arg2.shape().getElements());
+
+    if (arg1.valueType() == VALUE_TYPE_INT32) {
+      IVectorPtr vector1 =
+          IVector::create((int*)arg1.data(),
+                          arg1.shape().getElements(),
+                          DType1 == DEVICE_TYPE_CPU ? false : true);
+      IVectorPtr vector2 =
+          IVector::create((int*)arg2.data(),
+                          arg2.shape().getElements(),
+                          DType2 == DEVICE_TYPE_CPU ? false : true);
+      vector2->copyFrom(*vector1);
+    } else {
+      VectorPtr vector1 =
+          Vector::create((real*)arg1.data(),
+                         arg1.shape().getElements(),
+                         DType1 == DEVICE_TYPE_CPU ? false : true);
+      VectorPtr vector2 =
+          Vector::create((real*)arg2.data(),
+                         arg2.shape().getElements(),
+                         DType2 == DEVICE_TYPE_CPU ? false : true);
+      vector2->copyFrom(*vector1);
+    }
+  }
+};
+}  // namespace test
+
+/**
+ * \brief A class for comparing two Functions of different implementations.
+ *        For example, can be used to compare the CPU and GPU implementation
+ *        of the function is consistent.
+ *
+ * Use case:
+ *  // Initializes a test object, the corresponding cpu and gpu Function
+ *  // are constructed according to FunctionName and FuncConfig.
+ *  CpuGpuFuncCompare test(FunctionName, FuncConfig);
+ *  // Prepare inputs and outputs arguments.
+ *  // Here the input and output can not contain real data,
+ *  // only contains the argument type and shape.
+ *  test.addInputs(input1);
+ *  test.addInputs(input2);
+ *  test.addOutputs(output1);
+ *  test.addOutputs(output2);
+ *  // Run.
+ *  // Will according to the type and shape of arguments(inputs_/outputs_),
+ *  // automatic initialization cpu and gpu function required arguments
+ *  // (cpuInputs_/cpuOutputs_/gpuInputs_/gpuOutputs_).
+ *  // Call the CPU and GPU Function calculation results.
+ *  // Compares CPU and GPU calculation results for consistency.
+ *  test.run();
+ */
+template <DeviceType DType1, DeviceType DType2>
+class Compare2Function {
+ public:
+  typedef typename test::Allocator<DType1>::type Allocator1;
+  typedef typename test::Allocator<DType2>::type Allocator2;
+  typedef typename Tensor<real, DType1>::Vector Vector1;
+  typedef typename Tensor<real, DType2>::Vector Vector2;
+  typedef typename Tensor<real, DType1>::SparseMatrix SparseMatrix1;
+  typedef typename Tensor<real, DType2>::SparseMatrix SparseMatrix2;
+
+  Compare2Function(const std::string& name1,
+                   const std::string& name2,
+                   const FuncConfig& config)
+      : function1_(FunctionBase::funcRegistrar_.createByType(name1)),
+        function2_(FunctionBase::funcRegistrar_.createByType(name2)) {
+    function1_->init(config);
+    function2_->init(config);
+    initArgsCallback_ = nullptr;
+  }
+
+  ~Compare2Function() {}
+
+  // input need only contains shape, do not contains data.
+  void addInputs(const BufferArg& input) {
+    size_t size =
+        input.shape().getElements() * sizeOfValuType(input.valueType());
+    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
+    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
+
+    func1Inputs_.emplace_back(std::make_shared<BufferArg>(
+        func1Memory_.back()->getBuf(), input.valueType(), input.shape()));
+    func2Inputs_.emplace_back(std::make_shared<BufferArg>(
+        func2Memory_.back()->getBuf(), input.valueType(), input.shape()));
+  }
+
+  // assume one copy of sequence is shared by different SequenceArgs
+  void addSequence(const SequenceIdArg& input) {
+    CHECK_EQ(input.shape().ndims(), 1UL);
+    size_t batchSize = input.shape()[0];
+    size_t numSeqs = batchSize / 10 + 1;
+    size_t sizeId = (numSeqs + 1) * sizeOfValuType(VALUE_TYPE_INT32);
+    func1Memory_.emplace_back(std::make_shared<Allocator1>(sizeId));
+    func2Memory_.emplace_back(std::make_shared<Allocator2>(sizeId));
+    seq1_ = std::make_shared<SequenceIdArg>(func1Memory_.back()->getBuf(),
+                                            TensorShape{numSeqs + 1});
+    seq2_ = std::make_shared<SequenceIdArg>(func2Memory_.back()->getBuf(),
+                                            TensorShape{numSeqs + 1});
+    /// init sequence Id
+    initArg(*seq1_, batchSize);
+
+    copyArg_(*seq1_, *seq2_);
+  }
+
+  void addInputs(const SequenceArg& input) {
+    CHECK_EQ(input.shape().ndims(), 2UL);
+    size_t batchSize = input.shape()[0];
+    if (!seq1_ || !seq2_) {  // sequence not exist
+      addSequence(SequenceIdArg(TensorShape{batchSize}));
+    }
+
+    size_t size =
+        input.shape().getElements() * sizeOfValuType(input.valueType());
+    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
+    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
+
+    /// SequenceArg
+    func1Inputs_.emplace_back(
+        std::make_shared<SequenceArg>(func1Memory_.back()->getBuf(),
+                                      input.valueType(),
+                                      input.shape(),
+                                      *seq1_));
+    func2Inputs_.emplace_back(
+        std::make_shared<SequenceArg>(func2Memory_.back()->getBuf(),
+                                      input.valueType(),
+                                      input.shape(),
+                                      *seq2_));
+  }
+
+  void registerInitCallback(std::function<void(BufferArg&, size_t)> callback) {
+    initArgsCallback_ = callback;
+  }
+
+  // output need only contains shape, do not contains data.
+  void addOutputs(const BufferArg& output, ArgType argType = ASSIGN_TO) {
+    size_t size =
+        output.shape().getElements() * sizeOfValuType(output.valueType());
+    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
+    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
+
+    func1Outputs_.emplace_back(
+        std::make_shared<BufferArg>(func1Memory_.back()->getBuf(),
+                                    output.valueType(),
+                                    output.shape(),
+                                    argType));
+    func2Outputs_.emplace_back(
+        std::make_shared<BufferArg>(func2Memory_.back()->getBuf(),
+                                    output.valueType(),
+                                    output.shape(),
+                                    argType));
+  }
+
+  /// add and init output sparse matrix
+  void addOutputs(const SparseMatrixArg& output, ArgType argType = ASSIGN_TO) {
+    sparse1_ = std::make_shared<SparseMatrix1>(
+        output.shape()[0],
+        output.shape()[1],
+        output.nnz(),
+        static_cast<SparseValueType>(output.dataType()),
+        static_cast<SparseFormat>(output.dataFormat()));
+
+    sparse2_ = std::make_shared<SparseMatrix2>(
+        output.shape()[0],
+        output.shape()[1],
+        output.nnz(),
+        static_cast<SparseValueType>(output.dataType()),
+        static_cast<SparseFormat>(output.dataFormat()));
+
+    /// init sparse matrix
+    hl_stream_t stream(HPPL_STREAM_1);
+    sparse1_->randomizeUniform();
+    sparse2_->copyFrom(*sparse1_, stream);
+    hl_stream_synchronize(stream);
+
+    func1Outputs_.emplace_back(
+        std::make_shared<SparseMatrixArg>(*sparse1_, argType));
+    func2Outputs_.emplace_back(
+        std::make_shared<SparseMatrixArg>(*sparse2_, argType));
+  }
+
+  void addOutputs(const SequenceArg& output, ArgType argType = ASSIGN_TO) {
+    CHECK_EQ(output.shape().ndims(), 2UL);
+    size_t batchSize = output.shape()[0];
+
+    if (!seq1_ || !seq2_) {  // sequence not exist
+      addSequence(SequenceIdArg(TensorShape{batchSize}));
+    }
+    size_t size =
+        output.shape().getElements() * sizeOfValuType(output.valueType());
+    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
+    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
+
+    /// SequenceArg
+    func1Outputs_.emplace_back(
+        std::make_shared<SequenceArg>(func1Memory_.back()->getBuf(),
+                                      output.valueType(),
+                                      output.shape(),
+                                      *seq1_,
+                                      argType));
+    func2Outputs_.emplace_back(
+        std::make_shared<SequenceArg>(func2Memory_.back()->getBuf(),
+                                      output.valueType(),
+                                      output.shape(),
+                                      *seq2_,
+                                      argType));
+  }
+
+  void addInputs(const SparseMatrixArg& input) {
+    sparse1_ = std::make_shared<SparseMatrix1>(
+        input.shape()[0],
+        input.shape()[1],
+        input.nnz(),
+        static_cast<SparseValueType>(input.dataType()),
+        static_cast<SparseFormat>(input.dataFormat()));
+
+    sparse2_ = std::make_shared<SparseMatrix2>(
+        input.shape()[0],
+        input.shape()[1],
+        input.nnz(),
+        static_cast<SparseValueType>(input.dataType()),
+        static_cast<SparseFormat>(input.dataFormat()));
+
+    /// init sparse matrix
+    hl_stream_t stream(HPPL_STREAM_1);
+    sparse1_->randomizeUniform();
+    sparse2_->copyFrom(*sparse1_, stream);
+    hl_stream_synchronize(stream);
+
+    func1Inputs_.emplace_back(std::make_shared<SparseMatrixArg>(*sparse1_));
+    func2Inputs_.emplace_back(std::make_shared<SparseMatrixArg>(*sparse2_));
+  }
+
+  void run() {
+    // prepare cpu/gpu arguments
+    initInputs();
+
+    initOutputs();
+    // function calculate
+    auto callFunction = [](FunctionBase* function,
+                           std::vector<BufferArgPtr>& inputs,
+                           std::vector<BufferArgPtr>& outputs) {
+      BufferArgs inArgs;
+      BufferArgs outArgs;
+      for (auto arg : inputs) {
+        inArgs.addArg(*arg);
+      }
+      for (auto arg : outputs) {
+        outArgs.addArg(*arg);
+      }
+      function->calc(inArgs, outArgs);
+    };
+
+    callFunction(function1_.get(), func1Inputs_, func1Outputs_);
+    callFunction(function2_.get(), func2Inputs_, func2Outputs_);
+
+    // check outputs
+    compareOutputs();
+  }
+
+  std::shared_ptr<FunctionBase> getFunction1() const { return function1_; }
+
+  std::shared_ptr<FunctionBase> getFunction2() const { return function2_; }
+
+ protected:
+  // only init cpu argument, gpu argument copy from cpu argument.
+  void initArg(BufferArg& arg) {
+    Vector1 vector(arg.shape().getElements(), (real*)arg.data());
+    vector.uniform(0.001, 1);
+  }
+
+  void initArg(SequenceArg& arg) {
+    /// init only matrix
+    Vector1 vector(arg.shape().getElements(), (real*)arg.data());
+    vector.uniform(0.001, 1);
+  }
+
+  void initArg(SequenceIdArg& arg, size_t batchSize) {
+    size_t numSeqs = arg.numSeqs();
+    int* buf = reinterpret_cast<int*>(arg.data());
+    int pos = 0;
+    size_t maxLen = 2 * batchSize / numSeqs;
+    for (int i = 0; i < (int)numSeqs; ++i) {
+      int len = 1 + uniformRandom(std::min<int64_t>(
+                        maxLen, batchSize - pos - numSeqs + i));
+      buf[i] = pos;
+      pos += len;
+      VLOG(1) << " len=" << len;
+    }
+    buf[numSeqs] = batchSize;
+  }
+
+  void initInputs() {
+    for (size_t i = 0; i < func1Inputs_.size(); i++) {
+      if (func1Inputs_[i]->isSparseArg()) {
+        continue;  /// sparse matrix already init
+      }
+
+      if (func1Inputs_[i]->isSequenceArg()) {
+        initArg(dynamic_cast<SequenceArg&>(*func1Inputs_[i]));
+      } else {
+        initArg(*func1Inputs_[i]);
+      }
+
+      if (initArgsCallback_ != nullptr) {
+        initArgsCallback_(*func1Inputs_[i], i);
+      }
+
+      copyArg_(*func1Inputs_[i], *func2Inputs_[i]);
+    }
+  }
+
+  void initOutputs() {
+    for (size_t i = 0; i < func1Outputs_.size(); i++) {
+      if (func1Outputs_[i]->isSparseArg()) {
+        continue;  /// sparse matrix already init
+      }
+
+      if (func1Outputs_[i]->isSequenceArg()) {
+        initArg(dynamic_cast<SequenceArg&>(*func1Outputs_[i]));
+      } else {
+        initArg(*func1Outputs_[i]);
+      }
+
+      copyArg_(*func1Outputs_[i], *func2Outputs_[i]);
+    }
+  }
+
+  void compareOutputs() {
+    for (size_t i = 0; i < func1Outputs_.size(); i++) {
+      // TODO, Need a BufferCheck used to compare the two buffers.
+      const auto cpu = func1Outputs_[i];
+      const auto gpu = func2Outputs_[i];
+      CHECK_EQ(cpu->numElements(), gpu->numElements());
+      Vector1 cpuVector(cpu->numElements(), (real*)cpu->data());
+      Vector2 gpuVector(gpu->numElements(), (real*)gpu->data());
+      autotest::TensorCheckErr(cpuVector, gpuVector);
+    }
+  }
+
+ protected:
+  std::shared_ptr<FunctionBase> function1_;
+  std::shared_ptr<FunctionBase> function2_;
+  std::vector<std::shared_ptr<Allocator1>> func1Memory_;
+  std::vector<std::shared_ptr<Allocator2>> func2Memory_;
+  std::vector<BufferArgPtr> func1Inputs_;
+  std::vector<BufferArgPtr> func1Outputs_;
+  std::vector<BufferArgPtr> func2Inputs_;
+  std::vector<BufferArgPtr> func2Outputs_;
+  std::shared_ptr<SparseMatrix1> sparse1_;
+  std::shared_ptr<SparseMatrix2> sparse2_;
+  std::shared_ptr<SequenceIdArg> seq1_;
+  std::shared_ptr<SequenceIdArg> seq2_;
+  test::CopyArgument<DType1, DType2> copyArg_;
+  std::function<void(BufferArg&, size_t)> initArgsCallback_;
+};
+
+class CpuGpuFuncCompare
+    : public Compare2Function<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> {
+ public:
+  CpuGpuFuncCompare(const std::string& name, const FuncConfig& config)
+      : Compare2Function(name + "-CPU", name + "-GPU", config) {}
+
+  ~CpuGpuFuncCompare() {}
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/function/GemmConvOp.cpp b/paddle/legacy/function/GemmConvOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5a81315661dc2843a648315ca4a6b590f217a657
--- /dev/null
+++ b/paddle/legacy/function/GemmConvOp.cpp
@@ -0,0 +1,522 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ConvOp.h"
+#include "GemmFunctor.h"
+#include "Im2Col.h"
+#include "paddle/legacy/math/MemoryHandle.h"
+
+namespace paddle {
+
+/*
+ * \brief Forward calculation of convolution.
+ */
+template <DeviceType Device>
+class GemmConvFunction : public ConvFunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+    // TODO(hedaoyuan): Need to define some index macros,
+    // to avoid useing 0 and 1.
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+
+    real beta;
+    if (outputs[0].getArgType() == ADD_TO) {
+      beta = 1.0;
+    } else {
+      beta = 0.0;
+    }
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+
+    real* inputData = inputs[0].data<real>();
+    real* filterData = inputs[1].data<real>();
+    real* outputData = outputs[0].data<real>();
+    bool needIm2col = isNeedIm2col(filter);
+
+    TensorShape imShape =
+        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
+
+    TensorShape colShape;
+    real* colData = NULL;
+
+    if (needIm2col) {
+      colShape = TensorShape({inputChannels / groups_,
+                              filterHeight,
+                              filterWidth,
+                              outputHeight,
+                              outputWidth});
+      resizeBuffer<Device>(colShape.getElements());
+      colData = reinterpret_cast<real*>(memory_->getBuf());
+    }
+
+    Im2ColFunctor<kCFO, Device, real> im2col;
+    size_t inputOffset = imShape.getElements();
+    size_t outputOffset =
+        (outputChannels / groups_) * outputHeight * outputWidth;
+    size_t filterOffset = filter.getElements() / groups_;
+
+    for (size_t i = 0; i < batchSize; i++) {
+      for (size_t g = 0; g < groups_; g++) {
+        if (needIm2col) {
+          im2col(inputData + g * inputOffset,
+                 imShape,
+                 colData,
+                 colShape,
+                 strideH(),
+                 strideW(),
+                 paddingH(),
+                 paddingW(),
+                 dilationH(),
+                 dilationW());
+        } else {
+          colData = inputData + g * inputOffset;
+        }
+        int M = outputChannels / groups_;
+        int N = outputHeight * outputWidth;
+        int K = inputChannels / groups_ * filterHeight * filterWidth;
+        BlasGemm<Device, real>::compute(false,
+                                        false,
+                                        M,
+                                        N,
+                                        K,
+                                        1.0f,
+                                        filterData + g * filterOffset,
+                                        K,
+                                        colData,
+                                        N,
+                                        beta,
+                                        outputData + g * outputOffset,
+                                        N);
+      }
+      inputData += inputChannels * inputHeight * inputWidth;
+      outputData += outputChannels * outputHeight * outputWidth;
+    }
+  }
+};
+
+#ifdef PADDLE_MOBILE_INFERENCE
+
+/*
+ * \brief Forward calculation of convolution, optimized for mobile.
+ */
+template <DeviceType Device>
+class GemmConvMobileFunction : public ConvFunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+    // TODO(hedaoyuan): Need to define some index macros,
+    // to avoid useing 0 and 1.
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+
+    real beta;
+    if (outputs[0].getArgType() == ADD_TO) {
+      beta = 1.0;
+    } else {
+      beta = 0.0;
+    }
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+
+    real* inputData = inputs[0].data<real>();
+    real* filterData = inputs[1].data<real>();
+    real* outputData = outputs[0].data<real>();
+    real* colData = NULL;
+    bool needIm2col = isNeedIm2col(filter);
+
+    TensorShape imShape =
+        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
+    TensorShape colShape;
+
+    // Max col matrix width 4096, Max col matrix size 4M.
+    size_t outputHeightSteps =
+        std::min(std::max(4096 / outputWidth, (size_t)1), outputHeight);
+    size_t maxColWidth = outputHeightSteps * outputWidth;
+    size_t channelSteps =
+        std::min(std::max((1048576 / maxColWidth) / filterHeight * filterWidth,
+                          (size_t)1),
+                 inputChannels / groups_);
+    size_t maxColHeight = channelSteps * filterHeight * filterWidth;
+
+    if (needIm2col) {
+      colShape = TensorShape({inputChannels / groups_,
+                              filterHeight,
+                              filterWidth,
+                              outputHeight,
+                              outputWidth});
+
+      resizeBuffer<Device>(maxColHeight * maxColWidth * sizeof(real));
+      colData = reinterpret_cast<real*>(memory_->getBuf());
+    }
+
+    Im2ColMobileFunctor<real> im2col;
+    size_t inputOffset = imShape.getElements();
+    size_t outputOffset =
+        (outputChannels / groups_) * outputHeight * outputWidth;
+    size_t filterOffset = filter.getElements() / groups_;
+
+    int nStride = outputHeight * outputWidth;
+    int kStride = inputChannels / groups_ * filterHeight * filterWidth;
+    for (size_t i = 0; i < batchSize; i++) {
+      filterData = inputs[1].data<real>();
+      for (size_t g = 0; g < groups_; g++) {
+        if (needIm2col) {
+          real beta_ = beta;
+          for (size_t ic = 0; ic < inputChannels / groups_;
+               ic += channelSteps) {
+            int channels = std::min(inputChannels / groups_ - ic, channelSteps);
+            for (size_t oh = 0; oh < outputHeight; oh += outputHeightSteps) {
+              int height = std::min(outputHeight - oh, outputHeightSteps);
+
+              int M = outputChannels / groups_;
+              int N = height * outputWidth;
+              int K = channels * filterHeight * filterWidth;
+              // im2col
+              im2col(inputData,
+                     imShape,
+                     colData,
+                     colShape,
+                     strideH(),
+                     strideW(),
+                     paddingH(),
+                     paddingW(),
+                     dilationH(),
+                     dilationW(),
+                     channels,
+                     oh,
+                     height,
+                     N);
+
+              // gemm
+              BlasGemm<Device, real>::compute(
+                  false,
+                  false,
+                  M,
+                  N,
+                  K,
+                  1.0f,
+                  filterData + ic * filterHeight * filterWidth,
+                  kStride,
+                  colData,
+                  N,
+                  beta_,
+                  outputData + oh * outputWidth,
+                  nStride);
+            }
+            beta_ = 1.0;
+          }
+        } else {
+          int M = outputChannels / groups_;
+          int N = outputHeight * outputWidth;
+          int K = inputChannels / groups_ * filterHeight * filterWidth;
+          BlasGemm<Device, real>::compute(false,
+                                          false,
+                                          M,
+                                          N,
+                                          K,
+                                          1.0f,
+                                          filterData,
+                                          K,
+                                          inputData,
+                                          N,
+                                          beta,
+                                          outputData,
+                                          N);
+        }
+        inputData += inputOffset;
+        outputData += outputOffset;
+        filterData += filterOffset;
+      }
+    }
+
+    memory_.reset();
+  }
+};
+
+#endif
+
+/*
+ * \brief Backward input calculation of convolution.
+ */
+template <DeviceType Device>
+class GemmConvGradInputFunction : public ConvFunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& input = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+    // Since the implementation of Col2ImFunctor is ADD_TO,
+    // this function only supports ADD_TO mode.
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& input = outputs[0].shape();
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+
+    real* outputGrad = inputs[0].data<real>();
+    real* filterData = inputs[1].data<real>();
+    real* inputGrad = outputs[0].data<real>();
+    bool needIm2col = isNeedIm2col(filter);
+
+    TensorShape imShape =
+        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
+
+    TensorShape colShape;
+    real* colData = NULL;
+
+    if (needIm2col) {
+      colShape = TensorShape({inputChannels / groups_,
+                              filterHeight,
+                              filterWidth,
+                              outputHeight,
+                              outputWidth});
+      resizeBuffer<Device>(colShape.getElements());
+      colData = reinterpret_cast<real*>(memory_->getBuf());
+    }
+
+    Col2ImFunctor<kCFO, Device, real> col2im;
+    size_t inputOffset = imShape.getElements();
+    size_t outputOffset =
+        (outputChannels / groups_) * outputHeight * outputWidth;
+    size_t filterOffset = filter.getElements() / groups_;
+
+    for (size_t i = 0; i < batchSize; i++) {
+      for (size_t g = 0; g < groups_; g++) {
+        int K = outputChannels / groups_;
+        int N = outputHeight * outputWidth;
+        int M = inputChannels / groups_ * filterHeight * filterWidth;
+        real scale = 0.0f;
+        if (!needIm2col) {
+          colData = inputGrad + g * inputOffset;
+          scale = 1.0f;
+        }
+        BlasGemm<Device, real>::compute(true,
+                                        false,
+                                        M,
+                                        N,
+                                        K,
+                                        1.0f,
+                                        filterData + g * filterOffset,
+                                        M,
+                                        outputGrad + g * outputOffset,
+                                        N,
+                                        scale,
+                                        colData,
+                                        N);
+        if (needIm2col) {
+          col2im(inputGrad + g * inputOffset,
+                 imShape,
+                 colData,
+                 colShape,
+                 strideH(),
+                 strideW(),
+                 paddingH(),
+                 paddingW(),
+                 dilationH(),
+                 dilationW());
+        }
+      }
+      inputGrad += inputChannels * inputHeight * inputWidth;
+      outputGrad += outputChannels * outputHeight * outputWidth;
+    }
+  }
+};
+
+/*
+ * \brief Backward filter calculation of convolution.
+ */
+template <DeviceType Device>
+class GemmConvGradFilterFunction : public ConvFunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& input = inputs[1].shape();
+    const TensorShape& filter = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& input = inputs[1].shape();
+    const TensorShape& filter = outputs[0].shape();
+
+    real beta;
+    if (outputs[0].getArgType() == ADD_TO) {
+      beta = 1.0;
+    } else {
+      beta = 0.0;
+    }
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+
+    real* outputGrad = inputs[0].data<real>();
+    real* inputData = inputs[1].data<real>();
+    real* filterGrad = outputs[0].data<real>();
+    bool needIm2col = isNeedIm2col(filter);
+
+    TensorShape imShape =
+        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
+
+    TensorShape colShape;
+    real* colData = NULL;
+
+    if (needIm2col) {
+      colShape = TensorShape({inputChannels / groups_,
+                              filterHeight,
+                              filterWidth,
+                              outputHeight,
+                              outputWidth});
+      resizeBuffer<Device>(colShape.getElements());
+      colData = reinterpret_cast<real*>(memory_->getBuf());
+    }
+
+    Im2ColFunctor<kCFO, Device, real> im2col;
+    size_t inputOffset = imShape.getElements();
+    size_t outputOffset =
+        (outputChannels / groups_) * outputHeight * outputWidth;
+    size_t filterOffset = filter.getElements() / groups_;
+    for (size_t i = 0; i < batchSize; i++) {
+      for (size_t g = 0; g < groups_; g++) {
+        if (needIm2col) {
+          im2col(inputData + g * inputOffset,
+                 imShape,
+                 colData,
+                 colShape,
+                 strideH(),
+                 strideW(),
+                 paddingH(),
+                 paddingW(),
+                 dilationH(),
+                 dilationW());
+        } else {
+          colData = inputData + g * inputOffset;
+        }
+        int M = outputChannels / groups_;
+        int K = outputHeight * outputWidth;
+        int N = inputChannels / groups_ * filterHeight * filterWidth;
+        BlasGemm<Device, real>::compute(false,
+                                        true,
+                                        M,
+                                        N,
+                                        K,
+                                        1.0f,
+                                        outputGrad + g * outputOffset,
+                                        K,
+                                        colData,
+                                        K,
+                                        i == 0 ? beta : 1.0f,
+                                        filterGrad + g * filterOffset,
+                                        N);
+      }
+      inputData += inputChannels * inputHeight * inputWidth;
+      outputGrad += outputChannels * outputHeight * outputWidth;
+    }
+  }
+};
+
+#ifdef PADDLE_MOBILE_INFERENCE
+REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvMobileFunction);
+#else
+REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvFunction);
+#endif
+REGISTER_TYPED_FUNC(GemmConvGradInput, CPU, GemmConvGradInputFunction);
+REGISTER_TYPED_FUNC(GemmConvGradFilter, CPU, GemmConvGradFilterFunction);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_TYPED_FUNC(GemmConv, GPU, GemmConvFunction);
+REGISTER_TYPED_FUNC(GemmConvGradInput, GPU, GemmConvGradInputFunction);
+REGISTER_TYPED_FUNC(GemmConvGradFilter, GPU, GemmConvGradFilterFunction);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/GemmConvOpTest.cpp b/paddle/legacy/function/GemmConvOpTest.cpp
similarity index 100%
rename from paddle/function/GemmConvOpTest.cpp
rename to paddle/legacy/function/GemmConvOpTest.cpp
diff --git a/paddle/legacy/function/GemmFunctor.cpp b/paddle/legacy/function/GemmFunctor.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..450293dfeea170e287cfc90226dabad25c76e537
--- /dev/null
+++ b/paddle/legacy/function/GemmFunctor.cpp
@@ -0,0 +1,90 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "GemmFunctor.h"
+#include "paddle/legacy/math/MathFunctions.h"
+
+namespace paddle {
+
+template <class T>
+struct BlasGemm<DEVICE_TYPE_CPU, T> {
+  static void compute(const bool transA,
+                      const bool transB,
+                      const int M,
+                      const int N,
+                      const int K,
+                      const T alpha,
+                      const T* A,
+                      const int lda,
+                      const T* B,
+                      const int ldb,
+                      const T beta,
+                      T* C,
+                      const int ldc) {
+#ifdef PADDLE_USE_EIGEN_FOR_BLAS
+    EigenBlasGemm<T>::compute(
+        transA, transB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
+#else
+    gemm<T>(transA == false ? CblasNoTrans : CblasTrans,
+            transB == false ? CblasNoTrans : CblasTrans,
+            M,
+            N,
+            K,
+            alpha,
+            A,
+            lda,
+            B,
+            ldb,
+            beta,
+            C,
+            ldc);
+#endif
+  }
+};
+
+template <class T>
+struct BlasGemm<DEVICE_TYPE_GPU, T> {
+  static void compute(const bool transA,
+                      const bool transB,
+                      const int M,
+                      const int N,
+                      const int K,
+                      const T alpha,
+                      const T* A,
+                      const int lda,
+                      const T* B,
+                      const int ldb,
+                      const T beta,
+                      T* C,
+                      const int ldc) {
+    hl_matrix_mul((T*)A,
+                  transA == false ? HPPL_OP_N : HPPL_OP_T,
+                  (T*)B,
+                  transB == false ? HPPL_OP_N : HPPL_OP_T,
+                  C,
+                  M,
+                  N,
+                  K,
+                  alpha,
+                  beta,
+                  lda,
+                  ldb,
+                  ldc);
+  }
+};
+
+template struct BlasGemm<DEVICE_TYPE_CPU, real>;
+template struct BlasGemm<DEVICE_TYPE_GPU, real>;
+
+}  // namespace paddle
diff --git a/paddle/function/GemmFunctor.h b/paddle/legacy/function/GemmFunctor.h
similarity index 100%
rename from paddle/function/GemmFunctor.h
rename to paddle/legacy/function/GemmFunctor.h
diff --git a/paddle/function/GruFunctor.h b/paddle/legacy/function/GruFunctor.h
similarity index 100%
rename from paddle/function/GruFunctor.h
rename to paddle/legacy/function/GruFunctor.h
diff --git a/paddle/function/Im2Col.h b/paddle/legacy/function/Im2Col.h
similarity index 100%
rename from paddle/function/Im2Col.h
rename to paddle/legacy/function/Im2Col.h
diff --git a/paddle/function/Im2ColOp.cpp b/paddle/legacy/function/Im2ColOp.cpp
similarity index 100%
rename from paddle/function/Im2ColOp.cpp
rename to paddle/legacy/function/Im2ColOp.cpp
diff --git a/paddle/function/Im2ColOpGpu.cu b/paddle/legacy/function/Im2ColOpGpu.cu
similarity index 100%
rename from paddle/function/Im2ColOpGpu.cu
rename to paddle/legacy/function/Im2ColOpGpu.cu
diff --git a/paddle/legacy/function/Im2ColTest.cpp b/paddle/legacy/function/Im2ColTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2c5f06f38991497963cfbe1e12825f1bc39dffa6
--- /dev/null
+++ b/paddle/legacy/function/Im2ColTest.cpp
@@ -0,0 +1,223 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Im2Col.h"
+#include <gtest/gtest.h>
+#include "Function.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/tests/TensorCheck.h"
+
+namespace paddle {
+
+template <DeviceType Device, class T>
+void TestIm2ColFunctor() {
+  for (size_t channels : {1, 5, 32}) {
+    for (size_t inputHeight : {5, 33, 100}) {
+      for (size_t inputWidth : {5, 32, 96}) {
+        for (size_t filterHeight : {1, 5}) {
+          for (size_t filterWidth : {3, 7}) {
+            for (size_t stride : {1, 2}) {
+              for (size_t padding : {0, 1}) {
+                for (size_t dilation : {1, 3}) {
+                  size_t filterSizeH = (filterHeight - 1) * dilation + 1;
+                  size_t filterSizeW = (filterWidth - 1) * dilation + 1;
+                  if (inputHeight + 2 * padding < filterSizeH ||
+                      inputWidth + 2 * padding < filterSizeW)
+                    break;
+                  if (padding >= filterSizeH || padding >= filterSizeW) break;
+                  size_t outputHeight =
+                      (inputHeight - filterSizeH + 2 * padding) / stride + 1;
+                  size_t outputWidth =
+                      (inputWidth - filterSizeW + 2 * padding) / stride + 1;
+
+                  TensorShape imShape =
+                      TensorShape({channels, inputHeight, inputWidth});
+                  TensorShape colShape1 = TensorShape({channels,
+                                                       filterHeight,
+                                                       filterWidth,
+                                                       outputHeight,
+                                                       outputWidth});
+                  TensorShape colShape2 = TensorShape({outputHeight,
+                                                       outputWidth,
+                                                       channels,
+                                                       filterHeight,
+                                                       filterWidth});
+
+                  size_t height = channels * filterHeight * filterWidth;
+                  size_t width = outputHeight * outputWidth;
+                  VectorPtr input1 =
+                      Vector::create(imShape.getElements(), false);
+                  VectorPtr input2 =
+                      Vector::create(imShape.getElements(), false);
+                  MatrixPtr output1 =
+                      Matrix::create(height, width, false, false);
+                  MatrixPtr output2 =
+                      Matrix::create(width, height, false, false);
+                  input1->uniform(0.001, 1);
+                  input2->copyFrom(*input1);
+
+                  Im2ColFunctor<kCFO, Device, T> im2Col1;
+                  Im2ColFunctor<kOCF, Device, T> im2Col2;
+                  im2Col1(input1->getData(),
+                          imShape,
+                          output1->getData(),
+                          colShape1,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation);
+                  im2Col2(input2->getData(),
+                          imShape,
+                          output2->getData(),
+                          colShape2,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation);
+
+                  // The transposition of the result of ColFormat == kCFO
+                  // is equal to the result of ColFormat == kOCF.
+                  MatrixPtr test;
+                  output2->transpose(test, true);
+                  autotest::TensorCheckErr(*output1, *test);
+
+                  Col2ImFunctor<kCFO, Device, T> col2Im1;
+                  Col2ImFunctor<kOCF, Device, T> col2Im2;
+
+                  col2Im1(input1->getData(),
+                          imShape,
+                          output1->getData(),
+                          colShape1,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation);
+                  col2Im2(input2->getData(),
+                          imShape,
+                          output2->getData(),
+                          colShape2,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation);
+                  autotest::TensorCheckErr(*input1, *input2);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(Im2ColFunctor, CPU) { TestIm2ColFunctor<DEVICE_TYPE_CPU, float>(); }
+
+#ifdef PADDLE_WITH_CUDA
+
+TEST(Im2ColFunctor, GPU) { TestIm2ColFunctor<DEVICE_TYPE_GPU, float>(); }
+
+#endif
+
+template <class T>
+void TestIm2ColMobileFunctor() {
+  for (size_t channels : {32}) {
+    for (size_t inputHeight : {33, 100}) {
+      for (size_t inputWidth : {32, 96}) {
+        for (size_t filterHeight : {5}) {
+          for (size_t filterWidth : {7}) {
+            for (size_t stride : {2}) {
+              for (size_t padding : {1}) {
+                for (size_t dilation : {1, 3}) {
+                  size_t filterSizeH = (filterHeight - 1) * dilation + 1;
+                  size_t filterSizeW = (filterWidth - 1) * dilation + 1;
+                  if (inputHeight + 2 * padding < filterSizeH ||
+                      inputWidth + 2 * padding < filterSizeW)
+                    break;
+                  if (padding >= filterSizeH || padding >= filterSizeW) break;
+                  size_t outputHeight =
+                      (inputHeight - filterSizeH + 2 * padding) / stride + 1;
+                  size_t outputWidth =
+                      (inputWidth - filterSizeW + 2 * padding) / stride + 1;
+
+                  TensorShape imShape =
+                      TensorShape({channels, inputHeight, inputWidth});
+                  TensorShape colShape1 = TensorShape({channels,
+                                                       filterHeight,
+                                                       filterWidth,
+                                                       outputHeight,
+                                                       outputWidth});
+
+                  size_t height = channels * filterHeight * filterWidth;
+                  size_t width = outputHeight * outputWidth;
+                  VectorPtr input1 =
+                      Vector::create(imShape.getElements(), false);
+                  VectorPtr input2 =
+                      Vector::create(imShape.getElements(), false);
+                  MatrixPtr output1 =
+                      Matrix::create(height, width, false, false);
+                  MatrixPtr output2 =
+                      Matrix::create(height, width, false, false);
+                  input1->uniform(0.001, 1);
+                  input2->copyFrom(*input1);
+
+                  Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, T> im2Col1;
+                  Im2ColMobileFunctor<T> im2Col2;
+                  im2Col1(input1->getData(),
+                          imShape,
+                          output1->getData(),
+                          colShape1,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation);
+                  im2Col2(input2->getData(),
+                          imShape,
+                          output2->getData(),
+                          colShape1,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation,
+                          channels,
+                          0,
+                          outputHeight,
+                          outputHeight * outputWidth);
+
+                  autotest::TensorCheckEqual(*output1, *output2);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(Im2ColFunctor, Mobile) { TestIm2ColMobileFunctor<float>(); }
+
+}  // namespace paddle
diff --git a/paddle/legacy/function/MulOp.cpp b/paddle/legacy/function/MulOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..750978fc90201ccdc0a32f93fc01a2170d3f39d5
--- /dev/null
+++ b/paddle/legacy/function/MulOp.cpp
@@ -0,0 +1,347 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MulOp.h"
+#include "GemmFunctor.h"
+#include "paddle/legacy/math/SIMDFunctions.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
+
+namespace {
+inline void vecAddTo(real* a, const real* b, real scaleB, size_t len) {
+  for (unsigned int i = 0; i < len; ++i) {
+    a[i] += (1.0 == scaleB) ? b[i] : scaleB * b[i];
+  }
+}
+
+inline void colVecAddTo(
+    real* a, real* b, real c, size_t len, size_t aWidth, size_t bWidth) {
+  for (unsigned int i = 0; i < len; ++i) {
+    a[i * aWidth] += (1.0 == c) ? b[i * bWidth] : b[i * bWidth] * c;
+  }
+}
+}  // namespace
+
+namespace paddle {
+/// sparse matrix (+)= dense matrix * dense matrix
+template <>
+void MulOp<DEVICE_TYPE_CPU>(CpuSparseMatrix& out,
+                            const CpuMatrix& a,
+                            const CpuMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  CHECK_EQ(out.getValueType(), FLOAT_VALUE);
+  if (scaleT == 0) {
+    out.zeroMem();
+  }
+  const real* A = a.getData();
+  const real* B = b.getData();
+  real* C = out.getValue();
+  int* rows = out.getRows();
+  int* cols = out.getCols();
+  size_t width = out.getWidth();
+  size_t height = out.getHeight();
+
+  /// SPARSE_CSC, {a any, b not trans}
+  if (out.getFormat() == SPARSE_CSC) {
+    /// b not trans and a any
+    CHECK(!bTrans);
+    size_t m = !aTrans ? a.getWidth() : a.getHeight();
+    for (size_t i = 0; i < width; i++) {
+      size_t start = out.getColStartIdx(i);
+      size_t end = out.getColStartIdx(i + 1);
+      for (size_t j = start; j < end; j++) {
+        real sum = 0;
+        size_t rowIdx = rows[j];
+        for (size_t k = 0; k < m; k++) {
+          sum += (!aTrans ? A[rowIdx * m + k] : A[k * height + rowIdx]) *
+                 B[k * width + i];
+        }
+        C[j] = scaleAB * sum + scaleT * C[j];
+      }
+    }
+    return;
+  }
+
+  /// SPARSE_CSR, {a any, b not trans} or {a not trans, b trans}
+  if (out.getFormat() == SPARSE_CSR) {
+    /// a and b can not both transpose
+    CHECK(!(aTrans && bTrans));
+    size_t m = a.getWidth();
+    for (size_t i = 0; i < height; i++) {
+      size_t start = out.getRowStartIdx(i);
+      size_t end = out.getRowStartIdx(i + 1);
+      for (size_t j = start; j < end; j++) {
+        real sum = 0;
+        size_t colIdx = cols[j];
+        for (size_t k = 0; k < m; k++) {
+          sum += (!aTrans ? A[i * m + k] : A[k * height + i]) *
+                 (!bTrans ? B[k * width + colIdx] : B[colIdx * m + k]);
+        }
+        C[j] = scaleAB * sum + scaleT * C[j];
+      }
+    }
+    return;
+  }
+}
+
+/// dense matrix (+)= dense matrix * dense matrix
+template <>
+void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
+                            const CpuMatrix& a,
+                            const CpuMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  BlasGemm<DEVICE_TYPE_CPU, real>::compute(
+      aTrans,
+      bTrans,
+      out.getHeight(),
+      out.getWidth(),
+      !aTrans ? a.getWidth() : a.getHeight(),
+      scaleAB,
+      a.getData(),
+      a.getStride(),
+      b.getData(),
+      b.getStride(),
+      scaleT,
+      out.getData(),
+      out.getStride());
+}
+
+/// dense matrix (+)= sparse matrix * dense matrix
+template <>
+void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
+                            const CpuSparseMatrix& a,
+                            const CpuMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  if (scaleT == 0) {
+    out.zeroMem();
+  }
+  const real* B = b.getData();
+  real* C = out.getData();
+  if (out.getWidth() % 32 == 0) {
+    CHECK_EQ((size_t)B % 32, 0UL);
+    CHECK_EQ((size_t)C % 32, 0UL);
+  }
+
+  int* cols = a.getCols();
+  real* values = a.getValue();
+  for (size_t i = 0; i < a.getHeight(); ++i) {
+    const int start = a.getRowStartIdx(i);
+    const int end = a.getRowStartIdx(i + 1);
+    for (int j = start; j < end; ++j) {
+      vecAddTo(!aTrans ? out.getRow(i) : out.getRow(cols[j]),
+               !aTrans ? const_cast<CpuMatrix&>(b).getRow(cols[j])
+                       : const_cast<CpuMatrix&>(b).getRow(i),
+               (a.getValueType() == FLOAT_VALUE) ? values[j] : (real)1.0,
+               out.getWidth());
+    }
+  }
+}
+
+/// dense matrix (+)= dense matrix * sparse matrix
+template <>
+void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
+                            const CpuMatrix& a,
+                            const CpuSparseMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  if (scaleT == 0) {
+    out.zeroMem();
+  }
+  real* A = const_cast<real*>(a.getData());
+  real* B = const_cast<real*>(b.getValue());
+  real* C = out.getData();
+  int* rows = b.getRows();
+  int* cols = b.getCols();
+
+  /// SPARSE_CSC format
+  if (b.getFormat() == SPARSE_CSC) {
+    for (size_t j = 0; j < b.getWidth(); ++j) {
+      int start = b.getColStartIdx(j);
+      int end = b.getColStartIdx(j + 1);
+      for (int i = start; i < end; ++i) {
+        colVecAddTo(!bTrans ? C + j : C + rows[i],
+                    !bTrans ? A + rows[i] : A + j,
+                    (b.getValueType() == NO_VALUE) ? (real)1.0 : B[i],
+                    out.getHeight(),
+                    out.getWidth(),
+                    a.getWidth());
+      }
+    }
+    return;
+  }
+
+  /// SPARSE_CSR format
+  if (b.getFormat() == SPARSE_CSR) {
+    for (size_t j = 0; j < b.getHeight(); ++j) {
+      int start = b.getRowStartIdx(j);
+      int end = b.getRowStartIdx(j + 1);
+      for (int i = start; i < end; ++i) {
+        colVecAddTo(!bTrans ? C + cols[i] : C + j,
+                    !bTrans ? A + j : A + cols[i],
+                    (b.getValueType() == NO_VALUE) ? (real)1.0 : B[i],
+                    out.getHeight(),
+                    out.getWidth(),
+                    a.getWidth());
+      }
+    }
+    return;
+  }
+}
+
+/**
+ * mul operator
+ * out = scaleT * out + scaleAB * (A * B)
+ * here, scaleT in {0, 1}, scaleAB == 1,
+ * out = A * B, ASSIGN_TO
+ * out += A * B, ADD_TO
+ *
+ *
+ * \param outputs[0]      output matrix (out), M * N,
+ *                        could be either Sparse or Dense Matrix
+ *                        M is num of rows, N is num of columns
+ * \param inputs[0]       first input matrix (A),  M * K (if non-trans)
+ *                        could be either Sparse or Dense Matrix
+ *                        M is num of rows, K is num of columns
+ * \param inputs[1]       second input matrix (B), K * N (if non-trans)
+ *                        could be either Sparse or Dense Matrix
+ *                        K is num of rows, N is num of columns
+ *
+ * Support eight Mul operators, with both GPU and CPU devices
+ * For each device, four Mul operators are supported:
+ * 1. dense (out) = dense (A) * dense (B)
+ * 2. dense (out) = sparse (A) * dense (B)
+ *    sparse matrix only support SPARSE_CSR format
+ * 3. dense (out) = dense (A) * sparse (B)
+ *    sparse matrix support SPARSE_CSC and SPARSE_CSR formats
+ * 4. sparse (out) = dense (A) * dense (B)
+ *    sparse matrix support SPARSE_CSC and SPARSE_CSR formats
+ *
+ */
+template <DeviceType Device>
+class MulFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    aTrans_ = config.get<bool>("aTrans");
+    bTrans_ = config.get<bool>("bTrans");
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK(!aTrans_ || !bTrans_)
+        << "Not support both a and b are transpose matrices";
+
+    CHECK_EQ((size_t)2, inputs.size());
+    CHECK_EQ((size_t)1, outputs.size());
+    CHECK(inputs[0].data() && inputs[1].data() && outputs[0].data());
+    CHECK_EQ(inputs[0].shape().ndims(), (size_t)2);
+    CHECK_EQ(inputs[1].shape().ndims(), (size_t)2);
+    CHECK_EQ(outputs[0].shape().ndims(), (size_t)2);
+
+    size_t aRow = !aTrans_ ? inputs[0].shape()[0] : inputs[0].shape()[1];
+    size_t aCol = !aTrans_ ? inputs[0].shape()[1] : inputs[0].shape()[0];
+    size_t bRow = !bTrans_ ? inputs[1].shape()[0] : inputs[1].shape()[1];
+    size_t bCol = !bTrans_ ? inputs[1].shape()[1] : inputs[1].shape()[0];
+    /// C = A * B, or C += A * B, for matrix format
+    CHECK_EQ(aCol, bRow);
+    CHECK_EQ(aRow, outputs[0].shape()[0]);
+    CHECK_EQ(bCol, outputs[0].shape()[1]);
+
+    /// only support C = A * B (ASSIGN_TO) or C += A * B (ADD_TO)
+    real scaleT = (outputs[0].getArgType() == ADD_TO) ? 1.0 : 0.0;
+
+    /// support dense = not both sparse * sparse
+    /// or sparse = dense * dense
+    CHECK((!outputs[0].isSparseArg() &&
+           !(inputs[0].isSparseArg() && inputs[1].isSparseArg())) ||
+          (outputs[0].isSparseArg() && !inputs[0].isSparseArg() &&
+           !inputs[1].isSparseArg()));
+
+    auto outMat = outputs[0].matrix<Device>();
+    /// dense matrix = dense matrix * dense matrix
+    if (!inputs[0].isSparseArg() && !inputs[1].isSparseArg() &&
+        !outputs[0].isSparseArg()) {
+      MulOp<Device>(outMat,
+                    inputs[0].matrix<Device>(),
+                    inputs[1].matrix<Device>(),
+                    1.0,  // scaleAB
+                    scaleT,
+                    aTrans_,
+                    bTrans_);
+      return;
+    }
+
+    /// dense matrix = dense matrix * sparse matrix
+    if (!inputs[0].isSparseArg() && inputs[1].isSparseArg() &&
+        !outputs[0].isSparseArg()) {
+      CHECK(!aTrans_) << "Not supported a transpose";
+      MulOp<Device>(outMat,
+                    inputs[0].matrix<Device>(),
+                    inputs[1].sparse().SparseMatrix<Device>(),
+                    1.0,  // scaleAB
+                    scaleT,
+                    aTrans_,
+                    bTrans_);
+      return;
+    }
+
+    /// dense matrix = sparse matrix * dense matrix
+    if (inputs[0].isSparseArg() && !inputs[1].isSparseArg() &&
+        !outputs[0].isSparseArg()) {
+      CHECK(!bTrans_) << "Not supported b transpose";
+      CHECK_EQ(inputs[0].sparse().dataFormat(), T_SPARSE_CSR)
+          << "Only supported SPARSE_CSR format for sparse matrix a";
+      MulOp<Device>(outMat,
+                    inputs[0].sparse().SparseMatrix<Device>(),
+                    inputs[1].matrix<Device>(),
+                    1.0,  // scaleAB
+                    scaleT,
+                    aTrans_,
+                    bTrans_);
+      return;
+    }
+
+    /// sparse matrix = dense matrix * dense matrix
+    auto outSparseMat = outputs[0].sparse().SparseMatrix<Device>();
+    if (!inputs[0].isSparseArg() && !inputs[1].isSparseArg() &&
+        outputs[0].isSparseArg()) {
+      MulOp<Device>(outSparseMat,
+                    inputs[0].matrix<Device>(),
+                    inputs[1].matrix<Device>(),
+                    1.0,  // scaleAB
+                    scaleT,
+                    aTrans_,
+                    bTrans_);
+      return;
+    }
+  }
+
+ private:
+  bool aTrans_;
+  bool bTrans_;
+};
+
+REGISTER_TYPED_FUNC(MulOp, CPU, MulFunc);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_TYPED_FUNC(MulOp, GPU, MulFunc);
+#endif
+}  // namespace paddle
diff --git a/paddle/legacy/function/MulOp.h b/paddle/legacy/function/MulOp.h
new file mode 100644
index 0000000000000000000000000000000000000000..ab33bde17296cd2b17ac45c5a936cfd2727919a5
--- /dev/null
+++ b/paddle/legacy/function/MulOp.h
@@ -0,0 +1,102 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Function.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/SparseMatrix.h"
+
+namespace paddle {
+/// CPU, dense matrix (+)= dense matrix * dense matrix
+template <DeviceType DType>
+void MulOp(CpuMatrix& out,
+           const CpuMatrix& a,
+           const CpuMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+/// CPU, dense matrix (+)= sparse matrix * dense matrix
+template <DeviceType DType>
+void MulOp(CpuMatrix& out,
+           const CpuSparseMatrix& a,
+           const CpuMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+/// CPU, dense matrix (+)= dense matrix * sparse matrix
+template <DeviceType DType>
+void MulOp(CpuMatrix& out,
+           const CpuMatrix& a,
+           const CpuSparseMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+/// CPU, sparse matrix (+)= dense matrix * dense matrix
+template <DeviceType DType>
+void MulOp(CpuSparseMatrix& out,
+           const CpuMatrix& a,
+           const CpuMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+/// GPU, dense matrix (+)= dense matrix * dense matrix
+template <DeviceType DType>
+void MulOp(GpuMatrix& out,
+           const GpuMatrix& a,
+           const GpuMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+/// GPU, dense matrix (+)= sparse matrix * dense matrix
+template <DeviceType DType>
+void MulOp(GpuMatrix& out,
+           const GpuSparseMatrix& a,
+           const GpuMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+/// GPU, dense matrix (+)= dense matrix * sparse matrix
+template <DeviceType DType>
+void MulOp(GpuMatrix& out,
+           const GpuMatrix& a,
+           const GpuSparseMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+/// GPU, sparse matrix (+)= dense matrix * dense matrix
+template <DeviceType DType>
+void MulOp(GpuSparseMatrix& out,
+           const GpuMatrix& a,
+           const GpuMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+}  // namespace paddle
diff --git a/paddle/legacy/function/MulOpGpu.cu b/paddle/legacy/function/MulOpGpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..217c983cb75dfcbc0e17f752a66847c5e92fcc91
--- /dev/null
+++ b/paddle/legacy/function/MulOpGpu.cu
@@ -0,0 +1,130 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MulOp.h"
+#include "hl_base.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/SparseMatrix.h"
+
+namespace paddle {
+/// dense matrix (+)= dense matrix * dense matrix
+template <>
+void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
+                            const GpuMatrix& a,
+                            const GpuMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
+  hl_matrix_mul(const_cast<real*>(a.getData()),
+                !aTrans ? HPPL_OP_N : HPPL_OP_T,
+                const_cast<real*>(b.getData()),
+                !bTrans ? HPPL_OP_N : HPPL_OP_T,
+                const_cast<real*>(out.getData()),
+                out.getHeight(),
+                out.getWidth(),
+                !aTrans ? a.getWidth() : a.getHeight(),
+                scaleAB,
+                scaleT,
+                a.getStride(),
+                b.getStride(),
+                out.getStride());
+}
+
+/// dense matrix (+)= sparse matrix * dense matrix
+template <>
+void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
+                            const GpuSparseMatrix& a,
+                            const GpuMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  CHECK(out.isContiguous());
+  CHECK(b.isContiguous());
+  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
+  hl_matrix_csr_mul_dense(a.sMatrix_.get(),
+                          aTrans ? HPPL_OP_T : HPPL_OP_N,
+                          const_cast<real*>(b.getData()),
+                          HPPL_OP_N,
+                          const_cast<real*>(out.getData()),
+                          out.getHeight(),
+                          out.getWidth(),
+                          b.getHeight(),
+                          scaleAB,
+                          scaleT);
+}
+
+/// dense matrix (+)= dense matrix * sparse matrix
+template <>
+void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
+                            const GpuMatrix& a,
+                            const GpuSparseMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  CHECK(out.isContiguous());
+  CHECK(a.isContiguous());
+  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
+
+  if (b.format_ == SPARSE_CSC) {
+    hl_matrix_dense_mul_csc(const_cast<real*>(a.getData()),
+                            HPPL_OP_N,
+                            b.sMatrix_.get(),
+                            bTrans ? HPPL_OP_T : HPPL_OP_N,
+                            const_cast<real*>(out.getData()),
+                            out.getHeight(),
+                            out.getWidth(),
+                            a.getWidth(),
+                            scaleAB,
+                            scaleT);
+  } else {
+    hl_matrix_dense_mul_csr(const_cast<real*>(a.getData()),
+                            HPPL_OP_N,
+                            b.sMatrix_.get(),
+                            bTrans ? HPPL_OP_T : HPPL_OP_N,
+                            const_cast<real*>(out.getData()),
+                            out.getHeight(),
+                            out.getWidth(),
+                            a.getWidth(),
+                            scaleAB,
+                            scaleT);
+  }
+}
+
+/// sparse matrix (+)= dense matrix * dense matrix
+template <>
+void MulOp<DEVICE_TYPE_GPU>(GpuSparseMatrix& out,
+                            const GpuMatrix& a,
+                            const GpuMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
+  hl_sparse_matrix_mul(const_cast<real*>(a.getData()),
+                       aTrans ? HPPL_OP_T : HPPL_OP_N,
+                       const_cast<real*>(b.getData()),
+                       bTrans ? HPPL_OP_T : HPPL_OP_N,
+                       out.sMatrix_.get(),
+                       out.getHeight(),
+                       out.getWidth(),
+                       !bTrans ? b.getHeight() : b.getWidth(),
+                       scaleAB,
+                       scaleT);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/function/MulOpTest.cpp b/paddle/legacy/function/MulOpTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ab08b6f8696ff4aefd2dbdda591b20730b46898c
--- /dev/null
+++ b/paddle/legacy/function/MulOpTest.cpp
@@ -0,0 +1,212 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/SparseMatrix.h"
+#include "paddle/legacy/math/tests/test_matrixUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+
+/**
+ *  C += A * B, A, B, C dense matrix
+ *  dense = dense * dense
+ */
+void testFuncDDDMatrix(
+    bool transa, bool transb, size_t dimM, size_t dimN, size_t dimK) {
+  real scaleT = 1.0;
+  size_t heightA = (transa == false) ? dimM : dimK;
+  size_t widthA = (transa == false) ? dimK : dimM;
+  size_t heightB = (transb == false) ? dimK : dimN;
+  size_t widthB = (transb == false) ? dimN : dimK;
+  size_t heightC = dimM;
+  size_t widthC = dimN;
+  // init Test object
+  CpuGpuFuncCompare test(
+      "MulOp", FuncConfig().set("aTrans", transa).set("bTrans", transb));
+  // prepare input arguments
+  /// matrix A : HA * WA
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{heightA, widthA}));
+  /// matrix B: HB * WB
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{heightB, widthB}));
+
+  /// output matrix C: HC * WC
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{heightC, widthC}),
+                  scaleT == 1.0 ? ADD_TO : ASSIGN_TO);
+  // run Function
+  test.run();
+}
+
+TEST(MulOp, DDDMatrixMul) {
+  LOG(INFO) << "function test for dense = dense * dense matrix";
+  for (const auto transa : {false, true}) {
+    for (const auto transb : {false, true}) {
+      for (const auto dimM : {1, 10, 100}) {
+        for (const auto dimN : {1, 10}) {
+          for (const auto dimK : {8}) {
+            if (transa && transb) {
+              continue;
+            }
+            VLOG(3) << std::setiosflags(std::ios::left) << std::setfill(' ')
+                    << " transa=" << transa << " transb=" << transb
+                    << " dimM=" << std::setw(5) << dimM
+                    << " dimN=" << std::setw(5) << dimN
+                    << " dimK=" << std::setw(5) << dimK;
+            testFuncDDDMatrix(transa, transb, dimM, dimN, dimK);
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+ * C += A * B, B, C dense, A sparse
+ * dense = sparse * dense
+ */
+void testFuncDSparseDMatrix(
+    size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
+  real scaleT = 1.0;
+  // init Test object
+  CpuGpuFuncCompare test(
+      "MulOp", FuncConfig().set("aTrans", false).set("bTrans", false));
+  // prepare input arguments
+  /// sparse matrix A : M * K
+  test.addInputs(SparseMatrixArg(
+      VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}, nnz, FORMAT, FLOAT_VALUE));
+  /// matrix B: K * N
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimK, dimN}));
+
+  /// output matrix C: M * N
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}),
+                  scaleT == 1.0 ? ADD_TO : ASSIGN_TO);
+  // run Function
+  test.run();
+}
+
+TEST(MuLOp, DSparseDMul) {
+  LOG(INFO) << "function test for dense = sparse * dense matrix";
+  for (const auto dimM : {10, 100, 1000}) {
+    for (const auto dimN : {10, 100}) {
+      for (const auto dimK : {3, 10}) {
+        for (const auto nnz : {3, 10}) {
+          for (const auto FORMAT : {SPARSE_CSR}) {
+            VLOG(3) << std::setiosflags(std::ios::left) << std::setfill(' ')
+                    << " dimM=" << std::setw(5) << dimM
+                    << " dimN=" << std::setw(5) << dimN
+                    << " dimK=" << std::setw(5) << dimK
+                    << " nnz=" << std::setw(5) << nnz
+                    << " format=" << std::setw(5) << FORMAT;
+            testFuncDSparseDMatrix(dimM, dimN, dimK, nnz, FORMAT);
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+ * C += A * B, A, C dense, B sparse
+ * dense = dense * sparse
+ */
+void testFuncDDSparseMatrix(
+    size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
+  real scaleT = 1.0;
+  // init Test object
+  CpuGpuFuncCompare test(
+      "MulOp", FuncConfig().set("aTrans", false).set("bTrans", false));
+  // prepare input arguments
+  /// matrix A : M * K
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}));
+
+  /// matrix B: K * N
+  test.addInputs(SparseMatrixArg(
+      VALUE_TYPE_FLOAT, TensorShape{dimK, dimN}, nnz, FORMAT, FLOAT_VALUE));
+
+  /// output matrix C: M * N
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}),
+                  scaleT == 1.0 ? ADD_TO : ASSIGN_TO);
+  // run Function
+  test.run();
+}
+
+TEST(MulOp, DDSparseMul) {
+  LOG(INFO) << "function test for dense = dense * sparse matrix";
+  for (const auto dimM : {10, 100, 1000}) {
+    for (const auto dimN : {10, 100}) {
+      for (const auto dimK : {3, 10}) {
+        for (const auto nnz : {3, 10}) {
+          for (const auto FORMAT : {SPARSE_CSR, SPARSE_CSC}) {
+            VLOG(3) << std::setiosflags(std::ios::left) << std::setfill(' ')
+                    << " dimM=" << std::setw(5) << dimM
+                    << " dimN=" << std::setw(5) << dimN
+                    << " dimK=" << std::setw(5) << dimK
+                    << " nnz=" << std::setw(5) << nnz
+                    << " format=" << std::setw(5) << FORMAT;
+            testFuncDDSparseMatrix(dimM, dimN, dimK, nnz, FORMAT);
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+ * C += A * B, A sparse, B, C dense
+ * sparse = dense * dense
+ */
+void testFuncSparseDDMatrix(
+    size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
+  real scaleT = 1.0;
+  // init Test object
+  CpuGpuFuncCompare test(
+      "MulOp", FuncConfig().set("aTrans", false).set("bTrans", false));
+  // prepare input arguments
+  /// matrix A : M * K
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}));
+
+  /// matrix B: K * N
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimK, dimN}));
+
+  /// output sparse matrix C: M * N
+  test.addOutputs(
+      SparseMatrixArg(
+          VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}, nnz, FORMAT, FLOAT_VALUE),
+      scaleT == 1.0 ? ADD_TO : ASSIGN_TO);
+  // run Function
+  test.run();
+}
+
+TEST(MulOp, SparseDDMul) {
+  LOG(INFO) << "function test for sparse = dense * dense matrix";
+  for (const auto dimM : {10, 100, 1000}) {
+    for (const auto dimN : {10, 100}) {
+      for (const auto dimK : {3, 10}) {
+        for (const auto nnz : {3, 10}) {
+          for (const auto FORMAT : {SPARSE_CSC, SPARSE_CSR}) {
+            VLOG(3) << std::setiosflags(std::ios::left) << std::setfill(' ')
+                    << " dimM=" << std::setw(5) << dimM
+                    << " dimN=" << std::setw(5) << dimN
+                    << " dimK=" << std::setw(5) << dimK
+                    << " nnz=" << std::setw(5) << nnz
+                    << " format=" << std::setw(5) << FORMAT;
+            testFuncSparseDDMatrix(dimM, dimN, dimK, nnz, FORMAT);
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/paddle/function/NaiveConvOp.cpp b/paddle/legacy/function/NaiveConvOp.cpp
similarity index 100%
rename from paddle/function/NaiveConvOp.cpp
rename to paddle/legacy/function/NaiveConvOp.cpp
diff --git a/paddle/legacy/function/PadOp.cpp b/paddle/legacy/function/PadOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9d011d28e6938fac6980bed88f774abdbf3532d4
--- /dev/null
+++ b/paddle/legacy/function/PadOp.cpp
@@ -0,0 +1,215 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "PadOp.h"
+#include "paddle/legacy/math/Vector.h"
+
+namespace paddle {
+
+template <>
+void Pad<DEVICE_TYPE_CPU>(real* outputs,
+                          const real* inputs,
+                          const int num,
+                          const int inC,
+                          const int inH,
+                          const int inW,
+                          const PadConf& pad) {
+  int cstart = pad.channel[0], cend = pad.channel[1];
+  int hstart = pad.height[0], hend = pad.height[1];
+  int wstart = pad.width[0], wend = pad.width[1];
+  int outC = inC + cstart + cend;
+  int outH = inH + hstart + hend;
+  int outW = inW + wstart + wend;
+  for (int i = 0; i < num; i++) {
+    for (int c = 0; c < inC; c++) {
+      for (int h = 0; h < inH; h++) {
+        int inoff = ((i * inC + c) * inH + h) * inW;
+        int outoff =
+            ((i * outC + c + cstart) * outH + h + hstart) * outW + wstart;
+        memcpy(outputs + outoff, inputs + inoff, inW * sizeof(real));
+      }
+    }
+  }
+}
+
+template <>
+void PadGrad<DEVICE_TYPE_CPU>(real* inGrad,
+                              const real* outGrad,
+                              const int num,
+                              const int inC,
+                              const int inH,
+                              const int inW,
+                              const PadConf& pad) {
+  int cstart = pad.channel[0], cend = pad.channel[1];
+  int hstart = pad.height[0], hend = pad.height[1];
+  int wstart = pad.width[0], wend = pad.width[1];
+  int outC = inC + cstart + cend;
+  int outH = inH + hstart + hend;
+  int outW = inW + wstart + wend;
+  for (int i = 0; i < num; i++) {
+    for (int c = 0; c < inC; c++) {
+      for (int h = 0; h < inH; h++) {
+        int inoff = ((i * inC + c) * inH + h) * inW;
+        int outoff =
+            ((i * outC + c + cstart) * outH + h + hstart) * outW + wstart;
+        CpuVector inG = CpuVector(inW, inGrad + inoff);
+        CpuVector outG = CpuVector(inW, const_cast<real*>(outGrad + outoff));
+        inG += outG;
+      }
+    }
+  }
+}
+
+static inline PadConf castToPadConf(const FuncConfig& conf) {
+  return {conf.get<std::vector<uint32_t>>("channel"),
+          conf.get<std::vector<uint32_t>>("height"),
+          conf.get<std::vector<uint32_t>>("width")};
+}
+
+/**
+ * \brief Padding zeros to input according to the specify dimension.
+ *        The struct pad_ contains the padding size in each dimension.
+ *        The input and output is a 4D tensor. In PadFunc, we only
+ *        pad zeros to the 2nd to 4th dimension.
+ *
+ * Argument in this Function:
+ * \param pad_    A struct object contains the padding size in each dimension.
+ *                It has six integers. The channelStart and channelEnd indicate
+ *                how many zeros to add before and after the input in channel
+ *                dimension. And the heightStart and heightEnd indicate padding
+ *                in height dimension. The widthStart and widthEnd indicate the
+ *                padding in width dimension.
+ * \param inputs  A 4D tensor, only one input.
+ * \param outputs A 4D tensor, the output value after padding.
+ *
+ * For example,
+ * Input(2,2,2,3) = [
+ *                    [ [[1,2,3], [3,4,5]],
+ *                      [[2,3,5], [1,6,7]] ],
+ *                    [ [[4,3,1], [1,8,7]],
+ *                      [[3,8,9], [2,3,5]] ]
+ *                  ] # the shape is (1,2,2,3)
+ *
+ * pad_: if channelStart = channelEnd = 1, others are 0.
+ * Output(2,4,2,3) = [
+ *                    [ [[0,0,0], [0,0,0]],
+ *                      [[1,2,3], [3,4,5]],
+ *                      [[2,3,5], [1,6,7]],
+ *                      [[0,0,0], [0,0,0]] ],
+ *                    [ [[0,0,0], [0,0,0]],
+ *                      [[4,3,1], [1,8,7]],
+ *                      [[3,8,9], [2,3,5]],
+ *                      [[0,0,0], [0,0,0]] ]
+ *                   ] # the shape is (2,4,2,3)
+ *
+ * pad_: if widthStart = 1, widthEnd = 2, others are 0.
+ * Output(2,2,2,6) = [
+ *                     [ [[0,1,2,3,0,0], [0,3,4,5,0,0]],
+ *                       [[0,2,3,5,0,0], [0,1,6,7,0,0]] ],
+ *                     [ [[0,4,3,1,0,0], [0,1,8,7,0,0]],
+ *                       [[0,3,8,9,0,0], [0,2,3,5,0,0]] ],
+ *                   ] # the shape is (2,2,2,6)
+ *
+ * pad_: if heightStart = 1, heightEnd = 1, others are 0.
+ * Output(2,2,4,3) = [
+ *                     [ [[0,0,0], [1,2,3], [3,4,5], [0,0,0]],
+ *                       [[0,0,0], [2,3,5], [1,6,7], [0,0,0]] ],
+ *                     [ [[0,0,0], [4,3,1], [1,8,7], [0,0,0]],
+ *                       [[0,0,0], [3,8,9], [2,3,5], [0,0,0]] ],
+ *                   ] # the shape is (2,2,4,3)
+ */
+
+template <DeviceType Device>
+class PadFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override { pad_ = castToPadConf(config); }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+
+    size_t num = inputs[0].shape()[0];
+    size_t inC = inputs[0].shape()[1];
+    size_t inH = inputs[0].shape()[2];
+    size_t inW = inputs[0].shape()[3];
+    typename Tensor<real, Device>::Vector vec(outputs[0].shape().getElements(),
+                                              outputs[0].data<real>());
+    vec.zero();
+
+    Pad<Device>(outputs[0].data<real>(),
+                inputs[0].data<real>(),
+                num,
+                inC,
+                inH,
+                inW,
+                pad_);
+  }
+
+ private:
+  PadConf pad_;
+};
+
+/**
+ * \brief The backward propagation of padding Function. Remove the elements
+ *        in the padding positions of forward.
+ *
+ * Argument in this Function:
+ * \param pad_    The same meaning as it in PadFunc.
+ * \param inputs  The gradient with respect to the output value of PadFunc.
+ * \param outputs The gradient with respect to the input value of PadFunc.
+ */
+
+template <DeviceType Device>
+class PadGradFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override { pad_ = castToPadConf(config); }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+
+    size_t num = outputs[0].shape()[0];
+    size_t inC = outputs[0].shape()[1];
+    size_t inH = outputs[0].shape()[2];
+    size_t inW = outputs[0].shape()[3];
+
+    if (outputs[0].getArgType() != ADD_TO) {
+      // for unit test
+      typename Tensor<real, Device>::Vector tmp(
+          outputs[0].shape().getElements(), outputs[0].data<real>());
+      tmp.zero();
+    }
+
+    PadGrad<Device>(outputs[0].data<real>(),
+                    inputs[0].data<real>(),
+                    num,
+                    inC,
+                    inH,
+                    inW,
+                    pad_);
+  }
+
+ private:
+  PadConf pad_;
+};
+
+REGISTER_TYPED_FUNC(Pad, CPU, PadFunc);
+REGISTER_TYPED_FUNC(PadGrad, CPU, PadGradFunc);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_TYPED_FUNC(Pad, GPU, PadFunc);
+REGISTER_TYPED_FUNC(PadGrad, GPU, PadGradFunc);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/PadOp.h b/paddle/legacy/function/PadOp.h
similarity index 100%
rename from paddle/function/PadOp.h
rename to paddle/legacy/function/PadOp.h
diff --git a/paddle/function/PadOpGpu.cu b/paddle/legacy/function/PadOpGpu.cu
similarity index 100%
rename from paddle/function/PadOpGpu.cu
rename to paddle/legacy/function/PadOpGpu.cu
diff --git a/paddle/function/PadOpTest.cpp b/paddle/legacy/function/PadOpTest.cpp
similarity index 100%
rename from paddle/function/PadOpTest.cpp
rename to paddle/legacy/function/PadOpTest.cpp
diff --git a/paddle/legacy/function/RowConvOp.cpp b/paddle/legacy/function/RowConvOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3be50e80d71fabdb3e7a22bfc061da09412c132d
--- /dev/null
+++ b/paddle/legacy/function/RowConvOp.cpp
@@ -0,0 +1,225 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "RowConvOp.h"
+#include <iostream>
+#include "paddle/legacy/math/Vector.h"
+
+namespace paddle {
+
+template <>
+void RowConv<DEVICE_TYPE_CPU>(CpuMatrix& out,
+                              const CpuMatrix& in,
+                              const CpuMatrix& filter,
+                              const CpuIVector& seq) {
+  const int* starts = seq.getData();
+  const size_t numSeq = seq.getSize() - 1;
+  const size_t contextLength = filter.getHeight();
+  for (size_t i = 0; i < numSeq; ++i) {
+    size_t begin = starts[i];
+    size_t end = starts[i + 1];
+    for (size_t j = begin; j < end; ++j) {
+      MatrixPtr x;
+      MatrixPtr w;
+      if ((j + contextLength) < end) {
+        x = (const_cast<CpuMatrix&>(in)).subMatrix(j, contextLength);
+        w = (const_cast<CpuMatrix&>(filter)).subMatrix(0, contextLength);
+      } else {
+        x = (const_cast<CpuMatrix&>(in)).subMatrix(j, end - j);
+        w = (const_cast<CpuMatrix&>(filter)).subMatrix(0, end - j);
+      }
+      MatrixPtr y = out.subMatrix(j, 1);
+      y->addDotMulVMM(*x, *w);
+    }
+  }
+}
+
+template <>
+void RowConvGrad<DEVICE_TYPE_CPU>(const CpuMatrix& outG,
+                                  const CpuMatrix& in,
+                                  const CpuMatrix& filter,
+                                  CpuMatrix& inG,
+                                  CpuMatrix& filterG,
+                                  const CpuIVector& seq) {
+  // gradient w.r.t filter
+  const int* starts = seq.getData();
+  const size_t numSeq = seq.getSize() - 1;
+  const size_t contextLength = filter.getHeight();
+  if (filterG) {
+    for (size_t i = 0; i < numSeq; ++i) {
+      size_t begin = starts[i];
+      size_t end = starts[i + 1];
+      size_t steps = end - begin;
+      for (size_t j = 0; j < contextLength && (begin + j) < end; ++j) {
+        MatrixPtr x =
+            (const_cast<CpuMatrix&>(in)).subMatrix(begin + j, steps - j);
+        MatrixPtr dy =
+            (const_cast<CpuMatrix&>(outG)).subMatrix(begin, steps - j);
+        MatrixPtr dw = filterG.subMatrix(j, 1);
+        dw->addDotMulVMM(*dy, *x);
+      }
+    }
+  }
+
+  // gradient w.r.t input feature
+  if (inG) {
+    for (size_t i = 0; i < numSeq; ++i) {
+      size_t begin = starts[i];
+      size_t end = starts[i + 1];
+      size_t steps = end - begin;
+      for (size_t j = 0; j < steps; ++j) {
+        MatrixPtr dx = inG.subMatrix(begin + j, 1);
+        for (size_t t = 0; t < contextLength; ++t) {
+          if (int(j - t) >= 0) {
+            MatrixPtr dy =
+                (const_cast<CpuMatrix&>(outG)).subMatrix(begin + j - t, 1);
+            MatrixPtr w = (const_cast<CpuMatrix&>(filter)).subMatrix(t, 1);
+            dx->addDotMul(*dy, *w, 1.0, 1.0);
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+ * \brief The row convolution is called lookahead convolution. It is firstly
+ * introduced in deep-speech2 system. The bidirectional RNN that learns
+ * representation for a sequence by performing a forward and a backward pass
+ * through the entire sequence. However, unlike unidirectional RNNs,
+ * bidirectional RNNs are challenging to deploy in an online and low-latency
+ * setting. The lookahead convolution incorporates information from future
+ * subsequences in a computationally efficient manner to improve unidirectional
+ * recurrent neural networks.
+ *
+ * The connection of row convolution is different form the 1D sequence
+ * convolution. Assumed that, the future context-length is k, that is to say,
+ * it can get the output at timestep t by using the the input feature from t-th
+ * timestep to (t+k)-th timestep. Assumed that the hidden dim of input
+ * activations are d, the activations r_t for the new layer at time-step t are:
+ *
+ *
+ *            -- k + 1
+ *  r(t,i) =  >       W(i,j) * h(t+j-1, i),  for (1 <= i <= d)
+ *            -- j = 1
+ *
+ *
+ * The weight shape is: (k + 1) x d
+ * Function Arguments:
+ *
+ * \param inputs[0]  The input activations.
+ * \param inputs[0]  The filter (or weight) and shape is (k+1) x d.
+ * \param outputs[1] The output activations.
+ *
+ * [1] Dario Amodei, etc. Deep Speech 2 : End-to-End Speech Recognition in
+ * English
+ *     and Mandarin. https://arxiv.org/abs/1512.02595
+ */
+
+template <DeviceType Device>
+class RowConvFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override {}
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    // check
+    CHECK_EQ(2UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    // TODO(qingqing): support ASSIGN_TO.
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
+        << "SequenceArg required here.";
+    const auto in = dynamic_cast<const SequenceArg&>(inputs[0]);
+    auto out = dynamic_cast<const SequenceArg&>(outputs[0]);
+    auto w = inputs[1];
+    CHECK(in.data() && out.data() && in.getSequenceId().data());
+    CHECK_EQ(in.shape().ndims(), 2UL);
+    CHECK(in.shape() == out.shape());
+    CHECK_EQ(w.shape()[1], in.shape()[1]);
+
+    auto outMat = out.matrix<Device>();
+    const auto inMat = in.matrix<Device>();
+    const auto wMat = w.matrix<Device>();
+    const auto seqId = in.getSequenceId().vector<int, Device>();
+
+    RowConv<Device>(outMat, inMat, wMat, seqId);
+  }
+};
+
+/**
+ * \brief The backward of row convolution function. This function calculated
+ * the gradient w.r.t filter and the gradient w.r.t input activations(or data).
+ *
+ * Argument in this Function:
+ *
+ * \param inputs[0]  The gradient w.r.t output activations.
+ * \param inputs[1]  The input activations.
+ * \param inputs[2]  The filter (or weight) and shape is (k+1) x d.
+ * \param outputs[0] The gradient w.r.t input activations.
+ * \param outputs[1] The gradient w.r.r filter.
+ *
+ * Abbreviation:
+ * w.r.t: with respect to.
+ */
+
+template <DeviceType Device>
+class RowConvGradFunc : public FunctionBase {
+  // TODO(qingqing): split into RowConvDataFunc and RowConvWeightFunc
+ public:
+  void init(const FuncConfig& config) override {}
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    // check
+    CHECK_EQ(3UL, inputs.size());
+    CHECK_EQ(2UL, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    CHECK_EQ(outputs[1].getArgType(), ADD_TO);
+    CHECK(inputs[0].isSequenceArg() && inputs[1].isSequenceArg() &&
+          outputs[0].isSequenceArg())
+        << "SequenceArg required here.";
+
+    const auto outGrad = dynamic_cast<const SequenceArg&>(inputs[0]);
+    const auto in = dynamic_cast<const SequenceArg&>(inputs[1]);
+    const auto w = inputs[2];
+    auto inGrad = dynamic_cast<const SequenceArg&>(outputs[0]);
+    auto wGrad = outputs[1];
+
+    CHECK_EQ(in.shape().ndims(), 2UL);
+    CHECK(in.shape() == inGrad.shape());
+    CHECK(in.shape() == outGrad.shape());
+    CHECK_EQ(wGrad.shape()[1], in.shape()[1]);
+
+    const auto outGMat = outGrad.matrix<Device>();
+    const auto inMat = in.matrix<Device>();
+    const auto wMat = w.matrix<Device>();
+    auto inGMat = inGrad.data()
+                      ? inGrad.matrix<Device>()
+                      : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
+    auto wGMat = wGrad.data()
+                     ? wGrad.matrix<Device>()
+                     : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
+    const auto seqId = in.getSequenceId().vector<int, Device>();
+
+    RowConvGrad<Device>(outGMat, inMat, wMat, inGMat, wGMat, seqId);
+  }
+};
+
+REGISTER_TYPED_FUNC(RowConv, CPU, RowConvFunc);
+REGISTER_TYPED_FUNC(RowConvGrad, CPU, RowConvGradFunc);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_TYPED_FUNC(RowConv, GPU, RowConvFunc);
+REGISTER_TYPED_FUNC(RowConvGrad, GPU, RowConvGradFunc);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/RowConvOp.h b/paddle/legacy/function/RowConvOp.h
similarity index 100%
rename from paddle/function/RowConvOp.h
rename to paddle/legacy/function/RowConvOp.h
diff --git a/paddle/legacy/function/RowConvOpGpu.cu b/paddle/legacy/function/RowConvOpGpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a6d2e4c7e38b12bcd448a85f9e74df226e6984af
--- /dev/null
+++ b/paddle/legacy/function/RowConvOpGpu.cu
@@ -0,0 +1,373 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/cuda/include/hl_base.h"
+#include "paddle/legacy/function/RowConvOp.h"
+
+namespace paddle {
+
+template <int BLOCK_H, int BLOCK_W>
+__global__ void KeRowConv(real* y,
+                          const real* x,
+                          const real* w,
+                          const int* starts,
+                          const int height,
+                          const int width,
+                          const int numSeq,
+                          const int context) {
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int blky = blockDim.y;
+  const int gidx = blockIdx.x * blockDim.x;
+
+  __shared__ real sw[BLOCK_H][BLOCK_W];
+
+  for (int i = tidy; i < context; i += blky) {
+    sw[i][tidx] = gidx + tidx < width ? w[i * width + gidx + tidx] : 0.0;
+  }
+
+  __syncthreads();
+
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+    for (int j = tidy; j < steps; j += blky) {
+      real sum = 0;
+      int off = (start + j) * width;
+      for (int t = 0; t < context; ++t) {
+        if ((start + j + t) < end) {
+          int xoff = off + t * width;
+          real xVal = gidx + tidx < width ? x[xoff + gidx + tidx] : 0.0;
+          sum += sw[t][tidx] * xVal;
+        }
+      }
+      if (gidx + tidx < width) {
+        y[off + gidx + tidx] += sum;
+      }
+    }
+  }
+}
+
+__global__ void KeRowConv2(real* y,
+                           const real* x,
+                           const real* w,
+                           const int* starts,
+                           const int height,
+                           const int width,
+                           const int numSeq,
+                           const int context) {
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int blky = blockDim.y;
+  const int gidx = blockIdx.x * blockDim.x;
+
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+    for (int j = tidy; j < steps; j += blky) {
+      int off = (start + j) * width;
+      real sum = 0;
+      for (int t = 0; t < context && (start + j + t) < end; ++t) {
+        int xoff = off + t * width;
+        real xd = gidx + tidx < width ? x[xoff + gidx + tidx] : 0.0;
+        real wd = gidx + tidx < width ? w[t * width + gidx + tidx] : 0.0;
+        sum += wd * xd;
+      }
+      if (gidx + tidx < width) {
+        y[off + gidx + tidx] += sum;
+      }
+    }
+  }
+}
+
+template <>
+void RowConv<DEVICE_TYPE_GPU>(GpuMatrix& out,  // NOLINT
+                              const GpuMatrix& in,
+                              const GpuMatrix& filter,
+                              const GpuIVector& seq) {
+  const size_t numSeq = seq.getSize() - 1;
+  const size_t contextLength = filter.getHeight();
+  const size_t height = in.getHeight();
+  const size_t width = in.getWidth();
+
+  real* y = out.getData();
+  const real* x = in.getData();
+  const real* w = filter.getData();
+  const int* starts = seq.getData();
+
+  dim3 dimBlock(32, 32);
+  dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
+
+  if (contextLength <= 32) {
+    KeRowConv<32, 32><<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
+        y, x, w, starts, height, width, numSeq, contextLength);
+  } else {
+    KeRowConv2<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
+        y, x, w, starts, height, width, numSeq, contextLength);
+  }
+  CHECK_SYNC("RowConv");
+}
+
+template <int BLOCK_H, int BLOCK_W, int CONTEXT>
+__global__ void KeRowConvBwWeight(real* dw,
+                                  const real* x,
+                                  const real* dy,
+                                  const int* starts,
+                                  const int height,
+                                  const int width,
+                                  const int numSeq,
+                                  const int context) {
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int blky = blockDim.y;
+  const int gidx = blockIdx.x * blockDim.x;
+
+  __shared__ real sh_x[BLOCK_W][BLOCK_H];
+  __shared__ real sh_dy[BLOCK_W][BLOCK_H + CONTEXT - 1];
+  __shared__ real sh_dw[CONTEXT][BLOCK_W];
+
+  if (tidy < context) {
+    sh_dw[tidy][tidx] = 0.0;
+  }
+  __syncthreads();
+
+  // NOTE(zcd): temporary solution
+  unsigned mask = 0u;
+  CREATE_SHFL_MASK(mask, true);
+
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+    const int size = ((steps + BLOCK_H - 1) / BLOCK_H) * BLOCK_H;
+    for (int j = tidy; j < size; j += BLOCK_H) {
+      int xoff = gidx + tidx;
+      int yoff = start + j;
+
+      // transpose
+      sh_x[tidx][tidy] =
+          (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
+      sh_dy[tidx][tidy + context - 1] =
+          (xoff < width && yoff < end) ? dy[yoff * width + xoff] : 0.0;
+      __syncthreads();
+      if (tidy < (context - 1)) {
+        yoff = yoff - context + 1;
+        sh_dy[tidx][tidy] =
+            (xoff < width && yoff >= start) ? dy[yoff * width + xoff] : 0.0;
+      }
+      __syncthreads();
+
+      for (int t = 0; t < context; t++) {
+        real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx + context - 1 - t];
+        __syncthreads();
+        // warp size and blockDim.x is 32.
+
+        for (int offset = 16; offset > 0; offset /= 2)
+          val += __shfl_down_sync(mask, val, offset);
+
+        __syncthreads();
+        if (tidx == 0) {
+          sh_dw[t][tidy] += val;
+        }
+        __syncthreads();
+      }
+    }
+  }
+
+  for (int t = tidy; (t < context) && ((gidx + tidx) < width); t += blky) {
+    dw[t * width + gidx + tidx] += sh_dw[t][tidx];
+  }
+}
+
+template <int BLOCK_H, int BLOCK_W>
+__global__ void KeRowConvBwWeight2(real* dw,
+                                   const real* x,
+                                   const real* dy,
+                                   const int* starts,
+                                   const int height,
+                                   const int width,
+                                   const int numSeq,
+                                   const int context) {
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int gidx = blockIdx.x * blockDim.x;
+
+  __shared__ real sh_x[BLOCK_H][BLOCK_W];
+  __shared__ real sh_dy[BLOCK_H][BLOCK_W];
+
+  // NOTE(zcd): temporary solution
+  unsigned mask = 0u;
+  CREATE_SHFL_MASK(mask, true);
+
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+
+    const int size = ((steps + BLOCK_H - 1) / BLOCK_H) * BLOCK_H;
+    for (int j = tidy; j < size; j += BLOCK_H) {
+      int xoff = gidx + tidx;
+      int yoff = start + j;
+
+      // transpose
+      sh_x[tidx][tidy] =
+          (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
+      __syncthreads();
+
+      for (int t = 0; t < context; t++) {
+        sh_dy[tidx][tidy] =
+            (xoff < width && (yoff - t) >= start && yoff - t < end)
+                ? dy[(yoff - t) * width + xoff]
+                : 0.0;
+        __syncthreads();
+
+        real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx];
+        __syncthreads();
+        // warp size and blockDim.x is 32.
+        for (int offset = 16; offset > 0; offset /= 2)
+          val += __shfl_down_sync(mask, val, offset);
+
+        __syncthreads();
+
+        if (tidx == 0 && (gidx + tidy) < width) {
+          dw[t * width + gidx + tidy] += val;
+        }
+      }
+    }
+  }
+}
+
+template <int BLOCK_H, int BLOCK_W>
+__global__ void KeRowConvBwData(real* dx,
+                                const real* w,
+                                const real* dy,
+                                const int* starts,
+                                const int height,
+                                const int width,
+                                const int numSeq,
+                                const int context) {
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int blky = blockDim.y;
+  const int gidx = blockIdx.x * blockDim.x;
+
+  __shared__ real sw[BLOCK_H][BLOCK_W];
+
+  for (int i = tidy; i < context; i += blky) {
+    sw[i][tidx] = gidx + tidx < width ? w[i * width + gidx + tidx] : 0.0;
+  }
+
+  __syncthreads();
+
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+    for (int j = tidy; j < steps; j += blky) {
+      real sum = 0;
+      int off = (start + j) * width;
+      for (int t = 0; t < context && (j - t) >= 0; ++t) {
+        int dyOff = off - t * width;
+        real dyVal = gidx + tidx < width ? dy[dyOff + gidx + tidx] : 0.0;
+        sum += sw[t][tidx] * dyVal;
+      }
+      if (gidx + tidx < width) {
+        dx[off + gidx + tidx] += sum;
+      }
+    }
+  }
+}
+
+__global__ void KeRowConvBwData2(real* dx,
+                                 const real* w,
+                                 const real* dy,
+                                 const int* starts,
+                                 const int height,
+                                 const int width,
+                                 const int numSeq,
+                                 const int context) {
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int blky = blockDim.y;
+  const int gidx = blockIdx.x * blockDim.x;
+
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+    for (int j = tidy; j < steps; j += blky) {
+      real sum = 0;
+      int off = (start + j) * width;
+      for (int t = 0; t < context && (j - t) >= 0; ++t) {
+        int dyOff = off - t * width;
+        real dyVal = gidx + tidx < width ? dy[dyOff + gidx + tidx] : 0.0;
+        real wVal = gidx + tidx < width ? w[t * width + gidx + tidx] : 0.0;
+        sum += wVal * dyVal;
+      }
+      if (gidx + tidx < width) {
+        dx[off + gidx + tidx] += sum;
+      }
+    }
+  }
+}
+
+template <>
+void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
+                                  const GpuMatrix& in,
+                                  const GpuMatrix& filter,
+                                  GpuMatrix& inG,      // NOLINT
+                                  GpuMatrix& filterG,  // NOLINT
+                                  const GpuIVector& seq) {
+  const size_t numSeq = seq.getSize() - 1;
+  const size_t contextLength = filter.getHeight();
+  const size_t height = in.getHeight();
+  const size_t width = in.getWidth();
+
+  const real* dy = outG.getData();
+  const real* x = in.getData();
+  const real* w = filter.getData();
+  const int* starts = seq.getData();
+
+  if (filterG) {
+    dim3 dimBlock(32, 32);
+    dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
+    real* dw = filterG.getData();
+    if (contextLength <= 32) {
+      KeRowConvBwWeight<32, 32, 32><<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
+          dw, x, dy, starts, height, width, numSeq, contextLength);
+    } else {
+      KeRowConvBwWeight2<32, 32><<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
+          dw, x, dy, starts, height, width, numSeq, contextLength);
+    }
+  }
+
+  if (inG) {
+    real* dx = inG.getData();
+    dim3 dimBlock2(32, 32);
+    dim3 dimGrid2(DIVUP(width, dimBlock2.x), 1);
+    if (contextLength <= 64) {
+      KeRowConvBwData<32, 64><<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>>(
+          dx, w, dy, starts, height, width, numSeq, contextLength);
+    } else {
+      KeRowConvBwData2<<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>>(
+          dx, w, dy, starts, height, width, numSeq, contextLength);
+    }
+  }
+
+  CHECK_SYNC("RowConvGrad");
+}
+
+}  // namespace paddle
diff --git a/paddle/function/RowConvOpTest.cpp b/paddle/legacy/function/RowConvOpTest.cpp
similarity index 100%
rename from paddle/function/RowConvOpTest.cpp
rename to paddle/legacy/function/RowConvOpTest.cpp
diff --git a/paddle/legacy/function/ScaleSubRegionOp.cpp b/paddle/legacy/function/ScaleSubRegionOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..03a422a740dca4499532cdb1bdfbf3d3ab272a9a
--- /dev/null
+++ b/paddle/legacy/function/ScaleSubRegionOp.cpp
@@ -0,0 +1,155 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ScaleSubRegionOp.h"
+#include "paddle/legacy/function/TensorShape.h"
+
+namespace paddle {
+
+template <>
+void ScaleSubRegion<DEVICE_TYPE_CPU>(real* outputs,
+                                     const real* inputs,
+                                     const real* indices,
+                                     const TensorShape shape,
+                                     const FuncConfig& conf) {
+  real value = conf.get<real>("value");
+
+  int number = shape[0];
+  int channel = shape[1];
+  int height = shape[2];
+  int width = shape[3];
+
+  memcpy(outputs, inputs, number * channel * height * width * sizeof(real));
+
+  for (int n = 0; n < number; ++n) {
+    // indices start from 1
+    int offset = n * 6;
+    for (int c = indices[offset] - 1; c < indices[offset + 1]; ++c) {
+      for (int h = indices[offset + 2] - 1; h < indices[offset + 3]; ++h) {
+        for (int w = indices[offset + 4] - 1; w < indices[offset + 5]; ++w) {
+          int idx = ((n * channel + c) * height + h) * width + w;
+          outputs[idx] *= value;
+        }
+      }
+    }
+  }
+}
+
+template <>
+void ScaleSubRegionGrad<DEVICE_TYPE_CPU>(const real* inGrad,
+                                         real* outGrad,
+                                         const real* indices,
+                                         const TensorShape shape,
+                                         const FuncConfig& conf) {
+  real value = conf.get<real>("value");
+
+  int number = shape[0];
+  int channel = shape[1];
+  int height = shape[2];
+  int width = shape[3];
+
+  for (int n = 0; n < number; ++n) {
+    for (int c = 0; c < channel; ++c) {
+      for (int h = 0; h < height; ++h) {
+        for (int w = 0; w < width; ++w) {
+          int idx = ((n * channel + c) * height + h) * width + w;
+          int offset = n * 6;
+          if (c >= (indices[offset] - 1) && c <= (indices[offset + 1] - 1) &&
+              h >= (indices[offset + 2] - 1) &&
+              h <= (indices[offset + 3] - 1) &&
+              w >= (indices[offset + 4] - 1) &&
+              w <= (indices[offset + 5] - 1)) {
+            outGrad[idx] += inGrad[idx] * value;
+          } else {
+            outGrad[idx] += inGrad[idx];
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+ * \brief For each instance, ScaleSubRegion can be used to multiply a value to
+ *        a specified sub continuous region. By providing start index and end
+ *        index for C/H/W, you can specify the location and shape of the region.
+ *
+ * Argument in this Function:
+ * \param inputs    A 4-D tensor with shape [N, C, H, W], only one input.
+ * \param indices   A 2-D tensor with shape [N, 6], indicates the sub region.
+ * \param outputs   A 4-D tensor with same shape as inputs, output value.
+ */
+template <DeviceType Device>
+class ScaleSubRegionFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override { conf_ = config; }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(2UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+
+    TensorShape shape = inputs[0].shape();
+
+    ScaleSubRegion<Device>(outputs[0].data<real>(),
+                           inputs[0].data<real>(),
+                           inputs[1].data<real>(),
+                           shape,
+                           conf_);
+  }
+
+ private:
+  FuncConfig conf_;
+};
+
+/**
+ * \brief The backward propagation of ScaleSubRegion Function.
+ *
+ * Argument in this Function:
+ * \param inputs  A 4-D tensor with shape [N, C, H, W], output gradient.
+ * \param indices A 2-D tensor with shape [N, 6], indicates the sub region.
+ * \param outputs A 4-D tensor with shape [N, C, H, W], gradient of input value.
+ */
+
+template <DeviceType Device>
+class ScaleSubRegionGradFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override { conf_ = config; }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(2UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+
+    TensorShape shape = inputs[0].shape();
+
+    ScaleSubRegionGrad<Device>(inputs[0].data<real>(),
+                               outputs[0].data<real>(),
+                               inputs[1].data<real>(),
+                               shape,
+                               conf_);
+  }
+
+ private:
+  FuncConfig conf_;
+};
+
+REGISTER_TYPED_FUNC(ScaleSubRegion, CPU, ScaleSubRegionFunc);
+REGISTER_TYPED_FUNC(ScaleSubRegionGrad, CPU, ScaleSubRegionGradFunc);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_TYPED_FUNC(ScaleSubRegion, GPU, ScaleSubRegionFunc);
+REGISTER_TYPED_FUNC(ScaleSubRegionGrad, GPU, ScaleSubRegionGradFunc);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/ScaleSubRegionOp.h b/paddle/legacy/function/ScaleSubRegionOp.h
similarity index 100%
rename from paddle/function/ScaleSubRegionOp.h
rename to paddle/legacy/function/ScaleSubRegionOp.h
diff --git a/paddle/function/ScaleSubRegionOpGpu.cu b/paddle/legacy/function/ScaleSubRegionOpGpu.cu
similarity index 100%
rename from paddle/function/ScaleSubRegionOpGpu.cu
rename to paddle/legacy/function/ScaleSubRegionOpGpu.cu
diff --git a/paddle/function/ScaleSubRegionOpTest.cpp b/paddle/legacy/function/ScaleSubRegionOpTest.cpp
similarity index 100%
rename from paddle/function/ScaleSubRegionOpTest.cpp
rename to paddle/legacy/function/ScaleSubRegionOpTest.cpp
diff --git a/paddle/legacy/function/SwitchOp.cpp b/paddle/legacy/function/SwitchOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c6accd18039180aa521c18193e576d22e11f5a97
--- /dev/null
+++ b/paddle/legacy/function/SwitchOp.cpp
@@ -0,0 +1,140 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "SwitchOp.h"
+#include "paddle/legacy/math/Vector.h"
+
+namespace paddle {
+
+template <>
+void NCHW2NHWC<DEVICE_TYPE_CPU>(real* outputs,
+                                const real* inputs,
+                                const int num,
+                                const int inC,
+                                const int inH,
+                                const int inW,
+                                const int argType) {
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < inC; ++c) {
+      for (int h = 0; h < inH; ++h) {
+        for (int w = 0; w < inW; ++w) {
+          if (argType == ADD_TO) {
+            outputs[((n * inH + h) * inW + w) * inC + c] += *(inputs++);
+          } else {
+            outputs[((n * inH + h) * inW + w) * inC + c] = *(inputs++);
+          }
+        }
+      }
+    }
+  }
+}
+
+template <>
+void NHWC2NCHW<DEVICE_TYPE_CPU>(real* outputs,
+                                const real* inputs,
+                                const int num,
+                                const int inH,
+                                const int inW,
+                                const int inC,
+                                const int argType) {
+  for (int n = 0; n < num; ++n) {
+    for (int h = 0; h < inH; ++h) {
+      for (int w = 0; w < inW; ++w) {
+        for (int c = 0; c < inC; ++c) {
+          if (argType == ADD_TO) {
+            outputs[((n * inC + c) * inH + h) * inW + w] += *(inputs++);
+          } else {
+            outputs[((n * inC + c) * inH + h) * inW + w] = *(inputs++);
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+ * \brief  Switch dimension order of image input.
+ *         The input and output is a 4D tensor. Switch order
+ *         'batch_size,channels, height, width' to
+ *         order 'batch_size, height, width, channels'.
+ *
+ * Argument in this Function:
+ * \param inputs  input data with order 'batch_size,channels, height, width'.
+ * \param outputs output data with order 'batch_size, height, width, channels'.
+ */
+template <DeviceType Device>
+class NCHW2NHWCFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override {}
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+
+    size_t num = inputs[0].shape()[0];
+    size_t inC = inputs[0].shape()[1];
+    size_t inH = inputs[0].shape()[2];
+    size_t inW = inputs[0].shape()[3];
+    NCHW2NHWC<Device>(outputs[0].data<real>(),
+                      inputs[0].data<real>(),
+                      num,
+                      inC,
+                      inH,
+                      inW,
+                      outputs[0].getArgType());
+  }
+};
+
+/**
+ * \brief  Switch dimension order of image input.
+ *         The input and output is a 4D tensor. Switch order
+ *         'batch_size, height, width, channels' to
+ *         order 'batch_size, channels, height, width'.
+ *
+ * Argument in this Function:
+ * \param inputs  input data with order 'batch_size, height, width, channels'.
+ * \param outputs output data with order 'batch_size, channels, height, width'.
+ */
+template <DeviceType Device>
+class NHWC2NCHWFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override {}
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+
+    size_t num = inputs[0].shape()[0];
+    size_t inH = inputs[0].shape()[1];
+    size_t inW = inputs[0].shape()[2];
+    size_t inC = inputs[0].shape()[3];
+
+    NHWC2NCHW<Device>(outputs[0].data<real>(),
+                      inputs[0].data<real>(),
+                      num,
+                      inH,
+                      inW,
+                      inC,
+                      outputs[0].getArgType());
+  }
+};
+
+REGISTER_TYPED_FUNC(NCHW2NHWC, CPU, NCHW2NHWCFunc);
+REGISTER_TYPED_FUNC(NHWC2NCHW, CPU, NHWC2NCHWFunc);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_TYPED_FUNC(NCHW2NHWC, GPU, NCHW2NHWCFunc);
+REGISTER_TYPED_FUNC(NHWC2NCHW, GPU, NHWC2NCHWFunc);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/SwitchOp.h b/paddle/legacy/function/SwitchOp.h
similarity index 100%
rename from paddle/function/SwitchOp.h
rename to paddle/legacy/function/SwitchOp.h
diff --git a/paddle/function/SwitchOpGpu.cu b/paddle/legacy/function/SwitchOpGpu.cu
similarity index 100%
rename from paddle/function/SwitchOpGpu.cu
rename to paddle/legacy/function/SwitchOpGpu.cu
diff --git a/paddle/function/SwitchOpTest.cpp b/paddle/legacy/function/SwitchOpTest.cpp
similarity index 100%
rename from paddle/function/SwitchOpTest.cpp
rename to paddle/legacy/function/SwitchOpTest.cpp
diff --git a/paddle/function/TensorShape.h b/paddle/legacy/function/TensorShape.h
similarity index 100%
rename from paddle/function/TensorShape.h
rename to paddle/legacy/function/TensorShape.h
diff --git a/paddle/function/TensorShapeTest.cpp b/paddle/legacy/function/TensorShapeTest.cpp
similarity index 100%
rename from paddle/function/TensorShapeTest.cpp
rename to paddle/legacy/function/TensorShapeTest.cpp
diff --git a/paddle/legacy/function/TensorType.h b/paddle/legacy/function/TensorType.h
new file mode 100644
index 0000000000000000000000000000000000000000..13994821be7ba7264f43d8550e6800cdc5b93875
--- /dev/null
+++ b/paddle/legacy/function/TensorType.h
@@ -0,0 +1,149 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+enum ValueType {
+  VALUE_TYPE_INT32 = 0,
+  VALUE_TYPE_FLOAT = 1,
+  VALUE_TYPE_DOUBLE = 2,
+  VALUE_TYPE_BYTE = 3
+};
+
+enum DeviceType {
+  DEVICE_TYPE_UNSPECIFIED = 0,
+  DEVICE_TYPE_CPU = 1,
+  DEVICE_TYPE_GPU = 2
+};
+
+enum SparseDataType { T_NO_VALUE = 0, T_FLOAT_VALUE = 1 };
+
+enum SparseDataFormat { T_SPARSE_CSR = 0, T_SPARSE_CSC = 1 };
+
+inline int sizeOfValuType(ValueType valueType) {
+  if (valueType == VALUE_TYPE_INT32) {
+    return 4;
+  } else if (valueType == VALUE_TYPE_FLOAT) {
+    return 4;
+  } else if (valueType == VALUE_TYPE_DOUBLE) {
+    return 8;
+  } else {
+    LOG(FATAL) << "Unknown type: " << valueType;
+    return 0;
+  }
+}
+
+template <typename T>
+struct DataType;
+
+template <>
+struct DataType<float> {
+  static const ValueType value = VALUE_TYPE_FLOAT;
+};
+
+template <>
+struct DataType<double> {
+  static const ValueType value = VALUE_TYPE_DOUBLE;
+};
+
+template <>
+struct DataType<int> {
+  static const ValueType value = VALUE_TYPE_INT32;
+};
+
+namespace detail {
+
+template <typename VType, DeviceType Device>
+struct MatrixT;
+
+template <>
+struct MatrixT<real, DEVICE_TYPE_CPU> {
+  using type = CpuMatrix;
+};
+
+template <>
+struct MatrixT<real, DEVICE_TYPE_GPU> {
+  using type = GpuMatrix;
+};
+
+template <>
+struct MatrixT<int, DEVICE_TYPE_CPU> {
+  using type = void;  // Not implemented
+};
+
+template <>
+struct MatrixT<int, DEVICE_TYPE_GPU> {
+  using type = void;  // Not implemented
+};
+
+template <typename VType, DeviceType Device>
+struct SparseMatrixT;
+
+template <>
+struct SparseMatrixT<real, DEVICE_TYPE_CPU> {
+  using type = CpuSparseMatrix;
+};
+
+template <>
+struct SparseMatrixT<real, DEVICE_TYPE_GPU> {
+  using type = GpuSparseMatrix;
+};
+
+template <>
+struct SparseMatrixT<int, DEVICE_TYPE_CPU> {
+  using type = void;  // Not implemented
+};
+
+template <>
+struct SparseMatrixT<int, DEVICE_TYPE_GPU> {
+  using type = void;  // Not implemented
+};
+
+template <typename VType, DeviceType Device>
+struct VectorT;
+
+template <>
+struct VectorT<real, DEVICE_TYPE_CPU> {
+  using type = CpuVector;
+};
+
+template <>
+struct VectorT<real, DEVICE_TYPE_GPU> {
+  using type = GpuVector;
+};
+
+template <>
+struct VectorT<int, DEVICE_TYPE_CPU> {
+  using type = CpuIVector;
+};
+
+template <>
+struct VectorT<int, DEVICE_TYPE_GPU> {
+  using type = GpuIVector;
+};
+
+}  // namespace detail
+
+template <typename VType, DeviceType DType>
+struct Tensor {
+  typedef typename detail::VectorT<VType, DType>::type Vector;
+  typedef typename detail::MatrixT<VType, DType>::type Matrix;
+  typedef typename detail::SparseMatrixT<VType, DType>::type SparseMatrix;
+};
+
+}  // namespace paddle
diff --git a/paddle/function/TensorTypeTest.cpp b/paddle/legacy/function/TensorTypeTest.cpp
similarity index 100%
rename from paddle/function/TensorTypeTest.cpp
rename to paddle/legacy/function/TensorTypeTest.cpp
diff --git a/paddle/legacy/function/neon/NeonDepthwiseConv.cpp b/paddle/legacy/function/neon/NeonDepthwiseConv.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6179635a9fec4afecf53fabdc6a818588b54c808
--- /dev/null
+++ b/paddle/legacy/function/neon/NeonDepthwiseConv.cpp
@@ -0,0 +1,120 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "NeonDepthwiseConv.h"
+#include "paddle/legacy/function/ConvOp.h"
+
+namespace paddle {
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+
+template <DeviceType Device>
+class NeonDepthwiseConvFunction : public ConvFunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+
+    int batchSize = input[0];
+    int inputChannels = input[1];
+    int inputHeight = input[2];
+    int inputWidth = input[3];
+    int filterHeight = getFilterHeight(filter);
+    int filterWidth = getFilterWidth(filter);
+    int outputChannels = output[1];
+    int outputHeight = output[2];
+    int outputWidth = output[3];
+    int filterMultiplier = outputChannels / groups_;
+    CHECK_EQ(static_cast<size_t>(inputChannels), groups_);
+
+    // only support strideH() == strideW() and filterHeight == filterWidth.
+    CHECK_EQ(strideH(), strideW());
+    CHECK_EQ(filterHeight, filterWidth);
+
+    float* inputData = inputs[0].data<float>();
+    float* filterData = inputs[1].data<float>();
+    float* outputData = outputs[0].data<float>();
+
+    // padding the input
+    float* inputPadding = inputData;
+    int padInputHeight = inputHeight + 2 * paddingH();
+    int padInputWidth = inputWidth + 2 * paddingW();
+    int newSize =
+        batchSize * (inputChannels + 1) * padInputHeight * padInputWidth;
+
+    resizeBuffer<Device>(newSize);
+    inputPadding = reinterpret_cast<float*>(memory_->getBuf());
+    neon::Padding<float>::run(inputData,
+                              inputPadding,
+                              batchSize * inputChannels,
+                              inputHeight,
+                              inputWidth,
+                              padInputHeight,
+                              padInputWidth);
+
+    std::function<void(
+        const float*, const float*, int, int, int, int, int, int, float*)>
+        DepthWiseConv;
+
+    if (filterWidth == 3 && strideW() == 1) {
+      DepthWiseConv = neon::DepthwiseConvKernel<3, 1>::run;
+    } else if (filterWidth == 3 && strideW() == 2) {
+      DepthWiseConv = neon::DepthwiseConvKernel<3, 2>::run;
+    } else if (filterWidth == 4 && strideW() == 1) {
+      DepthWiseConv = neon::DepthwiseConvKernel<4, 1>::run;
+    } else if (filterWidth == 4 && strideW() == 2) {
+      DepthWiseConv = neon::DepthwiseConvKernel<4, 2>::run;
+    } else {
+      LOG(FATAL) << "Not supported";
+    }
+
+    for (int i = 0; i < batchSize; i++) {
+      DepthWiseConv(inputPadding,
+                    filterData,
+                    padInputHeight,
+                    padInputWidth,
+                    outputChannels,
+                    outputHeight,
+                    outputWidth,
+                    filterMultiplier,
+                    outputData);
+      inputPadding += inputChannels * padInputHeight * padInputWidth;
+      outputData += outputChannels * outputHeight * outputWidth;
+    }
+  }
+};
+
+#ifndef PADDLE_TYPE_DOUBLE
+REGISTER_TYPED_FUNC(NeonDepthwiseConv, CPU, NeonDepthwiseConvFunction);
+#endif
+
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/neon/NeonDepthwiseConv.h b/paddle/legacy/function/neon/NeonDepthwiseConv.h
similarity index 100%
rename from paddle/function/neon/NeonDepthwiseConv.h
rename to paddle/legacy/function/neon/NeonDepthwiseConv.h
diff --git a/paddle/legacy/function/neon/NeonDepthwiseConvTranspose.cpp b/paddle/legacy/function/neon/NeonDepthwiseConvTranspose.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..feb77e1ff9f591d63dbf86a05313d65025f7c65d
--- /dev/null
+++ b/paddle/legacy/function/neon/NeonDepthwiseConvTranspose.cpp
@@ -0,0 +1,136 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "NeonDepthwiseConv.h"
+#include "paddle/legacy/function/ConvOp.h"
+
+namespace paddle {
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+
+template <DeviceType Device>
+class NeonDepthwiseConvTransposeFunction : public ConvFunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+
+    int batchSize = input[0];
+    int inputChannels = input[1];
+    int inputHeight = input[2];
+    int inputWidth = input[3];
+    int filterHeight = getFilterHeight(filter);
+    int filterWidth = getFilterWidth(filter);
+    int outputChannels = output[1];
+    int outputHeight = output[2];
+    int outputWidth = output[3];
+    int filterMultiplier = outputChannels / groups_;
+    CHECK_EQ(inputChannels, groups_);
+
+    // only support strideH() == strideW() and filterHeight == filterWidth.
+    CHECK_EQ(strideH(), strideW());
+    CHECK_EQ(paddingH(), paddingW());
+    CHECK_EQ(filterHeight, filterWidth);
+
+    float* inputData = inputs[0].data<float>();
+    float* filterData = inputs[1].data<float>();
+    float* outputData = outputs[0].data<float>();
+
+    // padding the input, input -> inputPadding
+    float* inputPadding = inputData;
+    int padInputHeight =
+        (inputHeight - 1) * strideH() + 2 * filterHeight - 1 - 2 * paddingH();
+    int padInputWidth =
+        (inputWidth - 1) * strideW() + 2 * filterWidth - 1 - 2 * paddingW();
+
+    if (padInputHeight > inputHeight || padInputWidth > inputWidth) {
+      int newSize = batchSize * inputChannels * padInputHeight * padInputWidth;
+      resizeBuffer<Device>(newSize);
+      inputPadding = reinterpret_cast<float*>(memory_->getBuf());
+      if (strideH() == 1) {
+        neon::Padding<float>::run(inputData,
+                                  inputPadding,
+                                  batchSize * inputChannels,
+                                  inputHeight,
+                                  inputWidth,
+                                  padInputHeight,
+                                  padInputWidth);
+      } else if (strideH() == 2) {
+        neon::StridePadding::run(inputData,
+                                 inputPadding,
+                                 batchSize * inputChannels,
+                                 inputHeight,
+                                 inputWidth,
+                                 padInputHeight,
+                                 padInputWidth);
+      } else {
+        LOG(FATAL) << "Not supported";
+      }
+    }
+
+    std::function<void(
+        const float*, const float*, int, int, int, int, int, int, float*)>
+        DepthWiseConv;
+
+    if (filterWidth == 3) {
+      DepthWiseConv = neon::DepthwiseConvKernel<3, 1>::run;
+    } else if (filterWidth == 4) {
+      DepthWiseConv = neon::DepthwiseConvKernel<4, 1>::run;
+    } else {
+      LOG(FATAL) << "Not supported";
+    }
+
+    for (int i = 0; i < batchSize; i++) {
+      DepthWiseConv(inputPadding,
+                    filterData,
+                    padInputHeight,
+                    padInputWidth,
+                    outputChannels,
+                    outputHeight,
+                    outputWidth,
+                    filterMultiplier,
+                    outputData);
+      inputPadding += inputChannels * padInputHeight * padInputWidth;
+      outputData += outputChannels * outputHeight * outputWidth;
+    }
+  }
+};
+
+#ifndef PADDLE_TYPE_DOUBLE
+
+REGISTER_TYPED_FUNC(NeonDepthwiseConvTranspose,
+                    CPU,
+                    NeonDepthwiseConvTransposeFunction);
+
+#endif
+
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/neon/neon_util.h b/paddle/legacy/function/neon/neon_util.h
similarity index 100%
rename from paddle/function/neon/neon_util.h
rename to paddle/legacy/function/neon/neon_util.h
diff --git a/paddle/legacy/function/nnpack/NNPACKConvOp.cpp b/paddle/legacy/function/nnpack/NNPACKConvOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..81c832e7747f8e75d322891476e08dacc435f5d4
--- /dev/null
+++ b/paddle/legacy/function/nnpack/NNPACKConvOp.cpp
@@ -0,0 +1,247 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "nnpack.h"
+#include "paddle/legacy/function/ConvOp.h"
+
+DEFINE_bool(nnpack_allocate_outside,
+            true,
+            "Allocate and free workspace memory outside the NNPACK interface.");
+DEFINE_int32(nnpack_num_threads,
+             0,
+             "The number of nnpack threads"
+             "default: 0; 0 to disable threadpool.");
+
+namespace paddle {
+
+nnp_convolution_algorithm get_nnp_convolution_algorithm(
+    const std::string& algorithm) {
+  if (algorithm == "auto") {
+    return nnp_convolution_algorithm_auto;
+  } else if (algorithm == "ft8x8") {
+    return nnp_convolution_algorithm_ft8x8;
+  } else if (algorithm == "ft16x16") {
+    return nnp_convolution_algorithm_ft16x16;
+  } else if (algorithm == "wt8x8") {
+    return nnp_convolution_algorithm_wt8x8;
+  } else if (algorithm == "implicit-gemm") {
+    return nnp_convolution_algorithm_implicit_gemm;
+  } else if (algorithm == "direct") {
+    return nnp_convolution_algorithm_direct;
+  } else {
+    return nnp_convolution_algorithm_auto;
+  }
+}
+
+template <DeviceType Device>
+class NNPACKConvFunction : public ConvFunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+    algorithm_ = get_nnp_convolution_algorithm(config.get<std::string>("algo"));
+    transform_strategy_ = nnp_convolution_transform_strategy_compute;
+    nnp_status status = nnp_initialize();
+    CHECK_EQ(status, nnp_status_success);
+    workspaceBuffer_ = nullptr;
+    workspaceSize_ = 0;
+
+    create_nnpack_threadpool();
+  }
+
+  ~NNPACKConvFunction() {
+    if (workspaceBuffer_) {
+      free(workspaceBuffer_);
+    }
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+    check(inputs, outputs);
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+
+    nnp_size inputSize = {.width = inputWidth, .height = inputHeight};
+    nnp_padding padding = {.top = (size_t)paddingH(),
+                           .right = (size_t)paddingW(),
+                           .bottom = (size_t)paddingH(),
+                           .left = (size_t)paddingW()};
+    nnp_size kernelSize = {.width = filterWidth, .height = filterHeight};
+    nnp_size outputSubsampling = {.width = (size_t)strideW(),
+                                  .height = (size_t)strideH()};
+
+    float* inputData = inputs[0].data<float>();
+    float* filterData = inputs[1].data<float>();
+    float* outputData = outputs[0].data<float>();
+
+    void* bufferPtr = nullptr;
+    size_t* sizePtr = nullptr;
+    size_t needSize;
+    if (FLAGS_nnpack_allocate_outside) {
+      if (batchSize == 1) {
+        nnp_status status = nnp_convolution_inference(algorithm_,
+                                                      transform_strategy_,
+                                                      inputChannels,
+                                                      outputChannels,
+                                                      inputSize,
+                                                      padding,
+                                                      kernelSize,
+                                                      outputSubsampling,
+                                                      nullptr,
+                                                      nullptr,
+                                                      nullptr,
+                                                      nullptr,
+                                                      nullptr,
+                                                      &needSize,
+                                                      nnp_activation_identity,
+                                                      nullptr,
+                                                      nullptr,
+                                                      nullptr);
+        CHECK_EQ(status, nnp_status_success);
+      } else {
+        // only supports stride = 1
+        CHECK_EQ(strideH(), 1);
+        CHECK_EQ(strideW(), 1);
+        nnp_status status = nnp_convolution_output(algorithm_,
+                                                   batchSize,
+                                                   inputChannels,
+                                                   outputChannels,
+                                                   inputSize,
+                                                   padding,
+                                                   kernelSize,
+                                                   nullptr,
+                                                   nullptr,
+                                                   nullptr,
+                                                   nullptr,
+                                                   nullptr,
+                                                   &needSize,
+                                                   nnp_activation_identity,
+                                                   nullptr,
+                                                   nullptr,
+                                                   nullptr);
+        CHECK_EQ(status, nnp_status_success);
+      }
+
+      VLOG(3) << "workspace size is " << needSize;
+      if (needSize > workspaceSize_) {
+        workspaceSize_ = needSize;
+        if (workspaceBuffer_) {
+          free(workspaceBuffer_);
+        } else {
+          posix_memalign(&workspaceBuffer_, 64, needSize);
+        }
+      }
+
+      if (needSize) {
+        bufferPtr = workspaceBuffer_;
+        sizePtr = &needSize;
+      }
+    }
+
+    size_t inputOffset = inputChannels / groups_ * inputHeight * inputWidth;
+    size_t outputOffset = outputChannels / groups_ * outputHeight * outputWidth;
+    size_t filterOffset = filter.getElements() / groups_;
+
+    if (batchSize == 1) {
+      for (size_t g = 0; g < groups_; g++) {
+        nnp_status status =
+            nnp_convolution_inference(algorithm_,
+                                      transform_strategy_,
+                                      inputChannels / groups_,
+                                      outputChannels / groups_,
+                                      inputSize,
+                                      padding,
+                                      kernelSize,
+                                      outputSubsampling,
+                                      inputData + inputOffset * g,
+                                      filterData + filterOffset * g,
+                                      nullptr, /* bias */
+                                      outputData + outputOffset * g,
+                                      bufferPtr,
+                                      sizePtr,
+                                      nnp_activation_identity,
+                                      nullptr,
+                                      threadpool_, /* threadpool */
+                                      nullptr);
+        CHECK_EQ(status, nnp_status_success);
+      }
+    } else {
+      // only supports stride = 1
+      CHECK_EQ(strideH(), 1);
+      CHECK_EQ(strideW(), 1);
+
+      // TODO(hedaoyuan): There has some bug when batchSize > 1 and groups_ > 1.
+      CHECK_EQ(groups_, static_cast<size_t>(1));
+      nnp_status status = nnp_convolution_output(algorithm_,
+                                                 batchSize,
+                                                 inputChannels,
+                                                 outputChannels,
+                                                 inputSize,
+                                                 padding,
+                                                 kernelSize,
+                                                 inputData,
+                                                 filterData,
+                                                 nullptr, /* bias */
+                                                 outputData,
+                                                 bufferPtr,
+                                                 sizePtr,
+                                                 nnp_activation_identity,
+                                                 nullptr,
+                                                 threadpool_, /* threadpool */
+                                                 nullptr);
+      CHECK_EQ(status, nnp_status_success);
+    }
+  }
+
+  static void create_nnpack_threadpool() {
+    if (FLAGS_nnpack_num_threads && threadpool_ == nullptr) {
+      threadpool_ = pthreadpool_create(FLAGS_nnpack_num_threads);
+      VLOG(3) << "Number of threads "
+              << pthreadpool_get_threads_count(threadpool_);
+    }
+  }
+
+ private:
+  nnp_convolution_algorithm algorithm_;
+  nnp_convolution_transform_strategy transform_strategy_;
+  void* workspaceBuffer_;
+  size_t workspaceSize_;
+  static pthreadpool_t threadpool_;
+};
+
+template <DeviceType Device>
+pthreadpool_t NNPACKConvFunction<Device>::threadpool_ = nullptr;
+
+REGISTER_TYPED_FUNC(NNPACKConv, CPU, NNPACKConvFunction);
+
+}  // namespace paddle
diff --git a/paddle/legacy/function/nnpack/NNPACKConvOpTest.cpp b/paddle/legacy/function/nnpack/NNPACKConvOpTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a2db83f5a36310ca6f173d6e6501118b34060761
--- /dev/null
+++ b/paddle/legacy/function/nnpack/NNPACKConvOpTest.cpp
@@ -0,0 +1,30 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/legacy/function/ConvOpTest.h"
+
+namespace paddle {
+
+TEST(NNPACK, Forward) {
+  Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
+      "GemmConv-CPU", "NNPACKConv-CPU", forward);
+}
+
+TEST(NNPACK, Depthwise) {
+  DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
+      "GemmConv-CPU", "NNPACKConv-CPU", forward);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/CMakeLists.txt b/paddle/legacy/gserver/CMakeLists.txt
similarity index 100%
rename from paddle/gserver/CMakeLists.txt
rename to paddle/legacy/gserver/CMakeLists.txt
diff --git a/paddle/legacy/gserver/activations/ActivationFunction.cpp b/paddle/legacy/gserver/activations/ActivationFunction.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ae07c7e6d7fd9fe28a00dd209ae834cd28a327f7
--- /dev/null
+++ b/paddle/legacy/gserver/activations/ActivationFunction.cpp
@@ -0,0 +1,509 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ActivationFunction.h"
+
+#include <algorithm>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <thread>
+#include <type_traits>
+#include "paddle/legacy/parameter/Argument.h"
+#include "paddle/legacy/utils/ClassRegistrar.h"
+#include "paddle/legacy/utils/Logging.h"
+
+#ifdef PADDLE_WITH_MKLDNN
+#include "MKLDNNActivation.h"
+#endif
+
+namespace paddle {
+
+static ClassRegistrar<ActivationFunction> gActivationRegistrar;
+/**
+ * @def ACTIVATION_CLASS_NAME
+ * @brief Macro for getting derived activation class name
+ * @note ACTIVATION_CLASS_NAME(softmax) softmax_;
+ * means softmaxActivation softmax_;
+ */
+#define ACTIVATION_CLASS_NAME(ACTIVATION_NAME) ACTIVATION_NAME##Activation
+/**
+ * @def BEGIN_DEFINE_ACTIVATION
+ * @brief Macro for defining a devried activation class
+ */
+#define BEGIN_DEFINE_ACTIVATION(ACTIVATION_NAME)                             \
+  class ACTIVATION_CLASS_NAME(ACTIVATION_NAME) : public ActivationFunction { \
+   private:                                                                  \
+    static const std::string name;                                           \
+                                                                             \
+   public:                                                                   \
+    const std::string& getName() const { return name; }
+/**
+ * @def END_DEFINE_ACTIVATION
+ * @brief Macro for registering a derived activation class
+ */
+#define END_DEFINE_ACTIVATION(ACTIVATION_NAME)                     \
+  }                                                                \
+  ;                                                                \
+  const std::string ACTIVATION_CLASS_NAME(ACTIVATION_NAME)::name = \
+      #ACTIVATION_NAME;                                            \
+  static InitFunction __reg_activation__##ACTIVATION_NAME([] {     \
+    gActivationRegistrar                                           \
+        .registerClass<ACTIVATION_CLASS_NAME(ACTIVATION_NAME)>(    \
+            #ACTIVATION_NAME);                                     \
+  });
+
+/**
+ * @brief The IdentityActivation class
+ *
+ * Do nothing when forward/backward.
+ */
+class IdentityActivation : public ActivationFunction {
+ public:
+  static const std::string name;
+  Error __must_check forward(Argument& act) {
+    (void)act;
+    return Error();
+  }
+  Error __must_check backward(Argument& act) {
+    (void)act;
+    return Error();
+  }
+  const std::string& getName() const { return name; }
+};
+const std::string IdentityActivation::name = "";
+static InitFunction __reg_activation__identity([] {
+  gActivationRegistrar.registerClass<IdentityActivation>("");
+  gActivationRegistrar.registerClass<IdentityActivation>("linear");
+});
+
+/**
+ * @brief Sigmoid Activation
+ * \f[
+ * f(z) = \frac{1}{1+exp(-z)}
+ * \f]
+ */
+BEGIN_DEFINE_ACTIVATION(sigmoid)
+Error __must_check forward(Argument& act) {
+  act.value->sigmoid(*act.value);
+  return Error();
+}
+Error __must_check backward(Argument& act) {
+  act.grad->sigmoidDerivative(*act.value);
+  return Error();
+}
+END_DEFINE_ACTIVATION(sigmoid)
+
+/**
+ * @brief Softmax Activation
+ * \f[
+ * P(y=j|x) = \frac{e^{x^Tw_j}}{\sum^K_{k=1}e^{x^Tw_k}}
+ * \f]
+ */
+BEGIN_DEFINE_ACTIVATION(softmax)
+private:
+MatrixPtr sftMaxSum_;
+MatrixPtr sftMaxDot_;
+
+public:
+Error __must_check forward(Argument& act) {
+  act.value->softmax(*act.value);
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  MatrixPtr outputV = act.value;
+  MatrixPtr outputG = act.grad;
+
+  if (outputG->useGpu()) {
+    outputG->softmaxBackward(*outputV);
+  } else {
+    SetDevice device(act.deviceId);
+    Matrix::resizeOrCreate(sftMaxDot_,
+                           outputG->getHeight(),
+                           outputG->getWidth(),
+                           /* trans */ false,
+                           useGpu(act.deviceId));
+    Matrix::resizeOrCreate(sftMaxSum_,
+                           outputG->getHeight(),
+                           1,
+                           /* trans */ false,
+                           useGpu(act.deviceId));
+
+    sftMaxDot_->dotMul(*outputG, *outputV);
+    sftMaxSum_->colMerge(*sftMaxDot_);
+
+    act.grad->softmaxDerivative(*act.value, *sftMaxSum_);
+  }
+  return Error();
+}
+END_DEFINE_ACTIVATION(softmax)
+
+/**
+ * @brief Sequence_softmax Activation
+ * @note Softmax on all frames of one sequence.
+ * Width of frame must be one.
+ */
+BEGIN_DEFINE_ACTIVATION(sequence_softmax)
+private:
+ACTIVATION_CLASS_NAME(softmax) softmax_;
+Argument argument_;
+
+public:
+Error __must_check forward(Argument& act) {
+  if (act.value->getWidth() != 1UL) {
+    return Error(
+        "Input width for each timestep of sequence softmax should be 1");
+  }
+
+  if (!argument_.value) {
+    argument_.value = Matrix::create(nullptr,
+                                     /* height= */ 1,
+                                     1,
+                                     /* trans= */ false,
+                                     useGpu(act.deviceId));
+    argument_.grad = Matrix::create(nullptr,
+                                    /* height= */ 1,
+                                    1,
+                                    /* trans= */ false,
+                                    useGpu(act.deviceId));
+  }
+
+  auto starts =
+      act.hasSubseq()
+          ? act.subSequenceStartPositions->getVector(useGpu(act.deviceId))
+          : act.sequenceStartPositions->getVector(useGpu(act.deviceId));
+  act.value->sequenceSoftmax(*act.value, *starts);
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  if (act.value->getWidth() != 1UL) {
+    return Error(
+        "Input width for each timestep of sequence softmax should be 1");
+  }
+
+  size_t numSequences =
+      act.hasSubseq() ? act.getNumSubSequences() : act.getNumSequences();
+  const int* starts = act.getCpuStartPositions();
+
+  for (size_t i = 0; i < numSequences; ++i) {
+    // TODO(Dangqingqing) optimization for GPU
+    size_t offset = starts[i];
+    size_t size = starts[i + 1] - starts[i];
+    argument_.value->setData(act.value->getData() + offset, 1UL, size);
+    argument_.grad->setData(act.grad->getData() + offset, 1UL, size);
+
+    Error err = softmax_.backward(argument_);
+    if (!err.isOK()) return err;
+  }
+  return Error();
+}
+END_DEFINE_ACTIVATION(sequence_softmax)
+
+/*
+ * @brief SoftSign Activation.
+ * \f[
+ * f(z) = \frac{z}{1 + |z|}
+ * \f]
+ */
+BEGIN_DEFINE_ACTIVATION(softsign)
+private:
+MatrixPtr denominator_;
+
+Error __must_check forward(Argument& act) {
+  size_t height = act.value->getHeight();
+  size_t width = act.value->getWidth();
+  Matrix::resizeOrCreate(
+      denominator_, height, width, false, useGpu(act.deviceId));
+  denominator_->assign(*act.value);
+  denominator_->abs2();
+  denominator_->add(1.);
+
+  act.value->dotDiv(*act.value, *denominator_);
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  denominator_->square2();
+  denominator_->scalarDiv(*denominator_, 1.);
+  act.grad->dotMul(*act.grad, *denominator_);
+  return Error();
+}
+END_DEFINE_ACTIVATION(softsign)
+
+/**
+ * @brief Relu Activation.
+ * forward. y = max(0, z)
+ *
+ * derivative of relu is:
+ *
+ *    1 if z > 0
+ *
+ *    0 otherwise.
+ */
+BEGIN_DEFINE_ACTIVATION(relu)
+Error __must_check forward(Argument& act) {
+  act.value->relu(*act.value);
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  act.grad->reluDerivative(*act.value);
+  return Error();
+}
+END_DEFINE_ACTIVATION(relu)
+
+/**
+ * @brief BRelu Activation.
+ *
+ * forward. y = min(24, max(0, z))
+ *
+ * derivative of brelu is:
+ *
+ *    1 if 0 < z < 24
+ *
+ *    0 otherwise.
+ *
+ * TODO(yuyang18): Remove magic number 24 or make it configuable.
+ */
+BEGIN_DEFINE_ACTIVATION(brelu)
+Error __must_check forward(Argument& act) {
+  act.value->brelu(*act.value);
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  act.grad->breluDerivative(*act.value);
+  return Error();
+}
+END_DEFINE_ACTIVATION(brelu)
+
+/**
+ * @brief Tanh Activation.
+ * \f[
+ * f(z) = tanh(z)=\frac{e^z-e^{-z}}{e^z+e^{-z}}
+ * \f]
+ */
+BEGIN_DEFINE_ACTIVATION(tanh)
+Error __must_check forward(Argument& act) {
+  act.value->tanh(*act.value);
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  act.grad->tanhDerivative(*act.value);
+  return Error();
+}
+END_DEFINE_ACTIVATION(tanh)
+
+/**
+ * @brief Scaled Tanh Activation
+ * \f[
+ * f(z) = 1.7159 * tanh(2/3*z)
+ * \f]
+ */
+BEGIN_DEFINE_ACTIVATION(stanh)
+private:
+real a, b;
+
+public:
+ACTIVATION_CLASS_NAME(stanh)() : a(1.7159), b(2. / 3.) {}
+Error __must_check forward(Argument& act) {
+  act.value->scaledTanh(*act.value, a, b);
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  act.grad->scaledTanhDerivative(*act.value, a, b);
+  return Error();
+}
+END_DEFINE_ACTIVATION(stanh)
+
+/**
+ * @brief Soft Relu Activation.
+ * \f[
+ * f(z) = ln(1+e^z)
+ * \f]
+ */
+BEGIN_DEFINE_ACTIVATION(softrelu)
+Error __must_check forward(Argument& act) {
+  act.value->softrelu(*act.value);
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  act.grad->softreluDerivative(*act.value);
+  return Error();
+}
+END_DEFINE_ACTIVATION(softrelu)
+
+/**
+ * @brief Abs Activation.
+ * Forward: f(z) = abs(z)
+ *
+ * Derivative:
+ *
+ *     1   if z>0
+ *
+ *    -1   if z<0
+ *
+ *     0   if z=0
+ */
+BEGIN_DEFINE_ACTIVATION(abs)
+Error __must_check forward(Argument& act) {
+  SetDevice device(act.deviceId);
+  Matrix::resizeOrCreate(act.in,
+                         act.value->getHeight(),
+                         act.value->getWidth(),
+                         /* trans */ false,
+                         useGpu(act.deviceId));
+
+  act.in->copyFrom(*act.value);
+  act.value->abs2(*act.value);
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  act.grad->absDerivative(*act.in);
+  return Error();
+}
+END_DEFINE_ACTIVATION(abs)
+
+/**
+ * @brief Square Activation.
+ * \f[
+ * f(z) = z^2.
+ * \f]
+ */
+BEGIN_DEFINE_ACTIVATION(square)
+Error __must_check forward(Argument& act) {
+  SetDevice device(act.deviceId);
+  Matrix::resizeOrCreate(act.in,
+                         act.value->getHeight(),
+                         act.value->getWidth(),
+                         /* trans */ false,
+                         useGpu(act.deviceId));
+
+  act.in->copyFrom(*act.value);
+  act.value->square2(*act.value);
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  act.grad->squareDerivative(*act.in);
+  return Error();
+}
+END_DEFINE_ACTIVATION(square)
+
+/**
+ * @brief Exponential Activation.
+ * \f[
+ * f(z) = e^z
+ * \f]
+ */
+BEGIN_DEFINE_ACTIVATION(exponential)
+Error __must_check forward(Argument& act) {
+  act.value->exp2(*act.value);
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  act.grad->expDerivative(*act.value);
+  return Error();
+}
+END_DEFINE_ACTIVATION(exponential)
+
+/**
+ * @brief Reciprocal Activation.
+ * \f[
+ * f(z) = 1/z
+ * \f]
+ */
+BEGIN_DEFINE_ACTIVATION(reciprocal)
+Error __must_check forward(Argument& act) {
+  act.value->reciprocal2();
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  act.grad->dotMulSquare(*act.value);
+  act.grad->neg();
+  return Error();
+}
+END_DEFINE_ACTIVATION(reciprocal)
+
+/**
+ * @brief Square Root Activation.
+ * \f[
+ * f(z) = sqrt(z)
+ * \f]
+ */
+BEGIN_DEFINE_ACTIVATION(sqrt)
+Error __must_check forward(Argument& act) {
+  act.value->sqrt2();
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  act.grad->dotDiv(*act.grad, *act.value);
+  act.grad->mulScalar(0.5);
+  return Error();
+}
+END_DEFINE_ACTIVATION(sqrt)
+
+/**
+ * @brief Logarithm Activation.
+ * \f[
+ * f(z) = log(z)
+ * \f]
+ */
+BEGIN_DEFINE_ACTIVATION(log)
+Error __must_check forward(Argument& act) {
+  SetDevice device(act.deviceId);
+  Matrix::resizeOrCreate(act.in,
+                         act.value->getHeight(),
+                         act.value->getWidth(),
+                         /* trans */ false,
+                         useGpu(act.deviceId));
+
+  act.in->copyFrom(*act.value);
+  act.value->log2(*act.value);
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  act.grad->dotDiv(*act.grad, *act.in);
+  return Error();
+}
+END_DEFINE_ACTIVATION(log)
+
+ActivationFunction* ActivationFunction::create(const std::string& type) {
+#ifdef PADDLE_WITH_MKLDNN
+  if (!type.empty() && type.compare(0, 7, "mkldnn_") == 0) {
+    return MKLDNNActivation::create(type);
+  }
+#endif
+
+  return gActivationRegistrar.createByType(type);
+}
+
+std::vector<std::string> ActivationFunction::getAllRegisteredTypes() {
+  std::vector<std::string> types;
+  gActivationRegistrar.forEachType(
+      [&](const std::string& type) { types.push_back(type); });
+  return types;
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/activations/ActivationFunction.h b/paddle/legacy/gserver/activations/ActivationFunction.h
new file mode 100644
index 0000000000000000000000000000000000000000..8bc5b0f529a6358fba8b6c9d1e1f6ee2358dbbf9
--- /dev/null
+++ b/paddle/legacy/gserver/activations/ActivationFunction.h
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+#include <vector>
+#include "paddle/legacy/utils/Error.h"
+
+namespace paddle {
+
+struct Argument;
+/**
+ * @brief Activation function is a function that transforms a set of input
+ * signals into an output signals. The purpose of the activation function
+ * is to introduce non-liearilty into the network.
+ *
+ * @note Common activation function are provieded, including linear,
+ * sigmoid, softmax, sequence_max, relu, brelu, tanh, stanh,
+ * softrelu, abs, square, exponential.
+ *
+ */
+class ActivationFunction {
+ public:
+  static ActivationFunction* create(const std::string& type);
+  static std::vector<std::string> getAllRegisteredTypes();
+
+  ActivationFunction() {}
+
+  virtual ~ActivationFunction() {}
+
+  /**
+   * @brief Foward propagation
+   *
+   * act.value <- f(act.value),
+   * where f is the activation function.
+   * Suppose that before calling forward(), act.value is x and
+   * after forward() is called, act.value is y, then y = f(x).
+   *
+   * Usually, act is Layer::output_
+   */
+  virtual Error __must_check forward(Argument& act) = 0;
+
+  /**
+   * @brief Backward propagaion
+   *
+   * x and y are defined in the above comment for forward().
+   * - Before calling backward(), act.grad = dE / dy, where E is the error/cost
+   * - After backward() returns, act.grad = dE / dx = (dE/dy) * (dy/dx)
+   */
+  virtual Error __must_check backward(Argument& act) = 0;
+
+  virtual const std::string& getName() const = 0;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/activations/MKLDNNActivation.cpp b/paddle/legacy/gserver/activations/MKLDNNActivation.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2eed7af70a8a3cc305a79bbe23177ea71d15d252
--- /dev/null
+++ b/paddle/legacy/gserver/activations/MKLDNNActivation.cpp
@@ -0,0 +1,249 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNActivation.h"
+#include "mkldnn.hpp"
+#include "paddle/legacy/utils/ClassRegistrar.h"
+
+namespace paddle {
+
+static ClassRegistrar<ActivationFunction> gMKLDNNActivationRegistrar;
+/**
+ * @def MKLDNN_ACTIVATION_CLASS_NAME
+ * @note MKLDNN_ACTIVATION_CLASS_NAME(relu) relu_;
+ * means mkldnn_reluActivation relu_;
+ */
+#define MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE) mkldnn_##ACT_TYPE##Activation
+
+/**
+ * @def BEGIN_MKLDNN_ACTIVATION
+ */
+#define BEGIN_MKLDNN_ACTIVATION(ACT_TYPE, BASE_CLASS) \
+  class MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE) : public BASE_CLASS {
+/**
+ * @def END_MKLDNN_ACTIVATION
+ */
+#define END_MKLDNN_ACTIVATION(ACT_TYPE)                            \
+ private:                                                          \
+  static const std::string name;                                   \
+                                                                   \
+ public:                                                           \
+  const std::string& getName() const { return name; }              \
+  }                                                                \
+  ;                                                                \
+  const std::string MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::name = \
+      "mkldnn_" #ACT_TYPE;                                         \
+  static InitFunction __reg_activation__mkldnn_##ACT_TYPE([] {     \
+    gMKLDNNActivationRegistrar                                     \
+        .registerClass<MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)>(    \
+            "mkldnn_" #ACT_TYPE);                                  \
+  });
+
+/**
+ * @def DEFINE_MKLDNN_ACTIVATION
+ */
+#define DEFINE_MKLDNN_ACTIVATION(ACT_TYPE, BASE_CLASS) \
+  BEGIN_MKLDNN_ACTIVATION(ACT_TYPE, BASE_CLASS)        \
+  END_MKLDNN_ACTIVATION(ACT_TYPE)
+
+/**
+ * @def DEFINE_MKLDNN_ELTWISE_ACTIVATION
+ */
+#define DEFINE_MKLDNN_ELTWISE_ACTIVATION(                            \
+    ACT_TYPE, BASE_CLASS, ALPHA, BWD_ALPHA)                          \
+  BEGIN_MKLDNN_ACTIVATION(ACT_TYPE, BASE_CLASS)                      \
+ private:                                                            \
+  static const float alpha;                                          \
+  static const float bwdAlpha;                                       \
+                                                                     \
+ public:                                                             \
+  float getAlpha() const { return alpha; }                           \
+  float getBwdAlpha() const { return bwdAlpha; }                     \
+  END_MKLDNN_ACTIVATION(ACT_TYPE)                                    \
+  const float MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::alpha = ALPHA; \
+  const float MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::bwdAlpha = BWD_ALPHA;
+
+/**
+ * @brief MKLDNN Relu Activation.
+ * Actually mkldnn_relu is Leaky Relu.
+ *  f(x) = x                   (x >= 0)
+ *  f(x) = negative_slope * x  (x <  0)
+ * @note the negative_slope should be -0.f in forward
+ */
+DEFINE_MKLDNN_ELTWISE_ACTIVATION(relu, MKLDNNEltwiseActivation, -0.f, 0.f)
+
+/**
+ * @brief MKLDNN Tanh Activation.
+ */
+DEFINE_MKLDNN_ELTWISE_ACTIVATION(tanh, MKLDNNEltwiseActivation, 0.f, 0.f)
+
+/**
+ * @brief MKLDNN ELU(Exponential Linear Unit) Activation.
+ *  f(x) = x                              (x >= 0)
+ *  f(x) = negative_slope * (exp(x) - 1)  (x <  0)
+ */
+DEFINE_MKLDNN_ELTWISE_ACTIVATION(elu, MKLDNNEltwiseActivation, 0.f, 0.f)
+
+mkldnn::algorithm MKLDNNEltwiseActivation::getAlgo(std::string type) const {
+  const std::map<std::string, mkldnn::algorithm> algoMap = {
+      {"relu", algorithm::eltwise_relu},
+      {"tanh", algorithm::eltwise_tanh},
+      {"elu", algorithm::eltwise_elu}};
+  type.erase(0, 7);  // remove mkldnn_
+  algorithm algo = (algorithm)0;
+  mapGet(type, algoMap, &algo);
+  return algo;
+}
+
+void MKLDNNEltwiseActivation::resetFwd(Argument& act) {
+  if (cnt_ == act.value->getElementCnt()) {
+    return;
+  }
+  MKLDNNActivation::resetFwd(act);
+  // note: alpha represents the NegativeSlope when used in relu.
+  float alpha = getAlpha();
+  float beta = getBeta();
+  algorithm algo = getAlgo(this->getName());
+  auto fwdDesc = eltwise_fwd::desc(mkldnn::prop_kind::forward_training,
+                                   algo,
+                                   val_->getMemoryDesc(),
+                                   alpha,
+                                   beta);
+  fwdPD_.reset(new eltwise_fwd::primitive_desc(fwdDesc, *engine_));
+  // use inplace for forward but save input value before submit
+  inVal_ = val_;
+  copyInVal_ = nullptr;
+  if (act.grad && algo == algorithm::eltwise_tanh) {
+    // tanh need save src input for backward
+    inVal_ = MKLDNNMatrix::create(val_->getPrimitiveDesc());
+    copyInVal_ = std::make_shared<mkldnn::reorder>(*val_, *inVal_);
+    CHECK(copyInVal_) << "should not be emptry";
+    pipelineFwd_.push_back(*copyInVal_);
+  }
+  fwd_.reset(new eltwise_fwd(*fwdPD_, *val_, *val_));
+  pipelineFwd_.push_back(*fwd_);
+  needResetBwd_ = true;
+}
+
+void MKLDNNEltwiseActivation::resetBwd(Argument& act) {
+  if (!needResetBwd_) {
+    return;
+  }
+  VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward";
+  needResetBwd_ = false;
+  algorithm algo = getAlgo(this->getName());
+  float alpha = getBwdAlpha();
+  float beta = getBeta();
+  grad_ = MKLDNNMatrix::create(val_->getPrimitiveDesc(), act.grad);
+  auto eng = CPUEngine::Instance().getEngine();
+  auto bwdDesc = eltwise_bwd::desc(
+      algo, grad_->getMemoryDesc(), val_->getMemoryDesc(), alpha, beta);
+  auto bwdPD = eltwise_bwd::primitive_desc(bwdDesc, eng, *fwdPD_);
+  CHECK(inVal_);
+  bwd_.reset(new eltwise_bwd(bwdPD, *inVal_, *grad_, *grad_));
+  pipelineBwd_.clear();
+  pipelineBwd_.push_back(*bwd_);
+}
+
+/**
+ * @brief MKLDNN Softmax Activation
+ */
+DEFINE_MKLDNN_ACTIVATION(softmax, MKLDNNSoftmaxActivation)
+
+void MKLDNNSoftmaxActivation::resetFwd(Argument& act) {
+  if (cnt_ == act.value->getElementCnt()) {
+    return;
+  }
+  MKLDNNActivation::resetFwd(act);
+  int axis = 1;
+  auto fwdDesc = softmax_fwd::desc(
+      mkldnn::prop_kind::forward_scoring, val_->getMemoryDesc(), axis);
+  auto fwdPD = softmax_fwd::primitive_desc(fwdDesc, *engine_);
+  fwd_.reset(new softmax_fwd(fwdPD, *val_, *val_));
+  pipelineFwd_.push_back(*fwd_);
+}
+
+Error __must_check MKLDNNSoftmaxActivation::forward(Argument& act) {
+  resetFwd(act);
+  stream_->submit(pipelineFwd_);
+  real* v = act.value->getData();
+  real threshold = exp(-64);
+#pragma omp parallel for
+  for (size_t i = 0; i < act.value->getElementCnt(); ++i) {
+    v[i] = v[i] < threshold ? threshold : v[i];
+  }
+  return Error();
+}
+
+Error __must_check MKLDNNSoftmaxActivation::backward(Argument& act) {
+  MatrixPtr outputV = act.value;
+  MatrixPtr outputG = act.grad;
+  Matrix::resizeOrCreate(sftMaxDot_,
+                         outputG->getHeight(),
+                         outputG->getWidth(),
+                         /* trans */ false,
+                         /* useGpu */ false);
+  Matrix::resizeOrCreate(sftMaxSum_,
+                         outputG->getHeight(),
+                         1,
+                         /* trans */ false,
+                         /* useGpu */ false);
+  sftMaxDot_->dotMul(*outputG, *outputV);
+  sftMaxSum_->colMerge(*sftMaxDot_);
+  act.grad->softmaxDerivative(*act.value, *sftMaxSum_);
+  return Error();
+}
+
+ActivationFunction* MKLDNNActivation::create(const std::string& type) {
+  return gMKLDNNActivationRegistrar.createByType(type);
+}
+
+std::vector<std::string> MKLDNNActivation::getAllRegisteredTypes() {
+  std::vector<std::string> types;
+  gMKLDNNActivationRegistrar.forEachType(
+      [&](const std::string& type) { types.push_back(type); });
+  return types;
+}
+
+void MKLDNNActivation::resetFwd(Argument& act) {
+  VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward";
+  cnt_ = act.value->getElementCnt();
+  pipelineFwd_.clear();
+  stream_.reset(new MKLDNNStream());
+  engine_.reset(new mkldnn::engine(mkldnn::engine::cpu, 0));
+  val_ = std::dynamic_pointer_cast<MKLDNNMatrix>(act.value);
+  if (val_ == nullptr) {
+    int bs = act.getBatchSize();
+    int ih = act.getFrameHeight() > 0 ? act.getFrameHeight() : 1;
+    int iw = act.getFrameWidth() > 0 ? act.getFrameWidth() : 1;
+    int ic = cnt_ / bs / ih / iw;
+    CHECK_EQ(cnt_, (size_t)bs * ic * ih * iw);
+    val_ = MKLDNNMatrix::create(
+        {bs, ic, ih, iw}, mkldnn::memory::format::nchw, *engine_, act.value);
+    CHECK(val_);
+    val_->downSpatial();
+  }
+}
+
+Error __must_check MKLDNNActivation::forward(Argument& act) {
+  resetFwd(act);
+  stream_->submit(pipelineFwd_);
+  return Error();
+}
+Error __must_check MKLDNNActivation::backward(Argument& act) {
+  resetBwd(act);
+  stream_->submit(pipelineBwd_);
+  return Error();
+}
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/activations/MKLDNNActivation.h b/paddle/legacy/gserver/activations/MKLDNNActivation.h
new file mode 100644
index 0000000000000000000000000000000000000000..59c447ad07398c0b6ca7d78766dd533963744d1b
--- /dev/null
+++ b/paddle/legacy/gserver/activations/MKLDNNActivation.h
@@ -0,0 +1,119 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "ActivationFunction.h"
+#include "mkldnn.hpp"
+#include "paddle/legacy/gserver/layers/MKLDNNBase.h"
+#include "paddle/legacy/math/MKLDNNMatrix.h"
+#include "paddle/legacy/parameter/Argument.h"
+
+namespace paddle {
+
+/**
+ * @brief Base class of MKLDNN Activation.
+ * Common activation function are provieded,
+ * including mkldnn_relu, mkldnn_elu, mkldnn_tanh, mkldnn_softmax
+ */
+class MKLDNNActivation : public ActivationFunction {
+ protected:
+  // input value element count
+  size_t cnt_;
+  // should not merge the resetBwd into resetFwd,
+  // because the grad data would be changing before backward.
+  bool needResetBwd_;
+  // mkldnn matrix, primitive, stream and pipeline
+  MKLDNNMatrixPtr val_;
+  MKLDNNMatrixPtr grad_;
+  std::shared_ptr<mkldnn::engine> engine_;
+  std::shared_ptr<MKLDNNStream> stream_;
+  std::shared_ptr<mkldnn::primitive> fwd_;
+  std::shared_ptr<mkldnn::primitive> bwd_;
+  std::vector<mkldnn::primitive> pipelineFwd_;
+  std::vector<mkldnn::primitive> pipelineBwd_;
+
+ public:
+  MKLDNNActivation() : cnt_(0), needResetBwd_(true) {}
+  ~MKLDNNActivation() {}
+  static ActivationFunction* create(const std::string& type);
+  static std::vector<std::string> getAllRegisteredTypes();
+  virtual const std::string& getName() const = 0;
+  /**
+   * reset the forward primitives
+   */
+  virtual void resetFwd(Argument& act);
+  /**
+   * reset the backward primitives,
+   * can not merge this functions into resetFwd as the grad data
+   * would be changing before backward.
+   */
+  virtual void resetBwd(Argument& act) {}
+  virtual Error __must_check forward(Argument& act);
+  virtual Error __must_check backward(Argument& act);
+};
+
+/**
+ * @brief Base class of MKLDNN Eltwise Activation,
+ * includes mkldnn_relu, mkldnn_elu and mkldnn_tanh.
+ */
+class MKLDNNEltwiseActivation : public MKLDNNActivation {
+  typedef mkldnn::eltwise_forward eltwise_fwd;
+  typedef mkldnn::eltwise_backward eltwise_bwd;
+  typedef mkldnn::algorithm algorithm;
+
+ protected:
+  // save the forward primitive desc, which can be used backward
+  std::shared_ptr<eltwise_fwd::primitive_desc> fwdPD_;
+  // eltwise_bwd need src input value
+  MKLDNNMatrixPtr inVal_;
+  // use for copy data
+  std::shared_ptr<mkldnn::reorder> copyInVal_;
+
+ public:
+  MKLDNNEltwiseActivation() {}
+  ~MKLDNNEltwiseActivation() {}
+  virtual const std::string& getName() const = 0;
+
+  // in common, the alpha of forward and backward should be equal.
+  // but for relu, to avoid negative value, they should be opposite
+  virtual float getAlpha() const = 0;
+  virtual float getBwdAlpha() const = 0;
+  virtual float getBeta() const { return 0.f; }
+  virtual algorithm getAlgo(std::string type) const;
+  void resetFwd(Argument& act) override;
+  void resetBwd(Argument& act) override;
+};
+
+/**
+ * @brief Base class of MKLDNN softmax Activation,
+ * only have mkldnn forward, use cpu implement for backward.
+ */
+class MKLDNNSoftmaxActivation : public MKLDNNActivation {
+  typedef mkldnn::softmax_forward softmax_fwd;
+
+ private:
+  // for backward
+  MatrixPtr sftMaxSum_;
+  MatrixPtr sftMaxDot_;
+
+ public:
+  MKLDNNSoftmaxActivation() {}
+  ~MKLDNNSoftmaxActivation() {}
+  virtual const std::string& getName() const = 0;
+  void resetFwd(Argument& act) override;
+  Error __must_check forward(Argument& act) override;
+  Error __must_check backward(Argument& act) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/dataproviders/DataProvider.cpp b/paddle/legacy/gserver/dataproviders/DataProvider.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b67af8a326bdfd211ee5720bf67828040b19e5c1
--- /dev/null
+++ b/paddle/legacy/gserver/dataproviders/DataProvider.cpp
@@ -0,0 +1,410 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "DataProvider.h"
+
+#include <unistd.h>
+#include <algorithm>
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/StringUtil.h"
+#include "paddle/legacy/utils/Util.h"
+
+namespace paddle {
+
+void BufferBatch::swap(BufferBatch* bufBatch) {
+  DataBatch* batchData = bufBatch->getDataBatch();
+  hl_event_t hlEvent = bufBatch->getCuEvent();
+  hl_stream_t hlStream = bufBatch->getCuStream();
+  bufBatch->setDataBatch(batchData_);
+  bufBatch->setCuStream(hlStream_);
+  bufBatch->setCuEvent(hlEvent_);
+
+  batchData_ = batchData;
+  hlEvent_ = hlEvent;
+  hlStream_ = hlStream;
+}
+
+void BufferBatch::clone(DataBatch* srcBatch, bool useGpu) {
+  if (batchData_ == NULL) {
+    batchData_ = new DataBatch();
+  }
+  std::vector<Argument>& destData = batchData_->getStreams();
+  int numStreams = srcBatch->getNumStreams();
+  destData.resize(numStreams);
+  batchData_->setSize(srcBatch->getSize());
+  if (useGpu) {
+    createCuEvent();
+  }
+
+  for (int i = 0; i < numStreams; i++) {
+    destData[i].resizeAndCopyFrom(srcBatch->getStream(i), useGpu, hlStream_);
+  }
+  if (useGpu) {
+    hl_stream_record_event(hlStream_, hlEvent_);
+  }
+}
+
+DoubleBuffer::DoubleBuffer(DataProvider* dataPool,
+                           bool useGpu,
+                           int64_t batchSize) {
+  batchSize_ = batchSize;
+  dataPool_ = dataPool;
+  useGpu_ = useGpu;
+  dataQueue_ = new BufferBatchQueue();
+  bufferQueue_ = new BufferBatchQueue();
+
+  // insert a empty buffer
+  bufferQueue_->enqueue(new BufferBatch());
+  stopping_ = false;
+  pending_ = true;
+}
+
+DoubleBuffer::~DoubleBuffer() {
+  finishAsyncLoad();
+  while (dataQueue_->size()) {
+    BufferBatch* dataBtch = dataQueue_->dequeue();
+    delete dataBtch;
+    dataBtch = NULL;
+  }
+  while (bufferQueue_->size()) {
+    BufferBatch* bufBtch = bufferQueue_->dequeue();
+    delete bufBtch;
+    bufBtch = NULL;
+  }
+  delete dataQueue_;
+  dataQueue_ = NULL;
+  delete bufferQueue_;
+  bufferQueue_ = NULL;
+}
+
+void DoubleBuffer::removeOneBatch(DataBatch* dataBatch) {
+  // get data
+  BufferBatch* batch = dataQueue_->dequeue();
+  batch->syncEvent();  // when use GPU, need synchronized with the cuEvent
+  *dataBatch = *(batch->getDataBatch());
+
+  // push anothor buffer
+  if (*usingBatch_ == nullptr) {
+    *usingBatch_ = std::make_shared<BufferBatch>();
+  }
+
+  // Mark the using-batch
+  batch->swap((*usingBatch_).get());
+  bufferQueue_->enqueue(batch);
+
+  if (0 == dataBatch->getSize()) {
+    setPending(true);
+  }
+}
+
+void DoubleBuffer::insertOneBatch(DataBatch* batch) {
+  while (!bufferQueue_->waitNotEmptyFor(2 /* seconds */)) {  // time out
+    if (stopping_) return;
+  }
+  BufferBatch* bufBatch = bufferQueue_->dequeue();
+  // clone and copy the data from an Threadlocal Variable
+  bufBatch->clone(batch, useGpu_);
+  dataQueue_->enqueue(bufBatch);
+}
+
+void DoubleBuffer::asyncLoadBatch() {
+  int64_t actualSize = 0;
+  if (useGpu_) {
+    hl_set_device(FLAGS_gpu_id);
+  }
+  setPending(false);
+
+  while (true) {
+    taskReadySem_.wait();
+    if (stopping_) break;
+
+    while (batchSize_ == 0 && !stopping_) {
+      usleep(5);
+    }
+    if (stopping_) break;
+
+    do {
+      DataBatch newBatch;
+      {
+        REGISTER_TIMER("getNextBatchInternal");
+        actualSize = dataPool_->getNextBatchInternal(batchSize_, &newBatch);
+      }
+      insertOneBatch(&newBatch);
+    } while (actualSize > 0 && !stopping_);
+  }
+}
+
+void DoubleBuffer::startAsyncLoad() {
+  if (asyncLoader_ == nullptr) {
+    asyncLoader_.reset(new std::thread([this]() { this->asyncLoadBatch(); }));
+  }
+  taskReadySem_.post();
+}
+
+ClassRegistrar<DataProvider, DataConfig, ModelConfig, bool>
+    DataProvider::registrar_;
+
+DataProvider* DataProvider::create(const DataConfig& config,
+                                   const ModelConfig& modelConfig,
+                                   bool useGpu) {
+  return registrar_.createByType(config.type(), config, modelConfig, useGpu);
+}
+
+REGISTER_DATA_PROVIDER(simple, SimpleDataProvider);
+REGISTER_DATA_PROVIDER(dummy, DummyDataProvider);
+
+int64_t DataProvider::getNextBatch(int64_t size, DataBatch* batch) {
+  int64_t batchSize = doubleBuffer_ ? getNextBatchFromBuffer(size, batch)
+                                    : getNextBatchInternal(size, batch);
+
+  if (!batchSize) return 0;
+
+  if (!config_.constant_slots_size()) return batchSize;
+
+  auto& constantSlots = *constantSlots_;
+  constantSlots.resize(config_.constant_slots_size());
+
+  for (int i = 0; i < config_.constant_slots_size(); ++i) {
+    MemoryHandlePtr handle =
+        constantSlots[i] ? constantSlots[i]->getMemoryHandle() : nullptr;
+    Matrix::resizeOrCreate(constantSlots[i],
+                           batchSize,
+                           1,         // = width
+                           false,     // = trans
+                           useGpu_);  // = useGpu
+    if (handle != constantSlots[i]->getMemoryHandle()) {
+      // memory buf was reallocated. We need to initialize the value
+      constantSlots[i]->assign(config_.constant_slots(i));
+    }
+    batch->appendData(constantSlots[i],
+                      batch->getStream(0).sequenceStartPositions);
+  }
+
+  return batchSize;
+}
+
+int64_t DataProvider::getNextBatchFromBuffer(int64_t size, DataBatch* batch) {
+  CHECK(doubleBuffer_ != nullptr);
+
+  if (doubleBuffer_->getBatchSize() != size) {
+    doubleBuffer_->setBatchSize(size);
+  }
+
+  doubleBuffer_->removeOneBatch(batch);
+  return batch->getSize();
+}
+
+void DataProvider::initAsyncLoader() {
+  if (doubleBuffer_ == nullptr) {
+    doubleBuffer_.reset(new DoubleBuffer(this, useGpu_));
+  }
+  useGpu_ = false;  // Avoid D2D copy, it will delay the computing performance
+}
+
+SimpleDataProviderBase::SimpleDataProviderBase(const DataConfig& config,
+                                               bool useGpu,
+                                               bool withInfo)
+    : DataProvider(config, useGpu) {
+  /* initialize the size of a sample, and the buffer */
+  sampleDim_ = config_.feat_dim() * (2 * config_.context_len() + 1);
+  bufferCapacity_ = config_.buffer_capacity();
+  withInfo_ = withInfo;
+  sampleNumInBuf_ = 0;
+  nextItemIndex_ = 0;
+
+  /* malloc buffer in cpu */
+  hInputDataBuf_ = std::make_shared<CpuMatrix>(bufferCapacity_, sampleDim_);
+  hInputLabelBuf_ = std::make_shared<CpuIVector>(bufferCapacity_);
+  hInputInfoBuf_ = std::make_shared<CpuIVector>(bufferCapacity_);
+}
+
+void SimpleDataProviderBase::shuffle() {
+  int i, t;
+  int len = sampleNumInBuf_;
+  std::vector<real> temp(sampleDim_);
+  real* data = hInputDataBuf_->getData();
+  int* label = hInputLabelBuf_->getData();
+  int* info = hInputInfoBuf_->getData();
+  int sampleSz = sizeof(real) * sampleDim_;
+  for (i = 0; i < len; i++) {
+    int randNum = rand();  // NOLINT TODO(yuyang18): Use rand_r instead?
+    t = randNum % (len - i) + i;
+    // swap
+    if (i != t) {
+      // swap data
+      memcpy(&temp[0], &data[i * sampleDim_], sampleSz);
+      memcpy(&data[i * sampleDim_], &data[t * sampleDim_], sampleSz);
+      memcpy(&data[t * sampleDim_], &temp[0], sampleSz);
+      std::swap(label[i], label[t]);
+      if (withInfo_) {
+        std::swap(info[i], info[t]);
+      }
+    }
+  }
+}
+
+int64_t SimpleDataProviderBase::getNextBatchInternal(int64_t size,
+                                                     DataBatch* batch) {
+  CHECK(batch != NULL);
+  batch->clear();
+
+  int64_t startIndex;
+  int64_t cpySize;
+
+  std::lock_guard<RWLock> guard(lock_);
+  if (sampleNumInBuf_ - nextItemIndex_ < size) {
+    int64_t n = fillBuffer();
+    VLOG(1) << "fillBuffer return " << n << " samples.\n";
+  }
+
+  startIndex = nextItemIndex_;
+  cpySize = std::min(size, sampleNumInBuf_ - nextItemIndex_);
+  nextItemIndex_ += cpySize;
+
+  if (cpySize > 0) {
+    real* data = hInputDataBuf_->getData() + startIndex * sampleDim_;
+    int* label = hInputLabelBuf_->getData() + startIndex;
+    int* info = hInputInfoBuf_->getData() + startIndex;
+
+    MatrixPtr& dataBatch = *dataBatch_;     // get the thread local object
+    IVectorPtr& labelBatch = *labelBatch_;  // get the thread local object
+    IVectorPtr& infoBatch = *infoBatch_;    // get the thread local object
+    if (!dataBatch) {
+      dataBatch = Matrix::create(cpySize, sampleDim_, false, useGpu_);
+      labelBatch = IVector::create(cpySize, useGpu_);
+      if (withInfo_) {
+        infoBatch = IVector::create(cpySize, 0);
+      }
+    } else {
+      dataBatch->resize(cpySize, sampleDim_);
+      labelBatch->resize(cpySize);
+      if (withInfo_) {
+        infoBatch->resize(cpySize);
+      }
+    }
+    dataBatch->copyFrom(data, cpySize * sampleDim_);
+    labelBatch->copyFrom(label, cpySize);
+    batch->appendData(dataBatch);
+    batch->appendLabel(labelBatch);
+    if (withInfo_) {
+      infoBatch->copyFrom(info, cpySize);
+      batch->appendLabel(infoBatch);
+    }
+  }
+
+  batch->setSize(cpySize);
+  return cpySize;
+}
+
+void SimpleDataProviderBase::reset() {
+  sampleNumInBuf_ = 0;
+  nextItemIndex_ = 0;
+  DataProvider::reset();
+}
+
+int64_t SimpleDataProviderBase::getSize() {
+  LOG(FATAL) << "Currently, not implemented";
+  return 0;
+}
+
+int64_t SimpleDataProviderBase::fillBuffer() {
+  int64_t n = sampleNumInBuf_ - nextItemIndex_;
+
+  /* flash the remaining data to the beginning of the buffer */
+  if (n > 0) {
+    hInputDataBuf_->copyFrom(
+        hInputDataBuf_->getData() + nextItemIndex_ * sampleDim_,
+        n * sampleDim_);
+    hInputLabelBuf_->copyFrom(hInputLabelBuf_->getData() + nextItemIndex_, n);
+    if (withInfo_) {
+      hInputInfoBuf_->copyFrom(hInputInfoBuf_->getData() + nextItemIndex_, n);
+    }
+  }
+
+  sampleNumInBuf_ =
+      n + fillBufferImp(hInputDataBuf_->getData() + n * sampleDim_,
+                        hInputLabelBuf_->getData() + n,
+                        hInputInfoBuf_->getData() + n,
+                        bufferCapacity_ - n);
+
+  /* for stachastic gradient training */
+  if (!skipShuffle_) {
+    shuffle();
+  }
+
+  nextItemIndex_ = 0;
+
+  return sampleNumInBuf_;
+}
+
+SimpleDataProvider::SimpleDataProvider(const DataConfig& config, bool useGpu)
+    : SimpleDataProviderBase(config, useGpu, /* withInfo= */ false),
+      currentSampleIndex_(0) {
+  loadData(config_.files());
+}
+
+SimpleDataProvider::~SimpleDataProvider() {}
+
+int64_t SimpleDataProvider::fillBufferImp(real* data,
+                                          int* label,
+                                          int* info,
+                                          int64_t size) {
+  (void)info;
+  int64_t n = std::min<int64_t>(labels_.size() - currentSampleIndex_, size);
+  memcpy(data,
+         &data_[currentSampleIndex_ * sampleDim_],
+         n * sampleDim_ * sizeof(real));
+  memcpy(label, &labels_[currentSampleIndex_], sizeof(int) * n);
+  currentSampleIndex_ += n;
+
+  return n;
+}
+
+void SimpleDataProvider::reset() {
+  currentSampleIndex_ = 0;
+  SimpleDataProviderBase::reset();
+}
+
+void SimpleDataProvider::loadData(const std::string& fileName) {
+  std::ifstream is(fileName);
+  CHECK(is) << "Fail to open " << fileName;
+  std::string line;
+  while (is) {
+    if (!getline(is, line)) break;
+    LOG(INFO) << "load data file " << line;
+    loadDataFile(line);
+  }
+  LOG(INFO) << "read done, num of instance=" << labels_.size()
+            << " data size=" << data_.size();
+}
+
+void SimpleDataProvider::loadDataFile(const std::string& fileName) {
+  std::ifstream is(fileName);
+  std::string line;
+  std::vector<std::string> pieces;
+  while (is) {
+    if (!getline(is, line)) break;
+    str::split(line, ' ', &pieces);
+    CHECK_EQ((uint64_t)(sampleDim_ + 1), pieces.size())
+        << " Dimension mismatch, " << pieces.size() - 1 << " in " << fileName
+        << " " << sampleDim_ << " from config";
+    labels_.push_back(atoi(pieces[0].c_str()));
+    for (int i = 0; i < sampleDim_; ++i) {
+      data_.push_back(atof(pieces[i + 1].c_str()));
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/dataproviders/DataProvider.h b/paddle/legacy/gserver/dataproviders/DataProvider.h
new file mode 100644
index 0000000000000000000000000000000000000000..c2e1c5fdd6d504b77873aaeeba3611dff6d8f738
--- /dev/null
+++ b/paddle/legacy/gserver/dataproviders/DataProvider.h
@@ -0,0 +1,480 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <vector>
+
+#include "DataConfig.pb.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/SparseMatrix.h"
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/legacy/parameter/Argument.h"
+#include "paddle/legacy/utils/ClassRegistrar.h"
+#include "paddle/legacy/utils/Common.h"
+#include "paddle/legacy/utils/Locks.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Queue.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
+#include "paddle/legacy/utils/Util.h"
+
+namespace paddle {
+/**
+ * @def REGISTER_DATA_PROVIDER
+ * @brief Macro for registering a data provider. The class type should contain
+ *        a consturctor with parameter (DataConfig, bool).
+ */
+#define REGISTER_DATA_PROVIDER(__type_name, __class_name)                \
+  static InitFunction __reg_type_##__type_name([]() {                    \
+    DataProvider::registrar_.registerClass(                              \
+        #__type_name,                                                    \
+        [](DataConfig conf, ModelConfig, bool useGpu) -> DataProvider* { \
+          DataProvider* dp = new __class_name(conf, useGpu);             \
+          return dp;                                                     \
+        });                                                              \
+  })
+
+/**
+ * @def REGISTER_DATA_PROVIDER_EX
+ * @brief Macro for registering a data provider, which contains a constructor
+ *        with parameter (DataConfig, ModelConfig, bool).
+ */
+#define REGISTER_DATA_PROVIDER_EX(__type_name, __class_name)            \
+  static InitFunction __reg_type_##__type_name([] {                     \
+    DataProvider::registrar_.registerClass<__class_name>(#__type_name); \
+  })
+
+class DataBatch;
+class BufferBatch;
+typedef std::shared_ptr<DataBatch> DataBatchPtr;
+typedef std::shared_ptr<BufferBatch> BufferBatchPtr;
+/**
+ * @brief Data for batch training a neural network
+ */
+class DataBatch {
+ public:
+  DataBatch() : size_(0) { data_.clear(); }
+  /**
+   * @brief Get batch size
+   * @return batch size
+   */
+  int64_t getSize() const { return size_; }
+  /**
+   * @brief Get num of sequences of sequence data
+   * @return num of sequences
+   */
+  int64_t getNumSequences() const {
+    if (data_.empty()) return size_;
+    return data_[0].sequenceStartPositions
+               ? data_[0].sequenceStartPositions->getSize() - 1
+               : size_;
+  }
+  /**
+   * @brief Set batch size
+   * @param[in] size size
+   */
+  void setSize(int64_t size) { size_ = size; }
+  /**
+   * @brief Get size of argument vector
+   * @return size of argument vector
+   * @note For usual supervised learning, input data and label is needed,
+   * then there will be two argument.
+   */
+  int64_t getNumStreams() const { return data_.size(); }
+
+  /**
+   * @brief Get a argument with index i
+   * @param[in] i index in argument vector
+   * @return a argument with index i
+   */
+  const Argument& getStream(int i) const { return data_[i]; }
+  /**
+   * @brief Get all argument
+   * @return an argument vector
+   */
+  std::vector<Argument>& getStreams() { return data_; }
+  /**
+   * @brief Get all argument const
+   * @return an argument vector
+   */
+  std::vector<Argument> getStreams() const { return data_; }
+  /**
+   * @brief Clear DataBatch
+   */
+  void clear() {
+    data_.clear();
+    size_ = 0;
+  }
+
+  /**
+   * @brief Append data to DataBatch
+   * @param[in] data  matrix data
+   * @note The order in which each data stream is appended must match the order
+   * specified in stream_names of DataConfig. The stream_names can be obtained
+   * using DataProvider::getStreamNames().
+   */
+  void appendData(MatrixPtr data) {
+    Argument argu;
+    argu.value = data;
+    data_.push_back(argu);
+  }
+
+  /**
+   * @brief Append sequence data to DataBatch
+   * @param[in] data                      matrix data
+   * @param[in] sequenceStartPositions    sequence data
+   * @note The order in which each data stream is appended must match the order
+   * specified in stream_names of DataConfig. The stream_names can be obtained
+   * using DataProvider::getStreamNames().
+   */
+  void appendData(const MatrixPtr& data,
+                  const ICpuGpuVectorPtr& sequenceStartPositions) {
+    Argument argu;
+    argu.value = data;
+    argu.sequenceStartPositions = sequenceStartPositions;
+    data_.push_back(argu);
+  }
+  /**
+   * @brief Append label data
+   * @param[in]  label    label data
+   * @param[in]  value    matrix data, default null
+   */
+  void appendLabel(IVectorPtr label, MatrixPtr value = nullptr) {
+    Argument argu;
+    argu.ids = label;
+    argu.value = value;
+    data_.push_back(argu);
+  }
+
+  /*
+   * @brief Append argument
+   * @param[in]  argus   DataBatch.getStreams()
+   * @param[in]  size    DataBatch.getSize()
+   * @param[in]  dataId  sub dataprovider id (in MultiDataProvider)
+   */
+  void appendArguments(const std::vector<Argument>& argus,
+                       int size,
+                       int dataId) {
+    size_ += size;
+    for (const auto& argu : argus) {
+      data_.push_back(argu);
+      data_.back().dataId = dataId;
+    }
+  }
+
+ protected:
+  /**
+   * @brief batch size
+   */
+  int64_t size_;
+  /**
+   * @brief A batch data consist of a Argument vector,
+   * An argument corresponds to a type of input data.
+   */
+  std::vector<Argument> data_;
+};
+
+class BufferBatch {
+ public:
+  BufferBatch() {
+    hlStream_ = HPPL_STREAM_DEFAULT;
+    hlEvent_ = NULL;
+    batchData_ = NULL;
+  }
+  ~BufferBatch() {
+    if (hlEvent_) {
+      hl_destroy_event(hlEvent_);
+      hlEvent_ = NULL;
+    }
+    delete batchData_;
+    batchData_ = NULL;
+  }
+
+  void setDataBatch(DataBatch* batchData) { batchData_ = batchData; }
+  DataBatch* getDataBatch() { return batchData_; }
+
+  void setCuStream(hl_stream_t stream) { hlStream_ = stream; }
+  hl_stream_t getCuStream() const { return hlStream_; }
+
+  void setCuEvent(hl_event_t event) { hlEvent_ = event; }
+
+  hl_event_t getCuEvent() const { return hlEvent_; }
+
+  void createCuEvent() {
+    if (!hlEvent_) {
+      hlStream_ = HPPL_STREAM_1;
+      hl_create_event(&hlEvent_);
+    }
+  }
+
+  void syncEvent() {
+    if (hlEvent_) {
+      hl_stream_wait_event(hlStream_, hlEvent_);
+    }
+  }
+
+  void swap(BufferBatch* bufBatch);
+  void clone(DataBatch* srcBatch, bool useGpu);
+
+ protected:
+  DataBatch* batchData_;
+  hl_stream_t hlStream_;
+  hl_event_t hlEvent_;
+};
+
+class DataProvider;
+typedef std::shared_ptr<DataProvider> DataProviderPtr;
+
+typedef Queue<BufferBatch*> BufferBatchQueue;
+
+class DoubleBuffer {
+ public:
+  DoubleBuffer(DataProvider* dataPool, bool useGpu, int64_t batchSize = 0);
+  virtual ~DoubleBuffer();
+  void removeOneBatch(DataBatch* dataBatch);
+
+  void setBatchSize(int64_t newBatchSize) { batchSize_ = newBatchSize; }
+
+  int64_t getBatchSize() { return batchSize_; }
+
+  void startAsyncLoad();
+  void finishAsyncLoad() {
+    stopping_ = true;
+    taskReadySem_.post();
+    if (asyncLoader_) {
+      asyncLoader_->join();
+    }
+  }
+
+  void setPending(bool pending) { pending_ = pending; }
+
+ protected:
+  virtual void asyncLoadBatch();
+  void insertOneBatch(DataBatch* batch);
+
+  DataProvider* dataPool_;
+  bool useGpu_;
+  int32_t batchSize_;
+  ThreadLocal<BufferBatchPtr> usingBatch_;
+  BufferBatchQueue* dataQueue_;
+  BufferBatchQueue* bufferQueue_;
+  std::unique_ptr<std::thread> asyncLoader_;
+  Semaphore taskReadySem_;
+  bool stopping_;
+  bool pending_;
+};
+
+/**
+ * @brief Base class for DataProvider, which supplies data for training
+ * @note It can supplies multiple streams of data.
+ * For typical supervised training, there are two streams:
+ * one is for input, one is for label.
+ */
+class DataProvider {
+ public:
+  static ClassRegistrar<DataProvider, DataConfig, ModelConfig, bool> registrar_;
+  static DataProvider* create(const DataConfig& config,
+                              const ModelConfig& modelConfig,
+                              bool useGpu = FLAGS_use_gpu);
+
+  /**
+   * @brief create only used for unittest.
+   */
+  inline static DataProvider* create(const DataConfig& config,
+                                     bool useGpu = FLAGS_use_gpu) {
+    return create(config, ModelConfig(), useGpu);
+  }
+
+  DataProvider(const DataConfig& config, bool useGpu)
+      : config_(config),
+        skipShuffle_(false),
+        usageRatio_(config.usage_ratio()),
+        useGpu_(useGpu) {
+    if (config_.async_load_data()) {
+      initAsyncLoader();
+    }
+  }
+  virtual ~DataProvider() {}
+
+  const DataConfig& getConfig() const { return config_; }
+
+  void setSkipShuffle() { skipShuffle_ = true; }
+
+  /**
+   * @brief Get next batch of training samples
+   * @param[in]    size    size of training samples to get
+   * @param[out]   batch   a batch of training samples
+   * @return actual size of obtained training samples
+   */
+  int64_t getNextBatch(int64_t size, DataBatch* batch);
+
+  /**
+   * @brief Shuffle the data set
+   */
+  virtual void shuffle() = 0;
+
+  /**
+   * @brief reset all the value of index
+   * @note reset() must be called before any calls to getNextBatch()
+   * IMPORTANT: subclass reset() should always call the base class reset()
+   * at the end of the function
+   */
+  virtual void reset() {
+    if (doubleBuffer_ != nullptr) {
+      doubleBuffer_->startAsyncLoad();
+    }
+  }
+
+  /**
+   * @brief Get the size of training samples
+   * @return the number of training samples in the data set.
+   * @note return -1 to indicate unlimited number of samples.
+   */
+  virtual int64_t getSize() = 0;
+
+  /**
+   * @brief Get next batch training samples internally
+   * @param[in]    size      size of training samples to get
+   * @param[out]   batch     a batch of training samples
+   * @return actual size of obtained training samples
+   */
+  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch) = 0;
+
+ protected:
+  DataConfig config_;
+  bool skipShuffle_;
+  float usageRatio_;
+  bool useGpu_;
+  std::unique_ptr<DoubleBuffer> doubleBuffer_;
+  ThreadLocal<std::vector<MatrixPtr>> constantSlots_;
+  /**
+   * @@brief Get next batch training samples from buffer
+   * @param[in]    size      size of training samples to get
+   * @param[out]   batch     a batch of training samples
+   * @return actual size of obtained training samples
+   */
+  int64_t getNextBatchFromBuffer(int64_t size, DataBatch* batch);
+
+  void initAsyncLoader();
+};
+
+/**
+ * A data provider which does nothing. It only serves as providing
+ * necessary configurations such as stream_names
+ */
+class DummyDataProvider : public DataProvider {
+ public:
+  DummyDataProvider(const DataConfig& config, bool useGpu)
+      : DataProvider(config, useGpu) {}
+  virtual void shuffle() {}
+  virtual void reset() { DataProvider::reset(); }
+  virtual int64_t getSize() { return 0; }
+  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch) {
+    (void)size;
+    (void)batch;
+    return 0;
+  }
+};
+
+/**
+ * Data provider for one input and one integer label.
+ */
+class SimpleDataProviderBase : public DataProvider {
+ protected:
+  /// sample feature dimension
+  int64_t sampleDim_;
+  /// the number of samples
+  int64_t bufferCapacity_;
+  int64_t sampleNumInBuf_;
+  /// next item to read in buffer
+  int64_t nextItemIndex_;
+  /// some user defined info for validation
+  bool withInfo_;
+
+  /// data buffer: bufferCapacity_ * nDataDim_
+  CpuMatrixPtr hInputDataBuf_;
+
+  /// label buffer:bufferCapacity_ * 1
+  CpuIVectorPtr hInputLabelBuf_;
+
+  /// info buffer:bufferCapacity_ * 1
+  CpuIVectorPtr hInputInfoBuf_;
+
+  ThreadLocal<MatrixPtr> dataBatch_;
+  ThreadLocal<IVectorPtr> labelBatch_;
+  ThreadLocal<IVectorPtr> infoBatch_;
+
+  RWLock lock_;
+
+ public:
+  SimpleDataProviderBase(const DataConfig& config, bool useGpu, bool withInfo);
+  ~SimpleDataProviderBase() {}
+
+  void shuffle();
+
+  virtual void reset();
+
+  virtual int64_t getSize();
+
+  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
+
+  /// return the number of samples in the buffer
+  int64_t fillBuffer();
+
+ protected:
+  /**
+   * @brief Fill at most size samples into data and label.
+   *
+   * Each input is stored in contiguous memory locations in data.
+   *
+   * data[n * sampleDim_] .. data[n * sampleDim_ + sampleDim_ - 1] is for
+   * the input of the n-th sample.
+   *
+   * label[n] is the label for the n-th sample.
+   */
+  virtual int64_t fillBufferImp(real* data,
+                                int* label,
+                                int* info,
+                                int64_t size) = 0;
+};
+
+class SimpleDataProvider : public SimpleDataProviderBase {
+ public:
+  SimpleDataProvider(const DataConfig& config, bool useGpu);
+  ~SimpleDataProvider();
+  virtual void reset();
+
+ protected:
+  void loadData(const std::string& fileName);
+  void loadDataFile(const std::string& fileName);
+  virtual int64_t fillBufferImp(real* data,
+                                int* label,
+                                int* info,
+                                int64_t size);
+
+ protected:
+  size_t currentSampleIndex_;
+  std::vector<int> labels_;
+  std::vector<real> data_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/dataproviders/DataProviderGroup.h b/paddle/legacy/gserver/dataproviders/DataProviderGroup.h
similarity index 100%
rename from paddle/gserver/dataproviders/DataProviderGroup.h
rename to paddle/legacy/gserver/dataproviders/DataProviderGroup.h
diff --git a/paddle/legacy/gserver/dataproviders/MultiDataProvider.cpp b/paddle/legacy/gserver/dataproviders/MultiDataProvider.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e5fc6d8a88fe2c03cc74b4a38e999d11d676dfdf
--- /dev/null
+++ b/paddle/legacy/gserver/dataproviders/MultiDataProvider.cpp
@@ -0,0 +1,122 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MultiDataProvider.h"
+#include <algorithm>
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Util.h"
+
+namespace paddle {
+
+using namespace std;
+
+MultiDataProvider::MultiDataProvider(const DataConfig& config,
+                                     const ModelConfig& modelConfig,
+                                     bool useGpu)
+    : DataProvider(config, useGpu) {
+  bool atLeastOneMainDataFlag = false;
+  totalDataRatio_ = 0;
+  LOG(INFO) << "MultiDataProvider: sub data provider size: "
+            << config.sub_data_configs_size();
+  LOG(INFO) << "MultiDataProvider: for_test: " << config.for_test();
+  isTestMode_ = config.for_test();
+  for (int i = 0; i < config.sub_data_configs_size(); i++) {
+    LOG(INFO) << "dataRatio of sub(" << i
+              << ") is: " << config.sub_data_configs(i).data_ratio();
+    totalDataRatio_ += config.sub_data_configs(i).data_ratio();
+    if (config.sub_data_configs(i).is_main_data()) {
+      LOG(INFO) << "main data is [" << i << "]";
+      atLeastOneMainDataFlag = true;
+    }
+  }
+  CHECK(atLeastOneMainDataFlag) << "all sub dataproviders in MultiData do not"
+                                << " have is_main_data flag";
+  LOG(INFO) << "totalDataRatio_=" << totalDataRatio_;
+  DataConfig subConfig;
+  int subDataProviderCount = config.sub_data_configs_size();
+  if (isTestMode()) {
+    LOG(INFO) << "construct MultiDataProvider in test mode";
+  } else {
+    LOG(INFO) << "construct MultiDataProvider in train mode";
+  }
+  subDataProviders_.resize(subDataProviderCount);
+  for (int i = 0; i < subDataProviderCount; i++) {
+    subConfig = config.sub_data_configs(i);
+    if (subConfig.async_load_data()) {
+      LOG(INFO) << "can not use async_load_data in sub dataprovider of "
+                   "MultiDataProvider";
+      subConfig.set_async_load_data(false);
+    }
+    subDataProviders_[i] = std::unique_ptr<DataProvider>(
+        DataProvider::create(subConfig, modelConfig, useGpu_));
+  }
+}
+
+void MultiDataProvider::reset() {
+  for (auto& elem : subDataProviders_) {
+    elem->reset();
+  }
+  DataProvider::reset();
+}
+
+void MultiDataProvider::shuffle() {
+  for (auto& elem : subDataProviders_) {
+    elem->shuffle();
+  }
+}
+
+int64_t MultiDataProvider::getNextBatchInternal(int64_t size,
+                                                DataBatch* batch) {
+  batch->clear();
+  for (size_t i = 0; i < subDataProviders_.size(); ++i) {
+    // calc size according to data ratio
+    int64_t subSize =
+        (int64_t)(1.0 * size * config_.sub_data_configs(i).data_ratio() /
+                  totalDataRatio_);
+    DataBatch subBatch;
+    int64_t realSize =
+        subDataProviders_[i]->getNextBatchInternal(subSize, &subBatch);
+    if (realSize == 0) {
+      // current subDataProvider has no data
+      if (!isTestMode()) {
+        // in train mode
+        if (config_.sub_data_configs(i).is_main_data()) {
+          // is main data provider. then return 0
+          batch->clear();
+          return 0;
+        } else {
+          // not main data provider, reset current subDataProvider and try again
+          subDataProviders_[i]->reset();
+          subBatch.clear();
+          realSize =
+              subDataProviders_[i]->getNextBatchInternal(subSize, &subBatch);
+          CHECK_GT(realSize, 0);
+        }
+      } else {
+        // in test mode, make an empty argument
+        Argument emptyArgu;
+        std::vector<Argument> argus;
+        argus.push_back(emptyArgu);
+        batch->appendArguments(argus, 0, -1);
+        continue;
+      }
+    }
+    batch->appendArguments(subBatch.getStreams(), subBatch.getSize(), i);
+  }
+  return batch->getSize();
+}
+
+REGISTER_DATA_PROVIDER_EX(multi, MultiDataProvider);
+
+}  // namespace paddle
diff --git a/paddle/gserver/dataproviders/MultiDataProvider.h b/paddle/legacy/gserver/dataproviders/MultiDataProvider.h
similarity index 100%
rename from paddle/gserver/dataproviders/MultiDataProvider.h
rename to paddle/legacy/gserver/dataproviders/MultiDataProvider.h
diff --git a/paddle/gserver/dataproviders/ProtoReader.h b/paddle/legacy/gserver/dataproviders/ProtoReader.h
similarity index 100%
rename from paddle/gserver/dataproviders/ProtoReader.h
rename to paddle/legacy/gserver/dataproviders/ProtoReader.h
diff --git a/paddle/legacy/gserver/dataproviders/PyDataProvider.cpp b/paddle/legacy/gserver/dataproviders/PyDataProvider.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0827bd39d4cc78ef5658d437b6502f2e60e90b4c
--- /dev/null
+++ b/paddle/legacy/gserver/dataproviders/PyDataProvider.cpp
@@ -0,0 +1,498 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "PyDataProvider.h"
+#include "paddle/legacy/utils/Common.h"
+#include "paddle/legacy/utils/PythonUtil.h"
+#include "paddle/legacy/utils/Util.h"
+
+namespace paddle {
+
+#ifndef PADDLE_NO_PYTHON
+REGISTER_DATA_PROVIDER(py, PyDataProvider);
+#endif
+
+PyDataProvider::PyDataProvider(const DataConfig& config,
+                               bool useGpu,
+                               bool loadDataAll)
+    : DataProvider(config, useGpu), batchSize_(0) {
+  PyGuard guard;
+  pyModuleName_ = config_.load_data_module();
+  pyClassName_ = config_.load_data_object();
+  if (config_.load_data_args() != "") {
+    pyUserArgs_["load_data_args"] = config_.load_data_args();
+  }
+
+  if (loadDataAll) {
+    std::vector<std::string> fileList;
+    if (!config_.files().empty()) {
+      loadFileList(config_.files(), fileList);
+    }
+    loadData(fileList);
+  }
+}
+
+void PyDataProvider::loadData(const std::vector<std::string>& fileList) {
+  VLOG(1) << "module:" << pyModuleName_ << " class:" << pyClassName_;
+  classInstance_ =
+      createPythonClass(pyModuleName_, pyClassName_, fileList, pyUserArgs_);
+  CHECK(classInstance_) << "Create class instance failed.";
+  PyObjectPtr obj(PyObject_CallMethod(
+      classInstance_.get(), const_cast<char*>("getHeader"), NULL));
+  CHECK_PY(obj) << "Call function getHeader failed.";
+  std::string headerInfo =
+      std::string(PyString_AsString(obj.get()), PyString_Size(obj.get()));
+  parseHeaderData(headerInfo);
+  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
+}
+
+void PyDataProvider::parseHeaderData(const std::string& headerData) {
+  char* pHeader = const_cast<char*>(headerData.c_str());
+  char* pHeaderEnd = pHeader + headerData.size();
+  slotNum_ = readT<unsigned int>(pHeader, pHeaderEnd);
+  unsigned int useSequenceFlag = readT<unsigned int>(pHeader, pHeaderEnd);
+  isIID_ = useSequenceFlag != 1;
+  slots_.clear();
+  slots_.reserve(slotNum_);
+  for (size_t i = 0; i < slotNum_; ++i) {
+    unsigned int slotType = readT<unsigned int>(pHeader, pHeaderEnd);
+    unsigned int slotDim = readT<unsigned int>(pHeader, pHeaderEnd);
+    slots_.emplace_back();
+    slots_.back().dim = slotDim;
+    slots_.back().type = static_cast<SlotDef_SlotType>(slotType);
+  }
+}
+
+void PyDataProvider::resetSlots() {
+  for (auto& slot : slots_) {
+    slot.indexData.clear();
+    slot.denseData.clear();
+    slot.sparseNonValueData.clear();
+    slot.sparseFloatValueData.clear();
+    slot.indices.clear();
+    slot.sequenceStartPositions.clear();
+    slot.sampleSequenceIdVec.clear();
+    slot.subSequenceStartPositions.clear();
+    slot.strData.clear();
+  }
+}
+
+void PyDataProvider::fillDenseSlot(ProtoSlot& slot,
+                                   char*& data,
+                                   const char* dataEnd) {
+  unsigned int dim = slot.dim;
+  slot.sampleNum = readT<unsigned int>(data, dataEnd);
+  slot.denseData.resize(slot.sampleNum * dim);
+#ifdef PADDLE_TYPE_DOUBLE
+  CHECK_LE(data + sizeof(real) * dim * slot.sampleNum, dataEnd)
+      << "std::copy data is out of range";
+  // PyDataProvider always provide data in float
+  float* dat = reinterpret_cast<float*>(data);
+  std::copy(dat, dat + slot.sampleNum * dim, slot.denseData.begin());
+#else
+  memcpyWithCheck(slot.denseData.data(),
+                  data,
+                  sizeof(real) * dim * slot.sampleNum,
+                  dataEnd);
+#endif
+  // PyDataProvider always provide data in float
+  data += sizeof(float) * dim * slot.sampleNum;
+}
+
+void PyDataProvider::fillSparseNonValueSlot(ProtoSlot& slot,
+                                            char*& data,
+                                            const char* dataEnd) {
+  slot.sampleNum = readT<unsigned int>(data, dataEnd);
+  unsigned int* indexPtr = (unsigned int*)data;
+  CHECK_LE(data + sizeof(unsigned int) * slot.sampleNum, dataEnd)
+      << "Vector assign value is out of range";
+  slot.indices.assign(indexPtr, indexPtr + slot.sampleNum);
+  data += sizeof(unsigned int) * slot.sampleNum;
+  unsigned int length = 0;
+  length = readT<unsigned int>(data, dataEnd);
+  slot.indices.push_back(length);
+  slot.sparseNonValueData.resize(length);
+  memcpyWithCheck(slot.sparseNonValueData.data(),
+                  data,
+                  sizeof(unsigned int) * length,
+                  dataEnd);
+  data += sizeof(unsigned int) * length;
+}
+
+void PyDataProvider::fillSparseValueSlot(ProtoSlot& slot,
+                                         char*& data,
+                                         const char* dataEnd) {
+  slot.sampleNum = readT<unsigned int>(data, dataEnd);
+  unsigned int* indexPtr = (unsigned int*)data;
+  CHECK_LE(data + sizeof(unsigned int) * slot.sampleNum, dataEnd)
+      << "Vector assign value is out of range";
+  slot.indices.assign(indexPtr, indexPtr + slot.sampleNum);
+  data += sizeof(unsigned int) * slot.sampleNum;
+  unsigned int length = 0;
+  length = readT<unsigned int>(data, dataEnd);
+  unsigned int* colPtr = reinterpret_cast<unsigned int*>(data);
+  CHECK_LE(data + sizeof(unsigned int) * length, dataEnd)
+      << "Data is out of range";
+  data += sizeof(unsigned int) * length;
+  size_t colLen = readT<unsigned int>(data, dataEnd);
+  CHECK_EQ(colLen, length);
+  float* valuePtr = reinterpret_cast<float*>(data);
+  CHECK_LE(data + sizeof(real) * length, dataEnd) << "Data is out of range";
+  data += sizeof(real) * length;
+  slot.indices.push_back(length);
+  slot.sparseFloatValueData.resize(length);
+  for (unsigned int ii = 0; ii < length; ++ii) {
+    slot.sparseFloatValueData[ii].col = colPtr[ii];
+    slot.sparseFloatValueData[ii].value = valuePtr[ii];
+  }
+}
+
+void PyDataProvider::fillIndexSlot(ProtoSlot& slot,
+                                   char*& data,
+                                   const char* dataEnd) {
+  slot.sampleNum = readT<unsigned int>(data, dataEnd);
+  CHECK_LE(data + sizeof(unsigned int) * slot.sampleNum, dataEnd)
+      << "Vector assign is out of range";
+  slot.indexData.assign(reinterpret_cast<int*>(data),
+                        reinterpret_cast<int*>(data) + slot.sampleNum);
+  data += sizeof(unsigned int) * slot.sampleNum;
+}
+
+void PyDataProvider::fillStringSlot(ProtoSlot& slot,
+                                    char*& data,
+                                    const char* dataEnd) {
+  slot.sampleNum = readT<unsigned int>(data, dataEnd);
+  for (unsigned int i = 0; i < slot.sampleNum; ++i) {
+    size_t len = readT<uint32_t>(data, dataEnd);
+    auto str_begin = data;
+    data += len;
+    CHECK_LE(data, dataEnd) << "Data is out of range";
+    slot.strData.emplace_back(str_begin, len);
+  }
+}
+
+void PyDataProvider::fillSlotsByStr(const std::string& samples) {
+  char* data = const_cast<char*>(samples.c_str());
+  char* dataEnd = data + samples.size();
+  batchSize_ = readT<unsigned int>(data, dataEnd);
+  if (0 == batchSize_) {
+    return;
+  }
+
+  for (size_t j = 0; j < slotNum_; ++j) {
+    auto& slot = slots_[j];
+    CHECK(SlotDef::INDEX >= slot.type || SlotDef::STRING == slot.type)
+        << " Slot type:" << slot.type << " is out of range.";
+    CHECK_GE(slot.type, SlotDef::VECTOR_DENSE) << " Slot type:" << slot.type
+                                               << " is out of range.";
+    switch (slot.type) {
+      case SlotDef::VECTOR_DENSE:
+        fillDenseSlot(slot, data, dataEnd);
+        break;
+      case SlotDef::VECTOR_SPARSE_NON_VALUE:
+        fillSparseNonValueSlot(slot, data, dataEnd);
+        break;
+      case SlotDef::VECTOR_SPARSE_VALUE:
+        fillSparseValueSlot(slot, data, dataEnd);
+        break;
+      case SlotDef::INDEX:
+        fillIndexSlot(slot, data, dataEnd);
+        break;
+      case SlotDef::VAR_MDIM_DENSE:
+        LOG(FATAL) << "Not implemented";
+        break;
+      case SlotDef::VAR_MDIM_INDEX:
+        LOG(FATAL) << "Not implemented";
+        break;
+      case SlotDef::STRING:
+        fillStringSlot(slot, data, dataEnd);
+        break;
+    }
+  }
+  // read sequenceStartPositions
+  for (size_t j = 0; j < slotNum_; ++j) {
+    auto& slot = slots_[j];
+    if (!iidData()) {
+      unsigned int sequenceNum = readT<unsigned int>(data, dataEnd);
+      slot.sequenceNum = sequenceNum;
+      for (size_t i = 0; i < sequenceNum; ++i) {
+        slot.sequenceStartPositions.push_back(
+            readT<unsigned int>(data, dataEnd));
+      }
+      for (size_t i = 0; i < sequenceNum; ++i) {
+        size_t begin = slot.sequenceStartPositions[i];
+        size_t end = (i < sequenceNum - 1) ? slot.sequenceStartPositions[i + 1]
+                                           : slot.sampleNum;
+        for (size_t ii = begin; ii < end; ++ii) {
+          slot.sampleSequenceIdVec.push_back(ii);
+        }
+      }
+    } else {
+      for (size_t i = 0; i < slot.sampleNum; ++i) {
+        slot.sampleSequenceIdVec.push_back(i);
+      }
+    }
+  }
+  // read subSequenceStartPositions, not all slots have this infomation.
+  for (size_t j = 0; j < slotNum_; ++j) {
+    auto& slot = slots_[j];
+    if (!iidData() && data != dataEnd) {
+      unsigned int subSequenceNum = readT<unsigned int>(data, dataEnd);
+      slot.subSequenceNum = subSequenceNum;
+      for (size_t i = 0; i < subSequenceNum; ++i) {
+        slot.subSequenceStartPositions.push_back(
+            readT<unsigned int>(data, dataEnd));
+      }
+    }
+  }
+}
+
+void PyDataProvider::reset() {
+  {  // Invoke PyDataProvider Reset
+    PyGuard guard;
+    PyObjectPtr obj(PyObject_CallMethod(
+        classInstance_.get(), const_cast<char*>("reset"), NULL));
+    CHECK_PY(obj) << "Call function reset failed.";
+  }
+
+  if (!skipShuffle_) {
+    // Invoke PyDataProvider Shuffle
+    shuffle();
+  }
+  DataProvider::reset();
+}
+
+void PyDataProvider::shuffle() {
+  // py shuffle
+  PyGuard guard;
+  PyObjectPtr obj(PyObject_CallMethod(
+      classInstance_.get(), const_cast<char*>("shuffle"), NULL));
+  CHECK_PY(obj) << "Call function shuffle failed.";
+}
+
+void PyDataProvider::handleDenseSlot(ProtoSlot& slot,
+                                     size_t slotIndex,
+                                     std::vector<Argument>& cpuArguments) {
+  unsigned int dim = slot.dim;
+  Matrix::resizeOrCreate(cpuArguments[slotIndex].value,
+                         slot.sampleNum,
+                         dim,
+                         false,   // trans = false
+                         false);  // useGpu = false
+  real* buf = cpuArguments[slotIndex].value->getData();
+  for (size_t i = 0; i < slot.sampleNum; ++i) {
+    memcpyWithCheck(buf + i * dim,
+                    slot.denseData.data() + slot.sampleSequenceIdVec[i] * dim,
+                    sizeof(real) * dim,
+                    slot.denseData.data() + slot.denseData.size());
+  }
+}
+
+void PyDataProvider::handleSparseNonValueSlot(
+    ProtoSlot& slot, size_t slotIndex, std::vector<Argument>& cpuArguments) {
+  unsigned int dim = slot.dim;
+  if (!(cpuArguments[slotIndex].value)) {
+    cpuArguments[slotIndex].value =
+        Matrix::createSparseMatrix(slot.sampleNum,
+                                   dim,
+                                   slot.sampleNum /*DEFAULT_AVG_WIDTH = 1*/,
+                                   NO_VALUE,
+                                   SPARSE_CSR,
+                                   false,
+                                   useGpu_);
+  }
+  auto mat = cpuArguments[slotIndex].value;
+  mat->resize(slot.sampleNum, dim, slot.sampleNum, NO_VALUE, SPARSE_CSR);
+  if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
+    std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
+        slot.sampleSequenceIdVec.data(),
+        slot.indices.data(),
+        slot.sparseNonValueData.data(),
+        HPPL_STREAM_1);
+  } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
+    std::dynamic_pointer_cast<CpuSparseMatrix>(mat)->copyFrom(
+        slot.sampleSequenceIdVec.data(),
+        slot.indices.data(),
+        slot.sparseNonValueData.data());
+  } else {
+    LOG(FATAL) << "Not Supported";
+  }
+}
+
+void PyDataProvider::handleSparseValueSlot(
+    ProtoSlot& slot, size_t slotIndex, std::vector<Argument>& cpuArguments) {
+  unsigned int dim = slot.dim;
+  if (!(cpuArguments[slotIndex].value)) {
+    cpuArguments[slotIndex].value =
+        Matrix::createSparseMatrix(slot.sampleNum,
+                                   dim,
+                                   slot.sampleNum /*DEFAULT_AVG_WIDTH = 1*/,
+                                   FLOAT_VALUE,
+                                   SPARSE_CSR,
+                                   false,
+                                   useGpu_);
+  }
+  auto mat = cpuArguments[slotIndex].value;
+  mat->resize(slot.sampleNum, dim, slot.sampleNum, FLOAT_VALUE, SPARSE_CSR);
+  if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
+    std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
+        slot.sampleSequenceIdVec.data(),
+        slot.indices.data(),
+        slot.sparseFloatValueData.data(),
+        HPPL_STREAM_DEFAULT);
+  } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
+    std::dynamic_pointer_cast<CpuSparseMatrix>(mat)->copyFrom(
+        slot.sampleSequenceIdVec.data(),
+        slot.indices.data(),
+        slot.sparseFloatValueData.data());
+  } else {
+    LOG(FATAL) << "Not Supported";
+  }
+}
+
+void PyDataProvider::handleIndexSlot(ProtoSlot& slot,
+                                     size_t slotIndex,
+                                     std::vector<Argument>& cpuArguments) {
+  IVector::resizeOrCreate(cpuArguments[slotIndex].ids,
+                          slot.sampleNum,
+                          /*useGpu_*/ false);
+  int* buf = cpuArguments[slotIndex].ids->getData();
+  for (size_t i = 0; i < slot.sampleNum; ++i) {
+    buf[i] = slot.indexData[slot.sampleSequenceIdVec[i]];
+  }
+}
+
+void PyDataProvider::handleStringSlot(ProtoSlot& slot,
+                                      size_t slotIndex,
+                                      std::vector<Argument>& cpuArguments) {
+  if (cpuArguments[slotIndex].strs) {
+    cpuArguments[slotIndex].strs->resize(slot.sampleNum);
+  } else {
+    cpuArguments[slotIndex].strs =
+        std::make_shared<std::vector<std::string>>(slot.sampleNum);
+  }
+  for (size_t i = 0; i < slot.sampleNum; ++i) {
+    (*cpuArguments[slotIndex].strs)[i] =
+        slot.strData[slot.sampleSequenceIdVec[i]];
+  }
+}
+
+int64_t PyDataProvider::getNextBatchInternal(int64_t size, DataBatch* batch) {
+  PyGuard guard;
+  PyObjectPtr obj(PyObject_CallMethod(classInstance_.get(),
+                                      const_cast<char*>("getNextBatch"),
+                                      const_cast<char*>("i"),
+                                      size));
+  CHECK_PY(obj) << "Call function getNextBatch failed.";
+  const std::string& samples =
+      std::string(PyString_AsString(obj.get()), PyString_Size(obj.get()));
+  resetSlots();
+  fillSlotsByStr(samples);
+  size = batchSize_;
+  if (size <= 0) return 0;
+
+  DataBatch& cpuBatch = *cpuBatch_;
+  std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
+  cpuBatch.setSize(size);
+  cpuArguments.resize(slotNum_);
+
+  if (!iidData()) {
+    for (size_t j = 0; j < slotNum_; ++j) {
+      auto& slot = slots_[j];
+      ICpuGpuVector::resizeOrCreate(cpuArguments[j].sequenceStartPositions,
+                                    slot.sequenceNum + 1,
+                                    /* useGpu= */ false);
+      int* buf = cpuArguments[j].sequenceStartPositions->getMutableData(false);
+      std::copy(slot.sequenceStartPositions.begin(),
+                slot.sequenceStartPositions.end(),
+                buf);
+      buf[slot.sequenceStartPositions.size()] = slot.sampleNum;
+
+      if (slot.subSequenceStartPositions.size()) {
+        ICpuGpuVector::resizeOrCreate(cpuArguments[j].subSequenceStartPositions,
+                                      slot.subSequenceNum + 1,
+                                      /*  useGpu= */ false);
+        int* buf =
+            cpuArguments[j].subSequenceStartPositions->getMutableData(false);
+        std::copy(slot.subSequenceStartPositions.begin(),
+                  slot.subSequenceStartPositions.end(),
+                  buf);
+        buf[slot.subSequenceNum] = slot.sampleNum;
+        // check subSequenceStartPositions and sequenceStartPositions
+        cpuArguments[j].checkSubset();
+      }
+    }
+  }
+
+  for (size_t slotIndex = 0; slotIndex < slotNum_; ++slotIndex) {
+    auto& slot = slots_[slotIndex];
+    SlotDef::SlotType slotType = slot.type;
+    switch (slotType) {
+      case SlotDef::VECTOR_DENSE:
+        handleDenseSlot(slot, slotIndex, cpuArguments);
+        break;
+      case SlotDef::VECTOR_SPARSE_NON_VALUE:
+        handleSparseNonValueSlot(slot, slotIndex, cpuArguments);
+        break;
+      case SlotDef::VECTOR_SPARSE_VALUE:
+        handleSparseValueSlot(slot, slotIndex, cpuArguments);
+        break;
+      case SlotDef::INDEX:
+        handleIndexSlot(slot, slotIndex, cpuArguments);
+        break;
+      case SlotDef::VAR_MDIM_DENSE:
+        LOG(FATAL) << "Not implemented";
+        break;
+      case SlotDef::VAR_MDIM_INDEX:
+        LOG(FATAL) << "Not implemented";
+        break;
+      case SlotDef::STRING:
+        handleStringSlot(slot, slotIndex, cpuArguments);
+        break;
+    }
+  }
+
+  if (useGpu_) {
+    std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
+    DataBatch& gpuBatch = *gpuBatch_;
+    std::vector<Argument>& gpuArguments = gpuBatch.getStreams();
+    gpuArguments.resize(cpuArguments.size());
+    gpuBatch.setSize(size);
+    for (size_t i = 0; i < slotNum_; ++i) {
+      SlotDef::SlotType slotType = slots_[i].type;
+      if (SlotDef::VECTOR_SPARSE_VALUE == slotType ||
+          SlotDef::VECTOR_SPARSE_NON_VALUE == slotType) {
+        gpuArguments[i] = cpuArguments[i];
+        gpuArguments[i].sequenceStartPositions =
+            cpuArguments[i].sequenceStartPositions;
+
+        if (slots_[i].subSequenceStartPositions.size()) {
+          gpuArguments[i].subSequenceStartPositions =
+              cpuArguments[i].subSequenceStartPositions;
+        }
+      } else {
+        gpuArguments[i].resizeAndCopyFrom(
+            cpuArguments[i], useGpu_, HPPL_STREAM_1);
+      }
+    }
+    hl_stream_synchronize(HPPL_STREAM_1);
+    *batch = gpuBatch;
+  } else {
+    *batch = cpuBatch;
+  }
+
+  return batch->getSize();
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/dataproviders/PyDataProvider.h b/paddle/legacy/gserver/dataproviders/PyDataProvider.h
new file mode 100644
index 0000000000000000000000000000000000000000..4b8bea04a1670c60d5a801ca950f59116ba50195
--- /dev/null
+++ b/paddle/legacy/gserver/dataproviders/PyDataProvider.h
@@ -0,0 +1,124 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <paddle/legacy/utils/PythonUtil.h>
+#include "DataFormat.pb.h"
+#include "DataProvider.h"
+
+#include <vector>
+
+namespace paddle {
+
+class PyDataProvider : public DataProvider {
+ public:
+  PyDataProvider(const DataConfig& config,
+                 bool useGpu,
+                 bool loadDataAll = true);
+
+  virtual void reset();
+
+  // Note this size includes the sequences which are skipped because they
+  // are longer than the batch size
+  virtual int64_t getSize() {
+    LOG(FATAL) << "Not implement yet";
+    return -1;
+  }
+  virtual void shuffle();
+
+  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
+
+ protected:
+  struct ProtoSlot;
+  // return false if each each sample is one sequence, i.e., independent
+  // of other samples.
+  inline bool iidData() const { return isIID_; }
+
+  void parseHeaderData(const std::string& headerData);
+  void fillDenseSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
+  void fillSparseNonValueSlot(ProtoSlot& slot,
+                              char*& data,
+                              const char* dataEnd);
+  void fillSparseValueSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
+  void fillIndexSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
+  void fillStringSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
+  void fillSlotsByStr(const std::string& samples);
+  void handleDenseSlot(ProtoSlot& slot,
+                       size_t slotIndex,
+                       std::vector<Argument>& cpuArguments);
+  void handleSparseNonValueSlot(ProtoSlot& slot,
+                                size_t slotIndex,
+                                std::vector<Argument>& cpuArguments);
+  void handleSparseValueSlot(ProtoSlot& slot,
+                             size_t slotIndex,
+                             std::vector<Argument>& cpuArguments);
+  void handleIndexSlot(ProtoSlot& slot,
+                       size_t slotIndex,
+                       std::vector<Argument>& cpuArguments);
+  void handleStringSlot(ProtoSlot& slot,
+                        size_t slotIndex,
+                        std::vector<Argument>& cpuArguments);
+  void resetSlots();
+  void loadData(const std::vector<std::string>& fileList);
+
+ protected:
+  struct ProtoSlot {
+    SlotDef::SlotType type;
+    int dim;
+    unsigned int sampleNum;
+    unsigned int sequenceNum;
+    unsigned int subSequenceNum;
+    // Store the data of index type slot
+    std::vector<int> indexData;
+    // Store the data of dense type slot
+    std::vector<real> denseData;
+    // Store the data of sparseNonValue type slot
+    std::vector<sparse_non_value_t> sparseNonValueData;
+    // Store the data of sparseValue type slot
+    std::vector<sparse_float_value_t> sparseFloatValueData;
+    // Used to store the index of each sample in slot values
+    std::vector<int64_t> indices;
+    // The starting position of each sequence in samples
+    // The last element should be the number of samples
+    // If empty, each sample is one sequence.
+    std::vector<size_t> sequenceStartPositions;
+    // The index id of sequences in slot
+    std::vector<int64_t> sampleSequenceIdVec;
+    // The starting position of each subsequence in samples
+    // The last element should be the number of subsequence
+    // If empty, each sequence of sample has no subsequence.
+    std::vector<size_t> subSequenceStartPositions;
+    // Store the data of string type slot
+    std::vector<std::string> strData;
+  };
+  std::vector<ProtoSlot> slots_;
+
+  PyObjectPtr classInstance_;
+  unsigned int batchSize_;
+  unsigned int slotNum_;
+  // if use sequence, isIID_ equals false, otherwise it is true.
+  bool isIID_;
+  // The name of python module name
+  std::string pyModuleName_;
+  // The name of python class name
+  std::string pyClassName_;
+  // User args set in config
+  std::map<std::string, std::string> pyUserArgs_;
+
+  ThreadLocalD<DataBatch> cpuBatch_;
+  ThreadLocalD<DataBatch> gpuBatch_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/dataproviders/PyDataProvider2.cpp b/paddle/legacy/gserver/dataproviders/PyDataProvider2.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8e931e40611e27caa43675c3567972384a4d9026
--- /dev/null
+++ b/paddle/legacy/gserver/dataproviders/PyDataProvider2.cpp
@@ -0,0 +1,1031 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef PADDLE_NO_PYTHON
+
+#include <Python.h>
+#include <numpy/numpyconfig.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <list>
+#include <unordered_set>
+#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+#include <numpy/ndarrayobject.h>
+
+#include "DataProvider.h"
+
+#include "paddle/legacy/utils/Locks.h"
+#include "paddle/legacy/utils/PythonUtil.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+namespace unittest {
+
+static std::unique_ptr<std::function<void(size_t /*poolActualSize */)>>
+    OnPoolFilled;
+
+namespace pydp2 {
+
+void setOnPoolFilledHook(const std::function<void(size_t)>& callback) {
+  OnPoolFilled.reset(new std::function<void(size_t)>());
+  *OnPoolFilled = callback;
+}
+
+void clearOnPoolFilledHook() { OnPoolFilled.reset(); }
+
+}  // namespace pydp2
+}  // namespace unittest
+
+/**
+ * Slot type
+ */
+enum SlotType {
+  ST_DENSE = 0,
+  ST_NON_SPARSE_VALUE = 1,
+  ST_SPARSE_VALUE = 2,
+  ST_INDEX = 3
+};
+
+/**
+ * Sequence type
+ */
+enum SeqType { SQT_NONE = 0, SQT_SEQ, SQT_SUBSEQ };
+
+/**
+ * Cache Type.
+ */
+enum CacheType {
+  NO_CACHE = 0,           // Each pass will load data from PyDataProvider2.
+  CACHE_PASS_IN_MEM = 1,  // First pass will load data from PyDataProvider2,
+                          // then cache all data in memory. Load data from
+                          // memory in rest passes.
+};
+
+struct SlotHeader {  // Slot Header will parse from python object's slots field.
+  size_t dim;
+  SlotType slotType;
+  SeqType seqType;
+};
+
+inline std::ostream& operator<<(std::ostream& os, const SlotHeader& header) {
+  os << "Dim = " << header.dim << " Type = " << header.slotType
+     << " SeqType = " << header.seqType;
+  return os;
+}
+
+/**
+ * FieldScanner Interface.
+ *
+ * It will read python object, and fill to argument's each slot.
+ * There are two steps, prepare and fill. Scanner will alloc memory during
+ * prepare step, fill data into argument during fill step.
+ */
+class IFieldScanner {
+ public:
+  DISABLE_COPY(IFieldScanner);
+  /**
+   * Ctor.
+   * @param headerPtr slot header that scanner belong to.
+   */
+  explicit IFieldScanner(SlotHeader* headerPtr) : headerPtr_(headerPtr) {}
+  virtual ~IFieldScanner() {}
+
+  /**
+   * Start prepare step.
+   */
+  virtual void startPrepare(Argument& argument) {}
+
+  /**
+   * Prepare step.
+   *
+   * @note the obj could be a timestep of sample or whole sample. It depends
+   * what scanner it is.
+   */
+  virtual void prepare(Argument& argument, PyObject* obj) {}
+
+  /**
+   * Finish Prepare step.
+   */
+  virtual void finishPrepare(Argument& argument) {}
+
+  /**
+   * Start fill step.
+   */
+  virtual void startFill(Argument& argument) {}
+
+  /**
+   * Fill step.
+   *
+   * @note the obj could be a timestep of sample or whole sample. It depends
+   * what scanner it is.
+   */
+  virtual void fill(Argument& argument, PyObject* obj) {}
+
+  /**
+   * Finish fill step.
+   */
+  virtual void finishFill(Argument& argument) {}
+
+  /**
+   * Factory method. Create a scanner by header. The final scanner may be
+   * combine many scanners.
+   *
+   * @note Fatal if header is not support.
+   */
+  static IFieldScanner* create(SlotHeader* header);
+
+ protected:
+  SlotHeader* headerPtr_;
+};
+
+/**
+ * Py Data Provider Cache Interface.
+ */
+class IPyDataProviderCache {
+ public:
+  virtual ~IPyDataProviderCache() {}
+
+  /**
+   * invoke when DataProvider::reset()
+   * @return true if read data from python.
+   */
+  virtual bool reset() = 0;
+
+  /**
+   * invoke when these data are used by DataProvider, and need to clear.
+   * @param [inout] data used data.
+   *
+   * @note The implemented class must clear these data array. Or if you want to
+   * delete the PyObjectPtr later, you should make sure the paddle process only
+   * have one active thread calling python code (use PyGuard otherwise).
+   */
+  virtual void drop(std::deque<PyObjectPtr>* data) = 0;
+
+  /**
+   * Return whole data in cache.
+   */
+  virtual std::deque<PyObjectPtr>* load() = 0;
+
+  /**
+   * Factory method. Convert CacheType to IPyDataProviderCache*
+   */
+  static IPyDataProviderCache* create(CacheType ct);
+};
+
+/**
+ * PyDataProvider2.
+ *
+ * For usage, please refer python module 'paddle.trainer.PyDataProvider2'
+ *
+ * Here, we start a thread to read data. It is totally asynchronous for reading
+ * data. And it support cache strategies.
+ */
+class PyDataProvider2 : public DataProvider {
+ public:
+  /**
+   * Ctor
+   */
+  PyDataProvider2(const DataConfig& config,
+                  const ModelConfig& modelConfig,
+                  bool useGpu)
+      : DataProvider(config, useGpu), callingContextCreated_(2) {
+    if (PyArray_API == NULL) import_array();
+    auto& args = config.load_data_args();
+    PyObjectPtr kwargs = PyObjectPtr(PyDict_New());
+    if (!args.empty()) {
+      kwargs = callPythonFuncRetPyObj(
+          "paddle.trainer.PyDataProvider2", "deserialize_args", {args});
+    }
+
+    py::DictHelper kwargsDict(kwargs);
+    kwargsDict.setBool("is_train", !config.for_test());
+    std::vector<std::string> inputs;
+    inputs.reserve(modelConfig.input_layer_names().size());
+    std::copy(modelConfig.input_layer_names().begin(),
+              modelConfig.input_layer_names().end(),
+              std::back_inserter(inputs));
+    kwargsDict.setStringList("input_order", inputs);
+
+    // kwargs is keyword arguemts to create object.
+    this->createPyDataObj(config.load_data_module(),
+                          config.load_data_object(),
+                          config.files(),
+                          std::move(kwargs));
+    DBG << "Instance " << instance_.get() << " loaded.";
+    this->readPyFields(config.for_test());
+    DBG << "Py Field Done";
+  }
+
+  /**
+   * Dtor
+   * @note will stop loading thread when destructing
+   */
+  virtual ~PyDataProvider2() { resetImpl(false); }
+
+ private:
+  void createPyDataObj(const std::string& model,
+                       const std::string& className,
+                       const std::string& fileListName,
+                       PyObjectPtr&& kwargs  // NOLINT
+                       ) {
+    LOG(INFO) << "loading dataprovider " << model << "::" << className;
+
+    PyObjectPtr module = py::import(model);
+    PyObjectPtr moduleDict(PyModule_GetDict(module.get()));
+    CHECK_PY(moduleDict) << "Invoke module.__dict__ error";
+    PyObjectPtr cls(PyDict_GetItemString(moduleDict.get(), className.c_str()));
+    CHECK_PY(cls) << "load class " << className.c_str() << "error";
+
+    // If there are multiple python instance share same module, the PyObjectPtr
+    // only for instance will make python reference-count error.
+    //
+    // So here, we increase reference count manually.
+    Py_XINCREF(module.get());
+    Py_XINCREF(moduleDict.get());
+    Py_XINCREF(cls.get());
+
+    PyObjectPtr fileListInPy = loadPyFileLists(fileListName);
+    PyDict_SetItemString(kwargs.get(), "file_list", fileListInPy.get());
+    {
+      PyGuard guard;
+      instance_.reset(PyObject_Call(cls.get(), zeroTuple_.get(), kwargs.get()));
+    }
+    CHECK_PY(instance_) << "Cannot Create instance";
+  }
+
+  void readPyFields(bool testing) {
+    py::ObjectHelper self(this->instance_);
+    bool ok;
+
+    this->skipShuffle_ =
+        !self.getBoolAttr("should_shuffle", &ok /*isBoolType*/);
+    if (!ok) {
+      this->skipShuffle_ = testing;  // shuffle when is training, skip shuffle
+                                     // when is testing.
+    }
+    DBG << "Provider Skip Shuffle " << this->skipShuffle_;
+
+    this->poolSize_ = self.getIntAttr<size_t>("pool_size", &ok);
+    if (!ok) {
+      this->poolSize_ = -1UL;
+    }
+    this->minPoolSize_ = self.getIntAttr<size_t>("min_pool_size", &ok);
+    if (!ok) {
+      this->minPoolSize_ = -1UL;
+    }
+    this->minPoolSize_ = std::min(this->poolSize_, this->minPoolSize_);
+
+    this->canOverBatchSize_ = self.getBoolAttr("can_over_batch_size");
+
+    calcBatchSize_.reset(self.getAttr("calc_batch_size"));
+    if (this->calcBatchSize_ && !py::isCallable(this->calcBatchSize_)) {
+      this->calcBatchSize_.reset();
+    }
+
+    generator_.reset(self.getAttr("generator"));
+    CHECK(py::isCallable(generator_));
+
+    // Reading slots.
+    PyObjectPtr slotsPtr(self.getAttr("slots"));
+    py::SequenceHelper slots(slotsPtr);
+    headers_.reserve(slots.size());
+    for (size_t i = 0; i < slots.size(); ++i) {
+      headers_.emplace_back();
+      auto& header = headers_.back();
+      PyObject* hdPtr = slots[i];
+      CHECK(hdPtr != nullptr);
+      Py_XINCREF(hdPtr);
+      PyObjectPtr headerPtrWrap(hdPtr);
+      py::ObjectHelper hd(headerPtrWrap);
+      header.dim = hd.getIntAttrWithError<size_t>("dim");
+      header.seqType = (SeqType)hd.getIntAttrWithError<int>("seq_type");
+      header.slotType = (SlotType)hd.getIntAttrWithError<int>("type");
+    }
+
+    DBG << "Data header size " << headers_.size();
+    for (auto& header : headers_) {
+      DBG << header;
+    }
+    cache_.reset(IPyDataProviderCache::create(
+        (CacheType)self.getIntAttrWithError<int>("cache")));
+  }
+
+  PyObjectPtr loadPyFileLists(const std::string& fileListName) {
+    loadFileList(fileListName, fileLists_);
+    PyObject* lst = PyList_New(fileLists_.size());
+    for (size_t i = 0; i < fileLists_.size(); ++i) {
+      PyList_SET_ITEM(lst, i, PyString_FromString(fileLists_[i].c_str()));
+    }
+    return PyObjectPtr(lst);
+  }
+
+  void loadThread() {
+    DBG << "Creating context";
+    for (auto& filename : fileLists_) {
+      PyGuard g;
+      py::CallableHelper generator(this->generator_);
+      generator.setArgsSize(2);
+      generator.getArgs().set(0, instance_);
+      generator.getArgs().set(1, PyString_FromString(filename.c_str()), true);
+      callingContexts_.emplace_back(generator());
+      CHECK_PY(callingContexts_.back()) << "Generator error.";
+      CHECK(PyIter_Check(callingContexts_.back()));
+    }
+    DBG << "Create context done";
+    callingContextCreated_.wait();
+
+    PositionRandom p(skipShuffle_);
+
+    while (!exit_ && !callingContexts_.empty()) {
+      PyObject* data = nullptr;
+
+      {  // Read data.
+        size_t cid = p(callingContexts_.size());
+        bool atEnd;
+        data = py::iterNext(callingContexts_[cid], &atEnd);
+        if (atEnd || data == nullptr) {
+          if (cid != 0) {
+            std::swap(callingContexts_[cid], callingContexts_[0]);
+            cid = 0;
+          }
+
+          PyObjectPtr front;
+          {
+            std::unique_lock<std::mutex> l(mtx_);
+            front = pop_get_front(callingContexts_);
+          }
+          {
+            PyGuard g;
+            front.reset();
+          }
+          this->pullCV_.notify_all();
+          continue;
+        }
+      }
+
+      size_t additionalBatchSize = 1;
+      if (calcBatchSize_) {
+        PyGuard guard;
+        py::CallableHelper calcBatchSize(this->calcBatchSize_);
+        calcBatchSize.setArgsSize(1);
+        calcBatchSize.getArgs().set(0, data);
+        PyObjectPtr bs(calcBatchSize());
+        CHECK_PY(bs);
+        bool ok;
+        additionalBatchSize = py::castInt<size_t>(bs.get(), &ok);
+        CHECK(ok) << "CalcBatchSize must return int or long";
+      }
+
+      if (this->loadThread_) {  // wait poolActualSize < poolSize;
+        std::unique_lock<std::mutex> l(mtx_);
+        pushCV_.wait(l, [this] { return this->poolActualSize_ < poolSize_; });
+      }
+
+      {
+        std::lock_guard<std::mutex> guard(mtx_);
+        poolActualSize_ += additionalBatchSize;
+        dataPool_.emplace_back(data);
+      }
+      pullCV_.notify_all();
+    }
+    DBG << "load thread end";
+  }
+
+  inline void resetImpl(bool startNewThread) {
+    DBG << "Reseting " << startNewThread;
+    exit_.store(true);
+    if (loadThread_) {  // is loading.
+      loadThread_->join();
+      loadThread_.reset();
+    }
+    {
+      PyGuard g;
+      callingContexts_.clear();
+      this->pullCV_.notify_one();
+    }
+
+    std::lock_guard<std::mutex> guard(mutexForReset_);
+    {
+      PyGuard g;
+      dataPool_.clear();
+    }
+    poolActualSize_ = 0;
+
+    if (startNewThread && cache_->reset()) {
+      DBG << "Start new thread.";
+      loadThread_.reset(new std::thread([this] {
+        exit_ = false;
+        loadThread();
+      }));
+      callingContextCreated_.wait();
+    }
+    DBG << "Reset done";
+    exit_ = false;
+  }
+
+ private:
+  std::unique_ptr<std::thread> loadThread_;
+  std::atomic<bool> exit_;
+  std::deque<PyObjectPtr> callingContexts_;
+  std::deque<PyObjectPtr> dataPool_;
+  size_t poolActualSize_;
+  std::condition_variable pushCV_;
+  std::condition_variable pullCV_;
+  std::mutex mtx_;
+
+  std::mutex mutexForReset_;
+
+  ThreadBarrier callingContextCreated_;
+  std::unique_ptr<IPyDataProviderCache> cache_;
+
+  PyObjectPtr instance_;
+  size_t poolSize_;
+  size_t minPoolSize_;
+  bool canOverBatchSize_;
+  PyObjectPtr calcBatchSize_;
+  PyObjectPtr generator_;
+  std::vector<std::string> fileLists_;
+  std::vector<SlotHeader> headers_;
+  static PyObjectPtr zeroTuple_;
+
+  class PositionRandom {
+   public:
+    inline explicit PositionRandom(bool skipRand)
+        : eng_(ThreadLocalRandomEngine::get()), skipRand_(skipRand) {}
+
+    inline size_t operator()(size_t len) {
+      if (!skipRand_) {
+        if (!dist_ || dist_->b() != len - 1) {
+          dist_.reset(new std::uniform_int_distribution<size_t>(0, len - 1));
+        }
+        return (*dist_)(eng_);
+      } else {
+        return 0;
+      }
+    }
+
+   private:
+    std::default_random_engine& eng_;
+    std::unique_ptr<std::uniform_int_distribution<size_t>> dist_;
+    bool skipRand_;
+  };
+
+  // DataProvider interface
+ public:
+  /**
+   * Resetting the PyDataProvider. May start reading thread here.
+   */
+  virtual void reset() {
+    resetImpl(true);
+    DataProvider::reset();
+  }
+
+  /**
+   * Shuffle. Do nothing because PyDataProvider do shuffle implicitly by random
+   * select data from datapool.
+   */
+  void shuffle() {}
+
+  /**
+   * Not limited size.
+   */
+  int64_t getSize() { return -1; }
+
+  /**
+   * Loading a batch of data.
+   */
+  int64_t getNextBatchInternal(int64_t size_, DataBatch* batch) {
+    std::lock_guard<std::mutex> guard(mutexForReset_);
+    REGISTER_TIMER("PyDP2.getNextBatchInternal")
+    CHECK_GE(size_, 0);
+    size_t size = (size_t)size_;
+    if (loadThread_) {  // loading from thread should wait for data pool ready.
+                        // but, loading from cache, cache object should ensure
+                        // data pool ready.
+      std::unique_lock<std::mutex> l(mtx_);
+      pullCV_.wait(l, [this, &size] {
+        return this->poolActualSize_ >= std::max(size, this->minPoolSize_) ||
+               callingContexts_.empty();
+      });
+
+      if (unittest::OnPoolFilled) {
+        (*unittest::OnPoolFilled)(this->poolActualSize_);
+      }
+    }
+    std::deque<PyObjectPtr> data;
+    size_t bsize = 0;
+    std::deque<PyObjectPtr>* poolPtr = nullptr;
+
+    if (this->loadThread_) {  // loading from thread.
+      poolPtr = &this->dataPool_;
+    } else {  // loading from cache.
+      poolPtr = this->cache_->load();
+    }
+    if (exit_) {
+      // PyDataProvider is destructing.
+      return 0;
+    }
+    CHECK(poolPtr != nullptr);
+
+    std::deque<PyObjectPtr>& pool = *poolPtr;
+
+    while (bsize < size && !pool.empty()) {
+      {
+        // move data from pool to data
+        std::lock_guard<std::mutex> guard(mtx_);
+        if (skipShuffle_) {
+          size_t i = 0;
+          CHECK(pool[i] != nullptr);
+          data.emplace_back(std::move(pool[i]));
+          pool.pop_front();
+        } else {  // when shuffle, use swap to drop only last pool element.
+          size_t i = ThreadLocalRand::rand() % pool.size();
+          CHECK(pool[i] != nullptr);
+          if (i != 0) {
+            std::swap(pool[i], pool.front());
+          }
+          data.emplace_back(std::move(pool.front()));
+          pool.pop_front();
+        }
+
+        if (calcBatchSize_) {  // custom calc batch size.
+          PyGuard guard;
+          Py_INCREF(data.back().get());
+          py::CallableHelper calcBatchSize(calcBatchSize_);
+          calcBatchSize.setArgsSize(1);
+          calcBatchSize.getArgs().set(0, data.back());
+          PyObjectPtr customBatchSize(calcBatchSize());
+          bool ok;
+          size_t tmp = py::castInt<size_t>(customBatchSize.get(), &ok);
+          CHECK(ok) << "calc_batch_size must return int";
+
+          if (bsize + tmp > size && !canOverBatchSize_) {
+            // Put data back.
+            pool.push_front(std::move(data.back()));
+            data.pop_back();
+            break;
+          } else {
+            bsize += tmp;
+          }
+        } else {
+          bsize += 1;
+        }
+      }
+    }
+
+    if (this->loadThread_) {
+      {
+        std::lock_guard<std::mutex> g(mtx_);
+        poolActualSize_ -= bsize;
+      }
+      this->pushCV_.notify_all();
+    }
+
+    if (bsize == 0) {  // end of pass. In data pool, cannot get any data.
+      return 0;
+    }
+
+    DataBatch cpuBatch;
+    cpuBatch.setSize(bsize);
+    auto& inArgs = cpuBatch.getStreams();
+    inArgs.resize(headers_.size());
+    std::vector<std::unique_ptr<IFieldScanner>> scanners;
+    scanners.reserve(headers_.size());
+    for (auto& header : headers_) {
+      scanners.emplace_back(IFieldScanner::create(&header));
+    }
+    DBG << "Scanner created.";
+    for (size_t i = 0; i < headers_.size(); ++i) {
+      scanners[i]->startPrepare(inArgs[i]);
+    }
+    for (auto& d : data) {
+      py::SequenceHelper s(d);
+      for (size_t i = 0; i < headers_.size(); ++i) {
+        scanners[i]->prepare(inArgs[i], s[i]);
+      }
+    }
+    for (size_t i = 0; i < headers_.size(); ++i) {
+      scanners[i]->finishPrepare(inArgs[i]);
+    }
+    for (size_t i = 0; i < headers_.size(); ++i) {
+      scanners[i]->startFill(inArgs[i]);
+    }
+    for (auto& d : data) {
+      py::SequenceHelper s(d);
+      for (size_t i = 0; i < headers_.size(); ++i) {
+        scanners[i]->fill(inArgs[i], s[i]);
+      }
+    }
+
+    for (size_t i = 0; i < headers_.size(); ++i) {
+      scanners[i]->finishFill(inArgs[i]);
+    }
+
+    {
+      PyGuard g;
+      cache_->drop(&data);
+    }
+
+    DBG << "Reading CPU Batch Done.";
+
+    if (useGpu_) {
+      std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
+      DataBatch& gpuBatch = *batch;
+      std::vector<Argument>& gpuArguments = gpuBatch.getStreams();
+      gpuArguments.resize(cpuArguments.size());
+      gpuBatch.setSize(bsize);
+      for (size_t i = 0; i < headers_.size(); ++i) {
+        gpuArguments[i].resizeAndCopyFrom(
+            cpuArguments[i], useGpu_, HPPL_STREAM_1);
+      }
+      hl_stream_synchronize(HPPL_STREAM_1);
+    } else {
+      *batch = cpuBatch;
+    }
+    return bsize;
+  }
+};
+
+PyObjectPtr PyDataProvider2::zeroTuple_(PyTuple_New(0));
+
+REGISTER_DATA_PROVIDER_EX(py2, PyDataProvider2);
+
+/**
+ * Scanner for dense slot.
+ */
+class DenseScanner : public IFieldScanner {
+ public:
+  explicit DenseScanner(SlotHeader* ptr) : IFieldScanner(ptr), height_(0) {}
+
+  /**
+   * Prepare.
+   * @param argument target argument
+   * @param obj each timestep of a sample.
+   */
+  virtual void prepare(Argument& argument, PyObject* obj) { ++height_; }
+
+  virtual void finishPrepare(Argument& argument) {
+    Matrix::resizeOrCreate(
+        argument.value, height_, headerPtr_->dim, false, false);
+    height_ = 0;
+  }
+
+  /**
+   * Fill argument from obj.
+   * @param argument
+   * @param obj
+   */
+  virtual void fill(Argument& argument, PyObject* obj) {
+    real* dat = argument.value->getData() + height_ * headerPtr_->dim;
+    if (PyArray_Check(obj)) {
+      auto dtype = PyArray_DTYPE((PyArrayObject*)obj);
+      if (dtype->type == 'f' && dtype->elsize == sizeof(real)) {
+        real* data = (real*)PyArray_DATA((PyArrayObject*)obj);
+        auto sz = PyArray_SIZE((PyArrayObject*)obj);
+        std::copy(data, data + sz, dat);
+      } else {
+        LOG(FATAL) << "You should yield float" << sizeof(real) * 8 << " array";
+      }
+    } else {
+      py::SequenceHelper s(obj);
+      // TODO(yuyang18): Here we can use AVX or SSE to accelerate memory copy.
+      for (size_t i = 0; i < headerPtr_->dim; ++i) {
+        dat[i] = (real)s.getDouble(i);
+      }
+    }
+    ++height_;
+  }
+
+ private:
+  size_t height_;
+};
+
+/**
+ * Scanner for index slot
+ */
+class IndexScanner : public IFieldScanner {
+ public:
+  explicit IndexScanner(SlotHeader* ptr) : IFieldScanner(ptr), cnt_(0) {}
+
+  /**
+   * Prepare memory space.
+   *
+   * @note obj is a single timestep of sample
+   */
+  virtual void prepare(Argument& argument, PyObject* obj) { ++cnt_; }
+
+  virtual void finishPrepare(Argument& argument) {
+    IVector::resizeOrCreate(argument.ids, cnt_, false);
+    cnt_ = 0;
+  }
+
+  /**
+   * Fill one index to argument.
+   */
+  virtual void fill(Argument& argument, PyObject* obj) {
+    bool ok;
+    argument.ids->getData()[cnt_++] = py::castInt<int>(obj, &ok);
+    CHECK(ok) << "Cannot cast int " << py::repr(obj);
+  }
+
+ private:
+  size_t cnt_;
+};
+
+class SparseNonValueScanner : public IFieldScanner {
+ public:
+  explicit SparseNonValueScanner(SlotHeader* ptr)
+      : IFieldScanner(ptr), nnz_(0), height_(0) {}
+
+  /**
+   * Prepare memory space
+   * @note obj is a timestep of one sample.
+   */
+  virtual void prepare(Argument& argument, PyObject* obj) {
+    ++height_;
+    nnz_ += py::SequenceHelper(obj).size();
+  }
+
+  virtual void finishPrepare(Argument& argument) {
+    Matrix::resizeOrCreateSparseMatrix(
+        argument.value, height_, headerPtr_->dim, nnz_, NO_VALUE);
+  }
+
+  virtual void startFill(Argument& argument) {
+    auto smat = (CpuSparseMatrix*)(argument.value.get());
+    smat->getRows()[0] = 0;
+    nnz_ = 0;
+    height_ = 1;
+  }
+
+  /**
+   * Fill one sparse vector to argument.
+   * @note obj is a timestep of one sample.
+   */
+  virtual void fill(Argument& argument, PyObject* obj) {
+    py::SequenceHelper s(obj);
+    auto sz = s.size();
+    auto smat = (CpuSparseMatrix*)(argument.value.get());
+    int* row = smat->getRows();
+    int* col = smat->getCols();
+    real* dat = smat->getData();
+    row[height_] = row[height_ - 1] + (int)sz;
+
+    for (decltype(sz) i = 0; i < sz; ++i) {
+      setData(col + nnz_, dat + nnz_, s[i]);
+      ++nnz_;
+    }
+    ++height_;
+  }
+
+ protected:
+  /**
+   * Set a single sparse index and value.
+   * @param [out] col sparse index
+   * @param [out] dat sparse value
+   * @param [in] obj Python Object. For sparse_non_value is a PyInt or PyLong.
+   *                 For sparse_value is a Tuple (int, float).
+   */
+  virtual void setData(int* col, real* dat, PyObject* obj) {
+    bool ok;
+    *col = py::castInt<int>(obj, &ok);
+    CHECK(ok);
+  }
+
+  size_t nnz_;
+  size_t height_;
+};
+
+class SparseValueScanner : public SparseNonValueScanner {
+ public:
+  explicit SparseValueScanner(SlotHeader* ptr) : SparseNonValueScanner(ptr) {}
+
+  virtual void finishPrepare(Argument& argument) {
+    Matrix::resizeOrCreateSparseMatrix(
+        argument.value, height_, headerPtr_->dim, nnz_, FLOAT_VALUE);
+  }
+
+ protected:
+  virtual void setData(int* col, real* dat, PyObject* obj) {
+    py::SequenceHelper s(obj);
+    SparseNonValueScanner::setData(col, dat, s[0]);
+    *dat = (real)s.getDouble(1);
+  }
+};
+
+/**
+ * Sequence Scanner. Scanner for sequence or sub-sequence.
+ */
+class SequenceScanner : public IFieldScanner {
+ public:
+  /**
+   * Ctor
+   * @param innerScanner inner scanner for each timestep or sub-sequence.
+   * @param getSeqStartPos A callback, (Argument) => ICpuGpuVectorPtr.
+   *                       return a sequence start position or a sub-sequence
+   *                       start position.
+   */
+  SequenceScanner(
+      std::unique_ptr<IFieldScanner>&& innerScanner,
+      const std::function<ICpuGpuVectorPtr&(Argument&)>& getSeqStartPos)
+      : IFieldScanner(nullptr),
+        inner_(std::move(innerScanner)),
+        cnt_(0),
+        getSeqStartPos_(getSeqStartPos) {}
+
+  /**
+   * Start prepare. Invoke inner->startPrepare too.
+   */
+  virtual void startPrepare(Argument& argument) {
+    inner_->startPrepare(argument);
+  }
+
+  /**
+   * Prepare. obj is a list or tuple. it will invoke inner_->prepare for each
+   * element of sequence obj.
+   */
+  virtual void prepare(Argument& argument, PyObject* obj) {
+    py::SequenceHelper s(obj);
+    ++cnt_;
+    for (size_t i = 0; i < s.size(); ++i) {
+      inner_->prepare(argument, s[i]);
+    }
+  }
+
+  /**
+   * Finish prepare. invoke inner_->finishPrepare too.
+   */
+  virtual void finishPrepare(Argument& argument) {
+    ICpuGpuVector::resizeOrCreate(getSeqStartPos_(argument), cnt_ + 1, false);
+    inner_->finishPrepare(argument);
+  }
+
+  /**
+   * Start fill. invoke inner->startFill too.
+   */
+  virtual void startFill(Argument& argument) {
+    getSeqStartPos_(argument)->getMutableData(false)[0] = 0;
+    cnt_ = 1;
+    inner_->startFill(argument);
+  }
+
+  /**
+   * Fill. Obj is a tuple or list. invoke inner->fill for each element of
+   * sequence obj. And set seqStartPos at same time. The seqStartPos will be
+   * calculated by getSeqStartPos callback passed in ctor.
+   */
+  virtual void fill(Argument& argument, PyObject* obj) {
+    getSeqStartPos_(argument)->getMutableData(false)[cnt_] =
+        getSeqStartPos_(argument)->getMutableData(false)[cnt_ - 1] +
+        (int)getSize(obj);
+    py::SequenceHelper s(obj);
+    ++cnt_;
+    for (size_t i = 0; i < s.size(); ++i) {
+      inner_->fill(argument, s[i]);
+    }
+  }
+
+  /**
+   * Finish fill. will invoke inner->finishFill too.
+   */
+  virtual void finishFill(Argument& argument) { inner_->finishFill(argument); }
+
+ protected:
+  size_t getSize(PyObject* obj) {
+    py::SequenceHelper s(obj);
+    auto sc = dynamic_cast<SequenceScanner*>(inner_.get());
+    if (sc) {
+      size_t sum = 0;
+      for (size_t i = 0; i < s.size(); ++i) {
+        sum += sc->getSize(s[i]);
+      }
+      return sum;
+    } else {
+      return s.size();
+    }
+  }
+
+ private:
+  std::unique_ptr<IFieldScanner> inner_;
+  size_t cnt_;
+  std::function<ICpuGpuVectorPtr&(Argument&)> getSeqStartPos_;
+};
+
+IFieldScanner* IFieldScanner::create(SlotHeader* header) {
+  IFieldScanner* retv = nullptr;
+  switch (header->slotType) {
+    case ST_DENSE:
+      retv = new DenseScanner(header);
+      break;
+    case ST_INDEX:
+      retv = new IndexScanner(header);
+      break;
+    case ST_NON_SPARSE_VALUE:
+      retv = new SparseNonValueScanner(header);
+      break;
+    case ST_SPARSE_VALUE:
+      retv = new SparseValueScanner(header);
+      break;
+    default:
+      LOG(FATAL) << "Not implemented " << header->slotType;
+  }
+
+  switch (header->seqType) {
+    case SQT_NONE:
+      break;
+    case SQT_SUBSEQ:
+      retv = new SequenceScanner(std::unique_ptr<IFieldScanner>(retv),
+                                 [](Argument& arg) -> ICpuGpuVectorPtr& {
+                                   return arg.subSequenceStartPositions;
+                                 });
+    // fall through, not break;
+    case SQT_SEQ:
+      retv = new SequenceScanner(std::unique_ptr<IFieldScanner>(retv),
+                                 [](Argument& arg) -> ICpuGpuVectorPtr& {
+                                   return arg.sequenceStartPositions;
+                                 });
+      break;
+    default:
+      LOG(FATAL) << "Not implemented";
+  }
+
+  return retv;
+}
+
+/**
+ * No Cache Strategy. Will destruct old data immediately and load data from
+ * python every pass.
+ */
+class NoCacheStrategy : public IPyDataProviderCache {
+ public:
+  virtual bool reset() { return true; }
+
+  virtual void drop(std::deque<PyObjectPtr>* data) { data->clear(); }
+
+  virtual std::deque<PyObjectPtr>* load() { return nullptr; }
+};
+
+/**
+ * Cache One Pass In Memory strategy.
+ *
+ * In first pass, will load data from python and store them in memory.
+ * The rest passes, will load data from memory.
+ */
+class CacheOnePassInMemory : public IPyDataProviderCache {
+ public:
+  CacheOnePassInMemory()
+      : objPool_(new std::deque<PyObjectPtr>()),
+        droppedPool_(new std::deque<PyObjectPtr>()) {}
+
+  virtual bool reset() {
+    if (objPool_->empty() && droppedPool_->empty()) {
+      return true;
+    } else if (objPool_->empty()) {
+      std::swap(objPool_, droppedPool_);
+      return false;
+    } else {
+      LOG(FATAL) << "Unexpected branch";
+    }
+  }
+
+  virtual void drop(std::deque<PyObjectPtr>* data) {
+    size_t orgSize = droppedPool_->size();
+    droppedPool_->resize(orgSize + data->size());
+    for (size_t i = 0; i < data->size(); ++i) {
+      std::swap((*droppedPool_)[orgSize + i], (*data)[i]);
+    }
+    data->clear();
+  }
+
+  virtual std::deque<PyObjectPtr>* load() { return objPool_.get(); }
+
+ private:
+  std::unique_ptr<std::deque<PyObjectPtr>> objPool_;
+  std::unique_ptr<std::deque<PyObjectPtr>> droppedPool_;
+};
+
+IPyDataProviderCache* IPyDataProviderCache::create(CacheType ct) {
+  switch (ct) {
+    case NO_CACHE:
+      return new NoCacheStrategy();
+    case CACHE_PASS_IN_MEM:
+      return new CacheOnePassInMemory();
+    default:
+      LOG(FATAL) << "Not implemented";
+  }
+}
+}  // namespace paddle
+
+#endif
diff --git a/paddle/legacy/gserver/evaluators/CTCErrorEvaluator.cpp b/paddle/legacy/gserver/evaluators/CTCErrorEvaluator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c145adda5e04fb4a35df480fd3d0cf93ad453e0d
--- /dev/null
+++ b/paddle/legacy/gserver/evaluators/CTCErrorEvaluator.cpp
@@ -0,0 +1,320 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Evaluator.h"
+#include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
+#include "paddle/legacy/utils/StringUtil.h"
+
+namespace paddle {
+
+/**
+ * calculate sequence-to-sequence edit distance
+ */
+class CTCErrorEvaluator : public Evaluator {
+ private:
+  MatrixPtr outActivations_;
+  int numTimes_, numClasses_, numSequences_, blank_;
+  real deletions_, insertions_, substitutions_;
+  int seqClassficationError_;
+  mutable std::unordered_map<std::string, real> evalResults_;
+
+  std::vector<int> path2String(const std::vector<int>& path) {
+    std::vector<int> str;
+    str.clear();
+    int prevLabel = -1;
+    for (std::vector<int>::const_iterator label = path.begin();
+         label != path.end();
+         label++) {
+      if (*label != blank_ &&
+          (str.empty() || *label != str.back() || prevLabel == blank_)) {
+        str.push_back(*label);
+      }
+      prevLabel = *label;
+    }
+    return str;
+  }
+
+  std::vector<int> bestLabelSeq() {
+    std::vector<int> path;
+    path.clear();
+    real* acts = outActivations_->getData();
+    for (int i = 0; i < numTimes_; ++i) {
+      path.push_back(std::max_element(acts + i * numClasses_,
+                                      acts + (i + 1) * numClasses_) -
+                     (acts + i * numClasses_));
+    }
+    return path2String(path);
+  }
+
+  /* "sp, dp, ip" is the weighting parameter of "substitution, deletion,
+   * insertion"
+   * in edit-distance error */
+  real stringAlignment(std::vector<int>& gtStr,
+                       std::vector<int>& recogStr,
+                       bool backtrace = true,
+                       real sp = 1.0,
+                       real dp = 1.0,
+                       real ip = 1.0) {
+    std::vector<std::vector<int>> matrix;
+    int substitutions, deletions, insertions;
+    real distance;
+    int n = gtStr.size();
+    int m = recogStr.size();
+
+    if (n == 0) {
+      substitutions = 0;
+      deletions = 0;
+      insertions = m;
+      distance = m;
+    } else if (m == 0) {
+      substitutions = 0;
+      deletions = n;
+      insertions = 0;
+      distance = n;
+    } else {
+      substitutions = 0;
+      deletions = 0;
+      insertions = 0;
+      distance = 0;
+      // initialize the matrix
+      matrix.resize(n + 1);
+      for (int i = 0; i < n + 1; ++i) {
+        matrix[i].resize(m + 1);
+        for (int j = 0; j < m + 1; ++j) {
+          matrix[i][j] = 0;
+        }
+      }
+      for (int i = 0; i < n + 1; ++i) {
+        matrix[i][0] = i;
+      }
+      for (int j = 0; j < m + 1; ++j) {
+        matrix[0][j] = j;
+      }
+
+      // calculate the insertions, substitutions and deletions
+      for (int i = 1; i < n + 1; ++i) {
+        int s_i = gtStr[i - 1];
+        for (int j = 1; j < m + 1; ++j) {
+          int t_j = recogStr[j - 1];
+          int cost = (s_i == t_j) ? 0 : 1;
+          const int above = matrix[i - 1][j];
+          const int left = matrix[i][j - 1];
+          const int diag = matrix[i - 1][j - 1];
+          const int cell = std::min(above + 1, std::min(left + 1, diag + cost));
+          matrix[i][j] = cell;
+        }
+      }
+
+      if (backtrace) {
+        size_t i = n;
+        size_t j = m;
+        substitutions = 0;
+        deletions = 0;
+        insertions = 0;
+
+        while (i != 0 && j != 0) {
+          if (matrix[i][j] == matrix[i - 1][j - 1]) {
+            --i;
+            --j;
+          } else if (matrix[i][j] == matrix[i - 1][j - 1] + 1) {
+            ++substitutions;
+            --i;
+            --j;
+          } else if (matrix[i][j] == matrix[i - 1][j] + 1) {
+            ++deletions;
+            --i;
+          } else {
+            ++insertions;
+            --j;
+          }
+        }
+        while (i != 0) {
+          ++deletions;
+          --i;
+        }
+        while (j != 0) {
+          ++insertions;
+          --j;
+        }
+        int diff = substitutions + deletions + insertions;
+        if (diff != matrix[n][m]) {
+          LOG(ERROR) << "Found path with distance " << diff
+                     << " but Levenshtein distance is " << matrix[n][m];
+        }
+
+        distance = (sp * substitutions) + (dp * deletions) + (ip * insertions);
+      } else {
+        distance = (real)matrix[n][m];
+      }
+    }
+    real maxLen = std::max(m, n);
+    deletions_ += deletions / maxLen;
+    insertions_ += insertions / maxLen;
+    substitutions_ += substitutions / maxLen;
+
+    if (distance != 0) {
+      seqClassficationError_ += 1;
+    }
+
+    return distance / maxLen;
+  }
+
+  real editDistance(
+      real* output, int numTimes, int numClasses, int* labels, int labelsLen) {
+    numTimes_ = numTimes;
+    numClasses_ = numClasses;
+    blank_ = numClasses_ - 1;
+    outActivations_ = Matrix::create(output, numTimes, numClasses);
+    std::vector<int> recogStr, gtStr;
+    recogStr = bestLabelSeq();
+    for (int i = 0; i < labelsLen; ++i) {
+      gtStr.push_back(labels[i]);
+    }
+
+    return stringAlignment(gtStr, recogStr);
+  }
+
+  void storeLocalValues() const {
+    evalResults_["error"] = numSequences_ ? totalScore_ / numSequences_ : 0;
+    evalResults_["deletion_error"] =
+        numSequences_ ? deletions_ / numSequences_ : 0;
+    evalResults_["insertion_error"] =
+        numSequences_ ? insertions_ / numSequences_ : 0;
+    evalResults_["substitution_error"] =
+        numSequences_ ? substitutions_ / numSequences_ : 0;
+    evalResults_["sequence_error"] =
+        (real)seqClassficationError_ / numSequences_;
+  }
+
+ public:
+  CTCErrorEvaluator()
+      : numTimes_(0),
+        numClasses_(0),
+        numSequences_(0),
+        blank_(0),
+        deletions_(0),
+        insertions_(0),
+        substitutions_(0),
+        seqClassficationError_(0) {}
+
+  virtual real evalImp(std::vector<Argument>& arguments) {
+    CHECK_EQ(arguments.size(), (size_t)2);
+    Argument output, label;
+    output.resizeAndCopyFrom(arguments[0], false, HPPL_STREAM_DEFAULT);
+    label.resizeAndCopyFrom(arguments[1], false, HPPL_STREAM_DEFAULT);
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+    CHECK(label.sequenceStartPositions);
+    CHECK(label.ids);
+    size_t numSequences = label.sequenceStartPositions->getSize() - 1;
+    const int* labelStarts = label.sequenceStartPositions->getData(false);
+    const int* outputStarts = output.sequenceStartPositions->getData(false);
+    real totalErr = 0;
+    for (size_t i = 0; i < numSequences; ++i) {
+      real err = 0;
+      err = editDistance(
+          output.value->getData() + output.value->getWidth() * outputStarts[i],
+          outputStarts[i + 1] - outputStarts[i],
+          output.value->getWidth(),
+          label.ids->getData() + labelStarts[i],
+          labelStarts[i + 1] - labelStarts[i]);
+
+      totalErr += err;
+    }
+
+    return totalErr;
+  }
+
+  virtual void eval(const NeuralNetwork& nn) {
+    Evaluator::eval(nn);
+    std::vector<Argument> arguments;
+    arguments.reserve(config_.input_layers_size());
+    for (const std::string& name : config_.input_layers()) {
+      arguments.push_back(nn.getLayer(name)->getOutput());
+    }
+  }
+
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
+    numSequences_ += arguments[1].getNumSequences();
+  }
+
+  virtual void start() {
+    Evaluator::start();
+    numSequences_ = 0;
+    blank_ = 0;
+    deletions_ = 0;
+    insertions_ = 0;
+    substitutions_ = 0;
+    seqClassficationError_ = 0;
+  }
+
+  virtual void printStats(std::ostream& os) const {
+    storeLocalValues();
+    os << config_.name() << " error = " << evalResults_["error"];
+    os << " deletions error = " << evalResults_["deletion_error"];
+    os << " insertions error = " << evalResults_["insertion_error"];
+    os << " substitution error = " << evalResults_["substitution_error"];
+    os << " sequence error = " << evalResults_["sequence_error"];
+  }
+
+  virtual void distributeEval(ParameterClient2* client) {
+    double buf[6] = {totalScore_,
+                     (double)deletions_,
+                     (double)insertions_,
+                     (double)substitutions_,
+                     (double)seqClassficationError_,
+                     (double)numSequences_};
+    client->reduce(buf, buf, 6, FLAGS_trainer_id, 0);
+    totalScore_ = buf[0];
+    deletions_ = (real)buf[1];
+    insertions_ = (real)buf[2];
+    substitutions_ = (real)buf[3];
+    seqClassficationError_ = (int)buf[4];
+    numSequences_ = (int)buf[5];
+  }
+
+  void getNames(std::vector<std::string>* names) {
+    storeLocalValues();
+    names->reserve(names->size() + evalResults_.size());
+    for (auto it = evalResults_.begin(); it != evalResults_.end(); ++it) {
+      names->push_back(config_.name() + "." + it->first);
+    }
+  }
+
+  real getValue(const std::string& name, Error* err) const {
+    storeLocalValues();
+
+    std::vector<std::string> buffers;
+    paddle::str::split(name, '.', &buffers);
+    auto it = evalResults_.find(buffers[buffers.size() - 1]);
+
+    if (it == evalResults_.end()) {
+      *err = Error("Evaluator does not have the key %s", name.c_str());
+      return 0.0f;
+    }
+
+    return it->second;
+  }
+
+  std::string getType(const std::string& name, Error* err) const {
+    this->getValue(name, err);
+    if (!err->isOK()) {
+      return "";
+    }
+    return "ctc_edit_distance";
+  }
+};
+
+REGISTER_EVALUATOR(ctc_edit_distance, CTCErrorEvaluator);
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/evaluators/ChunkEvaluator.cpp b/paddle/legacy/gserver/evaluators/ChunkEvaluator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0ff3f2fa8cf06c13ef327aa7ae2511bfc0d028be
--- /dev/null
+++ b/paddle/legacy/gserver/evaluators/ChunkEvaluator.cpp
@@ -0,0 +1,296 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <set>
+#include <vector>
+
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/legacy/utils/StringUtil.h"
+
+#include "Evaluator.h"
+
+namespace paddle {
+
+/**
+ * Chunk evaluator is used to evaluate segment labelling accuracy for a
+ * sequence. It calculates the chunk detection F1 score.
+ *
+ * A chunk is correctly detected if its beginning, end and type are correct.
+ * Other chunk type is ignored.
+ * For each label in the label sequence, we have
+ *
+ * @code
+ * tagType = label % numTagType
+ * chunkType = label / numTagType
+ * otherChunkType = numChunkTypes
+ * @endcode
+ *
+ * The total number of different labels is numTagType*numChunkTypes+1
+ * We support 4 labelling scheme
+ * The tag type for each of the scheme is shown as follows:
+ *
+ * @code
+ *  Scheme Begin Inside End   Single
+ *   plain  0     -      -     -
+ *   IOB    0     1      -     -
+ *   IOE    -     0      1     -
+ *   IOBES  0     1      2     3
+ * @endcode
+ *
+ * 'plain' means the whole chunk must contain exactly the same chunk label.
+ */
+class ChunkEvaluator : public Evaluator {
+  int otherChunkType_;
+  int numChunkTypes_;  // number of chunk types besides other chunk type
+  int numTagTypes_;
+  int tagBegin_;
+  int tagInside_;
+  int tagEnd_;
+  int tagSingle_;
+
+  int64_t numLabelSegments_;
+  int64_t numOutputSegments_;
+  int64_t numCorrect_;
+
+  struct Segment {
+    int begin;
+    int end;
+    int type;
+    bool operator==(const Segment& y) const {
+      return begin == y.begin && end == y.end && type == y.type;
+    }
+  };
+
+  std::vector<Segment> labelSegments_;
+  std::vector<Segment> outputSegments_;
+  std::set<int> excludedChunkTypes_;
+  mutable std::unordered_map<std::string, real> values_;
+
+ public:
+  virtual void init(const EvaluatorConfig& config) {
+    Evaluator::init(config);
+    if (config.chunk_scheme() == "IOB") {
+      numTagTypes_ = 2;
+      tagBegin_ = 0;
+      tagInside_ = 1;
+      tagEnd_ = -1;
+      tagSingle_ = -1;
+    } else if (config.chunk_scheme() == "IOE") {
+      numTagTypes_ = 2;
+      tagBegin_ = -1;
+      tagInside_ = 0;
+      tagEnd_ = 1;
+      tagSingle_ = -1;
+    } else if (config.chunk_scheme() == "IOBES") {
+      numTagTypes_ = 4;
+      tagBegin_ = 0;
+      tagInside_ = 1;
+      tagEnd_ = 2;
+      tagSingle_ = 3;
+    } else if (config.chunk_scheme() == "plain") {
+      numTagTypes_ = 1;
+      tagBegin_ = -1;
+      tagInside_ = -1;
+      tagEnd_ = -1;
+      tagSingle_ = -1;
+    } else {
+      LOG(FATAL) << "Unknown chunk scheme: " << config.chunk_scheme();
+    }
+    CHECK(config.has_num_chunk_types()) << "Missing num_chunk_types in config";
+    otherChunkType_ = numChunkTypes_ = config.num_chunk_types();
+
+    // the chunks of types in excludedChunkTypes_ will not be counted
+    auto& tmp = config.excluded_chunk_types();
+    excludedChunkTypes_.insert(tmp.begin(), tmp.end());
+  }
+
+  virtual void start() {
+    Evaluator::start();
+    numLabelSegments_ = 0;
+    numOutputSegments_ = 0;
+    numCorrect_ = 0;
+  }
+
+  virtual void printStats(std::ostream& os) const {
+    storeLocalValues();
+    os << config_.name() << "=" << values_["F1-score"]
+       << " true_chunks=" << numLabelSegments_
+       << " result_chunks=" << numOutputSegments_
+       << " correct_chunks=" << numCorrect_;
+  }
+
+  virtual void distributeEval(ParameterClient2* client) {
+    int64_t buf[3] = {numLabelSegments_, numOutputSegments_, numCorrect_};
+    client->reduce(buf, buf, 3, FLAGS_trainer_id, 0);
+    numLabelSegments_ = buf[0];
+    numOutputSegments_ = buf[1];
+    numCorrect_ = buf[2];
+  }
+
+  virtual real evalImp(std::vector<Argument>& arguments) {
+    CHECK_EQ(arguments.size(), (size_t)2);
+    IVectorPtr& output = arguments[0].ids;
+    IVectorPtr& label = arguments[1].ids;
+    CHECK(!output->useGpu() && !label->useGpu()) << "Not supported";
+    auto sequenceStartPositions =
+        arguments[1].sequenceStartPositions->getVector(false);
+    CHECK_EQ(output->getSize(), label->getSize());
+    CHECK(sequenceStartPositions);
+    size_t numSequences = sequenceStartPositions->getSize() - 1;
+    const int* starts = sequenceStartPositions->getData();
+    for (size_t i = 0; i < numSequences; ++i) {
+      eval1(output->getData() + starts[i],
+            label->getData() + starts[i],
+            starts[i + 1] - starts[i]);
+    }
+    return 0;
+  }
+
+  void eval1(int* output, int* label, int length) {
+    getSegments(output, length, outputSegments_);
+    getSegments(label, length, labelSegments_);
+    size_t i = 0, j = 0;
+    while (i < outputSegments_.size() && j < labelSegments_.size()) {
+      if (outputSegments_[i] == labelSegments_[j] &&
+          excludedChunkTypes_.count(outputSegments_[i].type) != 1) {
+        ++numCorrect_;
+      }
+      if (outputSegments_[i].end < labelSegments_[j].end) {
+        ++i;
+      } else if (outputSegments_[i].end > labelSegments_[j].end) {
+        ++j;
+      } else {
+        ++i;
+        ++j;
+      }
+    }
+    for (auto& segment : labelSegments_) {
+      if (excludedChunkTypes_.count(segment.type) != 1) ++numLabelSegments_;
+    }
+    for (auto& segment : outputSegments_) {
+      if (excludedChunkTypes_.count(segment.type) != 1) ++numOutputSegments_;
+    }
+  }
+
+  void getSegments(int* label, int length, std::vector<Segment>& segments) {
+    segments.clear();
+    segments.reserve(length);
+    int chunkStart = 0;
+    bool inChunk = false;
+    int tag = -1;
+    int type = otherChunkType_;
+    for (int i = 0; i < length; ++i) {
+      int prevTag = tag;
+      int prevType = type;
+      CHECK_LE(label[i], numChunkTypes_ * numTagTypes_);
+      tag = label[i] % numTagTypes_;
+      type = label[i] / numTagTypes_;
+      if (inChunk && isChunkEnd(prevTag, prevType, tag, type)) {
+        Segment segment{
+            chunkStart,  // begin
+            i - 1,       // end
+            prevType,
+        };
+        segments.push_back(segment);
+        inChunk = false;
+      }
+      if (isChunkBegin(prevTag, prevType, tag, type)) {
+        chunkStart = i;
+        inChunk = true;
+      }
+    }
+    if (inChunk) {
+      Segment segment{
+          chunkStart,  // begin
+          length - 1,  // end
+          type,
+      };
+      segments.push_back(segment);
+    }
+  }
+
+  // whether (prevTag, prevType) is the end of a chunk
+  bool isChunkEnd(int prevTag, int prevType, int tag, int type) {
+    if (prevType == otherChunkType_) return false;
+    if (type == otherChunkType_) return true;
+    if (type != prevType) return true;
+    if (prevTag == tagBegin_) return tag == tagBegin_ || tag == tagSingle_;
+    if (prevTag == tagInside_) return tag == tagBegin_ || tag == tagSingle_;
+    if (prevTag == tagEnd_) return true;
+    if (prevTag == tagSingle_) return true;
+    return false;
+  }
+
+  // whether (tag, type) is the beginning of a chunk
+  bool isChunkBegin(int prevTag, int prevType, int tag, int type) {
+    if (prevType == otherChunkType_) return type != otherChunkType_;
+    if (type == otherChunkType_) return false;
+    if (type != prevType) return true;
+    if (tag == tagBegin_) return true;
+    if (tag == tagInside_) return prevTag == tagEnd_ || prevTag == tagSingle_;
+    if (tag == tagEnd_) return prevTag == tagEnd_ || prevTag == tagSingle_;
+    if (tag == tagSingle_) return true;
+    return false;
+  }
+
+  // three metrics: precision, recall and F1-score
+  void getNames(std::vector<std::string>* names) {
+    storeLocalValues();
+    names->reserve(names->size() + values_.size());
+    for (auto it = values_.begin(); it != values_.end(); ++it) {
+      names->push_back(config_.name() + "." + it->first);
+    }
+  }
+
+  // get value by field name
+  real getValue(const std::string& name, Error* err) const {
+    storeLocalValues();
+    std::vector<std::string> buffers;
+    paddle::str::split(name, '.', &buffers);
+    auto it = values_.find(buffers.back());
+    if (it == values_.end()) {  // not found
+      *err = Error("No such key %s", name.c_str());
+      return 0.0f;
+    }
+
+    return it->second;
+  }
+
+  // get type of evaluator
+  std::string getType(const std::string& name, Error* err) const {
+    this->getValue(name, err);
+    if (!err->isOK()) {
+      return "";
+    }
+    return "chunk";
+  }
+
+ private:
+  void storeLocalValues() const {
+    CHECK_GE(numOutputSegments_, 0);
+    CHECK_GE(numLabelSegments_, 0);
+    double precision =
+        !numOutputSegments_ ? 0 : (double)numCorrect_ / numOutputSegments_;
+    double recall =
+        !numLabelSegments_ ? 0 : (double)numCorrect_ / numLabelSegments_;
+    values_["precision"] = precision;
+    values_["recall"] = recall;
+    values_["F1-score"] =
+        !numCorrect_ ? 0 : 2 * precision * recall / (precision + recall);
+  }
+};
+
+REGISTER_EVALUATOR(chunk, ChunkEvaluator);
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/evaluators/DetectionMAPEvaluator.cpp b/paddle/legacy/gserver/evaluators/DetectionMAPEvaluator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..57657241f8c1517f674670d34cb984b85996bfc7
--- /dev/null
+++ b/paddle/legacy/gserver/evaluators/DetectionMAPEvaluator.cpp
@@ -0,0 +1,308 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Evaluator.h"
+#include "paddle/legacy/gserver/layers/DetectionUtil.h"
+
+using std::map;
+using std::vector;
+using std::pair;
+using std::make_pair;
+
+namespace paddle {
+
+/**
+ * @brief detection map Evaluator
+ *
+ * The config file api is detection_map_evaluator.
+ */
+class DetectionMAPEvaluator : public Evaluator {
+ public:
+  DetectionMAPEvaluator()
+      : evaluateDifficult_(false), cpuOutput_(nullptr), cpuLabel_(nullptr) {}
+
+  virtual void start() {
+    Evaluator::start();
+    allTruePos_.clear();
+    allFalsePos_.clear();
+    numPos_.clear();
+  }
+
+  virtual real evalImp(std::vector<Argument>& arguments) {
+    overlapThreshold_ = config_.overlap_threshold();
+    backgroundId_ = config_.background_id();
+    evaluateDifficult_ = config_.evaluate_difficult();
+    apType_ = config_.ap_type();
+
+    MatrixPtr detectTmpValue = arguments[0].value;
+    Matrix::resizeOrCreate(cpuOutput_,
+                           detectTmpValue->getHeight(),
+                           detectTmpValue->getWidth(),
+                           false,
+                           false);
+
+    MatrixPtr labelTmpValue = arguments[1].value;
+    Matrix::resizeOrCreate(cpuLabel_,
+                           labelTmpValue->getHeight(),
+                           labelTmpValue->getWidth(),
+                           false,
+                           false);
+
+    cpuOutput_->copyFrom(*detectTmpValue);
+    cpuLabel_->copyFrom(*labelTmpValue);
+
+    Argument label = arguments[1];
+    const int* labelIndex = label.sequenceStartPositions->getData(false);
+    size_t batchSize = label.getNumSequences();
+
+    vector<map<size_t, vector<NormalizedBBox>>> allGTBBoxes;
+    vector<map<size_t, vector<pair<real, NormalizedBBox>>>> allDetectBBoxes;
+
+    for (size_t n = 0; n < batchSize; ++n) {
+      map<size_t, vector<NormalizedBBox>> bboxes;
+      for (int i = labelIndex[n]; i < labelIndex[n + 1]; ++i) {
+        vector<NormalizedBBox> bbox;
+        getBBoxFromLabelData(cpuLabel_->getData() + i * 6, 1, bbox);
+        int c = cpuLabel_->getData()[i * 6];
+        bboxes[c].push_back(bbox[0]);
+      }
+      allGTBBoxes.push_back(bboxes);
+    }
+
+    size_t n = 0;
+    const real* cpuOutputData = cpuOutput_->getData();
+    for (size_t imgId = 0; imgId < batchSize; ++imgId) {
+      map<size_t, vector<pair<real, NormalizedBBox>>> bboxes;
+      size_t curImgId = static_cast<size_t>((cpuOutputData + n * 7)[0]);
+      while (curImgId == imgId && n < cpuOutput_->getHeight()) {
+        vector<real> label;
+        vector<real> score;
+        vector<NormalizedBBox> bbox;
+        getBBoxFromDetectData(cpuOutputData + n * 7, 1, label, score, bbox);
+        bboxes[label[0]].push_back(make_pair(score[0], bbox[0]));
+        ++n;
+        curImgId = static_cast<size_t>((cpuOutputData + n * 7)[0]);
+      }
+      allDetectBBoxes.push_back(bboxes);
+    }
+
+    for (size_t n = 0; n < batchSize; ++n) {
+      for (map<size_t, vector<NormalizedBBox>>::iterator it =
+               allGTBBoxes[n].begin();
+           it != allGTBBoxes[n].end();
+           ++it) {
+        size_t count = 0;
+        if (evaluateDifficult_) {
+          count = it->second.size();
+        } else {
+          for (size_t i = 0; i < it->second.size(); ++i)
+            if (!(it->second[i].isDifficult)) ++count;
+        }
+        if (numPos_.find(it->first) == numPos_.end() && count != 0) {
+          numPos_[it->first] = count;
+        } else {
+          numPos_[it->first] += count;
+        }
+      }
+    }
+
+    // calcTFPos
+    calcTFPos(batchSize, allGTBBoxes, allDetectBBoxes);
+
+    return 0;
+  }
+
+  virtual void printStats(std::ostream& os) const {
+    real mAP = calcMAP();
+    os << "Detection mAP=" << mAP;
+  }
+
+  virtual void distributeEval(ParameterClient2* client) {
+    LOG(FATAL) << "Distribute detection evaluation not implemented.";
+  }
+
+ protected:
+  void calcTFPos(const size_t batchSize,
+                 const vector<map<size_t, vector<NormalizedBBox>>>& allGTBBoxes,
+                 const vector<map<size_t, vector<pair<real, NormalizedBBox>>>>&
+                     allDetectBBoxes) {
+    for (size_t n = 0; n < allDetectBBoxes.size(); ++n) {
+      if (allGTBBoxes[n].size() == 0) {
+        for (map<size_t, vector<pair<real, NormalizedBBox>>>::const_iterator
+                 it = allDetectBBoxes[n].begin();
+             it != allDetectBBoxes[n].end();
+             ++it) {
+          size_t label = it->first;
+          for (size_t i = 0; i < it->second.size(); ++i) {
+            allTruePos_[label].push_back(make_pair(it->second[i].first, 0));
+            allFalsePos_[label].push_back(make_pair(it->second[i].first, 1));
+          }
+        }
+      } else {
+        for (map<size_t, vector<pair<real, NormalizedBBox>>>::const_iterator
+                 it = allDetectBBoxes[n].begin();
+             it != allDetectBBoxes[n].end();
+             ++it) {
+          size_t label = it->first;
+          vector<pair<real, NormalizedBBox>> predBBoxes = it->second;
+          if (allGTBBoxes[n].find(label) == allGTBBoxes[n].end()) {
+            for (size_t i = 0; i < predBBoxes.size(); ++i) {
+              allTruePos_[label].push_back(make_pair(predBBoxes[i].first, 0));
+              allFalsePos_[label].push_back(make_pair(predBBoxes[i].first, 1));
+            }
+          } else {
+            vector<NormalizedBBox> gtBBoxes =
+                allGTBBoxes[n].find(label)->second;
+            vector<bool> visited(gtBBoxes.size(), false);
+            // Sort detections in descend order based on scores
+            std::sort(predBBoxes.begin(),
+                      predBBoxes.end(),
+                      sortScorePairDescend<NormalizedBBox>);
+            for (size_t i = 0; i < predBBoxes.size(); ++i) {
+              real maxOverlap = -1.0;
+              size_t maxIdx = 0;
+              for (size_t j = 0; j < gtBBoxes.size(); ++j) {
+                real overlap =
+                    jaccardOverlap(predBBoxes[i].second, gtBBoxes[j]);
+                if (overlap > maxOverlap) {
+                  maxOverlap = overlap;
+                  maxIdx = j;
+                }
+              }
+              if (maxOverlap > overlapThreshold_) {
+                if (evaluateDifficult_ ||
+                    (!evaluateDifficult_ && !gtBBoxes[maxIdx].isDifficult)) {
+                  if (!visited[maxIdx]) {
+                    allTruePos_[label].push_back(
+                        make_pair(predBBoxes[i].first, 1));
+                    allFalsePos_[label].push_back(
+                        make_pair(predBBoxes[i].first, 0));
+                    visited[maxIdx] = true;
+                  } else {
+                    allTruePos_[label].push_back(
+                        make_pair(predBBoxes[i].first, 0));
+                    allFalsePos_[label].push_back(
+                        make_pair(predBBoxes[i].first, 1));
+                  }
+                }
+              } else {
+                allTruePos_[label].push_back(make_pair(predBBoxes[i].first, 0));
+                allFalsePos_[label].push_back(
+                    make_pair(predBBoxes[i].first, 1));
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  real calcMAP() const {
+    real mAP = 0.0;
+    size_t count = 0;
+    for (map<size_t, size_t>::const_iterator it = numPos_.begin();
+         it != numPos_.end();
+         ++it) {
+      size_t label = it->first;
+      size_t labelNumPos = it->second;
+      if (labelNumPos == 0 || allTruePos_.find(label) == allTruePos_.end())
+        continue;
+      vector<pair<real, size_t>> labelTruePos = allTruePos_.find(label)->second;
+      vector<pair<real, size_t>> labelFalsePos =
+          allFalsePos_.find(label)->second;
+      // Compute average precision.
+      vector<size_t> tpCumSum;
+      getAccumulation(labelTruePos, &tpCumSum);
+      vector<size_t> fpCumSum;
+      getAccumulation(labelFalsePos, &fpCumSum);
+      std::vector<real> precision, recall;
+      size_t num = tpCumSum.size();
+      // Compute Precision.
+      for (size_t i = 0; i < num; ++i) {
+        CHECK_LE(tpCumSum[i], labelNumPos);
+        precision.push_back(static_cast<real>(tpCumSum[i]) /
+                            static_cast<real>(tpCumSum[i] + fpCumSum[i]));
+        recall.push_back(static_cast<real>(tpCumSum[i]) / labelNumPos);
+      }
+      // VOC2007 style
+      if (apType_ == "11point") {
+        vector<real> maxPrecisions(11, 0.0);
+        int startIdx = num - 1;
+        for (int j = 10; j >= 0; --j)
+          for (int i = startIdx; i >= 0; --i) {
+            if (recall[i] < j / 10.) {
+              startIdx = i;
+              if (j > 0) maxPrecisions[j - 1] = maxPrecisions[j];
+              break;
+            } else {
+              if (maxPrecisions[j] < precision[i])
+                maxPrecisions[j] = precision[i];
+            }
+          }
+        for (int j = 10; j >= 0; --j) mAP += maxPrecisions[j] / 11;
+        ++count;
+      } else if (apType_ == "Integral") {
+        // Nature integral
+        real averagePrecisions = 0.;
+        real prevRecall = 0.;
+        for (size_t i = 0; i < num; ++i) {
+          if (fabs(recall[i] - prevRecall) > 1e-6)
+            averagePrecisions += precision[i] * fabs(recall[i] - prevRecall);
+          prevRecall = recall[i];
+        }
+        mAP += averagePrecisions;
+        ++count;
+      } else {
+        LOG(FATAL) << "Unkown ap version: " << apType_;
+      }
+    }
+    if (count != 0) mAP /= count;
+    return mAP * 100;
+  }
+
+  void getAccumulation(vector<pair<real, size_t>> inPairs,
+                       vector<size_t>* accuVec) const {
+    std::stable_sort(
+        inPairs.begin(), inPairs.end(), sortScorePairDescend<size_t>);
+    accuVec->clear();
+    size_t sum = 0;
+    for (size_t i = 0; i < inPairs.size(); ++i) {
+      sum += inPairs[i].second;
+      accuVec->push_back(sum);
+    }
+  }
+
+  std::string getTypeImpl() const { return "detection_map"; }
+
+  real getValueImpl() const { return calcMAP(); }
+
+ private:
+  real overlapThreshold_;  // overlap threshold when determining whether matched
+  bool evaluateDifficult_;  // whether evaluate difficult ground truth
+  size_t backgroundId_;     // class index of background
+  std::string apType_;      // how to calculate mAP (Integral or 11point)
+
+  MatrixPtr cpuOutput_;
+  MatrixPtr cpuLabel_;
+
+  map<size_t, size_t> numPos_;  // counts of true objects each classification
+  map<size_t, vector<pair<real, size_t>>>
+      allTruePos_;  // true positive prediction
+  map<size_t, vector<pair<real, size_t>>>
+      allFalsePos_;  // false positive prediction
+};
+
+REGISTER_EVALUATOR(detection_map, DetectionMAPEvaluator);
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/evaluators/Evaluator.cpp b/paddle/legacy/gserver/evaluators/Evaluator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a956f40d02e39ac57ca745988491c2b54741dca3
--- /dev/null
+++ b/paddle/legacy/gserver/evaluators/Evaluator.cpp
@@ -0,0 +1,1361 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/gserver/evaluators/Evaluator.h"
+#include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/StringUtil.h"
+
+DECLARE_int32(trainer_id);
+
+namespace paddle {
+
+void Evaluator::eval(const NeuralNetwork& nn) {
+  std::vector<Argument> arguments;
+  arguments.reserve(config_.input_layers_size());
+  for (const std::string& name : config_.input_layers()) {
+    arguments.push_back(nn.getLayer(name)->getOutput());
+  }
+  SetDevice device(arguments[0].deviceId);
+  real score = evalImp(arguments);
+  totalScore_ += score;
+  updateSamplesNum(arguments);
+}
+/**
+ * @brief classification error Evaluator
+ *
+ * The config file api is classification_error_evaluator.
+ */
+class ClassificationErrorEvaluator : public Evaluator {
+ public:
+  /*
+  ClassificationErrorEvaluator() : totalScore2_(0) {}
+
+  virtual void start() {
+    Evaluator::start();
+    totalScore2_ = 0;
+    } */
+
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
+    if (3 == arguments.size()) {
+      numSamples_ += arguments[2].value->getSum();
+    } else {
+      numSamples_ += arguments[0].getBatchSize();
+    }
+  }
+
+  MatrixPtr calcError(std::vector<Argument>& arguments) {
+    CHECK_GE(arguments.size(), (size_t)2);
+    CHECK_LE(arguments.size(), (size_t)3);
+    MatrixPtr& output = arguments[0].value;
+    IVectorPtr& label = arguments[1].ids;
+    MatrixPtr& multiBinaryLabel = arguments[1].value;  // For multi binary label
+    bool supportWeight = (3 == arguments.size()) ? true : false;
+    MatrixPtr weight = supportWeight ? arguments[2].value : nullptr;
+    if (nullptr == output ||
+        (nullptr == label && nullptr == multiBinaryLabel) ||
+        (supportWeight && nullptr == weight)) {
+      return 0;
+    }
+
+    if (label != nullptr) {
+      CHECK_EQ(label->getSize(), output->getHeight());
+    } else {
+      CHECK_EQ(multiBinaryLabel->getHeight(), output->getHeight());
+      CHECK_EQ(multiBinaryLabel->getWidth(), output->getWidth());
+    }
+    if (supportWeight) {
+      CHECK_EQ(output->getHeight(), weight->getHeight());
+      CHECK_EQ((size_t)1, weight->getWidth());
+    }
+
+    const MatrixPtr errorMat = Matrix::create(output->getHeight(),
+                                              1,
+                                              /* trans= */ false,
+                                              useGpu(arguments[0].deviceId));
+
+    errorMat->zeroMem();
+
+    if (label != nullptr) {
+      errorMat->classificationError(*output, *label, config_.top_k());
+    } else if (dynamic_cast<CpuSparseMatrix*>(multiBinaryLabel.get()) ||
+               dynamic_cast<GpuSparseMatrix*>(multiBinaryLabel.get())) {
+      errorMat->classificationErrorMulti(
+          *output, *multiBinaryLabel, config_.classification_threshold());
+    } else {
+      errorMat->binaryClassificationError(
+          0, *output, *multiBinaryLabel, config_.classification_threshold());
+    }
+
+    if (supportWeight) {
+      errorMat->dotMul(*errorMat, *weight);
+    }
+    return errorMat;
+  }
+
+  void printStats(std::ostream& os) const {
+    if (config_.top_k() == 1) {
+      os << config_.name() << "="
+         << (numSamples_ ? totalScore_ / numSamples_ : 0);
+    } else {
+      os << " top_" << config_.top_k()
+         << "_error=" << (numSamples_ ? totalScore_ / numSamples_ : 0);
+    }
+  }
+
+  virtual real evalImp(std::vector<Argument>& arguments) {
+    MatrixPtr errorMat = calcError(arguments);
+    return errorMat->getSum();
+  }
+
+  virtual void distributeEval(ParameterClient2* client) {
+    mergeResultsOfAllClients(client);
+  }
+
+  // Evaluator interface
+ protected:
+  std::string getTypeImpl() const { return "classification_error"; }
+};
+
+/**
+ * @brief sequence classification error Evaluator
+ * @note sequence level classification error stats,
+ * if any frame in one sequence has error, the sequence is error
+ */
+class SequenceClassificationErrorEvaluator
+    : public ClassificationErrorEvaluator {
+ public:
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
+    numSamples_ += arguments[0].getNumSequences();
+  }
+
+  virtual real evalImp(std::vector<Argument>& arguments) {
+    auto sequenceStartPositions =
+        arguments[0].sequenceStartPositions->getVector(false);
+    CHECK(sequenceStartPositions != nullptr);
+    const int* starts = sequenceStartPositions->getData();
+
+    MatrixPtr errorMat = calcError(arguments);
+
+    int errCounter = 0;
+    CpuVector errorVec(0, nullptr);
+    for (size_t i = 0; i < sequenceStartPositions->getSize() - 1; ++i) {
+      errorVec.subVecFrom(
+          errorMat->getData(), starts[i], starts[i + 1] - starts[i]);
+      if (errorVec.getSum() > 0) {
+        errCounter += 1;
+      }
+    }
+
+    return static_cast<real>(errCounter);
+  }
+
+  virtual void distributeEval(ParameterClient2* client) {
+    mergeResultsOfAllClients(client);
+  }
+
+  // Evaluator interface
+ protected:
+  std::string getTypeImpl() const { return "seq_classification_error"; }
+};
+REGISTER_EVALUATOR(seq_classification_error,
+                   SequenceClassificationErrorEvaluator);
+/**
+ * @brief sum Evaluator
+ * Calculate the sum of output or label
+ *
+ * The config file api is sum_evaluator.
+ */
+class SumEvaluator : public Evaluator {
+ public:
+  SumEvaluator() : cpuLabel_(nullptr), cpuWeight_(nullptr) {}
+
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
+    if (2 == arguments.size()) {
+      numSamples_ += arguments[1].value->getSum();
+    } else {
+      numSamples_ += arguments[0].getBatchSize();
+    }
+  }
+
+  virtual real evalImp(std::vector<Argument>& arguments) {
+    REGISTER_TIMER("SumEvaluator");
+    CHECK_GE(arguments.size(), (size_t)1);
+    CHECK_LE(arguments.size(), (size_t)2);
+    bool supportWeight = (2 == arguments.size()) ? true : false;
+    if (supportWeight) {
+      if (nullptr == arguments[1].value) {
+        return 0;
+      }
+      CHECK_EQ(arguments[1].value->getWidth(), (size_t)1);
+    }
+
+    // The sum of output
+    if (arguments[0].value) {
+      if (supportWeight) {
+        CHECK_EQ(arguments[0].value->getHeight(),
+                 arguments[1].value->getHeight());
+        MatrixPtr tmpMat = Matrix::create(arguments[0].value->getHeight(),
+                                          arguments[0].value->getWidth(),
+                                          /* trans= */ false,
+                                          arguments[0].value->useGpu());
+        tmpMat->copyFrom(*arguments[0].value);
+        tmpMat->rowScale(0, *tmpMat, *arguments[1].value);
+        return tmpMat->getSum();
+      } else {
+        return arguments[0].value->getSum();
+      }
+      // The sum of label
+    } else if (arguments[0].ids) {
+      size_t insNum = arguments[0].ids->getSize();
+      IVectorPtr label = arguments[0].ids;
+      MatrixPtr weight = supportWeight ? arguments[1].value : nullptr;
+      if (dynamic_cast<GpuIVector*>(label.get())) {
+        IVector::resizeOrCreate(cpuLabel_, insNum, false);
+        cpuLabel_->copyFrom(*arguments[0].ids);
+
+        if (supportWeight) {
+          CHECK_EQ(insNum, arguments[1].value->getHeight());
+          Matrix::resizeOrCreate(cpuWeight_, insNum, (size_t)1, false, false);
+          cpuWeight_->copyFrom(*arguments[1].value);
+        }
+
+        label = cpuLabel_;
+        weight = cpuWeight_;
+      }
+
+      if (supportWeight) {
+        real score = 0.0;
+        int* labelD = label->getData();
+        real* weightD = weight->getData();
+        for (size_t i = 0; i < insNum; ++i) {
+          score += (labelD[i] * weightD[i]);
+        }
+        return score;
+      } else {
+        return label->getSum();
+      }
+    } else {
+      return 0;
+    }
+  }
+
+  virtual void distributeEval(ParameterClient2* client) {
+    mergeResultsOfAllClients(client);
+  }
+
+ private:
+  IVectorPtr cpuLabel_;
+  MatrixPtr cpuWeight_;
+
+  // Evaluator interface
+ protected:
+  std::string getTypeImpl() const { return "sum"; }
+};
+/**
+ * @brief column sum Evaluator
+ * @note column sum for the colIdx-th column *
+ * - colIdx = 0: the 0-th column.
+ * - colIdx > 0: the colIdx-th column.
+ * - colIdx < 0: the last colIdx-th column.
+ *
+ * The config file api is column_sum_evaluator.
+ *
+ */
+class ColumnSumEvaluator : public Evaluator {
+ public:
+  explicit ColumnSumEvaluator(int32_t colIdx)
+      : colIdx_(colIdx), colNum_(0), sum_(nullptr) {}
+
+  virtual void start() {
+    Evaluator::start();
+    if (nullptr != sum_) {
+      sum_->zeroMem();
+    }
+  }
+
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
+    if (2 == arguments.size()) {
+      numSamples_ += arguments[1].value->getSum();
+    } else {
+      numSamples_ += arguments[0].getBatchSize();
+    }
+  }
+
+  virtual real evalImp(std::vector<Argument>& arguments) {
+    REGISTER_TIMER("ColumnSumEvaluator");
+    CHECK_GE(arguments.size(), (size_t)1);
+    CHECK_LE(arguments.size(), (size_t)2);
+    bool supportWeight = (2 == arguments.size()) ? true : false;
+    if (nullptr == arguments[0].value ||
+        (supportWeight && nullptr == arguments[1].value)) {
+      return 0;
+    }
+
+    size_t insNum = arguments[0].value->getHeight();
+    size_t colNum = arguments[0].value->getWidth();
+    if (nullptr == sum_) {
+      sum_ = Matrix::create((size_t)1, colNum, false, /* useGpu */ false);
+      colNum_ = colNum;
+      sum_->zeroMem();
+    } else {
+      CHECK_EQ(colNum, sum_->getWidth());
+    }
+
+    if (supportWeight) {
+      CHECK_EQ(insNum, arguments[1].value->getHeight());
+      CHECK_EQ((size_t)1, arguments[1].value->getWidth());
+      MatrixPtr tmpMat = Matrix::create(insNum, colNum);
+      if (arguments[0].value->useGpu()) {
+        tmpMat->copyFrom(*arguments[0].value);
+      }
+      if (!arguments[1].value->useGpu()) {
+        if (!arguments[0].value->useGpu()) {
+          tmpMat->rowScale(0, *arguments[0].value, *arguments[1].value);
+        } else {
+          tmpMat->rowScale(0, *tmpMat, *arguments[1].value);
+        }
+      } else {
+        MatrixPtr tmp2 = Matrix::create(insNum, 1);
+        tmp2->copyFrom(*arguments[1].value);
+        if (!arguments[0].value->useGpu()) {
+          tmpMat->rowScale(0, *arguments[0].value, *tmp2);
+        } else {
+          tmpMat->rowScale(0, *tmpMat, *tmp2);
+        }
+      }
+      sum_->accumulateColSum(*tmpMat);
+    } else {
+      if (!arguments[0].value->useGpu()) {
+        sum_->accumulateColSum(*arguments[0].value);
+      } else {
+        MatrixPtr tmpMat = Matrix::create(insNum, colNum);
+        tmpMat->copyFrom(*arguments[0].value);
+        sum_->accumulateColSum(*tmpMat);
+      }
+    }
+    return 0;
+  }
+
+  virtual void printStats(std::ostream& os) const {
+    CHECK(colIdx_ + (int32_t)colNum_ >= 0 && colIdx_ - (int32_t)colNum_ < 0)
+        << "column index [" << colIdx_ << "] out of range [-" << colNum_ << ", "
+        << colNum_ << ")";
+    size_t colIdx = 0;
+    if (colIdx_ >= 0) {
+      colIdx = colIdx_;
+    } else {
+      colIdx = colNum_ + colIdx_;
+    }
+    os << config_.name() << "="
+       << (numSamples_ ? sum_->getElement(0, colIdx) / numSamples_ : 0);
+  }
+
+  void distributeEval(ParameterClient2* client) {
+    client->reduce(
+        sum_->getData(), sum_->getData(), colNum_, FLAGS_trainer_id, 0);
+    client->reduce(&numSamples_, &numSamples_, 1, FLAGS_trainer_id, 0);
+  }
+
+ private:
+  int32_t colIdx_;
+  size_t colNum_;
+  MatrixPtr sum_; /* cpu matrix */
+
+  // Evaluator interface
+ protected:
+  std::string getTypeImpl() const {
+    if (colIdx_ == -1)
+      return "last-column-sum";
+    else
+      return "column-sum";
+  }
+};
+
+void AucEvaluator::start() {
+  Evaluator::start();
+  memset(statPos_, 0, sizeof(statPos_));
+  memset(statNeg_, 0, sizeof(statNeg_));
+}
+
+real AucEvaluator::evalImp(std::vector<Argument>& arguments) {
+  REGISTER_TIMER("AucEvaluator");
+  CHECK_GE(arguments.size(), (size_t)2);
+  CHECK_LE(arguments.size(), (size_t)3);
+  MatrixPtr output = arguments[0].value;
+  IVectorPtr label = arguments[1].ids;
+  MatrixPtr labelval = arguments[1].value;
+  bool supportWeight = (3 == arguments.size()) ? true : false;
+  MatrixPtr weight = supportWeight ? arguments[2].value : nullptr;
+
+  if (nullptr == output || (supportWeight && nullptr == weight)) {
+    return 0;
+  }
+  size_t insNum = output->getHeight();
+  size_t outputDim = output->getWidth();
+  // Copy label from value to a vector.
+  if (nullptr == label && nullptr != labelval) {
+    // label width is 1
+    CHECK_EQ(1U, labelval->getWidth());
+    VectorPtr vec =
+        Vector::create(labelval->getData(), insNum, output->useGpu());
+    label = vec->castToInt();
+  }
+
+  CHECK_EQ(insNum, label->getSize());
+  if (supportWeight) {
+    CHECK_EQ(insNum, weight->getHeight());
+    CHECK_EQ((size_t)1, weight->getWidth());
+  }
+
+  CHECK(colIdx_ + (int32_t)outputDim >= 0 && colIdx_ - (int32_t)outputDim < 0)
+      << "column index [" << colIdx_ << "] out of range [-" << outputDim << ", "
+      << outputDim << ")";
+  realColumnIdx_ = 0;
+  if (colIdx_ >= 0) {
+    realColumnIdx_ = colIdx_;
+  } else {
+    realColumnIdx_ = outputDim + colIdx_;
+  }
+
+  if (dynamic_cast<GpuMatrix*>(output.get())) {
+    Matrix::resizeOrCreate(cpuOutput_,
+                           insNum,
+                           outputDim,
+                           /* trans=*/false,
+                           /* useGpu=*/false);
+    cpuOutput_->copyFrom(*output);
+    IVector::resizeOrCreate(cpuLabel_, insNum, false);
+    cpuLabel_->copyFrom(*label);
+
+    if (supportWeight) {
+      Matrix::resizeOrCreate(cpuWeight_, insNum, (size_t)1, false, false);
+      cpuWeight_->copyFrom(*weight);
+    }
+
+    output = cpuOutput_;
+    label = cpuLabel_;
+    weight = cpuWeight_;
+  }
+
+  real* outputD = output->getData();
+  int* labelD = label->getData();
+  real* weightD = supportWeight ? weight->getData() : nullptr;
+  size_t pos = realColumnIdx_;
+
+  for (size_t i = 0; i < insNum; ++i) {
+    real value = outputD[pos];
+    uint32_t binIdx = static_cast<uint32_t>(value * kBinNum_);
+    CHECK(binIdx <= kBinNum_) << "bin index [" << binIdx
+                              << "] out of range, predict value[" << value
+                              << "]";
+    real w = supportWeight ? weightD[i] : 1.0;
+    if (labelD[i] == kNegativeLabel_) {
+      statNeg_[binIdx] += w;
+    } else {
+      statPos_[binIdx] += w;
+    }
+    pos += outputDim;
+  }
+  return 0;
+}
+
+void AucEvaluator::distributeEval(ParameterClient2* client) {
+  client->reduce(statPos_, statPos_, kBinNum_ + 1, FLAGS_trainer_id, 0);
+  client->reduce(statNeg_, statNeg_, kBinNum_ + 1, FLAGS_trainer_id, 0);
+}
+
+double AucEvaluator::calcAuc() const {
+  double totPos = 0.0;
+  double totNeg = 0.0;
+  double totPosPrev = 0.0;
+  double totNegPrev = 0.0;
+  double auc = 0.0;
+
+  int64_t idx = kBinNum_;
+  while (idx >= 0) {
+    totPosPrev = totPos;
+    totNegPrev = totNeg;
+    totPos += statPos_[idx];
+    totNeg += statNeg_[idx];
+    auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev);
+    --idx;
+  }
+
+  if (totPos > 0.0 && totNeg > 0.0) {
+    return auc / totPos / totNeg;
+  } else {
+    return 0.0;
+  }
+}
+
+real AucEvaluator::getValueImpl() const { return calcAuc(); }
+
+std::string AucEvaluator::getTypeImpl() const {
+  if (colIdx_ == -1) {
+    return "last-column-auc";
+  } else {
+    return "auc";
+  }
+}
+
+// class RankAucEvaluator
+REGISTER_EVALUATOR(rankauc, RankAucEvaluator);
+
+void RankAucEvaluator::start() { Evaluator::start(); }
+void RankAucEvaluator::updateSamplesNum(
+    const std::vector<Argument>& arguments) {
+  numSamples_ += arguments[0].getNumSequences();
+}
+real RankAucEvaluator::evalImp(std::vector<Argument>& arguments) {
+  CHECK_GE(arguments.size(), 2U);
+  CHECK_LE(arguments.size(), 3U);
+  double batchAuc = 0.0;
+  output_ = arguments[0].value;
+  click_ = arguments[1].value;
+  size_t batchSize = output_->getHeight();
+  CHECK(!output_->useGpu()) << "RankAUC evaluator does not support GPU!";
+
+  if (arguments.size() == 3U) {
+    pv_ = arguments[2].value;
+  } else {
+    Matrix::resizeOrCreate(pv_, batchSize, 1, false, false);
+    std::fill(pv_->getData(), pv_->getData() + batchSize, 1.0);
+  }
+
+  real* outputData = output_->getData();
+  real* clickData = click_->getData();
+  real* pvData = pv_->getData();
+
+  auto startPos = arguments[0].sequenceStartPositions->getVector(false);
+  const int* startPosData = startPos->getData();
+  size_t batchNum = startPos->getSize() - 1;
+  for (size_t i = 0; i < batchNum; ++i) {
+    int beginPos = startPosData[i];
+    int endPos = startPosData[i + 1];
+    batchAuc += calcRankAuc(outputData + beginPos,
+                            clickData + beginPos,
+                            pvData + beginPos,
+                            endPos - beginPos);
+  }
+  return batchAuc;
+}
+
+double RankAucEvaluator::calcRankAuc(real* outputData,
+                                     real* clickData,
+                                     real* pvData,
+                                     size_t size) {
+  outputPair_.clear();
+  for (size_t i = 0; i < size; ++i) {
+    outputPair_.push_back(std::make_pair(outputData[i], i));
+  }
+  std::sort(outputPair_.begin(),
+            outputPair_.end(),
+            [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
+              return a.first > b.first;
+            });
+  double aucTmp = 0.0;
+  double clickSum = 0.0;
+  double oldClickSum = 0.0;
+  double noClick = 0.0;
+  double noClickSum = 0.0;
+
+  double lastScore = outputPair_[0].first + 1.0;
+  for (size_t i = 0; i < size; ++i) {
+    if (lastScore != outputPair_[i].first) {
+      aucTmp += (clickSum + oldClickSum) * noClick / 2.0;
+      oldClickSum = clickSum;
+      noClick = 0.0;
+      lastScore = outputPair_[i].first;
+    }
+    size_t id = outputPair_[i].second;
+    noClick += pvData[id] - clickData[id];
+    noClickSum += noClick;
+    clickSum += clickData[id];
+  }
+  aucTmp += (clickSum + oldClickSum) * noClick / 2.0;
+  return (clickSum * noClickSum) == 0.0 ? 0.0
+                                        : aucTmp / (clickSum * noClickSum);
+}
+
+std::string RankAucEvaluator::getTypeImpl() const { return "rankauc"; }
+
+// class PrecisionRecallEvaluator
+REGISTER_EVALUATOR(precision_recall, PrecisionRecallEvaluator);
+
+void PrecisionRecallEvaluator::start() {
+  Evaluator::start();
+  statsInfo_.clear();
+  values_.clear();
+}
+
+real PrecisionRecallEvaluator::evalImp(std::vector<Argument>& arguments) {
+  REGISTER_TIMER("PrecisionRecallEvaluator");
+  CHECK_GE(arguments.size(), (size_t)2);
+  CHECK_LE(arguments.size(), (size_t)3);
+  MatrixPtr output = arguments[0].value;
+  IVectorPtr label = arguments[1].ids;
+  MatrixPtr multiBinaryLabel = arguments[1].value;
+  bool supportWeight = (3 == arguments.size()) ? true : false;
+  MatrixPtr weight = supportWeight ? arguments[2].value : nullptr;
+  if (nullptr == output || (nullptr == label && nullptr == multiBinaryLabel) ||
+      (supportWeight && nullptr == weight)) {
+    return 0;
+  }
+
+  size_t insNum = output->getHeight();
+  size_t outputDim = output->getWidth();
+  if (label != nullptr) {
+    CHECK_EQ(insNum, label->getSize());
+  } else {
+    CHECK_EQ(insNum, multiBinaryLabel->getHeight());
+    CHECK_EQ(outputDim, multiBinaryLabel->getWidth());
+  }
+  if (supportWeight) {
+    CHECK_EQ(insNum, weight->getHeight());
+    CHECK_EQ((size_t)1, weight->getWidth());
+  }
+
+  if (statsInfo_.size() != outputDim) {
+    statsInfo_.clear();
+    statsInfo_.resize(outputDim);
+  }
+
+  isMultiBinaryLabel_ = (nullptr == label) ? true : false;
+  if (label != nullptr) {
+    if (dynamic_cast<GpuMatrix*>(output.get())) {
+      Matrix::resizeOrCreate(cpuOutput_, insNum, outputDim, false, false);
+      cpuOutput_->copyFrom(*output);
+      IVector::resizeOrCreate(cpuLabel_, insNum, false);
+      cpuLabel_->copyFrom(*label);
+      if (supportWeight) {
+        Matrix::resizeOrCreate(cpuWeight_, insNum, (size_t)1, false, false);
+        cpuWeight_->copyFrom(*weight);
+      }
+
+      output = cpuOutput_;
+      label = cpuLabel_;
+      weight = cpuWeight_;
+    }
+    calcStatsInfo(output, label, weight);
+  } else {
+    // Not support GPU for multi binary labels
+    CHECK(dynamic_cast<CpuSparseMatrix*>(multiBinaryLabel.get()));
+    calcStatsInfoMulti(output, multiBinaryLabel, weight);
+  }
+  return 0;
+}
+
+void PrecisionRecallEvaluator::printStats(std::ostream& os) const {
+  PrintStatsInfo info;
+  bool containMacroMicroInfo = getStatsInfo(&info);
+  os << "positive_label=" << config_.positive_label()
+     << " precision=" << info.precision << " recall=" << info.recall
+     << " F1-score=" << info.f1;
+  if (containMacroMicroInfo) {
+    os << "macro-average-precision=" << info.macroAvgPrecision
+       << " macro-average-recall=" << info.macroAvgRecall
+       << " macro-average-F1-score=" << info.macroAvgF1Score;
+    if (!isMultiBinaryLabel_) {
+      // precision and recall are equal in this case
+      os << " micro-average-precision=" << info.microAvgPrecision;
+    } else {
+      os << " micro-average-precision=" << info.microAvgPrecision
+         << " micro-average-recall=" << info.microAvgRecall
+         << " micro-average-F1-score=" << info.microAvgF1Score;
+    }
+  }
+}
+
+void PrecisionRecallEvaluator::calcStatsInfo(const MatrixPtr& output,
+                                             const IVectorPtr& label,
+                                             const MatrixPtr& weight) {
+  size_t insNum = output->getHeight();
+  size_t dim = output->getWidth();
+  real* outputD = output->getData();
+  int* labelD = label->getData();
+  real* weightD = (weight != nullptr) ? weight->getData() : nullptr;
+  for (size_t i = 0; i < insNum; ++i) {
+    CHECK_GE(labelD[i], 0);
+    CHECK_LT((size_t)labelD[i], dim);
+    size_t maxIdx = 0;
+    real maxValue = outputD[i * dim];
+    for (size_t j = 1; j < dim; ++j) {
+      size_t idx = i * dim + j;
+      if (maxValue < outputD[idx]) {
+        maxIdx = j;
+        maxValue = outputD[idx];
+      }
+    }
+
+    real w = (weightD != nullptr) ? weightD[i] : 1.0;
+    if (maxIdx == (size_t)labelD[i]) {
+      statsInfo_[maxIdx].TP += w;  // true positive for labelD[i]
+      // true negative for all labels except for labelD[i]
+      for (size_t j = 0; j < dim; ++j) {
+        statsInfo_[j].TN += w;
+      }
+      statsInfo_[maxIdx].TN -= w;
+    } else {
+      statsInfo_[labelD[i]].FN += w;  // false negative for labelD[i]
+      statsInfo_[maxIdx].FP += w;     // false positive for maxIdx
+      // true negatives for all labels except for maxIdx and labelD[i]
+      for (size_t j = 0; j < dim; ++j) {
+        statsInfo_[j].TN += w;
+      }
+      statsInfo_[maxIdx].TN -= w;
+      statsInfo_[labelD[i]].TN -= w;
+    }
+  }
+}
+
+void PrecisionRecallEvaluator::calcStatsInfoMulti(const MatrixPtr& output,
+                                                  const MatrixPtr& label,
+                                                  const MatrixPtr& weight) {
+  size_t insNum = output->getHeight();
+  size_t dim = output->getWidth();
+  real* outputD = output->getData();
+  auto labelD = dynamic_cast<CpuSparseMatrix*>(label.get());
+  real* weightD = (weight != nullptr) ? weight->getData() : nullptr;
+  real threshold = config_.classification_threshold();
+  for (size_t i = 0; i < insNum; ++i) {
+    for (size_t j = 0; j < dim; ++j) {
+      real w = (weightD != nullptr) ? weightD[i] : 1.0;
+      size_t idx = i * dim + j;
+      if (outputD[idx] < threshold) {
+        statsInfo_[j].TN += w;  // true negative
+      } else {
+        statsInfo_[j].FP += w;  // false positive
+      }
+    }
+
+    const int* cols = labelD->getRowCols(i);
+    for (size_t j = 0; j < labelD->getColNum(i); ++j) {
+      CHECK_LT(size_t(cols[j]), dim);
+      real w = (weightD != nullptr) ? weightD[i] : 1.0;
+      size_t idx = i * dim + cols[j];
+      if (outputD[idx] < threshold) {
+        statsInfo_[cols[j]].FN += w;  // false negative
+        statsInfo_[cols[j]].TN -= w;  // true negative
+      } else {
+        statsInfo_[cols[j]].TP += w;  // true positive
+        statsInfo_[cols[j]].FP -= w;  // false positive
+      }
+    }
+  }
+}
+
+void PrecisionRecallEvaluator::storeLocalValues() const {
+  if (this->values_.size() == 0) {
+    PrintStatsInfo info;
+    bool containMacroMicroInfo = getStatsInfo(&info);
+    values_["precision"] = info.precision;
+    values_["recal"] = info.recall;
+    values_["F1-score"] = info.f1;
+    if (containMacroMicroInfo) {
+      values_["macro-average-precision"] = info.macroAvgPrecision;
+      values_["macro-average-recall"] = info.macroAvgRecall;
+      values_["macro-average-F1-score"] = info.macroAvgF1Score;
+      if (!isMultiBinaryLabel_) {
+        // precision and recall are equal in this case
+        values_["micro-average-precision"] = info.microAvgPrecision;
+      } else {
+        values_["micro-average-precision"] = info.microAvgPrecision;
+        values_["micro-average-recall"] = info.microAvgRecall;
+        values_["micro-average-F1-score"] = info.microAvgF1Score;
+      }
+    }
+  }
+}
+
+void PrecisionRecallEvaluator::getNames(std::vector<std::string>* names) {
+  this->storeLocalValues();
+  names->reserve(this->values_.size());
+  for (auto it = this->values_.begin(); it != this->values_.end(); ++it) {
+    names->push_back(this->config_.name() + "." + it->first);
+  }
+}
+
+real PrecisionRecallEvaluator::getValue(const std::string& name,
+                                        Error* err) const {
+  this->storeLocalValues();
+  std::vector<std::string> buffers;
+  paddle::str::split(name, '.', &buffers);
+  auto it = this->values_.find(buffers[buffers.size() - 1]);
+  if (it == this->values_.end()) {  // not found
+    *err = Error("No such key %s", name.c_str());
+    return .0f;
+  }
+
+  return it->second;
+}
+
+std::string PrecisionRecallEvaluator::getType(const std::string& name,
+                                              Error* err) const {
+  this->getValue(name, err);
+  if (!err->isOK()) {
+    return "";
+  }
+  return "precision_recall";
+}
+
+void PrecisionRecallEvaluator::distributeEval(ParameterClient2* client) {
+  size_t size = 4 * statsInfo_.size();
+  double* buf = new double[size];
+  for (size_t i = 0; i < statsInfo_.size(); ++i) {
+    buf[4 * i + 0] = statsInfo_[i].TP;
+    buf[4 * i + 1] = statsInfo_[i].TN;
+    buf[4 * i + 2] = statsInfo_[i].FP;
+    buf[4 * i + 3] = statsInfo_[i].FN;
+  }
+  client->reduce(buf, buf, size, FLAGS_trainer_id, 0);
+  for (size_t i = 0; i < statsInfo_.size(); ++i) {
+    statsInfo_[i].TP = buf[4 * i + 0];
+    statsInfo_[i].TN = buf[4 * i + 1];
+    statsInfo_[i].FP = buf[4 * i + 2];
+    statsInfo_[i].FN = buf[4 * i + 3];
+  }
+  delete[] buf;
+}
+
+bool PrecisionRecallEvaluator::getStatsInfo(
+    PrecisionRecallEvaluator::PrintStatsInfo* info) const {
+  int label = config_.positive_label();
+  if (label != -1) {
+    CHECK(label >= 0 && label < (int)statsInfo_.size())
+        << "positive_label [" << label << "] should be in range [0, "
+        << statsInfo_.size() << ")";
+    info->precision = calcPrecision(statsInfo_[label].TP, statsInfo_[label].FP);
+    info->recall = calcRecall(statsInfo_[label].TP, statsInfo_[label].FN);
+    info->f1 = calcF1Score(info->precision, info->recall);
+    return false;
+  }
+
+  // micro average method: precision = (TP1+TP2)/(TP1+FP1+TP2+FP2)
+  // macro average method: precision = (precision1+precision2)/2
+  double microTotalTP = 0;
+  double microTotalFP = 0;
+  double microTotalFN = 0;
+  info->macroAvgPrecision = 0;
+  info->macroAvgRecall = 0;
+  size_t numLabels = statsInfo_.size();
+  for (size_t i = 0; i < numLabels; ++i) {
+    microTotalTP += statsInfo_[i].TP;
+    microTotalFP += statsInfo_[i].FP;
+    microTotalFN += statsInfo_[i].FN;
+    info->macroAvgPrecision +=
+        calcPrecision(statsInfo_[i].TP, statsInfo_[i].FP);
+    info->macroAvgRecall += calcRecall(statsInfo_[i].TP, statsInfo_[i].FN);
+  }
+  info->macroAvgPrecision /= numLabels;
+  info->macroAvgRecall /= numLabels;
+  info->macroAvgF1Score =
+      calcF1Score(info->macroAvgPrecision, info->macroAvgRecall);
+
+  info->microAvgPrecision = calcPrecision(microTotalTP, microTotalFP);
+  info->microAvgRecall = calcPrecision(microTotalTP, microTotalFN);
+  info->microAvgF1Score =
+      calcF1Score(info->microAvgPrecision, info->microAvgRecall);
+  return true;
+}
+
+REGISTER_EVALUATOR(pnpair, PnpairEvaluator);
+void PnpairEvaluator::start() {
+  Evaluator::start();
+  memset(pairArray_, 0, sizeof(pairArray_));
+  predictArray_.clear();
+}
+
+real PnpairEvaluator::evalImp(std::vector<Argument>& arguments) {
+  CHECK_GE(arguments.size(), 3UL);
+  CHECK_LE(arguments.size(), 4UL);
+  MatrixPtr output = arguments[0].value;
+  IVectorPtr label = arguments[1].ids;
+  IVectorPtr info = arguments[2].ids;
+  bool supportWeight = (4 == arguments.size()) ? true : false;
+  MatrixPtr weight = supportWeight ? arguments[3].value : nullptr;
+  if (nullptr == output || nullptr == label ||
+      (supportWeight && nullptr == weight)) {
+    return 0;
+  }
+  size_t height = output->getHeight();
+  size_t width = output->getWidth();
+  CHECK_EQ(height, label->getSize());
+  CHECK_EQ(height, info->getSize());
+  if (supportWeight) {
+    CHECK_EQ(height, weight->getHeight());
+    CHECK_EQ((size_t)1, weight->getWidth());
+  }
+
+  if (dynamic_cast<GpuMatrix*>(output.get())) {
+    Matrix::resizeOrCreate(cpuOutput_, height, width, false, false);
+    IVector::resizeOrCreate(cpuLabel_, height, false);
+    IVector::resizeOrCreate(cpuInfo_, height, false);
+    cpuOutput_->copyFrom(*output);
+    cpuLabel_->copyFrom(*label);
+    cpuInfo_->copyFrom(*info);
+
+    output = cpuOutput_;
+    label = cpuLabel_;
+    info = cpuInfo_;
+
+    if (supportWeight) {
+      Matrix::resizeOrCreate(cpuWeight_, height, (size_t)1, false, false);
+      cpuWeight_->copyFrom(*weight);
+      weight = cpuWeight_;
+    }
+  }
+
+  real* outputs = output->getData();
+  int* labels = label->getData();
+  int* infos = info->getData();
+  real* weights = supportWeight ? weight->getData() : nullptr;
+  for (size_t i = 0; i < output->getHeight(); i++) {
+    real y1 = outputs[i * width + (width - 1)];
+    real w = supportWeight ? weights[i] : 1.0;
+    predictArray_.push_back(PredictionResult(y1, labels[i], infos[i], w));
+  }
+  return 0;
+}
+
+void PnpairEvaluator::stat(size_t start,
+                           size_t end,
+                           PredictionResult* answers,
+                           double& pos,
+                           double& neg,
+                           double& spe) {
+  for (size_t i = start; i < end; i++) {
+    for (size_t j = i + 1; j < end; j++) {
+      CHECK_EQ(answers[i].queryid, answers[j].queryid);
+      // The pair weight is the mean of the two samples' weight
+      double weight = (answers[i].weight + answers[j].weight) / 2.0;
+      if (answers[i].label != answers[j].label) {
+        if ((answers[i].out > answers[j].out &&
+             answers[i].label > answers[j].label) ||
+            (answers[i].out < answers[j].out &&
+             answers[i].label < answers[j].label)) {
+          pos += weight;
+        } else if ((answers[i].out > answers[j].out &&
+                    answers[i].label < answers[j].label) ||
+                   (answers[i].out < answers[j].out &&
+                    answers[i].label > answers[j].label)) {
+          neg += weight;
+        } else {
+          spe += weight;
+        }
+      }
+    }
+  }
+}
+
+void PnpairEvaluator::calc(std::vector<PredictionResult>& predictArray) {
+  std::sort(predictArray.begin(),
+            predictArray.end(),
+            [](const PredictionResult& x, const PredictionResult& y) {
+              return x.queryid < y.queryid;
+            });
+
+  double pos = 0;
+  double neg = 0;
+  double special = 0;
+  auto start = predictArray.begin();
+  while (start != predictArray.end()) {
+    auto end = std::find_if(
+        start + 1, predictArray.end(), [=](const PredictionResult& x) {
+          return x.queryid != start->queryid;
+        });
+    CHECK(end != start);
+    stat(start - predictArray.begin(),
+         end - predictArray.begin(),
+         predictArray.data(),
+         pos,
+         neg,
+         special);
+
+    start = end;
+  }
+
+  pairArray_[0] += pos;
+  pairArray_[1] += neg;
+
+  LOG(INFO) << " calc total pos pair: " << pos
+            << " calc total neg pair: " << neg
+            << " calc total special pair: " << special;
+}
+
+std::string PnpairEvaluator::getTypeImpl() const { return "pnpair"; }
+
+ClassRegistrar<Evaluator> Evaluator::registrar_;
+Evaluator* Evaluator::create(const EvaluatorConfig& config) {
+  Evaluator* evaluator = registrar_.createByType(config.type());
+  evaluator->init(config);
+  return evaluator;
+}
+
+REGISTER_EVALUATOR(classification_error, ClassificationErrorEvaluator);
+REGISTER_EVALUATOR(sum, SumEvaluator);
+static InitFunction __reg_type_auc_sum__([]() {
+  Evaluator::registrar_.registerClass(
+      "last-column-sum", [] { return new ColumnSumEvaluator(-1); });
+  Evaluator::registrar_.registerClass("last-column-auc",
+                                      [] { return new AucEvaluator(-1); });
+});
+
+/**
+ * @brief print value of each layer.
+ *
+ * The config file api is value_printer_evaluator.
+ */
+class ValuePrinter : public NotGetableEvaluator {
+ public:
+  virtual void eval(const NeuralNetwork& nn) {
+    for (const std::string& name : config_.input_layers()) {
+      nn.getLayer(name)->getOutput().printValueString(LOG(INFO),
+                                                      "layer=" + name + " ");
+    }
+  }
+
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
+
+  virtual real evalImp(std::vector<Argument>& arguments) { return 0; }
+};
+REGISTER_EVALUATOR(value_printer, ValuePrinter);
+
+/**
+ * @brief print gradient of each layer.
+ *
+ * The config file api is gradient_printer_evaluator.
+ */
+class GradientPrinter : public NotGetableEvaluator {
+ public:
+  virtual void eval(const NeuralNetwork& nn) {
+    for (const std::string& name : config_.input_layers()) {
+      const Argument& argu = nn.getLayer(name)->getOutput();
+      if (argu.grad) {
+        std::ostringstream os;
+        argu.grad->print(os);
+        LOG(INFO) << "layer=" << name << " grad matrix:\n" << os.str();
+      }
+    }
+  }
+
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
+
+  virtual real evalImp(std::vector<Argument>& arguments) { return 0; }
+};
+REGISTER_EVALUATOR(gradient_printer, GradientPrinter);
+/**
+ * @brief print row max id vctor of each layer
+ *
+ * The config file api is maxid_printer_evaluator.
+ */
+class MaxIdPrinter : public NotGetableEvaluator {
+ private:
+  IVectorPtr maxIds_;
+  MatrixPtr maxValues_;
+
+ public:
+  MaxIdPrinter() {}
+
+  virtual void eval(const NeuralNetwork& nn) {
+    for (const std::string& name : config_.input_layers()) {
+      const Argument& argu = nn.getLayer(name)->getOutput();
+      if (argu.value) {
+        size_t height = argu.value->getHeight();
+        size_t width = config_.num_results();
+        IVector::resizeOrCreate(maxIds_, height * width, false);
+        Matrix::resizeOrCreate(maxValues_, height, width, false);
+        argu.value->rowMax(*maxIds_, *maxValues_);
+        std::ostringstream os;
+        int* ids = maxIds_->getData();
+        real* values = maxValues_->getData();
+        for (size_t i = 0; i < height; ++i) {
+          for (size_t j = 0; j < width; ++j) {
+            size_t pos = i * width + j;
+            os << ids[pos] << " : " << values[pos] << ", ";
+          }
+          os << std::endl;
+        }
+        LOG(INFO) << "layer=" << name << " row max id vector:\n" << os.str();
+      }
+    }
+  }
+
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
+
+  virtual real evalImp(std::vector<Argument>& arguments) { return 0; }
+};
+REGISTER_EVALUATOR(max_id_printer, MaxIdPrinter);
+/**
+ * @brief print sequence max frames of each layer
+ *
+ * The config file api is maxframe_printer_evaluator.
+ */
+class MaxFramePrinter : public NotGetableEvaluator {
+ private:
+  IVectorPtr maxIds_;
+  MatrixPtr maxValues_;
+  MatrixPtr value_;
+
+ public:
+  MaxFramePrinter() {
+    value_ =
+        Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, false);
+  }
+
+  virtual void eval(const NeuralNetwork& nn) {
+    for (const std::string& name : config_.input_layers()) {
+      const Argument& argu = nn.getLayer(name)->getOutput();
+
+      CHECK_EQ(argu.value->getWidth(), 1LU);
+      size_t numSequences = argu.getNumSequences();
+      const int* starts = argu.sequenceStartPositions->getData(false);
+
+      std::ostringstream os;
+      for (size_t i = 0; i < numSequences; ++i) {
+        size_t offset = starts[i];
+        size_t size = starts[i + 1] - starts[i];
+        value_->setData(argu.value->getData() + offset, 1LU, size);
+
+        size_t height = 1LU;
+        size_t width = std::min((size_t)config_.num_results(), size);
+        IVector::resizeOrCreate(maxIds_, height * width, false);
+        Matrix::resizeOrCreate(maxValues_, height, width, false);
+
+        value_->rowMax(*maxIds_, *maxValues_);
+
+        int* ids = maxIds_->getData();
+        real* values = maxValues_->getData();
+        for (size_t j = 0; j < width; ++j) {
+          os << ids[j] << " : " << values[j] << ", ";
+        }
+        os << "total " << size << " frames" << std::endl;
+      }
+      LOG(INFO) << "layer=" << name << " sequence max frames:\n" << os.str();
+    }
+  }
+
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
+
+  virtual real evalImp(std::vector<Argument>& arguments) { return 0; }
+};
+REGISTER_EVALUATOR(max_frame_printer, MaxFramePrinter);
+
+/**
+ * @brief print text according to index matrix and a dictionary.
+ *
+ * There can be multiple input to this layer:
+ * - If there is only one input, the input must be a matrix containing
+ *      the sequence of indices;
+ * - If there are more than one input, the first input should be ids,
+ *      and are interpreted as sample ids.
+ *
+ * The output format will be:
+ *
+ * - sequence without sub-sequence, and there is probability.
+ *
+ *     @code
+ *      id \t prob space_seperated_tokens_from_dictionary_according_to_seq
+ *     @endcode
+ *
+ * - sequence without sub-sequence, and there is not probability.
+ *
+ *     @code
+ *      id \t space_seperated_tokens_from_dictionary_according_to_seq
+ *     @endcode
+ *
+ * - sequence with sub-sequence, and there is not probability.
+ *
+ *     @code
+ *      id \t space_seperated_tokens_from_dictionary_according_to_sub_seq
+ *      \t \t space_seperated_tokens_from_dictionary_according_to_sub_seq
+ *      ...
+ *     @endcode
+ *
+ * Typically SequenceTextPrinter layer takes output of maxid or RecurrentGroup
+ * with maxid (when generating) as an input.
+ *
+ * The config file api is seqtext_printer_evaluator.
+ *
+ */
+class SequenceTextPrinter : public NotGetableEvaluator {
+ private:
+  /// dict_file, which contains a list of tokens
+  std::vector<std::string> dict_;
+  /// result_file, which is the output file
+  std::ofstream os_;
+  /// True/False, to indicate whether to use space to separate output tokens.
+  /// Default is True. No space is added if set to False.
+  bool delimited_;
+  /// store the cpu version of argument.ids
+  std::vector<IVectorPtr> cpuIds_;
+  /// store the probability associated with each sequence
+  std::vector<MatrixPtr> cpuIn_;
+
+ public:
+  SequenceTextPrinter() {}
+
+  virtual void init(const EvaluatorConfig& config) {
+    Evaluator::init(config);
+    if (!config.dict_file().empty()) {
+      loadFileList(config.dict_file(), dict_);
+    }
+
+    os_.open(config.result_file(), std::ofstream::trunc);
+    CHECK(os_.is_open()) << "Failed to open file " << config.result_file();
+    delimited_ = config.delimited();
+  }
+
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
+
+  virtual real evalImp(std::vector<Argument>& arguments) {
+    CHECK_GE(arguments.size(), 1LU);
+    bool hasId = arguments.size() > 1;
+    size_t numSequences = arguments[0].getNumSequences();
+    if (hasId) {
+      CHECK_EQ(arguments[0].ids->getSize(), numSequences)
+          << "first input must be sample id.";
+    }
+    for (size_t i = hasId ? 1 : 0; i < arguments.size(); ++i) {
+      CHECK_EQ((size_t)arguments[i].getNumSequences(), numSequences);
+    }
+
+    auto resizeVector = [](IVectorPtr& dest, const IVectorPtr& src) {
+      if (src && src->useGpu()) {
+        IVector::resizeOrCreate(dest, src->getSize(), false);
+        dest->copyFrom(*src);
+      } else {
+        dest = src;
+      }
+    };
+
+    auto resizeMatrix = [](MatrixPtr& dest, const MatrixPtr& src) {
+      if (src && src->useGpu()) {
+        Matrix::resizeOrCreate(
+            dest, src->getHeight(), src->getWidth(), false, false);
+        dest->copyFrom(*src);
+      } else {
+        dest = src;
+      }
+    };
+
+    cpuIds_.resize(arguments.size());
+    cpuIn_.resize(arguments.size());
+    for (size_t i = 0; i < arguments.size(); ++i) {
+      resizeVector(cpuIds_[i], arguments[i].ids);
+      resizeMatrix(cpuIn_[i], arguments[i].in);
+    }
+
+    int* sampleIds = nullptr;
+    if (hasId) {
+      sampleIds = cpuIds_[0]->getData();
+    }
+
+    for (size_t i = 0; i < numSequences; ++i) {
+      os_ << (hasId ? sampleIds[i] : i);
+      for (size_t j = hasId ? 1 : 0; j < arguments.size(); ++j) {
+        int* output = cpuIds_[j]->getData();
+        const int* starts = arguments[j].sequenceStartPositions->getData(false);
+
+        auto seqPrint = [&](int start, int end) {
+          os_ << "\t";
+          for (int k = start; k < end; k++) {
+            int id = output[k];
+            os_ << (delimited_ ? " " : "");
+            if (!dict_.empty()) {
+              CHECK_LT((size_t)id, dict_.size());
+              os_ << dict_[id];
+            } else {
+              os_ << id;
+            }
+          }
+        };
+
+        if (arguments[j].hasSubseq()) {
+          // print sequence with sub-sequence
+          const int* subStarts =
+              arguments[j].subSequenceStartPositions->getData(false);
+          int subSeqId_start = 0;
+          int subSeqId_end = 0;
+          for (size_t k = 0; k < (size_t)arguments[j].getNumSubSequences() + 1;
+               ++k) {
+            if (starts[i] == subStarts[k]) subSeqId_start = k;
+            if (starts[i + 1] == subStarts[k]) subSeqId_end = k;
+          }
+          for (int k = subSeqId_start; k < subSeqId_end; k++) {
+            seqPrint(subStarts[k], subStarts[k + 1]);
+            os_ << std::endl;
+          }
+
+        } else {
+          // print sequence without sub-sequence
+          if (arguments[j].in) {  // beam print
+            real* probs = cpuIn_[j]->rowBuf(i);
+            os_ << std::endl;
+            int start = starts[i];
+            int seqEnd = starts[i + 1];
+            for (size_t k = 0; k < arguments[j].in->getWidth(); ++k) {
+              if (start == seqEnd) {
+                break;
+              }
+              int end = start + output[start] + 2;
+              CHECK_LE(end, seqEnd);
+              CHECK_EQ(output[end - 1], -1);
+              os_ << k << "\t" << probs[k];
+              seqPrint(start + 1, end - 1);
+              os_ << std::endl;
+              start = end;
+            }
+          } else {
+            seqPrint(starts[i], starts[i + 1]);
+          }
+        }
+      }
+      os_ << std::endl;
+    }
+    return 0;
+  }
+};
+REGISTER_EVALUATOR(seq_text_printer, SequenceTextPrinter);
+/**
+ * @brief print classification error.
+ *
+ * The config file api is classification_error_printer_evaluator.
+ */
+class ClassificationErrorPrinter : public ClassificationErrorEvaluator {
+ public:
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
+
+  virtual real evalImp(std::vector<Argument>& arguments) {
+    MatrixPtr errorMat = calcError(arguments);
+
+    std::ostringstream os;
+    errorMat->print(os);
+    LOG(INFO) << "Printer=" << config_.name() << " Classification Error:\n"
+              << os.str();
+
+    if (auto startPos = arguments[0].sequenceStartPositions) {
+      std::ostringstream os;
+      startPos->getVector(false)->print(os, startPos->getSize());
+      LOG(INFO) << "Printer=" << config_.name() << " sequence pos vector:\n"
+                << os.str();
+    }
+    return 0;
+  }
+};
+REGISTER_EVALUATOR(classification_error_printer, ClassificationErrorPrinter);
+
+std::string DummyEvaluator::getTypeImpl() const { return "dummy"; }
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/evaluators/Evaluator.h b/paddle/legacy/gserver/evaluators/Evaluator.h
new file mode 100644
index 0000000000000000000000000000000000000000..b3462819b1244e9f2d1a463cb44e7c550406c000
--- /dev/null
+++ b/paddle/legacy/gserver/evaluators/Evaluator.h
@@ -0,0 +1,510 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <fstream>
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/parameter/Argument.h"
+#include "paddle/legacy/pserver/ParameterClient2.h"
+#include "paddle/legacy/utils/ClassRegistrar.h"
+#include "paddle/legacy/utils/Error.h"
+
+namespace paddle {
+
+class NeuralNetwork;
+/**
+ * @def REGISTER_EVALUATOR
+ * @brief Macro for registering evaluator class
+ */
+
+#define REGISTER_EVALUATOR(__type_name, __class_name)                \
+  static InitFunction __reg_type_##__type_name([]() {                \
+    Evaluator::registrar_.registerClass<__class_name>(#__type_name); \
+  })
+/**
+ * @brief Base class for Evaluator
+ * Evaluating the performance of a model is very important.
+ * It indicates how successful the scores(predictions) of a datasets
+ * has been by a trained model.
+ */
+class Evaluator {
+ public:
+  static Evaluator* create(const EvaluatorConfig& config);
+
+  Evaluator() : numSamples_(0), totalScore_(0) {}
+
+  virtual ~Evaluator() {}
+
+  virtual void init(const EvaluatorConfig& config) { config_ = config; }
+
+  /**
+   * @brief start to evaluate some data
+   */
+  virtual void start() {
+    numSamples_ = 0;
+    totalScore_ = 0;
+  }
+
+  /**
+   * @brief Process a batch of data.
+   */
+  virtual void eval(const NeuralNetwork& nn);
+
+  /**
+   * @brief Process a batch of data.
+   * @return the score for the batch if it make sense to sum the score across
+   * batches.
+   * @note Otherwise evaluator should return 0 and override finish() and
+   * printStats() to do the right calculation.
+   */
+  virtual real evalImp(std::vector<Argument>& arguments) = 0;
+
+  /**
+   * @brief Update the number of processed samples
+   */
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
+    numSamples_ += arguments[0].getBatchSize();
+  }
+
+  /// finish() should be called before distributeEval
+  virtual void distributeEval(ParameterClient2* client) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  void mergeResultsOfAllClients(ParameterClient2* client) {
+    double data[2] = {totalScore_, numSamples_};
+    client->reduce(data, data, 2, FLAGS_trainer_id, 0);
+    totalScore_ = data[0];
+    numSamples_ = data[1];
+  }
+
+  /**
+   * @brief finish the evaluation.
+   */
+  virtual void finish() {}
+
+  /**
+   * @brief print the statistics of evaluate result
+   * @note finish() should be called before printStats
+   */
+  virtual void printStats(std::ostream& os) const {
+    os << config_.name() << "="
+       << (numSamples_ ? totalScore_ / numSamples_ : 0);
+  }
+
+  friend std::ostream& operator<<(std::ostream& os,
+                                  const Evaluator& evaluator) {
+    evaluator.printStats(os);
+    return os;
+  }
+
+  friend std::ostream&& operator<<(std::ostream&& os,  // NOLINT
+                                   const Evaluator& evaluator) {
+    evaluator.printStats(os);
+    return std::move(os);
+  }
+
+  static ClassRegistrar<Evaluator> registrar_;
+
+  /**
+   * @brief getNames will return all field names of current evaluator.
+   *
+   * The format of name is `evaluator_name.evaluator_fields`. If the evaluator
+   * has multiple field, the name could be `evaluator_name.field1`. For example
+   * the PrecisionRecallEvaluator contains `precision`, `recall` fields. The get
+   * names will return `precision_recall_evaluator.precision`,
+   * `precision_recall_evaluator.recal`, etc.
+   *
+   * Also, if current Evaluator is a combined evaluator. getNames will return
+   * all names of all evaluators inside the combined evaluator.
+   *
+   * @param names [out]: the field names of current evaluator.
+   * @note Never clear the names parameter inside getNames.
+   */
+  virtual void getNames(std::vector<std::string>* names) {
+    names->push_back(config_.name());
+  }
+
+  /**
+   * @brief getValue will return the current evaluate value of one field.
+   *
+   * @param name: The field name of current evaluator.
+   * @param err [out]: The error state.
+   *
+   * @return The evaluate value(metric).
+   */
+  virtual real getValue(const std::string& name, Error* err) const {
+    if (name != config_.name()) {
+      *err = Error("no such name of evaluator %s", name.c_str());
+      return .0f;
+    }
+    return this->getValueImpl();
+  }
+
+  /**
+   * @brief getType will return the evaluator type by field name.
+   *
+   * Evaluate Type is the current type of evaluator in string. Such as 'auc',
+   * 'precision_recall'. In combined evaluator, different name may get different
+   * evaluate type because it could be evaluated by different evaluator inside.
+   *
+   * @param name: The field name of current Evaluator.
+   * @param err: The error state. nullptr means don't care.
+   * @return the evaluator type string.
+   */
+  virtual std::string getType(const std::string& name, Error* err) const {
+    if (name != config_.name()) {
+      *err = Error("no such name of evaluator %s", name.c_str());
+      return std::string();
+    }
+    return this->getTypeImpl();
+  }
+
+ protected:
+  /**
+   * @brief getValueImpl The simplest way to define getValue result. If this
+   * evaluator doesn't contain multiple fields, and do not throw any error, just
+   * implemented this method to get the evaluate result(metric).
+   * @return Evaluate result(metric).
+   */
+  virtual real getValueImpl() const {
+    return numSamples_ != .0 ? totalScore_ / numSamples_ : .0;
+  }
+
+  /**
+   * @brief getTypeImpl The simplest way to define getType result. If this
+   * evaluator doesn't combine many evaluators, the get type should only return
+   * itself type.
+   * @return Evaluator type.
+   */
+  virtual std::string getTypeImpl() const { return "base"; }
+
+ protected:
+  EvaluatorConfig config_;
+  double numSamples_;
+  double totalScore_;
+};
+
+/**
+ * @brief The NotGetableEvaluator class is the base class of evaluator that
+ * cannot get value in runtime. The most NotGetableEvaluator is Printer
+ * Evaluator, which is only used to debug network configuration.
+ */
+class NotGetableEvaluator : public Evaluator {
+  // Evaluator interface
+ public:
+  void getNames(std::vector<std::string>* names) {}
+
+  real getValue(const std::string& name, Error* err) const {
+    *err = Error("Not implemented");
+    return .0f;
+  }
+
+  std::string getType(const std::string& name, Error* err) const {
+    *err = Error("Not implemented");
+    return "";
+  }
+};
+
+class DummyEvaluator : public Evaluator {
+ public:
+  DummyEvaluator() {}
+  virtual void init(const EvaluatorConfig&) {}
+  virtual void start() {}
+  virtual void eval(const NeuralNetwork&) {}
+  virtual real evalImp(std::vector<Argument>& arguments) {
+    (void)arguments;
+    return -1;
+  }
+  virtual void finish() {}
+  virtual void printStats(std::ostream&) const {}
+
+  // Evaluator interface
+ protected:
+  std::string getTypeImpl() const;
+};
+/**
+ * @brief evaluate AUC using colIdx-th column as prediction.
+ * The AUC(Area Under the Curve) is a common evaluation metric
+ * for binary classification problems. It computes the area under
+ * the receiver operating characteristic(ROC) curve.
+ *
+ * @note colIdx-th column
+ *
+ * - colIdx = 0: the 0-th column.
+ * - colIdx > 0: the colIdx-th column.
+ * - colIdx < 0: the last colIdx-th column.
+ *
+ * The config file api is auc_evaluator.
+ *
+ */
+class AucEvaluator : public Evaluator {
+ public:
+  AucEvaluator(int32_t colIdx)
+      : colIdx_(colIdx),
+        realColumnIdx_(0),
+        cpuOutput_(nullptr),
+        cpuLabel_(nullptr),
+        cpuWeight_(nullptr) {}
+
+  virtual void start();
+
+  virtual real evalImp(std::vector<Argument>& arguments);
+
+  virtual void printStats(std::ostream& os) const {
+    os << config_.name() << "=" << calcAuc();
+  }
+
+  virtual void distributeEval(ParameterClient2* client);
+
+ private:
+  static const uint32_t kBinNum_ = (1 << 24) - 1;
+  static const int kNegativeLabel_ = 0;
+  double statPos_[kBinNum_ + 1];
+  double statNeg_[kBinNum_ + 1];
+  int32_t colIdx_;
+  uint32_t realColumnIdx_;
+  MatrixPtr cpuOutput_;
+  IVectorPtr cpuLabel_;
+  MatrixPtr cpuWeight_;
+
+  AucEvaluator() {}
+
+  inline static double trapezoidArea(double X1,
+                                     double X2,
+                                     double Y1,
+                                     double Y2) {
+    return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0;
+  }
+
+  double calcAuc() const;
+
+  // Evaluator interface
+ protected:
+  real getValueImpl() const;
+  std::string getTypeImpl() const;
+};
+
+/**
+ * @brief RankAucEvaluator calculates the AUC of each list (i.e., titles
+ * under the same query), and averages them. Each list should be organized
+ * as a sequence. The inputs of this evaluator is [output, click, pv]. If pv
+ * is not provided, it will be set to 1. The types of click and pv are
+ * dense value.
+ */
+class RankAucEvaluator : public Evaluator {
+ public:
+  // evaluate ranking AUC
+  virtual void start();
+
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments);
+
+  virtual real evalImp(std::vector<Argument>& arguments);
+
+  virtual void distributeEval(ParameterClient2* client) {
+    mergeResultsOfAllClients(client);
+  }
+
+ private:
+  MatrixPtr output_;
+  MatrixPtr click_;
+  MatrixPtr pv_;
+  std::vector<std::pair<real, int>> outputPair_;
+
+  double calcRankAuc(real* outputData,
+                     real* clickData,
+                     real* pvData,
+                     size_t size);
+
+  // Evaluator interface
+ protected:
+  std::string getTypeImpl() const;
+};
+
+/**
+ * @brief precision, recall and f1 score Evaluator
+ * \f[
+ * precision = \frac{tp}{tp+tn} \\
+ * recall=\frac{tp}{tp+fn} \\
+ * f1=2*\frac{precsion*recall}{precision+recall}
+ * \f]
+ *
+ * The config file api is precision_recall_evaluator.
+ */
+class PrecisionRecallEvaluator : public Evaluator {
+ public:
+  // Evaluate precision, recall and F1 score
+  PrecisionRecallEvaluator()
+      : isMultiBinaryLabel_(false),
+        cpuOutput_(nullptr),
+        cpuLabel_(nullptr),
+        cpuWeight_(nullptr) {}
+
+  virtual void start();
+
+  virtual real evalImp(std::vector<Argument>& arguments);
+
+  virtual void printStats(std::ostream& os) const;
+
+  virtual void distributeEval(ParameterClient2* client);
+
+  void getNames(std::vector<std::string>* names);
+
+  real getValue(const std::string& name, Error* err) const;
+
+  std::string getType(const std::string& name, Error* err) const;
+
+  struct StatsInfo {
+    /// numbers of true positives
+    double TP;
+    /// numbers of true negatives
+    double TN;
+    /// numbers of false positives
+    double FP;
+    /// numbers of false negatives
+    double FN;
+
+    StatsInfo() : TP(0.0), TN(0.0), FP(0.0), FN(0.0) {}
+  };
+
+ private:
+  bool isMultiBinaryLabel_;
+  std::vector<StatsInfo> statsInfo_;
+
+  MatrixPtr cpuOutput_;
+  IVectorPtr cpuLabel_;
+  MatrixPtr cpuWeight_;
+
+  struct PrintStatsInfo {
+    double precision;
+    double recall;
+    double f1;
+    double macroAvgPrecision;
+    double macroAvgRecall;
+    double macroAvgF1Score;
+    double microAvgPrecision;
+    double microAvgRecall;
+    double microAvgF1Score;
+  };
+
+  bool getStatsInfo(PrintStatsInfo* info) const;
+
+  void calcStatsInfo(const MatrixPtr& output,
+                     const IVectorPtr& label,
+                     const MatrixPtr& weight);
+
+  void calcStatsInfoMulti(const MatrixPtr& output,
+                          const MatrixPtr& label,
+                          const MatrixPtr& weight);
+
+  inline static double calcPrecision(double TP, double FP) {
+    if (TP > 0.0 || FP > 0.0) {
+      return TP / (TP + FP);
+    } else {
+      return 1.0;
+    }
+  }
+
+  inline static double calcRecall(double TP, double FN) {
+    if (TP > 0.0 || FN > 0.0) {
+      return TP / (TP + FN);
+    } else {
+      return 1.0;
+    }
+  }
+
+  inline static double calcF1Score(double precision, double recall) {
+    if (precision > 0.0 || recall > 0.0) {
+      return 2 * precision * recall / (precision + recall);
+    } else {
+      return 0;
+    }
+  }
+
+  mutable std::unordered_map<std::string, real> values_;
+
+  void storeLocalValues() const;
+};
+
+/*
+ * @brief positive-negative pair rate Evaluator
+ *
+ * The config file api is pnpair_evaluator.
+ */
+class PnpairEvaluator : public Evaluator {
+ public:
+  PnpairEvaluator()
+      : cpuOutput_(nullptr),
+        cpuLabel_(nullptr),
+        cpuInfo_(nullptr),
+        cpuWeight_(nullptr) {}
+
+  virtual void start();
+  virtual real evalImp(std::vector<Argument>& arguments);
+
+  struct PredictionResult {
+    PredictionResult(real __out, int __label, int __queryid, real __weight)
+        : out(__out), label(__label), queryid(__queryid), weight(__weight) {}
+    real out;
+    int label;
+    int queryid;
+    real weight;
+  };
+  std::vector<PredictionResult> predictArray_;
+  void printPredictResults() {
+    std::ofstream fs(FLAGS_predict_file);
+    CHECK(fs) << "Fail to open " << FLAGS_predict_file;
+    for (auto& res : predictArray_) {
+      fs << res.out << " " << res.label << " " << res.queryid << std::endl;
+    }
+  }
+
+  void stat(size_t start,
+            size_t end,
+            PredictionResult* answers,
+            double& pos,
+            double& neg,
+            double& spe);
+  void calc(std::vector<PredictionResult>& predictArray);
+
+  virtual void finish() { calc(predictArray_); }
+
+  virtual void printStats(std::ostream& os) const {
+    os << " pos/neg=" << this->getValueImpl();
+  }
+
+  virtual void distributeEval(ParameterClient2* client) {
+    client->reduce(pairArray_, pairArray_, kPairArrayNum_, FLAGS_trainer_id, 0);
+    LOG(INFO) << " distribute eval calc total pos pair: " << pairArray_[0]
+              << " calc total neg pair: " << pairArray_[1];
+  }
+
+ private:
+  static const uint32_t kPairArrayNum_ = 2;
+  double pairArray_[kPairArrayNum_];
+  MatrixPtr cpuOutput_;
+  IVectorPtr cpuLabel_;
+  IVectorPtr cpuInfo_;
+  MatrixPtr cpuWeight_;
+
+  // Evaluator interface
+ protected:
+  real getValueImpl() const {
+    return pairArray_[0] / ((pairArray_[1] <= 0) ? 1.0 : pairArray_[1]);
+  }
+  std::string getTypeImpl() const;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/GradientMachine.cpp b/paddle/legacy/gserver/gradientmachines/GradientMachine.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1c4034d8bba59dbae0a1059b96ac2b6f18c5971b
--- /dev/null
+++ b/paddle/legacy/gserver/gradientmachines/GradientMachine.cpp
@@ -0,0 +1,104 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "GradientMachine.h"
+
+#include <fstream>
+#include "paddle/legacy/utils/Logging.h"
+
+#include "NeuralNetwork.h"
+#include "hl_gpu.h"
+
+#ifndef PADDLE_MOBILE_INFERENCE
+#include "GradientMachineMode.h"
+#include "MultiGradientMachine.h"
+#include "MultiNetwork.h"
+#include "ParallelNeuralNetwork.h"
+#endif
+
+namespace paddle {
+
+GradientMachine* GradientMachine::create(
+    const ModelConfig& config,
+    int mode,
+    const std::vector<ParameterType>& parameterTypes) {
+#ifndef PADDLE_MOBILE_INFERENCE
+  if (auto gm = IGradientMachineMode::tryCreateGradientMachine(mode, config)) {
+    return gm;
+  }
+  if (FLAGS_trainer_count > 1) {
+    return new MultiGradientMachine(config, FLAGS_use_gpu);
+  }
+#endif
+  if (FLAGS_trainer_count == 1) {  // single
+#ifndef PADDLE_MOBILE_INFERENCE
+    NeuralNetwork* nn;
+    if (config.type() == "multi_nn") {
+      /* multi submodel calculate, thread(s) will be initialized inside */
+      nn = new MultiNetwork("root");
+    } else if (FLAGS_parallel_nn) {
+      /* multi threads calculate */
+      nn = new ParallelNeuralNetwork();
+    } else {
+      /* single thread calculate */
+      nn = NeuralNetwork::create(config);
+    }
+#else
+    NeuralNetwork* nn = NeuralNetwork::create(config);
+#endif
+    ParamInitCallback testParamInitCb = [](int paramId, Parameter* para) {
+      para->enableType(PARAMETER_VALUE);
+    };
+    nn->init(
+        config, mode == kTesting ? testParamInitCb : nullptr, parameterTypes);
+    return nn;
+  }
+  LOG(FATAL) << "Unknown model type: " << config.type();
+  return nullptr;
+}
+
+void GradientMachine::saveParameters(const std::string& dir) const {
+  LOG(INFO) << "Saving parameters to " << dir;
+
+  for (auto& para : parameters_) {
+    std::string filename = dir + "/" + para->getName();
+    if (para->isFullSize()) {
+      para->save(filename);
+    }
+  }
+}
+
+void GradientMachine::loadParameters(const std::string& dir) {
+  LOG(INFO) << "Loading parameters from " << dir;
+
+  for (auto& para : parameters_) {
+    std::string filename = dir + "/" + para->getName();
+    if (para->isFullSize()) {
+      para->load(filename);
+    }
+  }
+}
+
+void GradientMachine::randParameters() {
+  LOG(INFO) << "Initing parameters..";
+
+  for (auto& para : parameters_) {
+    if (para->isFullSize()) {
+      para->randomize();
+    }
+  }
+  LOG(INFO) << "Init parameters done.";
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/GradientMachine.h b/paddle/legacy/gserver/gradientmachines/GradientMachine.h
new file mode 100644
index 0000000000000000000000000000000000000000..d4f754a9f4dc3175f5000774c77a0e7334df7d85
--- /dev/null
+++ b/paddle/legacy/gserver/gradientmachines/GradientMachine.h
@@ -0,0 +1,250 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+
+#include "ModelConfig.pb.h"
+#include "TrainerConfig.pb.h"
+#include "paddle/legacy/gserver/dataproviders/DataProvider.h"
+#include "paddle/legacy/gserver/layers/Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/parameter/Parameter.h"
+#include "paddle/legacy/parameter/ParameterUpdaterBase.h"
+#include "paddle/legacy/utils/Thread.h"
+
+#ifndef PADDLE_MOBILE_INFERENCE
+#include "paddle/legacy/gserver/evaluators/Evaluator.h"
+#endif
+
+namespace paddle {
+/**
+ * @brief A gradient machine is capable of calculating some outputs given
+ *        some inputs and performing gradient calculation based on the
+ *        derivative from the outputs.
+ *
+ * A gradient machine can be either a full neural network or part of a neural
+ * network.
+ *
+ * Usage for training:
+ *
+ *  1. Prepare inArgs. Put your input data into inArgs[i].value.
+ *
+ *  2. Call forward(inArgs, &outArgs)
+ *
+ *  3. Calculate gradient with respect to outArgs[i]->value
+ *     and fill them into outArgs[i]->grad.
+ *     This step can be skipped if your the outputs are from cost layers.
+ *
+ *  4. Call backward(). After backward, gradient of each parameter is
+ *     accumulated to getParameters()[i]->getBuf(PARAMETER_GRADIENT)
+ *
+ *  5. Update parameter value getParameters()[i]->getBuf(PARAMETER_VALUE) using
+ *     gradients.
+ *
+ *  6. Clear gradients to zero.
+ *
+ * Usage for prediction:
+ *
+ *  1. Prepare inArgs. Put your input data into inArgs[i].value.
+ *
+ *  2. Call forward(inArgs, &outArgs)
+ *
+ *  3. Obtain the prediction result from outArgs[i]
+ */
+
+typedef std::vector<LayerStatePtr> MachineState;
+
+class GradientMachine;
+
+typedef std::shared_ptr<GradientMachine> GradientMachinePtr;
+
+class GradientMachine {
+ public:
+  enum CreateMode {
+    kNormal = 0,
+    kSgdSparseCpuTraining = 3,
+    kTesting = 4,
+    kCustom = 10
+  };
+
+  /**
+   * Create a gradient machine from ModelConfig
+   * Parameter will have parameterTypes
+   */
+  static GradientMachine* create(
+      const ModelConfig& config,
+      int mode = kNormal,
+      const std::vector<ParameterType>& parameterTypes =
+          std::vector<ParameterType>{
+              PARAMETER_VALUE, PARAMETER_GRADIENT, PARAMETER_MOMENTUM});
+
+  virtual ~GradientMachine() {}
+
+  /**
+   * Prefetch row ids of sparse parameter.
+   */
+  virtual void prefetch(const std::vector<Argument>& inArgs) { (void)inArgs; }
+
+  /**
+   * @brief Forward propagation.
+   *
+   * Calculate outputs (outArgs) based the inputs (inArgs)
+   *
+   * @note: if passType==PASS_TEST, then backward() should not be called
+   */
+  virtual void forward(const std::vector<Argument>& inArgs,
+                       std::vector<Argument>* outArgs,
+                       PassType passType) = 0;
+
+  /**
+   * @brief Backward propagation.
+   *
+   * Calculate the gradient of inArgs and parameter.
+   *
+   * This function should only be called after a corresponding forward() call.
+   * The caller is responsible for filling the correct grad for the outArgs
+   * obtained using forward().
+   *
+   * It may also change the grad field for the inArgs supplied at forward()
+   */
+  virtual void backward(const UpdateCallback& callback = nullptr) = 0;
+
+  /**
+   * Combine forward() and backward(). For multithread training, this
+   * may be faster.
+   *
+   * @note: passType PASS_TEST is not allowed for forwardBackward().
+   */
+  virtual void forwardBackward(const std::vector<Argument>& inArgs,
+                               std::vector<Argument>* outArgs,
+                               PassType passType,
+                               const UpdateCallback& callback = nullptr) {
+    forward(inArgs, outArgs, passType);
+    backward(callback);
+  }
+
+  virtual Argument getLayerOutput(const std::string& layerName) = 0;
+
+  // see comment in Layer.h for the function with the same name
+  virtual void resetState() {}
+
+  // set machine state
+  virtual void setState(const MachineState& machineState) {}
+
+  // save machine state
+  virtual void getState(MachineState& machineState) {}
+
+  virtual void onPassEnd() = 0;
+
+#ifndef PADDLE_MOBILE_INFERENCE
+  /**
+   * Create an evaluator which can be used for eval()
+   */
+  virtual Evaluator* makeEvaluator() const = 0;
+
+  /**
+   * evaluate using the given evaluator
+   */
+  virtual void eval(Evaluator* evaluator) const = 0;
+#endif
+
+  std::vector<ParameterPtr>& getParameters() { return parameters_; }
+
+  std::vector<ParameterPtr>& getNonStaticParameters() {
+    if (nonStaticParameters_.empty()) {
+      for (auto para : parameters_) {
+        if (!para->isStatic()) {
+          nonStaticParameters_.push_back(para);
+        }
+      }
+    }
+    return nonStaticParameters_;
+  }
+
+  inline bool hasStaticParameters() {
+    return parameters_.size() != getNonStaticParameters().size();
+  }
+
+  /**
+   * @brief   Used before formal training, start work-threads and set
+   *          trainer Parameters;
+   *
+   * @note    This function will only been implemented and used in a
+   *          multithreaded environment.
+   */
+  virtual void start() {}
+
+  /**
+   * @brief   check  each work-thread whether is failed/error/finish,
+   *          if not, return ture, and yes return false.
+   *
+   * @note    This function will only been implemented and used in a
+   *          multithreaded environment.
+   */
+  virtual void finish() {}
+
+  /**
+   * @brief   set the training status a "finished" value, the sub_work_threads
+   *          will option the change, and then exit.
+   *
+   * @note    This function will only been implemented and used in a
+   *          multithreaded environment.
+   */
+  virtual bool trainIsOn() { return true; }
+
+  /**
+   * @brief   when all or some of the sub-workThreads are suspended to waiting
+   *          controller's instructions, and after some processing done in the
+   *          controller, it will call this function to wake up all the pending
+   *          thread.
+   *
+   * @note    This function will only been implemented and used in a
+   *          multithreaded environment.
+   */
+  virtual void restart() {}
+
+  /// Set the gradient of the output from outside.
+  virtual void setOutputGrad(const std::vector<Argument>& args) {
+    LOG(FATAL) << "Not implemented!";
+  }
+
+  void saveParameters(const std::string& dir) const;
+
+  void loadParameters(const std::string& dir);
+
+  void randParameters();
+
+  virtual void getStats(real& cost, int64_t& numProcessed) {
+    (void)cost;
+    (void)numProcessed;
+  }
+
+  /**
+   * @brief   Release the middle layer's output memory.
+   *
+   * @note    This function is used for memory optimization in inference.
+   */
+  virtual void releaseOutput() {}
+
+ protected:
+  virtual void onLoadParameter() {}
+
+  std::vector<ParameterPtr> parameters_;
+  std::vector<ParameterPtr> nonStaticParameters_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/GradientMachineMode.cpp b/paddle/legacy/gserver/gradientmachines/GradientMachineMode.cpp
similarity index 100%
rename from paddle/gserver/gradientmachines/GradientMachineMode.cpp
rename to paddle/legacy/gserver/gradientmachines/GradientMachineMode.cpp
diff --git a/paddle/gserver/gradientmachines/GradientMachineMode.h b/paddle/legacy/gserver/gradientmachines/GradientMachineMode.h
similarity index 100%
rename from paddle/gserver/gradientmachines/GradientMachineMode.h
rename to paddle/legacy/gserver/gradientmachines/GradientMachineMode.h
diff --git a/paddle/legacy/gserver/gradientmachines/MultiGradientMachine.cpp b/paddle/legacy/gserver/gradientmachines/MultiGradientMachine.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3ef0dfbfe2e5842918500a3b0706c1a55024ce46
--- /dev/null
+++ b/paddle/legacy/gserver/gradientmachines/MultiGradientMachine.cpp
@@ -0,0 +1,898 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MultiGradientMachine.h"
+
+#include "paddle/legacy/utils/Logging.h"
+
+#include "paddle/legacy/utils/Stat.h"
+
+#include "NeuralNetwork.h"
+#include "ParallelNeuralNetwork.h"
+
+DEFINE_bool(allow_only_one_model_on_one_gpu,
+            true,
+            "If true, do not allow multiple models on one GPU device");
+
+namespace paddle {
+
+// get types of the parameters which need to be merged after backward()
+static void fillMergeTypes(PassType passType,
+                           std::vector<ParameterType>* mergeTypes) {
+  mergeTypes->clear();
+  if (passType != PASS_TEST) {
+    mergeTypes->push_back(PARAMETER_GRADIENT);
+  }
+}
+
+MultiGradientMachine::MultiGradientMachine(const ModelConfig& config,
+                                           bool useGpu)
+    : useGpu_(useGpu),
+      trainerBarrier_(FLAGS_trainer_count),
+      allBarrier_(FLAGS_trainer_count + 1),
+      inArgsCopied_(false) {
+  isPassGrad_ = false;
+  numThreads_ = FLAGS_trainer_count;
+  if (useGpu) {
+    //! TODO(yuyang18): When useGpu=false && paddle is not compiled with gpu,
+    //! the hl_get_device_count will get an error result. It seems should return
+    //! 0 when hppl is not compiled as gpu version.
+    numDevices_ = hl_get_device_count();
+  } else {
+    numDevices_ = 0;
+  }
+  ParamInitCallback mainParamInitCb = [](int paramId, Parameter* para) {
+    // only create buf for CPU parameters
+    // GPU parameters will be created in each thread
+    if (para->useGpu()) return;
+
+    if (para->isSparseRemoteUpdate()) {
+      para->enableType(PARAMETER_VALUE,
+                       FLAGS_loadsave_parameters_in_pserver
+                           ? Parameter::MAT_SPARSE_ROW_PREFETCH
+                           : Parameter::MAT_SPARSE_ROW_PREFETCH_FULL_SIZE);
+      para->enableType(PARAMETER_GRADIENT, Parameter::MAT_SPARSE_ROW);
+    } else if (para->isGradSparseUpdate()) {
+      para->enableType(PARAMETER_VALUE);
+      para->enableType(PARAMETER_GRADIENT, Parameter::MAT_SPARSE_ROW_IDS);
+      SparseRowIdsCpuMatrix* mat = dynamic_cast<SparseRowIdsCpuMatrix*>(
+          para->getMat(PARAMETER_GRADIENT).get());
+      mat->setNumOfThreads(FLAGS_trainer_count);
+    } else if (para->isValueShared()) {
+      para->enableType(PARAMETER_VALUE, Parameter::MAT_VALUE_SHARED);
+      if (!para->isStatic()) {
+        para->enableType(PARAMETER_GRADIENT);
+      }
+    } else {
+      para->enableType(PARAMETER_VALUE);
+      if (!para->isStatic()) {
+        para->enableType(PARAMETER_GRADIENT);
+      }
+    }
+  };
+
+  NeuralNetwork* nn = NeuralNetwork::create(config);
+  nn->init(config, mainParamInitCb);
+  gradientMachine_.reset(nn);
+  parameters_ = gradientMachine_->getParameters();
+
+  numLogicalDevices_ = 0;
+  if (useGpu_) {
+    numLogicalDevices_ = 1;
+
+    for (size_t pid = 0; pid < parameters_.size(); pid++) {
+      if (parameters_[pid]->getConfig().device() + 1 > numLogicalDevices_) {
+        numLogicalDevices_ = parameters_[pid]->getConfig().device() + 1;
+      }
+    }
+    LOG(INFO) << "numLogicalDevices=" << numLogicalDevices_
+              << " numThreads=" << numThreads_ << " numDevices=" << numDevices_;
+
+    if (numLogicalDevices_ * numThreads_ > numDevices_ &&
+        FLAGS_allow_only_one_model_on_one_gpu) {
+      LOG(FATAL) << "trainer_count * num_devices_in_model "
+                 << "(" << numThreads_ << "*" << numLogicalDevices_ << ")"
+                 << "=" << numThreads_ * numLogicalDevices_
+                 << " exceeds number of GPU devices(" << numDevices_ << ")";
+    }
+    numLogicalDevices_ = std::min(numLogicalDevices_, numDevices_);
+
+    /* Enables direct access to memory allocations on a peer device */
+    for (int i = 0; i < numThreads_; i++) {
+      for (int d = 0; d < numLogicalDevices_; ++d) {
+        enablePeerAccess(logicalDeviceId2RealDeviceId(d, i),
+                         logicalDeviceId2RealDeviceId(d, i + 1));
+        enablePeerAccess(logicalDeviceId2RealDeviceId(d, i),
+                         logicalDeviceId2RealDeviceId(d, i - 1));
+      }
+    }
+  }
+
+  for (int i = 0; i < numThreads_; ++i) {
+    threads_.emplace_back(new TrainerThread(config, i, this));
+  }
+
+  bufferSizes_.resize(numLogicalDevices_, 0);
+  paraMainThread_.reserve(parameters_.size());
+  int pid = 0;
+  for (auto& para : parameters_) {
+    if (para->isStatic() || !para->useGpu()) {
+      paraMainThread_.push_back(0);
+    } else {
+      int end = pid++ % numThreads_;
+      paraMainThread_.push_back(end);
+      int paraDeviceId = para->getDeviceId();
+      if (paraDeviceId == -1) paraDeviceId = 0;
+      paraDeviceId = paraDeviceId % numLogicalDevices_;
+      if (para->getSize() > bufferSizes_[paraDeviceId]) {
+        bufferSizes_[paraDeviceId] = para->getSize();
+        VLOG(1) << "bufferSize[" << paraDeviceId << "]" << para->getSize();
+      }
+    }
+  }
+
+  // TODO(xuwei06) Instead of using maximal buffer size, we may use a smaller
+  // fixed buffer size and use pipeline to dispatch parameter value and merge
+  // parameter gradient, which may be faster.
+
+  // combination of all trainers mainPara into GradientMachine parameters
+  hasNonstaticCpuParamters_ = false;
+  for (size_t pid = 0; pid < parameters_.size(); pid++) {
+    if (parameters_[pid]->useGpu()) {
+      parameters_[pid] = threads_[paraMainThread_[pid]]->getParameters()[pid];
+    } else if (!parameters_[pid]->isStatic()) {
+      hasNonstaticCpuParamters_ = true;
+    }
+  }
+
+  gradBufs_.resize(numThreads_);
+  for (int i = 0; i < numThreads_; ++i) {
+    gradBufs_[i].resize(numLogicalDevices_);
+    for (int d = 0; d < numLogicalDevices_; ++d) {
+      gradBufs_[i][d].sem.post();
+    }
+  }
+
+  outArgStream_ = HPPL_STREAM_1;
+
+  start();
+}
+
+void MultiGradientMachine::start() {
+  for (auto& thread : threads_) {
+    thread->start();
+  }
+}
+
+void MultiGradientMachine::finish() {
+  for (auto& thread : threads_) {
+    thread->stop();
+  }
+}
+
+std::vector<const std::vector<ParameterPtr>*>
+MultiGradientMachine::getSlaveParameters() {
+  std::vector<const std::vector<ParameterPtr>*> vec;
+  vec.reserve(threads_.size());
+  for (auto& thread : threads_) {
+    vec.push_back(&thread->getParameters());
+  }
+  return vec;
+}
+
+void MultiGradientMachine::notifyGradientTransfer(int paramId) {
+  gradQueue_.enqueue(paramId);
+}
+
+void MultiGradientMachine::allocGradBufs() {
+  if (numLogicalDevices_ == 0) return;
+  if (gradBufs_[0][0].bufs.size() >= mergeTypes_.size()) return;
+
+  for (int i = 0; i < numThreads_; i++) {
+    for (int d = 0; d < numLogicalDevices_; ++d) {
+      if (bufferSizes_[d] == 0) continue;
+      SetDevice device(logicalDeviceId2RealDeviceId(d, i));
+      for (size_t j = 0; j < mergeTypes_.size(); j++) {
+        gradBufs_[i][d].bufs.push_back(
+            Vector::create(bufferSizes_[d], /* useGpu= */ true));
+      }
+    }
+  }
+}
+
+void MultiGradientMachine::prefetch(const std::vector<Argument>& inArgs) {
+  // Each gradient machine in threads needs to do prefetch on its own
+  // part of inArgs. So we need to first divide inArgs to each thread
+  inArgs_ = inArgs;
+  startTask(TASK_COPY_IN_ARGS);
+
+  for (auto& para : parameters_) {
+    if (para->isSparseRemoteUpdate()) {
+      auto mat = dynamic_cast<SparsePrefetchRowCpuMatrix*>(
+          para->getMat(PARAMETER_VALUE).get());
+      mat->clearIndices();
+    }
+  }
+
+  waitForCopyInArgs();
+
+  // Because SparsePrefetchRowCpuMatrix can only be changed by ONE thread
+  // at one time, we need to do prefetch sequentially
+  for (auto& thread : threads_) {
+    thread->prefetch();
+  }
+
+  for (auto& para : parameters_) {
+    if (para->isSparseRemoteUpdate()) {
+      auto mat = dynamic_cast<SparsePrefetchRowCpuMatrix*>(
+          para->getMat(PARAMETER_VALUE).get());
+      mat->setupIndices();
+      auto matGrad = dynamic_cast<SparseRowCpuMatrix*>(
+          para->getMat(PARAMETER_GRADIENT).get());
+      matGrad->reserveStore();
+    }
+  }
+}
+
+void MultiGradientMachine::forward(const std::vector<Argument>& inArgs,
+                                   std::vector<Argument>* outArgs,
+                                   PassType passType) {
+  forwardImp(inArgs, outArgs, passType, TASK_FORWARD);
+}
+
+void MultiGradientMachine::forwardImp(const std::vector<Argument>& inArgs,
+                                      std::vector<Argument>* outArgs,
+                                      PassType passType,
+                                      TaskType taskType) {
+  updateThreadParameters();
+  passType_ = passType;
+
+  if (!inArgsCopied_) {
+    inArgs_ = inArgs;
+    inArgsCopied_ = false;
+  }
+
+  fillMergeTypes(passType, &mergeTypes_);
+  allocGradBufs();
+  startTask(taskType);
+
+  getOutArgs(outArgs, passType);
+}
+
+void MultiGradientMachine::backward(const UpdateCallback& callback) {
+  backwardCallback_ = callback;
+  startTask(TASK_BACKWARD);
+  backwardImp(callback);
+}
+
+void MultiGradientMachine::forwardBackward(const std::vector<Argument>& inArgs,
+                                           std::vector<Argument>* outArgs,
+                                           PassType passType,
+                                           const UpdateCallback& callback) {
+  backwardCallback_ = callback;
+  forwardImp(inArgs, outArgs, passType, TASK_FORWARD_BACKWARD);
+  backwardImp(callback);
+}
+
+Argument MultiGradientMachine::getLayerOutput(const std::string& layerName) {
+  std::vector<Argument> args;
+  args.reserve(threads_.size());
+
+  for (auto& thread : threads_) {
+    args.push_back(thread->getGradientMachine()->getLayerOutput(layerName));
+  }
+  outLayerArgs_.concat(args, false /* use_gpu */, outArgStream_, passType_);
+
+  return outLayerArgs_;
+}
+
+void MultiGradientMachine::backwardImp(const UpdateCallback& callback) {
+  for (size_t i = 0; i < parameters_.size(); i++) {
+    if (!parameters_[i]->useGpu() || parameters_[i]->isStatic()) continue;
+    REGISTER_TIMER("controller_dequeue");
+    gradQueue_.dequeue();
+  }
+  if (hasNonstaticCpuParamters()) {
+    waitAfterMerge();
+    if (backwardCallback_) {
+      for (auto& para : parameters_) {
+        if (!para->useGpu() && !para->isStatic()) {
+          backwardCallback_(para.get());
+        }
+      }
+    }
+  }
+}
+
+void MultiGradientMachine::updateThreadParameters() {
+  for (size_t pid = 0; pid < parameters_.size(); ++pid) {
+    if (!parameters_[pid]->useGpu()) continue;
+    if (!parameters_[pid]->isValueUpdated()) continue;
+    parameters_[pid]->clearValueUpdated();
+    for (int i = 0; i < (int)threads_.size(); i++) {
+      threads_[i]->incUpdateCounter();
+    }
+    // NotifyValueReady should happen after that all threads' incUpdateCounter()
+    // are called so that the counters are correct when notifyValueReady()
+    // is called.
+    threads_[paraMainThread_[pid]]->notifyValueReady(pid);
+  }
+}
+
+void MultiGradientMachine::onPassEnd() {
+  for (auto& thread : threads_) {
+    thread->onPassEnd();
+  }
+}
+
+Evaluator* MultiGradientMachine::makeEvaluator() const {
+  return threads_[0]->getGradientMachine()->makeEvaluator();
+}
+
+void MultiGradientMachine::eval(Evaluator* evaluator) const {
+  for (auto& thread : threads_) {
+    SetDevice device(thread->getDeviceId());
+    if (thread->hasInputData()) {
+      thread->getGradientMachine()->eval(evaluator);
+    }
+  }
+}
+
+void MultiGradientMachine::getOutArgs(std::vector<Argument>* outArgs,
+                                      PassType passType) {
+  for (auto& thread : threads_) {
+    REGISTER_TIMER("waitOutArgs");
+    thread->waitOutArgsReady();
+  }
+
+  outArgs_.resize(threads_[threads_.size() - 1]->getOutArgs().size());
+
+  REGISTER_TIMER("copyOutArgs");
+  for (size_t i = 0; i < outArgs_.size(); ++i) {
+    std::vector<Argument> args;
+    args.reserve(threads_.size());
+    for (auto& thread : threads_) {
+      // If the thread input is empty, then the output is empty.
+      auto tmp = thread->getOutArgs();
+      if (tmp.size() > 0) {
+        args.push_back(tmp[i]);
+      }
+    }
+    outArgs_[i].concat(args, useGpu_, outArgStream_, passType);
+  }
+
+  if (useGpu_) {
+    hl_stream_synchronize(outArgStream_);
+  }
+
+  *outArgs = outArgs_;
+}
+
+void MultiGradientMachine::setOutputGrad(const std::vector<Argument>& args) {
+  CHECK_EQ(args.size(), outArgs_.size());
+  for (size_t i = 0; i < args.size(); i++) {
+    outArgs_[i].grad = args[i].grad;
+  }
+}
+
+void MultiGradientMachine::startTask(TaskType taskType) {
+  taskType_ = taskType;
+  for (auto& thread : threads_) {
+    thread->notifyTaskReady();
+  }
+}
+
+TrainerThread::TrainerThread(const ModelConfig& config,
+                             int threadId,
+                             MultiGradientMachine* multiMachine)
+    : multiMachine_(multiMachine),
+      config_(config),
+      threadId_(threadId),
+      inArgsCopied_(false) {
+  int numThreads = multiMachine->getNumThreads();
+
+  auto& mainParas = multiMachine->getParameters();
+
+  using std::placeholders::_1;
+  using std::placeholders::_2;
+
+  partnerId_ = mod(threadId_ - 1, numThreads);
+
+  deviceId_ = !multiMachine_->useGpu()
+                  ? -1
+                  : multiMachine_->logicalDeviceId2RealDeviceId(0, threadId_);
+  SetDevice gpuDevice(deviceId_);
+
+  NeuralNetwork* nn = nullptr;
+  if (!multiMachine->useGpu() || !FLAGS_parallel_nn) {
+    nn = NeuralNetwork::create(config);
+  } else {
+    nn = new ParallelNeuralNetwork();
+    for (auto& paraConfig : *config_.mutable_parameters()) {
+      if (paraConfig.device() != -1) {
+        paraConfig.set_device(multiMachine_->logicalDeviceId2RealDeviceId(
+            paraConfig.device(), threadId_));
+      }
+    }
+    for (auto& layerConfig : *config_.mutable_layers()) {
+      if (layerConfig.device() != -1) {
+        layerConfig.set_device(multiMachine_->logicalDeviceId2RealDeviceId(
+            layerConfig.device(), threadId_));
+      }
+    }
+  }
+  // Only GPU do not share parameter values with main paramters.
+  ParamInitCallback slaveParamInitCb =
+      std::bind(parameterInitNN, _1, _2, &mainParas);
+  nn->init(config_, slaveParamInitCb);
+  gradientMachine_.reset(nn);
+  parameters_ = gradientMachine_->getParameters();
+  if (!FLAGS_parallel_nn) {
+    for (auto& para : parameters_) {
+      para->setDevice(deviceId_);
+    }
+  }
+
+  backwardCallback_ =
+      std::bind(&TrainerThread::backwardCallback, this, std::placeholders::_1);
+
+  gradStream_ = HPPL_STREAM_2;
+  valueStream_ = HPPL_STREAM_3;
+  stopping_ = true;
+  updateCounter_ = 0;
+  parameterUpdated_ = false;
+}
+
+TrainerThread::~TrainerThread() { stop(); }
+
+void TrainerThread::start() {
+  if (!stopping_) return;
+
+  stopping_ = false;
+
+  gradientMachine_->start();
+
+  computeThread_.reset(new std::thread([this]() { computeThread(); }));
+
+  if (multiMachine_->useGpu()) {
+    gradCollectThread_.reset(
+        new std::thread([this]() { gradCollectThread(); }));
+
+    valueDispatchThread_.reset(
+        new std::thread([this]() { valueDispatchThread(); }));
+
+    copyThread_.reset(new std::thread([this]() { copyGradToBufferThread(); }));
+  }
+}
+
+void TrainerThread::stop() {
+  if (stopping_) return;
+
+  stopping_ = true;
+
+  if (computeThread_) {
+    taskReadySem_.post();
+    computeThread_->join();
+  }
+  if (gradCollectThread_) {
+    gradQueue_.enqueue(0);
+    gradCollectThread_->join();
+  }
+  if (copyThread_) {
+    gradBufQueue_.enqueue(0);
+    copyThread_->join();
+  }
+  if (valueDispatchThread_) {
+    valueReadyQueue_.enqueue(0);
+    valueDispatchThread_->join();
+  }
+}
+
+void TrainerThread::computeThread() {
+  VLOG(1) << "gradComputeThread " << threadId_;
+
+  if (deviceId_ >= 0) {
+    hl_init(deviceId_);
+  }
+
+  while (true) {
+    {
+      REGISTER_TIMER("taskSem_wait");
+      taskReadySem_.wait();
+    }
+
+    if (stopping_) break;
+
+    switch (multiMachine_->getTaskType()) {
+      case MultiGradientMachine::TASK_FORWARD_BACKWARD:
+        forward();
+        backward();
+        break;
+      case MultiGradientMachine::TASK_FORWARD:
+        forward();
+        break;
+      case MultiGradientMachine::TASK_BACKWARD:
+        backward();
+        break;
+      case MultiGradientMachine::TASK_COPY_IN_ARGS:
+        batchSize_ = copyInArgs();
+        inArgsCopied_ = true;
+        multiMachine_->waitForCopyInArgs();
+        break;
+    }
+  }
+  hl_fini();
+}
+
+void TrainerThread::prefetch() {
+  SetDevice setDevice(deviceId_);
+  gradientMachine_->prefetch(inArgs_);
+}
+
+void TrainerThread::forward() {
+  if (!inArgsCopied_) {
+    REGISTER_TIMER("copyInArgs");
+    batchSize_ = copyInArgs();
+  } else {
+    inArgsCopied_ = false;
+  }
+
+  if (multiMachine_->getPassType() != PASS_TEST) {
+    REGISTER_TIMER("clearGradient");
+    // For main parameter, the user of MultiGpuSyncMachine is responsible
+    // for setting the gradient to zero
+    for (size_t i = 0; i < parameters_.size(); i++) {
+      if (parameters_[i]->useGpu()) {
+        if (multiMachine_->paraMainThread(i) != threadId_) {
+          SetDevice device(parameters_[i]->getDeviceId());
+          parameters_[i]->clearGradient();
+        }
+      } else {
+        parameters_[i]->clearGradient();
+      }
+    }
+  }
+
+  {
+    REGISTER_TIMER("wait_value");
+    valueReadyCond_.wait([this]() { return !parameterUpdated_; });
+  }
+
+  { fillMergeTypes(multiMachine_->getPassType(), &mergeTypes_); }
+
+  {
+    REGISTER_TIMER("thread_forward");
+    if (batchSize_ > 0) {
+      gradientMachine_->forward(
+          inArgs_, &outArgs_, multiMachine_->getPassType());
+    } else {
+      outArgs_.clear();
+    }
+  }
+  outArgsReadySem_.post();
+}
+
+void TrainerThread::backward() {
+  REGISTER_TIMER("thread_backward");
+  if (multiMachine_->isPassGrad()) {
+    copyOutputGrad();
+  }
+  if (batchSize_ > 0) {
+    gradientMachine_->backward(backwardCallback_);
+  } else {
+    for (size_t i = parameters_.size(); i > 0; i--) {
+      backwardCallback(parameters_[i - 1].get());
+    }
+  }
+  if (multiMachine_->hasNonstaticCpuParamters()) {
+    mergeCpuGradients();
+  }
+}
+
+void TrainerThread::backwardCallback(Parameter* para) {
+  // CPU parameters are merged in the end
+  if (!para->useGpu() || para->isStatic()) return;
+
+  int paramId = para->getID();
+  if (multiMachine_->getNumThreads() == 1) {
+    // no need to do merge if there is only one thread
+    doCallback(paramId);
+  } else if (threadId_ == mod(multiMachine_->paraMainThread(paramId) - 1,
+                              multiMachine_->getNumThreads())) {
+    notifyCopyGradToBuffer(paramId);
+  } else {
+    notifyGradientCollect(paramId);
+  }
+}
+
+void TrainerThread::copyGradToBufferThread() {
+  VLOG(1) << "copyGradToBufferThread " << threadId_;
+
+  if (deviceId_ >= 0) {
+    hl_init(deviceId_);
+  }
+  auto& partnerThread = multiMachine_->getThread(partnerId_);
+  auto& gradBufs = multiMachine_->getGradBuf(partnerId_);
+
+  while (true) {
+    int pid = gradBufQueue_.dequeue();
+    if (stopping_) break;
+
+    int pdeviceId = multiMachine_->realDeviceId2LogicalDeviceId(
+        parameters_[pid]->getDeviceId(), threadId_);
+
+    auto& gradBuf = gradBufs[pdeviceId];
+
+    {
+      REGISTER_TIMER("waitBufferReady");
+      gradBuf.sem.wait();
+    }
+
+    {
+      REGISTER_TIMER("copyGradToBuffer");
+      SetDevice setDevice(parameters_[pid]->getDeviceId());
+      for (size_t i = 0; i < mergeTypes_.size(); ++i) {
+        gradBuf.bufs[i]->resize(
+            parameters_[pid]->getBuf(mergeTypes_[i])->getSize());
+        gradBuf.bufs[i]->copyFrom(*parameters_[pid]->getBuf(mergeTypes_[i]),
+                                  gradStream_);
+      }
+      hl_stream_synchronize(gradStream_);
+    }
+    partnerThread->notifyGradientCollect(pid);
+  }
+  hl_fini();
+}
+
+void TrainerThread::gradCollectThread() {
+  VLOG(1) << "gradCollectThread " << threadId_;
+
+  if (deviceId_ >= 0) {
+    hl_init(deviceId_);
+  }
+
+  std::vector<size_t> gradReadyCount(parameters_.size(), 0);
+
+  auto& gradBufs = multiMachine_->getGradBuf(threadId_);
+
+  while (true) {
+    int pid = gradQueue_.dequeue();
+    if (stopping_) break;
+
+    if (++gradReadyCount[pid] < 2) continue;
+    gradReadyCount[pid] = 0;
+    int pdeviceId = multiMachine_->realDeviceId2LogicalDeviceId(
+        parameters_[pid]->getDeviceId(), threadId_);
+
+    auto& gradBuf = gradBufs[pdeviceId];
+
+    {
+      REGISTER_TIMER("mergeGrad");
+      for (size_t i = 0; i < mergeTypes_.size(); ++i) {
+        ParameterType type = mergeTypes_[i];
+        const VectorPtr& localGrad = parameters_[pid]->getBuf(type);
+        SetDevice setDevice(parameters_[pid]->getDeviceId());
+        localGrad->add(*gradBuf.bufs[i]);
+      }
+    }
+
+    gradBuf.sem.post();
+
+    if (multiMachine_->paraMainThread(pid) == threadId_) {
+      doCallback(pid);
+    } else {
+      notifyCopyGradToBuffer(pid);
+    }
+  }
+  hl_fini();
+}
+
+void TrainerThread::doCallback(int pid) {
+  REGISTER_TIMER("callback");
+  auto& gpuThreads = multiMachine_->getAllThreads();
+  if (multiMachine_->getBackwardCallback()) {
+    // The callback supplied by the user of MultiGradientMachine may handle
+    // the parameter update using the gradient.
+    multiMachine_->getBackwardCallback()(parameters_[pid].get());
+    if (parameters_[pid]->isValueUpdated()) {
+      parameters_[pid]->clearValueUpdated();
+      for (auto& thread : gpuThreads) {
+        thread->incUpdateCounter();
+      }
+      notifyValueReady(pid);
+    }
+  }
+  multiMachine_->notifyGradientTransfer(pid);
+}
+
+void TrainerThread::valueDispatchThread() {
+  VLOG(1) << "valueDispatchThread " << threadId_;
+
+  if (deviceId_ >= 0) {
+    hl_init(deviceId_);
+  }
+
+  auto& thread = multiMachine_->getThread(partnerId_);
+
+  while (true) {
+    int pid;
+    {
+      REGISTER_TIMER("value_dequeue");
+      pid = valueReadyQueue_.dequeue();
+    }
+    if (stopping_) break;
+
+    if (multiMachine_->paraMainThread(pid) == partnerId_) continue;
+
+    {
+      REGISTER_TIMER("copyValue");
+      SetDevice setDevice(parameters_[pid]->getDeviceId());
+      thread->getValueBuf(pid)->copyFrom(*getValueBuf(pid), valueStream_);
+      hl_stream_synchronize(valueStream_);
+    }
+
+    thread->notifyValueReady(pid);
+  }
+  hl_fini();
+}
+
+void TrainerThread::notifyValueReady(int paramId) {
+  if (--updateCounter_ == 0) {
+    valueReadyCond_.notify_all([this] { parameterUpdated_ = false; });
+  }
+
+  notifyValueDispatch(paramId);
+}
+
+int TrainerThread::copyInArgs() {
+  const std::vector<Argument>& fullInArgs = multiMachine_->getInArgs();
+  int numThreads = multiMachine_->getAllThreads().size();
+  int32_t numSequences = fullInArgs[0].getNumSequences();
+  int32_t startSeq = numSequences * threadId_ / numThreads;
+  int32_t endSeq = numSequences * (threadId_ + 1) / numThreads;
+  int32_t copySize = endSeq - startSeq;
+
+  /**
+   * For the first copy, need to allocate space here
+   */
+  if (inArgs_.size() == 0) {
+    inArgs_.resize(fullInArgs.size());
+  }
+
+  if (copySize == 0) {
+    return 0;
+  }
+
+  for (size_t i = 0; i < fullInArgs.size(); i++) {
+    inArgs_[i].resizeAndCopyFrom(
+        fullInArgs[i],
+        startSeq,
+        copySize,
+        FLAGS_parallel_nn ? false : multiMachine_->useGpu());
+  }
+  return copySize;
+}
+
+void TrainerThread::mergeCpuGradients() {
+  CHECK_EQ(mergeTypes_.size(), 1UL);
+  CHECK_EQ(mergeTypes_[0], PARAMETER_GRADIENT);
+
+  {
+    REGISTER_TIMER("waitbeforeMerge");
+    multiMachine_->waitBeforeMerge();
+  }
+  std::vector<const std::vector<ParameterPtr>*> slaveParameters =
+      multiMachine_->getSlaveParameters();
+
+  CHECK(slaveParameters.size());
+  for (auto& para : multiMachine_->getNonStaticParameters()) {
+    if (para->useGpu()) continue;
+    if (para->isSparseRemoteUpdate()) {
+      REGISTER_TIMER("mergeRemoteGradSparse");
+      mergeGradSparseRemote(para.get(), slaveParameters);
+    } else if (para->isGradSparseUpdate()) {
+      REGISTER_TIMER("mergeGradSparse");
+      mergeGradSparse(para.get(), slaveParameters);
+    } else {
+      REGISTER_TIMER("mergeGradDense");
+      mergeGradDense(para.get(), slaveParameters);
+    }
+  }
+  {
+    REGISTER_TIMER("waitbeforeMerge");
+    multiMachine_->waitAfterMerge();
+  }
+}
+
+void TrainerThread::mergeGradSparse(
+    Parameter* para,
+    std::vector<const std::vector<ParameterPtr>*>& slaveParameters) {
+  size_t pid = para->getID();
+  SparseRowIdsCpuMatrix* mainMat = dynamic_cast<SparseRowIdsCpuMatrix*>(
+      para->getMat(PARAMETER_GRADIENT).get());
+  std::vector<uint32_t>& ids = mainMat->getIds(threadId_);
+
+  for (auto slaveParams : slaveParameters) {
+    SparseRowCpuMatrix* mat = dynamic_cast<SparseRowCpuMatrix*>(
+        (*slaveParams)[pid]->getMat(PARAMETER_GRADIENT).get());
+    mat->addTo(*mainMat, ids, threadId_, multiMachine_->getNumThreads());
+    // we use a sample hash method(%) instead of range partition,
+    // because range partition has balance issue sometimes,
+    // when feature ids are not generated from hashcode.
+  }
+  uniqueIds(ids);
+}
+
+void TrainerThread::mergeGradSparseRemote(
+    Parameter* para,
+    std::vector<const std::vector<ParameterPtr>*>& slaveParameters) {
+  size_t pid = para->getID();
+  SparseRowCpuMatrix* mainMat =
+      dynamic_cast<SparseRowCpuMatrix*>(para->getMat(PARAMETER_GRADIENT).get());
+
+  mainMat->checkIndices();
+  mainMat->zeroMemThread(threadId_, multiMachine_->getNumThreads());
+
+  for (auto slaveParams : slaveParameters) {
+    SparseRowCpuMatrix* mat = dynamic_cast<SparseRowCpuMatrix*>(
+        (*slaveParams)[pid]->getMat(PARAMETER_GRADIENT).get());
+    mat->addTo(*mainMat, threadId_, multiMachine_->getNumThreads());
+  }
+}
+
+void TrainerThread::mergeGradDense(
+    Parameter* para,
+    std::vector<const std::vector<ParameterPtr>*>& slaveParameters) {
+  size_t pid = para->getID();
+  auto interval = calcSplitArrayInterval(para->getSize(),
+                                         (size_t)threadId_,
+                                         multiMachine_->getNumThreads(),
+                                         8LU /*for avx*/);
+  size_t startSeq = interval.first;
+  size_t copySize = interval.second - interval.first;
+
+  // setup sub bufs
+  CpuVector destGrad(0, nullptr);
+  destGrad.subVecFrom(*para->getBuf(PARAMETER_GRADIENT), startSeq, copySize);
+
+  // merge
+  CpuVector slaveGradSub(0, nullptr);
+  for (auto slaveParams : slaveParameters) {
+    slaveGradSub.subVecFrom(
+        *(*slaveParams)[pid]->getBuf(PARAMETER_GRADIENT), startSeq, copySize);
+    destGrad.add(slaveGradSub);
+  }
+}
+
+void TrainerThread::copyOutputGrad() {
+  const std::vector<Argument>& outputGradArgs = multiMachine_->outArgs_;
+  int numThreads = multiMachine_->getAllThreads().size();
+  int32_t numSequences = outputGradArgs[0].getNumSequences();
+  int32_t startSeq = numSequences * threadId_ / numThreads;
+  int32_t endSeq = numSequences * (threadId_ + 1) / numThreads;
+  int32_t copySize = endSeq - startSeq;
+  outArgs_.resize(outputGradArgs.size());
+  for (size_t i = 0; i < outputGradArgs.size(); i++) {
+    outArgs_[i].resizeAndCopyFrom(outputGradArgs[i],
+                                  startSeq,
+                                  copySize,
+                                  multiMachine_->useGpu(),
+                                  HPPL_STREAM_DEFAULT);
+  }
+  if (multiMachine_->useGpu()) {
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  }
+  gradientMachine_->setOutputGrad(outArgs_);
+}
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/MultiGradientMachine.h b/paddle/legacy/gserver/gradientmachines/MultiGradientMachine.h
new file mode 100644
index 0000000000000000000000000000000000000000..674acd4124981face13b21aee02f031ea775ffec
--- /dev/null
+++ b/paddle/legacy/gserver/gradientmachines/MultiGradientMachine.h
@@ -0,0 +1,478 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <atomic>
+
+#include "GradientMachine.h"
+
+#include "hl_gpu.h"
+#include "paddle/legacy/utils/Locks.h"
+#include "paddle/legacy/utils/Queue.h"
+
+namespace paddle {
+
+class TrainerThread;
+
+typedef Queue<int> PidQueue;
+typedef std::unique_ptr<TrainerThread> TrainerThreadPtr;
+
+struct GradBuffer {
+  /// GradBuffer is used for gathering gradient for GPU parameters
+  int paramId;
+
+  /// sem is used to notify that the local gradient merge of the current thread
+  /// finished for the current thread.
+  Semaphore sem;
+
+  // bufs[mergeIndex]
+  std::vector<VectorPtr> bufs;
+};
+
+/**
+ *  A MultiGradientMachine is a synchronous GradientMachine which devides
+ *  one data batch into several smaller batches and assign each one small batch
+ *  to one computint thread for computation. After each thread finishes
+ *  computation, it merges result (including output Argument and gradient during
+ *  backward()). It basically is the same as single thread gradient machine,
+ *  except that it uses multi-thread to do the computation.
+ *
+ *  It handles GPU and Cpu parameters differently.  In GPU, one computing thread
+ *  generally corresponds to one GPU device. Thus, each thread keeps a separate
+ *  copy of the parameter in its own device's memory. In CPU, we only need to
+ keep
+ *  one copy of the parameters in the main memory. After, each computing thread
+ *  computes its own parameter gradient, the update process needs to accumulate
+ *  the parameter gradients from all the computing threads, and update the
+ *  accumulated parameter gradient to the corresponding parameter value.
+ *
+ *  Each GPU parameter is assigned to a thread called its main thread. For each
+ *  parameter, the accumulation of its gradients and the update of its value
+ *  happens in its main thread. The main thread first gather the parameter
+ *  gradients from all the computing thread. Then, it performs parameter update.
+ *  After a gradient is updated by the main thread, it is scattered to all the
+ *  computing thread so that the parameters in all the computing threads are
+ *  synchronized. The scatter and gather process are implemented by ring-style
+ *  communication. Assume we have N computing threads, its thread ids will be
+ *  0, 1, ..., N-1. For each parameter, the id of the main thread is specified
+ in
+ *  paraMainThread_[pid], where pid is the id of the parameter. Each thread i
+ only
+ *  sends data to its partner thread (i - 1) % N. For example, for a parameter
+ *  gradient that is computed in thread 4, and its main thread is 2. Its
+ *  traveling process would be 4, 5,..., N-1, 0, 1, 2. In each step, the
+ gradient
+ *  buffer is added to the local gradient, and the local gradient is then copied
+ *  to the gradient buffer of the next thread. At last, its main thread 2 will
+ *  get the accumulated parameter gradient. For the same parameter, after its
+ *  value is updated, the value's traveling process would be 2, 1, 0, N-1, ...
+ 3.
+ *  At the end, all the computing threads would have the updated parameter
+ value.
+ *
+ *  A computing thread (TrainerThread) uses 4 threads to do different jobs:
+ *
+ *  1. computeThread(): performing forward(), backward(), prefetch().
+ *
+ *  2. valueDispatchThread(): copying parameter values to partner thread.
+ *
+ *  3. copyGradToBufferThread(): copying parameter gradient to partner thread.
+ *
+ *  4. gradCollectThread(): merging the gradient from step 3 with local gradient
+ *     and call the callback supplied by the user to update parameter value.
+ *
+ *  CPU parameter value has only one copy. And their gradients are merged at the
+ *  end of backward().
+ *
+ *  * Handling of sparse update
+ *  Currently, sparse update is only supported for CPU parameters.
+
+ *  Sparse updates refers to gradient caculation where the gradient is sparse.
+ For
+ *  example, if the input argument to a 'fc' layer is sparse, the gradient of
+ the
+ *  weight matrix of this layer will be sparse. It is usually more efficient to
+ *  treat the gradient explicitly as sparse vector during the parameter update.
+
+ *  There are two types of sparse updates called local sparse update and remote
+ *  sparse update.
+
+ *  For both types of sparse updates, there is one copy of parameter value and
+ *  gradient called main parameter value and gradient, and there is a copy of
+ *  parameter value and gradient for each computing thread called slave
+ parameter
+ *  value and gradient. The slave parameter values are always shared with the
+ *  corresponding main parameter value. The slave parameter grad is a sparse row
+ *  matrix. The sparse pattern for slave parameter grads are different, because
+ *  the small batches for each computing thread might have different sparsity
+ *  pattern.
+
+ *  1. Local sparse update
+ *
+ *     Main parameter value type is MAT_NORMAL. It is a dense matrix.
+ *
+ *     Main parameter grad type is MAT_SPARSE_ROW_IDS (SparseRowIdsCpuMatrix)
+ *     It is also a dense matrix, but the updated values are specified by IDS.
+ *
+ *     Slave parameter value shares with main parameter value.
+ *
+ *     Slave parameter grad type is MAT_SPARSE_ROW_AUTO_GROW
+ *     (SparseAutoGrowRowCpuMatrix). It is a sparse row matrix.
+ *
+ *     During backward() of each TrainerThread, SparseAutoGrowRowCpuMatrix will
+ *     gather all the non-zero gradient. And After backward(), they will be
+ merged
+ *     into main parameter grad (SparseRowIdsCpuMatrix), with indices indicating
+ *     which rows have nonzero gradient.
+ *
+ *  2. Remote sparse update
+ *
+ *     Main parameter value type is MAT_SPARSE_ROW_PREFETCH(_FULL_SIZE)
+ *     (SparsePrefetchRowCpuMatrix). MAT_SPARSE_ROW_PREFETCH is a sparse matrix.
+ *     MAT_SPARSE_ROW_PREFETCH_FULL_SIZE is a dense matrix. However, only the
+ *     parameter values that are prefetched is up-to-date.
+ *
+ *     Main parameter grad type is MAT_SPARSE_ROW (SparseRowCpuMatrix).
+ *     And it shares sparse pattern with value by sharing indexDictHandle_,
+ which
+ *     is an internal data structure used by SparseRowCpuMatrixto specify the
+ *     sparsity pattern of Slave parameter value shares with main parameter
+ value.
+ *
+ *     Slave parameter grad type is MAT_SPARSE_ROW_AUTO_GROW
+ *     (SparsePrefetchRowCpuMatrix). It is a sparse row matrix
+ *
+ *     During prefetch(), all the layers will indicates which rows of each
+ *     parameter are needed. Then the framework will retrieve those rows from
+ *     parameter server.
+ *
+ *     During backward() of each TrainerThread, SparseAutoGrowRowCpuMatrix will
+ *     gather all the non-zero gradient. And After backward(), they will be
+ merged
+ *     into main parameter grad (SparseRowCpuMatrix). And the framework will
+ send
+ *     the merged gradient to parameter server.
+ */
+class MultiGradientMachine : public GradientMachine {
+ public:
+  enum TaskType {
+    TASK_FORWARD_BACKWARD = 0,
+    TASK_FORWARD = 1,
+    TASK_BACKWARD = 2,
+    TASK_COPY_IN_ARGS = 3,
+  };
+
+  explicit MultiGradientMachine(const ModelConfig& config, bool useGpu);
+
+  virtual void start();
+
+  virtual void finish();
+
+  virtual void prefetch(const std::vector<Argument>& inArgs);
+
+  virtual void forward(const std::vector<Argument>& inArgs,
+                       std::vector<Argument>* outArgs,
+                       PassType passType);
+
+  virtual void backward(const UpdateCallback& callback = nullptr);
+
+  void forwardBackward(const std::vector<Argument>& inArgs,
+                       std::vector<Argument>* outArgs,
+                       PassType passType,
+                       const UpdateCallback& callback);
+
+  virtual Argument getLayerOutput(const std::string& layerName);
+
+  virtual void onPassEnd();
+
+  virtual Evaluator* makeEvaluator() const;
+
+  virtual void eval(Evaluator* evaluator) const;
+
+  bool useGpu() const { return useGpu_; }
+
+  /// @return whether to pass the gradients in outArgs_ to each threads.
+  bool isPassGrad() { return isPassGrad_; }
+
+  /// @brief set whether to pass the gradient in outArgs_ to each threads.
+  void setPassGrad(bool isPass) { isPassGrad_ = isPass; }
+
+  /// Set the gradients of the outputs.
+  /// The gradietns will be copied to each thread in the computing threads.
+  virtual void setOutputGrad(const std::vector<Argument>& args);
+
+ protected:
+  friend class TrainerThread;
+
+  std::vector<TrainerThreadPtr>& getAllThreads() { return threads_; }
+  /// Calculate the real device id based on the logical device id and the
+  /// thread id.
+  int logicalDeviceId2RealDeviceId(int logicalId, int threadId = 0) const {
+    if (logicalId == -1) {
+      logicalId = 0;
+    }
+    return mod(logicalId + FLAGS_gpu_id + threadId * numLogicalDevices_,
+               numDevices_);
+  }
+
+  /// Calculate the logical device id based on the real device id and the
+  /// thread id.
+  int realDeviceId2LogicalDeviceId(int realId, int threadId = 0) const {
+    if (realId == -1) {
+      return 0;
+    } else {
+      return mod(realId - FLAGS_gpu_id - threadId * numLogicalDevices_,
+                 numDevices_);
+    }
+  }
+
+  std::vector<const std::vector<ParameterPtr>*> getSlaveParameters();
+
+  bool hasNonstaticCpuParamters() const { return hasNonstaticCpuParamters_; }
+
+  /// Called TrainerThread to wait before merging CPU parameter gradients.
+  void waitBeforeMerge() { trainerBarrier_.wait(); }
+
+  /// called by MultiGradientMachine and TrainerThread to wait after merging
+  /// CPU parameter graidents.
+  void waitAfterMerge() { allBarrier_.wait(); }
+
+  /// called by MultiGradientMachine and TrainerThread to wait for copyInArgs()
+  /// finishing
+  void waitForCopyInArgs() { allBarrier_.wait(); }
+
+  TrainerThreadPtr& getThread(int threadId) { return threads_[threadId]; }
+
+  std::vector<GradBuffer>& getGradBuf(int threadId) {
+    return gradBufs_[threadId];
+  }
+
+  PassType getPassType() const { return passType_; }
+
+  /// Called by TrainerThread to notify MultiGradientMachine that the gradient
+  /// for paramId is ready
+  void notifyGradientTransfer(int paramId);
+
+  const std::vector<Argument>& getInArgs() { return inArgs_; }
+
+  TaskType getTaskType() const { return taskType_; }
+
+  const UpdateCallback& getBackwardCallback() const {
+    return backwardCallback_;
+  }
+
+  int getNumDevices() const { return numDevices_; }
+
+  int getNumLogicalDevices() const { return numLogicalDevices_; }
+
+  int getNumThreads() const { return numThreads_; }
+
+  int paraMainThread(int pid) const { return paraMainThread_[pid]; }
+
+ protected:
+  virtual void forwardImp(const std::vector<Argument>& inArgs,
+                          std::vector<Argument>* outArgs,
+                          PassType passType,
+                          TaskType taskType);
+
+  virtual void backwardImp(const UpdateCallback& callback = NULL);
+
+  /// update all parameters
+  void updateThreadParameters();
+
+  void startTask(TaskType taskType);
+
+  void getOutArgs(std::vector<Argument>* outArgs, PassType passType);
+
+  void allocGradBufs();
+
+ protected:
+  bool useGpu_;
+
+  bool hasNonstaticCpuParamters_;
+
+  /// store main parameter only
+  std::unique_ptr<GradientMachine> gradientMachine_;
+
+  std::vector<TrainerThreadPtr> threads_;
+  std::vector<int> paraMainThread_;
+  std::vector<std::vector<GradBuffer>> gradBufs_;  // [threadId][deviceId]
+  std::vector<size_t> bufferSizes_;
+
+  PassType passType_;
+  TaskType taskType_;
+  PidQueue gradQueue_;
+  std::vector<Argument> inArgs_;
+  std::vector<Argument> outArgs_;
+  hl_stream_t outArgStream_;
+
+  Argument outLayerArgs_;
+
+  /// ParameterType which needs to be merged from each GPU
+  std::vector<ParameterType> mergeTypes_;
+  int numDevices_;         /* number of gpu devices */
+  int numLogicalDevices_;  // number of GPU used by one NN
+  int numThreads_;         /* number of train threads */
+
+  UpdateCallback backwardCallback_;
+
+  /// barrrier for threads_
+  ThreadBarrier trainerBarrier_;
+
+  /// barrier for both MultiGradientMachine and threds_
+  ThreadBarrier allBarrier_;
+
+  /// indicate whether inArgs is copied before forward()
+  bool inArgsCopied_;
+
+  /// Whether to copy the gradient back from an external input.
+  bool isPassGrad_;
+};
+
+class TrainerThread {
+ public:
+  TrainerThread(const ModelConfig& config,
+                int threadId,
+                MultiGradientMachine* multiMachine);
+
+  ~TrainerThread();
+
+  void start();
+
+  void onPassEnd() { gradientMachine_->onPassEnd(); }
+
+  void waitOutArgsReady() { outArgsReadySem_.wait(); }
+
+  void notifyTaskReady() { taskReadySem_.post(); }
+
+  int getDeviceId() const { return deviceId_; }
+
+  GradientMachine* getGradientMachine() { return gradientMachine_.get(); }
+
+  const std::vector<ParameterPtr>& getParameters() { return parameters_; }
+
+  void stop();
+
+  void notifyValueReady(int paramId);
+
+  const VectorPtr& getValueBuf(int paramId) {
+    return parameters_[paramId]->getBuf(PARAMETER_VALUE);
+  }
+
+  const std::vector<Argument>& getOutArgs() { return outArgs_; }
+
+  void incUpdateCounter(int n = 1) {
+    updateCounter_ += n;
+    parameterUpdated_ = true;
+  }
+
+  void notifyGradientCollect(int paramId) { gradQueue_.enqueue(paramId); }
+
+  void notifyCopyGradToBuffer(int paramId) { gradBufQueue_.enqueue(paramId); }
+
+  void notifyValueDispatch(int paramId) { valueReadyQueue_.enqueue(paramId); }
+
+  void prefetch();
+
+  /// copy the output gradient from the main GradientMachine.
+  void copyOutputGrad();
+
+  /// Whether the thread has input data.
+  bool hasInputData() { return batchSize_ != 0; }
+
+ protected:
+  void mergeCpuGradients();
+
+  void mergeGradSparse(
+      Parameter* para,
+      std::vector<const std::vector<ParameterPtr>*>& slaveParameters);
+
+  void mergeGradSparseRemote(
+      Parameter* para,
+      std::vector<const std::vector<ParameterPtr>*>& slaveParameters);
+
+  void mergeGradDense(
+      Parameter* para,
+      std::vector<const std::vector<ParameterPtr>*>& slaveParameters);
+
+  void computeThread();
+  void valueDispatchThread();
+  void copyGradToBufferThread();
+  void gradCollectThread();
+
+  int copyInArgs();
+  void forward();
+  void backward();
+  void backwardCallback(Parameter* para);
+
+  /// call the actuall callback supplied by the caller of
+  /// GradientMachine::backward
+  void doCallback(int pid);
+
+ protected:
+  MultiGradientMachine* multiMachine_;
+  ModelConfig config_;
+  /// whether the thread should stop
+  bool stopping_;
+  /// the threads form which to collect gradient
+  int partnerId_;
+  /// from 0 to threads-1
+  int threadId_;
+  int deviceId_;
+  std::unique_ptr<GradientMachine> gradientMachine_;
+  std::vector<ParameterPtr> parameters_;
+
+  /// ParameterType which needs to be merged from each GPU
+  std::vector<ParameterType> mergeTypes_;
+
+  /// compute thread
+  std::unique_ptr<std::thread> computeThread_;
+  std::vector<Argument> inArgs_;
+  std::vector<Argument> outArgs_;
+  Semaphore taskReadySem_;
+  Semaphore outArgsReadySem_;
+
+  /// copy thread
+  std::unique_ptr<std::thread> copyThread_;
+  /// queue of gradient needs to be copied to partner
+  PidQueue gradBufQueue_;
+  hl_stream_t gradStream_;
+
+  /// grad merge thread
+  std::unique_ptr<std::thread> gradCollectThread_;
+  /// queue of gradient needs to be merged with gradient coopied by
+  /// copyGradToBufferThread
+  PidQueue gradQueue_;
+  UpdateCallback backwardCallback_;
+
+  /// value dispatch thread
+  std::unique_ptr<std::thread> valueDispatchThread_;
+  /// queue of the parameter whose the vale are ready for copy
+  PidQueue valueReadyQueue_;
+
+  /// used to notify all the parameter values are ready
+  LockedCondition valueReadyCond_;
+
+  hl_stream_t valueStream_;
+  /// how many parameters are updated
+  std::atomic<int> updateCounter_;
+  bool parameterUpdated_;
+
+  /// indicate whether inArgs is copied before forward()
+  bool inArgsCopied_;
+  int batchSize_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/MultiNetwork.cpp b/paddle/legacy/gserver/gradientmachines/MultiNetwork.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1245c441036a601025192ab23a6d2899b688a9dc
--- /dev/null
+++ b/paddle/legacy/gserver/gradientmachines/MultiNetwork.cpp
@@ -0,0 +1,185 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/Util.h"
+
+#include "MultiNetwork.h"
+
+#include "NeuralNetwork.h"
+#include "ParallelNeuralNetwork.h"
+
+namespace paddle {
+
+void MultiNetwork::init(const ModelConfig& config,
+                        ParamInitCallback callback,
+                        const std::vector<ParameterType>& parameterTypes,
+                        bool useGpu) {
+  CHECK_GT(config.sub_models_size(), 1) << "sub_models_size should GT 1";
+  // check submodel[0] is root
+  CHECK_EQ("root", config.sub_models(0).name())
+      << "sub_models(0) should be root";
+  // ignore root
+  subNetworks_.resize(config.sub_models_size() - 1);
+  // base class
+  NeuralNetwork::init(config, callback, parameterTypes, useGpu);
+  // sub networks
+  for (int i = 1; i < config.sub_models_size(); ++i) {
+    std::string subModelName = config.sub_models(i).name();
+    if (FLAGS_parallel_nn) {
+      subNetworks_[i - 1] = std::unique_ptr<ParallelNeuralNetwork>(
+          new ParallelNeuralNetwork(subModelName, this));
+    } else {
+      subNetworks_[i - 1] = std::unique_ptr<NeuralNetwork>(
+          NeuralNetwork::newNeuralNetwork(subModelName, this));
+    }
+    subNetworks_[i - 1]->init(config);
+  }
+}
+
+void MultiNetwork::prefetch(const std::vector<Argument>& inArgs) {
+  std::vector<std::vector<Argument>> argumentGroups;
+  Argument::splitByDataId(inArgs, &argumentGroups);
+  // check group size is equal to sub network size
+  CHECK_EQ(argumentGroups.size(), subNetworks_.size());
+  for (size_t i = 0; i < subNetworks_.size(); i++) {
+    if (argumentGroups[i].size() == 1 && argumentGroups[i][0].dataId == -1) {
+      // check input args: if dataId is -1, then skip this sub network
+      continue;
+    }
+    subNetworks_[i]->prefetch(argumentGroups[i]);
+  }
+}
+
+void MultiNetwork::forward(const std::vector<Argument>& inArgs,
+                           std::vector<Argument>* outArgs,
+                           PassType passType) {
+  // split inArgs to several vectors
+  std::vector<std::vector<Argument>> argumentGroups;
+  Argument::splitByDataId(inArgs, &argumentGroups);
+
+  // check group size is equal to sub network size
+  CHECK_EQ(argumentGroups.size(), subNetworks_.size());
+  std::vector<Argument> tempOutArgs;
+  outArgs->clear();
+
+  for (size_t i = 0; i < subNetworks_.size(); i++) {
+    tempOutArgs.clear();
+    if (argumentGroups[i].size() == 1 && argumentGroups[i][0].dataId == -1) {
+      // check input args: if dataId is -1, then skip this sub network
+      continue;
+    }
+    subNetworks_[i]->forward(argumentGroups[i], &tempOutArgs, passType);
+    for (const auto& elem : tempOutArgs) {
+      outArgs->push_back(elem);
+      outArgs->back().dataId = i;
+    }
+  }
+}
+
+void MultiNetwork::backward(const UpdateCallback& callback) {
+  for (size_t i = 0; i < subNetworks_.size(); i++) {
+    subNetworks_[i]->backward(callback);
+  }
+}
+
+void MultiNetwork::forwardBackward(const std::vector<Argument>& inArgs,
+                                   std::vector<Argument>* outArgs,
+                                   PassType passType,
+                                   const UpdateCallback& callback) {
+  forward(inArgs, outArgs, passType);
+  backward(callback);
+}
+
+void MultiNetwork::onPassEnd() {
+  for (size_t i = 0; i < subNetworks_.size(); i++) {
+    subNetworks_[i]->onPassEnd();
+  }
+}
+
+void MultiNetwork::start() {
+  for (auto& subNetwork : subNetworks_) {
+    subNetwork->start();
+  }
+}
+
+void MultiNetwork::finish() {
+  for (size_t i = 0; i < subNetworks_.size(); i++) {
+    subNetworks_[i]->finish();
+  }
+}
+
+class MultiCombinedEvaluator : public Evaluator {
+ public:
+  MultiCombinedEvaluator() {}
+  void addEvaluator(std::unique_ptr<Evaluator>&& evaluator) {
+    evaluators_.emplace_back(std::move(evaluator));
+  }
+  virtual void start() {
+    for (auto& evaluator : evaluators_) {
+      evaluator->start();
+    }
+  }
+
+  virtual void finish() {
+    for (auto& evaluator : evaluators_) {
+      evaluator->finish();
+    }
+  }
+
+  virtual void eval(const NeuralNetwork& nn) {
+    const MultiNetwork& multiNetwork = dynamic_cast<const MultiNetwork&>(nn);
+    CHECK_EQ(evaluators_.size(), multiNetwork.getSubNetworks().size());
+    int size = evaluators_.size();
+    for (int i = 0; i < size; i++) {
+      // one evaluator for one subNetwork
+      evaluators_[i]->eval(*multiNetwork.getSubNetworks()[i]);
+    }
+  }
+
+  virtual real evalImp(std::vector<Argument>& arguments) {
+    (void)arguments;
+    return -1;
+  }
+
+  virtual void printStats(std::ostream& os) const {
+    for (auto& evaluator : evaluators_) {
+      evaluator->printStats(os);
+      os << ' ';
+    }
+  }
+
+  virtual void distributeEval(ParameterClient2* client) {
+    for (auto& evaluator : evaluators_) {
+      evaluator->distributeEval(client);
+    }
+  }
+
+ protected:
+  std::vector<std::unique_ptr<Evaluator>> evaluators_;
+};
+
+Evaluator* MultiNetwork::makeEvaluator() const {
+  MultiCombinedEvaluator* multiCombinedEvaluator = new MultiCombinedEvaluator();
+  for (size_t i = 0; i < subNetworks_.size(); i++) {
+    std::unique_ptr<Evaluator> evaluator(subNetworks_[i]->makeEvaluator());
+    multiCombinedEvaluator->addEvaluator(std::move(evaluator));
+  }
+  return multiCombinedEvaluator;
+}
+
+void MultiNetwork::eval(Evaluator* evaluator) const { evaluator->eval(*this); }
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/MultiNetwork.h b/paddle/legacy/gserver/gradientmachines/MultiNetwork.h
new file mode 100644
index 0000000000000000000000000000000000000000..afe15cb020ebe3bbe051800a72562c9543f3faa4
--- /dev/null
+++ b/paddle/legacy/gserver/gradientmachines/MultiNetwork.h
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "GradientMachine.h"
+#include "NeuralNetwork.h"
+
+#include "paddle/legacy/utils/Locks.h"
+
+namespace paddle {
+
+class MultiNetwork : public NeuralNetwork {
+ public:
+  explicit MultiNetwork(std::string subModelName = "")
+      : NeuralNetwork(subModelName) {}
+
+  virtual void init(const ModelConfig& config,
+                    ParamInitCallback callback,
+                    const std::vector<ParameterType>& parameterTypes,
+                    bool useGpu);
+
+  virtual void prefetch(const std::vector<Argument>& inArgs);
+
+  virtual void forward(const std::vector<Argument>& inArgs,
+                       std::vector<Argument>* outArgs,
+                       PassType passType);
+
+  virtual void backward(const UpdateCallback& callback = nullptr);
+
+  void forwardBackward(const std::vector<Argument>& inArgs,
+                       std::vector<Argument>* outArgs,
+                       PassType passType,
+                       const UpdateCallback& callback);
+
+  virtual void onPassEnd();
+
+  virtual Evaluator* makeEvaluator() const;
+
+  virtual void eval(Evaluator* evaluator) const;
+
+  const std::vector<std::unique_ptr<NeuralNetwork>>& getSubNetworks() const {
+    return subNetworks_;
+  }
+
+  virtual void start();
+
+  virtual void finish();
+
+ protected:
+  std::vector<std::unique_ptr<NeuralNetwork>> subNetworks_;
+};
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/legacy/gserver/gradientmachines/NeuralNetwork.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0f8048152ff317a1e445249fa7093158d2d4a5c5
--- /dev/null
+++ b/paddle/legacy/gserver/gradientmachines/NeuralNetwork.cpp
@@ -0,0 +1,548 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/utils/Util.h"
+
+#include "NeuralNetwork.h"
+#include "hl_gpu.h"
+#include "paddle/legacy/utils/CustomStackTrace.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/legacy/gserver/layers/MKLDNNLayer.h"
+#endif
+
+#ifndef PADDLE_MOBILE_INFERENCE
+#include "MultiNetwork.h"
+#include "RecurrentGradientMachine.h"
+#include "paddle/legacy/gserver/layers/AgentLayer.h"
+#endif
+
+namespace paddle {
+void parameterInitNN(int paramId,
+                     Parameter* para,
+                     std::vector<ParameterPtr>* sharedParams) {
+  // Create parameters values.
+  if (!para->useGpu() && sharedParams) {
+    para->enableSharedType(PARAMETER_VALUE,
+                           (*sharedParams)[paramId]->getBuf(PARAMETER_VALUE),
+                           (*sharedParams)[paramId]->getMat(PARAMETER_VALUE));
+  } else {
+    if (para->isSparseRemoteUpdate()) {
+      para->enableType(PARAMETER_VALUE,
+                       FLAGS_loadsave_parameters_in_pserver
+                           ? Parameter::MAT_SPARSE_ROW_PREFETCH
+                           : Parameter::MAT_SPARSE_ROW_PREFETCH_FULL_SIZE);
+    } else {
+      para->enableType(PARAMETER_VALUE);
+    }
+  }
+  // Create parameter gradients.
+  if (para->isSparseRemoteUpdate() && !sharedParams) {
+    para->enableType(PARAMETER_GRADIENT, Parameter::MAT_SPARSE_ROW);
+  } else if (para->isGradSparseUpdate()) {
+    para->enableType(PARAMETER_GRADIENT, Parameter::MAT_SPARSE_ROW_AUTO_GROW);
+  } else if (!para->isStatic()) {
+    para->enableType(PARAMETER_GRADIENT);
+  }
+}
+
+NeuralNetwork* NeuralNetwork::create(const ModelConfig& config) {
+#ifndef PADDLE_MOBILE_INFERENCE
+  if (config.type() == "recurrent_nn") {
+    return newNeuralNetwork("root");
+  } else if (config.type() == "multi_nn") {
+    return new MultiNetwork("root");
+  } else {
+    return newNeuralNetwork();
+  }
+#else
+  return new NeuralNetwork();
+#endif
+}
+
+std::map<std::string, bool> NeuralNetwork::dllInitMap;
+
+void NeuralNetwork::init(const ModelConfig& config,
+                         ParamInitCallback callback,
+                         const std::vector<ParameterType>& parameterTypes,
+                         bool useGpu) {
+  using std::placeholders::_1;
+  using std::placeholders::_2;
+  ParamInitCallback paramCallback = nullptr;
+  if (callback != nullptr) {
+    paramSelfInited_ = false;
+    paramCallback = callback;
+  } else {
+    paramSelfInited_ = true;
+    paramCallback = std::bind(parameterInitNN, _1, _2, nullptr);
+  }
+  config_ = config;
+
+  if (rootNetwork_ != nullptr) {
+    // direct use parameters_ and parameterMap_ from base network
+    CHECK_EQ((size_t)config.parameters_size(),
+             rootNetwork_->getParameters().size());
+    parameters_ = rootNetwork_->getParameters();
+    parameterMap_ = *(rootNetwork_->getParameterMap());
+  } else {
+    parameters_.reserve(config.parameters_size());
+    for (const auto& para_config : config.parameters()) {
+      auto parameter = std::make_shared<Parameter>(para_config,
+                                                   useGpu,
+                                                   /*initialize=*/false);
+      paramCallback(parameters_.size(), parameter.get());
+      if (!callback) {
+        for (ParameterType type :
+             (parameter->isStatic()
+                  ? std::vector<ParameterType>{PARAMETER_VALUE}
+                  : parameterTypes)) {
+          if (type != PARAMETER_VALUE && type != PARAMETER_GRADIENT) {
+            parameter->enableType(type);
+          }
+        }
+      }
+      parameter->setID(parameters_.size());
+      parameters_.push_back(parameter);
+      CHECK(!parameterMap_.count(parameter->getName()));
+      parameterMap_[parameter->getName()] = parameter;
+    }
+  }
+
+  auto layerCreate = [&](const LayerConfig& layer_config) {
+    auto layer = Layer::create(layer_config);
+    CHECK(layer) << "Create layer failed. Layer name:" << layer->getName();
+    layers_.push_back(layer);
+    CHECK(!layerMap_.count(layer->getName()));
+    layerMap_[layer->getName()] = layer;
+  };
+
+  auto subModelConfig = std::find_if(config.sub_models().begin(),
+                                     config.sub_models().end(),
+                                     [=](const SubModelConfig& sub_model) {
+                                       return sub_model.name() == subModelName_;
+                                     });
+  bool useSubModel = (subModelConfig != config.sub_models().end());
+  CHECK_EQ(useSubModel, !subModelName_.empty());
+  if (useSubModel) {
+    layers_.reserve(subModelConfig->layer_names_size());
+    for (const auto& layer_name : subModelConfig->layer_names()) {
+      auto layer_config =
+          std::find_if(config.layers().begin(),
+                       config.layers().end(),
+                       [=](const LayerConfig& layer_config) {
+                         return layer_config.name() == layer_name;
+                       });
+      CHECK(layer_config != config.layers().end());
+      layerCreate(*layer_config);
+    }
+  } else {
+    layers_.reserve(config.layers_size());
+    for (const auto& layer_config : config.layers()) {
+      bool useLayer = true;
+      if (config.has_external_config()) {
+        useLayer = true;
+        for (const auto& name : config.external_config().layer_names()) {
+          if (layer_config.name() == name) {
+            useLayer = false;
+            break;
+          }
+        }
+      }
+      if (useLayer) {
+        layerCreate(layer_config);
+      }
+    }
+  }
+
+  for (const auto& layer : layers_) {
+    layer->init(layerMap_, parameterMap_);
+    layer->initSubNetwork(this /*root*/, config_, parameterTypes, useGpu);
+  }
+
+  for (const auto& layer_name :
+       (useSubModel ? subModelConfig->input_layer_names()
+                    : config.input_layer_names())) {
+    auto it = layerMap_.find(layer_name);
+    CHECK(it != layerMap_.end());
+    dataLayers_.push_back(std::dynamic_pointer_cast<DataLayer>(it->second));
+  }
+
+  for (const auto& layer_name :
+       (useSubModel ? subModelConfig->output_layer_names()
+                    : config.output_layer_names())) {
+    auto it = layerMap_.find(layer_name);
+    CHECK(it != layerMap_.end());
+    outputLayers_.push_back(it->second);
+  }
+
+  for (const auto& layer : layers_) {
+    const auto& name = layer->getName();
+    bool isMiddleLayer = true;
+
+    // if data layer
+    for (const auto& dataLayer : dataLayers_) {
+      if (name == dataLayer->getName()) {
+        isMiddleLayer = false;
+        break;
+      }
+    }
+
+    // if output layer
+    for (const auto& dataLayer : outputLayers_) {
+      if (name == dataLayer->getName()) {
+        isMiddleLayer = false;
+        break;
+      }
+    }
+
+    if (isMiddleLayer) {
+      middleLayers_.push_back(layer);
+    }
+  }
+}
+
+void NeuralNetwork::connect(LayerPtr agentLayer,
+                            LayerPtr realLayer,
+                            int height) {
+#ifndef PADDLE_MOBILE_INFERENCE
+  AgentLayer* agent = dynamic_cast<AgentLayer*>(agentLayer.get());
+  CHECK_NOTNULL(agent);
+  agent->setRealLayer(realLayer, height);
+#endif
+}
+
+void NeuralNetwork::connect(std::string agentLayerName,
+                            NeuralNetwork* srcNN,
+                            std::string realLayerName) {
+  connect(this->getLayer(agentLayerName), srcNN->getLayer(realLayerName));
+}
+
+void NeuralNetwork::prefetch(const std::vector<Argument>& inArgs) {
+  CHECK_EQ(inArgs.size(), dataLayers_.size());
+
+  if (paramSelfInited_) {
+    for (auto& para : parameters_) {
+      if (para->isSparseRemoteUpdate()) {
+        auto mat = dynamic_cast<SparsePrefetchRowCpuMatrix*>(
+            para->getMat(PARAMETER_VALUE).get());
+        para->clearGradient();
+        if (mat) mat->clearIndices();
+      }
+    }
+  }
+
+  for (size_t i = 0; i != dataLayers_.size(); ++i) {
+    if (FLAGS_parallel_nn) {
+      const_cast<Argument&>(inArgs[i]).deviceId = -1;
+    }
+    dataLayers_[i]->setData(inArgs[i]);
+  }
+
+  for (auto& layer : layers_) {
+    layer->prefetch();
+  }
+
+  if (paramSelfInited_) {
+    for (auto& para : parameters_) {
+      if (para->isSparseRemoteUpdate()) {
+        auto mat = dynamic_cast<SparsePrefetchRowCpuMatrix*>(
+            para->getMat(PARAMETER_VALUE).get());
+        mat->setupIndices();
+        auto matGrad = dynamic_cast<SparseRowCpuMatrix*>(
+            para->getMat(PARAMETER_GRADIENT).get());
+        matGrad->reserveStore();
+      }
+    }
+  }
+}
+
+void NeuralNetwork::forward(const std::vector<Argument>& inArgs,
+                            std::vector<Argument>* outArgs,
+                            PassType passType) {
+  CHECK_EQ(inArgs.size(), dataLayers_.size());
+  outArgs->resize(outputLayers_.size());
+  for (size_t i = 0; i != dataLayers_.size(); ++i) {
+    dataLayers_[i]->setData(inArgs[i]);
+  }
+
+  gLayerStackTrace.set_stage(true);
+
+  {
+    for (auto& layer : layers_) {
+      REGISTER_TIMER_INFO("ForwardTimer", layer->getName().c_str());
+      gLayerStackTrace.push(layer->getName());
+      layer->forward(passType);
+      gLayerStackTrace.pop(layer->getName());
+    }
+  }
+
+  outArgs->clear();
+  outArgs->reserve(outputLayers_.size());
+  for (auto& layer : outputLayers_) {
+    outArgs->push_back(layer->getOutput());
+  }
+}
+
+void NeuralNetwork::resetState() {
+  for (auto& layer : layers_) {
+    layer->resetState();
+  }
+}
+
+void NeuralNetwork::setState(const MachineState& machineState) {
+  for (size_t i = 0; i < layers_.size(); i++) {
+    if (machineState[i] != nullptr) {
+      layers_[i]->setState(machineState[i]);
+    }
+  }
+}
+
+void NeuralNetwork::getState(MachineState& machineState) {
+  machineState.clear();
+  machineState.reserve(layers_.size());
+  for (auto& layer : layers_) {
+    LayerStatePtr p = layer->getState();
+    machineState.push_back(p);
+  }
+}
+
+void NeuralNetwork::backward(const UpdateCallback& callback) {
+  gLayerStackTrace.set_stage(false);
+  FOR_EACH_R(layer, layers_) {
+    REGISTER_TIMER_INFO("BackwardTimer", (*layer)->getName().c_str());
+    gLayerStackTrace.push((*layer)->getName());
+    if ((*layer)->needGradient()) {
+      (*layer)->backward(callback);
+    }
+    gLayerStackTrace.pop((*layer)->getName());
+  }
+}
+
+void NeuralNetwork::finish() {
+#ifdef PADDLE_WITH_MKLDNN
+  FOR_EACH_R(layer, layers_) {
+    MKLDNNLayerPtr dnnLayer = std::dynamic_pointer_cast<MKLDNNLayer>(*layer);
+    if (dnnLayer) {
+      dnnLayer->convertWeightsToPaddle();
+    }
+  }
+#endif
+}
+
+Argument NeuralNetwork::getLayerOutput(const std::string& layerName) {
+  return getLayer(layerName)->getOutput();
+}
+
+void NeuralNetwork::onPassEnd() {
+  for (auto& layer : layers_) {
+    layer->onPassEnd();
+  }
+}
+
+void NeuralNetwork::releaseOutput() {
+  for (auto& layer : middleLayers_) {
+    Argument& arg = layer->getOutput();
+    arg.value.reset();
+  }
+}
+
+#ifndef PADDLE_MOBILE_INFERENCE
+
+class CombinedEvaluator : public Evaluator {
+ public:
+  void addEvaluator(std::unique_ptr<Evaluator>&& evaluator) {
+    evaluators_.emplace_back(std::move(evaluator));
+  }
+  void start() override {
+    for (auto& evaluator : evaluators_) {
+      evaluator->start();
+    }
+  }
+
+  void finish() override {
+    for (auto& evaluator : evaluators_) {
+      evaluator->finish();
+    }
+  }
+
+  void eval(const NeuralNetwork& nn) override {
+    for (auto& evaluator : evaluators_) {
+      evaluator->eval(nn);
+    }
+  }
+  real evalImp(std::vector<Argument>& arguments) override {
+    (void)arguments;
+    return -1;
+  }
+  void printStats(std::ostream& os) const override {
+    for (auto& evaluator : evaluators_) {
+      evaluator->printStats(os);
+      os << ' ';
+    }
+  }
+
+  void distributeEval(ParameterClient2* client) override {
+    for (auto& evaluator : evaluators_) {
+      evaluator->distributeEval(client);
+    }
+  }
+
+ protected:
+  std::vector<std::unique_ptr<Evaluator>> evaluators_;
+
+  // Evaluator interface
+ public:
+  /**
+   * @brief getNames will return all inside evaluators' names.
+   * @param names [out]: return names.
+   */
+  void getNames(std::vector<std::string>* names) override {
+    for (auto& eval : evaluators_) {
+      eval->getNames(names);
+    }
+  }
+
+  /**
+   * @brief getValue could get all inside evaluators' value.
+   */
+  real getValue(const std::string& name, Error* err) const override {
+    return this->getMethodHelper<real>(
+        name, err, [&name, err](const std::unique_ptr<Evaluator>& eval) {
+          return eval->getValue(name, err);
+        });
+  }
+
+  /**
+   * @brief getType could get all inside evaluators' type.
+   */
+  std::string getType(const std::string& name, Error* err) const override {
+    return this->getMethodHelper<std::string>(
+        name, err, [&name, err](const std::unique_ptr<Evaluator>& eval) {
+          return eval->getType(name, err);
+        });
+  }
+
+ private:
+  template <typename T>
+  T getMethodHelper(const std::string& name,
+                    Error* err,
+                    const std::function<T(const std::unique_ptr<Evaluator>&)>&
+                        callback) const {
+    for (auto& eval : evaluators_) {
+      std::vector<std::string> names;
+      eval->getNames(&names);
+      if (std::find(names.begin(), names.end(), name) != names.end()) {
+        return callback(eval);
+      }
+    }
+    *err = Error("No such key %s", name.c_str());
+    return T();
+  }
+};
+
+class SubnetEvaluator : public CombinedEvaluator {
+ public:
+  SubnetEvaluator(const std::string& layerName,
+                  std::unique_ptr<Evaluator>&& evaluator)
+      : layerName_(layerName) {
+    addEvaluator(std::move(evaluator));
+  }
+  void eval(const NeuralNetwork& nn) override {
+    const LayerPtr& layer = nn.getLayer(layerName_);
+    CHECK(layer) << "Nonexisted layer: " << layerName_ << " in submodel "
+                 << nn.getName();
+    bool accessed = false;
+    layer->accessSubNetwork([this, &accessed](NeuralNetwork& subnet) {
+      subnet.eval(evaluators_[0].get());
+      accessed = true;
+    });
+    CHECK(accessed) << "There is no subnetwork for layer " << layerName_
+                    << " in submodel " << nn.getName();
+  }
+
+ protected:
+  std::string layerName_;
+};
+
+Evaluator* NeuralNetwork::makeEvaluator() const {
+  CombinedEvaluator* combinedEvaluator = new CombinedEvaluator();
+  auto subModelConfig = std::find_if(config_.sub_models().begin(),
+                                     config_.sub_models().end(),
+                                     [=](const SubModelConfig& sub_model) {
+                                       return sub_model.name() == subModelName_;
+                                     });
+  bool useSubModel = (subModelConfig != config_.sub_models().end());
+  CHECK_EQ(useSubModel, !subModelName_.empty());
+  if (useSubModel) {
+    // create the evaluators that belong to CURRENT submodel
+    for (int i = 0; i < subModelConfig->evaluator_names_size(); ++i) {
+      // find evaluator by name
+      auto thisEvalConfig = std::find_if(
+          config_.evaluators().begin(),
+          config_.evaluators().end(),
+          [=](const EvaluatorConfig& ecfg) {
+            return ecfg.name() == subModelConfig->evaluator_names(i);
+          });
+      bool validConfig = (thisEvalConfig != config_.evaluators().end());
+      if (validConfig) {
+        std::unique_ptr<Evaluator> evaluator(
+            Evaluator::create(*thisEvalConfig));
+        combinedEvaluator->addEvaluator(std::move(evaluator));
+      }
+    }
+    for (auto& layer : layers_) {
+      layer->accessSubNetwork(
+          [layer, combinedEvaluator](NeuralNetwork& subnet) {
+            std::unique_ptr<Evaluator> subEvaluator(new SubnetEvaluator(
+                layer->getName(),
+                std::unique_ptr<Evaluator>(subnet.makeEvaluator())));
+            combinedEvaluator->addEvaluator(std::move(subEvaluator));
+          });
+    }
+  } else {
+    for (const EvaluatorConfig& evalConfig : config_.evaluators()) {
+      std::unique_ptr<Evaluator> evaluator(Evaluator::create(evalConfig));
+      combinedEvaluator->addEvaluator(std::move(evaluator));
+    }
+  }
+  return combinedEvaluator;
+}
+
+void NeuralNetwork::eval(Evaluator* evaluator) const { evaluator->eval(*this); }
+
+#endif
+
+void NeuralNetwork::setOutputGrad(const std::vector<Argument>& args) {
+  CHECK_GE(outputLayers_.size(), args.size());
+  for (size_t i = 0; i < args.size(); ++i) {
+    outputLayers_[i]->getOutput().grad = args[i].grad;
+  }
+}
+
+extern NeuralNetwork* newCustomNerualNetwork(const std::string& name,
+                                             NeuralNetwork* network)
+    __attribute__((weak));
+
+NeuralNetwork* NeuralNetwork::newNeuralNetwork(const std::string& name,
+                                               NeuralNetwork* rootNetwork) {
+  if (newCustomNerualNetwork) {
+    return newCustomNerualNetwork(name, rootNetwork);
+  } else {
+    return new NeuralNetwork(name, rootNetwork);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/NeuralNetwork.h b/paddle/legacy/gserver/gradientmachines/NeuralNetwork.h
new file mode 100644
index 0000000000000000000000000000000000000000..566157c8998a38aef4a3620a4dca7246c6e66391
--- /dev/null
+++ b/paddle/legacy/gserver/gradientmachines/NeuralNetwork.h
@@ -0,0 +1,179 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <functional>
+#include <map>
+#include <memory>
+
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/gserver/dataproviders/DataProvider.h"
+#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
+#include "paddle/legacy/gserver/layers/CostLayer.h"
+#include "paddle/legacy/gserver/layers/DataLayer.h"
+#include "paddle/legacy/gserver/layers/Layer.h"
+#include "paddle/legacy/parameter/Parameter.h"
+#include "paddle/legacy/utils/ClassRegistrar.h"
+
+namespace paddle {
+/*
+ * @brief  Init function for the parameters.
+ * @param paramId: the id of the parameter to init.
+ * @param para: the pointer to the parameter to init.
+ * @param sharedParams: the pointer to an array of the parameter to be shared.
+ *                      If it is null, no parameter sharing is used.
+ *                      Only CPU paramters can be shared.
+ * It handles CPU, CPU sparse, CPU sparse remote,
+ * and GPU parameters differently. If the type
+ * of a parameter is NORMAL. Basically nothing need to be done.
+ * CPU value: NORMAL.
+ * CPU param: NORMAL.
+ *
+ * CPU sparse value: NORMAL.
+ * CPU sparse gradient: MAT_SPARSE_ROW_AUTO_GROW.
+ *
+ * CPU sparse remote value: MAT_SPARSE_ROW_PREFETCH(_FULL_SIZE).
+ * CPU sparse remote gradient: MAT_SPARSE_ROW (!sharedParams)
+ *                             MAT_SPARSE_ROW_AUTO_GROW (sharedParams)
+ *
+ * GPU value: NORMAL
+ * GPU param: NORMAL
+ */
+void parameterInitNN(int paramId,
+                     Parameter* para,
+                     std::vector<ParameterPtr>* sharedParams);
+
+class NeuralNetwork : public GradientMachine {
+ public:
+  virtual void init(const ModelConfig& config,
+                    ParamInitCallback callback = nullptr,
+                    const std::vector<ParameterType>& parameterTypes =
+                        std::vector<ParameterType>{PARAMETER_VALUE,
+                                                   PARAMETER_GRADIENT,
+                                                   PARAMETER_MOMENTUM},
+                    bool useGpu = FLAGS_use_gpu);
+
+  /**
+   * Connect two submodels and
+   * down-submodel's output become up-submodel's input.
+   * By default, connection is one by one,
+   * If the agent height is smaller than real layer, *height* has to be filled.
+   *
+   * @param realLayer  The down-submodel's output layer.
+   * @param agentLayer The up-submodel's input agent layer.
+   */
+  static void connect(LayerPtr agentLayer, LayerPtr realLayer, int height = 0);
+  void connect(std::string agentLayerName,
+               NeuralNetwork* srcNN,
+               std::string realLayerName);
+
+  virtual void prefetch(const std::vector<Argument>& inArgs);
+
+  virtual void forward(const std::vector<Argument>& inArgs,
+                       std::vector<Argument>* outArgs,
+                       PassType passType);
+
+  virtual void backward(const UpdateCallback& callback = nullptr);
+
+  virtual Argument getLayerOutput(const std::string& layerName);
+
+  const LayerPtr& getLayer(const std::string& layerName) const {
+    auto it = layerMap_.find(layerName);
+    CHECK(it != layerMap_.end()) << "Unknown layer " << layerName;
+    return it->second;
+  }
+
+  virtual void onPassEnd();
+
+#ifndef PADDLE_MOBILE_INFERENCE
+  virtual Evaluator* makeEvaluator() const;
+
+  virtual void eval(Evaluator* evaluator) const;
+#endif
+
+  virtual void resetState();
+  virtual void setOutputGrad(const std::vector<Argument>& args);
+
+  /// set machine state
+  virtual void setState(const MachineState& machineState);
+
+  /// get machine state
+  virtual void getState(MachineState& machineState);
+
+  static NeuralNetwork* create(const ModelConfig& config);
+
+  ParameterMap* getParameterMap() { return &parameterMap_; }
+
+  /**
+   * @brief Access each layer as a for each loop.
+   * @param callback invoke with each layer.
+   */
+  template <typename T>
+  void forEachLayer(T callback) {
+    for (auto& l : layers_) {
+      if (callback(l)) {
+        break;
+      }
+    }
+  }
+
+  static NeuralNetwork* newNeuralNetwork(const std::string& name = "",
+                                         NeuralNetwork* rootNetwork = nullptr);
+
+  const std::string& getName() const { return subModelName_; }
+
+  /// some finish work, like convert the weight format of MKLDNNLayers
+  void finish();
+
+  /**
+   * @brief   Release the middle layer's output memory.
+   *
+   * @note    This function is used for memory optimization in inference.
+   */
+  void releaseOutput();
+
+ protected:
+  /**
+   * The constructor of NeuralNetwork.
+   * The sub networks can get parameters_ and parameterMap_
+   * from base NeuralNetwork.
+   *
+   * @param subModelName The name of sub-model.
+   * @param rootNetwork  It used in MultiNetwork.
+   */
+  NeuralNetwork(std::string subModelName = "",
+                NeuralNetwork* rootNetwork = nullptr)
+      : subModelName_(subModelName), rootNetwork_(rootNetwork) {}
+
+  std::string subModelName_;
+  ModelConfig config_;
+  std::vector<LayerPtr> layers_;
+  ParameterMap parameterMap_;
+  LayerMap layerMap_;
+
+  std::vector<DataLayerPtr> dataLayers_;
+  std::vector<LayerPtr> outputLayers_;
+  std::vector<LayerPtr> middleLayers_;
+
+  static std::map<std::string, bool> dllInitMap;
+
+  NeuralNetwork* rootNetwork_;
+
+  /// Whether parameter of this NN is initialized by its own
+  /// (i.e., not by callback supplied with the caller)
+  bool paramSelfInited_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/ParallelNeuralNetwork.cpp b/paddle/legacy/gserver/gradientmachines/ParallelNeuralNetwork.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..33d24b5b832fe9011591606860e0f50361367790
--- /dev/null
+++ b/paddle/legacy/gserver/gradientmachines/ParallelNeuralNetwork.cpp
@@ -0,0 +1,214 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/Util.h"
+
+#include "ParallelNeuralNetwork.h"
+
+#include <pthread.h>
+#include <sched.h>
+
+namespace paddle {
+
+void ParallelNeuralNetwork::init(
+    const ModelConfig& config,
+    ParamInitCallback callback,
+    const std::vector<ParameterType>& parameterTypes,
+    bool useGpu) {
+  NeuralNetwork::init(config, callback, parameterTypes, useGpu);
+
+  if (config.type() == "recurrent_nn") {
+    LOG(FATAL)
+        << "You can not add `--parallel_nn=true` on the command line, "
+        << "parallel_nn training mode does not support the recurrent_nn model.";
+  }
+
+  useGpu_ = useGpu;
+  numDevices_ = 0;
+  if (useGpu_) {
+    numDevices_ = hl_get_device_count();
+  }
+
+  for (auto& layer : layers_) {
+    int deviceId = layer->getDeviceId();
+    CHECK_LT(deviceId, numDevices_);
+    addComputeThread(deviceId);
+  }
+}
+
+void ParallelNeuralNetwork::addComputeThread(int deviceId) {
+  for (auto& thread : threads_) {
+    if (thread->getDeviceId() == deviceId) {
+      return;
+    }
+  }
+
+  threads_.emplace_back(new ParallelThread(
+      threads_.size(), deviceId, deviceId >= 0 ? useGpu_ : false));
+}
+
+void ParallelNeuralNetwork::waitAllThread() {
+  for (auto& thread : threads_) {
+    thread->jobEnqueue(NULL, TASK_END_LAYER);
+  }
+
+  for (size_t i = 0; i < threads_.size(); i++) {
+    threads_[i]->queue_.waitEmpty();
+  }
+}
+
+void ParallelNeuralNetwork::dispatchByDeviceId(int deviceId,
+                                               LayerPtr layer,
+                                               TaskType task) {
+  for (auto& thread : threads_) {
+    if (thread->getDeviceId() == deviceId) {
+      thread->jobEnqueue(layer, task);
+      return;
+    }
+  }
+  LOG(FATAL) << "No specific device thread ";
+}
+
+void ParallelNeuralNetwork::forward(const std::vector<Argument>& inArgs,
+                                    std::vector<Argument>* outArgs,
+                                    PassType passType) {
+  for (auto& thread : threads_) {
+    thread->setForwardPassType(passType);
+  }
+  CHECK_EQ(inArgs.size(), dataLayers_.size());
+  outArgs->resize(outputLayers_.size());
+  for (size_t i = 0; i != dataLayers_.size(); ++i) {
+    const_cast<Argument&>(inArgs[i]).deviceId = -1;
+    dataLayers_[i]->setData(inArgs[i]);
+  }
+
+  for (auto& layer : layers_) {
+    dispatchByDeviceId(layer->getDeviceId(), layer, TASK_FORWARD);
+  }
+
+  {
+    REGISTER_TIMER("forwardTime");
+    waitAllThread();
+  }
+  outArgs->clear();
+  outArgs->reserve(outputLayers_.size());
+  for (auto& layer : outputLayers_) {
+    outArgs->push_back(layer->getOutput());
+  }
+}
+
+void ParallelNeuralNetwork::backward(const UpdateCallback& callback) {
+  for (auto& thread : threads_) {
+    thread->setBackwardCallback(callback);
+  }
+
+  FOR_EACH_R(layer, layers_) {
+    dispatchByDeviceId((*layer)->getDeviceId(), *layer, TASK_BACKWARD);
+  }
+  {
+    REGISTER_TIMER("backwardTime");
+    waitAllThread();
+  }
+}
+
+void ParallelNeuralNetwork::forwardBackward(const std::vector<Argument>& inArgs,
+                                            std::vector<Argument>* outArgs,
+                                            PassType passType,
+                                            const UpdateCallback& callback) {
+  forward(inArgs, outArgs, passType);
+  backward(callback);
+}
+
+void ParallelNeuralNetwork::start() {
+  for (auto& thread : threads_) {
+    thread->start();
+  }
+}
+
+ParallelThread::ParallelThread(int threadId, int deviceId, bool useGpu)
+    : threadId_(threadId), deviceId_(deviceId), useGpu_(useGpu) {}
+
+ParallelThread::~ParallelThread() { stop(); }
+
+void ParallelThread::stop() {
+  if (computeThread_) {
+    jobEnqueue(NULL, TASK_THREAD_FINISH);
+    computeThread_->join();
+    computeThread_.reset(nullptr);
+  }
+}
+
+void ParallelThread::computeThread() {
+  LOG(INFO) << "gradComputeThread " << threadId_;
+
+  if (useGpu_) {
+    hl_init(deviceId_);
+  }
+
+  while (true) {
+    struct Job job_work = queue_.dequeue();
+
+    if (job_work.task_ == TASK_END_LAYER) {
+      continue;
+    } else if (job_work.task_ == TASK_THREAD_FINISH) {
+      break;
+    }
+
+    if (TASK_FORWARD == job_work.task_) {
+      {
+        REGISTER_TIMER_INFO("waitInputValue",
+                            job_work.layer_->getName().c_str());
+        job_work.layer_->waitInputValue();
+      }
+      {
+        REGISTER_TIMER_INFO("threadForwardTimer",
+                            job_work.layer_->getName().c_str());
+        job_work.layer_->forward(passType_);
+      }
+      {
+        REGISTER_TIMER_INFO("copyOutputToOtherDevice",
+                            job_work.layer_->getName().c_str());
+        job_work.layer_->copyOutputToOtherDevice();
+      }
+    } else {
+      {
+        REGISTER_TIMER_INFO("waitAndMergeOutputGrad",
+                            job_work.layer_->getName().c_str());
+        job_work.layer_->waitAndMergeOutputGrad();
+      }
+      {
+        REGISTER_TIMER_INFO("threadBackwardTimer",
+                            job_work.layer_->getName().c_str());
+        job_work.layer_->backward(backwardCallback_);
+      }
+      hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+      job_work.layer_->markAllInputGrad();
+    }
+  }
+  hl_fini();
+}
+
+void ParallelThread::start() {
+  computeThread_.reset(new std::thread([this]() { computeThread(); }));
+}
+
+void ParallelThread::jobEnqueue(LayerPtr layer, TaskType task) {
+  struct Job job_work;
+  job_work.layer_ = layer;
+  job_work.task_ = task;
+  queue_.enqueue(job_work);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h b/paddle/legacy/gserver/gradientmachines/ParallelNeuralNetwork.h
similarity index 100%
rename from paddle/gserver/gradientmachines/ParallelNeuralNetwork.h
rename to paddle/legacy/gserver/gradientmachines/ParallelNeuralNetwork.h
diff --git a/paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e49f042404f80a21293545023efa3e68417c1edb
--- /dev/null
+++ b/paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -0,0 +1,1501 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "RecurrentGradientMachine.h"
+#include <dlfcn.h>
+#include <algorithm>
+#include <cmath>
+#include <functional>
+#include <limits>
+#include "NeuralNetwork.h"
+#include "paddle/legacy/gserver/layers/AgentLayer.h"
+#include "paddle/legacy/utils/Flags.h"
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/Util.h"
+
+DEFINE_string(diy_beam_search_prob_so, "", "the diy beam search cost so");
+
+static const char* DIY_CALC_PROB_SYMBOL_NAME = "calc_prob";
+static const char* DIY_START_CALC_PROB_SYMBOL_NAME = "start_calc_prob";
+static const char* DIY_FINISH_CALC_PROB_SYMBOL_NAME = "finish_calc_prob";
+
+namespace paddle {
+
+/**
+ * Start Custom Calculate Probability callback type.
+ *
+ * @param nNode, nodes: the path will be explored. nNodes is array size.
+ *                      nodes is array elements.
+ *
+ * @return: A custom handler id that will passed to another callback.
+ */
+typedef int (*DiyStartCalcProbCallback)(size_t nNodes, int* nodes);
+
+/**
+ * Doing Custom Calculation of Probability callback type.
+ *
+ * @param handler: User custom handler. The return value from start calc prob.
+ * @param nNode, nodes: Array. The current path.
+ * @param curProb: The current log probability that neural network returns.
+ *
+ * @return: Log probability which user calculated, it will be updated to this
+ *          path.
+ * @NOTE: Return -INFINITY will DROP this path IMMEDIATELY!!
+ */
+typedef real (*DiyCalcProbCallback)(
+    int handler, size_t nNodes, int* nodes, real curProb, bool atEos);
+
+/**
+ * Finish Custom Calculation of Probability callback type.
+ *
+ * @param handler: User custom handler. The return value from start calc prob.
+ */
+typedef void (*DiyStopCalcProbCallback)(int handler);
+
+static DiyCalcProbCallback gDiyProbMethod = nullptr;
+static DiyStartCalcProbCallback gDiyProbStart = nullptr;
+static DiyStopCalcProbCallback gDiyProbStop = nullptr;
+static void* gDiyProbHandle = nullptr;
+
+static void exit_diy_prob() { dlclose(gDiyProbHandle); }
+
+template <typename SymbolType>
+static inline SymbolType loadDiySymbol(const char* symbolName) {
+  void* sym = dlsym(gDiyProbHandle, symbolName);
+  CHECK(sym) << "Cannot load symbol " << symbolName << " from "
+             << FLAGS_diy_beam_search_prob_so;
+  return reinterpret_cast<SymbolType>(sym);
+}
+
+static InitFunction __init__diy_prob_method(
+    [] {
+      std::string soName = FLAGS_diy_beam_search_prob_so;
+      if (!soName.empty()) {
+        gDiyProbHandle = dlopen(soName.c_str(), RTLD_LAZY);
+        CHECK(gDiyProbHandle) << "Cannot Open DIY Prob So " << soName;
+        atexit(exit_diy_prob);
+        gDiyProbMethod =
+            loadDiySymbol<decltype(gDiyProbMethod)>(DIY_CALC_PROB_SYMBOL_NAME);
+        gDiyProbStart = loadDiySymbol<decltype(gDiyProbStart)>(
+            DIY_START_CALC_PROB_SYMBOL_NAME);
+        gDiyProbStop = loadDiySymbol<decltype(gDiyProbStop)>(
+            DIY_FINISH_CALC_PROB_SYMBOL_NAME);
+      }
+    },
+    std::numeric_limits<int>::max());
+
+class BeamSearchControlCallbacks {
+ public:
+  RecurrentGradientMachine::BeamSearchCandidatesAdjustCallback
+      beamSearchCandidateAdjust;
+  RecurrentGradientMachine::NormOrDropNodeCallback normOrDropNode;
+  RecurrentGradientMachine::DropCallback stopDetermineCandidates;
+
+  //! for gcc46 aggregate initialization is not very well, so we need to
+  //! explicit
+  BeamSearchControlCallbacks(
+      const RecurrentGradientMachine::BeamSearchCandidatesAdjustCallback&
+          candidateAdjust,
+      const RecurrentGradientMachine::NormOrDropNodeCallback& norm,
+      const RecurrentGradientMachine::DropCallback& stop)
+      : beamSearchCandidateAdjust(candidateAdjust),
+        normOrDropNode(norm),
+        stopDetermineCandidates(stop) {}
+};
+
+class BeamSearchStatisticsCallbacks {
+ public:
+  RecurrentGradientMachine::EachStepCallback onEachStepStarted;
+  RecurrentGradientMachine::EachStepCallback onEachStepStoped;
+
+  BeamSearchStatisticsCallbacks(
+      const RecurrentGradientMachine::EachStepCallback& start,
+      const RecurrentGradientMachine::EachStepCallback& stop)
+      : onEachStepStarted(start), onEachStepStoped(stop) {}
+};
+
+RecurrentGradientMachine::RecurrentGradientMachine(
+    const std::string& subModelName, NeuralNetwork* rootNetwork)
+    : NeuralNetwork(subModelName),
+      rootNetwork_(rootNetwork),
+      beamSearchCtrlCallbacks_(nullptr),
+      beamSearchStatistics_(nullptr) {
+  CHECK(!subModelName_.empty());
+}
+
+/**
+ * bias layer, as input of memory frame 0 will give vector of zeros
+ * if bias parameter is not set.
+ *
+ * boot bias layer create directly in recurrent gradient machine, because:
+ *
+ * 1. It is only one frame, so it should not be placed in layer group,
+ *    which is one instance for every one frame.
+ *
+ * 2. It is no input layer, so it need resetHeight() before forward(),
+ *    and resetHeight() must be called in recurrent gradient machine,
+ *    so it's should not be placed in root network.
+ */
+class BootBiasLayer : public Layer {
+ protected:
+  std::unique_ptr<Weight> biases_;
+  IVectorPtr cpuIds_;
+
+ public:
+  explicit BootBiasLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
+    if (!Layer::init(layerMap, parameterMap)) return false;
+
+    if (biasParameter_) {
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+    }
+    return true;
+  }
+
+  void resetHeight(int height) {
+    if (config_.has_bos_id()) {  // used as a constant id layerConfig
+      IVector::resizeOrCreate(output_.ids, height, useGpu_);
+      output_.ids->reset((int)config_.bos_id());
+    } else {
+      resetOutput(height, getSize());
+    }
+  }
+
+  void forward(PassType passType) override {
+    if (biases_) {
+      MatrixPtr outV = getOutputValue();
+      outV->addBias(*(biases_->getW()), 1);
+      forwardActivation();
+    }
+  }
+
+  void backward(const UpdateCallback& callback) override {
+    if (biases_ && biases_->getWGrad()) {
+      backwardActivation();
+      biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+      biases_->getParameterPtr()->incUpdate(callback);
+    }
+  }
+};
+
+void RecurrentGradientMachine::init(
+    const ModelConfig& config,
+    ParamInitCallback callback,
+    const std::vector<ParameterType>& parameterTypes,
+    bool useGpu) {
+  NeuralNetwork::init(config, callback, parameterTypes, useGpu);
+  useGpu_ = useGpu;
+
+  auto subModelConfig =
+      std::find_if(config.sub_models().begin(),
+                   config.sub_models().end(),
+                   [this](const SubModelConfig& sub_model) {
+                     return sub_model.name() == this->subModelName_;
+                   });
+  CHECK(subModelConfig != config.sub_models().end());
+  reversed_ = subModelConfig->reversed();
+  generating_ = subModelConfig->has_generator();
+
+  inFrameLines_.resize(subModelConfig->in_links_size());
+  for (size_t i = 0; i < inFrameLines_.size(); ++i) {
+    inFrameLines_[i].linkName = subModelConfig->in_links(i).link_name();
+    inFrameLines_[i].inLayer =
+        rootNetwork_->getLayer(subModelConfig->in_links(i).layer_name());
+  }
+
+  outFrameLines_.resize(subModelConfig->out_links_size());
+  for (size_t i = 0; i < outFrameLines_.size(); ++i) {
+    auto& linkPair = subModelConfig->out_links(i);
+    outFrameLines_[i].layerName = linkPair.layer_name();
+    outFrameLines_[i].agentLayer = rootNetwork_->getLayer(linkPair.link_name());
+  }
+
+  memoryFrameLines_.resize(subModelConfig->memories_size());
+  for (size_t i = 0; i < memoryFrameLines_.size(); ++i) {
+    auto& memoryConfig = subModelConfig->memories(i);
+    memoryFrameLines_[i].layerName = memoryConfig.layer_name();
+    memoryFrameLines_[i].linkName = memoryConfig.link_name();
+    auto agentConfig =
+        std::find_if(config.layers().begin(),
+                     config.layers().end(),
+                     [&memoryConfig](const LayerConfig& layerConfig) {
+                       return layerConfig.name() == memoryConfig.link_name();
+                     });
+    CHECK(agentConfig != config.layers().end());
+    if (memoryConfig.has_boot_layer_name()) {
+      memoryFrameLines_[i].rootLayer =
+          rootNetwork_->getLayer(memoryConfig.boot_layer_name());
+
+      LayerConfig scatterConfig = *agentConfig;
+      memoryFrameLines_[i].rootAgent.reset(
+          new ScatterAgentLayer(scatterConfig));
+      memoryFrameLines_[i].rootAgent->init(LayerMap(), parameterMap_);
+
+      memoryFrameLines_[i].bootLayer = memoryFrameLines_[i].rootAgent;
+    } else {
+      LayerConfig biasConfig = *agentConfig;
+      if (memoryConfig.has_boot_bias_parameter_name()) {
+        biasConfig.set_bias_parameter_name(
+            memoryConfig.boot_bias_parameter_name());
+        biasConfig.set_active_type(memoryConfig.boot_bias_active_type());
+      } else if (memoryConfig.has_boot_with_const_id()) {
+        biasConfig.set_bos_id(memoryConfig.boot_with_const_id());
+      }
+      memoryFrameLines_[i].biasLayer.reset(new BootBiasLayer(biasConfig));
+      memoryFrameLines_[i].biasLayer->init(LayerMap(), parameterMap_);
+
+      memoryFrameLines_[i].bootLayer = memoryFrameLines_[i].biasLayer;
+    }
+
+    if (subModelConfig->has_generator()) {
+      memoryFrameLines_[i].scatterAgents.resize(2);
+      for (auto& agent : memoryFrameLines_[i].scatterAgents) {
+        agent.reset(new ScatterAgentLayer(*agentConfig));
+        agent->init(LayerMap(), parameterMap_);
+      }
+    }
+  }
+
+  if (subModelConfig->has_generator()) {
+    generator_.config = subModelConfig->generator();
+    eosFrameLine_.reset(new EosFrameLine);
+    maxSequenceLength_ = generator_.config.max_num_frames();
+  }
+
+  // get parameters actually used by this Layer Group
+  resizeOrCreateFrames(1);
+  for (auto& para : frames_[0]->getParameters()) {
+    if (para->getSharedCount() > 0) {
+      parameterIds_.push_back(para->getID());
+    }
+  }
+  for (auto& para : parameters_) {  // bias layer parameters
+    if (para->getSharedCount() > 0) {
+      parameterIds_.push_back(para->getID());
+    }
+  }
+}
+
+void RecurrentGradientMachine::resizeOrCreateFrames(int numFrames) {
+  if ((size_t)numFrames <= frames_.size()) {
+    return;
+  }
+
+  frames_.reserve(numFrames);
+  for (auto& inFrameLine : inFrameLines_) {
+    inFrameLine.agents.reserve(numFrames);
+  }
+  for (auto& outFrameLine : outFrameLines_) {
+    outFrameLine.frames.reserve(numFrames);
+  }
+  for (auto& memoryFrameLine : memoryFrameLines_) {
+    memoryFrameLine.frames.reserve(numFrames);
+    memoryFrameLine.agents.reserve(numFrames);
+  }
+  if (eosFrameLine_) {
+    eosFrameLine_->layers.reserve(numFrames);
+  }
+
+  ParamInitCallback subParamInitCb = [this](int paramId, Parameter* para) {
+    para->enableSharedType(PARAMETER_VALUE,
+                           this->parameters_[paramId]->getBuf(PARAMETER_VALUE),
+                           this->parameters_[paramId]->getMat(PARAMETER_VALUE));
+    para->enableSharedType(
+        PARAMETER_GRADIENT,
+        this->parameters_[paramId]->getBuf(PARAMETER_GRADIENT),
+        this->parameters_[paramId]->getMat(PARAMETER_GRADIENT));
+  };
+
+  for (int i = frames_.size(); i < numFrames; ++i) {
+    std::unique_ptr<NeuralNetwork> frame(
+        NeuralNetwork::newNeuralNetwork(subModelName_));
+    frame->init(config_, subParamInitCb);
+
+    for (auto& inFrameLine : inFrameLines_) {
+      inFrameLine.agents.push_back(frame->getLayer(inFrameLine.linkName));
+    }
+
+    for (auto& outFrameLine : outFrameLines_) {
+      outFrameLine.frames.push_back(frame->getLayer(outFrameLine.layerName));
+    }
+    for (auto& memoryFrameLine : memoryFrameLines_) {
+      memoryFrameLine.frames.push_back(
+          frame->getLayer(memoryFrameLine.layerName));
+      memoryFrameLine.agents.push_back(
+          frame->getLayer(memoryFrameLine.linkName));
+    }
+    if (eosFrameLine_) {
+      eosFrameLine_->layers.push_back(
+          frame->getLayer(generator_.config.eos_layer_name()));
+    }
+
+    frames_.emplace_back(std::move(frame));
+  }
+}
+
+void RecurrentGradientMachine::resizeBootFrame(int numSequences) {
+  for (auto& memoryFrameLine : memoryFrameLines_) {
+    if (memoryFrameLine.biasLayer) {
+      auto biasLayer =
+          dynamic_cast<BootBiasLayer*>(memoryFrameLine.biasLayer.get());
+      CHECK_NOTNULL(biasLayer);
+      biasLayer->resetHeight(numSequences);
+    } else {  // check input root layer height
+      CHECK_EQ(numSequences,
+               memoryFrameLine.rootLayer->getOutput().getNumSequences());
+    }
+  }
+}
+
+void RecurrentGradientMachine::prefetch(const std::vector<Argument>& inArgs) {
+  LOG(FATAL) << "should not use this function";
+}
+
+void RecurrentGradientMachine::checkInputConsistency(
+    int inlinkId, const std::vector<Argument::SeqInfo>& seqInfo) {
+  if (commonSeqInfo_.empty()) {
+    commonSeqInfo_.resize(seqInfo.size());
+    for (size_t i = 0; i < seqInfo.size(); ++i) {
+      commonSeqInfo_[i].topLevelLength = seqInfo[i].topLevelLength;
+      commonSeqInfo_[i].seqId = seqInfo[i].seqId;
+    }
+  } else {
+    CHECK_EQ(commonSeqInfo_.size(), seqInfo.size())
+        << " RecurrentGroup " << subModelName_ << " input " << inlinkId
+        << " has mismatched number of sequences";
+    for (size_t i = 0; i < seqInfo.size(); ++i) {
+      CHECK_EQ(commonSeqInfo_[i].topLevelLength, seqInfo[i].topLevelLength)
+          << " RecurrentGroup " << subModelName_ << " input " << inlinkId
+          << " has mismatched sequence length";
+      CHECK_EQ(commonSeqInfo_[i].seqId, seqInfo[i].seqId)
+          << " RecurrentGroup " << subModelName_ << " input " << inlinkId
+          << " has mismatched sequence length";
+    }
+  }
+}
+
+void RecurrentGradientMachine::calcNumSequencesAtEachStep() {
+  int numSequences = commonSeqInfo_.size();
+  numSeqs_.resize(maxSequenceLength_);
+  for (int i = 0; i < numSequences; ++i) {
+    for (int j = 0; j < commonSeqInfo_[i].topLevelLength; ++j) {
+      numSeqs_[j] = i + 1;
+    }
+  }
+}
+
+void RecurrentGradientMachine::reorganizeInput(PassType passType) {
+  info_.clear();
+  info_.resize(inFrameLines_.size());
+
+  commonSeqInfo_.clear();
+  seqInfos_.clear();
+  seqInfos_.resize(inFrameLines_.size());
+
+  for (size_t i = 0; i < inFrameLines_.size(); i++) {
+    const Argument& input = inFrameLines_[i].inLayer->getOutput();
+    if (!input.hasSeq()) {
+      continue;
+    }
+    input.getSeqInfo(&seqInfos_[i]);
+    checkInputConsistency(i, seqInfos_[i]);
+  }
+  CHECK(!commonSeqInfo_.empty())
+      << "At least one input needs to be sequence or subsequence";
+  maxSequenceLength_ = commonSeqInfo_[0].topLevelLength;
+
+  calcNumSequencesAtEachStep();
+
+  for (size_t i = 0; i < inFrameLines_.size(); ++i) {
+    const Argument& input = inFrameLines_[i].inLayer->getOutput();
+    if (!input.hasSeq()) {
+      seqInfos_[i] = commonSeqInfo_;
+    }
+    createInFrameInfo(i, input, passType);
+  }
+
+  {
+    AsyncGpuBlock asyncGpuBlock;
+
+    // inFrameLine select rows in real layer one time
+    for (size_t i = 0; i < inFrameLines_.size(); i++) {
+      selectRowsOneTime(inFrameLines_[i].inLayer,
+                        info_[i].allIds,
+                        &(inFrameLines_[i].outArg),
+                        passType);
+    }
+  }
+}
+
+void RecurrentGradientMachine::reorganizeOutput(PassType passType) {
+  calcSequenceStartPositions();
+  for (size_t i = 0; i < outFrameLines_.size(); ++i) {
+    Info info;
+    auto& outFrameLine = outFrameLines_[i];
+    ICpuGpuVectorPtr sequenceStartPositions;
+    ICpuGpuVectorPtr subSequenceStartPositions;
+    createOutFrameInfo(
+        outFrameLine, info, sequenceStartPositions, subSequenceStartPositions);
+    auto gatherAgent =
+        dynamic_cast<GatherAgentLayer*>(outFrameLine.agentLayer.get());
+    CHECK_NOTNULL(gatherAgent);
+    gatherAgent->copyIdAndSequenceInfo(sequenceStartPositions,
+                                       subSequenceStartPositions,
+                                       info.allIds,
+                                       info.idIndex);
+  }
+}
+
+void RecurrentGradientMachine::connectFrames(PassType passType) {
+  for (auto& memoryFrameLine : memoryFrameLines_) {
+    if (memoryFrameLine.rootAgent) {
+      auto scatterAgent =
+          dynamic_cast<ScatterAgentLayer*>(memoryFrameLine.rootAgent.get());
+      createMemoryFrameInfo(&memoryFrameLine, passType);
+      scatterAgent->setRealLayerAndOutput(memoryFrameLine.rootLayer,
+                                          memoryFrameLine.outArg,
+                                          memoryFrameLine.allIds,
+                                          /* idIndex */ 0,
+                                          memoryFrameLine.allIds->getSize(),
+                                          /* handleBackward */ true);
+      if (memoryFrameLine.sequenceStartPositions) {
+        int size = memoryFrameLine.sequenceStartPositions->getSize();
+        scatterAgent->setSequenceStartPositions(
+            memoryFrameLine.sequenceStartPositions,
+            /* seqStartPosIndex */ 0,
+            size);
+      }
+    }
+  }
+
+  for (auto& outFrameLine : outFrameLines_) {
+    auto gatherAgent =
+        dynamic_cast<GatherAgentLayer*>(outFrameLine.agentLayer.get());
+    gatherAgent->clearRealLayers();
+  }
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    // connect in_links
+    for (size_t j = 0; j < inFrameLines_.size(); ++j) {
+      Info& info = info_[j];
+      // idSize denotes the sum number of tokens in each length i
+      int idIndex = info.idIndex.empty() ? 0 : info.idIndex[i];
+      int idSize = info.idIndex.empty() ? numSeqs_[i]
+                                        : info.idIndex[i + 1] - info.idIndex[i];
+      InFrameLine inFrameLine = inFrameLines_[j];
+      auto scatterAgent =
+          dynamic_cast<ScatterAgentLayer*>(inFrameLine.agents[i].get());
+      scatterAgent->setRealLayerAndOutput(inFrameLine.inLayer,
+                                          inFrameLine.outArg,
+                                          info.allIds,
+                                          idIndex,
+                                          idSize,
+                                          i == 0);
+      if (info.sequenceStartPositions) {
+        // size: the length of subsequence
+        int size = info.seqStartPosIndex[i + 1] - info.seqStartPosIndex[i];
+        scatterAgent->setSequenceStartPositions(
+            info.sequenceStartPositions, info.seqStartPosIndex[i], size);
+      }
+    }
+
+    // connect out_links
+    for (auto& outFrameLine : outFrameLines_) {
+      auto gatherAgent =
+          dynamic_cast<GatherAgentLayer*>(outFrameLine.agentLayer.get());
+      gatherAgent->addRealLayer(outFrameLine.frames[i]);
+    }
+    for (auto& memoryFrameLine : memoryFrameLines_) {
+      NeuralNetwork::connect(
+          memoryFrameLine.agents[i],
+          i == 0 ? memoryFrameLine.bootLayer : memoryFrameLine.frames[i - 1],
+          numSeqs_[i] /*height of agent*/);
+    }
+  }
+}
+
+void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
+                                       std::vector<Argument>* outArgs,
+                                       PassType passType) {
+  /* inArgs and outArgs are not used.
+     The inputs are inFrameLines_[i].inLayer.
+     The outputs are outFramesLines_[i].agentLayer
+   */
+
+  if (generating_) {
+    generateSequence();
+    return;
+  }  // else forward..
+
+  reorganizeInput(passType);
+  int numSequences = commonSeqInfo_.size();
+
+  resizeOrCreateFrames(maxSequenceLength_);
+  resizeBootFrame(numSequences);
+
+  connectFrames(passType);
+
+  REGISTER_TIMER_INFO("RecurrentFwTime", "RecurrentFwTime");
+  // forward
+  for (auto& memoryFrameLine : memoryFrameLines_) {
+    memoryFrameLine.bootLayer->forward(passType);
+  }
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    const std::vector<Argument> inArgs;
+    std::vector<Argument> outArgs;
+    frames_[i]->forward(inArgs, &outArgs, passType);
+  }
+
+  reorganizeOutput(passType);
+}
+
+void RecurrentGradientMachine::backward(const UpdateCallback& callback) {
+  if (generating_) {
+    return;
+  }
+  REGISTER_TIMER_INFO("RecurrentBwTime", "RecurrentBwTime");
+  AsyncGpuBlock asyncGpuBlock;
+  for (int i = maxSequenceLength_ - 1; i >= 0; --i) {
+    frames_[i]->backward(nullptr);
+  }
+  for (auto& memoryFrameLine : memoryFrameLines_) {
+    memoryFrameLine.bootLayer->backward(nullptr);
+  }
+}
+
+void RecurrentGradientMachine::forwardBackward(
+    const std::vector<Argument>& inArgs,
+    std::vector<Argument>* outArgs,
+    PassType passType,
+    const UpdateCallback& callback) {
+  LOG(FATAL) << "should not use this function";
+}
+
+void RecurrentGradientMachine::eval(Evaluator* evaluator) const {
+  // call printers frame by frame
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    VLOG(2) << "Recurrent Layer Group eval frame " << i << " begin";
+    evaluator->eval(*(frames_[i].get()));
+    VLOG(2) << "Recurrent Layer Group eval frame " << i << " end";
+  }
+}
+
+void RecurrentGradientMachine::registerBeamSearchControlCallbacks(
+    const BeamSearchCandidatesAdjustCallback& adjustBeamSearch,
+    const NormOrDropNodeCallback& normOrDropNode,
+    const DropCallback& stopBeamSearch) {
+  this->removeBeamSearchControlCallbacks();
+  //! for gcc 46, aggregate initialization is not supported. TAT
+  this->beamSearchCtrlCallbacks_ = new BeamSearchControlCallbacks(
+      adjustBeamSearch, normOrDropNode, stopBeamSearch);
+}
+
+void RecurrentGradientMachine::removeBeamSearchControlCallbacks() {
+  if (this->beamSearchCtrlCallbacks_) {
+    delete this->beamSearchCtrlCallbacks_;
+    this->beamSearchCtrlCallbacks_ = nullptr;
+  }
+}
+
+void RecurrentGradientMachine::registerBeamSearchStatisticsCallbacks(
+    const EachStepCallback& onEachStepStarted,
+    const EachStepCallback& onEachStepStoped) {
+  this->removeBeamSearchStatisticsCallbacks();
+  this->beamSearchStatistics_ =
+      new BeamSearchStatisticsCallbacks(onEachStepStarted, onEachStepStoped);
+}
+
+void RecurrentGradientMachine::removeBeamSearchStatisticsCallbacks() {
+  if (this->beamSearchStatistics_) {
+    delete this->beamSearchStatistics_;
+    this->beamSearchStatistics_ = nullptr;
+  }
+}
+
+namespace {
+void lenToStarts(std::vector<int>& starts) {
+  int pos = 0;
+  starts.back() = 0;
+  for (auto& start : starts) {
+    int tmp = start;
+    start = pos;
+    pos += tmp;
+  }
+  starts.back() = pos;
+}
+}  // namespace
+
+void RecurrentGradientMachine::calcSequenceStartPositions() {
+  std::vector<int> starts(commonSeqInfo_.size() + 1);
+  for (auto& seqInfo : commonSeqInfo_) {
+    starts[seqInfo.seqId] = seqInfo.topLevelLength;
+  }
+  lenToStarts(starts);
+  ICpuGpuVector::resizeOrCreate(sequenceStartPositions_, starts.size(), false);
+  std::copy(starts.begin(),
+            starts.end(),
+            sequenceStartPositions_->getMutableData(false));
+}
+
+void RecurrentGradientMachine::checkOutputConsistency(
+    OutFrameLine& outFrameLine) {
+  bool hasSeq = outFrameLine.frames[0]->getOutput().hasSeq();
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    LayerPtr frame = outFrameLine.frames[i];
+    CHECK_EQ(hasSeq, frame->getOutput().hasSeq());
+    int numSequences = frame->getOutput().getNumSequences();
+    CHECK_EQ(numSeqs_[i], numSequences);
+  }
+}
+
+void RecurrentGradientMachine::createOutFrameInfo(
+    OutFrameLine& outFrameLine,
+    Info& info,
+    ICpuGpuVectorPtr& sequenceStartPositions,
+    ICpuGpuVectorPtr& subSequenceStartPositions) {
+  checkOutputConsistency(outFrameLine);
+
+  if (!outFrameLine.frames[0]->getOutput().hasSeq()) {
+    createOutFrameInfo_seq(
+        outFrameLine, info, sequenceStartPositions, subSequenceStartPositions);
+  } else {
+    createOutFrameInfo_subseq(
+        outFrameLine, info, sequenceStartPositions, subSequenceStartPositions);
+  }
+}
+
+void RecurrentGradientMachine::createOutFrameInfo_seq(
+    OutFrameLine& outFrameLine,
+    Info& info,
+    ICpuGpuVectorPtr& sequenceStartPositions,
+    ICpuGpuVectorPtr& subSequenceStartPositions) {
+  std::vector<int> allIds;
+  info.idIndex.resize(1, 0);  // first idIndex = 0
+
+  const int* starts = sequenceStartPositions_->getData(false);
+
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    LayerPtr frame = outFrameLine.frames[i];
+    size_t numSequences = frame->getOutput().getNumSequences();
+    for (size_t j = 0; j < numSequences; ++j) {
+      int seqStart = starts[commonSeqInfo_[j].seqId];
+      int seqLength = commonSeqInfo_[j].topLevelLength;
+      allIds.push_back(reversed_ ? (seqStart + seqLength - 1 - i)
+                                 : (seqStart + i));
+    }
+    info.idIndex.push_back(allIds.size());
+  }
+  sequenceStartPositions = sequenceStartPositions_;
+  copyScattedId(allIds, &info.allIds, allIds.size());
+  CHECK_EQ(info.idIndex.size(), static_cast<size_t>(maxSequenceLength_ + 1));
+}
+
+void RecurrentGradientMachine::createOutFrameInfo_subseq(
+    OutFrameLine& outFrameLine,
+    Info& info,
+    ICpuGpuVectorPtr& sequenceStartPositions,
+    ICpuGpuVectorPtr& subSequenceStartPositions) {
+  size_t numSequences = commonSeqInfo_.size();
+  std::vector<int> allIds;
+  info.idIndex.resize(1, 0);  // first idIndex = 0
+
+  const int* starts = sequenceStartPositions_->getData(false);
+  std::vector<int> subStarts(starts[numSequences] + 1);
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    LayerPtr frame = outFrameLine.frames[i];
+    size_t numSequences = frame->getOutput().getNumSequences();
+    const int* seqStarts =
+        frame->getOutput().sequenceStartPositions->getData(false);
+    for (size_t j = 0; j < numSequences; ++j) {
+      subStarts[starts[commonSeqInfo_[j].seqId] + i] =
+          seqStarts[j + 1] - seqStarts[j];
+    }
+  }
+  lenToStarts(subStarts);
+
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    LayerPtr frame = outFrameLine.frames[i];
+    size_t numSequences = frame->getOutput().getNumSequences();
+    for (size_t j = 0; j < numSequences; ++j) {
+      int pos = starts[commonSeqInfo_[j].seqId] + i;
+      int subSeqStart = subStarts[pos];
+      int subSeqEnd = subStarts[pos + 1];
+      for (int k = subSeqStart; k < subSeqEnd; ++k) {
+        allIds.push_back(k);
+      }
+    }
+    info.idIndex.push_back(allIds.size());
+  }
+
+  ICpuGpuVector::resizeOrCreate(
+      subSequenceStartPositions, subStarts.size(), false);
+  int* cpuSubSequenceStartPositions =
+      subSequenceStartPositions->getMutableData(false);
+  std::copy(subStarts.begin(), subStarts.end(), cpuSubSequenceStartPositions);
+  ICpuGpuVector::resizeOrCreate(
+      sequenceStartPositions, numSequences + 1, false);
+  int* cpuSequenceStartPositions =
+      sequenceStartPositions->getMutableData(false);
+  for (size_t i = 0; i <= numSequences; ++i) {
+    cpuSequenceStartPositions[i] = subStarts[starts[i]];
+  }
+  copyScattedId(allIds, &info.allIds, allIds.size());
+  CHECK_EQ(info.idIndex.size(), static_cast<size_t>(maxSequenceLength_ + 1));
+}
+
+/* create scattered id infomation for all realLayer of inFrameLines one time.
+ * If hasSubseq, will also create scattered sequenceStartPositions infomation
+ * for all realLayer of inFrameLines one time.
+ */
+void RecurrentGradientMachine::createInFrameInfo(int inlinkId,
+                                                 const Argument& input,
+                                                 PassType passType) {
+  if (!input.hasSeq()) {
+    createInFrameInfo_nonseq(inlinkId, input, passType);
+  } else if (!input.hasSubseq()) {
+    createInFrameInfo_seq(inlinkId, input, passType);
+  } else {
+    createInFrameInfo_subseq(inlinkId, input, passType);
+  }
+}
+
+void RecurrentGradientMachine::createInFrameInfo_nonseq(int inlinkId,
+                                                        const Argument& input,
+                                                        PassType passType) {
+  std::vector<int> allIds;
+
+  auto& seqInfo = seqInfos_[inlinkId];
+  Info* inlinkInfo = &info_[inlinkId];
+  inlinkInfo->idIndex.clear();
+  for (size_t i = 0; i < seqInfo.size(); ++i) {
+    allIds.push_back(seqInfo[i].seqId);
+  }
+  // copy and check scatterId
+  copyScattedId(allIds, &inlinkInfo->allIds, input.getBatchSize());
+}
+
+void RecurrentGradientMachine::createInFrameInfo_seq(int inlinkId,
+                                                     const Argument& input,
+                                                     PassType passType) {
+  std::vector<int> allIds;
+  auto& seqInfo = seqInfos_[inlinkId];
+  Info* inlinkInfo = &info_[inlinkId];
+  inlinkInfo->idIndex.resize(1, 0);  // first idIndex = 0
+
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    for (int j = 0; j < numSeqs_[i]; ++j) {
+      int seqLength = seqInfo[j].topLevelLength;
+      int seqStart = seqInfo[j].seqStart;
+      allIds.push_back(reversed_ ? (seqStart + seqLength - 1 - i)
+                                 : (seqStart + i));
+    }
+    inlinkInfo->idIndex.push_back(allIds.size());
+  }
+
+  // copy and check scatterId
+  copyScattedId(allIds, &inlinkInfo->allIds, input.getBatchSize());
+  CHECK_EQ(inlinkInfo->idIndex.size(),
+           static_cast<size_t>(maxSequenceLength_ + 1));
+}
+void RecurrentGradientMachine::createInFrameInfo_subseq(int inlinkId,
+                                                        const Argument& input,
+                                                        PassType passType) {
+  std::vector<int> allIds;
+
+  auto& seqInfo = seqInfos_[inlinkId];
+
+  Info* inlinkInfo = &info_[inlinkId];
+  inlinkInfo->idIndex.resize(1, 0);  // first idIndex = 0
+  std::vector<int> sequenceStartPositions;
+  const int* subSequenceStartPositions = nullptr;
+
+  subSequenceStartPositions = input.subSequenceStartPositions->getData(false);
+  inlinkInfo->seqStartPosIndex.clear();
+  inlinkInfo->seqStartPosIndex.push_back(0);  // first seqStartPosIndex = 0
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    sequenceStartPositions.push_back(0);  // first element = 0
+    for (int j = 0; j < numSeqs_[i]; ++j) {
+      int subSeqStart = subSequenceStartPositions[seqInfo[j].subSeqStart + i];
+      int subSeqEnd = subSequenceStartPositions[seqInfo[j].subSeqStart + i + 1];
+      for (int k = subSeqStart; k < subSeqEnd; ++k) {
+        allIds.push_back(k);
+      }
+      sequenceStartPositions.push_back(sequenceStartPositions.back() +
+                                       subSeqEnd - subSeqStart);
+    }
+    inlinkInfo->idIndex.push_back(allIds.size());
+    inlinkInfo->seqStartPosIndex.push_back(sequenceStartPositions.size());
+  }
+  // inFrameLine create sequenceStartPositions one time
+  CHECK_EQ(
+      sequenceStartPositions.size(),
+      static_cast<size_t>(maxSequenceLength_ + input.getNumSubSequences()));
+  CHECK_EQ(inlinkInfo->seqStartPosIndex.size(),
+           static_cast<size_t>(maxSequenceLength_ + 1));
+  createSeqPos(sequenceStartPositions, &inlinkInfo->sequenceStartPositions);
+
+  // copy and check scatterId
+  copyScattedId(allIds, &inlinkInfo->allIds, input.getBatchSize());
+  CHECK_EQ(inlinkInfo->idIndex.size(),
+           static_cast<size_t>(maxSequenceLength_ + 1));
+}
+
+/* like createInFrameInfo, but for all realLayer of memoryFrameLines*/
+void RecurrentGradientMachine::createMemoryFrameInfo(
+    MemoryFrameLine* memoryFrameLine, PassType passType) {
+  const Argument& input = (*memoryFrameLine).rootLayer->getOutput();
+  size_t numSequences = input.getNumSequences();
+  std::vector<int> allIds;
+  bool seqFlag = input.hasSeq();
+  CHECK(!input.hasSubseq())
+      << "Subsequence boot layer for memory is not supported";
+
+  if (seqFlag) {  // for sequenceScatterAgentLayer
+    std::vector<int> sequenceStartPositions;
+    sequenceStartPositions.push_back(0);  // first element = 0
+    const int* starts = input.sequenceStartPositions->getData(false);
+    for (size_t i = 0; i < numSequences; ++i) {
+      // memory info adopt info of inlinks[0]
+      int seqId = seqInfos_[0][i].seqId;
+      for (int k = starts[seqId]; k < starts[seqId + 1]; ++k) {
+        allIds.push_back(k);
+      }
+      sequenceStartPositions.push_back(sequenceStartPositions.back() +
+                                       starts[seqId + 1] - starts[seqId]);
+    }
+    createSeqPos(sequenceStartPositions,
+                 &(*memoryFrameLine).sequenceStartPositions);
+
+  } else {  // for scatterAgentLayer
+    for (size_t i = 0; i < numSequences; ++i) {
+      allIds.push_back(seqInfos_[0][i].seqId);
+    }
+  }
+  // copy and check scatterId
+  copyScattedId(allIds, &(*memoryFrameLine).allIds, input.getBatchSize());
+  // memoryFrameLine select rows in real layer one time
+  selectRowsOneTime((*memoryFrameLine).rootLayer,
+                    (*memoryFrameLine).allIds,
+                    &(*memoryFrameLine).outArg,
+                    passType);
+}
+
+void RecurrentGradientMachine::copyScattedId(std::vector<int>& srcIds,
+                                             IVectorPtr* dstIds,
+                                             int size) {
+  int idSize = srcIds.size();
+  CHECK_EQ(idSize, size);
+  IVector::resizeOrCreate(*dstIds, idSize, useGpu_);
+  (*dstIds)->copyFrom(srcIds.data(), idSize);
+  // check
+  std::sort(srcIds.begin(), srcIds.end());
+  for (int i = 0; i < idSize; ++i) {
+    CHECK_EQ(srcIds[i], i);
+  }
+}
+
+void RecurrentGradientMachine::selectRowsOneTime(LayerPtr layer,
+                                                 const IVectorPtr& allIds,
+                                                 Argument* arg,
+                                                 PassType passType) {
+  Argument& src = layer->getOutput();
+  if (src.value) {
+    const MatrixPtr& realV = src.value;
+    int height = realV->getHeight();
+    int width = realV->getWidth();
+    Matrix::resizeOrCreate(
+        arg->value, height, width, /* trans */ false, useGpu_);
+    arg->value->zeroMem();
+    arg->value->selectRows(*realV, *allIds);
+    if (passType != PASS_TEST) {
+      Matrix::resizeOrCreate(
+          arg->grad, height, width, /* trans */ false, useGpu_);
+      arg->grad->zeroMem();
+    }
+  }
+  if (src.ids) {
+    IVector::resizeOrCreate(arg->ids, src.ids->getSize(), useGpu_);
+    arg->ids->selectFrom(*src.ids, *allIds);
+  }
+}
+
+void RecurrentGradientMachine::createSeqPos(
+    const std::vector<int>& sequenceStartPosition,
+    ICpuGpuVectorPtr* sequenceStartPositions) {
+  int size = sequenceStartPosition.size();
+  const int* data = sequenceStartPosition.data();
+  ICpuGpuVector::resizeOrCreate(*sequenceStartPositions, size, false);
+  (*sequenceStartPositions)->copyFrom(data, size, false);
+}
+
+size_t RecurrentGradientMachine::getGenBatchSize() {
+  size_t numSequences = 0;
+  for (auto& memoryFrameLine : memoryFrameLines_) {
+    if (!memoryFrameLine.rootLayer) continue;
+    Argument& bootArg = memoryFrameLine.rootLayer->getOutput();
+    size_t batchSize = bootArg.getNumSequences();
+    if (numSequences) {
+      CHECK_EQ(numSequences, batchSize);
+    } else {
+      numSequences = batchSize;
+    }
+  }
+  CHECK(numSequences)
+      << "Fail to get batch size in generation. "
+         "At least one of the Memory layer MUST have a layer that is NOT in "
+         "the layer group to boot it, and this boot layer is used to "
+         "decide batch_size in generation process.";
+  return numSequences;
+}
+
+void RecurrentGradientMachine::generateSequence() {
+  CHECK_NOTNULL(eosFrameLine_.get());
+  CHECK_GE(outFrameLines_.size(), 1UL);
+  size_t numSequences = getGenBatchSize();
+
+  resizeBootFrame(numSequences);
+  // We create only two sub-network in generation, one stores states of all
+  // layers in previous time step and the other storing the states at current
+  // time step.
+  resizeOrCreateFrames(2);
+
+  // outFrameLines_.size() > 1UL
+  dataArgsSize_ = outFrameLines_.size() - 1;
+  dataArgs_.resize(dataArgsSize_);
+  dataArgsFrame_.clear();
+  dataArgsFrame_.resize(dataArgsSize_);
+
+  // connect boot frame memory links
+  std::vector<int> ids(numSequences);
+  for (size_t i = 0; i < numSequences; ++i) {
+    ids[i] = i;
+  }
+  for (auto& memoryFrameLine : memoryFrameLines_) {
+    if (memoryFrameLine.rootAgent) {
+      auto scatterAgent =
+          dynamic_cast<ScatterAgentLayer*>(memoryFrameLine.rootAgent.get());
+      scatterAgent->setRealLayer(memoryFrameLine.rootLayer, ids);
+    }
+    NeuralNetwork::connect(
+        memoryFrameLine.agents[0], memoryFrameLine.bootLayer, ids.size());
+  }
+
+  // boot layer forward
+  AsyncGpuBlock asyncGpuBlock;
+
+  for (auto& memoryFrameLine : memoryFrameLines_) {
+    memoryFrameLine.bootLayer->forward(PASS_TEST);
+  }
+
+  // init outArg
+  size_t resultNum = generator_.config.num_results_per_sample();
+  size_t maxGenWordCount =
+      generator_.config.max_num_frames() * numSequences * resultNum;
+  IVector::resizeOrCreate(generator_.outArg.ids, maxGenWordCount, false);
+  if (resultNum > 1) {
+    CHECK_LE(resultNum, static_cast<size_t>(generator_.config.beam_size()));
+    Matrix::resizeOrCreate(generator_.outArg.in,
+                           /* height */ numSequences,
+                           /* width */ resultNum,
+                           false,
+                           /* useGpu */ false);
+  }
+  ICpuGpuVector::resizeOrCreate(generator_.outArg.sequenceStartPositions,
+                                numSequences + 1,
+                                /* useGpu */ false);
+  if (getBeamSize() > 1) {
+    beamSearch(numSequences);
+  } else {
+    oneWaySearch(numSequences);
+  }
+  if (dataArgsSize_) createDataOutlink();
+
+  size_t size = generator_.ids.size();
+  generator_.outArg.ids->resize(size);
+  generator_.outArg.ids->copyFrom(generator_.ids.data(), size);
+
+  OutFrameLine& outFrameLine = outFrameLines_[0];
+  auto dataAgent = dynamic_cast<DataLayer*>(outFrameLine.agentLayer.get());
+  CHECK_NOTNULL(dataAgent);
+  dataAgent->setData(generator_.outArg);
+  dataAgent->prefetch();
+}
+
+void RecurrentGradientMachine::oneWaySearch(size_t batchSize) {
+  OutFrameLine& outFrameLine = outFrameLines_[0];
+
+  // finalPaths_[0] stores the generated results of the
+  // entire batch, so its size exactly equals to batchSize.
+  finalPaths_.clear();
+  finalPaths_.resize(1);
+  std::vector<Path>& finalPaths = finalPaths_[0];
+  finalPaths.resize(batchSize);
+
+  seqIds_.resize(batchSize);
+  std::vector<int> scatterIds;
+  for (size_t i = 0; i < batchSize; ++i) {
+    finalPaths[i].seqId = i;
+    seqIds_[i] = i;
+  }
+
+  // forward
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    if (i && scatterIds.empty()) break;
+    int machineCur = i % 2;
+    int machinePrev = (i - 1) % 2;
+    // connect memory links
+    if (i) {
+      seqIds_.clear();
+      for (size_t j = 0; j < batchSize; ++j) {
+        if (finalPaths[j].seqId != -1) seqIds_.push_back(j);
+      }
+
+      for (auto& memoryFrameLine : memoryFrameLines_) {
+        auto scatterAgent = dynamic_cast<ScatterAgentLayer*>(
+            memoryFrameLine.scatterAgents[machineCur].get());
+        scatterAgent->setRealLayer(memoryFrameLine.frames[machinePrev],
+                                   scatterIds);
+        scatterAgent->forward(PASS_TEST);
+        NeuralNetwork::connect(memoryFrameLine.agents[machineCur],
+                               memoryFrameLine.scatterAgents[machineCur]);
+      }
+    }
+    const std::vector<Argument> inArgs;
+    std::vector<Argument> outArgs;
+    frames_[machineCur]->forward(inArgs, &outArgs, PASS_TEST);
+
+    const IVectorPtr& idVec = outFrameLine.frames[machineCur]->getOutput().ids;
+    for (size_t j = 0; j < seqIds_.size(); ++j) {
+      finalPaths[seqIds_[j]].ids.push_back(idVec->getElement(j));
+      finalPaths[seqIds_[j]].machineIdVec.push_back(j);
+    }
+
+    copyDataOutlinkFrame(machineCur);
+
+    // check eos
+    const IVectorPtr& eosVec =
+        eosFrameLine_->layers[machineCur]->getOutput().ids;
+    scatterIds.clear();
+    for (size_t j = 0; j < seqIds_.size(); ++j) {
+      if (eosVec->getElement(j) == 1U) {
+        // path.seqId = -1 indicates end of generation
+        // of an input sequence
+        finalPaths[seqIds_[j]].seqId = -1;
+      } else {
+        scatterIds.push_back(j);
+      }
+    }
+  }
+
+  batchMachineIdVec_.clear();
+  batchMachineStartPos_.clear();
+  int* starts = generator_.outArg.sequenceStartPositions->getMutableData(false);
+  starts[0] = 0;
+  generator_.ids.clear();
+  for (size_t i = 0; i < batchSize; ++i) {
+    generator_.ids.insert(generator_.ids.end(),
+                          finalPaths[i].ids.begin(),
+                          finalPaths[i].ids.end());
+    starts[i + 1] = generator_.ids.size();
+    batchMachineIdVec_.insert(batchMachineIdVec_.end(),
+                              finalPaths[i].machineIdVec.begin(),
+                              finalPaths[i].machineIdVec.end());
+  }
+}
+
+void RecurrentGradientMachine::connectPrevFrame(int stepId,
+                                                std::vector<Path>& paths) {
+  int machineCur = stepId % 2;
+  int machinePrev = (stepId - 1) % 2;
+  int beam = getBeamSize();
+  machineIds_.clear();
+  topIds_.clear();
+  seqIds_.clear();
+
+  for (size_t j = 0; j < paths.size(); ++j) {
+    machineIds_.push_back(paths[j].machineId);
+    topIds_.push_back(paths[j].machineId * beam + paths[j].topIndex);
+    seqIds_.push_back(paths[j].seqId);
+  }
+
+  for (auto& memoryFrameLine : memoryFrameLines_) {
+    bool isOutIds = (memoryFrameLine.layerName == outFrameLines_[0].layerName);
+    auto scatterAgent = dynamic_cast<ScatterAgentLayer*>(
+        memoryFrameLine.scatterAgents[machineCur].get());
+    scatterAgent->setRealLayer(memoryFrameLine.frames[machinePrev],
+                               isOutIds ? topIds_ : machineIds_);
+    scatterAgent->forward(PASS_TEST);
+    NeuralNetwork::connect(memoryFrameLine.agents[machineCur],
+                           memoryFrameLine.scatterAgents[machineCur]);
+  }
+}
+
+void RecurrentGradientMachine::forwardFrame(int machineCur) {
+  // forward
+  const std::vector<Argument> inArgs;
+  std::vector<Argument> outArgs;
+  frames_[machineCur]->forward(inArgs, &outArgs, PASS_TEST);
+
+  copyDataOutlinkFrame(machineCur);
+
+  IVectorPtr& ids = outFrameLines_[0].frames[machineCur]->getOutput().ids;
+  MatrixPtr in = outFrameLines_[0].frames[machineCur]->getOutput().in;
+  IVectorPtr& eos = eosFrameLine_->layers[machineCur]->getOutput().ids;
+  if (useGpu_) {
+    IVector::resizeOrCreate(cpuId_, ids->getSize(), false /* useGpu */);
+    cpuId_->copyFrom(*ids);
+    Matrix::resizeOrCreate(cpuProb_,
+                           in->getHeight(),
+                           in->getWidth(),
+                           false /* trans */,
+                           false /* useGpu */);
+    cpuProb_->copyFrom(*in);
+    IVector::resizeOrCreate(cpuEos_, eos->getSize(), false /* useGpu */);
+    cpuEos_->copyFrom(*eos);
+  } else {
+    cpuId_ = ids;
+    cpuProb_ = in;
+    cpuEos_ = eos;
+  }
+}
+
+void RecurrentGradientMachine::singlePathExpand(Path& curPath,
+                                                size_t curPathId,
+                                                std::vector<Path>& newPaths,
+                                                size_t expandWidth) {
+  int calc_id =
+      gDiyProbStart ? gDiyProbStart(curPath.ids.size(), curPath.ids.data()) : 0;
+
+  const int* idVec = cpuId_->getData();
+  const real* probMat = cpuProb_->getData();
+  const int* eosVec = cpuEos_->getData();
+
+  for (size_t k = 0; k < expandWidth; k++) {
+    int index = curPathId * expandWidth + k;
+    int id = idVec[index];
+    real prob = probMat[index];
+    /*
+     * Ordinarily, beam search greedily expands the most promising expandWidth
+     * paths that currently are ALWAYS returned by MaxIdLayer.
+     * In one condition, if user customizes the beam search procedure by
+     * restricting the expansion within a user defined subset,
+     * as a result, MaxIdLayer possibly COULD NOT return expandWidth
+     * vaild expansions, and it will use -1 to indicate the end of valid
+     * expansion candidates.
+     */
+    if (id == -1) break;
+
+    real newLogProb = generator_.config.log_prob() ? std::log(prob) : prob;
+    Path newPath(
+        curPath, id, newLogProb, curPathId /*machineId*/, k /*topIndex*/);
+    if (this->beamSearchCtrlCallbacks_) {
+      if (beamSearchCtrlCallbacks_->stopDetermineCandidates(
+              newPath.seqId, newPath.ids, newPath.probHistory))
+        return;
+    }
+    // outFrameLines_.size() > 1UL
+    if (dataArgsSize_) {
+      newPath.machineIdVec = curPath.machineIdVec;
+      newPath.machineIdVec.push_back(curPathId);
+    }
+    bool atEos =
+        eosVec[index] == 1U || newPath.ids.size() >= (size_t)maxSequenceLength_;
+    // adjustNewPath
+    newPath.adjustProb(calc_id, atEos);
+    if (this->beamSearchCtrlCallbacks_) {
+      this->beamSearchCtrlCallbacks_->normOrDropNode(
+          newPath.seqId, newPath.ids, newPath.probHistory, &newPath.logProb);
+    }
+    if (!newPath.isDropable()) {
+      atEos ? finalPaths_[curPath.seqId].push_back(newPath)
+            : newPaths.push_back(newPath);
+    }
+  }  // for expandWidth
+
+  if (gDiyProbStop) {
+    gDiyProbStop(calc_id);
+  }
+}
+
+void RecurrentGradientMachine::beamExpand(std::vector<Path>& paths,
+                                          std::vector<Path>& newPaths) {
+  size_t candidatePathCount = paths.size();
+  // idVec.size() could be larger than candidatePathCount * beam,
+  // so user can drop some node customly.
+  CHECK_EQ(cpuId_->getSize() % candidatePathCount, 0UL);
+  size_t expandWidth = cpuId_->getSize() / candidatePathCount;
+
+  // iterate over each sequence
+  size_t totalExpandCount = 0;
+  int prevSeqId = -1;
+  int curSeqId = 0;
+  for (size_t j = 0; j <= candidatePathCount; j++) {
+    // expansions of a single sequence are all processed
+    curSeqId = (j < candidatePathCount ? paths[j].seqId : curSeqId + 1);
+    if (prevSeqId != -1 && curSeqId != prevSeqId) {
+      totalExpandCount += beamShrink(newPaths, prevSeqId, totalExpandCount);
+    }
+    if (j == candidatePathCount) return;
+    singlePathExpand(paths[j], j, newPaths, expandWidth);
+
+    prevSeqId = paths[j].seqId;
+  }  // for paths
+}
+
+// Drop extra nodes to beam size.
+size_t RecurrentGradientMachine::beamShrink(std::vector<Path>& newPaths,
+                                            size_t seqId,
+                                            size_t totalExpandCount) {
+  size_t minNewPathSize =
+      std::min(getBeamSize(), newPaths.size() - totalExpandCount);
+  if (!minNewPathSize) {
+    return 0;
+  }
+  std::nth_element(newPaths.begin() + totalExpandCount,
+                   newPaths.begin() + totalExpandCount + minNewPathSize,
+                   newPaths.end(),
+                   Path::greaterPath);
+  newPaths.resize(totalExpandCount + minNewPathSize);
+
+  real minPathLogProb =
+      std::min_element(newPaths.end() - minNewPathSize, newPaths.end())
+          ->logProb;
+  real maxPathLogProb =
+      std::max_element(newPaths.end() - minNewPathSize, newPaths.end())
+          ->logProb;
+
+  // Remove the already formed paths that are relatively short
+  finalPaths_[seqId].erase(
+      std::remove_if(finalPaths_[seqId].begin(),
+                     finalPaths_[seqId].end(),
+                     [&](Path& p) { return p.logProb < minPathLogProb; }),
+      finalPaths_[seqId].end());
+  for (auto p : finalPaths_[seqId]) {
+    if (minFinalPathLogProb_[seqId] > p.logProb) {
+      minFinalPathLogProb_[seqId] = p.logProb;
+    }
+  }
+
+  if (finalPaths_[seqId].size() >= getBeamSize() &&
+      minFinalPathLogProb_[seqId] >= maxPathLogProb) {
+    newPaths.resize(totalExpandCount);
+    return 0;
+  }
+  return minNewPathSize;
+}
+
+void RecurrentGradientMachine::fillGenOutputs() {
+  size_t numResults = generator_.config.num_results_per_sample();
+  for (size_t i = 0; i < finalPaths_.size(); ++i) {
+    size_t minFinalPathsSize = std::min(numResults, finalPaths_[i].size());
+    std::partial_sort(finalPaths_[i].begin(),
+                      finalPaths_[i].begin() + minFinalPathsSize,
+                      finalPaths_[i].end(),
+                      Path::greaterPath);
+    finalPaths_[i].resize(minFinalPathsSize);
+  }
+
+  generator_.ids.clear();
+  int* starts = generator_.outArg.sequenceStartPositions->getMutableData(false);
+  starts[0] = 0;
+  if (numResults > 1) {
+    int idsProbSaveSize = 0;
+    for (auto inSeq : finalPaths_) {
+      for (auto path : inSeq) idsProbSaveSize += path.ids.size();
+      idsProbSaveSize += inSeq.size();
+    }
+    Matrix::resizeOrCreate(
+        generator_.outArg.value, idsProbSaveSize, 1, false, false);
+    real* idsProb = generator_.outArg.value->getData();
+
+    real* probs = generator_.outArg.in->getData();
+    size_t curPos = 0;
+    for (size_t i = 0; i < finalPaths_.size(); ++i) {
+      for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
+        Path& path = finalPaths_[i][j];
+        size_t genLen = path.ids.size();
+        generator_.ids.push_back(genLen);  // sequence size
+        generator_.ids.insert(
+            generator_.ids.end(), path.ids.begin(), path.ids.end());
+        generator_.ids.push_back(-1);  // end of sequence
+
+        memcpy(idsProb + curPos, path.idsProb.data(), sizeof(real) * genLen);
+        curPos += genLen;
+        idsProb[curPos++] = -1.0;
+        probs[i * numResults + j] = path.logProb;
+      }
+      starts[i + 1] = generator_.ids.size();
+    }
+  } else {
+    for (size_t i = 0; i < finalPaths_.size(); ++i) {
+      CHECK(!finalPaths_[i].empty());
+      Path& path = finalPaths_[i][0];
+      generator_.ids.insert(
+          generator_.ids.end(), path.ids.begin(), path.ids.end());
+      starts[i + 1] = starts[i] + path.ids.size();
+    }
+  }
+}
+
+void RecurrentGradientMachine::copyDataOutlinkFrame(size_t machineCur) {
+  for (size_t i = 0; i < dataArgsSize_; i++) {
+    Argument outFrame;
+    outFrame.resizeAndCopyFrom(
+        outFrameLines_[i + 1].frames[machineCur]->getOutput(), useGpu_);
+    dataArgsFrame_[i].emplace_back(outFrame);
+  }
+}
+
+void RecurrentGradientMachine::createDataOutlinkSelRowsInfo(
+    bool isSeq, std::vector<Argument>& outArgs) {
+  batchMachineIdVec_.clear();
+
+  size_t seqIdx = 0;
+  for (size_t i = 0; i < finalPaths_.size(); ++i) {
+    for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
+      std::vector<int>& machineIdVec = finalPaths_[i][j].machineIdVec;
+      if (isSeq) {
+        for (size_t i = 0; i < machineIdVec.size(); ++i) {
+          size_t rowId = machineIdVec[i];
+          int* seqPos =
+              outArgs[i].sequenceStartPositions->getMutableData(false);
+          batchMachineIdVec_.push_back(seqPos[rowId]);
+        }
+      } else {
+        batchMachineIdVec_.insert(
+            batchMachineIdVec_.end(), machineIdVec.begin(), machineIdVec.end());
+      }
+      seqIdx++;
+    }
+  }
+}
+
+void RecurrentGradientMachine::createDataOutlinkCopySizeInfo(
+    bool isSeq, std::vector<Argument>& outArgs, std::vector<int>& copySize) {
+  size_t totalSeqNum = std::accumulate(
+      finalPaths_.begin(),
+      finalPaths_.end(),
+      0UL,
+      [](size_t a, const std::vector<Path>& b) { return a + b.size(); });
+  copySize.resize(totalSeqNum, 1);
+
+  batchMachineStartPos_.resize(totalSeqNum + 1, 0);
+  if (isSeq) {
+    ICpuGpuVectorPtr inputSeqStartPos = outArgs[0].sequenceStartPositions;
+    CHECK_EQ(static_cast<size_t>(inputSeqStartPos->getSize() - 1),
+             getBeamSize() > 1 ? finalPaths_.size() : finalPaths_[0].size());
+    int* starts = inputSeqStartPos->getMutableData(false);
+    int seqId = 0;
+    for (size_t i = 0; i < finalPaths_.size(); ++i) {
+      for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
+        copySize[seqId] = getBeamSize() > 1 ? starts[i + 1] - starts[i]
+                                            : starts[j + 1] - starts[j];
+        batchMachineStartPos_[seqId + 1] =
+            batchMachineStartPos_[seqId] + finalPaths_[i][j].ids.size();
+        seqId++;
+      }
+    }
+  } else {
+    for (size_t i = 0; i < finalPaths_[0].size(); ++i)
+      batchMachineStartPos_[i + 1] =
+          batchMachineStartPos_[i] + finalPaths_[0][i].ids.size();
+  }
+}
+
+void RecurrentGradientMachine::createDataOutlink() {
+  for (size_t i = 0; i < dataArgsSize_; i++) {
+    bool isSeq = dataArgsFrame_[i][0].hasSeq();
+    std::vector<int> copySize;
+    createDataOutlinkCopySizeInfo(isSeq, dataArgsFrame_[i], copySize);
+    createDataOutlinkSelRowsInfo(isSeq, dataArgsFrame_[i]);
+
+    dataArgs_[i].concat(dataArgsFrame_[i],
+                        batchMachineIdVec_,
+                        batchMachineStartPos_,
+                        copySize,
+                        useGpu_,
+                        HPPL_STREAM_1,
+                        PASS_TEST);
+    auto dataAgent =
+        dynamic_cast<DataLayer*>(outFrameLines_[i + 1].agentLayer.get());
+    CHECK_NOTNULL(dataAgent);
+    dataAgent->setData(dataArgs_[i]);
+  }
+}
+
+void RecurrentGradientMachine::beamSearch(size_t batchSize) {
+  finalPaths_.clear();
+  finalPaths_.resize(batchSize);
+  seqIds_.resize(batchSize);
+  minFinalPathLogProb_.clear();
+  minFinalPathLogProb_.resize(batchSize, 0);
+
+  std::vector<Path> paths;
+  std::vector<Path> newPaths;
+  for (size_t i = 0; i < batchSize; ++i) {
+    paths.push_back(Path(i));
+    if (this->beamSearchCtrlCallbacks_) {
+      paths.back().recordHistory();
+    }
+  }
+
+  // restart beam search
+  stopBeamSearch_ = false;
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    int machineCur = i % 2;
+    std::unique_ptr<
+        ScopedCallbacks<const RecurrentGradientMachine::EachStepCallback&, int>>
+        statisticsBlock;
+    if (this->beamSearchStatistics_) {
+      auto ptr =
+          new ScopedCallbacks<const RecurrentGradientMachine::EachStepCallback&,
+                              int>(beamSearchStatistics_->onEachStepStarted,
+                                   beamSearchStatistics_->onEachStepStoped,
+                                   i);
+      statisticsBlock.reset(ptr);
+    }
+    if (stopBeamSearch_) break;
+
+    if (i) connectPrevFrame(i, paths);
+
+    if (this->beamSearchCtrlCallbacks_) {
+      std::vector<std::vector<int>*> prefixes;
+      prefixes.resize(paths.size());
+      std::transform(
+          paths.begin(), paths.end(), prefixes.begin(), [](const Path& p) {
+            return const_cast<std::vector<int>*>(&p.ids);
+          });
+      beamSearchCtrlCallbacks_->beamSearchCandidateAdjust(
+          prefixes, frames_[machineCur].get(), i);
+    }
+
+    forwardFrame(machineCur);
+    beamExpand(paths, newPaths);
+    if (newPaths.empty()) break;
+
+    paths = newPaths;
+    newPaths.clear();
+  }  // end for machineCur
+  fillGenOutputs();
+}
+
+void RecurrentGradientMachine::Path::adjustProb(int calc_id, bool atEos) {
+  if (gDiyProbMethod) {
+    logProb = gDiyProbMethod(calc_id, ids.size(), ids.data(), logProb, atEos);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.h b/paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.h
new file mode 100644
index 0000000000000000000000000000000000000000..0a13d4f6f84eb5309a1b25f039357cb8af02c35e
--- /dev/null
+++ b/paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.h
@@ -0,0 +1,580 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <functional>
+#include "GradientMachine.h"
+#include "NeuralNetwork.h"
+
+#include "paddle/legacy/utils/Locks.h"
+
+namespace paddle {
+
+/**
+ * Private data class declares.
+ * Used for user customized beam search.
+ */
+class BeamSearchControlCallbacks;
+class BeamSearchStatisticsCallbacks;
+
+class RecurrentGradientMachine : public NeuralNetwork {
+ public:
+  RecurrentGradientMachine(const std::string& subModelName,
+                           NeuralNetwork* rootNetwork);
+
+  // Disable copy and assign.
+  RecurrentGradientMachine(const RecurrentGradientMachine& other) = delete;
+  RecurrentGradientMachine& operator=(const RecurrentGradientMachine& other) =
+      delete;
+
+  virtual ~RecurrentGradientMachine() {
+    this->removeBeamSearchStatisticsCallbacks();
+    this->removeBeamSearchControlCallbacks();
+  }
+
+  virtual void init(const ModelConfig& config,
+                    ParamInitCallback callback,
+                    const std::vector<ParameterType>& parameterTypes,
+                    bool useGpu);
+
+  virtual void prefetch(const std::vector<Argument>& inArgs);
+
+  virtual void forward(const std::vector<Argument>& inArgs,
+                       std::vector<Argument>* outArgs,
+                       PassType passType);
+
+  virtual void backward(const UpdateCallback& callback = nullptr);
+
+  void forwardBackward(const std::vector<Argument>& inArgs,
+                       std::vector<Argument>* outArgs,
+                       PassType passType,
+                       const UpdateCallback& callback);
+
+  virtual void resetState() {}
+  virtual void eval(Evaluator* evaluator) const;
+
+  const std::vector<int>& getParameterIds() { return parameterIds_; }
+
+  /**
+   * @brief BeamSearchCandidatesAdjustCallback
+   *
+   * Adjust searching candidates to restrict beam search
+   * searching within a limited subset of all possibile paths.
+   *
+   * The first parameter is the prefixes of all formed paths in current
+   * beam search step, whose type is basically int[][].
+   *
+   * The second parameter is a pointer to the network used to generate sequence,
+   * user can use this pointer to tranverse each layer in the network to
+   * modify behaivors of a particular layer.
+   *
+   * The third parameter is an integer to indicate the iteration number of
+   * beam search, so that user can customize different operations in different
+   * beam search iterations.
+   */
+  typedef std::function<void(
+      const std::vector<std::vector<int>*>&, NeuralNetwork*, const int)>
+      BeamSearchCandidatesAdjustCallback;
+
+  /**
+   * @brief DropCallback
+   *
+   * Drop a whole prefix or one candidate in beam search or not.
+   *
+   * The first parameter is sequence index in a batch
+   *
+   * The second parameter is one path in beam search,
+   * which is made up of node indices.
+   *
+   * The third parameter is probabilites for each node in this path.
+   *
+   * Return true if this prefix or candidate is expected to be dropped.
+   */
+  typedef std::function<bool(
+      int seqId, const std::vector<int>&, const std::vector<real>&)>
+      DropCallback;
+
+  /**
+   * @brief NormOrDropNodeCallback
+   *
+   * Normalize a path's probabilities or just drop it by modifying path.logProb
+   *
+   * The first parameter is sequence index in a batch
+   *
+   * The second parameter is path.ids
+   *
+   * The third parameter is probabilites for each node in this path.
+   *
+   * The fourth parameter is the probability of the whole path.
+   */
+  typedef std::function<void(
+      int seqId, const std::vector<int>&, std::vector<real>&, real*)>
+      NormOrDropNodeCallback;
+
+  /**
+   * @brief Register beam search control callbacks. Used for prediction.
+   *
+   * @param queryBeamSearch: Give the sequences already formed, return the
+   * nodes expected to be expanded.
+   * Input: A pointer to an array holding pathes which have been expanded
+   * Return: A pointer to an array holding nodes wanted to be expanded.
+   *
+   * @param dropOneNode: Early drop a node in one beam search step.
+   * Given the path formed and probability history, decide whether a node
+   * should be dropped or not.
+   *
+   * @param stopBeamSearch: Early stop a path in one beam search step.
+   * Given the path and probability history, decide whether a path
+   * should be dropped or not.
+   */
+  void registerBeamSearchControlCallbacks(
+      const BeamSearchCandidatesAdjustCallback& adjustBeamSearch,
+      const NormOrDropNodeCallback& normOrDropNode,
+      const DropCallback& stopBeamSearch);
+
+  /**
+   * @brief Remove user costumized beam search callbacks,
+   *
+   * make sequence generation acts like normal beam search.
+   */
+  void removeBeamSearchControlCallbacks();
+
+  /**
+   * @brief EachStepCallback
+   *
+   * Invoke with beam search step.
+   */
+  typedef std::function<void(int)> EachStepCallback;
+
+  /**
+   * @brief register statistics methods for performance profile of beam search.
+   *
+   * @param onEachStepStarted: invoke once a beam search step starts.
+   * Its input is index of the beam search step.
+   *
+   * @param onEachStepStoped: invoke once a beam search step ends.
+   * Its input is index of the beam search step.
+   */
+  void registerBeamSearchStatisticsCallbacks(
+      const EachStepCallback& onEachStepStarted,
+      const EachStepCallback& onEachStepStoped);
+
+  /**
+   * @brief Remove beam search callbacks.
+   */
+  void removeBeamSearchStatisticsCallbacks();
+
+  /**
+   * @brief Stop beam search for current source.
+   *
+   * Will restart beam search in the next forward
+   */
+  void stopBeamSearch();
+
+  struct Path {
+    /**
+     * @brief ids, path of beam search.
+     */
+    std::vector<int> ids;
+
+    /**
+     * @brief idsProb, log probability of each generated word.
+     */
+    std::vector<real> idsProb;
+
+    /**
+     * @brief logProb, current probability of path.
+     */
+    real logProb;
+
+    int machineId;  // index of sample in frame
+    int topIndex;   // index of MaxIdLayer output in one sample
+    int seqId;      // index of sequence in batch generation
+    std::vector<int> machineIdVec;
+
+    /**
+     * @brief A record of each node's probality in a formed path in beam search.
+     *
+     * @note  It could be empty when history is not recorded. If the history is
+     *        wanted to be recorded, recordHistory() MUST be invoked first.
+     */
+    std::vector<real> probHistory;
+
+    /**
+     * @brief Path default ctor, first logProb is 0.
+     */
+    Path() {
+      logProb = 0;
+      seqId = 0;
+    }
+    explicit Path(size_t seqId) : seqId(seqId) { logProb = 0; }
+
+    /**
+     * @brief Create a new path based on an old path and
+     * a new node with probability.
+     *
+     * @param old       old path
+     * @param newId     index of the new node
+     * @param logProb   probability of the new node.
+     * @param machineId sample index of a frame in RNN
+     * @param topIndex  index of MaxIdLayer output in one sample
+     */
+    Path(Path& old, int newId, real logProb, int machineId, int topIndex)
+        : ids(old.ids),
+          idsProb(old.idsProb),
+          logProb(old.logProb + logProb),
+          machineId(machineId),
+          topIndex(topIndex),
+          seqId(old.seqId) {
+      ids.push_back(newId);
+      idsProb.push_back(logProb);
+      if (!old.probHistory.empty()) {
+        this->probHistory = old.probHistory;
+        // probHistory store current prob, not sum
+        this->probHistory.push_back(logProb);
+      }
+    }
+
+    /**
+     * @brief operator <
+     *
+     * Path a < Path b means log probability of a is smaller than that of b
+     */
+    bool operator<(const Path& other) const {
+      return (logProb < other.logProb);
+    }
+
+    static bool greaterPath(const Path& a, const Path& b) { return (b < a); }
+
+    /**
+     * @brief Start recording history in this path.
+     */
+    void recordHistory() { this->probHistory.push_back(this->logProb); }
+
+    /**
+     * @brief Adjust probability for DIY beam search interface.
+     * In normal situation, it will do nothing.
+     *
+     * @param calc_id: the object id for DIY beam search interface.
+     * @param atEos: at end of sequence or not.
+     */
+    void adjustProb(int calc_id, bool atEos = false);
+
+    /**
+     * @brief isDropable indacating whether the current node will be
+     * dropped or not in beam search.
+     *
+     * @note: if logProb is -inf, current node will be dropped.
+     * @return true to drop the current node.
+     */
+    bool isDropable() const { return std::isinf(logProb) && logProb < 0; }
+  };
+
+  /**
+   * @brief access beam search results.
+   * @return beam search results.
+   */
+  const std::vector<std::vector<Path>>& getFinalPaths() const {
+    return this->finalPaths_;
+  }
+
+ protected:
+  std::vector<Argument::SeqInfo> commonSeqInfo_;
+  ICpuGpuVectorPtr sequenceStartPositions_;
+  void calcSequenceStartPositions();
+  void checkInputConsistency(int inlinkId,
+                             const std::vector<Argument::SeqInfo>& seqInfo);
+  void reorganizeInput(PassType passType);
+  void reorganizeOutput(PassType passType);
+  void connectFrames(PassType passType);
+  void calcNumSequencesAtEachStep();
+
+  void resizeOrCreateFrames(int numFrames);
+  void resizeBootFrame(int numSequences);
+
+  void generateSequence();
+  void oneWaySearch(size_t batchSize);
+  void beamSearch(size_t batchSize);
+
+  struct InFrameLine {
+    std::string linkName;
+    LayerPtr inLayer;
+    std::vector<LayerPtr> agents;  // Scatter Agents to reform batch input
+    Argument outArg;               // scatter output argument
+  };
+  std::vector<InFrameLine> inFrameLines_;
+
+  struct OutFrameLine {
+    std::string layerName;
+    LayerPtr agentLayer;
+    std::vector<LayerPtr> frames;
+  };
+  std::vector<OutFrameLine> outFrameLines_;
+
+  struct MemoryFrameLine {
+    std::string layerName;
+    std::string linkName;
+    LayerPtr bootLayer;  // actually used biasLayer or rootAgent
+    LayerPtr biasLayer;
+    LayerPtr rootLayer;  // layer in root network to boot this memory
+    LayerPtr rootAgent;  // agent to link rootLayer
+    std::vector<LayerPtr> frames;
+    std::vector<LayerPtr> agents;
+    std::vector<LayerPtr> scatterAgents;  // scatter agent used by beam search
+    Argument outArg;                      // scatter output argument
+    // Different memoryFrameLine have different element as follows
+    IVectorPtr allIds;  // scattered id of realLayer
+    ICpuGpuVectorPtr
+        sequenceStartPositions;  // scattered sequenceStartPositions
+  };
+  std::vector<MemoryFrameLine> memoryFrameLines_;
+
+  // Each inFrameLines(inlinks) has its own info(elements) below,
+  // and all outFrameLines(outlinks) share the info with one inFrameLine,
+  // which is assigned by targetInfoInlinkId_.
+  struct Info {
+    // The original positions in the original batch
+    IVectorPtr allIds;  // scattered id of realLayer [batchSize]
+
+    // index of allIds for each step [maxSequenceLength_]
+    // idIndex[i] is the total length of the first i sequences
+    std::vector<int> idIndex;
+
+    ICpuGpuVectorPtr
+        sequenceStartPositions;         // scattered sequenceStartPositions
+    std::vector<int> seqStartPosIndex;  // index of sequenceStartPositions
+  };
+  std::vector<Info> info_;  // for input
+
+  // numSeqs_[i] is the number sequences which is longer than i (for sequence
+  // data) or has more than i subsequences (for subsequence data)
+  // Equivalently, numSeqs_[i] is the number of sequences at step i;
+  std::vector<int> numSeqs_;
+
+  std::vector<std::vector<Argument::SeqInfo>> seqInfos_;
+
+  void checkOutputConsistency(OutFrameLine& outFrameLine);
+
+  /* create scattered id infomation for all realLayer of inFrameLines one time.
+   *  If hasSubseq, will also create scattered sequenceStartPositions infomation
+   *  for all realLayer of inFrameLines one time.
+   */
+  void createInFrameInfo(int inlinks_id,
+                         const Argument& input,
+                         PassType passType);
+  void createInFrameInfo_nonseq(int inlinks_id,
+                                const Argument& input,
+                                PassType passType);
+  void createInFrameInfo_seq(int inlinks_id,
+                             const Argument& input,
+                             PassType passType);
+  void createInFrameInfo_subseq(int inlinks_id,
+                                const Argument& input,
+                                PassType passType);
+
+  void createOutFrameInfo(OutFrameLine& outFrameLine,
+                          Info& info,
+                          ICpuGpuVectorPtr& sequenceStartPositions,
+                          ICpuGpuVectorPtr& subSequenceStartPositions);
+  void createOutFrameInfo_seq(OutFrameLine& outFrameLine,
+                              Info& info,
+                              ICpuGpuVectorPtr& sequenceStartPositions,
+                              ICpuGpuVectorPtr& subSequenceStartPositions);
+  void createOutFrameInfo_subseq(OutFrameLine& outFrameLine,
+                                 Info& info,
+                                 ICpuGpuVectorPtr& sequenceStartPositions,
+                                 ICpuGpuVectorPtr& subSequenceStartPositions);
+
+  void createMemoryFrameInfo(MemoryFrameLine* memoryFrameLine,
+                             PassType passType);
+
+  void copyScattedId(std::vector<int>& srcIds, IVectorPtr* dstIds, int size);
+
+  void selectRowsOneTime(LayerPtr layer,
+                         const IVectorPtr& allIds,
+                         Argument* arg,
+                         PassType passType);
+
+  void createSeqPos(const std::vector<int>& sequenceStartPosition,
+                    ICpuGpuVectorPtr* sequenceStartPositions);
+
+  // for generator
+  struct EosFrameLine {
+    std::vector<LayerPtr> layers;
+  };
+  std::unique_ptr<EosFrameLine> eosFrameLine_;
+
+  struct Generator {
+    GeneratorConfig config;
+    std::vector<int> ids;       // store generated sequences
+    std::vector<real> idsProb;  // log probability of each generated word
+    Argument outArg;            // final output argument
+  };
+  bool generating_;
+  Generator generator_;
+
+  std::vector<std::unique_ptr<NeuralNetwork>> frames_;
+
+  NeuralNetwork* rootNetwork_;
+  bool reversed_;
+
+  int maxSequenceLength_;  // Max top-level length
+  bool useGpu_;
+  bool stopBeamSearch_;
+
+  std::vector<int>
+      parameterIds_;  // parameters actually used by this Layer Group
+
+  // store final argument of outFrameLines_
+  std::vector<Argument> dataArgs_;
+  // store each frame's output argument of outFrameLines_
+  std::vector<std::vector<Argument>> dataArgsFrame_;
+  size_t dataArgsSize_;  // size of dataArgs_ = size of dataArgsFrame_
+
+  IVectorPtr cpuId_;
+  MatrixPtr cpuProb_;
+  IVectorPtr cpuEos_;
+
+ private:
+  /*
+   * @return beam size in beam search
+   */
+  size_t getBeamSize() { return generator_.config.beam_size(); }
+
+  /*
+   * @return number of sequence in a batch in generation
+   */
+  size_t getGenBatchSize();
+
+  /*
+   * @brief store output of the machineCur-th frame during generation, for
+   * creating the final outlink after the entire generation process is finished.
+   *
+   * In generation, if the layer group has more than 1 outlink, the first
+   * one is reserved to store the generated word indices, the others are data
+   * outlinks, that can be used like a common layer in the network.
+   *
+   * @param machineCur : index to access the layer group frame in
+   * currrent generation step.
+   */
+  void copyDataOutlinkFrame(size_t machineCur);
+
+  /*
+   * @brief In generation, if the layer group has more than 1 outlink, outlink
+   * except the first one is a data outlink. In RecurrentLayerGroup, each time
+   * step is a separate Network, outputs of a layer inside the
+   * RecurrentLayerGroup are stored in separate Arguments. If one layer is
+   * specified as an outlink of RecurrentLayerGroup. This function will
+   * collect outputs in each time step of each generated sequence which are
+   * dispersed in separate Arguments to form a new single Argument as output of
+   * RecurrentLayerGroup.
+   */
+  void createDataOutlink();
+
+  /*
+   * @brief decide to select how many rows from the Matrix stored the forward
+   * pass results from a start position.
+   *
+   * @param isSeq: a flag indicating whetehr the layer to be output of the
+   * RecurrentGradientMachine is a sequence or not
+   * @param outArgs: all of the the returned Arguments of the forward pass
+   * during the generation process.
+   * @param copySize: the returned result, number of rows to select from the
+   * Matrix stored the forward pass results from a start position.
+   */
+  void createDataOutlinkCopySizeInfo(bool isSeq,
+                                     std::vector<Argument>& outArgs,
+                                     std::vector<int>& copySize);
+
+  /*
+   * @brief decide index of the start row for each time step of a generated
+   * sequence in Matrix stored the entire beam search batch's forward pass
+   * results.
+   *
+   * @param isSeq: a flag indicating whether the layer to be output of the
+   * RecurrentGradientMachine is a sequence or not
+   * @param outArgs: all of the returned Arguments of the forward pass
+   * during the generation process.
+   */
+  void createDataOutlinkSelRowsInfo(bool isSeq, std::vector<Argument>& outArgs);
+
+  /*
+   * @brief used in beam search, connect previous frame to form recurrent link
+   * @param stepId : iteration number of generation process.
+   * It equals to the length of longest half-generated sequence.
+   * @param paths : half-generated paths that are going to be expanded
+   * in current beam search iteration.
+   */
+  void connectPrevFrame(int stepId, std::vector<Path>& paths);
+
+  /*
+   * @brief used in beam search, forward current recurrent frame
+   * @param machineCur : index to access the layer group frame in
+   * currrent generation step.
+   */
+  void forwardFrame(int machineCur);
+
+  /*
+   * @brief reduce all expanded paths to beam size.
+   *
+   * @param newPaths : newPaths[totalExpandCount : ] stores all expanded paths
+   * for the seqId-th sequence
+   * @param seqId : sequence index in a batch
+   * @param totalExpandCount : number of already shrinked paths in newPaths
+   * @return size of retained paths at the end of a beam search iteration
+   */
+  size_t beamShrink(std::vector<Path>& newPaths,
+                    size_t seqId,
+                    size_t totalExpandCount);
+
+  /*
+   * @brief expand a single path to expandWidth new paths
+   * with highest probability
+   * @param curPath : path to be expanded
+   * @param curPathId : index of curPath in member newPaths
+   * @param expandWidth : number of paths to be expanded
+   */
+  void singlePathExpand(Path& curPath,
+                        size_t curPathId,
+                        std::vector<Path>& newPaths,
+                        size_t expandWidth);
+
+  /*
+   * @brief A new beam search iteration. Each half-generated paths in previous
+   * beam search iteration are further expanded to beam_size new paths
+   * with highest probabilities, and then all the expanded paths are again
+   * reduced to beam_size paths according to their log probabilities.
+   * @param paths : half-generated paths in previous iteration.
+   * @param newPaths : paths expanded and then reduces in current iteration.
+   */
+  void beamExpand(std::vector<Path>& paths, std::vector<Path>& newPaths);
+
+  /*
+   * @brief fill sequence start positions and some other information that are
+   * uesed by the "text_printer" evaluator.
+   */
+  void fillGenOutputs();
+
+  std::vector<int> machineIds_;
+  std::vector<int> topIds_;
+  std::vector<int> seqIds_;
+  std::vector<int> batchMachineIdVec_;
+  std::vector<int> batchMachineStartPos_;
+  std::vector<std::vector<Path>> finalPaths_;
+  std::vector<real> minFinalPathLogProb_;
+  BeamSearchControlCallbacks* beamSearchCtrlCallbacks_;
+  BeamSearchStatisticsCallbacks* beamSearchStatistics_;
+};
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/AddtoLayer.cpp b/paddle/legacy/gserver/layers/AddtoLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..39c5603d9389b433b77e2876f34b3061c62f68f0
--- /dev/null
+++ b/paddle/legacy/gserver/layers/AddtoLayer.cpp
@@ -0,0 +1,79 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "AddtoLayer.h"
+
+#include "paddle/legacy/utils/Logging.h"
+
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(addto, AddtoLayer);
+
+bool AddtoLayer::init(const LayerMap& layerMap,
+                      const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+  }
+
+  return true;
+}
+
+void AddtoLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  /* malloc memory for the output_ if necessary */
+  int batchSize = getInputValue(0)->getHeight();
+  int size = getSize();
+
+  reserveOutput(batchSize, size);
+
+  MatrixPtr outV = getOutputValue();
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    MatrixPtr input = getInputValue(i);
+    i == 0 ? outV->assign(*input) : outV->add(*input);
+  }
+  /* add the bias-vector */
+  if (biases_.get() != NULL) {
+    outV->addBias(*(biases_->getW()), 1);
+  }
+
+  /* activation */ { forwardActivation(); }
+}
+
+void AddtoLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ { backwardActivation(); }
+
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+
+    /* Increasing the number of gradient */
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    /* Calculate the input layers error */
+    MatrixPtr preGrad = getInputGrad(i);
+    if (NULL != preGrad) {
+      preGrad->add(*getOutputGrad());
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/AddtoLayer.h b/paddle/legacy/gserver/layers/AddtoLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..ad3cefe1a4d27953b2fef535e1b865175a2cadc2
--- /dev/null
+++ b/paddle/legacy/gserver/layers/AddtoLayer.h
@@ -0,0 +1,63 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
+
+namespace paddle {
+
+/**
+ * This layer just simply add all input layers together, then activate
+ * the sum inputs. Each input of this layer should be the same size,
+ * which is also the output size of this layer.
+ * \f[
+ *   y=f(\sum_{i}x_i + b)
+ * \f]
+ * where \f$y\f$ is output, \f$x\f$ is input, \f$b\f$ is bias, and \f$f\f$ is
+ * activation function.
+ *
+ * The config file api is addto_layer.
+ */
+class AddtoLayer : public Layer {
+ protected:
+  std::unique_ptr<Weight> biases_;
+
+ public:
+  explicit AddtoLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~AddtoLayer() {}
+
+  /**
+   * Intialization of AddtoLayer.
+   */
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  /**
+   * Forward propagation.
+   * @note There is no weight matrix for each input,
+   *       because it just a simple add operation.
+   */
+  void forward(PassType passType) override;
+
+  /**
+   * Backward propagation.
+   */
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/AgentLayer.cpp b/paddle/legacy/gserver/layers/AgentLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bae89b2fa34d156adae1305d78d6c1465ccdd0ae
--- /dev/null
+++ b/paddle/legacy/gserver/layers/AgentLayer.cpp
@@ -0,0 +1,281 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "AgentLayer.h"
+
+#include "paddle/legacy/utils/Logging.h"
+
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(agent, AgentLayer);
+
+bool AgentLayer::init(const LayerMap& layerMap,
+                      const ParameterMap& parameterMap) {
+  CHECK_EQ(config_.inputs_size(), 0);
+  if (!Layer::init(layerMap, parameterMap)) {
+    return false;
+  }
+  setNeedGradient(true);
+  return true;
+}
+
+void AgentLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  Argument& realOutput = realLayer_->getOutput();
+  int realNumSequences = realOutput.getNumSequences();
+  CHECK_LE(numSamples_, realNumSequences);
+
+  // get Arguments from real layers
+  if (numSamples_ > 0 && numSamples_ < realNumSequences) {
+    if (realOutput.hasSeq()) {
+      int numRows =
+          realOutput.sequenceStartPositions->getData(false)[numSamples_];
+      output_.subArgFrom(realOutput,
+                         /* offset */ 0,
+                         numRows,
+                         getSize(),
+                         useGpu_,
+                         /* trans */ false,
+                         /* seqFlag */ true,
+                         /* seqStart */ 0,
+                         /* seqSize */ numSamples_ + 1);
+    } else {
+      output_.subArgFrom(
+          realOutput, /* offset */ 0, numSamples_, getSize(), useGpu_);
+    }
+  } else {
+    output_ = realOutput;
+  }
+}
+
+bool GatherAgentLayer::init(const LayerMap& layerMap,
+                            const ParameterMap& parameterMap) {
+  CHECK_EQ(config_.inputs_size(), 0);
+  if (!Layer::init(layerMap, parameterMap)) {
+    return false;
+  }
+  setNeedGradient(true);
+  return true;
+}
+
+void GatherAgentLayer::copyIdAndSequenceInfo(
+    ICpuGpuVectorPtr sequenceStartPositions,
+    ICpuGpuVectorPtr subSequenceStartPositions,
+    const IVectorPtr& ids,
+    const std::vector<int>& idIndex) {
+  output_.sequenceStartPositions = sequenceStartPositions;
+  output_.subSequenceStartPositions = subSequenceStartPositions;
+  allIds_ = ids;
+  idIndex_ = idIndex;
+}
+
+void GatherAgentLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  forwardIds(passType);
+  forwardValue(passType);
+}
+
+void GatherAgentLayer::forwardValue(PassType passType) {
+  MatrixPtr valueReal = realLayers_[0]->getOutputValue();
+  if (!valueReal) return;
+
+  int height = allIds_->getSize();
+  int width = this->getSize();
+  resetOutput(height, width);
+  idsVec_.resize(idIndex_.size());
+
+  const MatrixPtr& outV = getOutputValue();
+
+  for (size_t i = 0; i < realLayers_.size(); ++i) {
+    const MatrixPtr& realV = realLayers_[i]->getOutputValue();
+    idsVec_[i] = IVector::create(allIds_->getData() + idIndex_[i],
+                                 /* size */ realV->getHeight(),
+                                 useGpu_);
+    realV->addToRows(*outV, *idsVec_[i]);
+  }
+}
+
+namespace {
+
+// dest[index[i]] <- src[i] for each i
+void copyElements(const IVector& srcVec,
+                  const IVector& indexVec,
+                  IVector& destVec) {
+  const int* src = srcVec.getData();
+  const int* index = indexVec.getData();
+  int* dest = destVec.getData();
+  int len = indexVec.getSize();
+  CHECK_EQ(srcVec.getSize(), indexVec.getSize());
+  for (int i = 0; i < len; ++i) {
+    dest[index[i]] = src[i];
+  }
+}
+}  // namespace
+
+void GatherAgentLayer::forwardIds(PassType passType) {
+  IVectorPtr realId = realLayers_[0]->getOutputLabel();
+  if (!realId) return;
+
+  IVector::resizeOrCreate(output_.ids, allIds_->getSize(), useGpu_);
+  IVectorPtr outId = output_.ids;
+  idsVec_.resize(idIndex_.size());
+
+  for (size_t i = 0; i < realLayers_.size(); ++i) {
+    const IVectorPtr& realId = realLayers_[i]->getOutputLabel();
+    idsVec_[i] = IVector::create(allIds_->getData() + idIndex_[i],
+                                 /* size */ realId->getSize(),
+                                 useGpu_);
+    execViaCpu(&copyElements, *realId, *idsVec_[i], *outId);
+  }
+}
+
+void GatherAgentLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+  const MatrixPtr& outputGrad = getOutputGrad();
+
+  for (size_t i = 0; i < realLayers_.size(); ++i) {
+    const MatrixPtr& realG = realLayers_[i]->getOutputGrad();
+    if (realG) {
+      realG->selectRows(*outputGrad, *idsVec_[i]);
+    }
+  }
+}
+
+bool ScatterAgentLayer::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  CHECK_EQ(config_.inputs_size(), 0);
+  if (!Layer::init(layerMap, parameterMap)) {
+    return false;
+  }
+  setNeedGradient(true);
+  return true;
+}
+
+void ScatterAgentLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  CHECK_EQ(realLayer_->getDeviceId(), this->getDeviceId());
+
+  int width = this->getSize();
+  if (selectionMode_) {
+    forwardWithSelection(passType);
+  } else {
+    if (realOutArg_.hasSeq()) {
+      output_.subArgFrom(realOutArg_,
+                         /* offset */ idIndex_,
+                         idSize_,
+                         width,
+                         useGpu_,
+                         /* trans */ false,
+                         /* seqFlag */ true,
+                         /* seqStart */ seqStartPosIndex_,
+                         /* seqSize */ numSequences_);
+    } else {
+      output_.subArgFrom(
+          realOutArg_, /* offset */ idIndex_, idSize_, width, useGpu_);
+    }
+  }
+}
+
+void ScatterAgentLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+
+  CHECK(!selectionMode_);
+
+  const MatrixPtr& outputGrad = realOutArg_.grad;
+  const MatrixPtr& realGrad = realLayer_->getOutputGrad();
+  if (realGrad) {
+    // for agent in inFrameLines and memoryFrameLines,
+    // only first scatterAgentLayer should do addToRows in backward
+    if (handleBackward_) {
+      outputGrad->addToRows(*realGrad, *ids_);
+    }
+  }
+}
+
+REGISTER_LAYER(gather_agent, GatherAgentLayer);
+REGISTER_LAYER(scatter_agent, ScatterAgentLayer);
+
+void ScatterAgentLayer::forwardWithSelection(PassType passType) {
+  Layer::forward(passType);
+  CHECK_EQ(realLayer_->getDeviceId(), this->getDeviceId());
+
+  const Argument& input = realLayer_->getOutput();
+  CHECK_EQ(realLayer_->getSize(), this->getSize());
+  int width = this->getSize();
+
+  AsyncGpuBlock asyncGpuBlock;
+  REGISTER_TIMER_INFO("SequenceAgentLayerForward", getName().c_str());
+
+  if (!input.hasSeq()) {
+    if (realLayer_->getOutput().ids) {
+      IVector::resizeOrCreate(output_.ids, ids_->getSize(), useGpu_);
+      output_.ids->selectFrom(*realLayer_->getOutput().ids, *ids_);
+    }
+    if (realLayer_->getOutput().value) {
+      int height = ids_->getSize();
+      resetOutput(height, width);
+
+      const MatrixPtr& outV = getOutputValue();
+      const MatrixPtr& realV = realLayer_->getOutputValue();
+      outV->selectRows(*realV, *ids_);
+    }
+  } else {
+    // Putting the generation logic here is really an ugly hack!
+    // used in generation
+    int height = 0;
+    size_t numSequences = ids_->getSize();
+    const int* starts = input.getCpuStartPositions();
+    size_t size = input.hasSubseq() ? input.getNumSubSequences()
+                                    : input.getNumSequences();
+    const int* cpuIds = cpuIds_->getData();
+
+    for (size_t i = 0; i < numSequences; ++i) {
+      size_t seqId = cpuIds[i];
+      CHECK_LT(seqId, size);
+      height += starts[seqId + 1] - starts[seqId];
+    }
+    reserveOutput(height, width);
+
+    const MatrixPtr& outputValue = getOutputValue();
+
+    CHECK_NE(input.sequenceStartPositions.get(),
+             output_.sequenceStartPositions.get());
+    ICpuGpuVector::resizeOrCreate(
+        output_.sequenceStartPositions, numSequences + 1, false);
+    int* outStarts = output_.sequenceStartPositions->getMutableData(false);
+
+    ICpuGpuVector::resizeOrCreate(inputStartPos_, height, false);
+    int* inStarts = inputStartPos_->getMutableData(false);
+
+    size_t offsetOut = 0;
+    for (size_t i = 0; i < numSequences; ++i) {
+      outStarts[i] = offsetOut;
+      size_t seqId = cpuIds[i];
+      int size = starts[seqId + 1] - starts[seqId];
+      for (int j = 0; j < size; j++) {
+        inStarts[offsetOut + j] = starts[seqId] + j;
+      }
+      offsetOut += size;
+    }
+    outStarts[numSequences] = offsetOut;
+
+    outputValue->copyByRowIndex(*input.value,
+                                *inputStartPos_->getVector(useGpu_));
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/AgentLayer.h b/paddle/legacy/gserver/layers/AgentLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..a05eac5e704466df02a74ce6e5364ab6f03f7446
--- /dev/null
+++ b/paddle/legacy/gserver/layers/AgentLayer.h
@@ -0,0 +1,177 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
+
+namespace paddle {
+
+/**
+ * AgentLayer use as a virtual input of another layer in config,
+ * before execute forward/backward, setRealLayer() should be
+ * called to set one and only one real layer
+ */
+class AgentLayer : public Layer {
+ protected:
+  LayerPtr realLayer_;
+  int numSamples_;
+
+ public:
+  explicit AgentLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~AgentLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  // if *numSamples* set,
+  // real layer output will only use first *numSamples* rows
+  void setRealLayer(LayerPtr layer, int numSamples = 0) {
+    realLayer_ = layer;
+    numSamples_ = numSamples;
+  }
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override {}
+};
+
+/**
+ * Like AgentLayer, but it can gather many real layers. Each real
+ * layer give a few rows of a sequence, after gather all real layers,
+ * GatherAgentLayer collect a complete sequence.
+ */
+class GatherAgentLayer : public Layer {
+ protected:
+  std::vector<LayerPtr> realLayers_;
+  std::vector<IVectorPtr> idsVec_;
+  // we don't clear idsVec_ vector to aviod IVector alloc/free
+  IVectorPtr allIds_;
+  std::vector<int> idIndex_;
+
+ public:
+  explicit GatherAgentLayer(const LayerConfig& config) : Layer(config) {}
+
+  virtual ~GatherAgentLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  // call before addRealLayer
+  void clearRealLayers() { realLayers_.clear(); }
+
+  void copyIdAndSequenceInfo(ICpuGpuVectorPtr sequenceStartPositions,
+                             ICpuGpuVectorPtr subSequenceStartPositions,
+                             const IVectorPtr& allIds,
+                             const std::vector<int>& idIndex);
+
+  // add one real layer, can call many times
+  void addRealLayer(LayerPtr layer) { realLayers_.push_back(layer); }
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
+  void forwardValue(PassType passType);
+  void forwardIds(PassType passType);
+};
+
+/**
+ * Like AgentLayer, but only select a few rows in real layer.
+ * [idIndex, idIndex + idSize) of *ids* in setRealLayerAndOutput()
+ * are the selected row ids. It's used to scatter one layer's output
+ * to many small submodels. ScatterAgentLayer can support ids real layer,
+ * if it is, the agent will select a few ids in real layer.
+ */
+class ScatterAgentLayer : public Layer {
+ protected:
+  LayerPtr realLayer_;
+  IVectorPtr ids_;
+  IVectorPtr cpuIds_;
+  Argument realOutArg_;
+  int idIndex_;
+  int idSize_;
+  int seqStartPosIndex_;
+  int numSequences_;  // number of sequences in this scatterAgentLayer
+  bool handleBackward_;
+
+  // use to store expanded cpuStartPositions or subSequenceStartPositions
+  // of real layer.
+  ICpuGpuVectorPtr inputStartPos_;
+
+  // true for setRealLayer, false for setRealLayerAndOutput
+  bool selectionMode_;
+
+ public:
+  explicit ScatterAgentLayer(const LayerConfig& config) : Layer(config) {}
+
+  virtual ~ScatterAgentLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  /**
+   * @brief set real layer in generation
+   *
+   * @param layer[input]    realLayer
+   * @param ids[input]      row id in real layer
+   * @param copyId[input]   whether to copy a cpu version of ids,
+   *                        false(default) in ScatterAgentLayer, and
+   *                        true in SequenceScatterAgentLayer.
+   */
+  void setRealLayer(LayerPtr layer, const std::vector<int>& ids) {
+    realLayer_ = layer;
+    IVector::resizeOrCreate(ids_, ids.size(), useGpu_);
+    ids_->copyFrom(ids.data(), ids.size());
+    if (useGpu_) {
+      IVector::resizeOrCreate(cpuIds_, ids.size(), false);
+      cpuIds_->copyFrom(ids.data(), ids.size());
+    } else {
+      cpuIds_ = ids_;
+    }
+    selectionMode_ = true;
+  }
+
+  // set real layer and output, [idIndex, idIndex + idSize) of *ids*
+  // are selected row for realOutArg in realLayer
+  void setRealLayerAndOutput(LayerPtr layer,
+                             const Argument& outArg,
+                             const IVectorPtr& ids,
+                             int idIndex,
+                             int idSize,
+                             bool handleBackward) {
+    realLayer_ = layer;
+    realOutArg_ = outArg;
+    ids_ = ids;
+    idIndex_ = idIndex;
+    idSize_ = idSize;
+    handleBackward_ = handleBackward;
+    selectionMode_ = false;
+  }
+
+  void setSequenceStartPositions(const ICpuGpuVectorPtr& sequenceStartPositions,
+                                 int seqStartPosIndex,
+                                 int numSequences) {
+    realOutArg_.sequenceStartPositions = sequenceStartPositions;
+    seqStartPosIndex_ = seqStartPosIndex;
+    numSequences_ = numSequences;
+  }
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
+
+  void forwardWithSelection(PassType passType);
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/AverageLayer.cpp b/paddle/legacy/gserver/layers/AverageLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0539da793712527c72792603ae28a1d0aa903bcc
--- /dev/null
+++ b/paddle/legacy/gserver/layers/AverageLayer.cpp
@@ -0,0 +1,67 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "AverageLayer.h"
+
+#include "paddle/legacy/utils/Logging.h"
+
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(average, AverageLayer);
+
+bool AverageLayer::init(const LayerMap& layerMap,
+                        const ParameterMap& parameterMap) {
+  SequencePoolLayer::init(layerMap, parameterMap);
+
+  // average strategy
+  if (config_.average_strategy() == "average") {
+    mode_ = kAverage;
+  } else if (config_.average_strategy() == "sum") {
+    mode_ = kSum;
+  } else if (config_.average_strategy() == "squarerootn") {
+    mode_ = kAverageSquareRootN;
+  } else {
+    LOG(FATAL) << "Unknown average strategy: " << config_.average_strategy();
+  }
+  return true;
+}
+
+void AverageLayer::forward(PassType passType) {
+  SequencePoolLayer::forward(passType);
+
+  MatrixPtr inputValue = getInputValue(0);
+  getOutputValue()->sequenceAvgForward(
+      *inputValue, *startPositions_->getVector(useGpu_), mode_);
+
+  /* add the bias-vector AFTER average operation */
+  if (biases_.get() != NULL) {
+    MatrixPtr outV = getOutputValue();
+    outV->addBias(*(biases_->getW()), 1);
+  }
+
+  /* activation */ { forwardActivation(); }
+}
+
+void AverageLayer::backward(const UpdateCallback& callback) {
+  SequencePoolLayer::backward(callback);
+
+  if (getInputGrad(0)) {
+    getInputGrad(0)->sequenceAvgBackward(
+        *getOutputGrad(), *startPositions_->getVector(useGpu_), mode_);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/AverageLayer.h b/paddle/legacy/gserver/layers/AverageLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..a0d457d35f4bce99860cf45e94525f323f45e286
--- /dev/null
+++ b/paddle/legacy/gserver/layers/AverageLayer.h
@@ -0,0 +1,54 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "SequencePoolLayer.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * A layer for "internal average" for sequence input.
+ * Input: one or more sequences. Each sequence contains some instances.
+ * If SequenceLevel = kNonSeq:
+ *    Output: output size is the number of input sequences (NOT input instances)
+ *    output[i] = average_{for each instance in this sequence}{input[i]}
+ *    If stride_ > 0:
+ *      Output: a shorten sequence. Stride is the step size by which we slide a
+ *              window upon the input sequence, and the average pooling
+ *              operation is then applied to each interval independently.
+ * If SequenceLevel = kSeq:
+ *    Check input sequence must has sub-sequence
+ *    Output: output size is the number of input sub-sequences
+ *    output[i] = average_{for each instance in this sub-sequence}{input[i]}
+ *
+ * The config file api is pooling_layer.
+ */
+class AverageLayer : public SequencePoolLayer {
+ public:
+  enum AverageStrategy { kAverage = 0, kSum = 1, kAverageSquareRootN = 2 };
+  explicit AverageLayer(const LayerConfig& config)
+      : SequencePoolLayer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+ protected:
+  int mode_;
+};
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/BatchNormBaseLayer.cpp b/paddle/legacy/gserver/layers/BatchNormBaseLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4dcbd8dc270d5e5329b33b366ac937894833085f
--- /dev/null
+++ b/paddle/legacy/gserver/layers/BatchNormBaseLayer.cpp
@@ -0,0 +1,80 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "BatchNormBaseLayer.h"
+#include "BatchNormalizationLayer.h"
+#include "Layer.h"
+#include "paddle/legacy/utils/Stat.h"
+#ifdef PADDLE_WITH_CUDA
+#include "CudnnBatchNormLayer.h"
+#endif
+
+namespace paddle {
+
+bool BatchNormBaseLayer::init(const LayerMap& layerMap,
+                              const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  if (!Layer::init(layerMap, parameterMap)) return false;
+
+  /* initialize the weightList */
+  // first is Input in configure
+  // other two is created in config_parser.py
+  CHECK_EQ(inputLayers_.size(), 3U);
+  CHECK_EQ(inputLayers_.size(), parameters_.size());
+  CHECK_EQ(inputLayers_.size(), size_t(config_.inputs_size()));
+  const ImageConfig& conf = config_.inputs(0).image_conf();
+  channels_ = conf.channels();
+  calFeatureMapSize();
+
+  if (config_.has_use_global_stats()) {
+    useGlobalStats_ = config_.use_global_stats();
+  }
+  movingAvgFraction_ = config_.moving_average_fraction();
+  epsilon_ = config_.epsilon();
+
+  weight_.reset(new Weight(1, channels_, parameters_[0]));
+  movingMean_.reset(new Weight(1, channels_, parameters_[1]));
+  movingVar_.reset(new Weight(1, channels_, parameters_[2]));
+
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, channels_, biasParameter_));
+  }
+
+  savedMean_ = Matrix::create(1, channels_, false, useGpu_);
+  savedInvVar_ = Matrix::create(1, channels_, false, useGpu_);
+  savedMean_->zeroMem();
+  savedInvVar_->zeroMem();
+
+  return true;
+}
+
+void BatchNormBaseLayer::calFeatureMapSize() {
+  const ImageConfig& conf = config_.inputs(0).image_conf();
+  imageH_ = inputLayers_[0]->getOutput().getFrameHeight();
+  imageW_ = inputLayers_[0]->getOutput().getFrameWidth();
+  imageD_ = inputLayers_[0]->getOutput().getFrameDepth();
+
+  if (0 == imageD_) imageD_ = conf.img_size_z();
+  if (imageH_ == 0 && imageW_ == 0) {
+    imageH_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+    imageW_ = conf.img_size();
+  } else {
+    getOutput().setFrameHeight(imageH_);
+    getOutput().setFrameWidth(imageW_);
+    getOutput().setFrameDepth(imageD_);
+  }
+  imgPixels_ = imageH_ * imageW_ * imageD_;
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/BatchNormBaseLayer.h b/paddle/legacy/gserver/layers/BatchNormBaseLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..8dc1d7883767b4aabc8501531996036c2def9481
--- /dev/null
+++ b/paddle/legacy/gserver/layers/BatchNormBaseLayer.h
@@ -0,0 +1,101 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * @brief Batch normalization layer use to normalizes the input to across the
+ * batch.
+ *
+ * By default, calculating global mean and variance statistics via a running
+ * average in the training peroid. Then the pre-calculated global mean and
+ * variance are used for testing.
+ *
+ * Moving mean and variance are located in Parameter object when constructing
+ * and the calculation will change them. Now we only save global mean and
+ * variance of one thread in first node for GPU.
+ * But the calculation in CPU is different, because parameters are shared by
+ * multiple threads. Here using ShareCpuMatrix with lock to calculate. We
+ * still save global mean and variance in first node in CPU when multi machine.
+ *
+ * [1] S. Ioffe and C. Szegedy, "Batch Normalization: Accelerating Deep Network
+ *     Training by Reducing Internal Covariate Shift." arXiv preprint
+ *     arXiv:1502.03167 (2015).
+ */
+
+class BatchNormBaseLayer : public Layer {
+ public:
+  explicit BatchNormBaseLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~BatchNormBaseLayer() {}
+
+  /**
+   * @brief Create BatchNorm layer by norm_type, including batch_norm and
+   * cudnn_batch_norm. If do not set norm_type, it will automatically select
+   * cudnn_batch_norm for GPU and batch_norm for CPU.
+   */
+  static Layer* create(const LayerConfig& config);
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  /**
+   * @brief Calculate feature map size. Some input uses frameHeight and
+   * frameWidth to store feature size
+   */
+  void calFeatureMapSize();
+
+ protected:
+  /// Batch normalization scale parameter, which is referred to as gamma in
+  /// in original paper.
+  std::unique_ptr<Weight> weight_;
+  /// Moving average of mean.
+  std::unique_ptr<Weight> movingMean_;
+  /// Moving average of variance.
+  std::unique_ptr<Weight> movingVar_;
+  /// Batch normalization bias parameter, which is referred to as beta in
+  /// in original paper.
+  std::unique_ptr<Weight> biases_;
+
+  /// Save intermediate results computed during the forward pass,
+  /// these can then be reused to speed up the backward pass.
+  MatrixPtr savedMean_;
+  MatrixPtr savedInvVar_;
+
+  /// Height or width of input image feature.
+  /// Both of them are 1 if the input is fully-connected layer.
+  int imageD_;
+  int imageH_;
+  int imageW_;
+  /// Height * Width.
+  int imgPixels_;
+  /// Feature dimension. If the input layer is conv layer, it is the channels
+  /// of feature map of the conv layer. If the input layer is fully-connected
+  /// layer, it is the dimension of fc layer.
+  int channels_;
+  // if useGlobalStats_ is true, will use the loaded mean and variance.
+  // otherwise, calculate mean and variance in this mini-batch.
+  bool useGlobalStats_;
+  // use to compute moving mean and variance.
+  real movingAvgFraction_;
+  // Epsilon is a small random noise used in batch normalization for stability.
+  real epsilon_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/BatchNormalizationLayer.cpp b/paddle/legacy/gserver/layers/BatchNormalizationLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0297bd44c7b0485f34598f6926e5337da452460d
--- /dev/null
+++ b/paddle/legacy/gserver/layers/BatchNormalizationLayer.cpp
@@ -0,0 +1,266 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/utils/Stat.h"
+#ifdef PADDLE_WITH_CUDA
+#include "hl_batch_transpose.h"
+#endif
+#include "BatchNormalizationLayer.h"
+
+namespace paddle {
+
+REGISTER_LAYER(batch_norm, BatchNormalizationLayer);
+
+bool BatchNormalizationLayer::init(const LayerMap& layerMap,
+                                   const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  if (!BatchNormBaseLayer::init(layerMap, parameterMap)) return false;
+
+  return true;
+}
+
+void BatchNormalizationLayer::calMeanAndStd(const MatrixPtr& mat) {
+  int numSamples = mat->getHeight();
+  Matrix::resizeOrCreate(tmpMat_, numSamples, channels_, false, useGpu_);
+  savedMean_->zeroMem();
+  savedMean_->accumulateColSum(*mat);
+  savedMean_->mulScalar(1.0 / numSamples);  // E[x]
+
+  tmpMat_->assign(*mat);
+  tmpMat_->square2();
+  savedInvVar_->zeroMem();
+  savedInvVar_->accumulateColSum(*tmpMat_);
+  savedInvVar_->mulScalar(1.0 / numSamples);   // E[x^2]
+  savedInvVar_->addSquare(*savedMean_, -1.0);  // E[x^2] - E^2[x]
+
+  // Variance may be small negative value
+  // because of the subtraction operation.
+  // Here using clipping.
+  savedInvVar_->downClip(real(0.0));
+
+  calMovingMeanAndVar();
+
+  savedInvVar_->subScalar(-epsilon_);
+  savedInvVar_->sqrt2(*savedInvVar_);
+}
+
+void BatchNormalizationLayer::calMovingMeanAndVar() {
+  // calculating and saving moving mean and variance
+  auto& movingMean = movingMean_->getW();
+  auto& movingVar = movingVar_->getW();
+  // movingMean =  movingMean * movingAvgFraction_
+  //            + savedMean_ * (1 - movingAvgFraction_)
+  movingMean->add(*savedMean_, movingAvgFraction_, 1.0 - movingAvgFraction_);
+  // movingVar =  movingVar * movingAvgFraction_
+  //           + savedInvVar_ * (1 - movingAvgFraction_)
+  movingVar->add(*savedInvVar_, movingAvgFraction_, 1.0 - movingAvgFraction_);
+}
+
+void BatchNormalizationLayer::setMeanAndStd() {
+  savedMean_->copyFrom(*(movingMean_->getW()));
+  savedInvVar_->copyFrom(*(movingVar_->getW()));
+  savedInvVar_->downClip(real(0.0));
+
+  savedInvVar_->subScalar(-epsilon_);
+  savedInvVar_->sqrt2(*savedInvVar_);
+}
+
+void BatchNormalizationLayer::expandMat(const MatrixPtr& in, MatrixPtr& out) {
+  CHECK_EQ(in->getWidth(), static_cast<size_t>(channels_ * imgPixels_));
+  CHECK_EQ(out->getWidth(), static_cast<size_t>(channels_));
+  CHECK(!in->isTransposed());
+  CHECK(!out->isTransposed());
+  if (imgPixels_ == 1) {
+    out->assign(*in);
+    return;
+  }
+  size_t batchSize = in->getHeight();
+  CHECK_EQ(out->getHeight(), batchSize * imgPixels_);
+  if (useGpu_) {
+#ifndef PADDLE_WITH_CUDA
+    LOG(FATAL) << "paddle is compiled only for cpu";
+#else
+    batchTranspose(
+        in->getData(), out->getData(), imgPixels_, channels_, batchSize);
+#endif
+  } else {
+    for (size_t i = 0; i < batchSize; i++) {
+      const MatrixPtr inTmp =
+          Matrix::create(in->getData() + i * imgPixels_ * channels_,
+                         channels_,
+                         imgPixels_,
+                         false,
+                         useGpu_);
+      MatrixPtr outTmp =
+          Matrix::create(out->getData() + i * imgPixels_ * channels_,
+                         imgPixels_,
+                         channels_,
+                         false,
+                         useGpu_);
+      inTmp->transpose(outTmp, false);
+    }
+  }
+}
+
+void BatchNormalizationLayer::shrinkMat(const MatrixPtr& in, MatrixPtr& out) {
+  CHECK_EQ(in->getWidth(), static_cast<size_t>(channels_));
+  CHECK_EQ(out->getWidth(), static_cast<size_t>(channels_ * imgPixels_));
+  size_t batchSize = out->getHeight();
+  CHECK(!in->isTransposed());
+  CHECK(!out->isTransposed());
+  if (imgPixels_ == 1) {
+    out->assign(*in);
+    return;
+  }
+  CHECK_EQ(in->getHeight(), static_cast<size_t>(batchSize * imgPixels_));
+  if (useGpu_) {
+#ifndef PADDLE_WITH_CUDA
+    LOG(FATAL) << "paddle is compiled only for cpu";
+#else
+    batchTranspose(
+        in->getData(), out->getData(), channels_, imgPixels_, batchSize);
+#endif
+  } else {
+    for (size_t i = 0; i < batchSize; i++) {
+      const MatrixPtr inTmp =
+          Matrix::create(in->getData() + i * channels_ * imgPixels_,
+                         imgPixels_,
+                         channels_,
+                         false,
+                         useGpu_);
+      MatrixPtr outTmp =
+          Matrix::create(out->getData() + i * imgPixels_ * channels_,
+                         channels_,
+                         imgPixels_,
+                         useGpu_);
+      inTmp->transpose(outTmp, false);
+    }
+  }
+}
+
+void BatchNormalizationLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  int batchSize = getInputValue(0)->getHeight();
+  calFeatureMapSize();
+  resetOutput(batchSize, getInputValue(0)->getWidth());
+
+  // for testing in training peroid.
+  useGlobalStats_ = (passType == PASS_TEST);
+  if (passType == PASS_TEST && config_.has_use_global_stats()) {
+    useGlobalStats_ = config_.use_global_stats();
+  }
+
+  Matrix::resizeOrCreate(
+      expandedIn_, batchSize * imgPixels_, channels_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      normIn_, batchSize * imgPixels_, channels_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      expandedOut_, batchSize * imgPixels_, channels_, false, useGpu_);
+  expandMat(getInputValue(0), expandedIn_);
+
+  if (useGlobalStats_) {
+    if (firstTest_) {
+      setMeanAndStd();
+      firstTest_ = false;
+    }
+  } else {
+    calMeanAndStd(expandedIn_);
+    firstTest_ = true;
+  }
+
+  normIn_->assign(*expandedIn_);
+  normIn_->addBias(*savedMean_, -1);     // subtract mean.
+  normIn_->divRowVector(*savedInvVar_);  // divide std.
+
+  expandedOut_->assign(*normIn_);
+  expandedOut_->mulRowVector(*weight_->getW());  // multiple gamma.
+  if (biases_) {
+    expandedOut_->addBias(*(biases_->getW()), 1);  // add beta.
+  }
+  MatrixPtr out = getOutputValue();
+  shrinkMat(expandedOut_, out);
+
+  /* activation */ {
+    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void BatchNormalizationLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ {
+    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
+    backwardActivation();
+  }
+  int batchSize = getInputValue(0)->getHeight();
+
+  Matrix::resizeOrCreate(meanGrad_, 1, channels_, false, useGpu_);
+  Matrix::resizeOrCreate(stdGrad_, 1, channels_, false, useGpu_);
+
+  Matrix::resizeOrCreate(
+      expandedInGrad_, batchSize * imgPixels_, channels_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      inGrad_, batchSize, imgPixels_ * channels_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      normInGrad_, batchSize * imgPixels_, channels_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      expandedOutGrad_, batchSize * imgPixels_, channels_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      tmpMat_, batchSize * imgPixels_, channels_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      tmpGrad_, batchSize * imgPixels_, channels_, false, useGpu_);
+
+  expandMat(getOutputGrad(), expandedOutGrad_);
+
+  // compute derivatives.
+  if (biases_ && biases_->getWGrad()) {
+    REGISTER_TIMER_INFO("BpBiasTimer", getName().c_str());
+    biases_->getWGrad()->collectBias(*expandedOutGrad_, 1);
+    /* Increasing the number of gradient */
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+  if (weight_->getWGrad()) {
+    tmpMat_->dotMul(*expandedOutGrad_, *normIn_);
+    weight_->getWGrad()->collectBias(*tmpMat_, 1);
+  }
+
+  // compute input gradients.
+  normInGrad_->assign(*expandedOutGrad_);
+  normInGrad_->mulRowVector(*(weight_->getW()));  // multiple gamma.
+  // normInGrad * (x - \mu)/ \sqrt(\delta^2)
+  tmpMat_->dotMul(*normInGrad_, *normIn_);
+  stdGrad_->zeroMem();
+  stdGrad_->collectBias(*tmpMat_, -1.0 / (batchSize * imgPixels_));
+  tmpGrad_->assign(*normIn_);
+  tmpGrad_->mulRowVector(*stdGrad_);
+
+  meanGrad_->zeroMem();
+  meanGrad_->collectBias(*normInGrad_, -1.0 / (batchSize * imgPixels_));
+
+  expandedInGrad_->zeroMem();
+  expandedInGrad_->add(*normInGrad_, *tmpGrad_);
+  expandedInGrad_->addRowVector(*meanGrad_);
+  expandedInGrad_->divRowVector(*savedInvVar_);
+
+  shrinkMat(expandedInGrad_, inGrad_);
+  if (getInputGrad(0)) {
+    getInputGrad(0)->add(*getInputGrad(0), *inGrad_);
+  }
+  {
+    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
+    weight_->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/BatchNormalizationLayer.h b/paddle/legacy/gserver/layers/BatchNormalizationLayer.h
similarity index 100%
rename from paddle/gserver/layers/BatchNormalizationLayer.h
rename to paddle/legacy/gserver/layers/BatchNormalizationLayer.h
diff --git a/paddle/legacy/gserver/layers/BilinearInterpLayer.cpp b/paddle/legacy/gserver/layers/BilinearInterpLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a091f51bc20e219c3111fb07058b5adea5a3fc38
--- /dev/null
+++ b/paddle/legacy/gserver/layers/BilinearInterpLayer.cpp
@@ -0,0 +1,107 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "BilinearInterpLayer.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(bilinear_interp, BilinearInterpLayer);
+
+size_t BilinearInterpLayer::getSize() {
+  inImgH_ = inputLayers_[0]->getOutput().getFrameHeight();
+  inImgW_ = inputLayers_[0]->getOutput().getFrameWidth();
+
+  const BilinearInterpConfig& conf = config_.inputs(0).bilinear_interp_conf();
+  if (inImgH_ == 0) {
+    inImgH_ = conf.image_conf().img_size_y();
+  }
+  if (inImgW_ == 0) {
+    inImgW_ = conf.image_conf().img_size();
+  }
+
+  outImgH_ = conf.out_size_y();
+  outImgW_ = conf.out_size_x();
+  numChannels_ = conf.image_conf().channels();
+
+  CHECK(outImgH_ > 0 && outImgW_ > 0);
+  CHECK(inImgH_ > 0 && inImgW_ > 0);
+  CHECK(numChannels_);
+
+  ratioH_ =
+      (outImgH_ > 1) ? static_cast<real>(inImgH_ - 1) / (outImgH_ - 1) : 0.f;
+  ratioW_ =
+      (outImgW_ > 1) ? static_cast<real>(inImgW_ - 1) / (outImgW_ - 1) : 0.f;
+
+  getOutput().setFrameHeight(outImgH_);
+  getOutput().setFrameWidth(outImgW_);
+  return outImgH_ * outImgW_ * numChannels_;
+}
+
+bool BilinearInterpLayer::init(const LayerMap& layerMap,
+                               const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(1, config_.inputs_size());
+
+  return true;
+}
+
+void BilinearInterpLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  size_t batchSize = getInput(0).getBatchSize();
+  size_t size = getSize();
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    resetOutput(batchSize, size);
+  }
+
+  MatrixPtr inV = getInputValue(0);
+  MatrixPtr outV = getOutputValue();
+  {
+    REGISTER_TIMER_INFO("FwBilinearInterpTimer", getName().c_str());
+    outV->bilinearForward(*inV,
+                          inImgH_,
+                          inImgW_,
+                          outImgH_,
+                          outImgW_,
+                          numChannels_,
+                          ratioH_,
+                          ratioW_);
+  }
+}
+
+void BilinearInterpLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+
+  MatrixPtr inputG = getInputGrad(0);
+  MatrixPtr outG = getOutputGrad();
+  {
+    REGISTER_TIMER_INFO("BwBilinearInterpTimer", getName().c_str());
+    if (inputG) {
+      inputG->bilinearBackward(*outG,
+                               outImgH_,
+                               outImgW_,
+                               inImgH_,
+                               inImgW_,
+                               numChannels_,
+                               ratioH_,
+                               ratioW_);
+    }
+  }
+}
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/BilinearInterpLayer.h b/paddle/legacy/gserver/layers/BilinearInterpLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..c585a5ed10d9c8f241b5a5ff3a671752fda6d432
--- /dev/null
+++ b/paddle/legacy/gserver/layers/BilinearInterpLayer.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief A layer for bilinear interpolation which is
+ *        used on conv layer output.
+ *
+ * @note  The config file api is bilinear_interp_layer.
+ */
+class BilinearInterpLayer : public Layer {
+ protected:
+  size_t outImgH_, outImgW_;
+  size_t inImgH_, inImgW_;
+  real ratioH_, ratioW_;
+  size_t numChannels_;
+
+ public:
+  explicit BilinearInterpLayer(const LayerConfig& config) : Layer(config) {}
+
+  virtual ~BilinearInterpLayer() {}
+
+  size_t getSize();
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/BlockExpandLayer.cpp b/paddle/legacy/gserver/layers/BlockExpandLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..24b5af67d40958c940eb0864994e7e81464f6c70
--- /dev/null
+++ b/paddle/legacy/gserver/layers/BlockExpandLayer.cpp
@@ -0,0 +1,121 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "BlockExpandLayer.h"
+
+#include "paddle/legacy/utils/Logging.h"
+
+namespace paddle {
+
+REGISTER_LAYER(blockexpand, BlockExpandLayer);
+
+bool BlockExpandLayer::init(const LayerMap& layerMap,
+                            const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(config_.inputs_size(), 1);
+  const BlockExpandConfig& blockConf = config_.inputs(0).block_expand_conf();
+  blockH_ = blockConf.block_y();
+  blockW_ = blockConf.block_x();
+  strideH_ = blockConf.stride_y();
+  strideW_ = blockConf.stride_x();
+  paddingH_ = blockConf.padding_y();
+  paddingW_ = blockConf.padding_x();
+  channels_ = blockConf.channels();
+  imgSizeH_ = blockConf.img_size_y();
+  imgSizeW_ = blockConf.img_size_x();
+
+  std::vector<size_t> strides = {(size_t)strideH_, (size_t)strideW_};
+  std::vector<size_t> paddings = {(size_t)paddingH_, (size_t)paddingW_};
+  std::vector<size_t> blocks = {(size_t)blockH_, (size_t)blockW_};
+  createFunction(forward_,
+                 "BlockExpand",
+                 FuncConfig()
+                     .set("strides", strides)
+                     .set("paddings", paddings)
+                     .set("blocks", blocks));
+  createFunction(backward_,
+                 "BlockExpandGrad",
+                 FuncConfig()
+                     .set("strides", strides)
+                     .set("paddings", paddings)
+                     .set("blocks", blocks));
+
+  return true;
+}
+
+size_t BlockExpandLayer::getBlockNum() {
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  const BlockExpandConfig& blockConf = config_.inputs(0).block_expand_conf();
+  imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
+  imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
+  if (imgSizeH_ == 0) {
+    imgSizeH_ = blockConf.img_size_y();
+  }
+  if (imgSizeW_ == 0) {
+    imgSizeW_ = blockConf.img_size_x();
+  }
+  size_t tmpH = 2 * paddingH_ + imgSizeH_ - blockH_;
+  outputH_ = (int)tmpH < 0 ? 1 : 1 + (tmpH + strideH_ - 1) / strideH_;
+  size_t tmpW = 2 * paddingW_ + imgSizeW_ - blockW_;
+  outputW_ = (int)tmpW < 0 ? 1 : 1 + (tmpW + strideW_ - 1) / strideW_;
+
+  return outputH_ * outputW_;
+}
+
+void BlockExpandLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  size_t batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+  size_t blockNum = getBlockNum();
+  size_t blockSize = blockH_ * blockW_ * channels_;
+  resetOutput(blockNum * batchSize, blockSize);
+
+  // calculate output_.value
+  inputShape_ = TensorShape({batchSize, channels_, imgSizeH_, imgSizeW_});
+  outputShape_ = TensorShape({batchSize, blockNum, blockSize});
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getInputValue(0), inputShape_);
+  outputs.addArg(*getOutputValue(), outputShape_, ASSIGN_TO);
+  forward_[0]->calc(inputs, outputs);
+
+  // calculate output_.sequenceStartPositions and output_.cpuSequenceDims
+  Argument& out = getOutput();
+  ICpuGpuVector::resizeOrCreate(
+      out.sequenceStartPositions, batchSize + 1, false);
+  IVector::resizeOrCreate(out.cpuSequenceDims, 2 * batchSize, false);
+  int* start = out.sequenceStartPositions->getMutableData(false);
+  int* dims = out.cpuSequenceDims->getData();
+  for (size_t i = 0; i < batchSize; i++) {
+    start[i] = i * blockNum;
+    dims[2 * i] = outputH_;
+    dims[2 * i + 1] = outputW_;
+  }
+  start[batchSize] = batchSize * blockNum;
+}
+
+void BlockExpandLayer::backward(const UpdateCallback& callback) {
+  /* Calculate the input layers error */
+  if (getInputGrad(0)) {
+    BufferArgs inputs;
+    BufferArgs outputs;
+    inputs.addArg(*getOutputGrad(), outputShape_);
+    outputs.addArg(*getInputGrad(0), inputShape_, ADD_TO);
+    backward_[0]->calc(inputs, outputs);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/BlockExpandLayer.h b/paddle/legacy/gserver/layers/BlockExpandLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..8b90249bfb0958f0081e7c668cd3b38a53c39951
--- /dev/null
+++ b/paddle/legacy/gserver/layers/BlockExpandLayer.h
@@ -0,0 +1,68 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief Expand feature map to minibatch matrix.
+ * - matrix width is: blockH_ * blockW_ * channels_
+ * - matirx height is: outputH_ * outputW_
+ *
+ * \f[
+ * outputH\_ = 1 + (2 * paddingH\_ + imgSizeH\_ - blockH\_ + strideH\_ - 1) /
+ *             strideH\_ \\
+ * outputW\_ = 1 + (2 * paddingW\_ + imgSizeW\_ - blockW\_ + strideW\_ - 1) /
+ *             strideW\_
+ * \f]
+ *
+ * The expand method is the same with ExpandConvLayer, but saved the transposed
+ * value. After expanding, output_.sequenceStartPositions will store timeline.
+ * The number of time steps are outputH_ * outputW_ and the dimension of each
+ * time step is blockH_ * blockW_ * channels_. This layer can be used after
+ * convolution neural network, and before recurrent neural network.
+ *
+ * The config file api is block_expand_layer.
+ */
+class BlockExpandLayer : public Layer {
+ protected:
+  /**
+   * @brief Calculate outputH_ and outputW_ and return block number which
+   * actually is time steps.
+   * @return time steps, outoutH_ * outputW_.
+   */
+  size_t getBlockNum();
+  size_t blockH_, blockW_, strideH_, strideW_, paddingH_, paddingW_;
+  size_t imgSizeH_, imgSizeW_, outputH_, outputW_, channels_;
+
+  TensorShape inputShape_;
+  TensorShape outputShape_;
+
+ public:
+  explicit BlockExpandLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~BlockExpandLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CRFDecodingLayer.cpp b/paddle/legacy/gserver/layers/CRFDecodingLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/CRFDecodingLayer.cpp
rename to paddle/legacy/gserver/layers/CRFDecodingLayer.cpp
diff --git a/paddle/gserver/layers/CRFDecodingLayer.h b/paddle/legacy/gserver/layers/CRFDecodingLayer.h
similarity index 100%
rename from paddle/gserver/layers/CRFDecodingLayer.h
rename to paddle/legacy/gserver/layers/CRFDecodingLayer.h
diff --git a/paddle/gserver/layers/CRFLayer.cpp b/paddle/legacy/gserver/layers/CRFLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/CRFLayer.cpp
rename to paddle/legacy/gserver/layers/CRFLayer.cpp
diff --git a/paddle/gserver/layers/CRFLayer.h b/paddle/legacy/gserver/layers/CRFLayer.h
similarity index 100%
rename from paddle/gserver/layers/CRFLayer.h
rename to paddle/legacy/gserver/layers/CRFLayer.h
diff --git a/paddle/gserver/layers/CTCLayer.cpp b/paddle/legacy/gserver/layers/CTCLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/CTCLayer.cpp
rename to paddle/legacy/gserver/layers/CTCLayer.cpp
diff --git a/paddle/gserver/layers/CTCLayer.h b/paddle/legacy/gserver/layers/CTCLayer.h
similarity index 100%
rename from paddle/gserver/layers/CTCLayer.h
rename to paddle/legacy/gserver/layers/CTCLayer.h
diff --git a/paddle/gserver/layers/ClipLayer.cpp b/paddle/legacy/gserver/layers/ClipLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/ClipLayer.cpp
rename to paddle/legacy/gserver/layers/ClipLayer.cpp
diff --git a/paddle/legacy/gserver/layers/ConcatenateLayer.cpp b/paddle/legacy/gserver/layers/ConcatenateLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ce3f2ca950bf87e287163f1cfc8b15d815a68cf4
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConcatenateLayer.cpp
@@ -0,0 +1,208 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "Projection.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * A concatenate layer has multiple input layers. It concatenates rows of
+ * each input as one row for the output of this layer and apply activation.
+ */
+class ConcatenateLayer : public Layer {
+ public:
+  explicit ConcatenateLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~ConcatenateLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(concat, ConcatenateLayer);
+
+bool ConcatenateLayer::init(const LayerMap& layerMap,
+                            const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  if (!Layer::init(layerMap, parameterMap)) return false;
+
+  CHECK(!biasParameter_);
+
+  return true;
+}
+
+void ConcatenateLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  int batchSize = getInput(0).getBatchSize();
+  int size = getSize();
+  reserveOutput(batchSize, size);
+
+  const MatrixPtr& out = getOutputValue();
+  int offset = 0;
+
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    const MatrixPtr& in = getInputValue(i);
+    size_t inSize = in->getWidth();
+    out->assignAtOffset(*in, offset);
+    offset += inSize;
+  }
+  CHECK_EQ(size, offset);
+
+  /* activation */ {
+    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void ConcatenateLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+
+  /* Do activation */ {
+    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
+    backwardActivation();
+  }
+
+  const MatrixPtr& out = getOutputGrad();
+  int offset = 0;
+
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    const MatrixPtr& in = getInputGrad(i);
+    size_t inSize = getInputValue(i)->getWidth();
+    if (in) {
+      in->addAtOffset(*out, offset);
+    }
+    offset += inSize;
+  }
+}
+
+/**
+ * concat2 layer is like concat layer, but each input layer was
+ * processed by a Projection.
+ */
+class ConcatenateLayer2 : public Layer {
+ public:
+  explicit ConcatenateLayer2(const LayerConfig& config) : Layer(config) {}
+
+  ~ConcatenateLayer2() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+ protected:
+  std::vector<std::unique_ptr<Projection>> projections_;
+  std::vector<Argument> projOutput_;
+  std::vector<std::pair<size_t, size_t>> projCol_;
+  bool sharedBias_;
+  std::unique_ptr<Weight> biases_;
+};
+
+REGISTER_LAYER(concat2, ConcatenateLayer2);
+
+bool ConcatenateLayer2::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  if (!Layer::init(layerMap, parameterMap)) return false;
+
+  CHECK_EQ(inputLayers_.size(), parameters_.size());
+  projections_.reserve(inputLayers_.size());
+  projCol_.reserve(inputLayers_.size());
+  projOutput_.resize(inputLayers_.size());
+
+  size_t startCol = 0;
+  size_t endCol = 0;
+  for (size_t i = 0; i < inputLayers_.size(); i++) {
+    projections_.emplace_back(Projection::create(
+        config_.inputs(i).proj_conf(), parameters_[i], useGpu_));
+
+    endCol += projections_[i]->getOutputSize();
+    projCol_.push_back(std::make_pair(startCol, endCol));
+    startCol = endCol;
+  }
+  CHECK_EQ(getSize(), endCol);
+
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    sharedBias_ = config_.shared_biases();
+    size_t psize = config_.bias_size();
+    biases_ = std::unique_ptr<Weight>(new Weight(1, psize, biasParameter_));
+  }
+
+  return true;
+}
+
+void ConcatenateLayer2::forward(PassType passType) {
+  Layer::forward(passType);
+
+  int batchSize = getInput(0).getBatchSize();
+  int size = getSize();
+  resetOutput(batchSize, size);
+
+  for (size_t i = 0; i < projections_.size(); i++) {
+    size_t startCol = projCol_[i].first;
+    size_t endCol = projCol_[i].second;
+    projOutput_[i].value = output_.value->subColMatrix(startCol, endCol);
+    if (output_.grad) {
+      projOutput_[i].grad = output_.grad->subColMatrix(startCol, endCol);
+    }
+  }
+
+  {
+    AsyncGpuBlock block;
+    for (size_t i = 0; i != inputLayers_.size(); ++i) {
+      projections_[i]->forward(&getInput(i), &projOutput_[i], passType);
+    }
+  }
+
+  /* add the bias-vector */
+  if (biases_) {
+    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
+    output_.value->addBias(*(biases_->getW()), 1, sharedBias_);
+  }
+
+  /* activation */ {
+    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void ConcatenateLayer2::backward(const UpdateCallback& callback) {
+  /* Do activation */ {
+    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
+    backwardActivation();
+  }
+
+  AsyncGpuBlock block;
+  if (biases_ && biases_->getWGrad()) {
+    REGISTER_TIMER_INFO("Concat2BpBiasTimer", getName().c_str());
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1, sharedBias_);
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    if (projections_[i]) {
+      projections_[i]->backward(callback);
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ContextProjection.cpp b/paddle/legacy/gserver/layers/ContextProjection.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8bcf32663eb381a7d7700270efcaa08f9ff86356
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ContextProjection.cpp
@@ -0,0 +1,185 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ContextProjection.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_PROJECTION(context, ContextProjection);
+
+ContextProjection::ContextProjection(const ProjectionConfig& config,
+                                     ParameterPtr parameter,
+                                     bool useGpu)
+    : Projection(config, parameter, useGpu) {
+  CHECK(config.has_context_start());
+  CHECK(config.has_context_length());
+  if (config.context_start() == 0 && config.context_length() == 1) {
+    config_.set_trainable_padding(false);
+  }
+  if (config_.trainable_padding()) {
+    CHECK(parameter);
+    beginPad_ = std::max(0, -config.context_start());
+    endPad_ = std::max(0, config.context_start() + config.context_length() - 1);
+    size_t totalPad = beginPad_ + endPad_;
+    size_t inputDim = parameter->getSize() / totalPad;
+    CHECK_EQ(config.input_size(), inputDim);
+    CHECK_EQ(inputDim * totalPad, parameter->getSize());
+    weight_.reset(new Weight(totalPad, inputDim, parameter));
+  }
+  // init forward_ and backward_ functions
+  init();
+}
+
+bool ContextProjection::init() {
+  size_t context_length = config_.context_length();
+  int context_start = config_.context_start();
+  bool is_padding = config_.trainable_padding();
+  size_t total_pad = is_padding ? beginPad_ + endPad_ : 0;
+
+  createFunction(forward_,
+                 "ContextProjectionForward",
+                 FuncConfig()
+                     .set("context_length", context_length)
+                     .set("context_start", context_start)
+                     .set("begin_pad", beginPad_));
+  createFunction(backward_,
+                 "ContextProjectionBackward",
+                 FuncConfig()
+                     .set("context_length", context_length)
+                     .set("context_start", context_start)
+                     .set("begin_pad", beginPad_)
+                     .set("is_padding", is_padding)
+                     .set("total_pad", total_pad));
+
+  return true;
+}
+
+void ContextProjection::resetState() {
+  CHECK_LE(config_.context_start() + config_.context_length(), 1)
+      << "state is not allowed for future context";
+  if (config_.context_start() >= 0) return;
+  Matrix::resizeOrCreate(state_,
+                         -config_.context_start(),
+                         config_.input_size(),
+                         false,  // trans
+                         useGpu_);
+  Matrix::resizeOrCreate(state2_,
+                         -config_.context_start(),
+                         config_.input_size(),
+                         false,  // trans
+                         useGpu_);
+  if (config_.trainable_padding()) {
+    state_->assign(*weight_->getW()->subMatrix(0, -config_.context_start()));
+  } else {
+    state_->zeroMem();
+  }
+}
+
+void ContextProjection::setState(LayerStatePtr state) {
+  CHECK(state->value.size() == 1)
+      << "one matrix is expected for ContextProjection state";
+  state_->copyFrom(*(state->value[0]));
+}
+
+LayerStatePtr ContextProjection::getState() {
+  if (state_ == nullptr) {
+    return nullptr;
+  }
+  LayerStatePtr res = std::make_shared<LayerState>();
+  res->value.push_back(state_->clone(0, 0, false));
+  res->value[0]->copyFrom(*state_);
+  return res;
+}
+
+void ContextProjection::forward() {
+  CHECK(in_->value && out_->value);
+  CHECK(in_->sequenceStartPositions);
+
+  size_t input_dim = in_->value->getWidth();
+  size_t dim = out_->value->getWidth();
+  CHECK_EQ(dim, input_dim * config_.context_length());
+  // size_t batch_size = in_->value->getHeight();
+  CHECK_EQ(forward_.size(), (size_t)1) << "Only one forward function here";
+
+  REGISTER_TIMER_INFO("ContextProjectionForward", getName().c_str());
+  bool is_padding = config_.trainable_padding();
+  /// first use state_, otherwise use weight_(padding false === w nullptr)
+  auto w_ptr =
+      state_ ? state_.get() : is_padding ? weight_->getW().get() : nullptr;
+  const auto start_pos = in_->sequenceStartPositions->getVector(useGpu_);
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*in_->value, *start_pos);
+  if (w_ptr) {
+    inputs.addArg(CpuMatrix(w_ptr->getData(), w_ptr->getHeight(), input_dim),
+                  *start_pos);
+  }
+  outputs.addArg(*out_->value, *start_pos, ADD_TO);
+  forward_[0]->calc(inputs, outputs);
+
+  if (state_ && config_.context_start() < 0) {
+    CHECK_EQ(1, in_->getNumSequences());
+    const int* starts = in_->sequenceStartPositions->getData(false);
+    int length = starts[1] - starts[0];
+    if (-config_.context_start() <= length) {
+      MatrixPtr sub = in_->value->subMatrix(starts[1] + config_.context_start(),
+                                            -config_.context_start());
+      state_->copyFrom(*sub);
+    } else {
+      int prevLength = -config_.context_start() - length;
+      state2_->subMatrix(0, prevLength)
+          ->copyFrom(*state_->subMatrix(length, prevLength));
+      state2_->subMatrix(prevLength, length)
+          ->copyFrom(*in_->value->subMatrix(starts[0], length));
+      std::swap(state_, state2_);
+    }
+  }
+}
+
+void ContextProjection::backward(const UpdateCallback& callback) {
+  CHECK(in_->value && out_->value && out_->grad);
+  size_t input_dim = in_->value->getWidth();
+  size_t dim = out_->value->getWidth();
+  CHECK_EQ(dim, input_dim * config_.context_length());
+  size_t batch_size = in_->value->getHeight();
+  CHECK_EQ(batch_size, out_->value->getHeight());
+  CHECK_EQ(static_cast<int>(backward_.size()), 1)
+      << "Only one backward function here";
+
+  REGISTER_TIMER_INFO("ContextProjectionBackward", getName().c_str());
+  bool is_padding = config_.trainable_padding();
+  auto start_pos = in_->sequenceStartPositions;
+  auto w_ptr = is_padding ? weight_->getWGrad() : nullptr;
+
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*out_->grad, *in_->sequenceStartPositions->getVector(useGpu_));
+  outputs.addArg(
+      CpuMatrix(
+          in_->grad ? in_->grad->getData() : nullptr, batch_size, input_dim),
+      *in_->sequenceStartPositions->getVector(useGpu_),
+      ADD_TO);
+  outputs.addArg(CpuMatrix(w_ptr ? w_ptr->getData() : nullptr,
+                           w_ptr ? w_ptr->getHeight() : 0,
+                           input_dim),
+                 ADD_TO);
+  backward_[0]->calc(inputs, outputs);
+
+  if (config_.trainable_padding()) {
+    weight_->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ContextProjection.h b/paddle/legacy/gserver/layers/ContextProjection.h
similarity index 100%
rename from paddle/gserver/layers/ContextProjection.h
rename to paddle/legacy/gserver/layers/ContextProjection.h
diff --git a/paddle/legacy/gserver/layers/Conv3DLayer.cpp b/paddle/legacy/gserver/layers/Conv3DLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d072a74234b43e06c1194acc2ec2b3f961b4a97e
--- /dev/null
+++ b/paddle/legacy/gserver/layers/Conv3DLayer.cpp
@@ -0,0 +1,253 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Conv3DLayer.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(conv3d, Conv3DLayer);
+
+bool Conv3DLayer::init(const LayerMap &layerMap,
+                       const ParameterMap &parameterMap) {
+  if (!ConvBaseLayer::init(layerMap, parameterMap)) return false;
+  int index = 0;
+  for (auto &inputConfig : config_.inputs()) {
+    const ConvConfig &conf = inputConfig.conv_conf();
+    M_.push_back(numFilters_ / conf.groups());
+    K_.push_back(filterPixels_[index] * filterChannels_[index]);
+
+    // create a new weight
+    size_t height, width;
+    width = filterPixels_[index] * filterChannels_[index];
+    height = numFilters_;
+    CHECK_EQ(parameters_[index]->getSize(), width * height);
+    Weight *w = new Weight(height, width, parameters_[index]);
+    weights_.emplace_back(w);
+    ++index;
+  }
+  if (biasParameter_.get()) {
+    if (sharedBiases_) {
+      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(numFilters_, 1, biasParameter_));
+    } else {
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(getSize(), 1, biasParameter_));
+    }
+  }
+  return true;
+}
+
+size_t Conv3DLayer::getSize() {
+  CHECK_NE(inputLayers_.size(), 0UL);
+  outputH_.clear();
+  outputW_.clear();
+  outputD_.clear();
+  N_.clear();
+  size_t layerSize = 0;
+  for (size_t i = 0; i < inputLayers_.size(); ++i) {
+    outputW_.push_back(outputSize(
+        imgSizeW_[i], filterSize_[i], padding_[i], stride_[i], true));
+    outputH_.push_back(outputSize(
+        imgSizeH_[i], filterSizeY_[i], paddingY_[i], strideY_[i], true));
+    outputD_.push_back(outputSize(
+        imgSizeD_[i], filterSizeZ_[i], paddingZ_[i], strideZ_[i], true));
+
+    N_.push_back(outputD_[i] * outputH_[i] * outputW_[i]);
+    CHECK(layerSize == 0 || N_[i] * size_t(numFilters_) == layerSize);
+    layerSize += N_[i] * numFilters_;
+  }
+  getOutput().setFrameHeight(outputH_[0]);
+  getOutput().setFrameWidth(outputW_[0]);
+  getOutput().setFrameDepth(outputD_[0]);
+  return layerSize;
+}
+
+void Conv3DLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+  int outWidth = getSize();
+  resetOutput(batchSize, outWidth);
+
+  REGISTER_TIMER_INFO("FwdConv3D", getName().c_str());
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    const MatrixPtr &inMat = getInputValue(i);
+    const MatrixPtr &outMat = getOutputValue();
+    int M = M_[i];
+    int N = N_[i];
+    int K = K_[i];
+    Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
+    MatrixPtr wMat = weights_[i]->getW();
+    for (int n = 0; n < batchSize; ++n) {
+      colBuf_->vol2Col(inMat->getData() + n * inMat->getStride(),
+                       channels_[i],
+                       imgSizeD_[i],
+                       imgSizeH_[i],
+                       imgSizeW_[i],
+                       filterSizeZ_[i],
+                       filterSizeY_[i],
+                       filterSize_[i],
+                       strideZ_[i],
+                       strideY_[i],
+                       stride_[i],
+                       paddingZ_[i],
+                       paddingY_[i],
+                       padding_[i]);
+
+      real *outData = outMat->getData() + n * outMat->getStride();
+      MatrixPtr outMatSub =
+          Matrix::create(outData, groups_[i] * M, N, false, useGpu_);
+      for (int g = 0; g < groups_[i]; g++) {
+        MatrixPtr wMatSub = wMat->subMatrix(g * M, M);
+        MatrixPtr in = colBuf_->subMatrix(g * K, K);
+        MatrixPtr out = outMatSub->subMatrix(g * M, M);
+        out->mul(*wMatSub, *in, 1.0, 1.0);
+      }
+    }
+  }
+  if (nullptr != this->biasParameter_) {
+    this->addBias();
+  }
+  forwardActivation();
+}
+
+void Conv3DLayer::backward(const UpdateCallback &callback) {
+  backwardActivation();
+
+  if (biases_ && biases_->getWGrad()) {
+    bpropBiases();
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  REGISTER_TIMER_INFO("BwdConv3D", getName().c_str());
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    if (weights_[i]->getWGrad()) {
+      bpropWeights(i);
+    }
+    if (getInputGrad(i)) {
+      bpropData(i);
+    }
+    weights_[i]->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+void Conv3DLayer::bpropWeights(int i) {
+  int M = M_[i];
+  int N = N_[i];
+  int K = K_[i];
+  const MatrixPtr &inMat = getInputValue(i);
+  Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
+  MatrixPtr wGradMat = weights_[i]->getWGrad();
+  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+  for (int n = 0; n < batchSize; ++n) {
+    colBuf_->vol2Col(inMat->getData() + n * inMat->getStride(),
+                     channels_[i],
+                     imgSizeD_[i],
+                     imgSizeH_[i],
+                     imgSizeW_[i],
+                     filterSizeZ_[i],
+                     filterSizeY_[i],
+                     filterSize_[i],
+                     strideZ_[i],
+                     strideY_[i],
+                     stride_[i],
+                     paddingZ_[i],
+                     paddingY_[i],
+                     padding_[i]);
+
+    real *outGradData =
+        getOutputGrad()->getData() + n * getOutputGrad()->getStride();
+    MatrixPtr outGradSub =
+        Matrix::create(outGradData, groups_[i] * M, N, false, useGpu_);
+    for (int g = 0; g < groups_[i]; ++g) {
+      MatrixPtr inMatSub = colBuf_->subMatrix(g * K, K);
+      MatrixPtr outG = outGradSub->subMatrix(g * M, M);
+      MatrixPtr wGradSub = wGradMat->subMatrix(g * M, M);
+      wGradSub->mul(*outG, *(inMatSub->getTranspose()), 1.0, 1.0);
+    }
+  }
+}
+
+void Conv3DLayer::bpropData(int i) {
+  int M = M_[i];
+  int N = N_[i];
+  int K = K_[i];
+  Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
+  MatrixPtr wMat = weights_[i]->getW();
+  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+  for (int n = 0; n < batchSize; ++n) {
+    real *outGradData =
+        getOutputGrad()->getData() + n * getOutputGrad()->getStride();
+    real *preGradData =
+        getInputGrad(i)->getData() + n * getInputGrad(i)->getStride();
+    MatrixPtr outGradSub =
+        Matrix::create(outGradData, M * groups_[i], N, false, useGpu_);
+    for (int g = 0; g < groups_[i]; ++g) {
+      MatrixPtr wMatSub = wMat->subMatrix(g * M, M);
+      MatrixPtr outG = outGradSub->subMatrix(g * M, M);
+      MatrixPtr inGradMatSub = colBuf_->subMatrix(g * K, K);
+      inGradMatSub->mul(*(wMatSub->getTranspose()), *outG, 1.0, 0.0);
+    }
+    colBuf_->col2Vol(preGradData,
+                     channels_[i],
+                     imgSizeD_[i],
+                     imgSizeH_[i],
+                     imgSizeW_[i],
+                     filterSizeZ_[i],
+                     filterSizeY_[i],
+                     filterSize_[i],
+                     strideZ_[i],
+                     strideY_[i],
+                     stride_[i],
+                     paddingZ_[i],
+                     paddingY_[i],
+                     padding_[i],
+                     1.0,
+                     1.0);
+  }
+}
+
+void Conv3DLayer::bpropBiases() {
+  MatrixPtr biases = Matrix::create(biases_->getWGrad()->getData(),
+                                    1,
+                                    biases_->getWGrad()->getElementCnt(),
+                                    false,
+                                    useGpu_);
+  MatrixPtr outGradMat = getOutputGrad();
+
+  if (this->sharedBiases_) {
+    biases->collectSharedBias(*outGradMat, 1.0f);
+  } else {
+    biases->collectBias(*outGradMat, 1.0f);
+  }
+}
+
+void Conv3DLayer::addBias() {
+  MatrixPtr outMat = getOutputValue();
+  MatrixPtr bias = Matrix::create(biases_->getW()->getData(),
+                                  1,
+                                  biases_->getW()->getElementCnt(),
+                                  false,
+                                  useGpu_);
+  if (this->sharedBiases_) {
+    outMat->addSharedBias(*(bias), 1.0f);
+  } else {
+    outMat->addBias(*(bias), 1.0f);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/Conv3DLayer.h b/paddle/legacy/gserver/layers/Conv3DLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..cb42a2f36d31365b473d7f593fd27dc063c83c47
--- /dev/null
+++ b/paddle/legacy/gserver/layers/Conv3DLayer.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "ConvBaseLayer.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief A subclass of convolution layer.
+ * This layer expands input and use matrix multiplication to
+ * calculate convolution operation.
+ */
+class Conv3DLayer : public ConvBaseLayer {
+ public:
+  explicit Conv3DLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
+  ~Conv3DLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void addBias();
+  void backward(const UpdateCallback& callback);
+  void bpropBiases();
+  void bpropData(int i);
+  void bpropWeights(int i);
+  size_t getSize();
+
+ protected:
+  // Figure out the dimensions for individual gemms.
+  IntV M_;  /// numFilters_ / filter_group_;
+  IntV N_;  /// channels_ * filterSizeZ_ * filterSize_ * filterSizeY_
+  IntV K_;  /// outputD_ * outputH_ * outputW_
+  MatrixPtr colBuf_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvBaseLayer.cpp b/paddle/legacy/gserver/layers/ConvBaseLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..76120915e48661a9b14fb6b9bb99e9ec9dd71e4b
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConvBaseLayer.cpp
@@ -0,0 +1,120 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ConvBaseLayer.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/utils/Logging.h"
+namespace paddle {
+
+bool ConvBaseLayer::init(const LayerMap& layerMap,
+                         const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+  isDeconv_ = (config_.type() == "exconv" || config_.type() == "cudnn_conv")
+                  ? false
+                  : true;
+
+  /* Initialize the convolutional layer parameter */
+  numFilters_ = config_.num_filters();
+  sharedBiases_ = config_.shared_biases();
+  for (auto& inputConfig : config_.inputs()) {
+    const ConvConfig& conf = inputConfig.conv_conf();
+    padding_.push_back(conf.padding());
+    stride_.push_back(conf.stride());
+    dilation_.push_back(conf.dilation());
+    filterSize_.push_back(conf.filter_size());
+    paddingY_.push_back(conf.padding_y());
+    strideY_.push_back(conf.stride_y());
+    dilationY_.push_back(conf.dilation_y());
+    filterSizeY_.push_back(conf.filter_size_y());
+    channels_.push_back(conf.channels());
+    imgSizeH_.push_back(conf.has_img_size_y() ? conf.img_size_y()
+                                              : conf.img_size());
+    imgSizeW_.push_back(conf.img_size());
+    groups_.push_back(conf.groups());
+    filterChannels_.push_back(conf.filter_channels());
+    outputH_.push_back(conf.has_output_y() ? conf.output_y() : conf.output_x());
+    outputW_.push_back(conf.output_x());
+
+    paddingZ_.push_back(conf.padding_z());
+    strideZ_.push_back(conf.stride_z());
+    filterSizeZ_.push_back(conf.filter_size_z());
+    imgSizeD_.push_back(conf.img_size_z());
+    outputD_.push_back(conf.output_z());
+    filterPixels_.push_back(filterSize_.back() * filterSizeY_.back() *
+                            filterSizeZ_.back());
+  }
+
+  CHECK(inputLayers_.size() == parameters_.size());
+
+  // create new weights_ in derived class
+  // create new biases_ in derived class
+
+  // default caffe model
+  caffeMode_ = true;
+
+  return true;
+}
+
+size_t ConvBaseLayer::calOutputSize() {
+  auto clearAndReserve = [this](IntV* vec) {
+    vec->clear();
+    vec->reserve(this->inputLayers_.size());
+  };
+  clearAndReserve(&imgSizeH_);
+  clearAndReserve(&imgSizeW_);
+  clearAndReserve(&outputH_);
+  clearAndReserve(&outputW_);
+  size_t layerSize = 0;
+
+  auto setLayerSize = [&](IntV& inH, IntV& inW, IntV& outH, IntV& outW) {
+    size_t filterSizeY;
+    size_t filterSize;
+    for (size_t i = 0; i < inputLayers_.size(); i++) {
+      filterSizeY = (filterSizeY_[i] - 1) * dilationY_[i] + 1;
+      filterSize = (filterSize_[i] - 1) * dilation_[i] + 1;
+      inH.push_back(inputLayers_[i]->getOutput().getFrameHeight());
+      inW.push_back(inputLayers_[i]->getOutput().getFrameWidth());
+      const ConvConfig& conf = config_.inputs(i).conv_conf();
+      if (isDeconv_) {
+        if (inH[i] == 0)
+          inH[i] = conf.has_output_y() ? conf.output_y() : conf.output_x();
+        if (inW[i] == 0) inW[i] = conf.output_x();
+        outH.push_back(imageSize(
+            inH[i], filterSizeY, paddingY_[i], strideY_[i], caffeMode_));
+        outW.push_back(
+            imageSize(inW[i], filterSize, padding_[i], stride_[i], caffeMode_));
+      } else {
+        if (inH[i] == 0)
+          inH[i] = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+        if (inW[i] == 0) inW[i] = conf.img_size();
+        outH.push_back(outputSize(
+            inH[i], filterSizeY, paddingY_[i], strideY_[i], caffeMode_));
+        outW.push_back(outputSize(
+            inW[i], filterSize, padding_[i], stride_[i], caffeMode_));
+      }
+      CHECK_EQ(outH[i], outH[0]);
+      CHECK_EQ(outW[i], outW[0]);
+    }
+    getOutput().setFrameHeight(outH[0]);
+    getOutput().setFrameWidth(outW[0]);
+    layerSize = outH[0] * outW[0] * size_t(numFilters_);
+  };
+
+  setLayerSize(imgSizeH_, imgSizeW_, outputH_, outputW_);
+
+  return layerSize;
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvBaseLayer.h b/paddle/legacy/gserver/layers/ConvBaseLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..01e90e999625f986b0f13d2b73a883297c097841
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConvBaseLayer.h
@@ -0,0 +1,107 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/MathUtils.h"
+namespace paddle {
+
+/**
+ * @brief A Base Convolution Layer, which convolves the input image
+ * with learned filters and (optionally) adds biases.
+ */
+
+class ConvBaseLayer : public Layer {
+ protected:
+  typedef std::vector<int> IntV;
+
+  /// True if it's deconv layer, false if it's convolution layer
+  bool isDeconv_;
+
+  /// The number of filters.
+  int numFilters_;
+  /// The x dimension of the padding.
+  IntV padding_;
+  /// The y dimension of the padding.
+  IntV paddingY_;
+  /// The x dimension of the stride.
+  IntV stride_;
+  /// The y dimension of the stride.
+  IntV strideY_;
+  /// The x dimension of the dilation.
+  IntV dilation_;
+  /// The y dimension of the dilation.
+  IntV dilationY_;
+  /// The x dimension of a filter kernel.
+  IntV filterSize_;
+  /// The y dimension of a filter kernel.
+  IntV filterSizeY_;
+  /// The spatial dimensions of the convolution input.
+  IntV channels_;
+  /// The spatial dimensions of input feature map height.
+  IntV imgSizeH_;
+  /// The spatial dimensions of input feature map width.
+  IntV imgSizeW_;
+  /// filterPixels_ = filterSizeX_ * filterSizeY_.
+  IntV filterPixels_;
+  /// filterChannels_ = channels_/groups_.
+  IntV filterChannels_;
+  /// The spatial dimensions of output feature map height.
+  IntV outputH_;
+  /// The spatial dimensions of output feature map width.
+  IntV outputW_;
+
+  IntV outputD_;
+  IntV imgSizeD_;
+  IntV filterSizeZ_;
+  IntV strideZ_;
+  IntV paddingZ_;
+
+  /// Group size, refer to grouped convolution in
+  /// Alex Krizhevsky's paper: when group=2, the first half of the
+  /// filters are only connected to the first half of the input channels,
+  /// and the second half only connected to the second half.
+  IntV groups_;
+  /// Whether the bias is shared for feature in each channel.
+  bool sharedBiases_;
+
+  /// shape of weight: (numChannels * filterPixels_, numFilters)
+  WeightList weights_;
+  /// If shared_biases is false shape of bias: (numFilters_, 1)
+  /// If shared_biases is ture shape of bias:
+  /// (numFilters_ * outputX * outputY, 1)
+  std::unique_ptr<Weight> biases_;
+
+  /// True by default. The only difference is the calculation
+  /// of output size.
+  bool caffeMode_;
+
+ public:
+  explicit ConvBaseLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  /**
+   * imgSizeH_ and imgSizeW_ will be set according to the previous input layers
+   * in this function. Then it will calculate outputH_ and outputW_ and set them
+   * into output argument.
+   */
+  virtual size_t calOutputSize();
+
+  Weight& getWeight(int idx) { return *weights_[idx]; }
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvBaseOperator.cpp b/paddle/legacy/gserver/layers/ConvBaseOperator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e8e59b3bfe9d8a9e54e5c11906707d10ec346a4d
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConvBaseOperator.cpp
@@ -0,0 +1,151 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ConvBaseOperator.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief ConvBaseOperator takes two inputs to perform the convolution.
+ * The first input is the image, and the second input is the convolution kernel.
+ * The height of data for two inputs are the same. Each data of the first input
+ * is convolved with each data of the second input indepedently.
+ *
+ * The config file api is conv_operator.
+ */
+
+ConvBaseOperator::ConvBaseOperator(const OperatorConfig &config, bool useGpu)
+    : Operator(config, useGpu) {
+  CHECK(useGpu);
+  CHECK_EQ(config_.input_indices_size(), 2L);
+
+  caffeMode_ = true;
+  getConvParams();
+  computeConvSizes();
+
+  // initialize all to default algorithms
+  fwdAlgo_ = 0;
+  bwdFilterAlgo_ = 0;
+  bwdDataAlgo_ = 0;
+  fwdLimitBytes_ = 0;
+  bwdDataLimitBytes_ = 0;
+  bwdFilterLimitBytes_ = 0;
+  workSpaceInBytes_ = 0;
+  workSpace_ = nullptr;
+
+  isSelectAlgo_ = false;
+}
+
+void ConvBaseOperator::allocConvWorkSpace() {
+  hl_conv_workspace(imageDesc_,
+                    outputDesc_,
+                    filterDesc_,
+                    convDesc_,
+                    &fwdAlgo_,
+                    &fwdLimitBytes_,
+                    &bwdDataAlgo_,
+                    &bwdDataLimitBytes_,
+                    &bwdFilterAlgo_,
+                    &bwdFilterLimitBytes_,
+                    /*useDilation*/ false);
+
+  size_t maxWorkSpace = 0;
+  maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
+  maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
+
+  if (maxWorkSpace > workSpaceInBytes_) {
+    if (workSpaceInBytes_ != 0) {
+      hl_free_mem_device(workSpace_);
+    }
+    // total amount of storage needed
+    workSpace_ = hl_malloc_device(maxWorkSpace);
+    workSpaceInBytes_ = maxWorkSpace;
+  }
+}
+
+void ConvBaseOperator::computeConvSizes() {
+  hl_create_filter_descriptor(
+      &filterDesc_, channels_, numFilters_, filterSizeY_, filterSize_);
+  hl_create_tensor_descriptor(&imageDesc_);
+  hl_create_tensor_descriptor(&outputDesc_);
+  hl_create_convolution_descriptor(&convDesc_,
+                                   imageDesc_,
+                                   filterDesc_,
+                                   paddingY_,
+                                   padding_,
+                                   strideY_,
+                                   stride_);
+}
+
+void ConvBaseOperator::reshapeImageDescriptors() {
+  hl_tensor_reshape(imageDesc_,
+                    1,
+                    channels_,
+                    imageH_,
+                    imageW_,
+                    channels_ * imageH_ * imageW_,
+                    imageH_ * imageW_,
+                    imageW_,
+                    1);
+  hl_tensor_reshape(outputDesc_,
+                    1,
+                    numFilters_,
+                    outputH_,
+                    outputW_,
+                    numFilters_ * outputH_ * outputW_,
+                    outputH_ * outputW_,
+                    outputW_,
+                    1);
+  hl_reset_convolution_descriptor(convDesc_,
+                                  imageDesc_,
+                                  filterDesc_,
+                                  paddingY_,
+                                  padding_,
+                                  strideY_,
+                                  stride_);
+}
+
+void ConvBaseOperator::getConvParams() {
+  configNumFilters_ = config_.num_filters();
+  const ConvConfig &conf = config_.conv_conf();
+  padding_ = conf.padding();
+  stride_ = conf.stride();
+  filterSize_ = conf.filter_size();
+  paddingY_ = conf.padding_y();
+  strideY_ = conf.stride_y();
+  filterSizeY_ = conf.filter_size_y();
+  filterPixels_ = filterSize_ * filterSizeY_;
+  configChannels_ = conf.channels();
+  imgSize_ = conf.img_size();
+  imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+  imgPixels_ = imgSize_ * imgSizeY_;
+  CHECK_EQ(conf.groups(), 1U);
+  filterChannels_ = conf.filter_channels();
+  outputX_ = conf.output_x();
+  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
+  outputs_ = outputX_ * outputX_;
+
+  isDeconv_ = (config_.type() == "conv") ? false : true;
+  if (isDeconv_) {
+    channels_ = configNumFilters_;
+    numFilters_ = configChannels_;
+  } else {
+    channels_ = configChannels_;
+    numFilters_ = configNumFilters_;
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvBaseOperator.h b/paddle/legacy/gserver/layers/ConvBaseOperator.h
new file mode 100644
index 0000000000000000000000000000000000000000..4ac77f2d743abd6f01e8e3f1e2f4e730c0e6fb39
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConvBaseOperator.h
@@ -0,0 +1,112 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include "Operator.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief ConvOperator takes two inputs to perform the convolution.
+ * The first input is the image, and the second input is the convolution kernel.
+ * The height of data for two inputs are the same. Each data of the first input
+ * is convolved with each data of the second input indepedently.
+ *
+ * The config file api is conv_operator.
+ */
+
+class ConvBaseOperator : public Operator {
+ public:
+  ConvBaseOperator(const OperatorConfig &config, bool useGpu);
+  /**
+   * Free workspace in device and destroy cudnn tensor descriptor.
+   */
+  virtual ~ConvBaseOperator() {
+    if (workSpaceInBytes_ != 0) {
+      hl_free_mem_device(workSpace_);
+      workSpaceInBytes_ = 0;
+    }
+
+    hl_destroy_tensor_descriptor(imageDesc_);
+    hl_destroy_tensor_descriptor(outputDesc_);
+    hl_destroy_filter_descriptor(filterDesc_);
+    hl_destroy_convolution_descriptor(convDesc_);
+  }
+
+ protected:
+  /**
+   * Get convolution parameters from layer config and
+   * initialize member variables.
+   */
+  void getConvParams();
+
+  /**
+   * Allocate Gpu Memory for cudnn convolution algorithms.
+   */
+  void allocConvWorkSpace();
+
+  /**
+   * Create cudnn tensor descriptor for convolution operation.
+   */
+  void computeConvSizes();
+
+  /**
+   * Reshape cudnn tensor descriptor.
+   */
+  void reshapeImageDescriptors();
+
+  /**
+   * Reshape cudnn tensor descriptor.
+   */
+  virtual void reshape(int batchSize) = 0;
+
+  /**
+   * Check filter size is equal to the size calculated by parameters from
+   * layer config.
+   */
+  void checkFilterSize(const MatrixPtr &filter) {
+    CHECK_EQ(static_cast<int>(filter->getWidth()),
+             filterSize_ * filterSizeY_ * channels_ * numFilters_);
+  }
+
+  /// Most of member variables are same with CudnnConvLayer.
+  /// There is no explanation here.
+  bool isDeconv_;
+  int imageH_, imageW_, outputH_, outputW_;
+  hl_tensor_descriptor imageDesc_;
+  hl_tensor_descriptor outputDesc_;
+  hl_filter_descriptor filterDesc_;
+  hl_convolution_descriptor convDesc_;
+  bool caffeMode_;
+  int inputOffset_, outputOffset_, weightOffset_;
+  int numFilters_, channels_;
+
+  /// from parsing config
+  int configNumFilters_, configChannels_;
+  int padding_, stride_, filterSize_, imgSize_, imgSizeY_;
+  int paddingY_, strideY_, filterSizeY_;
+  int imgPixels_, filterPixels_, filterChannels_, outputX_, outputY_, outputs_;
+
+  /// Following member variables are same with CudnnConvLayer.
+  /// There is no explanation here.
+  int fwdAlgo_, bwdFilterAlgo_, bwdDataAlgo_;
+  size_t fwdLimitBytes_, bwdDataLimitBytes_, bwdFilterLimitBytes_;
+  size_t workSpaceInBytes_;
+  void *workSpace_;
+  bool isSelectAlgo_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvBaseProjection.cpp b/paddle/legacy/gserver/layers/ConvBaseProjection.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ff5d3412de1c2940cdd9dcf9397370153c24b0c6
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConvBaseProjection.cpp
@@ -0,0 +1,199 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ConvBaseProjection.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+ThreadLocalD<std::vector<MemoryHandlePtr>> ConvBaseProjection::convMem_;
+
+ConvBaseProjection::ConvBaseProjection(const ProjectionConfig &config,
+                                       ParameterPtr parameter,
+                                       bool useGpu)
+    : Projection(config, parameter, useGpu) {
+  CHECK(useGpu);  // only support GPU
+  getConvParams();
+  initCudnn();
+
+  size_t height = filterH_ * filterW_ * channels_ / groups_;
+  size_t width = numFilters_;
+  weight_.reset(new Weight(height, width, parameter));
+  weightOffset_ = height * width / groups_;
+}
+
+void ConvBaseProjection::getConvParams() {
+  const ConvConfig &conf = config_.conv_conf();
+  paddingH_ = conf.padding_y();
+  paddingW_ = conf.padding();
+
+  strideH_ = conf.stride_y();
+  strideW_ = conf.stride();
+
+  dilationH_ = conf.dilation_y();
+  dilationW_ = conf.dilation();
+  CHECK_GT(dilationH_, 0);
+  CHECK_GT(dilationW_, 0);
+
+  filterH_ = conf.filter_size_y();
+  filterW_ = conf.filter_size();
+
+  configImgH_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+  configImgW_ = conf.img_size();
+
+  configOutH_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
+  configOutW_ = conf.output_x();
+
+  configChannels_ = conf.channels();
+  configNumFilters_ = config_.num_filters();
+
+  isDeconv_ = (config_.type() == "conv") ? false : true;
+
+  channels_ = (isDeconv_) ? configNumFilters_ : configChannels_;
+  numFilters_ = (isDeconv_) ? configChannels_ : configNumFilters_;
+
+  groups_ = conf.groups();
+  CHECK_EQ(channels_ % groups_, 0);
+  CHECK_EQ(numFilters_ % groups_, 0);
+}
+
+void ConvBaseProjection::initCudnn() {
+  hl_create_filter_descriptor(&filterDesc_,
+                              channels_ / groups_,
+                              numFilters_ / groups_,
+                              filterH_,
+                              filterW_);
+  hl_create_tensor_descriptor(&imageDesc_);
+  hl_create_tensor_descriptor(&outputDesc_);
+  hl_create_convolution_descriptor(&convDesc_,
+                                   imageDesc_,
+                                   filterDesc_,
+                                   paddingH_,
+                                   paddingW_,
+                                   strideH_,
+                                   strideW_,
+                                   dilationH_,
+                                   dilationW_);
+
+  // initialize all to default algorithms
+  fwdAlgo_ = 0;
+  bwdFilterAlgo_ = 0;
+  bwdDataAlgo_ = 0;
+  fwdLimitBytes_ = 0;
+  bwdDataLimitBytes_ = 0;
+  bwdFilterLimitBytes_ = 0;
+  workSpaceInBytes_ = 0;
+}
+
+void ConvBaseProjection::reshapeTensorDesc(int batchSize) {
+  // The stride between two consecutive samples in the output of ConvProjection
+  // may not be numFilters_ * outputH_ * outputW_ (conv) or
+  // channels_ * imageH_ * imageW_ (deconv)
+  // for example, in the case of layer ConcatenateLayer2 with two
+  // ConvProjection, the stride is the output_size of layer ConcatenateLayer2.
+  // So the calculation of nStride is different from CudnnConvLayer.
+  size_t nStrideImage, nStrideOutput;
+  if (isDeconv_) {
+    nStrideImage = out_->value->getStride();
+    nStrideOutput = numFilters_ * outputH_ * outputW_;
+  } else {
+    nStrideImage = channels_ * imageH_ * imageW_;
+    nStrideOutput = out_->value->getStride();
+  }
+
+  hl_tensor_reshape(imageDesc_,
+                    batchSize,
+                    channels_ / groups_,
+                    imageH_,
+                    imageW_,
+                    nStrideImage,
+                    imageH_ * imageW_,
+                    imageW_,
+                    1);
+
+  hl_tensor_reshape(outputDesc_,
+                    batchSize,
+                    numFilters_ / groups_,
+                    outputH_,
+                    outputW_,
+                    nStrideOutput,
+                    outputH_ * outputW_,
+                    outputW_,
+                    1);
+
+  hl_reset_convolution_descriptor(convDesc_,
+                                  imageDesc_,
+                                  filterDesc_,
+                                  paddingH_,
+                                  paddingW_,
+                                  strideH_,
+                                  strideW_,
+                                  dilationH_,
+                                  dilationW_);
+}
+
+void ConvBaseProjection::reshape(int batchSize) {
+  size_t width = calOutputSize();
+  CHECK_EQ(width, out_->value->getWidth());
+  CHECK_EQ(calInputSize(), in_->value->getWidth());
+
+  reshapeTensorDesc(batchSize);
+  bool useDilation = false;
+  if (dilationH_ > 1 || dilationW_ > 1) {
+    useDilation = true;
+  }
+  hl_conv_workspace(imageDesc_,
+                    outputDesc_,
+                    filterDesc_,
+                    convDesc_,
+                    &fwdAlgo_,
+                    &fwdLimitBytes_,
+                    &bwdDataAlgo_,
+                    &bwdDataLimitBytes_,
+                    &bwdFilterAlgo_,
+                    &bwdFilterLimitBytes_,
+                    useDilation);
+
+  size_t maxWorkSpace = 0;
+  maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
+  maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
+  workSpaceInBytes_ = maxWorkSpace;
+
+  VLOG(3) << getName() << " Fwd / BwdData / BwdFilter algo: " << fwdAlgo_
+          << " / " << bwdDataAlgo_ << " / " << bwdFilterAlgo_;
+}
+
+void *ConvBaseProjection::getSpaceBytes(size_t size) {
+  std::vector<MemoryHandlePtr> &convMem = *convMem_;
+  if (convMem.empty()) {
+    int numDevices = hl_get_device_count();
+    convMem.resize(numDevices);
+  }
+
+  int devId = hl_get_device();
+  MemoryHandlePtr localMem = convMem[devId];
+  if (NULL == localMem || size > localMem->getAllocSize()) {
+    localMem = std::make_shared<GpuMemoryHandle>(size);
+  }
+  return localMem->getBuf();
+}
+
+ConvBaseProjection::~ConvBaseProjection() {
+  hl_destroy_tensor_descriptor(imageDesc_);
+  hl_destroy_tensor_descriptor(outputDesc_);
+  hl_destroy_filter_descriptor(filterDesc_);
+  hl_destroy_convolution_descriptor(convDesc_);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvBaseProjection.h b/paddle/legacy/gserver/layers/ConvBaseProjection.h
new file mode 100644
index 0000000000000000000000000000000000000000..dcf5ce0f48daac396bab0ec7620303f6c1236fc2
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConvBaseProjection.h
@@ -0,0 +1,111 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Projection.h"
+#include "paddle/legacy/math/MathUtils.h"
+
+namespace paddle {
+
+/**
+ * @brief Base class for ConvProjection and ConvTransProjection.
+ */
+class ConvBaseProjection : public Projection {
+ public:
+  /**
+   * Constructor.
+   */
+  ConvBaseProjection(const ProjectionConfig& config,
+                     ParameterPtr parameter,
+                     bool useGpu);
+
+  ~ConvBaseProjection();
+
+ protected:
+  void getConvParams();
+  void initCudnn();
+
+  void reshapeTensorDesc(int batchSize);
+  void reshape(int batchSize);
+
+  virtual size_t calOutputSize() = 0;
+  virtual size_t calInputSize() = 0;
+
+  static void* getSpaceBytes(size_t size);
+
+  /// True if it's deconv projection layer, false if it's ConvProjection layer
+  bool isDeconv_;
+  /// imageH_ and imageW_ / outputH_ and outputW_
+  /// is calculated from the input layer.
+  int imageH_, imageW_;
+  int outputH_, outputW_;
+  /// configImgH_ and configImgW_ / configOutH_ and configOutW_
+  /// is obtained from config.
+  int configImgH_, configImgW_;
+  int configOutH_, configOutW_;
+  /// channels_ and numFilters_ are defined in terms of convolution semantics
+  int channels_, numFilters_;
+  /// configChannels and configNumFilters_ are obtained from config
+  /// For Conv they are the same as channels_ and numFilters
+  /// For ConvTrans they are opposite to channels_ and numFilters
+  int configChannels_, configNumFilters_;
+  int paddingH_, paddingW_;
+  int strideH_, strideW_;
+  int dilationH_, dilationW_;
+  int filterH_, filterW_;
+  /// One group offset of input data.
+  int inputOffset_;
+  /// One group offset of output data.
+  int outputOffset_;
+  /// One group offset of weight.
+  int weightOffset_;
+  int groups_;
+
+  /// Cudnn tensor descriptor for input.
+  hl_tensor_descriptor imageDesc_;
+  /// Cudnn tensor descriptor for output.
+  hl_tensor_descriptor outputDesc_;
+  /// Cudnn tensor descriptor for filter.
+  hl_filter_descriptor filterDesc_;
+  /// Cudnn tensor descriptor for a convolution operation.
+  hl_convolution_descriptor convDesc_;
+
+  /// Record the algorithm for forward convolution, which is obtained by cudnn
+  /// api to search the best suited algorithm.
+  int fwdAlgo_;
+  /// Record the algorithm for computing convolution gradient with respect to
+  /// filter coefficients.
+  int bwdFilterAlgo_;
+  /// Record the algorithm for computing convolution gradient with respect to
+  /// the output.
+  int bwdDataAlgo_;
+  /// Amount of GPU memory needed as workspace to be able to execute a
+  /// forward convolution with the specified algo.
+  size_t fwdLimitBytes_;
+  /// Amount of GPU memory needed as workspace to be able to execute a
+  /// backwardFilter with the specified algo.
+  size_t bwdDataLimitBytes_;
+  /// Amount of GPU memory needed as workspace to be able to execute a
+  /// backwardData with the specified algo.
+  size_t bwdFilterLimitBytes_;
+  /// Size of total work space.
+  size_t workSpaceInBytes_;
+  bool bias_;
+
+  std::unique_ptr<Weight> weight_;
+  static ThreadLocalD<std::vector<MemoryHandlePtr>> convMem_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvOperator.cpp b/paddle/legacy/gserver/layers/ConvOperator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5276b2c3920eee923f13a47d40b4498c6846f94b
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConvOperator.cpp
@@ -0,0 +1,128 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ConvOperator.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief ConvOperator takes two inputs to perform the convolution.
+ * The first input is the image, and the second input is the convolution kernel.
+ * The height of data for two inputs are the same. Each data of the first input
+ * is convolved with each data of the second input indepedently.
+ *
+ * The config file api is conv_operator.
+ */
+
+REGISTER_OPERATOR(conv, ConvOperator);
+
+void ConvOperator::reshape(int batchSize) {
+  imageH_ = ins_[0]->getFrameHeight();
+  imageW_ = ins_[0]->getFrameWidth();
+  if (imageH_ == 0) imageH_ = imgSizeY_;
+  if (imageW_ == 0) imageW_ = imgSize_;
+  outputH_ = outputSize(imageH_, filterSizeY_, paddingY_, strideY_, caffeMode_);
+  outputW_ = outputSize(imageW_, filterSize_, padding_, stride_, caffeMode_);
+  /// Check that the outputSizes are consistent with config
+  CHECK_EQ(outputH_, outputY_);
+  CHECK_EQ(outputW_, outputX_);
+  out_->setFrameHeight(outputH_);
+  out_->setFrameWidth(outputW_);
+
+  reshapeImageDescriptors();
+
+  inputOffset_ = channels_ * imageH_ * imageW_;
+  outputOffset_ = numFilters_ * outputH_ * outputW_;
+  weightOffset_ = numFilters_ * channels_ * filterSize_ * filterSizeY_;
+
+  if (!isSelectAlgo_) {
+    allocConvWorkSpace();
+  }
+
+  isSelectAlgo_ = true;
+}
+
+void ConvOperator::forward() {
+  size_t batchSize = ins_[0]->value->getHeight();
+  reshape(batchSize);
+  CHECK_EQ(ins_[1]->value->getHeight(), batchSize);
+  checkFilterSize(ins_[1]->value);
+  Matrix::resizeOrCreate(out_->value,
+                         batchSize,
+                         outputH_ * outputW_ * numFilters_,
+                         false,
+                         useGpu_);
+  {
+    AsyncGpuBlock block;
+    for (size_t batchId = 0; batchId < batchSize; ++batchId) {
+      real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
+      real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
+      real *outData = out_->value->getData() + outputOffset_ * batchId;
+      hl_convolution_forward(imageDesc_,
+                             inputData,
+                             outputDesc_,
+                             outData,
+                             filterDesc_,
+                             wgtData,
+                             convDesc_,
+                             workSpace_,
+                             workSpaceInBytes_,
+                             fwdAlgo_);
+    }
+  }
+}
+
+void ConvOperator::backward() {
+  size_t batchSize = ins_[0]->value->getHeight();
+  {
+    AsyncGpuBlock block;
+    for (size_t batchId = 0; batchId < batchSize; ++batchId) {
+      real *outGrad = out_->grad->getData() + outputOffset_ * batchId;
+      if (ins_[1]->grad) {
+        real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
+        real *weightGrad = ins_[1]->grad->getData() + weightOffset_ * batchId;
+        hl_convolution_backward_filter(imageDesc_,
+                                       inputData,
+                                       outputDesc_,
+                                       outGrad,
+                                       filterDesc_,
+                                       weightGrad,
+                                       convDesc_,
+                                       workSpace_,
+                                       workSpaceInBytes_,
+                                       bwdFilterAlgo_);
+      }
+
+      MatrixPtr preGrad = ins_[0]->grad;
+      if (NULL != preGrad) {
+        real *inputGrad = preGrad->getData() + inputOffset_ * batchId;
+        real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
+        hl_convolution_backward_data(imageDesc_,
+                                     inputGrad,
+                                     outputDesc_,
+                                     outGrad,
+                                     filterDesc_,
+                                     wgtData,
+                                     convDesc_,
+                                     workSpace_,
+                                     workSpaceInBytes_,
+                                     bwdDataAlgo_);
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvOperator.h b/paddle/legacy/gserver/layers/ConvOperator.h
new file mode 100644
index 0000000000000000000000000000000000000000..8f31620111c8ff3818d83145e16012d22b067a12
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConvOperator.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include "ConvBaseOperator.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief ConvOperator takes two inputs to perform the convolution.
+ * The first input is the image, and the second input is the convolution kernel.
+ * The height of data for two inputs are the same. Each data of the first input
+ * is convolved with each data of the second input indepedently.
+ *
+ * The config file api is conv_operator.
+ */
+
+class ConvOperator : public ConvBaseOperator {
+ public:
+  ConvOperator(const OperatorConfig &config, bool useGpu)
+      : ConvBaseOperator(config, useGpu) {}
+  /**
+   * Free workspace in device and destroy cudnn tensor descriptor.
+   */
+  virtual ~ConvOperator() {}
+  void forward() override;
+  void backward() override;
+  void reshape(int batchSize) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvProjection.cpp b/paddle/legacy/gserver/layers/ConvProjection.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b40cdac2587d1fc0fec00801414560d2a27bd34a
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConvProjection.cpp
@@ -0,0 +1,123 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ConvProjection.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_PROJECTION(conv, ConvProjection);
+
+size_t ConvProjection::calOutputSize() {
+  imageH_ = in_->getFrameHeight();
+  imageW_ = in_->getFrameWidth();
+  if (imageH_ == 0) imageH_ = configImgH_;
+  if (imageW_ == 0) imageW_ = configImgW_;
+  outputH_ = outputSize(imageH_,
+                        (filterH_ - 1) * dilationH_ + 1,
+                        paddingH_,
+                        strideH_,
+                        /* caffeMode */ true);
+  outputW_ = outputSize(imageW_,
+                        (filterW_ - 1) * dilationW_ + 1,
+                        paddingW_,
+                        strideW_,
+                        /* caffeMode */ true);
+
+  const_cast<Argument *>(out_)->setFrameHeight(outputH_);
+  const_cast<Argument *>(out_)->setFrameWidth(outputW_);
+
+  inputOffset_ = (configChannels_ / groups_) * imageH_ * imageW_;
+  outputOffset_ = (configNumFilters_ / groups_) * outputH_ * outputW_;
+  return outputH_ * outputW_ * configNumFilters_;
+}
+
+size_t ConvProjection::calInputSize() {
+  return static_cast<size_t>(configChannels_ * imageH_ * imageW_);
+}
+
+void ConvProjection::forward() {
+  int batchSize = in_->value->getHeight();
+  reshape(batchSize);
+
+  void *workSpace = NULL;
+  if (workSpaceInBytes_ > 0) {
+    workSpace = getSpaceBytes(workSpaceInBytes_);
+  }
+
+  for (int g = 0; g < groups_; ++g) {
+    REGISTER_TIMER_INFO("CudnnConvFwTimer", getName().c_str());
+
+    real *inputData = in_->value->getData() + g * inputOffset_;
+    real *wgtData = weight_->getW()->getData() + g * weightOffset_;
+    real *outData = out_->value->getData() + g * outputOffset_;
+    hl_convolution_forward(imageDesc_,
+                           inputData,
+                           outputDesc_,
+                           outData,
+                           filterDesc_,
+                           wgtData,
+                           convDesc_,
+                           workSpace,
+                           fwdLimitBytes_,
+                           fwdAlgo_);
+  }
+}
+
+void ConvProjection::backward(const UpdateCallback &callback) {
+  REGISTER_TIMER_INFO("CudnnConvBpTimer", getName().c_str());
+
+  void *workSpace = NULL;
+  if (workSpaceInBytes_ > 0) {
+    workSpace = getSpaceBytes(workSpaceInBytes_);
+  }
+
+  for (int g = 0; g < groups_; ++g) {
+    real *outGrad = out_->grad->getData() + g * outputOffset_;
+    if (weight_->getWGrad()) {
+      real *inputData = in_->value->getData() + g * inputOffset_;
+      real *weightGrad = weight_->getWGrad()->getData() + g * weightOffset_;
+      hl_convolution_backward_filter(imageDesc_,
+                                     inputData,
+                                     outputDesc_,
+                                     outGrad,
+                                     filterDesc_,
+                                     weightGrad,
+                                     convDesc_,
+                                     workSpace,
+                                     bwdFilterLimitBytes_,
+                                     bwdFilterAlgo_);
+    }
+
+    MatrixPtr preGrad = in_->grad;
+    if (NULL != preGrad) {
+      real *inputGrad = preGrad->getData() + g * inputOffset_;
+      real *wgtData = weight_->getW()->getData() + g * weightOffset_;
+      hl_convolution_backward_data(imageDesc_,
+                                   inputGrad,
+                                   outputDesc_,
+                                   outGrad,
+                                   filterDesc_,
+                                   wgtData,
+                                   convDesc_,
+                                   workSpace,
+                                   bwdDataLimitBytes_,
+                                   bwdDataAlgo_);
+    }
+  }
+
+  weight_->getParameterPtr()->incUpdate(callback);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvProjection.h b/paddle/legacy/gserver/layers/ConvProjection.h
new file mode 100644
index 0000000000000000000000000000000000000000..890a17e2f8d2d05001f825f374e8ab6420f7b3ea
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConvProjection.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "ConvBaseProjection.h"
+#include "paddle/legacy/math/MathUtils.h"
+
+namespace paddle {
+
+/**
+ * @brief Convolution projection do the same calculation with CudnnConvLayer.
+ */
+class ConvProjection : public ConvBaseProjection {
+ public:
+  /**
+   * Constructor.
+   */
+  ConvProjection(const ProjectionConfig& config,
+                 ParameterPtr parameter,
+                 bool useGpu)
+      : ConvBaseProjection(config, parameter, useGpu) {}
+
+  ~ConvProjection() {}
+
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback);
+  virtual size_t calOutputSize();
+  virtual size_t calInputSize();
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvShiftLayer.cpp b/paddle/legacy/gserver/layers/ConvShiftLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b7ecbe556c59b32cc5833617717b40c730392506
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConvShiftLayer.cpp
@@ -0,0 +1,108 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * @brief A layer for circular convluation of two vectors,
+ * which is used in NEURAL TURING MACHINE.
+ * - Input: two vectors, the first is data (batchSize x dataDim)
+ * the second is shift weights (batchSize x shiftDim)
+ * - Output: a vector (batchSize x dataDim)
+ * Assumed that:
+ * - a[in]: contains M elements.
+ * - b[in]: contains N elements (N should be odd).
+ * - c[out]: contains M elements.
+ *
+ * \f[
+ *     c[i] = \sum_{j=-(N-1)/2}^{(N-1)/2}a_{i+j} * b_{j}
+ * \f]
+ *
+ * In this formula:
+ *  - a's index is computed modulo M.
+ *  - b's index is comupted modulo N.
+ *
+ * The config file api is conv_shift_layer.
+ */
+
+class ConvShiftLayer : public Layer {
+ public:
+  explicit ConvShiftLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~ConvShiftLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(conv_shift, ConvShiftLayer);
+
+bool ConvShiftLayer::init(const LayerMap& layerMap,
+                          const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 2U);
+
+  return true;
+}
+
+void ConvShiftLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+
+  size_t batchSize = inV0->getHeight();
+  size_t dataDim = inV0->getWidth();
+
+  CHECK_EQ(batchSize, inV1->getHeight());
+  CHECK_EQ(dataDim, getSize());
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    resetOutput(batchSize, dataDim);
+  }
+
+  MatrixPtr outV = getOutputValue();
+
+  REGISTER_TIMER_INFO("FwConvShiftTimer", getName().c_str());
+  outV->circularConv(*inV0, *inV1);
+}
+
+void ConvShiftLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  MatrixPtr outG = getOutputGrad();
+  MatrixPtr inG0 = getInputGrad(0);
+  MatrixPtr inG1 = getInputGrad(1);
+
+  REGISTER_TIMER_INFO("BwConvShiftTimer", getName().c_str());
+
+  if (inG0 && inG1) {
+    outG->circularConvDerivative(*outG, *inV0, *inV1, *inG0, *inG1);
+  } else {
+    CHECK(!inG0 || !inG1) << "Not supported";
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvTransOperator.cpp b/paddle/legacy/gserver/layers/ConvTransOperator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f4ce2affb144152ed41a9d4be9fa87f800c83dbb
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConvTransOperator.cpp
@@ -0,0 +1,125 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ConvTransOperator.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief ConvTransOperator takes two inputs to perform the convolution.
+ * The first input is the image, and the second input is the convolution kernel.
+ * The height of data for two inputs are the same. Each data of the first input
+ * is convolved with each data of the second input indepedently.
+ *
+ * The config file api is conv_operator.
+ */
+
+REGISTER_OPERATOR(convt, ConvTransOperator);
+
+void ConvTransOperator::reshape(int batchSize) {
+  outputH_ = ins_[0]->getFrameHeight();
+  outputW_ = ins_[0]->getFrameWidth();
+  if (outputH_ == 0) outputH_ = outputY_;
+  if (outputW_ == 0) outputW_ = outputX_;
+  imageH_ = imageSize(outputH_, filterSizeY_, paddingY_, strideY_, caffeMode_);
+  imageW_ = imageSize(outputW_, filterSize_, padding_, stride_, caffeMode_);
+  /// Check that the imageSizes are consistent with config
+  CHECK_EQ(imageH_, imgSizeY_);
+  CHECK_EQ(imageW_, imgSize_);
+  out_->setFrameHeight(imageH_);
+  out_->setFrameWidth(imageW_);
+
+  reshapeImageDescriptors();
+
+  inputOffset_ = numFilters_ * outputH_ * outputW_;
+  outputOffset_ = channels_ * imageH_ * imageW_;
+  weightOffset_ = numFilters_ * channels_ * filterSize_ * filterSizeY_;
+
+  if (!isSelectAlgo_) {
+    allocConvWorkSpace();
+  }
+
+  isSelectAlgo_ = true;
+}
+
+void ConvTransOperator::forward() {
+  size_t batchSize = ins_[0]->value->getHeight();
+  reshape(batchSize);
+  CHECK_EQ(ins_[1]->value->getHeight(), batchSize);
+  checkFilterSize(ins_[1]->value);
+  Matrix::resizeOrCreate(
+      out_->value, batchSize, imageH_ * imageW_ * channels_, false, useGpu_);
+  {
+    AsyncGpuBlock block;
+    for (size_t batchId = 0; batchId < batchSize; ++batchId) {
+      real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
+      real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
+      real *outData = out_->value->getData() + outputOffset_ * batchId;
+      hl_convolution_backward_data(imageDesc_,
+                                   outData,
+                                   outputDesc_,
+                                   inputData,
+                                   filterDesc_,
+                                   wgtData,
+                                   convDesc_,
+                                   workSpace_,
+                                   workSpaceInBytes_,
+                                   bwdDataAlgo_);
+    }
+  }
+}
+
+void ConvTransOperator::backward() {
+  size_t batchSize = ins_[0]->value->getHeight();
+  {
+    AsyncGpuBlock block;
+    for (size_t batchId = 0; batchId < batchSize; ++batchId) {
+      real *outGrad = out_->grad->getData() + outputOffset_ * batchId;
+      if (ins_[1]->grad) {
+        real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
+        real *weightGrad = ins_[1]->grad->getData() + weightOffset_ * batchId;
+        hl_convolution_backward_filter(imageDesc_,
+                                       outGrad,
+                                       outputDesc_,
+                                       inputData,
+                                       filterDesc_,
+                                       weightGrad,
+                                       convDesc_,
+                                       workSpace_,
+                                       workSpaceInBytes_,
+                                       bwdFilterAlgo_);
+      }
+
+      MatrixPtr preGrad = ins_[0]->grad;
+      if (NULL != preGrad) {
+        real *inputGrad = preGrad->getData() + inputOffset_ * batchId;
+        real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
+        hl_convolution_forward(imageDesc_,
+                               outGrad,
+                               outputDesc_,
+                               inputGrad,
+                               filterDesc_,
+                               wgtData,
+                               convDesc_,
+                               workSpace_,
+                               workSpaceInBytes_,
+                               fwdAlgo_);
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvTransOperator.h b/paddle/legacy/gserver/layers/ConvTransOperator.h
new file mode 100644
index 0000000000000000000000000000000000000000..206335a01ff7509eaa5528002c6c9686f05c931b
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConvTransOperator.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include "ConvBaseOperator.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief ConvTransOperator takes two inputs to perform the convolution.
+ * The first input is the image, and the second input is the convolution kernel.
+ * The height of data for two inputs are the same. Each data of the first input
+ * is convolved with each data of the second input indepedently.
+ *
+ * The config file api is conv_operator.
+ */
+
+class ConvTransOperator : public ConvBaseOperator {
+ public:
+  ConvTransOperator(const OperatorConfig &config, bool useGpu)
+      : ConvBaseOperator(config, useGpu) {}
+  /**
+   * Free workspace in device and destroy cudnn tensor descriptor.
+   */
+  virtual ~ConvTransOperator() {}
+  void forward() override;
+  void backward() override;
+  void reshape(int batchSize) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvTransProjection.cpp b/paddle/legacy/gserver/layers/ConvTransProjection.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..00e34c8f2dcd2ea9698779f8b4425561f979cfef
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConvTransProjection.cpp
@@ -0,0 +1,123 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ConvTransProjection.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_PROJECTION(convt, ConvTransProjection);
+size_t ConvTransProjection::calOutputSize() {
+  outputH_ = in_->getFrameHeight();
+  outputW_ = in_->getFrameWidth();
+  if (outputH_ == 0) outputH_ = configOutH_;
+  if (outputW_ == 0) outputW_ = configOutW_;
+  imageH_ = imageSize(outputH_,
+                      (filterH_ - 1) * dilationH_ + 1,
+                      paddingH_,
+                      strideH_,
+                      /* caffeMode */ true);
+
+  imageW_ = imageSize(outputW_,
+                      (filterW_ - 1) * dilationW_ + 1,
+                      paddingW_,
+                      strideW_,
+                      /* caffeMode */ true);
+
+  const_cast<Argument *>(out_)->setFrameHeight(imageH_);
+  const_cast<Argument *>(out_)->setFrameWidth(imageW_);
+
+  inputOffset_ = (configChannels_ / groups_) * outputH_ * outputW_;
+  outputOffset_ = (configNumFilters_ / groups_) * imageH_ * imageW_;
+  return imageH_ * imageW_ * configNumFilters_;
+}
+
+size_t ConvTransProjection::calInputSize() {
+  return static_cast<size_t>(configChannels_ * outputH_ * outputW_);
+}
+
+void ConvTransProjection::forward() {
+  int batchSize = in_->value->getHeight();
+  reshape(batchSize);
+
+  void *workSpace = NULL;
+  if (workSpaceInBytes_ > 0) {
+    workSpace = getSpaceBytes(workSpaceInBytes_);
+  }
+
+  for (int g = 0; g < groups_; ++g) {
+    REGISTER_TIMER_INFO("CudnnConvTransFwTimer", getName().c_str());
+
+    real *inData = in_->value->getData() + g * inputOffset_;
+    real *wgtData = weight_->getW()->getData() + g * weightOffset_;
+    real *outData = out_->value->getData() + g * outputOffset_;
+    hl_convolution_backward_data(imageDesc_,
+                                 outData,
+                                 outputDesc_,
+                                 inData,
+                                 filterDesc_,
+                                 wgtData,
+                                 convDesc_,
+                                 workSpace,
+                                 bwdDataLimitBytes_,
+                                 bwdDataAlgo_);
+  }
+}
+
+void ConvTransProjection::backward(const UpdateCallback &callback) {
+  REGISTER_TIMER_INFO("CudnnConvTransBpTimer", getName().c_str());
+
+  void *workSpace = NULL;
+  if (workSpaceInBytes_ > 0) {
+    workSpace = getSpaceBytes(workSpaceInBytes_);
+  }
+
+  for (int g = 0; g < groups_; ++g) {
+    real *outGrad = out_->grad->getData() + g * outputOffset_;
+    if (weight_->getWGrad()) {
+      real *inData = in_->value->getData() + g * inputOffset_;
+      real *weightGrad = weight_->getWGrad()->getData() + g * weightOffset_;
+      hl_convolution_backward_filter(imageDesc_,
+                                     outGrad,
+                                     outputDesc_,
+                                     inData,
+                                     filterDesc_,
+                                     weightGrad,
+                                     convDesc_,
+                                     workSpace,
+                                     bwdFilterLimitBytes_,
+                                     bwdFilterAlgo_);
+    }
+
+    MatrixPtr preGrad = in_->grad;
+    if (NULL != preGrad) {
+      real *inGrad = preGrad->getData() + g * inputOffset_;
+      real *wgtData = weight_->getW()->getData() + g * weightOffset_;
+      hl_convolution_forward(imageDesc_,
+                             outGrad,
+                             outputDesc_,
+                             inGrad,
+                             filterDesc_,
+                             wgtData,
+                             convDesc_,
+                             workSpace,
+                             fwdLimitBytes_,
+                             fwdAlgo_);
+    }
+  }
+
+  weight_->getParameterPtr()->incUpdate(callback);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvTransProjection.h b/paddle/legacy/gserver/layers/ConvTransProjection.h
new file mode 100644
index 0000000000000000000000000000000000000000..9b63dd47352b9f24810d9406b314fbfa15ae13c3
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConvTransProjection.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "ConvBaseProjection.h"
+#include "paddle/legacy/math/MathUtils.h"
+
+namespace paddle {
+
+/**
+ * @brief Convolution projection do the same calculation with CudnnConvLayer.
+ */
+class ConvTransProjection : public ConvBaseProjection {
+ public:
+  /**
+   * Constructor.
+   */
+  ConvTransProjection(const ProjectionConfig& config,
+                      ParameterPtr parameter,
+                      bool useGpu)
+      : ConvBaseProjection(config, parameter, useGpu) {}
+
+  ~ConvTransProjection() {}
+
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback);
+  virtual size_t calOutputSize();
+  virtual size_t calInputSize();
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvexCombinationLayer.cpp b/paddle/legacy/gserver/layers/ConvexCombinationLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c38ab251f18728425d01479b82630550d29e9b61
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConvexCombinationLayer.cpp
@@ -0,0 +1,155 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * @brief A layer for weighted sum of vectors,
+ * which is used in NEURAL MACHINE TRANSLATION BY JOINTLY LEARNING TO ALIGN AND
+ * TRANSLATE
+ * - Input: the the size of the first input is weightDim,
+ *          and the size of the second input is weightdim * dataDim.
+ * - Output: the sizeof the output is dataDim
+ * \f[
+ *   out(j) = \sum_{i}(in0(i) * in1(i,j + i * dataDim)),
+ *               i = 0,1,...,(weightDim-1); j = 0, 1,...,(dataDim-1)
+ * \f]
+ * Note that the above computation is for one sample. Multiple samples are
+ * processed in one batch.
+ *
+ * The config file api is linear_comb_layer.
+ */
+class ConvexCombinationLayer : public Layer {
+ protected:
+  /// A matrix pointer pointing to second input.
+  MatrixPtr tmpMtx0;
+  /// A matrix pointer pointing to first input.
+  MatrixPtr tmpRow0;
+  /// A matrix pointer pointing to output.
+  MatrixPtr tmpRow1;
+
+ public:
+  explicit ConvexCombinationLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~ConvexCombinationLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(convex_comb, ConvexCombinationLayer);
+
+bool ConvexCombinationLayer::init(const LayerMap& layerMap,
+                                  const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(2U, inputLayers_.size());
+  size_t dataDim = getSize();
+  size_t weightDim = inputLayers_[0]->getSize();
+
+  CHECK_EQ(weightDim * dataDim, inputLayers_[1]->getSize())
+      << "Dimension mismatch";
+
+  tmpRow0 = Matrix::create(nullptr,
+                           /* height= */ 1,
+                           weightDim,
+                           /* trans= */ false,
+                           useGpu_);
+  tmpRow1 = Matrix::create(nullptr,
+                           /* height= */ 1,
+                           dataDim,
+                           /* trans= */ false,
+                           useGpu_);
+  tmpMtx0 = Matrix::create(nullptr,
+                           /* height= */ weightDim,
+                           dataDim,
+                           /* trans= */ false,
+                           useGpu_);
+
+  return true;
+}
+
+void ConvexCombinationLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+
+  size_t batchSize = inV0->getHeight();
+  size_t weightDim = inV0->getWidth();
+  size_t dataDim = getSize();
+
+  CHECK_EQ(batchSize, inV1->getHeight());
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    reserveOutput(batchSize, dataDim);
+  }
+
+  MatrixPtr outV = getOutputValue();
+
+  REGISTER_TIMER_INFO("FwCvxCombTimer", getName().c_str());
+  for (size_t i = 0; i < batchSize; i++) {
+    tmpMtx0->setData(inV1->getData() + i * weightDim * dataDim);
+    tmpRow0->setData(inV0->getData() + i * weightDim);
+    tmpRow1->setData(outV->getData() + i * dataDim);
+
+    tmpRow1->mul(*tmpRow0, *tmpMtx0, 1, 0);
+  }
+}
+
+void ConvexCombinationLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr outG = getOutputGrad();
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  MatrixPtr inG0 = getInputGrad(0);
+  MatrixPtr inG1 = getInputGrad(1);
+
+  size_t batchSize = inV0->getHeight();
+  size_t weightDim = inV0->getWidth();
+  size_t dataDim = getSize();
+
+  REGISTER_TIMER_INFO("BwCvxCombTimer", getName().c_str());
+
+  if (inG0) {
+    for (size_t i = 0; i < batchSize; i++) {
+      tmpRow0->setData(inG0->getData() + i * weightDim);
+      tmpRow1->setData(outG->getData() + i * dataDim);
+      tmpMtx0->setData(inV1->getData() + i * weightDim * dataDim);
+
+      tmpRow0->mul(*tmpRow1, *(tmpMtx0->getTranspose()), 1, 1);
+    }
+  }
+
+  if (inG1) {
+    for (size_t i = 0; i < batchSize; i++) {
+      tmpRow0->setData(inV0->getData() + i * weightDim);
+      tmpRow1->setData(outG->getData() + i * dataDim);
+      tmpMtx0->setData(inG1->getData() + i * weightDim * dataDim);
+
+      tmpMtx0->mul(*(tmpRow0->getTranspose()), *tmpRow1, 1, 1);
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CosSimLayer.cpp b/paddle/legacy/gserver/layers/CosSimLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ab8d7cc1f61823890676e8f647f784cfa9a0775e
--- /dev/null
+++ b/paddle/legacy/gserver/layers/CosSimLayer.cpp
@@ -0,0 +1,93 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "CosSimLayer.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(cos, CosSimLayer);
+
+bool CosSimLayer::init(const LayerMap& layerMap,
+                       const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 2LU);
+
+  createFunction(forward_,
+                 "CosSimForward",
+                 FuncConfig().set("scale", (real)config_.cos_scale()));
+  createFunction(backward_,
+                 "CosSimBackward",
+                 FuncConfig().set("scale", (real)config_.cos_scale()));
+
+  return true;
+}
+
+void CosSimLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  /* malloc memory for the output_ if necessary */
+  int batchSize = getInputValue(0)->getHeight();
+  int size = getSize();
+  CHECK_EQ(forward_.size(), 1UL) << "Only one forward function needed";
+
+  {
+    REGISTER_TIMER_INFO("CosFwResetTimer", getName().c_str());
+    reserveOutput(batchSize, size);
+  }
+
+  MatrixPtr outV = getOutputValue();
+  /* activation */ {
+    REGISTER_TIMER_INFO("CosFwAtvTimer", getName().c_str());
+    MatrixPtr prevOut1 = getInputValue(0);
+    MatrixPtr prevOut2 = getInputValue(1);
+
+    CHECK(outV && prevOut1 && prevOut2);
+    BufferArgs inputs;
+    BufferArgs outputs;
+    inputs.addArg(*prevOut1);
+    inputs.addArg(*prevOut2);
+    outputs.addArg(*outV, ASSIGN_TO);
+    forward_[0]->calc(inputs, outputs);
+  }
+}
+
+void CosSimLayer::backward(const UpdateCallback& callback) {
+  /* activation */ {
+    REGISTER_TIMER_INFO("CosBpAtvTimer", getName().c_str());
+    CHECK_EQ(backward_.size(), 1UL) << "Only one backward function needed";
+
+    const auto outG = this->getOutputGrad();
+    const auto outV = this->getOutputValue();
+    const auto inV1 = this->getInputValue(0);
+    const auto inV2 = this->getInputValue(1);
+    auto inG1 = this->getInputGrad(0);
+    auto inG2 = this->getInputGrad(1);
+    CHECK(outG && outV && inV1 && inV2 && inG1 && inG2);
+    BufferArgs inputs;
+    BufferArgs outputs;
+    inputs.addArg(*outG);
+    inputs.addArg(*outV);
+    inputs.addArg(*inV1);
+    inputs.addArg(*inV2);
+    outputs.addArg(*inG1, ADD_TO);
+    outputs.addArg(*inG2, ADD_TO);
+
+    backward_[0]->calc(inputs, outputs);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CosSimLayer.h b/paddle/legacy/gserver/layers/CosSimLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..b08e2c6a35369832732706d64f209f85a5292a6f
--- /dev/null
+++ b/paddle/legacy/gserver/layers/CosSimLayer.h
@@ -0,0 +1,48 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
+
+namespace paddle {
+/**
+ * @brief A layer for calculating cosine similarity between two vector
+ * \f[
+ * f(x,y)=scale\frac{x_1y_1+x_2y_2+...+x_ny_n}{\sqrt{x_1^2+x_2^2+...
+ * +x_n^2}\sqrt{y_1^2+y_2^2+...+y_n^2}}
+ * \f]
+ *
+ * - Input1: A vector (batchSize * dataDim) *
+ * - Input2: A vector (batchSize * dataDim) or (1 * dataDim) *
+ * - Output: A vector (batchSize * 1)
+ *
+ * The config file api is cos_sim.
+ */
+class CosSimLayer : public Layer {
+ public:
+  explicit CosSimLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~CosSimLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CosSimVecMatLayer.cpp b/paddle/legacy/gserver/layers/CosSimVecMatLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..03de0be815a1fb5eeb7ffab31b1721dc5951a469
--- /dev/null
+++ b/paddle/legacy/gserver/layers/CosSimVecMatLayer.cpp
@@ -0,0 +1,182 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+/**
+ * @brief A layer for computing cosine similarity between a vector
+ * and each row of a matrix
+ * out[i] = cos_scale * cos(in1, in2(i,:));
+ * @note used in NEURAL TURING MACHINE
+ *
+ * Input1: a vector (batchSize * dataDim)
+ *
+ * Input2: a matrix in vector form (batchSize * (weightDim*dataDim))
+ *
+ * Output: a vector (batchSize * weightDim)
+ */
+
+class CosSimVecMatLayer : public Layer {
+ protected:
+  MatrixPtr tmpMtx0;
+  MatrixPtr tmpMtx1;
+  MatrixPtr tmpRow0;
+  MatrixPtr tmpRow1;
+  MatrixPtr tmpRow2;
+  MatrixPtr tmpRow3;
+
+ public:
+  explicit CosSimVecMatLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~CosSimVecMatLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(cos_vm, CosSimVecMatLayer);
+
+bool CosSimVecMatLayer::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 2U);
+
+  size_t dataDim = inputLayers_[0]->getSize();
+  size_t numKeys = getSize();
+  size_t memoryDim = inputLayers_[1]->getSize();
+
+  CHECK_EQ(dataDim * numKeys, memoryDim) << "Dimension mismatch";
+
+  tmpRow0 = Matrix::create(nullptr,
+                           /* height= */ 1,
+                           dataDim,
+                           /* trans= */ false,
+                           useGpu_);
+  tmpRow1 = Matrix::create(nullptr,
+                           /* height= */ 1,
+                           dataDim,
+                           /* trans= */ false,
+                           useGpu_);
+  tmpRow2 = Matrix::create(nullptr,
+                           /* height= */ numKeys,
+                           1,
+                           /* trans= */ false,
+                           useGpu_);
+  tmpRow3 = Matrix::create(nullptr,
+                           /* height= */ numKeys,
+                           1,
+                           /* trans= */ false,
+                           useGpu_);
+
+  tmpMtx0 = Matrix::create(nullptr,
+                           /* height= */ numKeys,
+                           dataDim,
+                           /* trans= */ false,
+                           useGpu_);
+  tmpMtx1 = Matrix::create(nullptr,
+                           /* height= */ numKeys,
+                           dataDim,
+                           /* trans= */ false,
+                           useGpu_);
+
+  CHECK(tmpRow0 && tmpRow1 && tmpRow2 && tmpRow3 && tmpMtx0 && tmpMtx1);
+
+  createFunction(forward_,
+                 "CosSimForward",
+                 FuncConfig().set("scale", (real)config_.cos_scale()));
+  createFunction(backward_,
+                 "CosSimBackward",
+                 FuncConfig().set("scale", (real)config_.cos_scale()));
+
+  return true;
+}
+
+void CosSimVecMatLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  CHECK_EQ(forward_.size(), 1UL) << "Only one forward function needed";
+
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+
+  size_t batchSize = inV0->getHeight();
+  size_t numKeys = getSize();
+
+  CHECK_EQ(batchSize, inV1->getHeight());
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    reserveOutput(batchSize, numKeys);
+  }
+
+  MatrixPtr outV = getOutputValue();
+  CHECK(outV && inV0 && inV1);
+  REGISTER_TIMER_INFO("FwCosVMTimer", getName().c_str());
+  for (size_t i = 0; i < batchSize; i++) {
+    tmpRow0->setData(inV0->rowBuf(i));
+    tmpMtx0->setData(inV1->rowBuf(i));
+    tmpRow2->setData(outV->rowBuf(i));
+
+    BufferArgs inputs;
+    BufferArgs outputs;
+    inputs.addArg(*tmpMtx0);
+    inputs.addArg(*tmpRow0);
+    outputs.addArg(*tmpRow2, ASSIGN_TO);
+    forward_[0]->calc(inputs, outputs);
+  }
+}
+
+void CosSimVecMatLayer::backward(const UpdateCallback& callback) {
+  CHECK_EQ(backward_.size(), 1UL) << "Only one forward function needed";
+
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  MatrixPtr inG0 = getInputGrad(0);
+  MatrixPtr inG1 = getInputGrad(1);
+  MatrixPtr outV = getOutputValue();
+  MatrixPtr outG = getOutputGrad();
+
+  size_t batchSize = inV0->getHeight();
+  CHECK(inV0 && inV1 && inG0 && inG1 && outV && outG);
+  REGISTER_TIMER_INFO("BwCosVMTimer", getName().c_str());
+
+  for (size_t i = 0; i < batchSize; i++) {
+    tmpRow0->setData(inV0->rowBuf(i));
+    tmpRow1->setData(inG0->rowBuf(i));
+    tmpMtx0->setData(inV1->rowBuf(i));
+    tmpMtx1->setData(inG1->rowBuf(i));
+    tmpRow2->setData(outV->rowBuf(i));
+    tmpRow3->setData(outG->rowBuf(i));
+
+    BufferArgs inputs;
+    BufferArgs outputs;
+    inputs.addArg(*tmpRow3);
+    inputs.addArg(*tmpRow2);
+    inputs.addArg(*tmpMtx0);
+    inputs.addArg(*tmpRow0);
+    outputs.addArg(*tmpMtx1, ADD_TO);
+    outputs.addArg(*tmpRow1, ADD_TO);
+
+    backward_[0]->calc(inputs, outputs);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CostLayer.cpp b/paddle/legacy/gserver/layers/CostLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..18b5b77bde9dee97cb6971624007307ff06411c7
--- /dev/null
+++ b/paddle/legacy/gserver/layers/CostLayer.cpp
@@ -0,0 +1,748 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "CostLayer.h"
+#include <algorithm>
+#include <cmath>
+#include <memory>
+#include "paddle/legacy/utils/Logging.h"
+
+#include "paddle/legacy/math/SparseMatrix.h"
+
+namespace paddle {
+
+bool CostLayer::init(const LayerMap& layerMap,
+                     const ParameterMap& parameterMap) {
+  bool ret = Layer::init(layerMap, parameterMap);
+  coeff_ = config_.coeff();
+  if (!ret) return ret;
+  CHECK_GE(inputLayers_.size(), 2UL);
+  CHECK_LE(inputLayers_.size(), 3UL);
+  if (inputLayers_.size() == 3) {
+    weightLayer_ = inputLayers_[2];
+  }
+  return true;
+}
+
+void CostLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  /* malloc memory for the output_ if necessary */
+  int batchSize = getInputValue(*getOutputLayer())->getHeight();
+  int size = 1;
+  resetOutput(batchSize, size);
+
+  const MatrixPtr& output = getInputValue(*getOutputLayer());
+  Argument label = getInput(*getLabelLayer());
+
+  /* get the cost value for each sample*/
+  forwardImp(*output, label, *getOutputValue());
+  if (weightLayer_) {
+    const MatrixPtr& weight = getInputValue(*weightLayer_);
+    getOutputValue()->dotMul(*getOutputValue(), *weight);
+  }
+}
+
+void CostLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+
+  const Argument& output = getInput(*getOutputLayer());
+  Argument label = getInput(*getLabelLayer());
+
+  bool support = true;
+  if (weightLayer_) {
+    support = output.grad->getAbsSum() == 0;
+  }
+
+  backwardImp(*output.value, label, *output.grad);
+
+  if (weightLayer_) {
+    CHECK(support) << "Weighted cost layer '" << getName()
+                   << "' must be the last layer "
+                      "connected to the output layer '"
+                   << getOutputLayer()->getName() << "'";
+    output.grad->rowScale(0, *output.grad, *getInputValue(*weightLayer_));
+  }
+  if (coeff_ != real(1.0f)) {
+    output.grad->add(coeff_, 0);
+  }
+}
+
+//
+// class MultiClassCrossEntropy
+//
+bool MultiClassCrossEntropy::init(const LayerMap& layerMap,
+                                  const ParameterMap& parameterMap) {
+  return CostLayer::init(layerMap, parameterMap);
+}
+
+void MultiClassCrossEntropy::forwardImp(Matrix& output,
+                                        Argument& label,
+                                        Matrix& target) {
+  target.oneHotCrossEntropy(output, *label.ids);
+}
+
+void MultiClassCrossEntropy::backwardImp(Matrix& output,
+                                         Argument& label,
+                                         Matrix& outputG) {
+  outputG.oneHotCrossEntropyBp(output, *label.ids);
+}
+
+//
+// class MultiClassCrossEntropyWithSelfNorm
+//
+REGISTER_LAYER(multi_class_cross_entropy_with_selfnorm,
+               MultiClassCrossEntropyWithSelfNorm);
+
+bool MultiClassCrossEntropyWithSelfNorm::init(
+    const LayerMap& layerMap, const ParameterMap& parameterMap) {
+  return CostLayer::init(layerMap, parameterMap);
+}
+
+void MultiClassCrossEntropyWithSelfNorm::forwardImp(Matrix& output,
+                                                    Argument& label,
+                                                    Matrix& target) {
+  Matrix::resizeOrCreate(sftMaxSum_, output.getHeight(), 1, false, useGpu_);
+  output.rowSum(*sftMaxSum_);
+  sftMaxSum_->log2();
+
+  target.oneHotCrossEntropy(output, *label.ids);
+  target.add(*sftMaxSum_);
+
+  sftMaxSum_->square2();
+  target.add(*sftMaxSum_, config_.softmax_selfnorm_alpha());
+}
+
+void MultiClassCrossEntropyWithSelfNorm::backwardImp(Matrix& output,
+                                                     Argument& label,
+                                                     Matrix& outputG) {
+  Matrix::resizeOrCreate(sftMaxSum_, output.getHeight(), 1, false, useGpu_);
+  output.rowSum(*sftMaxSum_);
+
+  Matrix::resizeOrCreate(sumInv_, output.getHeight(), 1, false, useGpu_);
+  sftMaxSum_->reciprocal2(*sumInv_);
+
+  outputG.oneHotCrossEntropyBp(output, *label.ids);
+  outputG.addColumnVector(*sumInv_);
+
+  sftMaxSum_->log2();
+  sumInv_->dotMul(*sumInv_, *sftMaxSum_);
+  sumInv_->mulScalar(2 * config_.softmax_selfnorm_alpha());
+
+  outputG.addColumnVector(*sumInv_);
+}
+
+//
+// class SoftBinaryClassCrossEntropy
+//
+REGISTER_LAYER(soft_binary_class_cross_entropy, SoftBinaryClassCrossEntropy);
+
+bool SoftBinaryClassCrossEntropy::init(const LayerMap& layerMap,
+                                       const ParameterMap& parameterMap) {
+  return CostLayer::init(layerMap, parameterMap);
+}
+
+void SoftBinaryClassCrossEntropy::forwardImp(Matrix& output,
+                                             Argument& label,
+                                             Matrix& target) {
+  Matrix::resizeOrCreate(
+      targetPerDim_, output.getHeight(), output.getWidth(), false, useGpu_);
+
+  targetPerDim_->softCrossEntropy(output, *label.value);
+  targetPerDim_->rowSum(target);
+}
+
+void SoftBinaryClassCrossEntropy::backwardImp(Matrix& output,
+                                              Argument& label,
+                                              Matrix& outputG) {
+  outputG.softCrossEntropyBp(output, *label.value);
+}
+
+//
+// class SumOfSquaresCostLayer
+//
+
+REGISTER_LAYER(square_error, SumOfSquaresCostLayer);
+
+bool SumOfSquaresCostLayer::init(const LayerMap& layerMap,
+                                 const ParameterMap& parameterMap) {
+  return CostLayer::init(layerMap, parameterMap);
+}
+
+void SumOfSquaresCostLayer::forwardImp(Matrix& output,
+                                       Argument& label,
+                                       Matrix& target) {
+  target.sumOfSquares(output, *label.value);
+}
+
+void SumOfSquaresCostLayer::backwardImp(Matrix& output,
+                                        Argument& label,
+                                        Matrix& outputG) {
+  outputG.sumOfSquaresBp(output, *label.value);
+}
+
+//
+// class SmoothL1CostLayer
+//
+
+REGISTER_LAYER(smooth_l1, SmoothL1CostLayer);
+
+bool SmoothL1CostLayer::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  return CostLayer::init(layerMap, parameterMap);
+}
+
+void SmoothL1CostLayer::forwardImp(Matrix& output,
+                                   Argument& label,
+                                   Matrix& target) {
+  MatrixPtr targetCpu, outputCpu, labelCpu;
+  if (useGpu_) {
+    targetCpu =
+        Matrix::create(target.getHeight(), target.getWidth(), false, false);
+    outputCpu =
+        Matrix::create(output.getHeight(), output.getWidth(), false, false);
+    labelCpu = Matrix::create(
+        label.value->getHeight(), label.value->getWidth(), false, false);
+    targetCpu->copyFrom(target);
+    outputCpu->copyFrom(output);
+    labelCpu->copyFrom(*label.value);
+    targetCpu->smoothL1(*outputCpu, *labelCpu, 1.0);
+    target.copyFrom(*targetCpu);
+  } else {
+    target.smoothL1(output, *label.value, 1.0);
+  }
+}
+
+void SmoothL1CostLayer::backwardImp(Matrix& output,
+                                    Argument& label,
+                                    Matrix& outputG) {
+  MatrixPtr outputGCpu, outputCpu, labelCpu;
+  if (useGpu_) {
+    outputGCpu =
+        Matrix::create(outputG.getHeight(), outputG.getWidth(), false, false);
+    outputCpu =
+        Matrix::create(output.getHeight(), output.getWidth(), false, false);
+    labelCpu = Matrix::create(
+        label.value->getHeight(), label.value->getWidth(), false, false);
+    outputGCpu->copyFrom(outputG);
+    outputCpu->copyFrom(output);
+    labelCpu->copyFrom(*label.value);
+    outputGCpu->smoothL1Bp(*outputCpu, *labelCpu, 1.0);
+    outputG.copyFrom(*outputGCpu);
+  } else {
+    outputG.smoothL1Bp(output, *label.value, 1.0);
+  }
+}
+
+//
+// class RankingCost
+//
+bool RankingCost::init(const LayerMap& layerMap,
+                       const ParameterMap& parameterMap) {
+  posPairCount_ = 0;
+  negPairCount_ = 0;
+
+  bool ret = Layer::init(layerMap, parameterMap);
+  if (!ret) return ret;
+  CHECK_GE(inputLayers_.size(), 3UL);
+  CHECK_LE(inputLayers_.size(), 4UL);
+  if (inputLayers_.size() == 4) {
+    weightLayer_ = inputLayers_[3];
+  }
+  return true;
+}
+
+void RankingCost::forward(PassType passType) {
+  Layer::forward(passType);
+
+  /* malloc memory for the output_ if necessary */
+  int batchSize = getInputValue(*getOutputLayer(0))->getHeight();
+  int size = 1;
+  resizeOutput(batchSize, size);
+  Matrix::resizeOrCreate(margin_, batchSize, size, /* trans= */ false, useGpu_);
+  MatrixPtr label = getInputValue(*getLabelLayer());
+  if (!label) {
+    // input label is not in value, try ids
+    IVectorPtr idLabel = getInput(*getLabelLayer()).ids;
+    CHECK(idLabel) << "label layer has neither value nor ids";
+    CHECK_EQ((size_t)batchSize, idLabel->getSize());
+    Matrix::resizeOrCreate(
+        labelBuf_, batchSize, /*width*/ 1, /*trans*/ false, useGpu_);
+    labelBuf_->copyFrom(*idLabel);
+    label = labelBuf_;
+  }
+
+  MatrixPtr output[] = {getInputValue(*getOutputLayer(0)),
+                        getInputValue(*getOutputLayer(1))};
+  MatrixPtr target = this->getOutputValue();
+  margin_->sub(*output[0], *output[1]);
+
+  // for validation
+  size_t height = output[0]->getHeight();
+  target->biggerThan(*(output[0]), *(output[1]), *label);
+  double total = static_cast<double>(height);
+  if (weightLayer_) {
+    const MatrixPtr& weight = getInputValue(*weightLayer_);
+    target->dotMul(*target, *weight);
+    total = weight->getSum();
+  }
+  double pos = target->getSum();
+  posPairCount_ += pos;
+  negPairCount_ += (total - pos);
+
+  // forward
+  target->logisticRegressionLoss(*margin_, *label);
+  if (weightLayer_) {
+    const MatrixPtr& weight = getInputValue(*weightLayer_);
+    target->dotMul(*target, *weight);
+  }
+}
+
+void RankingCost::backward(const UpdateCallback& callback) {
+  (void)callback;
+
+  MatrixPtr label = getInputValue(*getLabelLayer());
+  if (!label) {
+    // input label is not in value, but in ids
+    // use labelBuf_ (should already resized and copied during forward)
+    label = labelBuf_;
+  }
+
+  Matrix::resizeOrCreate(
+      marginGrad_, label->getHeight(), 1, /* trans= */ false, useGpu_);
+  marginGrad_->zeroMem();
+  marginGrad_->logisticRegressionLossBp(*margin_, *label);
+  if (weightLayer_) {
+    const MatrixPtr& weight = getInputValue(*weightLayer_);
+    marginGrad_->dotMul(*marginGrad_, *weight);
+  }
+
+  getInputGrad(0)->add(*marginGrad_);
+  getInputGrad(1)->sub(*marginGrad_);
+}
+
+void RankingCost::onPassEnd() {
+  double ratio = posPairCount_ / ((negPairCount_ <= 0) ? 1.0 : negPairCount_);
+  LOG(INFO) << "calc pos/neg: " << ratio << " pos= " << posPairCount_
+            << " neg= " << negPairCount_;
+
+  posPairCount_ = 0;
+  negPairCount_ = 0;
+}
+
+//
+// class LambdaCost
+//
+REGISTER_LAYER(lambda_cost, LambdaCost);
+
+bool LambdaCost::init(const LayerMap& layerMap,
+                      const ParameterMap& parameterMap) {
+  truncationSize_ = config_.ndcg_num();
+  maxSortSize_ = config_.max_sort_size();
+  if (maxSortSize_ != -1) {
+    CHECK_GE(maxSortSize_, truncationSize_)
+        << "maxSortSize must be greater than or equal to NDCG size!";
+  }
+  LOG(INFO) << "LambdaRank v1.3, NDCG size = " << truncationSize_
+            << ", Max partial sort size = " << maxSortSize_;
+  CHECK(!useGpu_) << "LambdaRank supports CPU only!";
+  return Layer::init(layerMap, parameterMap);
+}
+
+void LambdaCost::forward(PassType passType) {
+  Layer::forward(passType);
+
+  /* malloc memory for the output_ if necessary */
+  int batchSize = getInputValue(*getOutputLayer())->getHeight();
+  resizeOutput(batchSize, 1);
+
+  MatrixPtr score = getInputValue(*getScoreLayer());
+  MatrixPtr output = getInputValue(*getOutputLayer());
+  MatrixPtr target = this->getOutputValue();
+
+  real* scoreData = score->getData();
+  real* outputData = output->getData();
+  real* targetData = target->getData();
+
+  auto startPos = getInput(*getOutputLayer()).sequenceStartPositions;
+  const int* startPosData = startPos->getData(false);
+  size_t batchNum = startPos->getSize() - 1;
+  for (size_t i = 0; i < batchNum; ++i) {
+    int beginPos = startPosData[i];
+    int endPos = startPosData[i + 1];
+    real NDCG = calcNDCG(
+        outputData + beginPos, scoreData + beginPos, endPos - beginPos);
+    for (int j = beginPos; j < endPos; ++j) {
+      targetData[j] = NDCG;
+    }
+  }
+}
+
+void LambdaCost::backward(const UpdateCallback& callback) {
+  (void)callback;
+  MatrixPtr score = getInputValue(*getScoreLayer());
+  MatrixPtr output = getInputValue(*getOutputLayer());
+  Matrix::resizeOrCreate(marginGrad_,
+                         score->getHeight(),
+                         1,
+                         /* trans= */ false,
+                         useGpu_);
+  marginGrad_->zeroMem();
+
+  real* gradData = marginGrad_->getData();
+  real* scoreData = score->getData();
+  real* outputData = output->getData();
+
+  auto startPos = getInput(*getOutputLayer()).sequenceStartPositions;
+  const int* startPosData = startPos->getData(false);
+  size_t batchNum = startPos->getSize() - 1;
+
+  for (size_t i = 0; i < batchNum; ++i) {
+    int beginPos = startPosData[i];
+    int endPos = startPosData[i + 1];
+    calcGrad(outputData + beginPos,
+             scoreData + beginPos,
+             gradData + beginPos,
+             endPos - beginPos);
+  }
+
+  getInputGrad(0)->add(*marginGrad_);
+}
+
+void LambdaCost::calcGrad(const real* outputScore,
+                          const real* score,
+                          real* gradData,
+                          int size) {
+  CHECK_GE(size, truncationSize_)
+      << "Invalid: (Sample num in the same list) < (NDCG truncation num) !";
+  int sortSize = maxSortSize_ == -1 ? size : std::min(maxSortSize_, size);
+
+  scorePair_.clear();
+  for (int i = 0; i < size; ++i) {
+    scorePair_.push_back(std::make_pair(score[i], i));
+  }
+  if (size <= sortSize) {
+    std::sort(scorePair_.begin(),
+              scorePair_.end(),
+              [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
+                return a.first > b.first;
+              });
+  } else {
+    std::partial_sort(
+        scorePair_.begin(),
+        scorePair_.begin() + sortSize,
+        scorePair_.end(),
+        [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
+          return a.first > b.first;
+        });
+  }
+
+  real maxDCG = 0;
+  for (int i = 0; i < truncationSize_; ++i) {
+    maxDCG += (std::pow(2, scorePair_[i].first) - 1) / std::log(i + 2);
+  }
+  CHECK_GT(maxDCG, 0) << "Invalid: max DCG = 0!";
+
+  for (int i = 0; i < sortSize; ++i) {
+    for (int j = i + 1; j < size; ++j) {
+      int index_i = scorePair_[i].second;
+      int index_j = scorePair_[j].second;
+      real score_i = score[index_i];
+      real score_j = score[index_j];
+      real dcgDif = 0;
+      if (j < sortSize) {
+        dcgDif = (std::pow(2, score_i) - std::pow(2, score_j)) *
+                 (1 / std::log(i + 2) - 1 / std::log(j + 2));
+      } else {
+        dcgDif =
+            (std::pow(2, score_i) - std::pow(2, score_j)) / std::log(i + 2);
+      }
+
+      real lambda_ij =
+          -std::abs(dcgDif) /
+          (1 + std::exp(outputScore[index_i] - outputScore[index_j]));
+      gradData[index_i] += lambda_ij / maxDCG;
+      gradData[index_j] -= lambda_ij / maxDCG;
+    }
+  }
+}
+
+real LambdaCost::calcNDCG(const real* outputScore,
+                          const real* score,
+                          int size) {
+  CHECK_GE(size, truncationSize_)
+      << "Invalid: (Sample num in the same list) < (NDCG truncation num) !";
+
+  outputScorePair_.clear();
+  for (int i = 0; i < size; ++i) {
+    outputScorePair_.push_back(std::make_pair(outputScore[i], i));
+  }
+  std::partial_sort(
+      outputScorePair_.begin(),
+      outputScorePair_.begin() + truncationSize_,
+      outputScorePair_.end(),
+      [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
+        return a.first > b.first;
+      });
+
+  real DCG = 0;
+  for (int i = 0; i < truncationSize_; ++i) {
+    DCG +=
+        (std::pow(2, score[outputScorePair_[i].second]) - 1) / std::log(i + 2);
+  }
+
+  scoreVec_.resize(size);
+  std::copy(score, score + size, scoreVec_.begin());
+  real maxDCG = 0;
+  std::partial_sort(scoreVec_.begin(),
+                    scoreVec_.begin() + truncationSize_,
+                    scoreVec_.end(),
+                    std::greater<real>());
+  for (int i = 0; i < truncationSize_; ++i) {
+    maxDCG += (std::pow(2, scoreVec_[i]) - 1) / std::log(i + 2);
+  }
+  CHECK_GT(maxDCG, 0) << "Invalid: max DCG = 0!";
+
+  return DCG / maxDCG;
+}
+
+//
+// class MultiBinaryLabelCrossEntropy
+//
+
+REGISTER_LAYER(multi_binary_label_cross_entropy, MultiBinaryLabelCrossEntropy);
+
+bool MultiBinaryLabelCrossEntropy::init(const LayerMap& layerMap,
+                                        const ParameterMap& parameterMap) {
+  return CostLayer::init(layerMap, parameterMap);
+}
+
+void MultiBinaryLabelCrossEntropy::forwardImp(Matrix& output,
+                                              Argument& label,
+                                              Matrix& target) {
+  MatrixPtr value = nullptr;
+  if (label.ids) {
+    CHECK(!label.value);
+    value = label.ids->toOneHotSparseMatrix(output.getWidth(), useGpu_);
+  } else {
+    CHECK(label.value);
+    value = label.value;
+  }
+
+  if (dynamic_cast<CpuSparseMatrix*>(value.get()) ||
+      dynamic_cast<GpuSparseMatrix*>(value.get())) {
+    target.multiBinaryLabelCrossEntropy(output, *value);
+  } else {
+    Matrix::resizeOrCreate(
+        targetPerDim_, output.getHeight(), output.getWidth(), false, useGpu_);
+
+    targetPerDim_->binaryLabelCrossEntropy(output, *value);
+    targetPerDim_->rowSum(target);
+  }
+}
+
+void MultiBinaryLabelCrossEntropy::backwardImp(Matrix& output,
+                                               Argument& label,
+                                               Matrix& outputG) {
+  MatrixPtr value = nullptr;
+  if (label.ids) {
+    CHECK(!value);
+    value = label.ids->toOneHotSparseMatrix(output.getWidth(), useGpu_);
+  } else {
+    CHECK(label.value);
+    value = label.value;
+  }
+
+  if (dynamic_cast<CpuSparseMatrix*>(value.get()) ||
+      dynamic_cast<GpuSparseMatrix*>(value.get())) {
+    outputG.multiBinaryLabelCrossEntropyBp(output, *value);
+  } else {
+    outputG.binaryLabelCrossEntropyBp(output, *value);
+  }
+}
+
+bool HuberCost::init(const LayerMap& layerMap,
+                     const ParameterMap& parameterMap) {
+  CostLayer::init(layerMap, parameterMap);
+  if (useGpu_) {
+    tmpCpuInput_.reserve(inputLayers_.size());
+    for (size_t i = 0; i < inputLayers_.size(); i++) {
+      tmpCpuInput_.push_back(Argument());
+    }
+  }
+  return true;
+}
+
+void HuberCost::forwardImp(Matrix& output, Argument& label, Matrix& cost) {
+  if (useGpu_) {
+    for (size_t i = 0; i < inputLayers_.size(); i++) {
+      tmpCpuInput_[i].resizeAndCopyFrom(
+          getInput(i), false, HPPL_STREAM_DEFAULT);
+    }
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  }
+}
+
+//
+// Huber loss for robust regression.
+//
+REGISTER_LAYER(huber_regression, HuberRegressionLoss);
+
+bool HuberRegressionLoss::init(const LayerMap& layerMap,
+                               const ParameterMap& parameterMap) {
+  HuberCost::init(layerMap, parameterMap);
+  delta_ = config_.delta();
+  return true;
+}
+
+void HuberRegressionLoss::forwardImp(Matrix& output,
+                                     Argument& label,
+                                     Matrix& target) {
+  HuberCost::forwardImp(output, label, target);
+  size_t numSamples = target.getHeight();
+  size_t dim = output.getWidth();
+  CHECK(label.value);
+  CHECK_EQ((*label.value).getHeight(), numSamples);
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(dim, (*label.value).getWidth());
+  CHECK_EQ(target.getWidth(), (size_t)1);
+
+  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
+  real* lbl =
+      useGpu_ ? tmpCpuInput_[1].value->getData() : (*label.value).getData();
+  std::vector<real> cost(numSamples, 0);
+  for (size_t i = 0; i < numSamples; ++i) {
+    for (size_t j = 0; j < dim; ++j) {
+      int index = i * dim + j;
+      real a = std::abs(lbl[index] - out[index]);
+      if (a <= delta_)
+        cost[i] += a * a / 2;
+      else
+        cost[i] += delta_ * (a - delta_ / 2);
+    }
+  }
+  target.copyFrom(cost.data(), numSamples);
+}
+
+void HuberRegressionLoss::backwardImp(Matrix& output,
+                                      Argument& label,
+                                      Matrix& outputG) {
+  size_t numSamples = output.getHeight();
+  size_t dim = output.getWidth();
+  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
+  real* lbl =
+      useGpu_ ? tmpCpuInput_[1].value->getData() : (*label.value).getData();
+  real* grad = useGpu_ ? tmpCpuInput_[0].grad->getData() : outputG.getData();
+  for (size_t i = 0; i < numSamples; ++i) {
+    for (size_t j = 0; j < dim; ++j) {
+      int index = i * dim + j;
+      real a = lbl[index] - out[index];
+      if (std::abs(a) <= delta_)
+        grad[index] += -a;
+      else
+        grad[index] += a > 0 ? -delta_ : delta_;
+    }
+  }
+  if (useGpu_) outputG.copyFrom(grad, numSamples * dim);
+}
+
+//
+// Huber loss for robust 2-classes classification
+//
+REGISTER_LAYER(huber_classification, HuberTwoClassification);
+
+bool HuberTwoClassification::init(const LayerMap& layerMap,
+                                  const ParameterMap& parameterMap) {
+  return HuberCost::init(layerMap, parameterMap);
+}
+
+void HuberTwoClassification::forwardImp(Matrix& output,
+                                        Argument& label,
+                                        Matrix& target) {
+  HuberCost::forwardImp(output, label, target);
+  size_t numSamples = target.getHeight();
+  CHECK(label.ids);
+  CHECK_EQ((*label.ids).getSize(), numSamples);
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(output.getWidth(), (size_t)1);
+  CHECK_EQ(target.getWidth(), (size_t)1);
+
+  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
+  int* lbl = useGpu_ ? tmpCpuInput_[1].ids->getData() : (*label.ids).getData();
+  std::vector<real> cost(numSamples, 0);
+  for (size_t i = 0; i < numSamples; ++i) {
+    int y = 2 * lbl[i] - 1;
+    real a = out[i] * y;
+    if (a < -1)
+      cost[i] = -4 * a;
+    else if (a < 1)
+      cost[i] = (1 - a) * (1 - a);
+  }
+  target.copyFrom(cost.data(), numSamples);
+}
+
+void HuberTwoClassification::backwardImp(Matrix& output,
+                                         Argument& label,
+                                         Matrix& outputG) {
+  size_t numSamples = output.getHeight();
+  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
+  int* lbl = useGpu_ ? tmpCpuInput_[1].ids->getData() : (*label.ids).getData();
+  real* grad = useGpu_ ? tmpCpuInput_[0].grad->getData() : outputG.getData();
+  for (size_t i = 0; i < numSamples; ++i) {
+    int y = 2 * lbl[i] - 1;
+    real a = out[i] * y;
+    if (a < -1)
+      grad[i] += -4 * y;
+    else if (a < 1)
+      grad[i] += -2 * (1 - a) * y;
+  }
+  if (useGpu_) outputG.copyFrom(grad, numSamples);
+}
+/**
+ * This cost layer compute the sum of its input as loss.
+ * \f[
+ * o(i) = \sum_{j=1}^D y_{ij}
+ * \f]
+ */
+class SumCostLayer : public Layer {
+ public:
+  explicit SumCostLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
+    bool ret = Layer::init(layerMap, parameterMap);
+    if (!ret) return ret;
+    CHECK_EQ(inputLayers_.size(), 1UL);
+    return true;
+  }
+
+  void forward(PassType passType) override {
+    Layer::forward(passType);
+    const MatrixPtr& input = getInputValue(0);
+
+    /* malloc memory for the output_ if necessary */
+    int batchSize = input->getHeight();
+    int size = 1;
+    resizeOutput(batchSize, size);
+    output_.value->sumRows(*input, /* scaleSum= */ 1, /* scaleDest= */ 0);
+  }
+
+  void backward(const UpdateCallback& callback = nullptr) override {
+    getInputGrad(0)->add((real)1);
+  }
+};
+
+REGISTER_LAYER(sum_cost, SumCostLayer);
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CostLayer.h b/paddle/legacy/gserver/layers/CostLayer.h
similarity index 100%
rename from paddle/gserver/layers/CostLayer.h
rename to paddle/legacy/gserver/layers/CostLayer.h
diff --git a/paddle/legacy/gserver/layers/CropLayer.cpp b/paddle/legacy/gserver/layers/CropLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d891375ecce0371503ba3034f0584f3b1e553a55
--- /dev/null
+++ b/paddle/legacy/gserver/layers/CropLayer.cpp
@@ -0,0 +1,146 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "CropLayer.h"
+#include "paddle/legacy/utils/Stat.h"
+namespace paddle {
+
+REGISTER_LAYER(crop, CropLayer);
+
+bool CropLayer::init(const LayerMap& layerMap,
+                     const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+  CHECK_LE(static_cast<int>(inputLayers_.size()), 2);
+  CHECK_GE(static_cast<int>(inputLayers_.size()), 1);
+  crop_axis_ = config_.axis();
+  for (int i = 0; i < config_.offset_size(); i++) {
+    crop_offsets_.push_back(config_.offset(i));
+  }
+
+  // 1. get input_0 shape
+  auto& input0_img_conf = config_.inputs(0).image_conf();
+  inDims_ = TensorShape({0,
+                         input0_img_conf.channels(),
+                         input0_img_conf.has_img_size_y()
+                             ? input0_img_conf.img_size_y()
+                             : input0_img_conf.img_size(),
+                         input0_img_conf.img_size()});
+  // 2. get target dims from config
+  if (config_.inputs_size() == 1) {
+    targetDims_ = TensorShape({config_.shape(0),
+                               config_.shape(1),
+                               config_.shape(2),
+                               config_.shape(3)});
+  } else {
+    // 2. get input_1 shape
+    auto& input1_img_conf = config_.inputs(1).image_conf();
+    targetDims_ = TensorShape({0,
+                               input1_img_conf.channels(),
+                               input1_img_conf.has_img_size_y()
+                                   ? input1_img_conf.img_size_y()
+                                   : input1_img_conf.img_size(),
+                               input1_img_conf.img_size()});
+  }
+
+  // 3. get final crop corner
+  int dimSize = 4;
+  crop_corner_ = {0, 0, 0, 0};
+  for (int i = 0; i < dimSize; i++) {
+    if (i >= crop_axis_) {
+      if (crop_offsets_.size() > 1) {
+        crop_corner_[i] = crop_offsets_[i - crop_axis_];
+      } else {
+        crop_corner_[i] = crop_offsets_[0];
+      }
+    }
+  }
+
+  outDims_ = TensorShape(4);
+
+  createFunction(
+      forward_, "Crop", FuncConfig().set("crop_corner", crop_corner_));
+  createFunction(
+      backward_, "CropGrad", FuncConfig().set("crop_corner", crop_corner_));
+
+  return true;
+}
+
+void CropLayer::setOutDims() {
+  MatrixPtr input = inputLayers_[1]->getOutputValue();
+  size_t batchSize = input->getHeight();
+  // get target dims from input_1
+  if (config_.inputs_size() == 2) {
+    targetDims_.setDim(0, batchSize);
+    int ch = config_.inputs(0).image_conf().channels();
+    if (ch != 0) targetDims_.setDim(1, ch);
+    int h = inputLayers_[1]->getOutput().getFrameHeight();
+    if (h != 0) targetDims_.setDim(2, h);
+    int w = inputLayers_[1]->getOutput().getFrameWidth();
+    if (w != 0) targetDims_.setDim(3, w);
+  }
+  // get final crop shape from target dims and crop axis
+  std::vector<uint32_t> crop_shape;
+  int dimSize = 4;
+  for (int i = 0; i < dimSize; i++) {
+    if (i >= crop_axis_) {
+      crop_shape.push_back(targetDims_[i]);
+    } else {
+      crop_shape.push_back(inDims_[i]);
+    }
+  }
+
+  outDims_.reshape(
+      {crop_shape[0], crop_shape[1], crop_shape[2], crop_shape[3]});
+  output_.setFrameHeight(crop_shape[2]);
+  output_.setFrameWidth(crop_shape[3]);
+}
+
+void CropLayer::setInDims() {
+  MatrixPtr input = inputLayers_[0]->getOutputValue();
+  size_t batchSize = input->getHeight();
+  inDims_.setDim(0, batchSize);
+  int h = inputLayers_[0]->getOutput().getFrameHeight();
+  if (h != 0) inDims_.setDim(2, h);
+  int w = inputLayers_[0]->getOutput().getFrameWidth();
+  if (w != 0) inDims_.setDim(3, w);
+}
+
+void CropLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  setInDims();
+  setOutDims();
+  int size = outDims_[1] * outDims_[2] * outDims_[3];
+  resetOutput(outDims_[0], size);
+  MatrixPtr outV = getOutputValue();
+  REGISTER_TIMER_INFO("CropForward", getName().c_str());
+
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getInputValue(0), inDims_);
+  outputs.addArg(*getOutputValue(), outDims_, ASSIGN_TO);
+  forward_[0]->calc(inputs, outputs);
+}
+
+void CropLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+  REGISTER_TIMER_INFO("CropBackward", getName().c_str());
+
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getOutputGrad(), outDims_);
+  outputs.addArg(*getInputGrad(0), inDims_, ADD_TO);
+  backward_[0]->calc(inputs, outputs);
+}
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CropLayer.h b/paddle/legacy/gserver/layers/CropLayer.h
similarity index 100%
rename from paddle/gserver/layers/CropLayer.h
rename to paddle/legacy/gserver/layers/CropLayer.h
diff --git a/paddle/legacy/gserver/layers/CrossChannelNormLayer.cpp b/paddle/legacy/gserver/layers/CrossChannelNormLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0fe100a96c01713f6c8d10d4eff428e7e743b002
--- /dev/null
+++ b/paddle/legacy/gserver/layers/CrossChannelNormLayer.cpp
@@ -0,0 +1,137 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "NormLayer.h"
+#include "paddle/legacy/math/BaseMatrix.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+MatrixPtr CrossChannelNormLayer::createSampleMatrix(MatrixPtr data,
+                                                    size_t iter,
+                                                    size_t spatialDim) {
+  return Matrix::create(data->getData() + iter * channels_ * spatialDim,
+                        channels_,
+                        spatialDim,
+                        false,
+                        useGpu_);
+}
+
+MatrixPtr CrossChannelNormLayer::createSpatialMatrix(MatrixPtr data,
+                                                     size_t iter,
+                                                     size_t spatialDim) {
+  return Matrix::create(
+      data->getData() + iter * spatialDim, 1, spatialDim, false, useGpu_);
+}
+
+bool CrossChannelNormLayer::init(const LayerMap& layerMap,
+                                 const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+  CHECK(parameters_[0]);
+  const NormConfig& conf = config_.inputs(0).norm_conf();
+  channels_ = conf.channels();
+  scale_.reset(new Weight(channels_, 1, parameters_[0]));
+  return true;
+}
+
+void CrossChannelNormLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  MatrixPtr inV = getInputValue(0);
+
+  size_t batchSize = inV->getHeight();
+  size_t dataDim = inV->getWidth();
+  CHECK_EQ(getSize(), dataDim);
+
+  reserveOutput(batchSize, dataDim);
+  MatrixPtr outV = getOutputValue();
+  size_t spatialDim = dataDim / channels_;
+
+  Matrix::resizeOrCreate(dataBuffer_, batchSize, dataDim, false, useGpu_);
+  Matrix::resizeOrCreate(spatialBuffer_, 1, spatialDim, false, useGpu_);
+  Matrix::resizeOrCreate(normBuffer_, batchSize, spatialDim, false, useGpu_);
+
+  inV->square2(*dataBuffer_);
+  for (size_t i = 0; i < batchSize; i++) {
+    const MatrixPtr inVTmp = createSampleMatrix(inV, i, spatialDim);
+    const MatrixPtr dataTmp = createSampleMatrix(dataBuffer_, i, spatialDim);
+    MatrixPtr outVTmp = createSampleMatrix(outV, i, spatialDim);
+    MatrixPtr normTmp = createSpatialMatrix(normBuffer_, i, spatialDim);
+
+    // compute norm.
+    spatialBuffer_->sumCols(*dataTmp, 1, 0);
+    // add eps to avoid overflow
+    spatialBuffer_->add(1e-6);
+    spatialBuffer_->sqrt2(*spatialBuffer_);
+    normTmp->copyFrom(*spatialBuffer_);
+    outVTmp->copyFrom(*inVTmp);
+    outVTmp->divRowVector(*spatialBuffer_);
+    // scale the layer.
+    outVTmp->mulColVector(*scale_->getW());
+  }
+}
+
+void CrossChannelNormLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inG = getInputGrad(0);
+  MatrixPtr inV = getInputValue(0);
+  MatrixPtr outG = getOutputGrad();
+  MatrixPtr outV = getOutputValue();
+
+  size_t batchSize = inG->getHeight();
+  size_t dataDim = inG->getWidth();
+  size_t spatialDim = dataDim / channels_;
+
+  MatrixPtr inGBuffer;
+  Matrix::resizeOrCreate(inGBuffer, channels_, spatialDim, false, useGpu_);
+
+  dataBuffer_->dotMul(*outG, *outV);
+  Matrix::resizeOrCreate(scaleDiff_, channels_, 1, false, useGpu_);
+  Matrix::resizeOrCreate(channelBuffer_, channels_, 1, false, useGpu_);
+  Matrix::resizeOrCreate(sampleBuffer_, channels_, spatialDim, false, useGpu_);
+  scaleDiff_->zeroMem();
+  for (size_t i = 0; i < batchSize; i++) {
+    MatrixPtr outGTmp = createSampleMatrix(outG, i, spatialDim);
+    const MatrixPtr dataTmp = createSampleMatrix(dataBuffer_, i, spatialDim);
+    const MatrixPtr inVTmp = createSampleMatrix(inV, i, spatialDim);
+    const MatrixPtr inGTmp = createSampleMatrix(inG, i, spatialDim);
+    const MatrixPtr normTmp = createSpatialMatrix(normBuffer_, i, spatialDim);
+
+    channelBuffer_->sumRows(*dataTmp, 1, 0);
+    channelBuffer_->dotDiv(*channelBuffer_, *(scale_->getW()));
+    // store a / scale[i] in scaleDiff_ temporary
+    scaleDiff_->add(*channelBuffer_, 1.);
+
+    sampleBuffer_->dotMul(*inVTmp, *outGTmp);
+    spatialBuffer_->sumCols(*sampleBuffer_, 1., 0.);
+    // scale the grad
+    inGBuffer->copyFrom(*inVTmp);
+    inGBuffer->mulRowVector(*spatialBuffer_);
+    // divide by square of norm
+    spatialBuffer_->dotMul(*normTmp, *normTmp);
+    inGBuffer->divRowVector(*spatialBuffer_);
+    // subtract
+    inGBuffer->add(*outGTmp, -1, 1);
+    // divide by norm
+    inGBuffer->divRowVector(*normTmp);
+    // scale the diff
+    inGBuffer->mulColVector(*scale_->getW());
+
+    inGTmp->add(*inGBuffer);
+  }
+  // updata scale
+  if (scale_->getWGrad()) scale_->getWGrad()->add(*scaleDiff_);
+  scale_->getParameterPtr()->incUpdate(callback);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CrossEntropyOverBeam.cpp b/paddle/legacy/gserver/layers/CrossEntropyOverBeam.cpp
similarity index 100%
rename from paddle/gserver/layers/CrossEntropyOverBeam.cpp
rename to paddle/legacy/gserver/layers/CrossEntropyOverBeam.cpp
diff --git a/paddle/gserver/layers/CrossEntropyOverBeam.h b/paddle/legacy/gserver/layers/CrossEntropyOverBeam.h
similarity index 100%
rename from paddle/gserver/layers/CrossEntropyOverBeam.h
rename to paddle/legacy/gserver/layers/CrossEntropyOverBeam.h
diff --git a/paddle/legacy/gserver/layers/CudnnBatchNormLayer.cpp b/paddle/legacy/gserver/layers/CudnnBatchNormLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..051155e0d2c1b4910c6627a902a4150cbfb15800
--- /dev/null
+++ b/paddle/legacy/gserver/layers/CudnnBatchNormLayer.cpp
@@ -0,0 +1,180 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "CudnnBatchNormLayer.h"
+#include "Layer.h"
+#include "paddle/legacy/cuda/include/hl_batch_norm.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(cudnn_batch_norm, CudnnBatchNormLayer);
+
+bool CudnnBatchNormLayer::init(const LayerMap& layerMap,
+                               const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  if (!BatchNormBaseLayer::init(layerMap, parameterMap)) return false;
+  CHECK(useGpu_) << "CudnnBatchNorm only support GPU";
+
+  hl_create_tensor_descriptor(&ioDesc_);
+  hl_create_tensor_descriptor(&bnParamDesc_);
+  hl_tensor_reshape(bnParamDesc_, 1, channels_, 1, 1);
+
+  return true;
+}
+
+void CudnnBatchNormLayer::reshape(int batchSize) {
+  hl_tensor_reshape(ioDesc_, batchSize, channels_, imageH_ * imageD_, imageW_);
+}
+
+void CudnnBatchNormLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  int batchSize = getInputValue(0)->getHeight();
+  calFeatureMapSize();
+  reshape(batchSize);
+  resetOutput(batchSize, getInputValue(0)->getWidth());
+
+  // for testing in training peroid.
+  useGlobalStats_ = (passType == PASS_TEST);
+  if (passType == PASS_TEST && config_.has_use_global_stats()) {
+    useGlobalStats_ = config_.use_global_stats();
+  }
+
+  real* input = getInputValue(0)->getData();
+  real* output = getOutputValue()->getData();
+  real* gamma = weight_->getW()->getData();
+  real* beta = biases_->getW()->getData();
+  real* movingMean = movingMean_->getW()->getData();
+  real* movingVar = movingVar_->getW()->getData();
+
+  // cuDNN does not allow an epsilon value less than CUDNN_BN_MIN_EPSILON.
+  eps_ = std::max(CUDNN_BN_MIN_EPSILON, static_cast<double>(epsilon_));
+
+  if (!useGlobalStats_) {
+    REGISTER_TIMER_INFO("CudnnBatchFwTimer", getName().c_str());
+    real* savedMean = savedMean_->getData();
+    real* savedInvVar = savedInvVar_->getData();
+    hl_batch_norm_forward_training(ioDesc_,
+                                   input,
+                                   ioDesc_,
+                                   output,
+                                   bnParamDesc_,
+                                   gamma,
+                                   beta,
+                                   1.0 - movingAvgFraction_,
+                                   movingMean,
+                                   movingVar,
+                                   eps_,
+                                   savedMean,
+                                   savedInvVar);
+  } else {
+    // used movingMean and movingVar in testing
+    if (batchSize <= 1024) {
+      hl_batch_norm_forward_inference(ioDesc_,
+                                      input,
+                                      ioDesc_,
+                                      output,
+                                      bnParamDesc_,
+                                      gamma,
+                                      beta,
+                                      movingMean,
+                                      movingVar,
+                                      eps_);
+    } else {
+      // There is a limitation in cudnn library.
+      // When the batch size is larger than 1024 in cuDNN v5.1,
+      // the cudnnBatchNormalizationForwardInference will fail.
+      hl_batch_norm_cuda_inference(input,
+                                   output,
+                                   gamma,
+                                   beta,
+                                   movingMean,
+                                   movingVar,
+                                   eps_,
+                                   batchSize,
+                                   channels_,
+                                   imageH_ * imageD_,
+                                   imageW_);
+    }
+  }
+
+  /* activation */ {
+    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void CudnnBatchNormLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ {
+    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
+    backwardActivation();
+  }
+
+  real* input = getInputValue(0)->getData();
+  real* outGrad = getOutputGrad()->getData();
+  real* inGrad = getInputGrad(0)->getData();
+  real* gamma = weight_->getW()->getData();
+  real* savedMean = savedMean_->getData();
+  real* savedInvVar = savedInvVar_->getData();
+
+  // cuDNN does not allow an epsilon value less than CUDNN_BN_MIN_EPSILON.
+  eps_ = std::max(CUDNN_BN_MIN_EPSILON, static_cast<double>(epsilon_));
+
+  auto create = [](MatrixPtr& m, size_t h, size_t w, real** p) {
+    Matrix::resizeOrCreate(m, h, w, false, true);
+    m->zeroMem();
+    *p = m->getData();
+  };
+
+  real* gammaGrad = nullptr;
+  real* betaGrad = nullptr;
+  if (weight_->getWGrad()) {
+    gammaGrad = weight_->getWGrad()->getData();
+  } else {
+    create(tmpWGrad_, 1, channels_, &gammaGrad);
+  }
+  if (biases_ && biases_->getWGrad()) {
+    betaGrad = biases_->getWGrad()->getData();
+  } else {
+    create(tmpBiasGrad_, 1, channels_, &betaGrad);
+  }
+
+  hl_batch_norm_backward(ioDesc_,
+                         input,
+                         ioDesc_,
+                         outGrad,
+                         ioDesc_,
+                         inGrad,
+                         bnParamDesc_,
+                         gamma,
+                         gammaGrad,
+                         betaGrad,
+                         eps_,
+                         savedMean,
+                         savedInvVar);
+
+  {
+    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
+    biases_->getParameterPtr()->incUpdate(callback);
+    weight_->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+CudnnBatchNormLayer::~CudnnBatchNormLayer() {
+  hl_destroy_tensor_descriptor(ioDesc_);
+  hl_destroy_tensor_descriptor(bnParamDesc_);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CudnnBatchNormLayer.h b/paddle/legacy/gserver/layers/CudnnBatchNormLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..3b33b983b31173ab941df5f2e66eac51aabc6315
--- /dev/null
+++ b/paddle/legacy/gserver/layers/CudnnBatchNormLayer.h
@@ -0,0 +1,68 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cudnn.h>
+#include "BatchNormBaseLayer.h"
+#include "Layer.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * @brief Cudnn Batch normalization layer use to cuDNN lib to implentment.
+ * @note Cudnn version must >= v4.0, and better to use the latest version
+ * (v5.1).
+ *
+ * The config file api is batch_norm_layer.
+ */
+
+class CudnnBatchNormLayer : public BatchNormBaseLayer {
+ public:
+  explicit CudnnBatchNormLayer(const LayerConfig& config)
+      : BatchNormBaseLayer(config) {}
+
+  ~CudnnBatchNormLayer();
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  /**
+   * reshape tensor of ioDesc_.
+   */
+  void reshape(int batchSize);
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+ protected:
+  /// Epsilon value used in the batch normalization formula.
+  /// Same epsilon value should be used in forward and backward functions.
+  double eps_;
+
+  /// Input/output tensor descriptor desc
+  hl_tensor_descriptor ioDesc_;
+  /// Shared tensor descriptor desc for the 6 tenros:
+  /// bnScale, bnBias, running mean/var, save_mean/var
+  hl_tensor_descriptor bnParamDesc_;
+
+  /**
+   * @brief The gradient of weight and bias in cudnn api can not be empty.
+   * If set is_static for weight or bias, it will not allocate memory for them,
+   * and the gradient is NULL. In this case, will use two matrix.
+   */
+  MatrixPtr tmpWGrad_, tmpBiasGrad_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CudnnConvBaseLayer.cpp b/paddle/legacy/gserver/layers/CudnnConvBaseLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9353cca9c83bd90a454b2be56dc08b8eadee0bf7
--- /dev/null
+++ b/paddle/legacy/gserver/layers/CudnnConvBaseLayer.cpp
@@ -0,0 +1,135 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "CudnnConvBaseLayer.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+REGISTER_LAYER(cudnn_conv, CudnnConvBaseLayer);
+REGISTER_LAYER(cudnn_convt, CudnnConvBaseLayer);
+
+bool CudnnConvBaseLayer::init(const LayerMap &layerMap,
+                              const ParameterMap &parameterMap) {
+  if (!ConvBaseLayer::init(layerMap, parameterMap)) return false;
+  CHECK(useGpu_) << "CudnnConvLayer only support gpu";
+
+  CHECK_EQ(inputLayers_.size(), parameters_.size());
+  projections_.reserve(inputLayers_.size());
+  projConf_.reserve(inputLayers_.size());
+
+  numFilters_ = config_.num_filters();
+  CHECK(config_.shared_biases());
+  for (size_t i = 0; i < inputLayers_.size(); i++) {
+    ProjectionConfig *conf = new ProjectionConfig();
+    if (isDeconv_) {
+      conf->set_type("convt");
+    } else {
+      conf->set_type("conv");
+    }
+    conf->set_num_filters(numFilters_);
+    ConvConfig *convConf = conf->mutable_conv_conf();
+    *convConf = *(config_.mutable_inputs(i)->mutable_conv_conf());
+    conf->set_input_size(getPrev(i)->getSize());
+    conf->set_output_size(getSize());
+    projConf_.emplace_back(conf);
+    projections_.emplace_back(
+        Projection::create(*projConf_[i], parameters_[i], useGpu_));
+
+    // create a new weight
+    size_t height, width;
+    height = filterPixels_[i] * filterChannels_[i];
+    width = (!isDeconv_) ? numFilters_ : channels_[i];
+    CHECK_EQ(parameters_[i]->getSize(), width * height);
+    Weight *w = new Weight(height, width, parameters_[i]);
+    weights_.emplace_back(w);
+  }
+
+  if (biasParameter_.get()) {
+    if (sharedBiases_) {
+      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(numFilters_, 1, biasParameter_));
+    } else {
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(getSize(), 1, biasParameter_));
+    }
+  }
+  if (biases_.get() && sharedBiases_) {
+    hl_create_tensor_descriptor(&biasDesc_);
+    hl_create_tensor_descriptor(&outputDesc_);
+    hl_tensor_reshape(biasDesc_, 1, numFilters_, 1, 1);
+  }
+
+  return true;
+}
+
+void CudnnConvBaseLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  int batchSize = getInput(0).getBatchSize();
+  resetOutput(batchSize, calOutputSize());
+
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    projections_[i]->forward(&getInput(i), &getOutput(), passType);
+  }
+
+  if (biases_) {
+    REGISTER_TIMER_INFO("CudnnConvBiasTimer", getName().c_str());
+    int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+    int outH = outputH_[0];
+    int outW = outputW_[0];
+
+    hl_tensor_reshape(outputDesc_,
+                      batchSize,
+                      numFilters_,
+                      outH,
+                      outW,
+                      numFilters_ * outH * outW,
+                      outH * outW,
+                      outW,
+                      1);
+    real *outData = getOutputValue()->getData();
+    real *biasData = biases_->getW()->getData();
+    hl_convolution_forward_add_bias(biasDesc_, biasData, outputDesc_, outData);
+  }
+
+  forwardActivation();
+}
+
+void CudnnConvBaseLayer::backward(const UpdateCallback &callback) {
+  backwardActivation();
+
+  if (biases_ && biases_->getWGrad()) {
+    REGISTER_TIMER_INFO("CudnnConvBpBiasTimer", getName().c_str());
+    real *biasGrad = biases_->getWGrad()->getData();
+    real *outGrad = getOutputGrad()->getData();
+    hl_convolution_backward_bias(biasDesc_, biasGrad, outputDesc_, outGrad);
+
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    projections_[i]->backward(callback);
+  }
+}
+
+CudnnConvBaseLayer::~CudnnConvBaseLayer() {
+  if (biases_) {
+    hl_destroy_tensor_descriptor(biasDesc_);
+    hl_destroy_tensor_descriptor(outputDesc_);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CudnnConvBaseLayer.h b/paddle/legacy/gserver/layers/CudnnConvBaseLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..d050183eb7838bed803995985383e0ee4e9731a1
--- /dev/null
+++ b/paddle/legacy/gserver/layers/CudnnConvBaseLayer.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "ConvBaseLayer.h"
+#include "Projection.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief A 2-dimension conv layer implemented by cuDNN. It only
+ *        supports GPU mode. We automatic select CudnnConvLayer for GPU
+ *        mode and ExpandConvLayer for CPU mode if you set type of "conv".
+ *        User also can specfiy type of "exconv" or "cudnn_conv" for
+ *        particular type.
+ *
+ * The config file api is img_conv_layer.
+ */
+class CudnnConvBaseLayer : public ConvBaseLayer {
+ protected:
+  std::vector<std::unique_ptr<ProjectionConfig>> projConf_;
+  std::vector<std::unique_ptr<Projection>> projections_;
+
+  hl_tensor_descriptor biasDesc_;
+  hl_tensor_descriptor outputDesc_;
+
+ public:
+  explicit CudnnConvBaseLayer(const LayerConfig& config)
+      : ConvBaseLayer(config) {}
+
+  ~CudnnConvBaseLayer();
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CudnnPoolLayer.cpp b/paddle/legacy/gserver/layers/CudnnPoolLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c790dfd71efbee1a2a0afa69e6c336c4330737d0
--- /dev/null
+++ b/paddle/legacy/gserver/layers/CudnnPoolLayer.cpp
@@ -0,0 +1,139 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "CudnnPoolLayer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+bool CudnnPoolLayer::typeCheck(const std::string &poolType,
+                               hl_pooling_mode_t *mode) {
+  if (poolType == "cudnn-max-pool") {
+    if (mode) {
+      *mode = HL_POOLING_MAX;
+    }
+  } else if (poolType == "cudnn-avg-pool") {
+    if (mode) {
+      *mode = HL_POOLING_AVERAGE;
+    }
+  } else if (poolType == "cudnn-avg-incl-pad-pool") {
+    if (mode) {
+      *mode = HL_POOLING_AVERAGE_INCLUDE_PADDING;
+    }
+  } else {
+    return false;
+  }
+
+  return true;
+}
+
+CudnnPoolLayer::CudnnPoolLayer(const LayerConfig &config) : PoolLayer(config) {
+  const std::string &pool_type = config.inputs(0).pool_conf().pool_type();
+  CHECK_EQ(CudnnPoolLayer::typeCheck(pool_type, &mode_), true);
+}
+
+bool CudnnPoolLayer::init(const LayerMap &layerMap,
+                          const ParameterMap &parameterMap) {
+  PoolLayer::init(layerMap, parameterMap);
+
+  CHECK(useGpu_) << "CudnnPoolLayer only support gpu";
+
+  hl_create_tensor_descriptor(&inputDesc_);
+  hl_create_tensor_descriptor(&outputDesc_);
+
+  windowHeight = sizeY_;
+  windowWidth = sizeX_;
+  heightPadding = confPaddingY_;
+  widthPadding = confPadding_;
+  strideHeight = strideY_;
+  strideWidth = stride_;
+
+  hl_create_pooling_descriptor(&poolingDesc_,
+                               mode_,
+                               windowHeight,
+                               windowWidth,
+                               heightPadding,
+                               widthPadding,
+                               strideHeight,
+                               strideWidth);
+
+  return true;
+}
+
+void CudnnPoolLayer::reshape(int batchSize) {
+  imageH_ = inputLayers_[0]->getOutput().getFrameHeight();
+  imageW_ = inputLayers_[0]->getOutput().getFrameWidth();
+  if (imageH_ == 0) {
+    imageH_ = imgSizeY_;
+  }
+  if (imageW_ == 0) {
+    imageW_ = imgSize_;
+  }
+  CHECK_EQ(inputLayers_[0]->getOutput().value->getWidth(),
+           channels_ * imageH_ * imageW_);
+  outputH_ = outputSize(imageH_,
+                        sizeY_,
+                        confPaddingY_,
+                        strideY_,
+                        /* caffeMode */ false);
+  outputW_ =
+      outputSize(imageW_, sizeX_, confPadding_, stride_, /* caffeMode */ false);
+  getOutput().setFrameHeight(outputH_);
+  getOutput().setFrameWidth(outputW_);
+
+  hl_tensor_reshape(inputDesc_, batchSize, channels_, imageH_, imageW_);
+  hl_tensor_reshape(outputDesc_, batchSize, channels_, outputH_, outputW_);
+}
+
+void CudnnPoolLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  CHECK(inputLayers_[0]->getOutputValue()->useGpu());
+  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+  reshape(batchSize);
+  resetOutput(batchSize, outputH_ * outputW_ * channels_);
+
+  real *inputData = getInputValue(0)->getData();
+  real *outData = getOutputValue()->getData();
+  hl_pooling_forward(inputDesc_, inputData, outputDesc_, outData, poolingDesc_);
+}
+
+void CudnnPoolLayer::backward(const UpdateCallback &callback) {
+  (void)callback;
+  if (NULL == getInputGrad(0)) {
+    return;
+  }
+
+  real *inputData = getInputValue(0)->getData();
+  real *inputGrad = getInputGrad(0)->getData();
+  real *outData = getOutputValue()->getData();
+  real *outGrad = getOutputGrad()->getData();
+  hl_pooling_backward(inputDesc_,
+                      inputData,
+                      inputGrad,
+                      outputDesc_,
+                      outData,
+                      outGrad,
+                      poolingDesc_);
+}
+
+CudnnPoolLayer::~CudnnPoolLayer() {
+  hl_destroy_tensor_descriptor(inputDesc_);
+  hl_destroy_tensor_descriptor(outputDesc_);
+  hl_destroy_pooling_descriptor(poolingDesc_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CudnnPoolLayer.h b/paddle/legacy/gserver/layers/CudnnPoolLayer.h
similarity index 100%
rename from paddle/gserver/layers/CudnnPoolLayer.h
rename to paddle/legacy/gserver/layers/CudnnPoolLayer.h
diff --git a/paddle/gserver/layers/DataLayer.cpp b/paddle/legacy/gserver/layers/DataLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/DataLayer.cpp
rename to paddle/legacy/gserver/layers/DataLayer.cpp
diff --git a/paddle/gserver/layers/DataLayer.h b/paddle/legacy/gserver/layers/DataLayer.h
similarity index 100%
rename from paddle/gserver/layers/DataLayer.h
rename to paddle/legacy/gserver/layers/DataLayer.h
diff --git a/paddle/legacy/gserver/layers/DataNormLayer.cpp b/paddle/legacy/gserver/layers/DataNormLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6820dfa4d4dcf90b2318a190ad4cc082c26fc180
--- /dev/null
+++ b/paddle/legacy/gserver/layers/DataNormLayer.cpp
@@ -0,0 +1,140 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "DataNormLayer.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(data_norm, DataNormLayer);
+
+bool DataNormLayer::init(const LayerMap& layerMap,
+                         const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  /* initialize the weight */
+  CHECK(!biasParameter_) << "DataNormLayer does not need bias";
+  CHECK(inputLayers_.size() == 1 && inputLayers_[0]->getType() == "data")
+      << "DataNormLayer accepts one and only one DataLayer as its input layer";
+  CHECK_EQ(inputLayers_.size(), parameters_.size());
+  CHECK_EQ(inputLayers_[0]->getSize(), getSize());
+  CHECK_EQ(parameters_[0]->getSize(), 5 * getSize());
+  CHECK(parameters_[0]->isStatic())
+      << "The parameter of DataNormLayer must be static";
+
+  weight_ = std::unique_ptr<Weight>(new Weight(5, getSize(), parameters_[0]));
+  min_ = Matrix::create(
+      nullptr, /* height= */ 1, getSize(), /* trans= */ false, useGpu_);
+  rangeReciprocal_ = Matrix::create(nullptr,
+                                    /* height= */ 1,
+                                    getSize(),
+                                    /* trans= */ false,
+                                    useGpu_);
+  mean_ = Matrix::create(nullptr,
+                         /* height= */ 1,
+                         getSize(),
+                         /* trans= */ false,
+                         useGpu_);
+  stdReciprocal_ = Matrix::create(nullptr,
+                                  /* height= */ 1,
+                                  getSize(),
+                                  /* trans= */ false,
+                                  useGpu_);
+  decimalReciprocal_ = Matrix::create(nullptr,
+                                      /* height= */ 1,
+                                      getSize(),
+                                      /* trans= */ false,
+                                      useGpu_);
+
+  min_->setData(weight_->getW()->getData());
+  rangeReciprocal_->setData(weight_->getW()->getData() + getSize());
+  mean_->setData(weight_->getW()->getData() + 2 * getSize());
+  stdReciprocal_->setData(weight_->getW()->getData() + 3 * getSize());
+  decimalReciprocal_->setData(weight_->getW()->getData() + 4 * getSize());
+
+  /* normalization strategy */
+  if (config_.data_norm_strategy() == "z-score") {
+    mode_ = kZScore;
+  } else if (config_.data_norm_strategy() == "min-max") {
+    mode_ = kMinMax;
+  } else if (config_.data_norm_strategy() == "decimal-scaling") {
+    mode_ = kDecimalScaling;
+  } else {
+    LOG(FATAL) << "Unknown data normalization strategy: "
+               << config_.data_norm_strategy();
+  }
+
+  return true;
+}
+
+void DataNormLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  /* malloc memory for the output_ if necessary */
+  int batchSize = getInput(0).getBatchSize();
+  int size = getSize();
+  reserveOutput(batchSize, size);
+
+  const MatrixPtr inValue = getInputValue(0);
+  MatrixPtr outValue = getOutputValue();
+  outValue->copyFrom(*inValue);
+  switch (mode_) {
+    case kZScore: {
+      outValue->addBias(*mean_, -1.0);
+      outValue->colScale(0, *outValue, *stdReciprocal_);
+      break;
+    }
+    case kMinMax: {
+      outValue->addBias(*min_, -1.0);
+      outValue->colScale(0, *outValue, *rangeReciprocal_);
+      break;
+    }
+    case kDecimalScaling: {
+      outValue->colScale(0, *outValue, *decimalReciprocal_);
+      break;
+    }
+    default:
+      LOG(FATAL) << "should not reach here";
+  }
+}
+
+void DataNormLayer::backward(const UpdateCallback& callback) {
+  // The parameter for DataNormLayer is static, and does not need to be updated
+  (void)callback;
+
+  /* Calculate the input layers error */
+  const MatrixPtr& outGrad = getOutputGrad();
+  MatrixPtr inGrad = getInputGrad(0);
+  if (inGrad) {
+    switch (mode_) {
+      case kZScore: {
+        inGrad->addColScale(0, *outGrad, *stdReciprocal_);
+        break;
+      }
+      case kMinMax: {
+        inGrad->addColScale(0, *outGrad, *rangeReciprocal_);
+        break;
+      }
+      case kDecimalScaling: {
+        inGrad->addColScale(0, *outGrad, *decimalReciprocal_);
+        break;
+      }
+      default: { LOG(FATAL) << "should not reach here"; }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/DataNormLayer.h b/paddle/legacy/gserver/layers/DataNormLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..7bb8e928248355cb7ae78dc16e467b77a42e02fc
--- /dev/null
+++ b/paddle/legacy/gserver/layers/DataNormLayer.h
@@ -0,0 +1,62 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
+
+namespace paddle {
+
+/**
+ * @brief A layer for data normalization
+ * - Input: One and only one input layer is accepted. The input layer must
+ *        be DataLayer with dense data type.
+ * - Output: The normalization of the input data
+ *
+ * Reference:
+ *    LA Shalabi, Z Shaaban, B Kasasbeh. Data mining: A preprocessing engine
+ *
+ * Three data normalization methoeds are considered
+ * - z-score: y = (x-mean)/std
+ * - min-max: y = (x-min)/(max-min)
+ * - decimal-scaling: y = x/10^j, where j is the smallest integer such that
+ *max(|y|)<1
+ */
+
+class DataNormLayer : public Layer {
+ public:
+  enum NormalizationStrategy { kZScore = 0, kMinMax = 1, kDecimalScaling = 2 };
+
+  explicit DataNormLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~DataNormLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+ protected:
+  int mode_;
+  std::unique_ptr<Weight> weight_;
+  MatrixPtr min_;
+  MatrixPtr rangeReciprocal_;  // 1/(max-min)
+  MatrixPtr mean_;
+  MatrixPtr stdReciprocal_;      // 1/std
+  MatrixPtr decimalReciprocal_;  // 1/10^j
+};
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/DeConv3DLayer.cpp b/paddle/legacy/gserver/layers/DeConv3DLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2cd635564c4cd9f42d27cd58694cff381d1ce224
--- /dev/null
+++ b/paddle/legacy/gserver/layers/DeConv3DLayer.cpp
@@ -0,0 +1,220 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "DeConv3DLayer.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(deconv3d, DeConv3DLayer);
+
+bool DeConv3DLayer::init(const LayerMap &layerMap,
+                         const ParameterMap &parameterMap) {
+  if (!ConvBaseLayer::init(layerMap, parameterMap)) return false;
+  // for Deconv, the dimension of Kernel is
+  // channel * output * depth * height * weigth
+  // Matrix storage format: (output * depth * height * weigth) x  channel
+  for (int index = 0; index < config_.inputs().size(); ++index) {
+    M_.push_back(filterChannels_[index]);
+    K_.push_back(filterPixels_[index] * (numFilters_ / groups_[index]));
+
+    // create a new weight
+    size_t height, width;
+    height = filterPixels_[index] * numFilters_;
+    width = filterChannels_[index];
+    CHECK_EQ(parameters_[index]->getSize(), width * height);
+    Weight *w = new Weight(height, width, parameters_[index]);
+    weights_.emplace_back(w);
+  }
+  if (biasParameter_.get()) {
+    if (sharedBiases_) {
+      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(numFilters_, 1, biasParameter_));
+    } else {
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(getSize(), 1, biasParameter_));
+    }
+  }
+  return true;
+}
+
+size_t DeConv3DLayer::getSize() {
+  CHECK_NE(inputLayers_.size(), 0UL);
+  imgSizeW_.clear();
+  imgSizeH_.clear();
+  imgSizeD_.clear();
+  N_.clear();
+  NOut_.clear();
+  size_t layerSize = 0;
+  for (size_t i = 0; i < inputLayers_.size(); ++i) {
+    imgSizeW_.push_back(
+        imageSize(outputW_[i], filterSize_[i], padding_[i], stride_[i], true));
+    imgSizeH_.push_back(imageSize(
+        outputH_[i], filterSizeY_[i], paddingY_[i], strideY_[i], true));
+    imgSizeD_.push_back(imageSize(
+        outputD_[i], filterSizeZ_[i], paddingZ_[i], strideZ_[i], true));
+    NOut_.push_back(imgSizeD_[i] * imgSizeH_[i] * imgSizeW_[i]);
+    N_.push_back(outputD_[i] * outputH_[i] * outputW_[i]);
+    CHECK(layerSize == 0 || N_[i] * size_t(numFilters_) == layerSize);
+    layerSize += NOut_[i] * numFilters_;
+  }
+  getOutput().setFrameHeight(imgSizeH_[0]);
+  getOutput().setFrameWidth(imgSizeW_[0]);
+  getOutput().setFrameDepth(imgSizeD_[0]);
+  return layerSize;
+}
+
+void DeConv3DLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+  int outWidth = getSize();
+  resetOutput(batchSize, outWidth);
+  const MatrixPtr outMat = getOutputValue();
+
+  REGISTER_TIMER_INFO("FwdDeConv3D", getName().c_str());
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    const MatrixPtr &inMat = getInputValue(i);
+    int M = M_[i];
+    int N = N_[i];
+    int K = K_[i];
+    MatrixPtr wMat = weights_[i]->getW();
+    Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
+    for (int n = 0; n < batchSize; ++n) {
+      real *inData = inMat->getData() + n * inMat->getStride();
+      for (int g = 0; g < groups_[i]; ++g) {
+        MatrixPtr inMatSub = Matrix::create(inData, M, N, false, useGpu_);
+        MatrixPtr wMatSub = wMat->subMatrix(g * K, K);
+        MatrixPtr colBufDataSub = colBuf_->subMatrix(g * K, K);
+        colBufDataSub->mul(*wMatSub, *inMatSub, 1.0, 0.0);
+        inData += M * N;
+      }
+      colBuf_->col2Vol(outMat->getData() + n * outMat->getStride(),
+                       numFilters_,
+                       imgSizeD_[i],
+                       imgSizeH_[i],
+                       imgSizeW_[i],
+                       filterSizeZ_[i],
+                       filterSizeY_[i],
+                       filterSize_[i],
+                       strideZ_[i],
+                       strideY_[i],
+                       stride_[i],
+                       paddingZ_[i],
+                       paddingY_[i],
+                       padding_[i],
+                       1.0,
+                       1.0);
+    }
+  }
+  if (nullptr != this->biasParameter_) {
+    this->addBias();
+  }
+  forwardActivation();
+}
+
+void DeConv3DLayer::backward(const UpdateCallback &callback) {
+  backwardActivation();
+  int batchSize = getOutputGrad()->getHeight();
+  if (biases_ && biases_->getWGrad()) {
+    bpropBiases();
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+  REGISTER_TIMER_INFO("BwdDeConv3D", getName().c_str());
+  for (size_t i = 0; i < inputLayers_.size(); ++i) {
+    if (weights_[i]->getWGrad() || this->needGradient_) {
+      int M = M_[i];
+      int N = N_[i];
+      int K = K_[i];
+      Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
+      const MatrixPtr &inMat = getInputValue(i);
+      for (int n = 0; n < batchSize; ++n) {
+        colBuf_->vol2Col(
+            getOutputGrad()->getData() + n * getOutputGrad()->getStride(),
+            numFilters_,
+            imgSizeD_[i],
+            imgSizeH_[i],
+            imgSizeW_[i],
+            filterSizeZ_[i],
+            filterSizeY_[i],
+            filterSize_[i],
+            strideZ_[i],
+            strideY_[i],
+            stride_[i],
+            paddingZ_[i],
+            paddingY_[i],
+            padding_[i]);
+        if (weights_[i]->getWGrad()) {
+          real *inData = inMat->getData() + n * inMat->getStride();
+          for (int g = 0; g < groups_[i]; ++g) {
+            MatrixPtr colBufDataSub = colBuf_->subMatrix(g * K, K);
+            MatrixPtr wGradMatSub =
+                weights_[i]->getWGrad()->subMatrix(g * K, K);
+            MatrixPtr inMatSub = Matrix::create(inData, M, N, false, useGpu_);
+            wGradMatSub->mul(
+                *colBufDataSub, *(inMatSub->getTranspose()), 1.0, 1.0);
+            inData += M * N;
+          }
+        }
+        if (getInputGrad(i)) {
+          real *preGrad =
+              getInputGrad(i)->getData() + n * getInputGrad(i)->getStride();
+          for (int g = 0; g < groups_[i]; ++g) {
+            MatrixPtr w = weights_[i]->getW()->subMatrix(g * K, K);
+            MatrixPtr outGradMat = colBuf_->subMatrix(g * K, K);
+            MatrixPtr inGradMatSub =
+                Matrix::create(preGrad, M, N, false, useGpu_);
+            inGradMatSub->mul(*(w->getTranspose()), *outGradMat, 1.0, 1.0);
+            preGrad += M * N;
+          }
+        }
+      }
+      weights_[i]->getParameterPtr()->incUpdate(callback);
+    }
+  }
+}
+void DeConv3DLayer::bpropWeights(int i) {}
+void DeConv3DLayer::bpropData(int i) {}
+
+void DeConv3DLayer::bpropBiases() {
+  MatrixPtr biases = Matrix::create(biases_->getWGrad()->getData(),
+                                    1,
+                                    biases_->getWGrad()->getElementCnt(),
+                                    false,
+                                    useGpu_);
+  const MatrixPtr &outGradMat = getOutputGrad();
+
+  if (this->sharedBiases_) {
+    biases->collectSharedBias(*outGradMat, 1.0f);
+  } else {
+    biases->collectBias(*outGradMat, 1.0f);
+  }
+}
+
+void DeConv3DLayer::addBias() {
+  MatrixPtr outMat = getOutputValue();
+  MatrixPtr bias = Matrix::create(biases_->getW()->getData(),
+                                  1,
+                                  biases_->getW()->getElementCnt(),
+                                  false,
+                                  useGpu_);
+  if (this->sharedBiases_) {
+    outMat->addSharedBias(*(bias), 1.0f);
+  } else {
+    outMat->addBias(*(bias), 1.0f);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/DeConv3DLayer.h b/paddle/legacy/gserver/layers/DeConv3DLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..9931bccb1284111e299206883847045edaae4ded
--- /dev/null
+++ b/paddle/legacy/gserver/layers/DeConv3DLayer.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "ConvBaseLayer.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief A subclass of deconvolution3D layer.
+ * This layer expands input and use matrix multiplication to
+ * calculate deconvolution3D operation.
+ */
+class DeConv3DLayer : public ConvBaseLayer {
+ public:
+  explicit DeConv3DLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
+  ~DeConv3DLayer() {}
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void addBias();
+  void backward(const UpdateCallback& callback);
+  void bpropBiases();
+  void bpropData(int i);
+  void bpropWeights(int i);
+  size_t getSize();
+
+ protected:
+  // Figure out the dimensions for individual gemms.
+  IntV M_;  /// numFilters_ / filter_group_;
+  IntV N_;  /// channels_ * filterSizeZ_ * filterSize_ * filterSizeY_
+  IntV K_;  /// outputD_ * outputH_ * outputW_
+  IntV NOut_;
+  MatrixPtr colBuf_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/DetectionOutputLayer.cpp b/paddle/legacy/gserver/layers/DetectionOutputLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/DetectionOutputLayer.cpp
rename to paddle/legacy/gserver/layers/DetectionOutputLayer.cpp
diff --git a/paddle/gserver/layers/DetectionOutputLayer.h b/paddle/legacy/gserver/layers/DetectionOutputLayer.h
similarity index 100%
rename from paddle/gserver/layers/DetectionOutputLayer.h
rename to paddle/legacy/gserver/layers/DetectionOutputLayer.h
diff --git a/paddle/gserver/layers/DetectionUtil.cpp b/paddle/legacy/gserver/layers/DetectionUtil.cpp
similarity index 100%
rename from paddle/gserver/layers/DetectionUtil.cpp
rename to paddle/legacy/gserver/layers/DetectionUtil.cpp
diff --git a/paddle/legacy/gserver/layers/DetectionUtil.h b/paddle/legacy/gserver/layers/DetectionUtil.h
new file mode 100644
index 0000000000000000000000000000000000000000..c1e0bb809ad290613159f558e9b1860476b3b5f2
--- /dev/null
+++ b/paddle/legacy/gserver/layers/DetectionUtil.h
@@ -0,0 +1,307 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <float.h>
+#include <algorithm>
+#include <vector>
+#include "paddle/legacy/math/Matrix.h"
+
+using std::vector;
+using std::pair;
+using std::map;
+
+namespace paddle {
+
+template <typename T>
+struct BBoxBase {
+  BBoxBase(T xMin, T yMin, T xMax, T yMax)
+      : xMin(xMin), yMin(yMin), xMax(xMax), yMax(yMax), isDifficult(false) {}
+
+  BBoxBase() {}
+
+  T getWidth() const { return xMax - xMin; }
+
+  T getHeight() const { return yMax - yMin; }
+
+  T getCenterX() const { return (xMin + xMax) / 2; }
+
+  T getCenterY() const { return (yMin + yMax) / 2; }
+
+  T getArea() const { return getWidth() * getHeight(); }
+
+  // coordinate of bounding box
+  T xMin;
+  T yMin;
+  T xMax;
+  T yMax;
+  // whether difficult object (e.g. object with heavy occlusion is difficult)
+  bool isDifficult;
+};
+
+struct NormalizedBBox : BBoxBase<real> {
+  NormalizedBBox() : BBoxBase<real>() {}
+};
+
+enum PermMode { kNCHWToNHWC, kNHWCToNCHW };
+
+/**
+ * @brief First permute input maxtrix then append to output matrix
+ */
+size_t appendWithPermute(const Matrix& inMatrix,
+                         size_t height,
+                         size_t width,
+                         size_t outTotalSize,
+                         size_t outOffset,
+                         size_t batchSize,
+                         Matrix& outMatrix,
+                         PermMode permMode);
+
+/**
+ * @brief First permute input maxtrix then decompose to output
+ */
+size_t decomposeWithPermute(const Matrix& inMatrix,
+                            size_t height,
+                            size_t width,
+                            size_t totalSize,
+                            size_t offset,
+                            size_t batchSize,
+                            Matrix& outMatrix,
+                            PermMode permMode);
+
+/**
+ * @brief Compute jaccard overlap between two bboxes.
+ * @param bbox1 The first bbox
+ * @param bbox2 The second bbox
+ */
+real jaccardOverlap(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2);
+
+/**
+ * @brief Compute offset parameters between prior bbox and ground truth bbox
+ * and variances of prior bbox are considered
+ * @param priorBBox Input prior bbox
+ * @param priorBBoxVar Variance parameters of prior bbox
+ * @param gtBBox Groundtruth bbox
+ * @param outVec Output vector
+ */
+void encodeBBoxWithVar(const NormalizedBBox& priorBBox,
+                       const vector<real>& priorBBoxVar,
+                       const NormalizedBBox& gtBBox,
+                       vector<real>& outVec);
+
+/**
+ * @brief Decode prior bbox with offset parameters
+ * and variances of prior bbox are considered
+ * @param priorBBox Prior bbox to be decoded
+ * @param priorBBoxVar Variance parameters of prior bbox
+ * @param locPredData Offset parameters
+ */
+NormalizedBBox decodeBBoxWithVar(const NormalizedBBox& priorBBox,
+                                 const vector<real>& priorBBoxVar,
+                                 const vector<real>& locPredData);
+
+/**
+ * @brief Extract bboxes from prior matrix, the layout is
+ * xmin1 | ymin1 | xmax1 | ymax1 | xmin1Var | ymin1Var | xmax1Var | ymax1Var ...
+ * @param priorData Matrix of prior value
+ * @param numBBoxes Number of bbox to be extracted
+ * @param bboxVec Append to the vector
+ */
+void getBBoxFromPriorData(const real* priorData,
+                          const size_t numBBoxes,
+                          vector<NormalizedBBox>& bboxVec);
+
+/**
+ * @brief Extract labels, scores and bboxes from detection matrix, the layout is
+ * imageId | label | score | xmin | ymin | xmax | ymax
+ * @param detectData Matrix of detection value
+ * @param numBBoxes Number of bbox to be extracted
+ * @param labelVec Label of bbox
+ * @param scoreVec Score of bbox
+ * @param bboxVec Append to the vector
+ */
+void getBBoxFromDetectData(const real* detectData,
+                           const size_t numBBoxes,
+                           vector<real>& labelVec,
+                           vector<real>& scoreVec,
+                           vector<NormalizedBBox>& bboxVec);
+
+/**
+ * @brief Extract variances from prior matrix, the layout is
+ * xmin1 | ymin1 | xmax1 | ymax1 | xmin1Var | ymin1Var | xmax1Var | ymax1Var ...
+ * @param priorData Matrix of prior value
+ * @param num Number to be extracted
+ * @param varVec Append to the vector
+ */
+void getBBoxVarFromPriorData(const real* priorData,
+                             const size_t num,
+                             vector<vector<real>>& varVec);
+
+/**
+ * @brief Extract bboxes from label matrix, the layout is
+ * class1_1 | xmin1_1 | ymin1_1 | xmax1_1 | ymax1_1 | difficult1_1 | ...
+ * @param labelData Matrix of label value
+ * @param numBBoxes Number to be extracted
+ * @param bboxVec Append to the vector
+ */
+void getBBoxFromLabelData(const real* labelData,
+                          const size_t numBBoxes,
+                          vector<NormalizedBBox>& bboxVec);
+
+/**
+* @brief Match prior bbox to groundtruth bbox, the strategy is:
+1. Find the most overlaped bbox pair (prior and groundtruth)
+2. For rest of prior bboxes find the most overlaped groundtruth bbox
+* @param priorBBoxes prior bbox
+* @param gtBBoxes groundtruth bbox
+* @param overlapThreshold Low boundary of overlap (judge whether matched)
+* @param matchIndices For each prior bbox, groundtruth bbox index if matched
+otherwise -1
+* @param matchOverlaps For each prior bbox, overap with all groundtruth bboxes
+*/
+void matchBBox(const vector<NormalizedBBox>& priorBBoxes,
+               const vector<NormalizedBBox>& gtBBoxes,
+               real overlapThreshold,
+               vector<int>* matchIndices,
+               vector<real>* matchOverlaps);
+
+/**
+* @brief Generate positive bboxes and negative bboxes,
+|positive bboxes|/|negative bboxes| is negPosRatio
+* @param priorValue Prior value
+* @param numPriorBBoxes Number of prior bbox
+* @param gtValue Groundtruth value
+* @param gtStartPosPtr Since groundtruth value stored as sequence type,
+this parameter indicates start position of each record
+* @param seqNum Number of sequence
+* @param maxConfScore Classification score for prior bbox, used to mine
+negative examples
+* @param batchSize Image number
+* @param overlapThreshold Low boundary of overap
+* @param negOverlapThreshold Upper boundary of overap (judge negative example)
+* @param negPosRatio Control number of negative bboxes
+* @param matchIndicesVecPtr Save indices of matched prior bbox
+* @param negIndicesVecPtr Save indices of negative prior bbox
+*/
+pair<size_t, size_t> generateMatchIndices(
+    const Matrix& priorValue,
+    const size_t numPriorBBoxes,
+    const Matrix& gtValue,
+    const int* gtStartPosPtr,
+    const size_t seqNum,
+    const vector<vector<real>>& maxConfScore,
+    const size_t batchSize,
+    const real overlapThreshold,
+    const real negOverlapThreshold,
+    const size_t negPosRatio,
+    vector<vector<int>>* matchIndicesVecPtr,
+    vector<vector<int>>* negIndicesVecPtr);
+
+/**
+ * @brief Get max confidence score for each prior bbox
+ * @param confData Confidence scores, layout is
+ * class1 score | class2 score | ... | classN score ...
+ * @param batchSize Image number
+ * @param numPriorBBoxes Prior bbox number
+ * @param numClasses Classes number
+ * @param backgroundId Background id
+ * @param maxConfScoreVecPtr Ouput
+ */
+void getMaxConfidenceScores(const real* confData,
+                            const size_t batchSize,
+                            const size_t numPriorBBoxes,
+                            const size_t numClasses,
+                            const size_t backgroundId,
+                            vector<vector<real>>* maxConfScoreVecPtr);
+
+template <typename T>
+bool sortScorePairDescend(const pair<real, T>& pair1,
+                          const pair<real, T>& pair2);
+
+template <>
+bool sortScorePairDescend(const pair<real, NormalizedBBox>& pair1,
+                          const pair<real, NormalizedBBox>& pair2);
+
+/**
+ * @brief Do NMS for bboxes to remove duplicated bboxes
+ * @param bboxes BBoxes to apply NMS
+ * @param confScoreData Confidence scores
+ * @param classIdx Class to do NMS
+ * @param topK Number to keep
+ * @param confThreshold Low boundary of confidence score
+ * @param nmsThreshold Threshold of overlap
+ * @param numPriorBBoxes Total number of prior bboxes
+ * @param numClasses Total class number
+ * @param indices Indices of high quality bboxes
+ */
+void applyNMSFast(const vector<NormalizedBBox>& bboxes,
+                  const real* confScoreData,
+                  size_t classIdx,
+                  size_t topK,
+                  real confThreshold,
+                  real nmsThreshold,
+                  size_t numPriorBBoxes,
+                  size_t numClasses,
+                  vector<size_t>* indices);
+
+/**
+ * @brief Get detection results which satify requirements
+ * @param numPriorBBoxes Prior bbox number
+ * @param numClasses Class number
+ * @param backgroundId Background class
+ * @param batchSize Image number
+ * @param confThreshold Threshold of class confidence
+ * @param nmsTopK Used in NMS operation to keep top k bbox
+ * @param nmsThreshold Used in NMS, threshold of overlap
+ * @param keepTopK How many bboxes keeped in an image
+ * @param allDecodedBBoxes Decoded bboxes for all images
+ * @param allDetectionIndices Save detection bbox indices
+ */
+size_t getDetectionIndices(
+    const real* confData,
+    const size_t numPriorBBoxes,
+    const size_t numClasses,
+    const size_t backgroundId,
+    const size_t batchSize,
+    const real confThreshold,
+    const size_t nmsTopK,
+    const real nmsThreshold,
+    const size_t keepTopK,
+    const vector<vector<NormalizedBBox>>& allDecodedBBoxes,
+    vector<map<size_t, vector<size_t>>>* allDetectionIndices);
+
+/**
+ * @brief Get detection results
+ * @param confData Confidence scores
+ * @param numPriorBBoxes Prior bbox number
+ * @param numClasses Class number
+ * @param batchSize Image number
+ * @param allIndices Indices of predicted bboxes
+ * @param allDecodedBBoxes BBoxes decoded
+ * @param out Output matrix
+ * image number | label | confidence score | xMin | yMin | xMax | yMax
+ */
+void getDetectionOutput(const real* confData,
+                        const size_t numKept,
+                        const size_t numPriorBBoxes,
+                        const size_t numClasses,
+                        const size_t batchSize,
+                        const vector<map<size_t, vector<size_t>>>& allIndices,
+                        const vector<vector<NormalizedBBox>>& allDecodedBBoxes,
+                        Matrix& out);
+
+NormalizedBBox clipBBox(const NormalizedBBox& bbox);
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/DotMulOperator.cpp b/paddle/legacy/gserver/layers/DotMulOperator.cpp
similarity index 100%
rename from paddle/gserver/layers/DotMulOperator.cpp
rename to paddle/legacy/gserver/layers/DotMulOperator.cpp
diff --git a/paddle/gserver/layers/DotMulProjection.cpp b/paddle/legacy/gserver/layers/DotMulProjection.cpp
similarity index 100%
rename from paddle/gserver/layers/DotMulProjection.cpp
rename to paddle/legacy/gserver/layers/DotMulProjection.cpp
diff --git a/paddle/legacy/gserver/layers/DotProdLayer.cpp b/paddle/legacy/gserver/layers/DotProdLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..06060d93f76c18d893852a5f5c99c36fe5641b2e
--- /dev/null
+++ b/paddle/legacy/gserver/layers/DotProdLayer.cpp
@@ -0,0 +1,97 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * @brief A layer for computing the dot product of two vectors.
+ * Input1: vector (batchSize * dim)
+ * Input2: vector (batchSize * dim)
+ * Output: a matrix: (batchSize * 1)
+ */
+
+class DotProdLayer : public Layer {
+ public:
+  explicit DotProdLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~DotProdLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(dot_prod, DotProdLayer);
+
+bool DotProdLayer::init(const LayerMap& layerMap,
+                        const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 2U);
+  CHECK_EQ(1UL, getSize())
+      << "The output dimensionality of this layer should be fixed to 1.";
+
+  return true;
+}
+
+void DotProdLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+
+  size_t batchSize = inV0->getHeight();
+  CHECK_EQ(inV1->getHeight(), batchSize);
+  CHECK_EQ(inV0->getWidth(), inV1->getWidth());
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    reserveOutput(batchSize, 1);
+  }
+
+  MatrixPtr outV = getOutputValue();
+  {
+    REGISTER_TIMER_INFO("FwDotProdTimer", getName().c_str());
+    outV->sumOfProducts(*inV0, *inV1, 1, 0);
+  }
+}
+
+void DotProdLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  MatrixPtr outG = getOutputGrad();
+  MatrixPtr inG0 = getInputGrad(0);
+  MatrixPtr inG1 = getInputGrad(1);
+
+  {
+    REGISTER_TIMER_INFO("BwDotProdTimer", getName().c_str());
+
+    if (inG0) {
+      inG0->addRowScale(0, *inV1, *outG);
+    }
+
+    if (inG1) {
+      inG1->addRowScale(0, *inV0, *outG);
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/EosIdCheckLayer.cpp b/paddle/legacy/gserver/layers/EosIdCheckLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..38671126c62ba36e22496dcbe1ff3c8d6dcea742
--- /dev/null
+++ b/paddle/legacy/gserver/layers/EosIdCheckLayer.cpp
@@ -0,0 +1,50 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/utils/Logging.h"
+
+namespace paddle {
+/**
+ * A layer for checking EOS for each sample:
+ * - output_id = (input_id == conf.eos_id)
+ *
+ * The result is stored in output_.ids.
+ * It is used by recurrent layer group.
+ */
+class EosIdCheckLayer : public Layer {
+ public:
+  explicit EosIdCheckLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
+    bool ret = Layer::init(layerMap, parameterMap);
+    CHECK_EQ(1UL, inputLayers_.size());
+    return ret;
+  }
+
+  void forward(PassType passType) override {
+    Layer::forward(passType);
+
+    const Argument& input = getInput(0);
+    IVector::resizeOrCreate(output_.ids, input.ids->getSize(), useGpu_);
+    output_.ids->isEqualTo(*input.ids, config_.eos_id());
+  }
+
+  void backward(const UpdateCallback& callback) override {}
+};
+
+REGISTER_LAYER(eos_id, EosIdCheckLayer);
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ExpandConvLayer.cpp b/paddle/legacy/gserver/layers/ExpandConvLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8a53db380686cea2ad121c948c45a0fa1154381e
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ExpandConvLayer.cpp
@@ -0,0 +1,248 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ExpandConvLayer.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+DEFINE_bool(use_nnpack,
+            false,
+            "Whether to use nnpack for convolution calculation.");
+
+namespace paddle {
+
+/*
+ * The calculation of the exconvt(convolution transpose (deconv) operation)
+ * is a swap of forward and backward of the calculation of exconv.
+ * */
+REGISTER_LAYER(exconv, ExpandConvLayer);
+REGISTER_LAYER(exconvt, ExpandConvLayer);
+
+inline bool isDepthwiseConv(int channels, int groups) {
+  return channels == groups;
+}
+
+bool ExpandConvLayer::init(const LayerMap &layerMap,
+                           const ParameterMap &parameterMap) {
+  /* Initialize the basic convolutional parent class */
+  ConvBaseLayer::init(layerMap, parameterMap);
+
+  int index = 0;
+  for (auto &inputConfig : config_.inputs()) {
+    const ConvConfig &conf = inputConfig.conv_conf();
+    /* Consistent caffe mode for multiple input */
+    caffeMode_ = conf.caffe_mode();
+
+    // create a new weight
+    size_t height, width;
+    height = filterPixels_[index] * filterChannels_[index];
+    width = (!isDeconv_) ? numFilters_ : channels_[index];
+    CHECK_EQ(parameters_[index]->getSize(), width * height);
+    Weight *w = new Weight(height, width, parameters_[index]);
+    weights_.emplace_back(w);
+    index++;
+  }
+
+  if (biasParameter_.get()) {
+    if (sharedBiases_) {
+      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
+      biases_ = std::unique_ptr<Weight>(
+          new Weight(1, numFilters_, biasParameter_, 0));
+    } else {
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_, 0));
+    }
+  }
+
+  getOutputSize();
+
+  size_t numInputs = config_.inputs_size();
+  inputShape_.resize(numInputs);
+  filterShape_.resize(numInputs);
+  outputShape_.resize(numInputs);
+
+  std::string convType;
+  std::string convGradInputType;
+  std::string convGradFilterType;
+
+  for (int i = 0; i < config_.inputs_size(); i++) {
+    std::vector<size_t> paddings = {(size_t)paddingY_[i], (size_t)padding_[i]};
+    std::vector<size_t> strides = {(size_t)strideY_[i], (size_t)stride_[i]};
+    std::vector<size_t> dilations = {(size_t)dilationY_[i],
+                                     (size_t)dilation_[i]};
+
+    bool useDilation = ((size_t)dilationY_[i] > 1 || (size_t)dilation_[i] > 1);
+
+    // Convolution Layer uses the GemmConv function by default.
+    convType = "GemmConv";
+    convGradInputType = "GemmConvGradInput";
+    convGradFilterType = "GemmConvGradFilter";
+
+    // If depth wise convolution and useGpu == true
+    if (useGpu_ && isDepthwiseConv(channels_[i], groups_[i]) && !isDeconv_) {
+      convType = "DepthwiseConv";
+      convGradInputType = "DepthwiseConvGradInput";
+      convGradFilterType = "DepthwiseConvGradFilter";
+    }
+
+    // If depth wise convolution and useGpu == false and ARM-NEON
+    if (!useGpu_ && isDepthwiseConv(channels_[i], groups_[i]) && !isDeconv_) {
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+      if ((filterSize_[i] == filterSizeY_[i]) &&
+          (filterSize_[i] == 3 || filterSize_[i] == 4) &&
+          (stride_[i] == strideY_[i]) && (stride_[i] == 1 || stride_[i] == 2) &&
+          !useDilation) {
+        convType = "NeonDepthwiseConv";
+      }
+#endif
+    }
+
+    if (FLAGS_use_nnpack && !isDeconv_ && !useDilation) {
+      createFunction(forward_,
+                     "NNPACKConv",
+                     FuncConfig()
+                         .set("paddings", paddings)
+                         .set("strides", strides)
+                         .set("groups", (size_t)groups_[i])
+                         .set("algo", std::string("auto")));
+    } else {
+      createFunction(forward_,
+                     !isDeconv_ ? convType : convGradInputType,
+                     FuncConfig()
+                         .set("paddings", paddings)
+                         .set("strides", strides)
+                         .set("dilations", dilations)
+                         .set("groups", (size_t)groups_[i]));
+
+      createFunction(backward_,
+                     !isDeconv_ ? convGradInputType : convType,
+                     FuncConfig()
+                         .set("paddings", paddings)
+                         .set("strides", strides)
+                         .set("dilations", dilations)
+                         .set("groups", (size_t)groups_[i]));
+
+      createFunction(backward_,
+                     convGradFilterType,
+                     FuncConfig()
+                         .set("paddings", paddings)
+                         .set("strides", strides)
+                         .set("dilations", dilations)
+                         .set("groups", (size_t)groups_[i]));
+    }
+  }
+  return true;
+}
+
+size_t ExpandConvLayer::getOutputSize() {
+  CHECK_NE(inputLayers_.size(), 0UL);
+  size_t layerSize = ConvBaseLayer::calOutputSize();
+  return layerSize;
+}
+
+// i is the index of input layers
+#define BACKWARD_INPUT(i, inputs, outputs) \
+  backward_[2 * i]->calc(inputs, outputs)
+#define BACKWARD_FILTER(i, inputs, outputs) \
+  backward_[2 * i + 1]->calc(inputs, outputs)
+
+void ExpandConvLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  size_t batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+  resetOutput(batchSize, getOutputSize());
+
+  // Calculate the shape of the input, output, and filter.
+  for (size_t i = 0; i < inputLayers_.size(); ++i) {
+    inputShape_[i] = TensorShape({(size_t)batchSize,
+                                  (size_t)channels_[i],
+                                  (size_t)imgSizeH_[i],
+                                  (size_t)imgSizeW_[i]});
+    filterShape_[i] =
+        TensorShape({(size_t)groups_[i],
+                     !isDeconv_ ? (size_t)numFilters_ / groups_[i]
+                                : (size_t)channels_[i] / groups_[i],
+                     !isDeconv_ ? (size_t)channels_[i] / groups_[i]
+                                : (size_t)numFilters_ / groups_[i],
+                     (size_t)filterSizeY_[i],
+                     (size_t)filterSize_[i]});
+    outputShape_[i] = TensorShape({(size_t)batchSize,
+                                   (size_t)numFilters_,
+                                   (size_t)outputH_[i],
+                                   (size_t)outputW_[i]});
+  }
+
+  // Calculate the output value.
+  for (size_t i = 0; i < inputLayers_.size(); ++i) {
+    BufferArgs inputs;
+    BufferArgs outputs;
+    inputs.addArg(*getInputValue(i), inputShape_[i]);
+    inputs.addArg(*weights_[i]->getW(), filterShape_[i]);
+    outputs.addArg(*getOutputValue(),
+                   outputShape_[i],
+                   !isDeconv_ && i == 0 ? ASSIGN_TO : ADD_TO);
+
+    forward_[i]->calc(inputs, outputs);
+  }
+
+  /* add the bias-vector */
+  if (biases_.get()) {
+    output_.value->addBias(*biases_->getW(), 1.0, sharedBiases_);
+  }
+
+  /* activation */
+  forwardActivation();
+}
+
+void ExpandConvLayer::backward(const UpdateCallback &callback) {
+  backwardActivation();
+
+  MatrixPtr outGrad = getOutputGrad();
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1, sharedBiases_);
+    /* Increasing the number of gradient */
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  // Calculate the input grad and filter grad.
+  for (size_t i = 0; i < inputLayers_.size(); ++i) {
+    if (getInputGrad(i)) {
+      BufferArgs inputs;
+      BufferArgs outputs;
+      inputs.addArg(*getOutputGrad(), outputShape_[i]);
+      inputs.addArg(*weights_[i]->getW(), filterShape_[i]);
+      outputs.addArg(*getInputGrad(i), inputShape_[i], ADD_TO);
+      BACKWARD_INPUT(i, inputs, outputs);
+    }
+
+    if (weights_[i]->getWGrad()) {
+      BufferArgs inputs;
+      BufferArgs outputs;
+      if (!isDeconv_) {
+        inputs.addArg(*getOutputGrad(), outputShape_[i]);
+        inputs.addArg(*getInputValue(i), inputShape_[i]);
+      } else {
+        inputs.addArg(*getInputValue(i), inputShape_[i]);
+        inputs.addArg(*getOutputGrad(), outputShape_[i]);
+      }
+      outputs.addArg(*weights_[i]->getWGrad(), filterShape_[i], ADD_TO);
+      BACKWARD_FILTER(i, inputs, outputs);
+
+      /* Increasing the number of gradient */
+      weights_[i]->getParameterPtr()->incUpdate(callback);
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ExpandConvLayer.h b/paddle/legacy/gserver/layers/ExpandConvLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..c0eff3ab061949bd583e0deaf121912ed993be76
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ExpandConvLayer.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "ConvBaseLayer.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief A subclass of convolution layer.
+ * This layer expands input and use matrix multiplication to
+ * calculate convolution operation.
+ *
+ * The config file api is img_conv_layer.
+ */
+
+class ExpandConvLayer : public ConvBaseLayer {
+ public:
+  explicit ExpandConvLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
+
+  ~ExpandConvLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
+
+  size_t getOutputSize();
+
+ protected:
+  std::vector<TensorShape> inputShape_;
+  std::vector<TensorShape> filterShape_;
+  std::vector<TensorShape> outputShape_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ExpandLayer.cpp b/paddle/legacy/gserver/layers/ExpandLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..074fbab8ef9d1453160058031be370e991459fa5
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ExpandLayer.cpp
@@ -0,0 +1,133 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ExpandLayer.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(expand, ExpandLayer);
+
+bool ExpandLayer::init(const LayerMap& layerMap,
+                       const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+  CHECK_EQ(inputLayers_.size(), 2UL);
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+  }
+  // which sequence type of input[0]
+  if (config_.trans_type() == "non-seq") {
+    type_ = kNonSeq;
+  } else if (config_.trans_type() == "seq") {
+    type_ = kSeq;
+  } else {
+    LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
+  }
+  setNeedSequenceInfo(false);
+  return true;
+}
+
+void ExpandLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  // Expand layer should have exactly 2 input, one for data, one for size
+  CHECK_EQ(2U, inputLayers_.size());
+
+  // using two input:
+  // * first one for data;
+  // * second one only for sequence info
+  const Argument& shapeInput = getInput(1);
+  const Argument& dataInput = getInput(0);
+  size_t outputBatchSize = shapeInput.getBatchSize();
+  auto startPositions = type_ ? shapeInput.subSequenceStartPositions
+                              : shapeInput.sequenceStartPositions;
+  size_t numSequences = startPositions->getSize() - 1;
+  const int* starts = startPositions->getData(false);
+
+  CHECK_EQ(starts[numSequences], shapeInput.getBatchSize());
+  if (type_) {
+    // when trans_type = seq, input[1] must hasSubseq
+    CHECK_EQ(shapeInput.hasSubseq(), 1UL);
+    CHECK_EQ(dataInput.getNumSequences(), shapeInput.getNumSequences());
+  } else {
+    CHECK_EQ(dataInput.getBatchSize(), shapeInput.getNumSequences());
+  }
+
+  // set output sequence info as shape sequence
+  output_.sequenceStartPositions = shapeInput.sequenceStartPositions;
+  if (shapeInput.hasSubseq()) {
+    output_.subSequenceStartPositions = shapeInput.subSequenceStartPositions;
+  }
+
+  // reserve output: Expand output to batchsize of sequence data.
+  reserveOutput(outputBatchSize, dataInput.value->getWidth());
+
+  MatrixPtr inputValue = getInputValue(0);
+  MatrixPtr outputValue = getOutputValue();
+
+  ICpuGpuVector::resizeOrCreate(expandStartsPos_, outputBatchSize, false);
+  int* expandStarts = expandStartsPos_->getMutableData(false);
+  for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
+    int sequenceLength = starts[sequenceId + 1] - starts[sequenceId];
+    for (int j = 0; j < sequenceLength; j++) {
+      expandStarts[starts[sequenceId] + j] = sequenceId;
+    }
+  }
+
+  outputValue->copyByRowIndex(*inputValue,
+                              *expandStartsPos_->getVector(useGpu_));
+
+  if (biases_.get() != NULL) {
+    outputValue->addBias(*(biases_->getW()), 1);
+  }
+}
+
+void ExpandLayer::backward(const UpdateCallback& callback) {
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+    /* Increasing the number of gradient */
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  if (!getInputGrad(0)) return;
+  MatrixPtr inputGrad = getInputGrad(0);
+  MatrixPtr outputGrad = getOutputGrad();
+  auto cpuSeqStartPos = type_ ? getInput(1).subSequenceStartPositions
+                              : getInput(1).sequenceStartPositions;
+  size_t numSequences = cpuSeqStartPos->getSize() - 1;
+  const int* starts = cpuSeqStartPos->getData(false);
+
+  CHECK_EQ(inputGrad->getWidth(), outputGrad->getWidth());
+  CHECK_EQ(outputGrad->getHeight(), (size_t)starts[numSequences]);
+
+  AsyncGpuBlock asyncGpuBlock;
+
+  // sum to get the grad
+  real scale = 1;
+  for (size_t sequenceId = 0; sequenceId < numSequences; sequenceId++) {
+    // TODO(Dangqingqing) optimization for GPU
+    int sequenceLength = starts[sequenceId + 1] - starts[sequenceId];
+    if (sequenceLength == 0) {
+      // empty sequence
+      continue;
+    }
+    MatrixPtr copyData = inputGrad->subMatrix(sequenceId, 1);
+    copyData->collectBias(
+        *outputGrad->subMatrix(starts[sequenceId], sequenceLength), scale);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ExpandLayer.h b/paddle/legacy/gserver/layers/ExpandLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..75a1ec75688cdbc61a117da7d4be47848c30425a
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ExpandLayer.h
@@ -0,0 +1,63 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * A layer for "Expand Dense data or (sequence data where the length of each
+ * sequence is one) to sequence data."
+ *
+ * It should have exactly 2 input, one for data, one for size:
+ * - first one for data
+ *   - If ExpandLevel = kNonSeq: dense data
+ *   - If ExpandLevel = kSeq: sequence data where the length of each sequence is
+ * one
+ * - second one only for sequence info
+ *   - should be sequence data with or without sub-sequence.
+ *
+ * And the output size is the batch size(not instances) of second input.
+ *
+ * The config file api is expand_layer.
+ */
+
+class ExpandLayer : public Layer {
+ protected:
+  std::unique_ptr<Weight> biases_;
+  /// if input[0] is dense data, ExpandLevel=kNonSeq;
+  /// if input[0] is sequence data, ExpandLevel=kSeq
+  enum ExpandLevel { kNonSeq = 0, kSeq = 1 };
+  /// store the ExpandLevel
+  int type_;
+  /// expanded sequenceStartPositions or subSequenceStartPositions
+  /// of input[1]
+  ICpuGpuVectorPtr expandStartsPos_;
+
+ public:
+  explicit ExpandLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~ExpandLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/FactorizationMachineLayer.cpp b/paddle/legacy/gserver/layers/FactorizationMachineLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6cf269fa3ffb3f4a2864aea4225d9401930e73b1
--- /dev/null
+++ b/paddle/legacy/gserver/layers/FactorizationMachineLayer.cpp
@@ -0,0 +1,158 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "FactorizationMachineLayer.h"
+#include <algorithm>
+#include <vector>
+#include "paddle/legacy/math/SparseMatrix.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(factorization_machine, FactorizationMachineLayer);
+
+bool FactorizationMachineLayer::init(const LayerMap& layerMap,
+                                     const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  factorSize_ = config_.factor_size();
+
+  /* initialize the latentVectors_ */
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  size_t inputSize = inputLayers_[0]->getSize();
+  CHECK_EQ(parameters_[0]->getSize(), inputSize * factorSize_);
+  latentVectors_ = std::unique_ptr<Weight>(
+      new Weight(inputSize, factorSize_, parameters_[0]));
+
+  return true;
+}
+
+void FactorizationMachineLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const MatrixPtr& inputV = getInputValue(0);
+
+  size_t batchSize = inputV->getHeight();
+  size_t outputSize = getSize();
+  size_t inputSize = inputLayers_[0]->getSize();
+  reserveOutput(batchSize, outputSize);
+
+  MatrixPtr outV = getOutputValue();
+
+  Matrix::resizeOrCreate(
+      latentVectorsSquare_, inputSize, factorSize_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      inputMulFactor_, batchSize, factorSize_, false, useGpu_);
+  Matrix::resizeOrCreate(tmpOut_, batchSize, factorSize_, false, useGpu_);
+
+  REGISTER_TIMER_INFO("FmInputMulFactorTimer", getName().c_str());
+  inputMulFactor_->mul(*inputV, *latentVectors_->getW());
+  inputMulFactor_->square2(*tmpOut_);
+  outV->sumRows(*tmpOut_, 0.5, 0);
+
+  if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
+    Matrix::resizeOrCreateSparseMatrix(inputSquare_,
+                                       inputV->getHeight(),
+                                       inputV->getWidth(),
+                                       inputV->getElementCnt(),
+                                       inputV->getValueType());
+    inputSquare_->copyFrom(*inputV);
+    (dynamic_cast<CpuSparseMatrix*>(inputSquare_.get()))->square2();
+  } else {
+    Matrix::resizeOrCreate(
+        inputSquare_, inputV->getHeight(), inputV->getWidth(), false, useGpu_);
+    inputV->square2(*inputSquare_);
+  }
+  latentVectors_->getW()->square2(*latentVectorsSquare_);
+  tmpOut_->mul(*inputSquare_, *latentVectorsSquare_);
+  outV->sumRows(*tmpOut_, -0.5, 1.0);
+
+  /* activation */ {
+    REGISTER_TIMER_INFO("FmFwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void FactorizationMachineLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ { backwardActivation(); }
+
+  const MatrixPtr& inputV = getInputValue(0);
+  const MatrixPtr& oGrad = getOutputGrad();
+
+  Matrix::resizeOrCreate(
+      tmpSum_, 1, latentVectors_->getW()->getHeight(), false, useGpu_);
+  MatrixPtr tmpSumTrans = Matrix::create(tmpSum_->getRowBuf(0),
+                                         latentVectors_->getW()->getHeight(),
+                                         1,
+                                         false,
+                                         useGpu_);
+
+  /* Calculate the gradients of the latentVectors_ matrix */
+  if (latentVectors_->getWGrad()) {
+    if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
+      Matrix::resizeOrCreateSparseMatrix(tmpInput_,
+                                         inputV->getHeight(),
+                                         inputV->getWidth(),
+                                         inputV->getElementCnt());
+
+      CpuSparseMatrix* sparseInputV =
+          dynamic_cast<CpuSparseMatrix*>(inputV.get());
+      CpuSparseMatrix* sparseInputSquare =
+          dynamic_cast<CpuSparseMatrix*>(inputSquare_.get());
+      CpuSparseMatrix* sparseTmpInput =
+          dynamic_cast<CpuSparseMatrix*>(tmpInput_.get());
+      sparseTmpInput->copyFrom(*sparseInputV);
+
+      sparseTmpInput->rowScale(0, *sparseInputV, *oGrad);
+      latentVectors_->getWGrad()->mul(
+          *sparseTmpInput->getTranspose(), *inputMulFactor_, 1, 1);
+      sparseTmpInput->rowScale(0, *sparseInputSquare, *oGrad);
+
+      Matrix::resizeOrCreate(negOnes_, 1, inputV->getHeight(), false, useGpu_);
+      negOnes_->zeroMem();
+      negOnes_->add(-1);
+      tmpSum_->mul(*negOnes_, *sparseTmpInput, 1, 0);
+    } else {
+      Matrix::resizeOrCreate(
+          tmpInput_, inputV->getHeight(), inputV->getWidth(), false, useGpu_);
+
+      tmpInput_->rowScale(0, *inputV, *oGrad);
+      latentVectors_->getWGrad()->mul(
+          *tmpInput_->getTranspose(), *inputMulFactor_, 1, 1);
+      tmpInput_->rowScale(0, *inputSquare_, *oGrad);
+
+      tmpSum_->sumCols(*tmpInput_, -1, 0);
+    }
+
+    latentVectors_->getWGrad()->addRowScale(
+        0, *latentVectors_->getW(), *tmpSumTrans);
+
+    /* Increasing the number of gradient */
+    latentVectors_->getParameterPtr()->incUpdate(callback);
+  }
+
+  /* Calculate the input layers gradient */
+  MatrixPtr inGrad = getInputGrad(0);
+  if (inGrad != NULL) {
+    inGrad->mul(
+        *inputMulFactor_, *latentVectors_->getW()->getTranspose(), 1, 1);
+    tmpSumTrans->sumRows(*latentVectorsSquare_, -1, 0);
+    inGrad->addColScale(0, *inputV, *tmpSum_);
+    inGrad->rowScale(0, *inGrad, *oGrad);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/FactorizationMachineLayer.h b/paddle/legacy/gserver/layers/FactorizationMachineLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..fc015ed727bbd8781bb50a22b8e745d8896837e1
--- /dev/null
+++ b/paddle/legacy/gserver/layers/FactorizationMachineLayer.h
@@ -0,0 +1,80 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
+
+namespace paddle {
+/**
+ * @brief The Factorization Machine models pairwise (order-2) feature
+ * interactions as inner product of the learned latent vectors corresponding
+ * to each input feature.
+ *
+ * The Factorization Machine can effectively capture feature interactions
+ * especially when the input is sparse. While in principle FM can model higher
+ * order feature interaction, in practice usually only order-2 feature
+ * interactions are considered. The Factorization Machine Layer here only
+ * computes the order-2 interations with the formula:
+ *
+ * \f[
+ *     y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j
+ * \f]
+ *
+ * The detailed calculation for forward and backward can be found at this paper:
+ *
+ *     Factorization machines.
+ *
+ * The config file api is factorization_machine.
+ */
+
+class FactorizationMachineLayer : public Layer {
+ protected:
+  // The latent vectors, shape: (size, factorSize_)
+  // Each row of the latentVectors_ matrix is the latent vector
+  // corresponding to one input feature dimension
+  std::unique_ptr<Weight> latentVectors_;
+  // The hyperparameter that defines the dimensionality of the factorization
+  size_t factorSize_;
+
+ private:
+  // Store the square values of the letent vectors matrix
+  MatrixPtr latentVectorsSquare_;
+  // Store the square values of input matrix
+  MatrixPtr inputSquare_;
+  // The result of input matrix * latent vector matrix that will be used in
+  // both forward and backward step
+  MatrixPtr inputMulFactor_;
+  // Store temporary calculation result
+  MatrixPtr tmpOut_;
+  MatrixPtr tmpSum_;
+  MatrixPtr tmpInput_;
+  // Negative identity matrix
+  MatrixPtr negOnes_;
+
+ public:
+  explicit FactorizationMachineLayer(const LayerConfig& config)
+      : Layer(config) {}
+  ~FactorizationMachineLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/FeatureMapExpandLayer.cpp b/paddle/legacy/gserver/layers/FeatureMapExpandLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a3fe1433e4b5fd7bd77f8d6bb73378243d391dd5
--- /dev/null
+++ b/paddle/legacy/gserver/layers/FeatureMapExpandLayer.cpp
@@ -0,0 +1,155 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * @brief A layer for expanding a batch of images to feature maps.
+ * Each data of the input is a 2 dimensional matrix. Each element of the matrix
+ * is replicated num_filters times to create a feature map with num_filters
+ * channels.
+ * - Input: Input one should be dense image data.
+ * - Output: expanded fature maps.
+ * \f[
+ *  y.row[i] = x.row[i \mod x.width], i = 0,1,..., (x.width * num\_filters - 1)
+ * \f]
+ * For example, num_filters = 4:
+ * @code
+ *   x = [a1,a2;
+ *        b1,b2]
+ *   y = [a1, a2, a1, a2, a1, a2, a1, a2;
+ *        b1, b2, b1, b2, b1, b2, b1, b2;]
+ * @endcode
+ */
+
+class FeatureMapExpandLayer : public Layer {
+ private:
+  int numFilters_;
+  bool asRowVector_;
+
+ public:
+  explicit FeatureMapExpandLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~FeatureMapExpandLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(featmap_expand, FeatureMapExpandLayer);
+
+bool FeatureMapExpandLayer::init(const LayerMap& layerMap,
+                                 const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  numFilters_ = config_.num_filters();
+  asRowVector_ = config_.user_arg() != "as_col_vec";
+  return true;
+}
+
+void FeatureMapExpandLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  MatrixPtr inputV = getInputValue(0);
+  size_t batchSize = getInput(0).getBatchSize();
+  int imgSize = inputV->getWidth();
+  resetOutput(batchSize, imgSize * numFilters_);
+
+  MatrixPtr outputV = getOutputValue();
+
+  {
+    AsyncGpuBlock asyncGpuBlock;
+    if (asRowVector_) {
+      for (size_t i = 0; i < batchSize; i++) {
+        MatrixPtr outVTmp =
+            Matrix::create(outputV->getData() + i * imgSize * numFilters_,
+                           numFilters_,
+                           imgSize,
+                           false,
+                           useGpu_);
+        MatrixPtr inVTmp = Matrix::create(
+            inputV->getData() + i * imgSize, 1, imgSize, false, useGpu_);
+        outVTmp->addRowVector(*inVTmp);
+      }
+    } else {
+      for (size_t i = 0; i < batchSize; i++) {
+        MatrixPtr outVTmp =
+            Matrix::create(outputV->getData() + i * imgSize * numFilters_,
+                           imgSize,
+                           numFilters_,
+                           false,
+                           useGpu_);
+        MatrixPtr inVTmp = Matrix::create(
+            inputV->getData() + i * imgSize, imgSize, 1, false, useGpu_);
+        outVTmp->addColVector(*inVTmp);
+      }
+    }
+  }
+  /* activation */ {
+    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void FeatureMapExpandLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inGrad = getInputGrad(0);
+  if (NULL == inGrad) {
+    return;
+  }
+  MatrixPtr outGrad = getOutputGrad();
+  size_t batchSize = getInput(0).getBatchSize();
+  int imgSize = inGrad->getWidth();
+  /* Do activation */ {
+    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
+    backwardActivation();
+  }
+  {
+    AsyncGpuBlock asyncGpuBlock;
+    if (asRowVector_) {
+      for (size_t i = 0; i < batchSize; i++) {
+        MatrixPtr outGradTmp =
+            Matrix::create(outGrad->getData() + i * imgSize * numFilters_,
+                           numFilters_,
+                           imgSize,
+                           false,
+                           useGpu_);
+        MatrixPtr inGradTmp = Matrix::create(
+            inGrad->getData() + i * imgSize, 1, imgSize, false, useGpu_);
+        inGradTmp->collectBias(*outGradTmp, 1);
+      }
+    } else {
+      for (size_t i = 0; i < batchSize; i++) {
+        MatrixPtr outGradTmp =
+            Matrix::create(outGrad->getData() + i * imgSize * numFilters_,
+                           imgSize,
+                           numFilters_,
+                           false,
+                           useGpu_);
+        MatrixPtr inGradTmp = Matrix::create(
+            inGrad->getData() + i * imgSize, imgSize, 1, false, useGpu_);
+        inGradTmp->sumRows(*outGradTmp, 1, 1);
+      }
+    }
+  }
+}
+
+}  // namespace paddle.
diff --git a/paddle/gserver/layers/FullMatrixProjection.cpp b/paddle/legacy/gserver/layers/FullMatrixProjection.cpp
similarity index 100%
rename from paddle/gserver/layers/FullMatrixProjection.cpp
rename to paddle/legacy/gserver/layers/FullMatrixProjection.cpp
diff --git a/paddle/legacy/gserver/layers/FullMatrixProjection.h b/paddle/legacy/gserver/layers/FullMatrixProjection.h
new file mode 100644
index 0000000000000000000000000000000000000000..c33d02a3aeac8e83f613e61320cb6cd63baeae83
--- /dev/null
+++ b/paddle/legacy/gserver/layers/FullMatrixProjection.h
@@ -0,0 +1,42 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/legacy/utils/Stat.h"
+
+#include "Projection.h"
+
+namespace paddle {
+
+/**
+ * FullMatrixProjection performs full matrix multiplication:
+ * \f[
+ *    out.row[i] += in.row[i] * weight
+ * \f]
+ *
+ * The config file api is full_matrix_projection.
+ */
+class FullMatrixProjection : public Projection {
+ public:
+  FullMatrixProjection(const ProjectionConfig& config,
+                       const ParameterPtr& parameter,
+                       bool useGpu);
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback);
+
+ protected:
+  std::unique_ptr<Weight> weight_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/FullyConnectedLayer.cpp b/paddle/legacy/gserver/layers/FullyConnectedLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..07f4dfbe39c6b9bc233b3c75b4b5891a1ec9b2ec
--- /dev/null
+++ b/paddle/legacy/gserver/layers/FullyConnectedLayer.cpp
@@ -0,0 +1,150 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "FullyConnectedLayer.h"
+#include <algorithm>
+#include <vector>
+#include "paddle/legacy/math/SparseMatrix.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(fc, FullyConnectedLayer);
+
+bool FullyConnectedLayer::init(const LayerMap& layerMap,
+                               const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  /* initialize the weightList */
+  CHECK(inputLayers_.size() == parameters_.size());
+  for (size_t i = 0; i < inputLayers_.size(); i++) {
+    // Option the parameters
+    size_t height = inputLayers_[i]->getSize();
+    size_t width = getSize();
+
+    // create a new weight
+    if (parameters_[i]->isSparse()) {
+      CHECK_LE(parameters_[i]->getSize(), width * height);
+    } else {
+      CHECK_EQ(parameters_[i]->getSize(), width * height);
+    }
+    Weight* w = new Weight(height, width, parameters_[i]);
+
+    // append the new weight to the list
+    weights_.emplace_back(w);
+  }
+
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+  }
+
+  return true;
+}
+
+void FullyConnectedLayer::prefetch() {
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    auto* sparseParam =
+        dynamic_cast<SparsePrefetchRowCpuMatrix*>(weights_[i]->getW().get());
+    if (sparseParam) {
+      MatrixPtr input = getInputValue(i);
+      sparseParam->addRows(input);
+    }
+  }
+}
+
+void FullyConnectedLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  /* malloc memory for the output_ if necessary */
+  int batchSize = getInput(0).getBatchSize();
+  int size = getSize();
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    reserveOutput(batchSize, size);
+  }
+
+  MatrixPtr outV = getOutputValue();
+
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    auto input = getInput(i);
+    CHECK(input.value) << "The input of 'fc' layer must be matrix";
+    REGISTER_TIMER_INFO("FwMulTimer", getName().c_str());
+    i == 0 ? outV->mul(*input.value, *weights_[i]->getW(), 1, 0)
+           : outV->mul(*input.value, *weights_[i]->getW(), 1, 1);
+  }
+
+  /* add the bias-vector */
+  if (biases_.get() != NULL) {
+    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
+    outV->addBias(*(biases_->getW()), 1);
+  }
+
+  /* activation */ {
+    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void FullyConnectedLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ {
+    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
+    backwardActivation();
+  }
+
+  if (biases_ && biases_->getWGrad()) {
+    REGISTER_TIMER_INFO("BpBiasTimer", getName().c_str());
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+
+    /* Increasing the number of gradient */
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  bool syncFlag = hl_get_sync_flag();
+
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    /* Calculate the W-gradient for the current layer */
+    if (weights_[i]->getWGrad()) {
+      MatrixPtr input_T = getInputValue(i)->getTranspose();
+      MatrixPtr oGrad = getOutputGrad();
+      {
+        REGISTER_TIMER_INFO("GradMulTimer", getName().c_str());
+        weights_[i]->getWGrad()->mul(*input_T, *oGrad, 1, 1);
+      }
+    }
+
+    // If callback does not change value, backprop error asynchronously so that
+    // we can do the callback concurrently.
+    hl_set_sync_flag(false);
+
+    /* Calculate the input layers error */
+    MatrixPtr preGrad = getInputGrad(i);
+    if (NULL != preGrad) {
+      MatrixPtr weights_T = weights_[i]->getW()->getTranspose();
+      REGISTER_TIMER_INFO("BpMulTimer", getName().c_str());
+      preGrad->mul(*getOutputGrad(), *weights_T, 1, 1);
+    }
+
+    hl_set_sync_flag(syncFlag);
+    {
+      REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
+      weights_[i]->getParameterPtr()->incUpdate(callback);
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/FullyConnectedLayer.h b/paddle/legacy/gserver/layers/FullyConnectedLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..7e29cac0437a8ae735ffb71e5ee901edd79fa7f3
--- /dev/null
+++ b/paddle/legacy/gserver/layers/FullyConnectedLayer.h
@@ -0,0 +1,49 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
+
+namespace paddle {
+/**
+ * A layer has full connections to all neurons in the previous layer.
+ * It computes an inner product with a set of learned weights, and
+ * (optionally) adds biases.
+ *
+ * The config file api is fc_layer.
+ */
+
+class FullyConnectedLayer : public Layer {
+ protected:
+  WeightList weights_;
+  std::unique_ptr<Weight> biases_;
+
+ public:
+  explicit FullyConnectedLayer(const LayerConfig& config) : Layer(config) {}
+  ~FullyConnectedLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  Weight& getWeight(int idx) { return *weights_[idx]; }
+
+  void prefetch() override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/GatedRecurrentLayer.cpp b/paddle/legacy/gserver/layers/GatedRecurrentLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bdcd445cb47de346a8ca496fdaecf7d1f841f51e
--- /dev/null
+++ b/paddle/legacy/gserver/layers/GatedRecurrentLayer.cpp
@@ -0,0 +1,414 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "GatedRecurrentLayer.h"
+#include "Layer.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(gated_recurrent, GatedRecurrentLayer);
+
+bool GatedRecurrentLayer::init(const LayerMap& layerMap,
+                               const ParameterMap& parameterMap) {
+  if (!Layer::init(layerMap, parameterMap)) return false;
+  CHECK_EQ(1U, inputLayers_.size());
+  CHECK_EQ(1U, parameters_.size());
+  CHECK_EQ(getSize() * getSize() * 3, parameters_[0]->getSize());
+  CHECK_EQ(getSize() * 3, biasParameter_->getSize());
+  weight_.reset(new Weight(getSize(), getSize() * 3, parameters_[0]));
+  gateWeight_.reset(new Weight(getSize(), getSize() * 2, parameters_[0], 0));
+  stateWeight_.reset(new Weight(
+      getSize(), getSize(), parameters_[0], 2 * getSize() * getSize()));
+  if (biasParameter_.get() != NULL) {
+    bias_.reset(new Weight(1, getSize() * 3, biasParameter_));
+  }
+
+  reversed_ = config_.reversed();
+  activationGate_.reset(ActivationFunction::create(config_.active_gate_type()));
+
+  GruCompute::init(config_);
+  useBatch_ = true;
+
+  return true;
+}
+
+void GatedRecurrentLayer::resetState() {
+  CHECK(!reversed_) << "state is not allowed for reversed gated "
+                       "recurrent layer";
+  Matrix::resizeOrCreate(
+      prevOutput_, 1, getSize(), /* trans= */ false, useGpu_);
+  prevOutput_->zeroMem();
+
+  // TODO(hedaoyuan): support prev_batch_state
+  CHECK(!FLAGS_prev_batch_state) << "Not supported";
+
+  useBatch_ = false;
+}
+
+void GatedRecurrentLayer::setState(LayerStatePtr state) {
+  CHECK(state->value.size() == 1)
+      << "one matrix is expected for GatedRecurrentLayer state";
+  prevOutput_->copyFrom(*(state->value[0]));
+}
+
+LayerStatePtr GatedRecurrentLayer::getState() {
+  LayerStatePtr res = std::make_shared<LayerState>();
+  res->value.push_back(prevOutput_->clone(0, 0, useGpu_));
+  res->value[0]->copyFrom(*prevOutput_);
+  return res;
+}
+
+void GatedRecurrentLayer::forward(PassType passType) {
+  REGISTER_TIMER_INFO("GruFwTimer", getName().c_str());
+  Layer::forward(passType);
+
+  const Argument& input = getInput(0);
+  CHECK(input.sequenceStartPositions);
+  int batchSize = input.getBatchSize();
+  size_t numSequences = input.getNumSequences();
+  resetOutput(batchSize, getSize());
+  CHECK_EQ(getSize() * 3, input.value->getWidth());
+  const int* starts = input.sequenceStartPositions->getData(false);
+  // batchSize = length of total frames in a batch (NOT size of mini-batch)
+  CHECK_EQ(starts[numSequences], batchSize);
+
+  Matrix::resizeOrCreate(gate_.value,
+                         /* height= */ batchSize,
+                         getSize() * 3,
+                         /* trans= */ false,
+                         useGpu_);
+  Matrix::resizeOrCreate(resetOutput_.value,
+                         /* height= */ batchSize,
+                         getSize(),
+                         /* trans= */ false,
+                         useGpu_);
+
+  if (useBatch_) {
+    forwardBatch(batchSize, numSequences, starts, input.value);
+  } else {
+    forwardSequence(batchSize, numSequences, starts, input.value);
+  }
+}
+
+void GatedRecurrentLayer::backward(const UpdateCallback& callback) {
+  REGISTER_TIMER_INFO("GruBwTimer", getName().c_str());
+  const Argument& input = getInput(0);
+  CHECK(input.sequenceStartPositions);
+  int batchSize = input.getBatchSize();
+  const int* starts = input.sequenceStartPositions->getData(false);
+  size_t numSequences = input.getNumSequences();
+
+  Matrix::resizeOrCreate(gate_.grad,
+                         /* height= */ batchSize,
+                         getSize() * 3,
+                         /* trans= */ false,
+                         useGpu_);
+  Matrix::resizeOrCreate(resetOutput_.grad,
+                         /* height= */ batchSize,
+                         getSize(),
+                         /* trans= */ false,
+                         useGpu_);
+
+  if (useBatch_) {
+    backwardBatch(batchSize, input.grad);
+  } else {
+    backwardSequence(batchSize, numSequences, starts, input.grad);
+  }
+
+  if (bias_) {
+    bias_->getParameterPtr()->incUpdate(callback);
+  }
+
+  weight_->getParameterPtr()->incUpdate(callback);
+}
+
+void GatedRecurrentLayer::forwardSequence(int batchSize,
+                                          size_t numSequences,
+                                          const int* starts,
+                                          MatrixPtr inputValue) {
+  REGISTER_TIMER_INFO("GruFwSequenceTime", getName().c_str());
+  gate_.value->assign(*inputValue);
+  if (bias_) {
+    gate_.value->addBias(*(bias_->getW()), 1);
+  }
+
+  hl_gru_value gruValue;
+  gruValue.gateWeight = (gateWeight_->getW())->getData();
+  gruValue.stateWeight = (stateWeight_->getW())->getData();
+  gruValue.gateValue = gate_.value->getData();
+  gruValue.resetOutputValue = resetOutput_.value->getData();
+  gruValue.outputValue = output_.value->getData();
+  gruValue.prevOutValue = nullptr;
+
+  if (reversed_) {
+    gruValue.gateValue += (batchSize - 1) * getSize() * 3;
+    gruValue.resetOutputValue += (batchSize - 1) * getSize();
+    gruValue.outputValue += (batchSize - 1) * getSize();
+  }
+
+  auto nextFrame = [&gruValue](bool reversed, int frameSize) {
+    gruValue.prevOutValue = gruValue.outputValue;
+    if (!reversed) {
+      gruValue.gateValue += frameSize * 3;
+      gruValue.resetOutputValue += frameSize;
+      gruValue.outputValue += frameSize;
+    } else {
+      gruValue.gateValue -= frameSize * 3;
+      gruValue.resetOutputValue -= frameSize;
+      gruValue.outputValue -= frameSize;
+    }
+  };
+
+  if (!reversed_) {
+    if (prevOutput_) {
+      gruValue.prevOutValue = prevOutput_->getData();
+    }
+  }
+  AsyncGpuBlock asyncGpuBlock;
+  for (size_t n = 0; n < numSequences; ++n) {
+    int length;
+    if (!reversed_) {
+      length = starts[n + 1] - starts[n];
+    } else {
+      length = starts[numSequences - n] - starts[numSequences - n - 1];
+    }
+    for (int l = 0; l < length; ++l) {
+      if (useGpu_) {
+        GruCompute::forward<1>(gruValue, getSize());
+      } else {
+        GruCompute::forward<0>(gruValue, getSize());
+      }
+
+      nextFrame(reversed_, getSize());
+    }
+    if (!reversed_) {
+      if (!prevOutput_) gruValue.prevOutValue = nullptr;
+    } else {
+      gruValue.prevOutValue = nullptr;
+    }
+  }
+
+  if (!reversed_) {
+    if (prevOutput_) {
+      prevOutput_->assign(*output_.value->subMatrix(batchSize - 1, 1));
+    }
+  }
+}
+
+void GatedRecurrentLayer::backwardSequence(int batchSize,
+                                           size_t numSequences,
+                                           const int* starts,
+                                           MatrixPtr inputGrad) {
+  REGISTER_TIMER_INFO("GruBwSequenceTime", getName().c_str());
+
+  hl_gru_value gruValue;
+  gruValue.gateWeight = (gateWeight_->getW())->getData();
+  gruValue.stateWeight = (stateWeight_->getW())->getData();
+  gruValue.gateValue = gate_.value->getData();
+  gruValue.resetOutputValue = resetOutput_.value->getData();
+  gruValue.outputValue = output_.value->getData();
+
+  hl_gru_grad gruGrad;
+  gruGrad.gateWeightGrad =
+      (gateWeight_->getWGrad() ? gateWeight_->getWGrad()->getData() : nullptr);
+  gruGrad.stateWeightGrad =
+      (stateWeight_->getWGrad() ? stateWeight_->getWGrad()->getData()
+                                : nullptr);
+  gruGrad.gateGrad = gate_.grad->getData();
+  gruGrad.resetOutputGrad = resetOutput_.grad->getData();
+  gruGrad.outputGrad = output_.grad->getData();
+
+  if (!reversed_) {
+    gruValue.gateValue += (batchSize - 1) * getSize() * 3;
+    gruValue.resetOutputValue += (batchSize - 1) * getSize();
+    gruValue.outputValue += (batchSize - 1) * getSize();
+    gruGrad.gateGrad += (batchSize - 1) * getSize() * 3;
+    gruGrad.resetOutputGrad += (batchSize - 1) * getSize();
+    gruGrad.outputGrad += (batchSize - 1) * getSize();
+    gruValue.prevOutValue = gruValue.outputValue - getSize();
+    gruGrad.prevOutGrad = gruGrad.outputGrad - getSize();
+  } else {
+    gruValue.prevOutValue = gruValue.outputValue + getSize();
+    gruGrad.prevOutGrad = gruGrad.outputGrad + getSize();
+  }
+
+  auto nextFrame = [&gruValue, &gruGrad](bool reversed, int frameSize) {
+    if (reversed) {
+      gruValue.gateValue += frameSize * 3;
+      gruValue.resetOutputValue += frameSize;
+      gruValue.outputValue += frameSize;
+      gruGrad.gateGrad += frameSize * 3;
+      gruGrad.resetOutputGrad += frameSize;
+      gruGrad.outputGrad += frameSize;
+      gruValue.prevOutValue = gruValue.outputValue + frameSize;
+      gruGrad.prevOutGrad = gruGrad.outputGrad + frameSize;
+    } else {
+      gruValue.gateValue -= frameSize * 3;
+      gruValue.resetOutputValue -= frameSize;
+      gruValue.outputValue -= frameSize;
+      gruGrad.gateGrad -= frameSize * 3;
+      gruGrad.resetOutputGrad -= frameSize;
+      gruGrad.outputGrad -= frameSize;
+      gruValue.prevOutValue = gruValue.outputValue - frameSize;
+      gruGrad.prevOutGrad = gruGrad.outputGrad - frameSize;
+    }
+  };
+
+  {
+    AsyncGpuBlock asyncGpuBlock;
+    for (size_t n = 0; n < numSequences; ++n) {
+      int length;
+      if (reversed_) {
+        length = starts[n + 1] - starts[n];
+      } else {
+        length = starts[numSequences - n] - starts[numSequences - n - 1];
+      }
+      for (int l = 0; l < length; ++l) {
+        if (l == length - 1) {
+          gruValue.prevOutValue = nullptr;
+          gruGrad.prevOutGrad = nullptr;
+        }
+        if (useGpu_) {
+          GruCompute::backward<1>(gruValue, gruGrad, getSize());
+        } else {
+          GruCompute::backward<0>(gruValue, gruGrad, getSize());
+        }
+        nextFrame(reversed_, getSize());
+      }
+    }
+  }
+
+  if (inputGrad) {
+    inputGrad->add(*gate_.grad);
+  }
+  if (bias_ && bias_->getWGrad()) {
+    bias_->getWGrad()->collectBias(*gate_.grad, 1);
+  }
+}
+
+void GatedRecurrentLayer::forwardBatch(int batchSize,
+                                       size_t numSequences,
+                                       const int* starts,
+                                       MatrixPtr inputValue) {
+  REGISTER_TIMER_INFO("GruFwBatchTime", getName().c_str());
+  hl_gru_value gruValue;
+  gruValue.gateWeight = (gateWeight_->getW())->getData();
+  gruValue.stateWeight = (stateWeight_->getW())->getData();
+
+  if (!batchValue_) {
+    batchValue_.reset(new SequenceToBatch(useGpu_));
+  }
+  batchValue_->resizeOrCreateBatch(batchSize, numSequences, starts, reversed_);
+
+  batchValue_->resizeOrCreate(*output_.value);
+  batchValue_->copy(*inputValue, *gate_.value, /* seq2batch */ true);
+  if (bias_) {
+    gate_.value->addBias(*(bias_->getW()), 1);
+  }
+
+  {
+    int numBatch = batchValue_->getNumBatch();
+    int curBatchSize = 0;
+    AsyncGpuBlock asyncGpuBlock;
+    for (int n = 0; n < numBatch; n++) {
+      MatrixPtr outputValueTmp = batchValue_->getBatchValue(n);
+      gruValue.outputValue = outputValueTmp->getData();
+      gruValue.gateValue =
+          (batchValue_->getBatchValue(*gate_.value, n))->getData();
+      gruValue.resetOutputValue =
+          (batchValue_->getBatchValue(*resetOutput_.value, n))->getData();
+
+      curBatchSize = outputValueTmp->getHeight();
+      gruValue.prevOutValue =
+          (n == 0
+               ? nullptr
+               : (batchValue_->getBatchValue(n - 1, curBatchSize))->getData());
+
+      {
+        if (useGpu_) {
+          GruCompute::forward<1>(gruValue, getSize(), curBatchSize);
+        } else {
+          GruCompute::forward<0>(gruValue, getSize(), curBatchSize);
+        }
+      }
+    }
+  }
+  { batchValue_->copyBackSeq(*output_.value); }
+}
+
+void GatedRecurrentLayer::backwardBatch(int batchSize, MatrixPtr inputGrad) {
+  REGISTER_TIMER_INFO("GruBwBatchTime", getName().c_str());
+  hl_gru_value gruValue;
+  gruValue.gateWeight = (gateWeight_->getW())->getData();
+  gruValue.stateWeight = (stateWeight_->getW())->getData();
+
+  hl_gru_grad gruGrad;
+  gruGrad.gateWeightGrad =
+      (gateWeight_->getWGrad() ? gateWeight_->getWGrad()->getData() : nullptr);
+  gruGrad.stateWeightGrad =
+      (stateWeight_->getWGrad() ? stateWeight_->getWGrad()->getData()
+                                : nullptr);
+
+  if (!batchGrad_) {
+    batchGrad_.reset(new SequenceToBatch(useGpu_));
+  }
+  batchGrad_->shareIndexWith(*batchValue_);
+
+  { batchGrad_->copyFromSeq(*output_.grad); }
+
+  {
+    int numBatch = batchGrad_->getNumBatch();
+    int batchSize = 0;
+    AsyncGpuBlock asyncGpuBlock;
+    for (int n = (int)numBatch - 1; n >= 0; n--) {
+      gruValue.gateValue =
+          (batchGrad_->getBatchValue(*gate_.value, n))->getData();
+      gruValue.resetOutputValue =
+          (batchGrad_->getBatchValue(*resetOutput_.value, n))->getData();
+
+      MatrixPtr outputGradTmp = batchGrad_->getBatchValue(n);
+      gruGrad.outputGrad = outputGradTmp->getData();
+      gruGrad.gateGrad = (batchGrad_->getBatchValue(*gate_.grad, n))->getData();
+      gruGrad.resetOutputGrad =
+          (batchGrad_->getBatchValue(*resetOutput_.grad, n))->getData();
+
+      {
+        batchSize = outputGradTmp->getHeight();
+        gruValue.prevOutValue =
+            (n == 0
+                 ? nullptr
+                 : (batchValue_->getBatchValue(n - 1, batchSize))->getData());
+        gruGrad.prevOutGrad =
+            (n == 0 ? nullptr
+                    : (batchGrad_->getBatchValue(n - 1, batchSize))->getData());
+
+        if (useGpu_) {
+          GruCompute::backward<1>(gruValue, gruGrad, getSize(), batchSize);
+        } else {
+          GruCompute::backward<0>(gruValue, gruGrad, getSize(), batchSize);
+        }
+      }
+    }
+  }
+
+  if (inputGrad) {
+    batchGrad_->add(*inputGrad, *gate_.grad, /* seq2batch */ false);
+  }
+  if (bias_ && bias_->getWGrad()) {
+    bias_->getWGrad()->collectBias(*gate_.grad, /* scale */ 1);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/GatedRecurrentLayer.h b/paddle/legacy/gserver/layers/GatedRecurrentLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..8bbf01ce200c9922f49508b0499aa9422745f474
--- /dev/null
+++ b/paddle/legacy/gserver/layers/GatedRecurrentLayer.h
@@ -0,0 +1,100 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "GruCompute.h"
+#include "Layer.h"
+#include "SequenceToBatch.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief Please refer to "Junyoung Chung, Empirical Evaluation
+ * of Gated Recurrent Neural Networks on Sequence Modeling".
+ *
+ * GatedRecurrentLayer takes 1 input layer with size * 3.
+ * Input layer is diveded into 3 equal parts: (xz_t, xr_t, xi_t).
+ * parameter and biasParameter is also diveded into 3 equal parts:
+ *   - parameter consists of (U_z, U_r, U)
+ *   - baisParameter consists of (bias_z, bias_r, bias_o)
+ *
+ * \f[
+ * update \ gate: z_t = actGate(xz_t + U_z * h_{t-1} + bias_z) \\
+ * reset \ gate: r_t = actGate(xr_t + U_r * h_{t-1} + bias_r) \\
+ * output \ candidate: {h}_t = actNode(xi_t + U * dot(r_t, h_{t-1}) + bias_o) \\
+ * hidden \ activation: h_t = dot((1-z_t), h_{t-1}) + dot(z_t, {h}_t) \\
+ * \f]
+ *
+ * @note
+ * - dot denotes "element-wise multiplication".
+ * - actNode is defined by config active_type
+ * - actGate is defined by config actvie_gate_type
+ *
+ * The config file is grumemory.
+ */
+
+class GatedRecurrentLayer : public Layer, public GruCompute {
+ public:
+  explicit GatedRecurrentLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+
+  void backward(const UpdateCallback& callback) override;
+
+  void resetState() override;
+
+  void setState(LayerStatePtr state) override;
+
+  LayerStatePtr getState() override;
+
+ protected:
+  void forwardSequence(int batchSize,
+                       size_t numSequences,
+                       const int* starts,
+                       MatrixPtr inputValue);
+  void backwardSequence(int batchSize,
+                        size_t numSequences,
+                        const int* starts,
+                        MatrixPtr inputGrad);
+
+  void forwardBatch(int batchSize,
+                    size_t numSequences,
+                    const int* starts,
+                    MatrixPtr inputValue);
+  void backwardBatch(int batchSize, MatrixPtr inputGrad);
+
+ protected:
+  std::unique_ptr<Weight> weight_;
+  std::unique_ptr<Weight> gateWeight_;
+  std::unique_ptr<Weight> stateWeight_;
+  std::unique_ptr<Weight> bias_;
+
+  Argument gate_;
+  Argument resetOutput_;
+
+  bool reversed_;
+  bool useBatch_;
+  std::unique_ptr<SequenceToBatch> batchValue_;
+  std::unique_ptr<SequenceToBatch> batchGrad_;
+  std::unique_ptr<ActivationFunction> activationGate_;
+
+  MatrixPtr prevOutput_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/GetOutputLayer.cpp b/paddle/legacy/gserver/layers/GetOutputLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/GetOutputLayer.cpp
rename to paddle/legacy/gserver/layers/GetOutputLayer.cpp
diff --git a/paddle/legacy/gserver/layers/GruCompute.cpp b/paddle/legacy/gserver/layers/GruCompute.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..adad6285b7d5acd8780444ffeab6627531683cb7
--- /dev/null
+++ b/paddle/legacy/gserver/layers/GruCompute.cpp
@@ -0,0 +1,54 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "GruCompute.h"
+#include "hl_recurrent_apply.cuh"
+#include "paddle/legacy/function/GruFunctor.h"
+#include "paddle/legacy/utils/Util.h"
+
+namespace paddle {
+
+void GruCompute::init(LayerConfig &config) {
+  activeNode_ = hlActiveType(config.active_type());
+  activeGate_ = hlActiveType(config.active_gate_type());
+}
+
+template <>
+void GruCompute::forward<0>(hl_gru_value value, int frameSize, int batchSize) {
+  GruFunctor<DEVICE_TYPE_CPU, real>::compute(hppl::forward::gru_resetOutput(),
+                                             hppl::forward::gru_finalOutput(),
+                                             value,
+                                             frameSize,
+                                             batchSize,
+                                             activeNode_,
+                                             activeGate_);
+}
+
+template <>
+void GruCompute::backward<0>(hl_gru_value value,
+                             hl_gru_grad grad,
+                             int frameSize,
+                             int batchSize) {
+  GruGradFunctor<DEVICE_TYPE_CPU, real>::compute(
+      hppl::backward::gru_stateGrad(),
+      hppl::backward::gru_resetGrad(),
+      value,
+      grad,
+      frameSize,
+      batchSize,
+      activeNode_,
+      activeGate_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/GruCompute.cu b/paddle/legacy/gserver/layers/GruCompute.cu
similarity index 100%
rename from paddle/gserver/layers/GruCompute.cu
rename to paddle/legacy/gserver/layers/GruCompute.cu
diff --git a/paddle/legacy/gserver/layers/GruCompute.h b/paddle/legacy/gserver/layers/GruCompute.h
new file mode 100644
index 0000000000000000000000000000000000000000..6feea7aca81b8618071893581a4e16d8ad38101c
--- /dev/null
+++ b/paddle/legacy/gserver/layers/GruCompute.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "ModelConfig.pb.h"
+#include "hl_gpu.h"
+#include "paddle/legacy/utils/Common.h"
+
+namespace paddle {
+
+class GruCompute {
+ public:
+  void init(LayerConfig &config);
+
+  template <bool useGpu>
+  void forward(hl_gru_value value, int frameSize, int batchSize = 1);
+
+  template <bool useGpu>
+  void backward(hl_gru_value value,
+                hl_gru_grad grad,
+                int frameSize,
+                int batchSize = 1);
+
+ public:
+  hl_activation_mode_t activeNode_;
+  hl_activation_mode_t activeGate_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/GruStepLayer.cpp b/paddle/legacy/gserver/layers/GruStepLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2480e42d68b87ee406efc2b220b9ad6bf5cacbd6
--- /dev/null
+++ b/paddle/legacy/gserver/layers/GruStepLayer.cpp
@@ -0,0 +1,177 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "GruCompute.h"
+#include "Layer.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * @brief GruStepLayer is like GatedRecurrentLayer, but used in recurrent
+ * layer group. GruStepLayer takes 2 input layer.
+ * - input[0] with size * 3 and diveded into 3 equal parts: (xz_t, xr_t, xi_t).
+ * - input[1] with size: {prev_out}.
+ *
+ * parameter and biasParameter is also diveded into 3 equal parts:
+ * - parameter consists of (U_z, U_r, U)
+ * - baisParameter consists of (bias_z, bias_r, bias_o)
+ *
+ * \f[
+ * update \ gate: z_t = actGate(xz_t + U_z * prev_out + bias_z) \\
+ * reset \ gate: r_t = actGate(xr_t + U_r * prev_out + bias_r)  \\
+ * output \ candidate: {h}_t = actNode(xi_t + U * dot(r_t, prev_out) + bias_o)
+ * \\
+ * output: h_t = dot((1-z_t), prev_out) + dot(z_t, prev_out)
+ * \f]
+ *
+ * @note
+ *   - dot denotes "element-wise multiplication".
+ *   - actNode is defined by config active_type
+ *   - actGate is defined by config actvie_gate_type
+ *
+ * The config file api if gru_step_layer.
+ */
+class GruStepLayer : public Layer, public GruCompute {
+ protected:
+  Argument gate_;
+  Argument resetOutput_;
+  std::unique_ptr<Weight> weight_;
+  std::unique_ptr<Weight> bias_;
+
+ public:
+  explicit GruStepLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~GruStepLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(gru_step, GruStepLayer);
+
+bool GruStepLayer::init(const LayerMap& layerMap,
+                        const ParameterMap& parameterMap) {
+  if (!Layer::init(layerMap, parameterMap)) return false;
+  CHECK_EQ(2U, inputLayers_.size());
+
+  CHECK_EQ(getSize() * getSize() * 3, parameters_[0]->getSize());
+  weight_.reset(new Weight(getSize(), getSize() * 3, parameters_[0]));
+
+  if (biasParameter_.get() != NULL) {
+    CHECK_EQ(getSize() * 3, biasParameter_->getSize());
+    bias_.reset(new Weight(1, getSize() * 3, biasParameter_));
+  }
+
+  GruCompute::init(config_);
+  return true;
+}
+
+void GruStepLayer::forward(PassType passType) {
+  REGISTER_TIMER_INFO("GruStepFwTime", getName().c_str());
+  Layer::forward(passType);
+
+  const Argument& input = getInput(0);
+  const Argument& prevOutput = getInput(1);
+  CHECK_EQ(getSize() * 3, input.value->getWidth());
+  CHECK_EQ(getSize(), prevOutput.value->getWidth());
+
+  int batchSize = input.getBatchSize();
+  resetOutput(batchSize, getSize());
+  resetSpecifyOutput(gate_,
+                     batchSize,
+                     getSize() * 3,
+                     /* isValueClean */ false,
+                     /* isGradClean */ false);
+  resetSpecifyOutput(resetOutput_,
+                     batchSize,
+                     getSize(),
+                     /* isValueClean */ false,
+                     /* isGradClean */ false);
+  gate_.value->assign(*input.value);
+  if (bias_) {
+    gate_.value->addBias(*(bias_->getW()), 1);
+  }
+
+  hl_gru_value gruValue;
+  gruValue.gateWeight = weight_->getW()->getData();
+  gruValue.stateWeight = weight_->getW()->getData() + getSize() * getSize() * 2;
+  gruValue.gateValue = gate_.value->getData();
+  gruValue.resetOutputValue = resetOutput_.value->getData();
+  gruValue.outputValue = output_.value->getData();
+  gruValue.prevOutValue = prevOutput.value->getData();
+
+  if (useGpu_) {
+    GruCompute::forward<1>(gruValue, getSize(), batchSize);
+  } else {
+    GruCompute::forward<0>(gruValue, getSize(), batchSize);
+  }
+}
+
+void GruStepLayer::backward(const UpdateCallback& callback) {
+  REGISTER_TIMER_INFO("GruStepBwTime", getName().c_str());
+
+  const Argument& input = getInput(0);
+  const Argument& prevOutput = getInput(1);
+  int batchSize = input.getBatchSize();
+
+  hl_gru_value gruValue;
+  gruValue.gateWeight = weight_->getW()->getData();
+  gruValue.stateWeight = weight_->getW()->getData() + getSize() * getSize() * 2;
+  gruValue.gateValue = gate_.value->getData();
+  gruValue.resetOutputValue = resetOutput_.value->getData();
+  gruValue.outputValue = output_.value->getData();
+  gruValue.prevOutValue = prevOutput.value->getData();
+
+  hl_gru_grad gruGrad;
+  gruGrad.gateWeightGrad =
+      (weight_->getWGrad() ? weight_->getWGrad()->getData() : nullptr);
+  gruGrad.stateWeightGrad =
+      (weight_->getWGrad()
+           ? weight_->getWGrad()->getData() + getSize() * getSize() * 2
+           : nullptr);
+
+  gruGrad.gateGrad = gate_.grad->getData();
+  gruGrad.resetOutputGrad = resetOutput_.grad->getData();
+  gruGrad.outputGrad = output_.grad->getData();
+  if (prevOutput.grad) {
+    gruGrad.prevOutGrad = prevOutput.grad->getData();
+  } else {
+    gruGrad.prevOutGrad = nullptr;
+  }
+
+  if (useGpu_) {
+    GruCompute::backward<1>(gruValue, gruGrad, getSize(), batchSize);
+  } else {
+    GruCompute::backward<0>(gruValue, gruGrad, getSize(), batchSize);
+  }
+
+  if (input.grad) {
+    input.grad->add(*gate_.grad);
+  }
+
+  if (bias_ && bias_->getWGrad()) {
+    bias_->getWGrad()->collectBias(*gate_.grad, 1);
+  }
+
+  if (bias_) {
+    bias_->getParameterPtr()->incUpdate(callback);
+  }
+  weight_->getParameterPtr()->incUpdate(callback);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/HierarchicalSigmoidLayer.cpp b/paddle/legacy/gserver/layers/HierarchicalSigmoidLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..34495994096a87640bdeef777feb5cd783cd4598
--- /dev/null
+++ b/paddle/legacy/gserver/layers/HierarchicalSigmoidLayer.cpp
@@ -0,0 +1,240 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "HierarchicalSigmoidLayer.h"
+#include "paddle/legacy/utils/Util.h"
+
+namespace paddle {
+
+REGISTER_LAYER(hsigmoid, HierarchicalSigmoidLayer);
+
+bool HierarchicalSigmoidLayer::init(const LayerMap& layerMap,
+                                    const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  CHECK(config_.has_num_classes()) << "num_classes must be specifed in config";
+  numClasses_ = config_.num_classes();
+  CHECK_GE(numClasses_, (size_t)2);
+  codeLength_ = findLastSet(numClasses_ - 1);
+
+  size_t height = numClasses_ - 1;
+
+  /* initialize the weightList */
+  // The last input layer is for label
+  CHECK(!parameters_.back());
+  for (size_t i = 0; i < inputLayers_.size() - 1; i++) {
+    size_t width = inputLayers_[i]->getSize();
+    // create a new weight
+    CHECK_EQ(parameters_[i]->getSize(), width * height);
+    Weight* w = new Weight(height, width, parameters_[i]);
+
+    // append the new weight to the list
+    weights_.emplace_back(w);
+  }
+
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    CHECK_EQ(biasParameter_->getSize(), numClasses_ - 1);
+    biases_.reset(new Weight(1, numClasses_ - 1, biasParameter_));
+  }
+
+  return true;
+}
+
+void HierarchicalSigmoidLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  /* malloc memory for the output_ if necessary */
+  int batchSize = getInputValue(0)->getHeight();
+  int size = getSize();
+  reserveOutput(batchSize, size);
+  Matrix::resizeOrCreate(preOutput_.value,
+                         batchSize,
+                         codeLength_,
+                         /* trans */ false,
+                         false);
+  Matrix::resizeOrCreate(preOutput_.grad,
+                         batchSize,
+                         codeLength_,
+                         /* trans */ false,
+                         false);
+  IVectorPtr label = getInput(*getLabelLayer()).ids;
+  preOutput_.value->zeroMem();
+
+  if (useGpu_) {
+    Matrix::resizeOrCreate(cpuOutput_,
+                           output_.value->getHeight(),
+                           output_.value->getWidth(),
+                           /* trans */ false,
+                           false);
+    IVector::resizeOrCreate(cpuLabel_, label->getSize(), false);
+    cpuLabel_->copyFrom(*label);
+    cpuOutput_->copyFrom(*output_.value);
+  } else {
+    cpuOutput_ = output_.value;
+    cpuLabel_ = label;
+  }
+  /* add the bias-vector */
+  if (biases_.get() != NULL) {
+    if (useGpu_) {
+      Matrix::resizeOrCreate(cpuBias_,
+                             1,
+                             numClasses_ - 1,
+                             /* trans */ false,
+                             false);
+      cpuBias_->copyFrom(*biases_->getW());
+    } else {
+      cpuBias_ = biases_->getW();
+    }
+    preOutput_.value->addByBitCode(numClasses_, *cpuLabel_, *cpuBias_);
+  }
+  for (size_t i = 0; i < inputLayers_.size() - 1; ++i) {
+    MatrixPtr input = getInputValue(i);
+    if (useGpu_) {
+      Matrix::resizeOrCreate(cpuInput_,
+                             input->getHeight(),
+                             input->getWidth(),
+                             /* trans */ false,
+                             false);
+      Matrix::resizeOrCreate(cpuWeight_,
+                             weights_[i]->getW()->getHeight(),
+                             weights_[i]->getW()->getWidth(),
+                             /* trans */ false,
+                             false);
+      cpuInput_->copyFrom(*input);
+      cpuWeight_->copyFrom(*weights_[i]->getW());
+    } else {
+      cpuInput_ = input;
+      cpuWeight_ = weights_[i]->getW();
+    }
+    preOutput_.value->mulByBitCode(
+        numClasses_, *cpuLabel_, *cpuWeight_, *cpuInput_);
+  }
+  // keep consistent with the clipping in the following softrelu
+  preOutput_.value->clip(-40.0, 40.0);
+  preOutput_.value->sumByBitCode(numClasses_,
+                                 *cpuLabel_,
+                                 *cpuOutput_,
+                                 -1);  // scaleSum
+  preOutput_.value->softrelu(*preOutput_.value);
+  MatrixPtr sum = Matrix::create(batchSize, 1, /* trans= */ false, false);
+  preOutput_.value->rowSum(*sum);
+  cpuOutput_->add(*sum);
+  if (useGpu_) {
+    output_.value->copyFrom(*cpuOutput_);
+  } else {
+    output_.value = cpuOutput_;
+  }
+}
+
+void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
+  IVectorPtr label = getInput(*getLabelLayer()).ids;
+  if (useGpu_) {
+    IVector::resizeOrCreate(cpuLabel_, label->getSize(), false);
+    cpuLabel_->copyFrom(*label);
+  } else {
+    cpuLabel_ = label;
+  }
+  preOutput_.grad->one();
+  preOutput_.grad->softreluDerivative(*preOutput_.value);
+  preOutput_.grad->subByBitCode(numClasses_, *cpuLabel_);
+
+  if (biases_ && biases_->getWGrad()) {
+    MatrixPtr biases_grad = biases_->getWGrad();
+    if (useGpu_) {
+      Matrix::resizeOrCreate(cpuBias_,
+                             1,
+                             numClasses_ - 1,
+                             /* trans */ false,
+                             false);
+      cpuBias_->copyFrom(*biases_grad);
+    } else {
+      cpuBias_ = biases_grad;
+    }
+    preOutput_.grad->addByBitCodeBackward(numClasses_, *cpuLabel_, *cpuBias_);
+    if (useGpu_) {
+      biases_grad->copyFrom(*cpuBias_);
+    } else {
+      biases_grad = cpuBias_;
+    }
+    /* Increasing the number of gradient */
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  for (size_t i = 0; i < inputLayers_.size() - 1; ++i) {
+    /* Calculate the W-gradient for the current layer */
+    MatrixPtr input = getInputValue(i);
+    if (weights_[i]->getWGrad()) {
+      MatrixPtr weights_grad = weights_[i]->getWGrad();
+      if (useGpu_) {
+        Matrix::resizeOrCreate(cpuInput_,
+                               input->getHeight(),
+                               input->getWidth(),
+                               /* trans */ false,
+                               false);
+        Matrix::resizeOrCreate(cpuWeightGrad_,
+                               weights_grad->getHeight(),
+                               weights_grad->getWidth(),
+                               /* trans */ false,
+                               false);
+        cpuInput_->copyFrom(*input);
+        cpuWeightGrad_->copyFrom(*weights_grad);
+      } else {
+        cpuInput_ = input;
+        cpuWeightGrad_ = weights_grad;
+      }
+      preOutput_.grad->mulByBitCodeBackwardWeight(
+          numClasses_, *cpuLabel_, *cpuWeightGrad_, *cpuInput_);
+      if (useGpu_) {
+        weights_grad->copyFrom(*cpuWeightGrad_);
+      } else {
+        weights_grad = cpuWeightGrad_;
+      }
+      /* Increasing the number of gradient */
+      weights_[i]->getParameterPtr()->incUpdate(callback);
+    }
+
+    /* Calculate the input layers error */
+    MatrixPtr inputGrad = getInputGrad(i);
+    if (inputGrad) {
+      if (useGpu_) {
+        Matrix::resizeOrCreate(cpuInputGrad_,
+                               inputGrad->getHeight(),
+                               inputGrad->getWidth(),
+                               /* trans */ false,
+                               false);
+        Matrix::resizeOrCreate(cpuWeight_,
+                               weights_[i]->getW()->getHeight(),
+                               weights_[i]->getW()->getWidth(),
+                               /* trans */ false,
+                               false);
+        cpuInputGrad_->copyFrom(*inputGrad);
+        cpuWeight_->copyFrom(*weights_[i]->getW());
+      } else {
+        cpuInputGrad_ = inputGrad;
+        cpuWeight_ = weights_[i]->getW();
+      }
+      preOutput_.grad->mulByBitCodeBackwardError(
+          numClasses_, *cpuLabel_, *cpuWeight_, *cpuInputGrad_);
+      if (useGpu_) {
+        inputGrad->copyFrom(*cpuInputGrad_);
+      } else {
+        inputGrad = cpuInputGrad_;
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.h b/paddle/legacy/gserver/layers/HierarchicalSigmoidLayer.h
similarity index 100%
rename from paddle/gserver/layers/HierarchicalSigmoidLayer.h
rename to paddle/legacy/gserver/layers/HierarchicalSigmoidLayer.h
diff --git a/paddle/legacy/gserver/layers/IdentityProjection.cpp b/paddle/legacy/gserver/layers/IdentityProjection.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f707642e09b86721a88142ab8b745bb3492e820c
--- /dev/null
+++ b/paddle/legacy/gserver/layers/IdentityProjection.cpp
@@ -0,0 +1,103 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Projection.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * IdentityProjection performs addition:
+ * \f[
+ *   out.row[i] += in.row[i]
+ * \f]
+ *
+ * The config file api is identity_projection.
+ */
+class IdentityProjection : public Projection {
+ public:
+  IdentityProjection(const ProjectionConfig& config,
+                     const ParameterPtr& parameter,
+                     bool useGpu);
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback);
+};
+
+REGISTER_PROJECTION(identity, IdentityProjection);
+
+/**
+ * Constructed function.
+ * @note IdentityProjection should not have any parameter.
+ */
+IdentityProjection::IdentityProjection(const ProjectionConfig& config,
+                                       const ParameterPtr& parameter,
+                                       bool useGpu)
+    : Projection(config, parameter, useGpu) {
+  CHECK(!parameter) << "'identity' projection should not have any parameter";
+}
+
+void IdentityProjection::forward() { out_->value->add(*in_->value); }
+
+void IdentityProjection::backward(const UpdateCallback& callback) {
+  if (in_->grad) {
+    in_->grad->add(*out_->grad);
+  }
+}
+
+/**
+ * IdentityOffsetProjection likes IdentityProjection, but layer size may be
+ * smaller
+ * than input size. It selects dimensions [offset, offset+layer_size) from input
+ * to
+ * perform addition:
+ * \f[
+ *   out.row[i] += in.row[i + \textrm{offset}]
+ * \f]
+ *
+ * The config file api is identity_projection.
+ */
+class IdentityOffsetProjection : public Projection {
+ public:
+  IdentityOffsetProjection(const ProjectionConfig& config,
+                           const ParameterPtr& parameter,
+                           bool useGpu);
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback);
+};
+
+REGISTER_PROJECTION(identity_offset, IdentityOffsetProjection);
+
+/**
+ * Constructed function.
+ * @note IdentityOffsetProjection should not have any parameter.
+ */
+IdentityOffsetProjection::IdentityOffsetProjection(
+    const ProjectionConfig& config, const ParameterPtr& parameter, bool useGpu)
+    : Projection(config, parameter, useGpu) {
+  CHECK(!parameter) << "'identity_offset' projection "
+                       "should not have any parameter";
+  CHECK_LE(config.output_size() + config.offset(), config.input_size());
+}
+
+void IdentityOffsetProjection::forward() {
+  out_->value->addAtOffset(*in_->value, config_.offset());
+}
+
+void IdentityOffsetProjection::backward(const UpdateCallback& callback) {
+  if (in_->grad) {
+    in_->grad->addAtOffset(*out_->grad, config_.offset());
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/InterpolationLayer.cpp b/paddle/legacy/gserver/layers/InterpolationLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ed2294e8a397edfee6ad3c1f52235970d6ad48a9
--- /dev/null
+++ b/paddle/legacy/gserver/layers/InterpolationLayer.cpp
@@ -0,0 +1,130 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * A layer for linear interpolation with two inputs,
+ * which is used in NEURAL TURING MACHINE.
+ * \f[
+ *   y.row[i] = w[i] * x_1.row[i] + (1 - w[i]) * x_2.row[i]
+ * \f]
+ * where \f$x_1\f$ and \f$x_2\f$ are two (batchSize x dataDim) inputs,
+ * \f$w\f$ is (batchSize x 1) weight vector,
+ * and \f$y\f$ is (batchSize x dataDim) output.
+ *
+ * The config file api is interpolation_layer.
+ */
+
+class InterpolationLayer : public Layer {
+ protected:
+  /// weightLast = 1 - weight
+  MatrixPtr weightLast_;
+  MatrixPtr tmpMatrix;
+
+ public:
+  explicit InterpolationLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~InterpolationLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(interpolation, InterpolationLayer);
+
+bool InterpolationLayer::init(const LayerMap& layerMap,
+                              const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(3U, inputLayers_.size());
+
+  return true;
+}
+
+void InterpolationLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr weightV = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  MatrixPtr inV2 = getInputValue(2);
+
+  size_t batchSize = inV1->getHeight();
+  size_t dataDim = inV1->getWidth();
+
+  CHECK_EQ(dataDim, getSize());
+  CHECK_EQ(dataDim, inV2->getWidth());
+  CHECK_EQ(batchSize, inV1->getHeight());
+  CHECK_EQ(batchSize, inV2->getHeight());
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    resetOutput(batchSize, dataDim);
+  }
+
+  MatrixPtr outV = getOutputValue();
+
+  Matrix::resizeOrCreate(weightLast_, batchSize, 1, false, useGpu_);
+  weightLast_->one();
+  weightLast_->sub(*weightV);
+
+  REGISTER_TIMER_INFO("FwInterpTimer", getName().c_str());
+  // outV = inV1 * weight + inV2 * weightLast
+  outV->addRowScale(0, *inV1, *weightV);
+  outV->addRowScale(0, *inV2, *weightLast_);
+}
+
+void InterpolationLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr outG = getOutputGrad();
+  MatrixPtr weightV = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  MatrixPtr inV2 = getInputValue(2);
+  MatrixPtr inG0 = getInputGrad(0);
+  MatrixPtr inG1 = getInputGrad(1);
+  MatrixPtr inG2 = getInputGrad(2);
+
+  size_t batchSize = inV1->getHeight();
+  size_t dataDim = inV1->getWidth();
+
+  REGISTER_TIMER_INFO("BwInterpTimer", getName().c_str());
+
+  if (inG0) {
+    Matrix::resizeOrCreate(tmpMatrix, batchSize, dataDim, false, useGpu_);
+
+    // inG0 += outG .* (inV1 - inV2)
+    tmpMatrix->sub(*inV1, *inV2);
+    inG0->rowDotMul(0, *outG, *tmpMatrix);
+  }
+
+  if (inG1) {
+    // inG1 += outG * weight
+    inG1->addRowScale(0, *outG, *weightV);
+  }
+
+  if (inG2) {
+    // inG2 += outG * weightLast
+    inG2->addRowScale(0, *outG, *weightLast_);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/KmaxSeqScoreLayer.cpp b/paddle/legacy/gserver/layers/KmaxSeqScoreLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/KmaxSeqScoreLayer.cpp
rename to paddle/legacy/gserver/layers/KmaxSeqScoreLayer.cpp
diff --git a/paddle/legacy/gserver/layers/L2DistanceLayer.cpp b/paddle/legacy/gserver/layers/L2DistanceLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a3e627e57047b790b4f74089a352f06b55e48664
--- /dev/null
+++ b/paddle/legacy/gserver/layers/L2DistanceLayer.cpp
@@ -0,0 +1,91 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "L2DistanceLayer.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(l2_distance, L2DistanceLayer);
+
+bool L2DistanceLayer::init(const LayerMap& layerMap,
+                           const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 2UL) << "The L2DistanceLayer accepts two and "
+                                     << "only two inputs.";
+  CHECK_EQ(getSize(), 1UL) << "The output dimensionality of L2DistanceLayer "
+                           << "is fixed to be 1.";
+
+  return true;
+}
+
+void L2DistanceLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const auto inV1 = getInputValue(0);
+  const auto inV2 = getInputValue(1);
+
+  CHECK(inV1 && inV2);
+  CHECK_EQ(inV1->getHeight(), inV2->getHeight())
+      << "The height of two inputs of this layer must be the same.";
+  CHECK_EQ(inV1->getWidth(), inV2->getWidth())
+      << "The width of two inputs of this layer must be the same.";
+
+  int batchSize = inV1->getHeight();
+  int output_dim = getSize();
+  {
+    REGISTER_TIMER_INFO("L2DistanceBpAtvTimer", getName().c_str());
+    reserveOutput(batchSize, output_dim);
+    auto outV = getOutputValue();
+    CHECK(outV) << "The output matrix should not be null.";
+
+    Matrix::resizeOrCreate(
+        inputSub_, inV1->getHeight(), inV1->getWidth(), false, useGpu_);
+
+    inputSub_->assign(*inV1);
+    inputSub_->sub(*inV2);
+    outV->sumOfProducts(*inputSub_, *inputSub_, 1, 0);
+    outV->sqrt2(*outV);
+  }
+}
+
+void L2DistanceLayer::backward(const UpdateCallback& callback) {
+  const auto outG = getOutputGrad();
+  const auto outV = getOutputValue();
+  CHECK(outG && outV);
+
+  auto inGrad1 = getInputGrad(0);
+  auto inGrad2 = getInputGrad(1);
+
+  {
+    REGISTER_TIMER_INFO("L2DistanceBpAtvTimer", getName().c_str());
+
+    if (inGrad1 || inGrad2) {
+      outV->scalarDiv(*outV, 1.);
+      outV->dotMul(*outG, *outV);
+    }
+
+    if (inGrad1) inGrad1->addRowScale(0, *inputSub_, *outV);
+
+    if (inGrad2) {
+      inputSub_->mulScalar(-1.);
+      inGrad2->addRowScale(0, *inputSub_, *outV);
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/L2DistanceLayer.h b/paddle/legacy/gserver/layers/L2DistanceLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..aa8aabd9ca5702e3ebdccbe7bb4f98fa087dd238
--- /dev/null
+++ b/paddle/legacy/gserver/layers/L2DistanceLayer.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief The layer calculates the l2 distance between two input vectors.
+ * \f[
+ * f(\bf{x}, \bf{y}) = \sqrt{\sum_{i=1}^D(x_i - y_i)}
+ * \f]
+ *
+ * - Input1: A vector (batchSize * dataDim)
+ * - Input2: A vector (batchSize * dataDim)
+ * - Output: A vector (batchSize * 1)
+ *
+ * The configuration api is: l2_distance_layer.
+ */
+
+class L2DistanceLayer : public Layer {
+ public:
+  explicit L2DistanceLayer(const LayerConfig& config) : Layer(config) {}
+  ~L2DistanceLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+ private:
+  // Store the result of subtracting Input2 from Input1 in forward computation,
+  // which will be reused in backward computation.
+  MatrixPtr inputSub_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/Layer.cpp b/paddle/legacy/gserver/layers/Layer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..890d33552dd31a8fd348a36d44fb0824ac9b32b5
--- /dev/null
+++ b/paddle/legacy/gserver/layers/Layer.cpp
@@ -0,0 +1,410 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/utils/Util.h"
+
+#include "CostLayer.h"
+#include "paddle/legacy/math/SparseMatrix.h"
+#include "paddle/legacy/utils/Error.h"
+#include "paddle/legacy/utils/Logging.h"
+
+#ifndef PADDLE_MOBILE_INFERENCE
+#include "ValidationLayer.h"
+#endif
+
+DEFINE_bool(log_error_clipping, false, "enable log error clipping or not");
+
+namespace paddle {
+
+Layer::Layer(const LayerConfig& config, bool useGpu)
+    : config_(config),
+      useGpu_(useGpu),
+      deviceId_(CPU_DEVICE),
+      needSequenceInfo_(true) {}
+
+bool Layer::init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
+  if (useGpu_ && FLAGS_parallel_nn) {
+    /* gpu environment is specified by device property */
+    deviceId_ = config_.device();
+    if (deviceId_ < 0) {
+      useGpu_ = false;
+    }
+  }
+
+  output_.deviceId = deviceId_;
+
+  for (auto& inputConfig : config_.inputs()) {
+    std::string inputName = inputConfig.input_layer_name();
+    LayerPtr inputLayer;
+    CHECK(mapGet(inputName, layerMap, &inputLayer))
+        << "Cannot find input layer " << inputName << " for layer "
+        << getName();
+    this->addPrev(inputLayer);
+
+    inputLayer->addOutputArgument(deviceId_);
+
+    if (inputConfig.has_input_parameter_name()) {
+      ParameterPtr parameter;
+      CHECK(
+          mapGet(inputConfig.input_parameter_name(), parameterMap, &parameter))
+          << "Cannot find input parameter "
+          << inputConfig.input_parameter_name() << " for layer " << getName();
+      parameter->incShared();
+      CHECK_EQ(parameter->getDeviceId(), getDeviceId());
+      parameters_.push_back(parameter);
+    } else {
+      parameters_.push_back(nullptr);
+    }
+
+    if (inputConfig.has_input_layer_argument()) {
+      inputArgument_.push_back(inputConfig.input_layer_argument());
+    } else {
+      inputArgument_.push_back("");
+    }
+  }
+
+  if (config_.has_bias_parameter_name()) {
+    CHECK(mapGet(config_.bias_parameter_name(), parameterMap, &biasParameter_))
+        << "Cannot find bias parameter " << config_.bias_parameter_name()
+        << " for layer " << getName();
+    biasParameter_->incShared();
+    CHECK_EQ(biasParameter_->getDeviceId(), getDeviceId());
+  }
+
+  /* specify the activation function according to the configuration */
+  std::string action_type = config_.active_type();
+  activation_.reset(ActivationFunction::create(action_type));
+  CHECK(activation_);
+
+  initNeedFlags();
+  markInBackward_.assign(inputLayers_.size(), false);
+
+  return true;
+}
+
+ClassRegistrar<Layer, LayerConfig> Layer::registrar_;
+
+LayerPtr Layer::create(const LayerConfig& config) {
+  std::string type = config.type();
+
+#ifndef PADDLE_MOBILE_INFERENCE
+  // NOTE: As following types have illegal character '-',
+  // they can not use REGISTER_LAYER to registrar.
+  // Besides, to fit with old training models,
+  // they can not use '_' instead.
+  if (type == "multi-class-cross-entropy")
+    return LayerPtr(new MultiClassCrossEntropy(config));
+  else if (type == "rank-cost")
+    return LayerPtr(new RankingCost(config));
+  else if (type == "auc-validation")
+    return LayerPtr(new AucValidation(config));
+  else if (type == "pnpair-validation")
+    return LayerPtr(new PnpairValidation(config));
+#endif
+
+  return LayerPtr(registrar_.createByType(config.type(), config));
+}
+
+void Layer::resetSpecifyOutput(Argument& output,
+                               size_t height,
+                               size_t width,
+                               bool isValueClean,
+                               bool isGradClean) {
+  SetDevice device(output.deviceId);
+
+  Matrix::resizeOrCreate(
+      output.value, height, width, /* trans */ false, useGpu(output.deviceId));
+  if (isValueClean) {
+    output.value->zeroMem();
+  }
+
+  if (passType_ != PASS_TEST && needGradient()) {
+    Matrix::resizeOrCreate(
+        output.grad, height, width, /* trans */ false, useGpu(output.deviceId));
+    if (isGradClean) {
+      output.grad->zeroMem();
+    }
+  }
+}
+
+void Layer::resizeOutput(size_t height, size_t width) {
+  resetSpecifyOutput(output_, height, width, false, false);
+
+  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
+    resetSpecifyOutput(outputOtherDevice_[i], height, width, false, false);
+  }
+}
+
+void Layer::reserveOutput(size_t height, size_t width) {
+  resetSpecifyOutput(output_, height, width, false, true);
+
+  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
+    resetSpecifyOutput(outputOtherDevice_[i], height, width, false, true);
+  }
+}
+
+void Layer::resetOutput(size_t height, size_t width) {
+  resetSpecifyOutput(output_, height, width, true, true);
+
+  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
+    resetSpecifyOutput(outputOtherDevice_[i], height, width, true, true);
+  }
+}
+
+void Layer::addOutputArgument(int deviceId) {
+  if (deviceId == deviceId_) {
+    output_.countIncrement();
+    return;
+  } else {
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      if (outputOtherDevice_[i].deviceId == deviceId) {
+        outputOtherDevice_[i].countIncrement();
+        return;
+      }
+    }
+  }
+
+  Argument argu;
+  argu.deviceId = deviceId;
+  outputOtherDevice_.push_back(argu);
+  outputOtherDevice_.back().countIncrement();
+}
+
+void Layer::copyOutputToOtherDevice() {
+  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
+    SetDevice device(outputOtherDevice_[i].deviceId);
+    // If outputOtherDevice_[i].value is a CpuMatrix,
+    // the copyFrom is a synchronous interface.
+    // If outputOtherDevice_[i].value is a GpuMatrix, since subsequent
+    // calculations are all on HPPL_STREAM_DEFAULT,
+    // copyFrom can be an asynchronous interface.
+    outputOtherDevice_[i].value->copyFrom(*getOutputValue(),
+                                          HPPL_STREAM_DEFAULT);
+    outputOtherDevice_[i].sequenceStartPositions =
+        output_.sequenceStartPositions;
+    outputOtherDevice_[i].subSequenceStartPositions =
+        output_.subSequenceStartPositions;
+    outputOtherDevice_[i].cpuSequenceDims = output_.cpuSequenceDims;
+
+    outputOtherDevice_[i].notifyValueReady();
+  }
+}
+
+void Layer::waitInputValue() {
+  for (size_t i = 0; i != inputLayers_.size(); i++) {
+    if (inputLayers_[i]->getDeviceId() != deviceId_) {
+      getInput(i).waitValueReady();
+    }
+  }
+}
+
+void Layer::waitAndMergeOutputGrad() {
+  if (!output_.grad || !outputOtherDevice_.size()) {
+    return;
+  }
+
+  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
+    outputOtherDevice_[i].waitGradReady();
+  }
+
+  /* merge output grad */
+  size_t i = 0;
+  if (!output_.getAllCount()) {
+    output_.grad->copyFrom(*outputOtherDevice_[0].grad, HPPL_STREAM_1);
+    hl_stream_synchronize(HPPL_STREAM_1);
+
+    i++;
+    if (outputOtherDevice_.size() == 1) return;
+  }
+
+  Matrix::resizeOrCreate(tmpGrad_,
+                         output_.grad->getHeight(),
+                         output_.grad->getWidth(),
+                         /* trans */ false,
+                         useGpu(output_.deviceId));
+
+  for (; i != outputOtherDevice_.size(); i++) {
+    tmpGrad_->copyFrom(*outputOtherDevice_[i].grad, HPPL_STREAM_1);
+    hl_stream_synchronize(HPPL_STREAM_1);
+    output_.grad->add(*tmpGrad_);
+  }
+}
+
+void Layer::markAllInputGrad() {
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    if (!markInBackward_[i]) {
+      inputLayers_[i]->getOutput(deviceId_).notifyGradReady();
+    }
+    markInBackward_[i] = false;
+  }
+}
+
+void Layer::markInputGrad(int inputIndex) {
+  inputLayers_[inputIndex]->getOutput(deviceId_).notifyGradReady();
+  markInBackward_[inputIndex] = true;
+}
+
+void Layer::zeroGrad() {
+  CHECK(output_.grad.get() != NULL);
+  output_.grad->zeroMem();
+}
+
+void Layer::initNeedFlags() {
+  auto initFlag = [this](
+      bool& flag, bool (Layer::*flagQueryFunc)() const, ParameterType type) {
+    flag = false;
+    if (biasParameter_ && biasParameter_->hasType(type)) {
+      flag = true;
+    }
+    if (!flag) {
+      for (auto& para : parameters_) {
+        if (para && para->hasType(type)) {
+          flag = true;
+          break;
+        }
+      }
+    }
+    if (!flag) {
+      for (auto& layer : inputLayers_) {
+        if ((layer.get()->*flagQueryFunc)()) {
+          flag = true;
+        }
+      }
+    }
+  };
+  initFlag(needGradient_, &Layer::needGradient, PARAMETER_GRADIENT);
+}
+
+void Layer::showOutputStats() {
+  MatrixPtr out = getOutputValue();
+  if (!out) return;
+  if (!out->getElementCnt()) {
+    LOG(INFO) << "The number of output of " << config_.name()
+              << " is 0, skip to show the statistics";
+    return;
+  }
+  MatrixPtr outSquare;
+  if (dynamic_cast<GpuSparseMatrix*>(out.get())) {
+    GpuSparseMatrix* tmp = dynamic_cast<GpuSparseMatrix*>(out.get());
+    outSquare = std::make_shared<CpuSparseMatrix>(tmp->getHeight(),
+                                                  tmp->getWidth(),
+                                                  tmp->getElementCnt(),
+                                                  tmp->getValueType(),
+                                                  tmp->getFormat());
+  } else {
+    outSquare = out->clone();
+  }
+  outSquare->copyFrom(*out, HPPL_STREAM_DEFAULT);
+  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+
+  real mean = outSquare->getSum() / out->getElementCnt();
+  real min;
+  real max;
+  if (dynamic_cast<CpuSparseMatrix*>(outSquare.get())) {
+    auto tmpMat = dynamic_cast<CpuSparseMatrix*>(outSquare.get());
+    min = tmpMat->getMin();
+    max = tmpMat->getMax();
+    tmpMat->square2();
+    LOG(INFO) << "show statistics of [none zero values] in sparse matrix";
+  } else {
+    min = outSquare->getMin();
+    max = outSquare->getMax();
+    outSquare->square2();
+  }
+  real std = (outSquare->getSum() / outSquare->getElementCnt()) - mean * mean;
+  std = std > 0 ? std : 0;
+  LOG(INFO) << "The output state of " << config_.name() << ": mean=" << mean
+            << ", "
+            << "std=" << std << ", "
+            << "min=" << min << ", "
+            << "max=" << max;
+}
+
+void Layer::forwardActivation() {
+  /* activation */
+  auto status = activation_->forward(output_);
+  status.check();
+
+  /* dropout */
+  if (config_.drop_rate() > 0) {
+    forwardDropOut();
+    CHECK_NE(activation_->getName(), "softmax")
+        << "Softmax activation cannot be used with Dropout";
+  }
+
+  if (FLAGS_show_layer_stat) {
+    showOutputStats();
+  }
+}
+
+void Layer::backwardActivation() {
+  /* Do error clipping */
+  if (config_.error_clipping_threshold() > 0.0f) {
+    if (FLAGS_log_error_clipping) {
+      VectorPtr outGradVec = Vector::create(
+          output_.grad->getData(), output_.grad->getElementCnt(), useGpu_);
+      real maxAbsGrad = outGradVec->getAbsMax();
+      if (maxAbsGrad > config_.error_clipping_threshold()) {
+        real avgAbsGrad = outGradVec->getAbsSum() / outGradVec->getSize();
+        LOG(INFO) << " layer=" << config_.name() << " need clipping,"
+                  << " max error=" << maxAbsGrad << " avg error=" << avgAbsGrad;
+      }
+    }
+    output_.grad->clip(-config_.error_clipping_threshold(),
+                       config_.error_clipping_threshold());
+  }
+
+  /* Do dropout for delta*/
+  if (config_.drop_rate() > 0 && passType_ != PASS_TEST) {
+    MatrixPtr oGrad = getOutputGrad();
+    oGrad->dotMul(*oGrad, *dropOutMask_);
+  }
+
+  auto status = activation_->backward(output_);
+  status.check();
+}
+
+void Layer::forwardDropOut() {
+  auto& outV = getOutputValue();
+
+  if (passType_ == PASS_TRAIN) {
+    // new dropOutMask_ if dropOutMask_ is null ptr
+    Matrix::resizeOrCreate(dropOutMask_,
+                           outV->getHeight(),
+                           outV->getWidth(),
+                           false,
+                           useGpu(deviceId_));
+    dropOutMask_->randomizeUniform();  // generate a uniform random matrix
+    dropOutMask_->biggerThanScalar(config_.drop_rate());  // random mask
+    outV->dotMul(*outV, *dropOutMask_);                   // dropout
+  } else if (passType_ == PASS_GC) {
+    // only initialize once
+    if (!dropOutMask_) {
+      dropOutMask_ = Matrix::create(
+          outV->getHeight(), outV->getWidth(), false, useGpu(deviceId_));
+      // We use cpu matrix to generate mask so that the mask
+      // will be same for both gpu version and cpu version.
+      // This will help unittest to make sure they have same result.
+      MatrixPtr tmpMask = Matrix::create(outV->getHeight(), outV->getWidth());
+      tmpMask->randomizeUniform();  // generate a uniform random matrix
+      tmpMask->biggerThanScalar(config_.drop_rate());  // random mask
+      dropOutMask_->copyFrom(*tmpMask);
+    }
+    outV->dotMul(*outV, *dropOutMask_);
+  } else {  // passType == PASS_TEST
+    outV->mulScalar(1.0 - config_.drop_rate());
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/Layer.h b/paddle/legacy/gserver/layers/Layer.h
new file mode 100644
index 0000000000000000000000000000000000000000..a7ff76decea9a448acfcdef1c81a68b5a823cc56
--- /dev/null
+++ b/paddle/legacy/gserver/layers/Layer.h
@@ -0,0 +1,512 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/function/Function.h"
+#include "paddle/legacy/gserver/activations/ActivationFunction.h"
+#include "paddle/legacy/math/CpuSparseMatrix.h"
+#include "paddle/legacy/parameter/Argument.h"
+#include "paddle/legacy/parameter/Parameter.h"
+#include "paddle/legacy/parameter/Weight.h"
+#include "paddle/legacy/utils/ClassRegistrar.h"
+#include "paddle/legacy/utils/Util.h"
+
+/// Macro for registering a layer type.
+/// Example: REGISTER_LAYER(crf_error, CRFDecodingErrorLayer);
+#define REGISTER_LAYER(__type_name, __class_name) \
+  static InitFunction __reg_type_##__type_name(   \
+      []() { Layer::registrar_.registerClass<__class_name>(#__type_name); })
+
+#define REGISTER_LAYER_CREATE_FUNC(__type_name, createFunction) \
+  static InitFunction __reg_type_##__type_name(                 \
+      []() { Layer::registrar_.registerClass(#__type_name, createFunction); })
+
+namespace paddle {
+
+class Layer;
+typedef std::shared_ptr<Layer> LayerPtr;
+typedef std::map<std::string, LayerPtr> LayerMap;
+class NeuralNetwork;
+
+/// layer state, used for RNN and LSTM layers
+struct LayerState {
+  std::vector<MatrixPtr> value;
+};
+typedef std::shared_ptr<LayerState> LayerStatePtr;
+
+/// Paddle device ID, MKLDNN is -2, CPU is -1
+enum PADDLE_DEVICE_ID {
+  MKLDNN_DEVICE = -2,
+  CPU_DEVICE = -1,
+};
+
+/**
+ * @brief Base class for layer.
+ * Define necessary variables and functions for every layer.
+ */
+class Layer {
+ protected:
+  /// Layer config
+  LayerConfig config_;
+  /// whether to use GPU
+  bool useGpu_;
+  /// Device Id. MKLDNN is -2, CPU is -1, and GPU is 0, 1, 2 ...
+  int deviceId_;
+  /// Input layers
+  std::vector<LayerPtr> inputLayers_;
+  /// Argument of input layers
+  std::vector<std::string> inputArgument_;
+
+  /// Parameter for each input layer.
+  /// Parameters_[i] is nullptr if inputLayers_[i] does not need parameter.
+  std::vector<ParameterPtr> parameters_;
+
+  /// nullptr if bias is not needed.
+  ParameterPtr biasParameter_;
+
+  /// Output
+  Argument output_;
+  /// Several outputs stored on different devices, used in 'parallel_nn' case,
+  /// and record them by deviceId_.
+  /// Also used in 'use_mkldnn' case.
+  std::vector<Argument> outputOtherDevice_;
+  /// If there are several outputs, map them by each name.
+  /// MKLDNNLayer use it only to merge output grad
+  std::map<std::string, Argument*> outputMap_;
+  /// Used to merge grad on different devices.
+  MatrixPtr tmpGrad_;
+
+  std::unique_ptr<ActivationFunction> activation_;
+
+  /// Current passType, PASS_TRAIN or PASS_TEST
+  PassType passType_;
+
+  /// Random 0-1 matrix for dropOut
+  MatrixPtr dropOutMask_;
+
+  /// Whether the layer need to compute gradient
+  bool needGradient_;
+  /// Whether the layer need to compute re-sequence information
+  bool needSequenceInfo_;
+
+  /// Mark input grad in(true) or out(false) of backward function.
+  std::vector<bool> markInBackward_;
+
+  /// Layer forward function
+  std::vector<std::shared_ptr<FunctionBase>> forward_;
+  /// Layer backward function
+  std::vector<std::shared_ptr<FunctionBase>> backward_;
+
+ public:
+  /**
+   * Wait until all input value ready.
+   * Called before Layer::forward() function.
+   */
+  virtual void waitInputValue();
+
+  /**
+   * Copy layer's output_ to other device.
+   * If output layer is in other device, called after Layer::forward() function.
+   */
+  virtual void copyOutputToOtherDevice();
+
+  /**
+   * Wait until all output grad ready and merge them to output_.grad.
+   * Called before Layer::backward() function.
+   */
+  virtual void waitAndMergeOutputGrad();
+
+  /**
+   * Notify previous layer the output grad ready.
+   * Called after Layer::backward() function.
+   */
+  virtual void markAllInputGrad();
+
+ protected:
+  /**
+   * Create layer function. Function is called in forward or backward.
+   * \param function, Layer::forward_ or Layer::backward_
+   * \param name, function name
+   * \param config, initialization configuration for the function
+   */
+  void createFunction(std::vector<std::shared_ptr<FunctionBase>>& function,
+                      const std::string& name,
+                      const FuncConfig& config) {
+    if (useGpu_) {
+      function.emplace_back(
+          FunctionBase::funcRegistrar_.createByType(name + "-GPU"));
+    } else {
+      function.emplace_back(
+          FunctionBase::funcRegistrar_.createByType(name + "-CPU"));
+    }
+    auto& func = function.back();
+    func->init(config);
+  }
+
+  /**
+   * Notify specified layer the output grad ready.
+   * Called in the backward function.
+   * If do mark input grad in the backward function, you should to ensure
+   * that all input grad will be marked in the backward function.
+   */
+  void markInputGrad(int inputIndex);
+
+  /**
+   * Get the argument of input layer.
+   */
+  const Argument& getInput(size_t inputIndex) const {
+    return inputLayers_[inputIndex]->getOutput(deviceId_);
+  }
+
+  /**
+   * Get the argument of input layer.
+   */
+  const Argument& getInput(const Layer& inputLayer) const {
+    return inputLayer.getOutput(deviceId_);
+  }
+
+  /**
+   * Get the argument of input layer with deviceId.
+   */
+  const Argument& getInput(size_t inputIndex, int deviceId) const {
+    return inputLayers_[inputIndex]->getOutput(deviceId);
+  }
+
+  /**
+   * Get the forward-input value.
+   */
+  const MatrixPtr& getInputValue(int inputIndex) {
+    return inputLayers_[inputIndex]->getOutput(deviceId_).value;
+  }
+
+  /**
+   * Get the forward-input value.
+   */
+  const MatrixPtr& getInputValue(const Layer& inputLayer) {
+    return inputLayer.getOutput(deviceId_).value;
+  }
+
+  /**
+   * Get the forward-input value with deviceId.
+   */
+  const MatrixPtr& getInputValue(int inputIndex, int deviceId) {
+    return inputLayers_[inputIndex]->getOutput(deviceId).value;
+  }
+
+  /**
+   * Get the forward-input grad.
+   */
+  const MatrixPtr& getInputGrad(int inputIndex) {
+    return inputLayers_[inputIndex]->getOutput(deviceId_).grad;
+  }
+
+  /**
+   * Get the forward-input grad.
+   */
+  const MatrixPtr& getInputGrad(const Layer& inputLayer) {
+    return inputLayer.getOutput(deviceId_).grad;
+  }
+
+  /**
+   * Get the forward-input grad.
+   */
+  const MatrixPtr& getInputGrad(int inputIndex, int deviceId) {
+    return inputLayers_[inputIndex]->getOutput(deviceId).grad;
+  }
+
+  /**
+   * Get the forward-input label.
+   */
+  const IVectorPtr& getInputLabel(const Layer& inputLayer) {
+    return inputLayer.getOutput(deviceId_).ids;
+  }
+
+  /**
+   * Change the size of output (value, grad).
+   * Reset to value zero if isValueClean = true,
+   * Reset to grad zero if isGradClean = true.
+   */
+  void resetSpecifyOutput(Argument& output,
+                          size_t height,
+                          size_t width,
+                          bool isValueClean,
+                          bool isGradClean);
+
+  /**
+   * Add output argument to other devices.
+   */
+  void addOutputArgument(int deviceId);
+
+ public:
+  explicit Layer(const LayerConfig& config, bool useGpu = FLAGS_use_gpu);
+  virtual ~Layer() {}
+
+  /// Register a Layer
+  static ClassRegistrar<Layer, LayerConfig> registrar_;
+
+  /**
+   * Get the flag whether layer need to compute gradient.
+   */
+  bool needGradient() const { return needGradient_; }
+
+  /**
+   * Set the flag whether layer need to compute gradient.
+   */
+  void setNeedGradient(bool need) { needGradient_ = need; }
+
+  /**
+   * Set the flag whether layer need to re-compute sequence information,
+   * which includes sequenceStartPositions or subSequenceStartPositions.
+   */
+  void setNeedSequenceInfo(bool need) { needSequenceInfo_ = need; }
+
+  /**
+   * Get layer's name.
+   */
+  const std::string& getName() const { return config_.name(); }
+
+  /**
+   * Get layer's type.
+   */
+  const std::string& getType() const { return config_.type(); }
+
+  /**
+   * Get layer's size.
+   */
+  size_t getSize() const { return config_.size(); }
+
+  /**
+   * Get layer's deviceId.
+   */
+  int getDeviceId() const { return deviceId_; }
+
+  /**
+   * Add the inputLayer.
+   */
+  void addPrev(LayerPtr l) { inputLayers_.push_back(l); }
+
+  /**
+   * Get the size of inputLayer[i].
+   */
+  const LayerPtr& getPrev(size_t i) { return inputLayers_[i]; }
+
+  /**
+   * Get the forward-output value.
+   */
+  const MatrixPtr& getOutputValue() { return output_.value; }
+
+  /**
+   * Get the forward-output label.
+   */
+  const IVectorPtr& getOutputLabel() { return output_.ids; }
+
+  /**
+   * Get the backward-Loss value.
+   */
+  const MatrixPtr& getOutputGrad() { return output_.grad; }
+  /**
+   * If layer has multi-output, set output into outputMap_.
+   */
+  void setOutput(const std::string& name, Argument* output) {
+    outputMap_[name] = output;
+  }
+
+  /**
+   * Get the output map size, if layer has multi-output.
+   */
+  size_t getOutputMapSize() { return outputMap_.size(); }
+
+  /**
+   * Get the output based on layer's name.
+   */
+  Argument& getOutput(const std::string& str = "") {
+    if (str == "") {
+      return output_;
+    } else {
+      auto output = outputMap_.find(str);
+      if (output != outputMap_.end()) {
+        return *output->second;
+      } else {
+        LOG(FATAL) << "No specific output " << str;
+        return *((Argument*)nullptr);
+      }
+    }
+  }
+
+  /**
+   * Get the output based on deviceId.
+   */
+  const Argument& getOutput(int deviceId) const {
+    if (deviceId == getDeviceId()) {
+      return output_;
+    } else {
+      for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+        if (outputOtherDevice_[i].deviceId == deviceId) {
+          return outputOtherDevice_[i];
+        }
+      }
+
+      LOG(FATAL) << "No specific device output ";
+      return *((Argument*)nullptr);
+    }
+  }
+
+  /**
+   * Get layer's parameters.
+   */
+  const std::vector<ParameterPtr>& getParameters() { return parameters_; }
+
+  /**
+   * Get layer's bias-parameters.
+   */
+  const ParameterPtr& getBiasParameter() { return biasParameter_; }
+
+  /**
+   * Create pointer of layer.
+   */
+  static LayerPtr create(const LayerConfig& config);
+
+  /**
+   * Resize the output matrix size.
+   */
+  void resizeOutput(size_t height, size_t width);
+
+  /**
+   * Resize the output matrix size,
+   * and reset value to zero.
+   */
+  void reserveOutput(size_t height, size_t width);
+
+  /**
+   * Resize the output matrix size,
+   * and reset value and grad to zero.
+   */
+  void resetOutput(size_t height, size_t width);
+
+  /**
+   * Clear the gradient of output.
+   */
+  void zeroGrad();
+
+  /**
+   * Intialization.
+   * For example, adding input layers from layerMap and parameterMap.
+   */
+  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  /**
+   * Intialization for sub network if there has sub network.
+   * @param rootNetwork root network
+   * @param config model config
+   * @param parameterTypes parameter's type
+   * @param useGpu whether to use gpu or not
+   */
+  virtual void initSubNetwork(NeuralNetwork* rootNetwork,
+                              const ModelConfig& config,
+                              const std::vector<ParameterType>& parameterTypes,
+                              bool useGpu) {}
+
+  /**
+   * @brief Access SubNetwork Object.
+   *        If subnetwork exists, then invoke callback with subnetwrk.
+   * @param callback if sub-network is exist, the callback is invoked.
+   */
+  virtual void accessSubNetwork(
+      const std::function<void(NeuralNetwork&)>& callback) {}
+
+  /**
+   * If use sparse row matrix as parameter,
+   * prefetch feature ids in input label.
+   */
+  virtual void prefetch() {}
+
+  /**
+   * Forward propagation.
+   * All inherited implementation should call Layer::foward() function.
+   */
+  virtual void forward(PassType passType) {
+    passType_ = passType;
+    if (!inputLayers_.empty() && needSequenceInfo_) {
+      const Argument& input = getInput(0);
+      output_.sequenceStartPositions = input.sequenceStartPositions;
+      output_.subSequenceStartPositions = input.subSequenceStartPositions;
+      output_.cpuSequenceDims = input.cpuSequenceDims;
+    }
+  }
+
+  /**
+   * Reset the internal state variables.
+   * Allocate them if they have not been allocated.
+   * This function need to called before Layer::forward() for generating
+   * sequence.
+   *
+   * This is used for sequence generation. When generating sequence, the
+   * calculation at current timestamp depends on the state from previous
+   * timestamp. The model needs to keep the information about the previous
+   * timestamp in the state variables. Layers such as RecurrentLayer,
+   * LstmLayer and ContextLayer have state variables.
+   */
+  virtual void resetState() {}
+
+  /**
+   * Set layer state.
+   */
+  virtual void setState(LayerStatePtr state) {}
+
+  /**
+   * Get layer state.
+   * @return A copy of internal state.
+   */
+  virtual LayerStatePtr getState() { return nullptr; }
+
+  /**
+   * Show output state.
+   */
+  void showOutputStats();
+
+  /**
+   * Backward propagation.
+   * Should only be called after Layer::forward() function.
+   */
+  virtual void backward(const UpdateCallback& callback = nullptr) = 0;
+
+  /**
+   * One pass is finished.
+   */
+  virtual void onPassEnd() {}
+
+ protected:
+  /**
+   * Forward of activation function.
+   */
+  void forwardActivation();
+  /**
+   * Backward of activation function.
+   */
+  void backwardActivation();
+  /**
+   * Forward of dropOut.
+   */
+  void forwardDropOut();
+  /**
+   * Initilize the needGradient_ flag.
+   */
+  void initNeedFlags();
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/LinearChainCRF.cpp b/paddle/legacy/gserver/layers/LinearChainCRF.cpp
similarity index 100%
rename from paddle/gserver/layers/LinearChainCRF.cpp
rename to paddle/legacy/gserver/layers/LinearChainCRF.cpp
diff --git a/paddle/legacy/gserver/layers/LinearChainCRF.h b/paddle/legacy/gserver/layers/LinearChainCRF.h
new file mode 100644
index 0000000000000000000000000000000000000000..65e23905435da24a1a7554c30e33d303b05aef69
--- /dev/null
+++ b/paddle/legacy/gserver/layers/LinearChainCRF.h
@@ -0,0 +1,97 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+class LinearChainCRF {
+ public:
+  /**
+   * The size of para must be \f$(numClasses + 2) * numClasses\f$.
+   * The first numClasses values of para are for starting weights (\f$a\f$).
+   * The next numClasses values of para are for ending weights (\f$b\f$),
+   * The remaning values are for transition weights (\f$w\f$).
+   *
+   * The probability of a state sequence s of length \f$L\f$ is defined as:
+   * \f$P(s) = (1/Z) exp(a_{s_1} + b_{s_L}
+   *                  + \sum_{l=1}^L x_{s_l}
+   *                  + \sum_{l=2}^L w_{s_{l-1},s_l})\f$
+   * where \f$Z\f$ is a normalization value so that the sum of \f$P(s)\f$ over
+   * all possible
+   * sequences is \f$1\f$, and \f$x\f$ is the input feature to the CRF.
+   */
+  LinearChainCRF(int numClasses, real* para);
+
+  /**
+   * Calculate the negative log likelihood of s given x.
+   * The size of x must be length * numClasses. Each consecutive numClasses
+   * values are the features for one time step.
+   */
+  real forward(real* x, int* s, int length);
+
+  /**
+   * Calculate the gradient with respect to x, a, b, and w.
+   * backward() can only be called after a corresponding call to forward() with
+   * the same x, s and length.
+   * The gradient with respect to a, b, and w will not be calculated if
+   * needWGrad is false.
+   * @note Please call getWGrad() and getXGrad() to get the gradient with
+   * respect to (a, b, w) and x respectively.
+   */
+  void backward(real* x, int* s, int length, bool needWGrad);
+
+  /**
+   * Find the most probable sequence given x. The result will be stored in s.
+   */
+  void decode(real* x, int* s, int length);
+
+  /*
+   * Return the gradient with respect to (a, b, w). It can only be called after
+   * a corresponding call to backward().
+   */
+  MatrixPtr getWGrad() { return matWGrad_; }
+
+  /*
+   * Return the gradient with respect to x. It can only be called after a
+   * corresponding call to backward().
+   */
+  MatrixPtr getXGrad() { return matGrad_; }
+
+ protected:
+  int numClasses_;
+  MatrixPtr a_;
+  MatrixPtr b_;
+  MatrixPtr w_;
+  MatrixPtr matWGrad_;
+  MatrixPtr da_;
+  MatrixPtr db_;
+  MatrixPtr dw_;
+  MatrixPtr ones_;
+
+  MatrixPtr expX_;
+  MatrixPtr matGrad_;
+  MatrixPtr alpha_;
+  MatrixPtr beta_;
+  MatrixPtr maxX_;
+  MatrixPtr expW_;
+
+  // track_(k,i) = j means that the best sequence at time k for class i comes
+  // from the sequence at time k-1 for class j
+  IVectorPtr track_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/LinearChainCTC.cpp b/paddle/legacy/gserver/layers/LinearChainCTC.cpp
similarity index 100%
rename from paddle/gserver/layers/LinearChainCTC.cpp
rename to paddle/legacy/gserver/layers/LinearChainCTC.cpp
diff --git a/paddle/legacy/gserver/layers/LinearChainCTC.h b/paddle/legacy/gserver/layers/LinearChainCTC.h
new file mode 100644
index 0000000000000000000000000000000000000000..e6c4c7bfe0cdb1bbcafbf5b847ea592eef02794a
--- /dev/null
+++ b/paddle/legacy/gserver/layers/LinearChainCTC.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+class LinearChainCTC {
+ public:
+  LinearChainCTC(int numClasses, bool normByTimes);
+
+  // Calculate the negative log probability as loss
+  real forward(real* softmaxSeq,
+               int softmaxSeqLen,
+               int* labelSeq,
+               int labelSeqLen);
+
+  // calculate the gradient
+  void backward(real* softmaxSeq,
+                real* softmaxSeqGrad,
+                int* labelSeq,
+                int labelSeqLen);
+
+ protected:
+  int numClasses_, blank_, totalSegments_, totalTime_;
+  bool normByTimes_;
+  bool isInvalid_;
+
+  MatrixPtr logActs_, forwardVars_, backwardVars_, gradTerms_;
+
+  real logProb_;
+
+  void segmentRange(int& start, int& end, int time);
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/LstmCompute.cpp b/paddle/legacy/gserver/layers/LstmCompute.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..70f08e1d4efd2223e7ddec1b104e4ee63fc34de5
--- /dev/null
+++ b/paddle/legacy/gserver/layers/LstmCompute.cpp
@@ -0,0 +1,93 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "LstmCompute.h"
+#include "hl_recurrent_apply.cuh"
+#include "paddle/legacy/utils/Util.h"
+
+namespace paddle {
+
+void LstmCompute::init(LayerConfig &config) {
+  activeNode_ = hlActiveType(config.active_type());
+  activeGate_ = hlActiveType(config.active_gate_type());
+  activeState_ = hlActiveType(config.active_state_type());
+}
+
+template <>
+void LstmCompute::forwardOneSequence<0>(hl_lstm_value value, int frameSize) {
+  hl_cpu_lstm_forward(hppl::forward::lstm(),
+                      value,
+                      frameSize,
+                      activeNode_,
+                      activeGate_,
+                      activeState_);
+}
+
+template <>
+void LstmCompute::backwardOneSequence<0>(hl_lstm_value value,
+                                         hl_lstm_grad grad,
+                                         int frameSize) {
+  hl_cpu_lstm_backward(hppl::backward::lstm(),
+                       value,
+                       grad,
+                       frameSize,
+                       activeNode_,
+                       activeGate_,
+                       activeState_);
+}
+
+template <>
+void LstmCompute::forwardBatch<0>(hl_lstm_value value,
+                                  int frameSize,
+                                  int batchSize) {
+  for (int b = 0; b < batchSize; b++) {
+    forwardOneSequence<0>(value, frameSize);
+
+    value.gateValue += frameSize * 4;
+    value.stateValue += frameSize;
+    value.stateActiveValue += frameSize;
+    value.outputValue += frameSize;
+    if (value.prevStateValue) {
+      value.prevStateValue += frameSize;
+    }
+  }
+}
+
+template <>
+void LstmCompute::backwardBatch<0>(hl_lstm_value value,
+                                   hl_lstm_grad grad,
+                                   int frameSize,
+                                   int batchSize) {
+  for (int b = 0; b < batchSize; b++) {
+    backwardOneSequence<0>(value, grad, frameSize);
+
+    value.gateValue += frameSize * 4;
+    value.stateValue += frameSize;
+    value.stateActiveValue += frameSize;
+    value.outputValue += frameSize;
+    if (value.prevStateValue) {
+      value.prevStateValue += frameSize;
+    }
+
+    grad.gateGrad += frameSize * 4;
+    grad.stateGrad += frameSize;
+    grad.stateActiveGrad += frameSize;
+    grad.outputGrad += frameSize;
+    if (grad.prevStateGrad) {
+      grad.prevStateGrad += frameSize;
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/LstmCompute.cu b/paddle/legacy/gserver/layers/LstmCompute.cu
similarity index 100%
rename from paddle/gserver/layers/LstmCompute.cu
rename to paddle/legacy/gserver/layers/LstmCompute.cu
diff --git a/paddle/legacy/gserver/layers/LstmCompute.h b/paddle/legacy/gserver/layers/LstmCompute.h
new file mode 100644
index 0000000000000000000000000000000000000000..ac40c35ef1b0a11e61b5d1b11476ffe7daff6d5e
--- /dev/null
+++ b/paddle/legacy/gserver/layers/LstmCompute.h
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "ModelConfig.pb.h"
+#include "hl_gpu.h"
+#include "paddle/legacy/utils/Common.h"
+
+namespace paddle {
+
+class LstmCompute {
+ public:
+  void init(LayerConfig &config);
+
+  /**
+   * LstmLayer batch compute API (forwardBatch, backwardBatch).
+   * If use batch compute api, lstm value(and grad) need to be batch structure.
+   * Compute order:
+   *   forwardBatch:  for 0 <= id < numBatch
+   *   backwardBatch:  for numBatch > id >= 0
+   */
+  template <bool useGpu>
+  void forwardBatch(hl_lstm_value value, int frameSize, int batchSize);
+
+  template <bool useGpu>
+  void backwardBatch(hl_lstm_value value,
+                     hl_lstm_grad grad,
+                     int frameSize,
+                     int batchSize);
+
+  /**
+   * LstmLayer sequence compute API (forwardOneSequence, backwardOneSequence).
+   * Compute order(for each sequence):
+   *   forwardOneSequence:
+   *     if (!reversed) for 0 <= seqId < seqLength
+   *     if (reversed)  for seqLength > seqId >= 0
+   *   backwardOneSequence:
+   *     if (!reversed) for seqLength > seqId >= 0
+   *     if (reversed)  for 0 <= seqId < seqLength
+   */
+  template <bool useGpu>
+  void forwardOneSequence(hl_lstm_value value, int frameSize);
+  template <bool useGpu>
+  void backwardOneSequence(hl_lstm_value value,
+                           hl_lstm_grad grad,
+                           int frameSize);
+
+ public:
+  hl_activation_mode_t activeNode_;
+  hl_activation_mode_t activeGate_;
+  hl_activation_mode_t activeState_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/LstmLayer.cpp b/paddle/legacy/gserver/layers/LstmLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..43a55d8d490faf0049d47bbca6ae1947d13e6be8
--- /dev/null
+++ b/paddle/legacy/gserver/layers/LstmLayer.cpp
@@ -0,0 +1,805 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "LstmLayer.h"
+#include "paddle/legacy/math/BaseMatrix.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/Stat.h"
+
+DECLARE_bool(prev_batch_state);
+
+namespace paddle {
+
+REGISTER_LAYER(lstmemory, LstmLayer);
+
+bool LstmLayer::init(const LayerMap &layerMap,
+                     const ParameterMap &parameterMap) {
+  if (!Layer::init(layerMap, parameterMap)) return false;
+  CHECK_EQ(1U, inputLayers_.size());
+  CHECK_EQ(1U, parameters_.size());
+  CHECK_EQ(getSize() * getSize() * 4, parameters_[0]->getSize());
+  CHECK_EQ(getSize() * 7, biasParameter_->getSize());
+  weight_.reset(new Weight(getSize(), getSize() * 4, parameters_[0]));
+  if (biasParameter_.get() != NULL) {
+    bias_.reset(new Weight(1, getSize() * 7, biasParameter_));
+    if (bias_->getW()) {
+      localBias_ = Matrix::create(nullptr,
+                                  /* height= */ 1,
+                                  getSize() * 4,
+                                  /* trans= */ false,
+                                  useGpu_);
+      checkIg_ = Matrix::create(nullptr,
+                                /* height= */ 1,
+                                getSize(),
+                                /* trans= */ false,
+                                useGpu_);
+      checkFg_ = Matrix::create(nullptr,
+                                /* height= */ 1,
+                                getSize(),
+                                /* trans= */ false,
+                                useGpu_);
+      checkOg_ = Matrix::create(nullptr,
+                                /* height= */ 1,
+                                getSize(),
+                                /* trans= */ false,
+                                useGpu_);
+
+      localBias_->setData(bias_->getW()->getData());
+      checkIg_->setData(bias_->getW()->getData() + getSize() * 4);
+      checkFg_->setData(bias_->getW()->getData() + getSize() * 5);
+      checkOg_->setData(bias_->getW()->getData() + getSize() * 6);
+    }
+
+    if (bias_->getWGrad()) {
+      localBiasGrad_ = Matrix::create(nullptr,
+                                      /* height= */ 1,
+                                      getSize() * 4,
+                                      /* trans= */ false,
+                                      useGpu_);
+      checkIgGrad_ = Matrix::create(nullptr,
+                                    /* height= */ 1,
+                                    getSize(),
+                                    /* trans= */ false,
+                                    useGpu_);
+      checkFgGrad_ = Matrix::create(nullptr,
+                                    /* height= */ 1,
+                                    getSize(),
+                                    /* trans= */ false,
+                                    useGpu_);
+      checkOgGrad_ = Matrix::create(nullptr,
+                                    /* height= */ 1,
+                                    getSize(),
+                                    /* trans= */ false,
+                                    useGpu_);
+      localBiasGrad_->setData(bias_->getWGrad()->getData());
+      checkIgGrad_->setData(bias_->getWGrad()->getData() + getSize() * 4);
+      checkFgGrad_->setData(bias_->getWGrad()->getData() + getSize() * 5);
+      checkOgGrad_->setData(bias_->getWGrad()->getData() + getSize() * 6);
+    }
+  } else {
+    LOG(FATAL) << "Bias should be here.";
+  }
+  reversed_ = config_.reversed();
+
+  // create IdentityActivation for using drop_rate
+  activation_.reset(ActivationFunction::create(""));
+
+  LstmCompute::init(config_);
+  useBatch_ = true;
+  useSeqParallel_ = false;
+  if (useGpu_ && (getSize() == 32 || getSize() == 64)) {
+    useSeqParallel_ = true;
+  }
+
+  return true;
+}
+
+void LstmLayer::resetState() {
+  CHECK(!reversed_) << "state is not allowed for reversed lstmemory layer";
+  Matrix::resizeOrCreate(
+      prevOutput_, 1, getSize(), /* trans= */ false, useGpu_);
+  Matrix::resizeOrCreate(prevState_, 1, getSize(), /* trans= */ false, useGpu_);
+  prevOutput_->resize(0, getSize());
+  prevState_->resize(0, getSize());
+  if (FLAGS_prev_batch_state) {
+    useBatch_ = true;
+  } else {
+    useBatch_ = false;
+  }
+}
+
+void LstmLayer::setState(LayerStatePtr state) {
+  CHECK(state->value.size() == 2) << "two matrices are expected for LSTM state";
+  prevOutput_->resize(state->value[0]->getHeight(),
+                      state->value[0]->getWidth());
+  prevState_->resize(state->value[1]->getHeight(), state->value[1]->getWidth());
+  prevOutput_->copyFrom(*(state->value[0]));
+  prevState_->copyFrom(*(state->value[1]));
+}
+
+LayerStatePtr LstmLayer::getState() {
+  LayerStatePtr res = std::make_shared<LayerState>();
+  if (prevOutput_->getHeight() && prevOutput_->getWidth()) {
+    res->value.push_back(prevOutput_->clone(0, 0, useGpu_));
+    res->value[0]->copyFrom(*prevOutput_);
+    res->value.push_back(prevState_->clone(0, 0, useGpu_));
+    res->value[1]->copyFrom(*prevState_);
+  } else {
+    MatrixPtr output =
+        Matrix::create(1, getSize(), /* trans= */ false, useGpu_);
+    MatrixPtr state = Matrix::create(1, getSize(), /* trans= */ false, useGpu_);
+    output->resize(0, getSize());
+    state->resize(0, getSize());
+    res->value.push_back(output);
+    res->value.push_back(state);
+  }
+  return res;
+}
+
+void LstmLayer::forward(PassType passType) {
+  REGISTER_TIMER_INFO("LstmFwTimer", getName().c_str());
+  Layer::forward(passType);
+
+  const Argument &input = getInput(0);
+  CHECK(input.sequenceStartPositions);
+  int batchSize = input.getBatchSize();
+  resetOutput(batchSize, getSize());
+  CHECK_EQ(getSize() * 4, input.value->getWidth());
+  size_t numSequences = input.getNumSequences();
+  const int *starts = input.sequenceStartPositions->getData(false);
+  CHECK_EQ(starts[numSequences], batchSize);
+
+  Matrix::resizeOrCreate(gate_.value,
+                         /* height= */ batchSize,
+                         getSize() * 4,
+                         /* trans= */ false,
+                         useGpu_);
+  if (prevOutput_) {
+    size_t prevNumSeq = useBatch_ ? numSequences : 1;
+    if (prevOutput_->getHeight() == 0) {
+      prevOutput_->resize(prevNumSeq, getSize());
+      prevState_->resize(prevNumSeq, getSize());
+      prevOutput_->zeroMem();
+      prevState_->zeroMem();
+    } else {
+      CHECK_EQ(prevOutput_->getHeight(), prevNumSeq)
+          << "the number of sequences must be the same";
+    }
+    Matrix::resizeOrCreate(totalState_,
+                           prevState_->getHeight() + batchSize,
+                           getSize(),
+                           /*trans*/ false,
+                           useGpu_);
+    state_.value = Matrix::create(nullptr,
+                                  /* height= */ batchSize,
+                                  getSize(),
+                                  /* trans= */ false,
+                                  useGpu_);
+    state_.value->setData(totalState_->getData() +
+                          prevState_->getHeight() * getSize());
+  } else {
+    Matrix::resizeOrCreate(state_.value,
+                           /* height= */ batchSize,
+                           getSize(),
+                           /* trans= */ false,
+                           useGpu_);
+  }
+  Matrix::resizeOrCreate(preOutput_.value,
+                         /* height= */ batchSize,
+                         getSize(),
+                         /* trans= */ false,
+                         useGpu_);
+
+  if (!useBatch_) {
+    forwardSequence(batchSize, numSequences, starts, input.value);
+  } else {
+    if (!useSeqParallel_) {
+      forwardBatch(batchSize, numSequences, starts, input.value);
+    } else {
+      const int *starts = input.sequenceStartPositions->getData(useGpu_);
+      forwardSeqParallel(batchSize, numSequences, starts, input.value);
+    }
+  }
+  /*  activation */ { forwardActivation(); }
+}
+
+void LstmLayer::backward(const UpdateCallback &callback) {
+  REGISTER_TIMER_INFO("LstmBwTimer", getName().c_str());
+  /*  Do derivation */ { backwardActivation(); }
+
+  const Argument &input = getInput(0);
+  CHECK(input.sequenceStartPositions);
+  int batchSize = input.getBatchSize();
+  size_t numSequences = input.getNumSequences();
+
+  Matrix::resizeOrCreate(gate_.grad,
+                         /* height= */ batchSize,
+                         getSize() * 4,
+                         /* trans= */ false,
+                         useGpu_);
+  Matrix::resizeOrCreate(state_.grad,
+                         /* height= */ batchSize,
+                         getSize(),
+                         /* trans= */ false,
+                         useGpu_);
+  Matrix::resizeOrCreate(preOutput_.grad,
+                         /* height= */ batchSize,
+                         getSize(),
+                         /* trans= */ false,
+                         useGpu_);
+  state_.grad->zero();
+
+  const int *starts = input.sequenceStartPositions->getData(false);
+  if (!useBatch_) {
+    backwardSequence(batchSize, numSequences, starts, input.grad);
+  } else {
+    if (!useSeqParallel_) {
+      backwardBatch(batchSize, numSequences, starts, input.grad);
+    } else {
+      const int *starts = input.sequenceStartPositions->getData(useGpu_);
+      backwardSeqParallel(batchSize, numSequences, starts, input.grad);
+    }
+  }
+
+  if (bias_) {
+    bias_->getParameterPtr()->incUpdate(callback);
+  }
+  weight_->getParameterPtr()->incUpdate(callback);
+}
+
+void LstmLayer::forwardSequence(int batchSize,
+                                size_t numSequences,
+                                const int *starts,
+                                MatrixPtr inputValue) {
+  REGISTER_TIMER_INFO("LstmFwSequenceTime", getName().c_str());
+  gate_.value->assign(*inputValue);
+  if (bias_) {
+    gate_.value->addBias(*localBias_, 1);
+  }
+
+  hl_lstm_value lstmValue;
+  lstmValue.checkIg = checkIg_->getData();
+  lstmValue.checkFg = checkFg_->getData();
+  lstmValue.checkOg = checkOg_->getData();
+  lstmValue.gateValue = gate_.value->getData();
+  lstmValue.stateValue = state_.value->getData();
+  lstmValue.stateActiveValue = preOutput_.value->getData();
+  lstmValue.outputValue = output_.value->getData();
+  lstmValue.prevStateValue = nullptr;
+  if (reversed_) {
+    lstmValue.gateValue += (batchSize - 1) * getSize() * 4;
+    lstmValue.stateValue += (batchSize - 1) * getSize();
+    lstmValue.stateActiveValue += (batchSize - 1) * getSize();
+    lstmValue.outputValue += (batchSize - 1) * getSize();
+  }
+
+  auto nextFrame = [&lstmValue](bool reversed, int frameSize) {
+    lstmValue.prevStateValue = lstmValue.stateValue;
+    if (!reversed) {
+      lstmValue.gateValue += frameSize * 4;
+      lstmValue.stateValue += frameSize;
+      lstmValue.stateActiveValue += frameSize;
+      lstmValue.outputValue += frameSize;
+    } else {
+      lstmValue.gateValue -= frameSize * 4;
+      lstmValue.stateValue -= frameSize;
+      lstmValue.stateActiveValue -= frameSize;
+      lstmValue.outputValue -= frameSize;
+    }
+  };
+
+  MatrixPtr frameGate = Matrix::create(nullptr,
+                                       /* height= */ 1,
+                                       getSize() * 4,
+                                       /* trans= */ false,
+                                       useGpu_);
+  MatrixPtr frameOutput = Matrix::create(nullptr,
+                                         /* height= */ 1,
+                                         getSize(),
+                                         /* trans= */ false,
+                                         useGpu_);
+
+  if (!reversed_) {
+    if (prevState_) {
+      lstmValue.prevStateValue = prevState_->getData();
+    }
+    if (prevOutput_) {
+      frameGate->setData(lstmValue.gateValue);
+      frameGate->mul(*prevOutput_, *weight_->getW(), 1, 1);
+    }
+  }
+  AsyncGpuBlock asyncGpuBlock;
+  for (size_t n = 0; n < numSequences; ++n) {
+    int length;
+    if (!reversed_) {
+      length = starts[n + 1] - starts[n];
+    } else {
+      length = starts[numSequences - n] - starts[numSequences - n - 1];
+    }
+    for (int l = 0; l < length; ++l) {
+      if (useGpu_) {
+        LstmCompute::forwardOneSequence<1>(lstmValue, getSize());
+      } else {
+        LstmCompute::forwardOneSequence<0>(lstmValue, getSize());
+      }
+
+      if (l != length - 1) {
+        frameOutput->setData(lstmValue.outputValue);
+        nextFrame(reversed_, getSize());
+        frameGate->setData(lstmValue.gateValue);
+        frameGate->mul(*frameOutput, *weight_->getW(), 1, 1);
+      }
+    }
+    if (n != numSequences - 1) {
+      frameOutput->setData(lstmValue.outputValue);
+      nextFrame(reversed_, getSize());
+      frameGate->setData(lstmValue.gateValue);
+      if (!reversed_) {
+        if (!prevState_) lstmValue.prevStateValue = nullptr;
+        if (prevOutput_) {
+          frameGate->mul(*frameOutput, *weight_->getW(), 1, 1);
+        }
+      } else {
+        lstmValue.prevStateValue = nullptr;
+      }
+    }
+  }
+
+  if (!reversed_) {
+    if (prevState_) {
+      prevState_->assign(*state_.value->subMatrix(batchSize - 1, 1));
+    }
+    if (prevOutput_) {
+      prevOutput_->assign(*output_.value->subMatrix(batchSize - 1, 1));
+    }
+  }
+}
+
+void LstmLayer::backwardSequence(int batchSize,
+                                 size_t numSequences,
+                                 const int *starts,
+                                 MatrixPtr inputGrad) {
+  REGISTER_TIMER_INFO("LstmBwSequenceTime", getName().c_str());
+  MatrixPtr weightT = weight_->getW()->getTranspose();
+
+  hl_lstm_value lstmValue;
+  hl_lstm_grad lstmGrad;
+  lstmValue.checkIg = checkIg_->getData();
+  lstmValue.checkFg = checkFg_->getData();
+  lstmValue.checkOg = checkOg_->getData();
+  lstmValue.gateValue = gate_.value->getData();
+  lstmValue.stateValue = state_.value->getData();
+  lstmValue.stateActiveValue = preOutput_.value->getData();
+  lstmValue.outputValue = nullptr;
+
+  if (bias_->getWGrad()) {
+    lstmGrad.checkIgGrad = checkIgGrad_->getData();
+    lstmGrad.checkFgGrad = checkFgGrad_->getData();
+    lstmGrad.checkOgGrad = checkOgGrad_->getData();
+  } else {
+    lstmGrad.checkIgGrad = nullptr;
+    lstmGrad.checkFgGrad = nullptr;
+    lstmGrad.checkOgGrad = nullptr;
+  }
+  lstmGrad.gateGrad = gate_.grad->getData();
+  lstmGrad.stateGrad = state_.grad->getData();
+  lstmGrad.stateActiveGrad = nullptr;
+  lstmGrad.outputGrad = output_.grad->getData();
+
+  if (!reversed_) {
+    lstmValue.gateValue += (batchSize - 1) * getSize() * 4;
+    lstmGrad.gateGrad += (batchSize - 1) * getSize() * 4;
+    lstmValue.stateValue += (batchSize - 1) * getSize();
+    lstmGrad.stateGrad += (batchSize - 1) * getSize();
+    lstmValue.stateActiveValue += (batchSize - 1) * getSize();
+    lstmGrad.outputGrad += (batchSize - 1) * getSize();
+    lstmValue.prevStateValue = lstmValue.stateValue - getSize();
+    lstmGrad.prevStateGrad = lstmGrad.stateGrad - getSize();
+  } else {
+    lstmValue.prevStateValue = lstmValue.stateValue + getSize();
+    lstmGrad.prevStateGrad = lstmGrad.stateGrad + getSize();
+  }
+
+  auto nextFrame = [&lstmValue, &lstmGrad](bool reversed, int frameSize) {
+    if (reversed) {
+      lstmValue.gateValue += frameSize * 4;
+      lstmGrad.gateGrad += frameSize * 4;
+      lstmValue.stateValue += frameSize;
+      lstmGrad.stateGrad += frameSize;
+      lstmValue.stateActiveValue += frameSize;
+      lstmGrad.outputGrad += frameSize;
+      lstmValue.prevStateValue = lstmValue.stateValue + frameSize;
+      lstmGrad.prevStateGrad = lstmGrad.stateGrad + frameSize;
+    } else {
+      lstmValue.gateValue -= frameSize * 4;
+      lstmGrad.gateGrad -= frameSize * 4;
+      lstmValue.stateValue -= frameSize;
+      lstmGrad.stateGrad -= frameSize;
+      lstmValue.stateActiveValue -= frameSize;
+      lstmGrad.outputGrad -= frameSize;
+      lstmValue.prevStateValue = lstmValue.stateValue - frameSize;
+      lstmGrad.prevStateGrad = lstmGrad.stateGrad - frameSize;
+    }
+  };
+
+  MatrixPtr frameGate = Matrix::create(nullptr,
+                                       /* height= */ 1,
+                                       getSize() * 4,
+                                       /* trans= */ false,
+                                       useGpu_);
+  MatrixPtr frameOutput = Matrix::create(nullptr,
+                                         /* height= */ 1,
+                                         getSize(),
+                                         /* trans= */ false,
+                                         useGpu_);
+
+  {
+    AsyncGpuBlock asyncGpuBlock;
+    for (size_t n = 0; n < numSequences; ++n) {
+      int length;
+      int start;
+      if (reversed_) {
+        length = starts[n + 1] - starts[n];
+        start = starts[n];
+      } else {
+        length = starts[numSequences - n] - starts[numSequences - n - 1];
+        start = starts[numSequences - n - 1];
+      }
+      for (int l = 0; l < length; ++l) {
+        if (l == length - 1) {
+          lstmValue.prevStateValue = nullptr;
+          lstmGrad.prevStateGrad = nullptr;
+        }
+        if (useGpu_) {
+          LstmCompute::backwardOneSequence<1>(lstmValue, lstmGrad, getSize());
+        } else {
+          LstmCompute::backwardOneSequence<0>(lstmValue, lstmGrad, getSize());
+        }
+        if (l != length - 1) {
+          frameGate->setData(lstmGrad.gateGrad);
+          nextFrame(reversed_, getSize());
+          frameOutput->setData(lstmGrad.outputGrad);
+          frameOutput->mul(*frameGate, *weightT, 1, 1);
+        } else {
+          nextFrame(reversed_, getSize());
+        }
+      }
+
+      if (weight_->getWGrad()) {
+        if (!reversed_) {
+          weight_->getWGrad()->mul(
+              *output_.value->subMatrix(start, length - 1)->getTranspose(),
+              *gate_.grad->subMatrix(start + 1, length - 1),
+              1,
+              1);
+        } else {
+          weight_->getWGrad()->mul(
+              *output_.value->subMatrix(start + 1, length - 1)->getTranspose(),
+              *gate_.grad->subMatrix(start, length - 1),
+              1,
+              1);
+        }
+      }
+    }
+  }
+
+  if (inputGrad) {
+    inputGrad->add(*gate_.grad);
+  }
+  if (bias_ && bias_->getWGrad()) {
+    localBiasGrad_->collectBias(*gate_.grad, 1);
+  }
+}
+
+void LstmLayer::forwardBatch(int batchSize,
+                             size_t numSequences,
+                             const int *starts,
+                             MatrixPtr inputValue) {
+  REGISTER_TIMER_INFO("LstmFwBatchTime", getName().c_str());
+
+  hl_lstm_value lstmValue;
+  lstmValue.checkIg = checkIg_->getData();
+  lstmValue.checkFg = checkFg_->getData();
+  lstmValue.checkOg = checkOg_->getData();
+
+  if (!batchValue_) {
+    batchValue_.reset(new SequenceToBatch(useGpu_));
+  }
+  batchValue_->resizeOrCreateBatch(
+      batchSize, numSequences, starts, reversed_, prevOutput_ ? true : false);
+
+  batchValue_->resizeOrCreate(*output_.value);
+  batchValue_->copy(*inputValue, *gate_.value, /* seq2batch */ true);
+  if (bias_) {
+    gate_.value->addBias(*localBias_, 1);
+  }
+
+  {
+    int numBatch = batchValue_->getNumBatch();
+    int batchSize = 0;
+    AsyncGpuBlock asyncGpuBlock;
+    if (prevState_) {
+      lstmValue.prevStateValue = totalState_->getData();
+    } else {
+      lstmValue.prevStateValue = nullptr;
+    }
+    for (int n = 0; n < numBatch; n++) {
+      MatrixPtr outputValue = batchValue_->getBatchValue(n);
+      MatrixPtr gateValue = batchValue_->getBatchValue(*gate_.value, n);
+      batchSize = outputValue->getHeight();
+
+      if (n != 0) {
+        MatrixPtr batch1 = batchValue_->getBatchValue(n - 1, batchSize);
+        gateValue->mul(*batch1, *weight_->getW(), 1, 1);
+      } else if (prevOutput_) {
+        Matrix::resizeOrCreate(prevBatchOutput2_,
+                               gateValue->getHeight(),
+                               getSize(),
+                               false,
+                               useGpu_);
+        batchValue_->prevOutput2Batch(*prevOutput_, *prevBatchOutput2_);
+        gateValue->mul(*prevBatchOutput2_, *weight_->getW(), 1, 1);
+
+        batchValue_->prevOutput2Batch(*prevState_,
+                                      *totalState_->subMatrix(0, numSequences));
+      }
+
+      lstmValue.gateValue = gateValue->getData();
+      lstmValue.outputValue = outputValue->getData();
+      lstmValue.stateValue =
+          batchValue_->getBatchValue(*state_.value, n)->getData();
+      lstmValue.stateActiveValue =
+          batchValue_->getBatchValue(*preOutput_.value, n)->getData();
+      {
+        if (useGpu_) {
+          LstmCompute::forwardBatch<1>(lstmValue, getSize(), batchSize);
+        } else {
+          LstmCompute::forwardBatch<0>(lstmValue, getSize(), batchSize);
+        }
+      }
+      lstmValue.prevStateValue = lstmValue.stateValue;
+    }
+  }
+  {
+    REGISTER_TIMER_INFO("batchToSeq", getName().c_str());
+    batchValue_->copyBackSeq(*output_.value);
+  }
+  if (prevOutput_) {
+    getPrevBatchOutput(numSequences);
+    getPrevBatchState(numSequences);
+  }
+}
+
+void LstmLayer::getPrevBatchOutput(size_t numSequences) {
+  prevOutput_->resize(numSequences, getSize());
+  batchValue_->getSeqOutputFromBatch(*prevOutput_,
+                                     *batchValue_->getBatchValue());
+}
+
+void LstmLayer::getPrevBatchState(size_t numSequences) {
+  prevState_->resize(numSequences, getSize());
+  batchValue_->getSeqOutputFromBatch(*prevState_, *state_.value);
+}
+
+void LstmLayer::backwardBatch(int batchSize,
+                              size_t numSequences,
+                              const int *starts,
+                              MatrixPtr inputGrad) {
+  REGISTER_TIMER_INFO("LstmBwBatchTime", getName().c_str());
+
+  hl_lstm_value lstmValue;
+  lstmValue.checkIg = checkIg_->getData();
+  lstmValue.checkFg = checkFg_->getData();
+  lstmValue.checkOg = checkOg_->getData();
+
+  hl_lstm_grad lstmGrad;
+  lstmGrad.stateActiveGrad = preOutput_.grad->getData();
+
+  if (bias_->getWGrad()) {
+    lstmGrad.checkIgGrad = checkIgGrad_->getData();
+    lstmGrad.checkFgGrad = checkFgGrad_->getData();
+    lstmGrad.checkOgGrad = checkOgGrad_->getData();
+  } else {
+    lstmGrad.checkIgGrad = nullptr;
+    lstmGrad.checkFgGrad = nullptr;
+    lstmGrad.checkOgGrad = nullptr;
+  }
+
+  if (!batchGrad_) {
+    batchGrad_.reset(new SequenceToBatch(useGpu_));
+  }
+  batchGrad_->shareIndexWith(*batchValue_);
+
+  {
+    REGISTER_TIMER_INFO("seqToBatch", getName().c_str());
+    batchGrad_->copyFromSeq(*output_.grad);
+  }
+
+  {
+    MatrixPtr weightT = weight_->getW()->getTranspose();
+    int numBatch = batchGrad_->getNumBatch();
+    int batchSize = 0;
+    AsyncGpuBlock asyncGpuBlock;
+    for (int n = (int)numBatch - 1; n >= 0; n--) {
+      MatrixPtr outputGrad = batchGrad_->getBatchValue(n);
+      MatrixPtr gateGrad = batchGrad_->getBatchValue(*gate_.grad, n);
+
+      lstmValue.gateValue =
+          batchGrad_->getBatchValue(*gate_.value, n)->getData();
+      lstmValue.stateValue =
+          batchGrad_->getBatchValue(*state_.value, n)->getData();
+      lstmValue.stateActiveValue =
+          batchGrad_->getBatchValue(*preOutput_.value, n)->getData();
+      lstmGrad.stateGrad =
+          batchGrad_->getBatchValue(*state_.grad, n)->getData();
+      lstmGrad.gateGrad = gateGrad->getData();
+      lstmGrad.outputGrad = outputGrad->getData();
+      {
+        batchSize = outputGrad->getHeight();
+        if (n != 0) {
+          lstmValue.prevStateValue =
+              batchGrad_->getBatchValue(*state_.value, n - 1)->getData();
+          lstmGrad.prevStateGrad =
+              batchGrad_->getBatchValue(*state_.grad, n - 1)->getData();
+        } else {
+          if (prevState_) {
+            lstmValue.prevStateValue = totalState_->getData();
+            lstmGrad.prevStateGrad = nullptr;
+          } else {
+            lstmValue.prevStateValue = nullptr;
+            lstmGrad.prevStateGrad = nullptr;
+          }
+        }
+        if (useGpu_) {
+          LstmCompute::backwardBatch<1>(
+              lstmValue, lstmGrad, getSize(), batchSize);
+        } else {
+          LstmCompute::backwardBatch<0>(
+              lstmValue, lstmGrad, getSize(), batchSize);
+        }
+      }
+
+      if (n != 0) {
+        MatrixPtr tmp = batchGrad_->getBatchValue(n - 1, batchSize);
+        tmp->mul(*gateGrad, *weightT, 1, 1);
+      }
+
+      if (n != 0 && weight_->getWGrad()) {
+        /* backward weight */
+        MatrixPtr outputValue = batchValue_->getBatchValue(n - 1, batchSize);
+        weight_->getWGrad()->mul(*outputValue->getTranspose(), *gateGrad, 1, 1);
+      } else if (prevOutput_ && weight_->getWGrad()) {
+        weight_->getWGrad()->mul(
+            *prevBatchOutput2_->getTranspose(), *gateGrad, 1, 1);
+      }
+    }
+  }
+
+  if (inputGrad) {
+    batchGrad_->add(*inputGrad, *gate_.grad, /* seq2batch */ false);
+  }
+  if (bias_ && bias_->getWGrad()) {
+    localBiasGrad_->collectBias(*gate_.grad, /* scale */ 1);
+  }
+}
+
+void LstmLayer::forwardSeqParallel(int batchSize,
+                                   size_t numSequences,
+                                   const int *starts,
+                                   MatrixPtr inputValue) {
+  REGISTER_TIMER_INFO("LstmFwSeqParallelTime", getName().c_str());
+  gate_.value->assign(*inputValue);
+  if (bias_) {
+    gate_.value->addBias(*localBias_, /* scale */ 1);
+  }
+
+  real *gateValue = gate_.value->getData();
+  real *stateValue = state_.value->getData();
+  real *outputValue = output_.value->getData();
+  real *preOutputValue = preOutput_.value->getData();
+  real *checkIg = checkIg_->getData();
+  real *checkFg = checkFg_->getData();
+  real *checkOg = checkOg_->getData();
+  real *weight = weight_->getW()->getData();
+  hl_lstm_parallel_forward(gateValue,
+                           stateValue,
+                           preOutputValue,
+                           outputValue,
+                           checkIg,
+                           checkFg,
+                           checkOg,
+                           weight,
+                           starts,
+                           getSize(),
+                           numSequences,
+                           reversed_,
+                           activeNode_,
+                           activeGate_,
+                           activeState_);
+}
+
+void LstmLayer::backwardSeqParallel(int batchSize,
+                                    size_t numSequences,
+                                    const int *starts,
+                                    MatrixPtr inputGrad) {
+  REGISTER_TIMER_INFO("LstmBwSeqParallelTime", getName().c_str());
+  real *gateValue = gate_.value->getData();
+  real *gateGrad = gate_.grad->getData();
+  real *stateValue = state_.value->getData();
+  real *stateGrad = state_.grad->getData();
+  real *preOutputValue = preOutput_.value->getData();
+  real *preOutputGrad = preOutput_.grad->getData();
+  real *checkIg = checkIg_->getData();
+  real *checkFg = checkFg_->getData();
+  real *checkOg = checkOg_->getData();
+  real *outputGrad = output_.grad->getData();
+  real *weight = weight_->getW()->getData();
+
+  real *checkIgGrad;
+  real *checkFgGrad;
+  real *checkOgGrad;
+  if (bias_->getWGrad()) {
+    checkIgGrad = checkIgGrad_->getData();
+    checkFgGrad = checkFgGrad_->getData();
+    checkOgGrad = checkOgGrad_->getData();
+  } else {
+    checkIgGrad = nullptr;
+    checkFgGrad = nullptr;
+    checkOgGrad = nullptr;
+  }
+
+  hl_lstm_parallel_backward_data(gateValue,
+                                 gateGrad,
+                                 stateValue,
+                                 stateGrad,
+                                 preOutputValue,
+                                 preOutputGrad,
+                                 outputGrad,
+                                 checkIg,
+                                 checkIgGrad,
+                                 checkFg,
+                                 checkFgGrad,
+                                 checkOg,
+                                 checkOgGrad,
+                                 weight,
+                                 starts,
+                                 getSize(),
+                                 numSequences,
+                                 reversed_,
+                                 activeNode_,
+                                 activeGate_,
+                                 activeState_);
+
+  if (inputGrad) {
+    inputGrad->add(*gate_.grad);
+  }
+  if (bias_ && bias_->getWGrad()) {
+    localBiasGrad_->collectBias(*gate_.grad, 1);
+  }
+
+  real *outputValue = output_.value->getData();
+  if (weight_->getWGrad()) {
+    real *weightGrad = weight_->getWGrad()->getData();
+    hl_lstm_parallel_backward_weight(weightGrad,
+                                     outputValue,
+                                     gateGrad,
+                                     starts,
+                                     getSize(),
+                                     batchSize,
+                                     numSequences,
+                                     reversed_);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/LstmLayer.h b/paddle/legacy/gserver/layers/LstmLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..8c8b382f505d791fb1ef4265dcfe95046aa832fb
--- /dev/null
+++ b/paddle/legacy/gserver/layers/LstmLayer.h
@@ -0,0 +1,221 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "LstmCompute.h"
+#include "SequenceToBatch.h"
+#include "paddle/legacy/math/BaseMatrix.h"
+#include "paddle/legacy/math/Matrix.h"
+namespace paddle {
+
+/**
+ * @brief LstmLayer takes 1 input layer with size * 4.
+ * Input layer is diveded into 4 equal parts:
+ *   (input_s, input_ig, input_fg, input_og)
+ *
+ * For each sequence [start, end] it performs the following computation:
+ * @code
+ * output_{i} = actState(state_{i}) * actGate(outputGate_{i})
+ * state_{i} = actInput(input_s_{i} + bias_s +
+ *             output_{i-1} * recurrIW) * actGate(inputGate_{i}) +
+ *             actGate(forgetGate_{i}) * state_{i-1}
+ * inputGate = input_ig_{i} + bias_ig + output_{i-1} * recurrIGW +
+ *             state_{i-1} * inputCheck
+ * ouputGate = input_og_{i} + bias_og + output_{i-1} * recurrOGW +
+ *             state_{i} * outputCheck
+ * forgetGate = input_fg_{i} + bias_fg + output_{i-1} * recurrFGW +
+ *              state_{i-1} * forgetCheck
+ * @endcode
+ *
+ * - parameter[0] consists of (recurrIW, recurrIGW, recurrFGW, recurrOGW)
+ * - baisParameter consists of
+ *   (bias_s, bias_ig, bias_og, bias_fg, inputCheck, forgetCheck, outputCheck)
+ *
+ * - actInput is defined by config active_type.
+ * - actState is defined by config active_state_type.
+ * - actGate is defined by config actvie_gate_type.
+ *
+ * There are two ways to compute, namely one sequence by one sequence or
+ * one batch by one batch. By default and no setting pre_batch_state true,
+ * it will compute batch by batch.
+ *
+ * The formula in the paper is as follows:
+ * \f[
+ * i_t = \sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i) \\
+ * f_t = \sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f) \\
+ * \tilde{c_t} = tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c) \\
+ * o_t = \sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o) \\
+ * c_t = f_t * c_{t-1} + i_t * \tilde{c_t} \\
+ * h_t = o_t tanh(c_t)
+ * \f]
+ *
+ * @note These \f$W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}\f$
+ * operations on the input sequence were NOT included in LstmLayer. So
+ * users should use fc_layer or mixed_layer before lstm_later.
+ *
+ * The weight ([size, 4*size]) contains \f$W_{hi}, W_{hf}, W_{hc}, W_{ho}\f$.
+ * The bias contains \f$b_i, b_f, b_c, b_o\f$ and \f$W_{ci}, W_{cf}, W_{co}\f$.
+ */
+
+class LstmLayer : public Layer, public LstmCompute {
+ public:
+  explicit LstmLayer(const LayerConfig &config) : Layer(config) {}
+
+  bool init(const LayerMap &layerMap,
+            const ParameterMap &parameterMap) override;
+
+  void forward(PassType passType) override;
+
+  void backward(const UpdateCallback &callback) override;
+
+  void resetState() override;
+
+  void setState(LayerStatePtr state) override;
+
+  LayerStatePtr getState() override;
+
+ protected:
+  /**
+   * @brief Compute lstm forward one sequence by one sequence.
+   * @param batchSize The batchSize is not equal to the batch_size in
+   * the config file. It is the total words number of all samples
+   * in this forward batch.
+   * @param numSequences The sample number. It is equal to the batch_size
+   * in the config file.
+   * @param starts Each start position of each samples.
+   * @param inputValue The input values.
+   */
+  void forwardSequence(int batchSize,
+                       size_t numSequences,
+                       const int *starts,
+                       MatrixPtr inputValue);
+  /**
+   * Compute lstm backward one sequence by one sequence.
+   */
+  void backwardSequence(int batchSize,
+                        size_t numSequences,
+                        const int *starts,
+                        MatrixPtr inputGrad);
+
+  /**
+   * Compute lstm forward one batch by one batch. The batch value is
+   * reorganized by SequenceToBatch class. The batch output value will
+   * be convert into sequence value after finishing forward. Here, one
+   * batch contains one word of each sample. If the length of each sample
+   * is not equality, the batch will not pads zero and contains less words.
+   * The total batch numbers are the max length of the sequence. The details
+   * can refer to SequenceToBatch class. On GPU mode, it will launch GPU
+   * kernel for loop.
+   *
+   * @code
+   * for (int i = 0; i < numBatch(max_sequence_length); ++i) {
+   *   compute one batch.
+   * }
+   * @endcode
+   */
+  void forwardBatch(int batchSize,
+                    size_t numSequences,
+                    const int *starts,
+                    MatrixPtr inputValue);
+  /**
+   * Compute lstm backward one batch by one batch.
+   */
+  void backwardBatch(int batchSize,
+                     size_t numSequences,
+                     const int *starts,
+                     MatrixPtr inputGrad);
+
+  /**
+   * This function only supports GPU. It not need to reorganize input into
+   * batch value. It will launch one kernel to parallelly compute forward
+   * propagation in sequence level.
+   */
+  void forwardSeqParallel(int batchSize,
+                          size_t numSequences,
+                          const int *starts,
+                          MatrixPtr inputValue);
+  /**
+   * Backward propagation corresponding to forwardSeqParallel.
+   */
+  void backwardSeqParallel(int batchSize,
+                           size_t numSequences,
+                           const int *starts,
+                           MatrixPtr inputGrad);
+  /**
+   * This function is used for sequence generation and get output after
+   * forwardBatch.
+   */
+  void getPrevBatchOutput(size_t numSequences);
+  /**
+   * This function is used for sequence generation and get state after
+   * forwardBatch.
+   */
+  void getPrevBatchState(size_t numSequences);
+
+ protected:
+  /// Learned parameters, shape: (size, 4*size).
+  /// The weight ([size, 4*size]) contains \f$W_{hi}, W_{hf}, W_{hc}, W_{ho}\f$.
+  std::unique_ptr<Weight> weight_;
+  /// Learned bias parameter, shape: (1, 7 * size).
+  /// The bias contains \f$b_i, b_f, b_c, b_o\f$ and \f$W_{ci}, W_{cf},
+  /// W_{co}\f$.
+  std::unique_ptr<Weight> bias_;
+  /// The reeal bias, point to \f$b_i, b_f, b_c, b_o\f$.
+  MatrixPtr localBias_;
+  /// The peephole connection for input gate.
+  MatrixPtr checkIg_;
+  /// The peephole connection for forget gate.
+  MatrixPtr checkFg_;
+  /// The peephole connection for output gate.
+  MatrixPtr checkOg_;
+  /// The gradient of real bias
+  MatrixPtr localBiasGrad_;
+  /// The gradient of peephole connection for input gates.
+  MatrixPtr checkIgGrad_;
+  /// The gradient of peephole connection for forget gates.
+  MatrixPtr checkFgGrad_;
+  /// The gradient of peephole connection for output gates.
+  MatrixPtr checkOgGrad_;
+
+  /// Stores the cell state of previous time step, namely \f$c_{t-1}\f$.
+  Argument state_;
+  /// Stores the hidden of previous time step, namely \f$h_{t-1}\f$.
+  Argument preOutput_;
+  /// Stores the value and gradient of four gates, namely
+  /// \f$i_t, f_t, o_t, c_t\f$.
+  Argument gate_;
+  /// Whether it is reversed lstm.
+  bool reversed_;
+  /// Whether to use batch method to compute.
+  bool useBatch_;
+  /// Whether to use sequence parallell method to compute.
+  bool useSeqParallel_;
+  /// batchValue_ is used in method of batch calculation. It stores the
+  /// batch value after reorganized input.
+  std::unique_ptr<SequenceToBatch> batchValue_;
+  /// The gradient of batchValue_.
+  std::unique_ptr<SequenceToBatch> batchGrad_;
+
+  /// Used in generation and stores the state of previous time step.
+  MatrixPtr prevState_;
+  /// Used in generation and stores the output of previous time step.
+  MatrixPtr prevOutput_;
+  MatrixPtr prevBatchOutput2_;
+  /// The total state.
+  MatrixPtr totalState_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/LstmStepLayer.cpp b/paddle/legacy/gserver/layers/LstmStepLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f02f8ad62fe4d4cb4bb580923200b398c8483a99
--- /dev/null
+++ b/paddle/legacy/gserver/layers/LstmStepLayer.cpp
@@ -0,0 +1,194 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "LstmCompute.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/*
+ * LstmStepLayer used in recurrent layer group.
+ */
+class LstmStepLayer : public Layer, public LstmCompute {
+ protected:
+  Argument state_;
+  Argument gate_;
+  Argument stateActive_;
+  MatrixPtr checkIg_, checkFg_, checkOg_;
+  MatrixPtr checkIgGrad_, checkFgGrad_, checkOgGrad_;
+  std::unique_ptr<Weight> weight_;
+
+ public:
+  explicit LstmStepLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~LstmStepLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(lstm_step, LstmStepLayer);
+
+bool LstmStepLayer::init(const LayerMap& layerMap,
+                         const ParameterMap& parameterMap) {
+  if (!Layer::init(layerMap, parameterMap)) return false;
+  CHECK_EQ(2U, inputLayers_.size());
+
+  checkIg_ = Matrix::create(nullptr,
+                            /* height= */ 1,
+                            getSize(),
+                            /* trans= */ false,
+                            useGpu_);
+  checkFg_ = Matrix::create(nullptr,
+                            /* height= */ 1,
+                            getSize(),
+                            /* trans= */ false,
+                            useGpu_);
+  checkOg_ = Matrix::create(nullptr,
+                            /* height= */ 1,
+                            getSize(),
+                            /* trans= */ false,
+                            useGpu_);
+  checkIgGrad_ = Matrix::create(nullptr,
+                                /* height= */ 1,
+                                getSize(),
+                                /* trans= */ false,
+                                useGpu_);
+  checkFgGrad_ = Matrix::create(nullptr,
+                                /* height= */ 1,
+                                getSize(),
+                                /* trans= */ false,
+                                useGpu_);
+  checkOgGrad_ = Matrix::create(nullptr,
+                                /* height= */ 1,
+                                getSize(),
+                                /* trans= */ false,
+                                useGpu_);
+
+  if (biasParameter_.get() != NULL) {
+    CHECK_EQ(getSize() * 3, biasParameter_->getSize());
+    weight_.reset(new Weight(1, getSize() * 3, biasParameter_));
+    if (weight_->getW()) {
+      real* data = weight_->getW()->getData();
+      checkIg_->setData(data);
+      checkFg_->setData(data + getSize());
+      checkOg_->setData(data + getSize() * 2);
+    }
+
+    if (weight_->getWGrad()) {
+      real* data = weight_->getWGrad()->getData();
+      checkIgGrad_->setData(data);
+      checkFgGrad_->setData(data + getSize());
+      checkOgGrad_->setData(data + getSize() * 2);
+    }
+  }
+
+  setOutput("state", &state_);
+  LstmCompute::init(config_);
+  return true;
+}
+
+void LstmStepLayer::forward(PassType passType) {
+  REGISTER_TIMER_INFO("LstmRecurrentFwTime", getName().c_str());
+  Layer::forward(passType);
+
+  const Argument& input = getInput(0);
+  const Argument& prevState = getInput(1);
+  CHECK_EQ(getSize() * 4, input.value->getWidth());
+  CHECK_EQ(getSize(), prevState.value->getWidth());
+  int batchSize = input.getBatchSize();
+  reserveOutput(batchSize, getSize());
+  resetSpecifyOutput(state_,
+                     batchSize,
+                     getSize(),
+                     /*  isValueClean */ false,
+                     /* isGradClean */ true);
+  resetSpecifyOutput(gate_,
+                     batchSize,
+                     getSize() * 4,
+                     /* isValueClean */ false,
+                     /* isGradClean */ false);
+  resetSpecifyOutput(stateActive_,
+                     batchSize,
+                     getSize(),
+                     /*  isValueClean */ false,
+                     /* isGradClean */ false);
+  gate_.value->assign(*input.value);
+
+  hl_lstm_value lstmValue;
+  lstmValue.checkIg = checkIg_->getData();
+  lstmValue.checkFg = checkFg_->getData();
+  lstmValue.checkOg = checkOg_->getData();
+  lstmValue.gateValue = gate_.value->getData();
+  lstmValue.stateValue = state_.value->getData();
+  lstmValue.prevStateValue = prevState.value->getData();
+  lstmValue.stateActiveValue = stateActive_.value->getData();
+  lstmValue.outputValue = output_.value->getData();
+
+  if (useGpu_) {
+    LstmCompute::forwardBatch<1>(lstmValue, getSize(), batchSize);
+  } else {
+    LstmCompute::forwardBatch<0>(lstmValue, getSize(), batchSize);
+  }
+}
+
+void LstmStepLayer::backward(const UpdateCallback& callback) {
+  REGISTER_TIMER_INFO("LstmRecurrentBwTime", getName().c_str());
+  const Argument& input = getInput(0);
+  const Argument& prevState = getInput(1);
+  int batchSize = input.getBatchSize();
+
+  hl_lstm_value lstmValue;
+  hl_lstm_grad lstmGrad;
+  lstmValue.checkIg = checkIg_->getData();
+  lstmValue.checkFg = checkFg_->getData();
+  lstmValue.checkOg = checkOg_->getData();
+  lstmValue.gateValue = gate_.value->getData();
+  lstmValue.prevStateValue = prevState.value->getData();
+  lstmValue.stateValue = state_.value->getData();
+  lstmValue.stateActiveValue = stateActive_.value->getData();
+
+  lstmGrad.gateGrad = gate_.grad->getData();
+  if (prevState.grad) {
+    lstmGrad.prevStateGrad = prevState.grad->getData();
+  } else {
+    lstmGrad.prevStateGrad = nullptr;
+  }
+  lstmGrad.stateGrad = state_.grad->getData();
+  lstmGrad.stateActiveGrad = stateActive_.grad->getData();
+  lstmGrad.outputGrad = output_.grad->getData();
+  lstmGrad.checkIgGrad = checkIgGrad_->getData();
+  lstmGrad.checkFgGrad = checkFgGrad_->getData();
+  lstmGrad.checkOgGrad = checkOgGrad_->getData();
+
+  if (useGpu_) {
+    LstmCompute::backwardBatch<1>(lstmValue, lstmGrad, getSize(), batchSize);
+  } else {
+    LstmCompute::backwardBatch<0>(lstmValue, lstmGrad, getSize(), batchSize);
+  }
+
+  if (input.grad) {
+    input.grad->add(*gate_.grad);
+  }
+
+  if (weight_) {
+    weight_->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MDLstmLayer.cpp b/paddle/legacy/gserver/layers/MDLstmLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4838183e8ccb213aa249fddf5102026198e98d3c
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MDLstmLayer.cpp
@@ -0,0 +1,769 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "LstmLayer.h"
+#include "paddle/legacy/math/BaseMatrix.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+class CoordIterator {
+ public:
+  std::vector<int> dims_;
+  std::vector<bool> directions_;
+  std::vector<int> curPos_;
+  bool end_;
+
+  void step(size_t d, bool reversed) {
+    if (directions_[d] ^ reversed) {
+      if (curPos_[d] == dims_[d] - 1) {
+        curPos_[d] = 0;
+        if (d) {
+          step(d - 1, reversed);
+        } else {
+          end_ = true;
+        }
+      } else {
+        curPos_[d]++;
+      }
+    } else {
+      if (curPos_[d] == 0) {
+        curPos_[d] = dims_[d] - 1;
+        if (d) {
+          step(d - 1, reversed);
+        } else {
+          end_ = true;
+        }
+      } else {
+        curPos_[d]--;
+      }
+    }
+  }
+
+ public:
+  CoordIterator(std::vector<int> dim, std::vector<bool> directions)
+      : dims_(dim), directions_(directions), end_(false) {
+    CHECK_EQ(dims_.size(), directions_.size());
+    for (size_t i = 0; i < dims_.size(); i++) {
+      curPos_.push_back(-1);
+    }
+  }
+  CoordIterator& operator++() {
+    step(dims_.size() - 1, false);
+    return *this;
+  }
+
+  CoordIterator& operator--() {
+    step(dims_.size() - 1, true);
+    return *this;
+  }
+
+  std::vector<int>& curPos() { return curPos_; }
+
+  int offset() {
+    int offset = curPos_[0];
+    for (size_t i = 1; i < dims_.size(); i++) {
+      offset = offset * dims_[i] + curPos_[i];
+    }
+    return offset;
+  }
+
+  int offset(const std::vector<int>& pos) {
+    int offset = pos[0];
+    for (size_t i = 1; i < dims_.size(); i++) {
+      offset = offset * dims_[i] + pos[i];
+    }
+    return offset;
+  }
+
+  std::vector<int>& begin() {
+    for (size_t i = 0; i < dims_.size(); i++) {
+      curPos_[i] = directions_[i] ? 0 : dims_[i] - 1;
+    }
+    end_ = false;
+    return curPos_;
+  }
+
+  std::vector<int>& rbegin() {
+    for (size_t i = 0; i < dims_.size(); i++) {
+      curPos_[i] = directions_[i] ? dims_[i] - 1 : 0;
+    }
+    end_ = false;
+    return curPos_;
+  }
+
+  bool end() { return end_; }
+
+  bool getPrePos(const std::vector<int>& delays,
+                 int idx,
+                 std::vector<int>& prePos) {
+    bool isAvial = true;
+    prePos.clear();
+    prePos.reserve(directions_.size());
+    for (size_t i = 0; i < directions_.size(); i++) {
+      if (int(i) == idx) {
+        prePos.push_back(curPos_[i] + delays[i] * (directions_[i] ? 1 : -1));
+        if (prePos[i] < 0) {
+          prePos[i] = 0;
+          isAvial = false;
+        }
+        if (prePos[i] >= dims_[i]) {
+          prePos[i] = dims_[i] - 1;
+          isAvial = false;
+        }
+      } else {
+        prePos.push_back(curPos_[i]);
+      }
+    }
+    return isAvial;
+  }
+
+  bool getNextPos(const std::vector<int>& delays,
+                  int idx,
+                  std::vector<int>& nextPos) {
+    bool isAvial = true;
+    nextPos.clear();
+    nextPos.reserve(directions_.size());
+    for (size_t i = 0; i < directions_.size(); i++) {
+      if (int(i) == idx) {
+        nextPos.push_back(curPos_[i] - delays[i] * (directions_[i] ? 1 : -1));
+        if (nextPos[i] < 0) {
+          nextPos[i] = 0;
+          isAvial = false;
+        }
+        if (nextPos[i] >= dims_[i]) {
+          nextPos[i] = dims_[i] - 1;
+          isAvial = false;
+        }
+      } else {
+        nextPos.push_back(curPos_[i]);
+      }
+    }
+    return isAvial;
+  }
+};
+/*
+ * MDLstmLayer takes 1 input layer with size * (3+numDims).
+ * For each sequence [start, end] it performs the following computation:
+ * out_i = actState(state_i) * actGate(outputGate_i)
+ *
+ * For example the image with 2 dims, we take the scanning order from left-top
+ * to right-bottom, then the 2 previous states of the current pixels are the
+ * ones located at left and top. And each of them has a independent forget gate.
+ *
+ * state_i = actInput(input_i) * actGate(inputGate_i) +
+ *           \sum{j}(actGate(forgetGate_i_j) * state_prev_i_j)
+ *
+ * inputGate = input_i * inputW + \sum{j}(output_prev_i_j * recurrInputW_j) +
+ *             \sum{j}(state_prev_i_j * inputCheck_j)
+ *
+ * ouputGate = input_i * outputW + \sum{j}(output_prev_i_j * recurrOutputW_j) +
+ *             state_i * outputCheck
+ *
+ * forgetGate_j = input_i * forgetW_j + \sum{j}(output_prev_i_j *
+ *                recurrForgetW_j) + \sum{j}(state_prev_i_j * forgetCheck_j)
+ *
+ * IG Layer: (Input, InputGate, ForgetGates, OutputGate) * OutputSize
+ * */
+
+class MDLstmLayer : public LstmLayer {
+ public:
+  explicit MDLstmLayer(const LayerConfig& config) : LstmLayer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+
+  void backward(const UpdateCallback& callback) override;
+
+ protected:
+  void forwardOneSequence(int start, CoordIterator& coordIter);
+  void backwardOneSequence(int start, CoordIterator& coordIter);
+  void forwardGate2OutputSequence(int start, CoordIterator& coordIter);
+  void backwardGate2OutputSequence(int start, CoordIterator& coordIter);
+
+ protected:
+  std::vector<Argument> frameInputGate_;
+  std::vector<Argument> frameForgetGate_;
+  std::vector<Argument> frameOutputGate_;
+  std::vector<Argument> frameInputNode_;
+  std::vector<Argument> frameGate_;
+  std::vector<Argument> frameState_;
+  std::vector<Argument> framePreOutput_;
+  std::vector<Argument> frameOutput_;
+
+  // Activation
+  std::unique_ptr<ActivationFunction> activationGate_;
+  std::unique_ptr<ActivationFunction> activationState_;
+
+  int numDims_;
+  size_t numBlocks_;
+  std::vector<bool> directions_;
+  std::vector<int> delays_;
+  std::vector<std::vector<int>> dimsV_;
+};
+
+REGISTER_LAYER(mdlstmemory, MDLstmLayer);
+
+bool MDLstmLayer::init(const LayerMap& layerMap,
+                       const ParameterMap& parameterMap) {
+  if (!Layer::init(layerMap, parameterMap)) return false;
+  CHECK_EQ(1U, inputLayers_.size());
+  CHECK_EQ(1U, parameters_.size());
+
+  numBlocks_ = getSize();
+  numDims_ = config_.directions_size();
+  CHECK_EQ(numBlocks_ * numBlocks_ * (3 + numDims_), parameters_[0]->getSize());
+
+  // inode(1), ig(1), fg(numDims_), og(1), peepIg(1), peepFg(numDims_),
+  // peepOg(1), then size of localBias_ is 3+numDims_
+  CHECK_EQ(numBlocks_ * (5 + 2 * numDims_), biasParameter_->getSize());
+  weight_.reset(
+      new Weight(numBlocks_, numBlocks_ * (3 + numDims_), parameters_[0]));
+  if (biasParameter_.get() != NULL) {
+    bias_.reset(new Weight(1, numBlocks_ * (5 + 2 * numDims_), biasParameter_));
+    localBias_ = Matrix::create(nullptr,
+                                /* height= */ 1,
+                                numBlocks_ * (3 + numDims_),
+                                /* trans= */ false,
+                                useGpu_);
+    checkIg_ = Matrix::create(nullptr,
+                              /* height= */ 1,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
+    checkFg_ = Matrix::create(nullptr,
+                              /* height= */ numDims_,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
+    checkOg_ = Matrix::create(nullptr,
+                              /* height= */ 1,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
+    localBiasGrad_ = Matrix::create(nullptr,
+                                    /* height= */ 1,
+                                    numBlocks_ * (3 + numDims_),
+                                    /* trans= */ false,
+                                    useGpu_);
+    checkIgGrad_ = Matrix::create(nullptr,
+                                  /* height= */ 1,
+                                  numBlocks_,
+                                  /* trans= */ false,
+                                  useGpu_);
+    checkFgGrad_ = Matrix::create(nullptr,
+                                  /* height= */ numDims_,
+                                  numBlocks_,
+                                  /* trans= */ false,
+                                  useGpu_);
+    checkOgGrad_ = Matrix::create(nullptr,
+                                  /* height= */ 1,
+                                  numBlocks_,
+                                  /* trans= */ false,
+                                  useGpu_);
+
+    localBias_->setData(bias_->getW()->getData());
+    checkIg_->setData(bias_->getW()->getData() + numBlocks_ * (3 + numDims_));
+    checkFg_->setData(bias_->getW()->getData() + numBlocks_ * (4 + numDims_));
+    checkOg_->setData(bias_->getW()->getData() +
+                      numBlocks_ * (4 + 2 * numDims_));
+
+    if (bias_->getWGrad()) {
+      localBiasGrad_->setData(bias_->getWGrad()->getData());
+      checkIgGrad_->setData(bias_->getWGrad()->getData() +
+                            numBlocks_ * (3 + numDims_));
+      checkFgGrad_->setData(bias_->getWGrad()->getData() +
+                            numBlocks_ * (4 + numDims_));
+      checkOgGrad_->setData(bias_->getWGrad()->getData() +
+                            numBlocks_ * (4 + 2 * numDims_));
+    }
+  } else {
+    LOG(FATAL) << "Bias should be here.";
+  }
+  for (int i = 0; i < numDims_; i++) {
+    directions_.push_back(config_.directions(i));
+  }
+  for (int i = 0; i < numDims_; i++) {
+    delays_.push_back(-1);
+  }
+  activationGate_.reset(ActivationFunction::create(config_.active_gate_type()));
+  activationState_.reset(
+      ActivationFunction::create(config_.active_state_type()));
+
+  return true;
+}
+
+void MDLstmLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const Argument& input = getInput(0);
+  CHECK(input.sequenceStartPositions);
+  int batchSize = input.getBatchSize();
+  int numSequences = input.getNumSequences();
+  resetOutput(batchSize, numBlocks_);
+  CHECK_EQ(numBlocks_ * (3 + numDims_), input.value->getWidth());
+  const int* starts = input.sequenceStartPositions->getData(false);
+  CHECK_EQ(starts[numSequences], batchSize);
+
+  int* dimsData = input.cpuSequenceDims->getData();
+  CHECK_EQ(int(input.cpuSequenceDims->getSize()), numDims_* numSequences);
+
+  for (int i = 0; i < numSequences; i++) {
+    std::vector<int> dims;
+    for (int j = 0; j < numDims_; j++) {
+      dims.push_back(dimsData[i * numDims_ + j]);
+    }
+    dimsV_.push_back(dims);
+  }
+
+  frameInputGate_.reserve(batchSize);
+  frameForgetGate_.reserve(batchSize);
+  frameOutputGate_.reserve(batchSize);
+  frameInputNode_.reserve(batchSize);
+  frameGate_.reserve(batchSize);
+  frameState_.reserve(batchSize);
+  framePreOutput_.reserve(batchSize);
+  frameOutput_.reserve(batchSize);
+
+  Matrix::resizeOrCreate(gate_.value,
+                         /* height= */ batchSize,
+                         numBlocks_ * (3 + numDims_),
+                         /* trans= */ false,
+                         useGpu_);
+
+  for (int i = frameGate_.size(); i < batchSize; i++) {
+    Argument arg;
+    arg.value = Matrix::create(nullptr,
+                               /* height= */ 1,
+                               numBlocks_ * (3 + numDims_),
+                               /* trans= */ false,
+                               useGpu_);
+    arg.grad = Matrix::create(nullptr,
+                              /* height= */ 1,
+                              numBlocks_ * (3 + numDims_),
+                              /* trans= */ false,
+                              useGpu_);
+    frameGate_.push_back(arg);
+  }
+  for (int i = frameInputGate_.size(); i < batchSize; i++) {
+    Argument arg;
+    arg.value = Matrix::create(nullptr,
+                               /* height= */ 1,
+                               numBlocks_,
+                               /* trans= */ false,
+                               useGpu_);
+    arg.grad = Matrix::create(nullptr,
+                              /* height= */ 1,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
+    frameInputGate_.push_back(arg);
+  }
+  for (int i = frameForgetGate_.size(); i < batchSize; i++) {
+    Argument arg;
+    arg.value = Matrix::create(nullptr,
+                               /* height= */ numDims_,
+                               numBlocks_,
+                               /* trans= */ false,
+                               useGpu_);
+    arg.grad = Matrix::create(nullptr,
+                              /* height= */ numDims_,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
+    frameForgetGate_.push_back(arg);
+  }
+  for (int i = frameOutputGate_.size(); i < batchSize; i++) {
+    Argument arg;
+    arg.value = Matrix::create(nullptr,
+                               /* height= */ 1,
+                               numBlocks_,
+                               /* trans= */ false,
+                               useGpu_);
+    arg.grad = Matrix::create(nullptr,
+                              /* height= */ 1,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
+    frameOutputGate_.push_back(arg);
+  }
+  for (int i = frameInputNode_.size(); i < batchSize; i++) {
+    Argument arg;
+    arg.value = Matrix::create(nullptr,
+                               /* height= */ 1,
+                               numBlocks_,
+                               /* trans= */ false,
+                               useGpu_);
+    arg.grad = Matrix::create(nullptr,
+                              /* height= */ 1,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
+    frameInputNode_.push_back(arg);
+  }
+  for (int i = frameState_.size(); i < batchSize; i++) {
+    Argument arg;
+    arg.value = Matrix::create(
+        /* height= */ 1, numBlocks_, /* trans= */ false, useGpu_);
+    frameState_.push_back(arg);
+  }
+  for (int i = framePreOutput_.size(); i < batchSize; i++) {
+    Argument arg;
+    arg.value = Matrix::create(
+        /* height= */ 1, numBlocks_, /* trans= */ false, useGpu_);
+    framePreOutput_.push_back(arg);
+  }
+  for (int i = frameOutput_.size(); i < batchSize; i++) {
+    Argument arg;
+    arg.value = Matrix::create(nullptr,
+                               /* height= */ 1,
+                               numBlocks_,
+                               /* trans= */ false,
+                               useGpu_);
+    arg.grad = Matrix::create(nullptr,
+                              /* height= */ 1,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
+    frameOutput_.push_back(arg);
+  }
+
+  for (int i = 0; i < batchSize; i++) {
+    frameOutput_[i].value->setData(output_.value->getData() + i * numBlocks_);
+    frameGate_[i].value->setData(gate_.value->getData() +
+                                 i * numBlocks_ * (3 + numDims_));
+    frameInputNode_[i].value->setData(gate_.value->getData() +
+                                      i * numBlocks_ * (3 + numDims_) +
+                                      numBlocks_ * 0);
+    frameInputGate_[i].value->setData(gate_.value->getData() +
+                                      i * numBlocks_ * (3 + numDims_) +
+                                      numBlocks_ * 1);
+    frameForgetGate_[i].value->setData(gate_.value->getData() +
+                                       i * numBlocks_ * (3 + numDims_) +
+                                       numBlocks_ * 2);
+    frameOutputGate_[i].value->setData(gate_.value->getData() +
+                                       i * numBlocks_ * (3 + numDims_) +
+                                       numBlocks_ * (2 + numDims_));
+  }
+
+  AsyncGpuBlock asyncGpuBlock;
+  gate_.value->assign(*input.value);
+
+  if (bias_) {
+    gate_.value->addBias(*localBias_, 1);
+  }
+
+  for (int i = 0; i < numSequences; i++) {
+    CoordIterator coordIter(dimsV_[i], directions_);
+    forwardOneSequence(starts[i], coordIter);
+  }
+}
+
+void MDLstmLayer::forwardGate2OutputSequence(int start,
+                                             CoordIterator& coordIter) {
+  int idxCurr = start + coordIter.offset();
+  std::vector<int> preOffsetV;
+  preOffsetV.reserve(numDims_);
+  for (int i = 0; i < numDims_; i++) {
+    std::vector<int> prePos;
+    if (coordIter.getPrePos(delays_, i, prePos)) {
+      preOffsetV[i] = coordIter.offset(prePos);
+    } else {
+      preOffsetV[i] = -1;
+    }
+  }
+
+  for (int i = 0; i < numDims_; i++) {
+    if (preOffsetV[i] >= 0) {
+      frameInputGate_[idxCurr].value->addDotMul(
+          *frameState_[start + preOffsetV[i]].value, *checkIg_, 1.0, 1.0);
+
+      MatrixPtr fgGateOneDim = Matrix::create(
+          frameForgetGate_[idxCurr].value->getData() + i * numBlocks_,
+          1,
+          numBlocks_,
+          false,
+          useGpu_);
+      MatrixPtr checkFgOneDim =
+          Matrix::create(checkFg_->getData() + i * numBlocks_,
+                         1.0,
+                         numBlocks_,
+                         false,
+                         useGpu_);
+      fgGateOneDim->addDotMul(
+          *frameState_[start + preOffsetV[i]].value, *checkFgOneDim, 1.0, 1.0);
+    }
+  }
+  auto status = activationGate_->forward(frameInputGate_[idxCurr]);
+  status.check();
+  status = activationGate_->forward(frameForgetGate_[idxCurr]);
+  status.check();
+  status = activation_->forward(frameInputNode_[idxCurr]);
+  status.check();
+
+  frameState_[idxCurr].value->zeroMem();
+  for (int i = 0; i < numDims_; i++) {
+    if (preOffsetV[i] >= 0) {
+      MatrixPtr fgGateOneDim = Matrix::create(
+          frameForgetGate_[idxCurr].value->getData() + i * numBlocks_,
+          1,
+          numBlocks_,
+          false,
+          useGpu_);
+      frameState_[idxCurr].value->addDotMul(
+          *frameState_[start + preOffsetV[i]].value, *fgGateOneDim, 1.0, 1.0);
+    }
+  }
+  frameState_[idxCurr].value->addDotMul(*frameInputNode_[idxCurr].value,
+                                        *frameInputGate_[idxCurr].value,
+                                        1.0,
+                                        1.0);
+
+  frameOutputGate_[idxCurr].value->addDotMul(
+      *frameState_[idxCurr].value, *checkOg_, 1.0, 1.0);
+  status = activationGate_->forward(frameOutputGate_[idxCurr]);
+  status.check();
+
+  framePreOutput_[idxCurr].value->copyFrom(*(frameState_[idxCurr].value));
+  status = activationState_->forward(framePreOutput_[idxCurr]);
+  status.check();
+
+  frameOutput_[idxCurr].value->dotMul(*framePreOutput_[idxCurr].value,
+                                      *frameOutputGate_[idxCurr].value);
+}
+
+void MDLstmLayer::forwardOneSequence(int start, CoordIterator& coordIter) {
+  for (coordIter.begin(); !coordIter.end(); ++coordIter) {
+    int offset = coordIter.offset();
+    for (int i = 0; i < numDims_; i++) {
+      std::vector<int> prePos;
+      if (coordIter.getPrePos(delays_, i, prePos)) {
+        int preOffset = coordIter.offset(prePos);
+        frameGate_[start + offset].value->mul(
+            *frameOutput_[start + preOffset].value, *weight_->getW(), 1.0, 1.0);
+      }
+    }
+    forwardGate2OutputSequence(start, coordIter);
+  }
+}
+
+void MDLstmLayer::backward(const UpdateCallback& callback) {
+  const Argument& input = getInput(0);
+  CHECK(input.sequenceStartPositions);
+  int batchSize = input.getBatchSize();
+  const int* starts = input.sequenceStartPositions->getData(false);
+  size_t numSequences = input.getNumSequences();
+
+  Matrix::resizeOrCreate(gate_.grad,
+                         /* height= */ batchSize,
+                         numBlocks_ * (3 + numDims_),
+                         /* trans= */ false,
+                         useGpu_);
+
+  for (int i = 0; i < batchSize; i++) {
+    if (frameState_[i].grad == NULL)
+      frameState_[i].grad = Matrix::create(
+          /* height= */ 1, numBlocks_, /* trans= */ false, useGpu_);
+  }
+  for (int i = 0; i < batchSize; i++) {
+    if (framePreOutput_[i].grad == NULL)
+      framePreOutput_[i].grad = Matrix::create(
+          /* height= */ 1, numBlocks_, /* trans= */ false, useGpu_);
+  }
+
+  for (int i = 0; i < batchSize; i++) {
+    frameOutput_[i].grad->setData(output_.grad->getData() + i * numBlocks_);
+    frameGate_[i].grad->setData(gate_.grad->getData() +
+                                i * numBlocks_ * (3 + numDims_));
+    frameInputNode_[i].grad->setData(gate_.grad->getData() +
+                                     i * numBlocks_ * (3 + numDims_) +
+                                     numBlocks_ * 0);
+    frameInputGate_[i].grad->setData(gate_.grad->getData() +
+                                     i * numBlocks_ * (3 + numDims_) +
+                                     numBlocks_ * 1);
+    frameForgetGate_[i].grad->setData(gate_.grad->getData() +
+                                      i * numBlocks_ * (3 + numDims_) +
+                                      numBlocks_ * 2);
+    frameOutputGate_[i].grad->setData(gate_.grad->getData() +
+                                      i * numBlocks_ * (3 + numDims_) +
+                                      numBlocks_ * (2 + numDims_));
+  }
+
+  {
+    AsyncGpuBlock asyncGpuBlock;
+
+    for (size_t i = 0; i < numSequences; i++) {
+      CoordIterator coordIter(dimsV_[i], directions_);
+      backwardOneSequence(starts[i], coordIter);
+    }
+  }
+
+  if (input.grad) {
+    input.grad->add(*gate_.grad);
+  }
+  if (bias_ && bias_->getWGrad()) {
+    localBiasGrad_->collectBias(*gate_.grad, 1);
+    bias_->getParameterPtr()->incUpdate(callback);
+  }
+
+  weight_->getParameterPtr()->incUpdate(callback);
+}
+
+void MDLstmLayer::backwardGate2OutputSequence(int start,
+                                              CoordIterator& coordIter) {
+  int idxCurr = start + coordIter.offset();
+  std::vector<int> preOffsetV;
+  std::vector<int> nextOffsetV;
+  preOffsetV.reserve(numDims_);
+  nextOffsetV.reserve(numDims_);
+  for (int i = 0; i < numDims_; i++) {
+    std::vector<int> prePos;
+    if (coordIter.getPrePos(delays_, i, prePos)) {
+      preOffsetV[i] = coordIter.offset(prePos);
+    } else {
+      preOffsetV[i] = -1;
+    }
+    std::vector<int> nextPos;
+    if (coordIter.getNextPos(delays_, i, nextPos)) {
+      nextOffsetV[i] = coordIter.offset(nextPos);
+    } else {
+      nextOffsetV[i] = -1;
+    }
+  }
+
+  framePreOutput_[idxCurr].grad->dotMul(*frameOutput_[idxCurr].grad,
+                                        *frameOutputGate_[idxCurr].value);
+  activationState_->backward(framePreOutput_[idxCurr]).check();
+  frameState_[idxCurr].grad->copyFrom(*(framePreOutput_[idxCurr].grad));
+
+  frameOutputGate_[idxCurr].grad->dotMul(*frameOutput_[idxCurr].grad,
+                                         *framePreOutput_[idxCurr].value);
+  activationGate_->backward(frameOutputGate_[idxCurr]).check();
+
+  frameState_[idxCurr].grad->addDotMul(
+      *frameOutputGate_[idxCurr].grad, *checkOg_, 1.0, 1.0);
+  for (int i = 0; i < numDims_; i++) {
+    if (nextOffsetV[i] >= 0) {
+      frameState_[idxCurr].grad->addDotMul(
+          *frameInputGate_[start + nextOffsetV[i]].grad, *checkIg_, 1.0, 1.0);
+
+      MatrixPtr fgGateOneDimGrad = Matrix::create(
+          frameForgetGate_[start + nextOffsetV[i]].grad->getData() +
+              i * numBlocks_,
+          1,
+          numBlocks_,
+          false,
+          useGpu_);
+      MatrixPtr fgGateOneDimVal = Matrix::create(
+          frameForgetGate_[start + nextOffsetV[i]].value->getData() +
+              i * numBlocks_,
+          1,
+          numBlocks_,
+          false,
+          useGpu_);
+      MatrixPtr checkFgOneDim = Matrix::create(
+          checkFg_->getData() + i * numBlocks_, 1, numBlocks_, false, useGpu_);
+
+      frameState_[idxCurr].grad->addDotMul(
+          *fgGateOneDimGrad, *checkFgOneDim, 1.0, 1.0);
+      frameState_[idxCurr].grad->addDotMul(
+          *frameState_[start + nextOffsetV[i]].grad,
+          *fgGateOneDimVal,
+          1.0,
+          1.0);
+    }
+  }
+
+  frameInputNode_[idxCurr].grad->dotMul(*frameState_[idxCurr].grad,
+                                        *frameInputGate_[idxCurr].value);
+  frameInputGate_[idxCurr].grad->dotMul(*frameState_[idxCurr].grad,
+                                        *frameInputNode_[idxCurr].value);
+
+  frameForgetGate_[idxCurr].grad->zeroMem();
+  for (int i = 0; i < numDims_; i++) {
+    if (preOffsetV[i] >= 0) {
+      MatrixPtr fgGateOneDimGrad = Matrix::create(
+          frameForgetGate_[idxCurr].grad->getData() + i * numBlocks_,
+          1,
+          numBlocks_,
+          false,
+          useGpu_);
+      fgGateOneDimGrad->addDotMul(*frameState_[idxCurr].grad,
+                                  *frameState_[start + preOffsetV[i]].value,
+                                  1.0,
+                                  1.0);
+    }
+  }
+
+  activationGate_->backward(frameInputGate_[idxCurr]).check();
+  activationGate_->backward(frameForgetGate_[idxCurr]).check();
+  activation_->backward(frameInputNode_[idxCurr]).check();
+
+  if (bias_->getWGrad()) {
+    for (int i = 0; i < numDims_; i++) {
+      if (preOffsetV[i] >= 0) {
+        checkIgGrad_->addDotMul(*frameInputGate_[idxCurr].grad,
+                                *frameState_[start + preOffsetV[i]].value,
+                                1.0,
+                                1.0);
+
+        MatrixPtr fgGateOneDimGrad = Matrix::create(
+            frameForgetGate_[idxCurr].grad->getData() + i * numBlocks_,
+            1,
+            numBlocks_,
+            false,
+            useGpu_);
+        MatrixPtr checkFgOneDimGrad =
+            Matrix::create(checkFgGrad_->getData() + i * numBlocks_,
+                           1,
+                           numBlocks_,
+                           false,
+                           useGpu_);
+        checkFgOneDimGrad->addDotMul(*fgGateOneDimGrad,
+                                     *frameState_[start + preOffsetV[i]].value,
+                                     1.0,
+                                     1.0);
+      }
+    }
+    checkOgGrad_->addDotMul(
+        *frameOutputGate_[idxCurr].grad, *frameState_[idxCurr].value, 1.0, 1.0);
+  }
+}
+
+void MDLstmLayer::backwardOneSequence(int start, CoordIterator& coordIter) {
+  MatrixPtr weightT = weight_->getW()->getTranspose();
+  for (coordIter.rbegin(); !coordIter.end(); --coordIter) {
+    int offset = coordIter.offset();
+    backwardGate2OutputSequence(start, coordIter);
+    for (int i = 0; i < numDims_; i++) {
+      std::vector<int> prePos;
+      if (coordIter.getPrePos(delays_, i, prePos)) {
+        int preOffset = coordIter.offset(prePos);
+        frameOutput_[start + preOffset].grad->mul(
+            *frameGate_[start + offset].grad, *weightT, 1.0, 1.0);
+        if (weight_->getWGrad()) {
+          weight_->getWGrad()->mul(
+              *frameOutput_[start + preOffset].value->getTranspose(),
+              *frameGate_[start + offset].grad,
+              1.0,
+              1.0);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNAddtoLayer.cpp b/paddle/legacy/gserver/layers/MKLDNNAddtoLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/MKLDNNAddtoLayer.cpp
rename to paddle/legacy/gserver/layers/MKLDNNAddtoLayer.cpp
diff --git a/paddle/gserver/layers/MKLDNNAddtoLayer.h b/paddle/legacy/gserver/layers/MKLDNNAddtoLayer.h
similarity index 100%
rename from paddle/gserver/layers/MKLDNNAddtoLayer.h
rename to paddle/legacy/gserver/layers/MKLDNNAddtoLayer.h
diff --git a/paddle/gserver/layers/MKLDNNBase.h b/paddle/legacy/gserver/layers/MKLDNNBase.h
similarity index 100%
rename from paddle/gserver/layers/MKLDNNBase.h
rename to paddle/legacy/gserver/layers/MKLDNNBase.h
diff --git a/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp b/paddle/legacy/gserver/layers/MKLDNNBatchNormLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
rename to paddle/legacy/gserver/layers/MKLDNNBatchNormLayer.cpp
diff --git a/paddle/gserver/layers/MKLDNNBatchNormLayer.h b/paddle/legacy/gserver/layers/MKLDNNBatchNormLayer.h
similarity index 100%
rename from paddle/gserver/layers/MKLDNNBatchNormLayer.h
rename to paddle/legacy/gserver/layers/MKLDNNBatchNormLayer.h
diff --git a/paddle/gserver/layers/MKLDNNConcatLayer.cpp b/paddle/legacy/gserver/layers/MKLDNNConcatLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/MKLDNNConcatLayer.cpp
rename to paddle/legacy/gserver/layers/MKLDNNConcatLayer.cpp
diff --git a/paddle/gserver/layers/MKLDNNConcatLayer.h b/paddle/legacy/gserver/layers/MKLDNNConcatLayer.h
similarity index 100%
rename from paddle/gserver/layers/MKLDNNConcatLayer.h
rename to paddle/legacy/gserver/layers/MKLDNNConcatLayer.h
diff --git a/paddle/legacy/gserver/layers/MKLDNNConvLayer.cpp b/paddle/legacy/gserver/layers/MKLDNNConvLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b47bf14821fed4057227c80bb77e584649ab3145
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MKLDNNConvLayer.cpp
@@ -0,0 +1,388 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNConvLayer.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/utils/Logging.h"
+
+using namespace mkldnn;  // NOLINT
+typedef memory::format format;
+
+namespace paddle {
+
+REGISTER_LAYER(mkldnn_conv, MKLDNNConvLayer);
+
+bool MKLDNNConvLayer::init(const LayerMap& layerMap,
+                           const ParameterMap& parameterMap) {
+  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
+    return false;
+  }
+  CHECK_EQ(inputLayers_.size(), 1UL) << "Only support one input layer yet";
+  CHECK_EQ(inputLayers_.size(), parameters_.size());
+  CHECK(config_.shared_biases()) << "Only support shared biases yet";
+
+  oc_ = config_.num_filters();
+  const ConvConfig& conf = config_.inputs(0).conv_conf();
+  ic_ = conf.channels();
+  fw_ = conf.filter_size();
+  fh_ = conf.filter_size_y();
+  pw_ = conf.padding();
+  ph_ = conf.padding_y();
+  dw_ = conf.dilation();
+  dh_ = conf.dilation_y();
+  sw_ = conf.stride();
+  sh_ = conf.stride_y();
+  gp_ = conf.groups();
+  oh_ = conf.output_y();
+  ow_ = conf.output_x();
+  ih_ = conf.img_size_y();
+  iw_ = conf.img_size();
+  caffeMode_ = conf.caffe_mode();
+  CHECK(caffeMode_) << "Only support caffe mode yet";
+  CHECK(dh_ == 1 && dw_ == 1) << "Only support dilation 1 yet";
+  // check group setting
+  CHECK_EQ((oc_ / gp_) * gp_, oc_) << "group is indivisible for oc";
+  CHECK_EQ((ic_ / gp_) * gp_, ic_) << "group is indivisible for ic";
+
+  // create weight
+  size_t height = oc_ / gp_;
+  size_t width = ic_ * fh_ * fw_;
+  CHECK_EQ(parameters_[0]->getSize(), height * width);
+  weight_ =
+      std::unique_ptr<Weight>(new Weight(height, width, parameters_[0], 0));
+
+  // create biases
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, oc_, biasParameter_, 0));
+  }
+  return true;
+}
+
+void MKLDNNConvLayer::convertWeightsFromPaddle() {
+  if (hasInitedWgt_) {
+    return;
+  }
+
+  CHECK(wgtVal_) << "should have been initialized";
+  // the paddle weight format is oihw or goihw
+  auto targetDim = wgtVal_->getDims();
+  auto srcFmt = (gp_ == 1) ? memory::format::oihw : memory::format::goihw;
+  wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim);
+  hasInitedWgt_ = true;
+}
+
+void MKLDNNConvLayer::convertWeightsToPaddle() {
+  CHECK(wgtVal_) << "should have been initialized";
+  auto targetDim = wgtVal_->getDims();
+  auto dstFmt = (gp_ == 1) ? memory::format::oihw : memory::format::goihw;
+  wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
+}
+
+void MKLDNNConvLayer::reshape(
+    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
+  reshapeInput(bs, ih, iw);
+
+  // cal output sizes
+  // oc can not be changed
+  int fh = (fh_ - 1) * dh_ + 1;
+  int fw = (fw_ - 1) * dw_ + 1;
+  oh = outputSize(ih, fh, ph_, sh_, caffeMode_);
+  ow = outputSize(iw, fw, pw_, sw_, caffeMode_);
+
+  reshapeOutput(oh, ow);
+  resizeOutput(bs, oc * oh * ow);
+}
+
+void MKLDNNConvLayer::resetFwd(std::vector<primitive>& pipeline,
+                               std::vector<MKLDNNMatrixPtr>& inputs,
+                               MKLDNNMatrixPtr& out) {
+  resetFwdPD(fwdPD_);
+
+  resetFwdBuffers(fwdPD_, inputs[0], wgtVal_, biasVal_, out);
+
+  resetFwdPipeline(pipeline, fwdPD_, inputs[0], wgtVal_, biasVal_, out);
+}
+
+void MKLDNNConvLayer::resetBwd(std::vector<primitive>& pipeline,
+                               std::vector<MKLDNNMatrixPtr>& inputs,
+                               MKLDNNMatrixPtr& out) {
+  std::shared_ptr<conv_bwdWgt::primitive_desc> bwdWgtPD;
+  std::shared_ptr<conv_bwdData::primitive_desc> bwdDataPD;
+
+  resetBwdWgtPD(bwdWgtPD);
+
+  resetBwdDataPD(bwdDataPD);
+
+  resetBwdBuffers(bwdWgtPD, bwdDataPD, inputs[0], wgtGrad_, biasGrad_, out);
+
+  resetBwdPipeline(
+      pipeline, bwdWgtPD, bwdDataPD, inputs[0], wgtGrad_, biasGrad_, out);
+}
+
+void MKLDNNConvLayer::updateWeights(const UpdateCallback& callback) {
+  weight_->getParameterPtr()->incUpdate(callback);
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+void MKLDNNConvLayer::loadConvSettings(memory::dims& wgt,
+                                       memory::dims& bias,
+                                       memory::dims& stride,
+                                       memory::dims& dilation,
+                                       memory::dims& padL,
+                                       memory::dims& padR) {
+  wgt = (gp_ == 1) ? memory::dims{oc_, ic_, fh_, fw_}
+                   : memory::dims{gp_, oc_ / gp_, ic_ / gp_, fh_, fw_};
+  bias = memory::dims{oc_};
+  stride = memory::dims{sh_, sw_};
+  padL = memory::dims{ph_, pw_};
+  padR = getPaddingR();
+  // note: mkldnn dilation start from 0
+  dilation = memory::dims{dh_ - 1, dw_ - 1};
+}
+
+void MKLDNNConvLayer::resetFwdPD(
+    std::shared_ptr<conv_fwd::primitive_desc>& pd) {
+  // dims for conv
+  memory::dims inDims = memory::dims{bs_, ic_, ih_, iw_};
+  memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
+  memory::dims wgtDims, biasDims, strides, dilations, padL, padR;
+  loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
+
+  prop_kind pk = passType_ == PASS_TEST ? prop_kind::forward_scoring
+                                        : prop_kind::forward_training;
+  algorithm algo = algorithm::convolution_direct;
+  padding_kind padKind = padding_kind::zero;
+  conv_fwd::desc fwdDesc =
+      biases_ && biases_->getW()
+          ? conv_fwd::desc(pk,
+                           algo,
+                           MKLDNNMatrix::createMemoryDesc(inDims),
+                           MKLDNNMatrix::createMemoryDesc(wgtDims),
+                           MKLDNNMatrix::createMemoryDesc(biasDims),
+                           MKLDNNMatrix::createMemoryDesc(outDims),
+                           strides,
+                           dilations,
+                           padL,
+                           padR,
+                           padKind)
+          : conv_fwd::desc(pk,
+                           algo,
+                           MKLDNNMatrix::createMemoryDesc(inDims),
+                           MKLDNNMatrix::createMemoryDesc(wgtDims),
+                           MKLDNNMatrix::createMemoryDesc(outDims),
+                           strides,
+                           dilations,
+                           padL,
+                           padR,
+                           padKind);
+  pd.reset(new conv_fwd::primitive_desc(fwdDesc, engine_));
+}
+
+void MKLDNNConvLayer::resetFwdBuffers(
+    std::shared_ptr<conv_fwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& bias,
+    MKLDNNMatrixPtr& out) {
+  CHECK(pd);
+  resetInValue(
+      in, std::make_shared<memory::primitive_desc>(pd->src_primitive_desc()));
+
+  resetOutValue(out, pd->dst_primitive_desc());
+
+  resetWithMatrix(wgt, weight_->getW(), pd->weights_primitive_desc());
+
+  if (biases_ && biases_->getW()) {
+    resetWithMatrix(bias, biases_->getW(), pd->bias_primitive_desc());
+  } else {
+    bias = nullptr;
+  }
+}
+
+void MKLDNNConvLayer::resetFwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<conv_fwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& bias,
+    MKLDNNMatrixPtr& out) {
+  if (bias) {
+    fwd_.reset(new conv_fwd(*pd, *in, *wgt, *bias, *out));
+  } else {
+    fwd_.reset(new conv_fwd(*pd, *in, *wgt, *out));
+  }
+  pipeline.push_back(*fwd_);
+}
+
+void MKLDNNConvLayer::resetBwdWgtPD(
+    std::shared_ptr<conv_bwdWgt::primitive_desc>& pd) {
+  memory::dims wgtDims, biasDims, strides, dilations, padL, padR;
+  loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
+
+  // create backward weight using input, output and weight value memory desc
+  CHECK(inVals_[0]) << "Should have internal input value";
+  CHECK(outVal_) << "Should have internal output value";
+  CHECK(wgtVal_) << "Should have weight value";
+  algorithm algo = algorithm::convolution_direct;
+  padding_kind padKind = padding_kind::zero;
+  auto bwdWgtDesc = biasVal_ != nullptr
+                        ? conv_bwdWgt::desc(algo,
+                                            inVals_[0]->getMemoryDesc(),
+                                            wgtVal_->getMemoryDesc(),
+                                            biasVal_->getMemoryDesc(),
+                                            outVal_->getMemoryDesc(),
+                                            strides,
+                                            padL,
+                                            padR,
+                                            padKind)
+                        : conv_bwdWgt::desc(algo,
+                                            inVals_[0]->getMemoryDesc(),
+                                            wgtVal_->getMemoryDesc(),
+                                            outVal_->getMemoryDesc(),
+                                            strides,
+                                            padL,
+                                            padR,
+                                            padKind);
+  pd.reset(new conv_bwdWgt::primitive_desc(bwdWgtDesc, engine_, *fwdPD_));
+  CHECK_PRIMITIVE_DESC_EQ(inVals_[0], pd->src_primitive_desc());
+  CHECK_PRIMITIVE_DESC_EQ(
+      outVal_,
+      pd->diff_dst_primitive_desc(),
+      "primitive desc of out value and grad should be equal");
+  CHECK_PRIMITIVE_DESC_EQ(
+      wgtVal_,
+      pd->diff_weights_primitive_desc(),
+      "primitive desc of weight value and grad should be equal");
+}
+
+void MKLDNNConvLayer::resetBwdDataPD(
+    std::shared_ptr<conv_bwdData::primitive_desc>& pd) {
+  pd = nullptr;
+  if (inputLayers_[0]->getOutput().grad == nullptr) {
+    return;
+  }
+
+  memory::dims wgtDims, biasDims, strides, dilations, padL, padR;
+  loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
+  CHECK(inVals_[0]) << "Should have internal input value";
+  CHECK(outVal_) << "Should have internal output value";
+  // create backward data using input and output value memory desc
+  // but using weight memory desc with any format
+  auto bwdDataDesc = conv_bwdData::desc(algorithm::convolution_direct,
+                                        inVals_[0]->getMemoryDesc(),
+                                        MKLDNNMatrix::createMemoryDesc(wgtDims),
+                                        outVal_->getMemoryDesc(),
+                                        strides,
+                                        padL,
+                                        padR,
+                                        padding_kind::zero);
+  pd.reset(new conv_bwdData::primitive_desc(bwdDataDesc, engine_, *fwdPD_));
+  CHECK_PRIMITIVE_DESC_EQ(
+      inVals_[0],
+      pd->diff_src_primitive_desc(),
+      "primitive desc of in value and grad should be equal");
+  CHECK_PRIMITIVE_DESC_EQ(
+      outVal_,
+      pd->diff_dst_primitive_desc(),
+      "primitive desc of out value and grad should be equal");
+}
+
+void MKLDNNConvLayer::resetBwdBuffers(
+    std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
+    std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& bias,
+    MKLDNNMatrixPtr& out) {
+  CHECK(wgtPD);
+  resetOutGrad(out, wgtPD->diff_dst_primitive_desc());
+
+  resetWithMatrix(
+      wgt, weight_->getWGrad(), wgtPD->diff_weights_primitive_desc());
+  CHECK_PRIMITIVE_DESC_EQ(
+      wgtVal_,
+      wgt->getPrimitiveDesc(),
+      "primitive desc of weight grad and value should be equal");
+
+  bias = nullptr;
+  if (biases_ && biases_->getWGrad()) {
+    resetWithMatrix(
+        bias, biases_->getWGrad(), wgtPD->diff_bias_primitive_desc());
+    CHECK(bias);
+    CHECK_PRIMITIVE_DESC_EQ(
+        biasVal_,
+        bias->getPrimitiveDesc(),
+        "primitive desc of bias grad and value should be equal");
+  }
+
+  if (dataPD == nullptr) {
+    return;
+  }
+  resetInGrad(in, dataPD->diff_src_primitive_desc());
+  resetWgtValBwdData(dataPD, wgtValBwdData_);
+}
+
+void MKLDNNConvLayer::resetBwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
+    std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& bias,
+    MKLDNNMatrixPtr& out) {
+  CHECK(inVals_[0]);
+  // add bwdWgt handle
+  if (bias) {
+    bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVals_[0], *out, *wgt, *bias));
+  } else {
+    bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVals_[0], *out, *wgt));
+  }
+  pipeline.push_back(*bwdWgt_);
+
+  if (dataPD == nullptr) {
+    return;
+  }
+  if (cvtWgtVal_) {
+    pipeline.push_back(*cvtWgtVal_);
+  }
+  // add bwdData handle
+  CHECK(wgtValBwdData_) << "Should have weight memory";
+  bwdData_.reset(new conv_bwdData(*dataPD, *out, *wgtValBwdData_, *in));
+  pipeline.push_back(*bwdData_);
+}
+
+void MKLDNNConvLayer::resetWgtValBwdData(
+    std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
+    MKLDNNMatrixPtr& wgt) {
+  if (dataPD == nullptr) {
+    return;
+  }
+
+  // create new weight value for backward data, and create reorder if necessary
+  // since the primitive_desc would be different with wgtVal_
+  CHECK(wgtVal_) << "should have weight value";
+  if (dataPD->weights_primitive_desc() != wgtVal_->getPrimitiveDesc()) {
+    wgtValBwdData_ = MKLDNNMatrix::create(dataPD->weights_primitive_desc());
+    cvtWgtVal_ = MKLDNNMatrix::createReorder(wgtVal_, wgtValBwdData_);
+    CHECK(cvtWgtVal_);
+  } else {
+    wgtValBwdData_ = wgtVal_;
+  }
+  VLOG(MKLDNN_FMTS) << "weight value format for backward data: "
+                    << wgtValBwdData_->getFormat();
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNConvLayer.h b/paddle/legacy/gserver/layers/MKLDNNConvLayer.h
similarity index 100%
rename from paddle/gserver/layers/MKLDNNConvLayer.h
rename to paddle/legacy/gserver/layers/MKLDNNConvLayer.h
diff --git a/paddle/legacy/gserver/layers/MKLDNNFcLayer.cpp b/paddle/legacy/gserver/layers/MKLDNNFcLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f3747c7db84ef53fdcfa3741525a754fab63bca5
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MKLDNNFcLayer.cpp
@@ -0,0 +1,262 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNFcLayer.h"
+#include "paddle/legacy/utils/Logging.h"
+
+using namespace mkldnn;  // NOLINT
+typedef memory::format format;
+
+namespace paddle {
+
+REGISTER_LAYER(mkldnn_fc, MKLDNNFcLayer);
+
+bool MKLDNNFcLayer::init(const LayerMap& layerMap,
+                         const ParameterMap& parameterMap) {
+  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
+    return false;
+  }
+
+  CHECK_EQ(inputLayers_.size(), 1UL) << "Only support one input layer yet";
+  CHECK_EQ(inputLayers_.size(), parameters_.size());
+  CHECK(!parameters_[0]->isSparse()) << "Do not support sparse yet";
+
+  // output size, cat not be changed
+  oc_ = getSize();
+  oh_ = 1;
+  ow_ = 1;
+  ih_ = 1;
+  iw_ = 1;
+
+  // input size can not change in FC
+  iLayerSize_ = inputLayers_[0]->getSize();
+  CHECK_EQ(parameters_[0]->getSize(), iLayerSize_ * oc_);
+
+  // create weight
+  weight_ =
+      std::unique_ptr<Weight>(new Weight(oc_, iLayerSize_, parameters_[0], 0));
+
+  // create biases
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, oc_, biasParameter_, 0));
+  }
+  return true;
+}
+
+void MKLDNNFcLayer::convertWeightsFromPaddle() {
+  if (hasInitedWgt_) {
+    return;
+  }
+
+  CHECK(wgtVal_) << "should have been initialized";
+  auto targetDim = wgtVal_->getDims();
+  auto srcFmt = targetDim.size() == 2 ? format::io : format::ihwo;
+  wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim);
+  hasInitedWgt_ = true;
+}
+
+void MKLDNNFcLayer::convertWeightsToPaddle() {
+  CHECK(wgtVal_) << "should have been initialized";
+  auto targetDim = wgtVal_->getDims();
+  auto dstFmt = targetDim.size() == 2 ? format::io : format::ihwo;
+  wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
+}
+
+void MKLDNNFcLayer::reshape(
+    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
+  reshapeInput(bs, ih, iw);
+
+  CHECK_EQ(iLayerSize_, inputLayers_[0]->getSize());
+  ic = iLayerSize_ / (ih * iw);
+  CHECK_EQ(size_t(ic * ih * iw), iLayerSize_) << "not divisible";
+  CHECK_EQ(size_t(oc), getSize());
+
+  reshapeOutput(oh, ow);
+  resizeOutput(bs, oc);
+}
+
+void MKLDNNFcLayer::resetFwd(std::vector<primitive>& pipeline,
+                             std::vector<MKLDNNMatrixPtr>& inputs,
+                             MKLDNNMatrixPtr& out) {
+  resetFwdBuffers(inputs[0], wgtVal_, biasVal_, out);
+
+  resetFwdPD(fwdPD_, inputs[0], wgtVal_, biasVal_, out);
+
+  resetFwdPipeline(pipeline, fwdPD_, inputs[0], wgtVal_, biasVal_, out);
+}
+
+void MKLDNNFcLayer::resetBwd(std::vector<primitive>& pipeline,
+                             std::vector<MKLDNNMatrixPtr>& inputs,
+                             MKLDNNMatrixPtr& out) {
+  std::shared_ptr<fc_bwdWgt::primitive_desc> bwdWgtPD;
+  std::shared_ptr<fc_bwdData::primitive_desc> bwdDataPD;
+
+  resetBwdBuffers(inputs[0], wgtGrad_, biasGrad_, out);
+
+  resetBwdWgtPD(bwdWgtPD, wgtGrad_, biasGrad_, out);
+
+  resetBwdDataPD(bwdDataPD, inputs[0], out);
+
+  resetBwdPipeline(
+      pipeline, bwdWgtPD, bwdDataPD, inputs[0], wgtGrad_, biasGrad_, out);
+}
+
+void MKLDNNFcLayer::updateWeights(const UpdateCallback& callback) {
+  weight_->getParameterPtr()->incUpdate(callback);
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+void MKLDNNFcLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
+                                    MKLDNNMatrixPtr& wgt,
+                                    MKLDNNMatrixPtr& bias,
+                                    MKLDNNMatrixPtr& out) {
+  resetInValue(in);
+  CHECK(in);
+  in->downSpatial();
+
+  auto outPD =
+      MKLDNNMatrix::createPrimitiveDesc({bs_, oc_}, format::nc, engine_);
+  resetOutValue(out, outPD);
+
+  format wgtFmt = format::oihw;
+  if (in->getFormat() == format::nChw8c) {
+    wgtFmt = format::oIhw8i;
+  } else if (in->getFormat() == format::nChw16c) {
+    wgtFmt = format::oIhw16i;
+  }
+  auto wgtPD =
+      MKLDNNMatrix::createPrimitiveDesc({oc_, ic_, ih_, iw_}, wgtFmt, engine_);
+  resetWithMatrix(wgt, weight_->getW(), wgtPD);
+  wgt->downSpatial();
+
+  if (biases_ && biases_->getW()) {
+    auto biasPD = MKLDNNMatrix::createPrimitiveDesc({oc_}, format::x, engine_);
+    resetWithMatrix(bias, biases_->getW(), biasPD);
+  } else {
+    bias = nullptr;
+  }
+}
+
+void MKLDNNFcLayer::resetFwdPD(std::shared_ptr<fc_fwd::primitive_desc>& pd,
+                               MKLDNNMatrixPtr in,
+                               MKLDNNMatrixPtr wgt,
+                               MKLDNNMatrixPtr bias,
+                               MKLDNNMatrixPtr out) {
+  CHECK(in);
+  CHECK(wgt);
+  CHECK(out);
+  prop_kind pk = prop_kind::forward;
+  fc_fwd::desc fwdDesc = bias != nullptr ? fc_fwd::desc(pk,
+                                                        in->getMemoryDesc(),
+                                                        wgt->getMemoryDesc(),
+                                                        bias->getMemoryDesc(),
+                                                        out->getMemoryDesc())
+                                         : fc_fwd::desc(pk,
+                                                        in->getMemoryDesc(),
+                                                        wgt->getMemoryDesc(),
+                                                        out->getMemoryDesc());
+  pd.reset(new fc_fwd::primitive_desc(fwdDesc, engine_));
+}
+
+void MKLDNNFcLayer::resetFwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<fc_fwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& bias,
+    MKLDNNMatrixPtr& out) {
+  if (bias) {
+    fwd_.reset(new fc_fwd(*pd, *in, *wgt, *bias, *out));
+  } else {
+    fwd_.reset(new fc_fwd(*pd, *in, *wgt, *out));
+  }
+  pipeline.push_back(*fwd_);
+}
+
+void MKLDNNFcLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
+                                    MKLDNNMatrixPtr& wgt,
+                                    MKLDNNMatrixPtr& bias,
+                                    MKLDNNMatrixPtr& out) {
+  CHECK(inVals_[0] && outVal_);
+  resetOutGrad(out, outVal_->getPrimitiveDesc());
+  resetInGrad(in, inVals_[0]->getPrimitiveDesc());
+
+  CHECK(wgtVal_);
+  resetWithMatrix(wgt, weight_->getWGrad(), wgtVal_->getPrimitiveDesc());
+
+  if (biasVal_) {
+    resetWithMatrix(bias, biases_->getWGrad(), biasVal_->getPrimitiveDesc());
+  } else {
+    bias = nullptr;
+  }
+}
+
+void MKLDNNFcLayer::resetBwdWgtPD(
+    std::shared_ptr<fc_bwdWgt::primitive_desc>& pd,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& bias,
+    MKLDNNMatrixPtr& out) {
+  CHECK(inVals_[0]);
+  fc_bwdWgt::desc bwdWgtDesc =
+      bias ? fc_bwdWgt::desc(inVals_[0]->getMemoryDesc(),
+                             wgt->getMemoryDesc(),
+                             bias->getMemoryDesc(),
+                             out->getMemoryDesc())
+           : fc_bwdWgt::desc(inVals_[0]->getMemoryDesc(),
+                             wgt->getMemoryDesc(),
+                             out->getMemoryDesc());
+  pd.reset(new fc_bwdWgt::primitive_desc(bwdWgtDesc, engine_, *fwdPD_));
+}
+
+void MKLDNNFcLayer::resetBwdDataPD(
+    std::shared_ptr<fc_bwdData::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& out) {
+  pd = nullptr;
+  if (in == nullptr) {
+    return;
+  }
+  CHECK(wgtVal_);
+  fc_bwdData::desc bwdDataDesc = fc_bwdData::desc(
+      in->getMemoryDesc(), wgtVal_->getMemoryDesc(), out->getMemoryDesc());
+  pd.reset(new fc_bwdData::primitive_desc(bwdDataDesc, engine_, *fwdPD_));
+}
+
+void MKLDNNFcLayer::resetBwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<fc_bwdWgt::primitive_desc>& bwdWgtPD,
+    std::shared_ptr<fc_bwdData::primitive_desc>& bwdDataPD,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& bias,
+    MKLDNNMatrixPtr& out) {
+  CHECK(inVals_[0]);
+  if (bias) {
+    bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVals_[0], *out, *wgt, *bias));
+  } else {
+    bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVals_[0], *out, *wgt));
+  }
+  pipeline.push_back(*bwdWgt_);
+
+  if (bwdDataPD == nullptr) {
+    return;
+  }
+  CHECK(wgtVal_) << "Should have weight memory";
+  bwdData_.reset(new fc_bwdData(*bwdDataPD, *out, *wgtVal_, *in));
+  pipeline.push_back(*bwdData_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.h b/paddle/legacy/gserver/layers/MKLDNNFcLayer.h
similarity index 100%
rename from paddle/gserver/layers/MKLDNNFcLayer.h
rename to paddle/legacy/gserver/layers/MKLDNNFcLayer.h
diff --git a/paddle/legacy/gserver/layers/MKLDNNLRNLayer.cpp b/paddle/legacy/gserver/layers/MKLDNNLRNLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..739482348f71bf144551cd1d881f1f1d7d69201f
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MKLDNNLRNLayer.cpp
@@ -0,0 +1,163 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNLRNLayer.h"
+#include "paddle/legacy/utils/Logging.h"
+
+using namespace mkldnn;  // NOLINT
+typedef memory::format format;
+
+namespace paddle {
+
+REGISTER_LAYER(mkldnn_lrn, MKLDNNLRNLayer);
+
+bool MKLDNNLRNLayer::init(const LayerMap& layerMap,
+                          const ParameterMap& parameterMap) {
+  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
+    return false;
+  }
+
+  /* the size of inputs for norm-layer is 1 */
+  CHECK_EQ(config_.inputs_size(), 1);
+  const NormConfig& conf = config_.inputs(0).norm_conf();
+  localSize_ = conf.size();
+  alpha_ = conf.scale();
+  beta_ = conf.pow();
+
+  ic_ = conf.channels();
+  oc_ = ic_;
+  iw_ = conf.img_size();
+  ow_ = conf.output_x();
+  ih_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+  oh_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
+  CHECK_EQ(iw_, ow_);
+  CHECK_EQ(ih_, oh_);
+  return true;
+}
+
+void MKLDNNLRNLayer::reshape(
+    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  reshapeInput(bs, ih, iw);
+  // ic_ and oc can not be changed
+  CHECK_EQ((size_t)ic,
+           inputLayers_[0]->getOutputValue()->getElementCnt() / bs / ih / iw)
+      << "Input channel can not be changed";
+  oh = ih;
+  ow = iw;
+  reshapeOutput(oh, ow);
+  resizeOutput(bs, oc * oh * ow);
+}
+
+void MKLDNNLRNLayer::resetFwd(std::vector<primitive>& pipeline,
+                              std::vector<MKLDNNMatrixPtr>& inputs,
+                              MKLDNNMatrixPtr& out) {
+  resetFwdBuffers(inputs[0], out);
+
+  resetFwdPD(fwdPD_, inputs[0], out);
+
+  resetFwdPipeline(pipeline, fwdPD_, inputs[0], out);
+}
+
+void MKLDNNLRNLayer::resetBwd(std::vector<primitive>& pipeline,
+                              std::vector<MKLDNNMatrixPtr>& inputs,
+                              MKLDNNMatrixPtr& out) {
+  std::shared_ptr<lrn_bwd::primitive_desc> pd;
+
+  resetBwdBuffers(inputs[0], out);
+
+  resetBwdPD(pd, inputs[0], out);
+
+  resetBwdPipeline(pipeline, pd, inputs[0], out);
+}
+
+void MKLDNNLRNLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
+                                     MKLDNNMatrixPtr& out) {
+  resetInValue(in);
+  CHECK(in);
+  resetOutValue(out, in->getPrimitiveDesc());
+}
+
+void MKLDNNLRNLayer::resetFwdPD(std::shared_ptr<lrn_fwd::primitive_desc>& pd,
+                                MKLDNNMatrixPtr in,
+                                MKLDNNMatrixPtr out) {
+  prop_kind pk = passType_ == PASS_TEST ? prop_kind::forward_scoring
+                                        : prop_kind::forward_training;
+  auto fwdDesc = lrn_fwd::desc(pk,
+                               algorithm::lrn_across_channels,
+                               in->getMemoryDesc(),
+                               localSize_,
+                               alpha_,
+                               beta_,
+                               1.0f);
+  pd.reset(new lrn_fwd::primitive_desc(fwdDesc, engine_));
+  // prepare workspace if necessary
+  workspace_ =
+      passType_ != PASS_TEST
+          ? std::make_shared<memory>(memory(pd->workspace_primitive_desc()))
+          : nullptr;
+}
+
+void MKLDNNLRNLayer::resetFwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<lrn_fwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& out) {
+  fwd_ = workspace_
+             ? std::make_shared<lrn_fwd>(lrn_fwd(*pd, *in, *workspace_, *out))
+             : std::make_shared<lrn_fwd>(lrn_fwd(*pd, *in, *out));
+  pipeline.push_back(*fwd_);
+}
+
+void MKLDNNLRNLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
+                                     MKLDNNMatrixPtr& out) {
+  CHECK(inVals_[0] && outVal_);
+  resetOutGrad(out, outVal_->getPrimitiveDesc());
+  resetInGrad(in, inVals_[0]->getPrimitiveDesc());
+}
+
+void MKLDNNLRNLayer::resetBwdPD(std::shared_ptr<lrn_bwd::primitive_desc>& pd,
+                                MKLDNNMatrixPtr& in,
+                                MKLDNNMatrixPtr& out) {
+  pd = nullptr;
+  if (in == nullptr) {
+    return;
+  }
+  CHECK(out);
+  auto bwdDesc = lrn_bwd::desc(algorithm::lrn_across_channels,
+                               in->getMemoryDesc(),
+                               out->getMemoryDesc(),
+                               localSize_,
+                               alpha_,
+                               beta_,
+                               1.0f);
+  pd.reset(new lrn_bwd::primitive_desc(bwdDesc, engine_, *fwdPD_));
+}
+
+void MKLDNNLRNLayer::resetBwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<lrn_bwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& out) {
+  if (pd == nullptr) {
+    return;
+  }
+  CHECK(inVals_[0]);
+  CHECK(workspace_);
+  bwdData_ = std::make_shared<lrn_bwd>(
+      lrn_bwd(*pd, *inVals_[0], *out, *workspace_, *in));
+  pipeline.push_back(*bwdData_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNLRNLayer.h b/paddle/legacy/gserver/layers/MKLDNNLRNLayer.h
similarity index 100%
rename from paddle/gserver/layers/MKLDNNLRNLayer.h
rename to paddle/legacy/gserver/layers/MKLDNNLRNLayer.h
diff --git a/paddle/gserver/layers/MKLDNNLayer.cpp b/paddle/legacy/gserver/layers/MKLDNNLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/MKLDNNLayer.cpp
rename to paddle/legacy/gserver/layers/MKLDNNLayer.cpp
diff --git a/paddle/legacy/gserver/layers/MKLDNNLayer.h b/paddle/legacy/gserver/layers/MKLDNNLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..94dc8625f68985a16bd68a6e36a1ad607d77a7cb
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MKLDNNLayer.h
@@ -0,0 +1,477 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "Layer.h"
+#include "MKLDNNBase.h"
+#include "mkldnn.hpp"
+#include "paddle/legacy/math/MKLDNNMatrix.h"
+#include "paddle/legacy/utils/Stat.h"
+
+DECLARE_bool(use_mkldnn);
+
+namespace paddle {
+
+class MKLDNNLayer;
+typedef std::shared_ptr<MKLDNNLayer> MKLDNNLayerPtr;
+
+/**
+ * @brief Base class of MKLDNNlayer.
+ *
+ */
+class MKLDNNLayer : public Layer {
+ protected:
+  // batch size
+  int bs_;
+  // their sizes are always from the first input layer
+  // input image channel, height and width
+  int ic_, ih_, iw_;
+  // output image channel, height and width
+  int oc_, oh_, ow_;
+
+  // the condition that forward need be reset
+  size_t condition_;
+  // backward also need reset after reset forward handle
+  bool needResetBwd_;
+
+  // is output only mkldnn
+  bool outputOnlyMKLDNN_;
+
+  // mkldnn engine, stream and primivtives
+  mkldnn::engine engine_;
+  std::shared_ptr<MKLDNNStream> stream_;
+  std::shared_ptr<mkldnn::primitive> fwd_;
+  std::shared_ptr<mkldnn::primitive> bwdWgt_;
+  std::shared_ptr<mkldnn::primitive> bwdData_;
+  std::vector<mkldnn::primitive> pipelineFwd_;
+  std::vector<mkldnn::primitive> pipelineBwd_;
+
+  /* Value and grad are seperated as internal and external buffers.
+   * Each MKLDNNLayer must init or reset internal buffer at least,
+   * and the external buffer format is always nchw of nc(when h==w==1),
+   * which is the same format as paddle.
+   * The output_.value and output_.grad always save the external data,
+   * when mixed with cpu device.
+   * When all layers are mkldnn layers, they could save internal data.
+   */
+  // below MKLDNNMatrix buffers are all internal buffers
+  std::vector<MKLDNNMatrixPtr> inVals_;
+  std::vector<MKLDNNMatrixPtr> inGrads_;
+  MKLDNNMatrixPtr outVal_;
+  MKLDNNMatrixPtr outGrad_;
+  // below are external value and grad
+  std::vector<MKLDNNMatrixPtr> extInVals_;
+  std::vector<MKLDNNMatrixPtr> extInGrads_;
+  MKLDNNMatrixPtr extOutVal_;
+  MKLDNNMatrixPtr extOutGrad_;
+  // convert handle between external and internal buffers
+  std::vector<std::shared_ptr<mkldnn::reorder>> cvtInVals_;
+  std::vector<std::shared_ptr<mkldnn::reorder>> cvtInGrads_;
+  std::shared_ptr<mkldnn::reorder> cvtOutVal_;
+  std::shared_ptr<mkldnn::reorder> cvtOutGrad_;
+
+  // weight and bias are always internal buffers
+  MKLDNNMatrixPtr wgtVal_;
+  MKLDNNMatrixPtr wgtGrad_;
+  MKLDNNMatrixPtr biasVal_;
+  MKLDNNMatrixPtr biasGrad_;
+
+  // merge grad primitive
+  std::shared_ptr<mkldnn::primitive> mergeGrad_;
+  std::vector<mkldnn::primitive> pipelineMergeGrad_;
+  // tmp input argument to save input grad, only used to merge grad
+  Argument tmpInArg_;
+
+ public:
+  explicit MKLDNNLayer(const LayerConfig& config)
+      : Layer(config),
+        ih_(0),
+        iw_(0),
+        condition_(0),
+        needResetBwd_(true),
+        outputOnlyMKLDNN_(false),
+        engine_(mkldnn::engine::cpu, 0),
+        stream_(nullptr),
+        fwd_(nullptr),
+        bwdWgt_(nullptr),
+        bwdData_(nullptr) {}
+
+  ~MKLDNNLayer() {}
+
+  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  virtual void forward(PassType passType);
+  virtual void backward(const UpdateCallback& callback);
+
+  /**
+   * reshape the input and output channels and image sizes
+   * and reset output buffer size
+   */
+  virtual void reshape(
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) = 0;
+
+  /**
+   * reset the mkldnn forward primitve and memories
+   * only would be called when input size changes
+   * weight and bias buffers should be coverd by child class itself
+   */
+  virtual void resetFwd(std::vector<mkldnn::primitive>& pipeline,
+                        std::vector<MKLDNNMatrixPtr>& inputs,
+                        MKLDNNMatrixPtr& out) = 0;
+
+  /**
+   * reset the mkldnn backward primitve and memories
+   * only would be called when needed
+   * weight and bias buffers should be coverd by child class itself
+   */
+  virtual void resetBwd(std::vector<mkldnn::primitive>& pipeline,
+                        std::vector<MKLDNNMatrixPtr>& inputs,
+                        MKLDNNMatrixPtr& out) = 0;
+
+  /**
+   * Update weights and biases if necessary.
+   */
+  virtual void updateWeights(const UpdateCallback& callback) {}
+
+  /**
+   * convert weight from paddle format to mkldnn format
+   * weight_ will be override
+   */
+  virtual void convertWeightsFromPaddle() {}
+
+  /**
+   * convert mkldnn weight to paddle format
+   * weight_ will be override
+   */
+  virtual void convertWeightsToPaddle() {}
+
+  /**
+   * add this interface as public for unit test
+   */
+  void addOutputArgument(int deviceId) { Layer::addOutputArgument(deviceId); }
+
+ protected:
+  /**
+   * Some layers may have different condition to reset the forward.
+   * The function returns the condition that do not need reset forward.
+   */
+  inline virtual size_t keepCondition() {
+    // reset when the first input element size changed, not only the batchsize
+    return inputLayers_[0]->getOutputValue()->getElementCnt();
+  }
+
+  /**
+   * reshape the input image sizes and input batchsize
+   */
+  void reshapeInput(int& batchsize, int& height, int& width, size_t idx = 0);
+
+  /**
+   * reshape output image sizes
+   */
+  void reshapeOutput(size_t height, size_t width);
+
+  /**
+   * reset MKLDNNMatrix from Matrix and internal primitive desc.
+   * reset nullptr if matrix or primitive desc is empty
+   */
+  void resetWithMatrix(MKLDNNMatrixPtr& dnn,
+                       const MatrixPtr& mat,
+                       mkldnn::memory::primitive_desc pd);
+
+  /**
+   * reset input value from input MKLDNNMatrix and internal primitive desc.
+   * reset both internal and external buffer and create reorder if necessary.
+   * input channel may be different in concat.
+   */
+  void resetInValue(
+      MKLDNNMatrixPtr& in,
+      const std::shared_ptr<mkldnn::memory::primitive_desc>& intPD = nullptr,
+      size_t idx = 0,
+      int inputChannel = 0);
+
+  /**
+   * reset output value from internal primitive desc.
+   * reset both internal and external buffer and create reorder if necessary.
+   */
+  void resetOutValue(MKLDNNMatrixPtr& out,
+                     mkldnn::memory::primitive_desc intPD);
+
+  /**
+   * reset input grad from internal primitive desc.
+   * reset both internal and external buffer and create reorder if necessary.
+   */
+  void resetInGrad(MKLDNNMatrixPtr& in,
+                   mkldnn::memory::primitive_desc intPD,
+                   size_t idx = 0);
+
+  /**
+   * reset output grad from internal primitive desc.
+   * merge grad if necessary.
+   * reset both internal and external buffer and create reorder if necessary.
+   * note: about merge grad, when this layer has several outputs,
+   *       it could not be mixed with cpu device,
+   *       since it can not get memory desc from cpu device.
+   */
+  void resetOutGrad(MKLDNNMatrixPtr& out, mkldnn::memory::primitive_desc intPD);
+
+  /**
+   * reset the merge grad primitive if necessary.
+   * note: do not support the grads mixed with cpu device,
+   *       since it can not get memory desc from cpu device.
+   */
+  void resetMergeGrad(MKLDNNMatrixPtr& out);
+
+ protected:
+  /**
+   * Set deviceId of this layer.
+   */
+  void setDevice(int id) { deviceId_ = id; }
+
+  /**
+   * check the format is nchw or nc,
+   * which is supported by Paddle default memory layout
+   */
+  bool isPaddleFormat(mkldnn::memory::format fmt) {
+    if (fmt == mkldnn::memory::format::nchw ||
+        fmt == mkldnn::memory::format::nc) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  /**
+   * If input only has MKLDNN device.
+   * Otherwise, only support the previous layer using CPU device.
+   */
+  bool inputIsOnlyMKLDNN(int index = 0) {
+    int prevDevice = getPrev(index)->getDeviceId();
+    if (prevDevice == MKLDNN_DEVICE) {
+      return true;
+    } else {
+      CHECK_EQ(prevDevice, CPU_DEVICE) << "Only support CPU yet";
+      return false;
+    }
+  }
+
+  /**
+   * If output only has MKLDNN device.
+   * Otherwise, other devices should only using CPU device.
+   */
+  bool outputIsOnlyMKLDNN() {
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      CHECK_EQ(outputOtherDevice_[i].deviceId, CPU_DEVICE)
+          << "Only support other device is CPU yet";
+    }
+    outputOnlyMKLDNN_ = outputOtherDevice_.size() == 0;
+    return outputOnlyMKLDNN_;
+  }
+
+  /**
+   * print info about sizes
+   */
+  virtual void printSizeInfo() {
+    VLOG(MKLDNN_SIZES) << getName() << ": bs: " << bs_ << ", ic: " << ic_
+                       << ", ih: " << ih_ << ", iw: " << iw_ << ", oc: " << oc_
+                       << ", oh: " << oh_ << ", ow: " << ow_;
+  }
+
+  /**
+   * print the mkldnn memory format of value
+   */
+  virtual void printValueFormat() {
+    for (size_t i = 0; i < inVals_.size(); ++i) {
+      if (!inVals_[i]) {
+        continue;
+      }
+      VLOG(MKLDNN_FMTS) << "Input " << i << ", " << inputLayers_[i]->getName()
+                        << ": " << (extInVals_[i] ? extInVals_[i]->getFormat()
+                                                  : inVals_[i]->getFormat())
+                        << " >>> " << inVals_[i]->getFormat() << " >>>";
+    }
+    if (outVal_) {
+      VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> "
+                        << (extOutVal_ ? extOutVal_->getFormat()
+                                       : outVal_->getFormat());
+    }
+    if (wgtVal_) {
+      VLOG(MKLDNN_FMTS) << "Weight value format: " << wgtVal_->getFormat();
+    }
+    if (biasVal_) {
+      VLOG(MKLDNN_FMTS) << "Bias value format: " << biasVal_->getFormat();
+    }
+  }
+
+  /**
+   * print the mkldnn memory format of grad
+   */
+  virtual void printGradFormat() {
+    if (outGrad_) {
+      VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< "
+                        << (extOutGrad_ ? extOutGrad_->getFormat()
+                                        : outGrad_->getFormat());
+    }
+    for (size_t i = 0; i < inGrads_.size(); ++i) {
+      if (!inGrads_[i]) {
+        continue;
+      }
+      VLOG(MKLDNN_FMTS) << "Input " << i << ", " << inputLayers_[i]->getName()
+                        << ": " << (extInGrads_[i] ? extInGrads_[i]->getFormat()
+                                                   : inGrads_[i]->getFormat())
+                        << " <<< " << inGrads_[i]->getFormat() << " <<<";
+    }
+    if (wgtGrad_) {
+      VLOG(MKLDNN_FMTS) << "Weight grad format: " << wgtGrad_->getFormat();
+    }
+    if (biasGrad_) {
+      VLOG(MKLDNN_FMTS) << "Bias grad format: " << biasGrad_->getFormat();
+    }
+  }
+
+ private:
+  /**
+   * clear all grad
+   */
+  void clearGrads() {
+    if (output_.grad) {
+      output_.grad->zeroMem();
+    }
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      if (outputOtherDevice_[i].grad) {
+        outputOtherDevice_[i].grad->zeroMem();
+      }
+    }
+  }
+
+  /**
+   * Set deviceId of the params used in this layer.
+   */
+  void setParamsDevice(int id, const ParameterMap& parameterMap) {
+    for (auto& inputConfig : config_.inputs()) {
+      if (inputConfig.has_input_parameter_name()) {
+        ParameterPtr parameter;
+        std::string name = inputConfig.input_parameter_name();
+        CHECK(mapGet(name, parameterMap, &parameter))
+            << "Cannot find input parameter " << name << " for layer "
+            << getName();
+        parameter->setDevice(id);
+      }
+    }
+    if (config_.has_bias_parameter_name()) {
+      ParameterPtr parameter;
+      std::string name = config_.bias_parameter_name();
+      CHECK(mapGet(name, parameterMap, &parameter))
+          << "Cannot find bias parameter " << name << " for layer "
+          << getName();
+      parameter->setDevice(id);
+    }
+  }
+
+  /**
+   * Set output map of prev layers.
+   */
+  void setOutputMap() {
+    outputMap_.clear();
+    for (size_t i = 0; i < inputLayers_.size(); ++i) {
+      inputLayers_[i]->setOutput(getName(), &tmpInArg_);
+    }
+  }
+
+  /**
+   * if have cpu device, share value and grad data with output_
+   */
+  void shareCPUDevice() {
+    if (outputIsOnlyMKLDNN()) {
+      return;
+    }
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      outputOtherDevice_[i].value = output_.value;
+      outputOtherDevice_[i].grad = output_.grad;
+    }
+  }
+
+  /**
+   * Check the cpu device number of outputOtherDevice_.
+   * should have only one at most.
+   */
+  void checkCPUOutputsNumber(int max = 1) {
+    int cnt = 0;
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      if (outputOtherDevice_[i].deviceId == CPU_DEVICE) {
+        ++cnt;
+      }
+    }
+    CHECK_LE(cnt, max) << "too much CPU devies";
+  }
+
+  /**
+   * copy SeqInfo from input layer to this output and other output devices.
+   * @note: do not use getInput(0) since it used this deviceId_,
+   *        use "inputLayers_[0]->getOutput()" instead.
+   */
+  void copySeqInfoToOutputs() {
+    if (inputLayers_.empty() || !needSequenceInfo_) {
+      return;
+    }
+    const Argument& input = inputLayers_[0]->getOutput();
+    output_.sequenceStartPositions = input.sequenceStartPositions;
+    output_.subSequenceStartPositions = input.subSequenceStartPositions;
+    output_.cpuSequenceDims = input.cpuSequenceDims;
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      outputOtherDevice_[i].sequenceStartPositions =
+          output_.sequenceStartPositions;
+      outputOtherDevice_[i].subSequenceStartPositions =
+          output_.subSequenceStartPositions;
+      outputOtherDevice_[i].cpuSequenceDims = output_.cpuSequenceDims;
+    }
+  }
+
+  void prepareValueConversions(std::vector<mkldnn::primitive>& pipeline) {
+    // MKLDNNLayer output value should be MKLDNNMatrix
+    // so external output value is necessary.
+    // Then external input value is not necessary,
+    // since input may be mkldnn internal buffer.
+    CHECK(extOutVal_) << "external output value is necessary";
+    output_.value = std::dynamic_pointer_cast<Matrix>(extOutVal_);
+    CHECK(inVals_[0] && outVal_) << "internal memories are necessary";
+    for (size_t i = 0; i < cvtInVals_.size(); ++i) {
+      if (cvtInVals_[i]) {
+        pipeline.insert(pipeline.begin(), *cvtInVals_[i]);
+      }
+    }
+    if (cvtOutVal_) {
+      pipeline.push_back(*cvtOutVal_);
+    }
+  }
+  void prepareGradConversions(std::vector<mkldnn::primitive>& pipeline) {
+    // external output grad is not necessary
+    // since output may be mkldnn internal buffer or merge them directly.
+    CHECK(outGrad_) << "internal output grad is necessary";
+    if (extOutGrad_) {
+      CHECK_EQ(extOutGrad_->getData(), output_.grad->getData())
+          << "the external buffer should share the same data with output_.grad";
+    }
+    if (cvtOutGrad_) {
+      pipeline.insert(pipeline.begin(), *cvtOutGrad_);
+    }
+    for (size_t i = 0; i < cvtInGrads_.size(); ++i) {
+      if (cvtInGrads_[i]) {
+        pipeline.push_back(*cvtInGrads_[i]);
+      }
+    }
+  }
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLDNNPoolLayer.cpp b/paddle/legacy/gserver/layers/MKLDNNPoolLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..83d980538d2b1b7351bf858ab391c14f6e7170bd
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MKLDNNPoolLayer.cpp
@@ -0,0 +1,195 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNPoolLayer.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/utils/Logging.h"
+
+using namespace mkldnn;  // NOLINT
+typedef memory::format format;
+
+namespace paddle {
+
+REGISTER_LAYER(mkldnn_pool, MKLDNNPoolLayer);
+
+bool MKLDNNPoolLayer::init(const LayerMap& layerMap,
+                           const ParameterMap& parameterMap) {
+  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
+    return false;
+  }
+
+  /* the size of inputs for pool-layer is 1 */
+  CHECK_EQ(config_.inputs_size(), 1);
+  const PoolConfig& conf = config_.inputs(0).pool_conf();
+  ic_ = conf.channels();
+  ih_ = conf.img_size_y();
+  iw_ = conf.img_size();
+  oc_ = ic_;
+  oh_ = conf.output_y();
+  ow_ = conf.output_x();
+  fh_ = conf.size_y();
+  fw_ = conf.size_x();
+  ph_ = conf.padding_y();
+  pw_ = conf.padding();
+  sh_ = conf.stride_y();
+  sw_ = conf.stride();
+
+  const std::string& type = conf.pool_type();
+  if (type == "max-projection") {
+    poolAlgo_ = algorithm::pooling_max;
+  } else if (type == "avg-projection") {
+    // paddle only use exclude_padding
+    poolAlgo_ = algorithm::pooling_avg_exclude_padding;
+  } else {
+    LOG(FATAL) << "unknow pooling type!";
+  }
+  return true;
+}
+
+void MKLDNNPoolLayer::reshape(
+    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
+  reshapeInput(bs, ih, iw);
+  // ic_ and oc can not be changed
+  CHECK_EQ((size_t)ic,
+           inputLayers_[0]->getOutputValue()->getElementCnt() / bs / ih / iw)
+      << "Input channel can not be changed";
+
+  // cal output sizes
+  // paddle used false caffeMode for pooling
+  oh = outputSize(ih, fh_, ph_, sh_, false);
+  ow = outputSize(iw, fw_, pw_, sw_, false);
+  reshapeOutput(oh, ow);
+
+  resizeOutput(bs, oc * oh * ow);
+}
+
+void MKLDNNPoolLayer::resetFwd(std::vector<primitive>& pipeline,
+                               std::vector<MKLDNNMatrixPtr>& inputs,
+                               MKLDNNMatrixPtr& out) {
+  resetFwdBuffers(inputs[0], out);
+
+  resetFwdPD(fwdPD_, inputs[0], out);
+
+  resetFwdPipeline(pipeline, fwdPD_, inputs[0], out);
+}
+
+void MKLDNNPoolLayer::resetBwd(std::vector<primitive>& pipeline,
+                               std::vector<MKLDNNMatrixPtr>& inputs,
+                               MKLDNNMatrixPtr& out) {
+  std::shared_ptr<pool_bwd::primitive_desc> pd;
+
+  resetBwdBuffers(inputs[0], out);
+
+  resetBwdPD(pd, inputs[0], out);
+
+  resetBwdPipeline(pipeline, pd, inputs[0], out);
+}
+
+void MKLDNNPoolLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
+                                      MKLDNNMatrixPtr& out) {
+  resetInValue(in);
+
+  memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
+  CHECK(in);
+  auto outPD =
+      MKLDNNMatrix::createPrimitiveDesc(outDims, in->getFormat(), engine_);
+  resetOutValue(out, outPD);
+}
+
+void MKLDNNPoolLayer::resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
+                                 MKLDNNMatrixPtr in,
+                                 MKLDNNMatrixPtr out) {
+  memory::dims kernels = memory::dims{fh_, fw_};
+  memory::dims strides = memory::dims{sh_, sw_};
+  memory::dims padL = memory::dims{ph_, pw_};
+  memory::dims padR = getPaddingR();
+  padding_kind padKind = padding_kind::zero;
+  prop_kind pk = passType_ == PASS_TEST ? prop_kind::forward_scoring
+                                        : prop_kind::forward_training;
+  auto fwdDesc = pool_fwd::desc(pk,
+                                poolAlgo_,
+                                in->getMemoryDesc(),
+                                out->getMemoryDesc(),
+                                strides,
+                                kernels,
+                                padL,
+                                padR,
+                                padKind);
+  pd.reset(new pool_fwd::primitive_desc(fwdDesc, engine_));
+
+  // prepare workspace if necessary
+  workspace_ =
+      (passType_ != PASS_TEST && poolAlgo_ == algorithm::pooling_max)
+          ? std::make_shared<memory>(memory(pd->workspace_primitive_desc()))
+          : nullptr;
+}
+
+void MKLDNNPoolLayer::resetFwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<pool_fwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& out) {
+  fwd_ = workspace_
+             ? std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out, *workspace_))
+             : std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out));
+  pipeline.push_back(*fwd_);
+}
+
+void MKLDNNPoolLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
+                                      MKLDNNMatrixPtr& out) {
+  CHECK(inVals_[0] && outVal_);
+  resetOutGrad(out, outVal_->getPrimitiveDesc());
+  resetInGrad(in, inVals_[0]->getPrimitiveDesc());
+}
+
+void MKLDNNPoolLayer::resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
+                                 MKLDNNMatrixPtr& in,
+                                 MKLDNNMatrixPtr& out) {
+  pd = nullptr;
+  if (in == nullptr) {
+    return;
+  }
+  memory::dims kernels = memory::dims{fh_, fw_};
+  memory::dims strides = memory::dims{sh_, sw_};
+  memory::dims padL = memory::dims{ph_, pw_};
+  memory::dims padR = getPaddingR();
+  CHECK(out);
+  auto bwdDesc = pool_bwd::desc(poolAlgo_,
+                                in->getMemoryDesc(),
+                                out->getMemoryDesc(),
+                                strides,
+                                kernels,
+                                padL,
+                                padR,
+                                padding_kind::zero);
+  pd.reset(new pool_bwd::primitive_desc(bwdDesc, engine_, *fwdPD_));
+}
+
+void MKLDNNPoolLayer::resetBwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<pool_bwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& out) {
+  if (pd == nullptr) {
+    return;
+  }
+
+  bwdData_ =
+      workspace_
+          ? std::make_shared<pool_bwd>(pool_bwd(*pd, *out, *workspace_, *in))
+          : std::make_shared<pool_bwd>(pool_bwd(*pd, *out, *in));
+  pipeline.push_back(*bwdData_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNPoolLayer.h b/paddle/legacy/gserver/layers/MKLDNNPoolLayer.h
similarity index 100%
rename from paddle/gserver/layers/MKLDNNPoolLayer.h
rename to paddle/legacy/gserver/layers/MKLDNNPoolLayer.h
diff --git a/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp b/paddle/legacy/gserver/layers/MKLPackedRecurrentLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/MKLPackedRecurrentLayer.cpp
rename to paddle/legacy/gserver/layers/MKLPackedRecurrentLayer.cpp
diff --git a/paddle/gserver/layers/MKLPackedRecurrentLayer.h b/paddle/legacy/gserver/layers/MKLPackedRecurrentLayer.h
similarity index 100%
rename from paddle/gserver/layers/MKLPackedRecurrentLayer.h
rename to paddle/legacy/gserver/layers/MKLPackedRecurrentLayer.h
diff --git a/paddle/legacy/gserver/layers/MKLPackedWeight.h b/paddle/legacy/gserver/layers/MKLPackedWeight.h
new file mode 100644
index 0000000000000000000000000000000000000000..47f225bd03c3ccb594db952483d3b8397b61e1ec
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MKLPackedWeight.h
@@ -0,0 +1,86 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/legacy/math/MathFunctions.h"
+#include "paddle/legacy/parameter/Parameter.h"
+#include "paddle/legacy/parameter/Weight.h"
+
+namespace paddle {
+
+class MKLPackedWeight {
+ protected:
+  /// The pointer of weight
+  real *weight_;
+  /// The pointer of cblas packed gemm to weight
+  real *packedWeight_;
+  size_t height_;
+  size_t width_;
+  bool transW_;
+
+ public:
+  explicit MKLPackedWeight(MatrixPtr weight, bool transW = false) {
+    packedWeight_ = nullptr;
+    weight_ = weight->getData();
+    height_ = weight->getHeight();
+    width_ = weight->getWidth();
+    transW_ = transW;
+  }
+
+  ~MKLPackedWeight() { free_(); }
+
+  void pack() { pack_(weight_); }
+
+  void gemm_compute(const MatrixPtr src, MatrixPtr dst) {
+    cblas_sgemm_compute(CblasRowMajor,
+                        CblasNoTrans,
+                        CblasPacked,
+                        src->getHeight(),
+                        transW_ ? height_ : width_,
+                        transW_ ? width_ : height_,
+                        src->getData(),
+                        src->getWidth(),
+                        packedWeight_,
+                        width_,
+                        1.0,
+                        dst->getData(),
+                        dst->getWidth());
+  }
+
+ protected:
+  void pack_(real *src) {
+    if (!packedWeight_) {
+      packedWeight_ = cblas_sgemm_alloc(CblasBMatrix, 1, width_, height_);
+    }
+    cblas_sgemm_pack(CblasRowMajor,
+                     CblasBMatrix,
+                     transW_ ? CblasTrans : CblasNoTrans,
+                     1,
+                     transW_ ? height_ : width_,
+                     transW_ ? width_ : height_,
+                     1.0,
+                     src,
+                     width_,
+                     packedWeight_);
+  }
+
+  void free_() {
+    if (packedWeight_) {
+      cblas_sgemm_free(packedWeight_);
+    }
+  }
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MaxIdLayer.cpp b/paddle/legacy/gserver/layers/MaxIdLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/MaxIdLayer.cpp
rename to paddle/legacy/gserver/layers/MaxIdLayer.cpp
diff --git a/paddle/legacy/gserver/layers/MaxLayer.cpp b/paddle/legacy/gserver/layers/MaxLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b51251b663cf818fbe662a96b7c0d55a615640d4
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MaxLayer.cpp
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MaxLayer.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(max, MaxLayer);
+
+void MaxLayer::forward(PassType passType) {
+  SequencePoolLayer::forward(passType);
+
+  IVector::resizeOrCreate(
+      maxIndex_, newBatchSize_ * getSize(), useGpu(deviceId_));
+  maxIndex_->zeroMem();
+
+  MatrixPtr inputValue = getInputValue(0);
+  MatrixPtr outputValue = getOutputValue();
+
+  {
+    REGISTER_TIMER_INFO("MaxLayerForward", getName().c_str());
+    outputValue->maxSequenceForward(
+        *inputValue, *startPositions_->getVector(useGpu_), *maxIndex_);
+  }
+
+  if (config_.output_max_index()) {
+    // copy maxIndex_ to output
+    outputValue->copyFrom(*maxIndex_);
+  } else {
+    /* add the bias-vector AFTER max operation */
+    if (biases_.get() != NULL) {
+      outputValue->addBias(*(biases_->getW()), 1);
+    }
+    /* activation */ { forwardActivation(); }
+  }
+}
+
+void MaxLayer::backward(const UpdateCallback& callback) {
+  CHECK(!config_.output_max_index())
+      << "backward is not available when output_max_index is set";
+  SequencePoolLayer::backward(callback);
+
+  MatrixPtr inputGrad = getInputGrad(0);
+  MatrixPtr outputGrad = getOutputGrad();
+  if (inputGrad) {
+    REGISTER_TIMER_INFO("MaxLayerBackward", getName().c_str());
+    inputGrad->maxSequenceBackward(
+        *outputGrad, *(startPositions_->getVector(useGpu_)), *maxIndex_);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MaxLayer.h b/paddle/legacy/gserver/layers/MaxLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..12d0128e39f2113d0e156813f9b3657cae145eed
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MaxLayer.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "SequencePoolLayer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
+
+namespace paddle {
+
+/**
+ * A layer for "internal max" for sequence input.
+ * Input: one or more sequences. Each sequence contains some instances.
+ * If SequenceLevel = kNonSeq:
+ *    Output: output size is the number of input sequences (NOT input instances)
+ *    output[i] = max_{for each instance in this sequence}{input[i]}
+ *    If stride_ > 0:
+ *      Output: a shorten sequence. Stride is the step size by which we slide a
+ *              window upon the input sequence, and the max pooling operation is
+ *              then applied to each interval independently.
+ * If SequenceLevel = kSeq:
+ *    Check input sequence must has sub-sequence
+ *    Output: output size is the number of input sub-sequences
+ *    output[i] = max_{for each instance in this sub-sequence}{input[i]}
+ *
+ * The config file api is pooling_layer.
+ */
+
+class MaxLayer : public SequencePoolLayer {
+ protected:
+  // maxIndex_[i][j] = k : the value at (i, j) is from input[k].
+  IVectorPtr maxIndex_;
+
+ public:
+  explicit MaxLayer(const LayerConfig& config) : SequencePoolLayer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
+    return SequencePoolLayer::init(layerMap, parameterMap);
+  }
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MaxOutLayer.cpp b/paddle/legacy/gserver/layers/MaxOutLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/MaxOutLayer.cpp
rename to paddle/legacy/gserver/layers/MaxOutLayer.cpp
diff --git a/paddle/legacy/gserver/layers/MaxOutLayer.h b/paddle/legacy/gserver/layers/MaxOutLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..e56f34b8e02bf1dd48c6b5b6ea135cc1009c25b5
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MaxOutLayer.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * A layer to do max out on conv layer output.
+ * Input: output of a conv layer.
+ * Output: feature map size same as input.  Channel is (input channel) / groups.
+ * So the num of channels should be able to devided by groups.
+ *
+ * The config file api is maxout_layer.
+ */
+
+class MaxOutLayer : public Layer {
+ protected:
+  size_t groups_;
+  size_t imgSizeH_, imgSizeW_;
+  /// outputChannels_ = channels_ / groups_
+  size_t channels_, outputChannels_;
+  /// feature length = imgSizeH_ * imgSizeW_
+  size_t featLen_;
+  IVectorPtr maxoutId_;
+
+ public:
+  /// return imgSizeH_ * imgSizeW_ * outputChannels_;
+  size_t getSize();
+
+  explicit MaxOutLayer(const LayerConfig& config) : Layer(config) {}
+  virtual ~MaxOutLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MaxPoolWithMaskLayer.cpp b/paddle/legacy/gserver/layers/MaxPoolWithMaskLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a1cc59a719e43453a8919a5827369982ac355480
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MaxPoolWithMaskLayer.cpp
@@ -0,0 +1,109 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MaxPoolWithMaskLayer.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+bool MaxPoolWithMaskLayer::init(const LayerMap& layerMap,
+                                const ParameterMap& parameterMap) {
+  PoolLayer::init(layerMap, parameterMap);
+  setOutput("mask", &mask_);
+  return true;
+}
+
+size_t MaxPoolWithMaskLayer::getSize() {
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  size_t layerSize = 0;
+
+  outputY_ = outputSize(imgSizeY_,
+                        sizeY_,
+                        confPaddingY_,
+                        strideY_,
+                        /* caffeMode */ false);
+  outputX_ = outputSize(imgSize_,
+                        sizeX_,
+                        confPadding_,
+                        stride_,
+                        /* caffeMode */ false);
+
+  layerSize = outputX_ * outputY_ * channels_;
+  getOutput().setFrameHeight(outputY_);
+  getOutput().setFrameWidth(outputX_);
+
+  return layerSize;
+}
+
+void MaxPoolWithMaskLayer::forward(PassType passType) {
+  size_t size = getSize();
+  MatrixPtr inputV = inputLayers_[0]->getOutputValue();
+  int batchSize = inputV->getHeight();
+  resetOutput(batchSize, size);
+
+  MatrixPtr outV = getOutputValue();
+  CHECK_EQ(size, outV->getWidth());
+
+  resetSpecifyOutput(mask_,
+                     batchSize,
+                     size,
+                     /* isValueClean */ false,
+                     /* isGradClean */ true);
+
+  MatrixPtr maskV = mask_.value;
+  outV->maxPoolForward(*inputV,
+                       imgSizeY_,
+                       imgSize_,
+                       channels_,
+                       sizeX_,
+                       sizeY_,
+                       strideY_,
+                       stride_,
+                       outputY_,
+                       outputX_,
+                       confPaddingY_,
+                       confPadding_,
+                       maskV);
+}
+
+void MaxPoolWithMaskLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+  if (NULL == getInputGrad(0)) {
+    return;
+  }
+
+  MatrixPtr outGrad = getOutputGrad();
+  MatrixPtr inputV = inputLayers_[0]->getOutputValue();
+  MatrixPtr outV = getOutputValue();
+  MatrixPtr inputGrad = inputLayers_[0]->getOutputGrad();
+
+  inputGrad->maxPoolBackward(*inputV,
+                             imgSizeY_,
+                             imgSize_,
+                             *outGrad,
+                             *outV,
+                             sizeX_,
+                             sizeY_,
+                             strideY_,
+                             stride_,
+                             outputY_,
+                             outputX_,
+                             1,
+                             1,
+                             confPaddingY_,
+                             confPadding_);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MaxPoolWithMaskLayer.h b/paddle/legacy/gserver/layers/MaxPoolWithMaskLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..fcd5388abe3f8229dfa418e6917a8a73c93900a7
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MaxPoolWithMaskLayer.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "PoolLayer.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+/**
+ * @brief Basic parent layer of different kinds of pooling
+ */
+class MaxPoolWithMaskLayer : public PoolLayer {
+ protected:
+  Argument mask_;
+
+ public:
+  explicit MaxPoolWithMaskLayer(const LayerConfig& config)
+      : PoolLayer(config) {}
+
+  size_t getSize();
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+};
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MixedLayer.cpp b/paddle/legacy/gserver/layers/MixedLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..63e658c09c2b3bae30c8b2890e4d67f72266dd4d
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MixedLayer.cpp
@@ -0,0 +1,176 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MixedLayer.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(mixed, MixedLayer);
+
+bool MixedLayer::init(const LayerMap& layerMap,
+                      const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  if (!Layer::init(layerMap, parameterMap)) return false;
+
+  CHECK_EQ(inputLayers_.size(), parameters_.size());
+  projections_.resize(inputLayers_.size());
+  for (size_t i = 0; i < inputLayers_.size(); i++) {
+    if (config_.inputs(i).has_proj_conf()) {
+      projections_[i].reset(Projection::create(
+          config_.inputs(i).proj_conf(), parameters_[i], useGpu_));
+    } else {
+      CHECK(!parameters_[i]) << "should no parameters for operators";
+    }
+  }
+  for (auto& operator_conf : config_.operator_confs()) {
+    for (auto& input_index : operator_conf.input_indices()) {
+      CHECK(!config_.inputs(input_index).has_proj_conf());
+    }
+    operators_.emplace_back(Operator::create(operator_conf, useGpu_));
+  }
+
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    sharedBias_ = config_.shared_biases();
+    size_t psize = config_.bias_size();
+    biases_ = std::unique_ptr<Weight>(new Weight(1, psize, biasParameter_));
+  }
+
+  return true;
+}
+
+void MixedLayer::prefetch() {
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    if (projections_[i]) {
+      projections_[i]->prefetch(&getInput(i));
+    }
+  }
+}
+
+void MixedLayer::resetState() {
+  for (auto& proj : projections_) {
+    if (proj) {
+      proj->resetState();
+    }
+  }
+}
+
+void MixedLayer::setState(LayerStatePtr state) {
+  CHECK(projectionStateMatrixSize_.size() == projections_.size())
+      << "projection size mis-match";
+
+  int start = 0;
+  LayerStatePtr statePtr = std::make_shared<LayerState>();
+  for (int i = 0; i < (int)projectionStateMatrixSize_.size(); i++) {
+    if (projectionStateMatrixSize_[i] > 0) {
+      statePtr->value.clear();
+      for (int j = start; j < start + projectionStateMatrixSize_[i]; j++) {
+        statePtr->value.push_back(state->value[j]);
+      }
+      projections_[i]->setState(statePtr);
+      start += projectionStateMatrixSize_[i];
+    }
+  }
+  CHECK((int)state->value.size() == start) << "state matrix size mis-match";
+}
+
+// Return state which consists of all projections states
+LayerStatePtr MixedLayer::getState() {
+  bool init = projectionStateMatrixSize_.size() == 0;
+  LayerStatePtr res = std::make_shared<LayerState>();
+  for (int i = 0; i < (int)projections_.size(); i++) {
+    LayerStatePtr statePtr =
+        projections_[i] ? projections_[i]->getState() : nullptr;
+    int stateSize = statePtr == nullptr ? 0 : statePtr->value.size();
+    if (init) {
+      projectionStateMatrixSize_.push_back(stateSize);
+    } else {
+      CHECK(projectionStateMatrixSize_[i] == stateSize)
+          << "state matrix size mis-match";
+    }
+    if (statePtr != nullptr) {
+      for (auto& matrixPtr : statePtr->value) {
+        res->value.push_back(matrixPtr);
+      }
+    }
+  }
+  return res;
+}
+
+void MixedLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  int batchSize = getInput(0).getBatchSize();
+  int size = getSize();
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    resetOutput(batchSize, size);
+  }
+
+  MatrixPtr outV = getOutputValue();
+
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    if (projections_[i]) {
+      projections_[i]->forward(&getInput(i), &output_, passType);
+    }
+  }
+
+  std::vector<const Argument*> ins;
+  for (auto& op : operators_) {
+    ins.clear();
+    for (auto& input_index : op->getConfig().input_indices()) {
+      ins.push_back(&getInput(input_index));
+    }
+    op->forward(ins, &output_, passType);
+  }
+
+  /* add the bias-vector */
+  if (biases_.get() != NULL) {
+    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
+    outV->addBias(*(biases_->getW()), 1, sharedBias_);
+  }
+
+  /* activation */ {
+    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void MixedLayer::backward(const UpdateCallback& callback) {
+  /* Do activation */ {
+    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
+    backwardActivation();
+  }
+
+  if (biases_ && biases_->getWGrad()) {
+    REGISTER_TIMER_INFO("BpBiasTimer", getName().c_str());
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1, sharedBias_);
+
+    /* Increasing the number of gradient */
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    if (projections_[i]) {
+      projections_[i]->backward(callback);
+    }
+  }
+
+  for (auto& op : operators_) {
+    op->backward();
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MixedLayer.h b/paddle/legacy/gserver/layers/MixedLayer.h
similarity index 100%
rename from paddle/gserver/layers/MixedLayer.h
rename to paddle/legacy/gserver/layers/MixedLayer.h
diff --git a/paddle/gserver/layers/MultiBoxLossLayer.cpp b/paddle/legacy/gserver/layers/MultiBoxLossLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/MultiBoxLossLayer.cpp
rename to paddle/legacy/gserver/layers/MultiBoxLossLayer.cpp
diff --git a/paddle/gserver/layers/MultiBoxLossLayer.h b/paddle/legacy/gserver/layers/MultiBoxLossLayer.h
similarity index 100%
rename from paddle/gserver/layers/MultiBoxLossLayer.h
rename to paddle/legacy/gserver/layers/MultiBoxLossLayer.h
diff --git a/paddle/gserver/layers/MultinomialSampler.cpp b/paddle/legacy/gserver/layers/MultinomialSampler.cpp
similarity index 100%
rename from paddle/gserver/layers/MultinomialSampler.cpp
rename to paddle/legacy/gserver/layers/MultinomialSampler.cpp
diff --git a/paddle/legacy/gserver/layers/MultinomialSampler.h b/paddle/legacy/gserver/layers/MultinomialSampler.h
new file mode 100644
index 0000000000000000000000000000000000000000..ed445352418f8504e52a6139492e3577a95eecb1
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MultinomialSampler.h
@@ -0,0 +1,81 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <random>
+#include "paddle/legacy/utils/Common.h"
+
+namespace paddle {
+
+/**
+ * @brief Given the probability of N objects, the sampler random select
+ * one of the object.
+ * @note: prob does not have to be unnormalized.
+ *
+ * The space requirement is O(N)=O(N * sizeof(Interval)).
+ * The computational complexity of generate one sample is O(1).
+ */
+class MultinomialSampler {
+ public:
+  MultinomialSampler(const real* prob, int size);
+
+  //! protobuf always using double.
+  static MultinomialSampler* create(const double* prob, int size) {
+#ifdef PADDLE_TYPE_DOUBLE
+    return new MultinomialSampler(prob, size);
+#else
+    std::unique_ptr<real[]> tmp(new real[size]);
+    std::copy(prob, prob + size, tmp.get());
+    return new MultinomialSampler(tmp.get(), size);
+#endif
+  }
+
+  /**
+   * @brief Generate a random sample.
+   * @param g is a random number engine. See <random>.
+   * @return Random integer.
+   */
+  template <typename URNG>
+  int gen(URNG& g) {
+    return gen1([&g, this]() { return rand_(g); });
+  }
+
+ protected:
+  /**
+   * @brief Generation
+   * @param[in] rand rand is a real random number distribution
+   * for the range [0, size).
+   * @return random int number or intervals_[random_int_number].otherId.
+   */
+  template <typename Rand>
+  int gen1(Rand rand) {
+    double r = rand();  // NOLINT
+    int i = (int)r;
+    r -= i;
+    return r < intervals_[i].thresh ? i : intervals_[i].otherId;
+  }
+
+  struct Interval {
+    int otherId;
+    real thresh;
+  };
+
+  /// The probability of each interval will be 1./size
+  std::vector<Interval> intervals_;
+  std::uniform_real_distribution<double> rand_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MultiplexLayer.cpp b/paddle/legacy/gserver/layers/MultiplexLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9ca2b2417596e7978ea6b84ec76bcb8a305a4f5d
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MultiplexLayer.cpp
@@ -0,0 +1,180 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ *@brief This layer multiplex multiple layers according to the index,
+ * which is provided by the first input layer.
+ * - Input[0]: the index of the layer to output of size batchSize.
+ * - Input[1:N]; the candidate output data.
+ * For each index i from 0 to batchSize -1, the output is the i-th row of the
+ * (index[i] + 1)-th layer.
+ *
+ * For each i-th row of output:
+ *
+ * \f[
+ *   y[i][j] = x_{x_{0}[i] + 1}[i][j], j = 0,1, ... , (x_{1}.width - 1)
+ * \f]
+ * where, y is output. \f$x_{k}\f$ is the k-th input layer and
+ * \f$k = x_{0}[i] + 1\f$.
+ */
+
+class MultiplexLayer : public Layer {
+ protected:
+  /**
+   * @brief A struct is used to save the copy information, includes input
+   * layer index and copy size.
+   */
+  struct CopyInfo {
+    CopyInfo(int inStartIdx, int inLength, int inCopyIdx)
+        : startIdx(inStartIdx), length(inLength), copyIdx(inCopyIdx) {}
+
+    /// The start row of input.
+    int startIdx;
+    /// Number of rows. If the layer index in Input[0] is not consecutive,
+    /// the length is one. Otherwise, the length is > 1 and copy multi rows
+    /// once.
+    int length;
+    /// The copied layer index, which needs to add 1.
+    int copyIdx;
+  };
+
+  /// A list of CopyInfo used to save copy information.
+  std::vector<CopyInfo> copySchedule_;
+
+  /// Temporary matrix pointer to point to input data.
+  MatrixPtr tmpSrc_;
+  /// Temporary matrix pointer to point to output data.
+  MatrixPtr tmpDest_;
+
+ public:
+  explicit MultiplexLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~MultiplexLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+ private:
+  /**
+   * @brief Calculate copy info for input layers.
+   */
+  void calculateCopySchedule(const IVectorPtr& copyIds, size_t numIns);
+};
+
+REGISTER_LAYER(multiplex, MultiplexLayer);
+
+void MultiplexLayer::calculateCopySchedule(const IVectorPtr& copyIds,
+                                           size_t numIns) {
+  copySchedule_.clear();
+  CopyInfo prevCopyInfo(0, 0, -1);
+  for (size_t i = 0; i < copyIds->getSize(); i++) {
+    int copyId = copyIds->getElement(i);
+    CHECK_GE(copyId, 0);
+    CHECK_LT(copyId, int(numIns));
+    // copy same input layer with prevous and will copy consecutive.
+    if (copyId == prevCopyInfo.copyIdx) {
+      ++prevCopyInfo.length;
+    } else {
+      if (prevCopyInfo.copyIdx != -1) {
+        copySchedule_.emplace_back(prevCopyInfo);
+      }
+      prevCopyInfo.startIdx = i;
+      prevCopyInfo.length = 1;
+      prevCopyInfo.copyIdx = copyId;
+    }
+  }
+  if (prevCopyInfo.copyIdx != -1) {
+    copySchedule_.emplace_back(prevCopyInfo);
+  }
+}
+
+bool MultiplexLayer::init(const LayerMap& layerMap,
+                          const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_GE(inputLayers_.size(), 2U);
+
+  tmpSrc_ =
+      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
+  tmpDest_ =
+      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
+  return true;
+}
+
+void MultiplexLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  IVectorPtr copyIds = getInput(0).ids;
+  MatrixPtr inV1 = getInputValue(1);
+  CHECK_EQ(copyIds->getSize(), inV1->getHeight());
+  for (size_t i = 2; i < inputLayers_.size(); i++) {
+    CHECK_EQ(inV1->getHeight(), getInputValue(i)->getHeight());
+    CHECK_EQ(inV1->getWidth(), getInputValue(i)->getWidth());
+  }
+
+  calculateCopySchedule(copyIds, inputLayers_.size() - 1);
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    reserveOutput(inV1->getHeight(), inV1->getWidth());
+  }
+
+  MatrixPtr outV = getOutputValue();
+  {
+    REGISTER_TIMER_INFO("FwLMultplexingTimer", getName().c_str());
+    AsyncGpuBlock block;
+    for (const CopyInfo& info : copySchedule_) {
+      outV->subMatrix(info.startIdx, info.length, tmpDest_)
+          ->copyFrom(*getInputValue(info.copyIdx + 1)
+                          ->subMatrix(info.startIdx, info.length, tmpSrc_));
+    }
+  }
+
+  /* activation */ {
+    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void MultiplexLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ {
+    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
+    backwardActivation();
+  }
+
+  MatrixPtr outG = getOutputGrad();
+
+  {
+    REGISTER_TIMER_INFO("BwLMultiplexTimer", getName().c_str());
+    AsyncGpuBlock block;
+    for (const CopyInfo& info : copySchedule_) {
+      if (getInputGrad(info.copyIdx + 1)) {
+        getInputGrad(info.copyIdx + 1)
+            ->subMatrix(info.startIdx, info.length, tmpDest_)
+            ->add(*outG->subMatrix(info.startIdx, info.length, tmpSrc_));
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/NCELayer.cpp b/paddle/legacy/gserver/layers/NCELayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ae4d6408168d1597760fe0094bc04f9cef657da4
--- /dev/null
+++ b/paddle/legacy/gserver/layers/NCELayer.cpp
@@ -0,0 +1,323 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <random>
+
+#include "Layer.h"
+#include "MultinomialSampler.h"
+#include "paddle/legacy/math/MathFunctions.h"
+
+namespace paddle {
+
+/**
+ * Noise-contrastive estimation.
+ * Implements the method in the following paper:
+ * A fast and simple algorithm for training neural probabilistic language
+ * models.
+ *
+ * The config file api is nce_layer.
+ */
+class NCELayer : public Layer {
+  int numClasses_;
+  /// number of input layer besides labelLayer and weightLayer
+  int numInputs_;
+  LayerPtr labelLayer_;
+  /// weight layer, can be None
+  LayerPtr weightLayer_;
+  WeightList weights_;
+  std::unique_ptr<Weight> biases_;
+  std::unique_ptr<MultinomialSampler> sampler_;
+
+  std::uniform_int_distribution<int> rand_;
+
+  struct Sample {
+    int sampleId;
+    int labelId;
+    bool target;
+    real weight;
+  };
+  std::vector<Sample> samples_;
+  /// whether samples_ is prepared
+  bool prepared_;
+  Argument sampleOut_;
+
+  IVectorPtr labelIds_;
+
+ public:
+  explicit NCELayer(const LayerConfig& config)
+      : Layer(config),
+        numClasses_(config.num_classes()),
+        rand_(0, config.num_classes() - 1),
+        prepared_(false) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
+    /* Initialize the basic parent class */
+    Layer::init(layerMap, parameterMap);
+
+    /* initialize the weightList */
+    size_t i;
+    for (i = 0; i < inputLayers_.size(); i++) {
+      if (!parameters_[i]) break;
+      size_t width = inputLayers_[i]->getSize();
+      // create a new weight
+      CHECK_EQ(parameters_[i]->getSize(), width * numClasses_);
+      Weight* w = new Weight(numClasses_, width, parameters_[i]);
+
+      // append the new weight to the list
+      weights_.emplace_back(w);
+    }
+
+    CHECK_EQ(1U, getSize());
+
+    numInputs_ = i;
+    CHECK_GE(numInputs_, 1)
+        << "Must have at least one input besides label and weight";
+    CHECK_LT(i, inputLayers_.size()) << "Missing label layer";
+    labelLayer_ = inputLayers_[i];
+    if (++i < inputLayers_.size()) {
+      weightLayer_ = inputLayers_[i];
+      ++i;
+    }
+    CHECK_EQ(i, inputLayers_.size());
+
+    /* initialize biases_ */
+    if (biasParameter_.get() != NULL) {
+      CHECK_EQ(biasParameter_->getSize(), (size_t)numClasses_);
+      biases_.reset(new Weight(1, numClasses_, biasParameter_));
+    }
+
+    if (config_.neg_sampling_dist_size()) {
+      CHECK_EQ(numClasses_, config_.neg_sampling_dist_size());
+      sampler_.reset(MultinomialSampler::create(
+          config_.neg_sampling_dist().data(), numClasses_));
+    }
+
+    return true;
+  }
+
+  void prepareSamples() {
+    CHECK(!useGpu_) << "GPU is not supported";
+
+    int batchSize = getInput(*labelLayer_).getBatchSize();
+    IVectorPtr label = getInput(*labelLayer_).ids;
+
+    CpuSparseMatrixPtr multiLabel = std::dynamic_pointer_cast<CpuSparseMatrix>(
+        getInput(*labelLayer_).value);
+
+    CHECK(label || multiLabel)
+        << "The label layer must have ids or NonValueSparseMatrix value";
+
+    auto& randEngine = ThreadLocalRandomEngine::get();
+
+    samples_.clear();
+    samples_.reserve(batchSize * (1 + config_.num_neg_samples()));
+
+    real* weight =
+        weightLayer_ ? getInputValue(*weightLayer_)->getData() : nullptr;
+
+    for (int i = 0; i < batchSize; ++i) {
+      real w = weight ? weight[i] : 1;
+      if (label) {
+        int* ids = label->getData();
+        samples_.push_back({i, ids[i], true, w});
+      } else {
+        const int* cols = multiLabel->getRowCols(i);
+        int n = multiLabel->getColNum(i);
+        for (int j = 0; j < n; ++j) {
+          samples_.push_back({i, cols[j], true, w});
+        }
+      }
+      for (int j = 0; j < config_.num_neg_samples(); ++j) {
+        int id = sampler_ ? sampler_->gen(randEngine) : rand_(randEngine);
+        samples_.push_back({i, id, false, w});
+      }
+    }
+    prepared_ = true;
+  }
+
+  void prefetch() override {
+    prepareSamples();
+    IVector::resizeOrCreate(labelIds_, samples_.size(), useGpu_);
+    int* ids = labelIds_->getData();
+    for (size_t i = 0; i < samples_.size(); ++i) {
+      ids[i] = samples_[i].labelId;
+    }
+
+    for (int i = 0; i < numInputs_; ++i) {
+      auto sparseParam =
+          dynamic_cast<SparsePrefetchRowCpuMatrix*>(weights_[i]->getW().get());
+      if (sparseParam) {
+        sparseParam->addRows(labelIds_);
+      }
+    }
+  }
+
+  void forward(PassType passType) override {
+    Layer::forward(passType);
+
+    CHECK(!useGpu_) << "GPU is not supported";
+
+    if (!prepared_) {
+      if (passType == PASS_GC) {
+        ThreadLocalRandomEngine::get().seed(ThreadLocalRand::getDefaultSeed());
+      }
+      prepareSamples();
+    }
+    prepared_ = false;
+
+    /* malloc memory for the output_ if necessary */
+    int batchSize = getInputValue(0)->getHeight();
+    int size = getSize();
+    resetOutput(batchSize, size);
+
+    Matrix::resizeOrCreate(sampleOut_.value,
+                           1,
+                           samples_.size(),
+                           /* trans= */ false,
+                           useGpu_);
+
+    forwardBias();
+
+    for (int l = 0; l < numInputs_; ++l) {
+      forwardOneInput(l);
+    }
+
+    auto status = activation_->forward(sampleOut_);
+    status.check();
+
+    forwardCost();
+  }
+
+  void backward(const UpdateCallback& callback) override {
+    Matrix::resizeOrCreate(sampleOut_.grad,
+                           1,
+                           samples_.size(),
+                           /* trans= */ false,
+                           useGpu_);
+
+    backwardCost();
+
+    auto status = activation_->backward(sampleOut_);
+    status.check();
+
+    if (biases_->getWGrad()) {
+      backwardBias(callback);
+    }
+
+    for (int l = 0; l < numInputs_; ++l) {
+      backwardOneInput(l, callback);
+    }
+  }
+
+  void forwardBias() {
+    if (!biases_) {
+      sampleOut_.value->zeroMem();
+    } else {
+      real* bias = biases_->getW()->getData();
+      real* sampleOut = sampleOut_.value->getData();
+      for (size_t i = 0; i < samples_.size(); ++i) {
+        sampleOut[i] = bias[samples_[i].labelId];
+      }
+    }
+  }
+
+  void backwardBias(const UpdateCallback& callback) {
+    if (!biases_) return;
+    real* bias = biases_->getWGrad()->getData();
+    real* sampleOut = sampleOut_.grad->getData();
+    for (size_t i = 0; i < samples_.size(); ++i) {
+      bias[samples_[i].labelId] += sampleOut[i];
+    }
+    biases_->incUpdate(callback);
+  }
+
+  void forwardOneInput(int layerId) {
+    const MatrixPtr& inputMat = getInputValue(layerId);
+    const MatrixPtr& weightMat = weights_[layerId]->getW();
+
+    int dim = inputMat->getWidth();
+    real* sampleOut = sampleOut_.value->getData();
+
+    for (size_t i = 0; i < samples_.size(); ++i) {
+      sampleOut[i] += dotProduct(dim,
+                                 inputMat->getRowBuf(samples_[i].sampleId),
+                                 weightMat->getRowBuf(samples_[i].labelId));
+    }
+  }
+
+  void backwardOneInput(int layerId, const UpdateCallback& callback) {
+    const MatrixPtr& inputMat = getInputValue(layerId);
+    const MatrixPtr& inputGradMat = getInputGrad(layerId);
+    const MatrixPtr& weightMat = weights_[layerId]->getW();
+    const MatrixPtr& weightGradMat = weights_[layerId]->getWGrad();
+
+    int dim = inputMat->getWidth();
+    real* sampleGrad = sampleOut_.grad->getData();
+
+    if (weightGradMat) {
+      for (size_t i = 0; i < samples_.size(); ++i) {
+        axpy(dim,
+             sampleGrad[i],
+             inputMat->getRowBuf(samples_[i].sampleId),
+             weightGradMat->getRowBuf(samples_[i].labelId));
+      }
+      weights_[layerId]->incUpdate(callback);
+    }
+
+    if (inputGradMat) {
+      for (size_t i = 0; i < samples_.size(); ++i) {
+        axpy(dim,
+             sampleGrad[i],
+             weightMat->getRowBuf(samples_[i].labelId),
+             inputGradMat->getRowBuf(samples_[i].sampleId));
+      }
+    }
+  }
+
+  void forwardCost() {
+    real* out = output_.value->getData();
+    real* sampleOut = sampleOut_.value->getData();
+    real b = 1. / numClasses_ * config_.num_neg_samples();
+    for (size_t i = 0; i < samples_.size(); ++i) {
+      real o = sampleOut[i];
+      if (sampler_) {
+        b = config_.num_neg_samples() *
+            config_.neg_sampling_dist(samples_[i].labelId);
+      }
+      real cost = samples_[i].target ? -log(o / (o + b)) : -log(b / (o + b));
+      out[samples_[i].sampleId] += samples_[i].weight * cost;
+    }
+  }
+
+  void backwardCost() {
+    real* sampleOut = sampleOut_.value->getData();
+    real* sampleGrad = sampleOut_.grad->getData();
+
+    real b = 1. / numClasses_ * config_.num_neg_samples();
+    for (size_t i = 0; i < samples_.size(); ++i) {
+      real o = sampleOut[i];
+      if (sampler_) {
+        b = config_.num_neg_samples() *
+            config_.neg_sampling_dist(samples_[i].labelId);
+      }
+      real w = samples_[i].weight;
+      sampleGrad[i] = samples_[i].target ? -w * b / (o * (o + b)) : w / (o + b);
+    }
+  }
+};
+
+REGISTER_LAYER(nce, NCELayer);
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/NormLayer.cpp b/paddle/legacy/gserver/layers/NormLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..443e26dbc859b1c51c5fb93077178ac45bdeaff3
--- /dev/null
+++ b/paddle/legacy/gserver/layers/NormLayer.cpp
@@ -0,0 +1,59 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "NormLayer.h"
+#include "NormProjectionLayer.h"
+#include "paddle/legacy/utils/Logging.h"
+namespace paddle {
+
+REGISTER_LAYER_CREATE_FUNC(norm, &NormLayer::create);
+
+Layer* NormLayer::create(const LayerConfig& config) {
+  CHECK_EQ(config.inputs_size(), 1);
+  const std::string& norm = config.inputs(0).norm_conf().norm_type();
+  if (norm == "rnorm") {
+    return new ResponseNormLayer(config);
+  } else if (norm == "cmrnorm-projection") {
+    return new CMRProjectionNormLayer(config);
+  } else if (norm == "cross-channel-norm") {
+    return new CrossChannelNormLayer(config);
+  } else {
+    LOG(FATAL) << "Unknown norm type: " << norm;
+    return nullptr;
+  }
+}
+
+bool ResponseNormLayer::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  NormLayer::init(layerMap, parameterMap);
+
+  /* the size of inputs for norm-layer is 1 */
+  CHECK_EQ(config_.inputs_size(), 1);
+
+  const NormConfig& conf = config_.inputs(0).norm_conf();
+  channels_ = conf.channels();
+  size_ = conf.size();
+  scale_ = conf.scale();
+  pow_ = conf.pow();
+  outputX_ = conf.output_x();
+  imgSize_ = conf.img_size();
+  denoms_ = NULL;
+
+  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
+  imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+  return true;
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/NormLayer.h b/paddle/legacy/gserver/layers/NormLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..5ac00034d086a5952b30576268c72af326e3ebf9
--- /dev/null
+++ b/paddle/legacy/gserver/layers/NormLayer.h
@@ -0,0 +1,99 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "Layer.h"
+#include "NormLayer.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief Basic parent layer of normalization
+ *
+ * @note Normalize the input in local region
+ */
+class NormLayer : public Layer {
+ public:
+  explicit NormLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
+    Layer::init(layerMap, parameterMap);
+    return true;
+  }
+
+  /**
+   * @brief create norm layer by norm_type
+   */
+  static Layer* create(const LayerConfig& config);
+};
+
+/**
+ * @brief response normalization within feature maps
+ * namely normalize in independent channel
+ * When code refactoring, we delete the original implementation.
+ * Need to implement in the futrue.
+ */
+class ResponseNormLayer : public NormLayer {
+ protected:
+  size_t channels_, size_, outputX_, imgSize_, outputY_, imgSizeY_;
+  real scale_, pow_;
+  MatrixPtr denoms_;
+
+ public:
+  explicit ResponseNormLayer(const LayerConfig& config) : NormLayer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override { LOG(FATAL) << "Not implemented"; }
+  void backward(const UpdateCallback& callback = nullptr) override {
+    LOG(FATAL) << "Not implemented";
+  }
+};
+
+/**
+ * This layer applys normalization across the channels of each sample to a
+ * conv layer's output, and scales the output by a group of trainable factors
+ * whose dimensions equal to the number of channels.
+ * - Input: One and only one input layer are accepted.
+ * - Output: The normalized data of the input data.
+ * Reference:
+ *    Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
+ *    Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector
+ */
+class CrossChannelNormLayer : public NormLayer {
+ public:
+  explicit CrossChannelNormLayer(const LayerConfig& config)
+      : NormLayer(config) {}
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback);
+  MatrixPtr createSampleMatrix(MatrixPtr data, size_t iter, size_t spatialDim);
+  MatrixPtr createSpatialMatrix(MatrixPtr data, size_t iter, size_t spatialDim);
+
+ protected:
+  size_t channels_;
+  std::unique_ptr<Weight> scale_;
+  MatrixPtr scaleDiff_;
+  MatrixPtr normBuffer_;
+  MatrixPtr dataBuffer_;
+  MatrixPtr channelBuffer_;
+  MatrixPtr spatialBuffer_;
+  MatrixPtr sampleBuffer_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/NormProjectionLayer.cpp b/paddle/legacy/gserver/layers/NormProjectionLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..72affaa1ce618a841f8040c84467a46b77531958
--- /dev/null
+++ b/paddle/legacy/gserver/layers/NormProjectionLayer.cpp
@@ -0,0 +1,101 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "NormProjectionLayer.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+size_t CMRProjectionNormLayer::getSize() {
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  size_t layerSize = 0;
+  imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
+  imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
+  if (imgSizeH_ == 0) {
+    imgSizeH_ = imgSizeY_;
+  }
+  if (imgSizeW_ == 0) {
+    imgSizeW_ = imgSize_;
+  }
+  outputH_ = imgSizeH_;
+  outputW_ = imgSizeW_;
+  layerSize = outputH_ * outputW_ * channels_;
+
+  getOutput().setFrameHeight(outputH_);
+  getOutput().setFrameWidth(outputW_);
+  return layerSize;
+}
+
+bool CMRProjectionNormLayer::init(const LayerMap& layerMap,
+                                  const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  ResponseNormLayer::init(layerMap, parameterMap);
+
+  /* the size of inputs for norm-layer is 1 */
+  CHECK_EQ(config_.inputs_size(), 1);
+
+  createFunction(
+      forward_,
+      "CrossMapNormal",
+      FuncConfig().set("size", size_).set("scale", scale_).set("pow", pow_));
+  createFunction(
+      backward_,
+      "CrossMapNormalGrad",
+      FuncConfig().set("size", size_).set("scale", scale_).set("pow", pow_));
+
+  return true;
+}
+
+void CMRProjectionNormLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  /* malloc memory for the output_ if necessary */
+  /* note: one sample correspond to one row */
+  MatrixPtr input = inputLayers_[0]->getOutputValue();
+  size_t batchSize = input->getHeight();
+  int size = getSize();
+  resetOutput(batchSize, size);
+
+  Matrix::resizeOrCreate(denoms_, batchSize, size, /* trans */ false, useGpu_);
+
+  shape_ = TensorShape({batchSize, channels_, imgSizeH_, imgSizeW_});
+
+  // prepare forward arguments
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getInputValue(0), shape_);
+  outputs.addArg(*getOutputValue(), shape_, ASSIGN_TO);
+  outputs.addArg(*denoms_, shape_, ASSIGN_TO);
+
+  forward_[0]->calc(inputs, outputs);
+}
+
+void CMRProjectionNormLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+
+  if (NULL == getInputGrad(0)) {
+    return;
+  }
+
+  // prepare backward arguments
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getInputValue(0), shape_);
+  inputs.addArg(*getOutputValue(), shape_);
+  inputs.addArg(*getOutputGrad(), shape_);
+  inputs.addArg(*denoms_, shape_);
+  outputs.addArg(*getInputGrad(0), shape_, ADD_TO);
+
+  backward_[0]->calc(inputs, outputs);
+}
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/NormProjectionLayer.h b/paddle/legacy/gserver/layers/NormProjectionLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..492d1fcb72343a54577a459aaa5de53596f43f42
--- /dev/null
+++ b/paddle/legacy/gserver/layers/NormProjectionLayer.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "NormLayer.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief response normalization across feature maps
+ * namely normalize in number of size_ channels
+ */
+class CMRProjectionNormLayer : public ResponseNormLayer {
+  size_t imgSizeH_, imgSizeW_;
+  size_t outputH_, outputW_;
+
+ public:
+  explicit CMRProjectionNormLayer(const LayerConfig& config)
+      : ResponseNormLayer(config) {}
+
+  ~CMRProjectionNormLayer() {}
+
+  size_t getSize();
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+ protected:
+  TensorShape shape_;
+};
+}  // namespace paddle
diff --git a/paddle/gserver/layers/Operator.cpp b/paddle/legacy/gserver/layers/Operator.cpp
similarity index 100%
rename from paddle/gserver/layers/Operator.cpp
rename to paddle/legacy/gserver/layers/Operator.cpp
diff --git a/paddle/legacy/gserver/layers/Operator.h b/paddle/legacy/gserver/layers/Operator.h
new file mode 100644
index 0000000000000000000000000000000000000000..20a248985eb6b3aba016b28bca4c0eea44baa868
--- /dev/null
+++ b/paddle/legacy/gserver/layers/Operator.h
@@ -0,0 +1,96 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/parameter/Parameter.h"
+
+#include "Layer.h"
+#include "paddle/legacy/parameter/Argument.h"
+
+namespace paddle {
+
+// Macro for registering a operator type
+// Example: REGISTER_OPERATOR(dot_mul, DotMulOperator);
+#define REGISTER_OPERATOR(__type_name, __class_name)                \
+  static InitFunction __reg_type_##__type_name([]() {               \
+    Operator::registrar_.registerClass<__class_name>(#__type_name); \
+  })
+
+/**
+ * Operator like Projection, but takes more than one Arguments as input.
+ * @note: Operator can't have parameters.
+ */
+class Operator {
+ public:
+  static Operator* create(const OperatorConfig& config, bool useGpu);
+
+  Operator(const OperatorConfig& config, bool useGpu)
+      : config_(config), useGpu_(useGpu) {}
+
+  virtual ~Operator() {}
+
+  const OperatorConfig& getConfig() const { return config_; }
+
+  static ClassRegistrar<Operator, OperatorConfig, bool> registrar_;
+
+  /**
+   * Forward propagation. If backward() will be called, in and out must be kept
+   * valid until then.
+   * @param ins inputs of operator
+   * @param out output of operator
+   * @param passType PASS_TRAIN of PASS_TEST
+   */
+  void forward(std::vector<const Argument*> ins,
+               Argument* out,
+               PassType passType) {
+    ins_ = ins;
+    out_ = out;
+    passType_ = passType;
+    forward();
+  }
+
+  virtual void prefetch(const Argument* in) {}
+  virtual void forward() = 0;
+  virtual void backward() = 0;
+
+  /**
+   * See comment in Layer.h for the function with the same name.
+   */
+  virtual void resetState() {}
+
+  /**
+   * Set layer state.
+   */
+  virtual void setState(LayerStatePtr state) {}
+
+  /**
+   * Set layer state.
+   */
+  virtual LayerStatePtr getState() { return nullptr; }
+
+ protected:
+  /// Config of operator
+  OperatorConfig config_;
+  bool useGpu_;
+
+  /// Store `ins` passed to forward()
+  std::vector<const Argument*> ins_;
+  /// Store `out` passed to forward()
+  Argument* out_;
+  /// Store `passType` passed to forward()
+  PassType passType_;
+};
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/OuterProdLayer.cpp b/paddle/legacy/gserver/layers/OuterProdLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d0928be9d4d52532503987af8e29fdf5c7fb16a5
--- /dev/null
+++ b/paddle/legacy/gserver/layers/OuterProdLayer.cpp
@@ -0,0 +1,141 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * @brief A layer for computing the outer product of two vectors
+ * @note used in NEURAL TURING MACHINE
+ * Input1: vector (batchSize * dim1)
+ * Input2: vector (batchSize * dim2)
+ * Output: a matrix: (batchSize * (dim1*dim2))
+ */
+
+class OuterProdLayer : public Layer {
+ protected:
+  MatrixPtr tmpMtx0;
+  MatrixPtr tmpRow0;
+  MatrixPtr tmpRow1;
+
+ public:
+  explicit OuterProdLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~OuterProdLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(out_prod, OuterProdLayer);
+
+bool OuterProdLayer::init(const LayerMap& layerMap,
+                          const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 2U);
+
+  size_t dim0 = inputLayers_[0]->getSize();
+  size_t dim1 = inputLayers_[1]->getSize();
+
+  CHECK_EQ(dim0 * dim1, getSize()) << "Dimension mismatch";
+
+  tmpRow0 = Matrix::create(
+      nullptr, /* height= */ 1, dim0, /* trans= */ false, useGpu_);
+  tmpRow1 = Matrix::create(
+      nullptr, /* height= */ 1, dim1, /* trans= */ false, useGpu_);
+  tmpMtx0 = Matrix::create(nullptr,
+                           /* height= */ dim0,
+                           dim1,
+                           /* trans= */ false,
+                           useGpu_);
+  return true;
+}
+
+void OuterProdLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+
+  size_t batchSize = inV0->getHeight();
+  size_t dim0 = inV0->getWidth();
+  size_t dim1 = inV1->getWidth();
+
+  CHECK_EQ(dim0 * dim1, getSize());
+  CHECK_EQ(inV1->getHeight(), batchSize);
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    reserveOutput(batchSize, dim0 * dim1);
+  }
+
+  MatrixPtr outV = getOutputValue();
+
+  {
+    REGISTER_TIMER_INFO("FwOutProdTimer", getName().c_str());
+    for (size_t i = 0; i < batchSize; i++) {
+      tmpMtx0->setData(outV->getData() + i * dim0 * dim1);
+      tmpRow0->setData(inV0->getData() + i * dim0);
+      tmpRow1->setData(inV1->getData() + i * dim1);
+
+      tmpMtx0->mul(*tmpRow0->getTranspose(), *tmpRow1);
+    }
+  }
+}
+
+void OuterProdLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  MatrixPtr outG = getOutputGrad();
+  MatrixPtr inG0 = getInputGrad(0);
+  MatrixPtr inG1 = getInputGrad(1);
+
+  size_t batchSize = inV0->getHeight();
+  size_t dim0 = inV0->getWidth();
+  size_t dim1 = inV1->getWidth();
+
+  {
+    REGISTER_TIMER_INFO("BwOutProdTimer", getName().c_str());
+
+    if (inG0) {
+      for (size_t i = 0; i < batchSize; i++) {
+        tmpMtx0->setData(outG->getData() + i * dim0 * dim1);
+        tmpRow0->setData(inG0->getData() + i * dim0);
+        tmpRow1->setData(inV1->getData() + i * dim1);
+
+        tmpRow0->mul(*tmpRow1, *tmpMtx0->getTranspose(), 1, 1);
+      }
+    }
+
+    if (inG1) {
+      for (size_t i = 0; i < batchSize; i++) {
+        tmpMtx0->setData(outG->getData() + i * dim0 * dim1);
+        tmpRow0->setData(inV0->getData() + i * dim0);
+        tmpRow1->setData(inG1->getData() + i * dim1);
+
+        tmpRow1->mul(*tmpRow0, *tmpMtx0, 1, 1);
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/PadLayer.cpp b/paddle/legacy/gserver/layers/PadLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7b92b3de2d839f240ec8cbe07ed7685295568809
--- /dev/null
+++ b/paddle/legacy/gserver/layers/PadLayer.cpp
@@ -0,0 +1,106 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "PadLayer.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(pad, PadLayer);
+
+bool PadLayer::init(const LayerMap& layerMap,
+                    const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  auto& pad_conf = config_.inputs(0).pad_conf();
+  auto& img_conf = pad_conf.image_conf();
+  CHECK_EQ(config_.inputs_size(), 1);
+  inDims_ = TensorShape(
+      {0,
+       img_conf.channels(),
+       img_conf.has_img_size_y() ? img_conf.img_size_y() : img_conf.img_size(),
+       img_conf.img_size()});
+
+  CHECK_EQ(2, pad_conf.pad_c_size());
+  CHECK_EQ(2, pad_conf.pad_h_size());
+  CHECK_EQ(2, pad_conf.pad_w_size());
+  padc_ = {pad_conf.pad_c(0), pad_conf.pad_c(1)};
+  padh_ = {pad_conf.pad_h(0), pad_conf.pad_h(1)};
+  padw_ = {pad_conf.pad_w(0), pad_conf.pad_w(1)};
+
+  outDims_ = TensorShape(4);
+  setOutDims(0);
+
+  createFunction(forward_,
+                 "Pad",
+                 FuncConfig()
+                     .set("channel", padc_)
+                     .set("height", padh_)
+                     .set("width", padw_));
+  createFunction(backward_,
+                 "PadGrad",
+                 FuncConfig()
+                     .set("channel", padc_)
+                     .set("height", padh_)
+                     .set("width", padw_));
+
+  return true;
+}
+
+void PadLayer::setOutDims(const size_t batchSize) {
+  outDims_.reshape({batchSize,
+                    inDims_[1] + padc_[0] + padc_[1],
+                    inDims_[2] + padh_[0] + padh_[1],
+                    inDims_[3] + padw_[0] + padw_[1]});
+}
+
+void PadLayer::setTensorDim(const size_t batchSize) {
+  CHECK_EQ(static_cast<int>(inputLayers_.size()), 1);
+  inDims_.setDim(0, batchSize);
+  int h = inputLayers_[0]->getOutput().getFrameHeight();
+  if (h != 0) inDims_.setDim(2, h);
+  int w = inputLayers_[0]->getOutput().getFrameWidth();
+  if (w != 0) inDims_.setDim(3, w);
+  setOutDims(batchSize);
+}
+
+void PadLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  MatrixPtr input = inputLayers_[0]->getOutputValue();
+  size_t batchSize = input->getHeight();
+  setTensorDim(batchSize);
+  int size = outDims_[1] * outDims_[2] * outDims_[3];
+  resetOutput(batchSize, size);
+  MatrixPtr outV = getOutputValue();
+  REGISTER_TIMER_INFO("PadForward", getName().c_str());
+
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getInputValue(0), inDims_);
+  outputs.addArg(*getOutputValue(), outDims_, ASSIGN_TO);
+  forward_[0]->calc(inputs, outputs);
+}
+
+void PadLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+  REGISTER_TIMER_INFO("PadBackward", getName().c_str());
+
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getOutputGrad(), outDims_);
+  outputs.addArg(*getInputGrad(0), inDims_, ADD_TO);
+  backward_[0]->calc(inputs, outputs);
+}
+}  // namespace paddle
diff --git a/paddle/gserver/layers/PadLayer.h b/paddle/legacy/gserver/layers/PadLayer.h
similarity index 100%
rename from paddle/gserver/layers/PadLayer.h
rename to paddle/legacy/gserver/layers/PadLayer.h
diff --git a/paddle/legacy/gserver/layers/ParameterReluLayer.cpp b/paddle/legacy/gserver/layers/ParameterReluLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..23715d1975d7a3606a9418d54bc69ae6f036a93a
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ParameterReluLayer.cpp
@@ -0,0 +1,69 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ParameterReluLayer.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(prelu, ParameterReluLayer);
+
+bool ParameterReluLayer::init(const LayerMap& layerMap,
+                              const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  CHECK_EQ(inputLayers_.size(), parameters_.size());
+  partialSum_ = config_.partial_sum();
+  CHECK_GT(partialSum_, 0UL) << "partial_sum must be larger than zero.";
+  CHECK(!(inputLayers_[0]->getSize() % partialSum_))
+      << "Incorrect value for partialSum: " << partialSum_
+      << " must divide input size: " << inputLayers_[0]->getSize();
+  CHECK_EQ(getSize() / partialSum_, parameters_[0]->getSize());
+  weight_ = std::unique_ptr<Weight>(new Weight(
+      1UL, inputLayers_[0]->getSize() / partialSum_, parameters_[0]));
+  return true;
+}
+
+void ParameterReluLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  /* malloc memory for the output_ if necessary */
+  int batchSize = getInput(0).getBatchSize();
+  int size = getSize();
+  reserveOutput(batchSize, size);
+  MatrixPtr outV = getOutputValue();
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    outV->paramReluForward(*(getInput(0).value), *(weight_->getW()));
+  }
+}
+
+void ParameterReluLayer::backward(const UpdateCallback& callback) {
+  if (weight_->getWGrad()) {
+    weight_->getWGrad()->paramReluBackwardW(*getOutputGrad(),
+                                            *(getInputValue(0)));
+  }
+
+  MatrixPtr preGrad = getInputGrad(0);
+  preGrad->paramReluBackwardDiff(
+      *getOutputGrad(), *(getInputValue(0)), *(weight_->getW()));
+  {
+    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
+    weight_->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ParameterReluLayer.h b/paddle/legacy/gserver/layers/ParameterReluLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..3aac4b42f60531b5856ddef208b8356898e42859
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ParameterReluLayer.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
+
+namespace paddle {
+
+/**
+ *  @brief ParameterReluLayer active inputs with learnable parameter weight_.
+ *  forward:
+ *  \f[
+ *      y = x > 0 ? x : w .* x
+ *  \f]
+ *  backward:
+ *  \f[
+ *      dx = x > 0 ? dy : w .* dy \\
+ *      dw = x > 0 ? 0 : dy.*x
+ *  \f]
+ *  Here, x is the input, w is the weight, y is the output.
+ *  dx, dw, dy is the gradient.
+ */
+
+class ParameterReluLayer : public Layer {
+ protected:
+  std::unique_ptr<Weight> weight_;
+
+  /**
+   *  @brief partialSum_ makes a group of inputs share same weights,
+   *  - partialSum_ = 1:
+   *       element wise activation: each element has a weight_,
+   *  - partialSum_ = number of elements in one channel,
+   *       channels wise parameter activation, elements in a channel
+   *       share same weight_,
+   *  - partialSum_ = number of outputs
+   *       all elements share same weight_,
+   */
+  size_t partialSum_;
+
+ public:
+  explicit ParameterReluLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~ParameterReluLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/Pool3DLayer.cpp b/paddle/legacy/gserver/layers/Pool3DLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ae3f55c27f2d7bd3ab47d834d5b6f274ff558310
--- /dev/null
+++ b/paddle/legacy/gserver/layers/Pool3DLayer.cpp
@@ -0,0 +1,178 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Pool3DLayer.h"
+#include "PoolProjectionLayer.h"
+#include "paddle/legacy/utils/Logging.h"
+
+namespace paddle {
+
+REGISTER_LAYER(pool3d, Pool3DLayer);
+
+bool Pool3DLayer::init(const LayerMap& layerMap,
+                       const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  /* the size of inputs for pool-layer is 1 */
+  CHECK_EQ(config_.inputs_size(), 1);
+
+  const PoolConfig& conf = config_.inputs(0).pool_conf();
+  poolType_ = conf.pool_type();
+  channels_ = conf.channels();
+
+  sizeX_ = conf.size_x();
+  sizeY_ = conf.size_y();
+  sizeZ_ = conf.size_z();
+
+  strideW_ = conf.stride();
+  strideH_ = conf.stride_y();
+  strideD_ = conf.stride_z();
+
+  imgSizeW_ = conf.img_size();
+  imgSizeH_ = conf.img_size_y();
+  imgSizeD_ = conf.img_size_z();
+
+  paddingW_ = conf.padding();
+  paddingH_ = conf.padding_y();
+  paddingD_ = conf.padding_z();
+
+  outputW_ = conf.output_x();
+  outputH_ = conf.output_y();
+  outputD_ = conf.output_z();
+
+  return true;
+}
+
+size_t Pool3DLayer::getSize() {
+  CHECK_EQ(inputLayers_.size(), 1UL);
+
+  size_t layerSize = 0;
+  outputD_ = outputSize(imgSizeD_, sizeZ_, paddingD_, strideD_, false);
+  outputH_ = outputSize(imgSizeH_, sizeY_, paddingH_, strideH_, false);
+  outputW_ = outputSize(imgSizeW_, sizeX_, paddingW_, strideW_, false);
+
+  layerSize = outputD_ * outputH_ * outputW_ * channels_;
+  getOutput().setFrameHeight(outputH_);
+  getOutput().setFrameWidth(outputW_);
+  getOutput().setFrameDepth(outputD_);
+  return layerSize;
+}
+
+void Pool3DLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  const MatrixPtr& inMat = inputLayers_[0]->getOutputValue();
+  size_t batchSize = inMat->getHeight();
+  size_t outWidth = getSize();
+  resetOutput(batchSize, outWidth);
+  Matrix::resizeOrCreate(maxPoolIdx_, batchSize, outWidth, false, useGpu_);
+  const MatrixPtr outMat = getOutputValue();
+
+  if (poolType_ == "avg") {
+    outMat->avgPool3DForward(*inMat,
+                             channels_,
+                             imgSizeD_,
+                             imgSizeH_,
+                             imgSizeW_,
+                             outputD_,
+                             outputH_,
+                             outputW_,
+                             sizeZ_,
+                             sizeY_,
+                             sizeX_,
+                             strideD_,
+                             strideH_,
+                             strideW_,
+                             paddingD_,
+                             paddingH_,
+                             paddingW_);
+  } else if (poolType_ == "max") {
+    outMat->maxPool3DForward(*inMat,
+                             *maxPoolIdx_,
+                             channels_,
+                             imgSizeD_,
+                             imgSizeH_,
+                             imgSizeW_,
+                             outputD_,
+                             outputH_,
+                             outputW_,
+                             sizeZ_,
+                             sizeY_,
+                             sizeX_,
+                             strideD_,
+                             strideH_,
+                             strideW_,
+                             paddingD_,
+                             paddingH_,
+                             paddingW_);
+  } else {
+    LOG(FATAL) << "Unknown pool type: " << poolType_;
+  }
+  forwardActivation();
+}
+
+void Pool3DLayer::backward(const UpdateCallback& callback) {
+  backwardActivation();
+
+  (void)callback;
+  if (NULL == getInputGrad(0)) return;
+  MatrixPtr inMat = inputLayers_[0]->getOutputValue();
+  MatrixPtr inGradMat = inputLayers_[0]->getOutputGrad();
+  MatrixPtr outMat = getOutputValue();
+  MatrixPtr outGradMat = getOutputGrad();
+
+  if (poolType_ == "avg") {
+    inGradMat->avgPool3DBackward(*outGradMat,
+                                 imgSizeD_,
+                                 imgSizeH_,
+                                 imgSizeW_,
+                                 outputD_,
+                                 outputH_,
+                                 outputW_,
+                                 sizeZ_,
+                                 sizeY_,
+                                 sizeZ_,
+                                 strideD_,
+                                 strideH_,
+                                 strideW_,
+                                 paddingD_,
+                                 paddingH_,
+                                 paddingW_,
+                                 1.0,
+                                 1.0);
+  } else if (poolType_ == "max") {
+    inGradMat->maxPool3DBackward(*outGradMat,
+                                 *maxPoolIdx_,
+                                 imgSizeD_,
+                                 imgSizeH_,
+                                 imgSizeW_,
+                                 outputD_,
+                                 outputH_,
+                                 outputW_,
+                                 sizeZ_,
+                                 sizeY_,
+                                 sizeZ_,
+                                 strideD_,
+                                 strideH_,
+                                 strideW_,
+                                 paddingD_,
+                                 paddingH_,
+                                 paddingW_,
+                                 1.0,
+                                 1.0);
+  } else {
+    LOG(FATAL) << "Unknown pool type: " << poolType_;
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/Pool3DLayer.h b/paddle/legacy/gserver/layers/Pool3DLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..6851c44ab22a39bebe3592b8e5f6384a393947f2
--- /dev/null
+++ b/paddle/legacy/gserver/layers/Pool3DLayer.h
@@ -0,0 +1,49 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "Layer.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief Basic parent layer of pooling
+ * Pools the input within regions
+ */
+class Pool3DLayer : public Layer {
+ public:
+  explicit Pool3DLayer(const LayerConfig& config) : Layer(config) {}
+  ~Pool3DLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
+  size_t getSize();
+
+ protected:
+  int channels_;
+  int sizeX_, sizeY_, sizeZ_;
+  int strideW_, strideH_, strideD_;
+  int paddingW_, paddingH_, paddingD_;
+  int imgSizeW_, imgSizeH_, imgSizeD_;
+  int outputW_, outputH_, outputD_;
+  std::string poolType_;
+  MatrixPtr maxPoolIdx_;
+};
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/PoolLayer.cpp b/paddle/legacy/gserver/layers/PoolLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..df172d95757e0842328caa508042f3613bc72232
--- /dev/null
+++ b/paddle/legacy/gserver/layers/PoolLayer.cpp
@@ -0,0 +1,70 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "PoolLayer.h"
+#include "MaxPoolWithMaskLayer.h"
+#include "PoolProjectionLayer.h"
+#include "paddle/legacy/utils/Logging.h"
+#ifdef PADDLE_WITH_CUDA
+#include "CudnnPoolLayer.h"
+#endif
+namespace paddle {
+
+REGISTER_LAYER_CREATE_FUNC(pool, &PoolLayer::create);
+
+bool PoolLayer::init(const LayerMap& layerMap,
+                     const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  /* the size of inputs for pool-layer is 1 */
+  CHECK_EQ(config_.inputs_size(), 1);
+
+  const PoolConfig& conf = config_.inputs(0).pool_conf();
+  poolType_ = conf.pool_type();
+  channels_ = conf.channels();
+  sizeX_ = conf.size_x();
+  stride_ = conf.stride();
+  outputX_ = conf.output_x();
+  imgSize_ = conf.img_size();
+  confPadding_ = conf.padding();
+
+  sizeY_ = conf.has_size_y() ? conf.size_y() : conf.size_x();
+  imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+  strideY_ = conf.has_stride_y() ? conf.stride_y() : conf.stride();
+  confPaddingY_ = conf.has_padding_y() ? conf.padding_y() : conf.padding();
+  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
+
+  excludeMode_ = conf.has_exclude_mode() ? conf.exclude_mode() : true;
+  return true;
+}
+
+Layer* PoolLayer::create(const LayerConfig& config) {
+  CHECK_EQ(config.inputs_size(), 1);
+  const std::string& pool = config.inputs(0).pool_conf().pool_type();
+  if (pool == "max-projection" || pool == "avg-projection") {
+    return new PoolProjectionLayer(config);
+#ifdef PADDLE_WITH_CUDA
+  } else if (CudnnPoolLayer::typeCheck(pool)) {
+    return new CudnnPoolLayer(config);
+#endif
+  } else if (pool == "max-pool-with-mask") {
+    return new MaxPoolWithMaskLayer(config);
+  } else {
+    LOG(FATAL) << "Unknown pool type: " << pool;
+    return nullptr;
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/PoolLayer.h b/paddle/legacy/gserver/layers/PoolLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..0808dfae8497008f974730b65977c85e914a7a27
--- /dev/null
+++ b/paddle/legacy/gserver/layers/PoolLayer.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "Layer.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief Basic parent layer of pooling
+ * Pools the input within regions
+ */
+class PoolLayer : public Layer {
+ protected:
+  size_t channels_, sizeX_, stride_, outputX_, imgSize_;
+  int confPadding_;
+
+  size_t sizeY_;
+  size_t imgSizeY_;
+  size_t strideY_;
+  size_t outputY_;
+  int confPaddingY_;
+
+  std::string poolType_;
+
+  bool excludeMode_;
+
+ public:
+  explicit PoolLayer(const LayerConfig& config) : Layer(config) {}
+
+  /**
+   * @brief create pooling layer by pool_type
+   */
+  static Layer* create(const LayerConfig& config);
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/PoolProjection.cpp b/paddle/legacy/gserver/layers/PoolProjection.cpp
similarity index 100%
rename from paddle/gserver/layers/PoolProjection.cpp
rename to paddle/legacy/gserver/layers/PoolProjection.cpp
diff --git a/paddle/legacy/gserver/layers/PoolProjection.h b/paddle/legacy/gserver/layers/PoolProjection.h
new file mode 100644
index 0000000000000000000000000000000000000000..d01b6a13f0a5fd2283f1f216ef419b9ccc7308f9
--- /dev/null
+++ b/paddle/legacy/gserver/layers/PoolProjection.h
@@ -0,0 +1,68 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Projection.h"
+#include "paddle/legacy/math/MathUtils.h"
+
+namespace paddle {
+
+class PoolProjection : public Projection {
+ protected:
+  size_t imgSizeY_, imgSize_;
+  size_t outputY_, outputX_;
+  size_t strideY_, stride_;
+  size_t sizeY_, sizeX_;
+  int confPaddingY_, confPadding_;
+  size_t channels_;
+  std::string poolType_;
+  bool excludeMode_;
+
+ public:
+  PoolProjection(const ProjectionConfig& config,
+                 ParameterPtr parameter,
+                 bool useGpu);
+
+  static PoolProjection* create(const ProjectionConfig& config,
+                                ParameterPtr parameter,
+                                bool useGpu);
+
+  const std::string& getPoolType() const { return poolType_; }
+
+  size_t getSize();
+};
+
+class MaxPoolProjection : public PoolProjection {
+ public:
+  MaxPoolProjection(const ProjectionConfig& config,
+                    ParameterPtr parameter,
+                    bool useGpu)
+      : PoolProjection(config, parameter, useGpu) {}
+
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback = nullptr);
+};
+
+class AvgPoolProjection : public PoolProjection {
+ public:
+  AvgPoolProjection(const ProjectionConfig& config,
+                    ParameterPtr parameter,
+                    bool useGpu)
+      : PoolProjection(config, parameter, useGpu) {}
+
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback = nullptr);
+};
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/PoolProjectionLayer.cpp b/paddle/legacy/gserver/layers/PoolProjectionLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e44b1d7ba1494e43db81f998c2818bbbf7779d6f
--- /dev/null
+++ b/paddle/legacy/gserver/layers/PoolProjectionLayer.cpp
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "PoolProjectionLayer.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+size_t PoolProjectionLayer::getSize() {
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  size_t layerSize = 0;
+  imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
+  imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
+  if (imgSizeH_ == 0) {
+    imgSizeH_ = imgSizeY_;
+  }
+  if (imgSizeW_ == 0) {
+    imgSizeW_ = imgSize_;
+  }
+
+  outputH_ = outputSize(imgSizeH_,
+                        sizeY_,
+                        confPaddingY_,
+                        strideY_,
+                        /* caffeMode */ false);
+  outputW_ = outputSize(imgSizeW_,
+                        sizeX_,
+                        confPadding_,
+                        stride_,
+                        /* caffeMode */ false);
+
+  layerSize = outputH_ * outputW_ * channels_;
+
+  return layerSize;
+}
+
+void PoolProjectionLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  const Argument& in = getInput(0);
+  int batchSize = in.value->getHeight();
+  int size = getSize();
+  resetOutput(batchSize, size);
+  poolProjection_->forward(&in, &output_, passType);
+}
+
+void PoolProjectionLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+  if (NULL == getInputGrad(0)) {
+    return;
+  }
+  poolProjection_->backward(callback);
+}
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/PoolProjectionLayer.h b/paddle/legacy/gserver/layers/PoolProjectionLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..fcd35bbba4dff612fba827cdf545de71127c560e
--- /dev/null
+++ b/paddle/legacy/gserver/layers/PoolProjectionLayer.h
@@ -0,0 +1,46 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "PoolLayer.h"
+#include "PoolProjection.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+/**
+ * @brief Basic parent layer of different kinds of pooling
+ */
+class PoolProjectionLayer : public PoolLayer {
+ protected:
+  size_t imgSizeH_, imgSizeW_;
+  size_t outputH_, outputW_;
+  std::unique_ptr<PoolProjection> poolProjection_;
+  ProjectionConfig projectionConfig_;
+
+ public:
+  explicit PoolProjectionLayer(const LayerConfig& config) : PoolLayer(config) {
+    PoolConfig* conf = projectionConfig_.mutable_pool_conf();
+    *conf = config_.inputs(0).pool_conf();
+    poolProjection_.reset(
+        PoolProjection::create(projectionConfig_, nullptr, useGpu_));
+  }
+
+  size_t getSize();
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/PowerLayer.cpp b/paddle/legacy/gserver/layers/PowerLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5e94c64db6098dbc1ed13bdcbd573f95024713bc
--- /dev/null
+++ b/paddle/legacy/gserver/layers/PowerLayer.cpp
@@ -0,0 +1,120 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * This layer applys a power function to a vector element-wise,
+ * which is used in NEURAL TURING MACHINE.
+ * \f[
+ *   y = x^w
+ * \f]
+ * where \f$x\f$ is a input vector, \f$w\f$ is scalar weight,
+ * and output \f$y\f$ is a vector.
+ *
+ * The config file api is power_layer.
+ */
+
+class PowerLayer : public Layer {
+ protected:
+  MatrixPtr tmpMtx;
+
+ public:
+  explicit PowerLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~PowerLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(power, PowerLayer);
+
+bool PowerLayer::init(const LayerMap& layerMap,
+                      const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 2U);
+
+  return true;
+}
+
+void PowerLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+
+  size_t batchSize = inV1->getHeight();
+  size_t dataDim = inV1->getWidth();
+
+  CHECK_EQ(getSize(), dataDim);
+  CHECK_EQ(1U, inV0->getWidth());
+  CHECK_EQ(batchSize, inV0->getHeight());
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    reserveOutput(batchSize, dataDim);
+  }
+
+  MatrixPtr outV = getOutputValue();
+
+  {
+    REGISTER_TIMER_INFO("FwPowerTimer", getName().c_str());
+    outV->rowPow(0, *inV1, *inV0);
+  }
+}
+
+void PowerLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  MatrixPtr inG0 = getInputGrad(0);
+  MatrixPtr inG1 = getInputGrad(1);
+  MatrixPtr outV = getOutputValue();
+  MatrixPtr outG = getOutputGrad();
+
+  size_t batchSize = inV1->getHeight();
+  size_t dataDim = inV1->getWidth();
+
+  {
+    REGISTER_TIMER_INFO("BwPowerTimer", getName().c_str());
+    Matrix::resizeOrCreate(tmpMtx, batchSize, dataDim, false, useGpu_);
+
+    if (inG0) {
+      tmpMtx->log2(*inV1);
+      tmpMtx->dotMul(*tmpMtx, *outV);
+
+      // inG0 += outG .* (log(inV1) * outV)
+      inG0->rowDotMul(0, *outG, *tmpMtx);
+    }
+
+    if (inG1) {
+      // tmp = (outV / inV1) * inV0
+      tmpMtx->dotDiv(*outV, *inV1);
+      tmpMtx->rowScale(0, *tmpMtx, *inV0);
+
+      inG1->addDotMul(*outG, *tmpMtx, 1, 1);
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/PrintLayer.cpp b/paddle/legacy/gserver/layers/PrintLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/PrintLayer.cpp
rename to paddle/legacy/gserver/layers/PrintLayer.cpp
diff --git a/paddle/legacy/gserver/layers/PriorBox.cpp b/paddle/legacy/gserver/layers/PriorBox.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..83aab6e36662855a5867463757bc5a92e6e83e07
--- /dev/null
+++ b/paddle/legacy/gserver/layers/PriorBox.cpp
@@ -0,0 +1,159 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/BaseMatrix.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+/**
+ * @brief A layer for generating priorbox locations and variances.
+ * - Input: Two and only two input layer are accepted. The input layer must be
+ *          be a data output layer and a convolution output layer.
+ * - Output: The priorbox locations and variances of the input data.
+ * Reference:
+ *    Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
+ *    Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector
+ */
+
+class PriorBoxLayer : public Layer {
+ public:  // NOLINT
+  explicit PriorBoxLayer(const LayerConfig& config) : Layer(config) {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override {}
+
+ protected:  // NOLINT
+  int numPriors_;
+  std::vector<int> minSize_;
+  std::vector<int> maxSize_;
+  std::vector<real> aspectRatio_;
+  std::vector<real> variance_;
+  MatrixPtr buffer_;
+};
+
+REGISTER_LAYER(priorbox, PriorBoxLayer);
+
+bool PriorBoxLayer::init(const LayerMap& layerMap,
+                         const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+  auto pbConf = config_.inputs(0).priorbox_conf();
+  std::vector<real> tmp;
+  aspectRatio_.push_back(1.);
+  std::copy(pbConf.min_size().begin(),
+            pbConf.min_size().end(),
+            std::back_inserter(minSize_));
+  std::copy(pbConf.max_size().begin(),
+            pbConf.max_size().end(),
+            std::back_inserter(maxSize_));
+  std::copy(pbConf.variance().begin(),
+            pbConf.variance().end(),
+            std::back_inserter(variance_));
+  std::copy(pbConf.aspect_ratio().begin(),
+            pbConf.aspect_ratio().end(),
+            std::back_inserter(tmp));
+
+  if (maxSize_.size() > 0) CHECK_EQ(minSize_.size(), maxSize_.size());
+
+  // flip aspect ratios
+  for (unsigned index = 0; index < tmp.size(); index++) {
+    real ar = tmp[index];
+    if (fabs(ar - 1.) < 1e-6) continue;
+    aspectRatio_.push_back(ar);
+    aspectRatio_.push_back(1. / ar);
+  }
+
+  numPriors_ = aspectRatio_.size() * minSize_.size() + maxSize_.size();
+
+  return true;
+}
+
+void PriorBoxLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  auto input = getInput(0);
+  int layerWidth = input.getFrameWidth();
+  int layerHeight = input.getFrameHeight();
+
+  auto image = getInput(1);
+  int imageWidth = image.getFrameWidth();
+  int imageHeight = image.getFrameHeight();
+
+  real stepW = static_cast<real>(imageWidth) / layerWidth;
+  real stepH = static_cast<real>(imageHeight) / layerHeight;
+  int dim = layerHeight * layerWidth * numPriors_ * 4;
+  reserveOutput(1, dim * 2);
+  // use a cpu buffer to compute
+  Matrix::resizeOrCreate(buffer_, 1, dim * 2, false, false);
+  auto* tmpPtr = buffer_->getData();
+
+  int idx = 0;
+  for (int h = 0; h < layerHeight; ++h) {
+    for (int w = 0; w < layerWidth; ++w) {
+      real centerX = (w + 0.5) * stepW;
+      real centerY = (h + 0.5) * stepH;
+      for (size_t s = 0; s < minSize_.size(); s++) {
+        real minSize = minSize_[s];
+        real boxWidth = minSize;
+        real boxHeight = minSize;
+
+        // first prior: aspect_ratio == 1.0, compatible to old logic
+        tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
+        tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
+        tmpPtr[idx++] = (centerX + boxWidth / 2.) / imageWidth;
+        tmpPtr[idx++] = (centerY + boxHeight / 2.) / imageHeight;
+        // set the variance.
+        for (int t = 0; t < 4; t++) tmpPtr[idx++] = variance_[t];
+
+        if (maxSize_.size() > 0) {
+          // square prior with size sqrt(minSize * maxSize)
+          real maxSize = maxSize_[s];
+          boxWidth = boxHeight = sqrt(minSize * maxSize);
+          tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
+          tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
+          tmpPtr[idx++] = (centerX + boxWidth / 2.) / imageWidth;
+          tmpPtr[idx++] = (centerY + boxHeight / 2.) / imageHeight;
+          // set the variance.
+          for (int t = 0; t < 4; t++) tmpPtr[idx++] = variance_[t];
+        }
+
+        // priors with different aspect ratios
+        for (size_t r = 0; r < aspectRatio_.size(); r++) {
+          real ar = aspectRatio_[r];
+          if (fabs(ar - 1.0) < 1e-6) {
+            continue;
+          }
+          boxWidth = minSize * sqrt(ar);
+          boxHeight = minSize / sqrt(ar);
+          tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
+          tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
+          tmpPtr[idx++] = (centerX + boxWidth / 2.) / imageWidth;
+          tmpPtr[idx++] = (centerY + boxHeight / 2.) / imageHeight;
+          // set the variance.
+          for (int t = 0; t < 4; t++) tmpPtr[idx++] = variance_[t];
+        }
+      }
+    }
+  }
+
+  // clip the prior's coordidate such that it is within [0, 1]
+  for (int d = 0; d < dim * 2; ++d)
+    if ((d % 8) < 4)
+      tmpPtr[d] = std::min(std::max(tmpPtr[d], (real)0.), (real)1.);
+  MatrixPtr outV = getOutputValue();
+  outV->copyFrom(buffer_->data_, dim * 2);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/Projection.cpp b/paddle/legacy/gserver/layers/Projection.cpp
similarity index 100%
rename from paddle/gserver/layers/Projection.cpp
rename to paddle/legacy/gserver/layers/Projection.cpp
diff --git a/paddle/legacy/gserver/layers/Projection.h b/paddle/legacy/gserver/layers/Projection.h
new file mode 100644
index 0000000000000000000000000000000000000000..974f5a2cacd10a965adcb4accf6ca00c26044b64
--- /dev/null
+++ b/paddle/legacy/gserver/layers/Projection.h
@@ -0,0 +1,140 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/parameter/Parameter.h"
+
+namespace paddle {
+
+// Macro for registering a projection type
+// Example: REGISTER_LAYER(fc, FullMatrixProjection);
+#define REGISTER_PROJECTION(__type_name, __class_name)                \
+  static InitFunction __reg_type_##__type_name([]() {                 \
+    Projection::registrar_.registerClass<__class_name>(#__type_name); \
+  })
+
+#define REGISTER_PROJECTION_CREATE_FUNC(__type_name, createFunction)    \
+  static InitFunction __reg_type_##__type_name([]() {                   \
+    Projection::registrar_.registerClass(#__type_name, createFunction); \
+  })
+
+/**
+ * A projection takes one Argument as input, calculate the result and add it
+ * to output Argument.
+ */
+class Projection {
+ public:
+  static Projection* create(const ProjectionConfig& config,
+                            ParameterPtr parameter,
+                            bool useGpu);
+
+  Projection(const ProjectionConfig& config,
+             ParameterPtr parameter,
+             bool useGpu)
+      : config_(config), parameter_(parameter), useGpu_(useGpu) {}
+
+  virtual ~Projection() {}
+
+  const std::string& getName() const { return config_.name(); }
+
+  /// Register a projection
+  static ClassRegistrar<Projection, ProjectionConfig, ParameterPtr, bool>
+      registrar_;
+
+  /**
+   * Forward propagation. If backward() will be called, in and out must be kept
+   * valid until then.
+   * @param in input of projection
+   * @param out output of projection
+   * @param passType PASS_TRAIN of PASS_TEST
+   */
+  void forward(const Argument* in, const Argument* out, PassType passType) {
+    in_ = in;
+    out_ = out;
+    passType_ = passType;
+    forward();
+  }
+
+  virtual void prefetch(const Argument* in) {}
+  virtual void forward() = 0;
+  virtual void backward(const UpdateCallback& callback) = 0;
+
+  /**
+   * See comment in Layer.h for the function with the same name.
+   */
+  virtual void resetState() {}
+
+  /**
+   * Set layer state.
+   */
+  virtual void setState(LayerStatePtr state) {}
+
+  /**
+   * Get layer state. A copy of internal state is returned.
+   */
+  virtual LayerStatePtr getState() { return nullptr; }
+
+  /**
+   * init forward_ and backward_ functions
+   */
+  virtual bool init() { return true; }
+
+  /**
+   * Get output size of projection.
+   */
+  size_t getOutputSize() const { return config_.output_size(); }
+
+ protected:
+  /**
+   * Create layer function. Function is called in forward or backward.
+   * \param function, Layer::forward_ or Layer::backward_
+   * \param name, function name
+   * \param config, initialization configuration for the function
+   */
+  void createFunction(std::vector<std::shared_ptr<FunctionBase>>& function,
+                      const std::string& name,
+                      const FuncConfig& config) {
+    if (useGpu_) {
+      function.emplace_back(
+          FunctionBase::funcRegistrar_.createByType(name + "-GPU"));
+    } else {
+      function.emplace_back(
+          FunctionBase::funcRegistrar_.createByType(name + "-CPU"));
+    }
+    auto& func = function.back();
+    func->init(config);
+  }
+
+ protected:
+  /// Config of projection
+  ProjectionConfig config_;
+  /// Parameter of projection
+  ParameterPtr parameter_;
+  bool useGpu_;
+
+  /// Store `in` passed to forward()
+  const Argument* in_;
+  /// Store `out` passed to forward()
+  const Argument* out_;
+  /// Store `passType` passed to forward()
+  PassType passType_;
+  /// Layer forward function
+  std::vector<std::shared_ptr<FunctionBase>> forward_;
+  /// Layer backward function
+  std::vector<std::shared_ptr<FunctionBase>> backward_;
+};
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ROIPoolLayer.cpp b/paddle/legacy/gserver/layers/ROIPoolLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/ROIPoolLayer.cpp
rename to paddle/legacy/gserver/layers/ROIPoolLayer.cpp
diff --git a/paddle/gserver/layers/ROIPoolLayer.h b/paddle/legacy/gserver/layers/ROIPoolLayer.h
similarity index 100%
rename from paddle/gserver/layers/ROIPoolLayer.h
rename to paddle/legacy/gserver/layers/ROIPoolLayer.h
diff --git a/paddle/gserver/layers/RecurrentLayer.cpp b/paddle/legacy/gserver/layers/RecurrentLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/RecurrentLayer.cpp
rename to paddle/legacy/gserver/layers/RecurrentLayer.cpp
diff --git a/paddle/legacy/gserver/layers/RecurrentLayer.h b/paddle/legacy/gserver/layers/RecurrentLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..287ea27a0984729fde5b35aa0807e9f2b29f993f
--- /dev/null
+++ b/paddle/legacy/gserver/layers/RecurrentLayer.h
@@ -0,0 +1,130 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <gflags/gflags.h>
+#include "Layer.h"
+#include "SequenceToBatch.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * @brief RecurrentLayer takes 1 input layer. The output size is the same with
+ * input layer.
+ * For each sequence [start, end] it performs the following computation:
+ * \f[
+ *    out_{i} = act(in_{i})     \      \      \text{for} \ i = start \\
+ *    out_{i} = act(in_{i} + out_{i-1} * W) \ \ \text{for} \ start < i <= end
+ *
+ * \f]
+ * If reversed is true, the order is reversed:
+ * \f[
+ *   out_{i} = act(in_{i})           \    \   \text{for} \ i = end  \\
+ *   out_{i} = act(in_{i} + out_{i+1} * W) \ \ \text{for} \ start <= i < end
+ * \f]
+ * There are two methods to calculate rnn. One way is to compute rnn one
+ * sequence by one sequence. The other way is to reorganize the input
+ * into batches, then compute rnn one batch by one batch. Users can select
+ * them by rnn_use_batch flag.
+ */
+
+class RecurrentLayer : public Layer {
+ public:
+  explicit RecurrentLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+
+  void backward(const UpdateCallback& callback) override;
+
+  void resetState() override;
+
+  void setState(LayerStatePtr state) override;
+
+  LayerStatePtr getState() override;
+
+ protected:
+  /**
+   * @brief If user do not set --rnn_use_batch=true, it will
+   * compute rnn forward one sequence by one sequence in default.
+   * @param batchSize Total words number of all samples in this batch.
+   * @param numSequences The sample number.
+   * @param starts Each start position of each samples.
+   */
+  void forwardSequence(int batchSize, size_t numSequences, const int* starts);
+  /**
+   * @brief Compute rnn forward by one sequence.
+   * @param start The start position of this sequence (or sample).
+   * @param length The length of this sequence (or sample), namely the words
+   * number of this sequence.
+   */
+  void forwardOneSequence(int start, int length);
+  /**
+   * @brief Compute rnn backward one sequence by onesequence.
+   * @param batchSize Total words number of all samples in this batch.
+   * @param numSequences The sample number.
+   * @param starts Each start position of each samples.
+   */
+  void backwardSequence(int batchSize, size_t numSequences, const int* starts);
+  /**
+   * @brief Compute rnn backward by one sequence.
+   * @param start The start position of this sequence (or sample).
+   * @param length The length of this sequence (or sample), namely the words
+   * number of this sequence.
+   */
+  void backwardOneSequence(int start, int length);
+
+  /**
+   * @brief Reorganize input into batches and compute rnn forward batch
+   * by batch. It will convert batch shape to sequence after finishing forward.
+   * The batch info can refer to SequenceToBatch class.
+   * @param batchSize Total words number of all samples in this batch.
+   * @param numSequences The sample number.
+   * @param starts Each start position of each samples.
+   */
+  virtual void forwardBatch(int batchSize,
+                            size_t numSequences,
+                            const int* starts);
+
+  /**
+   * @brief Reorganize input into batches and compute rnn forward batch
+   * by batch.
+   * @param batchSize Total words number of all samples in this batch.
+   * @param numSequences The sample number.
+   * @param starts Each start position of each samples.
+   */
+  virtual void backwardBatch(int batchSize,
+                             size_t numSequences,
+                             const int* starts);
+
+ protected:
+  std::unique_ptr<Weight> weight_;
+  std::unique_ptr<Weight> bias_;
+
+  /// frameOutput_[i] is used to hold the i-th sample of output_
+  std::vector<Argument> frameOutput_;
+  MatrixPtr prevOutput_;
+  /// Whether compute rnn by reverse.
+  bool reversed_;
+  /// If compute batch by batch, batchValue_ will be used to save the
+  /// reorganized input value.
+  std::unique_ptr<SequenceToBatch> batchValue_;
+  /// If compute batch by batch, batchGrad_ will be used to save the
+  /// gradient with respect to reorganized input value.
+  std::unique_ptr<SequenceToBatch> batchGrad_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/RecurrentLayerGroup.cpp b/paddle/legacy/gserver/layers/RecurrentLayerGroup.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..39321245995fce2f2bd671593c028fd6038865de
--- /dev/null
+++ b/paddle/legacy/gserver/layers/RecurrentLayerGroup.cpp
@@ -0,0 +1,95 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <functional>
+#include "paddle/legacy/gserver/layers/Layer.h"
+
+#include "paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * Recurrent layer group is a group of layers, which forward/backward one frame
+ * after previous frame forward/backward through all layers in layer group.
+ * It's automatically added by config_parser if some layers are defined
+ * between RecurrentLayerGroupBegin and RecurrentLayerGroupEnd.
+ */
+class RecurrentLayerGroup : public Layer {
+ public:
+  explicit RecurrentLayerGroup(const LayerConfig& config) : Layer(config) {}
+
+  void initSubNetwork(NeuralNetwork* rootNetwork,
+                      const ModelConfig& config,
+                      const std::vector<ParameterType>& parameterTypes,
+                      bool useGpu) override;
+
+  void forward(PassType passType) override {
+    REGISTER_TIMER_INFO("RecurrentGroupFwTime", getName().c_str());
+    const std::vector<Argument> inArgs;
+    std::vector<Argument> outArgs;
+    network_->forward(inArgs, &outArgs, passType);
+  }
+  void backward(const UpdateCallback& callback) override {
+    REGISTER_TIMER_INFO("RecurrentGroupBwTime", getName().c_str());
+    network_->backward(nullptr);
+
+    for (auto& para : parameters_) {
+      para->incUpdate(callback);
+    }
+  }
+
+  /**
+   * @see Layer.accessSubNetwork
+   */
+  void accessSubNetwork(
+      const std::function<void(NeuralNetwork&)>& callback) override {
+    callback(*network_);
+  }
+
+ private:
+  std::unique_ptr<RecurrentGradientMachine> network_;
+};
+
+REGISTER_LAYER(recurrent_layer_group, RecurrentLayerGroup);
+
+void RecurrentLayerGroup::initSubNetwork(
+    NeuralNetwork* rootNetwork,
+    const ModelConfig& config,
+    const std::vector<ParameterType>& parameterTypes,
+    bool useGpu) {
+  setNeedGradient(true);
+
+  network_.reset(new RecurrentGradientMachine(config_.name(), rootNetwork));
+  ParamInitCallback cb = [rootNetwork](int paramId, Parameter* para) {
+    para->enableSharedType(
+        PARAMETER_VALUE,
+        rootNetwork->getParameters()[paramId]->getBuf(PARAMETER_VALUE),
+        rootNetwork->getParameters()[paramId]->getMat(PARAMETER_VALUE));
+    para->enableSharedType(
+        PARAMETER_GRADIENT,
+        rootNetwork->getParameters()[paramId]->getBuf(PARAMETER_GRADIENT),
+        rootNetwork->getParameters()[paramId]->getMat(PARAMETER_GRADIENT));
+  };
+  network_->init(config, cb, parameterTypes, useGpu);
+
+  for (auto paramId : network_->getParameterIds()) {
+    ParameterPtr parameter = rootNetwork->getParameters()[paramId];
+    parameter->incShared();
+    CHECK_EQ(parameter->getDeviceId(), getDeviceId());
+    parameters_.push_back(parameter);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ResizeLayer.cpp b/paddle/legacy/gserver/layers/ResizeLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8f8aad820f7d6d2be0af74d607d763912c3c0f2a
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ResizeLayer.cpp
@@ -0,0 +1,79 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/BaseMatrix.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+/**
+ * @brief A layer for resizing a minibatch matrix h*w to h'*w'
+ * @note
+ * origin matrix height * width)
+ * resize matrix: (height * width / size) * size
+ */
+class ResizeLayer : public Layer {
+ public:
+  explicit ResizeLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+
+  void backward(const UpdateCallback& callback) override;
+};
+
+REGISTER_LAYER(resize, ResizeLayer);
+
+bool ResizeLayer::init(const LayerMap& layerMap,
+                       const ParameterMap& parameterMap) {
+  if (!Layer::init(layerMap, parameterMap)) return false;
+  CHECK_EQ(1U, inputLayers_.size());
+
+  setNeedSequenceInfo(false);
+  return true;
+}
+
+void ResizeLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  const Argument& input = getInput(0);
+  size_t height = input.value->getHeight();
+  size_t width = input.value->getWidth();
+  CHECK_EQ((height * width) % getSize(), 0UL);
+
+  reserveOutput(height * width / getSize(), getSize());
+  MatrixPtr tmp =
+      Matrix::create(output_.value->getData(), height, width, false, useGpu_);
+  tmp->assign(*input.value);
+}
+
+void ResizeLayer::backward(const UpdateCallback& callback) {
+  const Argument& input = getInput(0);
+  size_t height = input.value->getHeight();
+  size_t width = input.value->getWidth();
+
+  if (!input.grad) {
+    return;
+  }
+
+  MatrixPtr tmp = Matrix::create(input.grad->getData(),
+                                 height * width / getSize(),
+                                 getSize(),
+                                 false,
+                                 useGpu_);
+  tmp->add(*output_.grad);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/RotateLayer.cpp b/paddle/legacy/gserver/layers/RotateLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/RotateLayer.cpp
rename to paddle/legacy/gserver/layers/RotateLayer.cpp
diff --git a/paddle/legacy/gserver/layers/RotateLayer.h b/paddle/legacy/gserver/layers/RotateLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..498e24372b8ca17c21ebecbe6a8c8b40217ab259
--- /dev/null
+++ b/paddle/legacy/gserver/layers/RotateLayer.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+/**
+ * A layer for rotating a multi-channel feature map (M x N x C) in the spatial
+ * domain
+ * The rotation is 90 degrees in clock-wise for each channel
+ * \f[
+ *   y(j,i,:) = x(M-i-1,j,:)
+ * \f]
+ * where \f$x\f$ is (M x N x C) input, and \f$y\f$ is (N x M x C) output.
+ *
+ * The config file api is rotate_layer
+ *
+ */
+
+class RotateLayer : public Layer {
+ public:
+  explicit RotateLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+
+ private:
+  int batchSize_;
+  int size_;
+  int height_;
+  int width_;
+  int channels_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/RowConvLayer.cpp b/paddle/legacy/gserver/layers/RowConvLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1961557dc2d2601091bb0e56fcd884d76d49bc0e
--- /dev/null
+++ b/paddle/legacy/gserver/layers/RowConvLayer.cpp
@@ -0,0 +1,106 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "RowConvLayer.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(row_conv, RowConvLayer);
+
+bool RowConvLayer::init(const LayerMap& layerMap,
+                        const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  contexLength_ = config_.inputs(0).row_conv_conf().context_length();
+
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  weight_.reset(new Weight(contexLength_, getSize(), parameters_[0]));
+  createFunction(forward_, "RowConv", FuncConfig());
+  createFunction(backward_, "RowConvGrad", FuncConfig());
+
+  return true;
+}
+
+void RowConvLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  MatrixPtr input = getInputValue(0);
+  size_t height = input->getHeight();
+  size_t width = input->getWidth();
+  CHECK_EQ(width, getSize());
+  resetOutput(height, width);
+
+  const auto startPos = getInput(0).sequenceStartPositions->getVector(useGpu_);
+  MatrixPtr w = weight_->getW();
+  wDims_ = TensorShape({w->getHeight(), w->getWidth()});
+
+  MatrixPtr outV = getOutputValue();
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getInputValue(0), *startPos);
+  inputs.addArg(*w, wDims_);
+  outputs.addArg(*getOutputValue(), *startPos, ADD_TO);
+
+  {
+    REGISTER_TIMER_INFO("RowConvForward", getName().c_str());
+    forward_[0]->calc(inputs, outputs);
+  }
+
+  /* activation */ {
+    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void RowConvLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ {
+    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
+    backwardActivation();
+  }
+
+  const auto startPos = getInput(0).sequenceStartPositions->getVector(useGpu_);
+
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getOutputGrad(), *startPos);
+  inputs.addArg(*getInputValue(0), *startPos);
+  inputs.addArg(*weight_->getW(), wDims_);
+
+  MatrixPtr inGrad = getInputGrad(0);
+  MatrixPtr wGrad = weight_->getWGrad();
+  size_t h = getInputValue(0)->getHeight();
+  size_t w = getInputValue(0)->getWidth();
+  outputs.addArg(
+      inGrad ? (*inGrad) : *(Matrix::create(nullptr, h, w, false, useGpu_)),
+      *startPos,
+      ADD_TO);
+  outputs.addArg(
+      wGrad ? (*wGrad)
+            : *(Matrix::create(nullptr, contexLength_, w, false, useGpu_)),
+      wDims_,
+      ADD_TO);
+
+  {
+    REGISTER_TIMER_INFO("RowConvBackward", getName().c_str());
+    backward_[0]->calc(inputs, outputs);
+  }
+
+  {
+    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
+    weight_->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/RowConvLayer.h b/paddle/legacy/gserver/layers/RowConvLayer.h
similarity index 100%
rename from paddle/gserver/layers/RowConvLayer.h
rename to paddle/legacy/gserver/layers/RowConvLayer.h
diff --git a/paddle/gserver/layers/RowL2NormLayer.cpp b/paddle/legacy/gserver/layers/RowL2NormLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/RowL2NormLayer.cpp
rename to paddle/legacy/gserver/layers/RowL2NormLayer.cpp
diff --git a/paddle/gserver/layers/SamplingIdLayer.cpp b/paddle/legacy/gserver/layers/SamplingIdLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/SamplingIdLayer.cpp
rename to paddle/legacy/gserver/layers/SamplingIdLayer.cpp
diff --git a/paddle/gserver/layers/ScaleShiftLayer.cpp b/paddle/legacy/gserver/layers/ScaleShiftLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/ScaleShiftLayer.cpp
rename to paddle/legacy/gserver/layers/ScaleShiftLayer.cpp
diff --git a/paddle/legacy/gserver/layers/ScaleSubRegionLayer.cpp b/paddle/legacy/gserver/layers/ScaleSubRegionLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..70d44d2a7ef25df64beb2c861692436d842dac02
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ScaleSubRegionLayer.cpp
@@ -0,0 +1,78 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ScaleSubRegionLayer.h"
+#include "paddle/legacy/utils/Stat.h"
+namespace paddle {
+
+REGISTER_LAYER(scale_sub_region, ScaleSubRegionLayer);
+
+bool ScaleSubRegionLayer::init(const LayerMap& layerMap,
+                               const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+  CHECK_EQ(static_cast<int>(inputLayers_.size()), 2);
+  auto& conf = config_.inputs(0).scale_sub_region_conf();
+  value_ = conf.value();
+
+  createFunction(forward_, "ScaleSubRegion", FuncConfig().set("value", value_));
+  createFunction(
+      backward_, "ScaleSubRegionGrad", FuncConfig().set("value", value_));
+
+  return true;
+}
+
+void ScaleSubRegionLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  auto in0 = getInput(0);
+  imgH_ = in0.getFrameHeight();
+  imgW_ = in0.getFrameWidth();
+  if (imgH_ == 0 || imgW_ == 0) {
+    auto& conf = config_.inputs(0).scale_sub_region_conf();
+    imgH_ = conf.image_conf().img_size_y();
+    imgW_ = conf.image_conf().img_size();
+  }
+  MatrixPtr imgV = in0.value;
+  size_t batchSize = imgV->getHeight();
+  size_t spatialSize = imgH_ * imgW_;
+  channelsNum_ = imgV->getWidth() / spatialSize;
+  shape_ = TensorShape({batchSize, channelsNum_, imgH_, imgW_});
+
+  resetOutput(batchSize, imgV->getWidth());
+  auto& out = getOutput();
+  out.setFrameHeight(imgH_);
+  out.setFrameWidth(imgW_);
+
+  MatrixPtr indicesV = getInputValue(1);
+  indicesShape_ = TensorShape({batchSize, 6});
+
+  REGISTER_TIMER_INFO("ScaleSubRegionForward", getName().c_str());
+  BufferArgs inArgs;
+  BufferArgs outArgs;
+  inArgs.addArg(*imgV, shape_);
+  inArgs.addArg(*indicesV, indicesShape_);
+  outArgs.addArg(*out.value, shape_, ASSIGN_TO);
+  forward_[0]->calc(inArgs, outArgs);
+}
+
+void ScaleSubRegionLayer::backward(const UpdateCallback& callback) {
+  REGISTER_TIMER_INFO("ScaleSubRegionBackward", getName().c_str());
+  BufferArgs inArgs;
+  BufferArgs outArgs;
+  inArgs.addArg(*getOutputGrad(), shape_);
+  inArgs.addArg(*getInputValue(1), indicesShape_);
+  outArgs.addArg(*getInputGrad(0), shape_, ADD_TO);
+  backward_[0]->calc(inArgs, outArgs);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ScaleSubRegionLayer.h b/paddle/legacy/gserver/layers/ScaleSubRegionLayer.h
similarity index 100%
rename from paddle/gserver/layers/ScaleSubRegionLayer.h
rename to paddle/legacy/gserver/layers/ScaleSubRegionLayer.h
diff --git a/paddle/legacy/gserver/layers/ScalingLayer.cpp b/paddle/legacy/gserver/layers/ScalingLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a8286b6614c3cdfbd720d0719f939018f6ae9579
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ScalingLayer.cpp
@@ -0,0 +1,106 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * A layer for each row of a matrix, multiplying with a element of a vector,
+ * which is used in NEURAL TURING MACHINE.
+ * \f[
+ *   y.row[i] = w[i] * x.row[i]
+ * \f]
+ * where \f$x\f$ is (batchSize x dataDim) input, \f$w\f$ is
+ * (batchSize x 1) weight vector, and \f$y\f$ is (batchSize x dataDim) output.
+ *
+ * The config file api is scaling_layer.
+ */
+
+class ScalingLayer : public Layer {
+ public:
+  explicit ScalingLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~ScalingLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(scaling, ScalingLayer);
+
+bool ScalingLayer::init(const LayerMap& layerMap,
+                        const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 2U);
+
+  return true;
+}
+
+void ScalingLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr weightV = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+
+  size_t batchSize = inV1->getHeight();
+  size_t dataDim = inV1->getWidth();
+
+  CHECK_EQ(dataDim, getSize());
+  CHECK_EQ(weightV->getWidth(), 1U);
+  CHECK_EQ(weightV->getHeight(), batchSize);
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    resetOutput(batchSize, dataDim);
+  }
+
+  MatrixPtr outV = getOutputValue();
+  {
+    REGISTER_TIMER_INFO("FwScalingTimer", getName().c_str());
+    // outV += inV1 * weight
+    outV->addRowScale(0, *inV1, *weightV);
+  }
+}
+
+void ScalingLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr weightV = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  MatrixPtr inG0 = getInputGrad(0);
+  MatrixPtr inG1 = getInputGrad(1);
+  MatrixPtr outG = getOutputGrad();
+
+  {
+    REGISTER_TIMER_INFO("BwScalingTimer", getName().c_str());
+
+    if (inG0) {
+      // inG0 += outG .* inV1
+      inG0->rowDotMul(0, *outG, *inV1);
+    }
+
+    if (inG1) {
+      // inG1 += outG * weight;
+      inG1->addRowScale(0, *outG, *weightV);
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ScalingProjection.cpp b/paddle/legacy/gserver/layers/ScalingProjection.cpp
similarity index 100%
rename from paddle/gserver/layers/ScalingProjection.cpp
rename to paddle/legacy/gserver/layers/ScalingProjection.cpp
diff --git a/paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.cpp b/paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..72fb06814884cc2bcca2c600105077d8cf1459c5
--- /dev/null
+++ b/paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.cpp
@@ -0,0 +1,336 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "SelectiveFullyConnectedLayer.h"
+#include <algorithm>
+#include <vector>
+#include "paddle/legacy/math/SparseMatrix.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(selective_fc, SelectiveFullyConnectedLayer);
+
+bool SelectiveFullyConnectedLayer::init(const LayerMap& layerMap,
+                                        const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+  inputNum_ = inputLayers_.size();
+  if (config_.has_selected_colums()) {
+    inputNum_ -= 1;
+  }
+  for (size_t i = 0; i < inputNum_; i++) {
+    size_t height = inputLayers_[i]->getSize();
+    size_t width = getSize();
+    // NOTE weight is transpoed
+    weights_.emplace_back(new Weight(width, height, parameters_[i]));
+  }
+
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+  }
+
+  fullOutput_ = false;
+
+  return true;
+}
+
+void SelectiveFullyConnectedLayer::prefetch() {}
+
+void SelectiveFullyConnectedLayer::reserveOutput(size_t height,
+                                                 size_t width,
+                                                 size_t nnz) {
+  bool flag = (passType_ == PASS_TEST &&
+               config_.selective_fc_pass_generation() && !fullOutput_);
+  SetDevice device(output_.deviceId);
+  if (flag) {
+    // output_.value is sparse matrix
+    if (dynamic_cast<CpuMatrix*>(output_.value.get()) ||
+        dynamic_cast<GpuMatrix*>(output_.value.get())) {
+      output_.value = nullptr;
+    }
+    Matrix::resizeOrCreateSparseMatrix(output_.value,
+                                       height,
+                                       width,
+                                       nnz,
+                                       FLOAT_VALUE,
+                                       SPARSE_CSR,
+                                       /*trans=*/false,
+                                       /*useGpu=*/useGpu_);
+    output_.value->copyFrom(*selCols_);
+    interOutput_ = output_.value;
+  } else {
+    if (fullOutput_) {
+      // output_.value is dense matrix
+      if (dynamic_cast<CpuSparseMatrix*>(output_.value.get()) ||
+          dynamic_cast<GpuSparseMatrix*>(output_.value.get())) {
+        output_.value = nullptr;
+      }
+      Matrix::resizeOrCreate(output_.value,
+                             height,
+                             width,
+                             /*trans=*/false,
+                             /*useGpu=*/useGpu_);
+      interOutput_ = output_.value;
+    } else {
+      // output_.value is dense matrix, but width = nnz /height
+      CHECK_EQ(nnz % height, 0U);
+      CHECK(nnz / height);
+      Matrix::resizeOrCreate(output_.value,
+                             height,
+                             nnz / height,
+                             /*trans=*/false,
+                             /*useGpu=*/useGpu_);
+      interOutput_ = Matrix::createSparseMatrix(output_.value->getData(),
+                                                selCols_->getRows(),
+                                                selCols_->getCols(),
+                                                height,
+                                                width,
+                                                nnz,
+                                                FLOAT_VALUE,
+                                                SPARSE_CSR,
+                                                /*trans=*/false,
+                                                /*useGpu=*/useGpu_);
+    }
+  }
+  interOutput_->zeroMem();
+
+  if (passType_ != PASS_TEST && needGradient()) {
+    CHECK_EQ(nnz % height, 0U) << "during training, each sample must have a "
+                                  "same number of selected columns.";
+    CHECK(nnz / height)
+        << "during training, "
+           "each sample must have at least one column selected.";
+    Matrix::resizeOrCreate(output_.grad,
+                           height,
+                           nnz / height,
+                           /*trans=*/false,
+                           /*useGpu=*/useGpu_);
+    output_.grad->zeroMem();
+  }
+}
+
+void SelectiveFullyConnectedLayer::forward(PassType passType) {
+  REGISTER_TIMER("selective_fc.forward");
+  Layer::forward(passType);
+
+  getSelectiveCols();
+  size_t height = getInput(0).getBatchSize();
+  size_t width = getSize();
+  size_t nnz = height * width;
+  if (!fullOutput_) {
+    CHECK(selCols_);
+    CHECK(height == selCols_->getHeight());
+    CHECK(width == selCols_->getWidth());
+    nnz = selCols_->getElementCnt();
+  }
+
+  // Layer::ResetOutput(), here we set outV/outG as SparseMatrix manually
+  // this outV should be used as input of MaxIdLayer and softmax activation
+  reserveOutput(height, width, nnz);
+
+  bool flag = true;
+  for (size_t i = 0; i < inputNum_; i++) {
+    MatrixPtr input = getInputValue(i);
+    MatrixPtr weight = weights_[i]->getW();
+    size_t hsize = input->getHeight();
+    size_t wsize = weight->getHeight();
+    real scaleT = i == 0 ? real(0) : real(1);
+
+    flag = nnz < (hsize * wsize) * config_.selective_fc_full_mul_ratio() &&
+           !fullOutput_;
+    if (flag) {
+      // if the indecies are highly sparse,
+      // manully compute the multiplication of
+      // the input vector and the selected rows.
+      REGISTER_TIMER("selective.plain");
+      interOutput_->mul(*input, *weight->getTranspose(), 1, scaleT);
+    } else {
+      // if the indecies is not sparse enough,
+      // use full mul instead
+      REGISTER_TIMER("selective.mul");
+      if (fullOutput_) {
+        interOutput_->mul(*input, *weight->getTranspose(), 1, scaleT);
+      } else {
+        Matrix::resizeOrCreate(mmat_,
+                               hsize,
+                               wsize,
+                               /*trans=*/false,
+                               /*useGpu=*/useGpu_);
+        mmat_->mul(*input, *weight->getTranspose());
+        interOutput_->add3(mmat_);
+      }
+    }
+  }
+
+  if (biases_) {
+    interOutput_->addBias(*(biases_->getW()), 1);
+  }
+
+  flag = (passType_ == PASS_TEST && config_.selective_fc_pass_generation() &&
+          !fullOutput_);
+  if (flag) {
+    // during generation, output of this layer is a sparse csr matrix,
+    // which is probably the input of maxid layer
+    // if the model is trained with multi-class-cross-entroy-with-selfnorm,
+    // activiation of this layer should be exponential, not softmax.
+
+    Argument arg;
+    arg.value = Matrix::create(interOutput_->getData(),
+                               1,
+                               nnz,
+                               /*trans=*/false,
+                               /*useGpu=*/useGpu_);
+    //! TODO(yuyang18): Why we cannot invoke forwardActivation here?
+    activation_->forward(arg).check();
+  } else /* train and test in train, not generating */ {
+    // during training, this layer output value is *Matrix*, which is input of
+    // eg. multi-class-cross-entropy
+
+    // while training, every sample has a equal number of selected
+    // columns to be activated.
+    // note indices of multi-class-cross-entropy need to be remapped
+    // to this index.
+    // e.g. sample = [1,3,5] and 3 is gold, then label is 1
+
+    forwardActivation();
+  }
+}
+
+void SelectiveFullyConnectedLayer::backward(const UpdateCallback& callback) {
+  backwardActivation();
+  MatrixPtr oGrad = getOutputGrad();
+  if (!fullOutput_) {
+    interOutGrad_ = Matrix::createSparseMatrix(oGrad->getData(),
+                                               interOutput_->getRows(),
+                                               interOutput_->getCols(),
+                                               interOutput_->getHeight(),
+                                               interOutput_->getWidth(),
+                                               interOutput_->getElementCnt(),
+                                               FLOAT_VALUE,
+                                               SPARSE_CSR,
+                                               /*trans=*/false,
+                                               /*useGpu=*/useGpu_);
+  } else {
+    interOutGrad_ = Matrix::create(oGrad->getData(),
+                                   oGrad->getHeight(),
+                                   oGrad->getWidth(),
+                                   /*trans=*/false,
+                                   /*useGpu=*/useGpu_);
+  }
+
+  if (biases_ && biases_->getWGrad()) {
+    REGISTER_TIMER_INFO("BpBiasTimer", getName().c_str());
+    biases_->getWGrad()->collectBias(*interOutGrad_, 1);
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  // backward is different from FullyConnectedLayer
+  // because the weight is transposed
+  for (size_t i = 0; i < inputNum_; i++) {
+    AsyncGpuBlock block;
+    MatrixPtr preGrad = getInputGrad(i);
+    if (preGrad) {
+      REGISTER_TIMER_INFO("BpMulTimer", getName().c_str());
+      preGrad->mul(*interOutGrad_, *weights_[i]->getW(), 1, 1);
+    }
+
+    MatrixPtr wGrad = weights_[i]->getWGrad();
+    if (wGrad) {
+      REGISTER_TIMER_INFO("GradMulTimer", getName().c_str());
+      MatrixPtr input = getInputValue(i);
+      wGrad->mul(*interOutGrad_->getTranspose(), *input, 1, 1);
+    }
+
+    {
+      REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
+      weights_[i]->getParameterPtr()->incUpdate(callback);
+    }
+  }
+}
+
+void paddle::SelectiveFullyConnectedLayer::fillSelectiveData(
+    const std::shared_ptr<std::vector<std::pair<int*, size_t>>>& candidates) {
+  if (candidates == nullptr) {
+    fillFullySelectiveData();
+    return;
+  }
+
+  size_t sampleNum = candidates->size();
+  size_t outputWidth = getSize();
+  size_t nnz =
+      std::accumulate(candidates->begin(),
+                      candidates->end(),
+                      0UL,
+                      [](size_t a, const std::pair<int*, size_t>& arr) {
+                        return a + arr.second;
+                      });
+
+  Matrix::resizeOrCreateSparseMatrix(this->cpuSelCols_,
+                                     sampleNum,
+                                     outputWidth,
+                                     nnz,
+                                     NO_VALUE,
+                                     SPARSE_CSR,
+                                     false,
+                                     false);
+  CHECK(this->cpuSelCols_ != nullptr);
+  CpuSparseMatrixPtr selCols =
+      std::dynamic_pointer_cast<CpuSparseMatrix>(cpuSelCols_);
+  int* rowOffsets = selCols->getRows();
+  int* colIndices = selCols->getCols();
+
+  rowOffsets[0] = 0;
+  int idx = 0;
+  for (size_t i = 0; i < sampleNum; ++i) {
+    if ((*candidates)[i].second > 0) {
+      rowOffsets[i + 1] = rowOffsets[i] + (*candidates)[i].second;
+      for (size_t j = 0; j < (*candidates)[i].second; ++j) {
+        colIndices[idx] = (*candidates)[i].first[j];
+        idx++;
+      }
+    } else {
+      rowOffsets[i + 1] = rowOffsets[i];
+    }
+  }
+
+  CHECK_EQ(static_cast<size_t>(rowOffsets[sampleNum]), nnz);
+  if (!useGpu_) {
+    this->selCols_ = this->cpuSelCols_;
+  } else {
+    Matrix::resizeOrCreateSparseMatrix(this->selCols_,
+                                       sampleNum,
+                                       outputWidth,
+                                       nnz,
+                                       NO_VALUE,
+                                       SPARSE_CSR,
+                                       false,
+                                       true);
+    this->selCols_->copyFrom(*cpuSelCols_, HPPL_STREAM_1);
+    hl_stream_synchronize(HPPL_STREAM_1);
+  }
+
+  fullOutput_ = false;
+}
+
+void paddle::SelectiveFullyConnectedLayer::getSelectiveCols() {
+  if (config_.has_selected_colums()) {
+    this->selCols_ = inputLayers_[inputNum_]->getOutputValue();
+    fullOutput_ = false;
+  } else if (!config_.selective_fc_pass_generation() || selCols_ == nullptr) {
+    this->fillFullySelectiveData();
+  }  // else selCols_ is initialized by fillSelectiveData
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.h b/paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ba04d9b2ae208eda021a451e94856d9993dc126
--- /dev/null
+++ b/paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.h
@@ -0,0 +1,103 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
+
+namespace paddle {
+
+/**
+ * @brief The SelectiveFullyConnectedLayer class
+ *
+ * SelectiveFullyConnectedLayer differs from FullyConnectedLayer by that it
+ * requires an additional input to indicate several selected columns, and only
+ * compute the multiplications between the input matrices and the selected
+ * columns of the parameter matrices of this layer. If the selected columns is
+ * not specified, SelectiveFullyConnected layer acts exactly like
+ * FullyConnectedLayer.
+ *
+ * The config file api is selective_fc_layer.
+ */
+class SelectiveFullyConnectedLayer : public Layer {
+ protected:
+  WeightList weights_;
+  std::unique_ptr<Weight> biases_;
+
+ private:
+  /**
+   * Get selected columns each forward.
+   */
+  void getSelectiveCols();
+
+  MatrixPtr mmat_;
+  /// cpuSelCols_ is a CpuSparseMatrix, used to save selected columns.
+  MatrixPtr cpuSelCols_;
+  /// CpuSparseMatrix or GpuSparseMatrix. In CPU mode, selCols_ points
+  /// to cpuSelCols_.
+  MatrixPtr selCols_;
+  size_t inputNum_;
+
+  /// interOutput_ shared same memory with output_.value.
+  MatrixPtr interOutput_;
+
+  /// if fullOutput_ is false, interOutGrad_ sparse matrix
+  MatrixPtr interOutGrad_;
+
+  /// if true, means output_.value is the same as Fc Layer
+  bool fullOutput_;
+
+ public:
+  explicit SelectiveFullyConnectedLayer(const LayerConfig& config)
+      : Layer(config), selCols_(nullptr) {}
+
+  ~SelectiveFullyConnectedLayer() {}
+  void prefetch() override;
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  Weight& getWeight(int idx) { return *weights_[idx]; }
+
+  /**
+   * @brief Resize the output matrix size.
+   * And reset value to zero
+   */
+  void reserveOutput(size_t height, size_t width, size_t nnz);
+
+  /**
+   * @brief Fill candidates to select several activations as output.
+   * @param candidates specifies several selected columns of the parameter
+   * matrices of this layer.
+   * Multiplications only between the input matrices and the selected columns
+   * are computed.
+   * If the candidates is a nullptr, selective fc layer acts exactly like the
+   * fully connected layer.
+   * @note CURRENTLY, THIS METHOD IS ONLY USED FOR BEAM SEARCH
+   */
+  void fillSelectiveData(
+      const std::shared_ptr<std::vector<std::pair<int*, size_t>>>& candidates);
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+ private:
+  /**
+   * @brief Make SelectiveFC act as FullyConnectedLayer
+   */
+  void fillFullySelectiveData() { fullOutput_ = true; }
+};
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SequenceConcatLayer.cpp b/paddle/legacy/gserver/layers/SequenceConcatLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7b598e11acde533564f6eda49d78ea8df99a5056
--- /dev/null
+++ b/paddle/legacy/gserver/layers/SequenceConcatLayer.cpp
@@ -0,0 +1,189 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * A layer for concatenating the first sequence with the second sequence
+ * Input: two sequences each containing the same number of instances
+ *        seq1 = [a1, a2, ..., an]
+ *        seq2 = [b1, b2, ..., bn]
+ * Output: a concatenated sequence of the two input sequences
+ *        out = [a1, b1, a2, b2, ..., an, bn]
+ */
+
+class SequenceConcatLayer : public Layer {
+ protected:
+  std::unique_ptr<Weight> biases_;
+
+ public:
+  explicit SequenceConcatLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~SequenceConcatLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(seqconcat, SequenceConcatLayer);
+
+bool SequenceConcatLayer::init(const LayerMap& layerMap,
+                               const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  // sequene concatenation layer should have exactly 2 inputs
+  CHECK_EQ(2U, inputLayers_.size());
+
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+  }
+
+  setNeedSequenceInfo(false);
+  return true;
+}
+
+void SequenceConcatLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  size_t dim = getSize();
+
+  const Argument& input1 = getInput(0);
+  size_t numSequences1 = input1.getNumSequences();
+  auto startPositions1 = input1.sequenceStartPositions->getVector(false);
+
+  const Argument& input2 = getInput(1);
+  size_t numSequences2 = input2.getNumSequences();
+  auto startPositions2 = input2.sequenceStartPositions->getVector(false);
+
+  CHECK_EQ(dim, input1.value->getWidth());
+  CHECK_EQ(startPositions1->getData()[numSequences1], input1.getBatchSize());
+  CHECK_EQ(numSequences1, startPositions1->getSize() - 1);
+
+  CHECK_EQ(dim, input2.value->getWidth());
+  CHECK_EQ(startPositions2->getData()[numSequences2], input2.getBatchSize());
+  CHECK_EQ(numSequences2, startPositions2->getSize() - 1);
+
+  CHECK_EQ(numSequences1, numSequences2);
+
+  MatrixPtr inputValue1 = getInputValue(0);
+  MatrixPtr inputValue2 = getInputValue(1);
+
+  // reset output
+  reserveOutput(inputValue1->getHeight() + inputValue2->getHeight(), dim);
+
+  MatrixPtr outputValue = getOutputValue();
+
+  const int* starts1 = startPositions1->getData();
+  const int* starts2 = startPositions2->getData();
+
+  {
+    AsyncGpuBlock asyncGpuBlock;
+    REGISTER_TIMER_INFO("SequenceConcatLayerForward", getName().c_str());
+
+    size_t offset = 0;
+    size_t leftNumIns = 0;
+    size_t rightNumIns = 0;
+    for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
+      leftNumIns = starts1[seqId + 1] - starts1[seqId];
+      outputValue->subMatrix(offset, leftNumIns)
+          ->assign(*(inputValue1->subMatrix(starts1[seqId], leftNumIns)));
+      offset += leftNumIns;
+
+      rightNumIns = starts2[seqId + 1] - starts2[seqId];
+      outputValue->subMatrix(offset, rightNumIns)
+          ->assign(*(inputValue2->subMatrix(starts2[seqId], rightNumIns)));
+      offset += rightNumIns;
+    }
+
+    // modify the sequenceStartPositions
+    ICpuGpuVector::resizeOrCreate(
+        output_.sequenceStartPositions, numSequences1 + 1, false);
+
+    int* tgtBuf = output_.sequenceStartPositions->getMutableData(false);
+
+    for (size_t seqId = 0; seqId < numSequences1 + 1; ++seqId) {
+      tgtBuf[seqId] = starts1[seqId] + starts2[seqId];
+    }
+  }
+
+  if (biases_.get() != NULL) {
+    MatrixPtr outV = getOutputValue();
+    outV->addBias(*(biases_->getW()), 1);
+  }
+
+  /* activation */
+  forwardActivation();
+}
+
+void SequenceConcatLayer::backward(const UpdateCallback& callback) {
+  /* activation */
+  backwardActivation();
+
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+
+    // Increasing the number of gradient
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  MatrixPtr inputGrad1 = getInputGrad(0);
+  MatrixPtr inputGrad2 = getInputGrad(1);
+  MatrixPtr outputGrad = getOutputGrad();
+  auto startPositions1 = getInput(0).sequenceStartPositions->getVector(false);
+  auto startPositions2 = getInput(1).sequenceStartPositions->getVector(false);
+
+  size_t numSequences1 = startPositions1->getSize() - 1;
+  size_t numSequences2 = startPositions2->getSize() - 1;
+
+  CHECK_EQ(numSequences1, numSequences2);
+
+  const int* starts1 = startPositions1->getData();
+  const int* starts2 = startPositions2->getData();
+
+  {
+    AsyncGpuBlock asyncGpuBlock;
+    REGISTER_TIMER_INFO("SequenceConcatLayerBackward", getName().c_str());
+
+    size_t offset = 0;
+    size_t leftNumIns = 0;
+    size_t rightNumIns = 0;
+    for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
+      leftNumIns = starts1[seqId + 1] - starts1[seqId];
+      if (inputGrad1) {
+        inputGrad1->subMatrix(starts1[seqId], leftNumIns)
+            ->add(*(outputGrad->subMatrix(offset, leftNumIns)));
+      }
+      offset += leftNumIns;
+
+      rightNumIns = starts2[seqId + 1] - starts2[seqId];
+      if (inputGrad2) {
+        inputGrad2->subMatrix(starts2[seqId], rightNumIns)
+            ->add(*(outputGrad->subMatrix(offset, rightNumIns)));
+      }
+      offset += rightNumIns;
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SequenceLastInstanceLayer.cpp b/paddle/legacy/gserver/layers/SequenceLastInstanceLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8735d71ba372de894c9852229ed8c77537792ea0
--- /dev/null
+++ b/paddle/legacy/gserver/layers/SequenceLastInstanceLayer.cpp
@@ -0,0 +1,118 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/utils/Logging.h"
+
+#include "SequencePoolLayer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * A layer for extracting the last instance of the input sequence.
+ * Input: a sequence
+ * If SequenceLevel = kNonseq:
+ *   Output: a sequence containing only the last instance of the input sequence
+ *   If stride_ > 0:
+ *      Output: a shorten sequence. Stride is the step size by which we slide a
+ *              window upon the input sequence, and getting last instance
+ *              operation is then applied to each interval independently.
+ * If SequenceLevel = kSeq:
+ *   Check input sequence must has sub-sequence
+ *   Output: a sequence containing only the last instance of each sub-sequence
+ *           of the input sequence
+ *
+ * The config file api is last_seq and first_seq.
+ */
+
+class SequenceLastInstanceLayer : public SequencePoolLayer {
+ protected:
+  MatrixPtr tmpSrc_;
+  MatrixPtr tmpDest_;
+  std::vector<int> instanceIds_;
+
+ public:
+  explicit SequenceLastInstanceLayer(const LayerConfig& config)
+      : SequencePoolLayer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(seqlastins, SequenceLastInstanceLayer);
+
+bool SequenceLastInstanceLayer::init(const LayerMap& layerMap,
+                                     const ParameterMap& parameterMap) {
+  SequencePoolLayer::init(layerMap, parameterMap);
+  reversed_ = config_.select_first();
+
+  tmpSrc_ =
+      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
+  tmpDest_ =
+      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
+
+  return true;
+}
+
+void SequenceLastInstanceLayer::forward(PassType passType) {
+  SequencePoolLayer::forward(passType);
+
+  auto starts = startPositions_->getData(false);
+  MatrixPtr inputValue = getInputValue(0);
+  MatrixPtr outputValue = getOutputValue();
+
+  {
+    AsyncGpuBlock asyncGpuBlock;
+    REGISTER_TIMER_INFO("SequenceLastInstanceLayerForward", getName().c_str());
+
+    instanceIds_.clear();
+    for (size_t seqId = 0; seqId < newBatchSize_; ++seqId) {
+      int insId = reversed_ ? starts[seqId] : starts[seqId + 1] - 1;
+      instanceIds_.push_back(insId);
+
+      outputValue->subMatrix(seqId, 1, tmpDest_)
+          ->assign(*(inputValue->subMatrix(insId, 1, tmpSrc_)));
+    }
+  }
+
+  if (biases_.get() != NULL) {
+    outputValue->addBias(*(biases_->getW()), 1);
+  }
+
+  /*  activation, should set to 'linear' in most cases */
+  forwardActivation();
+}
+
+void SequenceLastInstanceLayer::backward(const UpdateCallback& callback) {
+  SequencePoolLayer::backward(callback);
+
+  MatrixPtr inputGrad = getInputGrad(0);
+  MatrixPtr outputGrad = getOutputGrad();
+
+  if (inputGrad) {
+    AsyncGpuBlock asyncGpuBlock;
+    REGISTER_TIMER_INFO("SequenceLastInstanceLayerBackward", getName().c_str());
+
+    for (size_t seqId = 0; seqId < newBatchSize_; ++seqId) {
+      inputGrad->subMatrix(instanceIds_[seqId], 1, tmpDest_)
+          ->add(*(outputGrad->subMatrix(seqId, 1, tmpSrc_)));
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SequencePoolLayer.cpp b/paddle/legacy/gserver/layers/SequencePoolLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..243b795db428ede1fbb39a5054485a198a14e00c
--- /dev/null
+++ b/paddle/legacy/gserver/layers/SequencePoolLayer.cpp
@@ -0,0 +1,93 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "SequencePoolLayer.h"
+#include "paddle/legacy/utils/Logging.h"
+
+namespace paddle {
+
+bool SequencePoolLayer::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  // seqlastins/max/average layer should have exactly 1 input
+  CHECK_EQ(1U, inputLayers_.size());
+
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+  }
+  // transform to which sequence type
+  if (config_.trans_type() == "non-seq") {
+    type_ = kNonSeq;
+  } else if (config_.trans_type() == "seq") {
+    type_ = kSeq;
+  } else {
+    LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
+  }
+  stride_ = config_.seq_pool_stride();
+  setNeedSequenceInfo(false);
+  return true;
+}
+
+void SequencePoolLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const Argument& input = getInput(0);
+  CHECK(input.hasSeq() || input.hasSubseq())
+      << "Input should be a sequence or subsequence for layer " << getName();
+
+  newBatchSize_ = type_ ? input.getNumSubSequences() : input.getNumSequences();
+  size_t dim = getSize();
+  // check
+  CHECK_EQ(dim, input.value->getWidth());
+  startPositions_ =
+      type_ ? input.subSequenceStartPositions : input.sequenceStartPositions;
+  auto starts = startPositions_->getVector(false);
+  CHECK_EQ(starts->getData()[newBatchSize_], input.getBatchSize());
+  CHECK_EQ(newBatchSize_, starts->getSize() - 1);
+
+  /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
+   * thus, in this case, output_ has no sequenceStartPositions.
+   * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
+   * case, we should compute the new sequenceStartPositions.
+   */
+  if (type_) {
+    CHECK(input.subSequenceStartPositions)
+        << "when trans_type = seq, input must hasSubseq";
+    output_.degradeSequence(input);
+  }
+  if (stride_ > 0) {
+    CHECK_EQ(input.hasSubseq(), 0UL)
+        << "sequence stride pooling is invalid for hasSubseq now";
+    output_.poolSequenceWithStride(input, stride_, &startPositions_, reversed_);
+    newBatchSize_ = startPositions_->getSize() - 1;
+  }
+
+  resetOutput(newBatchSize_, dim);
+}
+
+void SequencePoolLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ { backwardActivation(); }
+
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+
+    // Increasing the number of gradient
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SequencePoolLayer.h b/paddle/legacy/gserver/layers/SequencePoolLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..1c019b313093f4ac717e0fc57a9aa798e2951580
--- /dev/null
+++ b/paddle/legacy/gserver/layers/SequencePoolLayer.h
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+/**
+ * A base layer for SequenceLastInstanceLayer/AverageLayer/MaxLayer.
+ *
+ * Input: one or more sequences. Each sequence contains some instances.
+ * If SequenceLevel = kNonSeq:
+ *    Output: output size is the number of input sequences (NOT input instances)
+ *    output[i] = seqlastin/average/max_{for each instance in this
+ * sequence}{input[i]}
+ *    If stride_ > 0:
+ *        Check input sequence must not have sub-sequence
+ *        Output: a shorten sequence. Stride is the step size by which we slide
+ *                a window upon the input sequence, and the pooling operation
+ *                is then applied to each interval independently.
+ * If SequenceLevel = kSeq:
+ *    Check input sequence must has sub-sequence
+ *    Output: output size is the number of input sub-sequences
+ *    output[i] = seqlastin/average/max_{for each instance in this
+ * sub-sequence}{input[i]}
+ *
+ * The config file api is pooling_layer.
+ */
+
+class SequencePoolLayer : public Layer {
+ protected:
+  int type_;
+  std::unique_ptr<Weight> biases_;
+  enum SequenceLevel { kNonSeq = 0, kSeq = 1 };
+  size_t newBatchSize_;
+  ICpuGpuVectorPtr startPositions_;
+  int stride_;
+  // Whether the input sequence is reversed or not.
+  bool reversed_ = false;
+
+ public:
+  explicit SequencePoolLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SequenceReshapeLayer.cpp b/paddle/legacy/gserver/layers/SequenceReshapeLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e3d40cab50af1d6eafe28331cdd481ee2b187a56
--- /dev/null
+++ b/paddle/legacy/gserver/layers/SequenceReshapeLayer.cpp
@@ -0,0 +1,157 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ *  A layer for reshaping the sequence. Assume the input sequence has
+ *  T instances, the dimension of each instance is M, and the input
+ *  reshape_dim is N, then the output sequence has T*M/N instances,
+ *  the dimension of each instance is N.
+ *
+ *  Note that T*M/N must be an integer.
+ */
+
+class SequenceReshapeLayer : public Layer {
+ protected:
+  std::unique_ptr<Weight> biases_;
+
+  MatrixPtr reshapedOutputGrad;
+
+ public:
+  explicit SequenceReshapeLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(seqreshape, SequenceReshapeLayer);
+
+bool SequenceReshapeLayer::init(const LayerMap& layerMap,
+                                const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(1U, inputLayers_.size());
+
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+  }
+  setNeedSequenceInfo(false);
+  return true;
+}
+
+void SequenceReshapeLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const Argument& input = getInput(0);
+
+  size_t inDim = input.value->getWidth();
+  size_t outDim = getSize();
+
+  size_t numSequences = input.getNumSequences();
+
+  // by default, we assume each instance as a sequence
+  IVectorPtr seqStarts;
+  IVector::resizeOrCreate(seqStarts, input.getBatchSize() + 1, false);
+  int* startsData = seqStarts->getData();
+  for (int i = 0; i < input.getBatchSize() + 1; i++) {
+    startsData[i] = i;
+  }
+  const int* starts = startsData;
+
+  // if there is sequence, then use start positions
+  if (input.sequenceStartPositions) {
+    auto startPositions = input.sequenceStartPositions->getVector(false);
+    starts = startPositions->getData();
+    CHECK_EQ(starts[numSequences], input.getBatchSize());
+    CHECK_EQ(numSequences, startPositions->getSize() - 1);
+  }
+
+  for (size_t seqID = 0; seqID < numSequences; seqID++) {
+    size_t inNumIns = starts[seqID + 1] - starts[seqID];
+    size_t outNumIns = inNumIns * inDim / outDim;
+    CHECK_EQ(outNumIns * outDim, inNumIns * inDim);
+  }
+
+  MatrixPtr inputValue = getInputValue(0);
+
+  // reset output
+  reserveOutput(inputValue->getHeight() * inDim / outDim, outDim);
+  MatrixPtr outputValue = getOutputValue();
+
+  {
+    AsyncGpuBlock asyncGpuBlock;
+    REGISTER_TIMER_INFO("SequenceReshapeLayerForward", getName().c_str());
+
+    outputValue->copyFrom(*inputValue);
+
+    // modify the sequenceStartPositions
+    ICpuGpuVector::resizeOrCreate(
+        output_.sequenceStartPositions, numSequences + 1, false);
+
+    int* tgtBuf = output_.sequenceStartPositions->getMutableData(false);
+
+    for (size_t seqId = 0; seqId < numSequences + 1; ++seqId) {
+      tgtBuf[seqId] = starts[seqId] * inDim / outDim;
+    }
+  }
+
+  if (biases_.get() != NULL) {
+    MatrixPtr outV = getOutputValue();
+    outV->addBias(*(biases_->getW()), 1);
+  }
+
+  /* activation */
+  forwardActivation();
+}
+
+void SequenceReshapeLayer::backward(const UpdateCallback& callback) {
+  /* activation */
+  backwardActivation();
+
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+
+    // Increasing the number of gradient
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  MatrixPtr inputGrad = getInputGrad(0);
+  MatrixPtr outputGrad = getOutputGrad();
+
+  AsyncGpuBlock asyncGpuBlock;
+  REGISTER_TIMER_INFO("SequenceReshapeLayerBackward", getName().c_str());
+
+  if (inputGrad) {
+    Matrix::resizeOrCreate(reshapedOutputGrad,
+                           inputGrad->getHeight(),
+                           inputGrad->getWidth(),
+                           false,
+                           useGpu_);
+    reshapedOutputGrad->copyFrom(*outputGrad);
+    inputGrad->add(*reshapedOutputGrad);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SequenceSliceLayer.cpp b/paddle/legacy/gserver/layers/SequenceSliceLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3ed51c4ef2f6e91da94f302c14d1c0cc555886aa
--- /dev/null
+++ b/paddle/legacy/gserver/layers/SequenceSliceLayer.cpp
@@ -0,0 +1,224 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+class SequenceSliceLayer : public Layer {
+ public:
+  explicit SequenceSliceLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+ private:
+  /*
+   * TODO(caoying)
+   * In PaddePaddle, currently all matrices are real number types,
+   * but the second and the (optional) third input which are some
+   * selected indices of the give sequence to trim the sequence, are actually
+   * filled with int types so that storing int types information in real number
+   * matrices is very dangerous, since real numbers will be convered to int
+   * types. If a user fills this matrix himself, invalid data may occor.
+   */
+
+  MatrixPtr startIdsOnCpu_;
+  MatrixPtr endIdsOnCpu_;
+
+  std::vector<int> selectedRows_;
+  IVectorPtr rowIndice_;
+  std::vector<std::vector<int>> inputSeqInfoVec_;
+  std::vector<int> outSubSeqStartPos_;
+  std::vector<int> outSeqStartPos_;
+
+  void checkInputs();
+  void copySliceIdsToCpu();
+  void calSelectedRows(const MatrixPtr starts, const MatrixPtr ends);
+};
+
+REGISTER_LAYER(seq_slice, SequenceSliceLayer);
+
+bool SequenceSliceLayer::init(const LayerMap& layerMap,
+                              const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+  CHECK_GE(inputLayers_.size(), 2U);
+  CHECK_LE(inputLayers_.size(), 3U);
+
+  setNeedSequenceInfo(false);
+  return true;
+}
+
+void SequenceSliceLayer::checkInputs() {
+  const Argument& inputSeq = getInput(0);
+  CHECK(inputSeq.hasSeq()) << "The first input of sequence slice layer "
+                           << "must be a sequence.";
+  const MatrixPtr indices1 = getInputValue(1);
+  CHECK_EQ(
+      indices1->getHeight(),
+      static_cast<size_t>(inputSeq.hasSubseq() ? inputSeq.getNumSubSequences()
+                                               : inputSeq.getNumSequences()))
+      << "Height of the second input should be equal to number of sequence "
+      << "in the first input.";
+  if (inputLayers_.size() == 3) {
+    const MatrixPtr indices2 = getInputValue(2);
+    CHECK_EQ(indices2->getHeight(), indices1->getHeight())
+        << "start indices and end indices should have the same height.";
+    CHECK_EQ(indices2->getWidth(), indices1->getWidth())
+        << "start indices and end indices should have the same Width.";
+  }
+}
+
+void SequenceSliceLayer::copySliceIdsToCpu() {
+  const MatrixPtr indices1 = getInputValue(1);
+  if (inputLayers_.size() == 2U) {
+    if (config_.select_first()) {
+      Matrix::resizeOrCreate(startIdsOnCpu_,
+                             indices1->getHeight(),
+                             indices1->getWidth(),
+                             false /* trans */,
+                             false /* useGpu */);
+      startIdsOnCpu_->copyFrom(*indices1);
+      endIdsOnCpu_ = nullptr;
+    } else {
+      Matrix::resizeOrCreate(endIdsOnCpu_,
+                             indices1->getHeight(),
+                             indices1->getWidth(),
+                             false /* trans */,
+                             false /* useGpu */);
+      endIdsOnCpu_->copyFrom(*indices1);
+      startIdsOnCpu_ = nullptr;
+    }
+  } else if (inputLayers_.size() == 3U) {
+    Matrix::resizeOrCreate(startIdsOnCpu_,
+                           indices1->getHeight(),
+                           indices1->getWidth(),
+                           false /* trans */,
+                           false /* useGpu */);
+    startIdsOnCpu_->copyFrom(*indices1);
+
+    const MatrixPtr indices2 = getInputValue(2);
+    Matrix::resizeOrCreate(endIdsOnCpu_,
+                           indices2->getHeight(),
+                           indices2->getWidth(),
+                           false /* trans */,
+                           false /* useGpu */);
+    endIdsOnCpu_->copyFrom(*indices2);
+  }
+}
+
+void SequenceSliceLayer::calSelectedRows(const MatrixPtr starts,
+                                         const MatrixPtr ends) {
+  CHECK(starts || ends) << "At least one of the start or end indices "
+                        << "should be given.";
+
+  bool hasSubseq = getInput(0).hasSubseq();
+
+  outSeqStartPos_.resize(1, 0);
+  outSubSeqStartPos_.resize(1, 0);
+  selectedRows_.clear();
+
+  size_t beamSize = starts ? starts->getWidth() : ends->getWidth();
+  size_t rowIdx = 0;
+  for (size_t i = 0; i < inputSeqInfoVec_.size(); ++i) {
+    for (size_t j = 0; j < inputSeqInfoVec_[i].size() - 1; ++j) {
+      for (size_t k = 0; k < beamSize; ++k) {
+        if (starts && starts->getElement(rowIdx, k) == -1.) break;
+        if (ends && ends->getElement(rowIdx, k) == -1.) break;
+
+        int begPos = inputSeqInfoVec_[i][j];
+        if (starts) begPos += starts->getElement(rowIdx, k);
+
+        int endPos = inputSeqInfoVec_[i][j + 1] - 1;
+        if (ends) endPos = inputSeqInfoVec_[i][j] + ends->getElement(rowIdx, k);
+
+        int seqLen = endPos - begPos + 1;
+        CHECK_GT(seqLen, 0);
+        for (int m = begPos; m <= endPos; ++m) selectedRows_.push_back(m);
+        hasSubseq
+            ? outSubSeqStartPos_.push_back(outSubSeqStartPos_.back() + seqLen)
+            : outSeqStartPos_.push_back(outSeqStartPos_.back() + seqLen);
+      }
+      rowIdx++;
+    }
+    if (hasSubseq) outSeqStartPos_.push_back(outSubSeqStartPos_.back());
+  }
+
+  if (useGpu_) {
+    rowIndice_ = IVector::create(selectedRows_.size(), useGpu_);
+    rowIndice_->copyFrom(selectedRows_.data(), selectedRows_.size());
+  } else {
+    rowIndice_ =
+        IVector::create(selectedRows_.data(), selectedRows_.size(), useGpu_);
+  }
+
+  // create the sequence information for the output.
+  ICpuGpuVector::resizeOrCreate(
+      output_.sequenceStartPositions, outSeqStartPos_.size(), false);
+  output_.sequenceStartPositions->copyFrom(
+      outSeqStartPos_.data(), outSeqStartPos_.size(), false);
+
+  if (hasSubseq) {
+    ICpuGpuVector::resizeOrCreate(
+        output_.subSequenceStartPositions, outSubSeqStartPos_.size(), false);
+    output_.subSequenceStartPositions->copyFrom(
+        outSubSeqStartPos_.data(), outSubSeqStartPos_.size(), false);
+  }
+}
+
+void SequenceSliceLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  checkInputs();
+
+  const Argument& inputSeq = getInput(0);
+  inputSeqInfoVec_.clear();
+  Argument::reorganizeSeqInfo(inputSeq.sequenceStartPositions,
+                              inputSeq.subSequenceStartPositions,
+                              inputSeqInfoVec_);
+  if (!useGpu_) {
+    if (inputLayers_.size() == 2U) {
+      startIdsOnCpu_ = config_.select_first() ? getInputValue(1) : nullptr;
+      endIdsOnCpu_ = config_.select_first() ? nullptr : getInputValue(1);
+    } else if (inputLayers_.size() == 3U) {
+      startIdsOnCpu_ = getInputValue(1);
+      endIdsOnCpu_ = getInputValue(2);
+    }
+  } else {
+    copySliceIdsToCpu();
+  }
+
+  /*
+   * calculate the selected row indices in a batch, and build the output
+   * sequence information.
+   */
+  calSelectedRows(startIdsOnCpu_, endIdsOnCpu_);
+
+  resetOutput(selectedRows_.size(), getSize());
+
+  getOutputValue()->selectRows(*getInputValue(0), *rowIndice_);
+}
+
+void SequenceSliceLayer::backward(const UpdateCallback& callback) {
+  getOutputGrad()->addToRows(*getInputGrad(0), *rowIndice_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SequenceToBatch.cpp b/paddle/legacy/gserver/layers/SequenceToBatch.cpp
similarity index 100%
rename from paddle/gserver/layers/SequenceToBatch.cpp
rename to paddle/legacy/gserver/layers/SequenceToBatch.cpp
diff --git a/paddle/legacy/gserver/layers/SequenceToBatch.h b/paddle/legacy/gserver/layers/SequenceToBatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..7ed517937d4a015b6b11de16412cac7599f5f8b9
--- /dev/null
+++ b/paddle/legacy/gserver/layers/SequenceToBatch.h
@@ -0,0 +1,107 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/Vector.h"
+
+namespace paddle {
+
+/*
+ * This class can used to modify the matrix structure of sequence matrix into
+ * batch structure.
+ * sequence matrix: [C1_s ... Cn_s | ...... | C1_t ... Cn_t]
+ * batch matrix:    [C1_s ... C1_t | ...... | Cn_s ... Cn_t]
+ * Cn_s is the state for sequence s at time n.
+ *
+ * Exampel:  sequence matrix = {{0, 0, 0, 0}, {1, 1, 1, 1, 1}, {2, 2, 2}}
+ *           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
+ *           batch matrix = {{1, 0, 2}, {1, 0, 2}, {1, 0, 2}, {1, 0}, {1}}
+ *           b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1
+ *
+ * Use:
+ * Input: seqMatrix, seqStarts(Sequence Start Positions)
+ * Output: batchMatrix
+ * 1. SequenceToBatch seq2batch;
+ * 2. seq2batch.resizeOrCreateBatch(seqStarts);     // calculate seq2BatchIdx
+ * 3. seq2batch.copy(seqMatrix, batchMatrix, true); // copy seq to batch matrix
+ *
+ */
+class SequenceToBatch {
+ public:
+  explicit SequenceToBatch(bool useGpu) : useGpu_(useGpu) {}
+
+  /* resize and calculate the batchIndex_ */
+  void resizeOrCreateBatch(int batchSize,
+                           size_t numSequences,
+                           const int *seqStarts,
+                           bool reversed,
+                           bool prevBatchState = false);
+
+  /* sequence matrix and batch matrix copy:
+   * seq2batch: copy(seqValue, batchValue, true);
+   * batch2seq: copy(seqValue, batchValue, false);
+   */
+  void copy(Matrix &seqValue, Matrix &batchValue, bool seq2batch);
+  /* sequence/batch matrix add to batch/sequence matrix */
+  void add(Matrix &seqValue, Matrix &batchValue, bool seq2batch);
+  MatrixPtr getBatchValue(Matrix &batchValue, int batchId, int numRows = 0);
+
+  size_t getNumBatch() const { return numBatch_; }
+
+  /* resize or create a batch matrix(batchValue_) */
+  void resizeOrCreate(Matrix &seqValue);
+  /* copy seqValue to batchValue_ */
+  void copyFromSeq(Matrix &seqValue);
+  /* copy batchValue_ to seqValue */
+  void copyBackSeq(Matrix &seqValue);
+  MatrixPtr getBatchValue(int batchId, int numRows = 0);
+  MatrixPtr getBatchValue() { return batchValue_; }
+  /*tranfer preBatchOutput to batch struct*/
+  void prevOutput2Batch(Matrix &src, Matrix &dst);
+  /*get sequence output from batch struct*/
+  void getSeqOutputFromBatch(Matrix &sequence, Matrix &batch);
+
+  /* Copy the index from another seq2batch. */
+  void shareIndexWith(const SequenceToBatch &seq2batch) {
+    CHECK(useGpu_ == seq2batch.useGpu_);
+    batchStartPositions_ = seq2batch.batchStartPositions_;
+    seq2BatchIdx_ = seq2batch.seq2BatchIdx_;
+    cpuSeq2BatchIdx_ = seq2batch.cpuSeq2BatchIdx_;
+    numBatch_ = seq2batch.numBatch_;
+  }
+
+ protected:
+  void sequence2BatchCopy(Matrix &batch,
+                          Matrix &sequence,
+                          IVector &seq2BatchIdx,
+                          bool seq2batch);
+  void sequence2BatchAdd(Matrix &batch,
+                         Matrix &sequence,
+                         IVector &seq2BatchIdx,
+                         bool seq2batch);
+
+  IVectorPtr batchStartPositions_;
+  IVectorPtr seq2BatchIdx_;
+  IVectorPtr cpuSeq2BatchIdx_;
+  IVectorPtr cpuSeqIdx_;
+  IVectorPtr cpuSeqEndIdxInBatch_;
+  IVectorPtr seqIdx_;
+  IVectorPtr seqEndIdxInBatch_;
+  size_t numBatch_;
+  bool useGpu_;
+  MatrixPtr batchValue_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SliceProjection.cpp b/paddle/legacy/gserver/layers/SliceProjection.cpp
similarity index 100%
rename from paddle/gserver/layers/SliceProjection.cpp
rename to paddle/legacy/gserver/layers/SliceProjection.cpp
diff --git a/paddle/legacy/gserver/layers/SlopeInterceptLayer.cpp b/paddle/legacy/gserver/layers/SlopeInterceptLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9168fd7dda6dcdcd9e272acbf6337f1c8468e6f0
--- /dev/null
+++ b/paddle/legacy/gserver/layers/SlopeInterceptLayer.cpp
@@ -0,0 +1,94 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * @brief A layer for applying a slope and an intercept to the input
+ * element-wise.
+ * This layer is used in NEURAL TURING MACHINE.
+ * @note There is no activation and weight in this layer.
+ *
+ * \f[
+ *    y = ax + b
+ * \f]
+ *
+ * Here, a is scale and b is offset, which are provided as attributes of the
+ * layer.
+ *
+ * The config file api is slope_intercept_layer.
+ */
+
+class SlopeInterceptLayer : public Layer {
+ public:
+  explicit SlopeInterceptLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(slope_intercept, SlopeInterceptLayer);
+
+bool SlopeInterceptLayer::init(const LayerMap& layerMap,
+                               const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 1U);
+
+  return true;
+}
+
+void SlopeInterceptLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV = getInputValue(0);
+
+  /* malloc memory for the output_ if necessary */
+  size_t batchSize = inV->getHeight();
+  size_t size = getSize();
+
+  CHECK_EQ(size, inV->getWidth());
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    reserveOutput(batchSize, size);
+  }
+
+  MatrixPtr outV = getOutputValue();
+  {
+    REGISTER_TIMER_INFO("FwSlopeInterceptTimer", getName().c_str());
+    outV->mulScalar(*inV, config_.slope());
+    outV->add(config_.intercept());
+  }
+}
+
+void SlopeInterceptLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inG = getInputGrad(0);
+  MatrixPtr outG = getOutputGrad();
+
+  if (inG) {
+    REGISTER_TIMER_INFO("BwSlopeInterceptTimer", getName().c_str());
+    inG->add(*outG, config_.slope());
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SpatialPyramidPoolLayer.cpp b/paddle/legacy/gserver/layers/SpatialPyramidPoolLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/SpatialPyramidPoolLayer.cpp
rename to paddle/legacy/gserver/layers/SpatialPyramidPoolLayer.cpp
diff --git a/paddle/legacy/gserver/layers/SpatialPyramidPoolLayer.h b/paddle/legacy/gserver/layers/SpatialPyramidPoolLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..6d8ed9c87889a93664f09dbaf2a84bd00b1757ad
--- /dev/null
+++ b/paddle/legacy/gserver/layers/SpatialPyramidPoolLayer.h
@@ -0,0 +1,59 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "PoolProjection.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/utils/Logging.h"
+
+namespace paddle {
+/**
+ * @brief A layer for spatial pyramid pooling on the input image by taking
+ * the max, average, etc. within regions, so that the result vector of
+ * different sized images are of the same size.
+ *
+ * The config file api is spp_layer.
+ */
+
+class SpatialPyramidPoolLayer : public Layer {
+ protected:
+  size_t channels_;
+  size_t imgSizeW_;
+  size_t imgSizeH_;
+  size_t pyramidHeight_;
+  std::string poolType_;
+
+  std::vector<std::unique_ptr<PoolProjection>> poolProjections_;
+  std::vector<Argument> projOutput_;
+  std::vector<std::pair<size_t, size_t>> projCol_;
+
+ public:
+  explicit SpatialPyramidPoolLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  ProjectionConfig getConfig(size_t sizeX_,
+                             size_t sizeY_,
+                             size_t channels,
+                             size_t pyamidLevel_,
+                             std::string& poolType_);
+  size_t getSize();
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SubNestedSequenceLayer.cpp b/paddle/legacy/gserver/layers/SubNestedSequenceLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f363c2ac8dd22fc8b8e1d7fca27e5beb935d42de
--- /dev/null
+++ b/paddle/legacy/gserver/layers/SubNestedSequenceLayer.cpp
@@ -0,0 +1,187 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+class SubNestedSequenceLayer : public Layer {
+ public:
+  explicit SubNestedSequenceLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+ private:
+  /*
+   * This functions generates the indices of rows in a batch according to the
+   * indices of selected sub-sequence in each sequence.
+   *
+   * Examples:
+   * selectedIndices:
+   *   [
+   *     [0, 1, -1],
+   *     [0, 1, 2],
+   *     [0, -1, -1],
+   *     [0, 2, 3],
+   *   ]
+   * inputSeqInfo:
+   *   [
+   *     [0,3,4],
+   *     [4,5,7,10,15],
+   *     [15,20],
+   *     [20,22,23,25,28]
+   *   ]
+   *
+   * ths output is saved to private member rowIndice_;
+   * [0,1,2,3,4,5,6,7,8,9,15,16,17,18,19,20,21,23,24,25,26,27]
+   */
+
+  void calSelectedRows(const MatrixPtr selectedIndices,
+                       const std::vector<std::vector<int>>& inputSeqInfo);
+
+  /*
+   * TODO(caoying)
+   * In PaddePaddle, currently all matrices are real number types,
+   * but the second is some selected indices of the give sequence to trim
+   * the nested sequence, are actually filled with int types so that storing
+   * int types information in real number matrices is very dangerous, since
+   * real numbers will be convered to int types. If a user fills this matrix
+   * himself, invalid data may occor.
+   *
+   * if the second input of this layer is on GPU memory, copy it to CPU memory.
+   */
+  MatrixPtr selIdsCpu_;
+
+  /*
+   * reorganize sequenceStartPositions and subSequenceStartPositions
+   * into a 2d vector to facilitate the sequence selection process.
+   */
+  std::vector<std::vector<int>> inputSeqInfoVec_;
+
+  /* store the final selected row indices in a batch */
+  IVectorPtr rowIndice_;
+  /* rowIndice_ and selectedRows_ actually share a same memory. */
+  std::vector<int> selectedRows_;
+};
+
+REGISTER_LAYER(sub_nested_seq, SubNestedSequenceLayer);
+
+bool SubNestedSequenceLayer::init(const LayerMap& layerMap,
+                                  const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+  CHECK_EQ(2U, inputLayers_.size());
+  setNeedSequenceInfo(false);
+  return true;
+}
+
+void SubNestedSequenceLayer::calSelectedRows(
+    const MatrixPtr selectedIndices,
+    const std::vector<std::vector<int>>& inputSeqInfo) {
+  selectedRows_.clear();
+
+  std::vector<int> outSeqStartInfo(1, 0);
+  std::vector<int> outSubSeqStartInfo(1, 0);
+
+  size_t seqNum = selectedIndices->getHeight();
+  size_t beamSize = selectedIndices->getWidth();
+  for (size_t i = 0; i < seqNum; ++i) {
+    for (size_t j = 0; j < beamSize; ++j) {
+      if (selectedIndices->getElement(i, j) == -1.) break;
+      size_t selSubSeqIdx = selectedIndices->getElement(i, j);
+      CHECK_GT(inputSeqInfoVec_[i].size() - 1, selSubSeqIdx);
+
+      size_t subSeqLen = inputSeqInfoVec_[i][selSubSeqIdx + 1] -
+                         inputSeqInfoVec_[i][selSubSeqIdx];
+      for (size_t k = 0; k < subSeqLen; ++k)
+        selectedRows_.push_back(inputSeqInfoVec_[i][selSubSeqIdx] + k);
+      outSubSeqStartInfo.push_back(outSubSeqStartInfo.back() + subSeqLen);
+    }
+    outSeqStartInfo.push_back(outSubSeqStartInfo.back());
+  }
+
+  if (useGpu_) {
+    rowIndice_ = IVector::create(selectedRows_.size(), useGpu_);
+    rowIndice_->copyFrom(selectedRows_.data(), selectedRows_.size());
+  } else {
+    rowIndice_ =
+        IVector::create(selectedRows_.data(), selectedRows_.size(), useGpu_);
+  }
+
+  // create the sequence information for the output.
+  ICpuGpuVector::resizeOrCreate(
+      output_.sequenceStartPositions, outSeqStartInfo.size(), false);
+  output_.sequenceStartPositions->copyFrom(
+      outSeqStartInfo.data(), outSeqStartInfo.size(), false);
+
+  ICpuGpuVector::resizeOrCreate(
+      output_.subSequenceStartPositions, outSubSeqStartInfo.size(), false);
+  output_.subSequenceStartPositions->copyFrom(
+      outSubSeqStartInfo.data(), outSubSeqStartInfo.size(), false);
+}
+
+void SubNestedSequenceLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const Argument& inputSeq = getInput(0);
+  CHECK(inputSeq.hasSubseq()) << "The first input of SubNestSequence layer "
+                              << "must be a nested sequence.";
+  const MatrixPtr selectedIndices = getInputValue(1);
+  CHECK_EQ(size_t(inputSeq.getNumSequences()), selectedIndices->getHeight());
+
+  if (dynamic_cast<GpuMatrix*>(selectedIndices.get())) {
+    /*
+     * Currently, the second input for this layer is generated by
+     * kmax_sequence_score_layer whose output is always stored on CPU,
+     * or a data_layer which canbe on GPU.
+     *
+     * If the second input is on GPU, copy it to CPU memory, because this
+     * input always uses very few memory, and operations related to it are
+     * all logic control, not computations.
+     */
+    Matrix::resizeOrCreate(selIdsCpu_,
+                           selectedIndices->getHeight(),
+                           selectedIndices->getWidth(),
+                           false /* trans */,
+                           false /* useGpu */);
+    selIdsCpu_->copyFrom(*selectedIndices);
+  } else {
+    selIdsCpu_ = selectedIndices;
+  }
+
+  Argument::reorganizeSeqInfo(inputSeq.sequenceStartPositions,
+                              inputSeq.subSequenceStartPositions,
+                              inputSeqInfoVec_);
+  calSelectedRows(selIdsCpu_, inputSeqInfoVec_);
+
+  resetOutput(selectedRows_.size(), getSize());
+  getOutputValue()->selectRows(*getInputValue(0), *rowIndice_);
+}
+
+void SubNestedSequenceLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inputSeqGrad = getInputGrad(0);
+  MatrixPtr outputGrad = getOutputGrad();
+
+  if (inputSeqGrad) outputGrad->addToRows(*inputSeqGrad, *rowIndice_);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SubSequenceLayer.cpp b/paddle/legacy/gserver/layers/SubSequenceLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..36796f04739054bb19d4a3ce656e248898ba4b17
--- /dev/null
+++ b/paddle/legacy/gserver/layers/SubSequenceLayer.cpp
@@ -0,0 +1,226 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * A layer for taking the subsequence according to given offset and size
+ * Input: original sequence, offset, size
+ * Output: subsequence
+ */
+
+class SubSequenceLayer : public Layer {
+ protected:
+  std::unique_ptr<Weight> biases_;
+  MatrixPtr tmpSrc_;
+  MatrixPtr tmpDest_;
+
+ public:
+  explicit SubSequenceLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(subseq, SubSequenceLayer);
+
+bool SubSequenceLayer::init(const LayerMap& layerMap,
+                            const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  // sequene concatenation layer should have exactly 2 inputs
+  CHECK_EQ(3U, inputLayers_.size());
+
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+  }
+
+  tmpSrc_ =
+      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
+  tmpDest_ =
+      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
+
+  setNeedSequenceInfo(false);
+  return true;
+}
+
+void SubSequenceLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  size_t dim = getSize();
+
+  const Argument& input = getInput(0);
+  size_t numSequences1 = input.getNumSequences();
+  auto startPositions1 = input.sequenceStartPositions->getVector(false);
+
+  const Argument& offsetSeq = getInput(1);
+  size_t numSequences2 = offsetSeq.getNumSequences();
+  auto startPositions2 = offsetSeq.sequenceStartPositions->getVector(false);
+
+  const Argument& sizeSeq = getInput(2);
+  size_t numSequences3 = sizeSeq.getNumSequences();
+  auto startPositions3 = sizeSeq.sequenceStartPositions->getVector(false);
+
+  CHECK_EQ(dim, input.value->getWidth());
+
+  CHECK_EQ(startPositions1->getData()[numSequences1], input.getBatchSize());
+  CHECK_EQ(numSequences1, startPositions1->getSize() - 1);
+
+  CHECK_EQ(startPositions2->getData()[numSequences2], offsetSeq.getBatchSize());
+  CHECK_EQ(numSequences2, startPositions2->getSize() - 1);
+
+  CHECK_EQ(startPositions3->getData()[numSequences3], sizeSeq.getBatchSize());
+  CHECK_EQ(numSequences3, startPositions3->getSize() - 1);
+
+  CHECK_EQ(numSequences1, numSequences2);
+  CHECK_EQ(numSequences2, numSequences3);
+
+  MatrixPtr inputValue = input.value;
+  IVectorPtr offsetValue;
+  IVectorPtr sizeValue;
+
+  if (useGpu_) {
+    // copy to cpu
+    IVector::resizeOrCreate(offsetValue, offsetSeq.ids->getSize(), false);
+    IVector::resizeOrCreate(sizeValue, sizeSeq.ids->getSize(), false);
+    offsetValue->copyFrom(*offsetSeq.ids);
+    sizeValue->copyFrom(*sizeSeq.ids);
+  } else {
+    offsetValue = offsetSeq.ids;
+    sizeValue = sizeSeq.ids;
+  }
+
+  CHECK_EQ(offsetValue->getSize(), numSequences1);
+  CHECK_EQ(sizeValue->getSize(), numSequences1);
+
+  int* offsets = offsetValue->getData();
+  int* sizes = sizeValue->getData();
+
+  // get total height of output
+  size_t height = 0;
+  for (size_t seqId = 0; seqId < numSequences1; seqId++) {
+    height += sizes[seqId];
+  }
+
+  // reset output
+  resetOutput(height, dim);
+
+  MatrixPtr outputValue = getOutputValue();
+
+  const int* starts1 = startPositions1->getData();
+
+  {
+    AsyncGpuBlock asyncGpuBlock;
+    REGISTER_TIMER_INFO("SubSequenceLayerForward", getName().c_str());
+
+    size_t offsetIn = 0;
+    size_t offsetOut = 0;
+    size_t size = 0;
+    for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
+      offsetIn = starts1[seqId] + offsets[seqId];
+      size = sizes[seqId];
+
+      outputValue->subMatrix(offsetOut, size, tmpDest_)
+          ->assign(*(inputValue->subMatrix(offsetIn, size, tmpSrc_)));
+
+      offsetOut += size;
+    }
+
+    // modify the sequenceStartPositions
+    ICpuGpuVector::resizeOrCreate(
+        output_.sequenceStartPositions, numSequences1 + 1, false);
+
+    int* tgtBuf = output_.sequenceStartPositions->getMutableData(false);
+    int offset = 0;
+    for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
+      tgtBuf[seqId] = offset;
+      offset += sizes[seqId];
+    }
+    tgtBuf[numSequences1] = offset;
+  }
+
+  if (biases_.get() != NULL) {
+    MatrixPtr outV = getOutputValue();
+    outV->addBias(*(biases_->getW()), 1);
+  }
+
+  /* activation */
+  forwardActivation();
+}
+
+void SubSequenceLayer::backward(const UpdateCallback& callback) {
+  /* activation */
+  backwardActivation();
+
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+
+    // Increasing the number of gradient
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  MatrixPtr inputGrad1 = getInputGrad(0);
+  MatrixPtr outputGrad = getOutputGrad();
+  auto startPositions1 = getInput(0).sequenceStartPositions->getVector(false);
+  size_t numSequences1 = startPositions1->getSize() - 1;
+  const int* starts1 = startPositions1->getData();
+
+  const Argument& offsetSeq = getInput(1);
+  const Argument& sizeSeq = getInput(2);
+  IVectorPtr offsetValue;
+  IVectorPtr sizeValue;
+
+  if (useGpu_) {
+    // copy to cpu
+    IVector::resizeOrCreate(offsetValue, offsetSeq.ids->getSize(), false);
+    IVector::resizeOrCreate(sizeValue, sizeSeq.ids->getSize(), false);
+    offsetValue->copyFrom(*offsetSeq.ids);
+    sizeValue->copyFrom(*sizeSeq.ids);
+  } else {
+    offsetValue = offsetSeq.ids;
+    sizeValue = sizeSeq.ids;
+  }
+
+  int* offsets = offsetValue->getData();
+  int* sizes = sizeValue->getData();
+  {
+    AsyncGpuBlock asyncGpuBlock;
+    REGISTER_TIMER_INFO("SubSequenceLayerBackward", getName().c_str());
+
+    int offsetIn = 0;
+    int offsetOut = 0;
+    int size = 0;
+    for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
+      offsetIn = starts1[seqId] + offsets[seqId];
+      size = sizes[seqId];
+
+      inputGrad1->subMatrix(offsetIn, size, tmpDest_)
+          ->add(*(outputGrad->subMatrix(offsetOut, size, tmpSrc_)));
+      offsetOut += size;
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SumToOneNormLayer.cpp b/paddle/legacy/gserver/layers/SumToOneNormLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..410f4dd7c90e67488bc3dda6dfad551032890d65
--- /dev/null
+++ b/paddle/legacy/gserver/layers/SumToOneNormLayer.cpp
@@ -0,0 +1,120 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * A layer for sum-to-one normalization,
+ * which is used in NEURAL TURING MACHINE.
+ * \f[
+ *   out[i] = \frac {in[i]} {\sum_{k=1}^N in[k]}
+ * \f]
+ * where \f$in\f$ is a (batchSize x dataDim) input vector,
+ * and \f$out\f$ is a (batchSize x dataDim) output vector.
+ *
+ * The config file api is sum_to_one_norm_layer.
+ */
+
+class SumToOneNormLayer : public Layer {
+ protected:
+  /// reciprocalRowSum_ = \f$1 / \sum_{k=1}^N in[k]\f$
+  MatrixPtr reciprocalRowSum_;
+  /// dotSum = output_.grad \f$.*\f$ output_.value
+  MatrixPtr dotSum_;
+
+ public:
+  explicit SumToOneNormLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(sum_to_one_norm, SumToOneNormLayer);
+
+bool SumToOneNormLayer::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 1U);
+
+  return true;
+}
+
+void SumToOneNormLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV = getInputValue(0);
+
+  /* malloc memory for the output_ if necessary */
+  size_t batchSize = inV->getHeight();
+  size_t dataDim = getSize();
+
+  CHECK_EQ(dataDim, inV->getWidth());
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    resetOutput(batchSize, dataDim);
+  }
+
+  MatrixPtr outV = getOutputValue();
+  {
+    REGISTER_TIMER_INFO("FwSumToOneNormTimer", getName().c_str());
+
+    Matrix::resizeOrCreate(reciprocalRowSum_, batchSize, 1, false, useGpu_);
+    inV->rowSum(*reciprocalRowSum_);
+
+    // todo: matrix checks
+    CHECK_GT(reciprocalRowSum_->getMin(), 0.0);
+
+    reciprocalRowSum_->scalarDiv(*reciprocalRowSum_, 1.0);
+
+    // outV = inV * reciprocalRowSum
+    outV->rowScale(0, *inV, *reciprocalRowSum_);
+  }
+}
+
+void SumToOneNormLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inV = getInputValue(0);
+  MatrixPtr inG = getInputGrad(0);
+  MatrixPtr outV = getOutputValue();
+  MatrixPtr outG = getOutputGrad();
+
+  size_t batchSize = inV->getHeight();
+
+  if (inG) {
+    REGISTER_TIMER_INFO("BwSumToOneTimer", getName().c_str());
+
+    Matrix::resizeOrCreate(dotSum_, batchSize, 1, false, useGpu_);
+
+    // dotSum = outG .* outV
+    dotSum_->zeroMem();
+    dotSum_->rowDotMul(0, *outG, *outV);
+
+    // inG += -1 * (dotSum / rowSum)
+    dotSum_->dotMul(*dotSum_, *reciprocalRowSum_);
+    inG->rowAdd(0, *inG, *dotSum_, -1.0);
+    // inG += outG * (1/rowSum)
+    inG->addRowScale(0, *outG, *reciprocalRowSum_);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SwitchOrderLayer.cpp b/paddle/legacy/gserver/layers/SwitchOrderLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..513f3df7bcaf854835ec0e500d47c23469d5aa46
--- /dev/null
+++ b/paddle/legacy/gserver/layers/SwitchOrderLayer.cpp
@@ -0,0 +1,109 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "SwitchOrderLayer.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(switch_order, SwitchOrderLayer);
+
+bool SwitchOrderLayer::init(const LayerMap& layerMap,
+                            const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+  auto& img_conf = config_.inputs(0).image_conf();
+  size_t inD = img_conf.img_size_z();
+  size_t inH =
+      img_conf.has_img_size_y() ? img_conf.img_size_y() : img_conf.img_size();
+  size_t inW = img_conf.img_size();
+  size_t inC = img_conf.channels();
+  inH = inH * inD;
+  inDims_ = TensorShape({0, inC, inH, inW});
+  outDims_ = TensorShape(4);
+
+  auto& reshape_conf = config_.reshape_conf();
+  for (int i = 0; i < reshape_conf.height_axis_size(); i++) {
+    heightAxis_.push_back(reshape_conf.height_axis(i));
+  }
+  for (int i = 0; i < reshape_conf.width_axis_size(); i++) {
+    widthAxis_.push_back(reshape_conf.width_axis(i));
+  }
+  createFunction(nchw2nhwc_, "NCHW2NHWC", FuncConfig());
+  createFunction(nhwc2nchw_, "NHWC2NCHW", FuncConfig());
+  return true;
+}
+
+void SwitchOrderLayer::setOutDims() {
+  outDims_.setDim(0, inDims_[0]);
+  outDims_.setDim(1, inDims_[2]);
+  outDims_.setDim(2, inDims_[3]);
+  outDims_.setDim(3, inDims_[1]);
+  reshapeHeight_ = 1;
+  for (size_t i = 0; i < heightAxis_.size(); i++) {
+    reshapeHeight_ *= outDims_[heightAxis_[i]];
+  }
+  output_.setFrameHeight(reshapeHeight_);
+  reshapeWidth_ = 1;
+  for (size_t i = 0; i < widthAxis_.size(); i++) {
+    reshapeWidth_ *= outDims_[widthAxis_[i]];
+  }
+  output_.setFrameWidth(reshapeWidth_);
+}
+
+void SwitchOrderLayer::setInDims() {
+  MatrixPtr input = inputLayers_[0]->getOutputValue();
+  size_t batchSize = input->getHeight();
+  inDims_.setDim(0, batchSize);
+  int d = inputLayers_[0]->getOutput().getFrameDepth();
+  d = (d == 0 ? 1 : d);
+  int h = inputLayers_[0]->getOutput().getFrameHeight();
+  if (h != 0) inDims_.setDim(2, h * d);
+  int w = inputLayers_[0]->getOutput().getFrameWidth();
+  if (w != 0) inDims_.setDim(3, w);
+  int totalCount = input->getElementCnt();
+  int channels = totalCount / (inDims_[0] * inDims_[2] * inDims_[3]);
+  if (channels != 0) inDims_.setDim(1, channels);
+}
+
+void SwitchOrderLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  setInDims();
+  setOutDims();
+  resetOutput(outDims_[0], outDims_[1] * outDims_[2] * outDims_[3]);
+  if (heightAxis_.size() > 0) {
+    resetOutput(reshapeHeight_, reshapeWidth_);
+  }
+
+  // switch NCHW to NHWC
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getInputValue(0), inDims_);
+  outputs.addArg(*getOutputValue(), outDims_);
+  nchw2nhwc_[0]->calc(inputs, outputs);
+  forwardActivation();
+}
+
+void SwitchOrderLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+  backwardActivation();
+
+  // switch NHWC to NCHW
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getOutputGrad(), outDims_);
+  outputs.addArg(*getInputGrad(0), inDims_, ADD_TO);
+  nhwc2nchw_[0]->calc(inputs, outputs);
+}
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SwitchOrderLayer.h b/paddle/legacy/gserver/layers/SwitchOrderLayer.h
similarity index 100%
rename from paddle/gserver/layers/SwitchOrderLayer.h
rename to paddle/legacy/gserver/layers/SwitchOrderLayer.h
diff --git a/paddle/gserver/layers/TableProjection.cpp b/paddle/legacy/gserver/layers/TableProjection.cpp
similarity index 100%
rename from paddle/gserver/layers/TableProjection.cpp
rename to paddle/legacy/gserver/layers/TableProjection.cpp
diff --git a/paddle/gserver/layers/TableProjection.h b/paddle/legacy/gserver/layers/TableProjection.h
similarity index 100%
rename from paddle/gserver/layers/TableProjection.h
rename to paddle/legacy/gserver/layers/TableProjection.h
diff --git a/paddle/legacy/gserver/layers/TensorLayer.cpp b/paddle/legacy/gserver/layers/TensorLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7f874bce0f2bdf7ab4771e470e2e4535693ecf68
--- /dev/null
+++ b/paddle/legacy/gserver/layers/TensorLayer.cpp
@@ -0,0 +1,145 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "TensorLayer.h"
+
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(tensor, TensorLayer);
+
+bool TensorLayer::init(const LayerMap& layerMap,
+                       const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  /* initialize the weightList */
+  CHECK_EQ(inputLayers_.size(), 2LU);
+  CHECK(parameters_[0]);
+  CHECK(!parameters_[1]);
+
+  // Option the parameters
+  size_t height = inputLayers_[0]->getSize();
+  size_t width = inputLayers_[1]->getSize();
+  CHECK_EQ(width * height * getSize(), parameters_[0]->getSize());
+
+  for (size_t i = 0; i < getSize(); ++i) {
+    // create a new weight
+    Weight* w = new Weight(height, width, parameters_[0], i * width * height);
+
+    // append the new weight to the list
+    weights_.emplace_back(w);
+  }
+
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+  }
+
+  return true;
+}
+
+void TensorLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  /* malloc memory for the output_ if necessary */
+  int batchSize = getInputValue(0)->getHeight();
+  int size = getSize();
+
+  { resetOutput(batchSize, size); }
+
+  MatrixPtr outV = getOutputValue();
+  /* add the bias-vector */
+  if (biases_.get() != NULL) {
+    outV->addBias(*(biases_->getW()), 1);
+  }
+
+  /* e1 * W * trans(e2) */ {
+    MatrixPtr input1 = getInputValue(0);
+    MatrixPtr input2 = getInputValue(1);
+    MatrixPtr tmpMat = Matrix::create(input2->getHeight(),
+                                      input2->getWidth(),
+                                      /* trans= */ false,
+                                      input2->useGpu());
+    REGISTER_TIMER_INFO("TensorFwMulTimer", getName().c_str());
+    for (size_t i = 0; i < getSize(); ++i) {
+      MatrixPtr weights = weights_[i]->getW();
+      tmpMat->mul(*input1, *weights, 1, 0);
+      outV->rowDotMul(i, *tmpMat, *input2);
+    }
+  }
+
+  /* activation */ { forwardActivation(); }
+}
+
+void TensorLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ { backwardActivation(); }
+
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+
+    /* Increasing the number of gradient */
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  bool syncFlag = hl_get_sync_flag();
+
+  /* Calculate the W-gradient for the current layer */
+  MatrixPtr input1 = getInputValue(0);
+  MatrixPtr input2 = getInputValue(1);
+  MatrixPtr oGrad = getOutputGrad();
+  MatrixPtr tmpMat = Matrix::create(input1->getHeight(),
+                                    input1->getWidth(),
+                                    /* trans= */ false,
+                                    input1->useGpu());
+
+  /* trans(grad * e1) * e2 */ {
+    REGISTER_TIMER_INFO("TensorGradMulTimer", getName().c_str());
+    for (size_t i = 0; i < getSize(); ++i) {
+      if (weights_[i]->getWGrad()) {
+        tmpMat->rowScale(i, *input1, *oGrad);
+        MatrixPtr input1_T = tmpMat->getTranspose();
+        weights_[i]->getWGrad()->mul(*input1_T, *input2, 1, 1);
+      }
+    }
+  }
+
+  hl_set_sync_flag(false);
+
+  /* Calculate the input layers error */ {
+    MatrixPtr preGrad1 = getInputGrad(0);
+    MatrixPtr preGrad2 = getInputGrad(1);
+
+    REGISTER_TIMER_INFO("TensorBpMulTimer", getName().c_str());
+    for (size_t i = 0; i < getSize(); ++i) {
+      MatrixPtr weights = weights_[i]->getW();
+
+      if (NULL != preGrad1) { /* (grad * e2) * trans(W) */
+        tmpMat->rowScale(i, *input2, *oGrad);
+        MatrixPtr weights_T = weights->getTranspose();
+        preGrad1->mul(*tmpMat, *weights_T, 1, 1);
+      }
+      if (NULL != preGrad2) { /* (grad * e1) * W */
+        tmpMat->rowScale(i, *input1, *oGrad);
+        preGrad2->mul(*tmpMat, *weights, 1, 1);
+      }
+    }
+  }
+  hl_set_sync_flag(syncFlag);
+  parameters_[0]->incUpdate(callback);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/TensorLayer.h b/paddle/legacy/gserver/layers/TensorLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..fc491a7c9f223cf0dff6d878c6ec27a858c7c7b7
--- /dev/null
+++ b/paddle/legacy/gserver/layers/TensorLayer.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
+
+namespace paddle {
+
+/**
+ * @brief TensorLayer takes two input vectors.
+ * \f[
+ *     y_{i} = x_{1} * W_{i} * x_{2}^{\rm T}, i=0, 1, ...,K-1
+ * \f]
+ *
+ * - \f$x_{1}\f$: the first input, size is M.
+ * - \f$x_{2}\f$: the second input, size is N.
+ * - y: output, size is K.
+ * - \f$y_{i}\f$: i-th element of y.
+ * - \f$W_{i}\f$: the i-th learned weight, dimensions: [M, N].
+ * - \f$x_{2}^{\rm T}\f$: the transpose of \f$x_{2}\f$.
+ *
+ * The config file api is tensor_layer.
+ */
+
+class TensorLayer : public Layer {
+ protected:
+  WeightList weights_;
+  std::unique_ptr<Weight> biases_;
+
+ public:
+  explicit TensorLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  Weight& getWeight(int idx) { return *weights_[idx]; }
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/TransLayer.cpp b/paddle/legacy/gserver/layers/TransLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fd1d435ea5f53785c9c416146c642637adc786a8
--- /dev/null
+++ b/paddle/legacy/gserver/layers/TransLayer.cpp
@@ -0,0 +1,69 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "TransLayer.h"
+#include "paddle/legacy/utils/Logging.h"
+namespace paddle {
+
+REGISTER_LAYER(trans, TransLayer);
+
+bool TransLayer::init(const LayerMap& layerMap,
+                      const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  /* the size of inputs for trans-layer is 1 */
+  CHECK_EQ(config_.inputs_size(), 1);
+
+  return true;
+}
+
+void TransLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  /* malloc memory for the output_ if necessary */
+  MatrixPtr input = getInputValue(0);
+  int height = input->getHeight();
+  int width = input->getWidth();
+
+  resizeOutput(width, height);
+
+  MatrixPtr outV = getOutputValue();
+
+  /* outV's memory has been allocated, so memAlloc = false */
+  input->transpose(outV, false);
+  if (getInputGrad(0)) {
+    zeroGrad();
+  }
+}
+
+void TransLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+
+  MatrixPtr outputGrad = getOutputGrad();
+  if (outputGrad == NULL) {
+    return;
+  }
+  MatrixPtr preGrad = getInputGrad(0);
+  if (preGrad) {
+    MatrixPtr transGrad = Matrix::create(preGrad->getHeight(),
+                                         preGrad->getWidth(),
+                                         /* trans= */ false,
+                                         preGrad->useGpu());
+    outputGrad->transpose(transGrad, false);
+    preGrad->add(*transGrad);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/TransLayer.h b/paddle/legacy/gserver/layers/TransLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..0a6b13933f83f30a07ed63d722dbb612c64edae7
--- /dev/null
+++ b/paddle/legacy/gserver/layers/TransLayer.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+/**
+ * A layer for transposing a minibatch matrix.
+ * \f[
+     y = x^\mathrm{T}
+ * \f]
+ * where \f$x\f$ is (M x N) input, and \f$y\f$ is (N x M) output.
+ *
+ * The config file api is trans_layer.
+ */
+class TransLayer : public Layer {
+ public:
+  explicit TransLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/TransposedFullMatrixProjection.cpp b/paddle/legacy/gserver/layers/TransposedFullMatrixProjection.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c8533dc7d78ec4fd3629e29e6c1c3e73c6acdc17
--- /dev/null
+++ b/paddle/legacy/gserver/layers/TransposedFullMatrixProjection.cpp
@@ -0,0 +1,80 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Projection.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * @brief TransposedFullMatrixProjection performs full matrix multiplication:
+ * out.row[i] += in.row[i] * weight.transpose
+ *
+ * The config file api is trans_full_matrix_projection.
+ */
+class TransposedFullMatrixProjection : public Projection {
+ public:
+  TransposedFullMatrixProjection(const ProjectionConfig& config,
+                                 ParameterPtr parameter,
+                                 bool useGPu);
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback);
+
+ protected:
+  std::unique_ptr<Weight> weight_;
+};
+
+REGISTER_PROJECTION(trans_fc, TransposedFullMatrixProjection);
+
+TransposedFullMatrixProjection::TransposedFullMatrixProjection(
+    const ProjectionConfig& config, ParameterPtr parameter, bool useGpu)
+    : Projection(config, parameter, useGpu) {
+  weight_.reset(
+      new Weight(config.output_size(), config.input_size(), parameter));
+}
+
+void TransposedFullMatrixProjection::forward() {
+  REGISTER_TIMER_INFO("FwMulTimer", getName().c_str());
+  out_->value->mul(*(in_->value), *(weight_->getW()->getTranspose()), 1, 1);
+}
+
+void TransposedFullMatrixProjection::backward(const UpdateCallback& callback) {
+  bool syncFlag = hl_get_sync_flag();
+
+  /* Calculate the W-gradient for the current layer */
+  if (weight_->getWGrad()) {
+    REGISTER_TIMER_INFO("GradMulTimer", getName().c_str());
+    weight_->getWGrad()->mul(
+        *(out_->grad->getTranspose()), *(in_->value), 1, 1);
+  }
+
+  // If callback does not change value, backprop error asynchronously so that
+  // we can do the callback concurrently.
+  // This is still a little bit dangerous since theoretically for
+  // SyncMultiGpuMachine it is possible that the value copyback can still
+  // happen at the same time as the error backprop where the value is being
+  // used.
+  hl_set_sync_flag(false);
+
+  /* Calculate the input layers error */
+  if (in_->grad) {
+    REGISTER_TIMER_INFO("BpMulTimer", getName().c_str());
+    in_->grad->mul(*(out_->grad), *(weight_->getW()), 1, 1);
+  }
+
+  hl_set_sync_flag(syncFlag);
+  parameter_->incUpdate(callback);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/UpsampleLayer.cpp b/paddle/legacy/gserver/layers/UpsampleLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/UpsampleLayer.cpp
rename to paddle/legacy/gserver/layers/UpsampleLayer.cpp
diff --git a/paddle/legacy/gserver/layers/UpsampleLayer.h b/paddle/legacy/gserver/layers/UpsampleLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..2fe5938244c81ab25c66083cc1ad63ba15618aa1
--- /dev/null
+++ b/paddle/legacy/gserver/layers/UpsampleLayer.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * This layer transpose the pooling process.
+ * It takes two input, the first input is the input data, and
+ * the second is the mask data from the max-pool-with-mask layer.
+ *
+ */
+
+class UpsampleLayer : public Layer {
+ public:
+  explicit UpsampleLayer(const LayerConfig& config) : Layer(config) {}
+  ~UpsampleLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
+
+  size_t getOutputSize();
+
+ protected:
+  size_t scale_, scaleY_;
+  size_t upsampleSize_, upsampleSizeY_;
+  size_t padOutX_, padOutY_;
+  size_t imgSize_, imgSizeY_;
+  size_t channels_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ValidationLayer.cpp b/paddle/legacy/gserver/layers/ValidationLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9956fd2ed41464eae096911620e160f5ecd89da3
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ValidationLayer.cpp
@@ -0,0 +1,171 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <fstream>
+#include <memory>
+
+#include "ValidationLayer.h"
+#include "paddle/legacy/utils/Logging.h"
+
+namespace paddle {
+
+bool ValidationLayer::init(const LayerMap& layerMap,
+                           const ParameterMap& parameterMap) {
+  return Layer::init(layerMap, parameterMap);
+}
+
+void ValidationLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr output = getInputValue(*getOutputLayer());
+  CHECK(output);
+  IVectorPtr label = getInputLabel(*getLabelLayer());
+  CHECK(label);
+  validationImp(output, label);
+}
+
+void ValidationLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+}
+
+bool AucValidation::init(const LayerMap& layerMap,
+                         const ParameterMap& parameterMap) {
+  bool ret = ValidationLayer::init(layerMap, parameterMap);
+  EvaluatorConfig config;
+  config.set_name(getName());
+  config.set_type("last-column-auc");
+  config.add_input_layers(inputLayers_[0]->getName());
+  config.add_input_layers(inputLayers_[1]->getName());
+  if (3 == inputLayers_.size()) {
+    config.add_input_layers(inputLayers_[2]->getName());
+  }
+  evaluator_.reset(Evaluator::create(config));
+  passBegin_ = false;
+  return ret;
+}
+
+void AucValidation::validationImp(MatrixPtr output, IVectorPtr label) {
+  if (!passBegin_) {
+    passBegin_ = true;
+    evaluator_->start();
+  }
+
+  bool supportWeight = (3 == inputLayers_.size()) ? true : false;
+  MatrixPtr weight = supportWeight ? getInputValue(*inputLayers_[2]) : nullptr;
+  if (dynamic_cast<GpuMatrix*>(output.get())) {
+    size_t height = output->getHeight();
+    size_t width = output->getWidth();
+    Matrix::resizeOrCreate(cpuOutput_,
+                           height,
+                           width,
+                           /* trans=*/false,
+                           /* useGpu=*/false);
+    cpuOutput_->copyFrom(*output);
+    IVector::resizeOrCreate(cpuLabel_, height, false);
+    cpuLabel_->copyFrom(*label);
+
+    if (supportWeight) {
+      Matrix::resizeOrCreate(cpuWeight_, height, (size_t)1, false, false);
+      cpuWeight_->copyFrom(*weight);
+    }
+
+    output = cpuOutput_;
+    label = cpuLabel_;
+    weight = cpuWeight_;
+  }
+
+  for (size_t i = 0; i < output->getHeight(); i++) {
+    float y1 = output->getData()[i * output->getWidth() + 1];
+    int* labels = label->getData();
+    predictArray_.push_back(PredictionResult(y1, labels[i]));
+  }
+  std::vector<Argument> arguments;
+  if (3 == inputLayers_.size()) {
+    arguments.resize(3);
+    arguments[2].value = weight;
+  } else {
+    arguments.resize(2);
+  }
+  arguments[0].value = output;
+  arguments[1].ids = label;
+  evaluator_->evalImp(arguments);
+}
+
+void AucValidation::onPassEnd() {
+  if (!FLAGS_predict_file.empty()) {
+    std::ofstream fs(FLAGS_predict_file);
+    CHECK(fs) << "Fail to open " << FLAGS_predict_file;
+    for (auto& res : predictArray_) {
+      fs << res.out << " " << res.label << std::endl;
+    }
+  }
+
+  evaluator_->finish();
+  LOG(INFO) << *evaluator_;
+  passBegin_ = false;
+  predictArray_.clear();
+}
+
+bool PnpairValidation::init(const LayerMap& layerMap,
+                            const ParameterMap& parameterMap) {
+  bool ret = ValidationLayer::init(layerMap, parameterMap);
+  if (!ret) return ret;
+  CHECK_GE(inputLayers_.size(), 3UL);
+  CHECK_LE(inputLayers_.size(), 4UL);
+  EvaluatorConfig config;
+  config.set_name(getName());
+  config.set_type("pnpair");
+  config.add_input_layers(inputLayers_[0]->getName());
+  config.add_input_layers(inputLayers_[1]->getName());
+  config.add_input_layers(inputLayers_[2]->getName());
+  if (4 == inputLayers_.size()) {
+    config.add_input_layers(inputLayers_[3]->getName());
+  }
+  evaluator_.reset(Evaluator::create(config));
+  passBegin_ = false;
+  return true;
+}
+
+void PnpairValidation::validationImp(MatrixPtr output, IVectorPtr label) {
+  if (!passBegin_) {
+    passBegin_ = true;
+    evaluator_->start();
+  }
+  MatrixPtr weight =
+      (4 == inputLayers_.size()) ? getInputValue(*inputLayers_[3]) : nullptr;
+  IVectorPtr info = getInputLabel(*getInfoLayer());
+  std::vector<Argument> arguments;
+  if (4 == inputLayers_.size()) {
+    arguments.resize(4);
+    arguments[3].value = weight;
+  } else {
+    arguments.resize(3);
+  }
+  arguments[0].value = output;
+  arguments[1].ids = label;
+  arguments[2].ids = info;
+  evaluator_->evalImp(arguments);
+}
+
+void PnpairValidation::onPassEnd() {
+  if (!FLAGS_predict_file.empty()) {
+    (dynamic_cast<PnpairEvaluator*>(evaluator_.get()))->printPredictResults();
+  }
+  evaluator_->finish();
+  LOG(INFO) << *evaluator_;
+  passBegin_ = false;
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ValidationLayer.h b/paddle/legacy/gserver/layers/ValidationLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..fbc94e8ef570e2eec1d3737aca97bbf91c1392b2
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ValidationLayer.h
@@ -0,0 +1,104 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <memory>
+
+#include "Layer.h"
+#include "paddle/legacy/gserver/evaluators/Evaluator.h"
+
+DECLARE_int32(trainer_id);
+
+namespace paddle {
+
+class ValidationLayer : public Layer {
+ public:
+  explicit ValidationLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  LayerPtr getOutputLayer() { return inputLayers_[0]; }
+
+  LayerPtr getLabelLayer() { return inputLayers_[1]; }
+
+  LayerPtr getInfoLayer() {
+    assert(inputLayers_.size() > 2);
+    return inputLayers_[2];
+  }
+
+  void forward(PassType passType) override;
+
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+  virtual void validationImp(MatrixPtr outputValue, IVectorPtr label) = 0;
+
+  void onPassEnd() override = 0;
+};
+
+/*
+ * AucValidation
+ */
+class AucValidation : public ValidationLayer {
+ public:
+  explicit AucValidation(const LayerConfig& config)
+      : ValidationLayer(config),
+        cpuOutput_(nullptr),
+        cpuLabel_(nullptr),
+        cpuWeight_(nullptr) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void validationImp(MatrixPtr outputValue, IVectorPtr label) override;
+
+  void onPassEnd() override;
+
+  struct PredictionResult {
+    PredictionResult(real __out, int __label) : out(__out), label(__label) {}
+    real out;
+    int label;
+  };
+  std::vector<PredictionResult> predictArray_;
+
+ private:
+  bool passBegin_;
+  std::unique_ptr<Evaluator> evaluator_;
+  MatrixPtr cpuOutput_;
+  IVectorPtr cpuLabel_;
+  MatrixPtr cpuWeight_;
+};
+
+/*
+ * positive-negative pair rate Validation
+ */
+class PnpairValidation : public ValidationLayer {
+ public:
+  explicit PnpairValidation(const LayerConfig& config)
+      : ValidationLayer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void validationImp(MatrixPtr outputValue, IVectorPtr label) override;
+
+  void onPassEnd() override;
+
+ private:
+  bool passBegin_;
+  std::unique_ptr<Evaluator> evaluator_;
+};
+
+typedef std::shared_ptr<ValidationLayer> ValidationLayerPtr;
+}  // namespace paddle
diff --git a/paddle/gserver/layers/WarpCTCLayer.cpp b/paddle/legacy/gserver/layers/WarpCTCLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/WarpCTCLayer.cpp
rename to paddle/legacy/gserver/layers/WarpCTCLayer.cpp
diff --git a/paddle/gserver/layers/WarpCTCLayer.h b/paddle/legacy/gserver/layers/WarpCTCLayer.h
similarity index 100%
rename from paddle/gserver/layers/WarpCTCLayer.h
rename to paddle/legacy/gserver/layers/WarpCTCLayer.h
diff --git a/paddle/gserver/tests/.gitignore b/paddle/legacy/gserver/tests/.gitignore
similarity index 100%
rename from paddle/gserver/tests/.gitignore
rename to paddle/legacy/gserver/tests/.gitignore
diff --git a/paddle/legacy/gserver/tests/CMakeLists.txt b/paddle/legacy/gserver/tests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..93ddf5aa233017d4f5139a8add6c69ef3a4682b4
--- /dev/null
+++ b/paddle/legacy/gserver/tests/CMakeLists.txt
@@ -0,0 +1,103 @@
+# gserver pacakge unittests
+add_simple_unittest(test_LinearChainCRF)
+add_simple_unittest(test_RecurrentLayer)
+
+if(NOT MOBILE_INFERENCE)
+  add_simple_unittest(test_MultinomialSampler)
+endif()
+
+function(gserver_test TARGET)
+  add_unittest_without_exec(${TARGET}
+      ${TARGET}.cpp
+      LayerGradUtil.cpp)
+  add_test(NAME ${TARGET}
+      COMMAND ${TARGET})
+endfunction()
+
+add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/concat_dotmul_a.conf
+    COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/* ${CMAKE_CURRENT_BINARY_DIR}
+)
+add_custom_target(copy_gserver_conf ALL DEPENDS concat_dotmul_a.conf)
+
+gserver_test(test_LayerGrad)
+gserver_test(test_CRFLayerGrad)
+gserver_test(test_CrossEntropyOverBeamGrad)
+gserver_test(test_SeqSliceLayerGrad)
+gserver_test(test_ActivationGrad)
+gserver_test(test_ConvTrans)
+gserver_test(test_PriorBox)
+gserver_test(test_DetectionOutput)
+gserver_test(test_ConvUnify)
+gserver_test(test_BatchNorm)
+gserver_test(test_KmaxSeqScore)
+gserver_test(test_Expand)
+gserver_test(test_MaxPoolingWithMaskOutput)
+gserver_test(test_Upsample)
+
+set(PYTHON_PATH 
+   ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d 
+   ${PADDLE_BINARY_DIR}/python/:${PADDLE_BINARY_DIR}/paddle/legacy/gserver/tests)
+function(gserver_test_with_python TARGET)
+  add_unittest_without_exec(${TARGET} ${TARGET}.cpp)
+  add_test(NAME ${TARGET}
+    COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}
+      WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
+endfunction()
+
+gserver_test_with_python(test_PyDataProvider2)
+if(WITH_PYTHON)
+    gserver_test_with_python(test_PyDataProvider)
+endif()
+if(NOT MOBILE_INFERENCE)
+    gserver_test_with_python(test_CompareTwoNets)
+    # TODO(yuyang18): There is some bug in test_RecurrentGradientMachine, I will fix it.
+    gserver_test_with_python(test_RecurrentGradientMachine)
+endif()
+
+########## test_MKLDNN layers and activations ##########
+if(WITH_MKLDNN)
+    add_unittest_without_exec(test_MKLDNN
+        test_MKLDNN.cpp
+        MKLDNNTester.cpp
+        LayerGradUtil.cpp)
+    add_test(NAME test_MKLDNN
+        COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_MKLDNN
+            WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle)
+endif()
+
+############### test_WarpCTCLayer #######################
+if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE)
+    add_unittest_without_exec(test_WarpCTCLayer
+        test_WarpCTCLayer.cpp)
+    add_test(NAME test_WarpCTCLayer
+        COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_WarpCTCLayer --warpctc_dir=${WARPCTC_LIB_DIR}
+        WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle)
+endif()
+
+if(NOT MOBILE_INFERENCE)
+    ################## test_Evaluator #############
+    add_unittest(test_Evaluator
+        test_Evaluator.cpp)
+      
+    ########### test_NetworkCompare ###############
+    add_unittest_without_exec(test_NetworkCompare
+        test_NetworkCompare.cpp)
+    if(WITH_GPU)
+        set(use_gpu true)
+    else()
+        set(use_gpu false)
+    endif()
+    add_test(NAME test_NetworkCompare
+        COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=${use_gpu}
+        WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle)
+
+    ############ test_CompareSparse ################
+    add_unittest_without_exec(test_CompareSparse
+        test_CompareSparse.cpp)
+    if(NOT ON_TRAVIS)
+      add_test(NAME test_CompareSparse
+        COMMAND ${PYTHON_PATH} ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port -n 6
+                ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
+        WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
+    endif()
+endif()
diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/legacy/gserver/tests/LayerGradUtil.cpp
similarity index 100%
rename from paddle/gserver/tests/LayerGradUtil.cpp
rename to paddle/legacy/gserver/tests/LayerGradUtil.cpp
diff --git a/paddle/legacy/gserver/tests/LayerGradUtil.h b/paddle/legacy/gserver/tests/LayerGradUtil.h
new file mode 100644
index 0000000000000000000000000000000000000000..941989a1da49d215b9ed4af72e732d6a62fd225d
--- /dev/null
+++ b/paddle/legacy/gserver/tests/LayerGradUtil.h
@@ -0,0 +1,329 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/gserver/layers/DataLayer.h"
+
+#include "paddle/testing/TestUtil.h"
+using namespace std;  // NOLINT
+
+namespace paddle {
+enum InputType {
+  INPUT_DATA,         // dense vector
+  INPUT_LABEL,        // id
+  INPUT_DATA_TARGET,  // dense vector, but no gradient
+  INPUT_SEQUENCE_DATA,
+  INPUT_HASSUB_SEQUENCE_DATA,  // sequence has sub-sequence
+  INPUT_SEQUENCE_MDIM_DATA,
+  INPUT_SEQUENCE_LABEL,
+  INPUT_SPARSE_NON_VALUE_DATA,
+  INPUT_SPARSE_FLOAT_VALUE_DATA,
+  INPUT_DENSE_DIM_DATA,    // using sequence length to init dense data
+  INPUT_SELF_DEFINE_DATA,  // support customizing for input value
+};
+
+struct ParaSparse {
+  bool sparse;
+  string format;
+  // if equalNnzPerSample is set true,
+  // every row of the sparse matrix in a format of CSR has a same
+  // number of nnz values. Currently, this flag is only used for
+  // selective_fc layer
+  bool equalNnzPerSample;
+  ParaSparse(const string& formatIn = "") {  // NOLINT
+    if (formatIn == "") {
+      sparse = false;
+    } else {
+      sparse = true;
+    }
+    equalNnzPerSample = false;
+  }
+  ParaSparse(const string& formatIn, bool equalNnz) {
+    format = formatIn;
+    sparse = true;
+    equalNnzPerSample = equalNnz;
+  }
+};
+
+struct InputDef {
+  InputType inputType;
+  string name;
+  size_t dim;
+  size_t paraSize;
+  ParaSparse sparse;
+  bool isStatic;
+  std::vector<int> labelInitValue;
+  std::vector<int> labelSeqStartPositions;
+  std::vector<int> labelSubSeqStartPositions;
+  std::vector<int> ids;
+  MatrixPtr selfDefinedData;
+
+  InputDef(InputType type, string nameIn, size_t dimIn, size_t sizeIn) {
+    inputType = type;
+    name = nameIn;
+    dim = dimIn;
+    paraSize = sizeIn;
+    sparse = {""};
+    isStatic = false;
+  }
+
+  InputDef(InputType type,
+           string nameIn,
+           MatrixPtr selfDefinedData,
+           std::vector<int> selfDefinedSeqStartPos = {},
+           std::vector<int> selfDefinedSubSeqStartPos = {})
+      : labelSeqStartPositions(selfDefinedSeqStartPos),
+        labelSubSeqStartPositions(selfDefinedSubSeqStartPos),
+        selfDefinedData(selfDefinedData) {
+    inputType = type;
+    name = nameIn;
+    dim = 0;
+    sparse = {""};
+    paraSize = 0;
+    isStatic = false;
+  }
+
+  InputDef(InputType type,
+           string nameIn,
+           const std::vector<int>& ids,
+           const std::vector<int>& selfDefinedSeqStartPos = {},
+           const std::vector<int>& selfDefinedSubSeqStartPos = {})
+      : labelSeqStartPositions(selfDefinedSeqStartPos),
+        labelSubSeqStartPositions(selfDefinedSubSeqStartPos),
+        ids(ids) {
+    selfDefinedData = nullptr;
+    inputType = type;
+    name = nameIn;
+    dim = 0;
+    sparse = {""};
+    paraSize = 0;
+    isStatic = false;
+  }
+
+  InputDef(InputType type,
+           string nameIn,
+           size_t dimIn,
+           size_t sizeIn,
+           const std::vector<int>& labelInitValue,
+           const std::vector<int>& labelSeqStartPositions)
+      : labelInitValue(labelInitValue),
+        labelSeqStartPositions(labelSeqStartPositions) {
+    inputType = type;
+    name = nameIn;
+    dim = dimIn;
+    paraSize = sizeIn;
+    sparse = {""};
+    isStatic = false;
+  }
+
+  InputDef(InputType type,
+           string nameIn,
+           size_t dimIn,
+           size_t sizeIn,
+           ParaSparse sparseIn) {
+    inputType = type;
+    name = nameIn;
+    dim = dimIn;
+    paraSize = sizeIn;
+    sparse = sparseIn;
+  }
+};
+
+struct TestConfig {
+  LayerConfig layerConfig;
+  std::vector<InputDef> inputDefs;
+  size_t biasSize;
+  real paramInitialMean;
+  real paramInitialStd;
+  bool testAccumulate;
+  bool testState;
+  bool staticBias;
+  bool testBatchState;
+  TestConfig()
+      : biasSize(0),
+        paramInitialMean(0.0),
+        paramInitialStd(1.0),
+        testAccumulate(true),
+        testState(false),
+        staticBias(false),
+        testBatchState(false) {}
+};
+
+real getCostSum(ParameterPtr& parameter,
+                CpuVector& cpuPara,
+                LayerPtr& testLayer,
+                MatrixPtr weights = nullptr);
+
+real getDiffAndPrint(real newCost1,
+                     real newCost2,
+                     real callbackCount,
+                     char fill,
+                     string testLayerName,
+                     string name,
+                     real step,
+                     real delta);
+
+/**
+ * @brief verify that sequentially running forward() one timestamp at one time
+ *        has same result as running forward() with one whole sequence
+ *
+ * @param testLayer[in/out]    testLayer
+ * @param dataLayers[in/out]   dataLayers
+ * @param datas[in/out]        data of dataLayers
+ */
+void testState(LayerPtr testLayer,
+               vector<DataLayerPtr>& dataLayers,
+               vector<Argument>& datas);
+
+/**
+ * @brief verify that sequentially running forward() with short sequences one
+ *        time has same result as running forward() with long sequences.
+ *
+ * @param testLayer[in/out]    testLayer
+ * @param dataLayers[in/out]   dataLayers
+ * @param datas[in/out]        data of dataLayers
+ */
+void testBatchState(LayerPtr testLayer,
+                    vector<DataLayerPtr>& dataLayers,
+                    vector<Argument>& datas);
+
+/**
+ * @brief Generate a perturbation so that it is roughly aligned with the
+ *        gradient direction. This is to make sure that change along this
+ *        direction will make cost increase (or decrease) in a meaningful
+ *        way so that the finite difference can be used to approximate the
+ *        directional dirivative well.
+ *
+ * @param oldGrad[in]  input gradient
+ *        newGrad[out] output gradient
+ *        dim          dimension of oldGrad/newGrad
+ *
+ * @return sum_i(oldGrad[i] * newGrad[i])
+ */
+double genPerturbation(const real* oldGrad, real* newGrad, size_t dim);
+
+void initWeight(MatrixPtr& weights);
+
+void initBatchState(LayerPtr dataLayer,
+                    LayerPtr testLayer,
+                    LayerStatePtr state,
+                    bool useGpu);
+
+/**
+ * @brief initialize the dataLayer by its inputType
+ *
+ * @param testConf[in]        test config
+ *        dataLayers[out]     dataLayers
+ *        datas[out]          initialized data of dataLayers
+ *        layerMap[out]       layerMap
+ */
+void initDataLayer(TestConfig testConf,
+                   std::vector<DataLayerPtr>* dataLayers,
+                   vector<Argument>* datas,
+                   LayerMap* layerMap,
+                   string testLayerName,
+                   size_t batchSize,
+                   bool trans,
+                   bool useGpu);
+
+/**
+ * @brief initialize the parameter of testLayer
+ *
+ * @param testConf[in/out]    test config
+ *        layerMap[out]       layerMap
+ *        parameters[out]     parameters of testLayer
+ *        testLayer[out]      testLayer
+ */
+void initTestLayer(TestConfig testConf,
+                   LayerMap* layerMap,
+                   std::vector<ParameterPtr>* parameters,
+                   LayerPtr* testLayer);
+
+/**
+ * @brief Test whether the layer's forward calculation is stable by adding
+ *        perturbation to its parameters
+ *
+ * @param testConf[in]         test config
+ *        weights[in]          weights of testLayer
+ *        state[in]            state of testLayer
+ *        cost[in]             input cost
+ *        callbackCount[in]    number of done callback
+ *        maxDiff[in/out]      max of all previous diff
+ *        testLayer[in/out]    testLayer
+ *        parameters[in/out]   parameters of testLayer
+ */
+void testPerturbParameter(TestConfig testConf,
+                          const MatrixPtr weights,
+                          const LayerStatePtr state,
+                          real cost,
+                          real callbackCount,
+                          real* maxDiff,
+                          LayerPtr testLayer,
+                          std::vector<ParameterPtr>* parameters);
+
+/**
+ * @brief Test whether the layer's forward calculation is stable by adding
+ *        perturbation to its input layers
+ *
+ * @param testConf[in]         test config
+ *        weights[in]          weights of testLayer
+ *        state[in]            state of testLayer
+ *        cost[in]             input cost
+ *        callbackCount[in]    number of done callback
+ *        maxDiff[in/out]      max of all previous diff
+ *        testLayer[in/out]    testLayer
+ *        dataLayers[in/out]   dataLayers
+ */
+void testPerturbInput(TestConfig testConf,
+                      const MatrixPtr weights,
+                      const LayerStatePtr state,
+                      real cost,
+                      real callbackCount,
+                      real* maxDiff,
+                      LayerPtr testLayer,
+                      std::vector<DataLayerPtr> dataLayers);
+
+void testLayerGradKernel(TestConfig testConf,
+                         string testLayerName,
+                         size_t batchSize,
+                         bool trans,
+                         bool useGpu,
+                         bool useWeight = false,
+                         float epsilon = 0.02);
+
+void testLayerGrad(TestConfig testConf,
+                   string testLayerName,
+                   size_t batchSize,
+                   bool trans,
+                   bool useGpu,
+                   bool useWeight = false,
+                   float epsilon = 0.02);
+
+void testProjectionGrad(ProjectionConfig conf,
+                        InputType inputType,
+                        size_t parameterSize,
+                        size_t batchSize,
+                        bool useGpu,
+                        bool testState = false,
+                        int biasSize = 0,
+                        bool sharedBias = false);
+
+void testOperatorGrad(TestConfig& config,
+                      OperatorConfig& operatorConf,
+                      size_t batchSize,
+                      bool useGpu,
+                      bool testState = false);
+
+}  //  namespace paddle
diff --git a/paddle/legacy/gserver/tests/MKLDNNTester.cpp b/paddle/legacy/gserver/tests/MKLDNNTester.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b550ba9c72d85830dbf12485a6a645a6b5360026
--- /dev/null
+++ b/paddle/legacy/gserver/tests/MKLDNNTester.cpp
@@ -0,0 +1,580 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNTester.h"
+#include "paddle/legacy/gserver/layers/MKLDNNBase.h"
+#include "paddle/legacy/gserver/layers/MKLDNNLayer.h"
+#include "paddle/legacy/trainer/Trainer.h"
+
+namespace paddle {
+
+// init data layer and test layer of both dnn and reference
+void MKLDNNTester::reset(const TestConfig& dnn,
+                         const TestConfig& ref,
+                         size_t batchSize) {
+  const bool trans = false;
+  const bool useGpu = false;
+
+  // clear
+  configs_.clear();
+  layerNames_.clear();
+  dataLayers_.clear();
+  datas_.clear();
+  layerMaps_.clear();
+  parameters_.clear();
+  testLayers_.clear();
+
+  // resize
+  configs_.resize(NUM);
+  layerNames_.resize(NUM);
+  dataLayers_.resize(NUM);
+  datas_.resize(NUM);
+  layerMaps_.resize(NUM);
+  parameters_.resize(NUM);
+  testLayers_.resize(NUM);
+
+  // reset configs and layer names
+  configs_[DNN] = dnn;
+  configs_[REF] = ref;
+  layerNames_[DNN] = "mkldnn";     // the first is mkldnn layer
+  layerNames_[REF] = "reference";  // second is reference layer
+
+  // reset others
+  for (size_t i = 0; i < NUM; ++i) {
+    configs_[i].layerConfig.set_name(layerNames_[i]);
+    initDataLayer(configs_[i],
+                  &(dataLayers_[i]),
+                  &(datas_[i]),
+                  &(layerMaps_[i]),
+                  layerNames_[i],
+                  batchSize,
+                  trans,
+                  useGpu);
+    initTestLayer(
+        configs_[i], &(layerMaps_[i]), &(parameters_[i]), &(testLayers_[i]));
+  }
+  refLayer_ = testLayers_[REF];
+  dnnLayer_ = testLayers_[DNN];
+  EXPECT_EQ(dataLayers_[DNN].size(), dataLayers_[REF].size());
+  EXPECT_EQ(parameters_[DNN].size(), parameters_[REF].size());
+  setInputImgSize();
+
+  // for comparison with Paddle reference results,
+  // need manually add cpu device output for test
+  MKLDNNLayerPtr dnnLayer = std::dynamic_pointer_cast<MKLDNNLayer>(dnnLayer_);
+  if (dnnLayer) {
+    dnnLayer->addOutputArgument(CPU_DEVICE);
+  }
+}
+
+void MKLDNNTester::setInputImgSize() {
+  for (size_t n = 0; n < dataLayers_.size(); ++n) {
+    for (size_t i = 0; i < dataLayers_[n].size(); ++i) {
+      // TODO(TJ): fix me when concat and elewise ready
+      dataLayers_[n][i]->getOutput().setFrameHeight(ih_);
+      dataLayers_[n][i]->getOutput().setFrameWidth(iw_);
+    }
+  }
+}
+
+// init randome parameters of ref, and copy to mkldnn
+void MKLDNNTester::randomWgtDatas() {
+  EXPECT_EQ(parameters_[DNN].size(), parameters_[REF].size());
+  const bool isBN = refLayer_->getType() == "batch_norm";
+  for (size_t i = 0; i < parameters_[REF].size(); ++i) {
+    const VectorPtr& dnnValue = parameters_[DNN][i]->getBuf(PARAMETER_VALUE);
+    const VectorPtr& refValue = parameters_[REF][i]->getBuf(PARAMETER_VALUE);
+    parameters_[REF][i]->randomize();
+    if (isBN && i == 2) {
+      // this param is moving average in batch norm, which must larger than 0
+      real offset = fabs(refValue->getMin()) + 1.0;
+      refValue->add(offset);
+    }
+    dnnValue->copyFrom(*refValue);
+
+    VLOG(MKLDNN_TESTS) << "Random weight " << parameters_[DNN][i]->getName();
+    printVector(dnnValue);
+  }
+}
+
+// random botdata of ref layer and copy same to mkldnn
+void MKLDNNTester::randomBotDatas() {
+  CHECK_EQ(dataLayers_.size(), NUM);
+  for (size_t i = 0; i < dataLayers_[DNN].size(); ++i) {
+    dataLayers_[REF][i]->getOutputValue()->randomizeUniform();
+    dataLayers_[DNN][i]->getOutputValue()->copyFrom(
+        *(dataLayers_[REF][i]->getOutputValue()));
+    VLOG(MKLDNN_TESTS) << "Random Foward, InputValue " << i;
+    printMatrix(dataLayers_[REF][i]->getOutputValue());
+  }
+}
+
+void MKLDNNTester::randomTopDiffs() {
+  refLayer_->getOutputGrad()->randomizeUniform();
+  dnnLayer_->getOutput(CPU_DEVICE)
+      .grad->copyFrom(*(refLayer_->getOutputGrad()));
+  VLOG(MKLDNN_TESTS) << "Random Backward, OutputGrad";
+  printMatrix(refLayer_->getOutputGrad());
+}
+
+void MKLDNNTester::checkForward() {
+  VLOG(MKLDNN_TESTS) << "Check Forward";
+  printTopDatas();
+  double delta =
+      compareMatrix(refLayer_->getOutputValue(), dnnLayer_->getOutputValue());
+  EXPECT_LE(fabs(delta), eps_);
+}
+
+void MKLDNNTester::checkBackwardData() {
+  VLOG(MKLDNN_TESTS) << "Check Backward Data";
+  const bool isBN = refLayer_->getType() == "batch_norm";
+  for (size_t i = 0; i < dataLayers_[DNN].size(); ++i) {
+    const MatrixPtr& dnnDiff = dataLayers_[DNN][i]->getOutputGrad();
+    const MatrixPtr& refDiff = dataLayers_[REF][i]->getOutputGrad();
+    VLOG(MKLDNN_ALL) << "MKLDNN Backward Result: InputGrad " << i;
+    printMatrix(dnnDiff);
+    VLOG(MKLDNN_ALL) << "Reference Backward Result: InputGrad " << i;
+    printMatrix(refDiff);
+
+    double delta = compareMatrix(refDiff, dnnDiff);
+    EXPECT_LE(fabs(delta), eps_);
+    if (isBN) {
+      // the other two inputs in batch norm are for moving mean and var
+      // do not have grad to compare
+      break;
+    }
+  }
+}
+
+void MKLDNNTester::checkBackwardWgts() {
+  VLOG(MKLDNN_TESTS) << "Check Backward Weight";
+  CHECK_EQ(parameters_[DNN].size(), parameters_[REF].size());
+  vector<VectorPtr> dnnWgts;  // used to temply save mkldnn weights
+  saveWgt(parameters_[DNN], dnnWgts);
+
+  MKLDNNLayerPtr dnnLayer = std::dynamic_pointer_cast<MKLDNNLayer>(dnnLayer_);
+  if (dnnLayer) {
+    dnnLayer->convertWeightsToPaddle();
+  }
+  for (size_t i = 0; i < parameters_[DNN].size(); ++i) {
+    const VectorPtr& dnn = parameters_[DNN][i]->getBuf(PARAMETER_VALUE);
+    const VectorPtr& ref = parameters_[REF][i]->getBuf(PARAMETER_VALUE);
+    VLOG(MKLDNN_ALL) << "MKLDNN Result: weight value"
+                     << parameters_[DNN][i]->getName();
+    printVector(dnn);
+    VLOG(MKLDNN_ALL) << "Reference Result: weight value "
+                     << parameters_[REF][i]->getName();
+    printVector(ref);
+
+    double delta = compareVector(ref, dnn);
+    EXPECT_LE(fabs(delta), eps_);
+  }
+
+  VLOG(MKLDNN_ALL) << "Restore dnn weights before comapre";
+  restoreWgt(dnnWgts, parameters_[DNN]);
+}
+
+void MKLDNNTester::saveWgt(const vector<ParameterPtr>& from,
+                           vector<VectorPtr>& to) {
+  const bool useGpu = false;
+  to.resize(from.size());
+  for (size_t i = 0; i < to.size(); ++i) {
+    const VectorPtr& wgt = from[i]->getBuf(PARAMETER_VALUE);
+    to[i] = Vector::create(wgt->getSize(), useGpu);
+    to[i]->copyFrom(*wgt);
+  }
+}
+
+void MKLDNNTester::restoreWgt(const vector<VectorPtr>& from,
+                              vector<ParameterPtr>& to) {
+  CHECK_EQ(from.size(), to.size());
+  for (size_t i = 0; i < from.size(); ++i) {
+    const VectorPtr& wgt = to[i]->getBuf(PARAMETER_VALUE);
+    wgt->copyFrom(*from[i]);
+  }
+}
+
+// clear parameters grad
+void MKLDNNTester::clearWgtDiffs(size_t id) {
+  CHECK_LE(id, parameters_.size());
+  for (size_t n = 0; n < parameters_.size(); ++n) {
+    if (id == n || id == parameters_.size()) {
+      for (size_t i = 0; i < parameters_[n].size(); ++i) {
+        const VectorPtr& grad = parameters_[n][i]->getBuf(PARAMETER_GRADIENT);
+        if (grad) {
+          grad->zeroMem();
+        }
+      }
+    }
+  }
+}
+
+void MKLDNNTester::clearBotDiffs(size_t id) {
+  CHECK_LE(id, dataLayers_.size());
+  for (size_t n = 0; n < dataLayers_.size(); ++n) {
+    if (id == n || id == dataLayers_.size()) {
+      // clear inputs layers of this specific layer
+      for (size_t i = 0; i < dataLayers_[n].size(); ++i) {
+        dataLayers_[n][i]->getOutputGrad()->zeroMem();
+      }
+    }
+  }
+}
+
+void MKLDNNTester::clearTopDatas(size_t id) {
+  CHECK_LE(id, testLayers_.size());
+  for (size_t i = 0; i < testLayers_.size(); ++i) {
+    if (id == i || id == testLayers_.size()) {
+      testLayers_[i]->getOutputValue()->zeroMem();
+    }
+  }
+}
+
+void MKLDNNTester::printTopDatas() {
+  if (!log_) {
+    return;
+  }
+
+  for (int n = 0; n < NUM; ++n) {
+    VLOG(MKLDNN_ALL) << testLayers_[n]->getType()
+                     << " Forward Result: OutputValue";
+    printMatrix(testLayers_[n]->getOutputValue());
+  }
+}
+
+void MKLDNNTester::printMatrix(const MatrixPtr& m) {
+  if (!log_) {
+    return;
+  }
+
+  std::ostringstream ostr;
+  m->print(ostr);
+  VLOG(MKLDNN_ALL) << std::endl << ostr.str();
+}
+
+void MKLDNNTester::printVector(const VectorPtr& v) {
+  if (!log_) {
+    return;
+  }
+
+  std::ostringstream ostr;
+  v->print(ostr, v->getSize());
+  VLOG(MKLDNN_ALL) << std::endl << ostr.str();
+}
+
+double MKLDNNTester::getDelta(const real* refer,
+                              const real* value,
+                              size_t len,
+                              const float failRate,
+                              const float thres) {
+  double delta = 0, sum = 0;
+  int failCnt = 0;
+  const double eps = 1e-5;
+  double maxRatio = 0;
+  for (size_t i = 0; i < len; ++i) {
+    double ref = fabs(refer[i]);
+    double val = fabs(value[i]);
+    double diff = fabs(refer[i] - value[i]);
+    delta += diff;
+    sum += ref;
+    if (ref < eps && val < eps) {  // both values are very small
+      continue;
+    }
+    double ratio = diff / ref;
+    if (ratio > thres) {
+      maxRatio = std::max(maxRatio, ratio);
+      failCnt++;
+    }
+  }
+  EXPECT_FALSE(std::isinf(sum));
+  EXPECT_FALSE(std::isnan(sum));
+  EXPECT_FALSE(std::isnan(delta));
+  VLOG(MKLDNN_ALL) << "reference avg data: " << sum / len
+                   << ", delta: " << delta / sum << ", failCnt:" << failCnt;
+  double res = sum > eps ? delta / sum : eps;
+  return (failCnt / (float)len) > failRate ? maxRatio : res;
+}
+
+double MKLDNNTester::compareMatrix(const MatrixPtr& m1, const MatrixPtr& m2) {
+  CHECK_EQ(m1->getElementCnt(), m2->getElementCnt());
+  return getDelta(m1->getData(), m2->getData(), m1->getElementCnt());
+}
+
+double MKLDNNTester::compareVector(const VectorPtr& v1, const VectorPtr& v2) {
+  CHECK_EQ(v1->getSize(), v2->getSize());
+  return getDelta(v1->getData(), v2->getData(), v1->getSize());
+}
+
+void MKLDNNTester::runOnce() {
+  // test forward
+  randomBotDatas();
+  dnnLayer_->forward(passType_);
+  refLayer_->forward(passType_);
+  checkForward();
+
+  if (passType_ == PASS_TEST) {
+    return;
+  }
+
+  // test backward
+  // simple updater
+  UpdateCallback updateCallback = [](Parameter* para) {
+    auto& grad = para->getBuf(PARAMETER_GRADIENT);
+    auto& value = para->getBuf(PARAMETER_VALUE);
+    real lr = 1e-2;
+    value->add(*grad, lr);
+    grad->zeroMem();
+  };
+  randomTopDiffs();
+  dnnLayer_->backward(updateCallback);
+  refLayer_->backward(updateCallback);
+  checkBackwardData();
+  checkBackwardWgts();
+
+  // clear buffers
+  // ref code will addto the diff, dnn code will writeto it
+  // and clearTopDatas(REF) should be coverd by ref layers
+  clearBotDiffs(REF);
+  clearWgtDiffs(REF);
+  // it is necessary to clear bottom diffs when only activation is dnn type
+  if (configs_[DNN].layerConfig.active_type().compare(0, 7, "mkldnn_") == 0) {
+    clearBotDiffs(DNN);
+  }
+}
+
+void MKLDNNTester::run(const TestConfig& dnn,
+                       const TestConfig& ref,
+                       size_t batchSize,
+                       size_t inputImgH,
+                       size_t inputImgW,
+                       PassType passType,
+                       bool printDetails,
+                       size_t iter,
+                       float epsilon) {
+  CHECK(dnn.layerConfig.type().compare(0, 7, "mkldnn_") == 0 ||
+        dnn.layerConfig.active_type().compare(0, 7, "mkldnn_") == 0)
+      << "should be MKLDNN layer or MKLDNN activation";
+  if (dnn.layerConfig.type() == ref.layerConfig.type()) {
+    VLOG(MKLDNN_TESTS) << "Test MKLDNN functionality: "
+                       << dnn.layerConfig.active_type() << " vs "
+                       << ref.layerConfig.active_type();
+  } else {
+    VLOG(MKLDNN_TESTS) << "Test MKLDNN functionality: "
+                       << dnn.layerConfig.type() << " vs "
+                       << ref.layerConfig.type();
+  }
+
+  ih_ = inputImgH;
+  iw_ = inputImgW;
+  passType_ = passType;
+  log_ = printDetails;
+  iter_ = iter;
+  eps_ = epsilon;
+
+  // Firstly test mkldnn init from PARAM_FORMAT_ORIGINAL weight
+  reset(dnn, ref, batchSize);
+  randomWgtDatas();
+  clearWgtDiffs();
+  clearBotDiffs();
+  for (size_t i = 0; i < iter_; ++i) {
+    VLOG(MKLDNN_TESTS) << "Check Iteration " << i;
+    runOnce();
+  }
+
+  if (parameters_[DNN].empty()) {
+    // has no paramters
+    return;
+  }
+
+  // After run some iterations, the mkldnn weight has been stored in dnnLayer
+  // and we can also get the mkldnn weight parameter header format.
+  // Weight parameter should always be index 0 (and bias index 1).
+  // TODO(TJ): should also consider mean and var format when batchnorm ready
+  int dnnWgtFmt = parameters_[DNN][0]->getHeaderFormat();
+  int refWgtFmt = parameters_[REF][0]->getHeaderFormat();
+  if (dnnWgtFmt == refWgtFmt) {
+    // weight format are equal, so no need check more
+    return;
+  }
+
+  // then save the weights and restart again
+  vector<VectorPtr> dnnWgts, refWgts;
+  CHECK_EQ(parameters_[DNN].size(), parameters_[REF].size());
+  saveWgt(parameters_[DNN], dnnWgts);
+  saveWgt(parameters_[REF], refWgts);
+
+  // restart again with dnn weight format
+  reset(dnn, ref, batchSize);
+  // TODO(TJ): should also considerate mean and var format when batchnorm ready
+  parameters_[DNN][0]->setHeaderFormat(dnnWgtFmt);
+
+  // restore wgt
+  restoreWgt(dnnWgts, parameters_[DNN]);
+  restoreWgt(refWgts, parameters_[REF]);
+  clearWgtDiffs();
+  clearBotDiffs();
+
+  for (size_t i = 0; i < iter_; ++i) {
+    VLOG(MKLDNN_TESTS) << "Check Iteration " << i;
+    runOnce();
+  }
+}
+
+void MKLDNNTester::initArgument(DataIn& data,
+                                const std::string& configPath,
+                                const size_t iter) {
+  TrainerConfigHelper config(configPath);
+  size_t batchSize = config.getOptConfig().batch_size();
+  data.inArgs.resize(iter);
+  data.outGrads.resize(iter);
+  data.paraValues.clear();
+  for (const auto& layer_name : config.getModelConfig().input_layer_names()) {
+    auto layer_config = std::find_if(config.getModelConfig().layers().begin(),
+                                     config.getModelConfig().layers().end(),
+                                     [=](const LayerConfig& layer_config) {
+                                       return layer_config.name() == layer_name;
+                                     });
+    CHECK(layer_config != config.getModelConfig().layers().end());
+
+    size_t layerSize = layer_config->size();
+    for (size_t i = 0; i < iter; ++i) {
+      Argument arg;
+      arg.value = Matrix::create(batchSize, layerSize, false, false);
+      arg.grad = Matrix::create(batchSize, layerSize, false, false);
+      arg.value->randomizeUniform();
+      arg.value->add(-0.5);
+      arg.value->sigmoid(*arg.value);
+      arg.grad->zeroMem();
+      arg.ids = VectorT<int>::create(batchSize, false);
+      arg.ids->rand(layerSize);
+      generateSequenceStartPositions(batchSize, arg.sequenceStartPositions);
+      data.inArgs[i].push_back(arg);
+    }
+  }
+
+  for (const auto& layer_name : config.getModelConfig().output_layer_names()) {
+    auto layer_config = std::find_if(config.getModelConfig().layers().begin(),
+                                     config.getModelConfig().layers().end(),
+                                     [=](const LayerConfig& layer_config) {
+                                       return layer_config.name() == layer_name;
+                                     });
+    CHECK(layer_config != config.getModelConfig().layers().end());
+
+    size_t layerSize = layer_config->size();
+    for (size_t i = 0; i < iter; ++i) {
+      MatrixPtr grad = Matrix::create(batchSize, layerSize, false, false);
+      grad->randomizeUniform();
+      data.outGrads[i].push_back(grad);
+    }
+  }
+
+  for (const auto& para_config : config.getModelConfig().parameters()) {
+    VectorPtr value = Vector::create(para_config.size(), false);
+    value->randnorm(0, 2);
+    data.paraValues.push_back(value);
+  }
+}
+
+void MKLDNNTester::getOutResult(const std::string& configPath,
+                                DataIn& in,
+                                DataOut& out,
+                                bool use_mkldnn,
+                                size_t iter) {
+  FLAGS_use_gpu = false;
+  FLAGS_use_mkldnn = use_mkldnn;
+  *ThreadLocalRand::getSeed() = 1;
+  srand(1);
+
+  Trainer trainer;
+  auto config = std::make_shared<TrainerConfigHelper>(configPath);
+  trainer.init(config, false);
+  auto gradientMachine = trainer.getGradientMachine();
+  std::vector<ParameterPtr> parameters = gradientMachine->getParameters();
+  for (size_t i = 0; i < in.paraValues.size(); i++) {
+    parameters[i]->getBuf(PARAMETER_VALUE)->copyFrom(*in.paraValues[i]);
+  }
+  UpdateCallback simpleUpdate = [](Parameter* para) {
+    auto& grad = para->getBuf(PARAMETER_GRADIENT);
+    auto& value = para->getBuf(PARAMETER_VALUE);
+    real lr = 1e-2;
+    value->add(*grad, lr);
+    grad->zeroMem();
+  };
+
+  vector<Argument> outArgs;
+  gradientMachine->start();
+  out.outValues.clear();
+  out.paraValues.clear();
+  for (size_t i = 0; i < iter; ++i) {
+    VLOG(MKLDNN_TESTS) << "runing iteration " << i;
+    gradientMachine->forward(in.inArgs[i], &outArgs, PASS_TRAIN);
+    // save forward result
+    for (size_t k = 0; k < outArgs.size(); k++) {
+      const MatrixPtr& src = outArgs[k].value;
+      MatrixPtr dst =
+          Matrix::create(src->getHeight(), src->getWidth(), false, false);
+      if (typeid(*src) == typeid(MKLDNNMatrix)) {
+        MKLDNNMatrixPtr dnnSrc = std::dynamic_pointer_cast<MKLDNNMatrix>(src);
+        dnnSrc->copyTo(*dst);
+      } else {
+        dst->copyFrom(*src);
+      }
+      out.outValues.push_back(dst);
+    }
+
+    // random backward input
+    for (size_t k = 0; k < outArgs.size(); k++) {
+      outArgs[k].grad->copyFrom(*in.outGrads[i][k]);
+    }
+    gradientMachine->backward(simpleUpdate);
+  }
+  gradientMachine->finish();
+
+  // save param value
+  for (size_t i = 0; i < in.paraValues.size(); i++) {
+    VectorPtr val = Vector::create(
+        parameters[i]->getBuf(PARAMETER_VALUE)->getSize(), false);
+    val->copyFrom(*parameters[i]->getBuf(PARAMETER_VALUE));
+    out.paraValues.push_back(val);
+  }
+}
+
+void MKLDNNTester::compareResult(DataOut& ref, DataOut& dnn, float eps) {
+  CHECK_EQ(ref.outValues.size(), dnn.outValues.size());
+  CHECK_EQ(ref.paraValues.size(), dnn.paraValues.size());
+  for (size_t i = 0; i < ref.outValues.size(); i++) {
+    VLOG(MKLDNN_TESTS) << "compare value index: " << i;
+    EXPECT_LE(fabs(compareMatrix(ref.outValues[i], dnn.outValues[i])), eps);
+  }
+  for (size_t i = 0; i < ref.paraValues.size(); i++) {
+    VLOG(MKLDNN_TESTS) << "compare param index: " << i;
+    EXPECT_LE(fabs(compareVector(ref.paraValues[i], dnn.paraValues[i])), eps);
+  }
+}
+
+void MKLDNNTester::runNetTest(const std::string& configPath,
+                              size_t iter,
+                              float eps) {
+  DataIn in;
+  initArgument(in, configPath, iter);
+  DataOut outCpu, outDnn;
+  VLOG(MKLDNN_TESTS) << "runing cpu network";
+  getOutResult(configPath, in, outCpu, false, iter);
+  VLOG(MKLDNN_TESTS) << "runing mkldnn network";
+  getOutResult(configPath, in, outDnn, true, iter);
+
+  compareResult(outCpu, outDnn, eps);
+}
+
+}  //  namespace paddle
diff --git a/paddle/legacy/gserver/tests/MKLDNNTester.h b/paddle/legacy/gserver/tests/MKLDNNTester.h
new file mode 100644
index 0000000000000000000000000000000000000000..086846ce537857eb76ffca492246677eb7982a42
--- /dev/null
+++ b/paddle/legacy/gserver/tests/MKLDNNTester.h
@@ -0,0 +1,143 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "LayerGradUtil.h"
+#include "paddle/legacy/gserver/layers/MKLDNNBase.h"
+#include "paddle/legacy/gserver/layers/MKLDNNLayer.h"
+
+namespace paddle {
+
+/**
+ * @brief test the functionality of MKLDNNlayers and MKLDNNActivations
+ * refer to paddle original function
+ */
+class MKLDNNTester {
+  enum {
+    DNN = 0,  // MKLDNN layer
+    REF = 1,  // Reference layer
+    NUM = 2,  // Number of total
+  };
+
+  struct DataIn {
+    std::vector<std::vector<Argument>> inArgs;
+    std::vector<std::vector<MatrixPtr>> outGrads;
+    std::vector<VectorPtr> paraValues;
+  };
+
+  struct DataOut {
+    std::vector<MatrixPtr> outValues;
+    std::vector<VectorPtr> paraValues;
+  };
+
+ protected:
+  std::vector<TestConfig> configs_;
+  vector<string> layerNames_;
+  vector<vector<DataLayerPtr>> dataLayers_;
+  vector<vector<Argument>> datas_;
+  vector<LayerMap> layerMaps_;
+  vector<vector<ParameterPtr>> parameters_;
+  vector<LayerPtr> testLayers_;
+  LayerPtr refLayer_, dnnLayer_;
+
+  /// run some iterations, all the result should pass
+  size_t iter_;
+  /// whether to print out the details
+  bool log_;
+  /// epsilon
+  float eps_;
+  /// input image size, default 1
+  size_t ih_, iw_;
+  /// passType, PASS_TRAIN, PASS_TEST or PASS_GC (Gradient Check pass)
+  PassType passType_;
+
+ public:
+  explicit MKLDNNTester(size_t iter = 3, float epsilon = 1e-4) {
+    iter_ = iter;
+    eps_ = epsilon;
+    log_ = false;
+    passType_ = PASS_TRAIN;
+  }
+
+  ~MKLDNNTester() {}
+
+ public:
+  void run(const TestConfig& dnn,
+           const TestConfig& ref,
+           size_t batchSize,
+           size_t inputImgH = 1,
+           size_t inputImgW = 1,
+           PassType passType = PASS_TRAIN,
+           bool printDetails = false,
+           size_t iter = 3,
+           float epsilon = 1e-4);
+  static void runNetTest(const std::string& configPath,
+                         size_t iter = 2,
+                         float eps = 1e-4);
+  static void initArgument(DataIn& data,
+                           const std::string& configPath,
+                           size_t iter = 2);
+  static void getOutResult(const std::string& configPath,
+                           DataIn& in,
+                           DataOut& out,
+                           bool use_mkldnn,
+                           size_t iter = 2);
+
+ private:
+  void reset(const TestConfig& dnn, const TestConfig& ref, size_t batchSize);
+  void setInputImgSize();
+  void runOnce();
+
+  void randomWgtDatas();
+  void randomBotDatas();
+  void randomTopDiffs();
+
+  void checkForward();
+  void checkBackwardData();
+  void checkBackwardWgts();
+
+  // clear specific layer, clear all when id equals NUM
+  void clearWgtDiffs(size_t id = NUM);
+  void clearBotDiffs(size_t id = NUM);
+  void clearTopDatas(size_t id = NUM);
+
+  void printTopDatas();
+  void printMatrix(const MatrixPtr& m);
+  void printVector(const VectorPtr& v);
+
+  void saveWgt(const vector<ParameterPtr>& from, vector<VectorPtr>& to);
+  void restoreWgt(const vector<VectorPtr>& from, vector<ParameterPtr>& to);
+
+  static double compareMatrix(const MatrixPtr& m1, const MatrixPtr& m2);
+  static double compareVector(const VectorPtr& v1, const VectorPtr& v2);
+  static void compareResult(DataOut& ref, DataOut& dnn, float eps = 1e-4);
+
+  /**
+   * Get delta percent
+   * if many(>failRate) wrong(abs(val-ref)/abs(ref) > thres) points
+   * return the max(diff/ref)
+   * else return sum(abs(diff)) / sum(abs(ref))
+   * The return value should be smaller than eps when passing.
+   */
+  static double getDelta(const real* refer,
+                         const real* value,
+                         size_t len,
+                         const float failRate = 1e-3,
+                         const float thres = 0.1);
+};
+
+}  //  namespace paddle
diff --git a/paddle/gserver/tests/Sequence/dummy.list b/paddle/legacy/gserver/tests/Sequence/dummy.list
similarity index 100%
rename from paddle/gserver/tests/Sequence/dummy.list
rename to paddle/legacy/gserver/tests/Sequence/dummy.list
diff --git a/paddle/gserver/tests/Sequence/tour_dict_phrase.dict b/paddle/legacy/gserver/tests/Sequence/tour_dict_phrase.dict
similarity index 100%
rename from paddle/gserver/tests/Sequence/tour_dict_phrase.dict
rename to paddle/legacy/gserver/tests/Sequence/tour_dict_phrase.dict
diff --git a/paddle/gserver/tests/Sequence/tour_train_wdseg b/paddle/legacy/gserver/tests/Sequence/tour_train_wdseg
similarity index 100%
rename from paddle/gserver/tests/Sequence/tour_train_wdseg
rename to paddle/legacy/gserver/tests/Sequence/tour_train_wdseg
diff --git a/paddle/gserver/tests/Sequence/tour_train_wdseg.nest b/paddle/legacy/gserver/tests/Sequence/tour_train_wdseg.nest
similarity index 100%
rename from paddle/gserver/tests/Sequence/tour_train_wdseg.nest
rename to paddle/legacy/gserver/tests/Sequence/tour_train_wdseg.nest
diff --git a/paddle/legacy/gserver/tests/Sequence/train.list b/paddle/legacy/gserver/tests/Sequence/train.list
new file mode 100644
index 0000000000000000000000000000000000000000..1109a2449252cb9bfcb10ece4cf9a96e655e5a25
--- /dev/null
+++ b/paddle/legacy/gserver/tests/Sequence/train.list
@@ -0,0 +1 @@
+legacy/gserver/tests/Sequence/tour_train_wdseg
diff --git a/paddle/legacy/gserver/tests/Sequence/train.list.nest b/paddle/legacy/gserver/tests/Sequence/train.list.nest
new file mode 100644
index 0000000000000000000000000000000000000000..a67df35024f456d517899f37272b0f74d822f03d
--- /dev/null
+++ b/paddle/legacy/gserver/tests/Sequence/train.list.nest
@@ -0,0 +1 @@
+legacy/gserver/tests/Sequence/tour_train_wdseg.nest
diff --git a/paddle/gserver/tests/__init__.py b/paddle/legacy/gserver/tests/__init__.py
similarity index 100%
rename from paddle/gserver/tests/__init__.py
rename to paddle/legacy/gserver/tests/__init__.py
diff --git a/paddle/gserver/tests/concat_dotmul_a.conf b/paddle/legacy/gserver/tests/concat_dotmul_a.conf
similarity index 100%
rename from paddle/gserver/tests/concat_dotmul_a.conf
rename to paddle/legacy/gserver/tests/concat_dotmul_a.conf
diff --git a/paddle/gserver/tests/concat_dotmul_b.conf b/paddle/legacy/gserver/tests/concat_dotmul_b.conf
similarity index 100%
rename from paddle/gserver/tests/concat_dotmul_b.conf
rename to paddle/legacy/gserver/tests/concat_dotmul_b.conf
diff --git a/paddle/gserver/tests/concat_fullmatrix_a.conf b/paddle/legacy/gserver/tests/concat_fullmatrix_a.conf
similarity index 100%
rename from paddle/gserver/tests/concat_fullmatrix_a.conf
rename to paddle/legacy/gserver/tests/concat_fullmatrix_a.conf
diff --git a/paddle/gserver/tests/concat_fullmatrix_b.conf b/paddle/legacy/gserver/tests/concat_fullmatrix_b.conf
similarity index 100%
rename from paddle/gserver/tests/concat_fullmatrix_b.conf
rename to paddle/legacy/gserver/tests/concat_fullmatrix_b.conf
diff --git a/paddle/gserver/tests/concat_slice_a.conf b/paddle/legacy/gserver/tests/concat_slice_a.conf
similarity index 100%
rename from paddle/gserver/tests/concat_slice_a.conf
rename to paddle/legacy/gserver/tests/concat_slice_a.conf
diff --git a/paddle/gserver/tests/concat_slice_b.conf b/paddle/legacy/gserver/tests/concat_slice_b.conf
similarity index 100%
rename from paddle/gserver/tests/concat_slice_b.conf
rename to paddle/legacy/gserver/tests/concat_slice_b.conf
diff --git a/paddle/gserver/tests/concat_table_a.conf b/paddle/legacy/gserver/tests/concat_table_a.conf
similarity index 100%
rename from paddle/gserver/tests/concat_table_a.conf
rename to paddle/legacy/gserver/tests/concat_table_a.conf
diff --git a/paddle/gserver/tests/concat_table_b.conf b/paddle/legacy/gserver/tests/concat_table_b.conf
similarity index 100%
rename from paddle/gserver/tests/concat_table_b.conf
rename to paddle/legacy/gserver/tests/concat_table_b.conf
diff --git a/paddle/gserver/tests/img_conv_a.conf b/paddle/legacy/gserver/tests/img_conv_a.conf
similarity index 100%
rename from paddle/gserver/tests/img_conv_a.conf
rename to paddle/legacy/gserver/tests/img_conv_a.conf
diff --git a/paddle/gserver/tests/img_conv_b.conf b/paddle/legacy/gserver/tests/img_conv_b.conf
similarity index 100%
rename from paddle/gserver/tests/img_conv_b.conf
rename to paddle/legacy/gserver/tests/img_conv_b.conf
diff --git a/paddle/gserver/tests/img_conv_c.conf b/paddle/legacy/gserver/tests/img_conv_c.conf
similarity index 100%
rename from paddle/gserver/tests/img_conv_c.conf
rename to paddle/legacy/gserver/tests/img_conv_c.conf
diff --git a/paddle/gserver/tests/img_conv_cudnn.py b/paddle/legacy/gserver/tests/img_conv_cudnn.py
similarity index 100%
rename from paddle/gserver/tests/img_conv_cudnn.py
rename to paddle/legacy/gserver/tests/img_conv_cudnn.py
diff --git a/paddle/gserver/tests/img_conv_exconv.py b/paddle/legacy/gserver/tests/img_conv_exconv.py
similarity index 100%
rename from paddle/gserver/tests/img_conv_exconv.py
rename to paddle/legacy/gserver/tests/img_conv_exconv.py
diff --git a/paddle/gserver/tests/img_pool_a.conf b/paddle/legacy/gserver/tests/img_pool_a.conf
similarity index 100%
rename from paddle/gserver/tests/img_pool_a.conf
rename to paddle/legacy/gserver/tests/img_pool_a.conf
diff --git a/paddle/gserver/tests/img_pool_b.conf b/paddle/legacy/gserver/tests/img_pool_b.conf
similarity index 100%
rename from paddle/gserver/tests/img_pool_b.conf
rename to paddle/legacy/gserver/tests/img_pool_b.conf
diff --git a/paddle/gserver/tests/mkldnn_branch_net.conf b/paddle/legacy/gserver/tests/mkldnn_branch_net.conf
similarity index 100%
rename from paddle/gserver/tests/mkldnn_branch_net.conf
rename to paddle/legacy/gserver/tests/mkldnn_branch_net.conf
diff --git a/paddle/gserver/tests/mkldnn_simple_net.conf b/paddle/legacy/gserver/tests/mkldnn_simple_net.conf
similarity index 100%
rename from paddle/gserver/tests/mkldnn_simple_net.conf
rename to paddle/legacy/gserver/tests/mkldnn_simple_net.conf
diff --git a/paddle/gserver/tests/pyDataProvider.py b/paddle/legacy/gserver/tests/pyDataProvider.py
similarity index 100%
rename from paddle/gserver/tests/pyDataProvider.py
rename to paddle/legacy/gserver/tests/pyDataProvider.py
diff --git a/paddle/gserver/tests/pyDataProvider/pyDataProviderList b/paddle/legacy/gserver/tests/pyDataProvider/pyDataProviderList
similarity index 100%
rename from paddle/gserver/tests/pyDataProvider/pyDataProviderList
rename to paddle/legacy/gserver/tests/pyDataProvider/pyDataProviderList
diff --git a/paddle/gserver/tests/pyDataProvider/trainer.conf b/paddle/legacy/gserver/tests/pyDataProvider/trainer.conf
similarity index 100%
rename from paddle/gserver/tests/pyDataProvider/trainer.conf
rename to paddle/legacy/gserver/tests/pyDataProvider/trainer.conf
diff --git a/paddle/gserver/tests/rnn_data_provider.py b/paddle/legacy/gserver/tests/rnn_data_provider.py
similarity index 100%
rename from paddle/gserver/tests/rnn_data_provider.py
rename to paddle/legacy/gserver/tests/rnn_data_provider.py
diff --git a/paddle/gserver/tests/sequenceGen.py b/paddle/legacy/gserver/tests/sequenceGen.py
similarity index 100%
rename from paddle/gserver/tests/sequenceGen.py
rename to paddle/legacy/gserver/tests/sequenceGen.py
diff --git a/paddle/legacy/gserver/tests/sequence_layer_group.conf b/paddle/legacy/gserver/tests/sequence_layer_group.conf
new file mode 100644
index 0000000000000000000000000000000000000000..ad1b61d5821fd20135e61bb95abdea16d27a6a9a
--- /dev/null
+++ b/paddle/legacy/gserver/tests/sequence_layer_group.conf
@@ -0,0 +1,62 @@
+#!/usr/bin/env python
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+dict_path = 'legacy/gserver/tests/Sequence/tour_dict_phrase.dict'
+dict_file = dict()
+for line_count, line in enumerate(open(dict_path, "r")):
+    dict_file[line.strip()] = line_count
+
+define_py_data_sources2(
+    train_list='legacy/gserver/tests/Sequence/train.list',
+    test_list=None,
+    module='sequenceGen',
+    obj='process',
+    args={"dict_file": dict_file})
+
+settings(batch_size=5)
+######################## network configure ################################
+dict_dim = len(open(dict_path, 'r').readlines())
+word_dim = 128
+hidden_dim = 256
+label_dim = 3
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(input=data, size=word_dim)
+
+# (lstm_input + lstm) is equal to lstmemory 
+with mixed_layer(size=hidden_dim * 4) as lstm_input:
+    lstm_input += full_matrix_projection(input=emb)
+
+lstm = lstmemory_group(
+    input=lstm_input,
+    size=hidden_dim,
+    act=TanhActivation(),
+    gate_act=SigmoidActivation(),
+    state_act=TanhActivation())
+
+lstm_last = last_seq(input=lstm)
+
+with mixed_layer(
+        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
+    output += full_matrix_projection(input=lstm_last)
+
+outputs(
+    classification_cost(
+        input=output, label=data_layer(
+            name="label", size=1)))
diff --git a/paddle/legacy/gserver/tests/sequence_lstm.conf b/paddle/legacy/gserver/tests/sequence_lstm.conf
new file mode 100644
index 0000000000000000000000000000000000000000..6ab70e70713f31de31b5cd544cf132e7d0af0f2f
--- /dev/null
+++ b/paddle/legacy/gserver/tests/sequence_lstm.conf
@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+dict_path = 'legacy/gserver/tests/Sequence/tour_dict_phrase.dict'
+dict_file = dict()
+for line_count, line in enumerate(open(dict_path, "r")):
+    dict_file[line.strip()] = line_count
+
+define_py_data_sources2(
+    train_list='legacy/gserver/tests/Sequence/train.list',
+    test_list=None,
+    module='sequenceGen',
+    obj='process',
+    args={"dict_file": dict_file})
+
+settings(batch_size=5)
+######################## network configure ################################
+dict_dim = len(open(dict_path, 'r').readlines())
+word_dim = 128
+hidden_dim = 256
+label_dim = 3
+sparse_update = get_config_arg("sparse_update", bool, False)
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(
+    input=data,
+    size=word_dim,
+    param_attr=ParamAttr(sparse_update=sparse_update))
+
+with mixed_layer(size=hidden_dim * 4) as lstm_input:
+    lstm_input += full_matrix_projection(input=emb)
+
+lstm = lstmemory(
+    input=lstm_input,
+    act=TanhActivation(),
+    gate_act=SigmoidActivation(),
+    state_act=TanhActivation())
+
+lstm_last = last_seq(input=lstm)
+
+with mixed_layer(
+        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
+    output += full_matrix_projection(input=lstm_last)
+
+outputs(
+    classification_cost(
+        input=output, label=data_layer(
+            name="label", size=1)))
diff --git a/paddle/legacy/gserver/tests/sequence_nest_layer_group.conf b/paddle/legacy/gserver/tests/sequence_nest_layer_group.conf
new file mode 100644
index 0000000000000000000000000000000000000000..75c36b118979760e034f81e3127a748651f53347
--- /dev/null
+++ b/paddle/legacy/gserver/tests/sequence_nest_layer_group.conf
@@ -0,0 +1,83 @@
+#!/usr/bin/env python
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+dict_path = 'legacy/gserver/tests/Sequence/tour_dict_phrase.dict'
+dict_file = dict()
+for line_count, line in enumerate(open(dict_path, "r")):
+    dict_file[line.strip()] = line_count
+
+define_py_data_sources2(
+    train_list='legacy/gserver/tests/Sequence/train.list.nest',
+    test_list=None,
+    module='sequenceGen',
+    obj='process2',
+    args={"dict_file": dict_file})
+
+settings(batch_size=2)
+######################## network configure ################################
+dict_dim = len(open(dict_path, 'r').readlines())
+word_dim = 128
+hidden_dim = 256
+label_dim = 3
+
+data = data_layer(name="word", size=dict_dim)
+
+emb_group = embedding_layer(input=data, size=word_dim)
+
+
+# (lstm_input + lstm) is equal to lstmemory 
+def lstm_group(lstm_group_input):
+    with mixed_layer(size=hidden_dim * 4) as group_input:
+        group_input += full_matrix_projection(input=lstm_group_input)
+
+    lstm_output = lstmemory_group(
+        input=group_input,
+        name="lstm_group",
+        size=hidden_dim,
+        act=TanhActivation(),
+        gate_act=SigmoidActivation(),
+        state_act=TanhActivation())
+    return lstm_output
+
+
+lstm_nest_group = recurrent_group(
+    input=SubsequenceInput(emb_group), step=lstm_group, name="lstm_nest_group")
+# hasSubseq ->(seqlastins) seq
+lstm_last = last_seq(
+    input=lstm_nest_group, agg_level=AggregateLevel.TO_SEQUENCE)
+
+# seq ->(expand) hasSubseq
+lstm_expand = expand_layer(
+    input=lstm_last,
+    expand_as=emb_group,
+    expand_level=ExpandLevel.FROM_SEQUENCE)
+
+# hasSubseq ->(average) seq
+lstm_average = pooling_layer(
+    input=lstm_expand,
+    pooling_type=AvgPooling(),
+    agg_level=AggregateLevel.TO_SEQUENCE)
+
+with mixed_layer(
+        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
+    output += full_matrix_projection(input=lstm_average)
+
+outputs(
+    classification_cost(
+        input=output, label=data_layer(
+            name="label", size=1)))
diff --git a/paddle/legacy/gserver/tests/sequence_nest_rnn.conf b/paddle/legacy/gserver/tests/sequence_nest_rnn.conf
new file mode 100644
index 0000000000000000000000000000000000000000..bc3b22c2a946a62c7a9d3163d3863a090d63539c
--- /dev/null
+++ b/paddle/legacy/gserver/tests/sequence_nest_rnn.conf
@@ -0,0 +1,74 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+define_py_data_sources2(train_list='legacy/gserver/tests/Sequence/dummy.list',
+                        test_list=None,
+                        module='rnn_data_provider',
+                        obj='process_subseq')
+
+
+settings(batch_size=2, learning_rate=0.01)
+######################## network configure ################################
+dict_dim = 10
+word_dim = 8
+hidden_dim = 8
+label_dim = 3
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(input=data, size=word_dim)
+
+# This hierachical RNN is designed to be equivalent to the simple RNN in
+# sequence_rnn.conf
+
+def outer_step(x):
+    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
+    def inner_step(y):
+        inner_mem = memory(name="inner_rnn_state",
+                           size=hidden_dim,
+                           boot_layer=outer_mem)
+        out = fc_layer(input=[y, inner_mem],
+                        size=hidden_dim,
+                        act=TanhActivation(),
+                        bias_attr=True,
+                        name="inner_rnn_state")
+        return out
+
+    inner_rnn_output = recurrent_group(
+        step=inner_step,
+        name="inner",
+        input=x)
+    last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
+
+    # "return last" won't work, because recurrent_group only support the input 
+    # sequence type is same as return sequence type.
+    return inner_rnn_output
+
+out = recurrent_group(
+    name="outer",
+    step=outer_step,
+    input=SubsequenceInput(emb))
+
+rep = last_seq(input=out)
+prob = fc_layer(size=label_dim,
+                input=rep,
+                act=SoftmaxActivation(),
+                bias_attr=True)
+
+outputs(classification_cost(input=prob,
+                            label=data_layer(name="label", size=label_dim)))
diff --git a/paddle/legacy/gserver/tests/sequence_nest_rnn_multi_input.conf b/paddle/legacy/gserver/tests/sequence_nest_rnn_multi_input.conf
new file mode 100644
index 0000000000000000000000000000000000000000..165ab229897d32ce2cae1d483b3ffd81392a355a
--- /dev/null
+++ b/paddle/legacy/gserver/tests/sequence_nest_rnn_multi_input.conf
@@ -0,0 +1,76 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+define_py_data_sources2(train_list='legacy/gserver/tests/Sequence/dummy.list',
+                        test_list=None,
+                        module='rnn_data_provider',
+                        obj='process_subseq')
+
+
+settings(batch_size=2, learning_rate=0.01)
+######################## network configure ################################
+dict_dim = 10
+word_dim = 8
+hidden_dim = 8
+label_dim = 3
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(input=data, size=word_dim)
+
+# This hierachical RNN is designed to be equivalent to the simple RNN in
+# sequence_rnn.conf
+
+def outer_step(wid, x):
+    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
+    def inner_step(y, wid):
+        z = embedding_layer(input=wid, size=word_dim)
+        inner_mem = memory(name="inner_rnn_state",
+                           size=hidden_dim,
+                           boot_layer=outer_mem)
+        out = fc_layer(input=[y, z, inner_mem],
+                        size=hidden_dim,
+                        act=TanhActivation(),
+                        bias_attr=True,
+                        name="inner_rnn_state")
+        return out
+
+    inner_rnn_output = recurrent_group(
+        step=inner_step,
+        name="inner",
+        input=[x, wid])
+    last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
+
+    # "return last" should also work. But currently RecurrentGradientMachine
+    # does not handle it, and will report error: In hierachical RNN, all out
+    # links should be from sequences now.
+    return inner_rnn_output
+
+out = recurrent_group(
+    name="outer",
+    step=outer_step,
+    input=[SubsequenceInput(data), SubsequenceInput(emb)])
+
+rep = last_seq(input=out)
+prob = fc_layer(size=label_dim,
+                input=rep,
+                act=SoftmaxActivation(),
+                bias_attr=True)
+
+outputs(classification_cost(input=prob,
+                            label=data_layer(name="label", size=label_dim)))
diff --git a/paddle/legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py b/paddle/legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a48b7f25c454b492d20e807f09f6d788af44681
--- /dev/null
+++ b/paddle/legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
@@ -0,0 +1,96 @@
+#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+define_py_data_sources2(
+    train_list='legacy/gserver/tests/Sequence/dummy.list',
+    test_list=None,
+    module='rnn_data_provider',
+    obj='process_unequalength_subseq')
+
+settings(batch_size=2, learning_rate=0.01)
+######################## network configure ################################
+dict_dim = 10
+word_dim = 8
+hidden_dim = 8
+label_dim = 2
+
+speaker1 = data_layer(name="word1", size=dict_dim)
+speaker2 = data_layer(name="word2", size=dict_dim)
+
+emb1 = embedding_layer(input=speaker1, size=word_dim)
+emb2 = embedding_layer(input=speaker2, size=word_dim)
+
+
+# This hierarchical RNN is designed to be equivalent to the simple RNN in
+# sequence_rnn_multi_unequalength_inputs.conf
+def outer_step(x1, x2):
+    index = [0]
+
+    def inner_step(ipt):
+        index[0] += 1
+        i = index[0]
+        outer_mem = memory(name="outer_rnn_state_%d" % i, size=hidden_dim)
+
+        def inner_step_impl(y):
+            inner_mem = memory(
+                name="inner_rnn_state_" + y.name,
+                size=hidden_dim,
+                boot_layer=outer_mem)
+            out = fc_layer(
+                input=[y, inner_mem],
+                size=hidden_dim,
+                act=TanhActivation(),
+                bias_attr=True,
+                name='inner_rnn_state_' + y.name)
+            return out
+
+        encoder = recurrent_group(
+            step=inner_step_impl, name='inner_%d' % i, input=ipt)
+        last = last_seq(name="outer_rnn_state_%d" % i, input=encoder)
+        return encoder, last
+
+    encoder1, sentence_last_state1 = inner_step(ipt=x1)
+    encoder2, sentence_last_state2 = inner_step(ipt=x2)
+
+    encoder1_expand = expand_layer(
+        input=sentence_last_state1, expand_as=encoder2)
+
+    return [encoder1_expand, encoder2]
+
+
+encoder1_rep, encoder2_rep = recurrent_group(
+    name="outer",
+    step=outer_step,
+    input=[SubsequenceInput(emb1), SubsequenceInput(emb2)],
+    targetInlink=emb2)
+
+encoder1_last = last_seq(input=encoder1_rep)
+encoder1_expandlast = expand_layer(input=encoder1_last, expand_as=encoder2_rep)
+context = mixed_layer(
+    input=[
+        identity_projection(encoder1_expandlast),
+        identity_projection(encoder2_rep)
+    ],
+    size=hidden_dim)
+
+rep = last_seq(input=context)
+prob = fc_layer(
+    size=label_dim, input=rep, act=SoftmaxActivation(), bias_attr=True)
+
+outputs(
+    classification_cost(
+        input=prob, label=data_layer(
+            name="label", size=label_dim)))
diff --git a/paddle/legacy/gserver/tests/sequence_recurrent.py b/paddle/legacy/gserver/tests/sequence_recurrent.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2c6a7935c28838fb12fc6e44d99dd59636bf7dd
--- /dev/null
+++ b/paddle/legacy/gserver/tests/sequence_recurrent.py
@@ -0,0 +1,55 @@
+#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+dict_path = 'legacy/gserver/tests/Sequence/tour_dict_phrase.dict'
+dict_file = dict()
+for line_count, line in enumerate(open(dict_path, "r")):
+    dict_file[line.strip()] = line_count
+
+define_py_data_sources2(
+    train_list='legacy/gserver/tests/Sequence/train.list',
+    test_list=None,
+    module='sequenceGen',
+    obj='process',
+    args={"dict_file": dict_file})
+
+settings(batch_size=5)
+######################## network configure ################################
+dict_dim = len(open(dict_path, 'r').readlines())
+word_dim = 128
+hidden_dim = 128
+label_dim = 3
+
+# This config is designed to be equivalent with sequence_recurrent_group.py
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(
+    input=data, size=word_dim, param_attr=ParamAttr(name="emb"))
+
+recurrent = recurrent_layer(input=emb, bias_attr=False, act=SoftmaxActivation())
+
+recurrent_last = last_seq(input=recurrent)
+
+with mixed_layer(
+        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
+    output += full_matrix_projection(input=recurrent_last)
+
+outputs(
+    classification_cost(
+        input=output, label=data_layer(
+            name="label", size=1)))
diff --git a/paddle/legacy/gserver/tests/sequence_recurrent_group.py b/paddle/legacy/gserver/tests/sequence_recurrent_group.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4638bd9075ff5cdd4a5ed1bc0e0d133f9a9ab86
--- /dev/null
+++ b/paddle/legacy/gserver/tests/sequence_recurrent_group.py
@@ -0,0 +1,68 @@
+#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+dict_path = 'legacy/gserver/tests/Sequence/tour_dict_phrase.dict'
+dict_file = dict()
+for line_count, line in enumerate(open(dict_path, "r")):
+    dict_file[line.strip()] = line_count
+
+define_py_data_sources2(
+    train_list='legacy/gserver/tests/Sequence/train.list',
+    test_list=None,
+    module='sequenceGen',
+    obj='process',
+    args={"dict_file": dict_file})
+
+settings(batch_size=5)
+######################## network configure ################################
+dict_dim = len(open(dict_path, 'r').readlines())
+word_dim = 128
+hidden_dim = 128
+label_dim = 3
+
+# This config is designed to be equivalent with sequence_recurrent.py
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(
+    input=data, size=word_dim, param_attr=ParamAttr(name="emb"))
+
+
+def step(y):
+    mem = memory(name="rnn_state", size=hidden_dim)
+    with mixed_layer(
+            name="rnn_state",
+            size=hidden_dim,
+            bias_attr=False,
+            act=SoftmaxActivation()) as out:
+        out += identity_projection(input=y)
+        out += full_matrix_projection(
+            input=mem, param_attr=ParamAttr(name="___recurrent_layer_0__"))
+    return out
+
+
+recurrent = recurrent_group(name="rnn", step=step, input=emb)
+
+recurrent_last = last_seq(input=recurrent)
+
+with mixed_layer(
+        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
+    output += full_matrix_projection(input=recurrent_last)
+
+outputs(
+    classification_cost(
+        input=output, label=data_layer(
+            name="label", size=1)))
diff --git a/paddle/legacy/gserver/tests/sequence_rnn.conf b/paddle/legacy/gserver/tests/sequence_rnn.conf
new file mode 100644
index 0000000000000000000000000000000000000000..3133595c9ce4c25683c06d326a5ebe9d2bf13077
--- /dev/null
+++ b/paddle/legacy/gserver/tests/sequence_rnn.conf
@@ -0,0 +1,57 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+define_py_data_sources2(train_list='legacy/gserver/tests/Sequence/dummy.list',
+                        test_list=None,
+                        module='rnn_data_provider',
+                        obj='process_seq')
+
+
+settings(batch_size=2, learning_rate=0.01)
+######################## network configure ################################
+dict_dim = 10
+word_dim = 8
+hidden_dim = 8
+label_dim = 3
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(input=data, size=word_dim)
+
+def step(y):
+    mem = memory(name="rnn_state", size=hidden_dim)
+    out = fc_layer(input=[y, mem],
+                    size=hidden_dim,
+                    act=TanhActivation(),
+                    bias_attr=True,
+                    name="rnn_state")
+    return out
+
+out = recurrent_group(
+    name="rnn",
+    step=step,
+    input=emb)
+
+rep = last_seq(input=out)
+prob = fc_layer(size=label_dim,
+                input=rep,
+                act=SoftmaxActivation(),
+                bias_attr=True)
+
+outputs(classification_cost(input=prob,
+                            label=data_layer(name="label", size=label_dim)))
diff --git a/paddle/legacy/gserver/tests/sequence_rnn_matched_inputs.py b/paddle/legacy/gserver/tests/sequence_rnn_matched_inputs.py
new file mode 100644
index 0000000000000000000000000000000000000000..921cef04dda0da396a79592b09d7a7e7177462d5
--- /dev/null
+++ b/paddle/legacy/gserver/tests/sequence_rnn_matched_inputs.py
@@ -0,0 +1,84 @@
+#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+define_py_data_sources2(
+    train_list='legacy/gserver/tests/Sequence/dummy.list',
+    test_list=None,
+    module='rnn_data_provider',
+    obj='process_mixed')
+
+settings(batch_size=2, learning_rate=0.01)
+######################## network configure ################################
+dict_dim = 10
+word_dim = 2
+hidden_dim = 2
+label_dim = 2
+
+data1 = data_layer(name="word1", size=dict_dim)
+data2 = data_layer(name="word2", size=dict_dim)
+label = data_layer(name="label", size=label_dim)
+
+encoding = embedding_layer(input=data2, size=word_dim)
+
+subseq = embedding_layer(input=data1, size=word_dim)
+seq = embedding_layer(input=data2, size=word_dim)
+nonseq = embedding_layer(input=label, size=word_dim)
+
+
+# This hierarchical RNN is designed to be equivalent to the simple RNN in
+# sequence_rnn_mixed_inputs.conf
+def outer_step(subseq, seq, nonseq, encoding):
+    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
+
+    def inner_step(subseq, seq, nonseq):
+        inner_mem = memory(
+            name="inner_rnn_state", size=hidden_dim, boot_layer=outer_mem)
+
+        out = fc_layer(
+            input=[subseq, seq, nonseq, inner_mem],
+            size=hidden_dim,
+            act=TanhActivation(),
+            bias_attr=True,
+            name='inner_rnn_state')
+        return out
+
+    decoder = recurrent_group(
+        step=inner_step, name='inner', input=[subseq, seq, nonseq])
+    last = last_seq(name="outer_rnn_state", input=decoder)
+    context = simple_attention(
+        encoded_sequence=encoding, encoded_proj=encoding, decoder_state=last)
+    return context
+
+
+out = recurrent_group(
+    name="outer",
+    step=outer_step,
+    input=[
+        subseq, expand_layer(
+            seq, expand_as=subseq,
+            expand_level=ExpandLevel.FROM_SEQUENCE), expand_layer(
+                nonseq,
+                expand_as=subseq,
+                expand_level=ExpandLevel.FROM_NO_SEQUENCE),
+        StaticInput(encoding)
+    ])
+
+rep = last_seq(input=out)
+prob = fc_layer(
+    size=label_dim, input=rep, act=SoftmaxActivation(), bias_attr=True)
+
+outputs(classification_cost(input=prob, label=label))
diff --git a/paddle/legacy/gserver/tests/sequence_rnn_mixed_inputs.py b/paddle/legacy/gserver/tests/sequence_rnn_mixed_inputs.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7bcaf6c4b21272e1c95d6de7e69e4558d52b9c6
--- /dev/null
+++ b/paddle/legacy/gserver/tests/sequence_rnn_mixed_inputs.py
@@ -0,0 +1,78 @@
+#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+define_py_data_sources2(
+    train_list='legacy/gserver/tests/Sequence/dummy.list',
+    test_list=None,
+    module='rnn_data_provider',
+    obj='process_mixed')
+
+settings(batch_size=2, learning_rate=0.01)
+######################## network configure ################################
+dict_dim = 10
+word_dim = 2
+hidden_dim = 2
+label_dim = 2
+
+data1 = data_layer(name="word1", size=dict_dim)
+data2 = data_layer(name="word2", size=dict_dim)
+label = data_layer(name="label", size=label_dim)
+
+encoding = embedding_layer(input=data2, size=word_dim)
+
+
+# This hierarchical RNN is designed to be equivalent to the simple RNN in
+# sequence_rnn_matched_inputs.conf
+def outer_step(subseq, seq, nonseq, encoding):
+    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
+
+    def inner_step(data1, data2, label):
+        inner_mem = memory(
+            name="inner_rnn_state", size=hidden_dim, boot_layer=outer_mem)
+
+        subseq = embedding_layer(input=data1, size=word_dim)
+        seq = embedding_layer(input=data2, size=word_dim)
+        nonseq = embedding_layer(input=label, size=word_dim)
+
+        print_layer(input=[data1, seq, label, inner_mem])
+        out = fc_layer(
+            input=[subseq, seq, nonseq, inner_mem],
+            size=hidden_dim,
+            act=TanhActivation(),
+            bias_attr=True,
+            name='inner_rnn_state')
+        return out
+
+    decoder = recurrent_group(
+        step=inner_step, name='inner',
+        input=[subseq, StaticInput(seq), nonseq])
+    last = last_seq(name="outer_rnn_state", input=decoder)
+    context = simple_attention(
+        encoded_sequence=encoding, encoded_proj=encoding, decoder_state=last)
+    return context
+
+
+out = recurrent_group(
+    name="outer",
+    step=outer_step,
+    input=[data1, data2, StaticInput(label), StaticInput(encoding)])
+
+rep = last_seq(input=out)
+prob = fc_layer(
+    size=label_dim, input=rep, act=SoftmaxActivation(), bias_attr=True)
+
+outputs(classification_cost(input=prob, label=label))
diff --git a/paddle/legacy/gserver/tests/sequence_rnn_multi_input.conf b/paddle/legacy/gserver/tests/sequence_rnn_multi_input.conf
new file mode 100644
index 0000000000000000000000000000000000000000..bf4be779a23e081cef33ce2b2734ad91cfa33c0d
--- /dev/null
+++ b/paddle/legacy/gserver/tests/sequence_rnn_multi_input.conf
@@ -0,0 +1,58 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+define_py_data_sources2(train_list='legacy/gserver/tests/Sequence/dummy.list',
+                        test_list=None,
+                        module='rnn_data_provider',
+                        obj='process_seq')
+
+
+settings(batch_size=2, learning_rate=0.01)
+######################## network configure ################################
+dict_dim = 10
+word_dim = 8
+hidden_dim = 8
+label_dim = 3
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(input=data, size=word_dim)
+
+def step(y, wid):
+    z = embedding_layer(input=wid, size=word_dim)
+    mem = memory(name="rnn_state", size=hidden_dim)
+    out = fc_layer(input=[y, z, mem],
+                    size=hidden_dim,
+                    act=TanhActivation(),
+                    bias_attr=True,
+                    name="rnn_state")
+    return out
+
+out = recurrent_group(
+    name="rnn",
+    step=step,
+    input=[emb, data])
+
+rep = last_seq(input=out)
+prob = fc_layer(size=label_dim,
+                input=rep,
+                act=SoftmaxActivation(),
+                bias_attr=True)
+
+outputs(classification_cost(input=prob,
+                            label=data_layer(name="label", size=label_dim)))
diff --git a/paddle/legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py b/paddle/legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
new file mode 100644
index 0000000000000000000000000000000000000000..3612b49c2279874a378d4aaed81623f7d0d2ea2f
--- /dev/null
+++ b/paddle/legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
@@ -0,0 +1,76 @@
+#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+define_py_data_sources2(
+    train_list='legacy/gserver/tests/Sequence/dummy.list',
+    test_list=None,
+    module='rnn_data_provider',
+    obj='process_unequalength_seq')
+
+settings(batch_size=2, learning_rate=0.01)
+######################## network configure ################################
+dict_dim = 10
+word_dim = 8
+hidden_dim = 8
+label_dim = 2
+
+speaker1 = data_layer(name="word1", size=dict_dim)
+speaker2 = data_layer(name="word2", size=dict_dim)
+
+emb1 = embedding_layer(input=speaker1, size=word_dim)
+emb2 = embedding_layer(input=speaker2, size=word_dim)
+
+# This hierachical RNN is designed to be equivalent to the RNN in
+# sequence_nest_rnn_multi_unequalength_inputs.conf
+
+
+def step(x1, x2):
+    def calrnn(y):
+        mem = memory(name='rnn_state_' + y.name, size=hidden_dim)
+        out = fc_layer(
+            input=[y, mem],
+            size=hidden_dim,
+            act=TanhActivation(),
+            bias_attr=True,
+            name='rnn_state_' + y.name)
+        return out
+
+    encoder1 = calrnn(x1)
+    encoder2 = calrnn(x2)
+    return [encoder1, encoder2]
+
+
+encoder1_rep, encoder2_rep = recurrent_group(
+    name="stepout", step=step, input=[emb1, emb2])
+
+encoder1_last = last_seq(input=encoder1_rep)
+encoder1_expandlast = expand_layer(input=encoder1_last, expand_as=encoder2_rep)
+context = mixed_layer(
+    input=[
+        identity_projection(encoder1_expandlast),
+        identity_projection(encoder2_rep)
+    ],
+    size=hidden_dim)
+
+rep = last_seq(input=context)
+prob = fc_layer(
+    size=label_dim, input=rep, act=SoftmaxActivation(), bias_attr=True)
+
+outputs(
+    classification_cost(
+        input=prob, label=data_layer(
+            name="label", size=label_dim)))
diff --git a/paddle/legacy/gserver/tests/test_ActivationGrad.cpp b/paddle/legacy/gserver/tests/test_ActivationGrad.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f468d229a889e02bf79baa29576c638acbd8eb08
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_ActivationGrad.cpp
@@ -0,0 +1,98 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/gserver/layers/DataLayer.h"
+
+#include "LayerGradUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_bool(use_gpu);
+DECLARE_bool(thread_local_rand_use_global_seed);
+
+void testActivation(const string& act) {
+  LOG(INFO) << "test activation: " << act;
+  size_t size = 10;
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("addto");
+  config.layerConfig.set_size(size);
+  config.layerConfig.set_active_type(act);
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", size, 0});
+  config.layerConfig.add_inputs();
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  act + "_activation",
+                  100,
+                  /* trans= */ false,
+                  useGpu,
+                  /* useWeight */ true);
+  }
+}
+
+TEST(Activation, activation) {
+  auto types = ActivationFunction::getAllRegisteredTypes();
+  std::set<string> excluded{"sequence_softmax"};
+  for (auto type : types) {
+    if (excluded.count(type)) continue;
+    testActivation(type);
+  }
+}
+
+void testSequenceSoftmaxAct(bool hasSubseq) {
+  LOG(INFO) << "test activation: sequence softmax";
+
+  const size_t size = 1;
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("addto");
+  config.layerConfig.set_size(size);
+  config.layerConfig.set_active_type("sequence_softmax");
+  config.inputDefs.push_back(
+      {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA,
+       "layer_0",
+       1,
+       0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "sequence_softmax",
+                  100,
+                  /* trans= */ false,
+                  useGpu,
+                  /* useWeight */ true);
+  }
+}
+
+TEST(SequenceSoftmaxActivation, activation) {
+  for (auto hasSubseq : {false, true}) {
+    LOG(INFO) << "hasSubseq = " << hasSubseq;
+    testSequenceSoftmaxAct(hasSubseq);
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/gserver/tests/test_BatchNorm.cpp b/paddle/legacy/gserver/tests/test_BatchNorm.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e21fa16074406645be88eeb454d743531f825041
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_BatchNorm.cpp
@@ -0,0 +1,195 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/gserver/layers/DataLayer.h"
+#include "paddle/legacy/utils/GlobalConstants.h"
+
+#include "LayerGradUtil.h"
+#include "paddle/legacy/cuda/include/hl_batch_norm.h"
+#include "paddle/legacy/math/tests/TensorCheck.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_double(checkgrad_eps);
+DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(prev_batch_state);
+
+// Test that the batchNormLayer can be followed by a ConvLayer
+TEST(Layer, batchNorm) {
+  FLAGS_use_gpu = false;
+  TestConfig configBN;
+  const int CHANNELS = 6272;
+  const int IMG_SIZE = 1;
+  configBN.layerConfig.set_type("batch_norm");
+  configBN.layerConfig.set_name("bn");
+  configBN.layerConfig.set_size(CHANNELS * IMG_SIZE * IMG_SIZE);
+  configBN.layerConfig.set_active_type("relu");
+  configBN.biasSize = CHANNELS;
+  configBN.inputDefs.push_back({INPUT_DATA,
+                                "layer_0",
+                                /* dim= */ IMG_SIZE * IMG_SIZE * CHANNELS,
+                                /* paraSize= */ CHANNELS});
+
+  configBN.inputDefs.push_back(
+      {INPUT_DATA, "layer_1_running_mean", 1, CHANNELS});
+  configBN.inputDefs.back().isStatic = true;
+  configBN.inputDefs.push_back(
+      {INPUT_DATA, "layer_2_running_var", 1, CHANNELS});
+  configBN.inputDefs.back().isStatic = true;
+
+  LayerInputConfig* input = configBN.layerConfig.add_inputs();
+  configBN.layerConfig.add_inputs();
+  configBN.layerConfig.add_inputs();
+
+  ImageConfig* img_conf = input->mutable_image_conf();
+  img_conf->set_channels(CHANNELS);
+  img_conf->set_img_size(IMG_SIZE);
+
+  // Setting up conv-layer config
+  TestConfig config;
+  config.biasSize = 64;
+  config.layerConfig.set_type("exconv");
+  config.layerConfig.set_num_filters(64);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  config.inputDefs.push_back({INPUT_DATA, "bn", 6272, 204800});
+  input = config.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_filter_size(5);
+  conv->set_filter_size_y(5);
+  conv->set_channels(128);
+  conv->set_padding(1);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_groups(1);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  conv->set_img_size(7);
+  conv->set_output_x(3);
+  config.layerConfig.set_size(conv->output_x() * conv->output_x() *
+                              config.layerConfig.num_filters());
+  config.layerConfig.set_name("conv");
+
+  // data layer initialize
+  std::vector<DataLayerPtr> dataLayers;
+  LayerMap layerMap;
+  vector<Argument> datas;
+  initDataLayer(configBN,
+                &dataLayers,
+                &datas,
+                &layerMap,
+                "batch_norm",
+                100,
+                false,
+                false);
+  // test layer initialize
+  std::vector<ParameterPtr> parameters;
+  LayerPtr bnLayer;
+  initTestLayer(configBN, &layerMap, &parameters, &bnLayer);
+
+  std::vector<ParameterPtr> parameters2;
+  LayerPtr convLayer;
+  initTestLayer(config, &layerMap, &parameters2, &convLayer);
+
+  bnLayer->forward(PASS_GC);
+  convLayer->forward(PASS_GC);
+
+  CHECK_EQ(static_cast<int>(convLayer->getOutputValue()->getHeight()), 100);
+  CHECK_EQ(static_cast<int>(convLayer->getOutputValue()->getWidth()), 576);
+}
+
+#ifdef PADDLE_WITH_CUDA
+void batchNormInference(int n, int c, int h, int w) {
+  MatrixPtr input = std::make_shared<GpuMatrix>(n, c * h * w);
+  MatrixPtr cudnnOut = std::make_shared<GpuMatrix>(n, c * h * w);
+  MatrixPtr cudaOut = std::make_shared<GpuMatrix>(n, c * h * w);
+  MatrixPtr cudnnCheck = std::make_shared<CpuMatrix>(n, c * h * w);
+  MatrixPtr cudaCheck = std::make_shared<CpuMatrix>(n, c * h * w);
+  input->randomizeUniform();
+  cudnnOut->zeroMem();
+  cudaOut->zeroMem();
+
+  MatrixPtr scale = std::make_shared<GpuMatrix>(1, c);
+  scale->randomizeUniform();
+  MatrixPtr bias = std::make_shared<GpuMatrix>(1, c);
+  bias->randomizeUniform();
+
+  MatrixPtr movingMean = std::make_shared<GpuMatrix>(1, c);
+  movingMean->randomizeUniform();
+
+  MatrixPtr movingVar = std::make_shared<GpuMatrix>(1, c);
+  movingVar->randomizeUniform();
+  movingVar->clip(0.01, 50);
+
+  hl_tensor_descriptor ioDesc;
+  hl_tensor_descriptor bnDesc;
+  hl_create_tensor_descriptor(&ioDesc);
+  hl_create_tensor_descriptor(&bnDesc);
+  hl_tensor_reshape(ioDesc, n, c, h, w);
+  hl_tensor_reshape(bnDesc, 1, c, 1, 1);
+
+  double EPS = 1E-5;
+  hl_batch_norm_forward_inference(ioDesc,
+                                  input->getData(),
+                                  ioDesc,
+                                  cudnnOut->getData(),
+                                  bnDesc,
+                                  scale->getData(),
+                                  bias->getData(),
+                                  movingMean->getData(),
+                                  movingVar->getData(),
+                                  EPS);
+
+  hl_batch_norm_cuda_inference(input->getData(),
+                               cudaOut->getData(),
+                               scale->getData(),
+                               bias->getData(),
+                               movingMean->getData(),
+                               movingVar->getData(),
+                               EPS,
+                               n,
+                               c,
+                               h,
+                               w);
+
+  cudnnCheck->copyFrom(*cudnnOut);
+  cudaCheck->copyFrom(*cudaOut);
+  autotest::TensorCheckErr(*cudnnCheck, *cudaCheck);
+
+  hl_destroy_tensor_descriptor(ioDesc);
+  hl_destroy_tensor_descriptor(bnDesc);
+}
+
+TEST(BatchNorm, Inference) {
+  batchNormInference(33, 267, 1, 1);
+  batchNormInference(19, 105, 4, 4);
+}
+#endif
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/gserver/tests/test_CRFLayerGrad.cpp b/paddle/legacy/gserver/tests/test_CRFLayerGrad.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1dafd1de4d82f1d306626090c30cf9203fa24dd0
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_CRFLayerGrad.cpp
@@ -0,0 +1,173 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/gserver/layers/DataLayer.h"
+#include "paddle/legacy/gserver/layers/LinearChainCRF.h"
+
+#include "LayerGradUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+
+DECLARE_int32(gpu_id);
+DECLARE_bool(thread_local_rand_use_global_seed);
+
+static inline bool getNextSequence(std::vector<int>& seq, int numClasses) {
+  for (auto& v : seq) {
+    if (++v < numClasses) {
+      return true;
+    }
+    v = 0;
+  }
+  return false;
+}
+
+// log(exp(x) + exp(y))
+static inline real logSum(real x, real y) {
+  real maxValue = std::max(x, y);
+  if (std::isinf(maxValue)) {
+    return -std::numeric_limits<real>::infinity();
+  } else {
+    return maxValue + log(exp(x - maxValue) + exp(y - maxValue));
+  }
+}
+
+static inline std::vector<int> genRandLabels(int numClasses, int length) {
+  std::vector<int> labels(length);
+  for (int i = 0; i < length; ++i) {
+    labels[i] = rand() % numClasses;  // NOLINT
+  }
+  return labels;
+}
+
+TEST(CRFLayer, cost) {
+  const int numClasses = 4;
+  CpuVector para(numClasses * (numClasses + 2));
+  real* a = para.getData();
+  real* b = para.getData() + numClasses;
+  real* w = para.getData() + 2 * numClasses;
+  LinearChainCRF crf(4, para.getData());
+  for (int length : {1, 2, 3, 10}) {
+    for (int tries = 0; tries < 10; ++tries) {
+      CpuMatrix x(length, numClasses);
+      x.randomizeUniform();
+      para.randnorm(0, 2);
+
+      std::vector<int> goldenLabels = genRandLabels(numClasses, length);
+
+      real cost = crf.forward(x.getData(), goldenLabels.data(), length);
+
+      real logZ = -std::numeric_limits<real>::infinity();
+      real logNominator = -std::numeric_limits<real>::infinity();
+      std::vector<int> testResult(length, 0);
+      do {
+        real score = a[testResult.front()];
+        score += x.getElement(0, testResult.front());
+        for (int k = 1; k < length; ++k) {
+          score += x.getElement(k, testResult[k]) +
+                   w[numClasses * testResult[k - 1] + testResult[k]];
+        }
+        score += b[testResult.back()];
+        logZ = logSum(logZ, score);
+
+        if (goldenLabels == testResult) {
+          logNominator = score;
+        }
+      } while (getNextSequence(testResult, numClasses));
+
+      real trueCost = -logNominator + logZ;
+
+      real diff = fabs(trueCost - cost);
+      diff /= fabs(cost) < fabs(trueCost) ? fabs(cost) : fabs(trueCost);
+      VLOG(1) << "cost=" << cost << " trueCost=" << trueCost << " diff=" << diff
+              << std::endl;
+      if (typeid(real) == typeid(double)) {  // NOLINT
+        EXPECT_LE(diff, 1e-10);
+      } else {
+        EXPECT_LE(diff, 5e-3);
+      }
+    }
+  }
+}
+
+inline real epsilon() { return typeid(real) == typeid(double) ? 1e-10 : 0.06; }
+
+TestConfig initTestConfig(size_t numClasses, bool withWeight) {
+  TestConfig config;
+  config.layerConfig.set_type("crf");
+  config.layerConfig.set_size(numClasses);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_SEQUENCE_DATA,
+                              "layer_0",
+                              numClasses,
+                              numClasses * (numClasses + 2)});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back(
+      {INPUT_SEQUENCE_LABEL, "layer_label", numClasses, 0});
+  config.layerConfig.add_inputs();
+
+  if (withWeight) {
+    config.inputDefs.push_back({INPUT_DENSE_DIM_DATA, "layer_weight", 1, 0});
+    config.layerConfig.add_inputs();
+  }
+
+  return config;
+}
+
+TEST(Layer, CRFLayer) {
+  size_t numClasses = 10;
+  for (int tries = 0; tries < 5; ++tries) {
+    TestConfig config = initTestConfig(numClasses, /* withWeight= */ false);
+    for (int length : {1, 3, 100}) {
+      // Not support GPU now
+      testLayerGrad(config,
+                    "crf",
+                    length,
+                    /* trans= */ false,
+                    /* useGpu= */ false,
+                    /* useWeight= */ false,
+                    epsilon());
+    }
+  }
+}
+
+TEST(Layer, CRFLayerUseWeight) {
+  size_t numClasses = 10;
+  for (int tries = 0; tries < 5; ++tries) {
+    TestConfig config = initTestConfig(numClasses, /* withWeight= */ true);
+    for (int length : {1, 3, 100}) {
+      // Not support GPU now
+      testLayerGrad(config,
+                    "crf",
+                    length,
+                    /* trans= */ false,
+                    /* useGpu= */ false,
+                    /* useWeight= */ false,
+                    epsilon());
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  initMain(argc, argv);
+  hl_start();
+  hl_init(FLAGS_gpu_id);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/gserver/tests/test_CompareSparse.cpp b/paddle/legacy/gserver/tests/test_CompareSparse.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..11b633a5885180ae227f6e93330117b567d4a4ab
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_CompareSparse.cpp
@@ -0,0 +1,228 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <paddle/legacy/utils/PythonUtil.h>
+
+#include "paddle/legacy/trainer/Trainer.h"
+
+#include <gtest/gtest.h>
+#include <paddle/legacy/pserver/ParameterServer2.h>
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+static const string& configFile1 = "legacy/gserver/tests/sequence_lstm.conf";
+
+DECLARE_bool(use_gpu);
+DECLARE_string(config);
+DECLARE_int32(gpu_id);
+DECLARE_int32(seed);
+DECLARE_int32(num_passes);
+DECLARE_int32(saving_period);
+
+DECLARE_int32(num_gradient_servers);
+DECLARE_int32(port);
+DECLARE_bool(local);
+DECLARE_bool(use_old_updater);
+DECLARE_bool(parallel_nn);
+DECLARE_string(config_args);
+DEFINE_double(max_diff_ratio,
+              0.0f,
+              "max diff ratio allowed for parameters value");
+
+int gNumDevices = 0;
+
+std::vector<ParameterPtr> trainerOnePassTest(const string& configFile,
+                                             bool sparseUpdate,
+                                             int trainerCount = 1,
+                                             bool useGpu = false) {
+  FLAGS_use_gpu = useGpu;
+  FLAGS_config = configFile;
+  FLAGS_trainer_count = trainerCount;
+  FLAGS_config_args = sparseUpdate ? "sparse_update=1" : "sparse_update=0";
+
+  LOG(INFO) << " useGpu=" << useGpu << " trainerCount=" << trainerCount
+            << " configFile=" << configFile << " sparseUpdate=" << sparseUpdate;
+  srand(FLAGS_seed);
+  *ThreadLocalRand::getSeed() = FLAGS_seed;
+  ThreadLocalRandomEngine::get().seed(FLAGS_seed);
+  if (useGpu) {
+    CHECK_LE(trainerCount, gNumDevices);
+  }
+
+  std::vector<std::shared_ptr<ParameterServer2>> pservers;
+  if (!FLAGS_local) {
+    int numPorts = FLAGS_ports_num + FLAGS_ports_num_for_sparse;
+    pservers.resize(numPorts);
+
+    for (int i = 0; i < numPorts; ++i) {
+      pservers[i].reset(new ParameterServer2(std::string(), FLAGS_port + i));
+      pservers[i]->init();
+      pservers[i]->start();
+    }
+  }
+
+  Trainer trainer;
+  trainer.init(TrainerConfigHelper::createFromFlagConfig());
+  trainer.train();
+  return trainer.getGradientMachine()->getParameters();
+}
+
+std::vector<ParameterPtr>& getDenseParameters() {
+  static std::vector<ParameterPtr> denseParameters;
+  if (denseParameters.empty()) {
+    // use dense training as base
+    FLAGS_local = true;
+    denseParameters = trainerOnePassTest(configFile1, false);
+  }
+
+  return denseParameters;
+}
+
+void checkBuffer(real* A,
+                 const char* desA,
+                 real* B,
+                 const char* desB,
+                 size_t len,
+                 double maxDiffRatio) {
+  double maxDiff = 0;
+  double maxValue = 0;
+  for (size_t i = 0; i < len; ++i) {
+    double diff = fabs(A[i] - B[i]);
+    maxValue = std::max<double>(maxValue, std::max(fabs(A[i]), fabs(B[i])));
+    maxDiff = std::max(maxDiff, diff);
+  }
+  EXPECT_LE(maxDiff / maxValue, maxDiffRatio);
+  LOG(INFO) << " maxDiff=" << maxDiff << " maxValue=" << maxValue
+            << " maxDiff/maxValue=" << maxDiff / maxValue << "\n\n";
+}
+
+void compareValue(const vector<ParameterPtr>& parametersA,
+                  const vector<ParameterPtr>& parametersB,
+                  double maxDiffRatio = 0.0) {
+  LOG(INFO) << "\n\n--------------------------------"
+            << " Check Gradient Machine Parameters:"
+            << " -------------------------------------\n";
+  for (size_t i = 0; i < parametersA.size(); ++i) {
+    ParameterPtr parameterA, parameterB;
+    parameterA = parametersA[i];
+    parameterB = parametersB[i];
+
+    CpuVector paraA(parameterA->getSize());
+    CpuVector paraB(parameterB->getSize());
+    paraA.copyFrom(*parameterA->getBuf(PARAMETER_VALUE));
+    paraB.copyFrom(*parameterB->getBuf(PARAMETER_VALUE));
+
+    LOG(INFO) << "\n\n----------- PARAMETER_VALUE:  " << parameterA->getName()
+              << " ; size : " << paraA.getSize() << " ------------";
+    checkBuffer(paraA.getData(),
+                "para_A",
+                paraB.getData(),
+                "para_B",
+                paraA.getSize(),
+                maxDiffRatio);
+  }
+}
+
+TEST(compareSparse, cpu) {
+  FLAGS_local = 1;  // disable remote sparse update in parameter config
+  std::vector<ParameterPtr> parameters = trainerOnePassTest(configFile1, true);
+  compareValue(getDenseParameters(), parameters);
+}
+
+TEST(compareSparse, remote_cpu) {
+  FLAGS_local = 0;  // will enable remote sparse update
+  FLAGS_ports_num_for_sparse = 5;
+  std::vector<ParameterPtr> parameters = trainerOnePassTest(configFile1, true);
+  compareValue(getDenseParameters(), parameters);
+}
+
+TEST(compareSparse, cpu10_local_vs_remote) {
+  FLAGS_local = 1;  // disable remote sparse update in parameter config
+  std::vector<ParameterPtr> localParameters =
+      trainerOnePassTest(configFile1, true, 2);
+
+  FLAGS_local = 0;  // will enable remote sparse update
+  FLAGS_ports_num_for_sparse = 5;
+  std::vector<ParameterPtr> remoteParameters =
+      trainerOnePassTest(configFile1, true, 2);
+
+  compareValue(localParameters, remoteParameters);
+}
+
+TEST(compareSparse, multiGradientMachine) {
+  int numGpu;
+#ifdef PADDLE_TYPE_DOUBLE
+  double eps = 1e-8;
+#else
+  double eps = 1e-4;
+#endif
+  numGpu = hl_get_device_count();
+  for (bool local : {false, true}) {
+    FLAGS_local = local;
+    FLAGS_ports_num_for_sparse = 5;
+    for (bool useGpu : {false, true}) {
+#ifndef PADDLE_WITH_CUDA
+      if (useGpu) continue;
+#endif
+      FLAGS_parallel_nn = useGpu;
+      LOG(INFO) << " local=" << local << " useGpu=" << useGpu;
+      int trainerCount = useGpu ? numGpu : 2;
+      std::vector<ParameterPtr> parameters =
+          trainerOnePassTest(configFile1, true, trainerCount, useGpu);
+      compareValue(getDenseParameters(), parameters, eps);
+    }
+  }
+  FLAGS_parallel_nn = false;
+}
+
+TEST(compareSparse, NeuralNetwork) {
+#ifdef PADDLE_TYPE_DOUBLE
+  double eps = 1e-8;
+#else
+  double eps = 1e-4;
+#endif
+  for (bool local : {false, true}) {
+    FLAGS_local = local;
+    FLAGS_ports_num_for_sparse = 5;
+    for (bool useGpu : {false, true}) {
+#ifndef PADDLE_WITH_CUDA
+      if (useGpu) continue;
+#endif
+      FLAGS_parallel_nn = useGpu;
+      LOG(INFO) << " local=" << local << " useGpu=" << useGpu;
+      int trainerCount = 1;
+      std::vector<ParameterPtr> parameters =
+          trainerOnePassTest(configFile1, true, trainerCount, useGpu);
+      compareValue(getDenseParameters(), parameters, useGpu ? eps : 0);
+    }
+  }
+  FLAGS_parallel_nn = false;
+}
+
+int main(int argc, char** argv) {
+  // FIXME(tonyyang-svail):
+  //   Turn off this test due CI failure:
+  //   https://paddleci.ngrok.io/viewLog.html?buildId=27608&buildTypeId=Paddle_PrCi&tab=buildLog&_focus=10430
+  return 0;
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  initPython(argc, argv);
+
+  gNumDevices = hl_get_device_count();
+  FLAGS_num_passes = 1;          // train one pass
+  FLAGS_saving_period = 100000;  // do not save parameter
+
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/gserver/tests/test_CompareTwoNets.cpp b/paddle/legacy/gserver/tests/test_CompareTwoNets.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e19c34abbd8a84660a9e79bcbf602437bfc92832
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_CompareTwoNets.cpp
@@ -0,0 +1,210 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <paddle/legacy/utils/PythonUtil.h>
+#include <algorithm>
+#include <cstdlib>
+
+#include "paddle/legacy/trainer/Trainer.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_int32(gpu_id);
+
+DECLARE_bool(local);
+DECLARE_bool(use_gpu);
+
+DECLARE_string(config);
+DECLARE_string(nics);
+
+DEFINE_bool(need_high_accuracy,
+            false,
+            "whether need to run in double accuracy");
+DEFINE_double(
+    max_diff_ratio,
+    0.0f,
+    "max diff ratio allowed for outputs and parameters (value/gradient)");
+DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_int32(seed);
+
+static const string& config_file_a =
+    "legacy/gserver/tests/sequence_recurrent.py";
+static const string& config_file_b =
+    "legacy/gserver/tests/sequence_recurrent_group.py";
+
+struct ComData {
+  vector<Argument> outArgs;
+  vector<ParameterPtr> parameters;
+};
+
+void calcGradient(ComData& data, const string configFile) {
+  FLAGS_config = configFile;
+
+  FLAGS_local = true;
+  FLAGS_use_gpu = false;
+
+  FLAGS_nics = "";
+
+  *ThreadLocalRand::getSeed() = FLAGS_seed;
+  srand(FLAGS_seed);
+
+  Trainer trainer;
+  trainer.init(TrainerConfigHelper::createFromFlagConfig(), false);
+
+  data.parameters = trainer.getGradientMachine()->getParameters();
+
+  DataBatch dataBatch;
+  int32_t batchSize = trainer.getConfig().opt_config().batch_size();
+
+  trainer.getDataProvider()->reset();
+  trainer.getDataProvider()->setSkipShuffle();
+  trainer.getDataProvider()->getNextBatch(batchSize, &dataBatch);
+
+  CHECK(dataBatch.getSize()) << "No data from data provider";
+  vector<Argument>& inArgs = dataBatch.getStreams();
+
+  trainer.getGradientMachine()->start();
+  trainer.getGradientMachine()->forwardBackward(
+      inArgs, &data.outArgs, PASS_TRAIN);
+
+  trainer.getGradientMachine()->finish();
+}
+
+void checkBuffer(real* A,
+                 const char* desA,
+                 real* B,
+                 const char* desB,
+                 size_t len,
+                 size_t width = 1) {
+  int nNum = 0;
+  real maxVal = 0;
+  for (size_t i = 0; i < len; ++i) {
+    maxVal = std::max(maxVal, std::max(A[i], B[i]));
+  }
+  real maxDiff = 0;
+  for (size_t i = 0; i < len; ++i) {
+    real diff = fabs(A[i] - B[i]);
+    maxDiff = std::max(maxDiff, diff);
+    if (diff > maxVal * FLAGS_max_diff_ratio) {
+      nNum++;
+      VLOG(1) << "Row: " << i / width << ", " << desA << " : " << A[i] << "    "
+              << desB << " : " << B[i] << " diff=" << diff;
+    }
+  }
+  EXPECT_EQ(0, nNum);
+  LOG(INFO) << "maxValue=" << maxVal << " maxDiff=" << maxDiff << "\n\n";
+}
+
+void compareGradient(ComData& comDataA, ComData& comDataB) {
+  vector<Argument> outArgsA = comDataA.outArgs;
+  vector<Argument> outArgsB = comDataB.outArgs;
+
+  for (size_t i = 0; i < outArgsA.size(); ++i) {
+    CpuMatrix matA(outArgsA[i].value->getHeight(),
+                   outArgsA[i].value->getWidth());
+    CpuMatrix matB(outArgsB[i].value->getHeight(),
+                   outArgsB[i].value->getWidth());
+
+    matA.copyFrom(*outArgsA[i].value);
+    matB.copyFrom(*outArgsB[i].value);
+
+    LOG(INFO) << "\n--------------------------------"
+              << " Check Network Output_" << i << ":"
+              << " -------------------------------------\n";
+    checkBuffer(matA.getData(),
+                "network A output",
+                matB.getData(),
+                "network B output",
+                matA.getElementCnt(),
+                matA.getWidth());
+  }
+
+  vector<ParameterPtr>& parametersA = comDataA.parameters;
+  vector<ParameterPtr>& parametersB = comDataB.parameters;
+
+  LOG(INFO) << "\n\n--------------------------------"
+            << " Check Gradient Machine Parameters:"
+            << " -------------------------------------\n";
+  for (size_t i = 0; i < parametersA.size(); ++i) {
+    ParameterPtr parameterA, parameterB;
+    parameterA = parametersA[i];
+    parameterB = parametersB[i];
+
+    CpuVector paraA(parameterA->getSize());
+    CpuVector paraB(parameterB->getSize());
+    paraA.copyFrom(*parameterA->getBuf(PARAMETER_VALUE));
+    paraB.copyFrom(*parameterB->getBuf(PARAMETER_VALUE));
+
+    LOG(INFO) << "\n\n----------- PARAMETER_VALUE:  " << parameterA->getName()
+              << " ; size : " << paraA.getSize() << " ------------";
+    checkBuffer(paraA.getData(),
+                "Network A",
+                paraB.getData(),
+                "Network B",
+                paraA.getSize());
+
+    CpuVector gradA(*parameterA->getBuf(PARAMETER_GRADIENT));
+    CpuVector gradB(*parameterB->getBuf(PARAMETER_GRADIENT));
+
+    LOG(INFO) << "\n\n----------- PARAMETER_GRADIENT: " << parameterA->getName()
+              << " ; size : " << gradA.getSize() << " -----------";
+    checkBuffer(gradA.getData(),
+                "Network A",
+                gradB.getData(),
+                "Network B",
+                gradA.getSize());
+  }
+}
+
+TEST(Trainer, create) {
+  ComData dataA;
+  calcGradient(dataA, config_file_a);
+  LOG(INFO) << "\n\nforwardBackward of Network A is finished\n\n";
+
+  ComData dataB;
+  calcGradient(dataB, config_file_b);
+  LOG(INFO) << "\n\nforwardBackward of the Network B is finished\n\n";
+
+  compareGradient(dataA, dataB);
+}
+
+int main(int argc, char** argv) {
+  FLAGS_thread_local_rand_use_global_seed = true;
+  paddle::initMain(argc, argv);
+  testing::InitGoogleTest(&argc, argv);
+  initPython(argc, argv);
+
+#ifndef PADDLE_TYPE_DOUBLE
+  if (FLAGS_need_high_accuracy) {
+    LOG(INFO) << "skip test due to it's need high accuracy";
+    return 0;
+  }
+  if (FLAGS_max_diff_ratio == 0.0f) {
+    FLAGS_max_diff_ratio = 1e-5;
+    LOG(INFO) << "auto set max_diff_ratio " << FLAGS_max_diff_ratio
+              << " in low accuracy mode";
+  }
+#else
+  if (FLAGS_max_diff_ratio == 0.0f) {
+    FLAGS_max_diff_ratio = 1e-10;
+    LOG(INFO) << "auto set max_diff_ratio " << FLAGS_max_diff_ratio
+              << " in high accuracy mode";
+  }
+#endif
+
+  int ret = RUN_ALL_TESTS();
+  return ret;
+}
diff --git a/paddle/legacy/gserver/tests/test_ConvTrans.cpp b/paddle/legacy/gserver/tests/test_ConvTrans.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4ea0a3d379b010fcb6ccb91a28e653a53cfe66d8
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_ConvTrans.cpp
@@ -0,0 +1,244 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/gserver/layers/DataLayer.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/utils/GlobalConstants.h"
+
+#include "LayerGradUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_double(checkgrad_eps);
+DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(prev_batch_state);
+
+// Test that the convTrans forward is the same as conv backward
+TEST(Layer, convTransLayerFwd) {
+  // Setting up conv-trans layer
+  TestConfig configt;
+  configt.biasSize = 3;
+  configt.layerConfig.set_type("exconvt");
+  configt.layerConfig.set_num_filters(3);
+  configt.layerConfig.set_partial_sum(1);
+  configt.layerConfig.set_shared_biases(true);
+
+  configt.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 384});
+  LayerInputConfig* input = configt.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_filter_size(2);
+  conv->set_filter_size_y(4);
+  conv->set_channels(16);
+  conv->set_padding(0);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_groups(1);
+  conv->set_filter_channels(3 / conv->groups());
+  conv->set_img_size(16);
+  conv->set_output_x(outputSize(conv->img_size(),
+                                conv->filter_size(),
+                                conv->padding(),
+                                conv->stride(),
+                                /* caffeMode */ true));
+  configt.layerConfig.set_size(conv->img_size() * conv->img_size() *
+                               configt.layerConfig.num_filters());
+  configt.layerConfig.set_name("convTrans");
+
+  // data layer initialize
+  std::vector<DataLayerPtr> dataLayers;
+  LayerMap layerMap;
+  vector<Argument> datas;
+  initDataLayer(
+      configt, &dataLayers, &datas, &layerMap, "convTrans", 100, false, false);
+  // test layer initialize
+  std::vector<ParameterPtr> parameters;
+  LayerPtr convtLayer;
+  initTestLayer(configt, &layerMap, &parameters, &convtLayer);
+  convtLayer->getBiasParameter()->zeroMem();
+  convtLayer->forward(PASS_GC);
+
+  // Setting up conv-layer config
+  TestConfig config;
+  config.biasSize = 16;
+  config.layerConfig.set_type("exconv");
+  config.layerConfig.set_num_filters(16);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 768, 384});
+  input = config.layerConfig.add_inputs();
+  conv = input->mutable_conv_conf();
+  conv->set_filter_size(2);
+  conv->set_filter_size_y(4);
+  conv->set_channels(3);
+  conv->set_padding(0);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_groups(1);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  conv->set_img_size(16);
+  conv->set_output_x(outputSize(conv->img_size(),
+                                conv->filter_size(),
+                                conv->padding(),
+                                conv->stride(),
+                                /* caffeMode */ true));
+  config.layerConfig.set_size(conv->output_x() * conv->output_x() *
+                              config.layerConfig.num_filters());
+  config.layerConfig.set_name("conv");
+
+  // data layer initialize
+  std::vector<DataLayerPtr> dataLayers2;
+  LayerMap layerMap2;
+  vector<Argument> datas2;
+  initDataLayer(
+      config, &dataLayers2, &datas2, &layerMap2, "conv", 100, false, false);
+  // test layer initialize
+  std::vector<ParameterPtr> parameters2;
+  LayerPtr convLayer;
+  initTestLayer(config, &layerMap2, &parameters2, &convLayer);
+
+  // Sync convLayer and convtLayer parameter
+  convLayer->getBiasParameter()->zeroMem();
+  convLayer->getParameters()[0]
+      ->getBuf(PARAMETER_VALUE)
+      ->copyFrom(*(convtLayer->getParameters()[0]->getBuf(PARAMETER_VALUE)));
+
+  // Set convLayer outputGrad as convTransLayer input value
+  convLayer->forward(PASS_GC);
+  convLayer->getOutput().grad->copyFrom(*(dataLayers[0]->getOutputValue()));
+
+  vector<int> callbackFlags(parameters2.size(), 0);
+  auto callback = [&](Parameter* para) { ++callbackFlags[para->getID()]; };
+  convLayer->backward(callback);
+
+  // Check that the convLayer backward is the same as convTransLayer forward
+  checkMatrixEqual(convtLayer->getOutputValue(),
+                   dataLayers2[0]->getOutputGrad());
+}
+
+// Do one forward pass of convTrans layer and check to see if its output
+// matches the given result
+void doOneConvtTest(size_t imgSize,
+                    size_t output_x,
+                    size_t stride,
+                    size_t padding,
+                    size_t filter_size,
+                    MatrixPtr& result) {
+  TestConfig configt;
+  configt.biasSize = 1;
+  configt.layerConfig.set_type("exconvt");
+  configt.layerConfig.set_num_filters(1);
+  configt.layerConfig.set_partial_sum(1);
+  configt.layerConfig.set_shared_biases(true);
+
+  configt.inputDefs.push_back(
+      {INPUT_DATA, "layer_0", output_x * output_x, filter_size * filter_size});
+  LayerInputConfig* input = configt.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_filter_size(filter_size);
+  conv->set_filter_size_y(filter_size);
+  conv->set_channels(1);
+  conv->set_padding(padding);
+  conv->set_padding_y(padding);
+  conv->set_stride(stride);
+  conv->set_stride_y(stride);
+  conv->set_groups(1);
+  conv->set_filter_channels(1);
+  conv->set_img_size(imgSize);
+  conv->set_output_x(output_x);
+
+  configt.layerConfig.set_size(conv->img_size() * conv->img_size() *
+                               configt.layerConfig.num_filters());
+  configt.layerConfig.set_name("convTrans");
+
+  std::vector<DataLayerPtr> dataLayers;
+  LayerMap layerMap;
+  vector<Argument> datas;
+  initDataLayer(
+      configt, &dataLayers, &datas, &layerMap, "convTrans", 1, false, false);
+  dataLayers[0]->getOutputValue()->zeroMem();
+  dataLayers[0]->getOutputValue()->add(1.0);
+
+  // test layer initialize
+  std::vector<ParameterPtr> parameters;
+  LayerPtr convtLayer;
+  initTestLayer(configt, &layerMap, &parameters, &convtLayer);
+  convtLayer->getBiasParameter()->zeroMem();
+  convtLayer->getParameters()[0]->zeroMem();
+  convtLayer->getParameters()[0]->getBuf(PARAMETER_VALUE)->add(1.0);
+  convtLayer->forward(PASS_GC);
+
+  checkMatrixEqual(convtLayer->getOutputValue(), result);
+}
+
+TEST(Layer, convTransLayerFwd2) {
+  MatrixPtr result;
+  result = Matrix::create(1, 5 * 5, false, false);
+  result->zeroMem();
+  result->add(1.0);
+  doOneConvtTest(/* imgSize */ 5,
+                 /* output_x */ 1,
+                 /* stride */ 1,
+                 /* padding */ 0,
+                 /* filter_size */ 5,
+                 result);
+
+  real resultData[] = {1, 2, 2, 2, 1, 2, 4, 4, 4, 2, 2, 4, 4,
+                       4, 2, 2, 4, 4, 4, 2, 1, 2, 2, 2, 1};
+  result->setData(resultData);
+  doOneConvtTest(/* imgSize */ 5,
+                 /* output_x */ 2,
+                 /* stride */ 1,
+                 /* padding */ 0,
+                 /* filter_size */ 4,
+                 result);
+
+  real resultData2[] = {1, 2, 2, 2, 1, 2, 4, 4, 4, 2, 2, 4, 4,
+                        4, 2, 2, 4, 4, 4, 2, 1, 2, 2, 2, 1};
+  result->setData(resultData2);
+  doOneConvtTest(/* imgSize */ 5,
+                 /* output_x */ 2,
+                 /* stride */ 2,
+                 /* padding */ 1,
+                 /* filter_size */ 5,
+                 result);
+
+  real resultData3[] = {1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 2, 4,
+                        2, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1};
+  result->setData(resultData3);
+  doOneConvtTest(/* imgSize */ 5,
+                 /* output_x */ 2,
+                 /* stride */ 2,
+                 /* padding */ 0,
+                 /* filter_size */ 3,
+                 result);
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/gserver/tests/test_ConvUnify.cpp b/paddle/legacy/gserver/tests/test_ConvUnify.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d4ca158352d9e4bf859b31b7c7410518bdc20ac6
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_ConvUnify.cpp
@@ -0,0 +1,315 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/gserver/layers/DataLayer.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/utils/GlobalConstants.h"
+
+#include "LayerGradUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_double(checkgrad_eps);
+DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(prev_batch_state);
+
+// Do one forward pass of ConvLayer using either exconv or cudnn_conv
+MatrixPtr doOneConvTest(size_t imgSize,
+                        size_t output_x,
+                        size_t stride,
+                        size_t padding,
+                        size_t filter_size,
+                        size_t channel,
+                        size_t numfilters,
+                        size_t groups,
+                        MatrixPtr& inputData,
+                        real* param,
+                        bool useGpu,
+                        bool isDeconv = false) {
+  TestConfig config;
+  config.biasSize = numfilters;
+  string layerType;
+  if (useGpu) {
+    layerType = (isDeconv) ? "cudnn_convt" : "cudnn_conv";
+  } else {
+    layerType = (isDeconv) ? "exconvt" : "exconv";
+  }
+  config.layerConfig.set_type(layerType);
+  config.layerConfig.set_num_filters(numfilters);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  size_t weightSize = channel * filter_size * filter_size *
+                      config.layerConfig.num_filters() / groups;
+  if (isDeconv) {
+    config.inputDefs.push_back(
+        {INPUT_DATA, "layer_0", output_x * output_x * channel, weightSize});
+    config.layerConfig.set_size(imgSize * imgSize *
+                                config.layerConfig.num_filters());
+  } else {
+    config.inputDefs.push_back(
+        {INPUT_DATA, "layer_0", imgSize * imgSize * channel, weightSize});
+    config.layerConfig.set_size(output_x * output_x *
+                                config.layerConfig.num_filters());
+  }
+
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_filter_size(filter_size);
+  conv->set_filter_size_y(filter_size);
+  conv->set_channels(channel);
+  conv->set_padding(padding);
+  conv->set_padding_y(padding);
+  conv->set_stride(stride);
+  conv->set_stride_y(stride);
+  conv->set_groups(groups);
+  conv->set_img_size(imgSize);
+  conv->set_output_x(output_x);
+
+  if (isDeconv) {
+    conv->set_filter_channels(numfilters / groups);
+  } else {
+    conv->set_filter_channels(channel / groups);
+  }
+
+  config.layerConfig.set_name("conv");
+
+  std::vector<DataLayerPtr> dataLayers;
+  LayerMap layerMap;
+  vector<Argument> datas;
+  initDataLayer(
+      config, &dataLayers, &datas, &layerMap, "conv", 1, false, useGpu);
+  dataLayers[0]->getOutputValue()->zeroMem();
+  dataLayers[0]->getOutputValue()->copyFrom(*inputData);
+
+  // test layer initialize
+  std::vector<ParameterPtr> parameters;
+  LayerPtr convLayer;
+  initTestLayer(config, &layerMap, &parameters, &convLayer);
+  convLayer->getBiasParameter()->zeroMem();
+  convLayer->getParameters()[0]->zeroMem();
+  convLayer->getParameters()[0]
+      ->getBuf(PARAMETER_VALUE)
+      ->copyFrom(param, weightSize);
+  convLayer->forward(PASS_GC);
+
+  return convLayer->getOutputValue();
+}
+
+TEST(Layer, convParaUnified) {
+#ifdef PADDLE_WITH_CUDA
+  MatrixPtr input, resultCpu, resultGpu;
+
+  /// TEST1 for conv ///
+  input = Matrix::create(1, 4 * 4, false, false);
+  real inputData[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
+  real param[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 7, 6, 5, 4, 3, 2, 1};
+
+  input->setData(inputData);
+
+  resultCpu = doOneConvTest(/* imgSize */ 4,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 3,
+                            /*channel*/ 1,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param,
+                            /*useGpu*/ false);
+
+  resultGpu = doOneConvTest(/* imgSize */ 4,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 3,
+                            /*channel*/ 1,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param,
+                            /*useGpu*/ true);
+  checkMatrixEqual(resultCpu, resultGpu);
+
+  /// TEST1 for deconv ///
+  input = Matrix::create(1, 2 * 2, false, false);
+  real inputDataT[] = {1, 2, 3, 4};
+  input->setData(inputDataT);
+
+  resultCpu = doOneConvTest(/* imgSize */ 4,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 3,
+                            /*channel*/ 1,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param,
+                            /*useGpu*/ false,
+                            /*isDeconv*/ true);
+
+  resultGpu = doOneConvTest(/* imgSize */ 4,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 3,
+                            /*channel*/ 1,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param,
+                            /*useGpu*/ true,
+                            /*isDeconv*/ true);
+  checkMatrixEqual(resultCpu, resultGpu);
+
+  /// TEST2 for conv ///
+  input = Matrix::create(1, 3 * 3 * 2, false, false);
+  real inputData2[] = {
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18};
+  real param2[] = {1, 2, 3, 4, 5, 6, 7, 8, 8, 7, 6, 5, 4, 3, 2, 1};
+
+  input->setData(inputData2);
+
+  resultCpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param2,
+                            /*useGpu*/ false);
+
+  resultGpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param2,
+                            /*useGpu*/ true);
+  checkMatrixEqual(resultCpu, resultGpu);
+
+  /// TEST3 for conv ///
+  real param3[] = {1, 2, 3, 4, 4, 3, 2, 1};
+
+  resultCpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 2,
+                            input,
+                            param3,
+                            /*useGpu*/ false);
+
+  resultGpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 2,
+                            input,
+                            param3,
+                            /*useGpu*/ true);
+  checkMatrixEqual(resultCpu, resultGpu);
+
+  /// TEST2 for deconv ///
+  input = Matrix::create(1, 2 * 2 * 2, false, false);
+  real inputData2T[] = {1, 2, 3, 4, 5, 6, 7, 8};
+  input->setData(inputData2T);
+
+  resultCpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param2,
+                            /*useGpu*/ false,
+                            /*isDeconv*/ true);
+
+  resultGpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param2,
+                            /*useGpu*/ true,
+                            /*isDeconv*/ true);
+  checkMatrixEqual(resultCpu, resultGpu);
+
+  /// TEST3 for deconv ///
+  resultCpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 2,
+                            input,
+                            param3,
+                            /*useGpu*/ false,
+                            /*isDeconv*/ true);
+
+  resultGpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 2,
+                            input,
+                            param3,
+                            /*useGpu*/ true,
+                            /*isDeconv*/ true);
+  checkMatrixEqual(resultCpu, resultGpu);
+#endif
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/gserver/tests/test_CrossEntropyOverBeamGrad.cpp b/paddle/legacy/gserver/tests/test_CrossEntropyOverBeamGrad.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..34eb0dedffeba46c662a0e69ce9ba82f474a8358
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_CrossEntropyOverBeamGrad.cpp
@@ -0,0 +1,352 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <random>
+#include <sstream>
+
+#include <gtest/gtest.h>
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/gserver/layers/DataLayer.h"
+
+#include "LayerGradUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+
+DECLARE_int32(gpu_id);
+DECLARE_bool(thread_local_rand_use_global_seed);
+
+const size_t MAX_SEQ_NUM = 23;
+const size_t MAX_SEQ_LEN = 50;
+const size_t MAX_BEAM_SIZE = 27;
+
+const size_t SEED = (size_t)(time(NULL));
+
+struct SingleBeamExpansion {
+  vector<int> seqStartPos;
+  vector<int> subSeqStartPos;
+  vector<real> candidateScores;
+
+  // TODO(caoying): store this into Argument.ids
+  vector<real> selectedIndices;
+
+  vector<int> groundTruth;
+  vector<size_t> inBeam;
+  vector<int> rowIdxInBeam;
+  vector<int> colIdxInBeam;
+
+  void resetGroundTruth(size_t n) {
+    groundTruth.clear();
+    groundTruth.resize(n, -1);
+
+    inBeam.clear();
+    inBeam.resize(n, 0);
+
+    rowIdxInBeam.clear();
+    rowIdxInBeam.resize(n, -1);
+
+    colIdxInBeam.clear();
+    colIdxInBeam.resize(n, -1);
+  }
+};
+
+inline float randFloat() {
+  return static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
+}
+
+void genRand(real* numbers, size_t n) {
+  default_random_engine generator;
+  uniform_real_distribution<real> distribution(0.0, 1.0);
+  for (size_t i = 0; i < n; ++i) numbers[i] = distribution(generator);
+}
+
+vector<real> randSampling(real range, int n) {
+  CHECK_GE(range, n);
+  vector<real> num(range);
+  iota(begin(num), end(num), 0.);
+  if (range == n) return num;
+
+  random_shuffle(begin(num), end(num));
+  num.resize(n);
+  sort(begin(num), end(num));
+  return num;
+}
+
+void genCandidateScores(bool hasSubseq,
+                        size_t beamSize,
+                        SingleBeamExpansion& prevBeam,
+                        SingleBeamExpansion& curBeam) {
+  vector<int>& seqStartPos = curBeam.seqStartPos;
+  seqStartPos.resize(1, 0);
+  vector<int>& subSeqStartPos = curBeam.subSeqStartPos;
+  subSeqStartPos.resize(1, 0);
+
+  srand(SEED);
+  if (prevBeam.selectedIndices.size()) {
+    if (prevBeam.subSeqStartPos.size() > 1) {
+      int seqIdx = 1;
+      // samples in previous beam are nested sequences.
+      for (size_t i = 1; i < prevBeam.subSeqStartPos.size(); ++i) {
+        for (size_t j = 0; j < beamSize; ++j) {
+          if (prevBeam.selectedIndices[(i - 1) * beamSize + j] == -1.) break;
+          subSeqStartPos.push_back(1 + (rand() % MAX_SEQ_LEN) +
+                                   subSeqStartPos.back());
+        }
+        if (prevBeam.seqStartPos[seqIdx] == prevBeam.subSeqStartPos[i]) {
+          seqStartPos.push_back(subSeqStartPos.back());
+          seqIdx++;
+        }
+      }
+    } else {
+      for (size_t i = 0; i <= prevBeam.selectedIndices.size(); ++i) {
+        if (i && i % beamSize == 0) {
+          seqStartPos.push_back(subSeqStartPos.back());
+          if (i == prevBeam.selectedIndices.size()) break;
+        }
+        if (prevBeam.selectedIndices[i] == -1.) continue;
+        subSeqStartPos.push_back(subSeqStartPos.back() +
+                                 (1 + (rand() % MAX_SEQ_LEN)));
+      }
+    }
+  } else {
+    // the first beam expansion
+    int seqNum = 1 + (rand() % MAX_SEQ_NUM);
+    for (int i = 0; i < seqNum; ++i) {
+      if (hasSubseq) {
+        for (size_t j = 0; j < 1 + (rand() % MAX_SEQ_NUM); ++j)
+          subSeqStartPos.push_back(subSeqStartPos.back() +
+                                   (1 + (rand() % MAX_SEQ_LEN)));
+        seqStartPos.push_back(subSeqStartPos.back());
+      } else {
+        seqStartPos.push_back(seqStartPos.back() +
+                              (1 + (rand() % MAX_SEQ_LEN)));
+      }
+    }
+  }
+
+  size_t totalSeqNum = hasSubseq ? subSeqStartPos.back() : seqStartPos.back();
+  curBeam.candidateScores.resize(totalSeqNum, 0.);
+  genRand(curBeam.candidateScores.data(), totalSeqNum);
+}
+
+void genSelectedIndices(size_t beamSize,
+                        vector<int>& seqStartPos,
+                        vector<real>& selectedIndices) {
+  size_t selectedIdsCount = beamSize * (seqStartPos.size() - 1);
+  selectedIndices.resize(selectedIdsCount, -1.);
+
+  for (size_t i = 0; i < seqStartPos.size() - 1; ++i) {
+    int seqLen = seqStartPos[i + 1] - seqStartPos[i];
+    int n = min(seqLen, static_cast<int>(beamSize));
+    vector<real> ids = randSampling(seqLen, n);
+    memcpy(selectedIndices.data() + i * beamSize,
+           ids.data(),
+           sizeof(real) * ids.size());
+  }
+}
+
+void genGroundTruth(vector<SingleBeamExpansion>& beamExpansions,
+                    size_t beamSize) {
+  SingleBeamExpansion& beam = beamExpansions[1];
+  size_t seqNum = beam.seqStartPos.size() - 1;
+  for (size_t i = 2; i < beamExpansions.size(); ++i)
+    CHECK_EQ(seqNum, beamExpansions[i].seqStartPos.size() - 1);
+
+  srand(SEED);
+
+  // initialize the first beam.
+  beam.resetGroundTruth(seqNum);
+  for (size_t i = 0; i < seqNum; ++i) {
+    if (randFloat() > 0.5) {
+      /*
+       * force the randomly generated label falls in the beam by chance 0.5.
+       * otherwise, when sequence length is relatively long and beam size is
+       * relatively small, the gold sequences falls off the beam at in the
+       * first search.
+       */
+      real* begPos = beam.selectedIndices.data() + i * beamSize;
+      beam.colIdxInBeam[i] =
+          rand() % count_if(begPos, begPos + beamSize, [](const real& val) {
+            return val != -1.;
+          });
+      beam.groundTruth[i] =
+          beam.selectedIndices[i * beamSize + beam.colIdxInBeam[i]];
+      beam.inBeam[i] = 1;
+    } else {
+      int label = rand() % (beam.seqStartPos[i + 1] - beam.seqStartPos[i]);
+      beam.groundTruth[i] = label;
+
+      real* begPos = beam.selectedIndices.data() + i * beamSize;
+      real* endPos = begPos + beamSize;
+      real* lblPos = find(begPos, endPos, real(label));
+      if (lblPos != endPos) {
+        beam.inBeam[i] = 1;
+        beam.colIdxInBeam[i] = lblPos - begPos;
+      }
+    }
+    beam.rowIdxInBeam[i] = i;
+  }
+
+  // iterate over each beam expansions
+  for (size_t i = 2; i < beamExpansions.size(); ++i) {
+    SingleBeamExpansion& curBeam = beamExpansions[i];
+    SingleBeamExpansion& prevBeam = beamExpansions[i - 1];
+    curBeam.resetGroundTruth(seqNum);
+
+    // iterate over each sequence
+    for (size_t j = 0; j < seqNum; ++j) {
+      if (!prevBeam.inBeam[j]) continue;
+
+      // gold sequence falls in the beam in previous search.
+      real* begPos = prevBeam.selectedIndices.data();
+      int offset =
+          prevBeam.rowIdxInBeam[j] * beamSize + prevBeam.colIdxInBeam[j];
+      curBeam.rowIdxInBeam[j] = count_if(
+          begPos, begPos + offset, [](const real& val) { return val != -1.; });
+
+      if (randFloat() > 0.5) {
+        // force the randomly generated label falls in the beam by chance 0.5.
+
+        real* start =
+            curBeam.selectedIndices.data() + curBeam.rowIdxInBeam[j] * beamSize;
+        int n = rand() % count_if(start, start + beamSize, [](const real& val) {
+                  return val != -1.;
+                });
+        curBeam.colIdxInBeam[j] = n;
+        curBeam.groundTruth[j] = *(start + n);
+        curBeam.inBeam[j] = 1;
+      } else {
+        CHECK_LE((size_t)curBeam.rowIdxInBeam[j] + 1,
+                 curBeam.subSeqStartPos.size() - 1);
+        int start = curBeam.subSeqStartPos[curBeam.rowIdxInBeam[j]];
+        int end = curBeam.subSeqStartPos[curBeam.rowIdxInBeam[j] + 1];
+        CHECK_GT(size_t(end), size_t(start));
+        int label = rand() % (end - start);
+
+        curBeam.groundTruth[j] = label;
+        real* findBeg =
+            curBeam.selectedIndices.data() + curBeam.rowIdxInBeam[j] * beamSize;
+        real* lblPos =
+            find(findBeg, findBeg + beamSize, static_cast<real>(label));
+        if (lblPos != (findBeg + beamSize)) {
+          curBeam.inBeam[j] = 1;
+          curBeam.colIdxInBeam[j] = lblPos - findBeg;
+        }
+      }
+    }
+  }
+}
+
+void genOneBeam(size_t beamSize,
+                bool hasSubseq,
+                SingleBeamExpansion& prevBeam,
+                SingleBeamExpansion& curBeam) {
+  genCandidateScores(hasSubseq, beamSize, prevBeam, curBeam);
+  genSelectedIndices(beamSize,
+                     hasSubseq ? curBeam.subSeqStartPos : curBeam.seqStartPos,
+                     curBeam.selectedIndices);
+}
+
+void genRandomBeamExpansion(size_t expansionCount,
+                            size_t beamSize,
+                            vector<SingleBeamExpansion>& beamExpansions) {
+  beamExpansions.clear();
+  beamExpansions.resize(expansionCount + 1);
+
+  // beamExpansions[0] is reserved.
+  for (size_t i = 1; i <= expansionCount; ++i)
+    genOneBeam(beamSize, bool(i - 1), beamExpansions[i - 1], beamExpansions[i]);
+  genGroundTruth(beamExpansions, beamSize);
+}
+
+void testCrossEntropyOverBeam(bool useGpu,
+                              size_t beamSize,
+                              vector<SingleBeamExpansion>& beams) {
+  TestConfig config;
+  config.layerConfig.set_type("cross_entropy_over_beam");
+
+  size_t seqNum = 0;
+  for (size_t i = 1; i < beams.size(); ++i) {
+    const SingleBeamExpansion& beam = beams[i];
+    // create scores for all the candidates
+    MatrixPtr candidateScorePtr =
+        Matrix::create(beam.candidateScores.size(), 1, false, false);
+    candidateScorePtr->copyFrom(beam.candidateScores.data(),
+                                beam.candidateScores.size());
+
+    ostringstream paramName;
+    paramName << "candidate_scores_" << i;
+
+    if (beam.subSeqStartPos.size() > 1) {
+      seqNum = beam.subSeqStartPos.size() - 1;
+      config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
+                                  paramName.str(),
+                                  candidateScorePtr,
+                                  beam.seqStartPos,
+                                  beam.subSeqStartPos});
+    } else {
+      seqNum = beam.seqStartPos.size() - 1;
+      config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
+                                  paramName.str(),
+                                  candidateScorePtr,
+                                  beam.seqStartPos});
+    }
+    config.layerConfig.add_inputs();
+
+    // create indices for the selected candidates
+    MatrixPtr selectedCandidates =
+        Matrix::create(seqNum, beamSize, false, false);
+    selectedCandidates->copyFrom(beam.selectedIndices.data(),
+                                 beam.selectedIndices.size());
+    paramName.clear();
+    paramName << "selected_candidates_" << i;
+    config.inputDefs.push_back(
+        {INPUT_SELF_DEFINE_DATA, paramName.str(), selectedCandidates});
+    config.layerConfig.add_inputs();
+
+    // create the ground truth
+    paramName.clear();
+    paramName << "label_" << i;
+    config.inputDefs.push_back(
+        {INPUT_SELF_DEFINE_DATA, paramName.str(), beam.groundTruth});
+    config.layerConfig.add_inputs();
+  }
+
+  testLayerGrad(
+      config, "cross_entropy_over_beam", seqNum, false, useGpu, false);
+}
+
+TEST(Layer, CrossEntropyOverBeam) {
+  LOG(INFO) << "SEED = " << SEED;
+  const size_t beamSize = 1 + rand() % MAX_BEAM_SIZE;
+  LOG(INFO) << "beamSize = " << beamSize;
+
+  // TODO(caoying): test with random beam expansions.
+  const size_t expansionCount = 3;
+  vector<SingleBeamExpansion> beams;
+  genRandomBeamExpansion(expansionCount, beamSize, beams);
+
+  for (bool useGpu : {false, true})
+    testCrossEntropyOverBeam(useGpu, beamSize, beams);
+}
+
+int main(int argc, char** argv) {
+  initMain(argc, argv);
+  hl_start();
+  hl_init(FLAGS_gpu_id);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(SEED);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/gserver/tests/test_DetectionOutput.cpp b/paddle/legacy/gserver/tests/test_DetectionOutput.cpp
similarity index 100%
rename from paddle/gserver/tests/test_DetectionOutput.cpp
rename to paddle/legacy/gserver/tests/test_DetectionOutput.cpp
diff --git a/paddle/legacy/gserver/tests/test_Evaluator.cpp b/paddle/legacy/gserver/tests/test_Evaluator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8aab50d23e56e449d86f22a315c45432253cdd07
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_Evaluator.cpp
@@ -0,0 +1,267 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <vector>
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/trainer/Trainer.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_bool(thread_local_rand_use_global_seed);
+
+enum InputType {
+  INPUT_DATA,         // dense vector
+  INPUT_LABEL,        // id
+  INPUT_DATA_TARGET,  // dense vector, but no gradient
+  INPUT_SEQUENCE_DATA,
+  INPUT_SEQUENCE_LABEL,
+  INPUT_SPARSE_NON_VALUE_DATA
+};
+
+struct InputDef {
+  InputType inputType;
+  string name;
+  size_t dim;
+};
+
+struct TestConfig {
+  EvaluatorConfig evaluatorConfig;
+  std::vector<InputDef> inputDefs;
+  bool testAccumulate;
+  TestConfig() : testAccumulate(true) {}
+};
+
+void testEvaluator(TestConfig testConf,
+                   string testEvaluatorName,
+                   size_t batchSize,
+                   bool useGpu) {
+#ifndef PADDLE_WITH_CUDA
+  if (useGpu) return;
+#endif
+  FLAGS_use_gpu = useGpu;
+  testConf.evaluatorConfig.set_name(testEvaluatorName);
+  LOG(INFO) << " evaluator_type=" << testConf.evaluatorConfig.type()
+            << " useGpu=" << useGpu;
+
+  std::vector<Argument> arguments;
+  for (size_t i = 0; i < testConf.inputDefs.size(); ++i) {
+    Argument data;
+    size_t dim = testConf.inputDefs[i].dim;
+    switch (testConf.inputDefs[i].inputType) {
+      case INPUT_DATA:
+      case INPUT_SEQUENCE_DATA:
+      case INPUT_DATA_TARGET:
+        data.value = Matrix::create(batchSize, dim, false, useGpu);
+        data.value->randomizeUniform();
+
+        // make sure output > 0 && output < 1
+        data.value->add(-0.5);
+        data.value->sigmoid(*data.value);
+        break;
+      case INPUT_LABEL:
+      case INPUT_SEQUENCE_LABEL:
+        data.ids = VectorT<int>::create(batchSize, useGpu);
+        data.ids->rand(dim);  // now rand number can be 0 to inputDefs[i].dim.
+        break;
+      case INPUT_SPARSE_NON_VALUE_DATA:
+        data.value = makeRandomSparseMatrix(batchSize,
+                                            dim,
+                                            /* withValue= */ false,
+                                            useGpu);
+        break;
+      default:
+        LOG(FATAL) << " unknown inputType ";
+        return;
+    }
+
+    ICpuGpuVectorPtr sequenceStartPositions;
+    if (testConf.inputDefs[i].inputType == INPUT_SEQUENCE_DATA ||
+        testConf.inputDefs[i].inputType == INPUT_SEQUENCE_LABEL) {
+      if (!sequenceStartPositions) {
+        generateSequenceStartPositions(batchSize, sequenceStartPositions);
+      }
+      data.sequenceStartPositions = sequenceStartPositions;
+    }
+
+    arguments.push_back(data);
+  }
+
+  Evaluator* testEvaluator = Evaluator::create(testConf.evaluatorConfig);
+  double totalScore = 0.0;
+  testEvaluator->start();
+  totalScore += testEvaluator->evalImp(arguments);
+  testEvaluator->updateSamplesNum(arguments);
+  testEvaluator->finish();
+  LOG(INFO) << *testEvaluator;
+
+  std::vector<std::string> names;
+  testEvaluator->getNames(&names);
+  paddle::Error err;
+  for (auto& name : names) {
+    auto value = testEvaluator->getValue(name, &err);
+    ASSERT_TRUE(err.isOK());
+    LOG(INFO) << name << " " << value;
+    auto tp = testEvaluator->getType(name, &err);
+    ASSERT_TRUE(err.isOK());
+    ASSERT_EQ(testConf.evaluatorConfig.type(), tp);
+  }
+
+  double totalScore2 = 0.0;
+  if (testConf.testAccumulate) {
+    testEvaluator->start();
+    totalScore2 += testEvaluator->evalImp(arguments);
+    testEvaluator->finish();
+    EXPECT_LE(fabs(totalScore - totalScore2), 1.0e-5);
+  }
+}
+
+void testEvaluatorAll(TestConfig testConf,
+                      string testEvaluatorName,
+                      size_t batchSize) {
+  testEvaluator(testConf, testEvaluatorName, batchSize, true);
+  testEvaluator(testConf, testEvaluatorName, batchSize, false);
+}
+
+TEST(Evaluator, detection_map) {
+  TestConfig config;
+  config.evaluatorConfig.set_type("detection_map");
+  config.evaluatorConfig.set_overlap_threshold(0.5);
+  config.evaluatorConfig.set_background_id(0);
+  config.evaluatorConfig.set_ap_type("Integral");
+  config.evaluatorConfig.set_evaluate_difficult(0);
+
+  config.inputDefs.push_back({INPUT_DATA, "output", 7});
+  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "label", 6});
+  config.evaluatorConfig.set_evaluate_difficult(false);
+  testEvaluatorAll(config, "detection_map", 100);
+
+  config.evaluatorConfig.set_evaluate_difficult(true);
+  testEvaluatorAll(config, "detection_map", 100);
+}
+
+TEST(Evaluator, classification_error) {
+  TestConfig config;
+  config.evaluatorConfig.set_type("classification_error");
+  config.evaluatorConfig.set_top_k(5);
+
+  config.inputDefs.push_back({INPUT_DATA, "output", 50});
+  config.inputDefs.push_back({INPUT_LABEL, "label", 50});
+  testEvaluatorAll(config, "classification_error", 100);
+  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
+  testEvaluatorAll(config, "classification_error_weight", 100);
+
+  // multi binary labels
+  config.inputDefs.clear();
+  config.inputDefs.push_back({INPUT_DATA, "output", 100});
+  config.inputDefs.push_back({INPUT_SPARSE_NON_VALUE_DATA, "label", 100});
+  // Not support GPU
+  testEvaluator(config, "classification_error_multi_binary_label", 50, false);
+
+  config.evaluatorConfig.set_classification_threshold(0.4);
+  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
+  // Not support GPU
+  testEvaluator(
+      config, "classification_error_weight_multi_binary_label", 50, false);
+}
+
+TEST(Evaluator, sum) {
+  TestConfig config;
+  config.evaluatorConfig.set_type("sum");
+
+  // sum of output
+  config.inputDefs.push_back({INPUT_DATA, "output", 10});
+  testEvaluatorAll(config, "sum_output", 200);
+  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
+  testEvaluatorAll(config, "sum_output_weight", 200);
+
+  // sum of label
+  config.inputDefs.clear();
+  config.inputDefs.push_back({INPUT_LABEL, "label", 10});
+  testEvaluatorAll(config, "sum_label", 200);
+  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
+  testEvaluatorAll(config, "sum_label_weight", 200);
+}
+
+TEST(Evaluator, last_column_sum) {
+  TestConfig config;
+  config.evaluatorConfig.set_type("last-column-sum");
+
+  config.inputDefs.push_back({INPUT_DATA, "output", 50});
+  testEvaluatorAll(config, "last-column-sum", 200);
+  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
+  testEvaluatorAll(config, "last-column-sum_weight", 200);
+}
+
+TEST(Evaluator, last_column_auc) {
+  TestConfig config;
+  config.evaluatorConfig.set_type("last-column-auc");
+
+  config.inputDefs.push_back({INPUT_DATA, "output", 2});
+  config.inputDefs.push_back({INPUT_LABEL, "label", 2});
+  testEvaluatorAll(config, "last-column-auc", 500);
+  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
+  testEvaluatorAll(config, "last-column-auc_weight", 200);
+}
+
+TEST(Evaluator, precision_recall) {
+  TestConfig config;
+  config.evaluatorConfig.set_type("precision_recall");
+
+  config.inputDefs.push_back({INPUT_DATA, "output", 10});
+  config.inputDefs.push_back({INPUT_LABEL, "label", 10});
+  testEvaluatorAll(config, "precision_recall", 200);
+  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
+  testEvaluatorAll(config, "precision_recall_weight", 200);
+
+  LOG(INFO) << "positive_label = 5";
+  config.evaluatorConfig.set_positive_label(5);
+  testEvaluatorAll(config, "precision_recall_weight", 200);
+
+  // multi binary labels
+  config.inputDefs.clear();
+  config.evaluatorConfig.set_positive_label(-1);
+  config.inputDefs.push_back({INPUT_DATA, "output", 10});
+  config.inputDefs.push_back({INPUT_SPARSE_NON_VALUE_DATA, "label", 10});
+  // Not support GPU
+  testEvaluator(config, "precision_recall_multi_binary_label", 100, false);
+
+  LOG(INFO) << "classification_threshold = 0.4";
+  config.evaluatorConfig.set_classification_threshold(0.4);
+  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
+  // Not support GPU
+  testEvaluator(
+      config, "precision_recall_weight_multi_binary_label", 100, false);
+}
+
+TEST(Evaluator, ctc_error_evaluator) {
+  TestConfig config;
+  config.evaluatorConfig.set_type("ctc_edit_distance");
+
+  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "output", 32});
+  config.inputDefs.push_back({INPUT_SEQUENCE_LABEL, "label", 1});
+  testEvaluatorAll(config, "ctc_error_evaluator", 100);
+}
+
+int main(int argc, char** argv) {
+  initMain(argc, argv);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/gserver/tests/test_Expand.cpp b/paddle/legacy/gserver/tests/test_Expand.cpp
similarity index 100%
rename from paddle/gserver/tests/test_Expand.cpp
rename to paddle/legacy/gserver/tests/test_Expand.cpp
diff --git a/paddle/legacy/gserver/tests/test_KmaxSeqScore.cpp b/paddle/legacy/gserver/tests/test_KmaxSeqScore.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e15b4e5038cddda00acdd06b7748984b03094e6e
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_KmaxSeqScore.cpp
@@ -0,0 +1,164 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/gserver/layers/DataLayer.h"
+#include "paddle/legacy/utils/GlobalConstants.h"
+
+#include "LayerGradUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_bool(thread_local_rand_use_global_seed);
+
+vector<int> randSampling(int range, int n) {
+  CHECK_GE(range, n);
+  vector<int> num(range);
+  iota(begin(num), end(num), 0);
+  if (range == n) return num;
+
+  random_shuffle(begin(num), end(num));
+  num.resize(n);
+  return num;
+}
+
+void genRandomSeqInfo(vector<int>& seqStartPosition,
+                      vector<int>& subSeqStartPosition) {
+  const int maxSeqNum = 100;
+  // generate random start position information
+  int seqNum = 1 + (rand() % maxSeqNum);
+  seqStartPosition.resize(seqNum + 1, 0);
+  subSeqStartPosition.resize(1, 0);
+
+  for (int i = 0; i < seqNum; ++i) {
+    int subSeqLen = 1 + (rand() % maxSeqNum);
+    for (int j = 0; j < subSeqLen; ++j)
+      subSeqStartPosition.push_back(subSeqStartPosition.back() + subSeqLen);
+    seqStartPosition[i + 1] = subSeqStartPosition.back();
+  }
+}
+
+void genRandomGroundTruth(real* values,
+                          vector<vector<int>>& groundTruth,
+                          vector<int>& startPos,
+                          size_t beamSize) {
+  groundTruth.resize(startPos.size() - 1, vector<int>(beamSize, -1));
+  for (size_t i = 0; i < startPos.size() - 1; ++i) {
+    int seqLen = startPos[i + 1] - startPos[i];
+    vector<int> pos =
+        randSampling(seqLen, min(static_cast<int>(beamSize), seqLen));
+    for (size_t j = 0; j < pos.size(); ++j) {
+      groundTruth[i][j] = pos[j];
+      values[startPos[i] + pos[j]] = 1.;
+    }
+  }
+}
+
+void checkLayerOut(vector<vector<int>> groundTruth,
+                   real* layerOut,
+                   size_t beamSize) {
+  for (size_t i = 0; i < groundTruth.size(); ++i) {
+    int begPos = i * beamSize;
+    vector<real> tmp(layerOut + begPos, layerOut + begPos + beamSize);
+    sort(begin(tmp), end(tmp));
+    sort(begin(groundTruth[i]), end(groundTruth[i]));
+    for (size_t j = 0; j < beamSize; ++j) CHECK_EQ(tmp[j], groundTruth[i][j]);
+  }
+}
+
+TEST(Layer, kmaxSeqScoreLayer) {
+  const size_t maxBeamSize = 100;
+  size_t beamSize = 1 + (rand() % maxBeamSize);
+
+  vector<int> seqStartPosition;
+  vector<int> subSeqStartPosition;
+  genRandomSeqInfo(seqStartPosition, subSeqStartPosition);
+  MatrixPtr inValue =
+      Matrix::create(subSeqStartPosition.back(), 1, false, false);
+
+  std::vector<bool> mode = {false};
+#ifdef PADDLE_WITH_CUDA
+  mode.push_back(true);
+#endif
+
+  for (auto hasSubseq : {false, true}) {
+    vector<vector<int>> groundTruth;
+    inValue->randomizeUniform();
+    genRandomGroundTruth(inValue->getData(),
+                         groundTruth,
+                         hasSubseq ? subSeqStartPosition : seqStartPosition,
+                         beamSize);
+
+    for (auto useGpu : mode) {
+      TestConfig config;
+      config.layerConfig.set_type("kmax_seq_score");
+      config.layerConfig.set_beam_size(beamSize);
+
+      if (hasSubseq) {
+        config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
+                                    "scores",
+                                    inValue,
+                                    seqStartPosition,
+                                    subSeqStartPosition});
+      } else {
+        config.inputDefs.push_back(
+            {INPUT_SELF_DEFINE_DATA, "scores", inValue, seqStartPosition});
+      }
+      config.layerConfig.add_inputs();
+
+      // data layer initialize
+      std::vector<DataLayerPtr> dataLayers;
+      LayerMap layerMap;
+      vector<Argument> datas;
+      initDataLayer(
+          config,
+          &dataLayers,
+          &datas,
+          &layerMap,
+          "kmax_seq_score",
+          100 /* actually this parameter is unused in self-defined input*/,
+          false,
+          useGpu);
+      // test layer initialize
+      std::vector<ParameterPtr> parameters;
+      LayerPtr kmaxSeqScoreLayer;
+      FLAGS_use_gpu = useGpu;
+      initTestLayer(config, &layerMap, &parameters, &kmaxSeqScoreLayer);
+      kmaxSeqScoreLayer->forward(PASS_TRAIN);
+
+      const MatrixPtr outValue = kmaxSeqScoreLayer->getOutputValue();
+      CHECK_EQ(outValue->getHeight(),
+               hasSubseq ? subSeqStartPosition.size() - 1
+                         : seqStartPosition.size() - 1);
+      CHECK_EQ(outValue->getWidth(), beamSize);
+      checkLayerOut(groundTruth, outValue->getData(), beamSize);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand((size_t)(time(NULL)));
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/gserver/tests/test_LayerGrad.cpp b/paddle/legacy/gserver/tests/test_LayerGrad.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..979cf8ee673291d66f8704f2deda6c7160f4b228
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_LayerGrad.cpp
@@ -0,0 +1,2532 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_CUDA
+#include <cudnn.h>
+#endif
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/gserver/layers/DataLayer.h"
+#include "paddle/legacy/math/MathUtils.h"
+
+#include "LayerGradUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_double(checkgrad_eps);
+DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(prev_batch_state);
+
+TEST(Operator, dot_mul) {
+  TestConfig config;
+  config.layerConfig.set_size(10);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  OperatorConfig& operatorConf = *config.layerConfig.add_operator_confs();
+  operatorConf.set_type("dot_mul");
+  operatorConf.set_dotmul_scale(-1);
+
+  testOperatorGrad(config, operatorConf, 100, false, false);
+}
+
+TEST(Projection, context) {
+  for (auto contextStart : {-5, -3, -1, 0, 3}) {
+    for (auto contextLength : {1, 2, 5, 7}) {
+      for (auto batchSize : {1, 2, 5, 20}) {
+        for (auto trainablePadding : {false, true}) {
+          LOG(INFO) << " contextStart=" << contextStart
+                    << " contextLength=" << contextLength
+                    << " batchSize=" << batchSize
+                    << " trainablePadding=" << trainablePadding;
+          ProjectionConfig conf;
+          conf.set_type("context");
+          conf.set_input_size(10);
+          conf.set_context_start(contextStart);
+          conf.set_context_length(contextLength);
+          conf.set_trainable_padding(trainablePadding);
+          conf.set_output_size(conf.context_length() * conf.input_size());
+          int pad =
+              std::max(0, -conf.context_start()) +
+              std::max(0, conf.context_start() + conf.context_length() - 1);
+          for (auto useGpu : {false, true}) {
+            testProjectionGrad(
+                conf,
+                INPUT_SEQUENCE_DATA,
+                trainablePadding ? conf.input_size() * pad : 0,
+                batchSize,
+                useGpu,
+                contextStart + contextLength <= 1);  // = testState
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(Projection, trans_fc) {
+  ProjectionConfig conf;
+  conf.set_type("trans_fc");
+  conf.set_input_size(50);
+  conf.set_output_size(20);
+  for (auto useGpu : {false, true}) {
+    testProjectionGrad(conf,
+                       INPUT_DATA,
+                       /* parameterSize */ 1000,
+                       /* batchSize */ 100,
+                       useGpu);
+  }
+}
+
+TEST(Projection, fc) {
+  ProjectionConfig conf;
+  conf.set_type("fc");
+  conf.set_input_size(10);
+  conf.set_output_size(20);
+  for (auto useGpu : {false, true}) {
+    testProjectionGrad(conf,
+                       INPUT_DATA,
+                       /* parameterSize */ 200,
+                       /* batchSize */ 100,
+                       useGpu);
+  }
+}
+
+TEST(Projection, dot_mul) {
+  ProjectionConfig conf;
+  conf.set_type("dot_mul");
+  conf.set_input_size(20);
+  conf.set_output_size(20);
+  for (auto useGpu : {false, true}) {
+    testProjectionGrad(conf,
+                       INPUT_DATA,
+                       /* parameterSize */ 20,
+                       /* batchSize */ 100,
+                       useGpu);
+  }
+}
+
+TEST(Projection, table) {
+  ProjectionConfig conf;
+  conf.set_type("table");
+  conf.set_input_size(10);
+  conf.set_output_size(20);
+  for (auto useGpu : {false, true}) {
+    testProjectionGrad(conf,
+                       INPUT_LABEL,
+                       /* parameterSize */ 200,
+                       /* batchSize */ 100,
+                       useGpu);
+  }
+}
+
+TEST(Projection, identity) {
+  ProjectionConfig conf;
+  conf.set_type("identity");
+  conf.set_input_size(10);
+  conf.set_output_size(10);
+  for (auto useGpu : {false, true}) {
+    testProjectionGrad(conf,
+                       INPUT_DATA,
+                       /* parameterSize */ 0,
+                       /* batchSize */ 100,
+                       useGpu);
+  }
+}
+
+TEST(Projection, slice) {
+  ProjectionConfig conf;
+  conf.set_type("slice");
+  conf.set_input_size(100);
+  SliceConfig& slice1 = *conf.add_slices();
+  slice1.set_start(10);
+  slice1.set_end(20);
+  SliceConfig& slice2 = *conf.add_slices();
+  slice2.set_start(50);
+  slice2.set_end(70);
+  conf.set_output_size(30);
+  for (auto useGpu : {false, true}) {
+    testProjectionGrad(conf,
+                       INPUT_DATA,
+                       /* parameterSize */ 0,
+                       /* batchSize */ 10,
+                       useGpu);
+  }
+}
+
+TEST(Projection, scaling) {
+  ProjectionConfig conf;
+  conf.set_type("scaling");
+  conf.set_input_size(10);
+  conf.set_output_size(10);
+  for (auto useGpu : {false}) {
+    testProjectionGrad(conf,
+                       INPUT_DATA,
+                       /* parameterSize */ 1,
+                       /* batchSize */ 100,
+                       useGpu);
+  }
+}
+
+void testProjectionConv(size_t groups, bool isDeconv) {
+  const int NUM_FILTERS = 18;
+  const int FILTER_SIZE = 2;
+  const int FILTER_SIZE_Y = 2;
+  const int CHANNELS = 3;
+  const int IMAGE_SIZE = 16;
+
+#if CUDNN_VERSION >= 6000
+  const int DILATION = 2;
+#else
+  const int DILATION = 1;
+#endif
+
+  ProjectionConfig conf;
+  if (isDeconv) {
+    conf.set_type("convt");
+  } else {
+    conf.set_type("conv");
+  }
+  conf.set_num_filters(NUM_FILTERS);
+
+  ConvConfig* conv = conf.mutable_conv_conf();
+  conv->set_filter_size(FILTER_SIZE);
+  conv->set_filter_size_y(FILTER_SIZE_Y);
+  conv->set_channels(CHANNELS);
+  conv->set_padding(0);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_dilation(DILATION);
+  conv->set_dilation_y(DILATION);
+  conv->set_groups(groups);
+  if (isDeconv) {
+    conv->set_filter_channels(NUM_FILTERS / conv->groups());
+  } else {
+    conv->set_filter_channels(conv->channels() / conv->groups());
+  }
+  conv->set_img_size(IMAGE_SIZE);
+  int output_x = outputSize(conv->img_size(),
+                            (conv->filter_size() - 1) * DILATION + 1,
+                            conv->padding(),
+                            conv->stride(),
+                            /* caffeMode */ true);
+  int output_y = outputSize(conv->img_size(),
+                            (conv->filter_size_y() - 1) * DILATION + 1,
+                            conv->padding_y(),
+                            conv->stride_y(),
+                            /* caffeMode */ true);
+  conv->set_output_x(output_x);
+  conv->set_output_y(output_y);
+  LOG(INFO) << "DILATION:" << DILATION << "; output_x: " << output_x
+            << "; output_y: " << output_y;
+  if (isDeconv) {
+    int deconv_image_x = imageSize(output_x,
+                                   (conv->filter_size() - 1) * DILATION + 1,
+                                   conv->padding(),
+                                   conv->stride(),
+                                   /* caffeMode */ true);
+    int deconv_image_y = imageSize(output_y,
+                                   (conv->filter_size_y() - 1) * DILATION + 1,
+                                   conv->padding_y(),
+                                   conv->stride_y(),
+                                   /* caffeMode */ true);
+
+    LOG(INFO) << " deconv_image_x: " << deconv_image_x
+              << "; deconv_image_y: " << deconv_image_y;
+    conf.set_input_size(output_x * output_y * CHANNELS);
+    conf.set_output_size(deconv_image_x * deconv_image_y * NUM_FILTERS);
+  } else {
+    conf.set_input_size(IMAGE_SIZE * IMAGE_SIZE * CHANNELS);
+    conf.set_output_size(output_x * output_y * NUM_FILTERS);
+  }
+
+  testProjectionGrad(conf,
+                     INPUT_DATA,
+                     /* parameterSize */ NUM_FILTERS * CHANNELS * FILTER_SIZE *
+                         FILTER_SIZE_Y / groups,
+                     /* batchSize */ 100,
+                     true,
+                     false,
+                     NUM_FILTERS,
+                     true);
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(Projection, conv) {
+  /// test ConvProjection
+  testProjectionConv(1, false);
+  testProjectionConv(3, false);
+  /// test ConvTransProjection
+  testProjectionConv(1, true);
+  testProjectionConv(3, true);
+}
+#endif
+
+TEST(Layer, BilinearInterpLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("bilinear_interp");
+  config.biasSize = 0;
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 4096, 0});
+
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  BilinearInterpConfig* bilinear = input->mutable_bilinear_interp_conf();
+  ImageConfig* image = bilinear->mutable_image_conf();
+  image->set_img_size(32);
+  image->set_img_size_y(32);
+  image->set_channels(4);
+
+  for (auto useGpu : {false, true}) {
+    for (auto outSize : {32, 64}) {
+      bilinear->set_out_size_x(outSize);
+      bilinear->set_out_size_y(outSize);
+      testLayerGrad(config, "bilinear_interp", 10, false, useGpu);
+    }
+  }
+}
+
+TEST(Layer, concat) {
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("concat");
+  config.layerConfig.set_size(15);
+  config.layerConfig.set_active_type("sigmoid");
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 5, 0});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "concat", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, AddtoLayer) {
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("addto");
+  config.layerConfig.set_size(10);
+  config.layerConfig.set_active_type("sigmoid");
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "addto", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, CTCLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("ctc");
+  config.layerConfig.set_norm_by_times(false);
+  config.layerConfig.set_size(10);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 10, 0});
+  config.inputDefs.push_back({INPUT_SEQUENCE_LABEL, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "ctc",
+                  100,
+                  /* trans */ false, /* useGpu */
+                  useGpu);
+  }
+}
+
+TEST(Layer, cosSimLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("cos");
+  config.layerConfig.set_size(1);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 50, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "cos", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, CosSimVecMatLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("cos_vm");
+  config.layerConfig.set_size(5);  // output size
+  config.layerConfig.set_cos_scale(2.0);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 20, 0});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 100, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "cos_vm", 100, false, useGpu);
+  }
+}
+
+void testDepthwiseConvLayer(const string& type, bool useGpu) {
+  TestConfig config;
+  config.biasSize = 32;
+  config.layerConfig.set_type(type);
+  config.layerConfig.set_num_filters(32);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 2048, 192});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_filter_size(2);
+  conv->set_filter_size_y(3);
+  conv->set_channels(16);
+  conv->set_padding(0);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_groups(16);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  conv->set_img_size(16);
+  conv->set_img_size_y(8);
+  conv->set_output_x(outputSize(conv->img_size(),
+                                conv->filter_size(),
+                                conv->padding(),
+                                conv->stride(),
+                                /* caffeMode */ true));
+  conv->set_output_y(outputSize(conv->img_size_y(),
+                                conv->filter_size_y(),
+                                conv->padding_y(),
+                                conv->stride_y(),
+                                /* caffeMode */ true));
+  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
+                              config.layerConfig.num_filters());
+
+  testLayerGrad(config, "depthwise_conv", 100, false, useGpu);
+  // Use small batch_size and useWeight=true to test biasGrad
+  testLayerGrad(config, "depthwise_conv", 2, false, useGpu, true, 0.02);
+}
+
+TEST(Layer, depthwiseConvLayer) {
+  //  'depthwise_conv' is a sepecial case of 'exconv' whose
+  //  groups size equals to the input channels size.
+  testDepthwiseConvLayer("exconv", /* useGpu= */ false);
+#ifdef PADDLE_WITH_CUDA
+  testDepthwiseConvLayer("exconv", /* useGpu= */ true);
+#endif
+}
+
+void testConvLayer(const string& type, bool trans, bool useGpu) {
+  TestConfig config;
+  config.biasSize = 16;
+  config.layerConfig.set_type(type);
+  config.layerConfig.set_num_filters(16);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  int dilation = 2;
+  if (type == "cudnn_conv") {
+#if CUDNN_VERSION >= 6000
+    dilation = 2;
+#else
+    dilation = 1;
+#endif
+  }
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 768, 192});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_filter_size(2);
+  conv->set_filter_size_y(2);
+  conv->set_channels(3);
+  conv->set_padding(0);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_dilation(dilation);
+  conv->set_dilation_y(dilation);
+  conv->set_groups(1);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  conv->set_img_size(16);
+  conv->set_img_size_y(16);
+  conv->set_output_x(outputSize(conv->img_size(),
+                                (conv->filter_size() - 1) * dilation + 1,
+                                conv->padding(),
+                                conv->stride(),
+                                /* caffeMode */ true));
+  conv->set_output_y(outputSize(conv->img_size_y(),
+                                (conv->filter_size_y() - 1) * dilation + 1,
+                                conv->padding_y(),
+                                conv->stride_y(),
+                                /* caffeMode */ true));
+  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
+                              config.layerConfig.num_filters());
+
+  testLayerGrad(config, "conv", 100, trans, useGpu);
+  // Use small batch_size and useWeight=true to test biasGrad
+  testLayerGrad(config, "conv", 2, trans, useGpu, true, 0.02);
+}
+
+TEST(Layer, convLayer) {
+  testConvLayer("exconv", /* trans= */ false, /* useGpu= */ false);
+#ifdef PADDLE_WITH_CUDA
+  testConvLayer("exconv", /* trans= */ false, /* useGpu= */ true);
+  testConvLayer("cudnn_conv", /* trans= */ false, /* useGpu= */ true);
+#endif
+}
+
+void testConvTransLayer(const string& type, bool trans, bool useGpu) {
+  TestConfig config;
+  config.biasSize = 3;
+  config.layerConfig.set_type(type);
+  config.layerConfig.set_num_filters(3);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 384});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_filter_size(2);
+  conv->set_filter_size_y(4);
+  conv->set_channels(16);
+  conv->set_padding(0);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_groups(1);
+  conv->set_filter_channels(3 / conv->groups());
+  conv->set_img_size(16);
+  conv->set_output_x(outputSize(conv->img_size(),
+                                conv->filter_size(),
+                                conv->padding(),
+                                conv->stride(),
+                                /* caffeMode */ true));
+
+  config.layerConfig.set_size(conv->img_size() * conv->img_size() *
+                              config.layerConfig.num_filters());
+
+  testLayerGrad(config, "convTrans", 100, trans, useGpu);
+  // Use small batch_size and useWeight=true to test biasGrad
+  testLayerGrad(config, "convTrans", 2, trans, useGpu, true, 0.02);
+}
+
+TEST(Layer, convTransLayer) {
+  for (auto useGpu : {false, true}) {
+    testConvTransLayer("exconvt", /* trans= */ false, /* useGpu= */ useGpu);
+  }
+#ifdef PADDLE_WITH_CUDA
+  testConvTransLayer("cudnn_convt", /* trans= */ false, /* useGpu= */ true);
+#endif
+}
+
+TEST(Layer, blockExpandLayer) {
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("blockexpand");
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 6144, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  BlockExpandConfig* blockExpand = input->mutable_block_expand_conf();
+  blockExpand->set_img_size_x(64);
+  blockExpand->set_img_size_y(32);
+  blockExpand->set_channels(3);
+  blockExpand->set_padding_x(0);
+  blockExpand->set_padding_y(0);
+  blockExpand->set_block_x(4);
+  blockExpand->set_block_y(32);
+  blockExpand->set_stride_x(2);
+  blockExpand->set_stride_y(2);
+  blockExpand->set_output_x(outputSize(blockExpand->img_size_x(),
+                                       blockExpand->block_x(),
+                                       blockExpand->padding_x(),
+                                       blockExpand->stride_x(),
+                                       /* caffeMode */ false));
+  blockExpand->set_output_y(outputSize(blockExpand->img_size_y(),
+                                       blockExpand->block_y(),
+                                       blockExpand->padding_y(),
+                                       blockExpand->stride_y(),
+                                       /* caffeMode */ false));
+  config.layerConfig.set_size(blockExpand->block_x() * blockExpand->block_y() *
+                              blockExpand->channels());
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "blockexpand", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, maxoutLayer) {
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("maxout");
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 4096, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  MaxOutConfig* maxout = input->mutable_maxout_conf();
+  ImageConfig* image = maxout->mutable_image_conf();
+
+  image->set_img_size(32);
+  image->set_img_size_y(32);
+  image->set_channels(4);
+  maxout->set_groups(2);
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "maxout", 10, false, useGpu);
+  }
+}
+
+void testFcLayer(string format, size_t nnz) {
+  TestConfig config;
+  config.biasSize = 1024;
+  config.layerConfig.set_type("fc");
+  config.layerConfig.set_size(1024);
+  config.layerConfig.set_active_type("sigmoid");
+  config.layerConfig.set_drop_rate(0.1);
+
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_0", 2048, nnz, ParaSparse(format)});
+  config.layerConfig.add_inputs();
+
+  LOG(INFO) << config.inputDefs[0].sparse.sparse << " "
+            << config.inputDefs[0].sparse.format;
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "fc",
+                  100,
+                  /* trans */ false,
+                  useGpu,
+                  /* weight */ true);
+  }
+}
+
+TEST(Layer, fcLayer) {
+  testFcLayer("", 1024 * 1024 * 2);
+  testFcLayer("csc", 1024 * 10);
+  testFcLayer("csr", 1024 * 10);
+}
+
+TEST(Layer, SelectiveFullyConnectedLayer) {
+  TestConfig config;
+  size_t nin = 16;
+  size_t nout = 256;
+  config.layerConfig.set_type("selective_fc");
+  config.layerConfig.set_size(nout);
+  config.layerConfig.set_active_type("sigmoid");
+  config.layerConfig.set_has_selected_colums(true);
+  config.layerConfig.set_selective_fc_pass_generation(false);
+  config.biasSize = nout;
+
+  config.inputDefs.push_back({INPUT_DATA, "input0", nin, nin * nout});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back(
+      {INPUT_SPARSE_NON_VALUE_DATA, "index", nout, 0, ParaSparse("csr", true)});
+  config.layerConfig.add_inputs();
+
+  testLayerGrad(config,
+                "selective_fc",
+                100,
+                /* trans= */ false,
+                /* useGup= */ false,
+                false);
+#ifdef PADDLE_WITH_CUDA
+  testLayerGrad(config,
+                "selective_fc",
+                100,
+                /* trans= */ false,
+                /* useGup= */ true,
+                false);
+#endif
+}
+
+TEST(Layer, DataNormLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("data_norm");
+  config.layerConfig.set_size(20);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 20, 100});
+  config.inputDefs.back().isStatic = true;
+  config.layerConfig.add_inputs();
+
+  for (auto strategy : {"z-score", "min-max", "decimal-scaling"}) {
+    config.layerConfig.set_data_norm_strategy(strategy);
+    // The parameters are static, so not support GPU now
+    testLayerGrad(config,
+                  "data_norm",
+                  200,
+                  /* trans */ false,
+                  /* useGpu */ false);
+  }
+}
+
+TEST(Layer, hsigmoidLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("hsigmoid");
+  config.layerConfig.set_num_classes(5);
+  config.layerConfig.set_size(1);
+  config.biasSize = config.layerConfig.num_classes() - 1;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 200});
+  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 5, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "hsigmoid",
+                  100,
+                  /* trans */ false,
+                  /* useGpu */ useGpu);
+  }
+}
+
+TEST(Layer, multi_cross) {
+  TestConfig config;
+  config.layerConfig.set_type("multi-class-cross-entropy");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
+  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(
+        config, "multi-class-cross-entropy", 100, /* trans */ false, useGpu);
+  }
+}
+
+TEST(Layer, multi_binary_label_sparse_mat) {
+  TestConfig config;
+  config.layerConfig.set_type("multi_binary_label_cross_entropy");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
+  config.inputDefs.push_back({INPUT_SPARSE_NON_VALUE_DATA, "layer_1", 50, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "multi_binary_label_cross_entropy",
+                  100,
+                  /* trans */ false,
+                  useGpu);
+  }
+}
+
+TEST(layer, multi_binary_label_id) {
+  TestConfig config;
+  config.layerConfig.set_type("multi_binary_label_cross_entropy");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
+  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "multi_binary_label_cross_entropy",
+                  100,
+                  /* trans */ false,
+                  useGpu);
+  }
+}
+
+TEST(Layer, multi_cross_with_selfnorm) {
+  TestConfig config;
+  config.layerConfig.set_type("multi_class_cross_entropy_with_selfnorm");
+  config.layerConfig.set_softmax_selfnorm_alpha(0.1);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
+  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  // Not support GPU now
+  testLayerGrad(config,
+                "multi_class_cross_entropy_with_selfnorm",
+                100,
+                /* trans */ false,
+                /* useGpu */ false);
+}
+
+TEST(Layer, multi_cross_soft) {
+  TestConfig config;
+  config.layerConfig.set_type("soft_binary_class_cross_entropy");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "soft_binary_class_cross_entropy",
+                  100,
+                  /* trans */ false,
+                  useGpu);
+  }
+}
+
+TEST(Layer, square_error) {
+  TestConfig config;
+  config.layerConfig.set_type("square_error");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "square_error", 100, /* trans */ false, useGpu);
+  }
+}
+
+TEST(Layer, sparse_square_error) {
+  TestConfig config;
+  config.layerConfig.set_type("square_error");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
+  config.inputDefs.push_back({INPUT_SPARSE_NON_VALUE_DATA, "layer_1", 50, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  // "GpuSparseMatrix" as label is not supported
+  testLayerGrad(config,
+                "square_error",
+                100,
+                /* trans */ false,
+                /* useGpu */ false);
+}
+
+TEST(Layer, sparse_float_square_error) {
+  TestConfig config;
+  config.layerConfig.set_type("square_error");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
+  config.inputDefs.push_back({INPUT_SPARSE_FLOAT_VALUE_DATA, "layer_1", 50, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  // "GpuSparseMatrix" as label is not supported
+  testLayerGrad(config,
+                "square_error",
+                100,
+                /* trans */ false,
+                /* useGpu */ false);
+}
+
+TEST(Layer, square_error_weighted) {
+  TestConfig config;
+  config.layerConfig.set_type("square_error");
+  config.biasSize = 0;
+  config.testAccumulate = false;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_2", 1, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "square_error", 100, /* trans */ false, useGpu);
+  }
+}
+
+TEST(Layer, huber_regression_loss) {
+  TestConfig config;
+  config.layerConfig.set_type("huber_regression");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    for (auto delta : {1, 3, 5}) {
+      config.layerConfig.set_delta(delta);
+      testLayerGrad(config, "huber_regression", 100, /* trans */ false, useGpu);
+    }
+  }
+}
+
+TEST(Layer, huber_two_class) {
+  TestConfig config;
+  config.layerConfig.set_type("huber_classification");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
+  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 2, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "huber_two_class", 100, /* trans */ false, useGpu);
+  }
+}
+
+void testExpandLayer(string trans_type, bool hasSubseq) {
+  TestConfig config;
+  config.layerConfig.set_type("expand");
+
+  config.inputDefs.push_back(
+      {trans_type == "non-seq" ? INPUT_DENSE_DIM_DATA : INPUT_SEQUENCE_DATA,
+       "layer_0",
+       10,
+       0});
+  config.inputDefs.push_back(
+      {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA,
+       "layer_1",
+       10,
+       0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.set_trans_type(trans_type);
+  LOG(INFO) << " trans_type=" << trans_type << " hasSubseq=" << hasSubseq;
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "expand", 30, false, useGpu);
+  }
+}
+
+TEST(Layer, ExpandLayer) {
+  testExpandLayer("non-seq", false);  // non-seq expand to seq
+  testExpandLayer("non-seq", true);   // non-seq expand to hasSubseq
+  testExpandLayer("seq", true);       // seq expand to hasSubseq
+}
+
+void testDegradeLayer(bool hasSubseq,
+                      string layer_type,
+                      string trans_type,
+                      int stride) {
+  TestConfig config;
+  config.layerConfig.set_type(layer_type);
+  config.layerConfig.set_size(10);
+  config.layerConfig.set_seq_pool_stride(stride);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back(
+      {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA,
+       "layer_0",
+       10,
+       0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.set_trans_type(trans_type);
+
+  auto testDegradeLayerGrad = [](TestConfig& config, string layer_type) {
+    for (auto useGpu : {false, true}) {
+      testLayerGrad(config, layer_type, 100, false, useGpu);
+    }
+  };
+
+  if (layer_type == "average") {
+    for (auto strategy : {"average", "sum", "squarerootn"}) {
+      LOG(INFO) << " hasSubseq=" << hasSubseq << " trans_type=" << trans_type
+                << " average_strategy=" << strategy
+                << " seq_pool_stride=" << stride;
+      config.layerConfig.set_average_strategy(strategy);
+      testDegradeLayerGrad(config, layer_type);
+    }
+  } else {
+    LOG(INFO) << " hasSubseq=" << hasSubseq << " trans_type=" << trans_type
+              << " seq_pool_stride=" << stride;
+    testDegradeLayerGrad(config, layer_type);
+  }
+}
+
+TEST(Layer, MaxLayer) {
+  testDegradeLayer(false, "max", "non-seq", -1);  // seq max to non-seq
+  testDegradeLayer(false,
+                   "max",
+                   "non-seq",
+                   5);  // seq max to a shorten seq, stride window = 5
+  testDegradeLayer(true, "max", "non-seq", -1);  // hasSubseq max to non-seq
+  testDegradeLayer(true, "max", "seq", -1);      // hasSubseq max to seq
+}
+
+TEST(Layer, SequenceLastInstanceLayer) {
+  testDegradeLayer(false,
+                   "seqlastins",
+                   "non-seq",
+                   -1);  // seq seqlastins to non-seq
+  testDegradeLayer(false,
+                   "seqlastins",
+                   "non-seq",
+                   5);  // seq seqlastins to a shorten seq, stride window = 5
+  testDegradeLayer(true,
+                   "seqlastins",
+                   "non-seq",
+                   -1);  // hasSubseq seqlastins to non-seq
+  testDegradeLayer(true,
+                   "seqlastins",
+                   "seq",
+                   -1);  // hasSubseq seqlastins to seq
+}
+
+TEST(Layer, AverageLayer) {
+  testDegradeLayer(false, "average", "non-seq", -1);  // seq average to non-seq
+  testDegradeLayer(false,
+                   "average",
+                   "non-seq",
+                   5);  // seq average to a shorten seq, stride window = 5
+  testDegradeLayer(true,
+                   "average",
+                   "non-seq",
+                   -1);                          // hasSubseq average to non-seq
+  testDegradeLayer(true, "average", "seq", -1);  // hasSubseq average to seq
+}
+
+TEST(Layer, SequenceConcatLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("seqconcat");
+  config.layerConfig.set_size(10);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 10, 0});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "seqconcat", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, SequenceReshapeLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("seqreshape");
+  config.layerConfig.set_size(10);
+
+  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 100, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "seqreshape", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, ConvShiftLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("conv_shift");
+  config.layerConfig.set_size(10);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 3, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  // Not support GPU now
+  testLayerGrad(config, "conv_shift", 100, false, false);
+}
+
+TEST(Layer, PowerLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("power");
+  config.layerConfig.set_size(10);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "power", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, ConvexCombinationLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("convex_comb");
+  config.layerConfig.set_size(20);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 5, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 100, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "convex_comb", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, InterpolationLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("interpolation");
+  config.layerConfig.set_size(10);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_2", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "interpolation", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, DotProdLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("dot_prod");
+  config.layerConfig.set_size(1);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "dot_prod", 10, false, useGpu);
+  }
+}
+
+TEST(Layer, OuterProdLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("out_prod");
+  config.layerConfig.set_size(100);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "out_prod", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, SlopeInterceptLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("slope_intercept");
+  config.layerConfig.set_size(10);
+  config.layerConfig.set_slope(1.0);
+  config.layerConfig.set_intercept(0.1);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "slope_intercept", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, ScalingLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("scaling");
+  config.layerConfig.set_size(10);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "scaling", 100, false, useGpu);
+  }
+}
+
+void testNormLayer(const string& normType, bool trans, bool useGpu) {
+  TestConfig config;
+  config.layerConfig.set_type("norm");
+  config.layerConfig.set_active_type("relu");
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1568, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  NormConfig* norm = input->mutable_norm_conf();
+  norm->set_norm_type(normType);
+  norm->set_channels(16);
+  norm->set_size(5);
+  norm->set_scale(0.001);
+  norm->set_pow(0.75);
+  norm->set_blocked(0);
+  norm->set_img_size(14);
+  norm->set_img_size_y(7);
+  norm->set_output_x(norm->img_size());
+  norm->set_output_y(norm->img_size_y());
+  if (norm->norm_type() == "cmrnorm" ||
+      norm->norm_type() == "cmrnorm-projection") {
+    norm->set_scale(norm->scale() / norm->size());
+  } else {
+    norm->set_scale(norm->scale() / (norm->size() * norm->size()));
+  }
+
+  config.layerConfig.set_size(norm->output_x() * norm->output_y() *
+                              norm->channels());
+  config.biasSize = 0;
+
+  testLayerGrad(config, "norm", 100, trans, useGpu);
+}
+
+TEST(Layer, NormLayer) {
+  testNormLayer("cmrnorm-projection",
+                /* trans= */ false, /* useGpu= */
+                true);
+  testNormLayer("cmrnorm-projection",
+                /* trans= */ false, /* useGpu= */
+                false);
+}
+
+void setPoolConfig(TestConfig* config,
+                   PoolConfig* pool,
+                   const string& poolType) {
+  (*config).biasSize = 0;
+  (*config).layerConfig.set_type("pool");
+  (*config).layerConfig.set_num_filters(16);
+
+  int kw = 3, kh = 3;
+  int pw = 0, ph = 0;
+  int sw = 2, sh = 2;
+  pool->set_pool_type(poolType);
+  pool->set_channels(16);
+  pool->set_size_x(kw);
+  pool->set_size_y(kh);
+  pool->set_start(0);
+  pool->set_padding(pw);
+  pool->set_padding_y(ph);
+  pool->set_stride(sw);
+  pool->set_stride_y(sh);
+
+  int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
+  int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
+  pool->set_output_x(ow);
+  pool->set_output_y(oh);
+}
+
+void testPoolLayer(const string& poolType,
+                   bool trans,
+                   bool useGpu,
+                   bool excludeMode = true) {
+  TestConfig config;
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 3136, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  PoolConfig* pool = input->mutable_pool_conf();
+
+  pool->set_img_size(14);
+  pool->set_img_size_y(14);
+  pool->set_exclude_mode(excludeMode);
+  setPoolConfig(&config, pool, poolType);
+  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
+                              pool->channels());
+
+  testLayerGrad(config, "pool", 100, trans, useGpu);
+}
+
+#ifdef PADDLE_WITH_CUDA
+void testPoolLayer2(const string& poolType, bool trans, bool useGpu) {
+  TestConfig config;
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 3200, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  PoolConfig* pool = input->mutable_pool_conf();
+
+  pool->set_size_y(4);
+  pool->set_stride_y(3);
+  pool->set_img_size(10);
+  pool->set_img_size_y(20);
+  setPoolConfig(&config, pool, poolType);
+  pool->set_output_y((pool->img_size_y() - pool->start() - pool->size_y()) /
+                         ((float)pool->stride_y()) +
+                     1.5);
+  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
+                              pool->channels());
+
+  testLayerGrad(config, "pool", 100, trans, useGpu);
+}
+#endif
+
+TEST(Layer, PoolLayer) {
+  testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ false);
+  testPoolLayer("avg-projection",
+                /* trans= */ false,
+                /* useGpu= */ false,
+                /* excludeMode= */ false);
+  testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ false);
+  testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ false);
+
+#ifdef PADDLE_WITH_CUDA
+  testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ true);
+  testPoolLayer("avg-projection",
+                /* trans= */ false,
+                /* useGpu= */ true,
+                /* excludeMode= */ false);
+  testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ true);
+  testPoolLayer("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
+  testPoolLayer("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
+  testPoolLayer2("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
+  testPoolLayer2("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
+  testPoolLayer2("cudnn-avg-incl-pad-pool",
+                 /* trans= */ false,
+                 /* useGpu= */ true);
+  testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ true);
+#endif
+}
+
+void setPool3DConfig(TestConfig* config,
+                     PoolConfig* pool,
+                     const string& poolType) {
+  // filter size
+  const int NUM_FILTERS = 16;
+  const int FILTER_SIZE = 3;
+  const int FILTER_SIZE_Y = 3;
+  const int FILTER_SIZE_Z = 3;
+  const int CHANNELS = 16;
+
+  (*config).biasSize = 0;
+  (*config).layerConfig.set_type("pool3d");
+  (*config).layerConfig.set_num_filters(NUM_FILTERS);
+
+  int kw = FILTER_SIZE, kh = FILTER_SIZE_Y, kd = FILTER_SIZE_Z;
+  int pw = 0, ph = 0, pd = 0;
+  int sw = 2, sh = 2, sd = 2;
+
+  pool->set_pool_type(poolType);
+  pool->set_pool_type("avg");
+  pool->set_channels(CHANNELS);
+  pool->set_size_x(kw);
+  pool->set_size_y(kh);
+  pool->set_size_z(kd);
+  pool->set_padding(0);
+  pool->set_padding_y(0);
+  pool->set_padding_z(0);
+  pool->set_stride(sw);
+  pool->set_stride_y(sh);
+  pool->set_stride_z(sd);
+  pool->set_start(0);
+  int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
+  int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
+  int od = outputSize(pool->img_size_z(), kd, pd, sd, /* caffeMode */ false);
+  pool->set_output_x(ow);
+  pool->set_output_y(oh);
+  pool->set_output_z(od);
+}
+
+void testPool3DLayer(const string& poolType, bool trans, bool useGpu) {
+  TestConfig config;
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 11664, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  PoolConfig* pool = input->mutable_pool_conf();
+
+  const int IMAGE_SIZE = 9;
+  const int IMAGE_SIZE_Y = 9;
+  const int IMAGE_SIZE_Z = 9;
+
+  pool->set_img_size(IMAGE_SIZE);
+  pool->set_img_size_y(IMAGE_SIZE_Y);
+  pool->set_img_size_z(IMAGE_SIZE_Z);
+
+  setPool3DConfig(&config, pool, poolType);
+  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
+                              pool->channels());
+
+  testLayerGrad(config, "pool3d", 100, trans, useGpu);
+}
+
+TEST(Layer, Pool3DLayer) {
+  testPool3DLayer("avg", /* trans= */ false, /* useGpu= */ false);
+  testPool3DLayer("max", /* trans= */ false, /* useGpu= */ false);
+#ifdef PADDLE_WITH_CUDA
+  testPool3DLayer("avg", /* trans= */ false, /* useGpu= */ true);
+  testPool3DLayer("max", /* trans= */ false, /* useGpu= */ true);
+#endif
+}
+
+void testSppLayer(const string& poolType,
+                  const int pyramidHeight,
+                  bool trans,
+                  bool useGpu) {
+  TestConfig config;
+  config.layerConfig.set_type("spp");
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 3200, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  SppConfig* sppConfig = input->mutable_spp_conf();
+  sppConfig->set_pool_type(poolType);
+  sppConfig->set_pyramid_height(pyramidHeight);
+  ImageConfig* imageConfig = sppConfig->mutable_image_conf();
+  imageConfig->set_channels(16);
+  imageConfig->set_img_size(10);
+  imageConfig->set_img_size_y(20);
+  int outputSize = (std::pow(4, sppConfig->pyramid_height()) - 1) / (4 - 1);
+  config.layerConfig.set_size(outputSize * imageConfig->channels());
+  testLayerGrad(config, "spp", 100, trans, useGpu);
+}
+
+TEST(Layer, SpatialPyramidPoolLayer) {
+  for (auto useGpu : {false, true}) {
+    for (auto pyramidHeight : {1, 2, 3}) {
+      testSppLayer("avg-projection", pyramidHeight, false, useGpu);
+      testSppLayer("max-projection", pyramidHeight, false, useGpu);
+    }
+  }
+}
+
+TEST(Layer, rankCostLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("rank-cost");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 1, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_2", 1, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "rank-cost", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, sumCostLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("sum_cost");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "sum_cost", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, weightedRankCostLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("rank-cost");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 1, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_2", 1, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_3", 1, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "weighted-rank-cost", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, TensorLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("tensor");
+  config.layerConfig.set_size(10);
+  config.layerConfig.set_active_type("sigmoid");
+  config.biasSize = config.layerConfig.size();
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 5, 250});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 5, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "tensor", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, RecurrentLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("recurrent");
+  config.layerConfig.set_size(4);
+  config.layerConfig.set_active_type("tanh");
+  config.biasSize = 4;
+
+  config.inputDefs.push_back(
+      {INPUT_SEQUENCE_DATA, "layer_0", /* dim= */ 4, /* paraSize= */ 16});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    for (auto reversed : {false, true}) {
+      config.layerConfig.set_reversed(reversed);
+      config.testState = !reversed;
+      testLayerGrad(
+          config, "recurrent", 50, /* trans= */ false, useGpu, false, 1.0);
+    }
+  }
+}
+
+TEST(Layer, LstmLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("lstmemory");
+  config.layerConfig.set_size(4);
+  config.layerConfig.set_active_type("tanh");
+  config.layerConfig.set_active_state_type("sigmoid");
+  config.layerConfig.set_active_gate_type("sigmoid");
+  config.biasSize = 28;
+
+  config.inputDefs.push_back(
+      {INPUT_SEQUENCE_DATA, "layer_0", /* dim= */ 16, /* paraSize= */ 64});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    for (auto reversed : {false, true}) {
+      config.layerConfig.set_reversed(reversed);
+      config.testState = !reversed;
+      testLayerGrad(
+          config, "lstmemory", 100, /* trans= */ false, useGpu, false, 0.02);
+    }
+  }
+  for (auto useGpu : {true}) {
+    config.testBatchState = true;
+    config.layerConfig.set_reversed(false);
+    testLayerGrad(config, "lstmemory", 10, /* trans= */ false, useGpu);
+  }
+}
+
+TEST(Layer, MDLstmLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("mdlstmemory");
+  config.layerConfig.set_size(4);
+  config.layerConfig.set_active_type("sigmoid");
+  config.layerConfig.set_active_state_type("sigmoid");
+  config.layerConfig.set_active_gate_type("sigmoid");
+  config.biasSize = 4 * 9;
+
+  config.inputDefs.push_back(
+      {INPUT_SEQUENCE_MDIM_DATA, "layer_0", 4 * 5, 4 * 4 * 5});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_directions(true);
+  config.layerConfig.add_directions(true);
+
+  for (auto useGpu : {false, true}) {
+    for (int i = 0; i < 2; i++) {
+      for (int j = 0; j < 2; j++) {
+        config.layerConfig.set_directions(0, bool(i));
+        config.layerConfig.set_directions(1, bool(j));
+        testLayerGrad(config, "mdlstmemory", 100, false, useGpu);
+      }
+    }
+  }
+}
+
+TEST(Layer, ParameterReluLayer) {
+  auto testParameterReluLayer = [&](size_t inputSize, size_t channels) {
+    TestConfig config;
+    config.layerConfig.set_type("prelu");
+    config.inputDefs.push_back({INPUT_DATA, "layer_0", inputSize, channels});
+    config.layerConfig.add_inputs();
+    config.layerConfig.set_size(inputSize);
+    config.layerConfig.set_partial_sum(inputSize /
+                                       channels);  // size of feature map
+    for (auto useGpu : {false, true}) {
+      testLayerGrad(config, "prelu", 100, false, useGpu);
+    }
+  };
+
+  testParameterReluLayer(192, 1);
+  testParameterReluLayer(192, 3);
+  testParameterReluLayer(192, 192);
+}
+
+TEST(Layer, ResizeLayer) {
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("resize");
+  config.layerConfig.set_size(64);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 16, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "resize", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, RotateLayer) {
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("rotate");
+  const int CHANNEL = 2;
+  const int HEIGHT = 8;
+  const int WIDTH = 4;
+  const int INPUT_SIZE = HEIGHT * WIDTH * CHANNEL;
+  config.layerConfig.set_size(INPUT_SIZE);
+  config.layerConfig.set_height(HEIGHT);
+  config.layerConfig.set_width(WIDTH);
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", INPUT_SIZE, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "rotate", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, NCELayer) {
+  TestConfig config;
+  size_t numClasses = 4;
+  config.layerConfig.set_type("nce");
+  config.layerConfig.set_size(1);
+  config.layerConfig.set_active_type("sigmoid");
+  config.layerConfig.set_num_classes(numClasses);
+  config.biasSize = numClasses;
+
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_0", /* dim= */ 16, /* paraSize= */ 16 * numClasses});
+  config.inputDefs.push_back(
+      {INPUT_LABEL, "label", /* dim= */ numClasses, /* paraSize= */ 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto withWeight : {false, true}) {
+    if (withWeight) {
+      config.inputDefs.push_back(
+          {INPUT_DATA_TARGET, "weight", /* dim= */ 1, /* paraSize= */ 0});
+      config.layerConfig.add_inputs();
+    }
+
+    for (auto isIdLabel : {false, true}) {
+      config.inputDefs[1] = {
+          isIdLabel ? INPUT_LABEL : INPUT_SPARSE_NON_VALUE_DATA,
+          "label",
+          /* dim= */ numClasses,
+          /* paraSize= */ 0};
+
+      for (auto withDist : {false, true}) {
+        config.layerConfig.clear_neg_sampling_dist();
+        if (withDist) {
+          double sum = 0;
+          for (size_t i = 0; i < numClasses; ++i) {
+            real p = rand();  // NOLINT use rand_r
+            config.layerConfig.add_neg_sampling_dist(p);
+            sum += p;
+          }
+          for (size_t i = 0; i < numClasses; ++i) {
+            real p = config.layerConfig.neg_sampling_dist(i) / sum;
+            config.layerConfig.set_neg_sampling_dist(i, p);
+          }
+        }
+        LOG(INFO) << "NCELayer "
+                  << " isIdLabel=" << isIdLabel << " withWeight=" << withWeight
+                  << " withDist=" << withDist;
+        // Not support GPU now
+        testLayerGrad(config,
+                      "nce",
+                      100,
+                      /* trans= */ false,
+                      /* useGpu */ false);
+      }
+    }
+  }
+}
+
+TEST(Layer, GatedRecurrentLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("gated_recurrent");
+  config.layerConfig.set_size(4);
+  config.layerConfig.set_active_type("sigmoid");
+  config.layerConfig.set_active_gate_type("sigmoid");
+  config.biasSize = 12;
+
+  config.inputDefs.push_back(
+      {INPUT_SEQUENCE_DATA, "layer_0", /* dim= */ 12, /* paraSize= */ 48});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    for (auto reversed : {false, true}) {
+      config.layerConfig.set_reversed(reversed);
+      config.testState = !reversed;
+      testLayerGrad(config, "gated_recurrent", 100, /* trans= */ false, useGpu);
+    }
+  }
+}
+
+TEST(Layer, GruStepLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("gru_step");
+  config.layerConfig.set_size(4);
+  config.layerConfig.set_active_type("sigmoid");
+  config.layerConfig.set_active_gate_type("sigmoid");
+  config.biasSize = 12;
+
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_0", /* dim= */ 12, /* paraSize= */ 48});
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_1", /* dim= */ 4, /* paraSize= */ 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "gruStep", 100, /* trans= */ false, useGpu);
+  }
+}
+
+TEST(Layer, LstmStepLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("lstm_step");
+  config.layerConfig.set_size(4);
+  config.layerConfig.set_active_type("sigmoid");
+  config.layerConfig.set_active_state_type("sigmoid");
+  config.layerConfig.set_active_gate_type("sigmoid");
+  config.biasSize = 12;
+  config.testAccumulate = false;
+
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_0", /* dim= */ 16, /* paraSize= */ 0});
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_1", /* dim= */ 4, /* paraSize= */ 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "lstmStep", 100, /* trans= */ false, useGpu);
+  }
+}
+
+void testBatchNormLayer(const string& type, bool trans, bool useGpu) {
+  TestConfig config;
+  const int CHANNELS = 10;
+  const int IMG_SIZE = 16;
+  const int IMG_SIZE_Y = 8;
+  size_t size = CHANNELS * IMG_SIZE * IMG_SIZE_Y;
+  config.layerConfig.set_type(type);
+  config.layerConfig.set_size(size);
+  config.layerConfig.set_active_type("sigmoid");
+  config.biasSize = CHANNELS;
+  config.inputDefs.push_back({INPUT_DATA,
+                              "layer_0",
+                              /* dim= */ size,
+                              /* paraSize= */ CHANNELS});
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_1_running_mean", 1, CHANNELS});
+  config.inputDefs.back().isStatic = true;
+  config.inputDefs.push_back({INPUT_DATA, "layer_2_running_var", 1, CHANNELS});
+  config.inputDefs.back().isStatic = true;
+
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  ImageConfig* img_conf = input->mutable_image_conf();
+  img_conf->set_channels(CHANNELS);
+  img_conf->set_img_size(IMG_SIZE);
+  img_conf->set_img_size_y(IMG_SIZE_Y);
+
+  testLayerGrad(config,
+                "batch_norm",
+                64,
+                /* trans= */ trans,
+                useGpu,
+                /* useWeight */ true);
+}
+
+TEST(Layer, BatchNormalizationLayer) {
+  testBatchNormLayer("batch_norm", false, false);
+#ifdef PADDLE_WITH_CUDA
+  testBatchNormLayer("batch_norm", false, true);
+  if (hl_get_cudnn_lib_version() >= int(4000)) {
+    testBatchNormLayer("cudnn_batch_norm", false, true);
+  }
+#endif
+}
+
+void testBatchNorm3DLayer(const string& type, bool trans, bool useGpu) {
+  TestConfig config;
+  const int CHANNELS = 10;
+  const int IMG_SIZE = 16;
+  const int IMG_SIZE_Y = 8;
+  const int IMG_SIZE_Z = 8;
+  size_t size = CHANNELS * IMG_SIZE * IMG_SIZE_Y * IMG_SIZE_Z;
+  config.layerConfig.set_type(type);
+  config.layerConfig.set_size(size);
+  config.layerConfig.set_active_type("sigmoid");
+  config.biasSize = CHANNELS;
+  config.inputDefs.push_back({INPUT_DATA,
+                              "layer_0",
+                              /* dim= */ size,
+                              /* paraSize= */ CHANNELS});
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_1_running_mean", 1, CHANNELS});
+  config.inputDefs.back().isStatic = true;
+  config.inputDefs.push_back({INPUT_DATA, "layer_2_running_var", 1, CHANNELS});
+  config.inputDefs.back().isStatic = true;
+
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  ImageConfig* img_conf = input->mutable_image_conf();
+  img_conf->set_channels(CHANNELS);
+  img_conf->set_img_size(IMG_SIZE);
+  img_conf->set_img_size_y(IMG_SIZE_Y);
+  img_conf->set_img_size_z(IMG_SIZE_Z);
+
+  testLayerGrad(config,
+                "batch_norm",
+                64,
+                /* trans= */ trans,
+                useGpu,
+                /* useWeight */ true);
+}
+
+TEST(Layer, testBatchNorm3DLayer) {
+  testBatchNorm3DLayer("batch_norm", false, false);
+#ifdef PADDLE_WITH_CUDA
+  testBatchNorm3DLayer("batch_norm", false, true);
+  if (hl_get_cudnn_lib_version() >= int(4000)) {
+    testBatchNorm3DLayer("cudnn_batch_norm", false, true);
+  }
+#endif
+}
+
+void testConvOperator(bool isDeconv) {
+  TestConfig config;
+  const int NUM_FILTERS = 16;
+  const int FILTER_SIZE = 2;
+  const int FILTER_SIZE_Y = 3;
+  const int CHANNELS = 3;
+  const int IMAGE_SIZE = 16;
+  const int IMAGE_SIZE_Y = 9;
+  OperatorConfig& operatorConf = *config.layerConfig.add_operator_confs();
+  if (isDeconv) {
+    operatorConf.set_type("convt");
+  } else {
+    operatorConf.set_type("conv");
+  }
+  ConvConfig* conv = operatorConf.mutable_conv_conf();
+  operatorConf.set_num_filters(NUM_FILTERS);
+  conv->set_filter_size(FILTER_SIZE);
+  conv->set_filter_size_y(FILTER_SIZE_Y);
+  conv->set_channels(CHANNELS);
+  conv->set_padding(0);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_groups(1);
+  conv->set_img_size(IMAGE_SIZE);
+  conv->set_img_size_y(IMAGE_SIZE_Y);
+  conv->set_output_x(outputSize(conv->img_size(),
+                                conv->filter_size(),
+                                conv->padding(),
+                                conv->stride(),
+                                /*  caffeMode */ true));
+  conv->set_output_y(outputSize(conv->img_size_y(),
+                                conv->filter_size_y(),
+                                conv->padding_y(),
+                                conv->stride_y(),
+                                /*  caffeMode */ true));
+
+  if (isDeconv) {
+    conv->set_filter_channels(NUM_FILTERS / conv->groups());
+    config.inputDefs.push_back({INPUT_DATA,
+                                "layer_0",
+                                conv->output_x() * conv->output_y() * CHANNELS,
+                                0});
+    config.layerConfig.set_size(IMAGE_SIZE * IMAGE_SIZE_Y * NUM_FILTERS);
+  } else {
+    conv->set_filter_channels(conv->channels() / conv->groups());
+    config.inputDefs.push_back(
+        {INPUT_DATA, "layer_0", IMAGE_SIZE * IMAGE_SIZE_Y * CHANNELS, 0});
+    config.layerConfig.set_size(conv->output_x() * conv->output_y() *
+                                NUM_FILTERS);
+  }
+
+  config.inputDefs.push_back(
+      {INPUT_DATA,
+       "layer_1",
+       FILTER_SIZE * FILTER_SIZE_Y * CHANNELS * NUM_FILTERS,
+       0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  testOperatorGrad(config, operatorConf, 100, /*useGpu*/ true, false);
+}
+
+TEST(Operator, conv) {
+  testConvOperator(/*isDeconv*/ true);
+  testConvOperator(/*isDeconv*/ false);
+}
+
+TEST(Layer, FeatureMapExpandLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("featmap_expand");
+  const int CHANNELS = 10;
+  const int INPUT_SIZE = 100;
+  config.layerConfig.set_size(INPUT_SIZE * CHANNELS);
+  config.layerConfig.set_num_filters(CHANNELS);
+  config.inputDefs.push_back({INPUT_SEQUENCE_DATA,
+                              "layer_0",
+                              /* dim= */ INPUT_SIZE,
+                              /* paraSize= */ 0});
+  config.layerConfig.add_inputs();
+  for (auto useGpu : {false, true}) {
+    for (auto asRowVec : {false, true}) {
+      config.layerConfig.set_user_arg(asRowVec ? "as_row_vec" : "as_col_vec");
+      testLayerGrad(config,
+                    "featmap_expand",
+                    /*batch_size*/ 100,
+                    /* trans= */ false,
+                    useGpu,
+                    /* useWeight */ true);
+    }
+  }
+}
+
+TEST(Layer, MultiplexLayer) {
+  TestConfig config;
+  const int LAYER_SIZE = 100;
+  config.layerConfig.set_type("multiplex");
+  config.layerConfig.set_size(LAYER_SIZE);
+
+  config.inputDefs.push_back({INPUT_LABEL, "layer_0", 2, 0});
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_1", /* dim= */ LAYER_SIZE, /* paraSize= */ 0});
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_2", /* dim= */ LAYER_SIZE, /* paraSize= */ 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "multiplex", 512, /* trans= */ false, useGpu);
+  }
+}
+
+TEST(Layer, PadLayer) {
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("pad");
+
+  int c = 4;
+  int h = 31;
+  int w = 36;
+  size_t size = c * h * w;
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", size, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  PadConfig* pad = input->mutable_pad_conf();
+  ImageConfig* image = pad->mutable_image_conf();
+
+  image->set_channels(c);
+  image->set_img_size(h);
+  image->set_img_size_y(w);
+  pad->add_pad_c(1);
+  pad->add_pad_c(2);
+  pad->add_pad_h(2);
+  pad->add_pad_h(3);
+  pad->add_pad_w(3);
+  pad->add_pad_w(5);
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "pad", 10, false, useGpu);
+  }
+}
+
+TEST(Layer, CrossChannelNormLayer) {
+  TestConfig config;
+  config.paramInitialMean = 1.;
+  config.paramInitialStd = 0.;
+  config.layerConfig.set_type("norm");
+  config.layerConfig.set_size(100);
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  NormConfig* norm = input->mutable_norm_conf();
+  norm->set_norm_type("cross-channel-norm");
+  norm->set_channels(10);
+  norm->set_size(100);
+  norm->set_scale(0);
+  norm->set_pow(0);
+  norm->set_blocked(0);
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 100, 10});
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "cross-channel-norm", 10, false, useGpu, false);
+  }
+}
+
+TEST(Layer, smooth_l1) {
+  TestConfig config;
+  config.layerConfig.set_type("smooth_l1");
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 200, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 200, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "smooth_l1", 100, false, useGpu, false);
+  }
+}
+
+TEST(Layer, multibox_loss) {
+  TestConfig config;
+  config.layerConfig.set_type("multibox_loss");
+  config.biasSize = 0;
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  MultiBoxLossConfig* multiboxLoss = input->mutable_multibox_loss_conf();
+  multiboxLoss->set_num_classes(21);
+  multiboxLoss->set_input_num(1);
+  multiboxLoss->set_overlap_threshold(0.5);
+  multiboxLoss->set_neg_pos_ratio(3);
+  multiboxLoss->set_neg_overlap(0.5);
+  multiboxLoss->set_background_id(0);
+  multiboxLoss->set_height(3);
+  multiboxLoss->set_width(3);
+
+  size_t gtNum = 1;
+  MatrixPtr labelValue = Matrix::create(gtNum, 6, false, false);
+  labelValue->randomizeUniform();
+  labelValue->add(-0.5);
+  labelValue->sigmoid(*labelValue);
+  real* labelData = labelValue->getData();
+  size_t labelWidth = labelValue->getWidth();
+  for (size_t i = 0; i < gtNum; ++i) {
+    *(labelData + i * labelWidth) = std::rand() % 20 + 1;
+    *(labelData + i * labelWidth + 1) = 0.400259;
+    *(labelData + i * labelWidth + 2) = 0.377857;
+    *(labelData + i * labelWidth + 3) = 0.525712;
+    *(labelData + i * labelWidth + 4) = 0.519368;
+  }
+  vector<int> seqStartPositions(gtNum + 1, 0);
+  for (size_t i = 1; i <= gtNum; ++i) {
+    seqStartPositions[i] = i;
+  }
+
+  // Ensure at lease one matched bbox
+  MatrixPtr priorValue = Matrix::create(1, 72, false, false);
+  priorValue->randomizeUniform();
+  priorValue->add(-0.5);
+  priorValue->sigmoid(*priorValue);
+  real* priorData = priorValue->getData();
+  *(priorData) = 0.424811;
+  *(priorData + 1) = 0.397059;
+  *(priorData + 2) = 0.538905;
+  *(priorData + 3) = 0.447091;
+  *(priorData + 4) = 0.425720;
+  *(priorData + 5) = 0.515228;
+  *(priorData + 6) = 0.519452;
+  *(priorData + 7) = 0.591065;
+
+  config.inputDefs.push_back(
+      {INPUT_SELF_DEFINE_DATA, "priorbox", priorValue, {}});
+  config.inputDefs.push_back(
+      {INPUT_SELF_DEFINE_DATA, "label", labelValue, seqStartPositions});
+  config.inputDefs.push_back({INPUT_DATA, "locPred", 36, 0});
+  config.inputDefs.push_back({INPUT_DATA, "confPred", 189, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "multibox_loss", 1, false, useGpu, false);
+  }
+}
+
+TEST(Layer, TransLayer) {
+  TestConfig config;
+  const int height = 128;
+  const int width = 256;
+  config.layerConfig.set_type("trans");
+  config.layerConfig.set_size(width);
+
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_0", /* dim= */ height * width, /* paraSize= */ 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "trans", height, /* trans= */ false, useGpu);
+  }
+}
+
+TEST(Layer, RowConvLayer) {
+  const int context = 3;
+  const int size = 512;
+
+  TestConfig config;
+  config.layerConfig.set_type("row_conv");
+  config.layerConfig.set_size(size);
+  config.layerConfig.set_active_type("sigmoid");
+
+  config.inputDefs.push_back(
+      {INPUT_SEQUENCE_DATA, "layer_0", size, context * size});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  RowConvConfig* conv = input->mutable_row_conv_conf();
+  conv->set_context_length(context);
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "row_conv", 100, false, useGpu, false);
+  }
+}
+
+TEST(Layer, CropLayer) {
+  TestConfig config;
+  // config input_0
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ImageConfig* img = input->mutable_image_conf();
+  img->set_channels(4);
+  img->set_img_size(16);
+  config.layerConfig.set_axis(2);
+  config.layerConfig.add_offset(0);
+  config.layerConfig.add_offset(0);
+
+  // config input_1
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 128, 0});
+  input = config.layerConfig.add_inputs();
+  img = input->mutable_image_conf();
+  img->set_channels(2);
+  img->set_img_size(8);
+
+  // config crop layer
+  config.layerConfig.set_type("crop");
+  config.layerConfig.set_name("cropLayer");
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "crop", 100, false, useGpu, false);
+  }
+}
+
+TEST(Layer, roi_pool) {
+  TestConfig config;
+  config.layerConfig.set_type("roi_pool");
+  config.biasSize = 0;
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ROIPoolConfig* roiPoolConf = input->mutable_roi_pool_conf();
+  roiPoolConf->set_pooled_width(7);
+  roiPoolConf->set_pooled_height(7);
+  roiPoolConf->set_spatial_scale(1. / 16);
+  roiPoolConf->set_width(14);
+  roiPoolConf->set_height(14);
+
+  const size_t roiNum = 10;
+  const size_t roiDim = 10;
+  const size_t batchSize = 5;
+  MatrixPtr roiValue = Matrix::create(roiNum, roiDim, false, false);
+  roiValue->zeroMem();
+  real* roiData = roiValue->getData();
+  for (size_t i = 0; i < roiNum; ++i) {
+    roiData[i * roiDim + 0] = std::rand() % batchSize;
+    roiData[i * roiDim + 1] = std::rand() % 224;  // xMin
+    roiData[i * roiDim + 2] = std::rand() % 224;  // yMin
+    size_t xMin = static_cast<size_t>(roiData[i * roiDim + 1]);
+    size_t yMin = static_cast<size_t>(roiData[i * roiDim + 2]);
+    roiData[i * roiDim + 3] = xMin + std::rand() % (224 - xMin);  // xMax
+    roiData[i * roiDim + 4] = yMin + std::rand() % (224 - yMin);  // yMax
+  }
+
+  config.inputDefs.push_back({INPUT_DATA, "input", 3 * 14 * 14, {}});
+  config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, "rois", roiValue, {}});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "roi_pool", batchSize, false, useGpu, false);
+  }
+}
+
+TEST(Layer, SwitchOrderLayer) {
+  TestConfig config;
+  // config input_0
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ImageConfig* img = input->mutable_image_conf();
+  img->set_channels(4);
+  img->set_img_size(16);
+  img->set_img_size_y(16);
+
+  ReshapeConfig* reshape = config.layerConfig.mutable_reshape_conf();
+  reshape->add_height_axis(0);
+  reshape->add_height_axis(1);
+  reshape->add_height_axis(2);
+  reshape->add_width_axis(3);
+
+  // config softmax layer
+  config.layerConfig.set_type("switch_order");
+  config.layerConfig.set_name("switchOrderLayer");
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "switch_order", 100, false, useGpu, true);
+  }
+}
+
+vector<real> randSampling(real range, int n) {
+  CHECK_GE(range, n);
+  vector<real> num(range);
+  iota(begin(num), end(num), 0.);
+  if (range == n) return num;
+
+  random_shuffle(begin(num), end(num));
+  num.resize(n);
+  sort(begin(num), end(num));
+  return num;
+}
+
+TEST(Layer, SubNestedSequenceLayer) {
+  // layer size is not crutial for this layer,
+  // so use a small layer size in unittest
+  const int layerSize = 4;
+
+  const int maxSeqNum = 50;
+  const int maxSeqLen = 50;
+  const int maxBeamSize = 32;
+
+  srand((size_t)(time(NULL)));
+  int beamSize = 1 + (rand() % maxBeamSize);
+
+  TestConfig config;
+  config.layerConfig.set_type("sub_nested_seq");
+  config.layerConfig.set_name("sub_nested_seq_layer");
+  config.layerConfig.set_size(layerSize);
+
+  int seqNum = 1 + (rand() % maxSeqNum);
+
+  // sequence information for the first input, it is a nested sequence
+  vector<int> seqStartPos(seqNum + 1, 0);
+  vector<int> subSeqStartPos(1, 0);
+
+  // selected indices
+  MatrixPtr selectedIndices = Matrix::create(seqNum, beamSize, false, false);
+  selectedIndices->one();
+  selectedIndices->mulScalar(-1.);
+  real* indicesData = selectedIndices->getData();
+
+  for (int i = 0; i < seqNum; ++i) {
+    int subSeqNum = 1 + (rand() % maxSeqNum);
+    for (int j = 0; j < subSeqNum; ++j) {
+      subSeqStartPos.push_back(subSeqStartPos.back() +
+                               (1 + (rand() % maxSeqLen)));
+    }
+    vector<real> selSeqs =
+        randSampling(static_cast<real>(subSeqNum), min(beamSize, subSeqNum));
+    memcpy(indicesData + (i * beamSize),
+           selSeqs.data(),
+           selSeqs.size() * sizeof(real));
+    seqStartPos[i + 1] = subSeqStartPos.back();
+  }
+
+  MatrixPtr seqInputPtr =
+      Matrix::create(seqStartPos.back(), layerSize, false, false);
+  seqInputPtr->randomizeUniform();
+  config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
+                              "nested_seq_input",
+                              seqInputPtr,
+                              seqStartPos,
+                              subSeqStartPos});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back(
+      {INPUT_SELF_DEFINE_DATA, "selected_indices", selectedIndices});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "sub_nested_seq",
+                  /* batchSize */ seqNum,
+                  /* trans */ false,
+                  /* useGpu*/ useGpu,
+                  /* useWeight */ false);
+  }
+}
+
+TEST(Layer, ClipLayer) {
+  const size_t batchSize = 128;
+  const size_t size = 512;
+  TestConfig config;
+  config.layerConfig.set_type("clip");
+  config.inputDefs.push_back({INPUT_DATA, "input", size, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ClipConfig* layerConf = input->mutable_clip_conf();
+  double p1 = std::rand() / (double)RAND_MAX;
+  double p2 = std::rand() / (double)RAND_MAX;
+  layerConf->set_min(std::min(p1, p2));
+  layerConf->set_max(std::max(p1, p2));
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "clip", batchSize, false, useGpu, false);
+  }
+}
+
+TEST(Layer, RowL2NormLayer) {
+  const size_t batchSize = 128;
+  const size_t size = 512;
+  TestConfig config;
+  config.layerConfig.set_type("row_l2_norm");
+  config.layerConfig.set_size(size);
+  config.inputDefs.push_back({INPUT_DATA, "input", size, 0});
+  config.layerConfig.add_inputs();
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "row_l2_norm", batchSize, false, useGpu, false);
+  }
+}
+
+void test3DConvLayer(const string& type, bool trans, bool useGpu) {
+  // filter size
+  const int NUM_FILTERS = 6;
+  // const int CHANNELS = 3;
+  const int FILTER_SIZE = 3;
+  const int FILTER_SIZE_Y = 3;
+  const int FILTER_SIZE_Z = 3;
+
+  // input image
+  const int CHANNELS = 3;
+  const int IMAGE_SIZE = 9;
+  const int IMAGE_SIZE_Y = 9;
+  const int IMAGE_SIZE_Z = 9;
+
+  TestConfig config;
+  config.biasSize = NUM_FILTERS;
+  config.layerConfig.set_type(type);
+  config.layerConfig.set_num_filters(NUM_FILTERS);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  // Setting up conv3D-trans layer
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+
+  conv->set_channels(CHANNELS);
+  conv->set_filter_size(FILTER_SIZE);
+  conv->set_filter_size_y(FILTER_SIZE_Y);
+  conv->set_filter_size_z(FILTER_SIZE_Z);
+  conv->set_padding(0);
+  conv->set_padding_y(0);
+  conv->set_padding_z(0);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_stride_z(2);
+  conv->set_img_size(IMAGE_SIZE);
+  conv->set_img_size_y(IMAGE_SIZE_Y);
+  conv->set_img_size_z(IMAGE_SIZE_Z);
+  conv->set_output_x(outputSize(conv->img_size(),
+                                conv->filter_size(),
+                                conv->padding(),
+                                conv->stride(),
+                                /*  caffeMode */ true));
+  conv->set_output_y(outputSize(conv->img_size_y(),
+                                conv->filter_size_y(),
+                                conv->padding_y(),
+                                conv->stride_y(),
+                                /*  caffeMode */ true));
+  conv->set_output_z(outputSize(conv->img_size_z(),
+                                conv->filter_size_z(),
+                                conv->padding_z(),
+                                conv->stride_z(),
+                                /*  caffeMode */ true));
+
+  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
+                              conv->output_z() * NUM_FILTERS);
+  conv->set_groups(1);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  config.inputDefs.push_back(
+      {INPUT_DATA,
+       "layer_0",
+       CHANNELS * IMAGE_SIZE * IMAGE_SIZE_Y * IMAGE_SIZE_Z,
+       conv->filter_channels() * FILTER_SIZE * FILTER_SIZE_Y * FILTER_SIZE_Z *
+           NUM_FILTERS});
+
+  testLayerGrad(config, "conv3D", 10, trans, useGpu);
+  // Use small batch_size and useWeight=true to test biasGrad
+  testLayerGrad(config, "conv3D", 2, trans, useGpu, true, 0.02);
+}
+
+TEST(Layer, test3DConvLayer) {
+  test3DConvLayer("conv3d", /* trans= */ false, /* useGpu= */ false);
+#ifdef PADDLE_WITH_CUDA
+  test3DConvLayer("conv3d", /* trans= */ false, /* useGpu= */ true);
+#endif
+}
+
+void test3DDeConvLayer(const string& type, bool trans, bool useGpu) {
+  // filter size
+  const int NUM_FILTERS = 6;
+  // const int CHANNELS = 3;
+  const int FILTER_SIZE = 3;
+  const int FILTER_SIZE_Y = 3;
+  const int FILTER_SIZE_Z = 3;
+
+  // input image
+  const int CHANNELS = 3;
+  const int IMAGE_SIZE = 4;
+  const int IMAGE_SIZE_Y = 6;
+  const int IMAGE_SIZE_Z = 6;
+
+  // Setting up conv-trans layer
+  TestConfig config;
+  config.biasSize = NUM_FILTERS;
+  config.layerConfig.set_type("deconv3d");
+  config.layerConfig.set_num_filters(NUM_FILTERS);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+
+  conv->set_channels(CHANNELS);
+  conv->set_filter_size(FILTER_SIZE);
+  conv->set_filter_size_y(FILTER_SIZE_Y);
+  conv->set_filter_size_z(FILTER_SIZE_Z);
+  conv->set_padding(0);
+  conv->set_padding_y(0);
+  conv->set_padding_z(0);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_stride_z(2);
+  conv->set_output_x(IMAGE_SIZE);
+  conv->set_output_y(IMAGE_SIZE_Y);
+  conv->set_output_z(IMAGE_SIZE_Z);
+
+  conv->set_img_size(imageSize(conv->output_x(),
+                               conv->filter_size(),
+                               conv->padding(),
+                               conv->stride(),
+                               true));
+  conv->set_img_size_y(imageSize(conv->output_y(),
+                                 conv->filter_size_y(),
+                                 conv->padding_y(),
+                                 conv->stride_y(),
+                                 true));
+  conv->set_img_size_z(imageSize(conv->output_z(),
+                                 conv->filter_size_z(),
+                                 conv->padding_z(),
+                                 conv->stride_z(),
+                                 true));
+  config.layerConfig.set_size(conv->img_size() * conv->img_size_y() *
+                              conv->img_size_z() * NUM_FILTERS);
+  conv->set_groups(1);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  config.inputDefs.push_back(
+      {INPUT_DATA,
+       "layer_0",
+       CHANNELS * IMAGE_SIZE * IMAGE_SIZE_Y * IMAGE_SIZE_Z,
+       conv->filter_channels() * FILTER_SIZE * FILTER_SIZE_Y * FILTER_SIZE_Z *
+           NUM_FILTERS});
+
+  testLayerGrad(config, "deconv3D", 10, trans, useGpu);
+  // Use small batch_size and useWeight=true to test biasGrad
+  testLayerGrad(config, "deconv3D", 2, trans, useGpu, true, 0.02);
+}
+
+TEST(Layer, test3DDeConvLayer) {
+  test3DDeConvLayer("deconv3d", /* trans= */ false, /* useGpu= */ false);
+#ifdef PADDLE_WITH_CUDA
+  test3DDeConvLayer("deconv3d", /* trans= */ false, /* useGpu= */ true);
+#endif
+}
+
+TEST(Layer, ScaleShiftLayer) {
+  // FIXME: Disable ScaleShiftLayer because it is not stable.
+  // https://github.com/PaddlePaddle/Paddle/issues/7781
+  return;
+  //  const size_t batchSize = 16;
+  //  const size_t size = 32;
+  //  TestConfig config;
+  //  config.layerConfig.set_type("scale_shift");
+  //  config.layerConfig.set_size(size);
+  //  config.biasSize = 1;
+  //  config.inputDefs.push_back(
+  //      {INPUT_DATA, "input", /* dim= */ size, /* paraSize= */ 1});
+  //  config.layerConfig.add_inputs();
+  //  for (auto useGpu : {false, true}) {
+  //    testLayerGrad(config, "scale_shift", batchSize, false, useGpu, false);
+  //  }
+}
+
+TEST(Layer, ScaleSubRegionLayer) {
+  const size_t batchSize = 64;
+  const size_t size = 4096;
+  TestConfig config;
+  config.layerConfig.set_type("scale_sub_region");
+  config.inputDefs.push_back({INPUT_DATA, "input", size, 0});
+  MatrixPtr indicesV = Matrix::create(batchSize, 6, false, false);
+  auto* data = indicesV->getData();
+  for (size_t i = 0; i < batchSize; ++i) {
+    data[i * 2] = 2;
+    data[i * 2 + 1] = 4;
+    data[i * 2 + 2] = 16;
+    data[i * 2 + 3] = 32;
+    data[i * 2 + 4] = 16;
+    data[i * 2 + 5] = 32;
+  }
+  config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, "indices", indicesV, {}});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ScaleSubRegionConfig* scaleSubRegionConf =
+      input->mutable_scale_sub_region_conf();
+  ImageConfig* imgConf = scaleSubRegionConf->mutable_image_conf();
+  imgConf->set_img_size(32);
+  imgConf->set_img_size_y(32);
+  imgConf->set_channels(4);
+  scaleSubRegionConf->set_value(2.0);
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "scale_sub_region", batchSize, false, useGpu, false);
+  }
+}
+
+TEST(Layer, L2DistanceLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("l2_distance");
+  config.layerConfig.set_size(1);
+  config.biasSize = 0;
+
+  const size_t input_dim = 27;
+  const size_t batch_size = 11;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", input_dim, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", input_dim, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "l2_distance", batch_size, false, useGpu);
+  }
+}
+
+void testFactorizationMachineLayer(InputType type, bool useGpu) {
+  const int FACTOR_SIZE = 10;
+  TestConfig config;
+  config.layerConfig.set_type("factorization_machine");
+  config.layerConfig.set_factor_size(FACTOR_SIZE);
+  config.layerConfig.set_size(1);
+  config.biasSize = 0;
+  config.inputDefs.push_back({type, "layer_0", 128, 1280});
+  config.layerConfig.add_inputs();
+  testLayerGrad(config, "factorization_machine", 16, false, useGpu, false);
+}
+
+TEST(Layer, FactorizationMachineLayer) {
+  for (auto useGpu : {false, true}) {
+    testFactorizationMachineLayer(INPUT_DATA, useGpu);
+  }
+  testFactorizationMachineLayer(INPUT_SPARSE_FLOAT_VALUE_DATA, false);
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/gserver/tests/test_LinearChainCRF.cpp b/paddle/legacy/gserver/tests/test_LinearChainCRF.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7082c1363a4cdadfd0e4a4497c20ae5c513bc7f1
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_LinearChainCRF.cpp
@@ -0,0 +1,67 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <vector>
+#include "paddle/legacy/gserver/layers/LinearChainCRF.h"
+#include "paddle/legacy/utils/Util.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+static inline bool getNextSequence(vector<int>& seq, int numClasses) {
+  for (auto& v : seq) {
+    if (++v < numClasses) {
+      return true;
+    }
+    v = 0;
+  }
+  return false;
+}
+
+TEST(LinearChainCRF, decoding) {
+  const int numClasses = 4;
+  CpuVector para(numClasses * (numClasses + 2));
+  real* a = para.getData();
+  real* b = para.getData() + numClasses;
+  real* w = para.getData() + 2 * numClasses;
+  LinearChainCRF crf(4, para.getData());
+  for (int length : {1, 2, 3, 10}) {
+    for (int tries = 0; tries < 10; ++tries) {
+      CpuMatrix x(length, numClasses);
+      x.randomizeUniform();
+      para.randnorm(0, 2);
+      vector<int> decodingResult(length);
+      vector<int> bestResult(length);
+      vector<int> testResult(length, 0);
+      crf.decode(x.getData(), &decodingResult[0], length);
+      real bestScore = -std::numeric_limits<real>::max();
+      do {
+        real score = a[testResult.front()] + b[testResult.back()];
+        score += x.getElement(0, testResult.front());
+        for (int k = 1; k < length; ++k) {
+          score += x.getElement(k, testResult[k]) +
+                   w[numClasses * testResult[k - 1] + testResult[k]];
+        }
+        if (score > bestScore) {
+          bestScore = score;
+          bestResult = testResult;
+        }
+      } while (getNextSequence(testResult, numClasses));
+      for (int k = 0; k < length; ++k) {
+        EXPECT_EQ(decodingResult[k], bestResult[k]);
+      }
+    }
+  }
+}
diff --git a/paddle/legacy/gserver/tests/test_MKLDNN.cpp b/paddle/legacy/gserver/tests/test_MKLDNN.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c79ccd1956c5c68e5c97c2a185230b8ea9c3dea0
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_MKLDNN.cpp
@@ -0,0 +1,448 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <paddle/legacy/utils/PythonUtil.h>
+#include <string>
+#include <vector>
+#include "MKLDNNTester.h"
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/gserver/activations/MKLDNNActivation.h"
+#include "paddle/legacy/math/MathUtils.h"
+
+using namespace paddle;  // NOLINT
+
+DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(use_gpu);
+DECLARE_bool(use_mkldnn);
+
+#define RUN_MKLDNN_TEST(DNN_CONFIG, REF_CONFIG, DESC)         \
+  MKLDNNTester tester;                                        \
+  for (auto bs : {DESC.bs, 1}) {                              \
+    tester.run(DNN_CONFIG, REF_CONFIG, bs, DESC.ih, DESC.iw); \
+  }
+
+#define RUN_MKLDNN_TEST_LAYER(DNN_CONFIG, REF_TYPE, DESC) \
+  TestConfig ref = DNN_CONFIG;                            \
+  ref.layerConfig.set_type(REF_TYPE);                     \
+  RUN_MKLDNN_TEST(DNN_CONFIG, ref, DESC)
+
+struct testFcDesc {
+  int bs;
+  int ic;
+  int ih, iw;  // oh == ow == 1
+  int oc;
+};
+
+static void getMKLDNNFcConfig(TestConfig& cfg, const testFcDesc& pm) {
+  cfg.layerConfig.set_type("mkldnn_fc");
+  cfg.layerConfig.set_active_type("relu");
+  cfg.layerConfig.set_size(pm.oc);
+  cfg.inputDefs.push_back(
+      {INPUT_DATA,
+       "layer_0",
+       /* size of input layer= */ size_t(pm.ic * pm.ih * pm.iw),
+       /* size of weight= */ size_t(pm.oc * pm.ic * pm.ih * pm.iw)});
+  cfg.layerConfig.add_inputs();
+}
+
+void testFcLayer(const testFcDesc& pm) {
+  TestConfig dnnConfig;
+  getMKLDNNFcConfig(dnnConfig, pm);
+  for (auto biasSize : {pm.oc, 0}) {
+    dnnConfig.biasSize = biasSize;
+    RUN_MKLDNN_TEST_LAYER(dnnConfig, "fc", pm)
+  }
+}
+
+TEST(MKLDNNLayer, FcLayer) {
+  /* bs, ic, ih, iw, oc */
+  testFcLayer({2, 2, 1, 1, 3});
+  testFcLayer({3, 7, 1, 1, 19});
+  testFcLayer({8, 16, 13, 13, 32});
+  testFcLayer({4, 12, 13, 13, 18});
+  testFcLayer({2, 64, 16, 16, 32});
+  testFcLayer({15, 3, 16, 16, 6});
+}
+
+struct testConvDesc {
+  int bs, gp;
+  int ic, ih, iw;
+  int oc, oh, ow;
+  int fh, fw;
+  int ph, pw;
+  int sh, sw;
+  int dh, dw;
+};
+
+static void getMKLDNNConvConfig(TestConfig& cfg, const testConvDesc& pm) {
+  cfg.layerConfig.set_type("mkldnn_conv");
+  cfg.layerConfig.set_active_type("relu");
+  cfg.layerConfig.set_num_filters(pm.oc);
+  cfg.layerConfig.set_size(pm.oc * pm.oh * pm.ow);
+  cfg.layerConfig.set_shared_biases(true);
+  cfg.inputDefs.push_back(
+      {INPUT_DATA,
+       "layer_0",
+       /* size of input layer= */ size_t(pm.ic * pm.ih * pm.iw),
+       /* size of weight= */ size_t(pm.oc * pm.ic * pm.fh * pm.fw / pm.gp)});
+  LayerInputConfig* input = cfg.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_groups(pm.gp);
+  conv->set_img_size(pm.iw);
+  conv->set_img_size_y(pm.ih);
+  conv->set_output_x(pm.ow);
+  conv->set_output_y(pm.oh);
+  conv->set_filter_size(pm.fw);
+  conv->set_filter_size_y(pm.fh);
+  conv->set_channels(pm.ic);
+  conv->set_padding(pm.pw);
+  conv->set_padding_y(pm.ph);
+  conv->set_stride(pm.sw);
+  conv->set_stride_y(pm.sh);
+  conv->set_dilation(pm.dw);
+  conv->set_dilation_y(pm.dh);
+  conv->set_caffe_mode(true);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  CHECK_EQ(conv->filter_channels() * pm.gp, conv->channels())
+      << "it is indivisible";
+
+  int fh = (pm.fh - 1) * pm.dh + 1;
+  int fw = (pm.fw - 1) * pm.dw + 1;
+  int ow = outputSize(pm.iw, fw, pm.pw, pm.sw, true);
+  int oh = outputSize(pm.ih, fh, pm.ph, pm.sh, true);
+  CHECK_EQ(ow, pm.ow) << "output size check failed";
+  CHECK_EQ(oh, pm.oh) << "output size check failed";
+}
+
+void testConvLayer(const testConvDesc& pm) {
+  TestConfig dnnConfig;
+  getMKLDNNConvConfig(dnnConfig, pm);
+  for (auto biasSize : {pm.oc, 0}) {
+    dnnConfig.biasSize = biasSize;
+    RUN_MKLDNN_TEST_LAYER(dnnConfig, "exconv", pm)
+  }
+}
+
+TEST(MKLDNNLayer, ConvLayer) {
+  /* bs, gp, ic, ih, iw, oc, oh, ow, fh, fw, ph, pw, sh, sw, dh, dw */
+  testConvLayer({2, 1, 3, 32, 32, 16, 32, 32, 3, 3, 1, 1, 1, 1, 1, 1});
+  testConvLayer({2, 1, 8, 16, 16, 8, 16, 16, 3, 3, 1, 1, 1, 1, 1, 1});
+  testConvLayer({3, 1, 16, 32, 32, 3, 32, 32, 3, 3, 1, 1, 1, 1, 1, 1});
+  testConvLayer({8, 1, 16, 18, 18, 32, 18, 18, 3, 3, 1, 1, 1, 1, 1, 1});
+  testConvLayer({16, 1, 1, 42, 31, 32, 23, 11, 4, 5, 3, 2, 2, 3, 1, 1});
+  testConvLayer({2, 1, 8, 16, 16, 8, 8, 8, 3, 3, 1, 1, 2, 2, 1, 1});
+  testConvLayer({3, 1, 8, 13, 13, 8, 7, 7, 3, 3, 1, 1, 2, 2, 1, 1});
+  // with groups
+  testConvLayer({2, 2, 4, 5, 5, 8, 5, 5, 3, 3, 1, 1, 1, 1, 1, 1});
+  testConvLayer({2, 3, 3, 5, 5, 3, 5, 5, 3, 3, 1, 1, 1, 1, 1, 1});
+  testConvLayer({4, 4, 16, 3, 3, 16, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1});
+}
+
+struct testPoolDesc {
+  int bs, ic;  // input channel and output channel are the same
+  int ih, iw;
+  int oh, ow;
+  int fh, fw;
+  int ph, pw;
+  int sh, sw;
+};
+
+static void getMKLDNNPoolConfig(TestConfig& cfg, const testPoolDesc& pm) {
+  cfg.layerConfig.set_type("mkldnn_pool");
+  cfg.layerConfig.set_active_type("relu");
+  cfg.layerConfig.set_size(pm.ic * pm.oh * pm.ow);
+  cfg.inputDefs.push_back(
+      {INPUT_DATA,
+       "layer_0",
+       /* size of input layer= */ size_t(pm.ic * pm.ih * pm.iw),
+       0});
+  LayerInputConfig* input = cfg.layerConfig.add_inputs();
+  PoolConfig* pool = input->mutable_pool_conf();
+  pool->set_pool_type("avg-projection");
+  pool->set_channels(pm.ic);
+  pool->set_img_size(pm.iw);
+  pool->set_img_size_y(pm.ih);
+  pool->set_output_x(pm.ow);
+  pool->set_output_y(pm.oh);
+  pool->set_size_x(pm.fw);
+  pool->set_size_y(pm.fh);
+  pool->set_padding(pm.pw);
+  pool->set_padding_y(pm.ph);
+  pool->set_stride(pm.sw);
+  pool->set_stride_y(pm.sh);
+
+  int oh = outputSize(pm.ih, pm.fh, pm.ph, pm.sh, false);
+  int ow = outputSize(pm.iw, pm.fw, pm.pw, pm.sw, false);
+  CHECK_EQ(ow, pm.ow) << "output size check failed";
+  CHECK_EQ(oh, pm.oh) << "output size check failed";
+}
+
+void testPoolLayer(const testPoolDesc& pm) {
+  TestConfig dnnConfig;
+  getMKLDNNPoolConfig(dnnConfig, pm);
+  LayerInputConfig* input = dnnConfig.layerConfig.mutable_inputs(0);
+  PoolConfig* pool = input->mutable_pool_conf();
+  for (auto type : {"max-projection", "avg-projection"}) {
+    pool->set_pool_type(type);
+    RUN_MKLDNN_TEST_LAYER(dnnConfig, "pool", pm)
+  }
+}
+
+TEST(MKLDNNLayer, PoolLayer) {
+  /* bs, ch, ih, iw, oh, ow, fh, fw, ph, pw, sh, sw */
+  testPoolLayer({2, 1, 4, 4, 2, 2, 3, 3, 0, 0, 2, 2});
+  testPoolLayer({10, 8, 16, 16, 8, 8, 2, 2, 0, 0, 2, 2});
+  testPoolLayer({4, 2, 5, 5, 3, 3, 3, 3, 1, 1, 2, 2});
+  testPoolLayer({8, 16, 56, 56, 28, 28, 3, 3, 0, 0, 2, 2});
+  testPoolLayer({8, 16, 14, 14, 7, 7, 3, 3, 0, 0, 2, 2});
+  testPoolLayer({4, 16, 7, 7, 1, 1, 7, 7, 0, 0, 1, 1});
+  testPoolLayer({4, 2, 5, 5, 3, 3, 5, 5, 1, 1, 1, 1});
+  testPoolLayer({2, 8, 56, 56, 29, 29, 3, 3, 1, 1, 2, 2});
+}
+
+struct testBatchNormDesc {
+  int bs;
+  int ic;
+  int ih, iw;
+};
+
+static void getMKLDNNBatchNormConfig(TestConfig& cfg,
+                                     const testBatchNormDesc& pm) {
+  cfg.layerConfig.set_size(pm.ic * pm.ih * pm.iw);
+  cfg.layerConfig.set_type("mkldnn_batch_norm");
+  cfg.biasSize = pm.ic;
+  cfg.inputDefs.push_back(
+      {INPUT_DATA,
+       "layer_0",
+       /* size of input layer= */ size_t(pm.ic * pm.ih * pm.iw),
+       /* size of weight= */ size_t(pm.ic)});
+  cfg.inputDefs.push_back(
+      {INPUT_DATA, "layer_1_moving_mean", 1, size_t(pm.ic)});
+  cfg.inputDefs.back().isStatic = true;
+  cfg.inputDefs.push_back({INPUT_DATA, "layer_2_moving_var", 1, size_t(pm.ic)});
+  cfg.inputDefs.back().isStatic = true;
+  LayerInputConfig* input = cfg.layerConfig.add_inputs();
+  cfg.layerConfig.set_active_type("relu");
+  cfg.layerConfig.add_inputs();
+  cfg.layerConfig.add_inputs();
+  ImageConfig* img_conf = input->mutable_image_conf();
+  img_conf->set_channels(pm.ic);
+  img_conf->set_img_size_y(pm.ih);
+  img_conf->set_img_size(pm.iw);
+}
+
+void testBatchNormLayer(const testBatchNormDesc& pm) {
+  TestConfig dnnConfig;
+  getMKLDNNBatchNormConfig(dnnConfig, pm);
+  TestConfig refConfig = dnnConfig;
+  refConfig.layerConfig.set_type("batch_norm");
+  // for PASS_TRAIN, use_global_stats always should be false, and batchsize != 1
+  VLOG(MKLDNN_TESTS) << "check train phase";
+  dnnConfig.layerConfig.set_use_global_stats(false);
+  refConfig.layerConfig.set_use_global_stats(false);
+  MKLDNNTester tester;
+  tester.run(dnnConfig, refConfig, pm.bs, pm.ih, pm.iw, PASS_TRAIN);
+  // for PASS_TEST, check use_global_stats true and false, and batchsize 1
+  VLOG(MKLDNN_TESTS) << "check test phase";
+  for (auto useGS : {false, true}) {
+    dnnConfig.layerConfig.set_use_global_stats(useGS);
+    refConfig.layerConfig.set_use_global_stats(useGS);
+    MKLDNNTester tester;
+    for (auto bs : {pm.bs, 1}) {
+      tester.run(dnnConfig, refConfig, bs, pm.ih, pm.iw, PASS_TEST);
+    }
+  }
+}
+
+TEST(MKLDNNLayer, BatchNormLayer) {
+  testBatchNormLayer({4, 10, 6, 6});
+  testBatchNormLayer({16, 32, 16, 16});
+  testBatchNormLayer({4, 16, 8, 10});
+}
+
+struct testLRNDesc {
+  int bs, ic, ih, iw;
+  float scale, pow;
+  int localSize;
+};
+
+void getMKLDNNLRNConfig(TestConfig& cfg, const testLRNDesc& pm) {
+  cfg.layerConfig.set_type("mkldnn_lrn");
+  cfg.layerConfig.set_active_type("relu");
+  size_t layerSize = pm.ic * pm.ih * pm.iw;
+  cfg.inputDefs.push_back({INPUT_DATA, "layer_0", layerSize, 0});
+  LayerInputConfig* input = cfg.layerConfig.add_inputs();
+  NormConfig* norm = input->mutable_norm_conf();
+  norm->set_channels(pm.ic);
+  norm->set_size(pm.localSize);
+  norm->set_scale(pm.scale);
+  norm->set_pow(pm.pow);
+  norm->set_blocked(0);
+  norm->set_img_size(pm.iw);
+  norm->set_img_size_y(pm.ih);
+  norm->set_output_x(norm->img_size());
+  norm->set_output_y(norm->img_size_y());
+  cfg.layerConfig.set_size(layerSize);
+  cfg.biasSize = 0;
+}
+
+void testLRNLayer(const testLRNDesc& pm) {
+  TestConfig dnnConfig;
+  getMKLDNNLRNConfig(dnnConfig, pm);
+  // mkldnn_lrn <==> norm with cmrnorm-projection type
+  TestConfig refConfig = dnnConfig;
+  refConfig.layerConfig.set_type("norm");
+  LayerInputConfig* input = refConfig.layerConfig.mutable_inputs(0);
+  NormConfig* norm = input->mutable_norm_conf();
+  norm->set_norm_type("cmrnorm-projection");
+  norm->set_scale(norm->scale() / norm->size());
+  RUN_MKLDNN_TEST(dnnConfig, refConfig, pm)
+}
+
+TEST(MKLDNNLayer, LRNLayer) {
+  testLRNLayer({4, 10, 12, 12, 0.001f, 0.75f, 5});
+  testLRNLayer({2, 32, 6, 6, 0.001f, 0.75f, 5});
+  testLRNLayer({4, 16, 8, 10, 0.01f, 0.5f, 5});
+}
+
+struct testImageDesc {
+  int bs, ic, ih, iw;
+};
+
+static void getAddtoConfig(TestConfig& cfg,
+                           const testImageDesc& pm,
+                           const size_t nInputs = 1) {
+  cfg.biasSize = 0;
+  cfg.layerConfig.set_type("addto");
+  size_t layerSize = pm.ic * pm.ih * pm.iw;
+  cfg.layerConfig.set_size(layerSize);
+  cfg.layerConfig.set_active_type("relu");
+  for (size_t i = 0; i < nInputs; ++i) {
+    std::stringstream ss;
+    ss << "layer_" << i;
+    cfg.inputDefs.push_back({INPUT_DATA, ss.str(), layerSize, 0});
+    LayerInputConfig* input = cfg.layerConfig.add_inputs();
+    ImageConfig* img_conf = input->mutable_image_conf();
+    img_conf->set_channels(pm.ic);
+    img_conf->set_img_size_y(pm.ih);
+    img_conf->set_img_size(pm.iw);
+  }
+}
+
+void testAddtoLayer(const testImageDesc& pm, const size_t nInputs) {
+  CHECK_GE(nInputs, 1UL);
+  TestConfig dnnConfig;
+  getAddtoConfig(dnnConfig, pm, nInputs);
+  dnnConfig.layerConfig.set_type("mkldnn_addto");
+  for (auto withBias : {false, true}) {
+    dnnConfig.biasSize = withBias ? pm.ic * pm.ih * pm.iw : 0;
+    RUN_MKLDNN_TEST_LAYER(dnnConfig, "addto", pm)
+  }
+}
+
+TEST(MKLDNNLayer, AddtoLayer) {
+  testAddtoLayer({16, 5, 14, 14}, 1);
+  testAddtoLayer({8, 10, 8, 8}, 2);
+  testAddtoLayer({4, 12, 1, 1}, 3);
+}
+
+static void getMKLDNNConcatConfig(TestConfig& cfg,
+                                  const std::vector<testImageDesc>& inputs) {
+  CHECK_GE(inputs.size(), 2UL) << "at least two inputs";
+  int oc = inputs[0].ic;
+  for (size_t i = 1; i < inputs.size(); ++i) {
+    CHECK_EQ(inputs[i].bs, inputs[0].bs);
+    CHECK_EQ(inputs[i].ih, inputs[0].ih);
+    CHECK_EQ(inputs[i].iw, inputs[0].iw);
+    oc += inputs[i].ic;
+  }
+  cfg.biasSize = 0;
+  cfg.layerConfig.set_type("mkldnn_concat");
+  cfg.layerConfig.set_size(oc * inputs[0].ih * inputs[0].iw);
+  cfg.layerConfig.set_active_type("relu");
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    std::stringstream ss;
+    ss << "layer_" << i;
+    cfg.inputDefs.push_back(
+        {INPUT_DATA,
+         ss.str(),
+         (size_t)(inputs[i].ic) * inputs[i].ih * inputs[i].iw,
+         0});
+    LayerInputConfig* input = cfg.layerConfig.add_inputs();
+    ImageConfig* img_conf = input->mutable_image_conf();
+    img_conf->set_channels(inputs[i].ic);
+    img_conf->set_img_size_y(inputs[i].ih);
+    img_conf->set_img_size(inputs[i].iw);
+  }
+}
+
+void testConcatLayer(const std::vector<testImageDesc>& inputs) {
+  TestConfig dnnConfig;
+  getMKLDNNConcatConfig(dnnConfig, inputs);
+  RUN_MKLDNN_TEST_LAYER(dnnConfig, "concat", inputs[0])
+}
+
+TEST(MKLDNNLayer, ConcatLayer) {
+  testConcatLayer({{64, 128, 1, 1}, {64, 32, 1, 1}, {64, 64, 1, 1}});
+  testConcatLayer({{32, 100, 8, 8}, {32, 10, 8, 8}});
+}
+
+void testActivation(std::string actType, const testImageDesc& pm) {
+  // TODO(TJ): remove me when paddle support elu activation
+  if (actType == "mkldnn_elu") {
+    return;
+  }
+  const std::string compareTypes[] = {actType, actType.erase(0, 7)};
+  TestConfig cfg;
+  getAddtoConfig(cfg, pm);
+  TestConfig ref = cfg;
+  cfg.layerConfig.set_active_type(compareTypes[0]);
+  ref.layerConfig.set_active_type(compareTypes[1]);
+  RUN_MKLDNN_TEST(cfg, ref, pm)
+}
+
+TEST(MKLDNNActivation, Activations) {
+  auto types = MKLDNNActivation::getAllRegisteredTypes();
+  for (auto type : types) {
+    /* bs, c, h, w*/
+    testActivation(type, {16, 64, 32, 32});
+    testActivation(type, {2, 8, 1, 1});
+  }
+}
+
+DECLARE_string(config_args);
+TEST(MKLDNNNet, net) {
+  std::vector<std::string> cases = {"simple", "branch"};
+  for (auto name : cases) {
+    std::string config = "./legacy/gserver/tests/mkldnn_" + name + "_net.conf";
+    for (auto channels : {2, 32}) {
+      std::ostringstream oss;
+      oss << "channels=" << channels;
+      FLAGS_config_args = oss.str();
+      MKLDNNTester::runNetTest(config);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  FLAGS_use_gpu = false;
+  FLAGS_use_mkldnn = true;
+  initMain(argc, argv);
+  initPython(argc, argv);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/gserver/tests/test_MaxPoolingWithMaskOutput.cpp b/paddle/legacy/gserver/tests/test_MaxPoolingWithMaskOutput.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2bc261b4a87ce7f1f4ce1c936ee4151d75e17f3f
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_MaxPoolingWithMaskOutput.cpp
@@ -0,0 +1,117 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+
+#include "LayerGradUtil.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;
+
+void setPoolConfig(TestConfig* config,
+                   PoolConfig* pool,
+                   const string& poolType) {
+  (*config).biasSize = 0;
+  (*config).layerConfig.set_type("pool");
+  (*config).layerConfig.set_num_filters(1);
+
+  int kw = 3, kh = 3;
+  int pw = 0, ph = 0;
+  int sw = 2, sh = 2;
+  pool->set_pool_type(poolType);
+  pool->set_channels(1);
+  pool->set_size_x(kw);
+  pool->set_size_y(kh);
+  pool->set_start(0);
+  pool->set_padding(pw);
+  pool->set_padding_y(ph);
+  pool->set_stride(sw);
+  pool->set_stride_y(sh);
+
+  int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
+  int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
+  pool->set_output_x(ow);
+  pool->set_output_y(oh);
+}
+
+void doOneMaxPoolingWithMaskOutputTest(MatrixPtr& inputMat,
+                                       const string& poolType,
+                                       bool use_gpu,
+                                       MatrixPtr& maskMat) {
+  TestConfig config;
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 25, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  PoolConfig* pool = input->mutable_pool_conf();
+
+  pool->set_img_size(5);
+  pool->set_img_size_y(5);
+  setPoolConfig(&config, pool, poolType);
+  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
+                              pool->channels());
+
+  config.layerConfig.set_name("MaxPoolWithMask");
+
+  std::vector<DataLayerPtr> dataLayers;
+  LayerMap layerMap;
+  vector<Argument> datas;
+
+  initDataLayer(config,
+                &dataLayers,
+                &datas,
+                &layerMap,
+                "MaxPoolWithMask",
+                1,
+                false,
+                use_gpu);
+
+  dataLayers[0]->getOutputValue()->copyFrom(*inputMat);
+
+  FLAGS_use_gpu = use_gpu;
+  std::vector<ParameterPtr> parameters;
+  LayerPtr maxPoolingWithMaskOutputLayer;
+  initTestLayer(config, &layerMap, &parameters, &maxPoolingWithMaskOutputLayer);
+  maxPoolingWithMaskOutputLayer->forward(PASS_GC);
+
+  checkMatrixEqual(maxPoolingWithMaskOutputLayer->getOutput("mask").value,
+                   maskMat);
+}
+
+TEST(Layer, maxPoolingWithMaskOutputLayerFwd) {
+  bool useGpu = false;
+  MatrixPtr inputMat;
+  MatrixPtr maskMat;
+  real inputData[] = {0.1, 0.1, 0.5, 0.5, 1.1, 0.2, 0.2, 0.6, 0.1,
+                      0.1, 0.3, 0.3, 0.7, 0.1, 0.1, 0.4, 0.4, 0.8,
+                      0.8, 0.1, 1.0, 2.0, 3.0, 0.0, 9.0};
+  real maskData[] = {12, 4, 22, 24};
+
+  inputMat = Matrix::create(1, 25, false, useGpu);
+  maskMat = Matrix::create(1, 4, false, useGpu);
+  inputMat->setData(inputData);
+  maskMat->setData(maskData);
+  doOneMaxPoolingWithMaskOutputTest(
+      inputMat, "max-pool-with-mask", useGpu, maskMat);
+#ifdef PADDLE_WITH_CUDA
+  useGpu = true;
+  inputMat = Matrix::create(1, 25, false, useGpu);
+  maskMat = Matrix::create(1, 4, false, useGpu);
+  inputMat->copyFrom(inputData, 25);
+  maskMat->copyFrom(maskData, 4);
+  doOneMaxPoolingWithMaskOutputTest(
+      inputMat, "max-pool-with-mask", useGpu, maskMat);
+#endif
+}
diff --git a/paddle/legacy/gserver/tests/test_MultinomialSampler.cpp b/paddle/legacy/gserver/tests/test_MultinomialSampler.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..25b1a1191d0100c8ee625d3f5f36d1513164b23b
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_MultinomialSampler.cpp
@@ -0,0 +1,147 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <random>
+
+#include <gtest/gtest.h>
+#include <vector>
+
+#undef PADDLE_DISABLE_TIMER
+#include "paddle/legacy/utils/Stat.h"
+
+#include "paddle/legacy/gserver/layers/MultinomialSampler.h"
+#include "paddle/legacy/utils/Util.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+class MultinomialSamplerTester : public MultinomialSampler {
+ public:
+  MultinomialSamplerTester(real* prob, int size)
+      : MultinomialSampler(prob, size) {}
+
+  template <typename Rand1>
+  int testGen(Rand1 rand1) {
+    return gen1(rand1);
+  }
+};
+
+TEST(MultinomialSampler, gen) {
+  int numGrids = 1024 * 1024;
+  int size = 1024 * 4;
+  default_random_engine reng;
+
+  for (size_t iter = 0; iter < 256; ++iter) {
+    uniform_int_distribution<int> rand(1, numGrids / size * 1.8);
+    vector<real> prob;
+    int sum = 0;
+    for (int i = 0; i < size; ++i) {
+      prob.push_back(rand(reng));
+      sum += prob.back();
+    }
+
+    CHECK_LE(sum, numGrids);
+    prob.back() += numGrids - sum;
+
+    vector<int> counts(size);
+    MultinomialSamplerTester sampler(&prob[0], size);
+    counts.assign(size, 0);
+    {
+      double s = (double)size / (double)numGrids;
+      REGISTER_TIMER("MultinomialSampler");
+      for (double i = 0; i < numGrids; ++i) {
+        int ret = sampler.testGen([i, s]() { return s * i; });
+        if (ret < 0 || ret >= size) {
+          EXPECT_GE(ret, 0);
+          EXPECT_LT(ret, size);
+          break;
+        }
+        ++counts[ret];
+      }
+    }
+    for (int i = 0; i < size; ++i) {
+      if (prob[i] != counts[i]) {
+        EXPECT_EQ(prob[i], counts[i]);
+        LOG(INFO) << iter;
+        break;
+      }
+    }
+  }
+}
+
+void benchmarkRandom() {
+  int n = 1024 * 1024;
+
+  int sum;
+  double sum1;
+
+  sum = 0;
+  unsigned int seed = 1;
+  {
+    REGISTER_TIMER("crand");
+    for (int i = 0; i < n; ++i) {
+      sum += rand_r(&seed) % 1000;
+    }
+  }
+  LOG(INFO) << "sum=" << sum;
+
+  default_random_engine reng;
+  uniform_int_distribution<int> rand(1, 1000);
+  sum = 0;
+  {
+    REGISTER_TIMER("stdrand");
+    for (int i = 0; i < n; ++i) {
+      sum += rand(reng);
+    }
+  }
+  LOG(INFO) << "sum=" << sum;
+
+  sum = 0;
+  {
+    REGISTER_TIMER("default_random_engine");
+    for (int i = 0; i < n; ++i) {
+      sum += reng();
+    }
+  }
+  LOG(INFO) << "sum=" << sum;
+
+  uniform_real_distribution<double> rand1(0, 1);
+  sum1 = 0;
+  {
+    REGISTER_TIMER("stdrand1");
+    for (int i = 0; i < n; ++i) {
+      sum1 += rand1(reng);
+    }
+  }
+  LOG(INFO) << "sum1=" << sum1;
+
+  sum1 = 0;
+  {
+    real a = 1.0f / (real)RAND_MAX;
+    REGISTER_TIMER("crand1");
+    for (int i = 0; i < n; ++i) {
+      sum1 += a * rand_r(&seed);
+    }
+  }
+  LOG(INFO) << "sum1=" << sum1;
+}
+
+int main(int argc, char** argv) {
+  initMain(argc, argv);
+  testing::InitGoogleTest(&argc, argv);
+  benchmarkRandom();
+  int ret = RUN_ALL_TESTS();
+  globalStat.printSegTimerStatus();
+  return ret;
+}
diff --git a/paddle/legacy/gserver/tests/test_NetworkCompare.cpp b/paddle/legacy/gserver/tests/test_NetworkCompare.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c9f9f3e61be11fa33ab37e27065fdf275f86453a
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_NetworkCompare.cpp
@@ -0,0 +1,294 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#undef PADDLE_DISABLE_TIMER
+#include <gtest/gtest.h>
+#include <paddle/legacy/utils/PythonUtil.h>
+#include <algorithm>
+#include <cstdlib>
+
+#include "paddle/legacy/trainer/Trainer.h"
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_int32(gpu_id);
+DECLARE_double(checkgrad_eps);
+DEFINE_bool(use_label, true, "input label or sequence label");
+DEFINE_bool(static_para, false, "static parameter");
+
+struct DataIn {
+  std::vector<Argument> inArgs;
+  std::vector<MatrixPtr> outGrads;
+  std::vector<VectorPtr> paraValues;
+};
+
+struct DataOut {
+  std::vector<MatrixPtr> outValues;
+  std::vector<VectorPtr> paraGrads;
+};
+
+void initArgument(DataIn& data,
+                  const std::string& configPath,
+                  bool useGpu = FLAGS_use_gpu) {
+  TrainerConfigHelper config(configPath);
+  size_t batchSize = config.getOptConfig().batch_size();
+
+  for (const auto& layer_name : config.getModelConfig().input_layer_names()) {
+    auto layer_config = std::find_if(config.getModelConfig().layers().begin(),
+                                     config.getModelConfig().layers().end(),
+                                     [=](const LayerConfig& layer_config) {
+                                       return layer_config.name() == layer_name;
+                                     });
+    CHECK(layer_config != config.getModelConfig().layers().end());
+
+    size_t layerSize = layer_config->size();
+    Argument arg;
+    arg.value = Matrix::create(batchSize, layerSize, false, useGpu);
+    arg.grad = Matrix::create(batchSize, layerSize, false, useGpu);
+    arg.value->randomizeUniform();
+    arg.value->add(-0.5);
+    arg.value->sigmoid(*arg.value);
+    arg.grad->zeroMem();
+    if (FLAGS_use_label) {
+      arg.ids = VectorT<int>::create(batchSize, useGpu);
+      arg.ids->rand(layerSize);
+    }
+    generateSequenceStartPositions(batchSize, arg.sequenceStartPositions);
+    data.inArgs.push_back(arg);
+  }
+
+  for (const auto& layer_name : config.getModelConfig().output_layer_names()) {
+    auto layer_config = std::find_if(config.getModelConfig().layers().begin(),
+                                     config.getModelConfig().layers().end(),
+                                     [=](const LayerConfig& layer_config) {
+                                       return layer_config.name() == layer_name;
+                                     });
+    CHECK(layer_config != config.getModelConfig().layers().end());
+
+    size_t layerSize = layer_config->size();
+    MatrixPtr grad = Matrix::create(batchSize, layerSize, false, useGpu);
+    grad->randomizeUniform();
+    data.outGrads.push_back(grad);
+  }
+
+  for (const auto& para_config : config.getModelConfig().parameters()) {
+    VectorPtr value = Vector::create(para_config.size(), useGpu);
+    value->randnorm(0, 2);
+    data.paraValues.push_back(value);
+  }
+}
+
+void calcGradient(DataIn& in, DataOut& out, const std::string& configPath) {
+  *ThreadLocalRand::getSeed() = 0;
+  srand(0);
+
+  Trainer trainer;
+  auto config = std::make_shared<TrainerConfigHelper>(configPath);
+  trainer.init(config, false);
+
+  std::vector<ParameterPtr> parameters;
+  vector<Argument> outArgs;
+
+  auto gradientMachine = trainer.getGradientMachine();
+  parameters = gradientMachine->getParameters();
+  if (FLAGS_static_para) {
+    for (size_t i = 0; i < parameters.size(); i++) {
+      parameters[i]->getBuf(PARAMETER_VALUE)->one();
+    }
+  } else {
+    for (size_t i = 0; i < in.paraValues.size(); i++) {
+      parameters[i]->getBuf(PARAMETER_VALUE)->copyFrom(*in.paraValues[i]);
+    }
+  }
+  gradientMachine->start();
+  gradientMachine->forward(in.inArgs, &outArgs, PASS_TRAIN);
+  for (size_t i = 0; i < in.outGrads.size(); i++) {
+    // If the all the layers in the config have no parameters, also
+    // not set NeedGradient(), the outArgs[i] will be nullptr.
+    outArgs[i].grad->copyFrom(*in.outGrads[i]);
+  }
+  gradientMachine->backward();
+  for (size_t i = 0; i < in.outGrads.size(); i++) {
+    MatrixPtr value = Matrix::create(outArgs[i].value->getHeight(),
+                                     outArgs[i].value->getWidth(),
+                                     false,
+                                     false);
+    value->copyFrom(*outArgs[i].value);
+    out.outValues.push_back(value);
+  }
+  for (size_t i = 0; i < in.paraValues.size(); i++) {
+    VectorPtr grad = Vector::create(
+        parameters[i]->getBuf(PARAMETER_GRADIENT)->getSize(), false);
+    grad->copyFrom(*parameters[i]->getBuf(PARAMETER_GRADIENT));
+    out.paraGrads.push_back(grad);
+  }
+
+  for (int i = 0; i < 20; i++) {
+    REGISTER_TIMER("forward");
+    gradientMachine->forward(in.inArgs, &outArgs, PASS_TRAIN);
+  }
+  for (int i = 0; i < 20; i++) {
+    REGISTER_TIMER("backward");
+    gradientMachine->backward();
+  }
+
+  gradientMachine->finish();
+}
+
+void checkBuffer(real* A,
+                 const char* desA,
+                 real* B,
+                 const char* desB,
+                 size_t len,
+                 size_t width = 1) {
+  int nNum = 0;
+  for (size_t i = 0; i < len; ++i) {
+    real diff = fabs(A[i] - B[i]);
+    if (diff > 0.0f &&
+        diff / std::max(fabs(A[i]), fabs(B[i])) > FLAGS_checkgrad_eps) {
+      nNum++;
+      LOG(INFO) << "Row: " << i / width << ", " << desA << " : " << A[i]
+                << "    " << desB << " : " << B[i];
+    }
+  }
+  EXPECT_EQ(0, nNum);
+}
+
+void compareGradient(DataOut& outA, DataOut& outB) {
+  LOG(INFO) << "------------------------------"
+            << " Check Network Output "
+            << "------------------------------";
+  for (size_t i = 0; i < outA.outValues.size(); ++i) {
+    LOG(INFO) << "OUTPUT VALUE: " << i;
+    checkBuffer(outA.outValues[i]->getData(),
+                "network A output",
+                outB.outValues[i]->getData(),
+                "network B output",
+                outA.outValues[i]->getElementCnt(),
+                outA.outValues[i]->getWidth());
+  }
+
+  if (!FLAGS_static_para) {
+    LOG(INFO) << "------------------------------"
+              << " Check Parameters "
+              << "------------------------------";
+    for (size_t i = 0; i < outA.paraGrads.size(); ++i) {
+      LOG(INFO) << "PARAMETER GRADIENT: " << i;
+      checkBuffer(outA.paraGrads[i]->getData(),
+                  "Network A",
+                  outB.paraGrads[i]->getData(),
+                  "Network B",
+                  outA.paraGrads[i]->getSize());
+    }
+  }
+}
+
+void compareNetwork(const std::string& config_file_a,
+                    const std::string& config_file_b) {
+  DataIn in;
+  initArgument(in, config_file_a);
+
+  DataOut dataA;
+  calcGradient(in, dataA, config_file_a);
+  LOG(INFO) << "forwardBackward of Network A is finished";
+  globalStat.printSegTimerStatus();
+  globalStat.reset();
+  LOG(INFO) << "\n\n";
+
+  DataOut dataB;
+  calcGradient(in, dataB, config_file_b);
+  LOG(INFO) << "forwardBackward of the Network B is finished";
+  globalStat.printSegTimerStatus();
+  globalStat.reset();
+  LOG(INFO) << "\n\n";
+
+  compareGradient(dataA, dataB);
+}
+
+TEST(Compare, concat_dotmul) {
+  std::string config_file_a = "./legacy/gserver/tests/concat_dotmul_a.conf";
+  std::string config_file_b = "./legacy/gserver/tests/concat_dotmul_b.conf";
+  compareNetwork(config_file_a, config_file_b);
+}
+
+TEST(Compare, concat_fullmatrix) {
+  std::string config_file_a = "./legacy/gserver/tests/concat_fullmatrix_a.conf";
+  std::string config_file_b = "./legacy/gserver/tests/concat_fullmatrix_b.conf";
+  compareNetwork(config_file_a, config_file_b);
+}
+
+TEST(Compare, concat_table) {
+  std::string config_file_a = "./legacy/gserver/tests/concat_table_a.conf";
+  std::string config_file_b = "./legacy/gserver/tests/concat_table_b.conf";
+  compareNetwork(config_file_a, config_file_b);
+}
+
+TEST(Compare, concat_slice) {
+  std::string config_file_a = "./legacy/gserver/tests/concat_slice_a.conf";
+  std::string config_file_b = "./legacy/gserver/tests/concat_slice_b.conf";
+  compareNetwork(config_file_a, config_file_b);
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(Compare, img_pool) {
+  std::string config_file_a = "./legacy/gserver/tests/img_pool_a.conf";
+  std::string config_file_b = "./legacy/gserver/tests/img_pool_b.conf";
+  bool useGpu = FLAGS_use_gpu;
+  FLAGS_use_gpu = true;
+  compareNetwork(config_file_a, config_file_b);
+  FLAGS_use_gpu = useGpu;
+}
+
+TEST(Compare, img_conv) {
+  std::string config_file_a = "./legacy/gserver/tests/img_conv_a.conf";
+  std::string config_file_b = "./legacy/gserver/tests/img_conv_b.conf";
+  bool useGpu = FLAGS_use_gpu;
+  FLAGS_use_gpu = true;
+  compareNetwork(config_file_a, config_file_b);
+  FLAGS_use_gpu = useGpu;
+}
+
+// Test cudnn_conv and exconv give the same result
+TEST(Compare, img_conv2) {
+  std::string config_file_a = "./legacy/gserver/tests/img_conv_cudnn.py";
+  std::string config_file_b = "./legacy/gserver/tests/img_conv_exconv.py";
+  bool useGpu = FLAGS_use_gpu;
+  double eps = FLAGS_checkgrad_eps;
+  FLAGS_use_gpu = true;
+  // Sometimes, this unit test will fail with 1e-2
+  FLAGS_checkgrad_eps = 4e-2;
+  compareNetwork(config_file_a, config_file_b);
+  FLAGS_use_gpu = useGpu;
+  FLAGS_checkgrad_eps = eps;
+}
+#endif
+
+DEFINE_string(config_file_a, "", "config of one network to compare");
+DEFINE_string(config_file_b, "", "config of another network to compare");
+TEST(Compare, network) {
+  if (FLAGS_config_file_a != "" && FLAGS_config_file_b != "") {
+    compareNetwork(FLAGS_config_file_a, FLAGS_config_file_b);
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  paddle::initMain(argc, argv);
+  initPython(argc, argv);
+  int ret = RUN_ALL_TESTS();
+  return ret;
+}
diff --git a/paddle/gserver/tests/test_PriorBox.cpp b/paddle/legacy/gserver/tests/test_PriorBox.cpp
similarity index 100%
rename from paddle/gserver/tests/test_PriorBox.cpp
rename to paddle/legacy/gserver/tests/test_PriorBox.cpp
diff --git a/paddle/legacy/gserver/tests/test_PyDataProvider.cpp b/paddle/legacy/gserver/tests/test_PyDataProvider.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0209e6818a8340fe128146909b9e8ec610e310a3
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_PyDataProvider.cpp
@@ -0,0 +1,177 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include "paddle/legacy/gserver/dataproviders/PyDataProvider.h"
+#include "paddle/legacy/utils/Util.h"
+
+#include "paddle/testing/TestUtil.h"
+
+using namespace std;     // NOLINT
+using namespace paddle;  // NOLINT
+
+void simpleValueCheck(const vector<Argument>& argumentList, bool useGpu);
+void simpleSequenceCheck(const vector<Argument>& argumentList, int sample_num);
+
+TEST(PyDataProvider, py_fill_slots) {
+  DataConfig config;
+  config.set_type("py");
+  config.set_async_load_data(false);
+  config.set_load_data_module(std::string("pyDataProvider"));
+  config.set_load_data_object(std::string("SimpleDataProvider"));
+  config.clear_files();
+  std::string dataFile =
+      "legacy/gserver/tests/pyDataProvider/pyDataProviderList";
+  config.set_files(dataFile);
+#ifndef PADDLE_WITH_CUDA
+  bool useGpu = false;
+#else
+  bool useGpu = true;
+#endif
+  unique_ptr<DataProvider> dataProvider(DataProvider::create(config, useGpu));
+  DataBatch dataBatch;
+  dataProvider->getNextBatchInternal(2, &dataBatch);
+  const std::vector<Argument>& argumentList = dataBatch.getStreams();
+  // Check size
+  EXPECT_EQ(argumentList.size(), 3UL);
+  EXPECT_EQ(argumentList[0].value->getWidth(), 3UL);
+  EXPECT_EQ(argumentList[0].value->getHeight(), 2UL);
+  EXPECT_EQ(argumentList[0].value->getElementCnt(), 6UL);
+  EXPECT_EQ(argumentList[1].value->getWidth(), 7UL);
+  EXPECT_EQ(argumentList[1].value->getHeight(), 2UL);
+  EXPECT_EQ(argumentList[1].value->getElementCnt(), 4UL);
+  EXPECT_EQ(argumentList[2].ids->getSize(), 2UL);
+  // Check value
+  simpleValueCheck(argumentList, useGpu);
+  // Check sequenceStartPositions
+  simpleSequenceCheck(argumentList, 2);
+}
+
+TEST(PyDataProvider, py_fill_nest_slots) {
+  DataConfig config;
+  config.set_type("py");
+  config.set_async_load_data(false);
+  config.set_load_data_module(std::string("pyDataProvider"));
+  config.set_load_data_object(std::string("SimpleNestDataProvider"));
+  config.clear_files();
+  std::string dataFile =
+      "legacy/gserver/tests/pyDataProvider/pyDataProviderList";
+  config.set_files(dataFile);
+  EXPECT_EQ(config.IsInitialized(), true);
+#ifndef PADDLE_WITH_CUDA
+  bool useGpu = false;
+#else
+  bool useGpu = true;
+#endif
+  unique_ptr<DataProvider> dataProvider(DataProvider::create(config, useGpu));
+  DataBatch dataBatch;
+  dataProvider->getNextBatchInternal(2, &dataBatch);
+  const std::vector<Argument>& argumentList = dataBatch.getStreams();
+  // Check size
+  EXPECT_EQ(argumentList.size(), 3UL);
+  EXPECT_EQ(argumentList[0].value->getWidth(), 3UL);
+  EXPECT_EQ(argumentList[0].value->getHeight(), 4UL);
+  EXPECT_EQ(argumentList[0].value->getElementCnt(), 12UL);
+  EXPECT_EQ(argumentList[1].value->getWidth(), 7UL);
+  EXPECT_EQ(argumentList[1].value->getHeight(), 4UL);
+  EXPECT_EQ(argumentList[1].value->getElementCnt(), 8UL);
+  EXPECT_EQ(argumentList[2].ids->getSize(), 4UL);
+  // Check value
+  simpleValueCheck(argumentList, useGpu);
+  // Check sequenceStartPositions
+  simpleSequenceCheck(argumentList, 4);
+  // Check subSequenceStartPositions
+  EXPECT_EQ(argumentList[0].subSequenceStartPositions->getSize(), 4UL);
+  EXPECT_EQ(argumentList[1].subSequenceStartPositions->getSize(), 3UL);
+  EXPECT_EQ(argumentList[2].subSequenceStartPositions->getSize(), 4UL);
+  for (size_t i = 0; i < argumentList.size(); i++) {
+    EXPECT_EQ(argumentList[i].subSequenceStartPositions->getElement(0), 0);
+    EXPECT_EQ(argumentList[i].subSequenceStartPositions->getElement(1), 1);
+    if (i != 1) {
+      EXPECT_EQ(argumentList[i].subSequenceStartPositions->getElement(2), 2);
+      EXPECT_EQ(argumentList[i].subSequenceStartPositions->getElement(3), 4);
+    } else {
+      EXPECT_EQ(argumentList[i].subSequenceStartPositions->getElement(2), 4);
+    }
+  }
+}
+
+void simpleValueCheck(const vector<Argument>& argumentList, bool useGpu) {
+  // Dense
+  real* data;
+  if (useGpu) {
+    MatrixPtr cpuMatrixPtr = Matrix::create(argumentList[0].value->getHeight(),
+                                            argumentList[0].value->getWidth(),
+                                            0,
+                                            0);
+    cpuMatrixPtr->copyFrom(*argumentList[0].value);
+    data = cpuMatrixPtr->getData();
+  } else {
+    data = argumentList[0].value->getData();
+  }
+  for (size_t i = 0; i < argumentList[0].value->getElementCnt(); ++i) {
+    EXPECT_EQ(*(data + i), (float)(i % 3 + 1));
+  }
+  // Sparse without value
+  GpuSparseMatrixPtr matGpu;
+  CpuSparseMatrixPtr matCpu;
+  if (useGpu) {
+    matGpu = dynamic_pointer_cast<GpuSparseMatrix>(argumentList[1].value);
+    ASSERT_TRUE(matGpu != NULL);
+  } else {
+    data = argumentList[0].value->getData();
+    matCpu = dynamic_pointer_cast<CpuSparseMatrix>(argumentList[1].value);
+    ASSERT_TRUE(matCpu != NULL);
+  }
+  for (size_t i = 0; i < argumentList[1].value->getHeight(); ++i) {
+    size_t colNum = useGpu ? matGpu->getColNum(i) : matCpu->getColNum(i);
+    EXPECT_EQ(colNum, (size_t)2);
+    const int* buf = useGpu ? matGpu->getRowCols(i) : matCpu->getRowCols(i);
+    for (size_t j = 0; j < colNum; ++j) {
+      EXPECT_EQ((size_t)buf[j], (size_t)(j + 1));
+    }
+  }
+  // Index
+  for (size_t j = 0; j < argumentList[2].ids->getSize(); ++j) {
+    EXPECT_EQ((size_t)argumentList[2].ids->get(j), 0UL);
+  }
+}
+
+void simpleSequenceCheck(const vector<Argument>& argumentList, int sample_num) {
+  EXPECT_EQ(argumentList[0].sequenceStartPositions->getSize(), 3UL);
+  EXPECT_EQ(argumentList[1].sequenceStartPositions->getSize(), 2UL);
+  EXPECT_EQ(argumentList[2].sequenceStartPositions->getSize(), 3UL);
+  for (size_t i = 0; i < argumentList.size(); i++) {
+    EXPECT_EQ(argumentList[i].sequenceStartPositions->getElement(0), 0);
+    if (i != 1) {
+      EXPECT_EQ(argumentList[i].sequenceStartPositions->getElement(1), 1);
+      EXPECT_EQ(argumentList[i].sequenceStartPositions->getElement(2),
+                sample_num);
+    } else {
+      EXPECT_EQ(argumentList[i].sequenceStartPositions->getElement(1),
+                sample_num);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  initMain(argc, argv);
+  initPython(argc, argv);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/gserver/tests/test_PyDataProvider2.cpp b/paddle/legacy/gserver/tests/test_PyDataProvider2.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..de313ba82cf2697c13d6eae17056240b6272ca1c
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_PyDataProvider2.cpp
@@ -0,0 +1,409 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef PADDLE_NO_PYTHON
+#include <gtest/gtest.h>
+#include <fstream>
+#include "paddle/legacy/gserver/dataproviders/DataProvider.h"
+#include "paddle/legacy/utils/PythonUtil.h"
+#include "paddle/legacy/utils/Util.h"
+
+DEFINE_string(train_list, "unittest.list", "file list for unittest");
+
+namespace paddle {
+namespace unittest {
+namespace pydp2 {
+extern void setOnPoolFilledHook(const std::function<void(size_t)> &func);
+extern void clearOnPoolFilledHook();
+
+}  // namespace pydp2
+}  // namespace unittest
+}  // namespace paddle
+
+const paddle::real epsilon = 1e-5;
+
+static inline int64_t readDataBatch(paddle::DataBatch *batch,
+                                    const std::string &funcName,
+                                    int64_t batchSize = 65535) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object(funcName);
+  std::unique_ptr<paddle::DataProvider> provider(
+      paddle::DataProvider::create(config, false));
+  provider->setSkipShuffle();
+  provider->reset();
+  return provider->getNextBatchInternal(batchSize, batch);
+}
+
+TEST(PyDataProvider2, dense_no_seq) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_dense_no_seq");
+
+  std::unique_ptr<paddle::DataProvider> provider(
+      paddle::DataProvider::create(config, false));
+
+  provider->setSkipShuffle();  // skip shuffle for unittest.
+
+  paddle::DataBatch batch;
+  for (size_t pass = 0; pass < 2; ++pass) {  // read 2 passes
+    provider->reset();
+    int64_t num = provider->getNextBatchInternal(100, &batch);
+    ASSERT_NE(num, 0);
+    ASSERT_EQ((size_t)batch.getStreams().size(), (size_t)1);
+    ASSERT_EQ((size_t)batch.getSize(), (size_t)100);
+    // Check batch data.
+    for (size_t i = 0; i < 100; ++i) {
+      for (size_t j = 0; j < 200; ++j) {
+        paddle::real tmp = (paddle::real)((j - 100.0) * (i + 1) / 200.0);
+        ASSERT_NEAR(
+            batch.getStreams()[0].value->getData()[i * 200 + j], tmp, epsilon);
+      }
+    }
+
+    num = provider->getNextBatchInternal(100, &batch);
+    ASSERT_NE(num, 0);
+    ASSERT_EQ(batch.getStreams().size(), (size_t)1);
+    ASSERT_EQ((size_t)batch.getSize(), (size_t)100);
+    // Check batch data.
+    for (size_t i = 0; i < 100; ++i) {
+      size_t ii = i + 100;
+      for (size_t j = 0; j < 200; ++j) {
+        paddle::real tmp = (paddle::real)((j - 100.0) * (ii + 1) / 200.0);
+        ASSERT_NEAR(
+            batch.getStreams()[0].value->getData()[i * 200 + j], tmp, epsilon);
+      }
+    }
+    num = provider->getNextBatchInternal(100, &batch);
+    ASSERT_EQ(num, 0);
+  }
+}
+
+TEST(PyDataProvider2, index_no_seq) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_index_no_seq");
+  std::unique_ptr<paddle::DataProvider> provider(
+      paddle::DataProvider::create(config, false));
+
+  provider->setSkipShuffle();  // skip shuffle for unittest.
+  paddle::DataBatch batch;
+  for (size_t pass = 0; pass < 2; ++pass) {
+    provider->reset();
+    int64_t num = provider->getNextBatchInternal(10000, &batch);
+    CHECK_EQ(num, 200);
+    for (int i = 0; i < 200; ++i) {
+      CHECK_EQ(i, batch.getStreams()[0].ids->getData()[i]);
+    }
+  }
+}
+
+TEST(PyDataProvider2, init_hook) {
+  paddle::PyObjectPtr pickle = paddle::py::import("pickle");
+  paddle::PyObjectPtr globals(PyModule_GetDict(PyImport_AddModule("__main__")));
+  PyDict_SetItemString(globals.get(), "pickle", pickle.get());
+  paddle::PyObjectPtr locals(PyDict_New());
+  paddle::PyObjectPtr mdl(PyRun_String(
+      "dumps = pickle.dumps({'value':[float(x) for x in xrange(20)]})",
+      Py_file_input,
+      globals.get(),
+      locals.get()));
+  CHECK_PY(mdl) << "Error!";
+  paddle::PyObjectPtr dps(PyDict_GetItemString(locals.get(), "dumps"));
+  CHECK_PY(dps) << "Error!";
+
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_init_hook");
+  config.set_load_data_args(PyString_AsString(dps.get()));
+
+  std::unique_ptr<paddle::DataProvider> provider(
+      paddle::DataProvider::create(config, false));
+  provider->setSkipShuffle();  // skip shuffle for unittest.
+  provider->reset();
+  paddle::DataBatch batch;
+  int64_t num = provider->getNextBatchInternal(100000, &batch);
+  ASSERT_EQ(num, 200);
+  auto &mat = batch.getStreams()[0].value;
+  ASSERT_EQ((size_t)mat->getWidth(), (size_t)20);
+  for (size_t i = 0; i < 200; ++i) {
+    for (size_t j = 0; j < 20; ++j) {
+      ASSERT_NEAR((paddle::real)j, mat->getData()[i * 20 + j], epsilon);
+    }
+  }
+}
+
+TEST(PyDataProvider2, sparse_no_value_no_seq) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_sparse_non_value_no_seq");
+  std::unique_ptr<paddle::DataProvider> provider(
+      paddle::DataProvider::create(config, false));
+  provider->setSkipShuffle();
+  provider->reset();
+  paddle::DataBatch batch;
+  int64_t num = provider->getNextBatchInternal(10000, &batch);
+  CHECK_EQ(num, 200);
+  auto csm = std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(
+      batch.getStreams()[0].value);
+  CHECK(csm != nullptr);
+  for (int i = 0; i < 200; ++i) {
+    CHECK_EQ(csm->getColNum(i), (size_t)10);
+    int *cols = csm->getRowCols(i);
+    for (int j = 0; j < 10; ++j) {
+      CHECK_EQ(cols[j], (i + 1) * (j + 1));
+    }
+  }
+}
+
+TEST(PyDataProvider2, sparse_value_no_seq) {
+  paddle::DataBatch batch;
+  CHECK_EQ(readDataBatch(&batch, "test_sparse_value_no_seq"), 200);
+  auto csm = std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(
+      batch.getStreams()[0].value);
+  CHECK(csm != nullptr);
+  for (int i = 0; i < 200; ++i) {
+    CHECK_EQ(csm->getColNum(i), (size_t)10);
+    int *cols = csm->getRowCols(i);
+    real *dat = csm->getRowValues(i);
+    for (int j = 0; j < 10; ++j) {
+      EXPECT_EQ(cols[j], (i + 1) * (j + 1));
+      EXPECT_EQ(dat[j], real(j) / real(i + 1));
+    }
+  }
+}
+
+TEST(PyDataProvider2, index_seq) {
+  paddle::DataBatch batch;
+  CHECK_EQ(readDataBatch(&batch, "test_index_seq"), 200);
+  auto &arg = batch.getStreams()[0];
+  CHECK_EQ((int)arg.ids->getSize(), (200 + 1) * 200 / 2);
+  size_t tmp = 0;
+  for (size_t i = 0; i < 200; ++i) {  // CHECK DATA CORRECT
+    for (size_t j = 0; j < i + 1; ++j) {
+      ASSERT_EQ((size_t)arg.ids->getData()[tmp], j);
+      ++tmp;
+    }
+  }
+  ASSERT_EQ(arg.sequenceStartPositions->getSize(), (size_t)201);
+  tmp = 0;
+  for (size_t i = 0; i < 200; ++i) {
+    tmp += i;
+    ASSERT_EQ((size_t)arg.sequenceStartPositions->getData(false)[i], tmp);
+  }
+  tmp += 200;
+  ASSERT_EQ((size_t)arg.sequenceStartPositions->getData(false)[200], tmp);
+}
+
+TEST(PyDataProvider2, index_sub_seq) {
+  paddle::DataBatch batch;
+  ASSERT_EQ(readDataBatch(&batch, "test_index_sub_seq"), 200);
+  auto &arg = batch.getStreams()[0];
+  size_t tmp = 0;
+  for (size_t i = 0; i < 200; ++i) {
+    for (size_t j = 0; j < i + 1; ++j) {
+      for (size_t k = 0; k < j + 1; ++k) {
+        CHECK_EQ((size_t)arg.ids->getData()[tmp++], k);
+      }
+    }
+  }
+
+  CHECK_EQ(tmp, arg.ids->getSize());
+
+  ASSERT_EQ((size_t)arg.sequenceStartPositions->getSize(), (size_t)201);
+  ASSERT_EQ(arg.subSequenceStartPositions->getData(false)[0], 0);
+  ASSERT_EQ(arg.sequenceStartPositions->getData(false)[0], 0);
+  size_t idx = 1;
+  tmp = 0;
+  for (size_t i = 0; i < 200; ++i) {
+    for (size_t j = 0; j < i + 1; ++j) {
+      tmp += j + 1;
+      ASSERT_EQ((size_t)arg.subSequenceStartPositions->getData(false)[idx],
+                (size_t)tmp);
+      ++idx;
+    }
+    ASSERT_EQ((size_t)arg.sequenceStartPositions->getData(false)[i + 1], tmp);
+  }
+}
+
+TEST(PyDataProvider2, min_pool_size) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_min_pool_size");
+  config.set_load_data_args("");
+  size_t totalData = 1 << 14;
+  constexpr size_t batchSize = 100;
+  constexpr size_t minPoolSize = 1000;
+  paddle::DataBatch batch;
+  std::unique_ptr<paddle::DataProvider> provider(
+      paddle::DataProvider::create(config, false));
+  provider->reset();
+
+  paddle::unittest::pydp2::setOnPoolFilledHook([&](size_t poolSize) {
+    if (totalData > batchSize) {
+      CHECK_GE(poolSize, std::min(totalData - batchSize, minPoolSize));
+    }
+  });
+  while (true) {
+    int64_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
+    if (realBatchSize) {
+      totalData -= realBatchSize;
+    } else {
+      break;
+    }
+  }
+  paddle::unittest::pydp2::clearOnPoolFilledHook();
+}
+
+TEST(PyDataProvider2, can_over_batch_size) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_can_over_batch_size");
+  config.set_load_data_args("");
+  paddle::DataBatch batch;
+  std::unique_ptr<paddle::DataProvider> provider(
+      paddle::DataProvider::create(config, false));
+  provider->reset();
+  constexpr size_t batchSize = 100;
+  while (true) {
+    int64_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
+    if (realBatchSize) {
+      CHECK_LE(static_cast<size_t>(realBatchSize), batchSize);
+    } else {
+      break;
+    }
+  }
+}
+
+TEST(PyDataProvider2, input_order) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_input_order");
+  config.set_load_data_args("");
+
+  paddle::ModelConfig modelConfig;
+  *modelConfig.add_input_layer_names() = "input1";
+  *modelConfig.add_input_layer_names() = "input2";
+  paddle::DataBatch batch;
+  std::unique_ptr<paddle::DataProvider> provider(
+      paddle::DataProvider::create(config, modelConfig, false));
+  provider->reset();
+  constexpr size_t batchSize = 100;
+  while (true) {
+    int64_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
+    if (!realBatchSize) {
+      break;
+    }
+    ASSERT_EQ(batch.getStreams().size(), static_cast<size_t>(2));
+    for (int64_t i = 0; i < realBatchSize; ++i) {
+      ASSERT_EQ(batch.getStream(0).ids->getData()[i], 0);
+      ASSERT_EQ(batch.getStream(1).ids->getData()[i], 1);
+    }
+  }
+}
+
+TEST(PyDataProvider2, test_check) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_check");
+  config.set_load_data_args("");
+  paddle::DataBatch batch;
+  std::unique_ptr<paddle::DataProvider> provider(
+      paddle::DataProvider::create(config, false));
+  provider->reset();
+  while (true) {
+    int64_t realBatchSize = provider->getNextBatchInternal(100, &batch);
+    if (!realBatchSize) {
+      break;
+    } else {
+      auto &ivec = batch.getStream(0).ids;
+      for (size_t i = 0; i < ivec->getSize(); ++i) {
+        CHECK_LT(ivec->getData()[i], 10);
+      }
+    }
+  }
+}
+
+TEST(PyDataProvider2, multiThread) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_dense_no_seq");
+  config.set_async_load_data(true);
+
+  std::unique_ptr<paddle::DataProvider> provider(
+      paddle::DataProvider::create(config, false));
+  provider->reset();
+  paddle::DataBatch batch;
+  provider->getNextBatch(100, &batch);
+  provider->reset();
+  provider.reset();
+}
+
+TEST(PyDataProvider2, minPoolSizeWithCache) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_min_pool_size_with_cache");
+  config.set_async_load_data(true);
+
+  std::unique_ptr<paddle::DataProvider> provider(
+      paddle::DataProvider::create(config, false));
+
+  paddle::DataBatch batch;
+
+  for (int i = 0; i < 10; ++i) {
+    provider->reset();
+    int64_t sum = 0;
+    while (int64_t actualNum = provider->getNextBatch(100, &batch)) {
+      sum += actualNum;
+    }
+    ASSERT_EQ(1 << 20, sum);
+  }
+}
+
+int main(int argc, char **argv) {
+  testing::InitGoogleTest(&argc, argv);
+  paddle::initMain(argc, argv);
+  paddle::initPython(argc, argv);
+
+  std::ofstream fout(FLAGS_train_list);
+  CHECK(fout.is_open());
+  fout << "stub file name" << std::endl;  // in unittest, filename is not used.
+  fout.close();
+
+  return RUN_ALL_TESTS();
+}
+
+#endif
diff --git a/paddle/gserver/tests/test_PyDataProvider2.py b/paddle/legacy/gserver/tests/test_PyDataProvider2.py
similarity index 100%
rename from paddle/gserver/tests/test_PyDataProvider2.py
rename to paddle/legacy/gserver/tests/test_PyDataProvider2.py
diff --git a/paddle/legacy/gserver/tests/test_RecurrentGradientMachine.cpp b/paddle/legacy/gserver/tests/test_RecurrentGradientMachine.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..153c3e7f36a30a70d0c5870144a0091b1e5f7237
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_RecurrentGradientMachine.cpp
@@ -0,0 +1,180 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <paddle/legacy/gserver/gradientmachines/GradientMachine.h>
+#include <paddle/legacy/parameter/ParameterUpdateFunctions.h>
+#include <paddle/legacy/trainer/Trainer.h>
+#include <paddle/legacy/trainer/TrainerInternal.h>
+#include <paddle/legacy/utils/PythonUtil.h>
+#include <paddle/legacy/utils/Util.h>
+#include <paddle/legacy/utils/Version.h>
+
+DECLARE_int32(seed);
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+class TrainerForTest : public paddle::Trainer {
+ public:
+  void startTrain() {
+    GradientMachine& gm = *this->trainerInternal_.getGradientMachine();
+    gm.start();
+  }
+
+  void finishTrain() {
+    GradientMachine& gm = *this->trainerInternal_.getGradientMachine();
+    gm.finish();
+  }
+
+  /**
+   * Get total dimension of all parameters.
+   *
+   * @return the total dimension of all parameters
+   */
+  size_t getTotalParameterSize() const {
+    auto p = const_cast<TrainerForTest*>(this);
+    auto& params = p->getGradientMachine()->getParameters();
+    return std::accumulate(
+        params.begin(), params.end(), 0UL, [](size_t a, const ParameterPtr& p) {
+          return a + p->getSize();
+        });
+  }
+};
+
+void CalCost(const string& conf,
+             const string& dir,
+             real* cost,
+             int num_passes) {
+  auto config = std::make_shared<TrainerConfigHelper>(conf);
+  TrainerForTest trainer;
+  trainer.init(config);
+  mkDir(dir.c_str());
+  config->setSaveDir(dir);
+  auto dataProvider = trainer.getDataProvider();
+  int32_t batchSize = config->getOptConfig().batch_size();
+  real learningRate = config->getOptConfig().learning_rate();
+  real momentum = 0;
+  real decayRate = 0;
+  int64_t dim = trainer.getTotalParameterSize();
+  CpuVector vecW(dim);
+  CpuVector vecGradient(dim);
+  CpuVector vecMomentum(dim);
+
+  // vecW needs to be assigned, otherwise the variable is an uncertain value.
+
+  *ThreadLocalRand::getSeed() = FLAGS_seed;
+  vecW.randnorm(0, 0.1);
+  vecMomentum.randnorm(0, 0.1);
+
+  trainer.startTrain();
+  for (int i = 0; i < num_passes; ++i) {
+    real totalCost = 0;
+    dataProvider->reset();
+    while (true) {
+      DataBatch dataBatch;
+      int num = dataProvider->getNextBatch(batchSize, &dataBatch);
+      if (num == 0) break;
+      totalCost += trainer.calcGradient(dataBatch, vecW, vecGradient);
+      sgdUpdate(
+          learningRate, momentum, decayRate, &vecW, &vecGradient, &vecMomentum);
+    }
+    cost[i] = totalCost;
+  }
+  trainer.finishTrain();
+  rmDir(dir.c_str());
+}
+
+void test(const string& conf1, const string& conf2, double eps, bool useGpu) {
+  if (!paddle::version::isWithGpu() && useGpu) {
+    return;
+  }
+  FLAGS_use_gpu = useGpu;
+  int num_passes = 5;
+  real* cost1 = new real[num_passes];
+  const string dir1 = "legacy/gserver/tests/t1";
+  CalCost(conf1, dir1, cost1, num_passes);
+
+  real* cost2 = new real[num_passes];
+  const string dir2 = "legacy/gserver/tests/t2";
+  CalCost(conf2, dir2, cost2, num_passes);
+
+  for (int i = 0; i < num_passes; i++) {
+    LOG(INFO) << "num_passes: " << i << ", cost1=" << cost1[i]
+              << ", cost2=" << cost2[i]
+              << ", diff=" << std::abs(cost1[i] - cost2[i]);
+    ASSERT_NEAR(cost1[i], cost2[i], eps);
+  }
+  delete[] cost1;
+  delete[] cost2;
+}
+
+TEST(RecurrentGradientMachine, HasSubSequence) {
+  for (bool useGpu : {false, true}) {
+    test("legacy/gserver/tests/sequence_layer_group.conf",
+         "legacy/gserver/tests/sequence_nest_layer_group.conf",
+         1e-5,
+         useGpu);
+  }
+}
+
+TEST(RecurrentGradientMachine, rnn) {
+  for (bool useGpu : {false, true}) {
+    test("legacy/gserver/tests/sequence_rnn.conf",
+         "legacy/gserver/tests/sequence_nest_rnn.conf",
+         1e-6,
+         useGpu);
+  }
+}
+
+TEST(RecurrentGradientMachine, rnn_multi_input) {
+  for (bool useGpu : {false, true}) {
+    test("legacy/gserver/tests/sequence_rnn_multi_input.conf",
+         "legacy/gserver/tests/sequence_nest_rnn_multi_input.conf",
+         1e-6,
+         useGpu);
+  }
+}
+
+TEST(RecurrentGradientMachine, rnn_multi_unequalength_input) {
+  for (bool useGpu : {false, true}) {
+    test("legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py",
+         "legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py",
+         1e-6,
+         useGpu);
+  }
+}
+
+TEST(RecurrentGradientMachine, rnn_mixed_input) {
+  for (bool useGpu : {false, true}) {
+    test("legacy/gserver/tests/sequence_rnn_mixed_inputs.py",
+         "legacy/gserver/tests/sequence_rnn_matched_inputs.py",
+         1e-6,
+         useGpu);
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+
+  if (paddle::version::isWithPyDataProvider()) {
+    if (!paddle::version::isWithGpu()) {
+      FLAGS_use_gpu = false;
+    }
+    initMain(argc, argv);
+    initPython(argc, argv);
+    return RUN_ALL_TESTS();
+  } else {
+    return 0;
+  }
+}
diff --git a/paddle/legacy/gserver/tests/test_RecurrentLayer.cpp b/paddle/legacy/gserver/tests/test_RecurrentLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..71198cb6a1d29433ed0e315378f5aee51b921766
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_RecurrentLayer.cpp
@@ -0,0 +1,571 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <paddle/legacy/utils/Version.h>
+#include <vector>
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/gserver/layers/DataLayer.h"
+#include "paddle/legacy/gserver/layers/Layer.h"
+
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+DECLARE_bool(use_gpu);
+DECLARE_bool(rnn_use_batch);
+DECLARE_int32(fixed_seq_length);
+
+void checkError(const Matrix& matrix1, const Matrix& matrix2) {
+  CHECK(matrix1.getHeight() == matrix2.getHeight());
+  CHECK(matrix1.getWidth() == matrix2.getWidth());
+#ifndef PADDLE_TYPE_DOUBLE
+  real err = 1e-3;
+#else
+  real err = 1e-10;
+#endif
+
+  int height = matrix1.getHeight();
+  int width = matrix1.getWidth();
+  const real* data1 = matrix1.getData();
+  const real* data2 = matrix2.getData();
+  int count = 0;
+  for (int i = 0; i < height; i++) {
+    for (int j = 0; j < width; j++) {
+      if (fabs(data1[i * width + j] - data2[i * width + j]) > err) {
+        count++;
+      }
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+}
+
+void checkError(const CpuVector& vector1, const CpuVector& vector2) {
+  CHECK(vector1.getSize() == vector2.getSize());
+#ifndef PADDLE_TYPE_DOUBLE
+  real err = 1e-3;
+#else
+  real err = 1e-10;
+#endif
+
+  int size = vector1.getSize();
+  const real* data1 = vector1.getData();
+  const real* data2 = vector2.getData();
+  int count = 0;
+  for (int i = 0; i < size; i++) {
+    if (fabs(data1[i] - data2[i]) > err) {
+      count++;
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+}
+
+LayerPtr creatDataLayer(string name,
+                        size_t batchSize,
+                        int layerSize,
+                        bool useGpu) {
+  LayerConfig dataConfig;
+  dataConfig.set_name(name);
+  dataConfig.set_type("data");
+  dataConfig.set_size(layerSize);
+  LayerPtr layer = LayerPtr(new DataLayer(dataConfig));
+
+  Argument data;
+  data.value = Matrix::create(batchSize, layer->getSize(), false, useGpu);
+  data.grad = Matrix::create(batchSize, layer->getSize(), false, useGpu);
+  data.value->randomizeUniform();
+  data.value->add(-0.5);
+  data.value->sigmoid(*data.value);
+  data.grad->zeroMem();
+
+  generateSequenceStartPositions(batchSize, data.sequenceStartPositions);
+
+  DataLayerPtr dataLayer = std::dynamic_pointer_cast<DataLayer>(layer);
+  dataLayer->setData(data);
+  dataLayer->forward(PASS_GC);
+
+  return layer;
+}
+
+ParameterPtr creatParameter(string name,
+                            int pid,
+                            size_t paraSize,
+                            bool useGpu) {
+  ParameterConfig paraConfig;
+  paraConfig.set_name(name);
+  paraConfig.set_size(paraSize);
+
+  ParameterPtr parameter =
+      std::make_shared<Parameter>(paraConfig, useGpu, /*initialize */ false);
+  parameter->enableType(PARAMETER_VALUE);
+  parameter->enableType(PARAMETER_GRADIENT);
+  parameter->randomize();
+  parameter->setID(pid);
+
+  return parameter;
+}
+
+ParameterPtr creatParameterBias(string name,
+                                int pid,
+                                size_t paraSize,
+                                bool useGpu) {
+  ParameterConfig paraConfig;
+  paraConfig.set_name(name);
+  paraConfig.set_size(paraSize);
+  paraConfig.set_initial_std(1);
+
+  ParameterPtr parameter =
+      std::make_shared<Parameter>(paraConfig, useGpu, /*initialize */ true);
+  parameter->randomize();
+  parameter->setID(pid);
+
+  return parameter;
+}
+
+LayerPtr initRecurrentLayer(LayerConfig layerConfig,
+                            size_t batchSize,
+                            int layerSize,
+                            bool useGpu) {
+  FLAGS_use_gpu = useGpu;
+  LayerMap layerMap;
+  ParameterMap parameterMap;
+  LayerPtr dataLayer = creatDataLayer("layer_0", batchSize, layerSize, useGpu);
+  layerMap[dataLayer->getName()] = dataLayer;
+
+  ParameterPtr para =
+      creatParameter("para_0", 0, layerSize * layerSize, useGpu);
+  parameterMap[para->getName()] = para;
+
+  layerConfig.add_inputs();
+  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
+  input.set_input_layer_name("layer_0");
+  input.set_input_parameter_name("para_0");
+  LayerPtr testLayer = Layer::create(layerConfig);
+  layerMap[testLayer->getName()] = testLayer;
+
+  testLayer->init(layerMap, parameterMap);
+  testLayer->setNeedGradient(true);
+
+  return testLayer;
+}
+
+void checkRecurrentLayer(LayerPtr testLayer) {
+  const VectorPtr& weightGrad =
+      (testLayer->getParameters()[0])->getBuf(PARAMETER_GRADIENT);
+  const MatrixPtr& inputGrad = testLayer->getPrev(0)->getOutputGrad();
+  CpuVector seqPara(weightGrad->getSize());
+  CpuVector batPara(weightGrad->getSize());
+  CpuMatrix seqInputGrad(inputGrad->getHeight(), inputGrad->getWidth());
+  CpuMatrix batInputGrad(inputGrad->getHeight(), inputGrad->getWidth());
+
+  CpuMatrix outputGrad(inputGrad->getHeight(), inputGrad->getWidth());
+  outputGrad.randomizeUniform();
+
+  /* use sequence calculate */
+  FLAGS_rnn_use_batch = false;
+  weightGrad->zero();
+  inputGrad->zero();
+  testLayer->forward(PASS_GC);
+  testLayer->getOutputGrad()->copyFrom(outputGrad);
+  testLayer->backward();
+  seqPara.copyFrom(*weightGrad);
+  seqInputGrad.copyFrom(*inputGrad);
+
+  /* use batch calculate */
+  FLAGS_rnn_use_batch = true;
+  weightGrad->zero();
+  inputGrad->zero();
+  testLayer->forward(PASS_GC);
+  testLayer->getOutputGrad()->copyFrom(outputGrad);
+  testLayer->backward();
+  batPara.copyFrom(*weightGrad);
+  batInputGrad.copyFrom(*inputGrad);
+
+  /* check */
+  checkError(seqInputGrad, batInputGrad);
+  checkError(seqPara, batPara);
+}
+
+TEST(Layer, RecurrentLayer) {
+  LayerConfig layerConfig;
+  layerConfig.set_name("rnn");
+  layerConfig.set_type("recurrent");
+  layerConfig.set_active_type("tanh");
+  for (auto layerSize : {1, 10, 64, 128, 256, 512}) {
+    for (auto batchSize : {1, 5, 20, 100, 128}) {
+      for (auto useGpu : {false, true}) {
+        for (auto reversed : {false, true}) {
+          LOG(INFO) << " layerSize=" << layerSize << " batchSize=" << batchSize
+                    << " useGpu=" << useGpu << " reversed=" << reversed;
+          layerConfig.set_size(layerSize);
+          layerConfig.set_reversed(reversed);
+          LayerPtr testLayer =
+              initRecurrentLayer(layerConfig, batchSize, layerSize, useGpu);
+          checkRecurrentLayer(testLayer);
+        }
+      }
+    }
+  }
+}
+
+#define protected public
+#include "paddle/legacy/gserver/layers/GatedRecurrentLayer.h"
+#include "paddle/legacy/gserver/layers/LstmLayer.h"
+#include "paddle/legacy/gserver/layers/RecurrentLayer.h"
+template <class T>
+class TestRecurrentLayer {
+ public:
+  LayerConfig config_;
+  bool useGpu_;
+  bool useBatch_;
+  LayerPtr testLayer_;
+  LayerPtr dataLayer_;
+  ParameterPtr para_;
+  ParameterPtr bias_;
+  LayerMap layerMap_;
+  ParameterMap parameterMap_;
+  TestRecurrentLayer(const LayerConfig& config,
+                     bool useGpu,
+                     bool useBatch = false)
+      : config_(config), useGpu_(useGpu), useBatch_(useBatch) {}
+  void init(size_t batchSize) {
+    FLAGS_use_gpu = useGpu_;
+    testLayer_ = Layer::create(config_);
+    if (typeid(T) == typeid(GatedRecurrentLayer)) {
+      dataLayer_ = creatDataLayer(config_.mutable_inputs(0)->input_layer_name(),
+                                  batchSize,
+                                  config_.size() * 3,
+                                  useGpu_);
+      para_ = creatParameter(config_.mutable_inputs(0)->input_parameter_name(),
+                             0,
+                             config_.size() * config_.size() * 3,
+                             useGpu_);
+      bias_ = creatParameterBias(
+          config_.bias_parameter_name(), 1, config_.size() * 3, useGpu_);
+    } else if (typeid(T) == typeid(LstmLayer)) {
+      dataLayer_ = creatDataLayer(config_.mutable_inputs(0)->input_layer_name(),
+                                  batchSize,
+                                  config_.size() * 4,
+                                  useGpu_);
+      para_ = creatParameter(config_.mutable_inputs(0)->input_parameter_name(),
+                             0,
+                             config_.size() * config_.size() * 4,
+                             useGpu_);
+      bias_ = creatParameterBias(
+          config_.bias_parameter_name(), 1, config_.size() * 7, useGpu_);
+    }
+    layerMap_[dataLayer_->getName()] = dataLayer_;
+    parameterMap_[para_->getName()] = para_;
+    parameterMap_[bias_->getName()] = bias_;
+
+    layerMap_[testLayer_->getName()] = testLayer_;
+    testLayer_->init(layerMap_, parameterMap_);
+    testLayer_->setNeedGradient(true);
+    (dynamic_cast<T*>(testLayer_.get()))->useBatch_ = useBatch_;
+  }
+  void forward() {
+    FLAGS_use_gpu = useGpu_;
+    testLayer_->forward(PASS_GC);
+  }
+  void backward() {
+    FLAGS_use_gpu = useGpu_;
+    testLayer_->backward(nullptr);
+  }
+};
+
+template <class T>
+void checkRecurrentLayer(LayerConfig layerConfig,
+                         size_t batchSize,
+                         bool cpuBatch,
+                         bool gpuBatch) {
+  TestRecurrentLayer<T> testCpu(layerConfig, false, cpuBatch);
+  TestRecurrentLayer<T> testGpu(layerConfig, true, gpuBatch);
+  testCpu.init(batchSize);
+  testGpu.init(batchSize);
+  auto checkError = [](
+      MatrixPtr cpu, MatrixPtr gpu, int numSequences, const char* str) {
+    CpuMatrix check(gpu->getHeight(), gpu->getWidth());
+    check.copyFrom(*gpu);
+    int height = cpu->getHeight();
+    int width = cpu->getWidth();
+    const real* data1 = cpu->getData();
+    const real* data2 = check.getData();
+    int count = 0;
+    for (int i = 0; i < height; i++) {
+      for (int j = 0; j < width; j++) {
+        if (fabs(data1[i * width + j] - data2[i * width + j]) / numSequences >
+            1e-4) {
+          count++;
+        }
+      }
+    }
+    EXPECT_EQ(count, 0) << "[" << str << "]"
+                        << "There are " << count << " different element.";
+  };
+  T* cpuLayer = dynamic_cast<T*>(testCpu.testLayer_.get());
+  T* gpuLayer = dynamic_cast<T*>(testGpu.testLayer_.get());
+
+  Argument& cpuInput = testCpu.dataLayer_->getOutput();
+  Argument& gpuInput = testGpu.dataLayer_->getOutput();
+  gpuInput.resizeAndCopyFrom(cpuInput, true);
+
+  const VectorPtr& cpuVec = testCpu.para_->getBuf(PARAMETER_VALUE);
+  const VectorPtr& gpuVec = testGpu.para_->getBuf(PARAMETER_VALUE);
+  gpuVec->copyFrom(*cpuVec);
+
+  const VectorPtr& cpuBiasVec = testCpu.bias_->getBuf(PARAMETER_VALUE);
+  const VectorPtr& gpuBiasVec = testGpu.bias_->getBuf(PARAMETER_VALUE);
+  gpuBiasVec->copyFrom(*cpuBiasVec);
+
+  /* check forward */
+  testCpu.forward();
+  testGpu.forward();
+
+  checkError(
+      cpuLayer->getOutputValue(), gpuLayer->getOutputValue(), 1, "outputValue");
+
+  /* check backward */
+  cpuLayer->getOutputGrad()->randomizeUniform();
+  gpuLayer->getOutputGrad()->copyFrom(*cpuLayer->getOutputGrad());
+  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+
+  testCpu.backward();
+  testGpu.backward();
+
+  // check input grad
+  checkError(cpuInput.grad, gpuInput.grad, 1, "inputGrad");
+  // check weight grad
+  int numSequences = cpuInput.getNumSequences();
+  checkError(cpuLayer->weight_->getWGrad(),
+             gpuLayer->weight_->getWGrad(),
+             numSequences,
+             "weightGrad");
+  // check bias grad
+  checkError(cpuLayer->bias_->getWGrad(),
+             gpuLayer->bias_->getWGrad(),
+             numSequences,
+             "biasGrad");
+}
+
+TEST(Layer, GatedRecurrentLayer) {
+  LayerConfig layerConfig;
+  layerConfig.set_type("gated_recurrent");
+  layerConfig.set_active_type("sigmoid");
+  layerConfig.set_active_gate_type("sigmoid");
+
+  layerConfig.add_inputs();
+  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
+  input.set_input_layer_name("layer_0");
+  input.set_input_parameter_name("para_0");
+  layerConfig.set_bias_parameter_name("bias");
+
+  for (auto frameSize : {32, 64, 128, 256, 512}) {
+    for (auto batchSize : {1, 5, 100, 500}) {
+      for (auto reversed : {false, true}) {
+        for (auto cpuBatch : {false, true}) {
+          for (auto gpuBatch : {false, true}) {
+            LOG(INFO) << " batchSize=" << batchSize
+                      << " frameSize=" << frameSize << " reversed=" << reversed
+                      << " cpuBatch=" << cpuBatch << " gpuBatch=" << gpuBatch;
+            layerConfig.set_size(frameSize);
+            layerConfig.set_reversed(reversed);
+            checkRecurrentLayer<GatedRecurrentLayer>(
+                layerConfig, batchSize, cpuBatch, gpuBatch);
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(Layer, LstmLayer) {
+  LayerConfig layerConfig;
+  layerConfig.set_type("lstmemory");
+  layerConfig.set_active_type("relu");
+  layerConfig.set_active_state_type("tanh");
+  layerConfig.set_active_gate_type("sigmoid");
+
+  layerConfig.add_inputs();
+  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
+  input.set_input_layer_name("layer_0");
+  input.set_input_parameter_name("para_0");
+  layerConfig.set_bias_parameter_name("bias");
+
+  for (auto frameSize : {32, 64, 128, 256, 512}) {
+    for (auto batchSize : {1, 5, 100, 500}) {
+      for (auto reversed : {false, true}) {
+        for (auto cpuBatch : {false, true}) {
+          for (auto gpuBatch : {false, true}) {
+            LOG(INFO) << " batchSize=" << batchSize
+                      << " frameSize=" << frameSize << " reversed=" << reversed
+                      << " cpuBatch=" << cpuBatch << " gpuBatch=" << gpuBatch;
+            layerConfig.set_size(frameSize);
+            layerConfig.set_reversed(reversed);
+            checkRecurrentLayer<LstmLayer>(
+                layerConfig, batchSize, cpuBatch, gpuBatch);
+          }
+        }
+      }
+    }
+  }
+}
+
+#ifdef PADDLE_WITH_MKLML
+
+#include "paddle/legacy/gserver/layers/MKLPackedRecurrentLayer.h"
+
+LayerPtr initMKLPackedLayer(LayerConfig layerConfig,
+                            bool reversed,
+                            int layerSize,
+                            LayerPtr dataLayer,
+                            ParameterPtr para,
+                            ParameterPtr bias = nullptr) {
+  LayerMap layerMap;
+  ParameterMap parameterMap;
+  layerMap[dataLayer->getName()] = dataLayer;
+  parameterMap[para->getName()] = para;
+  if (bias) {
+    parameterMap[bias->getName()] = bias;
+    layerConfig.set_bias_parameter_name("bias_0");
+  }
+
+  layerConfig.set_size(layerSize);
+  layerConfig.set_reversed(reversed);
+  layerConfig.add_inputs();
+  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
+  input.set_input_layer_name("layer_0");
+  input.set_input_parameter_name("para_0");
+
+  LayerPtr testLayer = Layer::create(layerConfig);
+  layerMap[testLayer->getName()] = testLayer;
+
+  testLayer->init(layerMap, parameterMap);
+  testLayer->setNeedGradient(true);
+
+  return testLayer;
+}
+
+void checkMKLPackedLayer(LayerConfig layerConfig1,
+                         LayerConfig layerConfig2,
+                         bool reversed,
+                         int layerSize,
+                         int batchSize,
+                         bool useBatch1,
+                         bool useBatch2) {
+  LayerPtr dataLayer;
+  ParameterPtr para, bias;
+
+  if (layerConfig1.type() == "recurrent") {
+    dataLayer = creatDataLayer("layer_0", batchSize, layerSize, false);
+    para = creatParameter("para_0", 0, layerSize * layerSize, false);
+    bias = nullptr;
+  } else if (layerConfig1.type() == "gated_recurrent") {
+    dataLayer = creatDataLayer("layer_0", batchSize, layerSize * 3, false);
+    para = creatParameter("para_0", 0, layerSize * layerSize * 3, false);
+    bias = creatParameterBias("bias_0", 1, layerSize * 3, false);
+  }
+
+  LayerPtr testLayer1 = initMKLPackedLayer(
+      layerConfig1, reversed, layerSize, dataLayer, para, bias);
+  LayerPtr testLayer2 = initMKLPackedLayer(
+      layerConfig2, reversed, layerSize, dataLayer, para, bias);
+
+  const VectorPtr& weightGrad =
+      (testLayer1->getParameters()[0])->getBuf(PARAMETER_GRADIENT);
+  const MatrixPtr& inputGrad = testLayer1->getPrev(0)->getOutputGrad();
+  CpuVector wgt_grad1(weightGrad->getSize());
+  CpuVector wgt_grad2(weightGrad->getSize());
+  CpuMatrix input_grad1(inputGrad->getHeight(), inputGrad->getWidth());
+  CpuMatrix input_grad2(inputGrad->getHeight(), inputGrad->getWidth());
+
+  for (int i = 0; i < 2; i++) {
+    FLAGS_rnn_use_batch = useBatch1;
+
+    testLayer1->forward(PASS_GC);
+
+    FLAGS_rnn_use_batch = useBatch2;
+    testLayer2->forward(PASS_GC);
+
+    testLayer1->getOutputGrad()->randomizeUniform();
+    testLayer2->getOutputGrad()->copyFrom(*testLayer1->getOutputGrad());
+
+    weightGrad->zero();
+    inputGrad->zero();
+    FLAGS_rnn_use_batch = useBatch1;
+    testLayer1->backward(nullptr);
+
+    wgt_grad1.copyFrom(*weightGrad);
+    input_grad1.copyFrom(*inputGrad);
+
+    weightGrad->zero();
+    inputGrad->zero();
+    FLAGS_rnn_use_batch = useBatch2;
+    testLayer2->backward(nullptr);
+
+    wgt_grad2.copyFrom(*weightGrad);
+    input_grad2.copyFrom(*inputGrad);
+
+    checkError(*testLayer1->getOutputValue(), *testLayer2->getOutputValue());
+    checkError(wgt_grad1, wgt_grad2);
+    checkError(input_grad1, input_grad2);
+  }
+}
+
+TEST(MKLPackedLayer, RecurrentLayer) {
+  LayerConfig layerConfig1;
+  LayerConfig layerConfig2;
+
+  layerConfig1.set_name("paddle-rnn");
+  layerConfig1.set_type("recurrent");
+  layerConfig1.set_active_type("relu");
+
+  layerConfig2.set_name("mkl-packed-rnn");
+  layerConfig2.set_type("mkl_packed_recurrent");
+  layerConfig2.set_active_type("relu");
+
+  FLAGS_use_gpu = false;
+
+  for (auto layerSize : {32, 64, 128, 256, 512}) {
+    for (auto batchSize : {1, 5, 100, 500}) {
+      for (auto reversed : {true, false}) {
+        for (auto paddle_use_batch : {true, false}) {
+          for (auto MKLPacked_use_batch : {true, false}) {
+            LOG(INFO) << " layerSize=" << layerSize
+                      << " batchSize=" << batchSize << " reversed=" << reversed
+                      << " paddle_use_batch=" << paddle_use_batch
+                      << " MKLPacked_use_batch=" << MKLPacked_use_batch;
+
+            checkMKLPackedLayer(layerConfig1,
+                                layerConfig2,
+                                reversed,
+                                layerSize,
+                                batchSize,
+                                paddle_use_batch,
+                                MKLPacked_use_batch);
+          }
+        }
+      }
+    }
+  }
+}
+#endif
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  if (!version::isWithGpu()) {
+    testing::GTEST_FLAG(filter) = "-Layer.*";
+  }
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/gserver/tests/test_SelectiveFCLayer.cpp b/paddle/legacy/gserver/tests/test_SelectiveFCLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1975d9196d61dbb80667b2ba86c09d56bc568064
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_SelectiveFCLayer.cpp
@@ -0,0 +1,471 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <math.h>
+#include <paddle/legacy/utils/PythonUtil.h>
+#include <algorithm>
+#include <cstdlib>
+#include <ctime>
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/gserver/layers/DataLayer.h"
+#include "paddle/legacy/gserver/layers/FullyConnectedLayer.h"
+#include "paddle/legacy/gserver/layers/Layer.h"
+#include "paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.h"
+#include "paddle/legacy/math/CpuSparseMatrix.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_bool(use_gpu);
+DECLARE_int32(num_passes);
+DECLARE_string(config);
+DECLARE_string(init_model_path);
+DECLARE_string(config_args);
+
+size_t fcLayerWidth = 1024;
+
+struct ComData {
+  vector<Argument> outArgs;
+  vector<ParameterPtr> parameters;
+};
+
+int randint(int* data, size_t int_max, size_t size) {
+  srand((size_t)(time(NULL)));
+  if (int_max < size) {
+    return -1;
+  }
+  size_t count = 0;
+  std::map<int, int> tmp;
+  int this_int = 0;
+
+  while (count < size) {
+    this_int = std::rand() % int_max;  // NOLINT
+    if (tmp.find(this_int) == tmp.end()) {
+      tmp[this_int] = 0;
+      count += 1;
+    }
+  }
+
+  if (tmp.size() != size) {
+    return -1;
+  }
+  count = 0;
+  for (auto itr = tmp.begin(); itr != tmp.end(); ++itr) {
+    data[count] = itr->first;
+    count += 1;
+  }
+  return 0;
+}
+
+void calcOutput(ComData& comData,
+                const string configFile,
+                const string configArgs,
+                bool useGpu) {
+  FLAGS_config = configFile;
+  FLAGS_config_args = configArgs;
+  FLAGS_use_gpu = useGpu;
+  FLAGS_init_model_path = "legacy/gserver/tests/SelectiveFcTest/model";
+  *ThreadLocalRand::getSeed() = 0;
+  srand(0);
+
+  Trainer trainer;
+  trainer.init(TrainerConfigHelper::createFromFlags(), false);
+
+  comData.parameters = trainer.getGradientMachine()->getParameters();
+
+  auto dataProvider = trainer.getDataProvider();
+  int32_t batchSize = trainer.getConfig().opt_config().batch_size();
+  DataBatch dataBatch;
+  dataProvider->setSkipShuffle();
+  dataProvider->reset();
+  dataProvider->getNextBatch(batchSize, &dataBatch);
+  CHECK(dataBatch.getSize()) << "No data from data provider";
+
+  vector<Argument>& inArgs = dataBatch.getStreams();
+  trainer.getGradientMachine()->start(trainer.getConfig(), nullptr);
+  trainer.getGradientMachine()->forwardBackward(
+      inArgs, &comData.outArgs, PASS_TRAIN);
+  trainer.getGradientMachine()->finish();
+}
+
+void checkMatrix(real* A, real* B, size_t matSize) {
+#ifndef PADDLE_TYPE_DOUBLE
+  real err = 1e-3;
+#else
+  real err = 1e-10;
+#endif
+  int diffNum = 0;
+  for (size_t i = 0; i < matSize; ++i) {
+    if (std::isinf(A[i]) || std::isnan(A[i]) || std::isinf(B[i]) ||
+        std::isnan(B[i])) {
+    } else if (fabs(A[i] - B[i]) > err) {
+      diffNum++;
+    }
+  }
+  EXPECT_EQ(0, diffNum);
+}
+
+void checkTranspose(real* matrix,
+                    real* transpose,
+                    size_t width,
+                    size_t matSize) {
+#ifndef PADDLE_TYPE_DOUBLE
+  real err = 1e-3;
+#else
+  real err = 1e-10;
+#endif
+  size_t height = matSize / width;
+  int diffNum = 0;
+  size_t rowId = 0;
+  size_t colId = 0;
+  for (size_t i = 0; i < matSize; ++i) {
+    if (i % width == 0 && i) {
+      rowId++;
+    }
+    colId = i % width;
+    if (fabs(matrix[i] - transpose[colId * height + rowId]) > err) {
+      diffNum++;
+      LOG(INFO) << i << " diff : " << matrix[i] << "\t"
+                << transpose[colId * height + rowId];
+    }
+  }
+  EXPECT_EQ(0, diffNum);
+}
+
+void compareOutput(ComData& fcData, ComData& selFcData) {
+  vector<Argument> outArgsFc = fcData.outArgs;
+  vector<Argument> outArgsSelfc = selFcData.outArgs;
+
+  // check cost
+  LOG(INFO) << "Check cost";
+  CpuMatrix fcCost(outArgsFc[0].value->getHeight(),
+                   outArgsFc[0].value->getWidth());
+  CpuMatrix selfcCost(outArgsSelfc[0].value->getHeight(),
+                      outArgsSelfc[0].value->getWidth());
+  fcCost.copyFrom(*outArgsFc[0].value);
+  selfcCost.copyFrom(*outArgsSelfc[0].value);
+  checkMatrix(fcCost.getData(), selfcCost.getData(), fcCost.getElementCnt());
+
+  // check selective fc output and fc output
+  LOG(INFO) << "Compare output of SelectiveFullyConectedLayer "
+            << "with FullyConectedLayer";
+  CpuMatrix fcOut(outArgsFc[1].value->getHeight(),
+                  outArgsFc[1].value->getWidth());
+  CpuMatrix selfcOut(outArgsSelfc[1].value->getHeight(),
+                     outArgsSelfc[1].value->getWidth());
+
+  fcOut.copyFrom(*outArgsFc[1].value);
+  selfcOut.copyFrom(*outArgsSelfc[1].value);
+  checkMatrix(fcOut.getData(), selfcOut.getData(), fcOut.getElementCnt());
+
+  // check gradient math
+  vector<ParameterPtr>& fcParam = fcData.parameters;
+  vector<ParameterPtr>& selfcParam = selFcData.parameters;
+  for (size_t i = 0; i < fcParam.size(); ++i) {
+    ParameterPtr p1, p2;
+    p1 = fcParam[i];
+    p2 = selfcParam[i];
+
+    string paramName = p1->getName();
+    LOG(INFO) << "check parameter : " << paramName;
+
+    // check parameter value
+    CpuVector paraValue1(p1->getSize());
+    CpuVector paraValue2(p2->getSize());
+    paraValue1.copyFrom(*p1->getBuf(PARAMETER_VALUE));
+    paraValue2.copyFrom(*p2->getBuf(PARAMETER_VALUE));
+
+    // check gradient
+    CpuVector paraGrad1(*p1->getBuf(PARAMETER_GRADIENT));
+    CpuVector paraGrad2(*p2->getBuf(PARAMETER_GRADIENT));
+    if (paramName == "rand_fc_param.bias") {
+      checkMatrix(
+          paraValue1.getData(), paraValue2.getData(), paraValue1.getSize());
+      checkMatrix(
+          paraGrad1.getData(), paraGrad2.getData(), paraGrad1.getSize());
+    } else {
+      checkTranspose(paraValue1.getData(),
+                     paraValue2.getData(),
+                     fcLayerWidth,
+                     paraValue1.getSize());
+      checkTranspose(paraGrad1.getData(),
+                     paraGrad2.getData(),
+                     fcLayerWidth,
+                     paraGrad1.getSize());
+    }
+  }
+}
+
+void compareSparseMulOutput(
+    real* fcOutput,
+    real* selOutput,
+    size_t nnz,
+    const std::shared_ptr<std::vector<std::pair<int*, size_t>>>& selCols) {
+#ifndef PADDLE_TYPE_DOUBLE
+  real err = 1e-3;
+#else
+  real err = 1e-10;
+#endif
+  size_t nnzCount =
+      std::accumulate(selCols->begin(),
+                      selCols->end(),
+                      0UL,
+                      [](size_t a, const std::pair<int*, size_t>& arr) {
+                        return a + arr.second;
+                      });
+  EXPECT_EQ(nnz, nnzCount);
+
+  size_t sampleNum = selCols->size();
+  int diffNum = 0;
+  size_t count = 0;
+  for (size_t i = 0; i < sampleNum; ++i) {
+    for (size_t j = 0; j < (*selCols)[i].second; ++j) {
+      size_t selIdx = (*selCols)[i].first[j];
+      if (fabs(fcOutput[i * fcLayerWidth + selIdx] - selOutput[count]) > err) {
+        diffNum++;
+        LOG(INFO) << count << " diff : " << fcOutput[i * fcLayerWidth + selIdx]
+                  << "\t" << selOutput[count];
+      }
+      count++;
+    }
+  }
+  EXPECT_EQ(0, diffNum);
+}
+
+LayerPtr creatDataLayer(string name,
+                        size_t batchSize,
+                        size_t layerSize,
+                        std::vector<real>& values,
+                        bool useGpu) {
+  LayerConfig dataConfig;
+  dataConfig.set_name(name);
+  dataConfig.set_type("data");
+  dataConfig.set_size(layerSize);
+  LayerPtr layer = LayerPtr(new DataLayer(dataConfig));
+
+  Argument data;
+  data.value = Matrix::create(batchSize, layerSize, false, useGpu);
+  data.value->copyFrom(values.data(), batchSize * layerSize);
+
+  DataLayerPtr dataLayer = std::dynamic_pointer_cast<DataLayer>(layer);
+  dataLayer->setData(data);
+  dataLayer->forward(PASS_TEST);
+  return layer;
+}
+
+ParameterPtr creatParameter(
+    string name, int pid, size_t paraSize, string paramFile, bool useGpu) {
+  ParameterConfig paraConfig;
+  paraConfig.set_name(name);
+  paraConfig.set_size(paraSize);
+
+  ParameterPtr parameter =
+      std::make_shared<Parameter>(paraConfig, useGpu, /*initialize */ false);
+  parameter->enableType(PARAMETER_VALUE);
+  parameter->randomize();
+  parameter->setID(pid);
+  parameter->load(paramFile);
+  return parameter;
+}
+
+LayerPtr initFcLayer(LayerPtr dataLayer,
+                     LayerConfig layerConfig,
+                     int dataLayerSize,
+                     int fcLayerSize,
+                     string paraName,
+                     string paraFile,
+                     bool useGpu) {
+  LayerMap layerMap;
+  ParameterMap parameterMap;
+
+  layerMap[dataLayer->getName()] = dataLayer;
+  ParameterPtr para = creatParameter(
+      paraName, 0, dataLayerSize * fcLayerSize, paraFile, useGpu);
+  parameterMap[para->getName()] = para;
+
+  layerConfig.add_inputs();
+  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
+  input.set_input_layer_name(dataLayer->getName());
+  input.set_input_parameter_name(paraName);
+
+  LayerPtr testLayer = Layer::create(layerConfig);
+  layerMap[testLayer->getName()] = testLayer;
+
+  testLayer->setNeedGradient(false);
+  testLayer->init(layerMap, parameterMap);
+  return testLayer;
+}
+
+#ifndef PADDLE_TYPE_DOUBLE
+// The parameter file used in fc.conf and selective_fc.conf is float
+TEST(Layer, SelectiveFcLayer_train_dense_mul) {
+  const string& fcConfig = "legacy/gserver/tests/SelectiveFcTest/conf/fc.conf";
+  const string& fcConfigArgs =
+      "filelist=legacy/gserver/tests/SelectiveFcTest/dense_mul_list";
+  const string& selFcConfig =
+      "legacy/gserver/tests/SelectiveFcTest/conf/selective_fc.conf";
+  const string& selConfigArgs =
+      "filelist=legacy/gserver/tests/SelectiveFcTest/dense_mul_list";
+
+  for (auto useGpu : {false, true}) {
+#ifndef PADDLE_WITH_CUDA
+    if (useGpu) {
+      break;
+    }
+#endif
+    LOG(INFO) << "FullyConnectedLayer forwardBackward()";
+    ComData fcData;
+    calcOutput(fcData, fcConfig, fcConfigArgs, useGpu);
+
+    LOG(INFO) << "SelectiveFullyConnectedLayer forwardBackward()";
+    ComData selFcData;
+    calcOutput(selFcData, selFcConfig, selConfigArgs, useGpu);
+    compareOutput(fcData, selFcData);
+  }
+}
+#endif  // PADDLE_TYPE_DOUBLE
+
+void testSelectiveFcLayerTrainSparseMul(const LayerConfig& config,
+                                        bool useGpu) {
+  FLAGS_use_gpu = useGpu;
+  size_t batchSize = 100;
+  size_t dataLayerSize = 512;
+  std::vector<real> values(batchSize * dataLayerSize);
+  for (size_t j = 0; j < batchSize * dataLayerSize; ++j) {
+    values[j] = std::rand() / real(RAND_MAX);
+  }
+  LayerPtr dataLayer =
+      creatDataLayer("data", batchSize, dataLayerSize, values, useGpu);
+
+  const string& selfcParaFile =
+      "legacy/gserver/tests/SelectiveFcTest/model/rand_fc_param.w.transpose";
+  const string& selfcParaName = "rand_fc_param.w.transpose";
+
+  std::shared_ptr<SelectiveFullyConnectedLayer> selfcLayer =
+      std::dynamic_pointer_cast<SelectiveFullyConnectedLayer>(
+          initFcLayer(dataLayer,
+                      config,
+                      dataLayerSize,
+                      fcLayerWidth,
+                      selfcParaName,
+                      selfcParaFile,
+                      useGpu));
+
+  // create selected columns
+  std::shared_ptr<std::vector<std::pair<int*, size_t>>> selCols(
+      new std::vector<std::pair<int*, size_t>>(batchSize));
+  size_t maxNNZ = 30;
+  srand((size_t)(time(NULL)));
+  int total = 0;
+  while (total == 0) {
+    for (size_t i = 0; i < batchSize; ++i) {
+      size_t num = std::rand() % maxNNZ;
+      int* data = new int[num];
+      randint(data, fcLayerWidth, num);
+      (*selCols)[i] = std::make_pair(data, num);
+      total += num;
+    }
+  }
+  selfcLayer->fillSelectiveData(selCols);
+  selfcLayer->forward(PASS_TEST);
+
+  MatrixPtr outMatSelfc = selfcLayer->getOutputValue();
+  CpuSparseMatrixPtr cpuOutMatSelfc(
+      new CpuSparseMatrix(outMatSelfc->getHeight(),
+                          outMatSelfc->getWidth(),
+                          outMatSelfc->getElementCnt()));
+  cpuOutMatSelfc->copyFrom(*outMatSelfc, HPPL_STREAM_DEFAULT);
+#ifdef PADDLE_WITH_CUDA
+  if (useGpu) {
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  }
+#endif
+  real* outValueSelfc = cpuOutMatSelfc->getValue();
+  size_t nnz = cpuOutMatSelfc->getElementCnt();
+
+  const string& fcParaFile =
+      "legacy/gserver/tests/SelectiveFcTest/model/rand_fc_param.w";
+  const string& fcParaName = "rand_fc_param.w";
+  LayerConfig fcLayerConfig;
+  fcLayerConfig.set_name("fc_layer");
+  fcLayerConfig.set_type("fc");
+  fcLayerConfig.set_active_type("linear");
+  fcLayerConfig.set_size(fcLayerWidth);
+
+  LayerPtr fcLayer = initFcLayer(dataLayer,
+                                 fcLayerConfig,
+                                 dataLayerSize,
+                                 fcLayerWidth,
+                                 fcParaName,
+                                 fcParaFile,
+                                 useGpu);
+  fcLayer->forward(PASS_TEST);
+
+  MatrixPtr outMatFc = fcLayer->getOutputValue();
+  MatrixPtr cpuOutMatFc(
+      new CpuMatrix(outMatFc->getHeight(), outMatFc->getWidth()));
+  cpuOutMatFc->copyFrom(*outMatFc, HPPL_STREAM_DEFAULT);
+#ifdef PADDLE_WITH_CUDA
+  if (useGpu) {
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  }
+#endif
+  real* outValueFc = cpuOutMatFc->getData();
+
+  compareSparseMulOutput(outValueFc, outValueSelfc, nnz, selCols);
+  for (size_t i = 0; i < batchSize; ++i) {
+    delete[](*selCols)[i].first;
+  }
+}
+
+#ifndef PADDLE_TYPE_DOUBLE
+// The parameter file used in testSelectiveFcLayerTrainSparseMul is float
+TEST(Layer, SelectiveFcLayer_train_sparse_mul) {
+  LayerConfig selLayerConfig;
+  selLayerConfig.set_name("sel_fc");
+  selLayerConfig.set_type("selective_fc");
+  selLayerConfig.set_active_type("linear");
+  selLayerConfig.set_has_selected_colums(false);
+  selLayerConfig.set_selective_fc_pass_generation(true);
+  selLayerConfig.set_size(fcLayerWidth);
+
+  testSelectiveFcLayerTrainSparseMul(selLayerConfig, false);
+#ifdef PADDLE_WITH_CUDA
+  testSelectiveFcLayerTrainSparseMul(selLayerConfig, true);
+#endif
+}
+#endif  // PADDLE_TYPE_DOUBLE
+
+// TODO(dangqingqing) test multi threads after support in matrix
+// TEST(Layer, SelectiveFcLayer_train_sparse_mul_parallel) {
+//   LayerConfig selLayerConfig;
+//   selLayerConfig.set_name("sel_fc");
+//   selLayerConfig.set_type("selective_fc");
+//   selLayerConfig.set_active_type("linear");
+//   selLayerConfig.set_has_selected_colums(false);
+//   selLayerConfig.set_selective_fc_pass_generation(true);
+//   selLayerConfig.set_selective_fc_parallel_plain_mul_thread_num(10);
+//   selLayerConfig.set_selective_fc_full_mul_ratio(1000);
+//   selLayerConfig.set_size(fcLayerWidth);
+//   SelectiveFcLayer_test(selLayerConfig, false);
+// }
+
+int main(int argc, char** argv) {
+  paddle::initMain(argc, argv);
+  testing::InitGoogleTest(&argc, argv);
+  initPython(argc, argv);
+  int ret = RUN_ALL_TESTS();
+  return ret;
+}
diff --git a/paddle/legacy/gserver/tests/test_SeqSliceLayerGrad.cpp b/paddle/legacy/gserver/tests/test_SeqSliceLayerGrad.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..05acd714219fa5964b5b3595543682825ea67d84
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_SeqSliceLayerGrad.cpp
@@ -0,0 +1,224 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/gserver/layers/DataLayer.h"
+
+#include "LayerGradUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_int32(gpu_id);
+DECLARE_bool(thread_local_rand_use_global_seed);
+
+const int MAX_SEQ_NUM = 17;
+const int MAX_SEQ_LEN = 23;
+const int MAX_BEAM_SIZE = 13;
+
+const size_t SEED = (size_t)(time(NULL));
+
+vector<real> randSampling(real range, int n) {
+  CHECK_GE(range, n);
+  vector<real> num(range);
+  iota(begin(num), end(num), 0.);
+  if (range == n) return num;
+
+  random_shuffle(begin(num), end(num));
+  num.resize(n);
+  sort(begin(num), end(num));
+  return num;
+}
+
+void genSeqInfo(vector<int>& seqStartPos, vector<int>& subSeqStartPos) {
+  seqStartPos.resize(1, 0);
+  subSeqStartPos.resize(1, 0);
+
+  srand(SEED);
+  int seqNum = 1 + (rand() % MAX_SEQ_NUM);
+  for (int i = 0; i < seqNum; ++i) {
+    int subSeqNum = 1 + (rand() % MAX_SEQ_NUM);
+    for (int j = 0; j < subSeqNum; ++j)
+      subSeqStartPos.push_back(subSeqStartPos.back() +
+                               (1 + (rand() % MAX_SEQ_LEN)));
+    seqStartPos.push_back(subSeqStartPos.back());
+  }
+}
+
+/*
+  generate start indices according to sequence start positions.
+ */
+void genStarts(vector<int>& seqStartPos,
+               vector<vector<real>>& starts,
+               size_t beamSize) {
+  starts.clear();
+  starts.resize(seqStartPos.size() - 1, vector<real>(beamSize, -1.));
+
+  for (size_t i = 0; i < seqStartPos.size() - 1; ++i) {
+    int seqLen = seqStartPos[i + 1] - seqStartPos[i];
+    vector<real> randStarts =
+        randSampling(seqLen, min(seqLen, static_cast<int>(beamSize)));
+    copy(begin(randStarts), end(randStarts), begin(starts[i]));
+  }
+}
+
+/*
+  generate end indices according to sequence start positions and start indices.
+ */
+void genEnds(vector<int>& seqStartPos,
+             vector<vector<real>>& starts,
+             vector<vector<real>>& ends,
+             size_t beamSize) {
+  CHECK_EQ(seqStartPos.size() - 1, starts.size());
+  ends.clear();
+  ends.resize(seqStartPos.size() - 1, vector<real>(beamSize, -1.));
+
+  for (size_t i = 0; i < starts.size(); ++i) {
+    for (size_t j = 0; j < starts[i].size(); ++j) {
+      int seqLen = seqStartPos[i + 1] - seqStartPos[i];
+      CHECK_GE(seqLen - 1, starts[i][j]);
+      if (starts[i][j] == -1.) break;
+      if (starts[i][j] == (seqLen - 1)) {
+        ends[i][j] = starts[i][j];
+      } else {
+        ends[i][j] = starts[i][j] + randSampling(seqLen - starts[i][j], 1)[0];
+      }
+    }
+  }
+}
+
+void genTestData(vector<int>& seqStartPos,
+                 vector<int>& subSeqStartPos,
+                 vector<vector<real>>& starts,
+                 vector<vector<real>>& ends,
+                 bool hasSubseq) {
+  size_t beamSize = 1 + (rand() % MAX_BEAM_SIZE);
+  genSeqInfo(seqStartPos, subSeqStartPos);
+
+  genStarts(hasSubseq ? subSeqStartPos : seqStartPos, starts, beamSize);
+  genEnds(hasSubseq ? subSeqStartPos : seqStartPos, starts, ends, beamSize);
+}
+
+template <typename T>
+void flatten2dVector(vector<vector<T>>& inVec, vector<T>& outVec) {
+  size_t totalSize{0};
+  for (auto const& items : inVec) totalSize += items.size();
+  outVec.reserve(totalSize);
+
+  for (auto& items : inVec)
+    move(items.begin(), items.end(), back_inserter(outVec));
+}
+
+void testSeqSliceLayer(bool hasSubseq,
+                       bool useGpu,
+                       vector<int>& seqStartPos,
+                       vector<int>& subSeqStartPos,
+                       vector<vector<real>>& starts,
+                       vector<vector<real>>& ends) {
+  // layer size is not crutial for this layer,
+  // so here use a small layer size in the unittest.
+  const size_t layerSize{4};
+  TestConfig config;
+  config.layerConfig.set_type("seq_slice");
+  config.layerConfig.set_size(layerSize);
+
+  // add the first input
+  MatrixPtr seqInputPtr =
+      Matrix::create(hasSubseq ? subSeqStartPos.back() : seqStartPos.back(),
+                     layerSize,
+                     false,
+                     false);
+  seqInputPtr->randomizeUniform();
+
+  if (hasSubseq) {
+    config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
+                                "seq_input",
+                                seqInputPtr,
+                                seqStartPos,
+                                subSeqStartPos});
+  } else {
+    config.inputDefs.push_back(
+        {INPUT_SELF_DEFINE_DATA, "seq_input", seqInputPtr, seqStartPos});
+  }
+  config.layerConfig.add_inputs();
+
+  // add start indices
+  if (starts.size()) {
+    vector<real> startsToVec;
+    flatten2dVector(starts, startsToVec);
+
+    MatrixPtr startMatrixPtr =
+        Matrix::create(starts.size(), starts[0].size(), false, false);
+    startMatrixPtr->copyFrom(startsToVec.data(), startsToVec.size());
+
+    config.inputDefs.push_back(
+        {INPUT_SELF_DEFINE_DATA, "starts", startMatrixPtr});
+    config.layerConfig.add_inputs();
+    config.layerConfig.set_select_first(true);
+  }
+
+  // add end indices
+  if (ends.size()) {
+    vector<real> endsToVec;
+    flatten2dVector(ends, endsToVec);
+
+    MatrixPtr endMatrixPtr =
+        Matrix::create(ends.size(), ends[0].size(), false, false);
+    endMatrixPtr->copyFrom(endsToVec.data(), endsToVec.size());
+
+    config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, "ends", endMatrixPtr});
+    config.layerConfig.add_inputs();
+    config.layerConfig.set_select_first(false);
+  }
+
+  testLayerGrad(config, "seq_slice", /*batchSize*/ 100, false, useGpu, false);
+}
+
+TEST(Layer, SeqSliceLayer) {
+  vector<int> seqStartPos;
+  vector<int> subSeqStartPos;
+  vector<vector<real>> starts;
+  vector<vector<real>> ends;
+
+  std::vector<bool> mode = {false};
+#ifdef PADDLE_WITH_CUDA
+  mode.push_back(true);
+#endif
+  genSeqInfo(seqStartPos, subSeqStartPos);
+  for (bool hasSubseq : {true, false}) {
+    LOG(INFO) << "hasSubSeq : " << hasSubseq;
+    genTestData(seqStartPos, subSeqStartPos, starts, ends, hasSubseq);
+    for (bool useGpu : mode) {
+      vector<vector<real>> tmp;
+      testSeqSliceLayer(
+          hasSubseq, useGpu, seqStartPos, subSeqStartPos, tmp, ends);
+      testSeqSliceLayer(
+          hasSubseq, useGpu, seqStartPos, subSeqStartPos, starts, tmp);
+      testSeqSliceLayer(
+          hasSubseq, useGpu, seqStartPos, subSeqStartPos, starts, ends);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  initMain(argc, argv);
+  hl_start();
+  hl_init(FLAGS_gpu_id);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/gserver/tests/test_Upsample.cpp b/paddle/legacy/gserver/tests/test_Upsample.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..940d46baf73f2d600cff6edc37c29a3a36bf5d90
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_Upsample.cpp
@@ -0,0 +1,153 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+
+#include "LayerGradUtil.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/testing/TestUtil.h"
+
+void setPoolConfig(paddle::TestConfig* config,
+                   paddle::PoolConfig* pool,
+                   const string& poolType) {
+  (*config).biasSize = 0;
+  (*config).layerConfig.set_type("pool");
+  (*config).layerConfig.set_num_filters(1);
+
+  int kw = 2, kh = 2;
+  int pw = 0, ph = 0;
+  int sw = 2, sh = 2;
+  pool->set_pool_type(poolType);
+  pool->set_channels(2);
+  pool->set_size_x(kw);
+  pool->set_size_y(kh);
+  pool->set_start(0);
+  pool->set_padding(pw);
+  pool->set_padding_y(ph);
+  pool->set_stride(sw);
+  pool->set_stride_y(sh);
+
+  int ow =
+      paddle::outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
+  int oh =
+      paddle::outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
+  pool->set_output_x(ow);
+  pool->set_output_y(oh);
+}
+
+paddle::LayerPtr doOneUpsampleTest(const paddle::MatrixPtr& inputMat,
+                                   const string& poolType,
+                                   bool use_gpu,
+                                   real* tempGradData) {
+  /* prepare maxPoolWithMaskLayer */
+  paddle::TestConfig config;
+  config.inputDefs.push_back({paddle::INPUT_DATA, "layer_0", 128, 0});
+  paddle::LayerInputConfig* input = config.layerConfig.add_inputs();
+  paddle::PoolConfig* pool = input->mutable_pool_conf();
+
+  pool->set_img_size(8);
+  pool->set_img_size_y(8);
+  setPoolConfig(&config, pool, "max-pool-with-mask");
+  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
+                              pool->channels());
+
+  config.layerConfig.set_name("MaxPoolWithMask");
+
+  std::vector<paddle::DataLayerPtr> dataLayers;
+  paddle::LayerMap layerMap;
+  vector<paddle::Argument> datas;
+
+  initDataLayer(config,
+                &dataLayers,
+                &datas,
+                &layerMap,
+                "MaxPoolWithMask",
+                1,
+                false,
+                use_gpu);
+
+  dataLayers[0]->getOutputValue()->copyFrom(*inputMat);
+
+  FLAGS_use_gpu = use_gpu;
+  std::vector<paddle::ParameterPtr> parameters;
+  paddle::LayerPtr maxPoolingWithMaskOutputLayer;
+  initTestLayer(config, &layerMap, &parameters, &maxPoolingWithMaskOutputLayer);
+  maxPoolingWithMaskOutputLayer->forward(paddle::PASS_GC);
+
+  /* prepare the upsample layer */
+  paddle::LayerConfig upsampleLayerConfig;
+  upsampleLayerConfig.set_type("upsample");
+  paddle::LayerInputConfig* input1 = upsampleLayerConfig.add_inputs();
+  upsampleLayerConfig.add_inputs();
+
+  paddle::UpsampleConfig* upsampleConfig = input1->mutable_upsample_conf();
+  upsampleConfig->set_scale(2);
+  paddle::ImageConfig* imageConfig = upsampleConfig->mutable_image_conf();
+  imageConfig->set_channels(2);
+  imageConfig->set_img_size(4);
+  imageConfig->set_img_size_y(4);
+  upsampleLayerConfig.set_size(2 * 8 * 8);
+  upsampleLayerConfig.set_name("upsample");
+
+  for (size_t i = 0; i < 2; i++) {
+    paddle::LayerInputConfig& inputTemp =
+        *(upsampleLayerConfig.mutable_inputs(i));
+    inputTemp.set_input_layer_name("MaxPoolWithMask");
+  }
+
+  paddle::LayerPtr upsampleLayer;
+  paddle::ParameterMap parameterMap;
+  upsampleLayer = paddle::Layer::create(upsampleLayerConfig);
+  layerMap[upsampleLayerConfig.name()] = upsampleLayer;
+  upsampleLayer->init(layerMap, parameterMap);
+  upsampleLayer->setNeedGradient(true);
+  upsampleLayer->forward(paddle::PASS_GC);
+  upsampleLayer->getOutputGrad()->copyFrom(tempGradData, 128);
+  upsampleLayer->backward();
+
+  return upsampleLayer;
+}
+
+TEST(Layer, maxPoolingWithMaskOutputLayerFwd) {
+  bool useGpu = false;
+  paddle::MatrixPtr inputMat;
+  paddle::MatrixPtr inputGPUMat;
+  paddle::MatrixPtr tempGradMat;
+
+  inputMat = paddle::Matrix::create(1, 128, false, useGpu);
+  inputMat->randomizeUniform();
+
+  tempGradMat = paddle::Matrix::create(1, 128, false, useGpu);
+  tempGradMat->randomizeUniform();
+  real* tempGradData = tempGradMat->getData();
+
+  paddle::LayerPtr upsampleLayerCPU =
+      doOneUpsampleTest(inputMat, "max-pool-with-mask", useGpu, tempGradData);
+
+#ifdef PADDLE_WITH_CUDA
+  useGpu = true;
+  real* data = inputMat->getData();
+  inputGPUMat = paddle::Matrix::create(1, 128, false, useGpu);
+  inputGPUMat->copyFrom(data, 128);
+  paddle::LayerPtr upsampleLayerGPU = doOneUpsampleTest(
+      inputGPUMat, "max-pool-with-mask", useGpu, tempGradData);
+  paddle::checkMatrixEqual(upsampleLayerCPU->getOutput("").value,
+                           upsampleLayerGPU->getOutput("").value);
+
+  paddle::checkMatrixEqual(upsampleLayerCPU->getPrev(0)->getOutputGrad(),
+                           upsampleLayerGPU->getPrev(0)->getOutputGrad());
+#endif
+}
diff --git a/paddle/legacy/gserver/tests/test_WarpCTCLayer.cpp b/paddle/legacy/gserver/tests/test_WarpCTCLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b1697e1616484ec5389cdb5b59ba413a9615cf2e
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_WarpCTCLayer.cpp
@@ -0,0 +1,244 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <paddle/legacy/utils/Version.h>
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/gserver/layers/CTCLayer.h"
+#include "paddle/legacy/gserver/layers/DataLayer.h"
+#include "paddle/legacy/gserver/layers/Layer.h"
+#include "paddle/legacy/gserver/layers/WarpCTCLayer.h"
+
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_bool(use_gpu);
+
+const real* getData(const Matrix& matrix) {
+  if (matrix.useGpu()) {
+    MatrixPtr cpuMatrix = Matrix::create(
+        matrix.getHeight(), matrix.getWidth(), matrix.isTransposed(), false);
+    cpuMatrix->copyFrom(matrix);
+    return cpuMatrix->getData();
+  } else {
+    return matrix.getData();
+  }
+}
+
+int checkError(const Matrix& matrix1, const Matrix& matrix2) {
+  CHECK_EQ(matrix1.getHeight(), matrix2.getHeight());
+  CHECK_EQ(matrix1.getWidth(), matrix2.getWidth());
+  CHECK_EQ(matrix1.isTransposed(), matrix2.isTransposed());
+#ifndef PADDLE_TYPE_DOUBLE
+  real err = 1e-3;
+#else
+  real err = 1e-10;
+#endif
+
+  int height = matrix1.getHeight();
+  int width = matrix1.getWidth();
+
+  const real* data1 = getData(matrix1);
+  const real* data2 = getData(matrix2);
+  int count = 0;
+  for (int i = 0; i < height; i++) {
+    for (int j = 0; j < width; j++) {
+      if (fabs(data1[i * width + j] - data2[i * width + j]) > err) {
+        count++;
+      }
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+  return count;
+}
+
+void initArgument(size_t batchSize,
+                  int layerSize,
+                  bool useGpu,
+                  Argument& data) {
+  data.value = Matrix::create(batchSize, layerSize, false, useGpu);
+  data.grad = Matrix::create(batchSize, layerSize, false, useGpu);
+  data.value->randomizeUniform();
+  data.value->add(-0.5);
+  data.grad->zeroMem();
+
+  generateSequenceStartPositions(batchSize, data.sequenceStartPositions);
+}
+
+LayerPtr createDataLayer(
+    string name, size_t batchSize, int layerSize, bool useGpu, Argument& data) {
+  LayerConfig layerConfig;
+  layerConfig.set_name(name);
+  layerConfig.set_type("data");
+  layerConfig.set_size(layerSize);
+  LayerPtr layer = LayerPtr(new DataLayer(layerConfig));
+
+  DataLayerPtr dataLayer = std::dynamic_pointer_cast<DataLayer>(layer);
+  dataLayer->setData(data);
+  dataLayer->forward(PASS_GC);
+
+  return layer;
+}
+
+LayerPtr createLabelLayer(string name,
+                          size_t batchSize,
+                          size_t numClasses,
+                          bool useGpu) {
+  LayerConfig layerConfig;
+  layerConfig.set_name(name);
+  layerConfig.set_type("data");
+  layerConfig.set_size(1);
+  LayerPtr layer = LayerPtr(new DataLayer(layerConfig));
+
+  Argument data;
+  data.ids = IVector::create(batchSize, useGpu);
+  data.ids->rand(numClasses - 1);
+
+  generateSequenceStartPositions(batchSize, data.sequenceStartPositions);
+
+  DataLayerPtr labelLayer = std::dynamic_pointer_cast<DataLayer>(layer);
+  labelLayer->setData(data);
+  labelLayer->forward(PASS_GC);
+
+  return layer;
+}
+
+LayerPtr createCTCLayer(string name,
+                        size_t numClasses,
+                        bool useGpu,
+                        bool normByTimes,
+                        LayerPtr dataLayer,
+                        LayerPtr labelLayer) {
+  LayerMap layerMap;
+  layerMap[dataLayer->getName()] = dataLayer;
+  layerMap[labelLayer->getName()] = labelLayer;
+
+  ParameterMap parameterMap;
+
+  LayerConfig layerConfig;
+  layerConfig.set_name(name);
+  layerConfig.set_type("ctc");
+  layerConfig.set_size(numClasses);
+  layerConfig.set_norm_by_times(normByTimes);
+
+  layerConfig.add_inputs();
+  LayerInputConfig& input0 = *(layerConfig.mutable_inputs(0));
+  input0.set_input_layer_name(dataLayer->getName());
+
+  layerConfig.add_inputs();
+  LayerInputConfig& input1 = *(layerConfig.mutable_inputs(1));
+  input1.set_input_layer_name(labelLayer->getName());
+
+  LayerPtr layer = LayerPtr(new CTCLayer(layerConfig));
+  layerMap[layer->getName()] = layer;
+  layer->init(layerMap, parameterMap);
+
+  ActivationFunction* softmaxActivation = ActivationFunction::create("softmax");
+
+  softmaxActivation->forward(dataLayer->getOutput()).check();
+  layer->forward(PASS_GC);
+
+  layer->backward();
+  softmaxActivation->backward(dataLayer->getOutput()).check();
+
+  return layer;
+}
+
+LayerPtr createWarpCTCLayer(string name,
+                            size_t numClasses,
+                            bool useGpu,
+                            bool normByTimes,
+                            LayerPtr dataLayer,
+                            LayerPtr labelLayer) {
+  LayerMap layerMap;
+  layerMap[dataLayer->getName()] = dataLayer;
+  layerMap[labelLayer->getName()] = labelLayer;
+
+  ParameterMap parameterMap;
+
+  LayerConfig layerConfig;
+  layerConfig.set_name(name);
+  layerConfig.set_type("warp_ctc");
+  layerConfig.set_size(numClasses);
+  layerConfig.set_blank(numClasses - 1);
+  layerConfig.set_norm_by_times(normByTimes);
+
+  layerConfig.add_inputs();
+  LayerInputConfig& input0 = *(layerConfig.mutable_inputs(0));
+  input0.set_input_layer_name(dataLayer->getName());
+
+  layerConfig.add_inputs();
+  LayerInputConfig& input1 = *(layerConfig.mutable_inputs(1));
+  input1.set_input_layer_name(labelLayer->getName());
+
+  LayerPtr layer = LayerPtr(new WarpCTCLayer(layerConfig));
+  layerMap[layer->getName()] = layer;
+  layer->init(layerMap, parameterMap);
+
+  layer->forward(PASS_GC);
+  layer->backward();
+
+  return layer;
+}
+
+TEST(Layer, WarpCTCLayer) {
+  for (auto layerSize : {10, 64}) {
+    for (auto batchSize : {1, 10, 32}) {
+      for (auto normByTimes : {false, true}) {
+        for (auto useGpu : {false, true}) {
+#ifndef PADDLE_WITH_CUDA
+          if (useGpu) continue;
+#endif
+          LOG(INFO) << "layerSize=" << layerSize << " batchSize=" << batchSize
+                    << " normByTimes = " << normByTimes << " useGpu=" << useGpu;
+
+          FLAGS_use_gpu = useGpu;
+
+          Argument data0;
+          initArgument(batchSize, layerSize, useGpu, data0);
+
+          Argument data1;
+          data1.resizeAndCopyFrom(data0);
+
+          LayerPtr dataLayer0 =
+              createDataLayer("data", batchSize, layerSize, useGpu, data0);
+          LayerPtr dataLayer1 =
+              createDataLayer("data", batchSize, layerSize, useGpu, data1);
+
+          LayerPtr labelLayer =
+              createLabelLayer("label", batchSize, layerSize, useGpu);
+
+          LayerPtr warpctcLayer = createWarpCTCLayer(
+              "cost", layerSize, useGpu, normByTimes, dataLayer0, labelLayer);
+          LayerPtr ctcLayer = createCTCLayer(
+              "cost", layerSize, useGpu, normByTimes, dataLayer1, labelLayer);
+
+          /// Check cost
+          LOG(INFO) << "Check cost: "
+                    << checkError(*(warpctcLayer->getOutput().value),
+                                  *(ctcLayer->getOutput().value))
+                    << " different elements.";
+
+          /// Check gradients
+          LOG(INFO) << "Check gradients: "
+                    << checkError(*(dataLayer0->getOutput().grad),
+                                  *(dataLayer1->getOutput().grad))
+                    << " different elements";
+        }
+      }
+    }
+  }
+}
diff --git a/paddle/legacy/math/Allocator.h b/paddle/legacy/math/Allocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..ffb5ec1cad4113c2035daad8c385bbe57a161079
--- /dev/null
+++ b/paddle/legacy/math/Allocator.h
@@ -0,0 +1,137 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdlib.h>
+#include <mutex>
+#include "hl_gpu.h"
+#include "paddle/legacy/utils/Logging.h"
+
+namespace paddle {
+
+/**
+ * @brief Allocator base class.
+ *
+ * This is the base class of all Allocator class.
+ */
+class Allocator {
+ public:
+  virtual ~Allocator() {}
+  virtual void* alloc(size_t size) = 0;
+  virtual void free(void* ptr) = 0;
+  virtual std::string getName() = 0;
+};
+
+/**
+ * @brief CPU allocator implementation.
+ */
+class CpuAllocator : public Allocator {
+ public:
+  ~CpuAllocator() {}
+
+  /**
+   * @brief Aligned allocation on CPU.
+   * @param size Size to be allocated.
+   * @return Pointer to the allocated memory
+   */
+  virtual void* alloc(size_t size) {
+    void* ptr;
+#ifdef PADDLE_WITH_MKLDNN
+    // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp
+    // memory alignment
+    CHECK_EQ(posix_memalign(&ptr, 4096ul, size), 0);
+#else
+    CHECK_EQ(posix_memalign(&ptr, 32ul, size), 0);
+#endif
+    CHECK(ptr) << "Fail to allocate CPU memory: size=" << size;
+    return ptr;
+  }
+
+  /**
+   * @brief Free the memory space.
+   * @param ptr  Pointer to be free.
+   */
+  virtual void free(void* ptr) {
+    if (ptr) {
+      ::free(ptr);
+    }
+  }
+
+  virtual std::string getName() { return "cpu_alloc"; }
+};
+
+/**
+ * @brief GPU allocator implementation.
+ */
+class GpuAllocator : public Allocator {
+ public:
+  ~GpuAllocator() {}
+
+  /**
+   * @brief Allocate GPU memory.
+   * @param size Size to be allocated.
+   * @return Pointer to the allocated memory
+   */
+  virtual void* alloc(size_t size) {
+    void* ptr = hl_malloc_device(size);
+    CHECK(ptr) << "Fail to allocate GPU memory " << size << " bytes";
+    return ptr;
+  }
+
+  /**
+   * @brief Free the GPU memory.
+   * @param ptr  Pointer to be free.
+   */
+  virtual void free(void* ptr) {
+    if (ptr) {
+      hl_free_mem_device(ptr);
+    }
+  }
+
+  virtual std::string getName() { return "gpu_alloc"; }
+};
+
+/**
+ * @brief CPU pinned memory allocator implementation.
+ */
+class CudaHostAllocator : public Allocator {
+ public:
+  ~CudaHostAllocator() {}
+
+  /**
+   * @brief Allocate pinned memory.
+   * @param size Size to be allocated.
+   * @return Pointer to the allocated memory
+   */
+  virtual void* alloc(size_t size) {
+    void* ptr = hl_malloc_host(size);
+    CHECK(ptr) << "Fail to allocate pinned memory " << size << " bytes";
+    return ptr;
+  }
+
+  /**
+   * @brief Free the pinned memory.
+   * @param ptr  Pointer to be free.
+   */
+  virtual void free(void* ptr) {
+    if (ptr) {
+      hl_free_mem_host(ptr);
+    }
+  }
+
+  virtual std::string getName() { return "cuda_host_alloc"; }
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/math/BaseMatrix.cu b/paddle/legacy/math/BaseMatrix.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7e7cdc57a9887152ecd9e0bbd9fe14fcba56799d
--- /dev/null
+++ b/paddle/legacy/math/BaseMatrix.cu
@@ -0,0 +1,1953 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <paddle/legacy/utils/Logging.h>
+#include <string.h>
+#include <cmath>
+#include "BaseMatrix.h"
+#include "MathFunctions.h"
+#include "NEONFunctions.h"
+#include "SIMDFunctions.h"
+#include "hl_matrix_apply.cuh"
+#include "hl_matrix_base.cuh"
+#include "hl_matrix_ops.cuh"
+
+namespace paddle {
+
+const char* SPARSE_SUPPORT_ERROR = "Sparse Matrix/Vector is not supported.";
+
+template <class T>
+template <class Op>
+int BaseMatrixT<T>::applyUnary(Op op) {
+  MatrixOffset offset(0, 0);
+  applyUnary(op, height_, width_, offset);
+  return 0;
+}
+
+template <class T>
+template <class Op>
+int BaseMatrixT<T>::applyUnary(Op op,
+                               int numRows,
+                               int numCols,
+                               MatrixOffset& offset) {
+  CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
+  int dimM = numRows;
+  int dimN = numCols;
+  int lda = stride_;
+
+  T* A = data_;
+  CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_);
+
+  CHECK_LE(dimM + offset.aRow_, this->height_);
+  CHECK_LE(dimN + offset.aCol_, this->width_);
+  if (true == useGpu_) {
+    hl_gpu_apply_unary_op(op, A, dimM, dimN, lda);
+  } else {
+    hl_cpu_apply_unary_op(op, A, dimM, dimN, lda);
+  }
+  return 0;
+}
+
+template <class T>
+template <class Op>
+int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b) {
+  CHECK(height_ == b.height_ && width_ == b.width_)
+      << "Matrix dimensions are not equal";
+
+  MatrixOffset offset(0, 0, 0, 0);
+  applyBinary(op, b, height_, width_, offset);
+  return 0;
+}
+
+template <class T>
+template <class Op>
+int BaseMatrixT<T>::applyBinary(
+    Op op, BaseMatrixT& b, int numRows, int numCols, MatrixOffset& offset) {
+  applyBinary(op, b, numRows, numCols, offset, false_type(), false_type());
+  return 0;
+}
+
+template <class T>
+template <class Op, class bAsRowVector, class bAsColVector>
+int BaseMatrixT<T>::applyBinary(Op op,
+                                BaseMatrixT& b,
+                                int numRows,
+                                int numCols,
+                                MatrixOffset& offset,
+                                bAsRowVector,
+                                bAsColVector) {
+  CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
+  CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR;
+  CHECK(useGpu_ == b.useGpu_) << "Matrix type mismatch";
+
+  int dimM = numRows;
+  int dimN = numCols;
+  int lda = stride_;
+  int ldb = b.stride_;
+
+  T* A = data_;
+  T* B = b.data_;
+  CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_);
+  CAL_MATRIX_START_ADDRESS(
+      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
+  CHECK_LE(dimM + offset.aRow_, this->height_);
+  CHECK_LE(dimN + offset.aCol_, this->width_);
+  if (!bAsRowVector::value && !bAsColVector::value) {
+    CHECK_LE(dimM + offset.bRow_, b.height_);
+    CHECK_LE(dimN + offset.bCol_, b.width_);
+  } else if (bAsRowVector::value && !bAsColVector::value) {
+    CHECK_LE(dimN + offset.bCol_, b.width_);
+  } else if (!bAsRowVector::value && bAsColVector::value) {
+    CHECK_LE(dimM + offset.bRow_, b.height_);
+  } else {
+  }
+  if (true == useGpu_) {
+    hl_gpu_apply_binary_op<T, Op, bAsRowVector::value, bAsColVector::value>(
+        op, A, B, dimM, dimN, lda, ldb);
+  } else {
+    hl_cpu_apply_binary_op<T, Op, bAsRowVector::value, bAsColVector::value>(
+        op, A, B, dimM, dimN, lda, ldb);
+  }
+
+  return 0;
+}
+
+template <class T>
+template <class Op>
+int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c) {
+  CHECK_EQ(height_, b.height_);
+  CHECK_EQ(width_, b.width_);
+  CHECK_EQ(height_, c.height_);
+  CHECK_EQ(width_, c.width_);
+
+  MatrixOffset offset(0, 0, 0, 0, 0, 0);
+  applyTernary(op, b, c, height_, width_, offset);
+
+  return 0;
+}
+
+template <class T>
+template <class Op>
+int BaseMatrixT<T>::applyTernary(Op op,
+                                 BaseMatrixT& b,
+                                 BaseMatrixT& c,
+                                 int numRows,
+                                 int numCols,
+                                 MatrixOffset& offset) {
+  applyTernary(op, b, c, numRows, numCols, offset, false_type(), false_type());
+
+  return 0;
+}
+
+template <class T>
+template <class Op, class cAsRowVector, class cAsColVector>
+int BaseMatrixT<T>::applyTernary(Op op,
+                                 BaseMatrixT& b,
+                                 BaseMatrixT& c,
+                                 int numRows,
+                                 int numCols,
+                                 MatrixOffset& offset,
+                                 cAsRowVector,
+                                 cAsColVector) {
+  CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
+  CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR;
+  CHECK(!c.isSparse()) << SPARSE_SUPPORT_ERROR;
+  CHECK_EQ(useGpu_, b.useGpu_);
+  CHECK_EQ(useGpu_, c.useGpu_);
+
+  int dimM = numRows;
+  int dimN = numCols;
+  int lda = stride_;
+  int ldb = b.stride_;
+  int ldc = c.stride_;
+
+  T* A = data_;
+  T* B = b.data_;
+  T* C = c.data_;
+  CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_);
+  CAL_MATRIX_START_ADDRESS(
+      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
+  CAL_MATRIX_START_ADDRESS(
+      C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_);
+
+  CHECK_LE(dimM + offset.aRow_, this->height_);
+  CHECK_LE(dimN + offset.aCol_, this->width_);
+  CHECK_LE(dimM + offset.bRow_, b.height_);
+  CHECK_LE(dimN + offset.bCol_, b.width_);
+  if (!cAsRowVector::value && !cAsColVector::value) {
+    CHECK_LE(dimM + offset.cRow_, c.height_);
+    CHECK_LE(dimN + offset.cCol_, c.width_);
+  } else if (cAsRowVector::value && !cAsColVector::value) {
+    CHECK_LE(dimN + offset.cCol_, c.width_);
+  } else if (!cAsRowVector::value && cAsColVector::value) {
+    CHECK_LE(dimM + offset.cRow_, c.height_);
+  } else {
+  }
+
+  if (true == useGpu_) {
+    hl_gpu_apply_ternary_op<T, Op, cAsRowVector::value, cAsColVector::value>(
+        op, A, B, C, dimM, dimN, lda, ldb, ldc);
+  } else {
+    hl_cpu_apply_ternary_op<T, Op, cAsRowVector::value, cAsColVector::value>(
+        op, A, B, C, dimM, dimN, lda, ldb, ldc);
+  }
+
+  return 0;
+}
+
+template <class T>
+template <class Op>
+int BaseMatrixT<T>::applyQuaternary(Op op,
+                                    BaseMatrixT& b,
+                                    BaseMatrixT& c,
+                                    BaseMatrixT& d) {
+  CHECK_EQ(height_, b.height_);
+  CHECK_EQ(width_, b.width_);
+  CHECK_EQ(height_, c.height_);
+  CHECK_EQ(width_, c.width_);
+  CHECK_EQ(height_, d.height_);
+  CHECK_EQ(width_, d.width_);
+
+  MatrixOffset offset(0, 0, 0, 0, 0, 0, 0, 0);
+  applyQuaternary(op, b, c, d, height_, width_, offset);
+
+  return 0;
+}
+
+template <class T>
+template <class Op>
+int BaseMatrixT<T>::applyQuaternary(Op op,
+                                    BaseMatrixT& b,
+                                    BaseMatrixT& c,
+                                    BaseMatrixT& d,
+                                    int numRows,
+                                    int numCols,
+                                    MatrixOffset& offset) {
+  CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
+  CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR;
+  CHECK(!c.isSparse()) << SPARSE_SUPPORT_ERROR;
+  CHECK(!d.isSparse()) << SPARSE_SUPPORT_ERROR;
+  CHECK_EQ(useGpu_, b.useGpu_);
+  CHECK_EQ(useGpu_, c.useGpu_);
+  CHECK_EQ(useGpu_, d.useGpu_);
+
+  int dimM = numRows;
+  int dimN = numCols;
+  int lda = stride_;
+  int ldb = b.stride_;
+  int ldc = c.stride_;
+  int ldd = d.stride_;
+
+  T* A = data_;
+  T* B = b.data_;
+  T* C = c.data_;
+  T* D = d.data_;
+  CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_);
+  CAL_MATRIX_START_ADDRESS(
+      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
+  CAL_MATRIX_START_ADDRESS(
+      C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_);
+  CAL_MATRIX_START_ADDRESS(
+      D, d.height_, d.width_, ldd, offset.dCol_, offset.dRow_);
+
+  CHECK_LE(dimM + offset.aRow_, this->height_);
+  CHECK_LE(dimN + offset.aCol_, this->width_);
+  CHECK_LE(dimM + offset.bRow_, b.height_);
+  CHECK_LE(dimN + offset.bCol_, b.width_);
+  CHECK_LE(dimM + offset.cRow_, c.height_);
+  CHECK_LE(dimN + offset.cCol_, c.width_);
+  CHECK_LE(dimM + offset.dRow_, d.height_);
+  CHECK_LE(dimN + offset.dCol_, d.width_);
+  if (true == useGpu_) {
+    hl_gpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb, ldc, ldd);
+  } else {
+    hl_cpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb, ldc, ldd);
+  }
+
+  return 0;
+}
+
+template <class T>
+template <class Agg,
+          class Op,
+          class Saver,
+          class aAsRowVector,
+          class aAsColVector>
+int BaseMatrixT<T>::aggregate(Agg agg,
+                              Op op,
+                              Saver sv,
+                              BaseMatrixT& b,
+                              int numRows,
+                              int numCols,
+                              MatrixOffset& offset,
+                              aAsRowVector,
+                              aAsColVector) {
+  CHECK_EQ(useGpu_, b.useGpu_);
+
+  int ld = stride_;
+  int ldb = b.stride_;
+
+  T* dst = data_;
+  T* B = b.data_;
+  CAL_MATRIX_START_ADDRESS(
+      dst, height_, width_, ld, offset.aCol_, offset.aRow_);
+  CAL_MATRIX_START_ADDRESS(
+      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
+
+  if (aAsRowVector::value && !aAsColVector::value) {
+    if (useGpu_) {
+      hl_gpu_matrix_column_op(agg, op, sv, numRows, numCols, dst, B, ldb);
+    } else {
+      hl_cpu_matrix_column_op(agg, op, sv, numRows, numCols, dst, B, ldb);
+    }
+  } else if (!aAsRowVector::value && aAsColVector::value) {
+    if (useGpu_) {
+      hl_gpu_matrix_row_op(agg, op, sv, numRows, numCols, dst, ld, B, ldb);
+    } else {
+      hl_cpu_matrix_row_op(agg, op, sv, numRows, numCols, dst, ld, B, ldb);
+    }
+  } else {
+    LOG(FATAL) << "not supported";
+  }
+
+  return 0;
+}
+
+template <class T>
+template <class Agg,
+          class Op,
+          class Saver,
+          class aAsRowVector,
+          class aAsColVector>
+int BaseMatrixT<T>::aggregate(Agg agg,
+                              Op op,
+                              Saver sv,
+                              BaseMatrixT& b,
+                              BaseMatrixT& c,
+                              int numRows,
+                              int numCols,
+                              MatrixOffset& offset,
+                              aAsRowVector,
+                              aAsColVector) {
+  CHECK_EQ(useGpu_, b.useGpu_);
+  CHECK_EQ(useGpu_, c.useGpu_);
+
+  int ld = stride_;
+  int ldb = b.stride_;
+  int ldc = c.stride_;
+
+  T* dst = data_;
+  T* B = b.data_;
+  T* C = c.data_;
+  CAL_MATRIX_START_ADDRESS(
+      dst, height_, width_, ld, offset.aCol_, offset.aRow_);
+  CAL_MATRIX_START_ADDRESS(
+      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
+  CAL_MATRIX_START_ADDRESS(
+      C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_);
+
+  if (aAsRowVector::value && !aAsColVector::value) {
+    if (useGpu_) {
+      hl_gpu_matrix_column_op(
+          agg, op, sv, numRows, numCols, dst, B, ldb, C, ldc);
+    } else {
+      hl_cpu_matrix_column_op(
+          agg, op, sv, numRows, numCols, dst, B, ldb, C, ldc);
+    }
+  } else if (!aAsRowVector::value && aAsColVector::value) {
+    if (useGpu_) {
+      hl_gpu_matrix_row_op(
+          agg, op, sv, numRows, numCols, dst, ld, B, ldb, C, ldc);
+    } else {
+      hl_cpu_matrix_row_op(
+          agg, op, sv, numRows, numCols, dst, ld, B, ldb, C, ldc);
+    }
+  } else {
+    LOG(FATAL) << "not supported";
+  }
+
+  return 0;
+}
+
+/**
+ * @brief   unary operator.
+ *
+ */
+
+DEFINE_MATRIX_UNARY_OP(Neg, a = -a);
+template <class T>
+void BaseMatrixT<T>::neg() {
+  applyUnary(unary::Neg<T>());
+}
+
+DEFINE_MATRIX_UNARY_OP(Exp, a = exp(a));
+template <>
+void BaseMatrixT<real>::exp2() {
+  applyUnary(unary::Exp<real>());
+}
+
+DEFINE_MATRIX_UNARY_OP(Log, a = log(a));
+template <>
+void BaseMatrixT<real>::log2() {
+  if (useGpu_) {
+    applyUnary(unary::Log<real>());
+  } else {
+    vLog(height_ * width_, data_, data_);
+  }
+}
+
+DEFINE_MATRIX_UNARY_OP(Sqrt, a = sqrt(a));
+template <>
+void BaseMatrixT<real>::sqrt2() {
+  applyUnary(unary::Sqrt<real>());
+}
+
+DEFINE_MATRIX_UNARY_OP(Square, a = a * a);
+template <class T>
+void BaseMatrixT<T>::square2() {
+  applyUnary(unary::Square<T>());
+}
+
+DEFINE_MATRIX_UNARY_OP(Reciprocal, a = 1.0f / a);
+template <class T>
+void BaseMatrixT<T>::reciprocal2() {
+  applyUnary(unary::Reciprocal<T>());
+}
+
+DEFINE_MATRIX_UNARY_OP(Abs, a = a > 0 ? a : -a);
+template <class T>
+void BaseMatrixT<T>::abs2() {
+  applyUnary(unary::Abs<T>());
+}
+
+DEFINE_MATRIX_UNARY_OP(Sign, a = (a > 0) - (a < 0));
+template <class T>
+void BaseMatrixT<T>::sign2() {
+  applyUnary(unary::Sign<T>());
+}
+
+DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
+template <class T>
+void BaseMatrixT<T>::zero() {
+  applyUnary(unary::Zero<T>());
+}
+
+template <class T>
+void BaseMatrixT<T>::zeroAtOffset(int64_t columnOffset, int64_t numColumns) {
+  int numRows = height_;
+  int numCols = numColumns;
+  MatrixOffset offset(columnOffset, 0);
+  applyUnary(unary::Zero<T>(), numRows, numCols, offset);
+}
+
+DEFINE_MATRIX_UNARY_OP(One, a = 1);
+template <class T>
+void BaseMatrixT<T>::one() {
+  applyUnary(unary::One<T>());
+}
+
+DEFINE_MATRIX_UNARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(a, p));
+template <>
+void BaseMatrixT<real>::pow2(real p) {
+  if (useGpu_) {
+    applyUnary(unary::Pow<real>(p));
+  } else {
+    vPow(height_ * width_, data_, p, data_);
+  }
+}
+
+DEFINE_MATRIX_UNARY_PARAMETER_OP(SubScalar, ONE_PARAMETER, a -= p);
+template <class T>
+void BaseMatrixT<T>::subScalar(T p) {
+  applyUnary(unary::SubScalar<T>(p));
+}
+
+DEFINE_MATRIX_UNARY_PARAMETER_OP(MulScalar, ONE_PARAMETER, a *= p);
+template <class T>
+void BaseMatrixT<T>::mulScalar(T p) {
+  applyUnary(unary::MulScalar<T>(p));
+}
+
+DEFINE_MATRIX_UNARY_PARAMETER_OP(DivScalar, ONE_PARAMETER, a /= p);
+template <class T>
+void BaseMatrixT<T>::divScalar(T p) {
+  applyUnary(unary::DivScalar<T>(p));
+}
+
+DEFINE_MATRIX_UNARY_PARAMETER_OP(Assign, ONE_PARAMETER, a = p);
+template <class T>
+void BaseMatrixT<T>::assign(T p) {
+  applyUnary(unary::Assign<T>(p));
+}
+
+DEFINE_MATRIX_UNARY_PARAMETER_OP(Add, ONE_PARAMETER, a += p);
+template <class T>
+void BaseMatrixT<T>::add(T p) {
+  applyUnary(unary::Add<T>(p));
+}
+
+DEFINE_MATRIX_UNARY_PARAMETER_OP(Add2, TWO_PARAMETER, a = a * p1 + p2);
+template <class T>
+void BaseMatrixT<T>::add(T p1, T p2) {
+  applyUnary(unary::Add2<T>(p1, p2));
+}
+
+DEFINE_MATRIX_UNARY_PARAMETER_OP(Clip,
+                                 TWO_PARAMETER,
+                                 a = a < p1 ? p1 : (a > p2 ? p2 : a));
+template <class T>
+void BaseMatrixT<T>::clip(T p1, T p2) {
+  applyUnary(unary::Clip<T>(p1, p2));
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(ClipDerivative,
+                                  TWO_PARAMETER,
+                                  a = b < p1 ? 0 : (b > p2 ? 0 : 1));
+template <class T>
+void BaseMatrixT<T>::clipDerivative(BaseMatrixT& b, T p1, T p2) {
+  applyBinary(binary::ClipDerivative<T>(p1, p2), b);
+}
+
+DEFINE_MATRIX_UNARY_PARAMETER_OP(BiggerThanScalar,
+                                 ONE_PARAMETER,
+                                 a = a > p ? 1.0f : 0.0f);
+template <class T>
+void BaseMatrixT<T>::biggerThanScalar(T p) {
+  applyUnary(unary::BiggerThanScalar<T>(p));
+}
+
+DEFINE_MATRIX_UNARY_PARAMETER_OP(DownClip, ONE_PARAMETER, a = a > p ? a : p);
+template <class T>
+void BaseMatrixT<T>::downClip(T p) {
+  applyUnary(unary::DownClip<T>(p));
+}
+
+/**
+ * @brief   binary operator.
+ *
+ */
+
+DEFINE_MATRIX_BINARY_OP(Add, a += b);
+template <class T>
+void BaseMatrixT<T>::add(BaseMatrixT& b) {
+  applyBinary(binary::Add<T>(), b);
+}
+
+template <>
+void BaseMatrixT<real>::add(BaseMatrixT& b) {
+  if (useGpu_) {
+    applyBinary(binary::Add<real>(), b);
+  } else {  // cpu branch
+    CHECK_EQ(height_, b.height_);
+    CHECK_EQ(width_, b.width_);
+    vAdd(height_ * width_, data_, b.data_, data_);
+  }
+}
+
+template <class T>
+void BaseMatrixT<T>::addAtOffset(BaseMatrixT& b, int64_t columnOffset) {
+  if (columnOffset + b.width_ <= width_) {
+    int numRows = height_;
+    int numCols = b.width_;
+    MatrixOffset offset(columnOffset, 0, 0, 0);
+    applyBinary(binary::Add<T>(), b, numRows, numCols, offset);
+  } else if (columnOffset + width_ <= b.width_) {
+    int numRows = height_;
+    int numCols = width_;
+    MatrixOffset offset(0, 0, columnOffset, 0);
+    applyBinary(binary::Add<T>(), b, numRows, numCols, offset);
+  } else {
+    LOG(FATAL) << "Wrong argument "
+               << " a.width=" << width_ << " b.width=" << b.width_
+               << " columnOffset=" << columnOffset;
+  }
+}
+
+template <class T>
+void BaseMatrixT<T>::addP2P(BaseMatrixT& b) {
+  T* A = data_;
+  T* B = b.data_;
+  int dimM = height_;
+  int dimN = width_;
+
+  hl_gpu_apply_binary_op<T, binary::Add<T>, 0, 0>(
+      binary::Add<T>(), A, B, dimM, dimN, dimN, dimN);
+}
+
+template <class T>
+void BaseMatrixT<T>::addColVector(BaseMatrixT& b) {
+  MatrixOffset offset(0, 0, 0, 0);
+  int numRows = height_;
+  int numCols = width_;
+  applyBinary(binary::Add<T>(),
+              b,
+              numRows,
+              numCols,
+              offset,
+              false_type(),
+              true_type() /* bAsColVector */);
+}
+
+template <class T>
+void BaseMatrixT<T>::addRowVector(BaseMatrixT& b) {
+  MatrixOffset offset(0, 0, 0, 0);
+  int numRows = height_;
+  int numCols = width_;
+  applyBinary(binary::Add<T>(),
+              b,
+              numRows,
+              numCols,
+              offset,
+              true_type() /* bAsRowVector */,
+              false_type());
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(Add1, ONE_PARAMETER, a += b * p);
+template <class T>
+void BaseMatrixT<T>::add(BaseMatrixT& b, T p) {
+  applyBinary(binary::Add1<T>(p), b);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(b, p));
+template <>
+void BaseMatrixT<real>::pow2(BaseMatrixT& b, real p) {
+  if (useGpu_) {
+    applyBinary(binary::Pow<real>(p), b);
+  } else {
+    vPow(height_ * width_, b.data_, p, data_);
+  }
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(Add2, TWO_PARAMETER, a = p1 * a + p2 * b);
+template <class T>
+void BaseMatrixT<T>::add(BaseMatrixT& b, T p1, T p2) {
+  applyBinary(binary::Add2<T>(p1, p2), b);
+}
+
+template <class T>
+void BaseMatrixT<T>::addBias(BaseMatrixT& b, T scale) {
+  MatrixOffset offset(0, 0, 0, 0);
+  int numRows = height_;
+  int numCols = width_;
+  applyBinary(binary::Add1<T>(scale),
+              b,
+              numRows,
+              numCols,
+              offset,
+              true_type() /* bAsRowVector */,
+              false_type());
+}
+
+DEFINE_MATRIX_BINARY_OP(Sub, a -= b);
+template <class T>
+void BaseMatrixT<T>::sub(BaseMatrixT& b) {
+  applyBinary(binary::Sub<T>(), b);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(Sub1, ONE_PARAMETER, a -= b * p);
+template <class T>
+void BaseMatrixT<T>::sub(BaseMatrixT& b, T p) {
+  applyBinary(binary::Sub1<T>(p), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(Relu, b = a > 0.0f ? a : 0.0f);
+template <class T>
+void BaseMatrixT<T>::relu(BaseMatrixT& b) {
+  applyBinary(binary::Relu<T>(), b);
+}
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+template <>
+void BaseMatrixT<float>::relu(BaseMatrixT& b) {
+  neon::relu(data_, b.data_, height_ * width_);
+}
+#endif
+
+DEFINE_MATRIX_BINARY_OP(ReluDerivative, a *= (b > 0.0f ? 1.0f : 0.0f));
+template <class T>
+void BaseMatrixT<T>::reluDerivative(BaseMatrixT& b) {
+  applyBinary(binary::ReluDerivative<T>(), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(Softrelu, const T THRESHOLD = 40.0;
+                        b = log(1.0 + exp((a > THRESHOLD)
+                                              ? THRESHOLD
+                                              : ((a < -THRESHOLD) ? (-THRESHOLD)
+                                                                  : a))));
+template <>
+void BaseMatrixT<real>::softrelu(BaseMatrixT& b) {
+  applyBinary(binary::Softrelu<real>(), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(
+    SoftreluDerivative, const T THRESHOLD = 40.0;
+    a *= (1.0 - exp(-1.0 * ((b > THRESHOLD)
+                                ? THRESHOLD
+                                : ((b < -THRESHOLD) ? (-THRESHOLD) : b)))));
+template <>
+void BaseMatrixT<real>::softreluDerivative(BaseMatrixT& b) {
+  applyBinary(binary::SoftreluDerivative<real>(), b);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(Brelu, TWO_PARAMETER, b = a > p1 ? a : p1;
+                                  b = b < p2 ? b : p2);
+template <class T>
+void BaseMatrixT<T>::brelu(BaseMatrixT& b) {
+  int p1 = 0, p2 = 24;  //! TODO(yuyang18): Make p1,p2 configuable.
+  applyBinary(binary::Brelu<T>(p1, p2), b);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(BreluDerivative,
+                                  TWO_PARAMETER,
+                                  a *= (b > p1 && b < p2) ? 1.0 : 0.0);
+template <class T>
+void BaseMatrixT<T>::breluDerivative(BaseMatrixT& b) {
+  int p1 = 0, p2 = 24;
+  applyBinary(binary::BreluDerivative<T>(p1, p2), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(Square, b = a * a);
+template <class T>
+void BaseMatrixT<T>::square2(BaseMatrixT& b) {
+  applyBinary(binary::Square<T>(), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(SquareDerivative, a *= 2.0 * b);
+template <class T>
+void BaseMatrixT<T>::squareDerivative(BaseMatrixT& b) {
+  applyBinary(binary::SquareDerivative<T>(), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(Tanh, T tmp = -2.0 * a;
+                        tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+                        b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
+template <>
+void BaseMatrixT<real>::tanh(BaseMatrixT& b) {
+  applyBinary(binary::Tanh<real>(), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(TanhDerivative, a *= 1 - b * b);
+template <class T>
+void BaseMatrixT<T>::tanhDerivative(BaseMatrixT& b) {
+  applyBinary(binary::TanhDerivative<T>(), b);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(
+    ScaledTanh, TWO_PARAMETER, b = p1 * (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0));
+template <>
+void BaseMatrixT<real>::scaledTanh(BaseMatrixT& b, real p1, real p2) {
+  applyBinary(binary::ScaledTanh<real>(p1, p2), b);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(ScaledTanhDerivative,
+                                  TWO_PARAMETER,
+                                  a *= p2 * (p1 - b * b));
+template <class T>
+void BaseMatrixT<T>::scaledTanhDerivative(BaseMatrixT& b, T p1, T p2) {
+  applyBinary(binary::ScaledTanhDerivative<T>(p1 * p1, p2 / p1), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(Reciprocal, b = 1.0f / a);
+template <class T>
+void BaseMatrixT<T>::reciprocal2(BaseMatrixT& b) {
+  applyBinary(binary::Reciprocal<T>(), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(ReciprocalDerivative, a *= -b * b);
+template <class T>
+void BaseMatrixT<T>::reciprocalDerivative(BaseMatrixT& b) {
+  applyBinary(binary::ReciprocalDerivative<T>(), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(Abs, b = a > 0.0f ? a : -a);
+template <class T>
+void BaseMatrixT<T>::abs2(BaseMatrixT& b) {
+  applyBinary(binary::Abs<T>(), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(AbsDerivative, a = (b > 0) ? a : (b < 0) ? -a : 0);
+template <class T>
+void BaseMatrixT<T>::absDerivative(BaseMatrixT& b) {
+  applyBinary(binary::AbsDerivative<T>(), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(Sigmoid, const T THRESHOLD_MIN = -40.0;
+                        const T THRESHOLD_MAX = 13.0;
+                        T tmp = (a < THRESHOLD_MIN)
+                                    ? THRESHOLD_MIN
+                                    : ((a > THRESHOLD_MAX) ? THRESHOLD_MAX : a);
+                        b = 1.0f / (1.0f + exp(-tmp)));
+template <>
+void BaseMatrixT<real>::sigmoid(BaseMatrixT& b) {
+  if (useGpu_) {
+    applyBinary(binary::Sigmoid<real>(), b);
+  } else {  // cpu versioni
+    size_t numSamples = this->height_;
+    size_t dim = this->width_;
+    CHECK_EQ(b.height_, numSamples);
+    CHECK_EQ(b.width_, dim);
+    const real* in = this->data_;
+    real* out = b.data_;
+
+    // out = - in
+    const float THRESHOLD_MIN = -40.0;  // make sure sigmoid(x) > 0
+    const float THRESHOLD_MAX = 13.0;   // make sure sigmoid(x) < 1
+    for (size_t i = 0; i < numSamples * dim; ++i) {
+      real tmp = in[i];
+      tmp = (tmp < THRESHOLD_MIN)
+                ? THRESHOLD_MIN
+                : ((tmp > THRESHOLD_MAX) ? THRESHOLD_MAX : tmp);
+      out[i] = -tmp;
+    }
+
+    // out = exp(out)
+    vExp(numSamples * dim, out, out);
+
+    // out = 1 / (1 + out)
+    for (size_t i = 0; i < numSamples * dim; ++i) {
+      out[i] = 1 / (1 + out[i]);
+    }
+  }
+}
+
+DEFINE_MATRIX_BINARY_OP(SigmoidDerivative, a *= b * (1 - b));
+template <class T>
+void BaseMatrixT<T>::sigmoidDerivative(BaseMatrixT& b) {
+  applyBinary(binary::SigmoidDerivative<T>(), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(ExpDerivative, a *= b);
+template <class T>
+void BaseMatrixT<T>::expDerivative(BaseMatrixT& b) {
+  applyBinary(binary::ExpDerivative<T>(), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(Sign, b = a > 0.0f ? 1.0f : -1.0f);
+template <class T>
+void BaseMatrixT<T>::sign2(BaseMatrixT& b) {
+  applyBinary(binary::Sign<T>(), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(Exp, a = exp(b));
+template <>
+void BaseMatrixT<real>::exp2(BaseMatrixT& b) {
+  applyBinary(binary::Exp<real>(), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(Log, a = log(b));
+template <>
+void BaseMatrixT<real>::log2(BaseMatrixT& b) {
+  if (useGpu_) {
+    applyBinary(binary::Log<real>(), b);
+  } else {
+    vLog(height_ * width_, b.data_, data_);
+  }
+}
+
+DEFINE_MATRIX_BINARY_OP(Sqrt, a = sqrt(b));
+template <>
+void BaseMatrixT<real>::sqrt2(BaseMatrixT& b) {
+  applyBinary(binary::Sqrt<real>(), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(InvSqrt, a = 1.0f / sqrt(b));
+template <>
+void BaseMatrixT<real>::invSqrt(BaseMatrixT& b) {
+  if (useGpu_) {
+    applyBinary(binary::InvSqrt<real>(), b);
+  } else {  // cpu branch
+    CHECK_EQ(height_, b.height_);
+    CHECK_EQ(width_, b.width_);
+    vInvSqrt(height_ * width_, b.data_, data_);
+  }
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(IsEqual, ONE_PARAMETER, a = (b == p));
+template <class T>
+void BaseMatrixT<T>::isEqualTo(BaseMatrixT& b, T value) {
+  applyBinary(binary::IsEqual<T>(value), b);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(AddScalar, ONE_PARAMETER, a = b + p);
+template <class T>
+void BaseMatrixT<T>::addScalar(BaseMatrixT& b, T p) {
+  applyBinary(binary::AddScalar<T>(p), b);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(SubScalar, ONE_PARAMETER, a = b - p);
+template <class T>
+void BaseMatrixT<T>::subScalar(BaseMatrixT& b, T p) {
+  applyBinary(binary::SubScalar<T>(p), b);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(MulScalar, ONE_PARAMETER, a = b * p);
+template <class T>
+void BaseMatrixT<T>::mulScalar(BaseMatrixT& b, T p) {
+  applyBinary(binary::MulScalar<T>(p), b);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(DivScalar, ONE_PARAMETER, a = b / p);
+template <class T>
+void BaseMatrixT<T>::divScalar(BaseMatrixT& b, T p) {
+  applyBinary(binary::DivScalar<T>(p), b);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(ScalarDiv, ONE_PARAMETER, a = p / b);
+template <class T>
+void BaseMatrixT<T>::scalarDiv(BaseMatrixT& b, T p) {
+  applyBinary(binary::ScalarDiv<T>(p), b);
+}
+
+/**
+ * @brief   ternary operator.
+ *
+ */
+
+DEFINE_MATRIX_TERNARY_OP(SoftCrossEntropy,
+                         a = -c * log(b) - (1 - c) * log(1 - b));
+template <>
+void BaseMatrixT<real>::softCrossEntropy(BaseMatrixT& b, BaseMatrixT& c) {
+  applyTernary(ternary::SoftCrossEntropy<real>(), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_OP(SoftCrossEntropyBp, a += (b - c) / (b * (1 - b)));
+template <class T>
+void BaseMatrixT<T>::softCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c) {
+  applyTernary(ternary::SoftCrossEntropyBp<T>(), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_OP(BinaryCrossEntropy,
+                         a = c > 0.5 ? -log(b) : -log(1.0 - b));
+template <>
+void BaseMatrixT<real>::binaryLabelCrossEntropy(BaseMatrixT& b,
+                                                BaseMatrixT& c) {
+  if (useGpu_) {
+    applyTernary(ternary::BinaryCrossEntropy<real>(), b, c);
+  } else {
+    CHECK_EQ(height_, b.height_);
+    CHECK_EQ(height_, c.height_);
+    CHECK_EQ(width_, b.width_);
+    CHECK_EQ(width_, c.width_);
+
+    size_t size = height_ * width_;
+    real* out = b.data_;
+    real* label = c.data_;
+    real* cost = data_;
+
+    for (size_t i = 0; i < size; ++i) {
+      cost[i] = label[i] > 0.5 ? out[i] : 1.0 - out[i];
+    }
+    vLog(size, cost, cost);
+    for (size_t i = 0; i < size; ++i) {
+      cost[i] *= -1.0;
+    }
+  }
+}
+
+DEFINE_MATRIX_TERNARY_OP(BinaryCrossEntropyBp,
+                         a += c > 0.5 ? -1.0 / b : 1.0 / (1.0 - b));
+template <class T>
+void BaseMatrixT<T>::binaryLabelCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c) {
+  applyTernary(ternary::BinaryCrossEntropyBp<T>(), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_OP(Add, a = b + c);
+template <class T>
+void BaseMatrixT<T>::add(BaseMatrixT& b, BaseMatrixT& c) {
+  applyTernary(ternary::Add<T>(), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(Add1, TWO_PARAMETER, a = p1 * b + p2 * c);
+template <class T>
+void BaseMatrixT<T>::add(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2) {
+  applyTernary(ternary::Add1<T>(p1, p2), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_OP(Sub, a = b - c);
+template <class T>
+void BaseMatrixT<T>::sub(BaseMatrixT& b, BaseMatrixT& c) {
+  applyTernary(ternary::Sub<T>(), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(Sub1, TWO_PARAMETER, a = p1 * b - p2 * c);
+template <class T>
+void BaseMatrixT<T>::sub(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2) {
+  applyTernary(ternary::Sub1<T>(p1, p2), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_OP(Add2, a = a + b + c);
+template <class T>
+void BaseMatrixT<T>::add2(BaseMatrixT& b, BaseMatrixT& c) {
+  applyTernary(ternary::Add2<T>(), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(Add3,
+                                   THREE_PARAMETER,
+                                   a = p1 * a + p2 * b + p3 * c);
+template <class T>
+void BaseMatrixT<T>::add2(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3) {
+  applyTernary(ternary::Add3<T>(p1, p2, p3), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(SgdUpdate,
+                                   THREE_PARAMETER,
+                                   c = p2 * c - p1 * (b + p3 * a);
+                                   a = a + c);
+template <class T>
+void BaseMatrixT<T>::sgdUpdate(BaseMatrixT& b,  // grad
+                               BaseMatrixT& c,  // mom
+                               T p1,            // learningRate,
+                               T p2,            // momentum,
+                               T p3) {          // decayRate
+  applyTernary(ternary::SgdUpdate<T>(p1, p2, p3), b, c);
+}
+
+DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(SgdUpdate,
+                                      THREE_PARAMETER,
+                                      c = p2 * c - p1 * d * (b + p3 * a);
+                                      a += c);
+template <class T>
+void BaseMatrixT<T>::sgdUpdate(BaseMatrixT& b,  // grad,
+                               BaseMatrixT& c,  // mom,
+                               BaseMatrixT& d,  // lr,
+                               T p1,            // learningRate,
+                               T p2,            // momentum,
+                               T p3) {          // decayRate
+  applyQuaternary(quaternary::SgdUpdate<T>(p1, p2, p3), b, c, d);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL1, ONE_PARAMETER, T lambda = p * b;
+                                  a = (a > lambda)
+                                          ? (a - lambda)
+                                          : (a < -lambda) ? (a + lambda) : 0);
+template <class T>
+void BaseMatrixT<T>::applyL1(BaseMatrixT& lr, T learningRate, T decayRate) {
+  applyBinary(binary::ApplyL1<T>(learningRate * decayRate), lr);
+}
+
+template <>
+void BaseMatrixT<real>::applyL1(BaseMatrixT& lr,
+                                real learningRate,
+                                real decayRate) {
+  if (useGpu_) {
+    applyBinary(binary::ApplyL1<real>(learningRate * decayRate), lr);
+  } else {
+    simd::decayL1(this->data_,
+                  this->data_,
+                  lr.data_,
+                  learningRate * decayRate,
+                  height_ * width_);
+  }
+}
+
+DEFINE_MATRIX_UNARY_PARAMETER_OP(ApplyL1, ONE_PARAMETER, T lambda = p;
+                                 a = (a > lambda)
+                                         ? (a - lambda)
+                                         : (a < -lambda) ? (a + lambda) : 0);
+template <class T>
+void BaseMatrixT<T>::applyL1(T learningRate, T decayRate) {
+  applyUnary(unary::ApplyL1<T>(learningRate * decayRate));
+}
+
+template <>
+void BaseMatrixT<real>::applyL1(real learningRate, real decayRate) {
+  if (useGpu_) {
+    applyUnary(unary::ApplyL1<real>(learningRate * decayRate));
+  } else {
+    simd::decayL1(
+        this->data_, this->data_, learningRate * decayRate, height_ * width_);
+  }
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL2,
+                                  ONE_PARAMETER,
+                                  a *= (1.0f / (1.0f + p * b)));
+template <class T>
+void BaseMatrixT<T>::applyL2(BaseMatrixT& lr, T learningRate, T decayRate) {
+  if (useGpu_) {
+    applyBinary(binary::ApplyL2<T>(learningRate * decayRate), lr);
+  } else {
+    size_t size = this->height_ * this->width_;
+    T decay = learningRate * decayRate;
+    for (size_t j = 0; j < size; ++j) {
+      this->data_[j] *= 1.0f / (1.0f + decay * lr.data_[j]);
+    }
+  }
+}
+
+template <class T>
+void BaseMatrixT<T>::applyL2(T learningRate, T decayRate) {
+  BaseMatrixT<T>::mulScalar(1.0f / (1.0f + learningRate * decayRate));
+}
+
+DEFINE_MATRIX_BINARY_OP(DotMul, a *= b);
+template <class T>
+void BaseMatrixT<T>::dotMul(BaseMatrixT& b) {
+  applyBinary(binary::DotMul<T>(), b);
+}
+
+DEFINE_MATRIX_TERNARY_OP(DotMul, a = b * c);
+template <class T>
+void BaseMatrixT<T>::dotMul(BaseMatrixT& b, BaseMatrixT& c) {
+  applyTernary(ternary::DotMul<T>(), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_OP(DotDiv, a = (b == 0.0) ? 0.0 : b / c);
+template <class T>
+void BaseMatrixT<T>::dotDiv(BaseMatrixT& b, BaseMatrixT& c) {
+  applyTernary(ternary::DotDiv<T>(), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotDiv2P,
+                                   TWO_PARAMETER,
+                                   a = (b + p1) / (c + p2));
+template <class T>
+void BaseMatrixT<T>::dotDiv(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
+  applyTernary(ternary::DotDiv2P<T>(p1, p2), b, c);
+}
+
+DEFINE_MATRIX_QUATERNARY_OP(RankLoss, const T THRESHOLD = 40.0; a = b - c;
+                            a = (a > THRESHOLD)
+                                    ? THRESHOLD
+                                    : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
+                            a = log(1 + exp(a)) - a * d);
+template <>
+void BaseMatrixT<real>::rankLoss(BaseMatrixT& b,
+                                 BaseMatrixT& c,
+                                 BaseMatrixT& d) {
+  applyQuaternary(quaternary::RankLoss<real>(), b, c, d);
+}
+
+DEFINE_MATRIX_QUATERNARY_OP(RankLossBp, const T THRESHOLD = 40.0; a = b - c;
+                            a = (a > THRESHOLD)
+                                    ? THRESHOLD
+                                    : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
+                            a = exp(a);
+                            a = (a / (1 + a) - d));
+template <>
+void BaseMatrixT<real>::rankLossBp(BaseMatrixT& b,
+                                   BaseMatrixT& c,
+                                   BaseMatrixT& d) {
+  applyQuaternary(quaternary::RankLossBp<real>(), b, c, d);
+}
+
+/* this = log(1 + exp(b)) - c * b */
+DEFINE_MATRIX_TERNARY_OP(LogisticRegressionLoss, const T THRESHOLD = 40.0;
+                         T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD)
+                                                                 ? -THRESHOLD
+                                                                 : b;
+                         a = log(1 + exp(x)) - c * x);
+template <>
+void BaseMatrixT<real>::logisticRegressionLoss(BaseMatrixT& b, BaseMatrixT& c) {
+  applyTernary(ternary::LogisticRegressionLoss<real>(), b, c);
+}
+
+/* this = exp(b)/(1+exp(b)) - c */
+DEFINE_MATRIX_TERNARY_OP(LogisticRegressionLossBp, const T THRESHOLD = 40.0;
+                         T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD)
+                                                                 ? -THRESHOLD
+                                                                 : b;
+                         x = exp(x);
+                         a = x / (1 + x) - c);
+template <>
+void BaseMatrixT<real>::logisticRegressionLossBp(BaseMatrixT& b,
+                                                 BaseMatrixT& c) {
+  applyTernary(ternary::LogisticRegressionLossBp<real>(), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_OP(BiggerThan, a = (b > c) ? 1.0f : 0.0f);
+template <class T>
+void BaseMatrixT<T>::biggerThan(BaseMatrixT& b, BaseMatrixT& c) {
+  applyTernary(ternary::BiggerThan<T>(), b, c);
+}
+
+DEFINE_MATRIX_QUATERNARY_OP(
+    BiggerThan, a = ((b > c && d > 0.5f) || (b < c && d < 0.5f)) ? 1.0f : 0.0f);
+template <class T>
+void BaseMatrixT<T>::biggerThan(BaseMatrixT& b,
+                                BaseMatrixT& c,
+                                BaseMatrixT& d) {
+  applyQuaternary(quaternary::BiggerThan<T>(), b, c, d);
+}
+
+DEFINE_MATRIX_TERNARY_OP(Max, a = (b > c) ? b : c);
+template <class T>
+void BaseMatrixT<T>::max2(BaseMatrixT& b, BaseMatrixT& c) {
+  applyTernary(ternary::Max<T>(), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(BinaryClassificationError,
+                                   ONE_PARAMETER,
+                                   c += ((a > p) == (b > p)) ? 0.0f : 1.0f);
+template <class T>
+void BaseMatrixT<T>::binaryClassificationError2(size_t destCol,
+                                                BaseMatrixT& b,
+                                                BaseMatrixT& c,
+                                                T p) {
+  CHECK(!useGpu_) << "do not support gpu";
+  MatrixOffset offset(0, 0, 0, 0, destCol, 0);
+  int numRows = b.height_;
+  int numCols = b.width_;
+  b.applyTernary(ternary::BinaryClassificationError<T>(p),
+                 c,
+                 *this,
+                 numRows,
+                 numCols,
+                 offset,
+                 false_type(),
+                 true_type() /*cAsColVector*/);
+}
+
+template <>
+void BaseMatrixT<real>::binaryClassificationError(size_t destCol,
+                                                  BaseMatrixT& b,
+                                                  BaseMatrixT& c,
+                                                  real p) {
+  MatrixOffset offset(destCol, 0, 0, 0, 0, 0);
+  int numRows = b.height_;
+  int numCols = b.width_;
+  aggregate(aggregate::sum(),
+            base::binary::classificationError(p),
+            base::binary::add(),
+            b,
+            c,
+            numRows,
+            numCols,
+            offset,
+            false_type(),
+            true_type() /*aAsColVector*/);
+}
+
+DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(Add3,
+                                      THREE_PARAMETER,
+                                      a = p1 * b + p2 * c + p3 * d);
+template <class T>
+void BaseMatrixT<T>::add3(
+    BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d, T p1, T p2, T p3) {
+  applyQuaternary(quaternary::Add3<T>(p1, p2, p3), b, c, d);
+}
+
+DEFINE_MATRIX_TERNARY_OP(DotMulSquare, a = b * c * c);
+template <class T>
+void BaseMatrixT<T>::dotMulSquare(BaseMatrixT& b, BaseMatrixT& c) {
+  applyTernary(ternary::DotMulSquare<T>(), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_OP(DotSquareSquare, a = b * b * c * c);
+template <class T>
+void BaseMatrixT<T>::dotSquareSquare(BaseMatrixT& b, BaseMatrixT& c) {
+  applyTernary(ternary::DotSquareSquare<T>(), b, c);
+}
+
+DEFINE_MATRIX_BINARY_OP(DotMulSquare, a *= b * b);
+template <class T>
+void BaseMatrixT<T>::dotMulSquare(BaseMatrixT& b) {
+  applyBinary(binary::DotMulSquare<T>(), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(DotSquareMul, a = a * a * b);
+template <class T>
+void BaseMatrixT<T>::dotSquareMul(BaseMatrixT& b) {
+  applyBinary(binary::DotSquareMul<T>(), b);
+}
+
+DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(AddSquareSum,
+                                      THREE_PARAMETER,
+                                      T tmp = p1 * b + p2 * c + p3 * d;
+                                      a += tmp * tmp);
+template <class T>
+void BaseMatrixT<T>::addSquareSum(
+    BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT d, T p1, T p2, T p3) {
+  applyQuaternary(quaternary::AddSquareSum<T>(p1, p2, p3), b, c, d);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(AddSquare, ONE_PARAMETER, a += p * b * b);
+template <class T>
+void BaseMatrixT<T>::addSquare(BaseMatrixT& b, T p) {
+  applyBinary(binary::AddSquare<T>(p), b);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(DecayAddSquare,
+                                  TWO_PARAMETER,
+                                  a = p1 * a + p2 * b * b);
+template <class T>
+void BaseMatrixT<T>::decayAddSquare(BaseMatrixT& b, T p1, T p2) {
+  applyBinary(binary::DecayAddSquare<T>(p1, p2), b);
+}
+
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(DecayAddSquareMul,
+                                   TWO_PARAMETER,
+                                   a = p1 * a + p2 * b * b * c * c);
+template <class T>
+void BaseMatrixT<T>::decayAddSquareMul(BaseMatrixT& b,
+                                       BaseMatrixT& c,
+                                       T p1,
+                                       T p2) {
+  applyTernary(ternary::DecayAddSquareMul<T>(p1, p2), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(ReciprocalSum,
+                                   THREE_PARAMETER,
+                                   a = 1 / (p1 * b + p2 * c + p3));
+template <class T>
+void BaseMatrixT<T>::reciprocalSum(
+    BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3) {
+  applyTernary(ternary::ReciprocalSum<T>(p1, p2, p3), b, c);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(Reciprocal2,
+                                  TWO_PARAMETER,
+                                  a = 1 / (p1 * b + p2));
+template <class T>
+void BaseMatrixT<T>::reciprocal2(BaseMatrixT& b, T p1, T p2) {
+  applyBinary(binary::Reciprocal2<T>(p1, p2), b);
+}
+
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSquareSum,
+                                   TWO_PARAMETER,
+                                   T tmp = p1 * b + p2 * c;
+                                   a *= tmp * tmp);
+template <class T>
+void BaseMatrixT<T>::dotMulSquareSum(BaseMatrixT& b,
+                                     BaseMatrixT& c,
+                                     T p1,
+                                     T p2) {
+  applyTernary(ternary::DotMulSquareSum<T>(p1, p2), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotSquareSum,
+                                   TWO_PARAMETER,
+                                   T tmp = p1 * b + p2 * c;
+                                   a = tmp * tmp);
+template <class T>
+void BaseMatrixT<T>::dotSquareSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
+  applyTernary(ternary::DotSquareSum<T>(p1, p2), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSum,
+                                   TWO_PARAMETER,
+                                   a *= p1 * b + p2 * c);
+template <class T>
+void BaseMatrixT<T>::dotMulSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
+  applyTernary(ternary::DotMulSum<T>(p1, p2), b, c);
+}
+
+DEFINE_MATRIX_BINARY_OP(CopyAndClear, b = a; a = 0);
+template <class T>
+void BaseMatrixT<T>::copyAndClear(BaseMatrixT& b) {
+  applyBinary(binary::CopyAndClear<T>(), b);
+}
+
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(AddDotMul,
+                                   TWO_PARAMETER,
+                                   a = p1 * a + p2 * b * c);
+template <class T>
+void BaseMatrixT<T>::addDotMul(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
+  applyTernary(ternary::AddDotMul<T>(p1, p2), b, c);
+}
+
+DEFINE_MATRIX_BINARY_OP(Assign, a = b;);
+template <class T>
+void BaseMatrixT<T>::assign(BaseMatrixT& b) {
+  if (useGpu_) {
+    applyBinary(binary::Assign<T>(), b);
+  } else {  // cpu version
+    CHECK_EQ(this->height_, b.height_);
+    CHECK_EQ(this->width_, b.width_);
+    memcpy(data_, b.data_, sizeof(T) * height_ * width_);
+  }
+}
+
+template <class T>
+void BaseMatrixT<T>::assignAtOffset(BaseMatrixT& b, int64_t columnOffset) {
+  if (columnOffset + b.width_ <= width_) {
+    int numRows = height_;
+    int numCols = b.width_;
+    MatrixOffset offset(columnOffset, 0, 0, 0);
+    applyBinary(binary::Assign<T>(), b, numRows, numCols, offset);
+  } else if (columnOffset + width_ <= b.width_) {
+    int numRows = height_;
+    int numCols = width_;
+    MatrixOffset offset(0, 0, columnOffset, 0);
+    applyBinary(binary::Assign<T>(), b, numRows, numCols, offset);
+  } else {
+    LOG(FATAL) << "Wrong argument "
+               << " a.width=" << width_ << " b.width=" << b.width_
+               << " columnOffset=" << columnOffset;
+  }
+}
+
+DEFINE_MATRIX_BINARY_OP(DeepSwap, T tmp = a; a = b; b = tmp);
+template <class T>
+void BaseMatrixT<T>::deepSwap(BaseMatrixT& b) {
+  applyBinary(binary::DeepSwap<T>(), b);
+}
+
+template <>
+void BaseMatrixT<real>::rowDotMul(size_t destCol,
+                                  BaseMatrixT& b,
+                                  BaseMatrixT& c) {
+  int numRows = b.height_;
+  int numCols = b.width_;
+  MatrixOffset offset(destCol, 0, 0, 0, 0, 0);
+  aggregate(aggregate::sum(),
+            base::binary::mul(),
+            base::binary::add(),
+            b,
+            c,
+            numRows,
+            numCols,
+            offset,
+            false_type(),
+            true_type() /*aAsColVector*/);
+}
+
+template <class T>
+void BaseMatrixT<T>::rowDotMul2(size_t destCol,
+                                BaseMatrixT& b,
+                                BaseMatrixT& c) {
+  CHECK(!useGpu_) << "do not support gpu";
+
+  size_t height = this->height_;
+  CHECK_LT(destCol, this->width_);
+  CHECK_EQ(height, b.height_);
+  CHECK_EQ(height, c.height_);
+  CHECK_EQ(b.width_, c.width_);
+  size_t width = b.width_;
+  T* A = this->data_;
+  const T* B = b.data_;
+  const T* C = c.data_;
+  for (size_t i = 0; i < height;
+       ++i, A += this->width_, B += width, C += width) {
+    for (size_t j = 0; j < width; ++j) {
+      A[destCol] += B[j] * C[j];
+    }
+  }
+}
+
+template <>
+void BaseMatrixT<real>::addDotMulVMM(BaseMatrixT& b, BaseMatrixT& c) {
+  MatrixOffset offset(0, 0, 0, 0, 0, 0);
+  int numRows = b.height_;
+  int numCols = b.width_;
+  aggregate(aggregate::sum(),
+            base::binary::mul(),
+            base::binary::add(),
+            b,
+            c,
+            numRows,
+            numCols,
+            offset,
+            true_type() /*aAsRowVector*/,
+            false_type());
+}
+
+template <class T>
+void BaseMatrixT<T>::addDotMulVMM2(BaseMatrixT& b, BaseMatrixT& c) {
+  CHECK(!useGpu_) << "do not support gpu";
+
+  CHECK_EQ(height_, 1LU);
+  CHECK_EQ(b.height_, c.height_);
+  CHECK_EQ(width_, b.width_);
+  CHECK_EQ(width_, c.width_);
+  size_t height = b.height_;
+  size_t width = b.width_;
+  T* A = this->data_;
+  const T* B = b.data_;
+  const T* C = c.data_;
+  for (size_t i = 0; i < height; ++i, B += width, C += width) {
+    for (size_t j = 0; j < width; ++j) {
+      A[j] += B[j] * C[j];
+    }
+  }
+}
+
+DEFINE_MATRIX_TERNARY_OP(addDotMulMMV, a += b * c);
+template <class T>
+void BaseMatrixT<T>::addDotMulMMV(BaseMatrixT& b, BaseMatrixT& c) {
+  MatrixOffset offset(0, 0, 0, 0, 0, 0);
+  int numRows = height_;
+  int numCols = width_;
+  applyTernary(ternary::addDotMulMMV<T>(),
+               b,
+               c,
+               numRows,
+               numCols,
+               offset,
+               true_type() /*cAsRowVector*/,
+               false_type());
+}
+
+template <class T>
+void BaseMatrixT<T>::addDotMulMMV2(BaseMatrixT& b, BaseMatrixT& c) {
+  CHECK(!useGpu_) << "do not support gpu";
+
+  CHECK_EQ(c.height_, 1LU);
+  CHECK_EQ(height_, b.height_);
+  CHECK_EQ(width_, b.width_);
+  CHECK_EQ(width_, c.width_);
+  size_t height = height_;
+  size_t width = width_;
+  T* A = this->data_;
+  const T* B = b.data_;
+  const T* C = c.data_;
+  for (size_t i = 0; i < height; ++i, A += width, B += width) {
+    for (size_t j = 0; j < width; ++j) {
+      A[j] += B[j] * C[j];
+    }
+  }
+}
+
+template <class T>
+void BaseMatrixT<T>::rowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
+  MatrixOffset offset(0, 0, 0, 0, cCol, 0);
+  int numRows = height_;
+  int numCols = width_;
+  applyTernary(ternary::DotMul<T>(),
+               b,
+               c,
+               numRows,
+               numCols,
+               offset,
+               false_type(),
+               true_type() /*cAsColVector*/);
+}
+
+template <class T>
+void BaseMatrixT<T>::rowScale2(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
+  CHECK(!useGpu_) << "do not support gpu";
+
+  size_t height = this->height_;
+  size_t width = this->width_;
+  CHECK_EQ(height, b.height_);
+  CHECK_EQ(width, b.width_);
+  CHECK_LT(cCol, c.width_);
+  CHECK_EQ(height, c.height_);
+  T* A = this->data_;
+  const T* B = b.data_;
+  const T* C = c.data_;
+  for (size_t i = 0; i < height; ++i, A += width, B += width, C += c.width_) {
+    for (size_t j = 0; j < width; ++j) {
+      A[j] = B[j] * C[cCol];
+    }
+  }
+}
+
+template <class T>
+void BaseMatrixT<T>::colScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c) {
+  MatrixOffset offset(0, 0, 0, 0, 0, cRow);
+  int numRows = height_;
+  int numCols = width_;
+  applyTernary(ternary::DotMul<T>(),
+               b,
+               c,
+               numRows,
+               numCols,
+               offset,
+               true_type() /* cAsRowVector */,
+               false_type() /* cAsColVector */);
+}
+
+template <class T>
+void BaseMatrixT<T>::addColScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c) {
+  MatrixOffset offset(0, 0, 0, 0, 0, cRow);
+  int numRows = height_;
+  int numCols = width_;
+  applyTernary(ternary::addDotMulMMV<T>(),
+               b,
+               c,
+               numRows,
+               numCols,
+               offset,
+               true_type() /* cAsRowVector */,
+               false_type() /* cAsColVector */);
+}
+
+template <class T>
+void BaseMatrixT<T>::addRowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
+  MatrixOffset offset(0, 0, 0, 0, cCol, 0);
+  int numRows = height_;
+  int numCols = width_;
+  applyTernary(ternary::addDotMulMMV<T>(),
+               b,
+               c,
+               numRows,
+               numCols,
+               offset,
+               false_type(),
+               true_type() /*cAsColVector*/);
+}
+
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(RowAdd, ONE_PARAMETER, a = b + p * c);
+template <class T>
+void BaseMatrixT<T>::rowAdd(size_t cCol, BaseMatrixT& b, BaseMatrixT& c, T p) {
+  MatrixOffset offset(0, 0, 0, 0, cCol, 0);
+  int numRows = height_;
+  int numCols = width_;
+  applyTernary(ternary::RowAdd<T>(p),
+               b,
+               c,
+               numRows,
+               numCols,
+               offset,
+               false_type(),
+               true_type() /*cAsColVector*/);
+}
+
+DEFINE_MATRIX_TERNARY_OP(RowPow, a = pow(b, c));
+template <>
+void BaseMatrixT<real>::rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
+  if (useGpu_) {
+    MatrixOffset offset(0, 0, 0, 0, cCol, 0);
+    int numRows = height_;
+    int numCols = width_;
+    applyTernary(ternary::RowPow<real>(),
+                 b,
+                 c,
+                 numRows,
+                 numCols,
+                 offset,
+                 false_type(),
+                 true_type() /*cAsColVector*/);
+  } else {
+    size_t height = this->height_;
+    size_t width = this->width_;
+    CHECK_EQ(height, b.height_);
+    CHECK_EQ(width, b.width_);
+    CHECK_LT(cCol, c.width_);
+    CHECK_EQ(height, c.height_);
+    real* A = this->data_;
+    const real* B = b.data_;
+    const real* C = c.data_;
+    for (size_t i = 0; i < height; ++i, A += width, B += width, C += c.width_) {
+      vPow(width, B, C[cCol], A);
+    }
+  }
+}
+
+template <class T>
+void BaseMatrixT<T>::mulRowVector(BaseMatrixT& b) {
+  MatrixOffset offset(0, 0, 0, 0);
+  int numRows = height_;
+  int numCols = width_;
+  applyBinary(binary::DotMul<T>(),
+              b,
+              numRows,
+              numCols,
+              offset,
+              true_type() /* bAsRowVector */,
+              false_type());
+}
+
+DEFINE_MATRIX_BINARY_OP(DotDiv, a /= b);
+template <class T>
+void BaseMatrixT<T>::divRowVector(BaseMatrixT& b) {
+  MatrixOffset offset(0, 0, 0, 0);
+  int numRows = height_;
+  int numCols = width_;
+  applyBinary(binary::DotDiv<T>(),
+              b,
+              numRows,
+              numCols,
+              offset,
+              true_type() /* bAsRowVector */,
+              false_type());
+}
+
+template <class T>
+void BaseMatrixT<T>::mulColVector(BaseMatrixT& b) {
+  MatrixOffset offset(0, 0, 0, 0);
+  int numRows = height_;
+  int numCols = width_;
+  applyBinary(binary::DotMul<T>(),
+              b,
+              numRows,
+              numCols,
+              offset,
+              false_type(),
+              true_type() /* bAsColVector */);
+}
+
+template <class T>
+void BaseMatrixT<T>::divColVector(BaseMatrixT& b) {
+  MatrixOffset offset(0, 0, 0, 0);
+  int numRows = height_;
+  int numCols = width_;
+  applyBinary(binary::DotDiv<T>(),
+              b,
+              numRows,
+              numCols,
+              offset,
+              false_type(),
+              true_type() /* bAsColVector */);
+}
+
+template <>
+template <class Agg>
+int BaseMatrixT<real>::applyRow(Agg agg, BaseMatrixT& b) {
+  MatrixOffset offset(0, 0, 0, 0, 0, 0);
+  size_t numRows = b.height_;
+  size_t numCols = b.width_;
+  CHECK_EQ(height_, numRows);
+  CHECK_EQ(width_, 1UL);
+  aggregate(agg,
+            base::unary::identity(),
+            base::binary::second(),
+            b,
+            numRows,
+            numCols,
+            offset,
+            false_type(),
+            true_type() /*aAsColVector*/);
+
+  return 0;
+}
+
+template <>
+template <class Agg, class Saver>
+int BaseMatrixT<real>::applyRow(Agg agg, Saver sv, BaseMatrixT& b) {
+  MatrixOffset offset(0, 0, 0, 0, 0, 0);
+  size_t numRows = b.height_;
+  size_t numCols = b.width_;
+  CHECK_EQ(height_, numRows);
+  CHECK_EQ(width_, 1UL);
+  aggregate(agg,
+            base::unary::identity(),
+            sv,
+            b,
+            numRows,
+            numCols,
+            offset,
+            false_type(),
+            true_type() /*aAsColVector*/);
+
+  return 0;
+}
+
+template <>
+template <class Agg>
+int BaseMatrixT<real>::applyRow(Agg agg,
+                                real scaleDest,
+                                real scaleAgg,
+                                BaseMatrixT& b) {
+  if (scaleDest != 0) {
+    applyRow(agg, base::binary::add2(scaleDest, scaleAgg), b);
+  } else {
+    applyRow(agg, base::binary::second(), b);
+    if (scaleAgg != 1) {
+      mulScalar(scaleAgg);
+    }
+  }
+  return 0;
+}
+
+template <>
+template <class Agg, class Op, class Saver>
+int BaseMatrixT<real>::applyRow(
+    Agg agg, Op op, Saver sv, BaseMatrixT& b, BaseMatrixT& c) {
+  MatrixOffset offset(0, 0, 0, 0, 0, 0);
+  size_t numRows = b.height_;
+  size_t numCols = b.width_;
+  CHECK_EQ(height_, numRows);
+  CHECK_EQ(width_, 1UL);
+  CHECK_EQ(c.height_, numRows);
+  CHECK_EQ(c.width_, numCols);
+  aggregate(agg,
+            op,
+            sv,
+            b,
+            c,
+            numRows,
+            numCols,
+            offset,
+            false_type(),
+            true_type() /*aAsColVector*/);
+  return 0;
+}
+
+template <>
+template <class Agg, class Op>
+int BaseMatrixT<real>::applyRow(Agg agg,
+                                Op op,
+                                real scaleDest,
+                                real scaleAgg,
+                                BaseMatrixT& b,
+                                BaseMatrixT& c) {
+  if (scaleDest != 0) {
+    applyRow(agg, op, base::binary::add2(scaleDest, scaleAgg), b, c);
+  } else {
+    applyRow(agg, op, base::binary::second(), b, c);
+    if (scaleAgg != 1) {
+      mulScalar(scaleAgg);
+    }
+  }
+  return 0;
+}
+
+template <>
+template <class Agg>
+int BaseMatrixT<real>::applyCol(Agg agg, BaseMatrixT& b) {
+  MatrixOffset offset(0, 0, 0, 0, 0, 0);
+  size_t numRows = b.height_;
+  size_t numCols = b.width_;
+  CHECK_EQ(width_, numCols);
+  CHECK_EQ(height_, 1UL);
+  aggregate(agg,
+            base::unary::identity(),
+            base::binary::second(),
+            b,
+            numRows,
+            numCols,
+            offset,
+            true_type() /*aAsRowVector*/,
+            false_type());
+
+  return 0;
+}
+
+template <>
+template <class Agg, class Saver>
+int BaseMatrixT<real>::applyCol(Agg agg, Saver sv, BaseMatrixT& b) {
+  MatrixOffset offset(0, 0, 0, 0, 0, 0);
+  size_t numRows = b.height_;
+  size_t numCols = b.width_;
+  CHECK_EQ(width_, numCols);
+  CHECK_EQ(height_, 1UL);
+  aggregate(agg,
+            base::unary::identity(),
+            sv,
+            b,
+            numRows,
+            numCols,
+            offset,
+            true_type() /*aAsRowVector*/,
+            false_type());
+
+  return 0;
+}
+
+template <>
+template <class Agg>
+int BaseMatrixT<real>::applyCol(Agg agg,
+                                real scaleDest,
+                                real scaleAgg,
+                                BaseMatrixT& b) {
+  if (scaleDest != 0) {
+    applyCol(agg, base::binary::add2(scaleDest, scaleAgg), b);
+  } else {
+    applyCol(agg, base::binary::second(), b);
+    if (scaleAgg != 1) {
+      mulScalar(scaleAgg);
+    }
+  }
+  return 0;
+}
+
+template <>
+void BaseMatrixT<real>::sumRows(BaseMatrixT& b, real scaleSum, real scaleDest) {
+  applyRow(aggregate::sum(), scaleDest, scaleSum, b);
+}
+
+template <>
+void BaseMatrixT<real>::maxRows(BaseMatrixT& b) {
+  applyRow(aggregate::max(), b);
+}
+
+template <>
+void BaseMatrixT<real>::minRows(BaseMatrixT& b) {
+  applyRow(aggregate::min(), b);
+}
+
+template <>
+void BaseMatrixT<real>::maxCols(BaseMatrixT& b) {
+  applyCol(aggregate::max(), b);
+}
+
+template <>
+void BaseMatrixT<real>::minCols(BaseMatrixT& b) {
+  applyCol(aggregate::min(), b);
+}
+
+template <>
+void BaseMatrixT<real>::sumCols(BaseMatrixT& b, real scaleSum, real scaleDest) {
+  applyCol(aggregate::sum(), scaleDest, scaleSum, b);
+}
+
+template <>
+void BaseMatrixT<real>::sumOfSquaredDiffs(BaseMatrixT& b,
+                                          BaseMatrixT& c,
+                                          real scaleSum,
+                                          real scaleDest) {
+  applyRow(
+      aggregate::sum(), base::binary::squaredDiff(), scaleDest, scaleSum, b, c);
+}
+
+template <>
+void BaseMatrixT<real>::sumOfProducts(BaseMatrixT& b,
+                                      BaseMatrixT& c,
+                                      real scaleSum,
+                                      real scaleDest) {
+  applyRow(aggregate::sum(), base::binary::mul(), scaleDest, scaleSum, b, c);
+}
+
+template class BaseMatrixT<real>;
+
+#ifndef PADDLE_MOBILE_INFERENCE
+
+template class BaseMatrixT<int>;
+
+#else
+
+template <>
+void BaseMatrixT<int>::zero() {
+  applyUnary(unary::Zero<int>());
+}
+
+template <>
+void BaseMatrixT<int>::assign(int p) {
+  applyUnary(unary::Assign<int>(p));
+}
+
+template <>
+void BaseMatrixT<int>::isEqualTo(BaseMatrixT& b, int value) {
+  applyBinary(binary::IsEqual<int>(value), b);
+}
+
+template <>
+void BaseMatrixT<int>::neg() {
+  applyUnary(unary::Neg<int>());
+}
+
+template <>
+void BaseMatrixT<int>::abs2() {
+  applyUnary(unary::Abs<int>());
+}
+
+template <>
+void BaseMatrixT<int>::add(int p) {
+  applyUnary(unary::Add<int>(p));
+}
+
+template <>
+void BaseMatrixT<int>::add(int p1, int p2) {
+  applyUnary(unary::Add2<int>(p1, p2));
+}
+
+template <>
+void BaseMatrixT<int>::applyL1(int learningRate, int decayRate) {
+  applyUnary(unary::ApplyL1<int>(learningRate * decayRate));
+}
+
+#endif
+}  // namespace paddle
diff --git a/paddle/legacy/math/BaseMatrix.h b/paddle/legacy/math/BaseMatrix.h
new file mode 100644
index 0000000000000000000000000000000000000000..4627f847d356f07600edae8cadcb02302e19381c
--- /dev/null
+++ b/paddle/legacy/math/BaseMatrix.h
@@ -0,0 +1,1095 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <stdint.h>
+#include <cstddef>
+#include "TensorExpression.h"
+#include "paddle/legacy/utils/Common.h"
+
+namespace paddle {
+
+/*
+ * nvcc currently does not support C++11,
+ * so I realized false_type and true_type.
+ */
+template <class T, T v>
+struct bool_constant {
+  static const T value = v;
+};
+typedef bool_constant<bool, false> false_type;
+typedef bool_constant<bool, true> true_type;
+
+/**
+ * @brief   Calculate matrix element address.
+ *
+ * For instance, address of A[i][j] = i * ld + j.
+ *
+ */
+#define CAL_MATRIX_START_ADDRESS(address, height, width, ld, col, row) \
+  CHECK_LE(col, width);                                                \
+  CHECK_LE(row, height);                                               \
+  address += row * ld + col;
+
+class MatrixOffset {
+ public:
+  size_t aCol_;
+  size_t aRow_;
+  size_t bCol_;
+  size_t bRow_;
+  size_t cCol_;
+  size_t cRow_;
+  size_t dCol_;
+  size_t dRow_;
+  MatrixOffset(size_t aCol = 0,
+               size_t aRow = 0,
+               size_t bCol = 0,
+               size_t bRow = 0,
+               size_t cCol = 0,
+               size_t cRow = 0,
+               size_t dCol = 0,
+               size_t dRow = 0)
+      : aCol_(aCol),
+        aRow_(aRow),
+        bCol_(bCol),
+        bRow_(bRow),
+        cCol_(cCol),
+        cRow_(cRow),
+        dCol_(dCol),
+        dRow_(dRow) {}
+};
+
+template <class T>
+class BaseMatrixT : public TensorExpression<BaseMatrixT<T>, T> {
+ public:
+  size_t height_, width_;
+  size_t stride_;
+  T* data_;
+  bool trans_;
+  bool useGpu_;
+
+ public:
+  virtual ~BaseMatrixT() {}
+  BaseMatrixT(size_t height, size_t width, T* data, bool trans, bool useGpu)
+      : height_(height),
+        width_(width),
+        stride_(width),
+        data_(data),
+        trans_(trans),
+        useGpu_(useGpu) {}
+
+  /**
+   * @note This constructor is for temporarily making a matrix with different
+   *       useGpu flag as the original matrix so that mixed gpu/cpu operations
+   *       can be performed successfully.
+   */
+  BaseMatrixT(BaseMatrixT& mat, bool useGpu)
+      : height_(mat.height_),
+        width_(mat.width_),
+        stride_(mat.stride_),
+        data_(mat.data_),
+        trans_(mat.trans_),
+        useGpu_(useGpu) {}
+
+  BaseMatrixT(size_t height,
+              size_t width,
+              size_t stride,
+              T* data,
+              bool trans,
+              bool use_gpu)
+      : height_(height),
+        width_(width),
+        stride_(stride),
+        data_(data),
+        trans_(trans),
+        useGpu_(use_gpu) {
+    /* CHECK_LE(width_, stride_); */
+  }
+
+  /// caller should make sure that the size of data is at least height*width
+  void setData(T* data) { data_ = data; }
+
+  /**
+   * unary operator: element wise op(a).
+   *
+   * @code
+   * for 0 <= i < this->height_ & for 0 <= j < this->width_.
+   * @endcode
+   */
+  template <class Op>
+  int applyUnary(Op op);
+
+  /**
+   * unary operator: element wise op(a).
+   *
+   * @code
+   * for 0 <= i < numRows & for 0 <= j < numCols.
+   * While matrix start address is:
+   *  A = this->data_ + offset.aRow_*ld + offset.aCol_;
+   * @endcode
+   */
+  template <class Op>
+  int applyUnary(Op op, int numRows, int numCols, MatrixOffset& offset);
+
+  /**
+   * binary operator: element wise op(a, b).
+   *
+   * @code
+   * for 0 <= i < this->height_ & for 0 <= j < this->width_.
+   * While this->height_ == b.height_ && this->width_ == b.width_.
+   * @endcode
+   */
+  template <class Op>
+  int applyBinary(Op op, BaseMatrixT& b);
+
+  /**
+   * binary operator: element wise op(a, b)
+   *
+   * @code
+   * for 0 <= i < numRows & for 0 <= j < numCols.
+   * While matrix start address is:
+   *   A = this->data_ + offset.aRow_*lda + offset.aCol_;
+   *   B = b->data_ + offset.bRow_*ldb + offset.bCol_;
+   *
+   * if (bAsRowVector == false_type && bAsColVector == false_type)
+   *   op(A[i * lda + j], B[i * ldb + j])
+   *
+   * if (bAsRowVector == true_type && bAsColVector == false_type)
+   *   op(A[i * lda + j], B[j])
+   *
+   * if (bAsRowVector == false_type && bAsColVector == true_type)
+   *   op(A[i * lda + j], B[i * ldb])
+   *
+   * if (bAsRowVector == true_type && bAsColVector == true_type)
+   *   op(A[i * lda + j], B[0])
+   * @endcode
+   */
+  template <class Op, class bAsRowVector, class bAsColVector>
+  int applyBinary(Op op,
+                  BaseMatrixT& b,
+                  int numRows,
+                  int numCols,
+                  MatrixOffset& offset,
+                  bAsRowVector,
+                  bAsColVector);
+
+  template <class Op>
+  int applyBinary(
+      Op op, BaseMatrixT& b, int numRows, int numCols, MatrixOffset& offset);
+
+  /**
+   * ternary operator: element wise op(a, b, c).
+   *
+   * @code
+   * for 0 <= i < this->height_ & for 0 <= j < this->width_.
+   *
+   * While this->height_ == b.height_ && this->width_ == b.width_
+   *    && this->height_ == c.height_ && this->width_ == c.width_
+   * @endcode
+   */
+  template <class Op>
+  int applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * ternary operator: element wise op(a, b, c).
+   *
+   * @code
+   *  for 0 <= i < numRows & for 0 <= j < numCols.
+   *  While matrix start address is:
+   *
+   *    A = this->data_ + offset.aRow_*lda + offset.aCol_;
+   *    B = b->data_ + offset.bRow_*ldb + offset.bCol_;
+   *    C = c->data_ + offset.cRow_*ldc + offset.cCol_;
+   *
+   *    if (cAsRowVector == false_type && cAsColVector == false_type)
+   *      op(A[i*lda + j], B[i*ldb + j], C[i*ldc + j])
+   *
+   *    if (cAsRowVector == true_type && cAsColVector == false_type)
+   *      op(A[i*lda + j], B[i*ldb + j], C[j])
+   *
+   *    if (cAsRowVector == false_type && cAsColVector == true_type)
+   *      op(A[i*lda + j], B[i*ldb + j], C[i*ldc])
+   *
+   *    if (cAsRowVector == 1 && cAsColVector == 1)
+   *      op(A[i*lda + j], B[i*ldb + j], C[0])
+   * @endcode
+   */
+  template <class Op, class cAsRowVector, class cAsColVector>
+  int applyTernary(Op op,
+                   BaseMatrixT& b,
+                   BaseMatrixT& c,
+                   int numRows,
+                   int numCols,
+                   MatrixOffset& offset,
+                   cAsRowVector,
+                   cAsColVector);
+
+  template <class Op>
+  int applyTernary(Op op,
+                   BaseMatrixT& b,
+                   BaseMatrixT& c,
+                   int numRows,
+                   int numCols,
+                   MatrixOffset& offset);
+
+  /**
+   * quaternary operator: element wise op(a, b, c, d).
+   *
+   * @code
+   * for 0 <= i < this->height_ & for 0 <= j < this->width_.
+   *
+   * While this->height_ == b.height_ && this->width_ == b.width_
+   *    && this->height_ == c.height_ && this->width_ == c.width_
+   *    && this->height_ == d.height_ && this->width_ == d.width_
+   * @endcode
+   */
+  template <class Op>
+  int applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d);
+
+  /**
+   * quaternary operator: element wise op(a, b, c, d).
+   *
+   * @code
+   * for 0 <= i < numRows & for 0 <= j < numCols.
+   * While matrix start address is:
+   *    A = this->data_ + offset.aRow_*lda + offset.aCol_;
+   *    B = b->data_ + offset.bRow_*ldb + offset.bCol_;
+   *    C = c->data_ + offset.cRow_*ldc + offset.cCol_;
+   *    D = d->data_ + offset.dRow_*ldd + offset.dCol_;
+   * @endcode
+   */
+  template <class Op>
+  int applyQuaternary(Op op,
+                      BaseMatrixT& b,
+                      BaseMatrixT& c,
+                      BaseMatrixT& d,
+                      int numRows,
+                      int numCols,
+                      MatrixOffset& offset);
+
+  /**
+   * a aggregate expression that apply each row(or column) of matrix b.
+   * op and sv is element wise operator.
+   *
+   * @code
+   * if (aAsRowVector == true_type && aAsColVector == false_type)
+   *  for each column j & 0 <= i < numRows, do:
+   *    dst = agg(op(b[i*ldb + j]))
+   *    a[j] = sv(a[j], dst)
+   *
+   * if (aAsRowVector == false_type && aAsColVector == true_type)
+   *  for each row i & 0 <= j < numCols, do:
+   *    dst = agg(op(b[i*ldb + j]))
+   *    a[i] = sv(a[i], dst)
+   * @endcode
+   */
+  template <class Agg,
+            class Op,
+            class Saver,
+            class aAsRowVector,
+            class aAsColVector>
+  int aggregate(Agg agg,
+                Op op,
+                Saver sv,
+                BaseMatrixT& b,
+                int numRows,
+                int numCols,
+                MatrixOffset& offset,
+                aAsRowVector,
+                aAsColVector);
+
+  /**
+   * a aggregate expression that apply each row(or column) of matrix b and c.
+   *
+   * op and sv is element wise operator.
+   *
+   * @code
+   * if (aAsRowVector == true_type && aAsColVector == false_type)
+   *   for each column j & 0 <= i < numRows, do:
+   *     dst = agg(op(b[i*ldb + j], c[i*ldc + j]))
+   *     a[j] = sv(a[j], dst)
+   *
+   * if (aAsRowVector == false_type && aAsColVector == true_type)
+   *   for each row i & 0 <= j < numCols, do:
+   *     dst = agg(op(b[i*ldb + j], c[i*ldc + j]))
+   *     a[i] = sv(a[i], dst)
+   * @endcode
+   */
+  template <class Agg,
+            class Op,
+            class Saver,
+            class aAsRowVector,
+            class aAsColVector>
+  int aggregate(Agg agg,
+                Op op,
+                Saver sv,
+                BaseMatrixT& b,
+                BaseMatrixT& c,
+                int numRows,
+                int numCols,
+                MatrixOffset& offset,
+                aAsRowVector,
+                aAsColVector);
+
+  /**
+   * a aggregate expression that apply each row of matrix b.
+   *
+   * @code
+   * for each row i & 0 <= j < b.width_, do:
+   *   this[i] = agg(b[i*ldb + j])
+   * @endcode
+   */
+  template <class Agg>
+  int applyRow(Agg agg, BaseMatrixT& b);
+
+  /**
+   * a aggregate expression that apply each row of matrix b.
+   *
+   * @code
+   * for each row i & 0 <= j < b.width_, do:
+   *   dst = agg(op(b[i*ldb + j], c[i*ldc + j])
+   *   this[i] = sv(this[i], dst)
+   * @endcode
+   */
+  template <class Agg, class Op, class Saver>
+  int applyRow(Agg agg, Op op, Saver sv, BaseMatrixT& b, BaseMatrixT& c);
+
+  // Same as the above with the special handing of sv=add2(scaleDest, scaleAgg)
+  template <class Agg, class Op>
+  int applyRow(Agg agg,
+               Op op,
+               real scaleDest,
+               real scaleAgg,
+               BaseMatrixT& b,
+               BaseMatrixT& c);
+
+  /**
+   * a aggregate expression that apply each row of matrix b.
+   *
+   * @code
+   * for each row i & 0 <= j < b.width_, do:
+   *   dst = agg(b[i*ldb + j])
+   *   this[i] = sv(this[i], dst)
+   * @endcode
+   */
+  template <class Agg, class Saver>
+  int applyRow(Agg agg, Saver sv, BaseMatrixT& b);
+
+  // Same as the above with the special handing of sv=add2(scaleDest, scaleAgg)
+  template <class Agg>
+  int applyRow(Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b);
+
+  /**
+   * a aggregate expression that apply each column of matrix b.
+   *
+   * @code
+   * for each column j & 0 <= i < b.height_, do:
+   *   this[j] = agg(b[i*ldb + j])
+   * @endcode
+   */
+  template <class Agg>
+  int applyCol(Agg agg, BaseMatrixT& b);
+
+  /**
+   * a aggregate expression that apply each column of matrix b.
+   *
+   * @code
+   * for each column j & 0 <= i < b.height_, do:
+   *   dst = agg(b[i*ldb + j])
+   *   this[j] = sv(this[j], dst)
+   * @endcode
+   */
+  template <class Agg, class Saver>
+  int applyCol(Agg agg, Saver sv, BaseMatrixT& b);
+
+  // Same as the above with the special handing of sv=add2(scaleDest, scaleAgg)
+  template <class Agg>
+  int applyCol(Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b);
+
+  bool useGpu() const { return useGpu_; }
+
+  const T* rowBuf(size_t row) const { return data_ + width_ * row; }
+
+  T* rowBuf(size_t row) { return data_ + width_ * row; }
+
+  /**
+   * @brief   unary operator.
+   *
+   */
+  void neg();
+  void exp2();
+  void pow2(T p);
+  void log2();
+  void sqrt2();
+  void square2();
+  void reciprocal2();
+  void abs2();
+  void sign2();
+  void zero();
+
+  /**
+   * @code
+   * this(row, col + columnOffset) = 0 for 0 <= col < numColumns
+   * @endcode
+   */
+  void zeroAtOffset(int64_t columnOffset, int64_t numColumns);
+  void one();
+  void subScalar(T p);
+  void mulScalar(T p);
+  void divScalar(T p);
+
+  /**
+   * @code
+   * this = p
+   * @endcode
+   */
+  void assign(T p);
+
+  /**
+   * @code
+   * swap(this, b)
+   * example: swap two Matrices
+   * MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
+   * MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
+   * cpuA->deepSwap(*cpuB);
+   * @endcode
+   */
+  void deepSwap(BaseMatrixT& b);
+
+  /**
+   * @code
+   * this = this + p
+   * @endcode
+   */
+  void add(T p);
+
+  /**
+   * @code
+   * this = this*p1 + p2
+   * @endcode
+   */
+  void add(T p1, T p2);
+
+  /**
+   * this = this < low ? low : this
+   *
+   * this = this > high ? high : this
+   */
+  void clip(T p1, T p2);
+
+  /**
+   * this = b < low ? 0 : 1
+   *
+   * this = b > high ? 0 : 1
+   */
+  void clipDerivative(BaseMatrixT& b, T p1, T p2);
+
+  /**
+   * @code
+   * a = a > p ? 1.0f : 0.0f
+   * @endcode
+   */
+  void biggerThanScalar(T p);
+
+  /**
+   * @code
+   * a = a > p ? a : p
+   * @endcode
+   */
+  void downClip(T p);
+
+  /**
+   * @code
+   * this = b
+   * @endcode
+   */
+  void assign(BaseMatrixT& b);
+
+  /**
+   * @code
+   * If b.width + columOffset <= this.width
+   *  this(row, col + columnOffset) = b(row, col) for 0 <= col < b.width
+   *
+   * If this.width + columnOffset <= b.width
+   *  this(row, col) = b(row, col + columnOffset) for 0 <= col < this.width
+   *
+   * Otherwise, FATAL
+   * @endcode
+   */
+  void assignAtOffset(BaseMatrixT& b, int64_t columnOffset);
+
+  /// this = this + b
+  void add(BaseMatrixT& b);
+
+  /**
+   * @code
+   * If b.width + columOffset <= this.width
+   *  this(row, col + columnOffset) += b(row, col) for 0 <= col < b.width
+   *
+   * If this.width + columnOffset <= b.width
+   *  this(row, col) += b(row, col + columnOffset) for 0 <= col < this.width
+   *
+   * Otherwise, FATAL
+   * @endcode
+   */
+  void addAtOffset(BaseMatrixT& b, int64_t columnOffset);
+
+  void addColVector(BaseMatrixT& b);
+  void addRowVector(BaseMatrixT& b);
+  void addBias(BaseMatrixT& b, T scale);
+
+  void mulRowVector(BaseMatrixT& b);
+  void divRowVector(BaseMatrixT& b);
+
+  void mulColVector(BaseMatrixT& b);
+  void divColVector(BaseMatrixT& b);
+
+  void addP2P(BaseMatrixT& b);
+
+  /**
+   * @code
+   * this = this + b*p
+   * @endcode
+   */
+  void add(BaseMatrixT& b, T p);
+
+  /**
+   * @code
+   * this = p1*this + p2*b
+   * @endcode
+   */
+  void add(BaseMatrixT& b, T p1, T p2);
+
+  /**
+   * @code
+   * this = this - b
+   * @endcode
+   */
+  void sub(BaseMatrixT& b);
+
+  /**
+   * @code
+   * this = this - b*p
+   * @endcode
+   */
+  void sub(BaseMatrixT& b, T p);
+
+  /**
+   * @code
+   * b = max(0, this)
+   * @endcode
+   */
+  void relu(BaseMatrixT& b);
+  void reluDerivative(BaseMatrixT& b);
+
+  /**
+   * @code
+   * b = log(1.0 + exp(this))
+   * @endcode
+   */
+  void softrelu(BaseMatrixT& b);
+  void softreluDerivative(BaseMatrixT& b);
+
+  /**
+   * @code
+   * b = min(max(this, p1), p2)
+   * @endcode
+   */
+  void brelu(BaseMatrixT& b);
+  void breluDerivative(BaseMatrixT& b);
+
+  /**
+   * @code
+   * b = this * this
+   * @endcode
+   */
+  void square2(BaseMatrixT& b);
+  void squareDerivative(BaseMatrixT& b);
+
+  /**
+   * @code
+   * b = tanh(this)
+   * @endcode
+   */
+  void tanh(BaseMatrixT& b);
+  void tanhDerivative(BaseMatrixT& b);
+
+  /**
+   * @code
+   * b = p1 * tanh(p2 * this)
+   * @endcode
+   */
+  void scaledTanh(BaseMatrixT& b, T p1, T p2);
+  void scaledTanhDerivative(BaseMatrixT& b, T p1, T p2);
+
+  /**
+   * @code
+   * b = 1.0f / this
+   * @endcode
+   */
+  void reciprocal2(BaseMatrixT& b);
+  void reciprocalDerivative(BaseMatrixT& b);
+
+  /**
+   * @code
+   * b = this > 0.0f ? this : -this
+   * @endcode
+   */
+  void abs2(BaseMatrixT& b);
+  void absDerivative(BaseMatrixT& b);
+
+  /**
+   * @code
+   * b = 1.0f / (1.0f + exp(-this))
+   * @endcode
+   */
+  void sigmoid(BaseMatrixT& b);
+  void sigmoidDerivative(BaseMatrixT& b);
+
+  /**
+   * @code
+   * b = a
+   * @endcode
+   */
+  void expDerivative(BaseMatrixT& b);
+
+  void sign2(BaseMatrixT& b);
+
+  void exp2(BaseMatrixT& b);
+  void pow2(BaseMatrixT& b, T p);
+  void log2(BaseMatrixT& b);
+  void sqrt2(BaseMatrixT& b);
+  void addScalar(BaseMatrixT& b, T p);
+  void subScalar(BaseMatrixT& b, T p);
+  void mulScalar(BaseMatrixT& b, T p);
+  void divScalar(BaseMatrixT& b, T p);
+  void scalarDiv(BaseMatrixT& b, T p);
+
+  /**
+   * @code
+   * this = 1.0f / sqrt(b)
+   * @endcode
+   */
+  void invSqrt(BaseMatrixT& b);
+
+  /// this = (b == value)
+  void isEqualTo(BaseMatrixT& b, T value);
+
+  /**
+   * @brief   ternary operator.
+   */
+  void softCrossEntropy(BaseMatrixT& b, BaseMatrixT& c);
+  void softCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c);
+  void binaryLabelCrossEntropy(BaseMatrixT& b, BaseMatrixT& c);
+  void binaryLabelCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * @code
+   * this = b + c
+   * @endcode
+   */
+  void add(BaseMatrixT& b, BaseMatrixT& c);
+  /**
+   * @code
+   * this = b*p1 + c*p2
+   * @endcode
+   */
+  void add(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2);
+  /**
+   * @code
+   * this = b - c
+   * @endcode
+   */
+  void sub(BaseMatrixT& b, BaseMatrixT& c);
+  /**
+   * @code
+   * this = b*p1 - c*p2
+   * @endcode
+   */
+  void sub(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2);
+
+  /**
+   * @code
+   * this = this + b + c
+   * @endcode
+   */
+  void add2(BaseMatrixT& b, BaseMatrixT& c);
+  /**
+   * @code
+   * this = this*p1 + b*p2 + c*p3
+   * @endcode
+   */
+  void add2(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3);
+
+  /**
+   * @code
+   * this = a*p1 + b*p2 + c*p3
+   * @endcode
+   */
+  void add3(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d, T p1, T p2, T p3);
+
+  /**
+   * @code
+   *   c = p2 * c - p1 *  (b + p3 * this)
+   *   this += mom
+   * @endcode
+   */
+  void sgdUpdate(BaseMatrixT& b,  //  grad
+                 BaseMatrixT& c,  //  mom
+                 T p1,            //  learningRate,
+                 T p2,            //  momentum,
+                 T p3);           //  decayRate
+
+  /**
+   * @code
+   *   c = p2 * c - p1 * d * (b + p3 * this)
+   *   this += mom
+   * @endcode
+   */
+  void sgdUpdate(BaseMatrixT& b,  // grad,
+                 BaseMatrixT& c,  // mom,
+                 BaseMatrixT& d,  // lr,
+                 T p1,            // learningRate,
+                 T p2,            // momentum,
+                 T p3);           // decayRate
+
+  /// apply L1/L2 to *this*
+  virtual void applyL1(T learningRate, T decayRate);
+  void applyL1(BaseMatrixT& lr, T learningRate, T decayRate);
+  void applyL2(T learningRate, T decayRate);
+  void applyL2(BaseMatrixT& lr, T learningRate, T decayRate);
+
+  /**
+   * @code
+   * this *= b
+   * @endcode
+   */
+  void dotMul(BaseMatrixT& b);
+
+  /**
+   * @code
+   * this = b * c
+   * @endcode
+   */
+  void dotMul(BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * @code
+   * this = b / c
+   * @endcode
+   */
+  void dotDiv(BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * @code
+   * this = (b + p1) / (c + p2)
+   * @endcode
+   */
+  void dotDiv(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2);
+
+  /**
+   * @code
+   * this = log(1 + exp(b - c)) - d * (b - c)
+   * @endcode
+   */
+  void rankLoss(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d);
+  void rankLossBp(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d);
+
+  /**
+   * @code
+   * this = log(1 + exp(b)) - c * b
+   * @endcode
+   */
+  void logisticRegressionLoss(BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * @code
+   * this += exp(b)/(1+exp(b)) - c
+   * @endcode
+   */
+  void logisticRegressionLossBp(BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * @code
+   * this = b > c ? 1.0 : 0.0
+   * @endcode
+   */
+  void biggerThan(BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * @code
+   * this = ((b>c && d>0.5) || (b<c && d<0.5)) ? 1 : 0)
+   * @endcode
+   */
+  void biggerThan(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d);
+
+  /**
+   * @code
+   * this = b>c ? b : c
+   * @endcode
+   */
+  void max2(BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * @code
+   * this[destCol] += (b>p1 == c>p1) ? 0 : 1)
+   * @endcode
+   */
+  void binaryClassificationError(size_t destCol,
+                                 BaseMatrixT& b,
+                                 BaseMatrixT& c,
+                                 T p);
+  void binaryClassificationError2(size_t destCol,
+                                  BaseMatrixT& b,
+                                  BaseMatrixT& c,
+                                  T p);
+
+  /**
+   * @code
+   * this = this * b * b
+   * @endcode
+   */
+  void dotMulSquare(BaseMatrixT& b);
+
+  /**
+   * @code
+   * this = this * this * b
+   * @endcode
+   */
+  void dotSquareMul(BaseMatrixT& b);
+
+  /**
+   * @code
+   * this = b * c * c
+   * @endcode
+   */
+  void dotMulSquare(BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * @code
+   * this = b * b * c * c
+   * @endcode
+   */
+  void dotSquareSquare(BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * @code
+   * this = this * (p1*b + p2*c)^2
+   * @endcode
+   */
+  void dotMulSquareSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2);
+
+  /**
+   * @code
+   * this = (p1*b + p2*c)^2
+   * @endcode
+   */
+  void dotSquareSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2);
+
+  /**
+   * @code
+   * this=  this * (p1*b + p2*c)
+   * @endcode
+   */
+  void dotMulSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2);
+
+  /**
+   * @code
+   * this += sqr(p1*b + p2*c + p3*d)
+   * @endcode
+   */
+  void addSquareSum(
+      BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT d, T p1, T p2, T p3);
+
+  /**
+   * @code
+   * this += p * sqr(b)
+   * @endcode
+   */
+  void addSquare(BaseMatrixT& b, T p);
+
+  /**
+   * @code
+   * this = p1 * this + p2 * sqr(b)
+   * @endcode
+   */
+  void decayAddSquare(BaseMatrixT& b, T p1, T p2);
+
+  /**
+   * @code
+   * this = p1 * this + p2 * sqr(b * c)
+   * @endcode
+   */
+  void decayAddSquareMul(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2);
+
+  /**
+   * @code
+   * this = 1 / (p1 * b + p2)
+   * @endcode
+   */
+  void reciprocal2(BaseMatrixT& b, T p1, T p2);
+
+  /**
+   * @code
+   * this = 1 / (p1 * b + p2 * c + p3)
+   * @endcode
+   */
+  void reciprocalSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3);
+
+  /**
+   * @code
+   * b = this; this = 0
+   * @endcode
+   */
+  void copyAndClear(BaseMatrixT& b);
+
+  /**
+   * @code
+   * this_row[destCol] += dotprod(b_row, c_row)
+   * @endcode
+   */
+  void rowDotMul(size_t destCol, BaseMatrixT& b, BaseMatrixT& c);
+  void rowDotMul2(size_t destCol, BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * this is vector (one row matrix)
+   *
+   * @code
+   *   for each row i, do:
+   *      this_row += dotmul(b_row_i, c_row_i)
+   * @endcode
+   */
+  void addDotMulVMM(BaseMatrixT& b, BaseMatrixT& c);
+  void addDotMulVMM2(BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * c is vector (one row matrix)
+   *
+   * @code
+   * for each row i, do:
+   *    this_row_i += dotmul(b_row_i, c_row)
+   * @endcode
+   */
+  void addDotMulMMV(BaseMatrixT& b, BaseMatrixT& c);
+  void addDotMulMMV2(BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * @code
+   * this = p1 * this + p2 * b * c
+   * @endcode
+   */
+  void addDotMul(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2);
+
+  /**
+   * @code
+   * this_row = b_row * c_row[cCol]
+   * @endcode
+   */
+  void rowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c);
+  void rowScale2(size_t cCol, BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * @code
+   * this_col = b_col * c_col[cRow]
+   * @endcode
+   */
+  void colScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * @code
+   * this_col += b_col * c_col[cRow]
+   * @endcode
+   */
+  void addColScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * @code
+   * this_row += b_row * c_row[cCol]
+   * @endcode
+   */
+  void addRowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c);
+
+  /// calculate the sum of each row of the matrix b.
+  /// this_i = scaleDest * this_i + scaleSum * \sum_j b_{ij}
+  void sumRows(BaseMatrixT& b, T scaleSum, T scaleDest);
+
+  /// calculate the maximum value of each row of the matrix b.
+  void maxRows(BaseMatrixT& b);
+  /// calculate the minimum value of each row of the matrix b.
+  void minRows(BaseMatrixT& b);
+
+  /// calculate the maximum value of each column of the matrix b.
+  void maxCols(BaseMatrixT& b);
+  /// calculate the minimum value of each column of the matrix b.
+  void minCols(BaseMatrixT& b);
+
+  /// calculate the sum of each column of the matrix b.
+  /// this_i = scaleDest * this_i + scaleSum * \sum_j b_{ji}
+  void sumCols(BaseMatrixT& b, T scaleSum, T scaleDest);
+
+  /// this_i = scaleDest * this_i + scaleSum * \sum_j (b_{ij} - c_{ij})^2
+  void sumOfSquaredDiffs(BaseMatrixT& b,
+                         BaseMatrixT& c,
+                         T scaleSum,
+                         T scaleDest);
+
+  /// this_i = scaleDest * this_i + scaleSum * \sum_j b_{ij} * c_{ij}
+  void sumOfProducts(BaseMatrixT& b, BaseMatrixT& c, T scaleSum, T scaleDest);
+
+  /**
+   * @code
+   * this_row = b_row + p * ones * c_row[cCol]
+   * @endcode
+   */
+  void rowAdd(size_t cCol, BaseMatrixT& b, BaseMatrixT& c, T p);
+  /**
+   * @code
+   * this_row = pow(b_row, c_row[cCol])
+   * @endcode
+   */
+  void rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c);
+
+  virtual bool isSparse() const { return false; }
+
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    if (useGpu_) {
+      TensorGpuApply<T>(*this, expr);
+    } else {
+      TensorCpuApply<T>(*this, expr);
+    }
+  }
+
+  template <typename ExpressionType>
+  void operator+=(const ExpressionType& expr) {
+    (*this) = (*this) + expr;
+  }
+  template <typename ExpressionType>
+  void operator-=(const ExpressionType& expr) {
+    (*this) = (*this) - expr;
+  }
+  template <typename ExpressionType>
+  void operator*=(const ExpressionType& expr) {
+    (*this) = (*this) * expr;
+  }
+  template <typename ExpressionType>
+  void operator/=(const ExpressionType& expr) {
+    (*this) = (*this) / expr;
+  }
+};
+
+typedef BaseMatrixT<real> BaseMatrix;
+typedef BaseMatrixT<int> IBaseMatrix;
+
+}  // namespace paddle
diff --git a/paddle/legacy/math/CMakeLists.txt b/paddle/legacy/math/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9992ec71f45b592e0a73e1cc9c655e773fa18e86
--- /dev/null
+++ b/paddle/legacy/math/CMakeLists.txt
@@ -0,0 +1,57 @@
+# common package contains:
+#   * the utilities:
+#       * Thread Libs
+#       * Memory Manage libs
+#       * CommandLine Parser
+#       * Logging
+#       * Timer/Stats
+#   * the math libraries:
+#       * Matrix/Vector
+#   * the parameter optimizers.
+#   * the parameter updater functions.
+#
+# TODO(yuyang18): separate libs.
+#
+file(GLOB MATH_HEADERS . *.h)
+file(GLOB MATH_SOURCES . *.cpp)
+
+if(NOT WITH_MKLDNN)
+    set(DNN_HEADER "${CMAKE_CURRENT_SOURCE_DIR}/MKLDNNMatrix.h")
+    set(DNN_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/MKLDNNMatrix.cpp")
+    list(REMOVE_ITEM MATH_HEADERS "${DNN_HEADER}")
+    list(REMOVE_ITEM MATH_SOURCES "${DNN_SOURCE}")
+    message(STATUS "Skip compiling with MKLDNNMatrix")
+else()
+    message(STATUS "Compile with MKLDNNMatrix")
+endif()
+
+if(MOBILE_INFERENCE)
+    # Remove sparse
+    list(REMOVE_ITEM MATH_HEADERS
+         ${CMAKE_CURRENT_SOURCE_DIR}/CpuSparseMatrix.h
+         ${CMAKE_CURRENT_SOURCE_DIR}/SparseMatrix.h
+         ${CMAKE_CURRENT_SOURCE_DIR}/SparseRowMatrix.h)
+    list(REMOVE_ITEM MATH_SOURCES
+         ${CMAKE_CURRENT_SOURCE_DIR}/CpuSparseMatrix.cpp
+         ${CMAKE_CURRENT_SOURCE_DIR}/SparseMatrix.cpp
+         ${CMAKE_CURRENT_SOURCE_DIR}/SparseRowMatrix.cpp)
+endif()
+set(MATH_SOURCES
+    "${PADDLE_SOURCE_DIR}/paddle/legacy/math/BaseMatrix.cu"
+    "${PADDLE_SOURCE_DIR}/paddle/legacy/math/TrainingAlgorithmOp.cu"
+    ${MATH_SOURCES})
+if(NOT WITH_GPU)
+    # then compile BaseMatrix.cu as c++ file
+    compile_cu_as_cpp("${PADDLE_SOURCE_DIR}/paddle/legacy/math/BaseMatrix.cu")
+    compile_cu_as_cpp("${PADDLE_SOURCE_DIR}/paddle/legacy/math/TrainingAlgorithmOp.cu")
+    add_library(paddle_math STATIC
+        ${MATH_SOURCES})
+else()
+    cuda_add_library(paddle_math ${MATH_SOURCES})
+endif()
+
+
+add_dependencies(paddle_math paddle_proto ${external_project_dependencies})  # depends
+if(WITH_TESTING)
+    add_subdirectory(tests)
+endif()
diff --git a/paddle/legacy/math/CpuSparseMatrix.cpp b/paddle/legacy/math/CpuSparseMatrix.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..20c65a3a1d7099a73d8b3c490cd42e721e60823b
--- /dev/null
+++ b/paddle/legacy/math/CpuSparseMatrix.cpp
@@ -0,0 +1,787 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "CpuSparseMatrix.h"
+#include "SparseMatrix.h"
+#include "float.h"
+#include "hl_gpu.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/utils/Util.h"
+
+namespace paddle {
+
+const size_t CpuSparseMatrix::DEFAULT_AVG_WIDTH;
+
+CpuSparseMatrix::CpuSparseMatrix(size_t height,
+                                 size_t width,
+                                 size_t nnz,
+                                 SparseValueType valueType,
+                                 SparseFormat format,
+                                 bool trans)
+    : Matrix(NULL, height, width, trans, false) {
+  resize(height, width, nnz, valueType, format);
+}
+
+CpuSparseMatrix::CpuSparseMatrix(CpuMemHandlePtr dataHandle,
+                                 size_t height,
+                                 size_t width,
+                                 size_t nnz,
+                                 SparseValueType valueType,
+                                 SparseFormat format,
+                                 bool trans)
+    : Matrix(dataHandle, height, width, trans, false) {
+  resize(height, width, nnz, valueType, format);
+}
+
+CpuSparseMatrix::CpuSparseMatrix(real* data,
+                                 int* rows,
+                                 int* cols,
+                                 size_t height,
+                                 size_t width,
+                                 size_t nnz,
+                                 SparseValueType valueType,
+                                 SparseFormat format,
+                                 bool trans)
+    : Matrix(NULL, height, width, trans, false) {
+  cols_ = cols;
+  rows_ = rows;
+  value_ = data;
+  height_ = height;
+  width_ = width;
+  elementCnt_ = nnz;
+  valueType_ = valueType;
+  format_ = format;
+}
+
+void CpuSparseMatrix::resize(size_t newHeight,
+                             size_t newWidth,
+                             size_t newNnz,
+                             SparseValueType valueType,
+                             SparseFormat format) {
+  CHECK_LE(newNnz, newHeight * newWidth);
+  size_t newSize = 0;
+  if (format == SPARSE_CSR) {
+    newSize = (newHeight + 1) * sizeof(int) + newNnz * sizeof(int);
+  } else {
+    newSize = (newWidth + 1) * sizeof(int) + newNnz * sizeof(int);
+  }
+
+  if (NO_VALUE != valueType) {
+    newSize += newNnz * sizeof(real);
+  }
+
+  if (NULL == memoryHandle_.get() || newSize > memoryHandle_->getSize()) {
+    memoryHandle_ = std::make_shared<CpuMemoryHandle>(newSize);
+  }
+
+  height_ = newHeight;
+  width_ = newWidth;
+  elementCnt_ = newNnz;
+  valueType_ = valueType;
+  format_ = format;
+  sparseResize();
+}
+void CpuSparseMatrix::sparseResize() {
+  if (format_ == SPARSE_CSR) {
+    rows_ = reinterpret_cast<int*>(
+        reinterpret_cast<char*>(memoryHandle_->getBuf()));
+    cols_ = reinterpret_cast<int*>(
+        reinterpret_cast<char*>(memoryHandle_->getBuf()) +
+        (height_ + 1) * sizeof(int));
+    if (NO_VALUE != valueType_) {
+      value_ = reinterpret_cast<real*>(
+          reinterpret_cast<char*>(memoryHandle_->getBuf()) +
+          (height_ + 1) * sizeof(int) + elementCnt_ * sizeof(int));
+    } else {
+      value_ = NULL;
+    }
+  } else {
+    cols_ = reinterpret_cast<int*>(
+        reinterpret_cast<char*>(memoryHandle_->getBuf()));
+    rows_ = reinterpret_cast<int*>(
+        reinterpret_cast<char*>(memoryHandle_->getBuf()) +
+        (width_ + 1) * sizeof(int));
+    if (NO_VALUE != valueType_) {
+      value_ = reinterpret_cast<real*>(
+          reinterpret_cast<char*>(memoryHandle_->getBuf()) +
+          (width_ + 1) * sizeof(int) + elementCnt_ * sizeof(int));
+    } else {
+      value_ = NULL;
+    }
+  }
+}
+
+void CpuSparseMatrix::resize(size_t newHeight, size_t newWidth) {
+  resize(newHeight,
+         newWidth,
+         newHeight * std::min(DEFAULT_AVG_WIDTH, newWidth),
+         valueType_,
+         format_);
+}
+
+MatrixPtr CpuSparseMatrix::getTranspose() {
+  if (!memoryHandle_ && !value_) {
+    MatrixPtr dest(new CpuSparseMatrix(
+        height_, width_, elementCnt_, valueType_, format_, true));
+    return dest;
+  } else if (memoryHandle_) {
+    MatrixPtr dest(new CpuSparseMatrix(
+        std::dynamic_pointer_cast<CpuMemoryHandle>(memoryHandle_),
+        height_,
+        width_,
+        elementCnt_,
+        valueType_,
+        format_,
+        true));
+    return dest;
+  } else if (value_) {
+    MatrixPtr dest(new CpuSparseMatrix(value_,
+                                       rows_,
+                                       cols_,
+                                       height_,
+                                       width_,
+                                       elementCnt_,
+                                       valueType_,
+                                       format_,
+                                       true));
+    return dest;
+  } else {
+    return NULL;
+  }
+}
+
+SparseValueType CpuSparseMatrix::getValueType() { return valueType_; }
+
+void CpuSparseMatrix::mul(const Matrix& a,
+                          const Matrix& b,
+                          real scaleAB,
+                          real scaleT) {
+  CHECK(!isTransposed()) << "Not supported";
+  const auto a_ptr = dynamic_cast<const CpuMatrix*>(&a);
+  const auto b_ptr = dynamic_cast<const CpuMatrix*>(&b);
+
+  if (a_ptr && b_ptr) {
+    CpuMatrix::mul((CpuMatrix*)a_ptr, (CpuMatrix*)b_ptr, this, scaleAB, scaleT);
+  } else {
+    LOG(FATAL) << "not supported";
+  }
+}
+
+void CpuSparseMatrix::add3(CpuMatrix* b) {
+  CHECK(getFormat() != SPARSE_CSC) << "Not supported";
+  CHECK(height_ == b->getHeight());
+  CHECK(width_ == b->getWidth());
+  real* A = getValue();
+  real* B = b->getData();
+  int* cols = getCols();
+  for (size_t i = 0; i < height_; i++) {
+    size_t start = getRowStartIdx(i);
+    size_t end = getRowStartIdx(i + 1);
+    for (size_t j = start; j < end; j++) {
+      A[j] = B[i * width_ + cols[j]];
+    }
+  }
+}
+
+void CpuSparseMatrix::add3(MatrixPtr b) {
+  if (dynamic_cast<CpuMatrix*>(b.get())) {
+    add3(dynamic_cast<CpuMatrix*>(b.get()));
+  } else {
+    LOG(FATAL) << "not supported";
+  }
+}
+
+void CpuSparseMatrix::addBias(Matrix& b, real scale) {
+  CHECK_EQ(b.getHeight(), (size_t)1);
+  CHECK_EQ(width_, b.getWidth());
+  real* A = getValue();
+  real* B = b.getData();
+  int* cols = getCols();
+  size_t nnz = getElementCnt();
+  for (size_t i = 0; i < nnz; i++) {
+    A[i] += scale * B[cols[i]];
+  }
+}
+
+template <class T>
+void printBuf(std::ostream& os, T* a, size_t len, const char* name) {
+  os << "\n: " << name << " [";
+  for (size_t i = 0; i < len; i++) {
+    os << a[i] << " ";
+  }
+  os << "]\n";
+}
+
+void CpuSparseMatrix::print(std::ostream& os) const {
+  size_t rowSize = format_ == SPARSE_CSC ? elementCnt_ : height_ + 1;
+  size_t colSize = format_ == SPARSE_CSC ? width_ + 1 : elementCnt_;
+  printBuf(os, rows_, rowSize, "row");
+  printBuf(os, cols_, colSize, "col");
+  if (valueType_ == FLOAT_VALUE) {
+    printBuf(os, value_, elementCnt_, "value");
+  }
+  return;
+}
+
+void CpuSparseMatrix::printOneRow(std::ostream& os, size_t idx) const {
+  CHECK_LT(idx, height_);
+  if (format_ == SPARSE_CSC) {
+    LOG(FATAL) << "SPARSE_CSC not supported";
+    return;
+  }
+
+  const int* col = getRowCols(idx);
+  size_t num = getColNum(idx);
+  if (num > 0) {
+    if (valueType_ == FLOAT_VALUE) {
+      const real* data = getRowValues(idx);
+      os << col[0] << ":" << data[0];
+      for (size_t i = 1; i < num; ++i) {
+        os << " " << col[i] << ":" << data[i];
+      }
+    } else {
+      os << col[0];
+      for (size_t i = 1; i < num; ++i) {
+        os << " " << col[i];
+      }
+    }
+  }
+  os << ";";
+}
+
+void CpuSparseMatrix::rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c) {
+  CHECK(getFormat() != SPARSE_CSC) << "Not supported";
+  CHECK_EQ(height_, b.getHeight());
+  CHECK_EQ(width_, b.getWidth());
+  real* A = getValue();
+  real* B = b.getValue();
+  if (b.getValueType() == FLOAT_VALUE) {
+    for (size_t i = 0; i < height_; i++) {
+      size_t start = getRowStartIdx(i);
+      size_t end = getRowStartIdx(i + 1);
+      CHECK_EQ(start, b.getRowStartIdx(i));
+      CHECK_EQ(end, b.getRowStartIdx(i + 1));
+      for (size_t j = start; j < end; j++) {
+        A[j] = B[j] * c.getElement(i, cCol);
+      }
+    }
+  } else if (b.getValueType() == NO_VALUE) {
+    for (size_t i = 0; i < height_; i++) {
+      size_t start = getRowStartIdx(i);
+      size_t end = getRowStartIdx(i + 1);
+      CHECK_EQ(start, b.getRowStartIdx(i));
+      CHECK_EQ(end, b.getRowStartIdx(i + 1));
+      for (size_t j = start; j < end; j++) {
+        A[j] = c.getElement(i, cCol);
+      }
+    }
+  }
+}
+
+void CpuSparseMatrix::randomizeUniform() {
+  CHECK_LE(elementCnt_, height_ * width_);
+  if (valueType_ == FLOAT_VALUE) {
+    real* data = getValue();
+    for (size_t i = 0; i < elementCnt_; ++i) {
+      *data++ = rand() / static_cast<real>(RAND_MAX);  // NOLINT
+    }
+  }
+  if (format_ == SPARSE_CSR) {
+    sparseRand(rows_, cols_, elementCnt_, height_ + 1, width_, false);
+  } else {
+    sparseRand(cols_, rows_, elementCnt_, width_ + 1, height_, false);
+  }
+}
+
+void CpuSparseMatrix::copyFrom(std::vector<int>& rows,
+                               std::vector<int>& cols,
+                               std::vector<real>& values) {
+  size_t size = format_ == SPARSE_CSR ? cols.size() : rows.size();
+  resize(height_, width_, size, valueType_, format_);
+  if (valueType_ == FLOAT_VALUE) {
+    memcpy(&value_[0], &values[0], sizeof(real) * values.size());
+  }
+  memcpy(&cols_[0], &cols[0], sizeof(int) * cols.size());
+  memcpy(&rows_[0], &rows[0], sizeof(int) * rows.size());
+}
+
+// Copy from a CpuMatrix, only supported in sparse_float_value_t
+// SparseMatrix.
+void CpuSparseMatrix::copyFrom(const CpuMatrix& src) {
+  CHECK_EQ(getHeight(), src.getHeight());
+  CHECK_EQ(getWidth(), src.getWidth());
+  CHECK(!src.trans_ && !trans_);
+  if (format_ == SPARSE_CSR) {
+    std::vector<int> rows(getHeight() + 1);
+    std::vector<int> cols;
+    std::vector<real> values;
+    rows[0] = 0;
+    for (size_t r = 0; r < getHeight(); ++r) {
+      for (size_t c = 0; c < getWidth(); ++c) {
+        real v = src.getElement(r, c);
+        if (fabs(v) > FLT_EPSILON) {
+          cols.push_back(c);
+          values.push_back(v);
+        }
+      }
+      rows[r + 1] = values.size();
+    }
+    copyFrom(rows, cols, values);
+  } else {
+    std::vector<int> cols(getWidth() + 1);
+    std::vector<int> rows;
+    std::vector<real> values;
+    cols[0] = 0;
+    for (size_t r = 0; r < getWidth(); ++r) {
+      for (size_t c = 0; c < getHeight(); ++c) {
+        real v = src.getElement(c, r);
+        if (fabs(v) > FLT_EPSILON) {
+          rows.push_back(c);
+          values.push_back(v);
+        }
+      }
+      cols[r + 1] = values.size();
+    }
+    copyFrom(rows, cols, values);
+  }
+}
+
+MatrixPtr CpuSparseMatrix::clone(size_t height, size_t width, bool useGpu) {
+  if (height == 0 && width == 0) {
+    height = height_;
+    width = width_;
+  }
+  CHECK(width && height);
+  if (!useGpu) {
+    return std::make_shared<CpuSparseMatrix>(
+        height, width, 0, valueType_, format_);
+  } else {
+    return std::make_shared<GpuSparseMatrix>(
+        height, width, elementCnt_, valueType_, format_);
+  }
+}
+
+MatrixPtr CpuSparseMatrix::subMatrix(size_t startRow, size_t numRows) {
+  CHECK_LE(startRow + numRows, height_);
+  CHECK_EQ(format_, SPARSE_CSR);
+  if (valueType_ == NO_VALUE) {
+    return std::make_shared<CpuSparseMatrix>(
+        nullptr,
+        rows_ + startRow,
+        cols_,
+        numRows,
+        width_,
+        rows_[startRow + numRows] - rows_[startRow],
+        valueType_,
+        format_,
+        trans_);
+  } else {
+    return std::make_shared<CpuSparseMatrix>(
+        value_,
+        rows_ + startRow,
+        cols_,
+        numRows,
+        width_,
+        rows_[startRow + numRows] - rows_[startRow],
+        valueType_,
+        format_,
+        trans_);
+  }
+}
+
+/* mem MUST be alloced outside (memAlloc=false) */
+void CpuSparseMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
+  CHECK(!memAlloc);
+  CpuSparseMatrix* mat = dynamic_cast<CpuSparseMatrix*>(matTrans.get());
+  if (format_ == SPARSE_CSR) {
+    /*statistic element number in each col*/
+    int* colCounters = mat->getRows() + 1;
+    memset(colCounters, 0, sizeof(int) * width_);
+    for (size_t i = 0; i < elementCnt_; ++i) {
+      int col = cols_[i];
+      colCounters[col]++;
+    }
+    /*fill mat rows */
+    mat->getRows()[0] = 0;
+    for (size_t i = 1; i < width_ + 1; i++) {
+      mat->getRows()[i] = mat->getRows()[i - 1] + mat->getRows()[i];
+    }
+    /*fill mat values and cols*/
+    std::vector<int> colNumVec(width_, 0);
+    if (valueType_ == FLOAT_VALUE) {
+      for (size_t i = 0; i < height_; i++) {
+        for (int j = rows_[i]; j < rows_[i + 1]; j++) {
+          int colIdx = cols_[j];
+          int index = mat->getRows()[colIdx] + colNumVec[colIdx];
+          mat->getCols()[index] = i;
+          mat->getValue()[index] = value_[j];
+          colNumVec[colIdx]++;
+        }
+      }
+    } else {
+      for (size_t i = 0; i < height_; i++) {
+        for (int j = rows_[i]; j < rows_[i + 1]; j++) {
+          int colIdx = cols_[j];
+          int index = mat->getRows()[colIdx] + colNumVec[colIdx];
+          mat->getCols()[index] = i;
+          colNumVec[colIdx]++;
+        }
+      }
+    }
+  } else {
+    /*statistic element number in each row*/
+    int* rowCounters = mat->getCols() + 1;
+    memset(rowCounters, 0, sizeof(int) * height_);
+    for (size_t i = 0; i < elementCnt_; ++i) {
+      int row = rows_[i];
+      rowCounters[row]++;
+    }
+
+    /*fill mat cols */
+    mat->getCols()[0] = 0;
+    for (size_t i = 1; i < height_ + 1; i++) {
+      mat->getCols()[i] = mat->getCols()[i - 1] + mat->getCols()[i];
+    }
+    /*fill mat values and rows*/
+    std::vector<int> rowNumVec(height_, 0);
+    if (valueType_ == FLOAT_VALUE) {
+      for (size_t i = 0; i < width_; i++) {
+        for (int j = cols_[i]; j < cols_[i + 1]; j++) {
+          int rowIdx = rows_[j];
+          int index = mat->getCols()[rowIdx] + rowNumVec[rowIdx];
+          mat->getRows()[index] = i;
+          mat->getValue()[index] = value_[j];
+          rowNumVec[rowIdx]++;
+        }
+      }
+    } else {
+      for (size_t i = 0; i < width_; i++) {
+        for (int j = cols_[i]; j < cols_[i + 1]; j++) {
+          int rowIdx = rows_[j];
+          int index = mat->getCols()[rowIdx] + rowNumVec[rowIdx];
+          mat->getRows()[index] = i;
+          rowNumVec[rowIdx]++;
+        }
+      }
+    }
+  }
+}
+
+void CpuSparseMatrix::setRow(size_t row,
+                             size_t colNum,
+                             const unsigned int* cols,
+                             const real* values) {
+  if (format_ == SPARSE_CSR) {
+    CHECK_LT(row, height_);
+    CHECK(NULL != cols);
+    if (0 == row) {
+      rows_[row] = 0;
+    }
+    rows_[row + 1] = rows_[row] + colNum;
+    for (size_t i = 0; i < colNum; ++i) {
+      cols_[rows_[row] + i] = cols[i];
+    }
+    if (valueType_ == NO_VALUE) {
+      CHECK(!values);
+    } else {
+      for (size_t i = 0; i < colNum; ++i) {
+        value_[rows_[row] + i] = values[i];
+      }
+    }
+  } else {
+    LOG(FATAL) << "not supported";
+  }
+}
+
+void CpuSparseMatrix::fillRowIndices(IVectorPtr& outVec) const {
+  if (format_ == SPARSE_CSR) {
+    auto nnz = getElementCnt();
+    IVector::resizeOrCreate(outVec, nnz, false);
+    auto out = outVec->getData();
+    int* rows = getRows();
+    for (size_t i = 0; i < height_; i++) {
+      for (int j = rows[i]; j < rows[i + 1]; j++) {
+        out[j] = i;
+      }
+    }
+  } else {
+    LOG(FATAL) << "SPARSE_CSC not supported";
+  }
+}
+
+ThreadLocal<std::vector<CpuSparseMatrixPtr>> CpuSparseMatrix::cpuLocalMats_;
+
+CpuSparseMatrixPtr CpuSparseMatrix::getTmpSparseMatrix(size_t height,
+                                                       size_t width) {
+  std::vector<CpuSparseMatrixPtr>* localMats = cpuLocalMats_.get();
+  auto it = localMats->begin();
+  while (it != localMats->end()) {
+    if (it->unique()) {
+      (*it)->resize(height, width, elementCnt_, valueType_, format_);
+      return *it;
+    }
+  }
+  localMats->emplace_back(std::make_shared<CpuSparseMatrix>(
+      height, width, elementCnt_, valueType_, format_, false));
+  return localMats->back();
+}
+
+void CpuSparseMatrix::copyFrom(const Matrix& src, hl_stream_t stream) {
+  if (dynamic_cast<const GpuSparseMatrix*>(&src)) {
+    auto tmpSrc = dynamic_cast<const GpuSparseMatrix*>(&src);
+    copyFrom(*tmpSrc, stream);
+  } else if (dynamic_cast<const CpuSparseMatrix*>(&src)) {
+    auto tmpSrc = dynamic_cast<const CpuSparseMatrix*>(&src);
+    copyFrom(*tmpSrc);
+  } else if (dynamic_cast<const CpuMatrix*>(&src)) {
+    auto tmpSrc = dynamic_cast<const CpuMatrix*>(&src);
+    copyFrom(*tmpSrc);
+  } else {
+    LOG(FATAL) << "not implemented";
+  }
+}
+
+void CpuSparseMatrix::copyFrom(const Matrix& src) {
+  if (dynamic_cast<const CpuSparseMatrix*>(&src)) {
+    auto tmpSrc = dynamic_cast<const CpuSparseMatrix*>(&src);
+    copyFrom(*tmpSrc);
+  } else if (dynamic_cast<const CpuMatrix*>(&src)) {
+    auto tmpSrc = dynamic_cast<const CpuMatrix*>(&src);
+    copyFrom(*tmpSrc);
+  } else {
+    LOG(FATAL) << "not implemented";
+  }
+}
+
+void CpuSparseMatrix::copyFrom(const GpuSparseMatrix& src, hl_stream_t stream) {
+  CHECK_EQ(height_, src.getHeight());
+  CHECK_EQ(width_, src.getWidth());
+  CHECK_EQ(size_t(elementCnt_), src.getElementCnt());
+  size_t valSize = valueType_ == NO_VALUE ? 0 : elementCnt_;
+  if (format_ == SPARSE_CSC)
+    hl_memcpy_from_csc_matrix(value_,
+                              valSize,
+                              rows_,
+                              elementCnt_,
+                              cols_,
+                              width_ + 1,
+                              src.sMatrix_.get(),
+                              stream);
+  else
+    hl_memcpy_from_csr_matrix(value_,
+                              valSize,
+                              rows_,
+                              height_ + 1,
+                              cols_,
+                              elementCnt_,
+                              src.sMatrix_.get(),
+                              stream);
+}
+
+void CpuSparseMatrix::copyFrom(const CpuSparseMatrix& src) {
+  CHECK_EQ(height_, src.getHeight());
+  CHECK_EQ(width_, src.getWidth());
+  CHECK_EQ(format_, src.getFormat());
+  int start = format_ == SPARSE_CSR ? src.getRows()[0] : src.getCols()[0];
+  if (format_ == SPARSE_CSR) {
+    size_t totalColNum = 0;
+    for (size_t i = 0; i < height_; ++i) {
+      totalColNum += src.getColNum(i);
+    }
+    resize(height_, width_, totalColNum, valueType_, format_);
+    rows_[0] = 0;
+    for (size_t i = 0; i < height_; ++i) {
+      rows_[i + 1] = rows_[i] + src.getColNum(i);
+    }
+    memcpy(cols_, src.getCols() + start, totalColNum * sizeof(int));
+  } else {
+    size_t totalColNum = 0;
+    for (size_t i = 0; i < width_; ++i) {
+      totalColNum += src.getRowNum(i);
+    }
+    resize(height_, width_, totalColNum, valueType_, format_);
+    cols_[0] = 0;
+    for (size_t i = 0; i < width_; ++i) {
+      cols_[i + 1] = cols_[i] + src.getRowNum(i);
+    }
+    memcpy(rows_, src.getRows() + start, totalColNum * sizeof(int));
+  }
+
+  // if have different value type, only copy rows and cols
+  if (valueType_ == FLOAT_VALUE && src.getValueType() == FLOAT_VALUE) {
+    memcpy(value_, src.getValue() + start, elementCnt_ * sizeof(real));
+  }
+}
+
+void CpuSparseMatrix::copyRow(int offsets,
+                              size_t colNum,
+                              const sparse_non_value_t* row) {
+  for (size_t j = 0; j < colNum; j++) {
+    cols_[offsets + j] = row[j].col;
+  }
+}
+
+void CpuSparseMatrix::copyRow(int offsets,
+                              size_t colNum,
+                              const sparse_float_value_t* row) {
+  for (size_t j = 0; j < colNum; j++) {
+    cols_[offsets + j] = row[j].col;
+    value_[offsets + j] = row[j].value;
+  }
+}
+
+template <class T>
+void CpuSparseMatrix::copyFrom(int64_t* ids, int64_t* indices, T* data) {
+  size_t totalColNum = 0;
+  for (size_t i = 0; i < height_; ++i) {
+    int64_t id = ids[i];
+    totalColNum += indices[id + 1] - indices[id];
+  }
+  valueType_ = typeid(T) == typeid(sparse_non_value_t) ? NO_VALUE : FLOAT_VALUE;
+
+  resize(height_, width_, totalColNum, valueType_, format_);
+
+  rows_[0] = 0;
+  for (size_t i = 0; i < height_; ++i) {
+    int64_t id = ids[i];
+    T* row = data + indices[id];
+    size_t colNum = indices[id + 1] - indices[id];
+    rows_[i + 1] = rows_[i] + colNum;
+    copyRow(rows_[i], colNum, row);
+  }
+}
+
+template <class T>
+void CpuSparseMatrix::copyFrom(int64_t* indices, T* data) {
+  CHECK(format_ == SPARSE_CSR);
+  size_t totalColNum = indices[height_] - indices[0];
+  valueType_ = typeid(T) == typeid(sparse_non_value_t) ? NO_VALUE : FLOAT_VALUE;
+  resize(height_, width_, totalColNum, valueType_, format_);
+
+  rows_[0] = 0;
+  for (size_t i = 0; i < height_; ++i) {
+    T* row = data + indices[i];
+    size_t colNum = indices[i + 1] - indices[i];
+    rows_[i + 1] = rows_[i] + colNum;
+    copyRow(rows_[i], colNum, row);
+  }
+}
+
+void CpuSparseMatrix::trimFrom(const CpuSparseMatrix& src) {
+  CHECK_EQ(height_, src.getHeight());
+  CHECK_LE(width_, src.getWidth());
+  CHECK_EQ(format_, src.getFormat());
+  CHECK_EQ(valueType_, src.getValueType());
+  if (format_ == SPARSE_CSR) {
+    int* srcCols = src.getCols();
+    size_t numLessWidth =
+        std::count_if(srcCols, srcCols + src.getElementCnt(), [this](size_t n) {
+          return n < this->width_;
+        });
+    resize(height_, width_, numLessWidth, valueType_, format_);
+    rows_[0] = 0;
+    size_t index = 0;
+    for (size_t r = 0; r < height_; ++r) {
+      for (int i = src.getRows()[r]; i < src.getRows()[r + 1]; ++i) {
+        if (srcCols[i] < static_cast<int>(width_)) {
+          cols_[index] = srcCols[i];
+          if (valueType_ == FLOAT_VALUE) {
+            value_[index] = src.getValue()[i];
+          }
+          ++index;
+        }
+      }
+      rows_[r + 1] = index;
+    }
+    CHECK_EQ(index, numLessWidth);
+  } else {
+    size_t numLessWidth = src.getCols()[width_] - src.getCols()[0];
+    resize(height_, width_, numLessWidth, valueType_, format_);
+    cols_[0] = 0;
+    size_t index = 0;
+    // note: c < width_, not src.getWidth();
+    for (size_t c = 0; c < width_; ++c) {
+      for (int i = src.getCols()[c]; i < src.getCols()[c + 1]; ++i) {
+        rows_[index] = src.getRows()[i];
+        if (valueType_ == FLOAT_VALUE) {
+          value_[index] = src.getValue()[i];
+        }
+        ++index;
+      }
+      cols_[c + 1] = index;
+    }
+    CHECK_EQ(index, numLessWidth);
+  }
+}
+
+void CpuSparseMatrix::zeroMem() {
+  CHECK(valueType_ == FLOAT_VALUE);
+  memset(value_, 0, elementCnt_ * sizeof(real));
+}
+
+template void CpuSparseMatrix::copyFrom(int64_t* ids,
+                                        int64_t* indices,
+                                        sparse_non_value_t* data);
+
+template void CpuSparseMatrix::copyFrom(int64_t* ids,
+                                        int64_t* indices,
+                                        sparse_float_value_t* data);
+
+template void CpuSparseMatrix::copyFrom(int64_t* indices,
+                                        sparse_non_value_t* data);
+
+template void CpuSparseMatrix::copyFrom(int64_t* indices,
+                                        sparse_float_value_t* data);
+
+void CpuSparseMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
+  size_t numSamples = getHeight();
+  size_t beam = maxVal.getWidth();
+  CHECK_EQ(maxIds.getSize(), numSamples * beam);
+  CHECK_EQ(maxVal.getHeight(), numSamples);
+  maxVal.zeroMem();
+  int* outids = maxIds.getData();
+  real* outvalues = maxVal.getData();
+
+  typedef std::pair<real, size_t> valuepair;
+  std::vector<valuepair> vec;
+  for (size_t i = 0; i < numSamples; i++) {
+    vec.clear();
+
+    auto num = getColNum(i);
+    auto ids = getRowCols(i);
+    auto values = getRowValues(i);
+    for (size_t j = 0; j < num; j++) {
+      vec.push_back(std::make_pair(values[j], ids[j]));
+    }
+
+    size_t outsize = std::min(num, beam);
+    std::partial_sort(vec.begin(),
+                      vec.begin() + outsize,
+                      vec.end(),
+                      [](const valuepair& a, const valuepair& b) {
+                        return a.first > b.first;
+                      });
+    for (size_t j = 0; j < outsize; j++) {
+      outids[i * beam + j] = vec[j].second;
+      outvalues[i * beam + j] = vec[j].first;
+    }
+    if (outsize < beam) {
+      // if the number of values to sort are less than the output size,
+      // use -1 to indicate the end of valid sorted values.
+      outids[i * beam + outsize] = -1;
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/math/CpuSparseMatrix.h b/paddle/legacy/math/CpuSparseMatrix.h
similarity index 100%
rename from paddle/math/CpuSparseMatrix.h
rename to paddle/legacy/math/CpuSparseMatrix.h
diff --git a/paddle/math/ExecViaCpu.h b/paddle/legacy/math/ExecViaCpu.h
similarity index 100%
rename from paddle/math/ExecViaCpu.h
rename to paddle/legacy/math/ExecViaCpu.h
diff --git a/paddle/math/MKLDNNMatrix.cpp b/paddle/legacy/math/MKLDNNMatrix.cpp
similarity index 100%
rename from paddle/math/MKLDNNMatrix.cpp
rename to paddle/legacy/math/MKLDNNMatrix.cpp
diff --git a/paddle/legacy/math/MKLDNNMatrix.h b/paddle/legacy/math/MKLDNNMatrix.h
new file mode 100644
index 0000000000000000000000000000000000000000..5a0e5f85923dfd822dad4c63679acde63719f217
--- /dev/null
+++ b/paddle/legacy/math/MKLDNNMatrix.h
@@ -0,0 +1,256 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "Matrix.h"
+#include "mkldnn.hpp"
+#include "paddle/legacy/parameter/Parameter.h"
+
+namespace paddle {
+
+class MKLDNNMatrix;
+typedef std::shared_ptr<MKLDNNMatrix> MKLDNNMatrixPtr;
+
+#define CHECK_PRIMITIVE_DESC_EQ(MAT, PD, ...)                        \
+  CHECK(MAT) << " can not be empty.";                                \
+  CHECK(MAT->getPrimitiveDesc() == PD)                               \
+      << #MAT "->getPrimitiveDesc() and " #PD " should be equal.\n " \
+      << "" __VA_ARGS__;
+
+/**
+ * @brief MKLDNN Matrix.
+ *
+ */
+class MKLDNNMatrix : public CpuMatrix, public mkldnn::memory {
+ public:
+  MKLDNNMatrix(CpuMatrixPtr m, mkldnn::memory::primitive_desc pd)
+      : CpuMatrix(m->getData(), m->getHeight(), m->getWidth(), false),
+        mkldnn::memory(pd, m->getData()),
+        m_(m) {}
+
+  ~MKLDNNMatrix() {}
+
+  /**
+   * Create MKLDNNMatrix from a MatrixPtr and memory primitive_desc
+   */
+  static MKLDNNMatrixPtr create(mkldnn::memory::primitive_desc pd,
+                                MatrixPtr m = nullptr);
+
+  /**
+   * Create MKLDNNMatrix from a MatrixPtr and memory details info
+   */
+  static MKLDNNMatrixPtr create(
+      mkldnn::memory::dims dims,
+      mkldnn::memory::format fmt,
+      mkldnn::engine& eg,
+      MatrixPtr m = nullptr,
+      mkldnn::memory::data_type dtype = mkldnn::memory::data_type::f32);
+
+  /**
+   * Create primitive descriptor.
+   * default with f32 dtype
+   */
+  static mkldnn::memory::primitive_desc createPrimitiveDesc(
+      const mkldnn::memory::dims dims,
+      const mkldnn::memory::format& fmt,
+      const mkldnn::engine& eg,
+      const mkldnn::memory::data_type& dtype = mkldnn::memory::data_type::f32) {
+    return mkldnn::memory::primitive_desc(memory::desc(dims, dtype, fmt), eg);
+  }
+
+  /**
+   * Create Memory descriptor.
+   * default with any format and f32 dtype
+   */
+  static mkldnn::memory::desc createMemoryDesc(
+      const mkldnn::memory::dims dims,
+      const mkldnn::memory::format& fmt = mkldnn::memory::format::any,
+      const mkldnn::memory::data_type& dtype = mkldnn::memory::data_type::f32) {
+    return mkldnn::memory::desc(dims, dtype, fmt);
+  }
+
+  /**
+   * Create reorder primitive.
+   * Create a mkldnn::reorder handle for converting src MKLDNNMatrix to dst.
+   * checkData: whether to check the data handle of src and dst.
+   *            if true, it will check the data and do not allow them equal;
+   *            otherwise, it will not check them, then the reorder created
+   *            may have inplace buffer.
+   *            Do not set false, if you can not guarantee the inplace logical
+   *            would work with your reorder.
+   */
+  static std::shared_ptr<mkldnn::reorder> createReorder(
+      const MKLDNNMatrixPtr& src,
+      const MKLDNNMatrixPtr& dst,
+      bool checkData = true);
+
+  void copyFrom(const Matrix& src) {
+    // TODO(TJ): reorder data if this format is not nchw or x
+    m_->copyFrom(src);
+  }
+
+  void copyTo(Matrix& dst) {
+    // TODO(TJ): reorder data if this format is not nchw or x
+    dst.copyFrom(*m_);
+  }
+
+ public:
+  /**
+   * Reorder this MKLDNNMatrix from other format.
+   * Support inplace reorder.
+   * @note: this function would only reorder the data layout.
+   *        will NOT change this original dim or format info
+   */
+  void reorderDataFrom(const MKLDNNMatrixPtr& m,
+                       memory::format srcFmt,
+                       memory::dims targetDim);
+
+  /**
+   * Reorder this MKLDNNMatrix to other format.
+   * Support inplace reorder.
+   * @note: this function would only reorder the data layout.
+   *        will NOT change the dst dim or format info
+   */
+  void reorderDataTo(const MKLDNNMatrixPtr& m,
+                     memory::format dstFmt,
+                     memory::dims targetDim);
+
+  /**
+   * Dimensionality reduction.
+   * Change format "nchw --> nc" or "oihw --> oi" if the h and w are both 1
+   */
+  void downSpatial();
+
+  /**
+   * set the memory data handle.
+   * Caution: This will not check the buffer size of the data,
+   *          it should be coverd by user.
+   */
+  void setData(real* data) {
+    set_data_handle(data);
+    CpuMatrix::setData(data);
+    m_.reset();
+  }
+
+  /**
+   * override the CpuMatrix::resize
+   */
+  void resize(size_t newHeight, size_t newWidth) override {
+    m_->resize(newHeight, newWidth);
+    if (data_ == m_->getData() && elementCnt_ == newHeight * newWidth) {
+      return;
+    }
+    CpuMatrix::setData(data_);
+    height_ = newHeight;
+    width_ = newWidth;
+    elementCnt_ = newHeight * newWidth;
+    stride_ = width_;
+    auto pd = mkldnn::memory::primitive_desc(
+        mkldnn::memory::desc({(int)newHeight, (int)newWidth},
+                             getDtype(),
+                             mkldnn::memory::format::nc),
+        getEngine());
+    resetMKLDNNMemory(pd, data_);
+  }
+
+  /**
+   * override Matrix::getData
+   * check data before return
+   */
+  real* getData() override {
+    CHECK_EQ((void*)data_, get_data_handle());
+    return data_;
+  }
+
+  const real* getData() const override {
+    CHECK_EQ((void*)data_, get_data_handle());
+    return data_;
+  }
+
+  /**
+   * Get primitive descriptor.
+   */
+  mkldnn::memory::primitive_desc getPrimitiveDesc() {
+    return this->get_primitive_desc();
+  }
+
+  /**
+   * Get memory descriptor.
+   */
+  mkldnn::memory::desc getMemoryDesc() { return getPrimitiveDesc().desc(); }
+
+  /**
+   * Get dimensions.
+   */
+  mkldnn::memory::dims getDims() {
+    mkldnn::memory::desc md = getMemoryDesc();
+    const int* src = md.data.dims;
+    int ndims = md.data.ndims;
+    mkldnn::memory::dims dst;
+    dst.resize(ndims);
+    for (int i = 0; i < ndims; ++i) {
+      dst[i] = src[i];
+    }
+    return dst;
+  }
+
+  /**
+   * Get format.
+   */
+  mkldnn::memory::format getFormat() {
+    return (mkldnn::memory::format)(getMemoryDesc().data.format);
+  }
+
+  /**
+   * Get memory data type.
+   */
+  mkldnn::memory::data_type getDtype() {
+    return (mkldnn::memory::data_type)(getMemoryDesc().data.data_type);
+  }
+
+  /**
+   * Get engine.
+   */
+  mkldnn::engine getEngine() { return getPrimitiveDesc().get_engine(); }
+
+ protected:
+  /**
+   * Do reorder once.
+   * Can support inplace.
+   */
+  void reorderOnce(void* srcData,
+                   void* dstData,
+                   memory::format srcFmt,
+                   memory::format dstFmt,
+                   memory::dims dm);
+  /**
+   * reset this MKLDNN Memory from primitve desc
+   */
+  void resetMKLDNNMemory(memory::primitive_desc pd, real* data) {
+    mkldnn_primitive_t result;
+    mkldnn::error::wrap_c_api(
+        mkldnn_primitive_create(&result, pd.get(), nullptr, nullptr),
+        "could not create a memory primitive");
+    reset(result);
+    set_data_handle(data);
+  }
+
+ private:
+  // save the CpuMatrixPtr in case the buffer released outside
+  CpuMatrixPtr m_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/math/MathFunctions.cpp b/paddle/legacy/math/MathFunctions.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bbf34a32f36fa7988058f8d3bb7f91eaf2bc1ba0
--- /dev/null
+++ b/paddle/legacy/math/MathFunctions.cpp
@@ -0,0 +1,348 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/math/MathFunctions.h"
+#include "hl_matrix_apply.cuh"
+#include "hl_matrix_ops.cuh"
+#include "paddle/legacy/utils/DynamicLoader.h"
+
+namespace dynload {
+
+std::once_flag lapack_dso_flag;
+void* lapack_dso_handle = nullptr;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load lapack routine
+ * via operator overloading.
+ *
+ * note: default dynamic linked libs
+ */
+
+// The argument for stringizing operator is not macro-expanded first.
+// We have to use two levels of macro to do the expansion.
+// See https://gcc.gnu.org/onlinedocs/cpp/Stringizing.html
+#define STR(x) #x
+
+// clang-format off
+#ifndef LAPACK_FOUND
+#define DYNAMIC_LOAD_LAPACK_WRAP(__name)                                       \
+  struct DynLoad__##__name {                                                   \
+    template <typename... Args>                                                \
+    auto operator()(Args... args) -> decltype(__name(args...)) {               \
+      using lapack_func = decltype(__name(args...)) (*)(Args...);              \
+      std::call_once(lapack_dso_flag, GetLapackDsoHandle, &lapack_dso_handle); \
+      void* p_##__name = dlsym(lapack_dso_handle, STR(__name));                \
+      CHECK(p_##__name) << "Cannot find symbol " << STR(__name)                \
+                        << " in liblapack.so";                                 \
+      return reinterpret_cast<lapack_func>(p_##__name)(args...);               \
+    }                                                                          \
+  } __name;  // struct DynLoad__##__name
+#else
+#define DYNAMIC_LOAD_LAPACK_WRAP(__name)                                       \
+  struct DynLoad__##__name {                                                   \
+    template <typename... Args>                                                \
+    auto operator()(Args... args) -> decltype(__name(args...)) {               \
+      return __name(args...);                                                  \
+    }                                                                          \
+  } __name;  // struct DynLoad__##__name
+#endif
+
+#define  PADDLE_SGETRF  LAPACKE_sgetrf
+#define  PADDLE_DGETRF  LAPACKE_dgetrf
+#define  PADDLE_SGETRI  LAPACKE_sgetri
+#define  PADDLE_DGETRI  LAPACKE_dgetri
+
+#define LAPACK_ROUTINE_EACH(__macro)       \
+  __macro(PADDLE_SGETRF)                   \
+  __macro(PADDLE_DGETRF)                   \
+  __macro(PADDLE_SGETRI)                   \
+  __macro(PADDLE_DGETRI)
+// clang-format on
+
+LAPACK_ROUTINE_EACH(DYNAMIC_LOAD_LAPACK_WRAP)
+
+}  // namespace dynload
+
+namespace paddle {
+
+#ifndef PADDLE_USE_EIGEN_FOR_BLAS
+template <>
+void gemm<float>(const CBLAS_TRANSPOSE transA,
+                 const CBLAS_TRANSPOSE transB,
+                 const int M,
+                 const int N,
+                 const int K,
+                 const float alpha,
+                 const float* A,
+                 const int lda,
+                 const float* B,
+                 const int ldb,
+                 const float beta,
+                 float* C,
+                 const int ldc) {
+  cblas_sgemm(CblasRowMajor,
+              transA,
+              transB,
+              M,
+              N,
+              K,
+              alpha,
+              A,
+              lda,
+              B,
+              ldb,
+              beta,
+              C,
+              ldc);
+}
+
+template <>
+void gemm<double>(const CBLAS_TRANSPOSE transA,
+                  const CBLAS_TRANSPOSE transB,
+                  const int M,
+                  const int N,
+                  const int K,
+                  const double alpha,
+                  const double* A,
+                  const int lda,
+                  const double* B,
+                  const int ldb,
+                  const double beta,
+                  double* C,
+                  const int ldc) {
+  cblas_dgemm(CblasRowMajor,
+              transA,
+              transB,
+              M,
+              N,
+              K,
+              alpha,
+              A,
+              lda,
+              B,
+              ldb,
+              beta,
+              C,
+              ldc);
+}
+#endif
+
+template <>
+int getrf<float>(const CBLAS_ORDER order,
+                 const int M,
+                 const int N,
+                 float* A,
+                 const int lda,
+                 int* ipiv) {
+  return dynload::PADDLE_SGETRF(order, M, N, A, lda, ipiv);
+}
+
+template <>
+int getrf<double>(const CBLAS_ORDER order,
+                  const int M,
+                  const int N,
+                  double* A,
+                  const int lda,
+                  int* ipiv) {
+  return dynload::PADDLE_DGETRF(order, M, N, A, lda, ipiv);
+}
+
+template <>
+int getri<float>(const CBLAS_ORDER order,
+                 const int N,
+                 float* A,
+                 const int lda,
+                 const int* ipiv) {
+  return dynload::PADDLE_SGETRI(order, N, A, lda, ipiv);
+}
+
+template <>
+int getri<double>(const CBLAS_ORDER order,
+                  const int N,
+                  double* A,
+                  const int lda,
+                  const int* ipiv) {
+  return dynload::PADDLE_DGETRI(order, N, A, lda, ipiv);
+}
+
+#ifndef PADDLE_USE_EIGEN_FOR_BLAS
+template <>
+void axpy<float>(const int n, const float alpha, const float* x, float* y) {
+  cblas_saxpy(n, alpha, x, 1, y, 1);
+}
+
+template <>
+void axpy<double>(const int n, const double alpha, const double* x, double* y) {
+  cblas_daxpy(n, alpha, x, 1, y, 1);
+}
+
+template <>
+float dotProduct<float>(const int n, const float* x, const float* y) {
+  return cblas_sdot(n, x, 1, y, 1);
+}
+
+template <>
+double dotProduct<double>(const int n, const double* x, const double* y) {
+  return cblas_ddot(n, x, 1, y, 1);
+}
+#endif
+
+#if defined(PADDLE_WITH_MKLML)
+
+template <>
+void vExp<float>(const int n, const float* a, float* r) {
+  vsExp(n, a, r);
+}
+
+template <>
+void vExp<double>(const int n, const double* a, double* r) {
+  vdExp(n, a, r);
+}
+
+template <>
+void vPow<float>(const int n, const float* a, const float b, float* r) {
+  vsPowx(n, a, b, r);
+}
+
+template <>
+void vPow<double>(const int n, const double* a, const double b, double* r) {
+  vdPowx(n, a, b, r);
+}
+
+template <>
+void vLog<float>(const int n, const float* a, float* r) {
+  vsLn(n, a, r);
+}
+
+template <>
+void vLog<double>(const int n, const double* a, double* r) {
+  vdLn(n, a, r);
+}
+
+template <>
+void vAdd<float>(const int n, const float* a, const float* b, float* r) {
+  vsAdd(n, a, b, r);
+}
+
+template <>
+void vAdd<double>(const int n, const double* a, const double* b, double* r) {
+  vdAdd(n, a, b, r);
+}
+
+template <>
+void vTanh<float>(const int n, const float* a, float* r) {
+  vsTanh(n, a, r);
+}
+
+template <>
+void vTanh<double>(const int n, const double* a, double* r) {
+  vdTanh(n, a, r);
+}
+
+template <>
+void vInvSqrt<float>(const int n, const float* a, float* r) {
+  vsInvSqrt(n, a, r);
+}
+
+template <>
+void vInvSqrt<double>(const int n, const double* a, double* r) {
+  vdInvSqrt(n, a, r);
+}
+
+template <>
+void vLog1p<float>(const int n, const float* a, float* r) {
+  vsLog1p(n, a, r);
+}
+
+template <>
+void vLog1p<double>(const int n, const double* a, double* r) {
+  vdLog1p(n, a, r);
+}
+#else
+
+DEFINE_MATRIX_BINARY_OP(vExp, b = std::exp(a));
+template <class T>
+void vExp(const int n, const T* a, T* r) {
+  hl_cpu_apply_binary_op<T, binary::vExp<T>, 0, 0>(
+      binary::vExp<T>(), const_cast<T*>(a), r, 1, n, n, n);
+}
+
+DEFINE_MATRIX_BINARY_OP(vLog, b = std::log(a));
+template <class T>
+void vLog(const int n, const T* a, T* r) {
+  hl_cpu_apply_binary_op<T, binary::vLog<T>, 0, 0>(
+      binary::vLog<T>(), const_cast<T*>(a), r, 1, n, n, n);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(vPow, ONE_PARAMETER, b = std::pow(a, p));
+template <class T>
+void vPow(const int n, const T* a, const T b, T* r) {
+  hl_cpu_apply_binary_op<T, binary::vPow<T>, 0, 0>(
+      binary::vPow<T>(b), const_cast<T*>(a), r, 1, n, n, n);
+}
+
+DEFINE_MATRIX_TERNARY_OP(vAdd, c = a + b);
+template <class T>
+void vAdd(const int n, const T* a, const T* b, T* r) {
+  hl_cpu_apply_ternary_op<T, ternary::vAdd<T>, 0, 0>(ternary::vAdd<T>(),
+                                                     const_cast<T*>(a),
+                                                     const_cast<T*>(b),
+                                                     r,
+                                                     1,
+                                                     n,
+                                                     n,
+                                                     n,
+                                                     n);
+}
+
+DEFINE_MATRIX_BINARY_OP(vInvSqrt, b = 1.0f / std::sqrt(a));
+template <class T>
+void vInvSqrt(const int n, const T* a, T* r) {
+  hl_cpu_apply_binary_op<T, binary::vInvSqrt<T>, 0, 0>(
+      binary::vInvSqrt<T>(), const_cast<T*>(a), r, 1, n, n, n);
+}
+
+DEFINE_MATRIX_BINARY_OP(vLog1p, b = std::log(1.0f + a));
+template <class T>
+void vLog1p(const int n, const T* a, T* r) {
+  hl_cpu_apply_binary_op<T, binary::vLog1p<T>, 0, 0>(
+      binary::vLog1p<T>(), const_cast<T*>(a), r, 1, n, n, n);
+}
+
+DEFINE_MATRIX_BINARY_OP(vTanh, T tmp = -2.0 * a;
+                        tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+                        b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
+template <class T>
+void vTanh(const int n, const T* a, T* r) {
+  hl_cpu_apply_binary_op<T, binary::vTanh<T>, 0, 0>(
+      binary::vTanh<T>(), const_cast<T*>(a), r, 1, n, n, n);
+}
+
+template void vExp(const int n, const float* a, float* r);
+template void vExp(const int n, const double* a, double* r);
+template void vLog(const int n, const float* a, float* r);
+template void vLog(const int n, const double* a, double* r);
+template void vPow(const int n, const float* a, const float b, float* r);
+template void vPow(const int n, const double* a, const double b, double* r);
+template void vAdd(const int n, const float* a, const float* b, float* r);
+template void vAdd(const int n, const double* a, const double* b, double* r);
+template void vInvSqrt(const int n, const double* a, double* r);
+template void vInvSqrt(const int n, const float* a, float* r);
+template void vLog1p(const int n, const float* a, float* r);
+template void vLog1p(const int n, const double* a, double* r);
+template void vTanh(const int n, const float* a, float* r);
+template void vTanh(const int n, const double* a, double* r);
+#endif
+}  // namespace paddle
diff --git a/paddle/math/MathFunctions.h b/paddle/legacy/math/MathFunctions.h
similarity index 100%
rename from paddle/math/MathFunctions.h
rename to paddle/legacy/math/MathFunctions.h
diff --git a/paddle/legacy/math/MathUtils.cpp b/paddle/legacy/math/MathUtils.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..47ac9c187ca731c98c755501ff3633eabf095186
--- /dev/null
+++ b/paddle/legacy/math/MathUtils.cpp
@@ -0,0 +1,97 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MathUtils.h"
+#include <algorithm>
+#include "Vector.h"
+#include "paddle/legacy/utils/Logging.h"
+
+namespace paddle {
+
+/*if csc, major is cols and minor is rows, else
+ * major is rows and minor is cols, according to
+ * major value to initialize minor value"
+ */
+void sparseRand(
+    int* major, int* minor, int nnz, int majorLen, int minorMax, bool useGpu) {
+  CHECK(size_t(nnz) >= size_t(1));
+  int* cpuMajor;
+  int* cpuMinor;
+  CpuIVector cpuMinorVec(nnz);
+  CpuIVector cpuMajorVec(majorLen);
+  if (useGpu) {
+    cpuMajor = cpuMajorVec.getData();
+    cpuMinor = cpuMinorVec.getData();
+  } else {
+    cpuMajor = major;
+    cpuMinor = minor;
+  }
+
+  /*major value init*/
+  for (int i = 0; i < majorLen - 1; i++) {
+    cpuMajor[i] = 1.0 * i * nnz / (majorLen - 1);
+  }
+  cpuMajor[majorLen - 1] = nnz;
+
+  /*minor value init according to major value*/
+  std::vector<char> used(minorMax, 0);
+  for (int i = 0; i < majorLen - 1; i++) {
+    CHECK_LE(cpuMajor[i + 1] - cpuMajor[i], minorMax);
+    used.assign(minorMax, 0);
+    for (int j = cpuMajor[i]; j < cpuMajor[i + 1]; j++) {
+      int idx = ::rand() % minorMax;
+      while (used[idx]) {
+        idx = ::rand() % minorMax;
+      }
+      cpuMinor[j] = idx;
+      used[idx] = 1;
+    }
+    std::sort(cpuMinor + cpuMajor[i],
+              cpuMinor + cpuMajor[i + 1],
+              [](int a, int b) { return a < b; });
+  }
+  /*memcpy result to gpu*/
+  if (useGpu) {
+    hl_memcpy_host2device(major, cpuMajor, sizeof(int) * majorLen);
+    hl_memcpy_host2device(minor, cpuMinor, sizeof(int) * nnz);
+  }
+}
+
+int outputSize(
+    int imageSize, int filterSize, int padding, int stride, bool caffeMode) {
+  int outputSize;
+  if (!caffeMode) {
+    outputSize =
+        (imageSize - filterSize + 2 * padding + stride - 1) / stride + 1;
+  } else {
+    outputSize = (imageSize - filterSize + 2 * padding) / stride + 1;
+  }
+  CHECK_GE(outputSize, 1);
+  return outputSize;
+}
+
+int imageSize(
+    int outputSize, int filterSize, int padding, int stride, bool caffeMode) {
+  int imageSize;
+  if (!caffeMode) {
+    imageSize =
+        (outputSize - 1) * stride + filterSize - 2 * padding - stride + 1;
+  } else {
+    imageSize = (outputSize - 1) * stride + filterSize - 2 * padding;
+  }
+  CHECK_GE(imageSize, 1);
+  return imageSize;
+}
+
+}  // namespace paddle
diff --git a/paddle/math/MathUtils.h b/paddle/legacy/math/MathUtils.h
similarity index 100%
rename from paddle/math/MathUtils.h
rename to paddle/legacy/math/MathUtils.h
diff --git a/paddle/legacy/math/Matrix.cpp b/paddle/legacy/math/Matrix.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e53f95006c36bfce5df8e57e9efc249f56098b70
--- /dev/null
+++ b/paddle/legacy/math/Matrix.cpp
@@ -0,0 +1,4787 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Matrix.h"
+#include "MathFunctions.h"
+#include "SparseMatrix.h"
+#include "SparseRowMatrix.h"
+
+#include <float.h>
+#include <algorithm>
+#include <cmath>
+
+#include <string.h>
+#include "hl_cnn.h"
+#include "hl_gpu.h"
+#include "hl_table_apply.h"
+#include "hl_top_k.h"
+#include "paddle/legacy/utils/Logging.h"
+
+#include "NEONFunctions.h"
+#include "paddle/legacy/function/GemmFunctor.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
+
+#include "SIMDFunctions.h"
+
+namespace paddle {
+
+inline real _pow(real a, real beta) { return std::pow(a, beta); }
+
+inline real _square(real a) { return a * a; }
+
+inline real _safelog(real a) { return a > 0.0f ? std::log(a) : -40.0f; }
+
+Matrix::Matrix(MemoryHandlePtr memHandle,
+               size_t height,
+               size_t width,
+               bool trans,
+               bool use_gpu)
+    : BaseMatrix(
+          height,
+          width,
+          memHandle ? (reinterpret_cast<real*>(memHandle->getBuf())) : nullptr,
+          trans,
+          use_gpu) {
+  elementCnt_ = width * height;
+  memoryHandle_ = memHandle;
+}
+
+Matrix::Matrix(
+    real* data, size_t height, size_t width, bool trans, bool use_gpu)
+    : BaseMatrix(height, width, data, trans, use_gpu) {
+  elementCnt_ = width * height;
+}
+
+Matrix::Matrix(real* data,
+               size_t height,
+               size_t width,
+               size_t stride,
+               bool trans,
+               bool use_gpu)
+    : BaseMatrix(height, width, stride, data, trans, use_gpu) {
+  elementCnt_ = width * height;
+}
+
+MatrixPtr Matrix::createSparseMatrix(real* data,
+                                     int* row,
+                                     int* col,
+                                     size_t height,
+                                     size_t width,
+                                     size_t nnz, /* used to allocate space */
+                                     SparseValueType valueType, /*value type*/
+                                     SparseFormat format,
+                                     bool trans,
+                                     bool useGpu) {
+  if (useGpu) {
+    return std::make_shared<GpuSparseMatrix>(
+        data, row, col, height, width, nnz, valueType, format, trans);
+  } else {
+    return std::make_shared<CpuSparseMatrix>(
+        data, row, col, height, width, nnz, valueType, format, trans);
+  }
+}
+
+MatrixPtr Matrix::createSparseMatrix(size_t height,
+                                     size_t width,
+                                     size_t nnz, /* used to allocate space */
+                                     SparseValueType valueType, /*value type*/
+                                     SparseFormat format,
+                                     bool trans,
+                                     bool useGpu) {
+  if (useGpu) {
+    return std::make_shared<GpuSparseMatrix>(
+        height, width, nnz, valueType, format, trans);
+  } else {
+    return std::make_shared<CpuSparseMatrix>(
+        height, width, nnz, valueType, format, trans);
+  }
+}
+
+MatrixPtr Matrix::create(MemoryHandlePtr memHandle,
+                         size_t height,
+                         size_t width,
+                         bool trans) {
+  if (auto gpuHandle = std::dynamic_pointer_cast<GpuMemoryHandle>(memHandle)) {
+    return std::make_shared<GpuMatrix>(gpuHandle, height, width, trans);
+  } else if (auto cpuHandle =
+                 std::dynamic_pointer_cast<CpuMemoryHandle>(memHandle)) {
+    return std::make_shared<CpuMatrix>(cpuHandle, height, width, trans);
+  } else {
+    LOG(FATAL) << "Wrong";
+    return nullptr;
+  }
+}
+
+MatrixPtr Matrix::create(size_t height, size_t width, bool trans, bool useGpu) {
+  if (useGpu) {
+    return std::make_shared<GpuMatrix>(height, width, trans);
+  } else {
+    return std::make_shared<CpuMatrix>(height, width, trans);
+  }
+}
+
+MatrixPtr Matrix::create(
+    real* data, size_t height, size_t width, bool trans, bool useGpu) {
+  if (useGpu) {
+    return std::make_shared<GpuMatrix>(data, height, width, trans);
+  } else {
+    return std::make_shared<CpuMatrix>(data, height, width, trans);
+  }
+}
+
+MatrixPtr Matrix::create(real* data,
+                         size_t height,
+                         size_t width,
+                         size_t stride,
+                         bool trans,
+                         bool useGpu) {
+  if (useGpu) {
+    return std::make_shared<GpuMatrix>(data, height, width, stride, trans);
+  } else {
+    return std::make_shared<CpuMatrix>(data, height, width, stride, trans);
+  }
+}
+
+MatrixPtr Matrix::createSparseMatrix(size_t height,
+                                     size_t width,
+                                     size_t nnz,
+                                     SparseValueType valueType,
+                                     bool trans,
+                                     bool useGpu) {
+  if (useGpu) {
+    return std::make_shared<GpuSparseMatrix>(
+        height, width, nnz, valueType, SPARSE_CSR, trans);
+  } else {
+    return std::make_shared<CpuSparseMatrix>(
+        height, width, nnz, valueType, SPARSE_CSR, trans);
+  }
+}
+
+void Matrix::resizeOrCreate(
+    MatrixPtr& matrix, size_t height, size_t width, bool trans, bool useGpu) {
+  if (!matrix) {
+    matrix = Matrix::create(height, width, trans, useGpu);
+  } else {
+    CHECK_EQ(matrix->useGpu(), useGpu);
+    matrix->resize(height, width);
+  }
+}
+
+void Matrix::resizeOrCreateSparseMatrix(MatrixPtr& matrix,
+                                        size_t height,
+                                        size_t width,
+                                        size_t nnz,
+                                        SparseValueType valueType,
+                                        SparseFormat format,
+                                        bool trans,
+                                        bool useGpu) {
+  if (!matrix) {
+    matrix = Matrix::createSparseMatrix(
+        height, width, nnz, valueType, format, trans, useGpu);
+  } else {
+    CHECK(dynamic_cast<CpuSparseMatrix*>(matrix.get()) ||
+          dynamic_cast<GpuSparseMatrix*>(matrix.get()));
+    CHECK_EQ(matrix->useGpu(), useGpu);
+    matrix->resize(height, width, nnz, valueType, format);
+  }
+}
+
+void Matrix::reshape(size_t height, size_t width) {
+  CHECK(isContiguous());
+  CHECK(height_ * width_ == height * width);
+  height_ = height;
+  width_ = width;
+  stride_ = width_;
+}
+
+MatrixPtr Matrix::subMatrix(size_t startRow,
+                            size_t endRow,
+                            size_t startCol,
+                            size_t endCol) {
+  CHECK_LE(startRow, endRow);
+  CHECK_LE(endRow, getHeight());
+  CHECK_LE(startCol, endCol);
+  CHECK_LE(endCol, getWidth());
+
+  return Matrix::create(getData() + startRow * getStride() + startCol,
+                        endRow - startRow,
+                        endCol - startCol,
+                        getStride(),
+                        trans_,
+                        useGpu_);
+}
+
+void Matrix::setDiag(real value) {
+  CHECK(data_ != NULL);
+  CHECK_EQ(height_, width_);
+
+  zeroMem();
+  BaseMatrix diag(height_, 1, stride_ + 1, data_, false, useGpu_);
+  diag.assign(value);
+}
+
+GpuMatrix::GpuMatrix(size_t height, size_t width, bool trans)
+    : Matrix(std::make_shared<GpuMemoryHandle>(height * width * sizeof(real)),
+             height,
+             width,
+             trans,
+             true) {}
+
+GpuMatrix::~GpuMatrix() {}
+
+void GpuMatrix::zeroMem() {
+  CHECK(data_ != NULL);
+  zero();
+}
+
+void GpuMatrix::resetOne() {
+  CHECK(data_ != NULL);
+  one();
+}
+
+void GpuMatrix::resize(size_t newHeight, size_t newWidth) {
+  size_t newSize = newHeight * newWidth;
+  if (NULL == memoryHandle_.get() ||
+      newSize * sizeof(real) > memoryHandle_->getAllocSize()) {
+    memoryHandle_ = std::make_shared<GpuMemoryHandle>(newSize * sizeof(real));
+    data_ = reinterpret_cast<real*>(memoryHandle_->getBuf());
+  }
+  height_ = newHeight;
+  width_ = newWidth;
+  elementCnt_ = newSize;
+  stride_ = width_;
+}
+
+real GpuMatrix::getElement(size_t x, size_t y) const {
+  real elem = 0;
+  hl_memcpy_device2host(&elem, &data_[x * stride_ + y], sizeof(real));
+  return elem;
+}
+
+real GpuMatrix::getSum() {
+  CHECK(isContiguous());
+  real sum = 0.0f;
+  hl_vector_sum(data_, &sum, height_ * width_);
+  return sum;
+}
+
+real GpuMatrix::getMin() {
+  CHECK(isContiguous());
+  auto vec = GpuVector(height_ * width_, data_);
+  return vec.getMin();
+}
+
+real GpuMatrix::getMax() {
+  CHECK(isContiguous());
+  auto vec = GpuVector(height_ * width_, data_);
+  return vec.getMax();
+}
+
+void GpuMatrix::accumulateColSum(Matrix& src) {
+  CHECK_EQ(getWidth(), src.getWidth());
+  CHECK_EQ(getHeight(), (size_t)1);
+  sumCols(src, 1.0, 1.0);
+}
+
+real GpuMatrix::getAbsSum() {
+  CHECK(isContiguous());
+  real sum = 0.0f;
+  hl_vector_abs_sum(data_, &sum, height_ * width_);
+  return sum;
+}
+
+void GpuMatrix::copyFrom(const Matrix& src) {
+  CHECK(isContiguous());
+  CHECK(src.isContiguous());
+  CHECK(elementCnt_ == src.getElementCnt());
+
+  if (typeid(src) == typeid(CpuMatrix)) {
+    hl_memcpy_host2device(
+        data_, const_cast<real*>(src.getData()), sizeof(real) * elementCnt_);
+  } else if (typeid(src) == typeid(GpuMatrix)) {
+    hl_memcpy_device2device(
+        data_, const_cast<real*>(src.getData()), sizeof(real) * elementCnt_);
+  } else {
+    LOG(FATAL) << "Wrong";
+  }
+}
+
+void GpuMatrix::copyFrom(const Matrix& src, hl_stream_t stream) {
+  CHECK(isContiguous());
+  CHECK(src.isContiguous());
+  CHECK(elementCnt_ == src.getElementCnt());
+  hl_memcpy_async(this->getData(),
+                  const_cast<real*>(src.getData()),
+                  sizeof(real) * elementCnt_,
+                  stream);
+}
+
+void GpuMatrix::copyFrom(const real* hostSrc, size_t size) {
+  CHECK(isContiguous());
+  CHECK(size <= elementCnt_);
+  hl_memcpy_host2device(data_, const_cast<real*>(hostSrc), sizeof(real) * size);
+}
+
+void GpuMatrix::copyFrom(const real* hostSrc, const int64_t* seq) {
+  LOG(FATAL) << "not implemented";
+}
+
+void GpuMatrix::copyFrom(const IVector& src) {
+  CHECK(isContiguous());
+  CpuMatrix matrix(src.getSize(), 1, false);
+  matrix.copyFrom(src);
+  copyFrom(matrix);
+}
+
+void GpuMatrix::copyByRowIndex(Matrix& b, const IVector& rowIndex) {
+  size_t height = getHeight();
+  size_t width = getWidth();
+  CHECK_EQ(b.getWidth(), width);
+  real* dst = getData();
+  real* src = b.getData();
+  const int* index = rowIndex.getData();
+  hl_sequence2batch_copy(dst, src, index, width, height, true);
+}
+
+MatrixPtr GpuMatrix::clone(size_t height, size_t width, bool useGpu) {
+  CHECK(isContiguous());
+
+  if (height == 0 && width == 0) {
+    height = height_;
+    width = width_;
+  }
+
+  CHECK(width && height);
+
+  if (useGpu) {
+    return std::make_shared<GpuMatrix>(height, width);
+  } else {
+    return std::make_shared<CpuMatrix>(height, width);
+  }
+}
+
+MatrixPtr GpuMatrix::getTranspose() {
+  if (memoryHandle_.get() != NULL) {
+    MatrixPtr copy_T(
+        new GpuMatrix(std::dynamic_pointer_cast<GpuMemoryHandle>(memoryHandle_),
+                      height_,
+                      width_,
+                      true));
+    return copy_T;
+  } else {
+    MatrixPtr copy_T(new GpuMatrix(data_, height_, width_, true));
+    return copy_T;
+  }
+}
+
+void GpuMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
+  if (memAlloc) {
+    matTrans = std::make_shared<GpuMatrix>(width_, height_);
+  } else {
+    CHECK(matTrans != NULL);
+    CHECK_EQ(matTrans->getHeight(), width_);
+    CHECK_EQ(matTrans->getWidth(), height_);
+  }
+  real* dataTrans = matTrans->getData();
+  real* data = getData();
+  int lda = getStride();
+  int ldc = matTrans->getStride();
+
+  hl_matrix_transpose(data, dataTrans, height_, width_, lda, ldc);
+}
+
+void GpuMatrix::rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise) {
+  if (memAlloc) {
+    matRot = std::make_shared<GpuMatrix>(width_, height_);
+  } else {
+    CHECK(matRot != NULL);
+    CHECK_EQ(matRot->getHeight(), width_);
+    CHECK_EQ(matRot->getWidth(), height_);
+  }
+
+  real* dataRot = matRot->getData();
+  real* data = getData();
+  hl_matrix_rotate(data, dataRot, height_, width_, clockWise);
+}
+
+MatrixPtr GpuMatrix::getInverse() {
+  MatrixPtr matInv;
+  inverse(matInv, true);
+  return matInv;
+}
+
+void GpuMatrix::inverse(MatrixPtr& matInv, bool memAlloc) {
+  CHECK_EQ(height_, width_);
+
+  if (memAlloc) {
+    matInv = std::make_shared<GpuMatrix>(height_, width_);
+  } else {
+    CHECK(matInv != NULL);
+  }
+
+  real* data = getData();
+  real* dataInv = matInv->getData();
+  int lda = getStride();
+  int ldc = matInv->getStride();
+
+  hl_matrix_inverse(data, dataInv, height_, lda, ldc);
+}
+
+void GpuMatrix::addBias(Matrix& b, real scale) {
+  CHECK(b.getHeight() == 1) << "the Bias should be a vector";
+  BaseMatrix::addBias(b, scale);
+}
+
+void GpuMatrix::addSharedBias(Matrix& b, real scale) {
+  CHECK(b.getHeight() == 1) << "the Bias should be a vector";
+  CHECK_LE(b.getWidth(), getWidth());
+  CHECK_EQ(getWidth() % b.getWidth(), 0UL);
+  hl_matrix_add_shared_bias(
+      getData(), b.getData(), b.getWidth(), getHeight(), getWidth(), scale);
+}
+
+void GpuMatrix::collectBias(Matrix& a, real scale) {
+#ifdef PADDLE_WITH_CUDA
+  CHECK_EQ(getHeight(), (size_t)1);
+  CHECK_EQ(width_, a.getWidth());
+  GpuSparseMatrix* sMatPtr = dynamic_cast<GpuSparseMatrix*>(&a);
+  if (!sMatPtr) {
+    sumCols(a, /* scaleSum= */ scale, /* scaleDest= */ 1);
+  } else {
+    real* data = getData();
+    hl_sparse_matrix_s A_d = sMatPtr->sMatrix_.get();
+    hl_sparse_matrix_column_sum(data, A_d, sMatPtr->getHeight(), width_, scale);
+  }
+#endif
+}
+
+void GpuMatrix::collectSharedBias(Matrix& a, real scale) {
+  CHECK_EQ(getHeight(), (size_t)1);
+  CHECK_EQ(a.getWidth() % getWidth(), 0UL);
+  hl_matrix_collect_shared_bias(
+      getData(), a.getData(), getWidth(), a.getHeight(), a.getWidth(), scale);
+}
+
+void GpuMatrix::sequenceAvgForward(Matrix& a,
+                                   const IVector& startsPos,
+                                   int mode) {
+  size_t height = getHeight();
+  size_t width = getWidth();
+  CHECK_EQ(height, startsPos.getSize() - 1);
+  CHECK_EQ(width, a.getWidth());
+  real* dst = getData();
+  real* src = a.getData();
+  const int* starts = startsPos.getData();
+
+  hl_sequence_avg_forward(dst, src, starts, height, width, mode);
+}
+
+void GpuMatrix::sequenceAvgBackward(Matrix& a,
+                                    const IVector& startsPos,
+                                    int mode) {
+  size_t height = a.getHeight();
+  size_t width = getWidth();
+  CHECK_EQ(height, startsPos.getSize() - 1);
+  CHECK_EQ(width, a.getWidth());
+  real* dst = getData();
+  real* src = a.getData();
+  const int* starts = startsPos.getData();
+
+  hl_sequence_avg_backward(dst, src, starts, height, width, mode);
+}
+
+/* this = scaleAB*(a*b) +  scaleT*this */
+void GpuMatrix::mul(const GpuMatrix& a,
+                    const GpuMatrix& b,
+                    real scaleAB,
+                    real scaleT) {
+  CHECK(!isTransposed()) << "Not supported";
+
+  if (!a.isTransposed() && !b.isTransposed()) {
+    CHECK_EQ(width_, b.width_);
+    CHECK_EQ(height_, a.height_);
+    CHECK_EQ(a.width_, b.height_);
+  } else if (a.isTransposed() && !b.isTransposed()) {
+    CHECK_EQ(width_, b.width_);
+    CHECK_EQ(height_, a.width_);
+    CHECK_EQ(a.height_, b.height_);
+  } else if (!a.isTransposed() && b.isTransposed()) {
+    CHECK_EQ(width_, b.height_);
+    CHECK_EQ(height_, a.height_);
+    CHECK_EQ(a.width_, b.width_);
+  } else {
+    LOG(FATAL) << "Is not supported";
+  }
+
+  real* A_d = a.data_;
+  real* B_d = b.data_;
+  real* C_d = data_;
+  int dimM = getHeight();
+  int dimN = getWidth();
+  int dimK = !a.isTransposed() ? a.width_ : a.height_;
+  int lda = a.getStride();
+  int ldb = b.getStride();
+  int ldc = getStride();
+  hl_trans_op_t transa = !a.isTransposed() ? HPPL_OP_N : HPPL_OP_T;
+  hl_trans_op_t transb = !b.isTransposed() ? HPPL_OP_N : HPPL_OP_T;
+
+  hl_matrix_mul(A_d,
+                transa,
+                B_d,
+                transb,
+                C_d,
+                dimM,
+                dimN,
+                dimK,
+                scaleAB,
+                scaleT,
+                lda,
+                ldb,
+                ldc);
+}
+
+void GpuMatrix::mul(const GpuSparseMatrix& a,
+                    const GpuMatrix& b,
+                    real scaleAB,
+                    real scaleT) {
+#ifdef PADDLE_WITH_CUDA
+  CHECK(isContiguous());
+  CHECK(b.isContiguous());
+  CHECK(b.useGpu_ == true) << "Matrix type are not equal";
+  CHECK(!trans_ && !b.trans_) << "not supported";
+
+  if (!a.trans_) {
+    CHECK(width_ == b.width_ && height_ == a.height_ && a.width_ == b.height_)
+        << "Matrix dimensions are not equal";
+  } else {
+    CHECK(width_ == b.width_ && height_ == a.width_ && a.height_ == b.height_)
+        << "Matrix dimensions are not equal";
+  }
+  hl_trans_op_t transA = a.trans_ ? HPPL_OP_T : HPPL_OP_N;
+  hl_sparse_matrix_s A_d = a.sMatrix_.get();
+  real* B_d = b.data_;
+  real* C_d = data_;
+  hl_matrix_csr_mul_dense(A_d,
+                          transA,
+                          B_d,
+                          HPPL_OP_N,
+                          C_d,
+                          height_,
+                          width_,
+                          b.height_,
+                          scaleAB,
+                          scaleT);
+#endif
+}
+
+void GpuMatrix::mul(const GpuMatrix& a,
+                    const GpuSparseMatrix& b,
+                    real scaleAB,
+                    real scaleT) {
+#ifdef PADDLE_WITH_CUDA
+  CHECK(isContiguous());
+  CHECK(a.isContiguous());
+  CHECK(a.useGpu_ == true) << "Matrix type are not equal";
+
+  hl_sparse_matrix_s B_d = b.sMatrix_.get();
+  real* A_d = a.data_;
+  real* C_d = data_;
+  hl_trans_op_t transB = b.trans_ ? HPPL_OP_T : HPPL_OP_N;
+  if (!b.trans_) {
+    CHECK(width_ == b.width_ && height_ == a.height_ && a.width_ == b.height_)
+        << "Matrix dimensions are not equal";
+  } else {
+    CHECK(width_ == b.height_ && height_ == a.height_ && a.width_ == b.width_)
+        << "Matrix dimensions are not equal";
+  }
+  if (b.format_ == SPARSE_CSC) {
+    hl_matrix_dense_mul_csc(A_d,
+                            HPPL_OP_N,
+                            B_d,
+                            transB,
+                            C_d,
+                            height_,
+                            width_,
+                            a.width_,
+                            scaleAB,
+                            scaleT);
+  } else {
+    hl_matrix_dense_mul_csr(A_d,
+                            HPPL_OP_N,
+                            B_d,
+                            transB,
+                            C_d,
+                            height_,
+                            width_,
+                            a.width_,
+                            scaleAB,
+                            scaleT);
+  }
+#endif
+}
+
+/* this = a*b */
+void GpuMatrix::mul(const Matrix& a, const Matrix& b) { mul(a, b, 1.0, 0.0); }
+
+void GpuMatrix::mul(const Matrix& a,
+                    const Matrix& b,
+                    real scaleAB,
+                    real scaleT) {
+  const auto a_ptr = dynamic_cast<const GpuMatrix*>(&a);
+  const auto b_ptr = dynamic_cast<const GpuMatrix*>(&b);
+  const auto a_ptr_s = dynamic_cast<const GpuSparseMatrix*>(&a);
+  const auto b_ptr_s = dynamic_cast<const GpuSparseMatrix*>(&b);
+
+  if (a_ptr && b_ptr) {
+    mul(*a_ptr, *b_ptr, scaleAB, scaleT);
+  } else if (a_ptr_s && b_ptr) {
+    mul(*a_ptr_s, *b_ptr, scaleAB, scaleT);
+  } else if (a_ptr && b_ptr_s) {
+    mul(*a_ptr, *b_ptr_s, scaleAB, scaleT);
+  } else {
+    LOG(FATAL) << "Not supported";
+  }
+}
+
+/* this = this* b */
+void GpuMatrix::rightMul(Matrix& b) { rightMul(b, 1.0, 0.0); }
+
+/* this = scaleAB*(this*b) +  scaleT*this */
+void GpuMatrix::rightMul(Matrix& b, real scaleAB, real scaleT) {
+  CHECK(dynamic_cast<GpuMatrix*>(&b));
+  CHECK(!isTransposed()) << "Not supported";
+  CHECK(!b.isTransposed()) << "Not supported";
+  mul(*this, *dynamic_cast<GpuMatrix*>(&b), scaleAB, scaleT);
+}
+
+/* this = a*this */
+void GpuMatrix::leftMul(Matrix& a) { leftMul(a, 1.0, 0.0); }
+
+/* this = scaleAB*(a*this) +  scaleT*this */
+void GpuMatrix::leftMul(Matrix& a, real scaleAB, real scaleT) {
+  CHECK(dynamic_cast<GpuMatrix*>(&a));
+  CHECK(!isTransposed()) << "Not supported";
+  CHECK(!a.isTransposed()) << "Not supported";
+  mul(*dynamic_cast<GpuMatrix*>(&a), *this, scaleAB, scaleT);
+}
+
+void GpuMatrix::selectRows(Matrix& table, IVector& ids) {
+#ifdef PADDLE_WITH_CUDA
+  CHECK(dynamic_cast<GpuMatrix*>(&table));
+  CHECK(table.useGpu());
+  CHECK(ids.useGpu());
+  CHECK_EQ(getHeight(), ids.getSize());
+  CHECK_EQ(getWidth(), table.getWidth());
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  real* a = getData();
+  size_t tableSize = table.getHeight();
+  int* index = ids.getData();
+
+  hl_matrix_select_rows(a,
+                        stride_,
+                        table.getData(),
+                        table.stride_,
+                        index,
+                        numSamples,
+                        tableSize,
+                        dim);
+#endif
+}
+
+void GpuMatrix::addToRows(Matrix& table, IVector& ids) {
+#ifdef PADDLE_WITH_CUDA
+  CHECK(dynamic_cast<GpuMatrix*>(&table));
+  CHECK(table.useGpu());
+  CHECK(ids.useGpu());
+  CHECK_EQ(getHeight(), ids.getSize());
+  CHECK_EQ(getWidth(), table.getWidth());
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  real* a = getData();
+  size_t tableSize = table.getHeight();
+  int* index = ids.getData();
+
+  hl_matrix_add_to_rows(table.getData(),
+                        table.stride_,
+                        a,
+                        stride_,
+                        index,
+                        numSamples,
+                        tableSize,
+                        dim);
+#endif
+}
+
+void GpuMatrix::colMerge(Matrix& src) {
+  CHECK(src.height_ == height_);
+  if (!trans_ && !src.trans_) {
+    sumRows(src, /* scaleSum= */ 1, /* scaleDest= */ 0);
+  } else {
+    LOG(FATAL) << "Is not supported";
+  }
+}
+
+void GpuMatrix::rowSum(Matrix& sum) {
+  CHECK_EQ(sum.getHeight(), getHeight());
+  CHECK_EQ(sum.getWidth(), (size_t)1);
+
+  sum.sumRows(*this, /* scaleSum= */ 1, /* scaleDest= */ 0);
+}
+
+void GpuMatrix::rowMax(Matrix& max) {
+  CHECK_EQ(max.getHeight(), getHeight());
+  CHECK_EQ(max.getWidth(), (size_t)1);
+
+  max.maxRows(*this);
+}
+
+void GpuMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
+#ifdef PADDLE_WITH_CUDA
+  CHECK(maxIds.useGpu() && maxVal.useGpu()) << "Matrix type are not equal";
+  size_t numSamples = getHeight();
+  size_t beam = maxVal.getWidth();
+  CHECK_EQ(maxIds.getSize(), numSamples * beam);
+  CHECK_EQ(maxVal.getHeight(), numSamples);
+  CHECK_EQ(maxVal.getWidth(), beam);
+
+  hl_matrix_top_k(maxVal.getData(),
+                  maxVal.getStride(),
+                  maxIds.getData(),
+                  this->getData(),
+                  this->getStride(),
+                  this->getWidth(),
+                  beam,
+                  numSamples);
+#endif
+}
+
+void GpuMatrix::colMax(Matrix& max) {
+  CHECK_EQ(max.getWidth(), getWidth());
+  CHECK_EQ(max.getHeight(), (size_t)1);
+
+  max.maxCols(*this);
+}
+
+void GpuMatrix::colMax(IVector& maxIds, Matrix& maxVal) {
+  LOG(FATAL) << "Is not supported";
+}
+
+void GpuMatrix::maxoutForward(Matrix& a,
+                              IVector& id,
+                              size_t channels,
+                              size_t groups) {
+  CHECK(dynamic_cast<GpuMatrix*>(&a));
+  CHECK(dynamic_cast<GpuIVector*>(&id));
+  CHECK_EQ(a.getHeight(), getHeight());
+
+  size_t size = getWidth();
+  size_t batchSize = getHeight();
+  const real* input = a.getData();
+  real* output = getData();
+  int* idForGpu = id.getData();
+
+  hl_maxout_forward(
+      input, output, idForGpu, batchSize, size, size / channels, groups);
+}
+
+void GpuMatrix::maxoutBackward(Matrix& a,
+                               IVector& id,
+                               size_t channels,
+                               size_t groups) {
+  CHECK(dynamic_cast<GpuMatrix*>(&a));
+  CHECK(dynamic_cast<GpuIVector*>(&id));
+  CHECK_EQ(a.getHeight(), getHeight());
+
+  size_t size = a.getWidth();
+  size_t batchSize = getHeight();
+  real* input = getData();
+  const real* output = a.getData();
+  const int* idForGpu = id.getData();
+
+  hl_maxout_backward(
+      input, output, idForGpu, batchSize, size, size / channels, groups);
+}
+
+/*calulate the error of classification */
+void GpuMatrix::classificationError(Matrix& output,
+                                    IVector& label,
+                                    size_t topkSize) {
+  auto gpuOutput = dynamic_cast<GpuMatrix*>(&output);
+  auto gpuLabel = dynamic_cast<GpuIVector*>(&label);
+  size_t numSamples = this->getHeight();
+  GpuMatrixPtr gpuTopVal = std::make_shared<GpuMatrix>(numSamples, topkSize);
+  GpuIVectorPtr gpuTopIds = std::make_shared<GpuIVector>(numSamples * topkSize);
+
+  CHECK(gpuOutput && gpuLabel) << "Invalid argument pointer";
+  CHECK(gpuTopVal && gpuTopIds) << "Allocate GPU memory failed";
+  CHECK(gpuLabel->getSize() == numSamples) << "Vector size is not equal";
+  CHECK(numSamples == gpuOutput->getHeight() && this->getWidth() == 1)
+      << "Matrix dimensions are not equal";
+
+  size_t dim = gpuOutput->getWidth();
+  hl_matrix_classification_error(gpuTopVal->getData(),
+                                 gpuTopVal->getStride(),
+                                 gpuTopIds->getData(),
+                                 gpuOutput->getData(),
+                                 gpuOutput->getStride(),
+                                 dim,
+                                 topkSize,
+                                 numSamples,
+                                 gpuLabel->getData(),
+                                 this->getData());
+}
+
+/* copy -log(output[i * width + label]) to this->data[i] */
+void GpuMatrix::oneHotCrossEntropy(Matrix& output, IVector& label) {
+  GpuMatrix* output_ptr = dynamic_cast<GpuMatrix*>(&output);
+  GpuIVector* label_ptr = dynamic_cast<GpuIVector*>(&label);
+
+  CHECK(output_ptr && label_ptr) << "Invalid argument pointer";
+
+  CHECK(height_ == label.getSize() && width_ == 1 && height_ == output.height_)
+      << "Matrix dimensions are not equal";
+
+  real* A_d = output_ptr->data_;
+  real* C_d = data_;
+  int* label_d = label_ptr->getData();
+
+  hl_matrix_cross_entropy(A_d, C_d, label_d, height_, output.width_);
+}
+
+/* calculate the error of outputV according to label */
+void GpuMatrix::oneHotCrossEntropyBp(Matrix& outputV, IVector& label) {
+  GpuMatrix* output_ptr = dynamic_cast<GpuMatrix*>(&outputV);
+  GpuIVector* label_ptr = dynamic_cast<GpuIVector*>(&label);
+
+  CHECK(output_ptr && label_ptr) << "Invalid argument pointer";
+
+  CHECK(height_ == output_ptr->height_ && width_ == output_ptr->width_)
+      << "Matrix dimensions are not equal";
+
+  real* output_d = output_ptr->data_;
+  real* grad_d = data_;
+  int* label_d = label_ptr->getData();
+
+  hl_matrix_cross_entropy_bp(grad_d, output_d, label_d, height_, width_);
+}
+
+void GpuMatrix::oneHotCrossEntropyWithSelfNorm(Matrix& output,
+                                               IVector& label,
+                                               real alpha) {
+  LOG(FATAL) << "Not implemented";
+}
+
+void GpuMatrix::oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
+                                                 IVector& label,
+                                                 real alpha) {
+  LOG(FATAL) << "Not implemented";
+}
+
+void GpuMatrix::softmax(Matrix& output) {
+  CHECK(output.useGpu()) << "Matrix type are not equal";
+
+  size_t height = getHeight();
+  size_t width = getWidth();
+  CHECK(height == output.getHeight() && width == output.getWidth())
+      << "Matrix dimensions are not equal";
+
+  real* inputData = getData();
+  real* outputData = output.getData();
+  hl_matrix_softmax(inputData, outputData, height, width);
+}
+
+void GpuMatrix::sequenceSoftmax(Matrix& output, const IVector& index) {
+  CHECK_EQ(getWidth(), 1UL);
+  CHECK_EQ(output.getWidth(), 1UL);
+  CHECK(isContiguous());
+
+  real* inputData = getData();
+  real* outputData = output.getData();
+  auto starts = index.getData();
+  int numSequences = index.getSize() - 1;
+  hl_sequence_softmax_forward(inputData, outputData, starts, numSequences);
+}
+
+void GpuMatrix::softmaxDerivative(Matrix& output, Matrix& sftmaxSum) {
+  CHECK(output.useGpu_ == true && sftmaxSum.useGpu_ == true)
+      << "Matrix type are not equal";
+
+  CHECK(height_ == output.height_ && width_ == output.width_ &&
+        height_ == sftmaxSum.height_)
+      << "Matrix dimensions are not equal";
+
+  real* output_d = output.data_;
+  real* sftmaxSum_d = sftmaxSum.data_;
+  real* grad_d = data_;
+  hl_matrix_softmax_derivative(grad_d, output_d, sftmaxSum_d, height_, width_);
+}
+
+void GpuMatrix::softmaxBackward(Matrix& outputV) {
+  CHECK(outputV.useGpu()) << "Matrix type are not equal";
+
+  size_t height = getHeight();
+  size_t width = getWidth();
+  CHECK(height == outputV.getHeight() && width == outputV.getWidth())
+      << "Matrix dimensions are not equal";
+
+  real* output_grad = getData();
+  real* output_value = outputV.getData();
+  hl_softmax_backward(output_value, output_grad, height, width);
+}
+
+void GpuMatrix::sumOfSquares(Matrix& output, Matrix& label) {
+  CHECK_EQ(label.getHeight(), height_);
+  CHECK_EQ(output.getHeight(), height_);
+  CHECK_EQ(label.getWidth(), output.getWidth());
+  CHECK_EQ((size_t)1, width_);
+
+  auto labelptr = dynamic_cast<GpuSparseMatrix*>(&label);
+  if (labelptr) {
+    LOG(FATAL) << "not supported: GpuSparseMatrix as label";
+  }
+
+  BaseMatrix::sumOfSquaredDiffs(output,
+                                label,
+                                /* scaleSum= */ 1,
+                                /* scaleDest= */ 1);
+}
+
+void GpuMatrix::sumOfSquaresBp(Matrix& outputV, Matrix& label) {
+  add2(outputV, label, 1, 2, -2);
+}
+
+void GpuMatrix::tanh(Matrix& output) { BaseMatrix::tanh(output); }
+
+void GpuMatrix::tanhDerivative(Matrix& output) {
+  BaseMatrix::tanhDerivative(output);
+}
+
+void GpuMatrix::softrelu(Matrix& output) { BaseMatrix::softrelu(output); }
+
+void GpuMatrix::softreluDerivative(Matrix& output) {
+  BaseMatrix::softreluDerivative(output);
+}
+
+void GpuMatrix::scaledTanh(Matrix& output, real p1, real p2) {
+  BaseMatrix::scaledTanh(output, p1, p2);
+}
+
+void GpuMatrix::randomizeUniform() {
+  CHECK(isContiguous());
+  real* data = data_;
+  size_t size = height_ * width_;
+
+  hl_rand(data, size);
+}
+
+void GpuMatrix::print(std::ostream& os) const {
+  CHECK(isContiguous());
+  CpuMatrix cpuMat(getHeight(), getWidth());
+  cpuMat.copyFrom(*this);
+  cpuMat.print(os);
+}
+
+void GpuMatrix::print(std::ostream& os, size_t height, size_t width) const {
+  CHECK(isContiguous());
+  CpuMatrix cpuMat(getHeight(), getWidth());
+  cpuMat.copyFrom(*this);
+  cpuMat.print(os, height, width);
+}
+
+void GpuMatrix::check(std::ostream& os, Matrix& refMat, bool printDiff) {
+  CHECK(isContiguous());
+  CHECK(height_ == refMat.getHeight());
+  CHECK(width_ == refMat.getWidth());
+  CpuMatrix cpuRef(height_, width_);
+  GpuMatrix gpuRef(height_, width_);
+  cpuRef.copyFrom(refMat);
+  gpuRef.copyFrom(*this);
+  size_t diffCnt = 0;
+  for (size_t i = 0; i < height_; ++i) {
+    for (size_t j = 0; j < width_; ++j) {
+      real a = gpuRef.getElement(i, j);
+      real b = cpuRef.getElement(i, j);
+      if (fabs(a - b) > 0.00001) {
+        ++diffCnt;
+        if (printDiff) {
+          os << "ref= " << a << "  check= " << b << std::endl;
+        }
+      }
+    }
+  }
+  LOG(INFO) << "the  diffCnt is " << diffCnt;
+}
+
+void GpuMatrix::upsampleForward(Matrix& input,
+                                Matrix& mask,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t channels,
+                                size_t outputH,
+                                size_t outputW) {
+  CHECK(input.useGpu_ == true) << "Matrix type are not equal";
+  CHECK(mask.useGpu_ == true) << "Matrix type are not equal";
+
+  real* inputData = input.getData();
+  real* maskData = mask.getData();
+  real* outData = data_;
+
+  size_t batch = input.getHeight();
+
+  CHECK(imgSizeH * imgSizeW * channels == input.getWidth());
+  CHECK(imgSizeH * imgSizeW * channels == mask.getWidth());
+  CHECK_EQ(batch, this->getHeight());
+  CHECK(width_ == outputH * outputW * channels);
+  hl_upsample_forward(inputData,
+                      maskData,
+                      batch,
+                      imgSizeH,
+                      imgSizeW,
+                      channels,
+                      outputH,
+                      outputW,
+                      outData);
+}
+
+void GpuMatrix::upsampleBackward(Matrix& outputGrad,
+                                 Matrix& mask,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t channels,
+                                 size_t outputH,
+                                 size_t outputW) {
+  CHECK(outputGrad.useGpu_ == true) << "Matrix type are not equal";
+  CHECK(mask.useGpu_ == true) << "Matrix type are not equal";
+
+  real* outputGradData = outputGrad.getData();
+  real* maskData = mask.getData();
+  real* inputGradData = data_;
+  size_t batch = outputGrad.getHeight();
+
+  CHECK(imgSizeH * imgSizeW == this->getWidth() / channels);
+  CHECK_EQ(batch, this->getHeight());
+  CHECK_EQ(channels * outputH * outputW, outputGrad.getWidth());
+  hl_upsample_backward(outputGradData,
+                       maskData,
+                       batch,
+                       imgSizeH,
+                       imgSizeW,
+                       channels,
+                       outputH,
+                       outputW,
+                       inputGradData);
+}
+
+void GpuMatrix::maxPoolForward(Matrix& inputMat,
+                               size_t imgSizeH,
+                               size_t imgSizeW,
+                               size_t channels,
+                               size_t sizeX,
+                               size_t sizeY,
+                               size_t strideH,
+                               size_t strideW,
+                               size_t outputH,
+                               size_t outputW,
+                               size_t paddingH,
+                               size_t paddingW,
+                               MatrixPtr maskMatP) {
+  CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";
+
+  real* inputData = inputMat.getData();
+  real* maskData = NULL;
+  size_t frameNum = inputMat.getHeight();
+  CHECK(imgSizeH * imgSizeW * channels == inputMat.getWidth());
+  CHECK(height_ == inputMat.getHeight());
+  CHECK(width_ == outputH * outputW * channels);
+
+  if (maskMatP != NULL) {
+    CHECK(maskMatP->useGpu_ == true) << "Matrix type are not equal";
+    CHECK(outputH * outputW * channels == maskMatP->getWidth());
+    maskData = maskMatP->getData();
+  }
+
+  hl_maxpool_forward(frameNum,
+                     inputData,
+                     channels,
+                     imgSizeH,
+                     imgSizeW,
+                     outputH,
+                     outputW,
+                     sizeX,
+                     sizeY,
+                     strideH,
+                     strideW,
+                     paddingH,
+                     paddingW,
+                     data_,
+                     getStride(),
+                     maskData);
+}
+
+void GpuMatrix::maxPoolBackward(Matrix& inputMat,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                Matrix& outGrad,
+                                Matrix& outV,
+                                size_t sizeX,
+                                size_t sizeY,
+                                size_t strideH,
+                                size_t strideW,
+                                size_t outputH,
+                                size_t outputW,
+                                real scaleTargets,
+                                real scaleOutput,
+                                size_t paddingH,
+                                size_t paddingW) {
+  CHECK(inputMat.useGpu_ == true && outGrad.useGpu_ == true &&
+        outV.useGpu_ == true)
+      << "Matrix type are not equal";
+
+  real* inputData = inputMat.getData();
+  real* outData = outV.getData();
+  real* outDiff = outGrad.getData();
+  size_t frameNum = inputMat.getHeight();
+  size_t channels = outV.getWidth() / outputH / outputW;
+  CHECK(imgSizeH * imgSizeW * channels == inputMat.getWidth());
+  CHECK(height_ == inputMat.getHeight());
+  CHECK(outGrad.getHeight() == outV.getHeight() &&
+        outGrad.getWidth() == outV.getWidth());
+
+  hl_maxpool_backward(frameNum,
+                      inputData,
+                      outData,
+                      outDiff,
+                      channels,
+                      imgSizeH,
+                      imgSizeW,
+                      outputH,
+                      outputW,
+                      sizeX,
+                      sizeY,
+                      strideH,
+                      strideW,
+                      paddingH,
+                      paddingW,
+                      scaleTargets,
+                      scaleOutput,
+                      data_,
+                      outGrad.getStride());
+}
+
+void GpuMatrix::avgPoolForward(Matrix& inputMat,
+                               size_t imgSizeH,
+                               size_t imgSizeW,
+                               size_t channels,
+                               size_t sizeX,
+                               size_t sizeY,
+                               size_t strideH,
+                               size_t strideW,
+                               size_t outputH,
+                               size_t outputW,
+                               size_t paddingH,
+                               size_t paddingW,
+                               bool excludeMode) {
+  CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";
+
+  real* inputData = inputMat.getData();
+  size_t frameNum = inputMat.getHeight();
+  CHECK(imgSizeH * imgSizeW * channels == inputMat.getWidth());
+  CHECK(height_ == inputMat.getHeight());
+  CHECK(width_ == outputH * outputW * channels);
+
+  hl_avgpool_forward(frameNum,
+                     inputData,
+                     channels,
+                     imgSizeH,
+                     imgSizeW,
+                     outputH,
+                     outputW,
+                     sizeX,
+                     sizeY,
+                     strideH,
+                     strideW,
+                     paddingH,
+                     paddingW,
+                     data_,
+                     getStride(),
+                     excludeMode);
+}
+
+void GpuMatrix::avgPoolBackward(Matrix& outGrad,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t sizeX,
+                                size_t sizeY,
+                                size_t strideH,
+                                size_t strideW,
+                                size_t outputH,
+                                size_t outputW,
+                                real scaleTargets,
+                                real scaleOutput,
+                                size_t paddingH,
+                                size_t paddingW,
+                                bool excludeMode) {
+  CHECK(outGrad.useGpu_ == true) << "Matrix type are not equal";
+
+  real* outDiff = outGrad.getData();
+  size_t frameNum = outGrad.getHeight();
+  size_t channels = outGrad.getWidth() / outputH / outputW;
+  CHECK(imgSizeH * imgSizeW * channels == width_);
+  CHECK(height_ == outGrad.getHeight());
+  CHECK(outGrad.getWidth() == outputH * outputW * channels);
+
+  hl_avgpool_backward(frameNum,
+                      outDiff,
+                      channels,
+                      imgSizeH,
+                      imgSizeW,
+                      outputH,
+                      outputW,
+                      sizeX,
+                      sizeY,
+                      strideH,
+                      strideW,
+                      paddingH,
+                      paddingW,
+                      scaleTargets,
+                      scaleOutput,
+                      data_,
+                      outGrad.getStride(),
+                      excludeMode);
+}
+
+void GpuMatrix::maxPool3DForward(Matrix& inputMat,
+                                 Matrix& maxPoolIdx,
+                                 size_t channels,
+                                 size_t imgSizeD,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 size_t sizeZ,
+                                 size_t sizeY,
+                                 size_t sizeX,
+                                 size_t strideD,
+                                 size_t strideH,
+                                 size_t strideW,
+                                 size_t paddingD,
+                                 size_t paddingH,
+                                 size_t paddingW) {
+  CHECK(inputMat.useGpu_) << "Matrix type are not correct";
+
+  real* inputData = inputMat.getData();
+  real* maxPoolIdxData = maxPoolIdx.getData();
+  size_t num = inputMat.getHeight();
+  CHECK(imgSizeD * imgSizeH * imgSizeW * channels == inputMat.getWidth());
+  CHECK(height_ == inputMat.getHeight());
+  CHECK(width_ == outputD * outputH * outputW * channels);
+
+  hl_maxpool3D_forward(num,
+                       inputData,
+                       channels,
+                       imgSizeD,
+                       imgSizeH,
+                       imgSizeW,
+                       outputD,
+                       outputH,
+                       outputW,
+                       sizeZ,
+                       sizeY,
+                       sizeX,
+                       strideD,
+                       strideH,
+                       strideW,
+                       paddingD,
+                       paddingH,
+                       paddingW,
+                       getData(),
+                       maxPoolIdxData,
+                       getStride());
+}
+
+void GpuMatrix::maxPool3DBackward(Matrix& outGrad,
+                                  Matrix& maxPoolIdx,
+                                  size_t imgSizeD,
+                                  size_t imgSizeH,
+                                  size_t imgSizeW,
+                                  size_t outputD,
+                                  size_t outputH,
+                                  size_t outputW,
+                                  size_t sizeZ,
+                                  size_t sizeY,
+                                  size_t sizeX,
+                                  size_t strideD,
+                                  size_t strideH,
+                                  size_t strideW,
+                                  size_t paddingD,
+                                  size_t paddingH,
+                                  size_t paddingW,
+                                  real scaleTargets,
+                                  real scaleOutput) {
+  CHECK(outGrad.useGpu_ && maxPoolIdx.useGpu_) << "Matrix type are not equal";
+
+  real* outDiff = outGrad.getData();
+  real* maxPoolIdxData = maxPoolIdx.getData();
+  size_t frameNum = getHeight();
+  size_t channels = outGrad.getWidth() / outputD / outputH / outputW;
+  CHECK(imgSizeD * imgSizeH * imgSizeW * channels == getWidth());
+  CHECK(outGrad.getHeight() == maxPoolIdx.getHeight() &&
+        outGrad.getWidth() == maxPoolIdx.getWidth());
+
+  hl_maxpool3D_backward(frameNum,
+                        outDiff,
+                        channels,
+                        imgSizeD,
+                        imgSizeH,
+                        imgSizeW,
+                        outputD,
+                        outputH,
+                        outputW,
+                        sizeZ,
+                        sizeY,
+                        sizeX,
+                        strideD,
+                        strideH,
+                        strideW,
+                        paddingD,
+                        paddingH,
+                        paddingW,
+                        scaleTargets,
+                        scaleOutput,
+                        getData(),
+                        maxPoolIdxData,
+                        outGrad.getStride());
+}
+
+void GpuMatrix::avgPool3DForward(Matrix& inputMat,
+                                 size_t channels,
+                                 size_t imgSizeD,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 size_t sizeZ,
+                                 size_t sizeY,
+                                 size_t sizeX,
+                                 size_t strideD,
+                                 size_t strideH,
+                                 size_t strideW,
+                                 size_t paddingD,
+                                 size_t paddingH,
+                                 size_t paddingW) {
+  CHECK(inputMat.useGpu_) << "Matrix type are not equal";
+
+  real* inputData = inputMat.getData();
+  size_t frameNum = inputMat.getHeight();
+  CHECK(imgSizeD * imgSizeH * imgSizeW * channels == inputMat.getWidth());
+  CHECK(height_ == inputMat.getHeight());
+  CHECK(width_ == outputD * outputH * outputW * channels);
+
+  hl_avgpool3D_forward(frameNum,
+                       inputData,
+                       channels,
+                       imgSizeD,
+                       imgSizeH,
+                       imgSizeW,
+                       outputD,
+                       outputH,
+                       outputW,
+                       sizeZ,
+                       sizeY,
+                       sizeX,
+                       strideD,
+                       strideH,
+                       strideW,
+                       paddingD,
+                       paddingH,
+                       paddingW,
+                       getData(),
+                       getStride());
+}
+
+void GpuMatrix::avgPool3DBackward(Matrix& outGrad,
+                                  size_t imgSizeD,
+                                  size_t imgSizeH,
+                                  size_t imgSizeW,
+                                  size_t outputD,
+                                  size_t outputH,
+                                  size_t outputW,
+                                  size_t sizeZ,
+                                  size_t sizeY,
+                                  size_t sizeX,
+                                  size_t strideD,
+                                  size_t strideH,
+                                  size_t strideW,
+                                  size_t paddingD,
+                                  size_t paddingH,
+                                  size_t paddingW,
+                                  real scaleTargets,
+                                  real scaleOutput) {
+  CHECK(outGrad.useGpu_) << "Matrix type are not equal";
+
+  real* outDiff = outGrad.getData();
+  size_t frameNum = outGrad.getHeight();
+  size_t channels = outGrad.getWidth() / outputD / outputH / outputW;
+  CHECK(imgSizeD * imgSizeH * imgSizeW * channels == width_);
+  CHECK(height_ == outGrad.getHeight());
+  CHECK(outGrad.getWidth() == outputD * outputH * outputW * channels);
+
+  hl_avgpool3D_backward(frameNum,
+                        outDiff,
+                        channels,
+                        imgSizeD,
+                        imgSizeH,
+                        imgSizeW,
+                        outputD,
+                        outputH,
+                        outputW,
+                        sizeZ,
+                        sizeY,
+                        sizeX,
+                        strideD,
+                        strideH,
+                        strideW,
+                        paddingD,
+                        paddingH,
+                        paddingW,
+                        scaleTargets,
+                        scaleOutput,
+                        getData(),
+                        outGrad.getStride());
+}
+
+void GpuMatrix::maxSequenceForward(Matrix& input,
+                                   const IVector& sequence,
+                                   IVector& index) {
+  CHECK(dynamic_cast<GpuMatrix*>(&input));
+  CHECK(dynamic_cast<const GpuIVector*>(&sequence));
+  CHECK(dynamic_cast<GpuIVector*>(&index));
+
+  real* outData = getData();
+  real* inputData = input.getData();
+  const int* starts = sequence.getData();
+  int* maxIndex = index.getData();
+  size_t numSequences = getHeight();
+  size_t dim = getWidth();
+
+  CHECK_EQ(dim, input.getWidth());
+  CHECK_EQ(numSequences, sequence.getSize() - 1);
+  CHECK_EQ(numSequences * dim, index.getSize());
+
+  hl_max_sequence_forward(
+      inputData, starts, outData, maxIndex, numSequences, dim);
+}
+
+void GpuMatrix::maxSequenceBackward(Matrix& outputGrad,
+                                    const IVector& sequence,
+                                    IVector& index) {
+  CHECK(dynamic_cast<GpuMatrix*>(&outputGrad));
+  CHECK(dynamic_cast<const GpuIVector*>(&sequence));
+  CHECK(dynamic_cast<GpuIVector*>(&index));
+
+  real* inputGrad = getData();
+  real* outGrad = outputGrad.getData();
+  int* maxIndex = index.getData();
+  size_t dim = getWidth();
+  size_t numSequences = sequence.getSize() - 1;
+
+  CHECK_EQ(dim, outputGrad.getWidth());
+  CHECK_EQ(numSequences, outputGrad.getHeight());
+  CHECK_EQ(numSequences * dim, index.getSize());
+
+  hl_max_sequence_backward(outGrad, maxIndex, inputGrad, numSequences, dim);
+}
+
+void GpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
+  CHECK(data.useGpu_ == true && W.useGpu_ == true)
+      << "Matrix type are not equal";
+  real* input = data.getData();
+  real* w = W.getData();
+  size_t numElements = data.getWidth();
+  size_t numSamples = data.getHeight();
+  size_t paraSize = W.getHeight() * W.getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+  size_t partial_sum = numElements / paraSize;
+  real* output = getData();
+  hl_param_relu_forward(output, input, w, numElements, numSamples, partial_sum);
+}
+
+void GpuMatrix::paramReluBackwardW(Matrix& oGrad, Matrix& data) {
+  CHECK(oGrad.useGpu_ == true && data.useGpu_ == true)
+      << "Matrix type are not equal";
+  real* ograd = oGrad.getData();
+  real* input = data.getData();
+  real* wgrad = data_;
+  size_t numElements = data.getWidth();
+  size_t numSamples = data.getHeight();
+  size_t paraSize = this->getHeight() * this->getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+  size_t partial_sum = numElements / paraSize;
+  hl_param_relu_backward_w(
+      wgrad, ograd, input, numElements, numSamples, partial_sum);
+}
+
+void GpuMatrix::paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
+  real* diff = data_;
+  real* input = data.getData();
+  real* ograd = oGrad.getData();
+  real* w = W.getData();
+  size_t numElements = data.getWidth();
+  size_t numSamples = data.getHeight();
+  size_t paraSize = W.getHeight() * W.getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+  size_t partial_sum = numElements / paraSize;
+  hl_param_relu_backward_diff(
+      ograd, input, w, diff, numElements, numSamples, partial_sum);
+}
+
+void GpuMatrix::addColumnVector(const Matrix& b) {
+  BaseMatrix::addColVector(const_cast<Matrix&>(b));
+}
+
+void GpuMatrix::bilinearForward(const Matrix& in,
+                                const size_t inImgH,
+                                const size_t inImgW,
+                                const size_t outImgH,
+                                const size_t outImgW,
+                                const size_t numChannels,
+                                const real ratioH,
+                                const real ratioW) {
+  CHECK(dynamic_cast<const GpuMatrix*>(&in));
+
+  const size_t outputW = getWidth();
+  const size_t outputH = getHeight();
+  const size_t inputW = in.getWidth();
+  const size_t inputH = in.getHeight();
+
+  real* outData = getData();
+  const real* inData = in.getData();
+
+  if (inImgH == outImgW && inImgW == outImgW) {
+    this->copyFrom(in);
+  } else {
+    hl_bilinear_forward(inData,
+                        inImgH,
+                        inImgW,
+                        inputH,
+                        inputW,
+                        outData,
+                        outImgH,
+                        outImgW,
+                        outputH,
+                        outputW,
+                        numChannels,
+                        ratioH,
+                        ratioW);
+  }
+}
+
+void GpuMatrix::bilinearBackward(const Matrix& out,
+                                 const size_t outImgH,
+                                 const size_t outImgW,
+                                 const size_t inImgH,
+                                 const size_t inImgW,
+                                 const size_t numChannels,
+                                 const real ratioH,
+                                 const real ratioW) {
+  CHECK(dynamic_cast<const GpuMatrix*>(&out));
+
+  const size_t inputW = getWidth();
+  const size_t inputH = getHeight();
+  const size_t outputW = out.getWidth();
+  const size_t outputH = out.getHeight();
+
+  real* inGrad = getData();
+  const real* outGrad = out.getData();
+
+  if (outImgH == inImgH && outImgW == inImgW) {
+    this->add(const_cast<Matrix&>(out));
+  } else {
+    hl_bilinear_backward(inGrad,
+                         inImgH,
+                         inImgW,
+                         inputH,
+                         inputW,
+                         outGrad,
+                         outImgH,
+                         outImgW,
+                         outputH,
+                         outputW,
+                         numChannels,
+                         ratioH,
+                         ratioW);
+  }
+}
+
+void GpuMatrix::multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
+#ifdef PADDLE_WITH_CUDA
+  GpuMatrix* outputPtr = dynamic_cast<GpuMatrix*>(&output);
+  auto labelPtr = dynamic_cast<GpuSparseMatrix*>(&label);
+
+  CHECK(outputPtr && labelPtr) << "Invalid argument pointer";
+  CHECK(labelPtr->format_ == SPARSE_CSR) << "Matrix format not supported";
+  CHECK(height_ == outputPtr->height_ && width_ == 1 &&
+        outputPtr->width_ == labelPtr->getWidth() &&
+        outputPtr->height_ == labelPtr->getHeight())
+      << "Matrix dimensions are not equal";
+
+  real* output_d = outputPtr->data_;
+  real* entropy_d = data_;
+  hl_sparse_matrix_s mat_d = labelPtr->sMatrix_.get();
+  hl_matrix_multi_binary_cross_entropy(
+      output_d, entropy_d, mat_d, height_, outputPtr->width_);
+#endif
+}
+
+void GpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
+#ifdef PADDLE_WITH_CUDA
+  GpuMatrix* outputPtr = dynamic_cast<GpuMatrix*>(&output);
+  auto labelPtr = dynamic_cast<GpuSparseMatrix*>(&label);
+
+  CHECK(outputPtr && labelPtr) << "Invalid argument pointer";
+  CHECK(labelPtr->format_ == SPARSE_CSR) << "Matrix format not supported";
+  CHECK(height_ == outputPtr->height_ && width_ == outputPtr->width_ &&
+        outputPtr->width_ == labelPtr->getWidth() &&
+        outputPtr->height_ == labelPtr->getHeight())
+      << "Matrix dimensions are not equal";
+
+  real* output_d = outputPtr->data_;
+  real* grad_d = data_;
+  hl_sparse_matrix_s mat_d = labelPtr->sMatrix_.get();
+  hl_matrix_multi_binary_cross_entropy_bp(
+      output_d, grad_d, mat_d, height_, width_);
+#endif
+}
+
+void GpuMatrix::vol2Col(real* dataSrc,
+                        int channels,
+                        int depth,
+                        int height,
+                        int width,
+                        int filterD,
+                        int filterH,
+                        int filterW,
+                        int strideD,
+                        int strideH,
+                        int strideW,
+                        int paddingD,
+                        int paddingH,
+                        int paddingW) {
+  hl_matrix_vol2Col(dataSrc,
+                    channels,
+                    depth,
+                    height,
+                    width,
+                    filterD,
+                    filterH,
+                    filterW,
+                    strideD,
+                    strideH,
+                    strideW,
+                    paddingD,
+                    paddingH,
+                    paddingW,
+                    getData());
+}
+
+void GpuMatrix::col2Vol(real* dataDst,
+                        int channels,
+                        int depth,
+                        int height,
+                        int width,
+                        int filterD,
+                        int filterH,
+                        int filterW,
+                        int strideD,
+                        int strideH,
+                        int strideW,
+                        int paddingD,
+                        int paddingH,
+                        int paddingW,
+                        real alpha,
+                        real beta) {
+  hl_matrix_col2Vol(dataDst,
+                    channels,
+                    depth,
+                    height,
+                    width,
+                    filterD,
+                    filterH,
+                    filterW,
+                    strideD,
+                    strideH,
+                    strideW,
+                    paddingD,
+                    paddingH,
+                    paddingW,
+                    getData(),
+                    alpha,
+                    beta);
+}
+
+/**
+ * CpuMatrix
+ */
+
+CpuMatrix::CpuMatrix(size_t height, size_t width, bool trans)
+    : Matrix(std::make_shared<CpuMemoryHandle>(height * width * sizeof(real)),
+             height,
+             width,
+             trans,
+             false) {}
+
+CpuMatrix::~CpuMatrix() {}
+
+void CpuMatrix::zeroMem() {
+  CHECK(data_ != NULL);
+  if (isContiguous()) {
+    memset(data_, 0, height_ * width_ * sizeof(real));
+  } else {
+    BaseMatrix::zero();
+  }
+}
+void CpuMatrix::resetOne() {
+  CHECK(data_ != NULL);
+  BaseMatrix::one();
+}
+
+void CpuMatrix::copyFrom(const Matrix& src) {
+  CHECK(isContiguous());
+  if (typeid(src) == typeid(GpuMatrix)) {
+    CHECK(src.isContiguous());
+    CHECK(elementCnt_ == src.getElementCnt());
+    hl_memcpy_device2host(
+        data_, const_cast<real*>(src.getData()), sizeof(real) * elementCnt_);
+  } else if (typeid(src) == typeid(CpuMatrix) ||
+             typeid(src) == typeid(SharedCpuMatrix)) {
+    CHECK(src.isContiguous());
+    CHECK(elementCnt_ == src.getElementCnt());
+    memcpy(data_, src.getData(), sizeof(real) * elementCnt_);
+  } else if (typeid(src) == typeid(CpuSparseMatrix)) {
+    CHECK_GE(elementCnt_, src.getElementCnt());
+    copyFrom(dynamic_cast<CpuSparseMatrix&>(const_cast<Matrix&>(src)));
+  } else {
+    LOG(FATAL) << "Wrong";
+  }
+}
+
+void CpuMatrix::copyFrom(CpuSparseMatrix& src) {
+  CHECK(isContiguous());
+  CHECK(height_ == src.getHeight());
+  CHECK(width_ == src.getWidth());
+  memset(data_, 0, sizeof(real) * height_ * width_);
+  if (src.getValueType() == FLOAT_VALUE) {
+    if (src.getFormat() == SPARSE_CSC) {
+      int* rows = src.getRows();
+      real* vals = src.getValue();
+      for (size_t i = 0; i < width_; i++) {
+        for (size_t j = src.getColStartIdx(i); j < src.getColStartIdx(i + 1);
+             j++) {
+          data_[rows[j] * width_ + i] = vals[j];
+        }
+      }
+    } else {
+      int* cols = src.getCols();
+      real* vals = src.getValue();
+      for (size_t i = 0; i < height_; i++) {
+        for (size_t j = src.getRowStartIdx(i); j < src.getRowStartIdx(i + 1);
+             j++) {
+          data_[i * width_ + cols[j]] = vals[j];
+        }
+      }
+    }
+  } else {
+    if (src.getFormat() == SPARSE_CSC) {
+      int* rows = src.getRows();
+      for (size_t i = 0; i < width_; i++) {
+        for (size_t j = src.getColStartIdx(i); j < src.getColStartIdx(i + 1);
+             j++) {
+          data_[rows[j] * width_ + i] = 1.0;
+        }
+      }
+    } else {
+      int* cols = src.getCols();
+      for (size_t i = 0; i < height_; i++) {
+        for (size_t j = src.getRowStartIdx(i); j < src.getRowStartIdx(i + 1);
+             j++) {
+          data_[i * width_ + cols[j]] = 1.0;
+        }
+      }
+    }
+  }
+}
+
+void CpuMatrix::copyFrom(const Matrix& src, hl_stream_t stream) {
+  CHECK(isContiguous());
+  CHECK(src.isContiguous());
+  CHECK(elementCnt_ == src.getElementCnt());
+  if (typeid(src) == typeid(GpuMatrix)) {
+    hl_memcpy_async(this->getData(),
+                    const_cast<real*>(src.getData()),
+                    sizeof(real) * elementCnt_,
+                    stream);
+    // There is a need to add synchronization to ensure that the data is copied.
+    hl_stream_synchronize(stream);
+  } else if (typeid(src) == typeid(CpuMatrix)) {
+    memcpy(data_, src.getData(), sizeof(real) * elementCnt_);
+  } else {
+    LOG(FATAL) << "Wrong";
+  }
+}
+
+void CpuMatrix::copyFrom(const real* cpuSrc, size_t size) {
+  CHECK(isContiguous());
+  CHECK(size <= elementCnt_);
+  memcpy(data_, cpuSrc, sizeof(real) * size);
+}
+
+void CpuMatrix::copyFrom(const real* cpuSrc, const int64_t* seq) {
+  CHECK(isContiguous());
+  for (size_t i = 0; i < height_; i++) {
+    memcpy(data_ + i * width_, cpuSrc + seq[i] * width_, sizeof(real) * width_);
+  }
+}
+
+void CpuMatrix::copyFrom(const IVector& src) {
+  CHECK(isContiguous());
+  CHECK(elementCnt_ == src.getSize())
+      << "the src and dst should have same size.";
+  const int* cpuSrc = NULL;
+  IVectorPtr tmp;
+  if (src.useGpu()) {
+    CpuIVector tmp(src.getSize());
+    tmp.copyFrom(src);
+    cpuSrc = tmp.getData();
+  } else {
+    cpuSrc = src.getData();
+  }
+  for (size_t i = 0; i < elementCnt_; ++i) {
+    data_[i] = cpuSrc[i];
+  }
+}
+
+void CpuMatrix::copyByRowIndex(Matrix& b, const IVector& rowIndex) {
+  size_t height = getHeight();
+  size_t width = getWidth();
+  CHECK_EQ(b.getWidth(), width);
+  const int* index = rowIndex.getData();
+  for (size_t i = 0; i < height; i++) {
+    CHECK_LT(static_cast<size_t>(index[i]), b.getHeight());
+    real* src = b.getData() + index[i] * width;
+    real* dst = getData() + i * width;
+    memcpy(dst, src, sizeof(real) * width);
+  }
+}
+
+MatrixPtr CpuMatrix::clone(size_t height, size_t width, bool useGpu) {
+  CHECK(isContiguous());
+
+  if (height == 0 && width == 0) {
+    height = height_;
+    width = width_;
+  }
+
+  CHECK(width && height);
+
+  if (useGpu) {
+    return std::make_shared<GpuMatrix>(height, width);
+  } else {
+    return std::make_shared<CpuMatrix>(height, width);
+  }
+}
+
+void CpuMatrix::resize(size_t newHeight, size_t newWidth) {
+  size_t newSize = newHeight * newWidth;
+  if (NULL == memoryHandle_.get() ||
+      newSize * sizeof(real) > memoryHandle_->getAllocSize()) {
+    memoryHandle_ = std::make_shared<CpuMemoryHandle>(newSize * sizeof(real));
+    data_ = reinterpret_cast<real*>(memoryHandle_->getBuf());
+  }
+
+  height_ = newHeight;
+  width_ = newWidth;
+  elementCnt_ = newSize;
+  stride_ = width_;
+}
+
+real CpuMatrix::getElement(size_t x, size_t y) const {
+  return data_[x * stride_ + y];
+}
+
+real CpuMatrix::getSum() {
+  CHECK(isContiguous());
+  double sum = 0;
+  for (size_t i = 0; i < height_; ++i) {
+    for (size_t j = 0; j < width_; ++j) {
+      sum += data_[i * width_ + j];
+    }
+  }
+  return sum;
+}
+
+void CpuMatrix::accumulateColSum(Matrix& src) {
+  CHECK_EQ(getWidth(), src.getWidth());
+  CHECK_EQ(getHeight(), (size_t)1);
+
+  sumCols(src, /* scaleSum= */ 1, /* scaleDest= */ 1);
+}
+
+real CpuMatrix::getAbsSum() {
+  CHECK(isContiguous());
+  double sum = 0;
+  for (size_t i = 0; i < height_; ++i) {
+    for (size_t j = 0; j < width_; ++j) {
+      sum += fabs(data_[i * width_ + j]);
+    }
+  }
+  return sum;
+}
+
+MatrixPtr CpuMatrix::getTranspose() {
+  if (memoryHandle_.get() != NULL) {
+    return std::make_shared<CpuMatrix>(
+        std::dynamic_pointer_cast<CpuMemoryHandle>(memoryHandle_),
+        height_,
+        width_,
+        true);
+  } else {
+    MatrixPtr copy_T(new CpuMatrix(data_, height_, width_, true));
+    return copy_T;
+  }
+}
+
+void CpuMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
+  if (memAlloc) {
+    matTrans = std::make_shared<CpuMatrix>(width_, height_);
+  } else {
+    CHECK(matTrans != NULL);
+    CHECK_EQ(matTrans->getHeight(), width_);
+    CHECK_EQ(matTrans->getWidth(), height_);
+  }
+  real* dataTrans = matTrans->getData();
+  real* data = getData();
+  int lda = getStride();
+  int ldc = matTrans->getStride();
+
+  for (size_t i = 0; i < height_; i++) {
+    for (size_t j = 0; j < width_; j++) {
+      dataTrans[j * ldc + i] = data[i * lda + j];
+    }
+  }
+}
+
+void CpuMatrix::rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise) {
+  if (memAlloc) {
+    matRot = std::make_shared<CpuMatrix>(width_, height_);
+  } else {
+    CHECK(matRot != NULL);
+    CHECK_EQ(matRot->getHeight(), width_);
+    CHECK_EQ(matRot->getWidth(), height_);
+  }
+  real* dataRot = matRot->getData();
+  real* data = getData();
+
+  for (size_t i = 0; i < height_; i++) {
+    for (size_t j = 0; j < width_; j++) {
+      if (clockWise) {
+        dataRot[j * height_ + i] = data[(height_ - i - 1) * width_ + j];
+      } else {
+        dataRot[j * height_ + i] = data[i * width_ + (width_ - j - 1)];
+      }
+    }
+  }
+}
+
+MatrixPtr CpuMatrix::getInverse() {
+  MatrixPtr matInv;
+  inverse(matInv, true);
+  return matInv;
+}
+
+void CpuMatrix::inverse(MatrixPtr& matInv, bool memAlloc) {
+  CHECK_EQ(height_, width_);
+
+  if (memAlloc) {
+    matInv = std::make_shared<CpuMatrix>(height_, width_);
+  } else {
+    CHECK(matInv != NULL);
+  }
+
+  CHECK_EQ(height_, matInv->getHeight());
+  CHECK_EQ(width_, matInv->getWidth());
+  matInv->copyFrom(*this);
+
+  real* data = getData();
+  real* dataInv = matInv->getData();
+  int ldc = matInv->getStride();
+
+  if (height_ == 1) {
+    CHECK_NE(*data, 0);
+    *dataInv = 1.0 / (*data);
+    return;
+  }
+
+  /* Compute the LU decomposition of the matrix */
+  std::vector<int> ipiv(height_);
+  CBLAS_ORDER order = (matInv->isTransposed() ? CblasColMajor : CblasRowMajor);
+  int info = getrf<real>(order, height_, height_, dataInv, ldc, ipiv.data());
+  CHECK_EQ(info, 0);
+
+  /* Compute the inverse of the matrix given its LU decompsotion */
+  info = getri<real>(order, height_, dataInv, ldc, ipiv.data());
+  CHECK_EQ(info, 0);
+}
+
+void CpuMatrix::upsampleForward(Matrix& input,
+                                Matrix& mask,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t channels,
+                                size_t outputH,
+                                size_t outputW) {
+  real* inputData = input.getData();
+  real* maskData = mask.getData();
+  real* outData = data_;
+  size_t inLength = imgSizeH * imgSizeW;
+  size_t outLength = outputH * outputW;
+  size_t batch = input.getHeight();
+  CHECK(inLength == input.getWidth() / channels);
+  CHECK_EQ(batch, this->getHeight());
+  CHECK_EQ(channels * outLength, this->getWidth());
+
+  for (size_t k = 0; k < batch; k++) {
+    for (size_t c = 0; c < channels; c++) {
+      for (size_t i = 0; i < inLength; i++) {
+        size_t out_index = static_cast<int>(maskData[i]);
+        if (out_index >= outLength) {
+          LOG(FATAL) << "upsample index " << out_index << " out of range.";
+        }
+        outData[out_index] = inputData[i];
+      }
+      inputData += inLength;
+      maskData += inLength;
+      outData += outLength;
+    }
+  }
+}
+
+void CpuMatrix::upsampleBackward(Matrix& outputGrad,
+                                 Matrix& mask,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t channels,
+                                 size_t outputH,
+                                 size_t outputW) {
+  real* outputGradData = outputGrad.getData();
+  real* maskData = mask.getData();
+  real* inputGradData = data_;
+  size_t inLength = imgSizeH * imgSizeW;
+  size_t outLength = outputH * outputW;
+  size_t batch = outputGrad.getHeight();
+  CHECK(inLength == this->getWidth() / channels);
+  CHECK_EQ(batch, this->getHeight());
+  CHECK_EQ(channels * outLength, outputGrad.getWidth());
+
+  for (size_t k = 0; k < batch; k++) {
+    for (size_t c = 0; c < channels; c++) {
+      for (size_t i = 0; i < inLength; i++) {
+        size_t out_index = static_cast<int>(maskData[i]);
+        if (out_index >= outLength) {
+          LOG(FATAL) << "upsample index " << out_index << " out of range.";
+        }
+        inputGradData[i] = outputGradData[out_index];
+      }
+      inputGradData += inLength;
+      maskData += inLength;
+      outputGradData += outLength;
+    }
+  }
+}
+
+void CpuMatrix::maxPoolForward(Matrix& inputMat,
+                               size_t imgSizeH,
+                               size_t imgSizeW,
+                               size_t channels,
+                               size_t sizeX,
+                               size_t sizeY,
+                               size_t strideH,
+                               size_t strideW,
+                               size_t outputH,
+                               size_t outputW,
+                               size_t paddingH,
+                               size_t paddingW,
+                               MatrixPtr maskMatP) {
+  real* inputData = inputMat.getData();
+  real* outData = data_;
+  real* maskData = NULL;
+  size_t num = inputMat.getHeight();
+  size_t inLength = imgSizeH * imgSizeW;
+  size_t outLength = outputH * outputW;
+  CHECK(inLength == inputMat.getWidth() / channels);
+  CHECK_EQ(num, this->getHeight());
+  CHECK_EQ(channels * outLength, this->getWidth());
+  size_t outStride = getStride();
+
+  if (maskMatP != NULL) {
+    maskData = maskMatP->getData();
+    CHECK_EQ(channels * outLength, maskMatP->getWidth());
+  }
+
+  /* pool max one by one */
+  for (size_t n = 0; n < num; ++n) {  // frame by frame
+    if (!isContiguous()) {
+      outData = data_ + n * outStride;
+    }
+    for (size_t c = 0; c < channels; ++c) {  // channel by channel
+      for (size_t ph = 0; ph < outputH; ++ph) {
+        int hstart = ph * strideH - paddingH;
+        int hend = hstart + sizeY;
+        hstart = hstart < 0 ? 0 : hstart;
+        hend = hend < (int)imgSizeH ? hend : (int)imgSizeH;
+        for (size_t pw = 0; pw < outputW; ++pw) {
+          int wstart = pw * strideW - paddingW;
+          int wend = wstart + sizeX;
+          wstart = wstart < 0 ? 0 : wstart;
+          wend = wend < (int)imgSizeW ? wend : (int)imgSizeW;
+
+          real maxval = -(real)FLT_MAX;
+          int max_index = -1;
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              if (maxval < inputData[h * imgSizeW + w]) {
+                maxval = inputData[h * imgSizeW + w];
+                max_index = h * imgSizeW + w;
+              }
+            }
+          }
+
+          outData[ph * outputW + pw] = maxval;
+          if (maskData != NULL) maskData[ph * outputW + pw] = max_index;
+        }
+      }
+      // compute offset
+      inputData += inLength;
+      outData += outLength;
+
+      if (maskData != NULL) maskData += outLength;
+    }
+  }
+}
+
+void CpuMatrix::maxPoolBackward(Matrix& image,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                Matrix& outGrad,
+                                Matrix& outV,
+                                size_t sizeX,
+                                size_t sizeY,
+                                size_t strideH,
+                                size_t strideW,
+                                size_t outputH,
+                                size_t outputW,
+                                real scaleTargets,
+                                real scaleOutput,
+                                size_t paddingH,
+                                size_t paddingW) {
+  size_t num = image.getHeight();
+  size_t inLength = imgSizeH * imgSizeW;
+  size_t outLength = outputH * outputW;
+  size_t channels = size_t(width_ / inLength);
+  CHECK(image.getWidth() == inLength * channels);
+  CHECK(image.getHeight() == height_ && image.getWidth() == width_);
+  CHECK(outV.getHeight() == outGrad.getHeight() &&
+        outV.getWidth() == outGrad.getWidth());
+
+  real* tgtGrad = data_;
+  real* inData = image.getData();
+  real* otData = outV.getData();
+  real* otGrad = outGrad.getData();
+
+  size_t outStride = outV.getStride();
+  real* origOutData = otData;
+  real* origOutGrad = otGrad;
+
+  for (size_t n = 0; n < num; ++n) {
+    if (!outV.isContiguous()) {
+      otData = origOutData + n * outStride;
+      otGrad = origOutGrad + n * outStride;
+    }
+    for (size_t c = 0; c < channels; ++c) {
+      for (size_t ph = 0; ph < outputH; ++ph) {
+        int hstart = ph * strideH - paddingH;
+        int hend = std::min(hstart + sizeY, imgSizeH);
+        hstart = std::max(hstart, 0);
+        for (size_t pw = 0; pw < outputW; ++pw) {
+          int wstart = pw * strideW - paddingW;
+          int wend = std::min(wstart + sizeX, imgSizeW);
+          wstart = std::max(wstart, 0);
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              tgtGrad[h * imgSizeW + w] =
+                  scaleTargets * tgtGrad[h * imgSizeW + w] +
+                  scaleOutput * otGrad[ph * outputW + pw] *
+                      (inData[h * imgSizeW + w] == otData[ph * outputW + pw]);
+            }
+          }
+        }
+      }
+      // offset
+      inData += inLength;
+      tgtGrad += inLength;
+      otData += outLength;
+      otGrad += outLength;
+    }
+  }
+}
+
+void CpuMatrix::avgPoolForward(Matrix& input,
+                               size_t imgSizeH,
+                               size_t imgSizeW,
+                               size_t channels,
+                               size_t sizeX,
+                               size_t sizeY,
+                               size_t strideH,
+                               size_t strideW,
+                               size_t outputH,
+                               size_t outputW,
+                               size_t paddingH,
+                               size_t paddingW,
+                               bool excludeMode) {
+  // The main loop
+  size_t num = input.getHeight();
+  size_t inLength = imgSizeH * imgSizeW;
+  size_t outLength = outputH * outputW;
+  CHECK(inLength * channels == input.getWidth());
+  CHECK(outLength * channels * num == height_ * width_);
+  real* tgtData = data_;
+  real* inData = input.getData();
+
+  for (size_t n = 0; n < num; ++n) {
+    if (!isContiguous()) {
+      tgtData = data_ + n * getStride();
+    }
+    for (size_t c = 0; c < channels; ++c) {
+      for (size_t ph = 0; ph < outputH; ++ph) {
+        int hstart = ph * strideH - paddingH;
+        int hend = std::min(hstart + sizeY, imgSizeH);
+        hstart = std::max(hstart, 0);
+        for (size_t pw = 0; pw < outputW; ++pw) {
+          int wstart = pw * strideW - paddingW;
+          int wend = std::min(wstart + sizeX, imgSizeW);
+          wstart = std::max(wstart, 0);
+          tgtData[ph * outputW + pw] = 0;  // clear
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              tgtData[ph * outputW + pw] += inData[h * imgSizeW + w];
+            }
+          }
+          int poolSize =
+              excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
+          CHECK(poolSize);
+          tgtData[ph * outputW + pw] /= poolSize;
+        }
+      }
+      // compute offset
+      inData += inLength;
+      tgtData += outLength;
+    }
+  }
+}
+
+void CpuMatrix::avgPoolBackward(Matrix& input,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t sizeX,
+                                size_t sizeY,
+                                size_t strideH,
+                                size_t strideW,
+                                size_t outputH,
+                                size_t outputW,
+                                real scaleTargets,
+                                real scaleOutput,
+                                size_t paddingH,
+                                size_t paddingW,
+                                bool excludeMode) {
+  size_t num = input.getHeight();
+  size_t channels = input.getWidth() / outputH / outputW;
+  size_t inLength = imgSizeH * imgSizeW;
+  size_t outLength = outputH * outputW;
+  CHECK(inLength * channels == getWidth());
+  real* inData = input.getData();
+  real* outData = getData();
+
+  for (size_t n = 0; n < num; ++n) {
+    if (!input.isContiguous()) {
+      inData = input.getData() + n * input.getStride();
+    }
+    for (size_t c = 0; c < channels; ++c) {
+      for (size_t ph = 0; ph < outputH; ++ph) {
+        int hstart = ph * strideH - paddingH;
+        int hend = std::min(hstart + sizeY, imgSizeH);
+        hstart = std::max(hstart, 0);
+        for (size_t pw = 0; pw < outputW; ++pw) {
+          int wstart = pw * strideW - paddingW;
+          int wend = std::min(wstart + sizeX, imgSizeW);
+          wstart = std::max(wstart, 0);
+          int poolSize =
+              excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
+          CHECK(poolSize);
+
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              outData[h * imgSizeW + w] += inData[ph * outputW + pw] / poolSize;
+            }
+          }
+        }
+      }
+      // offset
+      outData += inLength;
+      inData += outLength;
+    }
+  }
+}
+
+void CpuMatrix::maxPool3DForward(Matrix& inputMat,
+                                 Matrix& maxPoolIdx,
+                                 size_t channels,
+                                 size_t imgSizeD,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 size_t sizeZ,
+                                 size_t sizeY,
+                                 size_t sizeX,
+                                 size_t strideD,
+                                 size_t strideH,
+                                 size_t strideW,
+                                 size_t paddingD,
+                                 size_t paddingH,
+                                 size_t paddingW) {
+  real* inputData = inputMat.getData();
+  real* outData = getData();
+  real* maxPoolIdxData = maxPoolIdx.getData();
+  size_t num = inputMat.getHeight();
+  size_t inLength = imgSizeH * imgSizeW * imgSizeD;
+  size_t outLength = outputH * outputW * outputD;
+  CHECK(inLength == inputMat.getWidth() / channels);
+  CHECK_EQ(num, this->getHeight());
+  CHECK_EQ(channels * outLength, this->getWidth());
+  size_t outStride = getStride();
+
+  /* initialize the data_ */
+  for (size_t i = 0; i < height_; i++) {
+    for (size_t j = 0; j < width_; j++) {
+      outData[(i)*outStride + j] = -(real)FLT_MAX;
+      maxPoolIdxData[(i)*outStride + j] = -1;
+    }
+  }
+
+  /* pool max one by one */
+  for (size_t n = 0; n < num; ++n) {  // frame by frame
+    if (!isContiguous()) {
+      outData = getData() + n * outStride;
+      maxPoolIdxData = maxPoolIdx.getData() + n * outStride;
+    }
+    for (size_t c = 0; c < channels; ++c) {  // channel by channel
+      for (size_t pd = 0; pd < outputD; ++pd) {
+        int dstart = pd * strideD - paddingD;
+        int dend = std::min(dstart + sizeZ, imgSizeD);
+        dstart = std::max(dstart, 0);
+        for (size_t ph = 0; ph < outputH; ++ph) {
+          int hstart = ph * strideH - paddingH;
+          int hend = std::min(hstart + sizeY, imgSizeH);
+          hstart = std::max(hstart, 0);
+          for (size_t pw = 0; pw < outputW; ++pw) {
+            int wstart = pw * strideW - paddingW;
+            int wend = std::min(wstart + sizeX, imgSizeW);
+            wstart = std::max(wstart, 0);
+            int maxIdx = -1;
+            real maxOutData = outData[(pd * outputH + ph) * outputW + pw];
+            for (int d = dstart; d < dend; ++d) {
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  if (maxOutData <
+                      inputData[(d * imgSizeH + h) * imgSizeW + w]) {
+                    maxOutData = inputData[(d * imgSizeH + h) * imgSizeW + w];
+                    maxIdx = (d * imgSizeH + h) * imgSizeW + w;
+                  }
+                }
+              }
+            }
+            outData[(pd * outputH + ph) * outputW + pw] = maxOutData;
+            maxPoolIdxData[(pd * outputH + ph) * outputW + pw] = maxIdx;
+          }
+        }
+      }
+      // compute offset
+      inputData += inLength;
+      outData += outLength;
+      maxPoolIdxData += outLength;
+    }
+  }
+}
+
+void CpuMatrix::maxPool3DBackward(Matrix& outGrad,
+                                  Matrix& maxPoolIdx,
+                                  size_t imgSizeD,
+                                  size_t imgSizeH,
+                                  size_t imgSizeW,
+                                  size_t outputD,
+                                  size_t outputH,
+                                  size_t outputW,
+                                  size_t sizeZ,
+                                  size_t sizeY,
+                                  size_t sizeX,
+                                  size_t strideD,
+                                  size_t strideH,
+                                  size_t strideW,
+                                  size_t paddingD,
+                                  size_t paddingH,
+                                  size_t paddingW,
+                                  real scaleTargets,
+                                  real scaleOutput) {
+  size_t num = getHeight();
+  size_t inLength = imgSizeH * imgSizeW * imgSizeD;
+  size_t outLength = outputH * outputW * outputD;
+  size_t channels = size_t(width_ / inLength);
+  CHECK(maxPoolIdx.getHeight() == outGrad.getHeight() &&
+        maxPoolIdx.getWidth() == outGrad.getWidth());
+
+  real* tgtGrad = getData();
+  real* otGrad = outGrad.getData();
+  real* maxPoolIdxData = maxPoolIdx.getData();
+  size_t outStride = outGrad.getStride();
+
+  for (size_t n = 0; n < num; ++n) {
+    if (!outGrad.isContiguous()) {
+      otGrad = outGrad.getData() + n * outStride;
+      maxPoolIdxData = maxPoolIdx.getData() + n * outStride;
+    }
+    for (size_t c = 0; c < channels; ++c) {
+      for (size_t pd = 0; pd < outputD; ++pd) {
+        for (size_t ph = 0; ph < outputH; ++ph) {
+          for (size_t pw = 0; pw < outputW; ++pw) {
+            const size_t index = (pd * outputH + ph) * outputW + pw;
+            const size_t tgtIdx = static_cast<size_t>(maxPoolIdxData[index]);
+            tgtGrad[tgtIdx] =
+                scaleTargets * tgtGrad[tgtIdx] + scaleOutput * otGrad[index];
+          }
+        }
+      }
+      // offset
+      tgtGrad += inLength;
+      otGrad += outLength;
+      maxPoolIdxData += outLength;
+    }
+  }
+}
+
+void CpuMatrix::avgPool3DForward(Matrix& input,
+                                 size_t channels,
+                                 size_t imgSizeD,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 size_t sizeZ,
+                                 size_t sizeY,
+                                 size_t sizeX,
+                                 size_t strideD,
+                                 size_t strideH,
+                                 size_t strideW,
+                                 size_t paddingD,
+                                 size_t paddingH,
+                                 size_t paddingW) {
+  // The main loop
+  size_t num = input.getHeight();
+  size_t inLength = imgSizeH * imgSizeW * imgSizeD;
+  size_t outLength = outputH * outputW * outputD;
+  CHECK(inLength * channels == input.getWidth());
+  CHECK(outLength * channels * num == height_ * width_);
+  real* tgtData = getData();
+  real* inData = input.getData();
+
+  for (size_t n = 0; n < num; ++n) {
+    if (!isContiguous()) {
+      tgtData = data_ + n * getStride();
+    }
+    for (size_t c = 0; c < channels; ++c) {
+      for (size_t pd = 0; pd < outputD; ++pd) {
+        int dstart = pd * strideD - paddingD;
+        int dend = std::min(dstart + sizeZ, imgSizeD);
+        dstart = std::max(dstart, 0);
+        for (size_t ph = 0; ph < outputH; ++ph) {
+          int hstart = ph * strideH - paddingH;
+          int hend = std::min(hstart + sizeY, imgSizeH);
+          hstart = std::max(hstart, 0);
+          for (size_t pw = 0; pw < outputW; ++pw) {
+            int wstart = pw * strideW - paddingW;
+            int wend = std::min(wstart + sizeX, imgSizeW);
+            wstart = std::max(wstart, 0);
+
+            tgtData[(pd * outputH + ph) * outputW + pw] = 0;  // clear
+            for (int d = dstart; d < dend; ++d) {
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  tgtData[(pd * outputH + ph) * outputW + pw] +=
+                      inData[(d * imgSizeH + h) * imgSizeW + w];
+                }
+              }
+            }
+            int poolSize = (dend - dstart) * (hend - hstart) * (wend - wstart);
+            CHECK(poolSize);
+            tgtData[(pd * outputH + ph) * outputW + pw] /= poolSize;
+          }
+        }
+      }
+      // compute offset
+      inData += inLength;
+      tgtData += outLength;
+    }
+  }
+}
+
+void CpuMatrix::avgPool3DBackward(Matrix& input,
+                                  size_t imgSizeD,
+                                  size_t imgSizeH,
+                                  size_t imgSizeW,
+                                  size_t outputD,
+                                  size_t outputH,
+                                  size_t outputW,
+                                  size_t sizeZ,
+                                  size_t sizeY,
+                                  size_t sizeX,
+                                  size_t strideD,
+                                  size_t strideH,
+                                  size_t strideW,
+                                  size_t paddingD,
+                                  size_t paddingH,
+                                  size_t paddingW,
+                                  real scaleTargets,
+                                  real scaleOutput) {
+  size_t num = input.getHeight();
+  size_t inLength = imgSizeH * imgSizeW * imgSizeD;
+  size_t outLength = outputH * outputW * outputD;
+  size_t channels = input.getWidth() / outLength;
+  CHECK(inLength * channels == getWidth());
+  real* inData = input.getData();
+  real* outData = getData();
+
+  for (size_t n = 0; n < num; ++n) {
+    if (!input.isContiguous()) {
+      inData = input.getData() + n * input.getStride();
+    }
+    for (size_t c = 0; c < channels; ++c) {
+      for (size_t pd = 0; pd < outputD; ++pd) {
+        int dstart = pd * strideD - paddingD;
+        int dend = std::min(dstart + sizeZ, imgSizeD);
+        dstart = std::max(dstart, 0);
+        for (size_t ph = 0; ph < outputH; ++ph) {
+          int hstart = ph * strideH - paddingH;
+          int hend = std::min(hstart + sizeY, imgSizeH);
+          hstart = std::max(hstart, 0);
+          for (size_t pw = 0; pw < outputW; ++pw) {
+            int wstart = pw * strideW - paddingW;
+            int wend = std::min(wstart + sizeX, imgSizeW);
+            wstart = std::max(wstart, 0);
+            int poolSize = (dend - dstart) * (hend - hstart) * (wend - wstart);
+            CHECK(poolSize);
+            for (int d = dstart; d < dend; ++d) {
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  outData[(d * imgSizeH + h) * imgSizeW + w] +=
+                      inData[(pd * outputH + ph) * outputW + pw] / poolSize;
+                }
+              }
+            }
+          }
+        }
+      }
+      // offset
+      outData += inLength;
+      inData += outLength;
+    }
+  }
+}
+
+/**
+ * Input: one or more sequences. Each sequence contains some instances.
+ * Output: output size is the number of input sequences (NOT input instances).
+ * output[i] is set to max_{for each instance in this sequence}{input[i]}
+ */
+void CpuMatrix::maxSequenceForward(Matrix& input,
+                                   const IVector& sequence,
+                                   IVector& index) {
+  CHECK(dynamic_cast<CpuMatrix*>(&input));
+  CHECK(dynamic_cast<const CpuIVector*>(&sequence));
+  CHECK(dynamic_cast<CpuIVector*>(&index));
+
+  real* outData = getData();
+  real* inputData = input.getData();
+  const int* starts = sequence.getData();
+  int* maxIndex = index.getData();
+  size_t numSequences = getHeight();
+  size_t dim = getWidth();
+
+  CHECK_EQ(dim, input.getWidth());
+  CHECK_EQ(numSequences, sequence.getSize() - 1);
+  CHECK_EQ(starts[numSequences], (int)input.getHeight());
+  CHECK_EQ(numSequences * dim, index.getSize());
+
+  for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
+    // current sequence, loop for each input instance
+    // (1) first instance: do not need compare, copy value to outV directly
+    for (size_t k = 0; k < dim; ++k) {
+      outData[sequenceId * dim + k] = inputData[starts[sequenceId] * dim + k];
+      maxIndex[sequenceId * dim + k] = starts[sequenceId];
+    }
+    // (2) other instance in same sequence
+    for (int insId = starts[sequenceId] + 1; insId < starts[sequenceId + 1];
+         ++insId) {
+      // insId is the index on all instances
+      for (size_t k = 0; k < dim; ++k) {
+        // for each dim
+        if (inputData[insId * dim + k] > outData[sequenceId * dim + k]) {
+          // update max value and record index
+          outData[sequenceId * dim + k] = inputData[insId * dim + k];
+          maxIndex[sequenceId * dim + k] = insId;
+        }
+      }
+    }
+  }
+}
+
+void CpuMatrix::maxSequenceBackward(Matrix& outputGrad,
+                                    const IVector& sequence,
+                                    IVector& index) {
+  CHECK(dynamic_cast<CpuMatrix*>(&outputGrad));
+  CHECK(dynamic_cast<const CpuIVector*>(&sequence));
+  CHECK(dynamic_cast<CpuIVector*>(&index));
+
+  real* inputGrad = getData();
+  real* outGrad = outputGrad.getData();
+  int* maxIndex = index.getData();
+  size_t dim = getWidth();
+  size_t numSequences = sequence.getSize() - 1;
+
+  CHECK_EQ(dim, outputGrad.getWidth());
+  CHECK_EQ(numSequences, outputGrad.getHeight());
+  CHECK_EQ(numSequences * dim, index.getSize());
+
+  for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
+    // current sequence
+    for (size_t j = 0; j < dim; ++j) {
+      // each dim
+      int insId = maxIndex[sequenceId * dim + j];
+      inputGrad[insId * dim + j] += outGrad[sequenceId * dim + j];
+    }
+  }
+}
+
+inline void vecAddTo(real* a, const real* b, size_t len) {
+  for (unsigned int i = 0; i < len; ++i) {
+    a[i] += b[i];
+  }
+}
+
+inline void vecAddTo(real* a, const real* b, real scaleB, size_t len) {
+  for (unsigned int i = 0; i < len; ++i) {
+    a[i] += scaleB * b[i];
+  }
+}
+
+inline void colVecAddTo(
+    real* a, const real* b, size_t len, size_t aWidth, size_t bWidth) {
+  for (unsigned int i = 0; i < len; ++i) {
+    a[i * aWidth] += b[i * bWidth];
+  }
+}
+
+inline void colVecAddTo(
+    real* a, real* b, real c, size_t len, size_t aWidth, size_t bWidth) {
+  for (unsigned int i = 0; i < len; ++i) {
+    a[i * aWidth] += b[i * bWidth] * c;
+  }
+}
+
+void CpuMatrix::addBias(Matrix& b, real scale) {
+  CHECK(b.useGpu_ == false) << "Matrix type are not equal";
+
+  CHECK_EQ(b.getHeight(), (size_t)1);
+  CHECK_EQ(width_, b.getWidth());
+  real* aData = getData();
+  real* bData = b.getData();
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+
+  if (scale == 1 && getStride() % 32 == 0) {  // use libaddto
+    // @TODO(yuyang18) Make input addr can be unaligned.
+    // So merge this if and else
+    CHECK_EQ((size_t)aData % 32, 0UL);
+    CHECK_EQ((size_t)bData % 32, 0UL);
+    for (size_t i = 0; i < numSamples; i++) {
+      simd::addTo(aData + i * getStride(), bData, dim);
+    }
+  } else {
+    for (size_t i = 0; i < numSamples; i++) {
+      for (size_t j = 0; j < dim; j++) {
+        aData[i * getStride() + j] += scale * bData[j];
+      }
+    }
+  }
+}
+
+void CpuMatrix::addSharedBias(Matrix& b, real scale) {
+  CHECK_EQ(b.getHeight(), (size_t)1);
+  real* aData = getData();
+  real* bData = b.getData();
+  size_t numSamples = getHeight();
+  size_t channel = b.getWidth();
+  CHECK_EQ(getWidth() % channel, 0UL);
+  size_t dim = getWidth() / channel;
+
+  for (size_t i = 0; i < numSamples; i++) {
+    for (size_t c = 0; c < channel; c++) {
+      for (size_t j = 0; j < dim; j++) {
+        aData[i * getStride() + c * dim + j] += scale * bData[c];
+      }
+    }
+  }
+}
+
+void CpuMatrix::collectBias(Matrix& a, real scale) {
+  CHECK_EQ(getHeight(), (size_t)1);
+  CHECK_EQ(width_, a.getWidth());
+  CpuSparseMatrix* aptr = dynamic_cast<CpuSparseMatrix*>(&a);
+  if (!aptr) {
+    sumCols(a, /* scaleSum= */ scale, /* scaleDest= */ 1);
+  } else {
+    size_t nnz = aptr->getElementCnt();
+    int* cols = aptr->getCols();
+    real* A = aptr->getValue();
+    real* B = getData();
+    for (size_t i = 0; i < nnz; i++) {
+      B[cols[i]] += scale * A[i];
+    }
+  }
+}
+
+void CpuMatrix::collectSharedBias(Matrix& a, real scale) {
+  CHECK_EQ(getHeight(), (size_t)1);
+  real* B = getData();
+  real* A = a.getData();
+  size_t numSamples = a.getHeight();
+  size_t channel = getWidth();
+  CHECK_EQ(a.getWidth() % channel, 0UL);
+  size_t dim = a.getWidth() / channel;
+  for (size_t i = 0; i < numSamples; i++) {
+    for (size_t c = 0; c < channel; c++) {
+      for (size_t j = 0; j < dim; j++) {
+        B[c] += scale * A[i * channel * dim + c * dim + j];
+      }
+    }
+  }
+}
+
+void CpuMatrix::sequenceAvgForward(Matrix& a,
+                                   const IVector& startsPos,
+                                   int mode) {
+  size_t height = getHeight();
+  size_t width = getWidth();
+  CHECK_EQ(height, startsPos.getSize() - 1);
+  CHECK_EQ(width, a.getWidth());
+  real* dst = getData();
+  real* src = a.getData();
+  const int* starts = startsPos.getData();
+  MatrixPtr outMtx = Matrix::create(nullptr, 1, width, false, false);
+  MatrixPtr dataMtx = Matrix::create(nullptr, 1, width, false, false);
+  for (size_t i = 0; i < height; i++) {
+    int sequenceLength = starts[i + 1] - starts[i];
+    if (0 == sequenceLength) {
+      // empty sequence
+      continue;
+    }
+    outMtx->setData(dst + i * width);
+    dataMtx->setData(src + starts[i] * width, sequenceLength, width);
+    if (mode == 0) {
+      // plain average
+      outMtx->sumCols(*dataMtx,
+                      (real)1 / (real)sequenceLength,
+                      /* scaleDest= */ 1);
+    } else if (mode == 1) {
+      // sum instead of average
+      outMtx->sumCols(*dataMtx, /* scaleSum= */ 1, /* scaleDest= */ 1);
+    } else if (mode == 2) {
+      // divide by square root of sequenceLength
+      outMtx->sumCols(*dataMtx,
+                      (real)1 / std::sqrt(sequenceLength),
+                      /* scaleDest= */ 1);
+    } else {
+      LOG(FATAL) << "should not reach here";
+    }
+  }
+}
+
+void CpuMatrix::sequenceAvgBackward(Matrix& a,
+                                    const IVector& startsPos,
+                                    int mode) {
+  size_t height = a.getHeight();
+  size_t width = getWidth();
+  CHECK_EQ(height, startsPos.getSize() - 1);
+  CHECK_EQ(width, a.getWidth());
+  real* dst = getData();
+  real* src = a.getData();
+  const int* starts = startsPos.getData();
+  MatrixPtr outMtx = Matrix::create(nullptr, 1, width, false, false);
+  MatrixPtr dataMtx = Matrix::create(nullptr, 1, width, false, false);
+  for (size_t i = 0; i < height; ++i) {
+    int sequenceLength = starts[i + 1] - starts[i];
+    if (0 == sequenceLength) {
+      // empty sequence
+      continue;
+    }
+    outMtx->setData(dst + starts[i] * width, sequenceLength, width);
+    dataMtx->setData(src + i * width);
+    if (mode == 0) {
+      // plain average
+      outMtx->addBias(*dataMtx, 1.0f / sequenceLength);
+    } else if (mode == 1) {
+      // sum instead of average
+      outMtx->addBias(*dataMtx, 1.0f);
+    } else if (mode == 2) {
+      // divide by square root of sequenceLength
+      outMtx->addBias(*dataMtx, 1.0f / std::sqrt(sequenceLength));
+    } else {
+      LOG(FATAL) << "should not reach here";
+    }
+  }
+}
+
+/* this = scaleAB*(a*b) + scaleT*this*/
+void CpuMatrix::mul(const Matrix& a,
+                    const Matrix& b,
+                    real scaleAB,
+                    real scaleT) {
+  CHECK(!isTransposed()) << "Not supported";
+  const auto a_ptr = dynamic_cast<const CpuMatrix*>(&a);
+  const auto b_ptr = dynamic_cast<const CpuMatrix*>(&b);
+  const auto a_ptr_s = dynamic_cast<const CpuSparseMatrix*>(&a);
+  const auto b_ptr_s = dynamic_cast<const CpuSparseMatrix*>(&b);
+
+  if (a_ptr && b_ptr) {
+    mul((CpuMatrix*)a_ptr, (CpuMatrix*)b_ptr, scaleAB, scaleT);
+  } else if (a_ptr_s && b_ptr) {
+    mul((CpuSparseMatrix*)a_ptr_s, (CpuMatrix*)b_ptr, scaleAB, scaleT);
+  } else if (a_ptr && b_ptr_s) {
+    mul((CpuMatrix*)a_ptr, (CpuSparseMatrix*)b_ptr_s, scaleAB, scaleT);
+  } else {
+    LOG(FATAL) << "Not supported";
+  }
+}
+
+void CpuMatrix::mul(CpuSparseMatrix* a,
+                    CpuMatrix* b,
+                    real scaleAB,
+                    real scaleT) {
+  if (dynamic_cast<CacheRowCpuMatrix*>(b)) {
+    return mul(a, dynamic_cast<CacheRowCpuMatrix*>(b), this, scaleAB, scaleT);
+  } else if (dynamic_cast<SparseRowCpuMatrix*>(b)) {
+    return mul(a, dynamic_cast<SparseRowCpuMatrix*>(b), this, scaleAB, scaleT);
+  } else {
+    return mul(a, b, this, scaleAB, scaleT);
+  }
+}
+
+void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) {
+  CHECK(!isTransposed()) << "Not supported";
+
+  size_t a_col, b_col, a_row, b_row;
+  bool a_trans, b_trans;
+  if (!a->isTransposed()) {
+    a_col = a->getWidth();
+    a_row = a->getHeight();
+    a_trans = false;
+  } else {
+    a_col = a->getHeight();
+    a_row = a->getWidth();
+    a_trans = true;
+  }
+  if (!b->isTransposed()) {
+    b_col = b->getWidth();
+    b_row = b->getHeight();
+    b_trans = false;
+  } else {
+    b_col = b->getHeight();
+    b_row = b->getWidth();
+    b_trans = true;
+  }
+
+  CHECK_EQ(a_col, b_row);
+  CHECK_EQ(a_row, getHeight());
+  CHECK_EQ(b_col, getWidth());
+
+  real* A = a->getData();
+  real* B = b->getData();
+  real* C = getData();
+
+  int M = getHeight();
+  int N = getWidth();
+  int K = a_col;
+  int lda = a->getStride();
+  int ldb = b->getStride();
+  int ldc = getStride();
+  BlasGemm<DEVICE_TYPE_CPU, real>::compute(
+      a_trans, b_trans, M, N, K, scaleAB, A, lda, B, ldb, scaleT, C, ldc);
+}
+
+void CpuMatrix::mul(
+    CpuMatrix* a, CpuMatrix* b, CpuSparseMatrix* c, real scaleAB, real scaleT) {
+  CHECK(!c->isTransposed()) << "Not supported";
+  CHECK_EQ(c->getValueType(), FLOAT_VALUE);
+
+  real* A = a->getData();
+  real* B = b->getData();
+  real* C = c->getValue();
+  int* rows = c->getRows();
+  int* cols = c->getCols();
+  size_t height = c->getHeight();
+  size_t width = c->getWidth();
+  if (scaleT == 0) {
+    c->zeroMem();
+  }
+
+  if (!a->isTransposed() && !b->isTransposed()) {
+    size_t m = a->getWidth();
+    CHECK_EQ(b->getHeight(), m);
+    CHECK_EQ(a->getHeight(), height);
+    CHECK_EQ(b->getWidth(), width);
+    if (c->getFormat() == SPARSE_CSC) {
+      for (size_t i = 0; i < width; i++) {
+        size_t start = c->getColStartIdx(i);
+        size_t end = c->getColStartIdx(i + 1);
+        for (size_t j = start; j < end; j++) {
+          real sum = 0;
+          size_t rowIdx = rows[j];
+          for (size_t k = 0; k < m; k++) {
+            sum += A[rowIdx * m + k] * B[k * width + i];
+          }
+          C[j] = scaleAB * sum + scaleT * C[j];
+        }
+      }
+    } else {
+      for (size_t i = 0; i < height; i++) {
+        size_t start = c->getRowStartIdx(i);
+        size_t end = c->getRowStartIdx(i + 1);
+        for (size_t j = start; j < end; j++) {
+          real sum = 0;
+          size_t colIdx = cols[j];
+          for (size_t k = 0; k < m; k++) {
+            sum += A[i * m + k] * B[k * width + colIdx];
+          }
+          C[j] = scaleAB * sum + scaleT * C[j];
+        }
+      }
+    }
+  } else if (a->isTransposed() && !b->isTransposed()) {
+    size_t m = a->getHeight();
+    CHECK_EQ(m, b->getHeight());
+    CHECK_EQ(b->getWidth(), width);
+    CHECK_EQ(a->getWidth(), height);
+
+    if (c->getFormat() == SPARSE_CSC) {
+      for (size_t i = 0; i < width; i++) {
+        size_t start = c->getColStartIdx(i);
+        size_t end = c->getColStartIdx(i + 1);
+        for (size_t j = start; j < end; j++) {
+          real sum = 0;
+          size_t rowIdx = rows[j];
+          for (size_t k = 0; k < m; k++) {
+            sum += A[k * height + rowIdx] * B[k * width + i];
+          }
+          C[j] = scaleAB * sum + scaleT * C[j];
+        }
+      }
+    } else {
+      for (size_t i = 0; i < height; i++) {
+        int start = c->getRowStartIdx(i);
+        int end = c->getRowStartIdx(i + 1);
+        for (int j = start; j < end; j++) {
+          real sum = 0;
+          size_t colIdx = cols[j];
+          for (size_t k = 0; k < m; k++) {
+            sum += A[k * height + i] * B[k * width + colIdx];
+          }
+          C[j] = scaleAB * sum + scaleT * C[j];
+        }
+      }
+    }
+  } else if (!a->isTransposed() && b->isTransposed()) {
+    size_t m = a->getWidth();
+    CHECK_EQ(b->getWidth(), m);
+    CHECK_EQ(a->getHeight(), height);
+    CHECK_EQ(b->getHeight(), width);
+    if (c->getFormat() == SPARSE_CSR) {
+      for (size_t i = 0; i < height; i++) {
+        size_t start = c->getRowStartIdx(i);
+        size_t end = c->getRowStartIdx(i + 1);
+        for (size_t j = start; j < end; j++) {
+          real sum = 0;
+          size_t colIdx = cols[j];
+          for (size_t k = 0; k < m; k++) {
+            sum += A[i * m + k] * B[colIdx * m + k];
+          }
+          C[j] = scaleAB * sum + scaleT * C[j];
+        }
+      }
+    } else {
+      LOG(FATAL) << "Not supported csc format "
+                    "when a is not trans and b is trans";
+    }
+  } else {
+    LOG(FATAL) << "Not supported";
+  }
+}
+
+void CpuMatrix::mul(CpuMatrix* a,
+                    CpuSparseMatrix* b,
+                    real scaleAB,
+                    real scaleT) {
+  CHECK(!trans_) << "Not supported";
+  CHECK(!a->isTransposed()) << "Not supported";
+  CHECK(scaleT == 0 || scaleT == 1);
+
+  // TODO(yuyang18): Maybe bug implementation here
+  CHECK_EQ(scaleAB, static_cast<real>(1.0));
+
+  real* A = a->getData();
+  real* B = b->getValue();
+  real* C = getData();
+  int* rows = b->getRows();
+  int* cols = b->getCols();
+
+  if (scaleT == 0) {
+    zeroMem();
+  }
+  if (b->getFormat() == SPARSE_CSC) {
+    if (!b->isTransposed()) {
+      size_t m = a->getWidth();
+      CHECK_EQ(b->getHeight(), m);
+      CHECK_EQ(a->getHeight(), height_);
+      CHECK_EQ(b->getWidth(), width_);
+
+      if (b->getValueType() == NO_VALUE) {
+        for (size_t j = 0; j < b->getWidth(); ++j) {
+          int start = b->getColStartIdx(j);
+          int end = b->getColStartIdx(j + 1);
+          for (int i = start; i < end; ++i) {
+            colVecAddTo(C + j, A + rows[i], height_, width_, a->getWidth());
+          }
+        }
+      } else if (b->getValueType() == FLOAT_VALUE) {
+        for (size_t j = 0; j < b->getWidth(); ++j) {
+          int start = b->getColStartIdx(j);
+          int end = b->getColStartIdx(j + 1);
+          for (int i = start; i < end; ++i) {
+            colVecAddTo(
+                C + j, A + rows[i], B[i], height_, width_, a->getWidth());
+          }
+        }
+      }
+    } else /*if (b->isTransposed())*/ {
+      size_t m = a->getWidth();
+      CHECK_EQ(b->getHeight(), width_);
+      CHECK_EQ(a->getHeight(), height_);
+      CHECK_EQ(b->getWidth(), m);
+      if (b->getValueType() == NO_VALUE) {
+        for (size_t i = 0; i < b->getWidth(); ++i) {
+          int start = b->getColStartIdx(i);
+          int end = b->getColStartIdx(i + 1);
+          for (int j = start; j < end; ++j) {
+            colVecAddTo(C + rows[j], A + i, height_, width_, a->getWidth());
+          }
+        }
+      } else if (b->getValueType() == FLOAT_VALUE) {
+        for (size_t i = 0; i < b->getWidth(); ++i) {
+          int start = b->getColStartIdx(i);
+          int end = b->getColStartIdx(i + 1);
+          for (int j = start; j < end; ++j) {
+            colVecAddTo(
+                C + rows[j], A + i, B[j], height_, width_, a->getWidth());
+          }
+        }
+      }
+    }
+  } else {
+    if (!b->isTransposed()) {
+      size_t m = a->getWidth();
+      CHECK_EQ(b->getHeight(), m);
+      CHECK_EQ(a->getHeight(), height_);
+      CHECK_EQ(b->getWidth(), width_);
+
+      if (b->getValueType() == NO_VALUE) {
+        for (size_t j = 0; j < b->getHeight(); ++j) {
+          int start = b->getRowStartIdx(j);
+          int end = b->getRowStartIdx(j + 1);
+          for (int i = start; i < end; ++i) {
+            colVecAddTo(C + cols[i], A + j, height_, width_, a->getWidth());
+          }
+        }
+      } else if (b->getValueType() == FLOAT_VALUE) {
+        for (size_t j = 0; j < b->getHeight(); ++j) {
+          int start = b->getRowStartIdx(j);
+          int end = b->getRowStartIdx(j + 1);
+          for (int i = start; i < end; ++i) {
+            colVecAddTo(
+                C + cols[i], A + j, B[i], height_, width_, a->getWidth());
+          }
+        }
+      }
+    } else /*if (b->isTransposed())*/ {
+      size_t m = a->getWidth();
+      CHECK_EQ(b->getHeight(), width_);
+      CHECK_EQ(a->getHeight(), height_);
+      CHECK_EQ(b->getWidth(), m);
+      if (b->getValueType() == NO_VALUE) {
+        for (size_t i = 0; i < b->getHeight(); ++i) {
+          int start = b->getRowStartIdx(i);
+          int end = b->getRowStartIdx(i + 1);
+          for (int j = start; j < end; ++j) {
+            colVecAddTo(C + i, A + cols[j], height_, width_, a->getWidth());
+          }
+        }
+      } else if (b->getValueType() == FLOAT_VALUE) {
+        for (size_t i = 0; i < b->getHeight(); ++i) {
+          int start = b->getRowStartIdx(i);
+          int end = b->getRowStartIdx(i + 1);
+          for (int j = start; j < end; ++j) {
+            colVecAddTo(
+                C + i, A + cols[j], B[j], height_, width_, a->getWidth());
+          }
+        }
+      }
+    }
+  }
+}
+
+void CpuMatrix::selectRows(Matrix& table, IVector& ids) {
+  if (dynamic_cast<CacheRowCpuMatrix*>(&table)) {
+    selectRowsImp(*dynamic_cast<CacheRowCpuMatrix*>(&table), ids);
+  } else if (dynamic_cast<SparseRowCpuMatrix*>(&table)) {
+    selectRowsImp(*dynamic_cast<SparseRowCpuMatrix*>(&table), ids);
+  } else {
+    CHECK(table.isContiguous());
+    selectRowsImp(*dynamic_cast<CpuMatrix*>(&table), ids);
+  }
+}
+
+void CpuMatrix::selectElements(Matrix& table, IVector& ids) {
+  CHECK_EQ(table.getHeight(), ids.getSize());
+  CHECK_EQ(getHeight(), ids.getSize());
+  CHECK_EQ(getWidth(), 1U);
+  real* tableData = table.getData();
+  int* idsData = ids.getData();
+  for (size_t i = 0; i < table.getHeight(); i++) {
+    data_[i] += tableData[i * table.getWidth() + idsData[i]];
+  }
+}
+
+void CpuMatrix::addElements(Matrix& table, IVector& ids) {
+  CHECK_EQ(table.getHeight(), ids.getSize());
+  CHECK_EQ(getHeight(), ids.getSize());
+  CHECK_EQ(getWidth(), 1U);
+  real* tableData = table.getData();
+  int* idsData = ids.getData();
+  for (size_t i = 0; i < table.getHeight(); i++) {
+    tableData[i * table.getWidth() + idsData[i]] += data_[i];
+  }
+}
+
+// this.row[i] += table.row[ids[i]]
+template <typename TableMatType>
+void CpuMatrix::selectRowsImp(TableMatType& table, IVector& ids) {
+  CHECK(!table.useGpu());
+  CHECK(!ids.useGpu());
+  CHECK_EQ(getHeight(), ids.getSize());
+  CHECK_EQ(getWidth(), table.getWidth());
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  real* a = getData();
+  size_t tableSize = table.getHeight();
+  int* index = ids.getData();
+
+  for (size_t i = 0; i < numSamples; ++i) {
+    if (index[i] == -1) continue;
+    CHECK_LT(index[i], (int)tableSize);
+    CHECK_GE(index[i], 0);
+    vecAddTo(a + i * stride_, table.getRow(index[i]), dim);
+  }
+}
+
+void CpuMatrix::addToRows(Matrix& table, IVector& ids) {
+  if (dynamic_cast<CacheRowCpuMatrix*>(&table)) {
+    addToRowsImp(*dynamic_cast<CacheRowCpuMatrix*>(&table), ids);
+  } else if (dynamic_cast<SparseAutoGrowRowCpuMatrix*>(&table)) {
+    addToRowsImp(*dynamic_cast<SparseAutoGrowRowCpuMatrix*>(&table), ids);
+  } else if (dynamic_cast<SparseRowCpuMatrix*>(&table)) {
+    addToRowsImp(*dynamic_cast<SparseRowCpuMatrix*>(&table), ids);
+  } else {
+    CHECK(table.isContiguous());
+    addToRowsImp(*dynamic_cast<CpuMatrix*>(&table), ids);
+  }
+}
+
+// table.row[ids[i]] += this.row[i]
+template <typename TableMatType>
+void CpuMatrix::addToRowsImp(TableMatType& table, IVector& ids) {
+  CHECK(!table.useGpu());
+  CHECK(!ids.useGpu());
+  CHECK_EQ(getHeight(), ids.getSize());
+  CHECK_EQ(getWidth(), table.getWidth());
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  real* a = getData();
+  size_t tableSize = table.getHeight();
+  int* index = ids.getData();
+
+  for (size_t i = 0; i < numSamples; ++i) {
+    if (index[i] == -1) continue;
+    CHECK_LT(index[i], (int)tableSize);
+    CHECK_GE(index[i], 0);
+    vecAddTo(table.getRow(index[i]), a + i * stride_, dim);
+  }
+}
+
+static ThreadLocal<std::vector<const real*>> threadLocalColArray;
+
+template <typename MatBType, typename MatCType>
+void CpuMatrix::mul(
+    CpuSparseMatrix* a, MatBType* b, MatCType* c, real scaleAB, real scaleT) {
+  CHECK(!c->isTransposed()) << "Not supported";
+  CHECK(!b->isTransposed()) << "Not supported";
+  // TODO(yuyang18): Maybe bug implementation here.
+  CHECK(scaleAB == 1) << "Not supported";
+  CHECK(scaleT == 0 || scaleT == 1) << "Not supported";
+  CHECK_EQ(a->getFormat(), SPARSE_CSR) << "Not supported";
+
+  real* B = b->getData();
+  real* C = c->getData();
+  size_t height = c->getHeight();
+  size_t width = c->getWidth();
+  int* cols = a->getCols();
+  real* values = a->getValue();
+
+  if (scaleT == 0) {
+    c->zeroMem();
+  }
+
+  if (!a->isTransposed()) {
+    size_t m = a->getWidth();
+    CHECK_EQ(b->getHeight(), m);
+    CHECK_EQ(a->getHeight(), height);
+    CHECK_EQ(b->getWidth(), width);
+
+    if (a->getValueType() == NO_VALUE) {
+      if (width % 32 == 0) {  // use libaddto
+        // @TODO(yuyang18) Make input addr can be unaligned.
+        // So merge this if and else
+        CHECK_EQ((size_t)B % 32, 0UL);
+        CHECK_EQ((size_t)C % 32, 0UL);
+        auto& colArray = *threadLocalColArray;
+        for (size_t i = 0; i < a->getHeight(); ++i) {
+          const int start = a->getRowStartIdx(i);
+          const int end = a->getRowStartIdx(i + 1);
+          size_t colNum = end - start;
+          colArray.resize(colNum);
+          for (int j = 0; j < end - start; ++j) {
+            colArray[j] = b->getRow(cols[j + start]);
+          }
+          simd::batchAddTo(c->getRow(i), &colArray[0], colNum, width);
+        }
+
+      } else {
+        for (size_t i = 0; i < a->getHeight(); ++i) {
+          const int start = a->getRowStartIdx(i);
+          const int end = a->getRowStartIdx(i + 1);
+          for (int j = start; j < end; ++j) {
+            vecAddTo(c->getRow(i), b->getRow(cols[j]), width);
+          }
+        }
+      }
+    } else if (a->getValueType() == FLOAT_VALUE) {
+      for (size_t i = 0; i < a->getHeight(); ++i) {
+        const int start = a->getRowStartIdx(i);
+        const int end = a->getRowStartIdx(i + 1);
+        for (int j = start; j < end; ++j) {
+          vecAddTo(c->getRow(i), b->getRow(cols[j]), values[j], width);
+        }
+      }
+    }
+  } else /*if (a->isTransposed())*/ {
+    size_t m = a->getHeight();
+    CHECK_EQ(b->getHeight(), m);
+    CHECK_EQ(a->getWidth(), height);
+    CHECK_EQ(b->getWidth(), width);
+    if (a->getValueType() == NO_VALUE) {
+      if (width % 32 == 0) {  // use libaddto
+        // @TODO(yuyang18) Make input addr can be unaligned.
+        // So merge this if and else
+        CHECK_EQ((size_t)B % 32, 0UL);
+        CHECK_EQ((size_t)C % 32, 0UL);
+        for (size_t i = 0; i < a->getHeight(); ++i) {
+          const int start = a->getRowStartIdx(i);
+          const int end = a->getRowStartIdx(i + 1);
+          for (int j = start; j < end; ++j) {
+            simd::addTo(c->getRow(cols[j]), b->getRow(i), width);
+          }
+        }
+
+      } else {
+        for (size_t i = 0; i < a->getHeight(); ++i) {
+          const int start = a->getRowStartIdx(i);
+          const int end = a->getRowStartIdx(i + 1);
+          for (int j = start; j < end; ++j) {
+            vecAddTo(c->getRow(cols[j]), b->getRow(i), width);
+          }
+        }
+      }
+    } else if (a->getValueType() == FLOAT_VALUE) {
+      for (size_t i = 0; i < a->getHeight(); ++i) {
+        const int start = a->getRowStartIdx(i);
+        const int end = a->getRowStartIdx(i + 1);
+        for (int j = start; j < end; ++j) {
+          vecAddTo(c->getRow(cols[j]), b->getRow(i), values[j], width);
+        }
+      }
+    }
+  }
+}
+
+// instantiation mul() called in SparseRowMatrix.cpp
+template void CpuMatrix::mul<CpuMatrix, SparseRowCpuMatrix>(
+    CpuSparseMatrix* a,
+    CpuMatrix* b,
+    SparseRowCpuMatrix* c,
+    real scaleAB,
+    real scaleT);
+template void CpuMatrix::mul<CpuMatrix, SparseAutoGrowRowCpuMatrix>(
+    CpuSparseMatrix* a,
+    CpuMatrix* b,
+    SparseAutoGrowRowCpuMatrix* c,
+    real scaleAB,
+    real scaleT);
+template void CpuMatrix::mul<CpuMatrix, CacheRowCpuMatrix>(CpuSparseMatrix* a,
+                                                           CpuMatrix* b,
+                                                           CacheRowCpuMatrix* c,
+                                                           real scaleAB,
+                                                           real scaleT);
+
+#ifndef PADDLE_MOBILE_INFERENCE
+void SharedCpuMatrix::mul(CpuSparseMatrix* a,
+                          CpuMatrix* b,
+                          real scaleAB,
+                          real scaleT) {
+  CHECK(!isTransposed()) << "Not supported";
+  CHECK(!b->isTransposed()) << "Not supported";
+  CHECK_EQ(scaleAB, 1) << "Not supported";
+  CHECK_EQ(scaleT, 1) << "Not supported";
+  CHECK_EQ(a->getFormat(), SPARSE_CSR) << "not supported";
+
+  real* B = b->getData();
+  real* C = getData();
+  size_t height = getHeight();
+  size_t width = getWidth();
+
+  // get real trans
+  MatrixPtr aTrans;
+  if (a->isTransposed()) {
+    aTrans = a->getTmpSparseMatrix(a->getWidth(), a->getHeight());
+    a->transpose(aTrans, false);
+  }
+  a = dynamic_cast<CpuSparseMatrix*>(aTrans.get());
+
+  size_t m = a->getWidth();
+  CHECK_EQ(b->getHeight(), m);
+  CHECK_EQ(a->getHeight(), height);
+  CHECK_EQ(b->getWidth(), width);
+
+  size_t blockSize = (height / blockNum_) + 1;
+  CpuMatrixPtr localBuf = *localBuf_;
+  if (!localBuf) {
+    localBuf = std::make_shared<CpuMatrix>(blockSize, width);
+  } else {
+    localBuf->resize(blockSize, width);
+  }
+  localBuf->zeroMem();
+  real* localC = localBuf->getData();
+  std::vector<int>& blockSeq = *blockSeq_;
+  if (blockSeq.size() == 0) {
+    for (int k = 0; k < blockNum_; ++k) {
+      blockSeq.push_back(k);
+    }
+    std::shuffle(
+        blockSeq.begin(), blockSeq.end(), ThreadLocalRandomEngine::get());
+  }
+  std::vector<int>& localBufRows = *localBufRows_;
+  int* cols = a->getCols();
+  real* value = a->getValue();
+
+  for (int k = 0; k < blockNum_; ++k) {
+    int blockId = blockSeq[k];
+    size_t blockBegin = blockId * blockSize;
+    size_t blockEnd = (blockId + 1) * blockSize;
+    if (blockId == blockNum_ - 1) {
+      blockEnd = height;
+    }
+    if (a->getValueType() == NO_VALUE) {
+      for (size_t i = blockBegin; i < blockEnd; ++i) {
+        int start = a->getRowStartIdx(i);
+        int end = a->getRowStartIdx(i);
+        size_t colNum = a->getColNum(i);
+        if (colNum == 0) {
+          continue;
+        }  // skip empty row
+        localBufRows.push_back(i);
+        size_t bufPos = localBufRows.size() - 1;
+        for (int j = start; j < end; ++j) {
+          vecAddTo(localC + bufPos * width, B + cols[j] * width, width);
+        }
+      }
+    } else if (a->getValueType() == FLOAT_VALUE) {
+      for (size_t i = blockBegin; i < blockEnd; ++i) {
+        int start = a->getRowStartIdx(i);
+        int end = a->getRowStartIdx(i);
+        size_t colNum = a->getColNum(i);
+        if (colNum == 0) {
+          continue;
+        }  // skip empty row
+        localBufRows.push_back(i);
+        size_t bufPos = localBufRows.size() - 1;
+        for (int j = start; j < end; ++j) {
+          vecAddTo(
+              localC + bufPos * width, B + cols[j] * width, value[j], width);
+        }
+      }
+    }
+
+    {
+      std::lock_guard<std::mutex> guard(*blockLocks_[blockId]);
+      for (size_t i = 0; i < localBufRows.size(); ++i) {
+        vecAddTo(C + localBufRows[i] * width, localC + i * width, width);
+      }
+    }
+    memset(localC, 0, localBufRows.size() * width * sizeof(real));
+    localBufRows.clear();
+  }
+
+  VLOG(2) << " B[0]=" << B[0] << " B[1]=" << B[1] << " C[0]=" << C[0]
+          << " C[1]=" << C[1];
+}
+
+void SharedCpuMatrix::add(Matrix& b, real p1, real p2) {
+  CHECK_EQ(blockNum_, 1);
+  std::lock_guard<std::mutex> guard(*blockLocks_[0]);
+  CpuMatrix::add(b, p1, p2);
+}
+
+void SharedCpuMatrix::add(real p1, real p2) {
+  CHECK_EQ(blockNum_, 1);
+  std::lock_guard<std::mutex> guard(*blockLocks_[0]);
+  CpuMatrix::add(p1, p2);
+}
+
+void SharedCpuMatrix::initShared(int blockNum) {
+  CHECK_GT(height_ * width_, 1UL * 1024 * 1024)
+      << "should not share small matrix";
+  initBlock(blockNum);
+}
+
+void SharedCpuMatrix::initBlock(int blockNum) {
+  CHECK_LE(blockNum, 200) << "should not use large block number";
+  blockNum_ = blockNum;
+  blockLocks_.resize(blockNum);
+  for (auto& locker : blockLocks_) {
+    locker.reset(new std::mutex);
+  }
+}
+
+#endif
+/* Add a (column) vector b to matrix a, column by column */
+void CpuMatrix::addColumnVector(const Matrix& b) {
+  BaseMatrix::addColVector(const_cast<Matrix&>(b));
+}
+
+/* this = a*b */
+void CpuMatrix::mul(const Matrix& a, const Matrix& b) {
+  return mul(a, b, 1.0, 0.0);
+}
+
+/* this = scaleAB*(this*b) +  scaleT*this */
+void CpuMatrix::rightMul(Matrix& b, real scaleAB, real scaleT) {
+  (void)b;
+  (void)scaleAB;
+  (void)scaleT;
+  LOG(FATAL) << "Not implemented";
+}
+
+/* this = this* b */
+void CpuMatrix::rightMul(Matrix& b) { return rightMul(b, 1.0, 0.0); }
+
+/* this = scaleAB*(a*this) +  scaleT*this */
+void CpuMatrix::leftMul(Matrix& a, real scaleAB, real scaleT) {
+  (void)a;
+  (void)scaleAB;
+  (void)scaleT;
+  LOG(FATAL) << "Not implemented";
+}
+
+/* this = a*this) */
+void CpuMatrix::leftMul(Matrix& a) { return leftMul(a, 1.0, 0.0); }
+
+void CpuMatrix::colMerge(Matrix& src) { src.rowSum(*this); }
+
+void CpuMatrix::rowSum(Matrix& sum) {
+  CHECK_EQ(sum.getHeight(), getHeight());
+  CHECK_EQ(sum.getWidth(), (size_t)1);
+
+  sum.sumRows(*this, /* scaleSum= */ 1, /* scaleDest= */ 0);
+}
+
+void CpuMatrix::rowMaxId(IVector& maxIds) {
+  CHECK(!maxIds.useGpu()) << "Matrix type are not equal";
+
+  size_t numSamples = getHeight();
+  CHECK_EQ(maxIds.getSize(), numSamples);
+
+  real* a = getData();
+  int* s = maxIds.getData();
+  size_t dim = getWidth();
+
+  for (size_t i = 0; i < numSamples; i++) {
+    real sm = a[i * dim];
+    int maxId = 0;
+    for (size_t j = 1; j < dim; j++) {
+      if (a[i * dim + j] > sm) {
+        maxId = j;
+        sm = a[i * dim + j];
+      }
+    }
+    s[i] = maxId;
+  }
+}
+
+void CpuMatrix::rowMax(Matrix& max) {
+  CHECK_EQ(max.getHeight(), getHeight());
+  CHECK_EQ(max.getWidth(), (size_t)1);
+  max.maxRows(*this);
+}
+
+/* Get the top k elements of each row of this matrix */
+void CpuMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
+  CHECK(isContiguous());
+  CHECK(!maxIds.useGpu() && !maxVal.useGpu()) << "Matrix type are not equal";
+  size_t numSamples = getHeight();
+  size_t beam = maxVal.getWidth();
+  CHECK_EQ(maxIds.getSize(), numSamples * beam);
+  CHECK_EQ(maxVal.getHeight(), numSamples);
+  CHECK_EQ(maxVal.getWidth(), beam);
+
+  real* a = getData();
+  int* s = maxIds.getData();
+  real* t = maxVal.getData();
+  size_t dim = getWidth();
+  for (size_t i = 0; i < numSamples; i++) {
+    std::vector<std::pair<real, size_t>> vec;
+    for (size_t j = 0; j < dim; j++) {
+      vec.push_back(std::pair<real, size_t>(a[i * dim + j], j));
+    }
+
+    std::partial_sort(
+        vec.begin(),
+        vec.begin() + beam,
+        vec.end(),
+        [](const std::pair<real, size_t>& l, const std::pair<real, size_t>& r) {
+          return l.first > r.first;
+        });
+    for (size_t j = 0; j < beam; j++) {
+      t[i * beam + j] = vec[j].first;
+      s[i * beam + j] = vec[j].second;
+    }
+  }
+}
+
+void CpuMatrix::colMax(Matrix& max) {
+  CHECK_EQ(max.getWidth(), getWidth());
+  CHECK_EQ(max.getHeight(), (size_t)1);
+  max.maxCols(*this);
+}
+
+void CpuMatrix::colMax(IVector& maxIds, Matrix& maxVal) {
+  CHECK(isContiguous());
+  CHECK(!maxIds.useGpu() && !maxVal.useGpu()) << "Matrix type are not equal";
+  size_t numSamples = getWidth();
+  size_t beam = maxVal.getHeight();
+  CHECK_EQ(maxIds.getSize(), numSamples * beam);
+  CHECK_EQ(maxVal.getWidth(), numSamples);
+
+  real* a = getData();
+  int* s = maxIds.getData();
+  real* t = maxVal.getData();
+  size_t dim = getHeight();
+  for (size_t i = 0; i < numSamples; i++) {
+    std::vector<std::pair<real, size_t>> vec;
+    for (size_t j = 0; j < dim; j++) {
+      vec.push_back(std::pair<real, size_t>(a[i + j * numSamples], j));
+    }
+
+    std::partial_sort(
+        vec.begin(),
+        vec.begin() + beam,
+        vec.end(),
+        [](const std::pair<real, size_t>& l, const std::pair<real, size_t>& r) {
+          return l.first > r.first;
+        });
+    for (size_t j = 0; j < beam; j++) {
+      t[i + j * numSamples] = vec[j].first;
+      s[i + j * numSamples] = vec[j].second;
+    }
+  }
+}
+
+void CpuMatrix::maxoutForward(Matrix& a,
+                              IVector& id,
+                              size_t channels,
+                              size_t groups) {
+  CHECK(dynamic_cast<CpuMatrix*>(&a));
+  CHECK(dynamic_cast<CpuIVector*>(&id));
+  CHECK_EQ(a.getHeight(), getHeight());
+
+  size_t size = getWidth();
+  size_t batchSize = getHeight();
+  size_t featLen = size / channels;
+  const real* input = a.getData();
+  int* idForCpu = id.getData();
+
+  MatrixPtr maxInMat, maxOutMat;
+  Matrix::resizeOrCreate(maxInMat, groups, size, false, false);
+  Matrix::resizeOrCreate(maxOutMat, 1, size, false, false);
+
+  for (size_t batch_idx = 0; batch_idx < batchSize; ++batch_idx) {
+    size_t newIndex = batch_idx * size;
+    IVectorPtr tmpId = IVector::create(idForCpu + newIndex, size, false);
+
+    for (size_t i = 0; i < channels; ++i) {
+      size_t newFeatLen = i * featLen;
+      for (size_t j = 0; j < groups; ++j) {
+        maxInMat->subMatrix(j, j + 1, newFeatLen, newFeatLen + featLen)
+            ->copyFrom(input + (newIndex + newFeatLen) * groups + j * featLen,
+                       featLen);
+      }
+    }
+    maxInMat->colMax(*tmpId, *maxOutMat);
+    this->subRowMatrix(batch_idx, batch_idx + 1)->copyFrom(*maxOutMat);
+  }
+}
+
+void CpuMatrix::maxoutBackward(Matrix& a,
+                               IVector& id,
+                               size_t channels,
+                               size_t groups) {
+  CHECK(dynamic_cast<CpuMatrix*>(&a));
+  CHECK(dynamic_cast<CpuIVector*>(&id));
+  CHECK_EQ(a.getHeight(), getHeight());
+
+  size_t size = a.getWidth();
+  size_t batchSize = getHeight();
+  size_t featLen = size / channels;
+  size_t newFeatLen = groups * featLen;
+  real* inputG = getData();
+  const real* outG = a.getData();
+  int* idForCpu = id.getData();
+
+  for (size_t batch_idx = 0; batch_idx < batchSize; ++batch_idx) {
+    size_t newIndex = batch_idx * size;
+    int* idData = idForCpu + newIndex;
+
+    for (size_t i = 0; i < size; ++i) {
+      int gradIdx =
+          idData[i] * featLen + (i / featLen) * newFeatLen + i % featLen;
+      (inputG + newIndex * groups)[gradIdx] += (outG + newIndex)[i];
+    }
+  }
+}
+
+void CpuMatrix::rowNormalizeL1(Matrix& out) {
+  CHECK(!out.useGpu());
+
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  CHECK_EQ(out.getHeight(), numSamples);
+  CHECK_EQ(out.getWidth(), dim);
+  real* a = getData();
+  real* b = out.getData();
+  for (size_t i = 0; i < numSamples; ++i) {
+    real s = 0;
+    for (size_t j = 0; j < dim; ++j) {
+      s += a[i * dim + j];
+    }
+    // Right now, we just bet that sum won't be zero. If this really happens,
+    // we will figure out what should be done then.
+    CHECK_GT(s, 0);
+    s = 1 / s;
+    for (size_t j = 0; j < dim; ++j) {
+      b[i * dim + j] = s * a[i * dim + j];
+    }
+  }
+}
+
+/* calulate classification error */
+void CpuMatrix::classificationError(Matrix& output,
+                                    IVector& label,
+                                    size_t topkSize) {
+  size_t numSamples = this->getHeight();
+  auto cpuOutput = dynamic_cast<CpuMatrix*>(&output);
+  auto cpuLabel = dynamic_cast<CpuIVector*>(&label);
+  IVectorPtr cpuTopIds = std::make_shared<CpuIVector>(numSamples * topkSize);
+  MatrixPtr cpuTopVal = std::make_shared<CpuMatrix>(numSamples, topkSize);
+
+  CHECK(cpuOutput && cpuLabel) << "Invalid argument pointer";
+  CHECK(cpuTopIds && cpuTopVal) << "Allocate cpu memory failed";
+  CHECK(cpuLabel->getSize() == numSamples) << "Vector size is not equal";
+  CHECK(cpuOutput->getHeight() == numSamples && this->getWidth() == 1)
+      << "Matrix dimensions are not equal";
+
+  // top k matrix classification
+  cpuOutput->rowMax(*cpuTopIds, *cpuTopVal);
+
+  size_t dim = cpuOutput->getWidth();
+  real* result = this->getData();
+  int* ids = cpuTopIds->getData();
+  int* lbl = cpuLabel->getData();
+  for (size_t i = 0; i < numSamples; ++i) {
+    CHECK_GE(lbl[i], 0);
+    CHECK_LT((size_t)lbl[i], dim);
+
+    for (size_t j = 0; j < topkSize; ++j) {
+      if (ids[j + i * topkSize] == lbl[i]) {
+        result[i] = 0;
+        break;
+      }
+      result[i] = 1.0f;
+    }
+  }
+}
+
+/* copy -log(output[label]) to this->data[i] */
+void CpuMatrix::oneHotCrossEntropy(Matrix& output, IVector& label) {
+  CHECK(dynamic_cast<CpuMatrix*>(&output));
+  CHECK(dynamic_cast<CpuIVector*>(&label));
+
+  size_t numSamples = getHeight();
+  size_t dim = output.getWidth();
+  CHECK_EQ(label.getSize(), numSamples);
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(getWidth(), (size_t)1);
+
+  real* out = output.getData();
+  real* cost = getData();
+  int* lbl = label.getData();
+  for (size_t i = 0; i < numSamples; ++i, out += dim) {
+    CHECK_GE(lbl[i], 0);
+    CHECK_LT((size_t)lbl[i], dim);
+    cost[i] = -std::log(out[lbl[i]]);
+  }
+}
+
+/* calculate the error of outputV according to label */
+void CpuMatrix::oneHotCrossEntropyBp(Matrix& output, IVector& label) {
+  CHECK(dynamic_cast<CpuMatrix*>(&output));
+  CHECK(dynamic_cast<CpuIVector*>(&label));
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  CHECK_EQ(output.getWidth(), dim);
+  real* out = output.getData();
+  real* grad = getData();
+  int* lbl = label.getData();
+  for (size_t i = 0; i < numSamples; ++i, out += dim, grad += dim) {
+    grad[lbl[i]] -= 1 / out[lbl[i]];
+  }
+}
+
+/*
+    We implement the matrix functionality in CostLayer.cpp,
+    but we define the scalar function here for sanity check
+    deletion of the function does not affect anything neverthelss
+*/
+void CpuMatrix::oneHotCrossEntropyWithSelfNorm(Matrix& output,
+                                               IVector& label,
+                                               real alpha) {
+  CHECK(dynamic_cast<CpuMatrix*>(&output));
+  CHECK(dynamic_cast<CpuIVector*>(&label));
+
+  size_t numSamples = getHeight();
+  size_t dim = output.getWidth();
+  CHECK_EQ(label.getSize(), numSamples);
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(getWidth(), (size_t)1);
+
+  real* out = output.getData();
+  real* cost = getData();
+  int* lbl = label.getData();
+  for (size_t i = 0; i < numSamples; ++i, out += dim) {
+    CHECK_GE(lbl[i], 0);
+    CHECK_LT((size_t)lbl[i], dim);
+    real sum = 0;
+    for (size_t j = 0; j < dim; ++j) {
+      sum += out[j];
+    }
+    sum = _safelog(sum);
+    cost[i] = -_safelog(out[lbl[i]]) + sum + alpha * _square(sum);
+  }
+}
+
+/*
+    We implement the matrix functionality in CostLayer.cpp,
+    but we define the scalar function here for sanity check
+    deletion of the function does not affect anything neverthelss
+*/
+void CpuMatrix::oneHotCrossEntropyWithSelfNormBp(Matrix& output,
+                                                 IVector& label,
+                                                 real alpha) {
+  CHECK(dynamic_cast<CpuMatrix*>(&output));
+  CHECK(dynamic_cast<CpuIVector*>(&label));
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  CHECK_EQ(output.getWidth(), dim);
+  real* out = output.getData();
+  real* grad = getData();
+  int* lbl = label.getData();
+
+  for (size_t i = 0; i < numSamples; ++i, out += dim, grad += dim) {
+    grad[lbl[i]] -= 1 / out[lbl[i]];
+    real sum = 0;
+    for (size_t j = 0; j < dim; ++j) {
+      sum += out[j];
+    }
+    for (size_t j = 0; j < dim; ++j) {
+      if (j == (size_t)lbl[i]) {
+        grad[j] += -1 / out[j];
+      }
+      grad[j] += 1 / sum + 2 * alpha * _safelog(sum) / sum;
+    }
+  }
+}
+
+#define FORWARD_LOOP()                      \
+  size_t numSamples = getHeight();          \
+  size_t dim = getWidth();                  \
+  CHECK_EQ(output.getHeight(), numSamples); \
+  CHECK_EQ(output.getWidth(), dim);         \
+  const real* in = getData();               \
+  real* out = output.getData();             \
+  for (size_t i = 0; i < numSamples; ++i, in += dim, out += dim)
+
+#define BACKWARD_LOOP()                     \
+  size_t numSamples = getHeight();          \
+  size_t dim = getWidth();                  \
+  CHECK_EQ(output.getHeight(), numSamples); \
+  CHECK_EQ(output.getWidth(), dim);         \
+  real* grad = getData();                   \
+  real* out = output.getData();             \
+  for (size_t i = 0; i < numSamples; ++i, grad += dim, out += dim)
+
+void CpuMatrix::softmax(Matrix& output) {
+  CHECK(!output.useGpu());
+
+  const float THRESHOLD = -64.0;
+
+  FORWARD_LOOP() {
+    real max = -1.0e20;
+    for (size_t j = 0; j < dim; ++j) {
+      if (in[j] > max) {
+        max = in[j];
+      }
+    }
+    for (size_t j = 0; j < dim; ++j) {
+      real a = in[j] - max;
+      if (a < THRESHOLD) {
+        a = THRESHOLD;
+      }
+      out[j] = a;
+    }
+    vExp(dim, out, out);
+
+    real sum = 0;
+    for (size_t j = 0; j < dim; ++j) {
+      sum += out[j];
+    }
+    sum = 1 / sum;
+    for (size_t j = 0; j < dim; ++j) {
+      out[j] *= sum;
+    }
+  }
+}
+
+void CpuMatrix::sequenceSoftmax(Matrix& output, const IVector& index) {
+  CHECK_EQ(getWidth(), 1UL);
+  CHECK_EQ(output.getWidth(), 1UL);
+  CHECK(isContiguous());
+
+  MatrixPtr inTmp = Matrix::create(nullptr,
+                                   /* height= */ 1,
+                                   1,
+                                   /* trans= */ false,
+                                   false);
+  MatrixPtr outTmp = Matrix::create(nullptr,
+                                    /* height= */ 1,
+                                    1,
+                                    /* trans= */ false,
+                                    false);
+  size_t numSequences = index.getSize() - 1;
+  auto starts = index.getData();
+  for (size_t i = 0; i < numSequences; ++i) {
+    size_t offset = starts[i];
+    size_t size = starts[i + 1] - starts[i];
+    inTmp->setData(getData() + offset, 1UL, size);
+    outTmp->setData(output.getData() + offset, 1UL, size);
+    inTmp->softmax(*outTmp);
+  }
+}
+
+void CpuMatrix::softmaxDerivative(Matrix& output, Matrix& sftmaxSum) {
+  CHECK(output.useGpu_ == false) << "Matrix type are not equal";
+  CHECK_EQ(getHeight(), sftmaxSum.getHeight());
+
+  real* sums = sftmaxSum.getData();
+
+  BACKWARD_LOOP() {
+    real sum = sums[i];
+    for (size_t j = 0; j < dim; ++j) {
+      grad[j] = out[j] * (grad[j] - sum);
+    }
+  }
+}
+
+void CpuMatrix::sumOfSquares(Matrix& output, Matrix& label) {
+  CHECK(output.useGpu_ == false && label.useGpu_ == false)
+      << "Matrix type are not equal";
+
+  size_t numSamples = getHeight();
+  size_t dim = output.getWidth();
+  CHECK_EQ(label.getHeight(), numSamples);
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(label.getWidth(), dim);
+  CHECK_EQ(getWidth(), (size_t)1);
+  real* out = output.getData();
+  real* cost = getData();
+
+  auto labelptr = dynamic_cast<CpuSparseMatrix*>(&label);
+  if (labelptr) {
+    // it is a CpuSparseMatrix
+    if (labelptr->getFormat() == SPARSE_CSR) {
+      // treat label as a SparseMatrix
+      for (size_t i = 0; i < numSamples; ++i) {
+        for (size_t j = 0; j < dim; ++j) {
+          cost[i] += _square(out[i * dim + j]);
+        }
+      }
+      if (labelptr->getValueType() == NO_VALUE) {
+        int* cols = labelptr->getCols();
+        for (size_t i = 0; i < numSamples; ++i) {
+          for (size_t j = labelptr->getRowStartIdx(i);
+               j < labelptr->getRowStartIdx(i + 1);
+               ++j) {
+            cost[i] += 1.0 - 2.0 * out[i * dim + cols[j]];
+            /*
+             * explanation of above line: original codes are follows:
+             * cost[i] -= _square(out[i * dim + feature.col]);
+             * cost[i] += _square(1.0 - out[i * dim + feature.col]);
+             */
+          }
+        }
+      } else if (labelptr->getValueType() == FLOAT_VALUE) {
+        int* cols = labelptr->getCols();
+        real* values = labelptr->getValue();
+        for (size_t i = 0; i < numSamples; ++i) {
+          real sum1 = 0;
+          real sum2 = 0;
+          for (size_t j = labelptr->getRowStartIdx(i);
+               j < labelptr->getRowStartIdx(i + 1);
+               ++j) {
+            sum1 += values[j] * values[j];
+            sum2 += values[j] * out[i * dim + cols[j]];
+            /*
+             * explanation of above line: original codes are follows:
+             * cost[i] -= _square(out[i * dim + feature.col]);
+             * cost[i] += _square(value.col - out[i * dim + feature.col]);
+             */
+          }
+          cost[i] += sum1 - 2.0 * sum2;
+        }
+      } else {
+        LOG(FATAL) << "unsupported sparse matrix value type in sumOfSquares";
+        return;
+      }
+      return;
+    } else {
+      LOG(FATAL) << "unsupported sparse matrix format in sumOfSquares";
+      return;
+    }
+  }
+
+  BaseMatrix::sumOfSquaredDiffs(output,
+                                label,
+                                /* scaleSum= */ 1,
+                                /* scaleDest= */ 1);
+}
+
+/* calculate the error of outputV according to label */
+void CpuMatrix::sumOfSquaresBp(Matrix& output, Matrix& label) {
+  CHECK(output.useGpu_ == false && label.useGpu_ == false)
+      << "Matrix type are not equal";
+
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  CHECK_EQ(output.getWidth(), dim);
+  CHECK_EQ(label.getWidth(), dim);
+
+  real* out = output.getData();
+  real* grad = getData();
+
+  auto labelptr = dynamic_cast<CpuSparseMatrix*>(&label);
+  if (labelptr) {
+    // it is a CpuSparseMatrix
+    if (labelptr->getFormat() == SPARSE_CSR) {
+      // treat label as a SparseMatrix
+      for (size_t i = 0; i < numSamples; ++i) {
+        for (size_t j = 0; j < dim; ++j) {
+          grad[i * dim + j] += 2.0 * out[i * dim + j];
+        }
+      }
+      if (labelptr->getValueType() == NO_VALUE) {
+        int* cols = labelptr->getCols();
+        for (size_t i = 0; i < numSamples; ++i) {
+          for (size_t j = labelptr->getRowStartIdx(i);
+               j < labelptr->getRowStartIdx(i + 1);
+               ++j) {
+            grad[i * dim + cols[j]] -= 2.0;
+            /*
+             * explanation of above line: original codes are follows:
+             * grad[i * dim + feature.col] -= 2.0 * out[i * dim + feature.col];
+             * grad[i * dim + feature.col] += 2.0 * (out[i * dim + feature.col]
+             * - 1);
+             */
+          }
+        }
+      } else if (labelptr->getValueType() == FLOAT_VALUE) {
+        int* cols = labelptr->getCols();
+        real* values = labelptr->getValue();
+        for (size_t i = 0; i < numSamples; ++i) {
+          for (size_t j = labelptr->getRowStartIdx(i);
+               j < labelptr->getRowStartIdx(i + 1);
+               ++j) {
+            grad[i * dim + cols[j]] -= 2.0 * values[j];
+            /*
+             * explanation of above line: original codes are follows:
+             * grad[i * dim + feature.col] -= 2.0 * out[i * dim + feature.col];
+             * grad[i * dim + feature.col] += 2.0 * (out[i * dim + feature.col]
+             * - value.col);
+             */
+          }
+        }
+      } else {
+        LOG(FATAL) << "unsupported sparse matrix value type in sumOfSquares";
+        return;
+      }
+      return;
+    } else {
+      LOG(FATAL) << "unsupported sparse matrix format in sumOfSquares";
+      return;
+    }
+  }
+
+  real* lbl = label.getData();
+  size_t ld = getStride();
+  size_t outLd = output.getStride();
+  size_t lblLd = label.getStride();
+  CHECK(lbl);
+  for (size_t i = 0; i < numSamples;
+       ++i, out += outLd, lbl += lblLd, grad += ld) {
+    for (size_t j = 0; j < dim; ++j) {
+      grad[j] += 2.0 * (out[j] - lbl[j]);  // positive gradient;
+    }
+  }
+}
+
+void CpuMatrix::smoothL1(Matrix& output, Matrix& label, real destScale) {
+  CHECK(output.useGpu_ == false && label.useGpu_ == false)
+      << "Matrix type are not equal";
+
+  size_t numSamples = getHeight();
+  size_t dim = output.getWidth();
+  CHECK_EQ(label.getHeight(), numSamples);
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(label.getWidth(), dim);
+  CHECK_EQ(getWidth(), (size_t)1);
+
+  real* cost = getData();
+  real* out = output.getData();
+  real* lbl = label.getData();
+
+  for (size_t i = 0; i < numSamples; ++i, out += dim, lbl += dim) {
+    for (size_t j = 0; j < dim; ++j) {
+      real absVal = std::fabs(out[j] - lbl[j]);
+      cost[i] *= destScale;
+      if (absVal < 1.0)
+        cost[i] += 0.5 * absVal * absVal;
+      else
+        cost[i] += absVal - 0.5;
+    }
+  }
+}
+
+void CpuMatrix::smoothL1Bp(Matrix& output, Matrix& label, real destScale) {
+  CHECK(output.useGpu_ == false && label.useGpu_ == false)
+      << "Matrix type are not equal";
+
+  size_t numSamples = getHeight();
+  size_t dim = output.getWidth();
+  CHECK_EQ(label.getHeight(), numSamples);
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(label.getWidth(), dim);
+  CHECK_EQ(getWidth(), dim);
+
+  real* out = output.getData();
+  real* lbl = label.getData();
+  real* grad = getData();
+
+  for (size_t i = 0; i < numSamples; ++i, out += dim, grad += dim, lbl += dim) {
+    for (size_t j = 0; j < dim; ++j) {
+      real val = out[j] - lbl[j];
+      grad[j] *= destScale;
+      if (std::fabs(val) < 1) {
+        grad[j] += val;
+      } else {
+        grad[j] += (real(0) < val) - (val < real(0));
+      }
+    }
+  }
+}
+
+void CpuMatrix::tanh(Matrix& output) {
+  CHECK(isContiguous());
+  CHECK(output.isContiguous());
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(output.getWidth(), dim);
+  vTanh(numSamples * dim, getData(), output.getData());
+}
+
+void CpuMatrix::tanhDerivative(Matrix& output) {
+  BaseMatrix::tanhDerivative(output);
+}
+
+void CpuMatrix::softrelu(Matrix& output) {
+  CHECK(isContiguous());
+  CHECK(output.isContiguous());
+  const real THRESHOLD = 40.0;
+  FORWARD_LOOP() {  // TODO(yuyang18): SIMD it?
+    for (size_t j = 0; j < dim; ++j) {
+      real x = in[j];
+      if (x > THRESHOLD) {
+        x = THRESHOLD;
+      } else if (x < -THRESHOLD) {
+        x = -THRESHOLD;
+      }
+      out[j] = x;
+    }
+  }
+  vExp(numSamples * dim, output.getData(), output.getData());
+  vLog1p(numSamples * dim, output.getData(), output.getData());
+}
+
+void CpuMatrix::softreluDerivative(Matrix& output) {
+  CHECK(isContiguous());
+  CHECK(output.isContiguous());
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  size_t size = numSamples * dim;
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(output.getWidth(), dim);
+  real* grad = getData();
+  MatrixPtr tmpMat = Matrix::create(numSamples, dim);
+  real* tmp = tmpMat->getData();
+
+  vExp(size, output.getData(), tmpMat->getData());
+
+  for (size_t i = 0; i < size; ++i) {
+    grad[i] *= (1.0 - 1.0 / tmp[i]);
+  }
+}
+
+void CpuMatrix::scaledTanh(Matrix& output, real p1, real p2) {
+  CHECK(isContiguous());
+  CHECK(output.isContiguous());
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(output.getWidth(), dim);
+
+  const real* in = getData();
+  real* out = output.getData();
+
+  // out = p2*in
+  for (size_t i = 0; i < numSamples * dim; ++i) {
+    out[i] = p2 * in[i];
+  }
+
+  vTanh(numSamples * dim, out, out);
+
+  // out = p1 * out
+  for (size_t i = 0; i < numSamples * dim; ++i) {
+    out[i] = p1 * out[i];
+  }
+}
+
+/* uniform randomization, minimize precision = 1e-5 */
+void CpuMatrix::randomizeUniform() {
+  CHECK(isContiguous());
+  real* data = getData();
+  unsigned int* randSeed = ThreadLocalRand::getSeed();
+  real recipRandMax = 1.0f / (real)RAND_MAX;
+  for (size_t i = 0; i < elementCnt_; ++i) {
+    *data++ = rand_r(randSeed) * recipRandMax;
+  }
+}
+
+void CpuMatrix::print(std::ostream& os) const {
+  CHECK(isContiguous());
+  for (size_t i = 0; i < height_; ++i) {
+    for (size_t j = 0; j < width_; ++j) {
+      os << data_[i * width_ + j] << " ";
+    }
+    os << std::endl;
+  }
+}
+
+void CpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
+  real* input = data.getData();
+  real* w = W.getData();
+  real* output = data_;
+  size_t numElements = data.getWidth();
+  size_t numSamples = data.getHeight();
+  size_t paraSize = W.getHeight() * W.getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+
+  size_t partial_sum = numElements / paraSize;
+  if (paraSize == numElements) {
+    for (size_t n = 0; n < numSamples * numElements; ++n) {
+      output[n] = input[n] > 0 ? input[n] : input[n] * w[n % numElements];
+    }
+    return;
+  }
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+  for (size_t n = 0; n < numSamples; ++n) {
+    for (size_t i = 0; i < paraSize; i++) {
+      neon::prelu(
+          input + i * partial_sum, w[i], output + i * partial_sum, partial_sum);
+    }
+    input = input + numElements;
+    output = output + numElements;
+  }
+#else
+  for (size_t n = 0, k = 0; n < numSamples; ++n) {
+    for (size_t i = 0; i < numElements; ++i, ++k) {
+      output[k] = input[k] > 0 ? input[k] : input[k] * w[i / partial_sum];
+    }
+  }
+#endif
+}
+
+void CpuMatrix::paramReluBackwardW(Matrix& oGrad, Matrix& data) {
+  real* ograd = oGrad.getData();
+  real* input = data.getData();
+  real* wgrad = data_;
+  size_t numElements = data.getWidth();
+  size_t numSamples = data.getHeight();
+  size_t paraSize = this->getHeight() * this->getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+  size_t partial_sum = numElements / paraSize;
+  for (size_t n = 0, k = 0; n < numSamples; ++n) {
+    for (size_t i = 0; i < numElements; ++i, ++k) {
+      wgrad[i / partial_sum] += ograd[k] * (input[k] > 0 ? 0 : input[k]);
+    }
+  }
+}
+
+void CpuMatrix::paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
+  real* diff = data_;
+  real* input = data.getData();
+  real* ograd = oGrad.getData();
+  real* w = W.getData();
+  size_t numElements = data.getWidth();
+  size_t numSamples = data.getHeight();
+  size_t paraSize = W.getHeight() * W.getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+  size_t partial_sum = numElements / paraSize;
+  for (size_t n = 0, k = 0; n < numSamples; ++n) {
+    for (size_t i = 0; i < numElements; ++i, ++k) {
+      diff[k] += ograd[k] * (input[k] > 0 ? 1 : w[i / partial_sum]);
+    }
+  }
+}
+
+void CpuMatrix::print(std::ostream& os, size_t height, size_t width) const {
+  CHECK(isContiguous());
+  size_t h = height_ < height ? height_ : height;
+  size_t w = width_ < width ? width_ : width;
+  os.setf(std::ostream::scientific);
+  os << "[";
+  for (size_t i = 0; i < h; ++i) {
+    for (size_t j = 0; j < w; ++j) {
+      os << data_[i * width_ + j] << " ";
+    }
+    if (i == h - 1) {
+      os << "]";
+    }
+    os << std::endl;
+  }
+}
+
+void CpuMatrix::printOneRow(std::ostream& os, size_t idx) const {
+  CHECK_LT(idx, height_);
+  size_t offset = idx * stride_;
+  os << data_[offset];
+  for (size_t i = 1; i < width_; ++i) {
+    os << " " << data_[offset + i];
+  }
+  os << ";";
+}
+
+void CpuMatrix::check(std::ostream& os, Matrix& refMat, bool printDiff) {
+  CHECK(isContiguous());
+  CHECK(height_ == refMat.getHeight());
+  CHECK(width_ == refMat.getWidth());
+  CpuMatrix cpuRef(height_, width_);
+  cpuRef.copyFrom(refMat);
+  size_t diffCnt = 0;
+  for (size_t i = 0; i < height_; ++i) {
+    for (size_t j = 0; j < width_; ++j) {
+      real a = getElement(i, j);
+      real b = cpuRef.getElement(i, j);
+      if (fabs(a - b) > 0.00001) {
+        ++diffCnt;
+        if (printDiff) {
+          os << "ref= " << a << "  check= " << b << std::endl;
+        }
+      }
+    }
+  }
+  LOG(INFO) << "the  diffCnt is " << diffCnt;
+}
+
+real CpuMatrix::getMin() {
+  size_t size = getHeight() * getWidth();
+  real* data = getData();
+  real res = data[0];
+  for (size_t i = 1; i < size; ++i) {
+    if (res > data[i]) {
+      res = data[i];
+    }
+  }
+  return res;
+}
+
+real CpuMatrix::getMax() {
+  size_t size = getHeight() * getWidth();
+  real* data = getData();
+  real res = data[0];
+  for (size_t i = 1; i < size; ++i) {
+    if (res < data[i]) {
+      res = data[i];
+    }
+  }
+  return res;
+}
+
+void CpuMatrix::circularConv(Matrix& in0, Matrix& in1) {
+  size_t height = this->getHeight();
+  size_t width0 = this->getWidth();
+  size_t width1 = in1.getWidth();
+
+  CHECK_EQ(height, in0.getHeight());
+  CHECK_EQ(width0, in0.getWidth());
+  CHECK_EQ(height, in1.getHeight());
+
+  CHECK_EQ(width1 % 2, 1U);
+
+  real* outV = this->getData();
+  real* inV0 = in0.getData();
+  real* inV1 = in1.getData();
+
+  int leftCtxLen = (width1 - 1) / 2;
+  for (size_t x = 0; x < height;
+       ++x, outV += width0, inV0 += width0, inV1 += width1) {
+    for (size_t i = 0; i < width0; ++i) {  // each dimension of output
+      for (size_t j = 0; j < width1; ++j) {
+        // iterate over all dimentions of inV1
+        int index = i + j - leftCtxLen;
+        index = (index + width0) % width0;
+        outV[i] += inV0[index] * inV1[j];
+      }
+    }
+  }
+}
+
+void CpuMatrix::circularConvDerivative(
+    Matrix& outG, Matrix& in0, Matrix& in1, Matrix& inG0, Matrix& inG1) {
+  size_t height = in0.getHeight();
+  size_t width0 = in0.getWidth();
+  size_t width1 = in1.getWidth();
+
+  CHECK_EQ(height, in1.getHeight());
+  CHECK_EQ(height, inG0.getHeight());
+  CHECK_EQ(width0, inG0.getWidth());
+  CHECK_EQ(height, inG1.getHeight());
+  CHECK_EQ(width1, inG1.getWidth());
+  CHECK_EQ(height, outG.getHeight());
+  CHECK_EQ(width0, outG.getWidth());
+
+  real* outGV = outG.getData();
+  real* inV0 = in0.getData();
+  real* inV1 = in1.getData();
+  real* inGV0 = inG0.getData();
+  real* inGV1 = inG1.getData();
+
+  int leftCtxLen = (width1 - 1) / 2;
+  for (size_t x = 0; x < height; ++x,
+              outGV += width0,
+              inV0 += width0,
+              inV1 += width1,
+              inGV0 += width0,
+              inGV1 += width1) {
+    for (size_t j = 0; j < width1; ++j) {  // iterate over width1
+      for (size_t i = 0; i < width0; ++i) {
+        // such over all dimensions of outG
+        int index = i + j - leftCtxLen;
+        index = (index + width0) % width0;
+        inGV0[index] += outGV[i] * inV1[j];
+        inGV1[j] += outGV[i] * inV0[index];
+      }
+    }
+  }
+}
+
+void CpuMatrix::multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
+  CHECK(dynamic_cast<CpuMatrix*>(&output));
+  auto labelPtr = dynamic_cast<CpuSparseMatrix*>(&label);
+  CHECK(labelPtr);
+
+  size_t numSamples = getHeight();
+  size_t dim = output.getWidth();
+  CHECK_EQ(numSamples, output.getHeight());
+  CHECK_EQ(numSamples, labelPtr->getHeight());
+  CHECK_EQ(dim, labelPtr->getWidth());
+
+  real* out = output.getData();
+  real* cost = getData();
+  for (size_t i = 0; i < numSamples; ++i, out += dim) {
+    for (size_t j = 0; j < dim; ++j) {
+      CHECK(out[j] > 0 && out[j] < 1.0);
+      cost[i] -= std::log(1 - out[j]);
+    }
+
+    const int* cols = labelPtr->getRowCols(i);
+    for (size_t j = 0; j < labelPtr->getColNum(i); ++j) {
+      CHECK_LT(size_t(cols[j]), dim);
+      cost[i] -= std::log(out[cols[j]] / (1 - out[cols[j]]));
+    }
+  }
+}
+
+void CpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
+  CHECK(dynamic_cast<CpuMatrix*>(&output));
+  auto labelPtr = dynamic_cast<CpuSparseMatrix*>(&label);
+  CHECK(labelPtr);
+
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  CHECK_EQ(numSamples, output.getHeight());
+  CHECK_EQ(numSamples, labelPtr->getHeight());
+  CHECK_EQ(dim, output.getWidth());
+  CHECK_EQ(dim, labelPtr->getWidth());
+
+  real* out = output.getData();
+  real* grad = getData();
+  for (size_t i = 0; i < numSamples; ++i, out += dim, grad += dim) {
+    for (size_t j = 0; j < dim; ++j) {
+      CHECK(out[j] > 0 && out[j] < 1.0);
+      grad[j] += 1.0 / (1 - out[j]);
+    }
+
+    const int* cols = labelPtr->getRowCols(i);
+    for (size_t j = 0; j < labelPtr->getColNum(i); ++j) {
+      CHECK_LT(size_t(cols[j]), dim);
+      grad[cols[j]] -= 1.0 / (out[cols[j]] * (1 - out[cols[j]]));
+    }
+  }
+}
+
+/* calculate the classification error for multi binary label */
+void CpuMatrix::classificationErrorMulti(Matrix& output,
+                                         Matrix& label,
+                                         real threshold) {
+  CHECK(dynamic_cast<CpuMatrix*>(&output));
+  auto labelPtr = dynamic_cast<CpuSparseMatrix*>(&label);
+  CHECK(labelPtr);
+
+  size_t numSamples = getHeight();
+  size_t dim = output.getWidth();
+  CHECK_EQ(numSamples, output.getHeight());
+  CHECK_EQ(numSamples, labelPtr->getHeight());
+  CHECK_EQ(dim, labelPtr->getWidth());
+
+  real* out = output.getData();
+  real* result = getData();
+  for (size_t i = 0; i < numSamples; ++i, out += dim) {
+    real sum = 0.0;
+    for (size_t j = 0; j < dim; ++j) {
+      if (out[j] >= threshold) {
+        sum += 1.0;
+      }
+    }
+
+    const int* cols = labelPtr->getRowCols(i);
+    for (size_t j = 0; j < labelPtr->getColNum(i); ++j) {
+      CHECK_LT(size_t(cols[j]), dim);
+      if (out[cols[j]] < threshold) {
+        sum += 1.0;
+      } else {
+        sum -= 1.0;
+      }
+    }
+    result[i] = sum / dim;
+  }
+}
+
+void CpuMatrix::bilinearForward(const Matrix& in,
+                                const size_t inImgH,
+                                const size_t inImgW,
+                                const size_t outImgH,
+                                const size_t outImgW,
+                                const size_t numChannels,
+                                const real ratioH,
+                                const real ratioW) {
+  CHECK(dynamic_cast<const CpuMatrix*>(&in));
+
+  size_t outputW = getWidth();
+  size_t batchSize = getHeight();
+  size_t inputW = in.getWidth();
+  size_t inputH = in.getHeight();
+  size_t inPosOffset = inImgH * inImgW;
+  size_t outPosOffset = outImgH * outImgW;
+  (void)(inputH);
+
+  real* outData = getData();
+  const real* inData = in.getData();
+
+  if (inImgH == outImgH && inImgW == outImgW) {
+    this->copyFrom(in);
+  } else {
+    for (size_t k = 0; k < batchSize; ++k) {  // loop for batches
+      for (size_t i = 0; i < outImgH; ++i) {  // loop for images
+        size_t h = ratioH * i;
+        size_t hid = (h < inImgH - 1) ? 1 : 0;
+        real h1lambda = ratioH * i - h;
+        real h2lambda = 1 - h1lambda;
+
+        for (size_t j = 0; j < outImgW; ++j) {
+          size_t w = ratioW * j;
+          size_t wid = (w < inImgW - 1) ? 1 : 0;
+          real w1lambda = ratioW * j - w;
+          real w2lambda = 1 - w1lambda;
+          // calculate four position for bilinear interpolation
+          const real* inPos = &inData[k * inputW + h * inImgW + w];
+          real* outPos = &outData[k * outputW + i * outImgW + j];
+          for (size_t c = 0; c < numChannels; ++c) {  // loop for channels
+            // bilinear interpolation
+            outPos[0] =
+                h2lambda * (w2lambda * inPos[0] + w1lambda * inPos[wid]) +
+                h1lambda * (w2lambda * inPos[hid * inImgW] +
+                            w1lambda * inPos[hid * inImgW + wid]);
+            inPos += inPosOffset;
+            outPos += outPosOffset;
+          }
+        }
+      }
+    }
+  }
+}
+
+void CpuMatrix::bilinearBackward(const Matrix& out,
+                                 const size_t outImgH,
+                                 const size_t outImgW,
+                                 const size_t inImgH,
+                                 const size_t inImgW,
+                                 const size_t numChannels,
+                                 const real ratioH,
+                                 const real ratioW) {
+  CHECK(dynamic_cast<const CpuMatrix*>(&out));
+
+  size_t inputW = getWidth();
+  size_t inputH = getHeight();
+  size_t outputW = out.getWidth();
+  size_t batchSize = out.getHeight();
+  size_t inPosOffset = inImgH * inImgW;
+  size_t outPosOffset = outImgH * outImgW;
+  (void)(inputH);
+
+  real* inGrad = getData();
+  const real* outGrad = out.getData();
+
+  if (inImgH == outImgH && inImgW == outImgW) {
+    this->add(const_cast<Matrix&>(out));
+  } else {
+    for (size_t k = 0; k < batchSize; ++k) {  // loop for batches
+      for (size_t i = 0; i < outImgH; ++i) {  // loop for images
+        size_t h = ratioH * i;
+        size_t hid = (h < inImgH - 1) ? 1 : 0;
+        real h1lambda = ratioH * i - h;
+        real h2lambda = 1 - h1lambda;
+        for (size_t j = 0; j < outImgW; ++j) {
+          size_t w = ratioW * j;
+          size_t wid = (w < inImgW - 1) ? 1 : 0;
+          real w1lambda = ratioW * j - w;
+          real w2lambda = 1 - w1lambda;
+
+          real* inPos = &inGrad[k * inputW + h * inImgW + w];
+          const real* outPos = &outGrad[k * outputW + i * outImgW + j];
+          for (size_t c = 0; c < numChannels; ++c) {  // loop for channels
+            inPos[0] += h2lambda * w2lambda * outPos[0];
+            inPos[wid] += h2lambda * w1lambda * outPos[0];
+            inPos[hid * inImgW] += h1lambda * w2lambda * outPos[0];
+            inPos[hid * inImgW + wid] += h1lambda * w1lambda * outPos[0];
+            inPos += inPosOffset;
+            outPos += outPosOffset;
+          }
+        }
+      }
+    }
+  }
+}
+
+void CpuMatrix::vol2Col(real* data,
+                        int channels,
+                        int depth,
+                        int height,
+                        int width,
+                        int filterD,
+                        int filterH,
+                        int filterW,
+                        int strideD,
+                        int strideH,
+                        int strideW,
+                        int paddingD,
+                        int paddingH,
+                        int paddingW) {
+  real* outData = getData();
+  int outHeight = (height + 2 * paddingH - filterH) / strideH + 1;
+  int outWidth = (width + 2 * paddingW - filterW) / strideW + 1;
+  int outDepth = (depth + 2 * paddingD - filterD) / strideD + 1;
+
+  int channelsCol = channels * filterD * filterH * filterW;
+  for (int c = 0; c < channelsCol; ++c) {
+    int wOffset = c % filterW;
+    int hOffset = (c / filterW) % filterH;
+    int dOffset = (c / filterW / filterH) % filterD;
+    int cIn = c / filterW / filterH / filterD;
+    for (int d = 0; d < outDepth; ++d) {
+      for (int h = 0; h < outHeight; ++h) {
+        for (int w = 0; w < outWidth; ++w) {
+          int dPad = d * strideD - paddingD + dOffset;
+          int hPad = h * strideH - paddingH + hOffset;
+          int wPad = w * strideW - paddingW + wOffset;
+
+          if (hPad >= 0 && hPad < height && wPad >= 0 && wPad < width &&
+              dPad >= 0 && dPad < depth)
+            outData[((c * outDepth + d) * outHeight + h) * outWidth + w] =
+                data[((cIn * depth + dPad) * height + hPad) * width + wPad];
+          else
+            outData[((c * outDepth + d) * outHeight + h) * outWidth + w] = 0;
+        }
+      }
+    }
+  }
+}
+
+void CpuMatrix::col2Vol(real* trg,
+                        int channels,
+                        int depth,
+                        int height,
+                        int width,
+                        int filterD,
+                        int filterH,
+                        int filterW,
+                        int strideD,
+                        int strideH,
+                        int strideW,
+                        int paddingD,
+                        int paddingH,
+                        int paddingW,
+                        real alpha,
+                        real beta) {
+  real* src = getData();
+  int outDepth = (depth + 2 * paddingD - filterD) / strideD + 1;
+  int outHeight = (height + 2 * paddingH - filterH) / strideH + 1;
+  int outWidth = (width + 2 * paddingW - filterW) / strideW + 1;
+  int channelsCol = channels * filterD * filterH * filterW;
+  for (int c = 0; c < channelsCol; ++c) {
+    int wOffset = c % filterW;
+    int hOffset = (c / filterW) % filterH;
+    int dOffset = (c / filterW / filterH) % filterD;
+    int cIm = c / filterW / filterH / filterD;
+    for (int d = 0; d < outDepth; ++d) {
+      for (int h = 0; h < outHeight; ++h) {
+        for (int w = 0; w < outWidth; ++w) {
+          int dPad = d * strideD - paddingD + dOffset;
+          int hPad = h * strideH - paddingH + hOffset;
+          int wPad = w * strideW - paddingW + wOffset;
+          if (hPad >= 0 && hPad < height && wPad >= 0 && wPad < width &&
+              dPad >= 0 && dPad < depth)
+            trg[((cIm * depth + dPad) * height + hPad) * width + wPad] =
+                alpha *
+                    src[((c * outDepth + d) * outHeight + h) * outWidth + w] +
+                beta *
+                    trg[((cIm * depth + dPad) * height + hPad) * width + wPad];
+        }
+      }
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////
+//               functions executed via cpu                   //
+////////////////////////////////////////////////////////////////
+
+void GpuMatrix::selectElements(Matrix& table, IVector& ids) {
+  execViaCpu2(&CpuMatrix::selectElements, *this, table, ids);
+}
+}  // namespace paddle
diff --git a/paddle/legacy/math/Matrix.h b/paddle/legacy/math/Matrix.h
new file mode 100644
index 0000000000000000000000000000000000000000..ff4f4cfc2a41add1a06308556b38aba5bbdac884
--- /dev/null
+++ b/paddle/legacy/math/Matrix.h
@@ -0,0 +1,2189 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+#include <memory>
+#include <thread>
+
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
+
+#include <hl_gpu.h>
+
+#include "BaseMatrix.h"
+#include "MemoryHandle.h"
+#include "Vector.h"
+#include "paddle/legacy/utils/Common.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
+
+namespace paddle {
+
+/// TODO(tianbing), move to paddle/legacy/function/TensorType.h
+enum SparseValueType { NO_VALUE = 0, FLOAT_VALUE = 1 };
+
+/**
+ * @brief  matrix sparse_format .
+ *
+ * nnz represents nonzero number in sparse matrix.
+ *
+ * SPARSE_CSR: row major matrix. length of row is height_ + 1, each element
+ * represents row start index in Matrix. length of col and value are nnz.
+ *
+ * SPARSE_CSC: col major matrix. length of col is width_ + 1, each element
+ * represents col start index in Matrix. length of col and value are nnz.
+ *
+ * @code
+ * for example: [0, 1, 0, 2, 0;
+ *               1, 0, 0, 0, 0;
+ *               0, 0, 0, 2, 5];
+ * SPARSE_CSR row   [0, 2, 3, 5];
+ *            col   [1, 3, 0, 3, 4];
+ *            value [1, 2, 1, 2, 5]
+ * SPARSE_CSC col   [0, 1, 2, 2, 4, 5];
+ *            row   [1, 0, 0, 2, 2];
+ *            value [1, 1, 2, 2, 5]
+ * @endcode
+ */
+/// TODO(tianbing), move to paddle/legacy/function/TensorType.h
+enum SparseFormat { SPARSE_CSR = 0, SPARSE_CSC = 1 };
+
+class Matrix;
+class GpuMatrix;
+class CpuMatrix;
+class CpuSparseMatrix;
+class GpuSparseMatrix;
+typedef std::shared_ptr<Matrix> MatrixPtr;
+typedef std::shared_ptr<GpuMatrix> GpuMatrixPtr;
+typedef std::shared_ptr<CpuMatrix> CpuMatrixPtr;
+typedef std::shared_ptr<GpuSparseMatrix> GpuSparseMatrixPtr;
+typedef std::shared_ptr<CpuSparseMatrix> CpuSparseMatrixPtr;
+
+/**
+ * Copy or assignemnt constructor will share the data as opposed to making a
+ * copy of the original data. To make a copy of the orinal data, use copyFrom()
+ * instead.
+ */
+class Matrix : public BaseMatrix {
+ protected:
+  Matrix(MemoryHandlePtr memHandle,
+         size_t height,
+         size_t width,
+         bool trans,
+         bool use_gpu);
+
+  Matrix(real* data, size_t height, size_t width, bool trans, bool use_gpu);
+
+  Matrix(real* data,
+         size_t height,
+         size_t width,
+         size_t stride,
+         bool trans,
+         bool use_gpu);
+
+  static ThreadLocal<MatrixPtr> tmpMat_;
+
+ public:
+  size_t elementCnt_;  // maximal number of elements which can be held in data_
+  MemoryHandlePtr memoryHandle_;
+
+ public:
+  virtual ~Matrix() {}
+
+  static MatrixPtr create(MemoryHandlePtr memHandle,
+                          size_t height,
+                          size_t width,
+                          bool trans = false);
+  static MatrixPtr create(size_t height,
+                          size_t width,
+                          bool trans = false,
+                          bool useGpu = false);
+  static MatrixPtr create(real* data,
+                          size_t height,
+                          size_t width,
+                          bool trans = false,
+                          bool useGpu = false);
+  static MatrixPtr create(real* data,
+                          size_t height,
+                          size_t width,
+                          size_t stride,
+                          bool trans = false,
+                          bool useGpu = false);
+
+  static MatrixPtr createSparseMatrix(size_t height,
+                                      size_t width,
+                                      size_t nnz,
+                                      SparseValueType valueType = FLOAT_VALUE,
+                                      bool trans = false,
+                                      bool useGpu = false);
+  static MatrixPtr createSparseMatrix(size_t height,
+                                      size_t width,
+                                      size_t nnz,
+                                      SparseValueType valueType = FLOAT_VALUE,
+                                      SparseFormat foramt = SPARSE_CSR,
+                                      bool trans = false,
+                                      bool useGpu = false);
+
+  static MatrixPtr createSparseMatrix(real* data,
+                                      int* row,
+                                      int* col,
+                                      size_t height,
+                                      size_t width,
+                                      size_t nnz, /* used to allocate space */
+                                      SparseValueType valueType, /*value type*/
+                                      SparseFormat format,
+                                      bool trans,
+                                      bool useGpu);
+
+  static void resizeOrCreateSparseMatrix(
+      MatrixPtr& matrix,
+      size_t height,
+      size_t width,
+      size_t nnz,
+      SparseValueType valueType = FLOAT_VALUE,
+      SparseFormat foramt = SPARSE_CSR,
+      bool trans = false,
+      bool useGpu = false);
+
+  static void resizeOrCreate(MatrixPtr& a,
+                             size_t height,
+                             size_t width,
+                             bool trans = false,
+                             bool useGpu = false);
+
+  /**
+   * @brief  set the data buffer used to hold the matrix data.
+   *
+   * caller should make sure that the size of data is at least
+   * sizeof(real)*height*width.
+   */
+  void setData(real* data) {
+    BaseMatrix::setData(data);
+    memoryHandle_.reset();
+  }
+
+  /// the data should be contiguous
+  void setData(real* data, size_t newHeight, size_t newWidth) {
+    setData(data);
+    height_ = newHeight;
+    width_ = newWidth;
+    elementCnt_ = newHeight * newWidth;
+    stride_ = width_;
+  }
+
+  size_t getWidth() const { return width_; }
+  size_t getHeight() const { return height_; }
+  size_t getStride() const { return stride_; }
+  size_t getElementCnt() const { return elementCnt_; }
+  virtual real* getData() { return data_; }
+  virtual const real* getData() const { return data_; }
+  bool isTransposed() const { return trans_; }
+  bool isContiguous() const { return stride_ == width_ || height_ == 1; }
+
+  // If sparse matrix, need to dynamic_cast to CpuSparseMatrix/GpuSparseMatrix
+  // befor call the following functions.
+  // Declare these functions in the base class just easy to call them.
+  // And these declarations should be moved to base class of sparse matrix
+  // if refactor sparse matrix
+  virtual int* getRows() const {
+    LOG(FATAL) << "Not implemented";
+    return nullptr;  //! suppress warning for no return value.
+  }
+
+  virtual int* getCols() const {
+    LOG(FATAL) << "Not implemented";
+    return nullptr;  //! suppress warning for no return value.
+  }
+
+  virtual SparseFormat getFormat() const {
+    LOG(FATAL) << "Not implemented";
+    return SPARSE_CSR;  //! suppress warning for no return value.
+  }
+
+  virtual SparseValueType getValueType() const {
+    LOG(FATAL) << "Not implemented";
+    return NO_VALUE;  //! suppress warning for no return value.
+  }
+
+  /**
+   * @brief matrix elment-wise add
+   *
+   * Named add3 just because add/add2 has been used in BaseMatrix.cu
+   * and they are not virtual function.
+   */
+  virtual void add3(MatrixPtr b) { LOG(FATAL) << "Not implemented"; }
+
+  MemoryHandlePtr getMemoryHandle() const { return memoryHandle_; }
+
+  virtual void zeroMem() { LOG(FATAL) << "Not implemented"; }
+
+  virtual void resetOne() { LOG(FATAL) << "Not implemented"; }
+
+  void setDiag(real value);
+
+  virtual void copyFrom(const Matrix& src) { LOG(FATAL) << "Not implemented"; }
+
+  virtual void trimFrom(const CpuSparseMatrix& src) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  // For GpuMatrix this is an asynchronous copy interface
+  // For CpuMatrix this is an synchronous copy interface
+  virtual void copyFrom(const Matrix& src, hl_stream_t stream) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  MatrixPtr subMatrix(size_t startRow,
+                      size_t endRow,
+                      size_t startCol,
+                      size_t endCol);
+
+  MatrixPtr subRowMatrix(size_t startRow, size_t endRow) {
+    return subMatrix(startRow, endRow, 0, getWidth());
+  }
+
+  MatrixPtr subColMatrix(size_t startCol, size_t endCol) {
+    return subMatrix(0, getHeight(), startCol, endCol);
+  }
+
+  virtual MatrixPtr subMatrix(size_t startRow, size_t numRows) {
+    CHECK_LE(startRow + numRows, getHeight());
+    return Matrix::create(getData() + startRow * getWidth(),
+                          numRows,
+                          getWidth(),
+                          trans_,
+                          useGpu_);
+  }
+  virtual MatrixPtr subMatrix(size_t startRow, size_t numRows, MatrixPtr dest) {
+    CHECK_LE(startRow + numRows, getHeight());
+    CHECK_EQ(useGpu_, dest->useGpu_);
+    dest->setData(this->rowBuf(startRow), numRows, getWidth());
+    return dest;
+  }
+
+  /**
+   * If this is GpuMatrix, src is assumed to be CPU memory
+   *
+   * If this is CpuMatrix, src is assumed to be CPU memory
+   */
+  virtual void copyFrom(const real* src, size_t size) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void copyFrom(const real* src, const int64_t* seq) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @brief convert a int vector to a real matrix.
+   *
+   * (1) source and dest are both in CPU.
+   *
+   * (2) sizes are exactly match.
+   */
+  virtual void copyFrom(const IVector& src) {
+    LOG(FATAL) << "copy data from int vector only available on CpuMatrix.";
+  }
+
+  virtual void copyByRowIndex(Matrix& b, const IVector& rowIndex) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @brief Create a matrix with the same type (GpuMatrix, CpuMatrix,
+   *        NonValueSparseMatrix, etc.) as this.
+   *
+   * If height and width is zero, the new matrix will have the same size
+   * as this, otherwise the new matrix will have the specified size.
+   *
+   */
+  virtual MatrixPtr clone(size_t height = 0,
+                          size_t width = 0,
+                          bool useGpu = false) {
+    LOG(FATAL) << "Not implemented";
+    return nullptr;
+  }
+
+  virtual real* getRowBuf(size_t row) {
+    LOG(FATAL) << "Not implemented";
+    return nullptr;
+  }
+
+  virtual real getElement(size_t x, size_t y) const {
+    LOG(FATAL) << "Not implemented";
+    return 0;
+  }
+
+  virtual real getSum() {
+    LOG(FATAL) << "Not implemented";
+    return 0;
+  }
+
+  virtual void accumulateColSum(Matrix& src) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual real getAbsSum() {
+    LOG(FATAL) << "Not implemented";
+    return 0;
+  }
+
+  /**
+   * @note Original data may not be preserved after resize().
+   */
+  virtual void resize(size_t newHeight, size_t newWidth) = 0;
+
+  /**
+   * @note This should only be used for sparse matrix.
+   */
+  virtual void resize(size_t newHeight,
+                      size_t newWidth,
+                      size_t newNnz, /* total item used to allocate space */
+                      SparseValueType valueType,
+                      SparseFormat format) = 0;
+
+  /**
+   * @brief This should only be used for sparse matrix.
+   *
+   * Currently must be called for each row in order.
+   * The matrix is not valid until setRow is called for the last row.
+   */
+  virtual void setRow(size_t row,
+                      size_t colNum,
+                      const unsigned int* cols,
+                      const real* values) = 0;
+
+  virtual MatrixPtr getTranspose() = 0;
+
+  /**
+   * @brief  hard transpose.
+   *
+   * allocate matTrans' memory outside, then set memAlloc as false;
+   * else set as true.
+   */
+  virtual void transpose(MatrixPtr& matTrans, bool memAlloc) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @brief  rotate 90 degrees in clock-wise if clockWise=true;
+   *         otherwise rotate in anti clock-wise
+   * clock-wise:
+   * \f[
+   *   y(j,i) = x(M-i-1,j)
+   * \f]
+   * anti clock-wise:
+   * \f[
+   *   y(j,i) = x(i, N-1-j)
+   * \f]
+   * where \f$x\f$ is (M x N) input, and \f$y\f$ is (N x M) output.
+   *
+   * allocate matRot' memory outside, then set memAlloc as false;
+   * else set as true.
+   */
+  virtual void rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual MatrixPtr getInverse() {
+    LOG(FATAL) << "Not implemented";
+    return nullptr;
+  }
+
+  /**
+   * @brief  inverse.
+   *
+   * if allocate matInv's memory outside, then set memAlloc as false;
+   * else set as true.
+   */
+  virtual void inverse(MatrixPtr& matInv, bool memAlloc) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+ public:
+  /// Only set all variables to 0 or NULL but not free them.
+  virtual void clear() {
+    height_ = 0;
+    width_ = 0;
+    data_ = NULL;
+  }
+
+  void reshape(size_t height, size_t width);
+
+  /// add b to each sample of this.
+  virtual void addBias(Matrix& b, real scale) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void addSharedBias(Matrix& b, real scale) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  void addBias(Matrix& b, real scale, bool sharedBias) {
+    if (!sharedBias) {
+      addBias(b, scale);
+    } else {
+      addSharedBias(b, scale);
+    }
+  }
+
+  /// add each sample from a to this.
+  virtual void collectBias(Matrix& a, real scale) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void collectSharedBias(Matrix& a, real scale) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  void collectBias(Matrix& a, real scale, bool sharedBias) {
+    if (!sharedBias) {
+      collectBias(a, scale);
+    } else {
+      collectSharedBias(a, scale);
+    }
+  }
+
+  virtual void sequenceAvgForward(Matrix& a,
+                                  const IVector& startsPos,
+                                  int mode) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void sequenceAvgBackward(Matrix& a,
+                                   const IVector& startsPos,
+                                   int mode) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * this = scaleAB*(a*b) + scaleT*this
+   * @endcode
+   */
+  virtual void mul(const Matrix& a,
+                   const Matrix& b,
+                   real scaleAB,
+                   real scaleT) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// Add a vector (column) b to matrix a, column by column.
+  virtual void addColumnVector(const Matrix& b) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * For j < codeLength:
+   *   this(i, j) += vec(index(i, j), 0)
+   * where index(i, j) = ((codes(i) + numClasses) >> (j + 1)) - 1
+   * @endcode
+   */
+  virtual void addByBitCode(size_t numClasses,
+                            const IVector& codes,
+                            const Matrix& vec) {
+    (void)numClasses;
+    (void)codes;
+    (void)vec;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   * For j < codeLength:
+   *   vec(index(i, j), 0) += this(i, j)
+   * where index is same as the index for addByBitCode
+   * @endcode
+   */
+  virtual void addByBitCodeBackward(size_t numClasses,
+                                    const IVector& codes,
+                                    Matrix& vec) {
+    (void)numClasses;
+    (void)codes;
+    (void)vec;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   * For j < codeLength:
+   *   this(i, j) += <mat.row(index(i, j)), input.row(i)>
+   * where index is same as the index for addByBitCode
+   * @endcode
+   */
+  virtual void mulByBitCode(size_t numClasses,
+                            const IVector& codes,
+                            const Matrix& mat,
+                            const Matrix& input) {
+    (void)numClasses;
+    (void)codes;
+    (void)mat;
+    (void)input;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   * For j < codeLength:
+   *   mat.row(index(i, j)) += this(i, j) * input.row(i)
+   * where index is same as the index for addByBitCode
+   * @endcode
+   */
+  virtual void mulByBitCodeBackwardWeight(size_t numClasses,
+                                          const IVector& codes,
+                                          Matrix& mat,
+                                          const Matrix& input) {
+    (void)numClasses;
+    (void)codes;
+    (void)mat;
+    (void)input;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   * For j < codeLength:
+   *   input.row(i) += this(i, j) * mat.row(index(i, j))
+   * where index is same as the index for addByBitCode
+   * @endcode
+   */
+  virtual void mulByBitCodeBackwardError(size_t numClasses,
+                                         const IVector& codes,
+                                         const Matrix& mat,
+                                         Matrix& input) {
+    (void)numClasses;
+    (void)codes;
+    (void)mat;
+    (void)input;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   * For j < codeLength
+   *   sum(i, 0) = scaleSum * \sum_j  bit(i, j) * this(i, j)
+   * where bit(i, j) = ((codes(i) + numClasses) & 2^j) ? 1 : 0
+   * @endcode
+   */
+  virtual void sumByBitCode(size_t numClasses,
+                            IVector& codes,
+                            Matrix& sum,
+                            real scaleSum) {
+    (void)numClasses;
+    (void)codes;
+    (void)sum;
+    (void)scaleSum;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   * For j < codeLength
+   *  this(i, j) -= bit(i, j)
+   * where bit(i, j) is same as that for sumByBitCode
+   * @endcode
+   */
+  virtual void subByBitCode(size_t numClasses_, IVector& codes) {
+    (void)numClasses_;
+    (void)codes;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * add the sum of each row of this to mat
+   */
+  virtual void rowSum(Matrix& sum) {
+    (void)sum;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * set the max of each row of this to mat
+   */
+  virtual void rowMax(Matrix& max) {
+    (void)max;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * set the max of each column of this to mat
+   */
+  virtual void colMax(Matrix& max) { LOG(FATAL) << "not implemented"; }
+
+  /**
+   * @brief Get the top k elements of each column of this matrix.
+   *
+   * The row ids and values of these elements are stored in
+   * maxIds and max respectively. where k is the size of maxIds.
+   * And note that the top k elements are not sorted.
+   */
+  virtual void colMax(IVector& maxIds, Matrix& maxVal) {
+    LOG(FATAL) << "not implemented";
+  }
+
+  virtual void maxoutForward(Matrix& a,
+                             IVector& id,
+                             size_t channels,
+                             size_t groups) {
+    LOG(FATAL) << "not implemented";
+  }
+
+  virtual void maxoutBackward(Matrix& a,
+                              IVector& id,
+                              size_t channels,
+                              size_t groups) {
+    LOG(FATAL) << "not implemented";
+  }
+
+  virtual void rowMaxId(IVector& maxIds) { LOG(FATAL) << "Not implemented"; }
+
+  /**
+   * @brief Get the top k elements of each row of this matrix.
+   *
+   * The column ids and values of these elements are stored in
+   * maxIds and max respectively. where k is the size of maxIds.
+   * And note that the top k elements are not sorted.
+   */
+  virtual void rowMax(IVector& maxIds, Matrix& max) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// normalize each row so that the sum of each row is 1.
+  virtual void rowNormalizeL1(Matrix& out) {
+    (void)out;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   *  this = a*b
+   * @endcode
+   */
+  virtual void mul(const Matrix& a, const Matrix& b) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * this = scaleAB*(this*b) +  scaleT*this
+   * @endcode
+   */
+  virtual void rightMul(Matrix& b, real scaleAB, real scaleT) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * this = this* b
+   * @endcode
+   */
+  virtual void rightMul(Matrix& b) { LOG(FATAL) << "Not implemented"; }
+
+  /**
+   * @code
+   * this = scaleAB*(a*this) +  scaleT*this
+   * @endcode
+   */
+  virtual void leftMul(Matrix& a, real scaleAB, real scaleT) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * this = a*this)
+   * @endcode
+   */
+  virtual void leftMul(Matrix& a) { LOG(FATAL) << "Not implemented"; }
+
+  /// merge the element for each col.
+  virtual void colMerge(Matrix& src) { LOG(FATAL) << "Not implemented"; }
+
+  /// copy -log(output[label]) to this->data[i].
+  virtual void oneHotCrossEntropy(Matrix& output, IVector& label) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// calculate the error of outputV according to label.
+  virtual void oneHotCrossEntropyBp(Matrix& outputV, IVector& label) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// copy -log(output[label]) to this->data[i].
+  virtual void oneHotCrossEntropyWithSelfNorm(Matrix& output,
+                                              IVector& label,
+                                              real alpha) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// calculate the error of outputV according to label.
+  virtual void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
+                                                IVector& label,
+                                                real alpha) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * \f[
+   *  a[i] = \sum_{j=-(N-1)/2}^{(N-1)/2} b_{i+j} * c_{j}
+   * \f]
+   *
+   * b contains M elements,
+   * c contains N elements (N is odd),
+   * b's index arithmetic is computed modulo M,
+   * c's index arithmetic is computed modulo N.
+   */
+  virtual void circularConv(Matrix& b, Matrix& c) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void circularConvDerivative(Matrix& output,
+                                      Matrix& prevOut1,
+                                      Matrix& prevOut2,
+                                      Matrix& prevGrad1,
+                                      Matrix& prevGrad2) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /* output_ij = exp(this_{ij}) / (sum_j exp(this_ij)) */
+  virtual void softmax(Matrix& output) {
+    (void)output;
+    LOG(FATAL) << "Not implemeted";
+  }
+  virtual void sequenceSoftmax(Matrix& output, const IVector& index) {
+    (void)output;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void softmaxBackward(Matrix& outputV) {
+    (void)outputV;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /*
+    sum_i = sum_j this_ij * output_ij
+    this_ij = output_ij* (this_ij - sum_i)
+  */
+  virtual void softmaxDerivative(Matrix& output, Matrix& sftmaxSum) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// calculate the sum of squares diff cost.
+  virtual void sumOfSquares(Matrix& output, Matrix& label) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// gradient of sumOfSquares.
+  virtual void sumOfSquaresBp(Matrix& outputV, Matrix& label) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void smoothL1(Matrix& output, Matrix& label, real destScale) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void smoothL1Bp(Matrix& outputV, Matrix& label, real destScale) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void tanh(Matrix& output) { LOG(FATAL) << "Not implemented"; }
+
+  virtual void tanhDerivative(Matrix& output) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void softrelu(Matrix& output) { LOG(FATAL) << "Not implemented"; }
+
+  virtual void softreluDerivative(Matrix& output) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void scaledTanh(Matrix& output, real p1, real p2) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// print out the values of elements to os
+  virtual void print(std::ostream& os) const {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * print a part of the matrix
+   * from the (top,left) value to the (height, width) value (not included)
+   */
+  virtual void print(std::ostream& os, size_t height, size_t width) const {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// print one row to os
+  virtual void printOneRow(std::ostream& os, size_t idx) const {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void check(std::ostream& os, Matrix& refMat, bool printDiff = true) {}
+
+  virtual real getMin() {
+    LOG(FATAL) << "Not implemented";
+    return 0;
+  }
+  virtual real getMax() {
+    LOG(FATAL) << "Not implemented";
+    return 0;
+  }
+
+  virtual void randomizeUniform() { LOG(FATAL) << "Not implemented"; }
+
+  /**
+   * @brief  calulate the error of classification
+   *
+   * output[i] = 1 if row i is an error.
+   *
+   * output[i] = 0 if row i is correct.
+   *
+   */
+  virtual void classificationError(Matrix& output,
+                                   IVector& label,
+                                   size_t topkSize = 1) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void upsampleForward(Matrix& input,
+                               Matrix& mask,
+                               size_t imgSizeH,
+                               size_t imgSizeW,
+                               size_t channels,
+                               size_t outputH,
+                               size_t outputW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void upsampleBackward(Matrix& outputGrad,
+                                Matrix& mask,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t channels,
+                                size_t outputH,
+                                size_t outputW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * Pooling forward operation, pick out the largest element
+   * in the sizeX of value, if the maskMatP is not NULL, it will
+   * also caculate the location indices.
+   */
+  virtual void maxPoolForward(Matrix& inputMat,
+                              size_t imgSizeH,
+                              size_t imgSizeW,
+                              size_t channels,
+                              size_t sizeX,
+                              size_t sizeY,
+                              size_t strideH,
+                              size_t strideW,
+                              size_t outputH,
+                              size_t outputW,
+                              size_t paddingH,
+                              size_t paddingW,
+                              MatrixPtr maskMatP = NULL) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /// Pooling backward operation.
+  virtual void maxPoolBackward(Matrix& image,
+                               size_t imgSizeH,
+                               size_t imgSizeW,
+                               Matrix& outGrad,
+                               Matrix& outV,
+                               size_t sizeX,
+                               size_t sizeY,
+                               size_t strideH,
+                               size_t strideW,
+                               size_t outputH,
+                               size_t outputW,
+                               real scaleTargets,
+                               real scaleOutput,
+                               size_t paddingH,
+                               size_t paddingW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /// Pooling forward operation, caculate the average of sizeX elements.
+  virtual void avgPoolForward(Matrix& input,
+                              size_t imgSizeH,
+                              size_t imgSizeW,
+                              size_t channels,
+                              size_t sizeX,
+                              size_t sizeY,
+                              size_t strideH,
+                              size_t strideW,
+                              size_t outputH,
+                              size_t outputW,
+                              size_t paddingH,
+                              size_t paddingW,
+                              bool excludeMode = true) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void avgPoolBackward(Matrix& input,
+                               size_t imgSizeH,
+                               size_t imgSizeW,
+                               size_t sizeX,
+                               size_t sizeY,
+                               size_t strideH,
+                               size_t strideW,
+                               size_t outputH,
+                               size_t outputW,
+                               real scaleTargets,
+                               real scaleOutput,
+                               size_t paddingH,
+                               size_t paddingW,
+                               bool excludeMode = true) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * Pooling 3D forward operation, pick out the largest element
+   * in the sizeX of value
+   */
+  virtual void maxPool3DForward(Matrix& inputMat,
+                                Matrix& maxPoolIdx,
+                                size_t channels,
+                                size_t imgSizeD,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t outputD,
+                                size_t outputH,
+                                size_t outputW,
+                                size_t sizeZ,
+                                size_t sizeY,
+                                size_t sizeX,
+                                size_t strideD,
+                                size_t strideH,
+                                size_t strideW,
+                                size_t paddingD,
+                                size_t paddingH,
+                                size_t paddingW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void maxPool3DBackward(Matrix& outGrad,
+                                 Matrix& maxPoolIdx,
+                                 size_t imgSizeD,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 size_t sizeZ,
+                                 size_t sizeY,
+                                 size_t sizeX,
+                                 size_t strideD,
+                                 size_t strideH,
+                                 size_t strideW,
+                                 size_t paddingD,
+                                 size_t paddingH,
+                                 size_t paddingW,
+                                 real scaleTargets,
+                                 real scaleOutput) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void avgPool3DForward(Matrix& input,
+                                size_t channels,
+                                size_t imgSizeD,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t outputD,
+                                size_t outputH,
+                                size_t outputW,
+                                size_t sizeZ,
+                                size_t sizeY,
+                                size_t sizeX,
+                                size_t strideD,
+                                size_t strideH,
+                                size_t strideW,
+                                size_t paddingD,
+                                size_t paddingH,
+                                size_t paddingW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void avgPool3DBackward(Matrix& input,
+                                 size_t imgSizeD,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 size_t sizeZ,
+                                 size_t sizeY,
+                                 size_t sizeX,
+                                 size_t strideD,
+                                 size_t strideH,
+                                 size_t strideW,
+                                 size_t paddingD,
+                                 size_t paddingH,
+                                 size_t paddingW,
+                                 real scaleTargets,
+                                 real scaleOutput) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+ * Input: one or more sequences. Each sequence contains some instances.
+ *
+ * Output: output size is the number of input sequences (NOT input
+ * instances).
+ *
+ * output[i] is set to max_input[i].
+ */
+  virtual void maxSequenceForward(Matrix& input,
+                                  const IVector& sequence,
+                                  IVector& index) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void maxSequenceBackward(Matrix& outputGrad,
+                                   const IVector& sequence,
+                                   IVector& index) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   * this.row[i] += table.row[ids[i]]
+   * if ids[i] == -1, it will be ignored
+   * @endcode
+   */
+  virtual void selectRows(Matrix& table, IVector& ids) {
+    (void)table;
+    (void)ids;
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * this[i] = table[i, id[i]]
+   * @endcode
+   */
+  virtual void selectElements(Matrix& table, IVector& ids) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * table.row[ids[i]] += this.row[i]
+   * if ids[i] == -1, it will be ignored
+   * @endcode
+   */
+  virtual void addToRows(Matrix& table, IVector& ids) {
+    (void)table;
+    (void)ids;
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * table[i, id[i]] += this[i]
+   * @endcode
+   */
+  virtual void addElements(Matrix& table, IVector& ids) {
+    LOG(FATAL) << "Not implemented";
+  }
+  /**
+   * @brief  cross entropy for multi binary labels
+   *
+   * @code
+   * this[i] = -sum(label[i][j]*log(output[i][j])
+   *           + (1-label[i][j])*log(1-output[i][j]))
+   * @endcode
+   */
+  virtual void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @brief  The gradient of cross entropy for multi binary labels on output
+   *
+   * @code
+   * this[i][j] = -label[i][j]/output[i][j]
+   *              + (1-label[i][j])/(1-output[i][j])
+   * @endcode
+   */
+  virtual void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @brief  Calculate the classification error for multi binary labels
+   *
+   * @code
+   * this[i] = sum((output[i][j] >= threshold && label[i][j] == 0)
+   *            || (output[i][j] < threshold && label[i][j] == 1))
+   *            / output->getWidth()
+   * @endcode
+   */
+  virtual void classificationErrorMulti(Matrix& output,
+                                        Matrix& label,
+                                        real threshold) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void paramReluForward(Matrix& data, Matrix& W) {
+    LOG(FATAL) << "Not implemented";
+  }
+  virtual void paramReluBackwardW(Matrix& oGrad, Matrix& data) {
+    LOG(FATAL) << "Not implemented";
+  }
+  virtual void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void vol2Col(real* data,
+                       int channels,
+                       int depth,
+                       int height,
+                       int width,
+                       int filterD,
+                       int filterH,
+                       int filterW,
+                       int strideD,
+                       int strideH,
+                       int strideW,
+                       int paddingD,
+                       int paddingH,
+                       int paddingW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void col2Vol(real* trg,
+                       int channels,
+                       int depth,
+                       int height,
+                       int width,
+                       int filterD,
+                       int filterH,
+                       int filterW,
+                       int strideD,
+                       int strideH,
+                       int strideW,
+                       int paddingD,
+                       int paddingH,
+                       int paddingW,
+                       real alpha,
+                       real beta) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void bilinearForward(const Matrix& in,
+                               const size_t inImgH,
+                               const size_t inImgW,
+                               const size_t outImgH,
+                               const size_t outImgW,
+                               const size_t numChannels,
+                               const real ratioH,
+                               const real ratioW) {
+    LOG(FATAL) << "Not implemented";
+  }
+  virtual void bilinearBackward(const Matrix& out,
+                                const size_t outImgH,
+                                const size_t outImgW,
+                                const size_t inImgH,
+                                const size_t inImgW,
+                                const size_t numChannels,
+                                const real ratioH,
+                                const real ratioW) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    if (useGpu_) {
+      TensorGpuApply<real>(*this, expr);
+    } else {
+      TensorCpuApply<real>(*this, expr);
+    }
+  }
+
+  bool isEmpty() const { return data_ == nullptr; }
+
+  explicit operator bool() const { return !isEmpty(); }
+};
+
+inline std::ostream& operator<<(std::ostream& os, const Matrix& mat) {
+  mat.print(os);
+  return os;
+}
+
+class GpuMatrix : public Matrix {
+ public:
+  GpuMatrix();
+
+  GpuMatrix(size_t height, size_t width, bool trans = false);
+  GpuMatrix(real* data, size_t height, size_t width, bool trans = false)
+      : Matrix(data, height, width, trans, true) {}
+  GpuMatrix(real* data,
+            size_t height,
+            size_t width,
+            size_t stride,
+            bool trans = false)
+      : Matrix(data, height, width, stride, trans, true) {}
+  GpuMatrix(GpuMemHandlePtr dataHandle,
+            size_t height,
+            size_t width,
+            bool trans = false)
+      : Matrix(dataHandle, height, width, trans, true) {}
+  ~GpuMatrix();
+
+  void zeroMem();
+  void resetOne();
+  void setDiag(real value);
+
+  void resize(size_t newHeight, size_t newWidth);
+  void resize(size_t newHeight,
+              size_t newWidth,
+              size_t newNnz, /* used to allocate space */
+              SparseValueType valueType,
+              SparseFormat format) {
+    LOG(FATAL) << "Only Support Sparse Matrix";
+  }
+  void setRow(size_t row,
+              size_t colNum,
+              const unsigned int* cols,
+              const real* values) {
+    LOG(FATAL) << "Only Support Sparse Matrix";
+  }
+
+  /**
+   * Copy the data from cpu_memory buffer
+   */
+  void copyFrom(const real* hostSrc, size_t size);
+
+  void copyFrom(const real* hostSrc, const int64_t* seq);
+
+  void copyFrom(const Matrix& src, hl_stream_t stream);
+
+  void copyFrom(const Matrix& src);
+
+  void copyFrom(const IVector& src);
+
+  void copyByRowIndex(Matrix& b, const IVector& rowIndex);
+
+  MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
+
+  real getElement(size_t x, size_t y) const;
+
+  real* getRow(size_t row) { return BaseMatrix::rowBuf(row); }
+  virtual real* getRowBuf(size_t row) { return getRow(row); }
+
+  real getSum();
+  void accumulateColSum(Matrix& src);
+  real getAbsSum();
+
+  real getMin();
+  real getMax();
+
+  MatrixPtr getTranspose();
+  void transpose(MatrixPtr& matTrans, bool memAlloc);
+  void rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise);
+
+  MatrixPtr getInverse();
+  void inverse(MatrixPtr& matInv, bool memAlloc);
+
+  /// add b to each sample of this.
+  void addBias(Matrix& b, real scale);
+  void addSharedBias(Matrix& b, real scale);
+
+  /**
+   * @code
+   * add each sample from a to this.
+   * @endcode
+   */
+  void collectBias(Matrix& a, real scale);
+  void collectSharedBias(Matrix& a, real scale);
+
+  void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
+  void sequenceAvgBackward(Matrix& a, const IVector& startsPos, int mode);
+
+  /**
+   * @code
+   * this.row[i] += table.row[ids[i]]
+   * @endcode
+   */
+  virtual void selectRows(Matrix& table, IVector& ids);
+
+  /**
+   * @code
+   * this[i] = table[i, id[i]]
+   * @endcode
+   */
+  virtual void selectElements(Matrix& table, IVector& ids);
+
+  /**
+   * @code
+   * table.row[ids[i]] += this.row[i]
+   * @endcode
+   */
+  virtual void addToRows(Matrix& table, IVector& ids);
+
+  void addColumnVector(const Matrix& b);
+
+  /**
+   * @code
+   * this = scaleAB*(a*b) + scaleT*this
+   * @endcode
+   */
+  void mul(const Matrix& a, const Matrix& b, real scaleAB, real scaleT);
+
+  /**
+   * @code
+   * this = a*b
+   * @endcode
+   */
+  void mul(const Matrix& a, const Matrix& b);
+
+  void mul(const GpuMatrix& a, const GpuMatrix& b, real scaleAB, real scaleT);
+
+  void mul(const GpuSparseMatrix& a,
+           const GpuMatrix& b,
+           real scaleAB,
+           real scaleT);
+
+  void mul(const GpuMatrix& a,
+           const GpuSparseMatrix& b,
+           real scaleAB,
+           real scaleT);
+
+  /**
+   * @code
+   * this = scaleAB*(this*b) +  scaleT*this
+   * @endcode
+   */
+  void rightMul(Matrix& b, real scaleAB, real scaleT);
+
+  /**
+   * @code
+   * this = this* b
+   * @endcode
+   */
+  void rightMul(Matrix& b);
+
+  /**
+   * @code
+   * this = scaleAB*(a*this) +  scaleT*this
+   * @endcode
+   */
+  void leftMul(Matrix& a, real scaleAB, real scaleT);
+
+  /**
+   * @code
+   * this = a*this
+   * @endcode
+   */
+  void leftMul(Matrix& a);
+
+  void colMerge(Matrix& src);
+  void rowSum(Matrix& sum);
+  void rowMax(Matrix& max);
+  void rowMax(IVector& maxIds, Matrix& max);
+  void colMax(Matrix& max);
+  void colMax(IVector& maxIds, Matrix& max);
+  void maxoutForward(Matrix& a, IVector& id, size_t channels, size_t groups);
+  void maxoutBackward(Matrix& a, IVector& id, size_t channels, size_t groups);
+
+  void oneHotCrossEntropy(Matrix& output, IVector& label);
+  void oneHotCrossEntropyBp(Matrix& outputV, IVector& label);
+  void oneHotCrossEntropyWithSelfNorm(Matrix& output,
+                                      IVector& label,
+                                      real alpha);
+  void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
+                                        IVector& label,
+                                        real alpha);
+
+  void softmax(Matrix& output);
+  void sequenceSoftmax(Matrix& output, const IVector& index);
+  void softmaxBackward(Matrix& outputV);
+  void softmaxDerivative(Matrix& output, Matrix& sftmaxSum);
+
+  /// calculate the sum of squares diff cost.
+  void sumOfSquares(Matrix& output, Matrix& label);
+
+  /// gradient of sumOfSquares.
+  void sumOfSquaresBp(Matrix& outputV, Matrix& label);
+  void tanh(Matrix& output);
+  void tanhDerivative(Matrix& output);
+  void softrelu(Matrix& output);
+  void softreluDerivative(Matrix& output);
+  void scaledTanh(Matrix& output, real p1, real p2);
+
+  virtual void print(std::ostream& os) const;
+  virtual void print(std::ostream& os, size_t height, size_t width) const;
+
+  void paramReluForward(Matrix& data, Matrix& W);
+  void paramReluBackwardW(Matrix& oGrad, Matrix& data);
+  void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W);
+
+  void check(std::ostream& os, Matrix& refMat, bool printDiff = true);
+  void randomizeUniform();
+
+  void classificationError(Matrix& output, IVector& label, size_t topkSize = 1);
+
+  void upsampleForward(Matrix& input,
+                       Matrix& mask,
+                       size_t imgSizeH,
+                       size_t imgSizeW,
+                       size_t channels,
+                       size_t outputH,
+                       size_t outputW);
+
+  void upsampleBackward(Matrix& outputGrad,
+                        Matrix& mask,
+                        size_t imgSizeH,
+                        size_t imgSizeW,
+                        size_t channels,
+                        size_t outputH,
+                        size_t outputW);
+
+  void maxPoolForward(Matrix& inputMat,
+                      size_t imgSizeH,
+                      size_t imgSizeW,
+                      size_t channels,
+                      size_t sizeX,
+                      size_t sizeY,
+                      size_t strideH,
+                      size_t strideW,
+                      size_t outputH,
+                      size_t outputW,
+                      size_t paddingH,
+                      size_t paddingW,
+                      MatrixPtr maskMatP);
+
+  void maxPoolBackward(Matrix& image,
+                       size_t imgSizeH,
+                       size_t imgSizeW,
+                       Matrix& outGrad,
+                       Matrix& outV,
+                       size_t sizeX,
+                       size_t sizeY,
+                       size_t strideH,
+                       size_t strideW,
+                       size_t outputH,
+                       size_t outputW,
+                       real scaleTargets,
+                       real scaleOutput,
+                       size_t paddingH,
+                       size_t paddingW);
+
+  void avgPoolForward(Matrix& input,
+                      size_t imgSizeH,
+                      size_t imgSizeW,
+                      size_t channels,
+                      size_t sizeX,
+                      size_t sizeY,
+                      size_t strideH,
+                      size_t strideW,
+                      size_t outputH,
+                      size_t outputW,
+                      size_t paddingH,
+                      size_t paddingW,
+                      bool excludeMode = true);
+
+  void avgPoolBackward(Matrix& input,
+                       size_t imgSizeH,
+                       size_t imgSizeW,
+                       size_t sizeX,
+                       size_t sizeY,
+                       size_t strideH,
+                       size_t strideW,
+                       size_t outputH,
+                       size_t outputW,
+                       real scaleTargets,
+                       real scaleOutput,
+                       size_t paddingH,
+                       size_t paddingW,
+                       bool excludeMode = true);
+
+  void maxPool3DForward(Matrix& inputMat,
+                        Matrix& maxPoolIdx,
+                        size_t channels,
+                        size_t imgSizeD,
+                        size_t imgSizeH,
+                        size_t imgSizeW,
+                        size_t outputD,
+                        size_t outputH,
+                        size_t outputW,
+                        size_t sizeZ,
+                        size_t sizeY,
+                        size_t sizeX,
+                        size_t strideD,
+                        size_t strideH,
+                        size_t strideW,
+                        size_t paddingD,
+                        size_t paddingH,
+                        size_t paddingW);
+
+  void maxPool3DBackward(Matrix& outGrad,
+                         Matrix& maxPoolIdx,
+                         size_t imgSizeD,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         size_t outputD,
+                         size_t outputH,
+                         size_t outputW,
+                         size_t sizeZ,
+                         size_t sizeY,
+                         size_t sizeX,
+                         size_t strideD,
+                         size_t strideH,
+                         size_t strideW,
+                         size_t paddingD,
+                         size_t paddingH,
+                         size_t paddingW,
+                         real scaleTargets,
+                         real scaleOutput);
+
+  void avgPool3DForward(Matrix& input,
+                        size_t channels,
+                        size_t imgSizeD,
+                        size_t imgSizeH,
+                        size_t imgSizeW,
+                        size_t outputD,
+                        size_t outputH,
+                        size_t outputW,
+                        size_t sizeZ,
+                        size_t sizeY,
+                        size_t sizeX,
+                        size_t strideD,
+                        size_t strideH,
+                        size_t strideW,
+                        size_t paddingD,
+                        size_t paddingH,
+                        size_t paddingW);
+
+  void avgPool3DBackward(Matrix& input,
+                         size_t imgSizeD,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         size_t outputD,
+                         size_t outputH,
+                         size_t outputW,
+                         size_t sizeZ,
+                         size_t sizeY,
+                         size_t sizeX,
+                         size_t strideD,
+                         size_t strideH,
+                         size_t strideW,
+                         size_t paddingD,
+                         size_t paddingH,
+                         size_t paddingW,
+                         real scaleTargets,
+                         real scaleOutput);
+
+  void maxSequenceForward(Matrix& input,
+                          const IVector& sequence,
+                          IVector& index);
+
+  void maxSequenceBackward(Matrix& outputGrad,
+                           const IVector& sequence,
+                           IVector& index);
+
+  void bilinearForward(const Matrix& in,
+                       const size_t inImgH,
+                       const size_t inImgW,
+                       const size_t outImgH,
+                       const size_t outImgW,
+                       const size_t numChannels,
+                       const real ratioH,
+                       const real ratioW);
+
+  void bilinearBackward(const Matrix& out,
+                        const size_t outImgH,
+                        const size_t outImgW,
+                        const size_t inImgH,
+                        const size_t inImgW,
+                        const size_t numChannels,
+                        const real ratioH,
+                        const real ratioW);
+
+  void vol2Col(real* data,
+               int channels,
+               int depth,
+               int height,
+               int width,
+               int filterD,
+               int filterH,
+               int filterW,
+               int strideD,
+               int strideH,
+               int strideW,
+               int paddingD,
+               int paddingH,
+               int paddingW);
+
+  void col2Vol(real* trg,
+               int channels,
+               int depth,
+               int height,
+               int width,
+               int filterD,
+               int filterH,
+               int filterW,
+               int strideD,
+               int strideH,
+               int strideW,
+               int paddingD,
+               int paddingH,
+               int paddingW,
+               real alpha,
+               real beta);
+
+  void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label);
+
+  void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label);
+
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    TensorGpuApply<real>(*this, expr);
+  }
+};
+
+class CpuMatrix : public Matrix {
+ private:
+  MatrixPtr sftmaxSum_;
+  MatrixPtr sftmaxDot_;
+
+ public:
+  CpuMatrix(size_t height, size_t width, bool trans = false);
+  CpuMatrix(real* data, size_t height, size_t width, bool trans = false)
+      : Matrix(data, height, width, trans, false) {}
+  CpuMatrix(real* data,
+            size_t height,
+            size_t width,
+            size_t stride,
+            bool trans = false)
+      : Matrix(data, height, width, stride, trans, false) {}
+
+  CpuMatrix(CpuMemHandlePtr dataHandle,
+            size_t height,
+            size_t width,
+            bool trans = false)
+      : Matrix(dataHandle, height, width, trans, false) {}
+
+  ~CpuMatrix();
+
+  void zeroMem();
+  void resetOne();
+  void setDiag(real value);
+
+  void resize(size_t newHeight, size_t newWidth);
+  void resize(size_t newHeight,
+              size_t newWidth,
+              size_t newNnz, /* used to allocate space */
+              SparseValueType valueType,
+              SparseFormat format) {
+    LOG(FATAL) << "Only Support Sparse Matrix";
+  }
+  void setRow(size_t row,
+              size_t colNum,
+              const unsigned int* cols,
+              const real* values) {
+    LOG(FATAL) << "Only Support Sparse Matrix";
+  }
+
+  real getElement(size_t x, size_t y) const;
+  real getSum();
+  void accumulateColSum(Matrix& src);
+  real getAbsSum();
+
+  MatrixPtr getTranspose();
+  void transpose(MatrixPtr& matTrans, bool memAlloc);
+  void rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise);
+
+  MatrixPtr getInverse();
+  void inverse(MatrixPtr& matInv, bool memAlloc);
+
+  void copyFrom(const Matrix& src);
+
+  void copyFrom(const Matrix& src, hl_stream_t stream);
+
+  void copyFrom(const real* cpuSrc, size_t size);
+
+  void copyFrom(const real* cpuSrc, const int64_t* seq);
+
+  void copyFrom(const IVector& src);
+
+  void copyFrom(CpuSparseMatrix& src);
+
+  void copyByRowIndex(Matrix& b, const IVector& rowIndex);
+
+  MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
+
+  void upsampleForward(Matrix& input,
+                       Matrix& mask,
+                       size_t imgSizeH,
+                       size_t imgSizeW,
+                       size_t channels,
+                       size_t outputH,
+                       size_t outputW);
+
+  void upsampleBackward(Matrix& outputGrad,
+                        Matrix& mask,
+                        size_t imgSizeH,
+                        size_t imgSizeW,
+                        size_t channels,
+                        size_t outputH,
+                        size_t outputW);
+
+  void maxPoolForward(Matrix& inputMat,
+                      size_t imgSizeH,
+                      size_t imgSizeW,
+                      size_t channels,
+                      size_t sizeX,
+                      size_t sizeY,
+                      size_t strideH,
+                      size_t strideW,
+                      size_t outputH,
+                      size_t outputW,
+                      size_t paddingH,
+                      size_t paddingW,
+                      MatrixPtr maskMatP);
+
+  void maxPoolBackward(Matrix& image,
+                       size_t imgSizeH,
+                       size_t imgSizeW,
+                       Matrix& outGrad,
+                       Matrix& outV,
+                       size_t sizeX,
+                       size_t sizeY,
+                       size_t strideH,
+                       size_t strideW,
+                       size_t outputH,
+                       size_t outputW,
+                       real scaleTargets,
+                       real scaleOutput,
+                       size_t paddingH,
+                       size_t paddingW);
+
+  void avgPoolForward(Matrix& input,
+                      size_t imgSizeH,
+                      size_t imgSizeW,
+                      size_t channels,
+                      size_t sizeX,
+                      size_t sizeY,
+                      size_t strideH,
+                      size_t strideW,
+                      size_t outputH,
+                      size_t outputW,
+                      size_t paddingH,
+                      size_t paddingW,
+                      bool excludeMode = true);
+
+  void avgPoolBackward(Matrix& input,
+                       size_t imgSizeH,
+                       size_t imgSizeW,
+                       size_t sizeX,
+                       size_t sizeY,
+                       size_t strideH,
+                       size_t strideW,
+                       size_t outputH,
+                       size_t outputW,
+                       real scaleTargets,
+                       real scaleOutput,
+                       size_t paddingH,
+                       size_t paddingW,
+                       bool excludeMode = true);
+
+  void maxPool3DForward(Matrix& inputMat,
+                        Matrix& maxPoolIdx,
+                        size_t channels,
+                        size_t imgSizeD,
+                        size_t imgSizeH,
+                        size_t imgSizeW,
+                        size_t outputD,
+                        size_t outputH,
+                        size_t outputW,
+                        size_t sizeZ,
+                        size_t sizeY,
+                        size_t sizeX,
+                        size_t strideD,
+                        size_t strideH,
+                        size_t strideW,
+                        size_t paddingD,
+                        size_t paddingH,
+                        size_t paddingW);
+
+  void maxPool3DBackward(Matrix& outGrad,
+                         Matrix& maxPoolIdx,
+                         size_t imgSizeD,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         size_t outputD,
+                         size_t outputH,
+                         size_t outputW,
+                         size_t sizeZ,
+                         size_t sizeY,
+                         size_t sizeX,
+                         size_t strideD,
+                         size_t strideH,
+                         size_t strideW,
+                         size_t paddingD,
+                         size_t paddingH,
+                         size_t paddingW,
+                         real scaleTargets,
+                         real scaleOutput);
+
+  void avgPool3DForward(Matrix& input,
+                        size_t channels,
+                        size_t imgSizeD,
+                        size_t imgSizeH,
+                        size_t imgSizeW,
+                        size_t outputD,
+                        size_t outputH,
+                        size_t outputW,
+                        size_t sizeZ,
+                        size_t sizeY,
+                        size_t sizeX,
+                        size_t strideD,
+                        size_t strideH,
+                        size_t strideW,
+                        size_t paddingD,
+                        size_t paddingH,
+                        size_t paddingW);
+
+  void avgPool3DBackward(Matrix& input,
+                         size_t imgSizeD,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         size_t outputD,
+                         size_t outputH,
+                         size_t outputW,
+                         size_t sizeZ,
+                         size_t sizeY,
+                         size_t sizeX,
+                         size_t strideD,
+                         size_t strideH,
+                         size_t strideW,
+                         size_t paddingD,
+                         size_t paddingH,
+                         size_t paddingW,
+                         real scaleTargets,
+                         real scaleOutput);
+
+  void maxSequenceForward(Matrix& input,
+                          const IVector& sequence,
+                          IVector& index);
+
+  void maxSequenceBackward(Matrix& outputGrad,
+                           const IVector& sequence,
+                           IVector& index);
+
+  real* getRow(size_t row) { return BaseMatrix::rowBuf(row); }
+  virtual real* getRowBuf(size_t row) { return getRow(row); }
+
+ public:
+  /// add b to each sample of this.
+  void addBias(Matrix& b, real scale);
+  void addSharedBias(Matrix& b, real scale);
+
+  /// add each sample of a to this.
+  void collectBias(Matrix& a, real scale);
+  void collectSharedBias(Matrix& a, real scale);
+
+  void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
+  void sequenceAvgBackward(Matrix& a, const IVector& startsPos, int mode);
+
+  /**
+   * @code
+   * this.row[i] += table.row[ids[i]]
+   * @endcode
+   */
+  virtual void selectRows(Matrix& table, IVector& ids);
+
+  /**
+   * @code
+   * table.row[ids[i]] += this.row[i]
+   * @endcode
+   */
+  virtual void addToRows(Matrix& table, IVector& ids);
+
+  /**
+   * @code
+   * this[i] = table[i, id[i]]
+   * @endcode
+   */
+  virtual void selectElements(Matrix& table, IVector& ids);
+
+  /**
+   * @code
+   * table[i, id[i]] += this[i]
+   * @endcode
+   */
+  virtual void addElements(Matrix& table, IVector& ids);
+
+  /**
+   * use abstract getRow() to get row from table.
+   *
+   * Define table as template instead of virtual class for performance sake.
+   * internal used by above two virtual funcs.
+   */
+  template <typename TableMatType>
+  void selectRowsImp(TableMatType& table, IVector& ids);
+  template <typename TableMatType>
+  void addToRowsImp(TableMatType& table, IVector& ids);
+
+  void addColumnVector(const Matrix& b);
+
+  void mul(const Matrix& a, const Matrix& b, real scaleAB, real scaleT);
+  void mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
+
+  void mul(CpuMatrix* a, CpuSparseMatrix* b, real scaleAB, real scaleT);
+
+  static void mul(CpuMatrix* a,
+                  CpuMatrix* b,
+                  CpuSparseMatrix* c,
+                  real scaleAB,
+                  real scaleT);
+
+  /**
+   * c = a * b
+   *
+   * use abstract getRow() to get row from B,C.
+   * Define B,C as template instead of virtual class for performance sake.
+   */
+  template <typename MatBType, typename MatCType>
+  static void mul(
+      CpuSparseMatrix* a, MatBType* b, MatCType* c, real scaleAB, real scaleT);
+
+  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
+
+  void mul(const Matrix& a, const Matrix& b);
+
+  void rightMul(Matrix& b, real scaleAB, real scaleT);
+  void rightMul(Matrix& b);
+
+  void leftMul(Matrix& a, real scaleAB, real scaleT);
+  void leftMul(Matrix& a);
+  void colMerge(Matrix& src);
+  void rowSum(Matrix& sum);
+  void rowMaxId(IVector& maxIds);
+  void rowMax(Matrix& max);
+  void rowMax(IVector& maxIds, Matrix& maxVal);
+  void colMax(Matrix& max);
+  void colMax(IVector& maxIds, Matrix& maxVal);
+  void maxoutForward(Matrix& a, IVector& id, size_t channels, size_t groups);
+  void maxoutBackward(Matrix& a, IVector& id, size_t channels, size_t groups);
+  void rowNormalizeL1(Matrix& out);
+
+  void oneHotCrossEntropy(Matrix& output, IVector& label);
+  void oneHotCrossEntropyBp(Matrix& outputV, IVector& label);
+  void oneHotCrossEntropyWithSelfNorm(Matrix& output,
+                                      IVector& label,
+                                      real alpha);
+  void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
+                                        IVector& label,
+                                        real alpha);
+
+  void circularConv(Matrix& b, Matrix& c);
+  void circularConvDerivative(Matrix& output,
+                              Matrix& prevOut1,
+                              Matrix& prevOut2,
+                              Matrix& prevGrad1,
+                              Matrix& prevGrad2);
+
+  void softmax(Matrix& output);
+  void sequenceSoftmax(Matrix& output, const IVector& index);
+  void softmaxDerivative(Matrix& output, Matrix& sftmaxSum);
+
+  /// calculate the sum of squares diff cost.
+  void sumOfSquares(Matrix& output, Matrix& label);
+
+  /// gradient of sumOfSquares.
+  void sumOfSquaresBp(Matrix& outputV, Matrix& label);
+
+  void smoothL1(Matrix& output, Matrix& label, real destScale);
+  void smoothL1Bp(Matrix& output, Matrix& label, real destScale);
+
+  void tanh(Matrix& output);
+  void tanhDerivative(Matrix& output);
+
+  void softrelu(Matrix& output);
+  void softreluDerivative(Matrix& output);
+  void scaledTanh(Matrix& output, real p1, real p2);
+
+  void print(std::ostream& os) const;
+  void print(std::ostream& os, size_t height, size_t width) const;
+  void printOneRow(std::ostream& os, size_t idx) const;
+
+  void paramReluForward(Matrix& data, Matrix& W);
+  void paramReluBackwardW(Matrix& oGrad, Matrix& data);
+  void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W);
+
+  void check(std::ostream& os, Matrix& refMat, bool printDiff = true);
+
+  real getMin();
+  real getMax();
+
+  void randomizeUniform();
+
+  void classificationError(Matrix& output, IVector& label, size_t topkSize = 1);
+
+  void addByBitCode(size_t numClasses, const IVector& codes, const Matrix& vec);
+
+  void addByBitCodeBackward(size_t numClasses,
+                            const IVector& codes,
+                            Matrix& vec);
+
+  void mulByBitCode(size_t numClasses,
+                    const IVector& codes,
+                    const Matrix& mat,
+                    const Matrix& input);
+
+  void mulByBitCodeBackwardWeight(size_t numClasses,
+                                  const IVector& codes,
+                                  Matrix& mat,
+                                  const Matrix& input);
+
+  void mulByBitCodeBackwardError(size_t numClasses,
+                                 const IVector& codes,
+                                 const Matrix& mat,
+                                 Matrix& input);
+
+  void sumByBitCode(size_t numClasses,
+                    IVector& codes,
+                    Matrix& sum,
+                    real scaleSum);
+
+  void subByBitCode(size_t numClasses_, IVector& codes);
+
+  void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label);
+  void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label);
+  void classificationErrorMulti(Matrix& output, Matrix& label, real threshold);
+
+  void bilinearForward(const Matrix& in,
+                       const size_t inImgH,
+                       const size_t inImgW,
+                       const size_t outImgH,
+                       const size_t outImgW,
+                       const size_t numChannels,
+                       const real ratioH,
+                       const real ratioW);
+
+  void bilinearBackward(const Matrix& out,
+                        const size_t outImgH,
+                        const size_t outImgW,
+                        const size_t inImgH,
+                        const size_t inImgW,
+                        const size_t numChannels,
+                        const real ratioH,
+                        const real ratioW);
+
+  void vol2Col(real* data,
+               int channels,
+               int depth,
+               int height,
+               int width,
+               int filterD,
+               int filterH,
+               int filterW,
+               int strideD,
+               int strideH,
+               int strideW,
+               int paddingD,
+               int paddingH,
+               int paddingW);
+
+  void col2Vol(real* trg,
+               int channels,
+               int depth,
+               int height,
+               int width,
+               int filterD,
+               int filterH,
+               int filterW,
+               int strideD,
+               int strideH,
+               int strideW,
+               int paddingD,
+               int paddingH,
+               int paddingW,
+               real alpha,
+               real beta);
+
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    TensorCpuApply<real>(*this, expr);
+  }
+};
+
+class SharedCpuMatrix : public CpuMatrix {
+ public:
+#ifndef PADDLE_MOBILE_INFERENCE
+  /* blockNum is number of partitions of the matrix  */
+  SharedCpuMatrix(int blockNum, size_t height, size_t width, bool trans = false)
+      : CpuMatrix(height, width, trans) {
+    initShared(blockNum);
+  }
+  SharedCpuMatrix(
+      int blockNum, real* data, size_t height, size_t width, bool trans = false)
+      : CpuMatrix(data, height, width, trans) {
+    initShared(blockNum);
+  }
+
+  SharedCpuMatrix(int blockNum,
+                  CpuMemHandlePtr dataHandle,
+                  size_t height,
+                  size_t width,
+                  bool trans = false)
+      : CpuMatrix(dataHandle, height, width, trans) {
+    initShared(blockNum);
+  }
+
+  SharedCpuMatrix(CpuMemHandlePtr dataHandle,
+                  size_t height,
+                  size_t width,
+                  bool trans = false)
+      : CpuMatrix(dataHandle, height, width, trans) {
+    initBlock(1);
+  }
+
+  ~SharedCpuMatrix() {}
+
+ public:
+  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
+  virtual void add(Matrix& b, real p1, real p2);
+  virtual void add(real p1, real p2);
+
+ private:
+  using Matrix::mul;
+  void initShared(int blockNum);
+  void initBlock(int blockNum);
+
+  int blockNum_;
+  std::vector<std::unique_ptr<std::mutex>> blockLocks_;
+  ThreadLocal<CpuMatrixPtr> localBuf_;
+  ThreadLocal<std::vector<int>> localBufRows_;
+  ThreadLocal<std::vector<int>> blockSeq_;
+#endif
+};
+
+typedef struct { unsigned int col; } sparse_non_value_t;
+
+typedef struct {
+  unsigned int col;
+  float value;
+} sparse_float_value_t;
+
+}  // namespace paddle
+#include "ExecViaCpu.h"
diff --git a/paddle/legacy/math/MatrixBitCode.cpp b/paddle/legacy/math/MatrixBitCode.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f35f266a30506110eb6c656f7b631d12d8f6ae90
--- /dev/null
+++ b/paddle/legacy/math/MatrixBitCode.cpp
@@ -0,0 +1,291 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Matrix.h"
+#include "hl_gpu.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Util.h"
+
+namespace paddle {
+
+namespace {
+
+struct SimpleCode {
+  SimpleCode(size_t code, size_t numClasses) : c_(code + numClasses) {}
+  inline size_t calcIndex(int bit) const { return (c_ >> (bit + 1)) - 1; }
+  inline bool calcBit(int bit) const { return c_ & (1 << bit); }
+  inline int getLength() const { return findLastSet(c_) - 1; }
+
+ private:
+  size_t c_;
+};
+
+struct SimpleCodeTable {
+  explicit SimpleCodeTable(size_t numClasses) : numClasses_(numClasses) {}
+  SimpleCode operator()(size_t code) const {
+    return SimpleCode(code, numClasses_);
+  }
+  size_t size() const { return numClasses_; }
+  int getMaxCodeLength() const { return findLastSet(numClasses_ - 1); }
+
+ private:
+  size_t numClasses_;
+  int maxCodeLength_;
+};
+
+}  // namespace
+
+/**
+ * CodeTable class should support 3 functions:
+ *
+ * size_t size()
+ *   return the number of codes
+ *
+ * int getMaxCodeLength()
+ *   return the maximal code length
+ *
+ * Code operator()(size_t i)
+ *   return the i-th code. Code class is descriebed below.
+ *
+ * Code class should support 3 functions:
+ *
+ * int getLength()
+ *   return the length of the code
+ *
+ * bool calcIndex(int bit)
+ *   bit ranges from 0 to getLength() - 1
+ *   return the index for the (1+bit) level parent
+ *
+ * bool calcBit(int bit)
+ *   return true if the bit level parent is the right child of (1+bit) level
+ *   parent
+ *
+ */
+
+/*
+   for i:
+     for j < codeLength:
+       op(tmat(i, j), vec(0, index(i, j)))
+*/
+template <class CodeTable, class Op, class TMat, class Mat>
+static void addByBitCodeT(
+    Op op, CodeTable codeTable, const IVector& codes, TMat& tmat, Mat& vec) {
+  CHECK(!vec.useGpu());
+
+  size_t numClasses = codeTable.size();
+  size_t maxCodeLength = codeTable.getMaxCodeLength();
+  size_t numSamples = tmat.getHeight();
+  size_t oWidth = tmat.getWidth();
+  CHECK_EQ(tmat.getWidth(), maxCodeLength);
+  CHECK_EQ(codes.getSize(), numSamples);
+  CHECK_EQ(vec.getHeight(), (size_t)1);
+  CHECK_EQ(vec.getWidth(), numClasses - 1);
+
+  auto data = tmat.getData();
+  auto v = vec.getData();
+  const int* c = codes.getData();
+  for (size_t i = 0; i < numSamples; ++i) {
+    auto code = codeTable(c[i]);
+    int codeLength = code.getLength();
+    for (int j = 0; j < codeLength; ++j) {
+      size_t index = code.calcIndex(j);
+      op(data[i * oWidth + j], v[index]);
+    }
+  }
+}
+
+/* For j < codeLength:
+   this(i, j) += vec(0, index(i, j))
+*/
+void CpuMatrix::addByBitCode(size_t numClasses,
+                             const IVector& codes,
+                             const Matrix& vec) {
+  auto op = [](real& t, real v) { t += v; };
+  addByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, vec);
+}
+
+/* For j < codeLength:
+   vec(0, index(i, j)) += this(i, j)
+*/
+void CpuMatrix::addByBitCodeBackward(size_t numClasses,
+                                     const IVector& codes,
+                                     Matrix& vec) {
+  auto op = [](real t, real& v) { v += t; };
+  addByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, vec);
+}
+
+/*
+  for i:
+    for j < codeLength:
+      op(tmat(i, j), mat.row(index(i, j)), input.row(i))
+*/
+template <class Op,
+          class CodeTable,
+          class IVec,
+          class TMat,
+          class WMat,
+          class InMat>
+void mulByBitCodeT(Op op,
+                   CodeTable codeTable,
+                   IVec& codes,
+                   TMat& tmat,
+                   WMat& weight,
+                   InMat& input) {
+  CHECK(!tmat.useGpu() && !weight.useGpu() && !input.useGpu());
+
+  size_t numClasses = codeTable.size();
+  size_t maxCodeLength = codeTable.getMaxCodeLength();
+  size_t numSamples = tmat.getHeight();
+  size_t inputDim = input.getWidth();
+  size_t oWidth = tmat.getWidth();
+  CHECK_EQ(tmat.getWidth(), maxCodeLength);
+  CHECK_EQ(codes.getSize(), numSamples);
+  CHECK_EQ(input.getHeight(), numSamples);
+  CHECK_EQ(weight.getHeight(), numClasses - 1);
+  CHECK_EQ(weight.getWidth(), inputDim);
+
+  real* data = tmat.getData();
+  const int* c = codes.getData();
+  for (size_t i = 0; i < numSamples; ++i) {
+    auto code = codeTable(c[i]);
+    int codeLength = code.getLength();
+    for (int j = 0; j < codeLength; ++j) {
+      size_t index = code.calcIndex(j);
+      op(data[i * oWidth + j], weight.rowBuf(index), input.rowBuf(i), inputDim);
+    }
+  }
+}
+
+/* For j < codeLength:
+   this(i, j) += <weight.row(index(i, j)), input.row(i)>
+*/
+void CpuMatrix::mulByBitCode(size_t numClasses,
+                             const IVector& codes,
+                             const Matrix& weight,
+                             const Matrix& input) {
+  auto op = [](
+      real& t, const real* weightRow, const real* inputRow, size_t inputDim) {
+    real sum = 0;
+    for (size_t k = 0; k < inputDim; ++k) {
+      sum += weightRow[k] * inputRow[k];
+    }
+    t += sum;
+  };
+
+  mulByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, weight, input);
+}
+
+/* For index(i, j) >= 0:
+   weight.row(index(i, j)) += this(i, j) * input.row(i)
+*/
+void CpuMatrix::mulByBitCodeBackwardWeight(size_t numClasses,
+                                           const IVector& codes,
+                                           Matrix& weight,
+                                           const Matrix& input) {
+  auto op = [](
+      const real t, real* weightRow, const real* inputRow, size_t inputDim) {
+    for (size_t k = 0; k < inputDim; ++k) {
+      weightRow[k] += t * inputRow[k];
+    }
+  };
+
+  mulByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, weight, input);
+}
+
+/* For j < codeLength:
+   input.row(i) += this(i, j) * weight.row(index(i, j))
+*/
+void CpuMatrix::mulByBitCodeBackwardError(size_t numClasses,
+                                          const IVector& codes,
+                                          const Matrix& weight,
+                                          Matrix& input) {
+  auto op = [](
+      const real t, const real* weightRow, real* inputRow, size_t inputDim) {
+    for (size_t k = 0; k < inputDim; ++k) {
+      inputRow[k] += t * weightRow[k];
+    }
+  };
+
+  mulByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, weight, input);
+}
+
+template <class CodeTable>
+void sumByBitCodeT(CodeTable codeTable,
+                   IVector& codes,
+                   const CpuMatrix& tmat,
+                   Matrix& sum,
+                   real scaleSum) {
+  size_t maxCodeLength = codeTable.getMaxCodeLength();
+  size_t numSamples = tmat.getHeight();
+  size_t oWidth = tmat.getWidth();
+  CHECK_EQ(tmat.getWidth(), maxCodeLength);
+  CHECK_EQ(codes.getSize(), numSamples);
+  CHECK_EQ(sum.getHeight(), numSamples);
+  CHECK_EQ(sum.getWidth(), (size_t)1);
+
+  const real* data = tmat.getData();
+  real* s = sum.getData();
+  int* c = codes.getData();
+  for (size_t i = 0; i < numSamples; ++i) {
+    real sm = 0;
+    auto code = codeTable(c[i]);
+    int codeLength = code.getLength();
+    for (int j = 0; j < codeLength; ++j) {
+      if (code.calcBit(j)) {
+        sm += data[i * oWidth + j];
+      }
+    }
+    s[i] = scaleSum * sm;
+  }
+}
+
+/* For j < codeLength:
+   sum(i, 0) = \sum_j  bit(i, j) * this(i, j)
+*/
+void CpuMatrix::sumByBitCode(size_t numClasses,
+                             IVector& codes,
+                             Matrix& sum,
+                             real scaleSum) {
+  sumByBitCodeT(SimpleCodeTable(numClasses), codes, *this, sum, scaleSum);
+}
+
+template <class CodeTable>
+void subByBitCodeT(CodeTable codeTable, IVector& codes, CpuMatrix& tmat) {
+  size_t maxCodeLength = codeTable.getMaxCodeLength();
+  size_t numSamples = tmat.getHeight();
+  size_t oWidth = tmat.getWidth();
+  CHECK_EQ(tmat.getWidth(), maxCodeLength);
+  CHECK_EQ(codes.getSize(), numSamples);
+
+  real* data = tmat.getData();
+  int* c = codes.getData();
+  for (size_t i = 0; i < numSamples; ++i) {
+    auto code = codeTable(c[i]);
+    int codeLength = code.getLength();
+    for (int j = 0; j < codeLength; ++j) {
+      if (code.calcBit(j)) {
+        data[i * oWidth + j] -= 1;
+      }
+    }
+  }
+}
+
+/* For j < codeLength
+   this(i, j) -= bit(i, j)
+*/
+void CpuMatrix::subByBitCode(size_t numClasses, IVector& codes) {
+  subByBitCodeT(SimpleCodeTable(numClasses), codes, *this);
+}
+
+}  // namespace paddle
diff --git a/paddle/math/MemoryHandle.cpp b/paddle/legacy/math/MemoryHandle.cpp
similarity index 100%
rename from paddle/math/MemoryHandle.cpp
rename to paddle/legacy/math/MemoryHandle.cpp
diff --git a/paddle/math/MemoryHandle.h b/paddle/legacy/math/MemoryHandle.h
similarity index 100%
rename from paddle/math/MemoryHandle.h
rename to paddle/legacy/math/MemoryHandle.h
diff --git a/paddle/math/NEONFunctions.cpp b/paddle/legacy/math/NEONFunctions.cpp
similarity index 100%
rename from paddle/math/NEONFunctions.cpp
rename to paddle/legacy/math/NEONFunctions.cpp
diff --git a/paddle/math/NEONFunctions.h b/paddle/legacy/math/NEONFunctions.h
similarity index 100%
rename from paddle/math/NEONFunctions.h
rename to paddle/legacy/math/NEONFunctions.h
diff --git a/paddle/math/PoolAllocator.cpp b/paddle/legacy/math/PoolAllocator.cpp
similarity index 100%
rename from paddle/math/PoolAllocator.cpp
rename to paddle/legacy/math/PoolAllocator.cpp
diff --git a/paddle/math/PoolAllocator.h b/paddle/legacy/math/PoolAllocator.h
similarity index 100%
rename from paddle/math/PoolAllocator.h
rename to paddle/legacy/math/PoolAllocator.h
diff --git a/paddle/legacy/math/RowBuffer.h b/paddle/legacy/math/RowBuffer.h
new file mode 100644
index 0000000000000000000000000000000000000000..9dfd5eff06a39494cea6a8ce0b1f5ead6490b148
--- /dev/null
+++ b/paddle/legacy/math/RowBuffer.h
@@ -0,0 +1,139 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "MemoryHandle.h"
+#include "paddle/legacy/utils/Util.h"
+
+namespace paddle {
+
+/**
+ * @brief The RowBuffer class
+ * Represent the SparseRow Matrix Data.
+ *
+ * If not set memory handler, then the data could be auto growth.
+ */
+class RowBuffer {
+ public:
+  /**
+   * @brief RowBuffer create a auto-growth row buffer. The row length is width.
+   * @param width the length of each row, a.k.a matrix width.
+   */
+  explicit RowBuffer(size_t width) : width_(width) {}
+
+  /**
+   * @brief RowBuffer create a row buffer, which cannot be auto-growth.
+   * @param mem the pre-allocated memory.
+   * @param width the length of each row, a.k.a matrix width.
+   */
+  RowBuffer(const CpuMemHandlePtr& mem, size_t width)
+      : preallocatedBuf_(mem), width_(width) {}
+
+  /**
+   * @brief resize resize the buffer with rowCount
+   * @param rowCnt number of row. matrix height.
+   */
+  inline void resize(int rowCnt) {
+    if (preallocatedBuf_) {
+      CHECK(preallocatedBuf_->getSize() >= rowCnt * width_ * sizeof(real));
+    } else {
+      rowStore_.resize(rowCnt * width_);
+    }
+  }
+
+  /**
+   * @brief get a row buffer with row index.
+   * @param row the index of row.
+   * @return row buffer.
+   */
+  inline real* get(int row) const {
+    if (preallocatedBuf_) {
+      CHECK_LE((row)*width_ * sizeof(real), preallocatedBuf_->getSize());
+      return reinterpret_cast<real*>(preallocatedBuf_->getBuf()) + row * width_;
+    } else {
+      CHECK_LE((row + 1) * width_, rowStore_.size());
+      return const_cast<real*>(rowStore_.data() + row * width_);
+    }
+  }
+
+  /**
+   * @brief get a row buffer with row index. If row index is larger than local
+   *        buffer, the size of local buffer will grow.
+   * @param row the index of row.
+   * @return row buffer.
+   */
+  inline real* getWithAutoGrowth(int row) {
+    if (preallocatedBuf_) {
+      return get(row);
+    } else {
+      if ((rowStore_.size() <= row * width_)) {
+        rowStore_.resize((row + 1) * width_);
+      }
+      return rowStore_.data() + row * width_;
+    }
+  }
+
+  /**
+   * @return raw data buffer.
+   */
+  inline real* data() {
+    if (preallocatedBuf_) {
+      return reinterpret_cast<real*>(preallocatedBuf_->getBuf());
+    } else {
+      return rowStore_.data();
+    }
+  }
+
+  /**
+   * @brief clear local buffer. It only affect auto-growth buffer.
+   */
+  inline void clear() {
+    // swap an empty vector to it to free the memory.
+    std::vector<real, AlignedAllocator<real, 32>> empty;
+    rowStore_.swap(empty);
+  }
+
+  /**
+   * @brief get current number of rows.
+   * @return number of rows.
+   */
+  inline size_t getRowCount() const {
+    if (preallocatedBuf_) {
+      return preallocatedBuf_->getSize() / sizeof(real) / width_;
+    } else {
+      return rowStore_.size() / width_;
+    }
+  }
+
+  /**
+   * @brief get is this buffer can automatically grow or not.
+   * @return ture if can automacitally grow.
+   */
+  inline bool isAutoGrowth() const { return !preallocatedBuf_; }
+
+  /**
+   * @brief return the width of matrix. a.k.a length of row.
+   * @return width of matrix
+   */
+  inline size_t getWidth() const { return width_; }
+
+ private:
+  //! TODO(yuyang18): Add resize method to CpuMemHandlePtr, then we can get rid
+  //! of std::vector here.
+  CpuMemHandlePtr preallocatedBuf_;
+  std::vector<real, AlignedAllocator<real, 32>> rowStore_;
+  size_t width_;
+};
+}  // namespace paddle
diff --git a/paddle/math/SIMDFunctions.cpp b/paddle/legacy/math/SIMDFunctions.cpp
similarity index 100%
rename from paddle/math/SIMDFunctions.cpp
rename to paddle/legacy/math/SIMDFunctions.cpp
diff --git a/paddle/math/SIMDFunctions.h b/paddle/legacy/math/SIMDFunctions.h
similarity index 100%
rename from paddle/math/SIMDFunctions.h
rename to paddle/legacy/math/SIMDFunctions.h
diff --git a/paddle/legacy/math/SparseMatrix.cpp b/paddle/legacy/math/SparseMatrix.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6f68252b0a74802946e899e6e13e1da681d76986
--- /dev/null
+++ b/paddle/legacy/math/SparseMatrix.cpp
@@ -0,0 +1,864 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "SparseMatrix.h"
+#include <algorithm>
+#include <iostream>
+#include <vector>
+#include "hl_gpu.h"
+#include "hl_top_k.h"
+#include "paddle/legacy/utils/Util.h"
+
+namespace paddle {
+
+GpuSparseMatrix::GpuSparseMatrix(size_t height,
+                                 size_t width,
+                                 size_t nnz,
+                                 SparseValueType valueType,
+                                 SparseFormat format,
+                                 bool trans)
+    : Matrix(NULL, height, width, trans, true) {
+  resize(height, width, nnz, valueType, format);
+}
+
+GpuSparseMatrix::GpuSparseMatrix(GpuMemHandlePtr dataHandle,
+                                 hl_sparse_matrix_s_ptr sMatrix,
+                                 size_t height,
+                                 size_t width,
+                                 size_t nnz,
+                                 SparseValueType valueType,
+                                 SparseFormat format,
+                                 bool trans,
+                                 MemoryHandlePtr sMemoryHandle)
+    : Matrix(dataHandle, height, width, trans, true) {
+  CHECK(dataHandle && sMatrix) << "Invalid argument pointer";
+
+  size_t size = 0;
+  if (format == SPARSE_CSR) {
+    size = (height + 1) * sizeof(int) + nnz * sizeof(int);
+  } else {
+    size = (width + 1) * sizeof(int) + nnz * sizeof(int);
+  }
+
+  if (NO_VALUE != valueType) {
+    size += nnz * sizeof(real);
+  }
+  CHECK_LE(size, dataHandle->getSize());
+
+  sMatrix_ = sMatrix;
+
+  if (sMemoryHandle == NULL) {
+    sMemoryHandle_ = std::make_shared<CpuMemoryHandle>(dataHandle->getSize());
+  } else {
+    CHECK_EQ(sMemoryHandle->getSize(), dataHandle->getSize());
+    sMemoryHandle_ = sMemoryHandle;
+  }
+
+  elementCnt_ = nnz;
+  valueType_ = valueType;
+  format_ = format;
+  if (format_ == SPARSE_CSR)
+    sparseResizeCSR();
+  else
+    sparseResizeCSC();
+}
+
+GpuSparseMatrix::GpuSparseMatrix(hl_sparse_matrix_s_ptr sMatrix,
+                                 size_t height,
+                                 size_t width,
+                                 size_t nnz,
+                                 SparseValueType valueType,
+                                 SparseFormat format,
+                                 bool trans,
+                                 MemoryHandlePtr sMemoryHandle)
+    : Matrix(NULL, height, width, trans, true) {
+  CHECK(sMatrix) << "Invalid argument pointer";
+  sMatrix_ = sMatrix;
+  sMemoryHandle_ = sMemoryHandle;
+  elementCnt_ = nnz;
+  format_ = format;
+  valueType_ = valueType;
+}
+
+GpuSparseMatrix::GpuSparseMatrix(real* value,
+                                 int* rows,
+                                 int* cols,
+                                 size_t height,
+                                 size_t width,
+                                 size_t nnz,
+                                 SparseValueType valueType,
+                                 SparseFormat format,
+                                 bool trans)
+    : Matrix(NULL, height, width, trans, true) {
+  size_t size = 0;
+  if (format == SPARSE_CSR) {
+    size = (height + 1) * sizeof(int) + nnz * sizeof(int);
+  } else {
+    size = (width + 1) * sizeof(int) + nnz * sizeof(int);
+  }
+
+  if (NO_VALUE != valueType) {
+    size += nnz * sizeof(real);
+  }
+  elementCnt_ = nnz;
+  valueType_ = valueType;
+  format_ = format;
+
+  sMemoryHandle_ = std::make_shared<CpuMemoryHandle>(size);
+  if (format_ == SPARSE_CSR) {
+    rows_ = reinterpret_cast<int*>(
+        reinterpret_cast<char*>(sMemoryHandle_->getBuf()));
+    cols_ = reinterpret_cast<int*>(
+        reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
+        (height_ + 1) * sizeof(int));
+    if (NO_VALUE != valueType_) {
+      value_ = reinterpret_cast<real*>(
+          reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
+          (height_ + 1) * sizeof(int) + elementCnt_ * sizeof(int));
+    } else {
+      value_ = NULL;
+    }
+
+    if (sMatrix_ == NULL) {
+      /* construct hl_sparse_matrix_s */
+      hl_sparse_matrix_s tmp;
+      hl_construct_sparse_matrix(
+          &tmp,
+          value,
+          rows,
+          cols,
+          HL_SPARSE_CSR,
+          valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE,
+          height_,
+          width_,
+          elementCnt_);
+      hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix);
+      sMatrix_ = tmp2;
+    }
+
+  } else {
+    cols_ = reinterpret_cast<int*>(
+        reinterpret_cast<char*>(sMemoryHandle_->getBuf()));
+    rows_ = reinterpret_cast<int*>(
+        reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
+        (width_ + 1) * sizeof(int));
+    if (NO_VALUE != valueType_) {
+      value_ = reinterpret_cast<real*>(
+          reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
+          (width_ + 1) * sizeof(int) + elementCnt_ * sizeof(int));
+    } else {
+      value_ = NULL;
+    }
+
+    if (sMatrix_ == NULL) {
+      /* construct hl_sparse_matrix_s */
+      hl_sparse_matrix_s tmp;
+      hl_construct_sparse_matrix(
+          &tmp,
+          value,
+          rows,
+          cols,
+          HL_SPARSE_CSC,
+          valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE,
+          height_,
+          width_,
+          elementCnt_);
+      hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix);
+      sMatrix_ = tmp2;
+    }
+  }
+}
+
+void GpuSparseMatrix::sparseResizeCSR() {
+  rows_ =
+      reinterpret_cast<int*>(reinterpret_cast<char*>(sMemoryHandle_->getBuf()));
+  cols_ =
+      reinterpret_cast<int*>(reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
+                             (height_ + 1) * sizeof(int));
+  if (NO_VALUE != valueType_) {
+    value_ = reinterpret_cast<real*>(
+        reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
+        (height_ + 1) * sizeof(int) + elementCnt_ * sizeof(int));
+  } else {
+    value_ = NULL;
+  }
+
+  if (sMatrix_ == NULL) {
+    /* construct hl_sparse_matrix_s */
+    hl_sparse_matrix_s tmp;
+    hl_construct_sparse_matrix(
+        &tmp,
+        data_,
+        memoryHandle_->getSize(),
+        HL_SPARSE_CSR,
+        valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE,
+        height_,
+        width_,
+        elementCnt_);
+    hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix);
+    sMatrix_ = tmp2;
+  }
+}
+
+void GpuSparseMatrix::sparseResizeCSC() {
+  cols_ =
+      reinterpret_cast<int*>(reinterpret_cast<char*>(sMemoryHandle_->getBuf()));
+  rows_ =
+      reinterpret_cast<int*>(reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
+                             (width_ + 1) * sizeof(int));
+  if (NO_VALUE != valueType_) {
+    value_ = reinterpret_cast<real*>(
+        reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
+        (width_ + 1) * sizeof(int) + elementCnt_ * sizeof(int));
+  } else {
+    value_ = NULL;
+  }
+
+  if (sMatrix_ == NULL) {
+    /* construct hl_sparse_matrix_s */
+    hl_sparse_matrix_s tmp;
+    hl_construct_sparse_matrix(
+        &tmp,
+        memoryHandle_->getBuf(),
+        memoryHandle_->getSize(),
+        HL_SPARSE_CSC,
+        valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE,
+        height_,
+        width_,
+        elementCnt_);
+    hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix);
+    sMatrix_ = tmp2;
+  }
+}
+
+void GpuSparseMatrix::resize(size_t newHeight,
+                             size_t newWidth,
+                             size_t newNnz,
+                             SparseValueType valueType,
+                             SparseFormat format) {
+  if (format == SPARSE_CSR) {
+    resizeCSR(newHeight, newWidth, newNnz, valueType);
+  } else {
+    resizeCSC(newHeight, newWidth, newNnz, valueType);
+  }
+}
+
+void GpuSparseMatrix::resizeCSR(size_t newHeight,
+                                size_t newWidth,
+                                size_t newNnz,
+                                SparseValueType valueType) {
+  size_t newSize = (newHeight + 1) * sizeof(int) + newNnz * sizeof(int);
+  if (NO_VALUE != valueType) {
+    newSize += newNnz * sizeof(real);
+  }
+
+  if (NULL == memoryHandle_.get() || newSize > memoryHandle_->getSize()) {
+    memoryHandle_ = std::make_shared<GpuMemoryHandle>(newSize);
+    data_ = reinterpret_cast<real*>(memoryHandle_->getBuf());
+    sMemoryHandle_ = std::make_shared<CpuMemoryHandle>(newSize);
+    end_ = reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
+           sMemoryHandle_->getSize();
+    sMatrix_ = NULL;
+  } else if (valueType != valueType_) {
+    sMatrix_ = NULL;
+  } else {
+    /*
+     * newNnz > elementCnt_ is necessary for the following condition:
+     * Firstly, height_ is 9 elementCnt_ is 56
+     * Secondly, height_ is 11 elementCnt_ is 44
+     *   ==> height_ is bigger, sMatrix_ will resize, and total item is 44 now
+     * Then, height_ is 10 elementCnt_ is 52
+     *   ==> Without newNnz > elementCnt_ condition, sMatrix_ will fail
+     */
+    if ((ssize_t)((newHeight + 1) * sizeof(int)) >
+            ((char*)cols_ - (char*)rows_) ||
+        newNnz > static_cast<size_t>(sMatrix_->nnz)) {
+      sMatrix_ = NULL;
+    } else if (NO_VALUE == valueType) {
+      if ((ssize_t)(newNnz * sizeof(int)) > (end_ - (char*)cols_)) {
+        sMatrix_ = NULL;
+      }
+    } else {
+      if ((ssize_t)(newNnz * sizeof(int)) > ((char*)value_ - (char*)cols_) ||
+          (ssize_t)(newNnz * sizeof(real)) > (end_ - (char*)value_)) {
+        sMatrix_ = NULL;
+      }
+    }
+  }
+
+  height_ = newHeight;
+  width_ = newWidth;
+  elementCnt_ = newNnz;
+  valueType_ = valueType;
+  format_ = SPARSE_CSR;
+
+  if (sMatrix_ == NULL) {
+    sparseResizeCSR();
+  }
+}
+
+void GpuSparseMatrix::resizeCSC(size_t newHeight,
+                                size_t newWidth,
+                                size_t newNnz,
+                                SparseValueType valueType) {
+  size_t newSize = (newWidth + 1) * sizeof(int) + newNnz * sizeof(int);
+  if (NO_VALUE != valueType) {
+    newSize += newNnz * sizeof(real);
+  }
+
+  if (NULL == memoryHandle_.get() || newSize > memoryHandle_->getSize()) {
+    memoryHandle_ = std::make_shared<GpuMemoryHandle>(newSize);
+    data_ = reinterpret_cast<real*>(memoryHandle_->getBuf());
+    sMemoryHandle_ = std::make_shared<CpuMemoryHandle>(newSize);
+    end_ = reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
+           sMemoryHandle_->getSize();
+    sMatrix_ = NULL;
+  } else if (valueType != valueType_) {
+    sMatrix_ = NULL;
+  } else {
+    /*
+     * newNnz > elementCnt_ is necessary for the following condition:
+     * Firstly, height_ is 9 elementCnt_ is 56
+     * Secondly, height_ is 11 elementCnt_ is 44
+     *   ==> height_ is bigger, sMatrix_ will resize,
+     *       and total item is 44 now
+     * Then, height_ is 10 elementCnt_ is 52
+     *   ==> Without newNnz > elementCnt_ condition, sMatrix_ will fail
+     */
+    if ((ssize_t)((newWidth + 1) * sizeof(int)) >
+            ((char*)rows_ - (char*)cols_) ||
+        newNnz > static_cast<size_t>(sMatrix_->nnz)) {
+      sMatrix_ = NULL;
+    } else if (NO_VALUE == valueType) {
+      if ((ssize_t)(newNnz * sizeof(int)) > (end_ - (char*)rows_)) {
+        sMatrix_ = NULL;
+      }
+    } else {
+      if ((ssize_t)(newNnz * sizeof(int)) > ((char*)value_ - (char*)rows_) ||
+          (ssize_t)(newNnz * sizeof(real)) > (end_ - (char*)value_)) {
+        sMatrix_ = NULL;
+      }
+    }
+  }
+
+  height_ = newHeight;
+  width_ = newWidth;
+  elementCnt_ = newNnz;
+  valueType_ = valueType;
+  format_ = SPARSE_CSC;
+
+  if (sMatrix_ == NULL) {
+    sparseResizeCSC();
+  }
+}
+
+void GpuSparseMatrix::resize(size_t newHeight, size_t newWidth) {
+  resize(newHeight, newWidth, elementCnt_, valueType_, format_);
+}
+
+MatrixPtr GpuSparseMatrix::getTranspose() {
+  CHECK(memoryHandle_.get() || sMatrix_) << "not supported";
+  if (memoryHandle_.get()) {
+    MatrixPtr copy_T(new GpuSparseMatrix(
+        std::dynamic_pointer_cast<GpuMemoryHandle>(memoryHandle_),
+        sMatrix_,
+        height_,
+        width_,
+        elementCnt_,
+        valueType_,
+        format_,
+        true,
+        sMemoryHandle_));
+    return copy_T;
+  } else {
+    MatrixPtr copy_T(new GpuSparseMatrix(sMatrix_,
+                                         height_,
+                                         width_,
+                                         elementCnt_,
+                                         valueType_,
+                                         format_,
+                                         true,
+                                         sMemoryHandle_));
+    return copy_T;
+  }
+}
+
+void GpuSparseMatrix::copyRow(int offsets,
+                              size_t colNum,
+                              const sparse_non_value_t* row) {
+  memcpy(cols_ + offsets, row, sizeof(int) * colNum);
+}
+
+void GpuSparseMatrix::copyRow(int offsets,
+                              size_t colNum,
+                              const sparse_float_value_t* row) {
+  for (size_t j = 0; j < colNum; j++) {
+    cols_[offsets + j] = row[j].col;
+    value_[offsets + j] = row[j].value;
+  }
+}
+
+void GpuSparseMatrix::copyFrom(const Matrix& src, hl_stream_t stream) {
+  if (auto mat = dynamic_cast<const CpuSparseMatrix*>(&src)) {
+    copyFrom(*(const_cast<CpuSparseMatrix*>(mat)), stream);
+  } else if (auto mat = dynamic_cast<const GpuSparseMatrix*>(&src)) {
+    copyFrom(*(const_cast<GpuSparseMatrix*>(mat)), stream);
+  } else {
+    LOG(FATAL) << "Not implemented";
+  }
+}
+
+void GpuSparseMatrix::copyFrom(const Matrix& src) {
+  copyFrom(src, HPPL_STREAM_1);
+  hl_stream_synchronize(HPPL_STREAM_1);
+}
+
+template <class T>
+void GpuSparseMatrix::copyFrom(int64_t* ids,
+                               int64_t* indices,
+                               T* data,
+                               hl_stream_t stream) {
+  CHECK_EQ(format_, SPARSE_CSR);
+  size_t nnz = 0;
+  for (size_t i = 0; i < height_; i++) {
+    int64_t id = ids[i];
+    nnz += indices[id + 1] - indices[id];
+  }
+
+  resize(height_,
+         width_,
+         nnz,
+         sizeof(T) == sizeof(sparse_non_value_t) ? NO_VALUE : FLOAT_VALUE,
+         format_);
+
+  rows_[0] = 0;
+  for (size_t i = 0; i < height_; i++) {
+    int64_t id = ids[i];
+    size_t colNum = indices[id + 1] - indices[id];
+    rows_[i + 1] = rows_[i] + colNum;
+
+    T* row = data + indices[id];
+    copyRow(rows_[i], colNum, row);
+  }
+
+  sMatrix_->format = HL_SPARSE_CSR;
+  sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
+  sMatrix_->rows = height_;
+  sMatrix_->cols = width_;
+  sMatrix_->nnz = nnz;
+  hl_memcpy_csr_matrix(sMatrix_.get(), value_, rows_, cols_, stream);
+}
+
+void GpuSparseMatrix::setRow(size_t row,
+                             size_t colNum,
+                             const unsigned int* cols,
+                             const real* values) {
+  CHECK_EQ(format_, SPARSE_CSR);
+  if (NO_VALUE == valueType_) {
+    CHECK_LT(row, height_);
+    CHECK(NULL != cols);
+    CHECK(NULL == values);
+  } else {
+    CHECK_LT(row, height_);
+    CHECK(NULL != cols);
+    CHECK(NULL != values);
+  }
+  if (0 == row) {
+    rows_[row] = 0;
+  }
+  rows_[row + 1] = rows_[row] + colNum;
+
+  memcpy(cols_ + rows_[row], cols, sizeof(*cols) * colNum);
+  if (FLOAT_VALUE == valueType_) {
+    memcpy(value_ + rows_[row], values, sizeof(*values) * colNum);
+  }
+
+  if (height_ - 1 == row) {
+    sMatrix_->format = HL_SPARSE_CSR;
+    sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
+    sMatrix_->rows = height_;
+    sMatrix_->cols = width_;
+    sMatrix_->nnz = elementCnt_;
+    hl_memcpy_csr_matrix(
+        sMatrix_.get(), value_, rows_, cols_, HPPL_STREAM_DEFAULT);
+  }
+}
+
+SparseValueType GpuSparseMatrix::getValueType() const { return valueType_; }
+
+void GpuSparseMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
+  CHECK_EQ(format_, SPARSE_CSC);
+  int nnz = sMatrix_->nnz;
+  if (memAlloc) {
+    matTrans = std::make_shared<GpuSparseMatrix>(
+        width_, height_, nnz, valueType_, format_, false);
+  } else {
+    CHECK(matTrans != nullptr);
+  }
+
+  CpuIVector rows(nnz);
+  CpuIVector cols(width_ + 1);
+  CpuIVector cols_full(nnz);
+  CpuVector value(nnz);
+  hl_stream_t stream = HPPL_STREAM_1;
+  hl_memcpy_from_csc_matrix(value.getData(),
+                            nnz,
+                            rows.getData(),
+                            nnz,
+                            cols.getData(),
+                            width_ + 1,
+                            sMatrix_.get(),
+                            stream);
+
+  hl_stream_synchronize(stream);
+
+  /*for every non zero number, get its column index*/
+  std::vector<Element> dataVec;
+  for (size_t i = 0; i < width_; i++) {
+    for (int j = cols.getData()[i]; j < cols.getData()[i + 1]; j++) {
+      cols_full.getData()[j] = i;
+    }
+  }
+
+  /*sort row index and column index by the ascending order*/
+  for (int i = 0; i < nnz; i++) {
+    dataVec.emplace_back(
+        rows.getData()[i], cols_full.getData()[i], value.getData()[i]);
+  }
+  std::sort(dataVec.begin(), dataVec.end(), [](Element a, Element b) {
+    return a.row < b.row || (a.row == b.row && a.col < b.col);
+  });
+
+  /*get sorted data, row index, and col index, put them in the right place*/
+  cols.resize(height_ + 1);
+  rows.resize(nnz);
+  value.resize(nnz);
+
+  cols.getData()[0] = 0;
+  rows.getData()[0] = dataVec[0].col;
+  value.getData()[0] = dataVec[0].val;
+  for (int i = 1; i < nnz; i++) {
+    if (dataVec[i].row != dataVec[i - 1].row) {
+      for (int j = dataVec[i - 1].row + 1; j <= dataVec[i].row; j++) {
+        cols.getData()[j] = i;
+      }
+    }
+    rows.getData()[i] = dataVec[i].col;
+    value.getData()[i] = dataVec[i].val;
+  }
+  cols.getData()[height_] = nnz;
+
+  /*copy back from cpu*/
+  GpuSparseMatrixPtr dest =
+      std::dynamic_pointer_cast<GpuSparseMatrix>(matTrans);
+  hl_memcpy_csc_matrix((dest->sMatrix_).get(),
+                       value.getData(),
+                       rows.getData(),
+                       cols.getData(),
+                       stream);
+  hl_stream_synchronize(stream);
+}
+
+void GpuSparseMatrix::mul(const GpuMatrix& a,
+                          const GpuMatrix& b,
+                          real scaleAB,
+                          real scaleT) {
+  CHECK(a.useGpu_ && b.useGpu_) << "type not match";
+  CHECK(!trans_) << "trans not supported";
+  real* A_d = (real*)a.getData();
+  real* B_d = (real*)b.getData();
+  hl_sparse_matrix_s C_d = sMatrix_.get();
+  hl_trans_op_t a_trans = a.trans_ ? HPPL_OP_T : HPPL_OP_N;
+  hl_trans_op_t b_trans = b.trans_ ? HPPL_OP_T : HPPL_OP_N;
+
+  if (!a.trans_ && !b.trans_) {
+    CHECK(height_ == a.getHeight());
+    CHECK(width_ == b.getWidth());
+    CHECK(a.getWidth() == b.getHeight());
+  } else if (a.trans_ && !b.trans_) {
+    CHECK(height_ == a.getWidth());
+    CHECK(width_ == b.getWidth());
+    CHECK(a.getHeight() == b.getHeight());
+  } else if (!a.trans_ && b.trans_) {
+    CHECK(height_ == a.getHeight());
+    CHECK(width_ == b.getHeight());
+    CHECK(a.getWidth() == b.getWidth());
+  } else {
+    LOG(INFO) << "Not support";
+  }
+  int dimM = height_;
+  int dimN = width_;
+  int dimK = !b.trans_ ? b.getHeight() : b.getWidth();
+  hl_sparse_matrix_mul(
+      A_d, a_trans, B_d, b_trans, C_d, dimM, dimN, dimK, scaleAB, scaleT);
+}
+
+void GpuSparseMatrix::mul(const Matrix& a,
+                          const Matrix& b,
+                          real scaleAB,
+                          real scaleT) {
+  const auto a_ptr = dynamic_cast<const GpuMatrix*>(&a);
+  const auto b_ptr = dynamic_cast<const GpuMatrix*>(&b);
+  if (a_ptr && b_ptr) {
+    mul(*a_ptr, *b_ptr, scaleAB, scaleT);
+  } else {
+    LOG(FATAL) << "not supported";
+  }
+}
+
+template <class T>
+void printBuf(std::ostream& os, T* a, size_t len, const char* name) {
+  os << "\n: " << name << " [";
+  for (size_t i = 0; i < len; i++) {
+    os << a[i] << " ";
+  }
+  os << "]\n";
+}
+
+void GpuSparseMatrix::print(std::ostream& os) const {
+  if (format_ == SPARSE_CSC) {
+    int nnz = sMatrix_->nnz;
+    IVectorPtr rows = IVector::create(nnz, false);
+    IVectorPtr cols = IVector::create(width_ + 1, false);
+    VectorPtr value = Vector::create(nnz, false);
+    hl_stream_t stream = HPPL_STREAM_DEFAULT;
+    hl_memcpy_from_csc_matrix(value->getData(),
+                              value->getSize(),
+                              rows->getData(),
+                              rows->getSize(),
+                              cols->getData(),
+                              cols->getSize(),
+                              sMatrix_.get(),
+                              stream);
+    hl_stream_synchronize(stream);
+
+    printBuf(os, cols->getData(), width_ + 1, "col idx");
+    printBuf(os, rows->getData(), elementCnt_, "row idx");
+    printBuf(os, value->getData(), elementCnt_, "value");
+  }
+}
+
+void GpuSparseMatrix::copyFromCSR(CpuSparseMatrix& src, hl_stream_t stream) {
+  trans_ = src.trans_;
+  size_t nnz = src.getElementCnt();
+
+  resize(src.getHeight(), src.getWidth(), nnz, valueType_, src.getFormat());
+  // if have different value type, only copy rows and cols
+  SparseValueType vType =
+      valueType_ != src.getValueType() ? NO_VALUE : valueType_;
+
+  sMatrix_->format = HL_SPARSE_CSR;
+  sMatrix_->type = vType == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
+  sMatrix_->rows = height_;
+  sMatrix_->cols = width_;
+  sMatrix_->nnz = nnz;
+
+  hl_memcpy_csr_matrix(sMatrix_.get(),
+                       vType == NO_VALUE ? NULL : src.getValue(),
+                       src.getRows(),
+                       src.getCols(),
+                       stream);
+
+  // restore type of sMatrix_
+  sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
+}
+
+void GpuSparseMatrix::copyFromCSC(CpuSparseMatrix& src, hl_stream_t stream) {
+  trans_ = src.trans_;
+  size_t nnz = src.getElementCnt();
+
+  resize(src.getHeight(), src.getWidth(), nnz, valueType_, src.getFormat());
+
+  // if have different value type, only copy rows and cols
+  SparseValueType vType =
+      valueType_ != src.getValueType() ? NO_VALUE : valueType_;
+
+  sMatrix_->format = HL_SPARSE_CSC;
+  sMatrix_->type = vType == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
+  sMatrix_->rows = height_;
+  sMatrix_->cols = width_;
+  sMatrix_->nnz = nnz;
+
+  hl_memcpy_csc_matrix(sMatrix_.get(),
+                       vType == NO_VALUE ? NULL : src.getValue(),
+                       src.getRows(),
+                       src.getCols(),
+                       stream);
+
+  // restore type of sMatrix_
+  sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
+}
+
+void GpuSparseMatrix::copyFrom(GpuSparseMatrix& src, hl_stream_t stream) {
+  CHECK(trans_ == src.trans_);
+  CHECK(format_ == src.getFormat());
+  resize(src.getHeight(),
+         src.getWidth(),
+         elementCnt_,
+         valueType_,
+         src.getFormat());
+
+  size_t rowSize = format_ == SPARSE_CSC ? elementCnt_ : height_ + 1;
+  size_t colSize = format_ == SPARSE_CSC ? width_ + 1 : elementCnt_;
+
+  if (valueType_ == FLOAT_VALUE && src.getValueType() == FLOAT_VALUE) {
+    hl_memcpy_async(
+        getValue(), src.getValue(), sizeof(real) * elementCnt_, stream);
+  }
+  CHECK(getRows());
+  CHECK(src.getRows());
+
+  hl_memcpy_async(getRows(), src.getRows(), sizeof(int) * rowSize, stream);
+  hl_memcpy_async(getCols(), src.getCols(), sizeof(int) * colSize, stream);
+}
+
+void GpuSparseMatrix::copyFrom(CpuSparseMatrix& src, hl_stream_t stream) {
+  if (format_ == SPARSE_CSR) {
+    copyFromCSR(src, stream);
+  } else {
+    copyFromCSC(src, stream);
+  }
+}
+
+void GpuSparseMatrix::trimFromCSR(const CpuSparseMatrix& src) {
+  trans_ = src.trans_;
+  int* srcCols = src.getCols();
+  size_t nnz = std::count_if(srcCols,
+                             srcCols + src.getElementCnt(),
+                             [this](size_t n) { return n < this->width_; });
+  resize(height_, width_, nnz, valueType_, format_);
+
+  rows_[0] = 0;
+  size_t index = 0;
+  for (size_t r = 0; r < height_; ++r) {
+    for (int i = src.getRows()[r]; i < src.getRows()[r + 1]; ++i) {
+      if (srcCols[i] < (int)width_) {
+        cols_[index] = srcCols[i];
+        if (valueType_ == FLOAT_VALUE) {
+          value_[index] = src.getValue()[i];
+        }
+        ++index;
+      }
+    }
+    rows_[r + 1] = index;
+  }
+  CHECK_EQ(index, nnz);
+
+  sMatrix_->format = HL_SPARSE_CSR;
+  sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
+  sMatrix_->rows = height_;
+  sMatrix_->cols = width_;
+  sMatrix_->nnz = nnz;
+
+  hl_memcpy_csr_matrix(sMatrix_.get(),
+                       valueType_ == NO_VALUE ? NULL : value_,
+                       rows_,
+                       cols_,
+                       /*default stream = */ HPPL_STREAM_DEFAULT);
+}
+
+void GpuSparseMatrix::trimFromCSC(const CpuSparseMatrix& src) {
+  trans_ = src.trans_;
+  size_t nnz = src.getCols()[width_] - src.getCols()[0];
+  resize(height_, width_, nnz, valueType_, format_);
+
+  cols_[0] = 0;
+  for (size_t i = 0; i < width_; i++) {
+    cols_[i + 1] = cols_[i] + (int)(src.getRowNum(i));
+  }
+  memcpy(rows_, src.getRows() + src.getCols()[0], sizeof(int) * nnz);
+  if (valueType_ == FLOAT_VALUE) {
+    memcpy(value_, src.getValue() + src.getCols()[0], sizeof(real) * nnz);
+  }
+
+  sMatrix_->format = HL_SPARSE_CSC;
+  sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
+  sMatrix_->rows = height_;
+  sMatrix_->cols = width_;
+  sMatrix_->nnz = nnz;
+
+  hl_memcpy_csc_matrix(sMatrix_.get(),
+                       valueType_ == NO_VALUE ? NULL : value_,
+                       rows_,
+                       cols_,
+                       /*default stream = */ HPPL_STREAM_DEFAULT);
+}
+
+void GpuSparseMatrix::trimFrom(const CpuSparseMatrix& src) {
+  if (format_ == SPARSE_CSR) {
+    trimFromCSR(src);
+  } else {
+    trimFromCSC(src);
+  }
+}
+
+void GpuSparseMatrix::addBias(Matrix& b, real scale) {
+  CHECK(b.getHeight() == 1) << "the Bias should be a vector";
+  hl_sparse_matrix_s A_d = sMatrix_.get();
+  hl_sparse_matrix_add_bias(A_d, b.getData(), scale);
+}
+
+void GpuSparseMatrix::add3(GpuMatrix* b) {
+  CHECK(getFormat() != SPARSE_CSC) << "Not supported";
+  CHECK(height_ == b->getHeight());
+  CHECK(width_ == b->getWidth());
+  real* B_d = b->getData();
+  hl_sparse_matrix_s A_d = sMatrix_.get();
+  hl_sparse_matrix_add_dense(A_d, B_d, height_, width_, 1, 0);
+}
+
+void GpuSparseMatrix::add3(MatrixPtr b) {
+  if (dynamic_cast<GpuMatrix*>(b.get())) {
+    add3(dynamic_cast<GpuMatrix*>(b.get()));
+  } else {
+    LOG(FATAL) << "not supported";
+  }
+}
+
+void GpuSparseMatrix::zeroMem() {
+  CHECK(valueType_ == FLOAT_VALUE);
+  real* value = getValue();
+  if (value == NULL) {
+    LOG(FATAL) << "value is nullptr";
+  }
+  hl_matrix_zero_mem(value, elementCnt_);
+}
+
+void GpuSparseMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
+#ifdef PADDLE_WITH_CUDA
+  CHECK(maxIds.useGpu() && maxVal.useGpu()) << "Matrix type are not equal";
+  size_t numSamples = getHeight();
+  size_t beam = maxVal.getWidth();
+  CHECK_EQ(maxIds.getSize(), numSamples * beam);
+  CHECK_EQ(maxVal.getHeight(), numSamples);
+  CHECK_EQ(format_, SPARSE_CSR) << "Only support SPARSE_CSR";
+
+  hl_sparse_matrix_top_k(maxVal.getData(),
+                         maxVal.getStride(),
+                         maxIds.getData(),
+                         sMatrix_.get(),
+                         beam,
+                         numSamples);
+#endif
+}
+
+template void GpuSparseMatrix::copyFrom(int64_t* ids,
+                                        int64_t* indices,
+                                        sparse_non_value_t* data,
+                                        hl_stream_t stream);
+template void GpuSparseMatrix::copyFrom(int64_t* ids,
+                                        int64_t* indices,
+                                        sparse_float_value_t* data,
+                                        hl_stream_t stream);
+}  // namespace paddle
diff --git a/paddle/math/SparseMatrix.h b/paddle/legacy/math/SparseMatrix.h
similarity index 100%
rename from paddle/math/SparseMatrix.h
rename to paddle/legacy/math/SparseMatrix.h
diff --git a/paddle/legacy/math/SparseRowMatrix.cpp b/paddle/legacy/math/SparseRowMatrix.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..39bcdf22984db766283a3b4fbf56f224f730c5f8
--- /dev/null
+++ b/paddle/legacy/math/SparseRowMatrix.cpp
@@ -0,0 +1,282 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "SparseRowMatrix.h"
+#include "CpuSparseMatrix.h"
+
+#include <algorithm>
+
+#include "paddle/legacy/utils/Logging.h"
+
+#include "SIMDFunctions.h"
+
+#include "paddle/legacy/utils/Thread.h"
+#include "paddle/legacy/utils/Util.h"
+
+namespace paddle {
+
+const unsigned int SparseRowCpuMatrix::kUnusedId_ = -1U;
+
+void SparseRowCpuMatrix::init(size_t height, size_t width) {
+  height_ = height;
+  if (!indexDictHandle_) {
+    indexDictHandle_.reset(new IndexDict);
+    indexDictHandle_->globalIndices.assign(height, kUnusedId_);
+  }
+  localIndices_ = &indexDictHandle_->localIndices;
+  globalIndices_ = indexDictHandle_->globalIndices.data();
+}
+
+void SparseRowCpuMatrix::mul(CpuSparseMatrix* a,
+                             CpuMatrix* b,
+                             real scaleAB,
+                             real scaleT) {
+  CpuMatrix::mul<CpuMatrix, SparseRowCpuMatrix>(a, b, this, scaleAB, scaleT);
+}
+
+void SparseRowCpuMatrix::copyFrom(const real* src, size_t size) {
+  LOG(FATAL) << "This should not be called";
+}
+
+void SparseRowCpuMatrix::zeroMem() {
+  apply([](real* buf, size_t len) { memset(buf, 0, sizeof(real) * len); });
+  clearRows();
+}
+
+void SparseRowCpuMatrix::applyL1(real learningRate, real decayRate) {
+  apply([=](real* buf, size_t len) {
+    CpuVector value(0, nullptr);
+    value.subVecFrom(buf, 0, len);
+    value.applyL1(learningRate, decayRate);
+  });
+}
+
+void SparseRowCpuMatrix::sgdUpdate(BaseMatrix& value,
+                                   IVector& t0,
+                                   real learningRate,
+                                   int currentTime,
+                                   real decayRate,
+                                   bool useL1,
+                                   bool fini) {
+  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
+
+  // t0 and value are vectors
+  CHECK_EQ(t0.getSize(), this->height_);
+  CHECK_EQ(value.width_, this->height_ * this->width_);
+
+  if (decayRate == 0.0f) {
+    if (fini) {
+      return;
+    }
+
+    for (size_t i = 0; i < localIndices.size(); ++i) {
+      real* g = getLocalRow(i);
+      real* v = value.rowBuf(localIndices[i]);
+      for (size_t j = 0; j < this->width_; ++j) {
+        v[j] -= learningRate * g[j];
+      }
+    }
+    return;
+  }  // else
+
+  if (useL1) {  // L1 decay
+    if (fini) {
+      for (size_t i = 0; i < this->height_; ++i) {
+        real* v = value.rowBuf(i);
+        int* t = t0.getData() + i;
+        if (t[0] < currentTime) {
+          // W(t0) -> W(t+1)
+          int tDiff = currentTime - t[0];
+          real delta = tDiff * learningRate * decayRate;
+          simd::decayL1(v, v, delta, this->width_);
+        }
+      }
+      return;
+    }  // else
+
+    for (size_t i = 0; i < localIndices.size(); ++i) {
+      real* g = getLocalRow(i);
+      real* v = value.rowBuf(localIndices[i]);
+      int* t = t0.getData() + localIndices[i];
+      if (t[0] < currentTime) {
+        // W(t0) -> W(t)
+        int tDiff = currentTime - t[0];
+        real delta = tDiff * learningRate * decayRate;
+        simd::decayL1(v, v, delta, this->width_);
+      }
+
+      // W(t) -> W(t+1)
+      for (size_t j = 0; j < this->width_; ++j) {
+        v[j] -= learningRate * g[j];
+      }
+      simd::decayL1(v, v, learningRate * decayRate, this->width_);
+
+      // state update to t+1
+      t[0] = currentTime + 1;
+    }
+
+  } else {  // L2 decay
+    if (fini) {
+      for (size_t i = 0; i < this->height_; ++i) {
+        real* v = value.rowBuf(i);
+        int* t = t0.getData() + i;
+        if (t[0] < currentTime) {
+          // W(t0) -> W(t+1)
+          int tDiff = currentTime - t[0];
+          real recip = 1.0f / (1.0f + tDiff * learningRate * decayRate);
+          for (size_t j = 0; j < this->width_; ++j) {
+            v[j] *= recip;
+          }
+        }
+      }
+      return;
+    }  // else
+
+    real recipDecay = 1.0f / (1.0f + learningRate * decayRate);
+
+    for (size_t i = 0; i < localIndices.size(); ++i) {
+      real* g = getLocalRow(i);
+      real* v = value.rowBuf(localIndices[i]);
+      int* t = t0.getData() + localIndices[i];
+      if (t[0] < currentTime) {
+        // W(t0) -> W(t)
+        int tDiff = currentTime - t[0];
+        real recip = 1.0f / (1.0f + tDiff * learningRate * decayRate);
+        for (size_t j = 0; j < this->width_; ++j) {
+          v[j] *= recip;
+        }
+      }
+
+      // W(t) -> W(t+1)
+      for (size_t j = 0; j < this->width_; ++j) {
+        v[j] = recipDecay * (v[j] - learningRate * g[j]);
+      }
+
+      // state update to t+1
+      t[0] = currentTime + 1;
+    }
+  }
+}
+
+void SparseRowCpuMatrix::addTo(BaseMatrix& dest,
+                               std::vector<uint32_t>& ids,
+                               size_t tid,
+                               size_t numThreads) {
+  CHECK(!dest.useGpu_);
+  CHECK_EQ(dest.height_ * dest.width_, this->height_ * this->width_);
+
+  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
+  for (size_t i = 0; i < localIndices.size(); ++i) {
+    uint32_t id = localIndices[i];
+    if (id % numThreads == tid) {
+      simd::addTo(dest.rowBuf(id), getLocalRow(i), this->width_);
+      ids.push_back(id);
+    }
+  }
+}
+
+void SparseRowCpuMatrix::addTo(SparseRowCpuMatrix& dest,
+                               size_t tid,
+                               size_t numThreads) {
+  CHECK(!dest.useGpu_);
+  CHECK_EQ(dest.height_ * dest.width_, this->height_ * this->width_);
+
+  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
+  for (size_t i = 0; i < localIndices.size(); ++i) {
+    uint32_t id = localIndices[i];
+    if (id % numThreads == tid) {
+      dest.checkIndex(id);
+      simd::addTo(dest.getRow(id), getLocalRow(i), this->width_);
+    }
+  }
+}
+
+void SparseRowCpuMatrix::zeroMemThread(size_t tid, size_t numThreads) {
+  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
+  for (size_t i = 0; i < localIndices.size(); ++i) {
+    uint32_t id = localIndices[i];
+    if (id % numThreads == tid) {
+      memset(this->getLocalRow(i), 0, this->width_ * sizeof(real));
+    }
+  }
+}
+
+void SparseAutoGrowRowCpuMatrix::mul(CpuSparseMatrix* a,
+                                     CpuMatrix* b,
+                                     real scaleAB,
+                                     real scaleT) {
+  CpuMatrix::mul<CpuMatrix, SparseAutoGrowRowCpuMatrix>(
+      a, b, this, scaleAB, scaleT);
+}
+
+void CacheRowCpuMatrix::mul(CpuSparseMatrix* a,
+                            CpuMatrix* b,
+                            real scaleAB,
+                            real scaleT) {
+  CpuMatrix::mul<CpuMatrix, CacheRowCpuMatrix>(a, b, this, scaleAB, scaleT);
+}
+
+void SparsePrefetchRowCpuMatrix::addRows(const unsigned int* ids, size_t len) {
+  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
+  for (size_t i = 0; i < len; i++) {
+    CHECK_LT(*(ids + i), this->getHeight())
+        << "id:" << *(ids + i) << "Height:" << this->getHeight()
+        << "sparse id value exceeds the max input dimension, "
+        << "it could be caused invalid input data samples";
+  }
+  localIndices.insert(localIndices.end(), ids, ids + len);
+}
+
+void SparsePrefetchRowCpuMatrix::addRows(MatrixPtr input) {
+  CpuSparseMatrix* mat = dynamic_cast<CpuSparseMatrix*>(input.get());
+  CHECK(mat) << "only support sparse matrix";
+  addRows(reinterpret_cast<const unsigned int*>(mat->getCols()),
+          mat->getElementCnt());
+}
+
+void SparsePrefetchRowCpuMatrix::addRows(IVectorPtr ids) {
+  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
+  size_t numSamples = ids->getSize();
+  int* index = ids->getData();
+  for (size_t i = 0; i < numSamples; ++i) {
+    if (index[i] == -1) continue;
+
+    unsigned int id = (unsigned int)index[i];
+    CHECK_LT(id, this->getHeight())
+        << "id:" << id << "Height:" << this->getHeight()
+        << "sparse id value exceeds the max input dimension, "
+        << "it could be caused invalid input data samples";
+    localIndices.push_back(id);
+  }
+}
+
+void SparsePrefetchRowCpuMatrix::setupIndices() {
+  auto& localIndices = indexDictHandle_->localIndices;
+  uniqueIds(localIndices);
+  // for each sparse row
+  for (size_t id = 0; id < localIndices.size(); ++id) {
+    globalIndices_[localIndices[id]] = id;  // sparse row -> local id
+  }
+  checkStoreSize();
+}
+
+void SparseRowCpuMatrix::checkIndices() {
+  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
+  for (size_t i = 0; i < localIndices.size(); ++i) {
+    CHECK_EQ(globalIndices_[localIndices[i]], i);
+  }
+  checkStoreSize();
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/math/SparseRowMatrix.h b/paddle/legacy/math/SparseRowMatrix.h
new file mode 100644
index 0000000000000000000000000000000000000000..e206747a41c9f3a0f058bf3b0a94472bf4b2c349
--- /dev/null
+++ b/paddle/legacy/math/SparseRowMatrix.h
@@ -0,0 +1,341 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifndef PADDLE_MOBILE_INFERENCE
+
+#include <gflags/gflags.h>
+#include <string.h>
+#include <algorithm>
+#include "Matrix.h"
+#include "RowBuffer.h"
+#include "paddle/legacy/utils/Util.h"
+
+namespace paddle {
+
+/**
+ * Sparse Row
+ */
+class SparseRowCpuMatrix : public CpuMatrix {
+ public:
+  struct IndexDict {
+    // In the following, global id means the row id in the original matrix.
+    // Local id means the row id in the local storage which only contains
+    // the sparse rows.
+    std::vector<unsigned int> localIndices;   // local id -> global id
+    std::vector<unsigned int> globalIndices;  // global id -> local id
+  };
+  typedef std::shared_ptr<IndexDict> IndexDictPtr;
+
+  /// heightStore is max number of rows of the sparse matrix.
+  SparseRowCpuMatrix(CpuMemHandlePtr dataHandle,
+                     size_t height,
+                     size_t width,
+                     IndexDictPtr indexDictHandle = nullptr,
+                     bool trans = false)
+      : CpuMatrix(nullptr, height, width, trans),
+        indexDictHandle_(indexDictHandle) {
+    init(height, width);
+    buf_.reset(new RowBuffer(dataHandle, width));
+  }
+
+  virtual ~SparseRowCpuMatrix() {}
+
+ public:
+  /**
+   *  Get the row buf
+   *
+   *  @param row row id in the original matrix
+   */
+  real* getRow(size_t row) {
+    CHECK_NE(globalIndices_[row], kUnusedId_);
+    return getLocalRow(globalIndices_[row]);
+  }
+
+  /**
+   *  Get the row buf
+   *
+   *  @param row row id in local storage
+   */
+  real* getLocalRow(size_t row) { return buf_->getWithAutoGrowth(row); }
+
+  /**
+   *  reserve the storage for rows according to current size of
+   * indexDictHandle.
+   *
+   *  This is only used when SparseRowCpuMatrix is constructed with
+   *  indexDictHandle.
+   */
+  void reserveStore() { buf_->resize(localIndices_->size()); }
+
+  // row is the row id in the original matrix
+  virtual real* getRowBuf(size_t row) { return getRow(row); }
+
+  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
+
+  /**
+   * Fill data according to row indexs added, setup indices inside.
+   *
+   * *src* and *size* are data and size of normal dense CpuMatrix.
+   */
+  virtual void copyFrom(const real* src, size_t size);
+  virtual void zeroMem();
+
+  /**
+   * apply L1 to all sparse rows, should be apply after indices ready.
+   */
+  virtual void applyL1(real learningRate, real decayRate);
+
+  void clearIndices() { clearRows(); }
+  void zeroMemThread(size_t tid, size_t numThreads);
+
+  /**
+   *  value -= grad * learningRate,  this is gradient.
+   *
+   * If L1 decay set use L1, else if L2 set use L2, otherwise no decay atall.
+   *
+   * t0 is a int vector used by L1/L2 decay, size = height of parameter
+   * matrix,
+   * store the time that each weight row last updated.
+   *
+   * Time is batchId, currentTime is current batchId.
+   *
+   * While pass finished, caller should call this func one more time
+   *  with (fini=true) to let weight decay catch up current time.
+   */
+  void sgdUpdate(BaseMatrix& value,
+                 IVector& t0,
+                 real learningRate,
+                 int currentTime,
+                 real decayRate,
+                 bool useL1,
+                 bool fini = false);
+
+  /**
+   *  merge rows in *this* to *dest* for designated thread
+   *
+   *  values add to *dest* matrix
+   *
+   *  ids occured in *this* append to *ids*
+   *  filtered by  (id % numThreads == tid)
+   */
+  void addTo(BaseMatrix& dest,
+             std::vector<uint32_t>& ids,
+             size_t tid,
+             size_t numThreads);
+
+  /**
+   *  the second version addTo(), *dest* is a SparseRowCpuMatrix.
+   *
+   *  The dest's indices should be setup already, addTo() will
+   *  check src ids is exist in dest's indices.
+   */
+  void addTo(SparseRowCpuMatrix& dest, size_t tid, size_t numThreads);
+
+  const IndexDictPtr& getIndexDictHandle() const { return indexDictHandle_; }
+
+  /**
+   *  check all local and global indices consistency
+   */
+  void checkIndices();
+  /**
+   *  check whether row *i* exist in indices
+   */
+  void checkIndex(size_t i) {
+    size_t localId = globalIndices_[i];
+    CHECK_LT(localId, localIndices_->size());
+    CHECK_EQ((*localIndices_)[localId], i);
+  }
+
+  std::vector<unsigned int>& getLocalIndices() const {
+    return indexDictHandle_->localIndices;
+  }
+
+ protected:
+  template <typename Func>
+  void apply(Func f) {
+    f(buf_->data(), localIndices_->size() * width_);
+  }
+
+  void init(size_t height, size_t width);
+
+  /// clear row indices.
+  void clearRows() {
+    for (auto id : *localIndices_) {
+      globalIndices_[id] = kUnusedId_;
+    }
+    localIndices_->clear();
+    buf_->clear();
+  }
+
+  inline void checkStoreSize() {
+    if (buf_->isAutoGrowth()) {
+      if (buf_->getRowCount() > 0.5 * height_) {
+        LOG(WARNING) << "There are more than 0.5*height ("
+                     << localIndices_->size() << ") rows are used for sparse "
+                     << "update, which is not efficient. Considering not use "
+                     << "sparse_update.";
+      }
+    } else {
+      CHECK_LE(localIndices_->size(), buf_->getRowCount());
+    }
+  }
+
+  std::unique_ptr<RowBuffer> buf_;
+  IndexDictPtr indexDictHandle_;
+  std::vector<unsigned int>* localIndices_;  // =&indexDictHandle_->localIndices
+  unsigned int* globalIndices_;  // =indexDictHandle_->globalIndices.data();
+  static const unsigned int kUnusedId_;
+};
+
+class SyncThreadPool;
+
+/// For prefetching parameters from remote Parameter server
+class SparsePrefetchRowCpuMatrix : public SparseRowCpuMatrix {
+ public:
+  SparsePrefetchRowCpuMatrix(CpuMemHandlePtr dataHandle,
+                             size_t height,
+                             size_t width,
+                             IndexDictPtr indexDictHandle = nullptr,
+                             SyncThreadPool* pool = nullptr,
+                             bool trans = false)
+      : SparseRowCpuMatrix(dataHandle, height, width, indexDictHandle, trans),
+        pool_(pool) {}
+
+  /**
+   * Extract feature ids from *input*, to fill row indexs.
+   *
+   * *input* must be sparse matrix.
+   *
+   * Can call many times before setup.
+   */
+  void addRows(MatrixPtr input);
+  void addRows(IVectorPtr ids);
+
+  /**
+   * setup global indices of SparseRowMatrix after finish add rows.
+   */
+  void setupIndices();
+
+ protected:
+  void addRows(const unsigned int* ids, size_t len);
+  SyncThreadPool* pool_;
+};
+
+class SparseAutoGrowRowCpuMatrix : public SparseRowCpuMatrix {
+ public:
+  SparseAutoGrowRowCpuMatrix(size_t height,
+                             size_t width,
+                             IndexDictPtr indexDictHandle = nullptr,
+                             bool trans = false)
+      : SparseRowCpuMatrix(nullptr, height, width, indexDictHandle, trans) {}
+
+  real* getRow(size_t row) {
+    auto id = globalIndices_[row];
+    if (id == kUnusedId_) {
+      id = globalIndices_[row] = localIndices_->size();
+      localIndices_->push_back(row);
+      checkStoreSize();
+    }
+    return getLocalRow(id);
+  }
+
+  virtual real* getRowBuf(size_t row) { return getRow(row); }
+
+  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
+};
+
+class CacheRowCpuMatrix : public SparseAutoGrowRowCpuMatrix {
+ public:
+  CacheRowCpuMatrix(size_t height,
+                    size_t width,
+                    IndexDictPtr indexDictHandle = nullptr,
+                    bool trans = false)
+      : SparseAutoGrowRowCpuMatrix(height, width, indexDictHandle, trans),
+        sourceData_(nullptr) {}
+
+  void setSourceData(CpuVectorPtr sourceVec) {
+    sourceDataVec_ = sourceVec;
+    sourceData_ = sourceVec->getData();
+  }
+
+  real* getRow(size_t row) {
+    auto id = globalIndices_[row];
+    if (id == kUnusedId_) {
+      id = globalIndices_[row] = localIndices_->size();
+      localIndices_->push_back(row);
+      checkStoreSize();
+      memcpy(
+          getLocalRow(id), sourceData_ + width_ * row, sizeof(float) * width_);
+    }
+    return getLocalRow(id);
+  }
+
+  virtual real* getRowBuf(size_t row) { return getRow(row); }
+
+  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
+
+ public:
+  CpuVectorPtr sourceDataVec_;
+  real* sourceData_;
+};
+
+/**
+ * Sparse Row Ids Matrix.
+ *
+ * mostly same as CpuMatrix, but maintain sparse row ids occured,
+ * ids are hashed by worker thread id.
+ */
+class SparseRowIdsCpuMatrix : public CpuMatrix {
+ public:
+  SparseRowIdsCpuMatrix(CpuMemHandlePtr dataHandle,
+                        size_t height,
+                        size_t width,
+                        bool trans = false)
+      : CpuMatrix(dataHandle, height, width, trans) {}
+
+  void setNumOfThreads(size_t numOfThreads) { idsArray_.resize(numOfThreads); }
+
+  std::vector<uint32_t>& getIds(size_t threadId) { return idsArray_[threadId]; }
+
+ private:
+  std::vector<std::vector<uint32_t>> idsArray_;
+};
+
+}  // namespace paddle
+
+#else
+namespace paddle {
+
+class SparseRowCpuMatrix : public CpuMatrix {
+ public:
+  void reserveStore() {}
+  void clearIndices() {}
+};
+
+class SparsePrefetchRowCpuMatrix : public SparseRowCpuMatrix {
+ public:
+  void setupIndices() {}
+  void addRows(MatrixPtr input) {}
+  void addRows(IVectorPtr ids) {}
+};
+
+class SparseAutoGrowRowCpuMatrix : public SparseRowCpuMatrix {};
+class CacheRowCpuMatrix : public SparseAutoGrowRowCpuMatrix {};
+class SparseRowIdsCpuMatrix : public CpuMatrix {};
+
+}  // namespace paddle
+
+#endif
diff --git a/paddle/legacy/math/Storage.cpp b/paddle/legacy/math/Storage.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..65d53aeaa926690c7fe9e6fcac7affdfb68fede9
--- /dev/null
+++ b/paddle/legacy/math/Storage.cpp
@@ -0,0 +1,101 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Storage.h"
+#include "Allocator.h"
+#include "paddle/legacy/utils/StringUtil.h"
+#include "paddle/legacy/utils/Util.h"
+
+#ifndef PADDLE_MOBILE_INFERENCE
+DEFINE_int32(pool_limit_size,
+             536870912,
+             "maximum memory size managed by a memory pool, default is 512M");
+#else
+DEFINE_int32(pool_limit_size, 0, "default is 0");
+#endif
+
+namespace paddle {
+
+// Initialization StorageEngine singleton.
+// Other modules may rely on storage management,
+// so StorageEngine need to be initialized before other modules.
+static InitFunction __init_storage_engine([]() { StorageEngine::singleton(); },
+                                          std::numeric_limits<int>::max());
+
+StorageEngine::StorageEngine() : cpuAllocator_(nullptr) {}
+
+StorageEngine::~StorageEngine() {
+  delete cpuAllocator_;
+  for (auto it : gpuAllocator_) {
+    delete it;
+  }
+}
+
+StorageEngine* StorageEngine::singleton() {
+  static StorageEngine storage;
+  return &storage;
+}
+
+PoolAllocator* StorageEngine::getGpuAllocator(int deviceId) {
+  {
+    // if gpuAllocator_ has been constructed
+    ReadLockGuard guard(lock_);
+    if (deviceId < static_cast<int>(gpuAllocator_.size()) &&
+        (gpuAllocator_[deviceId] != nullptr)) {
+      return gpuAllocator_[deviceId];
+    }
+  }
+
+  {
+    // Construct gpuAllocator_
+    std::lock_guard<RWLock> guard(lock_);
+    if (deviceId >= static_cast<int>(gpuAllocator_.size())) {
+      gpuAllocator_.resize(deviceId + 1);
+    }
+    if (gpuAllocator_[deviceId] == nullptr) {
+      std::string name =
+          "gpu" + str::to_string(deviceId) + std::string("_pool");
+      gpuAllocator_[deviceId] =
+          new PoolAllocator(new GpuAllocator(), FLAGS_pool_limit_size, name);
+    }
+    return gpuAllocator_[deviceId];
+  }
+}
+
+PoolAllocator* StorageEngine::getCpuAllocator() {
+  {
+    // if cpuAllocator_ has been constructed
+    ReadLockGuard guard(lock_);
+    if (cpuAllocator_ != nullptr) {
+      return cpuAllocator_;
+    }
+  }
+
+  {
+    // Construct cpuAllocator_
+    std::lock_guard<RWLock> guard(lock_);
+    if (cpuAllocator_ == nullptr) {
+      if (FLAGS_use_gpu) {
+        cpuAllocator_ = new PoolAllocator(
+            new CudaHostAllocator(), FLAGS_pool_limit_size, "cuda_host_pool");
+      } else {
+        cpuAllocator_ = new PoolAllocator(
+            new CpuAllocator(), FLAGS_pool_limit_size, "cpu_pool");
+      }
+    }
+    return cpuAllocator_;
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/math/Storage.h b/paddle/legacy/math/Storage.h
new file mode 100644
index 0000000000000000000000000000000000000000..bd22dde2c85be5ba432cb3a259211c1900a17b6c
--- /dev/null
+++ b/paddle/legacy/math/Storage.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <mutex>
+#include <vector>
+#include "PoolAllocator.h"
+#include "paddle/legacy/utils/Locks.h"
+
+namespace paddle {
+
+/**
+ * @brief Storage manager for multiple devices.
+ */
+class StorageEngine {
+ public:
+  /**
+   * @return Storage singleton
+   */
+  static StorageEngine* singleton();
+
+  /**
+   * @return return one gpu allocator by deviceId
+   */
+  PoolAllocator* getGpuAllocator(int deviceId);
+
+  /**
+   * @return return cpu allocator
+   */
+  PoolAllocator* getCpuAllocator();
+
+ protected:
+  StorageEngine();
+  ~StorageEngine();
+  RWLock lock_;
+  std::vector<PoolAllocator*> gpuAllocator_;
+  PoolAllocator* cpuAllocator_;
+};
+
+}  // namespace paddle
diff --git a/paddle/math/TensorApply.h b/paddle/legacy/math/TensorApply.h
similarity index 100%
rename from paddle/math/TensorApply.h
rename to paddle/legacy/math/TensorApply.h
diff --git a/paddle/legacy/math/TensorAssign.h b/paddle/legacy/math/TensorAssign.h
new file mode 100644
index 0000000000000000000000000000000000000000..efbfce6c4f88197f18285e3679698b8bbb1ed3b8
--- /dev/null
+++ b/paddle/legacy/math/TensorAssign.h
@@ -0,0 +1,158 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include "paddle/legacy/utils/Logging.h"
+
+namespace paddle {
+
+/**
+ * \brief Tensor Assign Expression(return by lazyAssign,
+ * and evaluated by AssignEvaluate)
+ */
+template <typename LhsType, typename RhsType, class T>
+class TensorAssignOp {
+ public:
+  explicit TensorAssignOp(const LhsType& lhs, const RhsType& rhs)
+      : lhs_(lhs), rhs_(rhs) {
+#ifndef __CUDA_ARCH__
+    CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
+    CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
+    CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
+#endif
+  }
+
+  INLINE void apply(const int i, const int j) {
+    lhs_.applyRef(i, j) = rhs_.apply(i, j);
+  }
+  INLINE void apply(const int index) {
+    lhs_.applyRef(index) = rhs_.apply(index);
+  }
+
+  INLINE size_t getWidth() const { return lhs_.getWidth(); }
+  INLINE size_t getHeight() const { return rhs_.getHeight(); }
+  INLINE bool isContiguous() const {
+    return lhs_.isContiguous() && rhs_.isContiguous();
+  }
+  INLINE bool useGpu() const { return lhs_.useGpu(); }
+
+ private:
+  TensorApply<LhsType, T> lhs_;
+  TensorApply<const RhsType, T> rhs_;
+};
+
+template <typename Assign, typename... AssignOp>
+void AssignCpuEvaluate(int height,
+                       int width,
+                       bool isContiguous,
+                       Assign&& assign,
+                       AssignOp&&... args) {
+  if (isContiguous) {
+    int size = height * width;
+    for (int index = 0; index < size; index++) {
+      assign.apply(index);
+      __attribute__((unused)) int dummy[] = {(((args)).apply(index), 0)...};
+    }
+  } else {
+    for (int i = 0; i < height; i++) {
+      for (int j = 0; j < width; j++) {
+        assign.apply(i, j);
+        __attribute__((unused)) int dummy[] = {(((args)).apply(i, j), 0)...};
+      }
+    }
+  }
+}
+
+#ifdef __NVCC__
+template <typename Assign, typename... AssignOp>
+__global__ void AssignGpuEvaluate1(const int border,
+                                   Assign assign,
+                                   AssignOp... args) {
+  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < border) {
+    assign.apply(idx);
+    __attribute__((unused)) int dummy[] = {(((args)).apply(idx), 0)...};
+  }
+}
+
+template <typename Assign, typename... AssignOp>
+__global__ void AssignGpuEvaluate2(const int height,
+                                   const int width,
+                                   Assign assign,
+                                   AssignOp... args) {
+  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  for (int i = rowIdx; i < height; i += gridDim.y * blockDim.y) {
+    for (int j = colIdx; j < width; j += gridDim.x * blockDim.x) {
+      assign.apply(i, j);
+      __attribute__((unused)) int dummy[] = {(((args)).apply(i, j), 0)...};
+    }
+  }
+}
+#endif
+
+/**
+ * \brief Evaluate one or more TensorAssignOp objects.
+ *
+ * \note At least one assignment expression is required
+ */
+template <typename Assign, typename... AssignOp>
+void AssignEvaluate(Assign&& assign, AssignOp&&... args) {
+  const bool useGpu_ = assign.useGpu();
+  bool isContiguous_ = assign.isContiguous();
+  const size_t height = assign.getHeight();
+  const size_t width = assign.getWidth();
+
+  const int packSize = sizeof...(args);
+  const bool packUseGpu[] = {((args)).useGpu()...};
+  const bool packIsContiguous[] = {((args)).isContiguous()...};
+  const size_t packHeight[] = {((args)).getHeight()...};
+  const size_t packWidth[] = {((args)).getWidth()...};
+
+  for (int i = 0; i < packSize; i++) {
+    CHECK_EQ(useGpu_, packUseGpu[i]);
+    CHECK_EQ(height, packHeight[i]);
+    CHECK_EQ(width, packWidth[i]);
+    isContiguous_ = isContiguous_ && packIsContiguous[i];
+  }
+
+  if (useGpu_) {
+#ifdef __NVCC__
+    if (isContiguous_) {
+      int size = height * width;
+      int blockSize = size <= 1024 ? size : 1024;
+      int gridSize = (size + 1024 - 1) / 1024;
+      AssignGpuEvaluate1<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
+          size, assign, args...);
+    } else {
+      int blockSizeY = std::min(32, (int)height);
+      int blockSizeX = (32 / blockSizeY) * 32;
+      int gridSizeX = std::min(32, (int)(width + blockSizeX - 1) / blockSizeX);
+      int gridSizeY = std::min(32, (int)(height + blockSizeY - 1) / blockSizeY);
+      dim3 threads(blockSizeX, blockSizeY);
+      dim3 grid(gridSizeX, gridSizeY);
+      AssignGpuEvaluate2<<<grid, threads, 0, STREAM_DEFAULT>>>(
+          height, width, assign, args...);
+    }
+
+    CHECK_SYNC("AssignEvaluate failed");
+#endif
+  } else {
+    AssignCpuEvaluate(height, width, isContiguous_, assign, args...);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/math/TensorEvaluate.h b/paddle/legacy/math/TensorEvaluate.h
new file mode 100644
index 0000000000000000000000000000000000000000..3029dd35fb05c893f99cde0689f816f4257f21c4
--- /dev/null
+++ b/paddle/legacy/math/TensorEvaluate.h
@@ -0,0 +1,112 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include "hl_base.h"
+#include "paddle/legacy/utils/Logging.h"
+
+namespace paddle {
+
+/**
+ * \brief The tensor cpu evaluate api.
+ */
+template <class T, typename LeftType, typename RightType>
+inline void TensorCpuApply(LeftType& lhs, const RightType& rhs) {
+  TensorApply<LeftType, T> lhs_(lhs);
+  TensorApply<const RightType, T> rhs_(rhs);
+  CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
+  CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
+  CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
+
+  int height = lhs_.getHeight();
+  int width = lhs_.getWidth();
+  if (lhs_.isContiguous() && rhs_.isContiguous()) {
+    int size = height * width;
+    for (int index = 0; index < size; index++) {
+      lhs_.applyRef(index) = rhs_.apply(index);
+    }
+  } else {
+    for (int i = 0; i < height; i++) {
+      for (int j = 0; j < width; j++) {
+        lhs_.applyRef(i, j) = rhs_.apply(i, j);
+      }
+    }
+  }
+}
+
+#ifdef __NVCC__
+template <typename LeftType, typename RightType>
+__global__ void TensorElementWiseOp(LeftType lhs,
+                                    RightType rhs,
+                                    const int border) {
+  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < border) {
+    lhs.applyRef(idx) = rhs.apply(idx);
+  }
+}
+
+template <typename LeftType, typename RightType>
+__global__ void TensorElementWiseOp(LeftType lhs, RightType rhs) {
+  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  for (int i = rowIdx; i < lhs.getHeight(); i += gridDim.y * blockDim.y) {
+    for (int j = colIdx; j < lhs.getWidth(); j += gridDim.x * blockDim.x) {
+      lhs.applyRef(i, j) = rhs.apply(i, j);
+    }
+  }
+}
+
+/**
+ * \brief The tensor gpu evaluate api.
+ */
+template <class T, typename LeftType, typename RightType>
+inline void TensorGpuApply(LeftType& lhs, const RightType& rhs) {
+  TensorApply<LeftType, T> lhs_(lhs);
+  TensorApply<const RightType, T> rhs_(rhs);
+  CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
+  CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
+  CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
+
+  int dimM = lhs_.getHeight();
+  int dimN = lhs_.getWidth();
+
+  if (lhs_.isContiguous() && rhs_.isContiguous()) {
+    int size = dimM * dimN;
+    int blockSize = size <= 1024 ? size : 1024;
+    int gridSize = (size + 1024 - 1) / 1024;
+    TensorElementWiseOp<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
+        lhs_, rhs_, size);
+  } else {
+    int blockSizeY = std::min(32, dimM);
+    int blockSizeX = (32 / blockSizeY) * 32;
+    int gridSizeX = std::min(32, (dimN + blockSizeX - 1) / blockSizeX);
+    int gridSizeY = std::min(32, (dimM + blockSizeY - 1) / blockSizeY);
+    dim3 threads(blockSizeX, blockSizeY);
+    dim3 grid(gridSizeX, gridSizeY);
+    TensorElementWiseOp<<<grid, threads, 0, STREAM_DEFAULT>>>(lhs_, rhs_);
+  }
+
+  CHECK_SYNC("TensorGpuApply failed");
+}
+#else
+template <class T, typename LeftType, typename RightType>
+inline void TensorGpuApply(LeftType& lhs, RightType& rhs) {
+  LOG(FATAL) << "Since it is gcc compiled, "
+                "this calculation does not support GPU implementation.";
+}
+#endif
+
+}  // namespace paddle
diff --git a/paddle/legacy/math/TensorExpression.h b/paddle/legacy/math/TensorExpression.h
new file mode 100644
index 0000000000000000000000000000000000000000..1c6cf07831487165445a3f59931c4ca9196375b9
--- /dev/null
+++ b/paddle/legacy/math/TensorExpression.h
@@ -0,0 +1,446 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <stdint.h>
+#include <cstddef>
+#include "hl_tensor_ops.h"
+#include "paddle/legacy/utils/Common.h"
+#include "paddle/legacy/utils/Logging.h"
+
+namespace paddle {
+
+template <class OP, typename ExprType, class T>
+class TensorConstant;
+template <class OP, typename ExprType, class T>
+class TensorUnaryOp;
+template <class OP, typename LhsType, typename RhsType, class T>
+class TensorBinaryOp;
+template <typename ExprType1, typename ExprType2, typename ExprType3, class T>
+class TensorTernaryOp;
+
+template <typename LhsType, typename RhsType, class T>
+class TensorAssignOp;
+
+/**
+ * \brief Tensor base class.
+ *
+ * This is the base class of all Tensor and Expression class.
+ */
+template <typename Derived, class T>
+class TensorExpression {
+ public:
+  /**
+   * Element wise unary expression.
+   */
+  template <typename UnaryOp>
+  const TensorUnaryOp<UnaryOp, const Derived, T> unaryExpression(
+      const UnaryOp& op) const {
+    return TensorUnaryOp<UnaryOp, const Derived, T>(op, derived());
+  }
+
+  const TensorUnaryOp<hppl::unary::add_scale<T>, const Derived, T> operator+(
+      T p) const {
+    return unaryExpression(hppl::unary::add_scale<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::sub_scale<T>, const Derived, T> operator-(
+      T p) const {
+    return unaryExpression(hppl::unary::sub_scale<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::mul_scale<T>, const Derived, T> operator*(
+      T p) const {
+    return unaryExpression(hppl::unary::mul_scale<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::div_scale<T>, const Derived, T> operator/(
+      T p) const {
+    return unaryExpression(hppl::unary::div_scale<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::neg<T>, const Derived, T> operator-() const {
+    return unaryExpression(hppl::unary::neg<T>());
+  }
+
+  const TensorUnaryOp<hppl::unary::exp_op<T>, const Derived, T> exp() const {
+    return unaryExpression(hppl::unary::exp_op<T>());
+  }
+
+  const TensorUnaryOp<hppl::unary::log_op<T>, const Derived, T> log() const {
+    return unaryExpression(hppl::unary::log_op<T>());
+  }
+
+  const TensorUnaryOp<hppl::unary::sqrt_op<T>, const Derived, T> sqrt() const {
+    return unaryExpression(hppl::unary::sqrt_op<T>());
+  }
+
+  const TensorUnaryOp<hppl::unary::square<T>, const Derived, T> square() const {
+    return unaryExpression(hppl::unary::square<T>());
+  }
+
+  const TensorUnaryOp<hppl::unary::reciprocal<T>, const Derived, T> reciprocal()
+      const {
+    return unaryExpression(hppl::unary::reciprocal<T>());
+  }
+
+  const TensorUnaryOp<hppl::unary::abs<T>, const Derived, T> abs() const {
+    return unaryExpression(hppl::unary::abs<T>());
+  }
+
+  const TensorUnaryOp<hppl::unary::sign<T>, const Derived, T> sign() const {
+    return unaryExpression(hppl::unary::sign<T>());
+  }
+
+  const TensorUnaryOp<hppl::unary::pow_op<T>, const Derived, T> pow(T p) const {
+    return unaryExpression(hppl::unary::pow_op<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::min<T>, const Derived, T> min(T p) const {
+    return unaryExpression(hppl::unary::min<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::max<T>, const Derived, T> max(T p) const {
+    return unaryExpression(hppl::unary::max<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::cmp_eq<T>, const Derived, T> operator==(
+      T p) const {
+    return unaryExpression(hppl::unary::cmp_eq<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::cmp_ne<T>, const Derived, T> operator!=(
+      T p) const {
+    return unaryExpression(hppl::unary::cmp_ne<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::cmp_le<T>, const Derived, T> operator<=(
+      T p) const {
+    return unaryExpression(hppl::unary::cmp_le<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::cmp_lt<T>, const Derived, T> operator<(
+      T p) const {
+    return unaryExpression(hppl::unary::cmp_lt<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::cmp_ge<T>, const Derived, T> operator>=(
+      T p) const {
+    return unaryExpression(hppl::unary::cmp_ge<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::cmp_gt<T>, const Derived, T> operator>(
+      T p) const {
+    return unaryExpression(hppl::unary::cmp_gt<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::and_op<T>, const Derived, T> operator&&(
+      T p) const {
+    return unaryExpression(hppl::unary::and_op<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::or_op<T>, const Derived, T> operator||(
+      T p) const {
+    return unaryExpression(hppl::unary::or_op<T>(p));
+  }
+
+  /**
+   * Element wise binary expression.
+   */
+  template <typename BinaryOp, typename ExpressionType>
+  const TensorBinaryOp<BinaryOp, const Derived, const ExpressionType, T>
+  binaryExpression(const BinaryOp& op, const ExpressionType& expr) const {
+    return TensorBinaryOp<BinaryOp, const Derived, const ExpressionType, T>(
+        op, derived(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::cmp_eq<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator==(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::cmp_eq<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::cmp_ne<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator!=(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::cmp_ne<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::cmp_le<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator<=(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::cmp_le<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::cmp_lt<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator<(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::cmp_lt<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::cmp_ge<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator>=(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::cmp_ge<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::cmp_gt<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator>(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::cmp_gt<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::and_op<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator&&(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::and_op<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::or_op<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator||(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::or_op<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::add<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator+(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::add<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::sub<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator-(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::sub<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::mul<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator*(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::mul<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::div<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator/(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::div<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::min<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  min(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::min<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::max<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  max(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::max<T>(), expr);
+  }
+
+  /**
+   * Element wise ternary expression.
+   *
+   * ternary conditional operator(?: operator).
+   * The conditional expression returns one of two values depending on
+   * the result of derived expression.
+   * If derived expression evaluates to true, then expression1 is evaluated.
+   * If derived expression evaluates to false, then expression2 is evaluated.
+   */
+  template <typename ExprType1, typename ExprType2>
+  const TensorTernaryOp<const Derived, const ExprType1, const ExprType2, T>
+  condition(const ExprType1& expr1, const ExprType2& expr2) const {
+    return TensorTernaryOp<const Derived, const ExprType1, const ExprType2, T>(
+        derived(), expr1, expr2);
+  }
+
+  template <typename ExprType>
+  const TensorTernaryOp<
+      const Derived,
+      const TensorConstant<hppl::unary::constant<T>, const Derived, T>,
+      const ExprType,
+      T>
+  condition(T p, const ExprType& expr) const {
+    return condition(constant(p), expr);
+  }
+
+  template <typename ExprType>
+  const TensorTernaryOp<
+      const Derived,
+      const ExprType,
+      const TensorConstant<hppl::unary::constant<T>, const Derived, T>,
+      T>
+  condition(const ExprType& expr, T p) const {
+    return condition(expr, constant(p));
+  }
+
+  const TensorTernaryOp<
+      const Derived,
+      const TensorConstant<hppl::unary::constant<T>, const Derived, T>,
+      const TensorConstant<hppl::unary::constant<T>, const Derived, T>,
+      T>
+  condition(T p1, T p2) const {
+    return condition(constant(p1), constant(p2));
+  }
+
+  /**
+   * return a TensorConstant. A TensorConstant object hold a constant value.
+   */
+  const TensorConstant<hppl::unary::constant<T>, const Derived, T> constant(
+      T p) const {
+    return TensorConstant<hppl::unary::constant<T>, const Derived, T>(
+        hppl::unary::constant<T>(p), derived());
+  }
+
+  /**
+   * return a TensorAssignOp, and use AssignEvaluate to evaluate one or more
+   * TensorAssignOp objects.
+   */
+  template <typename ExpressionType>
+  TensorAssignOp<Derived, ExpressionType, T> lazyAssign(
+      const ExpressionType& expr) const {
+    return TensorAssignOp<Derived, ExpressionType, T>(derived(), expr);
+  }
+
+ protected:
+  const Derived& derived() const { return *static_cast<const Derived*>(this); }
+};
+
+/**
+ * \brief Unary Operator Expression
+ */
+template <class OP, typename ExprType, class T>
+class TensorUnaryOp
+    : public TensorExpression<TensorUnaryOp<OP, ExprType, T>, T> {
+ public:
+  explicit TensorUnaryOp(const OP op, const ExprType& expr)
+      : op_(op), expr_(expr) {}
+
+  const OP op_;
+  const ExprType expr_;
+};
+
+/**
+ * \brief Binary Operator Expression
+ */
+template <class OP, typename LhsType, typename RhsType, class T>
+class TensorBinaryOp
+    : public TensorExpression<TensorBinaryOp<OP, LhsType, RhsType, T>, T> {
+ public:
+  explicit TensorBinaryOp(const OP op, const LhsType& lhs, const RhsType& rhs)
+      : op_(op), lhs_(lhs), rhs_(rhs) {}
+
+  const OP op_;
+  const LhsType lhs_;
+  const RhsType rhs_;
+};
+
+/**
+ * \brief Ternary Operator Expression
+ */
+template <typename ExprType1, typename ExprType2, typename ExprType3, class T>
+class TensorTernaryOp : public TensorExpression<
+                            TensorTernaryOp<ExprType1, ExprType2, ExprType3, T>,
+                            T> {
+ public:
+  explicit TensorTernaryOp(const ExprType1& expr1,
+                           const ExprType2& expr2,
+                           const ExprType3& expr3)
+      : expr1_(expr1), expr2_(expr2), expr3_(expr3) {}
+
+  const ExprType1 expr1_;
+  const ExprType2 expr2_;
+  const ExprType3 expr3_;
+};
+
+/**
+ * \brief Constant Expression
+ */
+template <class OP, typename ExprType, class T>
+class TensorConstant
+    : public TensorExpression<TensorConstant<OP, ExprType, T>, T> {
+ public:
+  explicit TensorConstant(const OP op, const ExprType& expr)
+      : op_(op), expr_(expr) {}
+
+  const OP op_;
+  const ExprType expr_;
+};
+
+/**
+ * \brief operator+ overload
+ * \return a unary operator expression
+ */
+template <typename Derived, class T>
+const TensorUnaryOp<hppl::unary::add_scale<T>, const Derived, T> operator+(
+    T p, const TensorExpression<Derived, T>& expr) {
+  return expr + p;
+}
+
+/**
+ * \brief operator* overload
+ * \return a unary operator expression
+ */
+template <typename Derived, class T>
+const TensorUnaryOp<hppl::unary::mul_scale<T>, const Derived, T> operator*(
+    T p, const TensorExpression<Derived, T>& expr) {
+  return expr * p;
+}
+
+}  // namespace paddle
+
+#include "TensorApply.h"
+#include "TensorEvaluate.h"
diff --git a/paddle/legacy/math/TrainingAlgorithmOp.cu b/paddle/legacy/math/TrainingAlgorithmOp.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9e1eaa0f45ae94d12cf7763bbaff632fc473bcc8
--- /dev/null
+++ b/paddle/legacy/math/TrainingAlgorithmOp.cu
@@ -0,0 +1,356 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "BaseMatrix.h"
+#include "TrainingAlgorithmOp.h"
+#include "paddle/legacy/utils/Logging.h"
+
+#if __cplusplus > 199711L
+
+#include "TensorAssign.h"
+
+namespace paddle {
+
+void sparseMomentumApply(BaseMatrix& value,
+                         BaseMatrix& grad,
+                         BaseMatrix& momU,
+                         BaseMatrix& momV,
+                         real alpha,
+                         real beta,
+                         real gamma,
+                         real tau,
+                         real learningRate) {
+  auto expr1 = momU.lazyAssign(momU - (alpha * gamma * learningRate) * grad);
+  auto expr2 =
+      momV.lazyAssign(momV + (tau * alpha * gamma * learningRate) * grad);
+  auto expr3 = value.lazyAssign((tau / beta + (real)1 / alpha) * momU +
+                                ((real)1 / beta) * momV);
+
+  AssignEvaluate(expr1, expr2, expr3);
+}
+
+void adadeltaApply(BaseMatrix& value,
+                   BaseMatrix& grad,
+                   BaseMatrix& mom,
+                   BaseMatrix& accum,
+                   BaseMatrix& accum_update,
+                   BaseMatrix& lr,
+                   real rou,
+                   real epsilon,
+                   real learningRate,
+                   real momentum,
+                   real decayRate) {
+  auto expr1 = accum.lazyAssign(rou * accum + ((real)1 - rou) * grad.square());
+  auto expr2 =
+      lr.lazyAssign(((accum_update + epsilon) / (accum + epsilon)).sqrt());
+  auto expr3 = accum_update.lazyAssign(rou * accum_update +
+                                       ((real)1 - rou) * (grad * lr).square());
+  auto expr4 = mom.lazyAssign(mom * momentum -
+                              learningRate * lr * (grad + value * decayRate));
+  auto expr5 = value.lazyAssign(value + mom);
+
+  AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
+}
+
+void adagradApply(BaseMatrix& value,
+                  BaseMatrix& grad,
+                  BaseMatrix& mom,
+                  BaseMatrix& accum_buffer,
+                  BaseMatrix& accum,
+                  BaseMatrix& lr,
+                  real epsilon,
+                  real learningRate,
+                  real momentum,
+                  real decayRate) {
+  auto expr1 = accum.lazyAssign(accum + grad.square());
+  auto expr2 =
+      lr.lazyAssign((accum_buffer + accum + epsilon).sqrt().reciprocal());
+  auto expr3 = mom.lazyAssign(mom * momentum -
+                              learningRate * lr * (grad + value * decayRate));
+  auto expr4 = value.lazyAssign(value + mom);
+
+  AssignEvaluate(expr1, expr2, expr3, expr4);
+}
+
+void rmspropApply(BaseMatrix& value,
+                  BaseMatrix& grad,
+                  BaseMatrix& mom,
+                  BaseMatrix& g,
+                  BaseMatrix& f,
+                  BaseMatrix& lr,
+                  real accumulatedRou,
+                  real rou,
+                  real epsilon,
+                  real learningRate,
+                  real momentum,
+                  real decayRate,
+                  bool firstTime) {
+  auto expr2 = f.lazyAssign(accumulatedRou * f + ((real)1 - rou) * grad);
+  auto expr3 = lr.lazyAssign((g - f.square() + epsilon).sqrt().reciprocal());
+  auto expr4 = mom.lazyAssign(mom * momentum -
+                              learningRate * lr * (grad + value * decayRate));
+  auto expr5 = value.lazyAssign(value + mom);
+
+  if (firstTime) {
+    auto expr1 = g.lazyAssign(accumulatedRou * g + grad.square());
+
+    AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
+  } else {
+    auto expr1 =
+        g.lazyAssign(accumulatedRou * g + ((real)1 - rou) * grad.square());
+
+    AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
+  }
+}
+
+void decayedAdagradApply(BaseMatrix& value,
+                         BaseMatrix& grad,
+                         BaseMatrix& mom,
+                         BaseMatrix& accum,
+                         BaseMatrix& lr,
+                         real accumulatedRou,
+                         real rou,
+                         real epsilon,
+                         real learningRate,
+                         real momentum,
+                         real decayRate,
+                         bool firstTime) {
+  auto expr2 = lr.lazyAssign((accum + epsilon).sqrt().reciprocal());
+  auto expr3 = mom.lazyAssign(mom * momentum -
+                              learningRate * lr * (grad + value * decayRate));
+  auto expr4 = value.lazyAssign(value + mom);
+
+  if (firstTime) {
+    auto expr1 = accum.lazyAssign(accumulatedRou * accum + grad.square());
+
+    AssignEvaluate(expr1, expr2, expr3, expr4);
+  } else {
+    auto expr1 = accum.lazyAssign(accumulatedRou * accum +
+                                  ((real)1 - rou) * grad.square());
+
+    AssignEvaluate(expr1, expr2, expr3, expr4);
+  }
+}
+
+void adamApply(BaseMatrix& value,
+               BaseMatrix& grad,
+               BaseMatrix& mom,  // firse moment
+               BaseMatrix& v,    // second moment
+               real beta1,
+               real beta2,
+               real beta1_power,
+               real beta2_power,
+               real epsilon,
+               real learningRate) {
+  real alpha =
+      learningRate * std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
+
+  auto expr1 = mom.lazyAssign(beta1 * mom + ((real)1 - beta1) * grad);
+  auto expr2 = v.lazyAssign(beta2 * v + ((real)1 - beta2) * grad.square());
+  auto expr3 = value.lazyAssign(value - (mom * alpha) / (v.sqrt() + epsilon));
+
+  AssignEvaluate(expr1, expr2, expr3);
+}
+
+void adamaxApply(BaseMatrix& value,
+                 BaseMatrix& grad,
+                 BaseMatrix& mom,  // firse moment
+                 BaseMatrix& u,    // weighted infinity norm
+                 real beta1,
+                 real beta2,
+                 int64_t step,
+                 real alpha) {
+  auto expr1 = mom.lazyAssign(beta1 * mom + ((real)1 - beta1) * grad);
+  auto expr2 =
+      u.lazyAssign((beta2 * u > grad.abs()).condition(beta2 * u, grad.abs()));
+  auto expr3 = value.lazyAssign(
+      value - (alpha / ((real)1 - (real)std::pow(beta1, step))) * (mom / u));
+
+  AssignEvaluate(expr1, expr2, expr3);
+}
+
+}  // namespace paddle
+
+#else
+
+namespace paddle {
+
+void sparseMomentumApply(BaseMatrix& value,
+                         BaseMatrix& grad,
+                         BaseMatrix& momU,
+                         BaseMatrix& momV,
+                         real alpha,
+                         real beta,
+                         real gamma,
+                         real tau,
+                         real learningRate) {
+  /**
+   * \alpha_t = \alpha_{t-1} / k
+   * \beta_t = \beta_{t-1} / (1 + \lambda\gamma_t)
+   * u_t = u_{t-1} - \alpha_t \gamma_t g_t
+   * v_t = v_{t-1} + \tau_{t-1} \alpha_t \gamma_t g_t
+   * \tau_t = \tau_{t-1} + \beta_t / \alpha_t
+   */
+  momU -= (alpha * gamma * learningRate) * grad;
+  momV += (tau * alpha * gamma * learningRate) * grad;
+  value = (tau / beta + (real)1 / alpha) * momU + ((real)1 / beta) * momV;
+}
+
+void adadeltaApply(BaseMatrix& value,
+                   BaseMatrix& grad,
+                   BaseMatrix& mom,
+                   BaseMatrix& accum,
+                   BaseMatrix& accum_update,
+                   BaseMatrix& lr,
+                   real rou,
+                   real epsilon,
+                   real learningRate,
+                   real momentum,
+                   real decayRate) {
+  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  accum = rou * accum + ((real)1 - rou) * grad.square();
+
+  // learn_rate: sqrt(( E(dx_{t-1}^2) + epsilon ) / ( E(g_t^2) + epsilon ))
+  lr = ((accum_update + epsilon) / (accum + epsilon)).sqrt();
+
+  // E(dx_t^2) = \rou * E(dx_{t-1}^2) + (1-\rou) * (-g*learn_rate)^2
+  accum_update = rou * accum_update + ((real)1 - rou) * (grad * lr).square();
+
+  mom = mom * momentum - learningRate * lr * (grad + value * decayRate);
+  value += mom;
+}
+
+void adagradApply(BaseMatrix& value,
+                  BaseMatrix& grad,
+                  BaseMatrix& mom,
+                  BaseMatrix& accum_buffer,
+                  BaseMatrix& accum,
+                  BaseMatrix& lr,
+                  real epsilon,
+                  real learningRate,
+                  real momentum,
+                  real decayRate) {
+  accum += grad.square();
+  lr = (accum_buffer + accum + epsilon).sqrt().reciprocal();
+  mom = mom * momentum - learningRate * lr * (grad + value * decayRate);
+  value += mom;
+}
+
+void rmspropApply(BaseMatrix& value,
+                  BaseMatrix& grad,
+                  BaseMatrix& mom,
+                  BaseMatrix& g,
+                  BaseMatrix& f,
+                  BaseMatrix& lr,
+                  real accumulatedRou,
+                  real rou,
+                  real epsilon,
+                  real learningRate,
+                  real momentum,
+                  real decayRate,
+                  bool firstTime) {
+  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  // For the first time update, make the sum be the current square
+  // so that the initial estimation of E(g_t^2) will not be too small.
+  if (firstTime) {
+    g = accumulatedRou * g + grad.square();
+  } else {
+    g = accumulatedRou * g + ((real)1 - rou) * grad.square();
+  }
+
+  // E(f_t) = \rou * E(f_{t-1}) + (1-\rou) * g
+  f = accumulatedRou * f + ((real)1 - rou) * grad;
+
+  // learn_rate = 1/sqrt( ( E(g_t^2) - (E(f_t))^2 + epsilon )
+  // Basiclly if the sign of the gradient changes more often,
+  // the learning rate will be decreased.
+  lr = (g - f.square() + epsilon).sqrt().reciprocal();
+
+  mom = mom * momentum - learningRate * lr * (grad + value * decayRate);
+  value += mom;
+}
+
+void decayedAdagradApply(BaseMatrix& value,
+                         BaseMatrix& grad,
+                         BaseMatrix& mom,
+                         BaseMatrix& accum,
+                         BaseMatrix& lr,
+                         real accumulatedRou,
+                         real rou,
+                         real epsilon,
+                         real learningRate,
+                         real momentum,
+                         real decayRate,
+                         bool firstTime) {
+  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  // For the first time update, make the sum be the current square
+  // so that the initial estimation of E(g_t^2) will not be too small.
+  if (firstTime) {
+    accum = accumulatedRou * accum + grad.square();
+  } else {
+    accum = accumulatedRou * accum + ((real)1 - rou) * grad.square();
+  }
+
+  // learn_rate = 1/sqrt( ( E(g_t^2) + epsilon )
+  // Basiclly if the bigger the magnitude gradient is,
+  // the smaller the learning rate will be.
+  lr = (accum + epsilon).sqrt().reciprocal();
+
+  mom = mom * momentum - learningRate * lr * (grad + value * decayRate);
+  value += mom;
+}
+
+void adamApply(BaseMatrix& value,
+               BaseMatrix& grad,
+               BaseMatrix& mom,  // firse moment
+               BaseMatrix& v,    // second moment
+               real beta1,
+               real beta2,
+               real beta1_power,
+               real beta2_power,
+               real epsilon,
+               real learningRate) {
+  real alpha =
+      learningRate * std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
+
+  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
+  mom = beta1 * mom + ((real)1 - beta1) * grad;
+
+  // v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2
+  v = beta2 * v + ((real)1 - beta2) * grad.square();
+
+  value -= (mom * alpha) / (v.sqrt() + epsilon);
+}
+
+void adamaxApply(BaseMatrix& value,
+                 BaseMatrix& grad,
+                 BaseMatrix& mom,  // firse moment
+                 BaseMatrix& u,    // weighted infinity norm
+                 real beta1,
+                 real beta2,
+                 int64_t step,
+                 real alpha) {
+  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
+  mom = beta1 * mom + ((real)1 - beta1) * grad;
+
+  // u_t = max(\beta_2*u_{t-1}, abs(g_t))
+  u = (beta2 * u > grad.abs()).condition(beta2 * u, grad.abs());
+
+  // \theta_t = \theta_{t-1} - (\alpha/(1-\beta_1^t))*m_t/u_t
+  value -= (alpha / ((real)1 - (real)std::pow(beta1, step))) * (mom / u);
+}
+
+}  // namespace paddle
+
+#endif
diff --git a/paddle/legacy/math/TrainingAlgorithmOp.h b/paddle/legacy/math/TrainingAlgorithmOp.h
new file mode 100644
index 0000000000000000000000000000000000000000..921c2742cfe2576785768da40ab11c94234be966
--- /dev/null
+++ b/paddle/legacy/math/TrainingAlgorithmOp.h
@@ -0,0 +1,122 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "BaseMatrix.h"
+#include "paddle/legacy/utils/Logging.h"
+
+namespace paddle {
+
+/**
+ * \brief Sparse Momentum optimizer.
+ */
+extern void sparseMomentumApply(BaseMatrix& value,
+                                BaseMatrix& grad,
+                                BaseMatrix& momU,
+                                BaseMatrix& momV,
+                                real alpha,
+                                real beta,
+                                real gamma,
+                                real tau,
+                                real learningRate);
+
+/**
+ * \brief AdaDelta optimizer.
+ */
+extern void adadeltaApply(BaseMatrix& value,
+                          BaseMatrix& grad,
+                          BaseMatrix& sum,
+                          BaseMatrix& sum1,
+                          BaseMatrix& mom,
+                          BaseMatrix& lr,
+                          real rou,
+                          real epsilon,
+                          real learningRate,
+                          real momentum,
+                          real decayRate);
+
+/**
+ * \brief AdaGrad optimizer.
+ */
+extern void adagradApply(BaseMatrix& value,
+                         BaseMatrix& grad,
+                         BaseMatrix& sum,
+                         BaseMatrix& sum1,
+                         BaseMatrix& mom,
+                         BaseMatrix& lr,
+                         real epsilon,
+                         real learningRate,
+                         real momentum,
+                         real decayRate);
+
+/**
+ * \brief RMSProp optimizer.
+ */
+extern void rmspropApply(BaseMatrix& value,
+                         BaseMatrix& grad,
+                         BaseMatrix& g,
+                         BaseMatrix& f,
+                         BaseMatrix& mom,
+                         BaseMatrix& lr,
+                         real accumulatedRou,
+                         real rou,
+                         real epsilon,
+                         real learningRate,
+                         real momentum,
+                         real decayRate,
+                         bool firstTime);
+
+/**
+ * \brief Decayed AdaGrad optimizer.
+ */
+extern void decayedAdagradApply(BaseMatrix& value,
+                                BaseMatrix& grad,
+                                BaseMatrix& mom,
+                                BaseMatrix& accum,
+                                BaseMatrix& lr,
+                                real accumulatedRou,
+                                real rou,
+                                real epsilon,
+                                real learningRate,
+                                real momentum,
+                                real decayRate,
+                                bool firstTime);
+
+/**
+ * \brief Adam optimizer.
+ */
+extern void adamApply(BaseMatrix& value,
+                      BaseMatrix& grad,
+                      BaseMatrix& mom,
+                      BaseMatrix& v,
+                      real beta1,
+                      real beta2,
+                      real beta1_power,
+                      real beta2_power,
+                      real epsilon,
+                      real learningRate);
+
+/**
+ * \brief AdaMax optimizer.
+ */
+extern void adamaxApply(BaseMatrix& value,
+                        BaseMatrix& grad,
+                        BaseMatrix& mom,  // firse moment
+                        BaseMatrix& u,    // weighted infinity norm
+                        real beta1,
+                        real beta2,
+                        int64_t step,
+                        real alpha);
+}  // namespace paddle
diff --git a/paddle/legacy/math/Vector.cpp b/paddle/legacy/math/Vector.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..87f48bb1622f28f8cb53e5afc924f5cadb14c528
--- /dev/null
+++ b/paddle/legacy/math/Vector.cpp
@@ -0,0 +1,1091 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Vector.h"
+#include "paddle/legacy/utils/Util.h"
+
+#include <memory>
+#include "Matrix.h"
+#include "hl_gpu.h"
+#include "hl_matrix.h"
+#include "hl_table_apply.h"
+#include "paddle/legacy/utils/Flags.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Thread.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
+
+namespace paddle {
+
+template <class T>
+std::shared_ptr<VectorT<T>> VectorT<T>::create(size_t size, bool useGpu) {
+  if (useGpu) {
+    return std::make_shared<GpuVectorT<T>>(size);
+  } else {
+    return std::make_shared<CpuVectorT<T>>(size);
+  }
+}
+
+template <class T>
+std::shared_ptr<VectorT<T>> VectorT<T>::createParallelVector(
+    size_t size, bool useGpu, SyncThreadPool* pool) {
+  if (!useGpu && FLAGS_trainer_count > 1 && FLAGS_enable_parallel_vector &&
+      size >= (size_t)FLAGS_enable_parallel_vector) {
+    return std::make_shared<ParallelCpuVectorT<T>>(
+        size, pool ? pool : getGlobalSyncThreadPool());
+  } else {
+    return create(size, useGpu);
+  }
+}
+
+template <class T>
+std::shared_ptr<VectorT<T>> VectorT<T>::create(T* data,
+                                               size_t size,
+                                               bool useGpu) {
+  if (useGpu) {
+    return std::make_shared<GpuVectorT<T>>(size, data);
+  } else {
+    return std::make_shared<CpuVectorT<T>>(size, data);
+  }
+}
+
+template <class T>
+std::shared_ptr<VectorT<T>> VectorT<T>::create(size_t size,
+                                               MemoryHandlePtr memoryHandle,
+                                               size_t offset) {
+  if (auto cpuMemHandle =
+          std::dynamic_pointer_cast<CpuMemoryHandle>(memoryHandle)) {
+    return std::make_shared<CpuVectorT<T>>(size, cpuMemHandle, offset);
+  } else if (auto gpuMemHandle =
+                 std::dynamic_pointer_cast<GpuMemoryHandle>(memoryHandle)) {
+    return std::make_shared<GpuVectorT<T>>(size, gpuMemHandle, offset);
+  } else {
+    LOG(FATAL) << "Wrong";
+    return NULL;
+  }
+}
+
+template <>
+MatrixPtr VectorT<real>::toOneHotSparseMatrix(size_t idRange, bool useGpu) {
+  LOG(FATAL) << "Wrong for real vector";
+  return nullptr;
+}
+
+template <>
+MatrixPtr VectorT<int>::toOneHotSparseMatrix(size_t idRange, bool useGpu) {
+  size_t height = getSize();
+  size_t width = idRange;
+  MatrixPtr mat = Matrix::createSparseMatrix(
+      height, idRange, height, NO_VALUE, SPARSE_CSR, false, useGpu);
+
+  CpuIVector cpuIds(height);
+  cpuIds.copyFrom(*this);
+  int* idData = cpuIds.getData();
+
+  for (decltype(height) i = 0; i < height; i++) {
+    const unsigned int id = idData[i];
+    CHECK_LT(id, width);
+    mat->setRow(i, 1, &id, nullptr);
+  }
+  return mat;
+}
+
+template <>
+std::shared_ptr<VectorT<int>> VectorT<real>::castToInt() {
+  std::shared_ptr<VectorT<int>> ret = IVector::create(this->getSize(), useGpu_);
+  if (useGpu_) {
+    hl_vector_cast2int(ret->getData(), this->getData(), this->getSize());
+  } else {
+    for (size_t i = 0; i < getSize(); ++i) {
+      ret->getData()[i] = int(this->getData()[i]);
+    }
+  }
+  return ret;
+}
+
+template <class T>
+GpuVectorT<T>::GpuVectorT(size_t size)
+    : VectorT<T>(size,
+                 std::make_shared<GpuMemoryHandle>(sizeof(T) * size),
+                 0, /* offset = 0 */
+                 true /* useGpu = true */) {}
+
+template <class T>
+T GpuVectorT<T>::getElement(size_t i) const {
+  T elem = 0;
+  hl_memcpy_device2host(&elem, const_cast<T*>(&this->getData()[i]), sizeof(T));
+  return elem;
+}
+template <class T>
+void GpuVectorT<T>::setElement(size_t i, const T& value) {
+  hl_memcpy_host2device(&this->getData()[i], const_cast<T*>(&value), sizeof(T));
+}
+
+template <class T>
+T* GpuVectorT<T>::getPoint(const uint64_t beginPos) {
+  LOG(FATAL) << "Not implemented" << beginPos;
+  return NULL;
+}
+
+template <>
+int GpuVectorT<int>::getAbsSum() {
+  LOG(FATAL) << "Not implemented";
+  return 0;
+}
+
+template <>
+int GpuVectorT<int>::getSum() {
+  LOG(FATAL) << "Not implemented";
+  return 0;
+}
+
+template <>
+real GpuVectorT<real>::getAbsSum() {
+  real* A = this->getData();
+  real sum = 0;
+  hl_vector_abs_sum(A, &sum, this->getSize());
+  return sum;
+}
+
+template <>
+real GpuVectorT<real>::getSum() {
+  real* A = this->getData();
+  real sum = 0;
+  hl_vector_sum(A, &sum, this->getSize());
+  return sum;
+}
+
+template <>
+int GpuVectorT<int>::getMax() {
+  CpuIVector cpuIVec = CpuIVector(this->getSize());
+  copyTo(&cpuIVec);
+  return cpuIVec.getMax();
+}
+
+template <>
+int GpuVectorT<int>::getAbsMax() {
+  CpuIVector cpuIVec = CpuIVector(this->getSize());
+  copyTo(&cpuIVec);
+  return cpuIVec.getAbsMax();
+}
+
+template <class T>
+void GpuVectorT<T>::isEqualTo(const VectorT<T>& b, const T& value) {
+  BaseMatrixT<T>::isEqualTo((BaseMatrixT<T>&)b, value);
+}
+
+template <class T>
+void GpuVectorT<T>::selectFrom(const VectorT<T>& src, const VectorT<int>& ids) {
+#ifdef PADDLE_WITH_CUDA
+  hl_vector_select_from<T>(this->getData(),
+                           this->getSize(),
+                           src.getData(),
+                           src.getSize(),
+                           ids.getData(),
+                           ids.getSize());
+#endif
+}
+
+template <class Func>
+real gpuRowFunc(Func f, GpuVector& v) {
+  static ThreadLocal<std::unique_ptr<CpuVectorT<real>>> local;
+  if (!*local) {
+    (*local).reset(new CpuVector(1));
+  }
+  real* A = v.getData();
+  f(A, (*local)->getData(), 1, v.getSize());
+  return (*local)->getData()[0];
+}
+
+template <>
+real GpuVectorT<real>::getMax() {
+  return gpuRowFunc(hl_matrix_row_max, *this);
+}
+
+template <>
+real GpuVectorT<real>::getAbsMax() {
+  return std::max(gpuRowFunc(hl_matrix_row_max, *this),
+                  -gpuRowFunc(hl_matrix_row_min, *this));
+}
+
+template <>
+int GpuVectorT<int>::getMin() {
+  LOG(FATAL) << "Not implemented";
+  return 0;
+}
+
+template <>
+real GpuVectorT<real>::getMin() {
+  return gpuRowFunc(hl_matrix_row_min, *this);
+}
+
+template <class T>
+T GpuVectorT<T>::get(size_t pos) {
+  T val = (T)0;
+  hl_memcpy_device2host((void*)&val, (void*)(this->getData() + pos), sizeof(T));
+  return val;
+}
+
+template <class T>
+void GpuVectorT<T>::histogram(std::ostream& os, int type) {
+  LOG(FATAL) << "Not implemented";
+}
+
+template <class T>
+void GpuVectorT<T>::zeroMem() {
+  BaseMatrixT<T>::zero();
+}
+
+template <class T>
+void GpuVectorT<T>::reset(const T& value) {
+  BaseMatrixT<T>::assign(value);
+}
+
+template <class T>
+void GpuVectorT<T>::fillSequence() {
+  LOG(FATAL) << "not implemented";
+}
+
+template <class T>
+void GpuVectorT<T>::copyFrom(const VectorT<T>& src) {
+  src.copyTo(this);
+}
+
+template <class T>
+void GpuVectorT<T>::copyFrom(const VectorT<T>& src, hl_stream_t stream) {
+  CHECK_EQ(src.getSize(), this->getSize());
+  hl_memcpy_async((void*)this->getData(),
+                  (void*)src.getData(),
+                  sizeof(T) * this->getSize(),
+                  stream);
+}
+
+template <class T>
+void GpuVectorT<T>::copyFrom(const T* gpuSrc, size_t size) {
+  CHECK(gpuSrc != NULL);
+  CHECK_LE(size, this->size_);
+
+  hl_memcpy((void*)this->getData(), (void*)gpuSrc, sizeof(T) * size);
+}
+
+template <class T>
+void GpuVectorT<T>::copyFrom(const T* gpuSrc, size_t size, hl_stream_t stream) {
+  CHECK(gpuSrc != NULL);
+  CHECK_LE(size, this->size_);
+
+  hl_memcpy_async(
+      (void*)this->getData(), (void*)gpuSrc, sizeof(T) * size, stream);
+}
+
+template <class T>
+void GpuVectorT<T>::copyTo(CpuVectorT<T>* dest) const {
+  CHECK_EQ(this->getSize(), dest->getSize());
+
+  hl_memcpy_device2host((void*)dest->getData(),
+                        (void*)this->getData(),
+                        sizeof(T) * this->getSize());
+}
+
+template <class T>
+void GpuVectorT<T>::copyTo(GpuVectorT<T>* dest) const {
+  CHECK_EQ(this->getSize(), dest->getSize());
+
+  hl_memcpy_device2device((void*)dest->getData(),
+                          (void*)this->getData(),
+                          sizeof(T) * this->getSize());
+}
+
+template <>
+void GpuVectorT<int>::rand() {
+  LOG(FATAL) << "Not implemented";
+}
+
+template <>
+void GpuVectorT<int>::print(std::ostream& os, size_t num) const {
+  IVectorPtr dest = IVector::create(this->size_, false);
+  hl_memcpy_device2host((void*)dest->getData(),
+                        (void*)this->getData(),
+                        sizeof(int) * this->getSize());
+  dest->print(os, num);
+}
+
+template <>
+void GpuVectorT<real>::print(std::ostream& os, size_t num) const {
+  VectorPtr dest = Vector::create(this->size_, false);
+  hl_memcpy_device2host((void*)dest->getData(),
+                        (void*)this->getData(),
+                        sizeof(int) * this->getSize());
+  dest->print(os, num);
+}
+
+template <>
+void GpuVectorT<int>::printOneElement(std::ostream& os, size_t idx) const {
+  LOG(FATAL) << "Not implemented";
+}
+
+template <>
+void GpuVectorT<real>::printOneElement(std::ostream& os, size_t idx) const {
+  LOG(FATAL) << "Not implemented";
+}
+
+template <>
+void CpuVectorT<int>::rand() {
+  LOG(FATAL) << "Not implemented";
+}
+template <>
+void GpuVectorT<real>::rand(size_t classNum) {
+  LOG(FATAL) << "Not implemented";
+}
+
+template <>
+void CpuVectorT<real>::rand(size_t classNum) {
+  LOG(FATAL) << "Not implemented";
+}
+
+template <>
+void GpuVectorT<real>::rand() {
+  VectorPtr cPtr = Vector::create(this->size_, false);
+  cPtr->rand();
+
+  hl_memcpy_host2device(data_, cPtr->getData(), this->size_ * sizeof(real));
+}
+
+template <>
+void GpuVectorT<int>::rand(size_t classNum) {
+  IVectorPtr cPtr = IVector::create(this->size_, false);
+  cPtr->rand(classNum);
+
+  hl_memcpy_host2device(data_, cPtr->getData(), this->size_ * sizeof(int));
+}
+
+template <>
+void CpuVectorT<int>::rand(size_t classNum) {
+  size_t size = this->getSize();
+  int* data = this->getData();
+  for (size_t i = 0; i < size; i++) {
+    data[i] =
+        std::min(classNum - 1,
+                 size_t(::rand() * (1. / ((double)RAND_MAX + 1)) * classNum));
+  }
+}
+
+template <>
+void CpuVectorT<real>::rand() {
+  size_t size = this->getSize();
+  real* data = this->getData();
+  for (size_t i = 0; i < size; i++) {
+    data[i] = ::rand() * (1. / (double)RAND_MAX);
+    // data[ii] = ((temp > RAND_MAX/2)? 1 : -1) *
+    // sqrt( abs((temp-RAND_MAX/2))/(double(RAND_MAX))/2048 );
+  }
+}
+
+template <class T>
+void CpuVectorT<T>::randnorm(real, real) {
+  LOG(FATAL) << "Not implemented";
+}
+
+template <class T>
+void CpuVectorT<T>::uniform(real, real) {
+  LOG(FATAL) << "Not implemented";
+}
+
+template <class T>
+void GpuVectorT<T>::randnorm(real, real) {
+  LOG(FATAL) << "Not implemented";
+}
+
+template <class T>
+void GpuVectorT<T>::uniform(real, real) {
+  LOG(FATAL) << "Not implemented";
+}
+
+template <>
+void CpuVectorT<real>::randnorm(real mean, real std) {
+  size_t size = this->getSize();
+  real* data = this->getData();
+  unsigned int* seed = ThreadLocalRand::getSeed();
+  auto rand1 = [&]() { return (1. + ::rand_r(seed)) * (1. / (1. + RAND_MAX)); };
+  for (size_t i = 0; i < size - 1; i += 2) {
+    real r1 = rand1();
+    r1 = std::sqrt(-2 * std::log(r1));
+    real r2 = rand1();
+    data[i] = mean + std * r1 * cos(2 * M_PI * r2);
+    data[i + 1] = mean + std * r1 * sin(2 * M_PI * r2);
+  }
+  real r1 = rand1();
+  r1 = std::sqrt(-2 * std::log(r1));
+  real r2 = rand1();
+  data[size - 1] = mean + std * r1 * cos(2 * M_PI * r2);
+}
+
+template <>
+void CpuVectorT<real>::uniform(real left, real right) {
+  size_t size = this->getSize();
+  real* data = this->getData();
+  real range = right - left;
+  unsigned int* seed = ThreadLocalRand::getSeed();
+  auto rand1 = [&]() { return ::rand_r(seed) * (1. / (1. + RAND_MAX)); };
+  for (size_t i = 0; i < size; ++i) {
+    data[i] = rand1() * range + left;
+  }
+}
+
+template <>
+void GpuVectorT<real>::randnorm(real mean, real std) {
+  CpuVector cpuVec = CpuVector(this->getSize());
+  cpuVec.randnorm(mean, std);
+
+  hl_memcpy_host2device(
+      data_, cpuVec.getData(), this->getSize() * sizeof(real));
+}
+
+template <>
+void GpuVectorT<real>::uniform(real left, real right) {
+  CpuVector cpuVec = CpuVector(this->getSize());
+  cpuVec.uniform(left, right);
+
+  hl_memcpy_host2device(
+      data_, cpuVec.getData(), this->getSize() * sizeof(real));
+}
+
+template <class T>
+CpuVectorT<T>::CpuVectorT(size_t size)
+    : VectorT<T>(size,
+                 std::make_shared<CpuMemoryHandle>(sizeof(T) * size),
+                 0, /* offset = 0 */
+                 false /* useGpu = false */) {}
+
+template <class T>
+CpuVectorT<T>::CpuVectorT(const VectorT<T>& src)
+    : VectorT<T>(src.getSize(),
+                 src.getMemoryHandle(),
+                 0, /* offset = 0 */
+                 false /* useGpu = false */) {
+  if (typeid(*this->memoryHandle_.get()) != typeid(CpuMemoryHandle)) {
+    this->memoryHandle_ =
+        std::make_shared<CpuMemoryHandle>(sizeof(T) * this->getSize());
+    this->data_ = reinterpret_cast<T*>(this->memoryHandle_->getBuf());
+  }
+  src.copyTo(this);
+}
+
+template <class T>
+T CpuVectorT<T>::getAbsSum() {
+  const T* A = this->getData();
+  size_t size = this->getSize();
+  T sum = 0;
+  for (size_t i = 0; i < size; i++) {
+    sum += (A[i] > 0) ? A[i] : -A[i];
+  }
+  return sum;
+}
+
+// cannot use above version, due to precision issue of float
+template <>
+real CpuVectorT<real>::getAbsSum() {
+  const real* A = this->getData();
+  size_t size = this->getSize();
+  double sum = 0;
+  for (size_t i = 0; i < size; i++) {
+    sum += (A[i] > 0) ? A[i] : -A[i];
+  }
+  return sum;
+}
+
+template <class T>
+T CpuVectorT<T>::getSum() {
+  const T* A = this->getData();
+  size_t size = this->getSize();
+  T sum = 0;
+  for (size_t i = 0; i < size; i++) {
+    sum += A[i];
+  }
+  return sum;
+}
+
+template <>
+real CpuVectorT<real>::getSum() {
+  const real* A = this->getData();
+  size_t size = this->getSize();
+  double sum = 0;
+  for (size_t i = 0; i < size; i++) {
+    sum += A[i];
+  }
+  return sum;
+}
+
+template <class T>
+T CpuVectorT<T>::get(size_t pos) {
+  return this->getData()[pos];
+}
+
+template <class T>
+T CpuVectorT<T>::getMax() {
+  const T* A = this->getData();
+  size_t size = this->getSize();
+  T res = A[0];
+  for (size_t i = 1; i < size; i++) {
+    if (res < A[i]) res = A[i];
+  }
+  return res;
+}
+
+template <class T>
+T CpuVectorT<T>::getAbsMax() {
+  const T* A = this->getData();
+  size_t size = this->getSize();
+  T res = std::abs(A[0]);
+  for (size_t i = 1; i < size; i++) {
+    if (res < std::abs(A[i])) res = std::abs(A[i]);
+  }
+  return res;
+}
+
+template <class T>
+T CpuVectorT<T>::getMin() {
+  const T* A = this->getData();
+  size_t size = this->getSize();
+  T res = A[0];
+  for (size_t i = 1; i < size; i++) {
+    if (res > A[i]) res = A[i];
+  }
+  return res;
+}
+
+template <class T>
+void CpuVectorT<T>::isEqualTo(const VectorT<T>& b, const T& value) {
+  size_t size = this->getSize();
+  CHECK_EQ(b.getSize(), size);
+
+  const T* B = b.getData();
+  T* A = this->getData();
+  for (size_t i = 0; i < size; i++) {
+    A[i] = (B[i] == value);
+  }
+}
+
+template <class T>
+void CpuVectorT<T>::selectFrom(const VectorT<T>& src, const VectorT<int>& ids) {
+  size_t size = this->getSize();
+  CHECK_EQ(ids.getSize(), size);
+
+  const int* indices = ids.getData();
+  const T* B = src.getData();
+  T* A = this->getData();
+  for (size_t i = 0; i < size; i++) {
+    int index = indices[i];
+    CHECK_LT(index, (int)src.getSize());
+    A[i] = B[index];
+  }
+}
+
+static int getSignAndExponentOfFloat(float a) {
+  uint32_t* pa = reinterpret_cast<uint32_t*>(&a);
+  return *pa >> 23;
+}
+
+template <class T>
+void CpuVectorT<T>::histogram(std::ostream& os, int type) {
+  LOG(FATAL) << "Not implemented";
+}
+
+template <>
+void CpuVectorT<real>::histogram(std::ostream& os, int type) {
+  int counters[512];
+  memset(counters, 0, sizeof(counters));
+  int counterZero = 0;
+
+  const real* A = this->getData();
+  size_t size = this->getSize();
+  for (size_t i = 0; i < size; i++) {
+    if (A[i] == 0.0f) {
+      ++counterZero;
+    } else {
+      ++counters[getSignAndExponentOfFloat(A[i])];
+    }
+  }
+
+  int64_t sum = 0;
+  float sizeNonZero = size - counterZero;
+  os << "zero:" << counterZero;
+  for (int i = 0; i < 256; i++) {
+    int counter = counters[i];
+    if (counter) {
+      os << " 2^" << i - 127 << ":" << counter / sizeNonZero * 100 << "%";
+      sum += counter * (i - 127);
+    }
+  }
+  for (int i = 0; i < 256; i++) {
+    int counter = counters[i + 256];
+    if (counter) {
+      os << " -2^" << i - 127 << ":" << counter / sizeNonZero * 100 << "%";
+      sum += counter * (i - 127);
+    }
+  }
+  os << ", nonzero_exponent_avg=" << sum / sizeNonZero;
+}
+
+template <class T>
+void CpuVectorT<T>::zeroMem() {
+  memset(this->getData(), 0, sizeof(T) * this->getSize());
+}
+
+template <class T>
+void CpuVectorT<T>::reset(const T& value) {
+  T* A = this->getData();
+  size_t size = this->getSize();
+  for (size_t i = 0; i < size; i++) {
+    A[i] = value;
+  }
+}
+
+template <class T>
+void CpuVectorT<T>::fillSequence() {
+  T* A = this->getData();
+  size_t size = this->getSize();
+  for (size_t i = 0; i < size; i++) {
+    A[i] = i;
+  }
+}
+
+template <class T>
+void CpuVectorT<T>::copyFrom(const VectorT<T>& src) {
+  src.copyTo(this);
+}
+
+template <class T>
+void CpuVectorT<T>::copyFrom(const VectorT<T>& src, hl_stream_t stream) {
+  if (typeid(src) == typeid(GpuVectorT<T>)) {
+    hl_memcpy_async((void*)this->getData(),
+                    (void*)src.getData(),
+                    sizeof(T) * this->getSize(),
+                    stream);
+    // There is a need to add synchronization to ensure that the data is copied.
+    hl_stream_synchronize(stream);
+  } else {
+    src.copyTo(this);
+  }
+}
+
+template <class T>
+void CpuVectorT<T>::copyFrom(const T* hostSrc, size_t size) {
+  CHECK(hostSrc != NULL);
+  CHECK_LE(size, this->size_);
+  memcpy(this->data_, hostSrc, sizeof(T) * size);
+}
+
+template <class T>
+void CpuVectorT<T>::copyFrom(const T* hostSrc,
+                             size_t size,
+                             hl_stream_t stream) {
+  (void)stream;
+
+  CHECK(hostSrc != NULL);
+  CHECK_LE(size, this->size_);
+  memcpy(this->data_, hostSrc, sizeof(T) * size);
+}
+
+template <class T>
+void CpuVectorT<T>::copyTo(CpuVectorT<T>* dest) const {
+  CHECK_EQ(this->getSize(), dest->getSize());
+  memcpy(dest->getData(), this->getData(), sizeof(T) * this->getSize());
+}
+
+template <class T>
+void CpuVectorT<T>::copyTo(GpuVectorT<T>* dest) const {
+  CHECK_EQ(this->getSize(), dest->getSize());
+  hl_memcpy_host2device((void*)dest->getData(),
+                        (void*)this->getData(),
+                        sizeof(T) * this->getSize());
+}
+
+template <>
+void CpuVectorT<real>::print(std::ostream& os, size_t num) const {
+  size_t w = size_ < num ? size_ : num;
+  os << "[";
+  for (size_t i = 0; i < w; ++i) {
+    os << data_[i] << " ";
+  }
+  os << "]" << std::endl;
+}
+
+template <>
+void CpuVectorT<int>::print(std::ostream& os, size_t num) const {
+  size_t w = size_ < num ? size_ : num;
+  os << "[";
+  for (size_t i = 0; i < w; ++i) {
+    os << (int)data_[i] << " ";
+  }
+  os << "]" << std::endl;
+}
+
+template <>
+void CpuVectorT<real>::printOneElement(std::ostream& os, size_t idx) const {
+  CHECK_LT(idx, size_);
+  os << data_[idx] << ";";
+}
+
+template <>
+void CpuVectorT<int>::printOneElement(std::ostream& os, size_t idx) const {
+  CHECK_LT(idx, size_);
+  os << (int)data_[idx] << ";";
+}
+
+template <class T>
+void ParallelCpuVectorT<T>::parallelExec(ExecFunc func) {
+  LOG(FATAL) << "Not implemented";
+}
+
+template <>
+void ParallelCpuVectorT<real>::parallelExec(ExecFunc func) {
+  pool_->exec([this, func](int tid, size_t numThreads) {
+    auto interval = calcSplitArrayInterval(
+        this->getSize(), (size_t)tid, numThreads, 8LU /*for avx*/);
+    // setup sub bufs
+    CpuVector subVec(0, nullptr);
+    subVec.subVecFrom(*this, interval);
+    func(subVec);
+  });
+}
+
+template <class T>
+void ParallelCpuVectorT<T>::exec(SyncThreadPool::JobFunc func) {
+  LOG(FATAL) << "Not implemented";
+}
+
+template <>
+void ParallelCpuVectorT<real>::exec(SyncThreadPool::JobFunc func) {
+  pool_->exec(func);
+}
+
+template <class T>
+CpuGpuVectorT<T>::CpuGpuVectorT(size_t size, bool useGpu) : sync_(nullptr) {
+  if (!useGpu) {
+    cpuVectorT_ = std::make_shared<CpuVectorT<T>>(size);
+  } else {
+    gpuVectorT_ = std::make_shared<GpuVectorT<T>>(size);
+  }
+  setSync(useGpu);
+}
+
+template <class T>
+CpuGpuVectorT<T>::CpuGpuVectorT(const std::shared_ptr<VectorT<T>>& src)
+    : sync_(nullptr) {
+  bool useGpu = src->useGpu();
+  if (useGpu) {
+    gpuVectorT_ = src;
+  } else {
+    cpuVectorT_ = src;
+  }
+  setSync(useGpu);
+}
+
+template <class T>
+CpuGpuVectorT<T>::CpuGpuVectorT(size_t size, T* data, bool useGpu)
+    : sync_(nullptr) {
+  if (!useGpu) {
+    cpuVectorT_ = std::make_shared<CpuVectorT<T>>(size, data);
+    setSync(DATA_AT_CPU);
+  } else {
+    gpuVectorT_ = std::make_shared<GpuVectorT<T>>(size, data);
+    setSync(DATA_AT_GPU);
+  }
+}
+
+template <class T>
+std::shared_ptr<CpuGpuVectorT<T>> CpuGpuVectorT<T>::create(size_t size,
+                                                           bool useGpu) {
+  return std::make_shared<CpuGpuVectorT<T>>(size, useGpu);
+}
+
+template <class T>
+void CpuGpuVectorT<T>::resize(size_t size, bool useGpu) {
+  if (useGpu) {
+    CHECK(gpuVectorT_) << "gpuVectorT_ is null";
+    // If memoryHandle_ is nullptr,
+    // the data may be owned by the caller when it was constructed.
+    // It should not resize for this case.
+    if (gpuVectorT_->getMemoryHandle()) {
+      gpuVectorT_->resize(size);
+    } else {
+      CHECK_EQ(gpuVectorT_->getSize(), size);
+    }
+  } else {
+    CHECK(cpuVectorT_) << "cpuVectorT_ is null";
+    // If memoryHandle_ is nullptr,
+    // the data may be owned by the caller when it was constructed.
+    // It should not resize for this case.
+    if (cpuVectorT_->getMemoryHandle()) {
+      cpuVectorT_->resize(size);
+    } else {
+      CHECK_EQ(cpuVectorT_->getSize(), size);
+    }
+  }
+  setSync(useGpu);
+}
+
+template <class T>
+void CpuGpuVectorT<T>::resizeOrCreate(std::shared_ptr<CpuGpuVectorT<T>>& vec,
+                                      size_t size,
+                                      bool useGpu) {
+  if (vec) {
+    vec->resize(size, useGpu);
+  } else {
+    vec = create(size, useGpu);
+  }
+}
+
+template <class T>
+void CpuGpuVectorT<T>::resizeOrCreate(size_t size, bool useGpu) {
+  if (useGpu && (!gpuVectorT_)) {
+    gpuVectorT_ = VectorT<T>::create(size, true);
+  } else if ((!useGpu) && (!cpuVectorT_)) {
+    cpuVectorT_ = VectorT<T>::create(size, false);
+  } else {
+    CHECK((useGpu && gpuVectorT_) || (!useGpu && cpuVectorT_));
+    this->resize(size, useGpu);
+  }
+}
+
+template <class T>
+CpuGpuVectorT<T>::CpuGpuVectorT(CpuGpuVectorT<T>& src,
+                                size_t offset,
+                                size_t size)
+    : sync_(nullptr) {
+  CHECK_LE(offset + size, static_cast<size_t>(src.getSize()));
+#ifdef PADDLE_WITH_CUDA
+  SyncedFlag* flag = src.getSync();
+  if (*flag == DATA_AT_CPU) {
+    src.copyToGpu();  // will set synchronous data between CPU and GPU
+  } else if (*flag == DATA_AT_GPU) {
+    src.copyToCpu();  // will set synchronous data between CPU and GPU
+  }
+#endif
+  auto cMemHandle = (src.getVector(false))->getMemoryHandle();
+  cpuVectorT_ = std::make_shared<CpuVectorT<T>>(
+      size, std::dynamic_pointer_cast<CpuMemoryHandle>(cMemHandle), offset);
+#ifdef PADDLE_WITH_CUDA
+  auto gMemHandle = (src.getVector(true))->getMemoryHandle();
+  gpuVectorT_ = std::make_shared<GpuVectorT<T>>(
+      size, std::dynamic_pointer_cast<GpuMemoryHandle>(gMemHandle), offset);
+  src.setSync(SYNCED);
+#endif
+  setSync(src.getSync());
+}
+
+template <class T>
+std::shared_ptr<const VectorT<T>> CpuGpuVectorT<T>::getVector(
+    bool useGpu) const {
+  auto* self = const_cast<CpuGpuVectorT<T>*>(this);
+  if (useGpu) {
+    self->copyToGpu();
+    return std::const_pointer_cast<const VectorT<T>>(gpuVectorT_);
+  } else {
+    self->copyToCpu();
+    return std::const_pointer_cast<const VectorT<T>>(cpuVectorT_);
+  }
+}
+
+template <class T>
+std::shared_ptr<VectorT<T>>& CpuGpuVectorT<T>::getMutableVector(bool useGpu) {
+  setSync(useGpu);
+  if (useGpu) {
+    copyToGpu();
+    return gpuVectorT_;
+  } else {
+    copyToCpu();
+    return cpuVectorT_;
+  }
+}
+
+template <class T>
+const T* CpuGpuVectorT<T>::getData(bool useGpu) const {
+  auto self = const_cast<CpuGpuVectorT<T>*>(this);
+  if (useGpu) {
+    self->copyToGpu();
+    return gpuVectorT_->getData();
+  } else {
+    self->copyToCpu();
+    return cpuVectorT_->getData();
+  }
+}
+
+// Operation will change data and need to reset sync_ & syncFlag_.
+#define MUTABLE_VECTOR_OP(OP, useGpu, args...) \
+  do {                                         \
+    if (useGpu) {                              \
+      copyToGpu();                             \
+      setSync(useGpu);                         \
+      return gpuVectorT_->OP(args);            \
+    } else {                                   \
+      copyToCpu();                             \
+      setSync(useGpu);                         \
+      return cpuVectorT_->OP(args);            \
+    }                                          \
+  } while (0)
+
+template <class T>
+T* CpuGpuVectorT<T>::getMutableData(bool useGpu) {
+  MUTABLE_VECTOR_OP(getData, useGpu);
+}
+
+template <class T>
+void CpuGpuVectorT<T>::zeroMem(bool useGpu) {
+  MUTABLE_VECTOR_OP(zeroMem, useGpu);
+}
+
+template <class T>
+void CpuGpuVectorT<T>::fillSequence(bool useGpu) {
+  MUTABLE_VECTOR_OP(fillSequence, useGpu);
+}
+
+template <class T>
+void CpuGpuVectorT<T>::setElement(size_t i, const T& value, bool useGpu) {
+  MUTABLE_VECTOR_OP(setElement, useGpu, i, value);
+}
+
+template <class T>
+T CpuGpuVectorT<T>::getElement(size_t i) const {
+  switch (*this->getSync()) {
+    case SYNCED:
+    case DATA_AT_CPU:
+      return cpuVectorT_->getElement(i);
+      break;
+    case DATA_AT_GPU:
+      return gpuVectorT_->getElement(i);
+      break;
+    default:
+      LOG(FATAL) << "Not support";
+      break;
+  }
+}
+
+template <class T>
+void CpuGpuVectorT<T>::copyFrom(const VectorT<T>& src, hl_stream_t stream) {
+  auto cVec = dynamic_cast<const CpuVectorT<T>*>(&src);
+  auto gVec = dynamic_cast<const GpuVectorT<T>*>(&src);
+  if (cVec) {
+    copyToCpu(cVec->getData(), cVec->getSize(), stream);
+  } else if (gVec) {
+    copyToGpu(gVec->getData(), gVec->getSize(), stream);
+  } else {
+    LOG(FATAL) << "Invalid type of src";
+  }
+}
+
+template <class T>
+void CpuGpuVectorT<T>::copyFrom(const T* data, size_t size, bool useGpu) {
+  if (useGpu) {
+    copyToGpu(data, size);
+  } else {
+    copyToCpu(data, size);
+  }
+}
+
+template <class T>
+void CpuGpuVectorT<T>::copyFrom(const T* data,
+                                size_t size,
+                                hl_stream_t stream,
+                                bool useGpu) {
+  if (useGpu) {
+    copyToGpu(data, size, stream);
+  } else {
+    copyToCpu(data, size, stream);
+  }
+}
+
+template <class T>
+void CpuGpuVectorT<T>::copyFrom(CpuGpuVectorT<T>& src,
+                                size_t offset,
+                                size_t size,
+                                bool useGpu,
+                                hl_stream_t stream) {
+  if (useGpu) {
+    VectorT<T>::resizeOrCreate(gpuVectorT_, size, true);
+    gpuVectorT_->copyFrom(src.getData(true) + offset, size, stream);
+  } else {
+    VectorT<T>::resizeOrCreate(cpuVectorT_, size, false);
+    cpuVectorT_->copyFrom(src.getData(false) + offset, size, stream);
+  }
+  setSync(useGpu);
+}
+
+template <class T>
+void CpuGpuVectorT<T>::copyFrom(CpuGpuVectorT<T>& src, hl_stream_t stream) {
+  switch (*src.getSync()) {
+    case DATA_AT_CPU:
+      copyFrom(*(src.getVector(false)), stream);
+      break;
+    case DATA_AT_GPU:
+      copyFrom(*(src.getVector(true)), stream);
+      break;
+    case SYNCED:
+      copyFrom(*(src.getVector(false)), stream);
+      copyFrom(*(src.getVector(true)), stream);
+      setSync(SYNCED);
+      break;
+    default:
+      LOG(FATAL) << "Not support";
+      break;
+  }
+}
+
+template <class T>
+void CpuGpuVectorT<T>::copyToCpu() {
+  switch (*this->getSync()) {
+    case DATA_AT_GPU:
+      CHECK(gpuVectorT_);
+      this->resizeOrCreate(gpuVectorT_->getSize(), false);
+      cpuVectorT_->copyFrom(*gpuVectorT_);
+      setSync(SYNCED);
+      break;
+    case DATA_AT_CPU:
+    case SYNCED:
+      CHECK(cpuVectorT_);
+      break;
+    default:
+      LOG(FATAL) << "Not support";
+      break;
+  }
+}
+
+template <class T>
+void CpuGpuVectorT<T>::copyToGpu() {
+  switch (*this->getSync()) {
+    case DATA_AT_CPU:
+      CHECK(cpuVectorT_);
+      this->resizeOrCreate(cpuVectorT_->getSize(), true);
+      gpuVectorT_->copyFrom(*cpuVectorT_);
+      setSync(SYNCED);
+      break;
+    case DATA_AT_GPU:
+    case SYNCED:
+      CHECK(gpuVectorT_);
+      break;
+    default:
+      LOG(FATAL) << "Not support";
+      break;
+  }
+}
+
+template class VectorT<real>;
+template class VectorT<int>;
+template class CpuVectorT<real>;
+template class CpuVectorT<int>;
+template class GpuVectorT<real>;
+template class GpuVectorT<int>;
+template class CpuGpuVectorT<real>;
+template class CpuGpuVectorT<int>;
+
+}  // namespace paddle
diff --git a/paddle/legacy/math/Vector.h b/paddle/legacy/math/Vector.h
new file mode 100644
index 0000000000000000000000000000000000000000..63cb4651c52219807e11e778db9c42667759a055
--- /dev/null
+++ b/paddle/legacy/math/Vector.h
@@ -0,0 +1,726 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cmath>
+#include <memory>
+
+#include <hl_gpu.h>
+
+#include "BaseMatrix.h"
+#include "MemoryHandle.h"
+#include "paddle/legacy/utils/Common.h"
+#include "paddle/legacy/utils/Thread.h"
+
+namespace paddle {
+
+template <class T>
+class GpuVectorT;
+template <class T>
+class CpuVectorT;
+
+template <class T>
+class BaseVector;
+
+class SyncThreadPool;
+
+class Matrix;
+
+template <class T>
+class BaseVector : public BaseMatrixT<T> {
+ public:
+  BaseVector(size_t size, T* data, bool useGpu)
+      : BaseMatrixT<T>(1, size, data, false, useGpu), size_(this->width_) {}
+
+  ~BaseVector() {}
+
+ protected:
+  size_t& size_;
+};
+
+/**
+ * Copy or assignemnt constructor will share the data as opposed to making a
+ * copy of the original data. To make a copy of the orinal data, use copyFrom()
+ * instead.
+ */
+template <class T>
+class VectorT : public BaseVector<T> {
+ protected:
+  VectorT(size_t size, MemoryHandlePtr memoryHandle, size_t offset, bool useGpu)
+      : BaseVector<T>(size,
+                      reinterpret_cast<T*>(memoryHandle->getBuf()) + offset,
+                      useGpu) {
+    memoryHandle_ = memoryHandle;
+  }
+
+  // data is still owned by the caller.
+  // data should be valid during the life of this vector.
+  // Caller is responsible for release the memory.
+  VectorT(size_t size, T* data, bool useGpu)
+      : BaseVector<T>(size, data, useGpu) {}
+
+ public:
+  virtual ~VectorT() {}
+
+  static std::shared_ptr<VectorT<T>> create(size_t size, bool useGpu);
+
+  static std::shared_ptr<VectorT<T>> create(T* data, size_t size, bool useGpu);
+
+  static std::shared_ptr<VectorT<T>> create(size_t size,
+                                            MemoryHandlePtr memoryHandle,
+                                            size_t offset = 0);
+
+  // owner can set SyncThreadPool,
+  // if not set, will use globalSyncThreadPool,
+  // which can be used in main thread only.
+  static std::shared_ptr<VectorT<T>> createParallelVector(
+      size_t size, bool useGpu, SyncThreadPool* pool = nullptr);
+
+  size_t getSize() const { return this->size_; }
+  const T* getData() const { return this->data_; }
+  T* getData() { return this->data_; }
+
+  virtual void zeroMem() = 0;
+  // set all elements to value
+  virtual void reset(const T& value) = 0;
+  // fill data by 0, 1, 2, ...
+  virtual void fillSequence() = 0;
+
+  MemoryHandlePtr getMemoryHandle() const { return memoryHandle_; }
+
+  /**
+   * resizing to a big vector will not preserve old values.
+   */
+  void resize(size_t newSize) {
+    if (!memoryHandle_ || newSize * sizeof(T) > memoryHandle_->getAllocSize()) {
+      memoryHandle_ = newMemory(newSize * sizeof(T));
+      this->data_ = reinterpret_cast<T*>(memoryHandle_->getBuf());
+    }
+    this->size_ = newSize;
+  }
+
+  static void resizeOrCreate(std::shared_ptr<VectorT<T>>& vec,
+                             size_t size,
+                             bool useGpu) {
+    if (vec) {
+      vec->resize(size);
+    } else {
+      vec = create(size, useGpu);
+    }
+  }
+
+  virtual MemoryHandlePtr newMemory(size_t size) = 0;
+
+  /**
+   * form sub vector from *src*, shallow copy
+   */
+  void subVecFrom(const VectorT<T>& src, size_t start, size_t size) {
+    CHECK_EQ(BaseVector<T>::useGpu_, src.useGpu_);
+    CHECK_LT(start, src.size_);
+    CHECK_LE(start + size, src.size_);
+
+    BaseVector<T>::size_ = size;
+    BaseVector<T>::data_ = const_cast<T*>(src.data_) + start;
+  }
+
+  std::shared_ptr<VectorT<T>> subVec(size_t start, size_t size) {
+    CHECK_LE(start + size, static_cast<size_t>(getSize()));
+    return VectorT<T>::create(getData() + start, size, BaseVector<T>::useGpu_);
+  }
+
+  /**
+   * form sub vector from *src*, shallow copy
+   */
+  void subVecFrom(const T* src, size_t start, size_t size) {
+    BaseVector<T>::size_ = size;
+    BaseVector<T>::data_ = const_cast<T*>(src) + start;
+  }
+
+  /**
+   * form sub vector from *src*, shallow copy
+   * in *interval* [interval.first, interval.second)
+   */
+  void subVecFrom(const VectorT<T>& src, std::pair<size_t, size_t> interval) {
+    subVecFrom(src, interval.first, interval.second - interval.first);
+  }
+
+  /**
+   * convert the vector to a sparse one_hot matrix of width idRange
+   * only applies to IVector
+   */
+  std::shared_ptr<Matrix> toOneHotSparseMatrix(size_t idRange, bool useGpu);
+
+  /**
+   * @brief cast vector of "real" elements to "int" elements.
+   *
+   * @note: float -> int must be casted, or you'll get wrong data.
+   */
+  std::shared_ptr<VectorT<int>> castToInt();
+
+  /**
+   * This function will crash if the size of src and dest is different.
+   */
+  virtual void copyFrom(const VectorT<T>& src) = 0;
+
+  /**
+   * If GpuVector, this function is an asynchronous interface,
+   * will push the copy-task to the specifed-stream and return immediately.
+   *
+   * If CpuVector, this function is an synchronous interface,
+   * same as the copyFrom(const VectorT<T>& src).
+   */
+  virtual void copyFrom(const VectorT<T>& src, hl_stream_t stream) = 0;
+
+  /**
+   * copy size elements from src
+   *
+   * If this is GpuVector, src can be cpu or gpu memory
+   *
+   * If this is CpuVector, src is assumed to be cpu memory
+   */
+  virtual void copyFrom(const T* src, size_t size) = 0;
+
+  /**
+   * copy size elements from src
+   *
+   * If this is GpuVector, src can be cpu or gpu memory
+   *
+   * If this is CpuVector, src is assumed to be cpu memory,
+   */
+  virtual void copyFrom(const T* src, size_t size, hl_stream_t stream) = 0;
+
+  /**
+   * exec a func in single/multi thread
+   */
+  virtual void exec(SyncThreadPool::JobFunc func) { func(0, 1); }
+
+  /// Get the buffer point with beginPos
+  virtual T* getPoint(const uint64_t beginPos) = 0;
+
+  /// Get the value for the i'th element
+  virtual T getElement(size_t i) const = 0;
+  virtual void setElement(size_t i, const T& value) = 0;
+
+  //----------  math operations ----------------
+
+  // sum of the absolute value of each elements
+  virtual T getAbsSum() = 0;
+
+  virtual T getSum() = 0;
+  virtual T getMax() = 0;
+  virtual T getAbsMax() = 0;
+  virtual T getMin() = 0;
+
+  /// element-wise calc:  this = (b == value)
+  virtual void isEqualTo(const VectorT<T>& b, const T& value) = 0;
+
+  /// select elements indexed by *ids* from vector *src*
+  virtual void selectFrom(const VectorT<T>& src, const VectorT<int>& ids) = 0;
+
+  enum HistogramType {
+    HISTOGRAM_EXPONENT = 0,
+  };
+
+  /**
+   * @brief  print histogram of vector values
+   *
+   * @note   only exponent histogram supported currently
+   */
+  virtual void histogram(std::ostream& os, int type = HISTOGRAM_EXPONENT) = 0;
+
+  /// generate uniform random value for each element
+  virtual void rand() = 0;
+  /**
+   * generate uniform random value for each element,
+   * data range is from 0 to (classes - 1).
+   */
+  virtual void rand(size_t classes) = 0;
+
+  /**
+   * Debug use only. Very inefficient for GPU vector.
+   * get the value at pos.
+   */
+  virtual T get(size_t pos) = 0;
+
+  /**
+   * generate univariate Gaussian distributed random numbers
+   * with given mean and standardDeviation.
+   */
+  virtual void randnorm(real mean, real standardDeviation) = 0;
+
+  /**
+   * generate uniform distributed random numbers
+   * with given range.
+   */
+  virtual void uniform(real left, real right) = 0;
+
+  /// print the first "num" elements of the Vector
+  virtual void print(std::ostream& os, size_t num) const = 0;
+
+  /// print the "idx" element of the Vector
+  virtual void printOneElement(std::ostream& os, size_t idx) const = 0;
+
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    if (BaseVector<T>::useGpu_) {
+      TensorGpuApply<T>(*this, expr);
+    } else {
+      TensorCpuApply<T>(*this, expr);
+    }
+  }
+
+ protected:
+  friend class GpuVectorT<T>;
+  friend class CpuVectorT<T>;
+  virtual void copyTo(CpuVectorT<T>* dest) const = 0;
+  virtual void copyTo(GpuVectorT<T>* dest) const = 0;
+  MemoryHandlePtr memoryHandle_;
+};
+
+template <class T>
+std::ostream& operator<<(std::ostream& os, const VectorT<T>& vec) {
+  vec.print(os, vec.getSize());
+  return os;
+}
+
+template <class T>
+class GpuVectorT : public VectorT<T> {
+ public:
+  explicit GpuVectorT(size_t size);
+  GpuVectorT(size_t size, GpuMemHandlePtr memHandle, size_t offset)
+      : VectorT<T>(size, memHandle, offset, true) {}
+
+  // data is still owned by the caller.
+  // data should be valid during the life of this vector.
+  // Caller is responsible for release the memory.
+  GpuVectorT(size_t size, T* data) : VectorT<T>(size, data, true) {}
+
+  virtual MemoryHandlePtr newMemory(size_t size) {
+    return std::make_shared<GpuMemoryHandle>(size);
+  }
+  virtual void zeroMem();
+  virtual void reset(const T& value);
+  virtual void fillSequence();
+
+  virtual void copyFrom(const T* src, size_t size);
+  virtual void copyFrom(const T* src, size_t size, hl_stream_t stream);
+  virtual void copyFrom(const VectorT<T>& src);
+  virtual void copyFrom(const VectorT<T>& src, hl_stream_t stream);
+  virtual T getElement(size_t i) const;
+  virtual void setElement(size_t i, const T& value);
+  virtual T* getPoint(const uint64_t beginPos);
+
+  virtual T getAbsSum();
+  virtual T getSum();
+  virtual T getMax();
+  virtual T getAbsMax();
+  virtual T getMin();
+  virtual void isEqualTo(const VectorT<T>& b, const T& value);
+  virtual void selectFrom(const VectorT<T>& src, const VectorT<int>& ids);
+  virtual void histogram(std::ostream& os, int type);
+  virtual void rand();
+  virtual void rand(size_t classes);
+  virtual void randnorm(real mean, real standardDeviation);
+  virtual void uniform(real left, real right);
+  virtual T get(size_t pos);
+  virtual void print(std::ostream& os, size_t num) const;
+  virtual void printOneElement(std::ostream& os, size_t idx) const;
+
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    TensorGpuApply<T>(*this, expr);
+  }
+
+ protected:
+  virtual void copyTo(CpuVectorT<T>* dest) const;
+  virtual void copyTo(GpuVectorT<T>* dest) const;
+};
+
+template <class T>
+class CpuVectorT : public VectorT<T> {
+ public:
+  explicit CpuVectorT(size_t size);
+  CpuVectorT(size_t size, MemoryHandlePtr memoryHandle, size_t offset)
+      : VectorT<T>(size, memoryHandle, offset, false) {}
+
+  // data is still owned by the caller.
+  // data should be valid during the life of this vector.
+  // Caller is responsible for release the memory.
+  CpuVectorT(size_t size, T* data) : VectorT<T>(size, data, false) {}
+
+  /**
+   * If src is a CpuVector, the new CpuVector will share the data with src
+   *
+   * If src is a GpuVector, the new CpuVector will copy data from src
+   */
+  explicit CpuVectorT(const VectorT<T>& src);
+
+  virtual MemoryHandlePtr newMemory(size_t size) {
+    return std::make_shared<CpuMemoryHandle>(size);
+  }
+
+  virtual void zeroMem();
+  virtual void reset(const T& value);
+  virtual void fillSequence();
+  virtual void copyFrom(const T* src, size_t size);
+  virtual void copyFrom(const T* src, size_t size, hl_stream_t stream);
+  virtual void copyFrom(const VectorT<T>& src);
+  virtual void copyFrom(const VectorT<T>& src, hl_stream_t stream);
+  virtual void copyTo(CpuVectorT<T>* dest) const;
+  virtual void copyTo(GpuVectorT<T>* dest) const;
+
+  /// Get the buffer point with beginPos
+  virtual T* getPoint(const uint64_t beginPos) {
+    return this->getData() + beginPos;
+  }
+
+  virtual T getElement(size_t i) const { return this->getData()[i]; }
+  virtual void setElement(size_t i, const T& value) {
+    this->getData()[i] = value;
+  }
+
+  virtual T getAbsSum();
+  virtual T getSum();
+  virtual T getMax();
+  virtual T getAbsMax();
+  virtual T getMin();
+  virtual void isEqualTo(const VectorT<T>& b, const T& value);
+  virtual void selectFrom(const VectorT<T>& src, const VectorT<int>& ids);
+  virtual void histogram(std::ostream& os, int type);
+  virtual void rand();
+  virtual void rand(size_t classes);
+  virtual void randnorm(real mean, real standardDeviation);
+  virtual void uniform(real left, real right);
+  virtual T get(size_t pos);
+  virtual void print(std::ostream& os, size_t num) const;
+  virtual void printOneElement(std::ostream& os, size_t idx) const;
+
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    TensorCpuApply<T>(*this, expr);
+  }
+};
+
+template <class T>
+class ParallelCpuVectorT : public CpuVectorT<T> {
+ public:
+  ParallelCpuVectorT(size_t size, SyncThreadPool* pool)
+      : CpuVectorT<T>(size), pool_(pool) {}
+
+  virtual void zeroMem() {
+    parallelExec([](CpuVectorT<T>& vec) { vec.CpuVectorT<T>::zeroMem(); });
+  }
+  virtual void randnorm(real mean, real standardDeviation) {
+    parallelExec([=](CpuVectorT<T>& vec) {
+      vec.CpuVectorT<T>::randnorm(mean, standardDeviation);
+    });
+  }
+  virtual void uniform(real left, real right) {
+    parallelExec(
+        [=](CpuVectorT<T>& vec) { vec.CpuVectorT<T>::uniform(left, right); });
+  }
+
+  virtual void exec(SyncThreadPool::JobFunc jobFunc);
+
+ private:
+  typedef std::function<void(CpuVectorT<T>& vec)> ExecFunc;
+  void parallelExec(ExecFunc func);
+  SyncThreadPool* pool_;
+};
+
+/**
+ * A class to do conversion between CpuVector and GpuVector automatically.
+ */
+template <class T>
+class CpuGpuVectorT {
+ public:
+  /**
+   * @brief An enum type of SyncedFlag using to
+   *        mark data memory is in CPU or GPU.
+   *
+   * DATA_AT_CPU: data is located in CPU.
+   *
+   * DATA_AT_GPU: data is located in GPU.
+   *
+   * SYNCED: data is located in CPU and GPU simultaneously.
+   */
+  enum SyncedFlag { DATA_AT_CPU = 0, DATA_AT_GPU = 1, SYNCED = 2 };
+
+  /**
+   * @brief A constructor, create cpuVectorT_ or gpuVectorT_.
+   *
+   * @param[in] size    data size.
+   * @param[in] useGpu  use gpu or not.
+   */
+  explicit CpuGpuVectorT(size_t size, bool useGpu);
+
+  /**
+   * @brief A constructor, create CpuGpuVectorT by VectorT.
+   *
+   * If src is CpuVector, cpuVectorT_ is shared data with src.
+   *
+   * If src is GpuVector, gpuVectorT_ is shared data with src.
+   */
+  explicit CpuGpuVectorT(const std::shared_ptr<VectorT<T>>& src);
+
+  /**
+   * @brief A constructor.
+   *
+   * If useGpu is true, data should be located in device and
+   * create gpuVectorT_ with data.
+   *
+   * If useGpu is false, data should be located in host and
+   * create cpuVectorT_ with data.
+   *
+   * @note Data is owned by the caller and should be valid during
+   *       the life of this vector.
+   *       Caller is responsible for release the memory.
+   */
+  CpuGpuVectorT(size_t size, T* data, bool useGpu);
+
+  CpuGpuVectorT(CpuGpuVectorT<T>& src, size_t offset, size_t size);
+
+  virtual ~CpuGpuVectorT() {}
+
+  static std::shared_ptr<CpuGpuVectorT<T>> create(size_t size, bool useGpu);
+
+  /**
+   * @brief resize vector.
+   *
+   * If useGpu is true, resize gpuVectorT_ and set syncFlag_ to DATA_AT_GPU,
+   *
+   * otherwise resize cpuVectorT_ and set syncFlag_ to DATA_AT_CPU.
+   */
+  void resize(size_t size, bool useGpu);
+
+  /**
+   * @brief resize or create CpuGpuVectorT.
+   */
+  static void resizeOrCreate(std::shared_ptr<CpuGpuVectorT<T>>& vec,
+                             size_t size,
+                             bool useGpu);
+
+  /**
+   * @brief return a const cpuVectorT_ or gpuVectorT_.
+   *
+   * If useGpu is true, return gpuVectorT_.
+   *
+   * If useGpu is false, return cpuVectorT_.
+   *
+   * @note Caller should not change the data.
+   *       If caller changes const attribute,
+   *       should set syncFlag_.
+   */
+  std::shared_ptr<const VectorT<T>> getVector(bool useGpu) const;
+
+  /**
+   * @brief return a const cpuVectorT_ or gpuVectorT_.
+   *
+   * @note: This interface will change syncFlag_, so if you will
+   *        not change the data, you should call getVector.
+   */
+  std::shared_ptr<VectorT<T>>& getMutableVector(bool useGpu);
+
+  /**
+   * @brief return const T* data.
+   *
+   * If useGpu is true, return device data.
+   *
+   * If useGpu is false, return host data.
+   */
+  const T* getData(bool useGpu) const;
+
+  // TODO(yuyang18): Make getData more c++ style.
+  //  inline T* getData(bool useGpu) {
+  //    return getMutableData(useGpu);
+  //  }
+
+  T* getMutableData(bool useGpu);
+
+  /**
+   * If useGpu is true, gpuVectorT_->Op().
+   *
+   * If useGpu is false, cpuVectorT_->Op().
+   *
+   * Op is zeroMem, fillSequence, ...
+   */
+  void zeroMem(bool useGpu);
+  void fillSequence(bool useGpu);
+  void setElement(size_t i, const T& value, bool useGpu);
+
+  /**
+   * @brief return i-th element.
+   */
+  T getElement(size_t i) const;
+
+  /**
+   * @brief return vector size.
+   */
+  size_t getSize() const {
+    size_t size = 0;
+    switch (*sync_) {
+      case SYNCED:
+      case DATA_AT_CPU:
+        size = cpuVectorT_->getSize();
+        break;
+      case DATA_AT_GPU:
+        size = gpuVectorT_->getSize();
+        break;
+      default:
+        LOG(FATAL) << "Not support";
+        break;
+    }
+    return size;
+  }
+
+  /// copy data to cpuVectorT_.
+  inline void copyToCpu(const T* data, size_t size) {
+    this->resizeOrCreate(size, false);
+    cpuVectorT_->copyFrom(data, size);
+    setSync(DATA_AT_CPU);
+  }
+  /// copy data to cpuVectorT_ using specifed-stream.
+  inline void copyToCpu(const T* data, size_t size, hl_stream_t stream) {
+    this->resizeOrCreate(size, false);
+    cpuVectorT_->copyFrom(data, size, stream);
+    setSync(DATA_AT_CPU);
+  }
+
+  /// copy data to gpuVectorT_.
+  inline void copyToGpu(const T* data, size_t size) {
+    this->resizeOrCreate(size, true);
+    gpuVectorT_->copyFrom(data, size);
+    setSync(DATA_AT_GPU);
+  }
+  /// copy data to gpuVectorT_ using specifed-stream.
+  inline void copyToGpu(const T* data, size_t size, hl_stream_t stream) {
+    this->resizeOrCreate(size, true);
+    gpuVectorT_->copyFrom(data, size, stream);
+    setSync(DATA_AT_GPU);
+  }
+
+  /**
+   * @brief copy from src using specifed-stream.
+   *
+   * If src is CpuVectorT, copy to cpuVectorT_.
+   *
+   * If src is GpuVectorT, copy to gpuVectorT_.
+   */
+  void copyFrom(const VectorT<T>& src, hl_stream_t stream);
+
+  /**
+   * @brief copy data.
+   *
+   * If useGpu is false, copy host data to cpuVectorT_.
+   *
+   * If useGpu is true, copy device data to gpuVectorT_.
+   *
+   * @note  data address should consistent with useGpu.
+   */
+  void copyFrom(const T* data, size_t size, bool useGpu);
+  void copyFrom(const T* data, size_t size, hl_stream_t stream, bool useGpu);
+
+  /**
+   * @brief copy from (src + offset) using specifed-stream.
+   */
+  void copyFrom(CpuGpuVectorT<T>& src,
+                size_t offset,
+                size_t size,
+                bool useGpu,
+                hl_stream_t stream);
+
+  /**
+   * @brief copy from src using specifed-stream.
+   */
+  void copyFrom(CpuGpuVectorT<T>& src, hl_stream_t stream);
+
+  /**
+   * @brief return sync_.
+   */
+  inline SyncedFlag* getSync() const { return sync_; }
+
+  /**
+   * @brief set sync_.
+   */
+  inline void setSync(SyncedFlag* sync) { sync_ = sync; }
+
+  inline void setSync(SyncedFlag syncFlag) {
+    if (sync_) {
+      *sync_ = syncFlag;
+    } else {
+      syncFlag_ = syncFlag;
+      sync_ = &syncFlag_;
+    }
+  }
+
+  inline void setSync(bool useGpu) {
+    SyncedFlag flag = useGpu ? DATA_AT_GPU : DATA_AT_CPU;
+    setSync(flag);
+  }
+
+ protected:
+  void resizeOrCreate(size_t size, bool useGpu);
+
+  /**
+   * @brief copy between cpuVectorT_ and gpuVectorT_.
+   *
+   * If syncFlag_ is DATA_AT_CPU and SYNCED, do nothing.
+   *
+   * If syncFlag_ is DATA_AT_GPU, copy gpuVectorT_ to cpuVectorT_
+   *   and set syncFlag_ to SYNCED.
+   */
+  void copyToCpu();
+
+  /**
+   * @brief copy between cpuVectorT_ and gpuVectorT_.
+   *
+   * If syncFlag_ is DATA_AT_GPU and SYNCED, do nothing.
+   *
+   * If syncFlag_ is DATA_AT_CPU, copy cpuVectorT_ to gpuVectorT_
+   *   and set syncFlag_ to SYNCED.
+   */
+  void copyToGpu();
+
+  /// host pointer.
+  std::shared_ptr<VectorT<T>> cpuVectorT_;
+  /// device pointer.
+  std::shared_ptr<VectorT<T>> gpuVectorT_;
+  /// specify current data address.
+  SyncedFlag syncFlag_;
+  SyncedFlag* sync_;
+};
+
+typedef VectorT<real> Vector;
+typedef CpuVectorT<real> CpuVector;
+typedef GpuVectorT<real> GpuVector;
+
+typedef VectorT<int> IVector;
+typedef CpuVectorT<int> CpuIVector;
+typedef GpuVectorT<int> GpuIVector;
+
+typedef std::shared_ptr<Vector> VectorPtr;
+typedef std::shared_ptr<CpuVector> CpuVectorPtr;
+typedef std::shared_ptr<GpuVector> GpuVectorPtr;
+
+typedef std::shared_ptr<IVector> IVectorPtr;
+typedef std::shared_ptr<CpuIVector> CpuIVectorPtr;
+typedef std::shared_ptr<GpuIVector> GpuIVectorPtr;
+
+typedef CpuGpuVectorT<real> CpuGpuVector;
+typedef CpuGpuVectorT<int> ICpuGpuVector;
+typedef std::shared_ptr<CpuGpuVector> CpuGpuVectorPtr;
+typedef std::shared_ptr<ICpuGpuVector> ICpuGpuVectorPtr;
+
+}  // namespace paddle
diff --git a/paddle/math/tests/CMakeLists.txt b/paddle/legacy/math/tests/CMakeLists.txt
similarity index 100%
rename from paddle/math/tests/CMakeLists.txt
rename to paddle/legacy/math/tests/CMakeLists.txt
diff --git a/paddle/legacy/math/tests/OriginalOptimizerApi.h b/paddle/legacy/math/tests/OriginalOptimizerApi.h
new file mode 100644
index 0000000000000000000000000000000000000000..f386e19958a21214151776e6d0ae7bb2a4530b6c
--- /dev/null
+++ b/paddle/legacy/math/tests/OriginalOptimizerApi.h
@@ -0,0 +1,201 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/legacy/utils/GlobalConstants.h"
+
+using namespace paddle;  // NOLINT
+
+void SparseMomentumParameterOptimizer(const VectorPtr vecs[],
+                                      real alpha,
+                                      real beta,
+                                      real gamma,
+                                      real tau,
+                                      real learningRate) {
+  vecs[PARAMETER_MOMENTUM_UT]->add(*vecs[PARAMETER_GRADIENT],
+                                   -alpha * gamma * learningRate);
+  vecs[PARAMETER_MOMENTUM_VT]->add(*vecs[PARAMETER_GRADIENT],
+                                   tau * alpha * gamma * learningRate);
+  vecs[PARAMETER_VALUE]->add(*vecs[PARAMETER_MOMENTUM_UT],
+                             tau / beta + 1.0 / alpha,
+                             *vecs[PARAMETER_MOMENTUM_VT],
+                             1.0 / beta);
+}
+
+void AdagradParameterOptimizer(const VectorPtr vecs[],
+                               real epsilon,
+                               real learningRate,
+                               real momentum,
+                               real decayRate) {
+  vecs[PARAMETER_GRADIENT_SQURESUM1]->addSquare(*vecs[PARAMETER_GRADIENT],
+                                                1.0f);
+  vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM],
+                                     *vecs[PARAMETER_GRADIENT_SQURESUM1]);
+  vecs[PARAMETER_LEARNING_RATE]->add(epsilon);
+  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
+
+  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+                                   *vecs[PARAMETER_MOMENTUM],
+                                   *vecs[PARAMETER_LEARNING_RATE],
+                                   learningRate,
+                                   momentum,
+                                   decayRate);
+}
+
+void AdaDeltaParameterOptimizer(const VectorPtr vecs[],
+                                real rou,
+                                real epsilon,
+                                real learningRate,
+                                real momentum,
+                                real decayRate) {
+  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
+      *vecs[PARAMETER_GRADIENT], rou, 1.0f - rou);
+
+  // learn_rate = sqrt( ( E(dx_{t-1}^2) + epsilon ) / ( E(g_t^2) + epsilon ) )
+  vecs[PARAMETER_LEARNING_RATE]->dotDiv(*vecs[PARAMETER_GRADIENT_SQURESUM1],
+                                        *vecs[PARAMETER_GRADIENT_SQURESUM],
+                                        epsilon,
+                                        epsilon);
+  vecs[PARAMETER_LEARNING_RATE]->sqrt2();
+
+  // E(dx_t^2) = \rou * E(dx_{t-1}^2) + (1-\rou) * (-g*learn_rate)^2
+  vecs[PARAMETER_GRADIENT_SQURESUM1]->decayAddSquareMul(
+      *vecs[PARAMETER_GRADIENT],
+      *vecs[PARAMETER_LEARNING_RATE],
+      rou,
+      1.0f - rou);
+
+  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+                                   *vecs[PARAMETER_MOMENTUM],
+                                   *vecs[PARAMETER_LEARNING_RATE],
+                                   learningRate,
+                                   momentum,
+                                   decayRate);
+}
+
+void RMSPropParameterOptimizer(const VectorPtr vecs[],
+                               real accumulatedRou,
+                               real rou,
+                               real epsilon,
+                               real learningRate,
+                               real momentum,
+                               real decayRate,
+                               bool firstTime) {
+  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  // For the first time update, make the sum be the current square
+  // so that the initial estimation of E(g_t^2) will not be too small.
+  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
+      *vecs[PARAMETER_GRADIENT], accumulatedRou, firstTime ? 1.0f : 1.0f - rou);
+
+  // E(g_t) = \rou * E(g_{t-1}) + (1-\rou) * g
+  vecs[PARAMETER_GRADIENT_SQURESUM1]->add(
+      *vecs[PARAMETER_GRADIENT], accumulatedRou, 1.0f - rou);
+
+  // learn_rate = 1/sqrt( ( E(g_t^2) - (E(g_t))^2 + epsilon )
+  // Basiclly if the sign of the gradient changes more often,
+  // the learning rate will be decreased.
+  vecs[PARAMETER_LEARNING_RATE]->assign(*vecs[PARAMETER_GRADIENT_SQURESUM]);
+  vecs[PARAMETER_LEARNING_RATE]->addSquare(*vecs[PARAMETER_GRADIENT_SQURESUM1],
+                                           -1.0f);
+  vecs[PARAMETER_LEARNING_RATE]->add(epsilon);
+  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
+
+  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+                                   *vecs[PARAMETER_MOMENTUM],
+                                   *vecs[PARAMETER_LEARNING_RATE],
+                                   learningRate,
+                                   momentum,
+                                   decayRate);
+}
+
+void DecayedAdagradParameterOptimizer(const VectorPtr vecs[],
+                                      real accumulatedRou,
+                                      real rou,
+                                      real epsilon,
+                                      real learningRate,
+                                      real momentum,
+                                      real decayRate,
+                                      bool firstTime) {
+  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  // For the first time update, make the sum be the current square
+  // so that the initial estimation of E(g_t^2) will not be too small.
+  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
+      *vecs[PARAMETER_GRADIENT], accumulatedRou, firstTime ? 1.0f : 1.0f - rou);
+
+  // learn_rate = 1/sqrt( ( E(g_t^2) + epsilon )
+  // Basiclly if the bigger the magnitude gradient is,
+  // the smaller the learning rate will be.
+  vecs[PARAMETER_LEARNING_RATE]->assign(epsilon);
+  vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM]);
+  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
+
+  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+                                   *vecs[PARAMETER_MOMENTUM],
+                                   *vecs[PARAMETER_LEARNING_RATE],
+                                   learningRate,
+                                   momentum,
+                                   decayRate);
+}
+
+void AdamParameterOptimizer(const VectorPtr vecs[],
+                            real beta1,
+                            real beta2,
+                            real beta1_power,
+                            real beta2_power,
+                            real epsilon,
+                            real learningRate) {
+  Vector* m = vecs[PARAMETER_MOMENTUM].get();
+  Vector* g = vecs[PARAMETER_GRADIENT].get();
+  Vector* v = vecs[PARAMETER_SECOND_MOMENTUM].get();
+  Vector* theta = vecs[PARAMETER_VALUE].get();
+
+  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
+  m->add(*g, beta1, 1 - beta1);
+
+  // v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2
+  g->square2();
+  v->add(*g, beta2, 1 - beta2);
+
+  // tmp = m_t / ( \sqrt{v_t} + \epsilon )
+  // \theta_t = \theta_{t-1} - \alpha * \sqrt(1-\beta_2^t) / (1-\beta_1^t) * tmp
+  g->sqrt2(*v);
+  g->dotDiv(*m, *g, 0., epsilon);
+  real alpha =
+      learningRate * std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
+  theta->add(*theta, 1.0, *g, -alpha);
+}
+
+void AdamaxParameterOptimizer(
+    const VectorPtr vecs[], real beta1, real beta2, int64_t step, real alpha) {
+  Vector* m = vecs[PARAMETER_MOMENTUM].get();
+  Vector* g = vecs[PARAMETER_GRADIENT].get();
+  Vector* u = vecs[PARAMETER_WEIGHTED_INFINITY_NORM].get();
+  Vector* theta = vecs[PARAMETER_VALUE].get();
+
+  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
+  m->add(*g, beta1, 1 - beta1);
+
+  // u_t = max(\beta_2*u_{t-1}, abs(g_t))
+  u->mulScalar(beta2);
+  g->abs2();
+  u->max2(*u, *g);
+
+  // \theta_t = \theta_{t-1} - (\alpha/(1-\beta_1^t))*m_t/u_t
+  g->dotDiv(*m, *u);
+  real learningRate = alpha / (1 - std::pow(beta1, step));
+  theta->add(*theta, 1.0, *g, -learningRate);
+}
diff --git a/paddle/legacy/math/tests/PerfUtils.h b/paddle/legacy/math/tests/PerfUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..eaf4869e4c994e5ec739fe650d0228687d24853f
--- /dev/null
+++ b/paddle/legacy/math/tests/PerfUtils.h
@@ -0,0 +1,46 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+// Performance Check
+#ifdef PADDLE_DISABLE_TIMER
+
+#define EXPRESSION_PERFORMANCE(expression) expression;
+
+#else
+
+#include "paddle/legacy/utils/Stat.h"
+using namespace paddle;  // NOLINT
+
+#define EXPRESSION_PERFORMANCE(expression)                             \
+  do {                                                                 \
+    char expr[30];                                                     \
+    strncpy(expr, #expression, 30);                                    \
+    if (expr[29] != '\0') {                                            \
+      expr[27] = '.';                                                  \
+      expr[28] = '.';                                                  \
+      expr[29] = '\0';                                                 \
+    }                                                                  \
+    expression;                                                        \
+    for (int i = 0; i < 20; i++) {                                     \
+      REGISTER_TIMER(expr);                                            \
+      expression;                                                      \
+    }                                                                  \
+    LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ') \
+              << *globalStat.getStat(expr);                            \
+    globalStat.reset();                                                \
+  } while (0)
+
+#endif
diff --git a/paddle/legacy/math/tests/TensorCheck.h b/paddle/legacy/math/tests/TensorCheck.h
new file mode 100644
index 0000000000000000000000000000000000000000..41c8ece282e05f55d063e6ad0d8805629c847d34
--- /dev/null
+++ b/paddle/legacy/math/tests/TensorCheck.h
@@ -0,0 +1,216 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+/**
+ * This file provides a TensorCheck template function, which can be used to
+ * compare CpuMatrix and GpuMatrix, CpuVector and GpuVector, and so on.
+ */
+
+#include <cmath>
+#include "paddle/legacy/math/Matrix.h"
+
+namespace autotest {
+
+using paddle::Matrix;
+using paddle::CpuMatrix;
+using paddle::GpuMatrix;
+using paddle::VectorT;
+using paddle::CpuVectorT;
+using paddle::GpuVectorT;
+
+class AssertEqual {
+ public:
+  AssertEqual(real err = 0) : err_(err) {}
+
+  inline bool operator()(real a, real b) {
+    if (err_ == 0) {
+      if (a != b) {
+        return false;
+      }
+    } else {
+      if (std::fabs(a - b) > err_) {
+        if ((std::fabs(a - b) / std::fabs(a)) > (err_ / 10.0f)) {
+          return false;
+        }
+      }
+    }
+
+    return true;
+  }
+
+ private:
+  real err_;
+};
+
+template <typename Tensor>
+class CopyToCpu;
+
+template <>
+class CopyToCpu<CpuMatrix> {
+ public:
+  explicit CopyToCpu(const CpuMatrix& arg) : arg_(arg) {}
+  const CpuMatrix& copiedArg() const { return arg_; }
+
+ private:
+  const CpuMatrix& arg_;
+};
+
+template <>
+class CopyToCpu<GpuMatrix> {
+ public:
+  explicit CopyToCpu(const GpuMatrix& arg)
+      : arg_(arg.getHeight(), arg.getWidth()) {
+    arg_.copyFrom(arg);
+  }
+  CpuMatrix& copiedArg() { return arg_; }
+
+ private:
+  CpuMatrix arg_;
+};
+
+template <>
+class CopyToCpu<Matrix> {
+ public:
+  explicit CopyToCpu(const Matrix& arg)
+      : arg_(arg.getHeight(), arg.getWidth()) {
+    arg_.copyFrom(arg);
+  }
+  CpuMatrix& copiedArg() { return arg_; }
+
+ private:
+  CpuMatrix arg_;
+};
+
+template <typename T>
+class CopyToCpu<CpuVectorT<T>> {
+ public:
+  explicit CopyToCpu(const CpuVectorT<T>& arg) : arg_(arg) {}
+  const CpuVectorT<T>& copiedArg() const { return arg_; }
+
+ private:
+  const CpuVectorT<T>& arg_;
+};
+
+template <typename T>
+class CopyToCpu<GpuVectorT<T>> {
+ public:
+  explicit CopyToCpu(const GpuVectorT<T>& arg) : arg_(arg.getSize()) {
+    arg_.copyFrom(arg);
+  }
+  CpuVectorT<T>& copiedArg() { return arg_; }
+
+ private:
+  CpuVectorT<T> arg_;
+};
+
+template <typename T>
+class CopyToCpu<VectorT<T>> {
+ public:
+  explicit CopyToCpu(const VectorT<T>& arg) : arg_(arg.getSize()) {
+    arg_.copyFrom(arg);
+  }
+  CpuVectorT<T>& copiedArg() { return arg_; }
+
+ private:
+  CpuVectorT<T> arg_;
+};
+
+template <typename AssertEq>
+void TensorCheck(AssertEq compare,
+                 const CpuMatrix& matrix1,
+                 const CpuMatrix& matrix2) {
+  CHECK(matrix1.getHeight() == matrix2.getHeight());
+  CHECK(matrix1.getWidth() == matrix2.getWidth());
+
+  int height = matrix1.getHeight();
+  int width = matrix1.getWidth();
+  const real* data1 = matrix1.getData();
+  const real* data2 = matrix2.getData();
+  int count = 0;
+  for (int i = 0; i < height; i++) {
+    for (int j = 0; j < width; j++) {
+      real a = data1[i * width + j];
+      real b = data2[i * width + j];
+      if (!compare(a, b)) {
+        count++;
+      }
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+}
+
+template <typename AssertEq, class T>
+void TensorCheck(AssertEq compare,
+                 const CpuVectorT<T>& vector1,
+                 const CpuVectorT<T>& vector2) {
+  CHECK(vector1.getSize() == vector2.getSize());
+
+  const T* data1 = vector1.getData();
+  const T* data2 = vector2.getData();
+  size_t size = vector1.getSize();
+  int count = 0;
+  for (size_t i = 0; i < size; i++) {
+    real a = data1[i];
+    real b = data2[i];
+    if (!compare(a, b)) {
+      count++;
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different elements.";
+}
+
+template <typename AssertEq, typename Tensor1, typename Tensor2>
+void TensorCheck(AssertEq compare,
+                 const Tensor1& tensor1,
+                 const Tensor2& tensor2) {
+  TensorCheck(compare,
+              CopyToCpu<Tensor1>(tensor1).copiedArg(),
+              CopyToCpu<Tensor2>(tensor2).copiedArg());
+}
+
+template <typename AssertEq>
+void TensorCheck(AssertEq compare, real args1, real args2) {
+  EXPECT_EQ(compare(args1, args2), true) << "[Test error] args1 = " << args1
+                                         << ", args2 = " << args2;
+}
+
+template <typename AssertEq>
+void TensorCheck(AssertEq compare, size_t args1, size_t args2) {
+  EXPECT_EQ(args1, args2) << "[Test error] args1 = " << args1
+                          << ", args2 = " << args2;
+}
+
+template <typename Tensor1, typename Tensor2>
+void TensorCheckEqual(const Tensor1& tensor1, const Tensor2& tensor2) {
+  AssertEqual compare(0);
+  TensorCheck(compare,
+              CopyToCpu<Tensor1>(tensor1).copiedArg(),
+              CopyToCpu<Tensor2>(tensor2).copiedArg());
+}
+
+template <typename Tensor1, typename Tensor2>
+void TensorCheckErr(const Tensor1& tensor1, const Tensor2& tensor2) {
+#ifndef PADDLE_TYPE_DOUBLE
+  AssertEqual compare(1e-3);
+#else
+  AssertEqual compare(1e-10);
+#endif
+  TensorCheck(compare,
+              CopyToCpu<Tensor1>(tensor1).copiedArg(),
+              CopyToCpu<Tensor2>(tensor2).copiedArg());
+}
+
+}  // namespace autotest
diff --git a/paddle/legacy/math/tests/TestUtils.h b/paddle/legacy/math/tests/TestUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..60e76359da61ac32346b093d9a9ff69104bfc494
--- /dev/null
+++ b/paddle/legacy/math/tests/TestUtils.h
@@ -0,0 +1,294 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+/**
+ * This file provides a AutoCompare calss to simplify the comparison
+ * of CPU and GPU member functions.
+ *
+ * This takes two steps
+ * 1. Construct an AutoCompare object.
+ *    When constructing an AutoCompare object, you can set the err argument
+ * to specify the maximum error for CPU and GPU functions.
+ *
+ * 2. Use the template functions cmpWithArg or cmpWithoutArg.
+ * A. [cmpWithArg] Requires the caller construct the cpu arguments.
+ *
+ *  AutoCompare test;
+ *  Init Argument arg1,arg2...
+ *  test.cmpWithArg(function, arg1, arg2....)
+ *
+ * B. [cmpWithoutArg] The caller do not need construct arguments.
+ *    If matrix used in these functions arguments is the same size.
+ *    Such as the element wise function and the aggregate function
+ *    defined in the BaseMatrix.cpp.
+ *
+ *  AutoCompare test;
+ *  test.cmpWithoutArg<I...>(function, height, width)
+ */
+
+#include <gtest/gtest.h>
+#include "TensorCheck.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/SparseMatrix.h"
+
+namespace autotest {
+
+using paddle::BaseMatrix;
+using paddle::CpuMatrix;
+using paddle::GpuMatrix;
+using paddle::CpuIVector;
+using paddle::GpuIVector;
+using paddle::CpuSparseMatrix;
+using paddle::GpuSparseMatrix;
+
+template <typename T1, typename T2>
+class ReplaceType {
+ public:
+  typedef T1 type;
+};
+
+template <>
+class ReplaceType<BaseMatrix, CpuMatrix> {
+ public:
+  typedef CpuMatrix type;
+};
+
+template <>
+class ReplaceType<BaseMatrix, GpuMatrix> {
+ public:
+  typedef GpuMatrix type;
+};
+
+template <>
+class ReplaceType<Matrix, CpuMatrix> {
+ public:
+  typedef CpuMatrix type;
+};
+
+template <>
+class ReplaceType<Matrix, GpuMatrix> {
+ public:
+  typedef GpuMatrix type;
+};
+
+// construct a argument
+template <typename T>
+T construct(int height, int width);
+
+template <>
+float construct(int height, int width) {
+  return 0.5;
+}
+
+template <>
+double construct(int height, int width) {
+  return 0.5;
+}
+
+template <>
+size_t construct(int height, int width) {
+  size_t offset = std::rand() % (height < width ? height : width);
+  return offset;
+}
+
+template <>
+CpuMatrix construct(int height, int width) {
+  CpuMatrix a(height, width);
+  return a;
+}
+
+template <>
+GpuMatrix construct(int height, int width) {
+  GpuMatrix a(height, width);
+  return a;
+}
+
+// init a argument
+template <typename T>
+void init(T& v) {
+  return;
+}
+
+template <>
+void init(CpuMatrix& v) {
+  v.randomizeUniform();
+}
+
+template <>
+void init(GpuMatrix& v) {
+  v.randomizeUniform();
+}
+
+// init a tuple which contains a set of arguments.
+template <std::size_t I = 0, typename... Args>
+inline typename std::enable_if<I == sizeof...(Args), void>::type initTuple(
+    std::tuple<Args...>& t) {}
+
+template <std::size_t I = 0, typename... Args>
+    inline typename std::enable_if <
+    I<sizeof...(Args), void>::type initTuple(std::tuple<Args...>& t) {
+  init(std::get<I>(t));
+  initTuple<I + 1>(t);
+}
+
+// copy a argument, copy src to dest
+template <typename T1, typename T2>
+void copy(T1& dest, T2& src) {
+  dest = src;
+}
+
+template <>
+void copy(GpuMatrix& dest, CpuMatrix& src) {
+  dest.copyFrom(src);
+}
+
+// copy a tuple, copy src to dest
+template <std::size_t I = 0, typename... Args1, typename... Args2>
+inline typename std::enable_if<I == sizeof...(Args1), void>::type copyTuple(
+    std::tuple<Args1...>& dest, std::tuple<Args2...>& src) {}
+
+template <std::size_t I = 0, typename... Args1, typename... Args2>
+    inline typename std::enable_if <
+    I<sizeof...(Args1), void>::type copyTuple(std::tuple<Args1...>& dest,
+                                              std::tuple<Args2...>& src) {
+  copy(std::get<I>(dest), std::get<I>(src));
+  copyTuple<I + 1>(dest, src);
+}
+
+// call member function
+template <typename C,
+          typename FC,
+          typename R,
+          typename... FArgs,
+          typename... Args>
+R call(C& obj, R (FC::*f)(FArgs...), Args&&... args) {
+  return (obj.*f)(args...);
+}
+
+template <typename T>
+class ReturnType {
+ public:
+  typedef T type;
+};
+
+template <>
+class ReturnType<CpuMatrix> {
+ public:
+  typedef GpuMatrix type;
+};
+
+template <>
+class ReturnType<CpuIVector> {
+ public:
+  typedef GpuIVector type;
+};
+
+template <>
+class ReturnType<CpuSparseMatrix> {
+ public:
+  typedef GpuSparseMatrix type;
+};
+
+template <typename T>
+typename ReturnType<T>::type autoArgs(T& v) {
+  return v;
+}
+
+template <>
+GpuMatrix autoArgs(CpuMatrix& v) {
+  GpuMatrix a(v.getHeight(), v.getWidth());
+  a.copyFrom(v);
+  return a;
+}
+
+template <>
+GpuIVector autoArgs(CpuIVector& v) {
+  GpuIVector a(v.getSize());
+  a.copyFrom(v);
+  return a;
+}
+
+template <>
+GpuSparseMatrix autoArgs(CpuSparseMatrix& v) {
+  GpuSparseMatrix a(v.getHeight(),
+                    v.getWidth(),
+                    v.getElementCnt(),
+                    v.getValueType(),
+                    v.getFormat());
+  a.copyFrom(v, HPPL_STREAM_DEFAULT);
+  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  return a;
+}
+
+class AutoCompare {
+ public:
+  /**
+   * err is the allowed calculation error.
+   * The smaller the value of err,
+   * the stricter the comparison is between CPU and GPU calculations.
+   */
+  AutoCompare(size_t height, size_t width, real err = 1e-3)
+      : cpu(height, width), gpu(height, width), compare(err) {
+    init(cpu);
+    copy(gpu, cpu);
+  }
+
+  template <typename C, typename R, typename... FArgs, typename... Args>
+  void cmpWithArg(R (C::*f)(FArgs...), Args&&... args) {
+    static_assert(sizeof...(FArgs) == sizeof...(Args),
+                  "size of parameter packs are not equal");
+    call(cpu, f, args...);
+    call(gpu, f, autoArgs(args)...);
+
+    TensorCheck(compare, cpu, gpu);
+  }
+
+  template <std::size_t... I, typename C, typename R, typename... Args>
+  void cmpWithoutArg(R (C::*f)(Args...), size_t height, size_t width) {
+    static_assert(sizeof...(I) == sizeof...(Args),
+                  "size of parameter packs are not equal");
+    (void)height;
+    (void)width;
+    auto tuple1 = std::make_tuple(
+        construct<typename ReplaceType<
+            typename std::decay<
+                typename std::tuple_element<I,
+                                            std::tuple<Args...>>::type>::type,
+            CpuMatrix>::type>(height, width)...);
+
+    auto tuple2 = std::make_tuple(
+        construct<typename ReplaceType<
+            typename std::decay<
+                typename std::tuple_element<I,
+                                            std::tuple<Args...>>::type>::type,
+            GpuMatrix>::type>(height, width)...);
+
+    initTuple(tuple1);
+    copyTuple(tuple2, tuple1);
+
+    call(cpu, f, std::get<I>(tuple1)...);
+    call(gpu, f, std::get<I>(tuple2)...);
+
+    TensorCheck(compare, cpu, gpu);
+  }
+
+ protected:
+  CpuMatrix cpu;
+  GpuMatrix gpu;
+  AssertEqual compare;
+};
+
+}  // namespace autotest
diff --git a/paddle/legacy/math/tests/test_Allocator.cpp b/paddle/legacy/math/tests/test_Allocator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..122be9082a8db33caf55661091caad115f575099
--- /dev/null
+++ b/paddle/legacy/math/tests/test_Allocator.cpp
@@ -0,0 +1,122 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Util.h"
+#define private public
+#include "paddle/legacy/math/Allocator.h"
+#include "paddle/legacy/math/MemoryHandle.h"
+#include "paddle/legacy/math/PoolAllocator.h"
+
+using namespace paddle;  // NOLINT
+
+template <typename Allocator>
+void testPoolAllocator() {
+  PoolAllocator* pool =
+      new PoolAllocator(new Allocator(), /* sizeLimit */ 1024);
+
+  /* alloc from system memory */
+  void* ptr1 = pool->alloc(10);
+  void* ptr2 = pool->alloc(200);
+  void* ptr3 = pool->alloc(200);
+  pool->free(ptr1, 10);
+  pool->free(ptr2, 200);
+  pool->free(ptr3, 200);
+  pool->printAll();
+  EXPECT_EQ((size_t)2, pool->pool_.size());
+  EXPECT_EQ((size_t)1, pool->pool_[10].size());
+  EXPECT_EQ((size_t)2, pool->pool_[200].size());
+  EXPECT_EQ(ptr1, pool->pool_[10][0]);
+  EXPECT_EQ(ptr2, pool->pool_[200][0]);
+  EXPECT_EQ(ptr3, pool->pool_[200][1]);
+
+  /* alloc from pool */
+  void* ptr4 = pool->alloc(10);
+  void* ptr5 = pool->alloc(200);
+  pool->printAll();
+  EXPECT_EQ((size_t)0, pool->pool_[10].size());
+  EXPECT_EQ((size_t)1, pool->pool_[200].size());
+  EXPECT_EQ(ptr1, ptr4);
+  EXPECT_EQ(ptr3, ptr5);
+  pool->free(ptr4, 10);
+  pool->free(ptr5, 200);
+
+  /* alloc size > sizeLimit */
+  void* ptr6 = pool->alloc(1024);
+  pool->free(ptr6, 1024);
+  EXPECT_LE((size_t)1024, pool->poolMemorySize_);
+
+  void* ptr7 = pool->alloc(1);
+  EXPECT_EQ((size_t)0, pool->poolMemorySize_);
+  EXPECT_EQ((size_t)0, pool->pool_.size());
+  pool->free(ptr7, 1);
+
+  delete pool;
+}
+
+TEST(Allocator, Pool) {
+  testPoolAllocator<CpuAllocator>();
+#ifdef PADDLE_WITH_CUDA
+  testPoolAllocator<GpuAllocator>();
+#endif
+}
+
+TEST(MemoryHandle, Cpu) {
+  for (auto size : {10, 30, 50, 100, 200, 512, 1000, 1023, 1024, 1025, 8193}) {
+    CpuMemoryHandle handle(size);
+    EXPECT_LE(handle.getSize(), handle.getAllocSize());
+  }
+
+  void* ptr1;
+  void* ptr2;
+  {
+    CpuMemoryHandle handle(256);
+    ptr1 = handle.getBuf();
+  }
+  {
+    CpuMemoryHandle handle(256);
+    ptr2 = handle.getBuf();
+  }
+  EXPECT_EQ(ptr1, ptr2);
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(MemoryHandle, Gpu) {
+  int numGpu = hl_get_device_count();
+
+  /* alloc from system memory */
+  void* ptr3[numGpu];
+  void* ptr4[numGpu];
+  for (int i = 0; i < numGpu; i++) {
+    SetDevice device(i);
+    GpuMemoryHandle handle1(30);
+    GpuMemoryHandle handle2(30);
+    GpuMemoryHandle handle3(4000);
+    GpuMemoryHandle handle4(500);
+    ptr3[i] = handle3.getBuf();
+    ptr4[i] = handle4.getBuf();
+  }
+
+  /* alloc from pool */
+  for (int i = 0; i < numGpu; i++) {
+    SetDevice device(i);
+    GpuMemoryHandle handle1(30);
+    GpuMemoryHandle handle3(4000);
+    GpuMemoryHandle handle4(500);
+    EXPECT_EQ(ptr3[i], handle3.getBuf());
+    EXPECT_EQ(ptr4[i], handle4.getBuf());
+  }
+}
+#endif
diff --git a/paddle/legacy/math/tests/test_BaseMatrix.cpp b/paddle/legacy/math/tests/test_BaseMatrix.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..488765c6ac203ad064146faaab7b8c423d53cf0b
--- /dev/null
+++ b/paddle/legacy/math/tests/test_BaseMatrix.cpp
@@ -0,0 +1,247 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_CUDA
+/**
+ * This test file use autotest::AutoCompare and cmpWithoutArg to compares the
+ * implementation of CPU and GPU member function in
+ * BaseMatrix.cpp and Matrix.cpp.
+ */
+
+#include <gtest/gtest.h>
+#include "TestUtils.h"
+#include "paddle/legacy/math/BaseMatrix.h"
+
+using paddle::BaseMatrix;
+using paddle::Matrix;
+using autotest::AutoCompare;
+
+// Test all void (BaseMatrix::*)() function
+TEST(BaseMatrix, void) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      auto compare = [height, width](void (BaseMatrix::*f)()) {
+        AutoCompare test(height, width, 1e-5);
+        test.cmpWithoutArg(f, height, width);
+      };
+
+      compare(&BaseMatrix::neg);
+      compare(&BaseMatrix::exp2);
+      compare(&BaseMatrix::log2);
+      compare(&BaseMatrix::sqrt2);
+      compare(&BaseMatrix::square2);
+      compare(&BaseMatrix::reciprocal2);
+      compare(&BaseMatrix::abs2);
+      compare(&BaseMatrix::sign2);
+      compare(&BaseMatrix::zero);
+      compare(&BaseMatrix::one);
+    }
+  }
+}
+
+// Test all void (BaseMatrix::*)(real) function
+TEST(BaseMatrix, real) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      auto compare = [height, width](void (BaseMatrix::*f)(real)) {
+        AutoCompare test(height, width, 1e-5);
+        test.cmpWithoutArg<0>(f, height, width);
+      };
+
+      compare(&BaseMatrix::pow2);
+      compare(&BaseMatrix::subScalar);
+      compare(&BaseMatrix::mulScalar);
+      compare(&BaseMatrix::divScalar);
+      compare(&BaseMatrix::assign);
+      compare(&BaseMatrix::add);
+      compare(&BaseMatrix::biggerThanScalar);
+      compare(&BaseMatrix::downClip);
+    }
+  }
+}
+
+// Test all void (BaseMatrix::*)(BaseMatrix&) function
+TEST(BaseMatrix, BaseMatrix) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      auto compare = [height, width](void (BaseMatrix::*f)(BaseMatrix&)) {
+        AutoCompare test(height, width, 1e-5);
+        test.cmpWithoutArg<0>(f, height, width);
+      };
+
+      compare(&BaseMatrix::assign);
+      compare(&BaseMatrix::add);
+      compare(&BaseMatrix::relu);
+      compare(&BaseMatrix::reluDerivative);
+      compare(&BaseMatrix::softrelu);
+      compare(&BaseMatrix::softreluDerivative);
+      compare(&BaseMatrix::brelu);
+      compare(&BaseMatrix::breluDerivative);
+      compare(&BaseMatrix::square2);
+      compare(&BaseMatrix::squareDerivative);
+      compare(&BaseMatrix::tanh);
+      compare(&BaseMatrix::tanhDerivative);
+      compare(&BaseMatrix::reciprocal2);
+      compare(&BaseMatrix::reciprocalDerivative);
+      compare(&BaseMatrix::abs2);
+      compare(&BaseMatrix::absDerivative);
+      compare(&BaseMatrix::sigmoid);
+      compare(&BaseMatrix::sigmoidDerivative);
+      compare(&BaseMatrix::expDerivative);
+      compare(&BaseMatrix::sign2);
+      compare(&BaseMatrix::exp2);
+      compare(&BaseMatrix::log2);
+      compare(&BaseMatrix::sqrt2);
+      compare(&BaseMatrix::dotMul);
+      compare(&BaseMatrix::dotMulSquare);
+      compare(&BaseMatrix::dotSquareMul);
+      compare(&BaseMatrix::addColVector);
+      compare(&BaseMatrix::addRowVector);
+      compare(&BaseMatrix::mulRowVector);
+      compare(&BaseMatrix::divRowVector);
+      compare(&BaseMatrix::mulColVector);
+      compare(&BaseMatrix::divColVector);
+      compare(&BaseMatrix::addP2P);
+      compare(&BaseMatrix::invSqrt);
+    }
+  }
+}
+
+// Test all void (BaseMatrix::*)(real, real) function
+TEST(BaseMatrix, real_real) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      auto compare = [height, width](void (BaseMatrix::*f)(real, real)) {
+        AutoCompare test(height, width, 1e-5);
+        test.cmpWithoutArg<0, 1>(f, height, width);
+      };
+
+      compare(&BaseMatrix::add);
+      compare(&BaseMatrix::clip);
+    }
+  }
+}
+
+// Test all void (BaseMatrix::*)(BaseMatrix&, real) function
+TEST(BaseMatrix, BaseMatrix_real) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      auto compare = [height, width](void (BaseMatrix::*f)(BaseMatrix&, real)) {
+        AutoCompare test(height, width, 1e-5);
+        test.cmpWithoutArg<0, 1>(f, height, width);
+      };
+
+      compare(&BaseMatrix::addBias);
+      compare(&BaseMatrix::add);
+      compare(&BaseMatrix::sub);
+      compare(&BaseMatrix::pow2);
+      compare(&BaseMatrix::addScalar);
+      compare(&BaseMatrix::subScalar);
+      compare(&BaseMatrix::mulScalar);
+      compare(&BaseMatrix::divScalar);
+      compare(&BaseMatrix::scalarDiv);
+      compare(&BaseMatrix::addSquare);
+      compare(&BaseMatrix::isEqualTo);
+    }
+  }
+}
+
+// Test all void (BaseMatrix::*)(BaseMatrix&, BaseMatrix&) function
+TEST(BaseMatrix, BaseMatrix_BaseMatrix) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      auto compare = [height,
+                      width](void (BaseMatrix::*f)(BaseMatrix&, BaseMatrix&)) {
+        AutoCompare test(height, width, 1e-5);
+        test.cmpWithoutArg<0, 1>(f, height, width);
+      };
+
+      compare(&BaseMatrix::softCrossEntropy);
+      compare(&BaseMatrix::softCrossEntropyBp);
+      compare(&BaseMatrix::binaryLabelCrossEntropy);
+      compare(&BaseMatrix::binaryLabelCrossEntropyBp);
+      compare(&BaseMatrix::sub);
+      compare(&BaseMatrix::add2);
+      compare(&BaseMatrix::dotMul);
+      compare(&BaseMatrix::dotDiv);
+      compare(&BaseMatrix::logisticRegressionLoss);
+      compare(&BaseMatrix::logisticRegressionLossBp);
+      compare(&BaseMatrix::biggerThan);
+      compare(&BaseMatrix::max2);
+      compare(&BaseMatrix::dotMulSquare);
+      compare(&BaseMatrix::dotSquareSquare);
+    }
+  }
+}
+
+void TestEelementWise(size_t height, size_t width) {
+  AutoCompare rowScale(height, width);
+  rowScale.cmpWithoutArg<0, 1, 2>(&BaseMatrix::rowScale, height, width);
+
+  AutoCompare rowDotMul(height, width);
+  rowDotMul.cmpWithoutArg<0, 1, 2>(&BaseMatrix::rowDotMul, height, width);
+
+  AutoCompare binaryClassificationError(height, width);
+  binaryClassificationError.cmpWithoutArg<0, 1, 2, 3>(
+      &BaseMatrix::binaryClassificationError, height, width);
+
+  AutoCompare sumOfSquaresBp(height, width);
+  sumOfSquaresBp.cmpWithoutArg<0, 1>(&Matrix::sumOfSquaresBp, height, width);
+}
+
+void TestAggregateToRow(size_t height, size_t width) {
+  AutoCompare maxCols(1, width);
+  maxCols.cmpWithoutArg<0>(&BaseMatrix::maxCols, height, width);
+
+  AutoCompare minCols(1, width);
+  minCols.cmpWithoutArg<0>(&BaseMatrix::minCols, height, width);
+
+  AutoCompare addDotMulVMM(1, width);
+  addDotMulVMM.cmpWithoutArg<0, 1>(&BaseMatrix::addDotMulVMM, height, width);
+
+  AutoCompare sumCols(1, width);
+  sumCols.cmpWithoutArg<0, 1, 2>(&BaseMatrix::sumCols, height, width);
+
+  AutoCompare collectBias(1, width);
+  collectBias.cmpWithoutArg<0, 1>(
+      static_cast<void (Matrix::*)(Matrix&, real)>(&Matrix::collectBias),
+      height,
+      width);
+}
+
+void TestAggregateToCol(size_t height, size_t width) {
+  AutoCompare maxRows(height, 1);
+  maxRows.cmpWithoutArg<0>(&BaseMatrix::maxRows, height, width);
+
+  AutoCompare minRows(height, 1);
+  minRows.cmpWithoutArg<0>(&BaseMatrix::minRows, height, width);
+
+  AutoCompare sumRows(height, 1);
+  sumRows.cmpWithoutArg<0, 1, 2>(&BaseMatrix::sumRows, height, width);
+
+  AutoCompare sumOfSquares(height, 1);
+  sumOfSquares.cmpWithoutArg<0, 1>(&Matrix::sumOfSquares, height, width);
+}
+
+TEST(BaseMatrix, Other) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      TestEelementWise(height, width);
+      TestAggregateToRow(height, width);
+      TestAggregateToCol(height, width);
+    }
+  }
+}
+
+#endif
diff --git a/paddle/legacy/math/tests/test_CpuGpuVector.cpp b/paddle/legacy/math/tests/test_CpuGpuVector.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..010fef534d1e19d2d7d134298eb97aa1b56e2270
--- /dev/null
+++ b/paddle/legacy/math/tests/test_CpuGpuVector.cpp
@@ -0,0 +1,80 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_CUDA
+
+#include <gtest/gtest.h>
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/legacy/utils/Util.h"
+#include "test_matrixUtil.h"
+
+using namespace paddle;  // NOLINT
+
+TEST(CpuGpuVector, getData) {
+  size_t size = 500;
+  hl_stream_t stream(HPPL_STREAM_DEFAULT);
+  CpuVectorPtr cpuVec = std::make_shared<CpuVector>(size);
+  GpuVectorPtr gpuVec = std::make_shared<GpuVector>(size);
+  cpuVec->uniform(0.0, 10.0);
+  gpuVec->copyFrom(*cpuVec, stream);
+  hl_stream_synchronize(stream);
+
+  CpuGpuVectorPtr vec = std::make_shared<CpuGpuVector>(gpuVec);
+  auto a = vec->getData(false);
+  auto b = cpuVec->getData();
+  hl_stream_synchronize(stream);
+  checkDataEqual(a, b, size);
+}
+
+TEST(CpuGpuVector, subCreate) {
+  size_t size1 = 1024;
+  size_t offset = 100;
+  size_t size2 = 500;
+  hl_stream_t stream(HPPL_STREAM_DEFAULT);
+  CpuGpuVectorPtr v1 = std::make_shared<CpuGpuVector>(size1, /*useGpu*/ false);
+  auto vec = v1->getMutableVector(false);
+  vec->uniform(0.0, 10.0);
+  auto v2 = std::make_shared<CpuGpuVector>(*v1, offset, size2);
+  CHECK_EQ(*v1->getSync(), *v2->getSync());
+
+  // check subVec equal
+  checkDataEqual(v1->getData(false) + offset, v2->getData(false), size2);
+
+  CpuVectorPtr v1Check = std::make_shared<CpuVector>(size1);
+  CpuVectorPtr v2Check = std::make_shared<CpuVector>(size2);
+  v1Check->copyFrom(*(v1->getVector(true)), stream);
+  v2Check->copyFrom(*(v2->getVector(true)), stream);
+  hl_stream_synchronize(stream);
+
+  checkDataEqual(v2->getData(false), v2Check->getData(), size2);
+  checkDataEqual(v1Check->getData() + offset, v2Check->getData(), size2);
+
+  CpuVectorPtr noise = std::make_shared<CpuVector>(size2);
+  noise->uniform(0.0, 1.0);
+  auto v = v2->getMutableVector(false);  // will change header
+  // add noise to subVec
+  v->add(*noise);
+
+  // check v1_cpu_data == v2_cpu_data
+  checkDataEqual(v1->getData(false) + offset, v2->getData(false), size2);
+
+  v1Check->copyFrom(*(v1->getVector(true)), stream);
+  v2Check->copyFrom(*(v2->getVector(true)), stream);
+  hl_stream_synchronize(stream);
+
+  // check v1_gpu_data == v2_gpu_data
+  checkDataEqual(v1Check->getData() + offset, v2Check->getData(), size2);
+}
+
+#endif
diff --git a/paddle/legacy/math/tests/test_ExecViaCpu.cpp b/paddle/legacy/math/tests/test_ExecViaCpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b2ce0bc7ede133028fff8a855ff336ff83f55d82
--- /dev/null
+++ b/paddle/legacy/math/tests/test_ExecViaCpu.cpp
@@ -0,0 +1,116 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <paddle/legacy/utils/PythonUtil.h>
+#include <paddle/legacy/utils/Util.h>
+#include <vector>
+#include "paddle/legacy/math/SparseMatrix.h"
+
+using namespace paddle;  // NOLINT
+
+const int height = 10;
+const int width = 16;
+
+real f(Matrix& mat1,
+       const Matrix& mat2,
+       IVector& vec1,
+       const IVector& vec2,
+       real scalar) {
+  CHECK(!mat1.useGpu());
+  CHECK(!mat2.useGpu());
+  CHECK(!vec1.useGpu());
+  CHECK(!vec2.useGpu());
+  mat1.copyFrom(mat2);
+  vec1.copyFrom(vec2);
+
+  return scalar;
+}
+
+class Functor {
+ public:
+  real operator()(Matrix& mat1,
+                  const Matrix& mat2,
+                  IVector& vec1,
+                  const IVector& vec2,
+                  real scalar) {
+    a_ = f(mat1, mat2, vec1, vec2, scalar);
+    return a_;
+  }
+
+ private:
+  real a_;
+};
+
+template <typename F>
+void testWrapper(F&& f) {
+  MatrixPtr cpumat1 = Matrix::create(height, width, false, /*useGpu=*/false);
+  MatrixPtr cpumat2 = Matrix::create(height, width, false, /*useGpu=*/false);
+
+  IVectorPtr cpuvec1 = IVector::create(height, /*useGpu=*/false);
+  IVectorPtr cpuvec2 = IVector::create(height, /*useGpu=*/false);
+
+  const real scalar = 1.23456;
+
+  MatrixPtr gpumat1 = Matrix::create(height, width, false, /*useGpu=*/true);
+  MatrixPtr gpumat2 = Matrix::create(height, width, false, /*useGpu=*/true);
+  IVectorPtr gpuvec1 = IVector::create(height, /*useGpu=*/true);
+  IVectorPtr gpuvec2 = IVector::create(height, /*useGpu=*/true);
+
+  cpumat2->randomizeUniform();
+  cpuvec2->rand(width);
+  gpumat2->copyFrom(*cpumat2);
+  gpuvec2->copyFrom(*cpuvec2);
+
+  real ret = execViaCpu(f, *gpumat1, *gpumat2, *gpuvec1, *gpuvec2, 1.23456);
+  EXPECT_EQ(ret, scalar);
+  cpumat1->copyFrom(*gpumat1);
+  cpuvec1->copyFrom(*gpuvec1);
+
+  for (int i = 0; i < height; ++i) {
+    EXPECT_EQ(cpuvec1->getElement(i), cpuvec2->getElement(i));
+    for (int j = 0; j < width; ++j) {
+      EXPECT_EQ(cpumat1->getElement(i, j), cpumat2->getElement(i, j));
+    }
+  }
+  gpumat1->resize(height, 1);
+  execViaCpu2(&CpuMatrix::selectElements, *gpumat1, *gpumat2, *gpuvec1);
+
+  cpumat1->resize(height, 1);
+  cpumat1->selectElements(*cpumat2, *cpuvec1);
+  for (int i = 0; i < height; ++i) {
+    EXPECT_EQ(cpumat1->getElement(i, 0), gpumat1->getElement(i, 0));
+  }
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(ExecViaCpu, test1) {
+  testWrapper(f);
+  testWrapper(&f);
+
+  auto lambda = [](Matrix& mat1,
+                   const Matrix& mat2,
+                   IVector& vec1,
+                   const IVector& vec2,
+                   real scalar) -> real {
+    return f(mat1, mat2, vec1, vec2, scalar);
+  };
+  LOG(INFO) << "lambda is_class=" << std::is_class<decltype(lambda)>::value
+            << " is_function=" << std::is_function<decltype(lambda)>::value;
+  testWrapper(lambda);
+
+  Functor functor;
+  testWrapper(functor);
+}
+#endif
diff --git a/paddle/legacy/math/tests/test_FPException.cpp b/paddle/legacy/math/tests/test_FPException.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..aa6aea71c8d959834ff11c04969e13bb36b630ff
--- /dev/null
+++ b/paddle/legacy/math/tests/test_FPException.cpp
@@ -0,0 +1,93 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/**
+ * This test is about floating point calculation exception.
+ * Paddle catches FE_INVALID, FE DIVBYZERO and FE_OVERFLOW exceptions.
+ *
+ * Some exceptions occur in the middle of a set of formulas,
+ * that can be circumvented by some tricks.
+ * For example,
+ * calculate tanh
+ *   b = 2.0 / (1.0 + exp(-2 * a)) - 1.0
+ *
+ * If the result of (-2 * a) is too large,
+ * a FE_OVERFLOW exception occurs when calculating exp.
+ * But the result of tanh is no overflow problem,
+ * so we can add some tricks to prevent exp calculate an excessive value.
+ *
+ */
+
+#include <gtest/gtest.h>
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/Common.h"
+
+using namespace paddle;  // NOLINT
+
+void SetTensorValue(Matrix& matrix, real value) {
+  int height = matrix.getHeight();
+  int width = matrix.getWidth();
+  int stride = matrix.getStride();
+  real* data = matrix.getData();
+  for (int i = 0; i < height; i++) {
+    int j = rand() % width;  // NOLINT
+    if (typeid(matrix) == typeid(CpuMatrix)) {
+      data[i * stride + j] = value;
+    } else if (typeid(matrix) == typeid(GpuMatrix)) {
+      hl_memcpy(&data[i * stride + j], &value, sizeof(real));
+    } else {
+      LOG(FATAL) << "should not reach here";
+    }
+  }
+}
+
+template <typename Matrix>
+void testTanh(real illegal) {
+  MatrixPtr A = std::make_shared<Matrix>(10, 10);
+  MatrixPtr B = std::make_shared<Matrix>(10, 10);
+  A->randomizeUniform();
+  B->randomizeUniform();
+
+  SetTensorValue(*A, illegal);
+
+  A->tanh(*B);
+}
+
+template <typename Matrix>
+void testSigmoid(real illegal) {
+  MatrixPtr A = std::make_shared<Matrix>(10, 10);
+  MatrixPtr B = std::make_shared<Matrix>(10, 10);
+  A->randomizeUniform();
+  B->randomizeUniform();
+
+  SetTensorValue(*A, illegal);
+
+  A->sigmoid(*B);
+}
+
+TEST(fp, overflow) {
+  for (auto illegal : {-90.0, 90.0}) {
+    LOG(INFO) << " illegal=" << illegal;
+    testTanh<CpuMatrix>(illegal);
+    testSigmoid<CpuMatrix>(illegal);
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+
+  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/math/tests/test_GpuProfiler.cpp b/paddle/legacy/math/tests/test_GpuProfiler.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ee27109f218ca56df8f42ca6395b22621f5fbc11
--- /dev/null
+++ b/paddle/legacy/math/tests/test_GpuProfiler.cpp
@@ -0,0 +1,165 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_CUDA
+
+#include <gtest/gtest.h>
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/SparseMatrix.h"
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/Util.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+void MatrixCheckErr(const Matrix& matrix1, const Matrix& matrix2) {
+  CHECK(matrix1.getHeight() == matrix2.getHeight());
+  CHECK(matrix1.getWidth() == matrix2.getWidth());
+#ifndef PADDLE_TYPE_DOUBLE
+  real err = 1e-3;
+#else
+  real err = 1e-10;
+#endif
+
+  int height = matrix1.getHeight();
+  int width = matrix1.getWidth();
+  const real* data1 = matrix1.getData();
+  const real* data2 = matrix2.getData();
+  int count = 0;
+  for (int i = 0; i < height; i++) {
+    for (int j = 0; j < width; j++) {
+      real a = data1[i * width + j];
+      real b = data2[i * width + j];
+      if (fabs(a - b) > err) {
+        if ((fabsf(a - b) / fabsf(a)) > (err / 10.0f)) {
+          count++;
+        }
+      }
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+}
+
+void testBilinearFwdBwd(int numSamples,
+                        int imgSizeH,
+                        int imgSizeW,
+                        int channels) {
+  int inWidth = imgSizeH * imgSizeW * channels;
+  int outWidth = 2 * imgSizeH * 2 * imgSizeW * channels;
+  real ratioH = 0.5;
+  real ratioW = 0.5;
+
+  // forward
+  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
+
+  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
+
+  input->randomizeUniform();
+  inputGpu->copyFrom(*input);
+
+  {
+    // nvprof: GPU Proflier
+    REGISTER_GPU_PROFILER("testBilinearFwdBwd");
+    target->bilinearForward(*input,
+                            imgSizeH,
+                            imgSizeW,
+                            2 * imgSizeH,
+                            2 * imgSizeW,
+                            channels,
+                            ratioH,
+                            ratioW);
+    targetGpu->bilinearForward(*inputGpu,
+                               imgSizeH,
+                               imgSizeW,
+                               2 * imgSizeH,
+                               2 * imgSizeW,
+                               channels,
+                               ratioH,
+                               ratioW);
+  }
+
+  // check
+  targetCheck->copyFrom(*targetGpu);
+  MatrixCheckErr(*target, *targetCheck);
+
+  // backward
+  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
+
+  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpuGrad =
+      GpuMatrix::create(numSamples, outWidth, false, true);
+  MatrixPtr targetCheckGrad =
+      CpuMatrix::create(numSamples, inWidth, false, false);
+
+  inputGrad->randomizeUniform();
+  targetGrad->randomizeUniform();
+  inputGpuGrad->copyFrom(*inputGrad);
+  targetGpuGrad->copyFrom(*targetGrad);
+
+  inputGrad->bilinearBackward(*targetGrad,
+                              2 * imgSizeH,
+                              2 * imgSizeW,
+                              imgSizeH,
+                              imgSizeW,
+                              channels,
+                              ratioH,
+                              ratioW);
+  inputGpuGrad->bilinearBackward(*targetGpuGrad,
+                                 2 * imgSizeH,
+                                 2 * imgSizeW,
+                                 imgSizeH,
+                                 imgSizeW,
+                                 channels,
+                                 ratioH,
+                                 ratioW);
+
+  // check
+  targetCheckGrad->copyFrom(*inputGpuGrad);
+  MatrixCheckErr(*inputGrad, *targetCheckGrad);
+}
+
+TEST(Profiler, testBilinearFwdBwd) {
+  auto numSamples = 10;
+  auto channels = 16;
+  auto imgSize = 64;
+  {
+    // nvprof: GPU Proflier
+    REGISTER_GPU_PROFILER("testBilinearFwdBwd");
+    // Paddle built-in timer
+    REGISTER_TIMER_INFO(
+        "testBilinearFwdBwd",
+        "numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64");
+    testBilinearFwdBwd(numSamples, imgSize, imgSize, channels);
+  }
+  globalStat.printAllStatus();
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+
+  // nvprof: GPU Proflier
+  REGISTER_GPU_PROFILER(
+      "RecursiveProfilingTest",
+      "numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64");
+
+  return RUN_ALL_TESTS();
+}
+
+#endif
diff --git a/paddle/math/tests/test_Matrix.cpp b/paddle/legacy/math/tests/test_Matrix.cpp
similarity index 100%
rename from paddle/math/tests/test_Matrix.cpp
rename to paddle/legacy/math/tests/test_Matrix.cpp
diff --git a/paddle/legacy/math/tests/test_RowBuffer.cpp b/paddle/legacy/math/tests/test_RowBuffer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2ef8cd303d65f50cd18adb7f80fa18a665b67340
--- /dev/null
+++ b/paddle/legacy/math/tests/test_RowBuffer.cpp
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/legacy/math/RowBuffer.h"
+
+TEST(RowBuffer, testAutoGrow) {
+  paddle::RowBuffer buf(128);
+  ASSERT_EQ(128UL, buf.getWidth());
+  ASSERT_TRUE(buf.isAutoGrowth());
+  buf.resize(2);
+  ASSERT_EQ(2UL, buf.getRowCount());
+  for (size_t i = 0; i < buf.getWidth() * 2; ++i) {
+    buf.data()[i] = i;
+  }
+  for (size_t i = 0; i < buf.getRowCount(); ++i) {
+    for (size_t j = 0; j < buf.getWidth(); ++j) {
+      ASSERT_NEAR(i * buf.getWidth() + j, buf.get(i)[j], 1e-5);
+    }
+  }
+
+  auto data = buf.getWithAutoGrowth(2);
+  for (size_t i = 0; i < buf.getWidth(); ++i) {
+    data[i] = i;
+  }
+
+  ASSERT_EQ(3UL, buf.getRowCount());
+  for (size_t i = 0; i < buf.getRowCount() - 1; ++i) {
+    for (size_t j = 0; j < buf.getWidth(); ++j) {
+      ASSERT_NEAR(i * buf.getWidth() + j, buf.get(i)[j], 1e-5);
+    }
+  }
+  for (size_t i = 0; i < buf.getWidth(); ++i) {
+    ASSERT_NEAR(i, buf.get(2)[i], 1e-5);
+  }
+}
+
+TEST(RowBuffer, testWithMemBuf) {
+  paddle::CpuMemHandlePtr mem =
+      std::make_shared<paddle::CpuMemoryHandle>(128 * 2 * sizeof(real));
+  paddle::RowBuffer buf(mem, 128);
+  ASSERT_TRUE(!buf.isAutoGrowth());
+  ASSERT_EQ(2UL, buf.getRowCount());
+  for (size_t i = 0; i < buf.getWidth() * 2; ++i) {
+    buf.data()[i] = i;
+  }
+  for (size_t i = 0; i < buf.getRowCount(); ++i) {
+    for (size_t j = 0; j < buf.getWidth(); ++j) {
+      ASSERT_NEAR(i * buf.getWidth() + j, buf.getWithAutoGrowth(i)[j], 1e-5);
+    }
+  }
+
+  ASSERT_DEATH_IF_SUPPORTED(buf.getWithAutoGrowth(3), ".*");
+}
diff --git a/paddle/legacy/math/tests/test_SIMDFunctions.cpp b/paddle/legacy/math/tests/test_SIMDFunctions.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c6490f70e336dadcf6710c83ced2afddc13b7812
--- /dev/null
+++ b/paddle/legacy/math/tests/test_SIMDFunctions.cpp
@@ -0,0 +1,171 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/math/SIMDFunctions.h"
+#include "paddle/legacy/utils/Util.h"
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <functional>
+#include <memory>
+#include <random>
+
+#include <stdlib.h>
+#include <time.h>
+
+static constexpr size_t VECTOR_LEN = 3072;
+static constexpr size_t BATCH_SIZE = 64;
+static constexpr size_t ALIGN = 32;
+static_assert(VECTOR_LEN % ALIGN == 0, "VECTOR_LEN % ALIGN == 0");
+static_assert(BATCH_SIZE % ALIGN == 0, "BATCH_SIZE % ALIGN == 0");
+static constexpr float EPSILON = 1e-5;
+static std::mt19937 RandomEngine(time(0));
+
+inline static std::unique_ptr<float[]> NewVector(size_t len = VECTOR_LEN,
+                                                 size_t align = ALIGN) {
+  float* ptr;
+  CHECK_EQ(posix_memalign((void**)&ptr, align, len * sizeof(float)), 0);
+  return std::unique_ptr<float[]>(ptr);
+}
+
+inline static std::unique_ptr<float[]> NewRandomVector(size_t len = VECTOR_LEN,
+                                                       size_t align = ALIGN) {
+  std::uniform_real_distribution<float> dist(-100.0f, 100.0f);
+  auto generator = std::bind(dist, RandomEngine);
+  auto retv = NewVector(len, align);
+  std::generate_n(retv.get(), len, generator);
+  return retv;
+}
+
+TEST(SIMDFunction, addTo) {
+  typedef std::function<void(float*, const float*, size_t)> AddToMethodType;
+
+  AddToMethodType naive = paddle::simd::naive::addTo<float>;
+  AddToMethodType simd = paddle::simd::addTo<float>;
+
+  auto A = NewRandomVector();
+  auto B = NewRandomVector();
+
+  auto ACopy = NewVector();
+  memcpy(ACopy.get(), A.get(), VECTOR_LEN * sizeof(float));
+
+  naive(A.get(), B.get(), VECTOR_LEN);
+  simd(ACopy.get(), B.get(), VECTOR_LEN);
+
+  for (size_t i = 0; i < VECTOR_LEN; ++i) {
+    ASSERT_NEAR(A[i], ACopy[i], EPSILON);
+  }
+}
+
+TEST(SIMDFunction, batchAddTo) {
+  auto A = NewRandomVector();
+  auto ACopy = NewVector();
+  memcpy(ACopy.get(), A.get(), sizeof(float) * VECTOR_LEN);
+
+  std::vector<std::unique_ptr<float[]>> B;
+  for (size_t i = 0; i < BATCH_SIZE; ++i) {
+    B.emplace_back(NewRandomVector());
+  }
+  std::unique_ptr<float* []> BRaw(new float*[BATCH_SIZE]);
+  for (size_t i = 0; i < BATCH_SIZE; ++i) {
+    BRaw[i] = B[i].get();
+  }
+
+  typedef std::function<void(float*, const float**, int, size_t)>
+      BatchAddToMethodType;
+
+  BatchAddToMethodType naive = paddle::simd::naive::batchAddTo<float>;
+  BatchAddToMethodType simd = paddle::simd::batchAddTo<float>;
+
+  naive(A.get(), (const float**)BRaw.get(), BATCH_SIZE, VECTOR_LEN);
+  simd(ACopy.get(), (const float**)BRaw.get(), BATCH_SIZE, VECTOR_LEN);
+
+  for (size_t i = 0; i < VECTOR_LEN; ++i) {
+    ASSERT_NEAR(A[i], ACopy[i], EPSILON);
+  }
+}
+
+TEST(SIMDFunction, colMax) {
+  auto A = NewRandomVector(VECTOR_LEN * BATCH_SIZE);
+  auto naiveResult = NewVector(BATCH_SIZE);
+  auto simdResult = NewVector(BATCH_SIZE);
+
+  typedef std::function<void(float*, const float*, int, int)> ColMaxMethodType;
+  ColMaxMethodType naive = paddle::simd::naive::colMax<float>;
+  ColMaxMethodType simd = paddle::simd::colMax<float>;
+
+  naive(naiveResult.get(), A.get(), BATCH_SIZE, VECTOR_LEN);
+  simd(simdResult.get(), A.get(), BATCH_SIZE, VECTOR_LEN);
+
+  for (size_t i = 0; i < BATCH_SIZE; ++i) {
+    ASSERT_NEAR(naiveResult[i], simdResult[i], EPSILON);
+  }
+}
+
+TEST(SIMDFunction, decayL1_WithLR) {
+  auto dest = NewRandomVector();
+  auto src = NewRandomVector();
+  auto lr = NewRandomVector();
+  auto lambda = 0.23f;
+
+  auto simd_dest = NewVector();
+  memcpy(simd_dest.get(), dest.get(), sizeof(float) * VECTOR_LEN);
+
+  typedef std::function<void(float*, float*, float*, float, size_t)>
+      DecayL1MethodType;
+
+  DecayL1MethodType naive = [](
+      float* d, float* s, float* lr, float l, size_t len) {
+    paddle::simd::naive::decayL1<float>(d, s, lr, l, len);
+  };
+
+  DecayL1MethodType simd = [](
+      float* d, float* s, float* lr, float l, size_t len) {
+    paddle::simd::decayL1<float>(d, s, lr, l, len);
+  };
+
+  naive(dest.get(), src.get(), lr.get(), lambda, VECTOR_LEN);
+  simd(simd_dest.get(), src.get(), lr.get(), lambda, VECTOR_LEN);
+
+  for (size_t i = 0; i < VECTOR_LEN; ++i) {
+    ASSERT_NEAR(dest[i], simd_dest[i], EPSILON);
+  }
+}
+
+TEST(SIMDFunction, decayL1_WithoutLR) {
+  auto dest = NewRandomVector();
+  auto src = NewRandomVector();
+  auto lambda = 0.23;
+
+  auto simd_dest = NewVector();
+  memcpy(simd_dest.get(), dest.get(), sizeof(float) * VECTOR_LEN);
+
+  typedef std::function<void(float*, float*, float, size_t)> DecayL1MethodType;
+
+  DecayL1MethodType naive = [](float* d, float* s, float l, size_t len) {
+    paddle::simd::naive::decayL1<float>(d, s, l, len);
+  };
+
+  DecayL1MethodType simd = [](float* d, float* s, float l, size_t len) {
+    paddle::simd::decayL1<float>(d, s, l, len);
+  };
+
+  naive(dest.get(), src.get(), lambda, VECTOR_LEN);
+  simd(simd_dest.get(), src.get(), lambda, VECTOR_LEN);
+
+  for (size_t i = 0; i < VECTOR_LEN; ++i) {
+    ASSERT_NEAR(dest[i], simd_dest[i], EPSILON);
+  }
+}
diff --git a/paddle/legacy/math/tests/test_SparseMatrix.cpp b/paddle/legacy/math/tests/test_SparseMatrix.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..30896a945ec6d111c35eea94d8008a62593d2893
--- /dev/null
+++ b/paddle/legacy/math/tests/test_SparseMatrix.cpp
@@ -0,0 +1,565 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <paddle/legacy/utils/PythonUtil.h>
+#include <vector>
+#include "test_matrixUtil.h"
+
+using namespace paddle;  // NOLINT
+
+TEST(Matrix, CopyCpuMatrixToSparseMatrix) {
+  const size_t HEIGHT = 20;
+  const size_t WIDTH = 10;
+  const size_t WIDTH_TEST = 15;
+  MatrixPtr testMatrix(
+      new CpuSparseMatrix(HEIGHT, WIDTH, HEIGHT * 5, FLOAT_VALUE, SPARSE_CSR));
+  MatrixPtr testCpuMatrix(new CpuMatrix(HEIGHT, WIDTH));
+  testCpuMatrix->randomizeUniform();
+  testMatrix->copyFrom(*testCpuMatrix, HPPL_STREAM_DEFAULT);
+  MatrixPtr mulCpuMatrix(new CpuMatrix(WIDTH, WIDTH_TEST));
+  mulCpuMatrix->randomizeUniform();
+  MatrixPtr ret1(new CpuMatrix(HEIGHT, WIDTH_TEST)),
+      ret2(new CpuMatrix(HEIGHT, WIDTH_TEST));
+  ret1->zeroMem();
+  ret2->zeroMem();
+  ret1->mul(*testMatrix, *mulCpuMatrix, 1.0, 1.0);
+  ret2->mul(*testCpuMatrix, *mulCpuMatrix, 1.0, 1.0);
+  checkMatrixEqual(ret1, ret2);
+}
+
+struct MatrixPara {
+  size_t height;
+  size_t width;
+  bool trans;
+  bool sparse;
+  size_t nnz;
+  SparseFormat format;
+};
+
+#ifdef PADDLE_WITH_CUDA
+void test_sparse_matrix_mul(MatrixPara paraA,
+                            MatrixPara paraB,
+                            MatrixPara paraC) {
+  // for cpu sparse matrix mul
+  MatrixPtr cpuMatrixA, cpuMatrixB, cpuMatrixC, gpuMatrixC_d2h;
+  // for gpu sparse matrix mul
+  MatrixPtr gpuMatrixA, gpuMatrixB, gpuMatrixC;
+  // for cpu dense matrix mul
+  MatrixPtr cpuDenseA, cpuDenseB, cpuDenseC;
+
+  if (paraA.sparse) {
+    cpuMatrixA = Matrix::createSparseMatrix(paraA.height,
+                                            paraA.width,
+                                            paraA.nnz,
+                                            FLOAT_VALUE,
+                                            paraA.format,
+                                            paraA.trans,
+                                            false);
+    gpuMatrixA = Matrix::createSparseMatrix(paraA.height,
+                                            paraA.width,
+                                            paraA.nnz,
+                                            FLOAT_VALUE,
+                                            paraA.format,
+                                            paraA.trans,
+                                            true);
+  } else {
+    cpuMatrixA = Matrix::create(paraA.height, paraA.width, paraA.trans, false);
+    gpuMatrixA = Matrix::create(paraA.height, paraA.width, paraA.trans, true);
+  }
+  cpuDenseA = Matrix::create(paraA.height, paraA.width, paraA.trans, false);
+
+  if (paraB.sparse) {
+    cpuMatrixB = Matrix::createSparseMatrix(paraB.height,
+                                            paraB.width,
+                                            paraB.nnz,
+                                            FLOAT_VALUE,
+                                            paraB.format,
+                                            paraB.trans,
+                                            false);
+    gpuMatrixB = Matrix::createSparseMatrix(paraB.height,
+                                            paraB.width,
+                                            paraB.nnz,
+                                            FLOAT_VALUE,
+                                            paraB.format,
+                                            paraB.trans,
+                                            true);
+  } else {
+    cpuMatrixB = Matrix::create(paraB.height, paraB.width, paraB.trans, false);
+    gpuMatrixB = Matrix::create(paraB.height, paraB.width, paraB.trans, true);
+  }
+  cpuDenseB = Matrix::create(paraB.height, paraB.width, paraB.trans, false);
+
+  if (paraC.sparse) {
+    cpuMatrixC = Matrix::createSparseMatrix(paraC.height,
+                                            paraC.width,
+                                            paraC.nnz,
+                                            FLOAT_VALUE,
+                                            paraC.format,
+                                            paraC.trans,
+                                            false);
+    gpuMatrixC = Matrix::createSparseMatrix(paraC.height,
+                                            paraC.width,
+                                            paraC.nnz,
+                                            FLOAT_VALUE,
+                                            paraC.format,
+                                            paraC.trans,
+                                            true);
+    gpuMatrixC_d2h = Matrix::createSparseMatrix(paraC.height,
+                                                paraC.width,
+                                                paraC.nnz,
+                                                FLOAT_VALUE,
+                                                paraC.format,
+                                                paraC.trans,
+                                                false);
+  } else {
+    cpuMatrixC = Matrix::create(paraC.height, paraC.width, paraC.trans, false);
+    gpuMatrixC = Matrix::create(paraC.height, paraC.width, paraC.trans, true);
+    gpuMatrixC_d2h =
+        Matrix::create(paraC.height, paraC.width, paraC.trans, false);
+  }
+  cpuDenseC = Matrix::create(paraC.height, paraC.width, paraC.trans, false);
+
+  /*matrix init*/
+  hl_stream_t stream(HPPL_STREAM_1);
+  cpuMatrixA->randomizeUniform();
+  cpuMatrixB->randomizeUniform();
+  cpuMatrixC->randomizeUniform();
+
+  gpuMatrixA->copyFrom(*cpuMatrixA, stream);
+  gpuMatrixB->copyFrom(*cpuMatrixB, stream);
+  gpuMatrixC->copyFrom(*cpuMatrixC, stream);
+
+  cpuDenseA->copyFrom(*cpuMatrixA);
+  cpuDenseB->copyFrom(*cpuMatrixB);
+  cpuDenseC->copyFrom(*cpuMatrixC);
+
+  hl_stream_synchronize(stream);
+
+  /*matrix mul*/
+  cpuMatrixC->mul(*cpuMatrixA, *cpuMatrixB, 1.0, 1.0);
+  gpuMatrixC->mul(*gpuMatrixA, *gpuMatrixB, 1.0, 1.0);
+  cpuDenseC->mul(*cpuDenseA, *cpuDenseB, 1.0, 1.0);
+
+  gpuMatrixC_d2h->copyFrom(*gpuMatrixC, stream);
+  hl_stream_synchronize(stream);
+
+  /*check result*/
+  if (paraC.sparse) {
+    checkSMatrixEqual(
+        std::dynamic_pointer_cast<CpuSparseMatrix>(cpuMatrixC),
+        std::dynamic_pointer_cast<CpuSparseMatrix>(gpuMatrixC_d2h));
+    checkSMatrixEqual2Dense(
+        std::dynamic_pointer_cast<CpuSparseMatrix>(cpuMatrixC),
+        std::dynamic_pointer_cast<CpuMatrix>(cpuDenseC));
+  } else {
+    checkMatrixEqual(cpuMatrixC, gpuMatrixC_d2h);
+    checkMatrixEqual(cpuMatrixC, cpuDenseC);
+  }
+}
+
+TEST(Matrix, SparseMatrixMul) {
+  const size_t DIM_M = 4;
+  const size_t DIM_N = 4;
+  const size_t DIM_K = 8;
+  const size_t NNZ = 5;
+  for (auto format : {SPARSE_CSC, SPARSE_CSR}) {
+    std::string str_format = format == SPARSE_CSC ? "CSC" : "CSR";
+    LOG(INFO) << "test dense mul " << str_format;
+    test_sparse_matrix_mul(
+        {DIM_M, DIM_K, /*trans*/ false, /*sparse*/ false, NNZ, format},
+        {DIM_K, DIM_N, /*trans*/ false, /*sparse*/ true, NNZ, format},
+        {DIM_M, DIM_N, /*trans*/ false, /*sparse*/ false, NNZ, format});
+
+    LOG(INFO) << "test dense mul " << str_format << "  trans";
+    test_sparse_matrix_mul(
+        {DIM_M, DIM_K, /*trans*/ false, /*sparse*/ false, NNZ, format},
+        {DIM_N, DIM_K, /*trans*/ true, /*sparse*/ true, NNZ, format},
+        {DIM_M, DIM_N, /*trans*/ false, /*sparse*/ false, NNZ, format});
+
+    LOG(INFO) << "test dense mul dense 2 " << str_format;
+    test_sparse_matrix_mul(
+        {DIM_M, DIM_K, /*trans*/ false, /*sparse*/ false, NNZ, format},
+        {DIM_K, DIM_N, /*trans*/ false, /*sparse*/ false, NNZ, format},
+        {DIM_M, DIM_N, /*trans*/ false, /*sparse*/ true, NNZ, format});
+
+    LOG(INFO) << "test denseT mul dense 2 " << str_format;
+    test_sparse_matrix_mul(
+        {DIM_K, DIM_M, /*trans*/ true, /*sparse*/ false, NNZ, format},
+        {DIM_K, DIM_N, /*trans*/ false, /*sparse*/ false, NNZ, format},
+        {DIM_M, DIM_N, /*trans*/ false, /*sparse*/ true, NNZ, format});
+  }
+}
+
+TEST(Matrix, CopySparseMatrixToGpuSparseMatrix) {
+  const size_t HEIGHT = 20;
+  const size_t WIDTH = 10;
+  const size_t WIDTH_TEST = 15;
+  MatrixPtr testMatrix(
+      new CpuSparseMatrix(HEIGHT, WIDTH, HEIGHT * 2, FLOAT_VALUE, SPARSE_CSR));
+  MatrixPtr testCpuMatrix(new CpuMatrix(HEIGHT, WIDTH));
+  testCpuMatrix->randomizeUniform();
+  testMatrix->copyFrom(*testCpuMatrix, HPPL_STREAM_DEFAULT);
+
+  MatrixPtr testGpuMatrix = testMatrix->clone(HEIGHT, WIDTH, true);
+  hl_stream_t gpuStream(HPPL_STREAM_3);
+  testGpuMatrix->copyFrom(*testMatrix, gpuStream);
+  hl_stream_synchronize(gpuStream);
+
+  MatrixPtr mulCpuMatrix(new CpuMatrix(WIDTH, WIDTH_TEST));
+  mulCpuMatrix->randomizeUniform();
+  MatrixPtr mulGpuMatrix(new GpuMatrix(WIDTH, WIDTH_TEST));
+  mulGpuMatrix->copyFrom(*mulCpuMatrix);
+  MatrixPtr ret1(new CpuMatrix(HEIGHT, WIDTH_TEST));
+  MatrixPtr ret2(new GpuMatrix(HEIGHT, WIDTH_TEST));
+  ret1->zeroMem();
+  ret2->zeroMem();
+  ret1->mul(*testMatrix, *mulCpuMatrix, 1.0, 1.0);
+  ret2->mul(*testGpuMatrix, *mulGpuMatrix, 1.0, 1.0);
+  checkMatrixEqual(ret1, ret2);
+}
+
+#endif
+
+TEST(Matrix, SparseMatrixTranspose) {
+  for (auto height : {10, 50, 100}) {
+    for (auto width : {10, 50, 100}) {
+      auto nnz = height * width;
+      for (auto valueType : {FLOAT_VALUE, NO_VALUE}) {
+        for (auto format : {SPARSE_CSR, SPARSE_CSC}) {
+          for (auto sparseRate : {0.1, 0.2, 0.5}) {
+            MatrixPtr matA = Matrix::createSparseMatrix(
+                height, width, size_t(nnz * sparseRate), valueType, format);
+            MatrixPtr matB(new CpuSparseMatrix(
+                width, height, size_t(nnz * sparseRate), valueType, format));
+            matA->randomizeUniform();
+            matA->transpose(matB, false);
+
+            /*dense matrix transpose*/
+            CpuMatrixPtr matC(new CpuMatrix(height, width));
+            matC->copyFrom(*matA);
+            MatrixPtr matD(new CpuMatrix(width, height));
+            matC->transpose(matD, false);
+
+            /*check result*/
+            checkSMatrixEqual2Dense(
+                std::dynamic_pointer_cast<CpuSparseMatrix>(matB),
+                std::dynamic_pointer_cast<CpuMatrix>(matD));
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(Matrix, CpuSparseMatrixSubMatrix) {
+  const size_t HEIGHT = 10;
+  const size_t WIDTH = 10;
+  const size_t NNZ = HEIGHT * WIDTH;
+  for (auto valueType : {FLOAT_VALUE, NO_VALUE}) {
+    size_t startRow = 3;
+    size_t rowNum = 2;
+    real sparseRate = 0.1;
+    /*sparse matrix init and get subMatrix*/
+    CpuSparseMatrixPtr matA = std::make_shared<CpuSparseMatrix>(
+        HEIGHT, WIDTH, size_t(NNZ * sparseRate), valueType, SPARSE_CSR);
+    matA->randomizeUniform();
+    CpuSparseMatrixPtr matB = std::dynamic_pointer_cast<CpuSparseMatrix>(
+        matA->subMatrix(startRow, rowNum));
+
+    int start = matA->getRows()[startRow];
+    int end = matA->getRows()[startRow + rowNum];
+
+    /*compare two matrix*/
+    ASSERT_EQ(matB->getElementCnt(), size_t(end - start));
+    if (valueType == FLOAT_VALUE) {
+      for (size_t i = 0; i < matB->getElementCnt(); i++) {
+        ASSERT_FLOAT_EQ(matB->getValue()[start + i],
+                        matA->getValue()[start + i]);
+      }
+    }
+
+    for (size_t i = 0; i < matB->getElementCnt(); i++) {
+      ASSERT_EQ(matB->getCols()[start + i], matA->getCols()[start + i]);
+    }
+    for (size_t i = 0; i < rowNum; i++) {
+      ASSERT_EQ(matB->getRows()[i], matA->getRows()[startRow + i]);
+    }
+  }
+}
+
+void sparseValid(
+    int* major, int* minor, size_t nnz, size_t majorLen, size_t minorLen) {
+  CHECK_EQ(nnz, size_t(major[majorLen - 1]));
+  CHECK_EQ(nnz, minorLen);
+  for (size_t i = 0; i < majorLen - 1; i++) {
+    EXPECT_LE(major[i], major[i + 1]);
+    for (int j = major[i]; j < major[i + 1] - 1; j++) {
+      EXPECT_LE(minor[j], minor[j + 1]);
+    }
+  }
+}
+
+TEST(Matrix, CpuSparseMatrixRandUniform) {
+  const size_t HEIGHT = 5;
+  const size_t WIDTH = 10;
+  const size_t NNZ = HEIGHT * WIDTH;
+  int* major = nullptr;
+  int* minor = nullptr;
+  size_t majorLen = 0;
+  size_t minorLen = 0;
+  size_t nnz = 0;
+  for (auto valueType : {NO_VALUE, FLOAT_VALUE}) {
+    for (auto format : {SPARSE_CSR, SPARSE_CSC}) {
+      CpuSparseMatrixPtr matA = std::make_shared<CpuSparseMatrix>(
+          HEIGHT, WIDTH, size_t(NNZ * 0.1), valueType, format);
+      matA->randomizeUniform();
+      nnz = matA->getElementCnt();
+      if (format == SPARSE_CSR) {
+        majorLen = matA->getHeight() + 1;
+        minorLen = matA->getElementCnt();
+        major = matA->getRows();
+        minor = matA->getCols();
+      } else {
+        majorLen = matA->getWidth() + 1;
+        minorLen = matA->getElementCnt();
+        major = matA->getCols();
+        minor = matA->getRows();
+      }
+      sparseValid(major, minor, nnz, majorLen, minorLen);
+    }
+  }
+}
+
+TEST(Matrix, CpuSparseMatrixCopyFrom) {
+  size_t height = 10;
+  size_t width = 8;
+  int64_t indices[11] = {0, 1, 5, 5, 9, 13, 15, 17, 19, 30, 32};
+  sparse_non_value_t data[32];
+  for (size_t i = 0; i < 32; i++) {
+    data[i].col = ::rand() % width;
+  }
+  CpuSparseMatrixPtr mat = std::make_shared<CpuSparseMatrix>(
+      height, width, 32, NO_VALUE, SPARSE_CSR, false);
+  mat->copyFrom(indices, data);
+
+  /*compare indices*/
+  size_t sum = 0;
+  CHECK_EQ(sum, size_t(mat->getRows()[0]));
+  for (size_t i = 1; i < height + 1; i++) {
+    sum += indices[i] - indices[i - 1];
+    CHECK_EQ(sum, size_t(mat->getRows()[i]));
+  }
+  CHECK_EQ(mat->getElementCnt(), size_t(indices[height] - indices[0]));
+  for (size_t i = 0; i < mat->getElementCnt(); i++) {
+    CHECK_EQ(size_t(mat->getCols()[i]), size_t(data[i].col));
+  }
+}
+
+TEST(Matrix, SparseMatrixCSRFormatTrimFrom) {
+  size_t height = 10;
+  size_t width = 8;
+  int64_t indices[11] = {0, 1, 5, 5, 9, 13, 15, 17, 19, 27, 32};
+  sparse_float_value_t data[32];
+  int value[32] = {
+      1,                       // row_0 : 1
+      5, 3, 1, 6,              // row_1 : 4
+      0, 1, 2, 3,              // row_3 : 4
+      4, 5, 6, 7,              // row_4 : 4
+      2, 3,                    // row_5 : 2
+      3, 5,                    // row_6 : 2
+      0, 1,                    // row_7 : 2
+      0, 1, 2, 3, 4, 5, 6, 7,  // row_8 : 8
+      2, 4, 7, 3, 1            // row_9 : 5
+  };
+  for (size_t i = 0; i < 32; i++) {
+    data[i].col = value[i];
+    data[i].value = float(value[i]);
+  }
+  CpuSparseMatrixPtr mat = std::make_shared<CpuSparseMatrix>(
+      height, width, 32, FLOAT_VALUE, SPARSE_CSR, false);
+  mat->copyFrom(indices, data);
+
+  /*compare indices*/
+  size_t sum = 0;
+  CHECK_EQ(sum, size_t(mat->getRows()[0]));
+  for (size_t i = 1; i < height + 1; i++) {
+    sum += indices[i] - indices[i - 1];
+    CHECK_EQ(sum, size_t(mat->getRows()[i]));
+  }
+  CHECK_EQ(mat->getElementCnt(), size_t(indices[height] - indices[0]));
+  for (size_t i = 0; i < mat->getElementCnt(); i++) {
+    CHECK_EQ(size_t(mat->getCols()[i]), size_t(data[i].col));
+  }
+
+  size_t trimedWidth = 4;
+  int64_t trimedIndices[11] = {0, 1, 3, 3, 7, 7, 9, 10, 12, 16, 19};
+  sparse_float_value_t trimedData[19];
+  int trimedValue[19] = {
+      1,  // row_0 : 1
+      3,
+      1,  // row_1 : 2
+      0,
+      1,
+      2,
+      3,  // row_3 : 4
+      2,
+      3,  // row_5 : 2
+      3,  // row_6 : 1
+      0,
+      1,  // row_7 : 2
+      0,
+      1,
+      2,
+      3,  // row_8 : 4
+      2,
+      3,
+      1  // row_9 : 3
+  };
+  for (size_t i = 0; i < 19; i++) {
+    trimedData[i].col = trimedValue[i];
+    trimedData[i].value = float(trimedValue[i]);
+  }
+  CpuSparseMatrixPtr matA = std::make_shared<CpuSparseMatrix>(
+      height, trimedWidth, 19, FLOAT_VALUE, SPARSE_CSR, false);
+  matA->copyFrom(trimedIndices, trimedData);
+
+  /*compare indices*/
+  sum = 0;
+  CHECK_EQ(sum, size_t(matA->getRows()[0]));
+  for (size_t i = 1; i < height + 1; i++) {
+    sum += trimedIndices[i] - trimedIndices[i - 1];
+    CHECK_EQ(sum, size_t(matA->getRows()[i]));
+  }
+  CHECK_EQ(matA->getElementCnt(),
+           size_t(trimedIndices[height] - trimedIndices[0]));
+  for (size_t i = 0; i < matA->getElementCnt(); i++) {
+    CHECK_EQ(size_t(matA->getCols()[i]), size_t(trimedData[i].col));
+  }
+
+  CpuSparseMatrixPtr matB = std::make_shared<CpuSparseMatrix>(
+      height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSR, false);
+  matB->trimFrom(*mat);
+  checkSMatrixEqual2(matA, matB);
+
+#ifdef PADDLE_WITH_CUDA
+  GpuSparseMatrixPtr matC = std::make_shared<GpuSparseMatrix>(
+      height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSR, true);
+  matC->trimFrom(*mat);
+
+  CpuSparseMatrixPtr matD =
+      std::make_shared<CpuSparseMatrix>(height,
+                                        trimedWidth,
+                                        matC->getElementCnt(),
+                                        FLOAT_VALUE,
+                                        SPARSE_CSR,
+                                        false);
+  matD->copyFrom(*matC, HPPL_STREAM_DEFAULT);
+  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  checkSMatrixEqual2(matA, matD);
+#endif
+}
+
+TEST(Matrix, SparseMatrixCSCFormatTrimFrom) {
+  size_t height = 8;
+  size_t width = 10;
+  int indices[11] = {0, 1, 5, 5, 9, 13, 15, 17, 19, 27, 32};
+  int value[32] = {
+      1,                       // col_0 : 1
+      5, 3, 1, 6,              // col_1 : 4
+      0, 1, 2, 3,              // col_3 : 4
+      4, 5, 6, 7,              // col_4 : 4
+      2, 3,                    // col_5 : 2
+      3, 5,                    // col_6 : 2
+      0, 1,                    // col_7 : 2
+      0, 1, 2, 3, 4, 5, 6, 7,  // col_8 : 8
+      2, 4, 7, 3, 1            // col_9 : 5
+  };
+  std::vector<int> rows(value, value + 32);
+  std::vector<int> cols(indices, indices + 11);
+  std::vector<real> values(value, value + 32);
+  CpuSparseMatrixPtr mat = std::make_shared<CpuSparseMatrix>(
+      height, width, 32, FLOAT_VALUE, SPARSE_CSC, false);
+  mat->copyFrom(rows, cols, values);
+
+  /*compare indices*/
+  size_t sum = 0;
+  CHECK_EQ(sum, size_t(mat->getCols()[0]));
+  for (size_t i = 1; i < width + 1; i++) {
+    sum += indices[i] - indices[i - 1];
+    CHECK_EQ(sum, size_t(mat->getCols()[i]));
+  }
+  CHECK_EQ(mat->getElementCnt(), size_t(indices[width] - indices[0]));
+  for (size_t i = 0; i < mat->getElementCnt(); i++) {
+    CHECK_EQ(size_t(mat->getRows()[i]), size_t(value[i]));
+  }
+
+  size_t trimedWidth = 5;
+  int trimedIndices[6] = {0, 1, 5, 5, 9, 13};
+  int trimedValue[13] = {
+      1,  // col_0 : 1
+      5,
+      3,
+      1,
+      6,  // col_1 : 4
+      0,
+      1,
+      2,
+      3,  // col_3 : 4
+      4,
+      5,
+      6,
+      7  // col_4 : 4
+  };
+  std::vector<int> rowsA(trimedValue, trimedValue + 13);
+  std::vector<int> colsA(trimedIndices, trimedIndices + 6);
+  std::vector<real> valuesA(trimedValue, trimedValue + 13);
+  CpuSparseMatrixPtr matA = std::make_shared<CpuSparseMatrix>(
+      height, trimedWidth, 13, FLOAT_VALUE, SPARSE_CSC, false);
+  matA->copyFrom(rowsA, colsA, valuesA);
+
+  /*compare indices*/
+  sum = 0;
+  CHECK_EQ(sum, size_t(matA->getCols()[0]));
+  for (size_t i = 1; i < trimedWidth + 1; i++) {
+    sum += trimedIndices[i] - trimedIndices[i - 1];
+    CHECK_EQ(sum, size_t(matA->getCols()[i]));
+  }
+  CHECK_EQ(matA->getElementCnt(),
+           size_t(trimedIndices[trimedWidth] - trimedIndices[0]));
+  for (size_t i = 0; i < matA->getElementCnt(); i++) {
+    CHECK_EQ(size_t(matA->getRows()[i]), size_t(rowsA[i]));
+  }
+
+  CpuSparseMatrixPtr matB = std::make_shared<CpuSparseMatrix>(
+      height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSC, false);
+  matB->trimFrom(*mat);
+  checkSMatrixEqual2(matA, matB);
+
+#ifdef PADDLE_WITH_CUDA
+  GpuSparseMatrixPtr matC = std::make_shared<GpuSparseMatrix>(
+      height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSC, true);
+  matC->trimFrom(*mat);
+
+  CpuSparseMatrixPtr matD =
+      std::make_shared<CpuSparseMatrix>(height,
+                                        trimedWidth,
+                                        matC->getElementCnt(),
+                                        FLOAT_VALUE,
+                                        SPARSE_CSC,
+                                        false);
+  matD->copyFrom(*matC, HPPL_STREAM_DEFAULT);
+  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  checkSMatrixEqual2(matA, matD);
+#endif
+}
diff --git a/paddle/legacy/math/tests/test_Tensor.cu b/paddle/legacy/math/tests/test_Tensor.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3ce056d66140059be8145f7f49bb80cbff4686eb
--- /dev/null
+++ b/paddle/legacy/math/tests/test_Tensor.cu
@@ -0,0 +1,1162 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "TensorCheck.h"
+#include "paddle/legacy/math/Matrix.h"
+
+using paddle::Matrix;
+using paddle::CpuMatrix;
+using paddle::GpuMatrix;
+using paddle::CpuVector;
+using paddle::GpuVector;
+using paddle::CpuIVector;
+using paddle::GpuIVector;
+using autotest::TensorCheckEqual;
+using autotest::TensorCheckErr;
+
+#define INIT_UNARY(A1, A2)  \
+  Tensor A1(height, width); \
+  Tensor A2(height, width); \
+  A1.randomizeUniform();    \
+  A2.copyFrom(A1)
+#define INIT_BINARY(A1, A2, B) \
+  INIT_UNARY(A1, A2);          \
+  Tensor B(height, width);     \
+  B.randomizeUniform()
+#define INIT_TERNARY(A1, A2, B, C) \
+  INIT_BINARY(A1, A2, B);          \
+  Tensor C(height, width);         \
+  C.randomizeUniform()
+#define INIT_QUATERNARY(A1, A2, B, C, D) \
+  INIT_TERNARY(A1, A2, B, C);            \
+  Tensor D(height, width);               \
+  D.randomizeUniform()
+
+template <typename Tensor>
+struct TestUnaryMatrix {
+  typedef std::function<void(Tensor& A1, Tensor& A2)> UnaryFunc;
+
+  explicit TestUnaryMatrix(UnaryFunc testUnaryFunc) {
+    for (auto height : {1, 11, 73, 128, 200, 330}) {
+      for (auto width : {1, 32, 100, 512, 1000, 3210}) {
+        LOG(INFO) << " height=" << height << " width=" << width;
+        INIT_UNARY(A1, A2);
+        testUnaryFunc(A1, A2);
+      }
+    }
+  }
+};
+
+template <typename Tensor>
+struct TestBinaryMatrix {
+  typedef std::function<void(Tensor& A1, Tensor& A2, Tensor& B)> BinaryFunc;
+
+  explicit TestBinaryMatrix(BinaryFunc testBinaryFunc) {
+    for (auto height : {1, 11, 73, 128, 200, 330}) {
+      for (auto width : {1, 32, 100, 512, 1000, 3210}) {
+        LOG(INFO) << " height=" << height << " width=" << width;
+        INIT_BINARY(A1, A2, B);
+        testBinaryFunc(A1, A2, B);
+      }
+    }
+  }
+};
+
+template <typename Tensor>
+struct TestTernaryMatrix {
+  typedef std::function<void(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C)>
+      TernaryFunc;
+
+  explicit TestTernaryMatrix(TernaryFunc testTernaryFunc) {
+    for (auto height : {1, 11, 73, 128, 200, 330}) {
+      for (auto width : {1, 32, 100, 512, 1000, 3210}) {
+        LOG(INFO) << " height=" << height << " width=" << width;
+        INIT_TERNARY(A1, A2, B, C);
+        testTernaryFunc(A1, A2, B, C);
+      }
+    }
+  }
+};
+
+template <typename Tensor>
+struct TestQuaternaryMatrix {
+  typedef std::function<void(
+      Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D)>
+      QuaternaryFunc;
+
+  explicit TestQuaternaryMatrix(QuaternaryFunc testQuaternaryFunc) {
+    for (auto height : {1, 11, 73, 128, 200, 330}) {
+      for (auto width : {1, 32, 100, 512, 1000, 3210}) {
+        LOG(INFO) << " height=" << height << " width=" << width;
+        INIT_QUATERNARY(A1, A2, B, C, D);
+        testQuaternaryFunc(A1, A2, B, C, D);
+      }
+    }
+  }
+};
+
+template <typename Tensor, class T>
+struct TestUnaryVectorT {
+  typedef std::function<void(Tensor& A1, Tensor& A2)> UnaryFunc;
+
+  explicit TestUnaryVectorT(UnaryFunc testUnaryFunc) {
+    for (auto size : {1, 11, 73, 128, 200, 330, 512, 1000, 4210}) {
+      LOG(INFO) << " size=" << size;
+      Tensor A1(size);
+      Tensor A2(size);
+      if (typeid(T) == typeid(real)) {
+        A1.rand();
+      } else {
+        A1.rand(1000);
+      }
+      A2.copyFrom(A1);
+      testUnaryFunc(A1, A2);
+    }
+  }
+};
+
+void SetTensorValue(Matrix& matrix, real value) {
+  int height = matrix.getHeight();
+  int width = matrix.getWidth();
+  int stride = matrix.getStride();
+  real* data = matrix.getData();
+  for (int i = 0; i < height; i++) {
+    int j = rand() % width;  // NOLINT
+    if (typeid(matrix) == typeid(CpuMatrix)) {
+      data[i * stride + j] = value;
+    } else if (typeid(matrix) == typeid(GpuMatrix)) {
+      hl_memcpy(&data[i * stride + j], &value, sizeof(real));
+    } else {
+    }
+  }
+}
+
+template <typename Tensor>
+void testTensorAddScalar(Tensor& A1, Tensor& A2) {
+  real p1 = 2.5;
+  real p2 = 3.0;
+  A1.add(p1);  // a += p
+  A2 += p1;
+  TensorCheckEqual(A1, A2);
+
+  A1.add(p1, p2);  // a = a * p1 + p2
+  A2 = A2 * p1 + p2;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSubScalar(Tensor& A1, Tensor& A2) {
+  real p = 2.5;
+  A1.subScalar(p);  // a -= p
+  A2 -= p;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorMulScalar(Tensor& A1, Tensor& A2) {
+  real p = 2.5;
+  A1.mulScalar(p);  // a *= p
+  A2 *= p;
+  TensorCheckEqual(A1, A2);
+
+  real learningRate = 0.7f;
+  real decayRate = 1.2f;
+  A1.applyL2(learningRate, decayRate);
+  A2 = A2 * (1.0f / (1.0f + learningRate * decayRate));
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorDivScalar(Tensor& A1, Tensor& A2) {
+  real p = 2.5;
+  A1.divScalar(p);  // a /= p
+  A2 /= p;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorNeg(Tensor& A1, Tensor& A2) {
+  A1.neg();  // a = -a
+  A2 = -A2;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorAbs(Tensor& A1, Tensor& A2) {
+  A1.abs2();  // a = a > 0 ? a : -a
+  A2 = A2.abs();
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSquare(Tensor& A1, Tensor& A2) {
+  A1.square2();  // a = a * a
+  A2 = A2.square();
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorReciprocal(Tensor& A1, Tensor& A2) {
+  A1.reciprocal2();  // a = 1.0f / a
+  A2 = A2.reciprocal();
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSign(Tensor& A1, Tensor& A2) {
+  A1.sign2();  // a = (a > 0) - (a < 0)
+  A2 = A2.sign();
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorAssign(Tensor& A1, Tensor& A2) {
+  A1.assign(1.5);  // a = p
+  A2 = A2.constant(1.5);
+  TensorCheckEqual(A1, A2);
+
+  A1.one();  // a = 1
+  A2 = A2.constant(1.0);
+  TensorCheckEqual(A1, A2);
+
+  A1.zero();  // a = 0
+  A2 = A2.constant(0.0);
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testUnaryBaseOp(Tensor& A1, Tensor& A2) {
+  testTensorAddScalar(A1, A2);
+  testTensorSubScalar(A1, A2);
+  testTensorMulScalar(A1, A2);
+  testTensorDivScalar(A1, A2);
+  testTensorNeg(A1, A2);
+  testTensorAbs(A1, A2);
+  testTensorSquare(A1, A2);
+  testTensorReciprocal(A1, A2);
+  testTensorSign(A1, A2);
+  testTensorAssign(A1, A2);
+}
+
+template <typename Tensor>
+void testUnaryBaseOpInt(Tensor& A1, Tensor& A2) {
+  A1.add(2);  // a += p
+  A2 += 2;
+  TensorCheckEqual(A1, A2);
+
+  A1.add(3, 2);  // a = a * p1 + p2
+  A2 = A2 * 3 + 2;
+  TensorCheckEqual(A1, A2);
+
+  testTensorNeg(A1, A2);
+  testTensorAbs(A1, A2);
+}
+
+TEST(Unary, BaseOp) {
+  TestUnaryMatrix<CpuMatrix> testCpuMatrix(testUnaryBaseOp<CpuMatrix>);
+  TestUnaryVectorT<CpuVector, real> testCpuVector(testUnaryBaseOp<CpuVector>);
+  TestUnaryVectorT<CpuIVector, int> testCpuIVector(
+      testUnaryBaseOpInt<CpuIVector>);
+
+#ifdef PADDLE_WITH_GPU
+  TestUnaryMatrix<GpuMatrix> testGpuMatrix(testUnaryBaseOp<GpuMatrix>);
+  TestUnaryVectorT<GpuVector, real> testGpuVector(testUnaryBaseOp<GpuVector>);
+  TestUnaryVectorT<GpuIVector, int> testGpuIVector(
+      testUnaryBaseOpInt<GpuIVector>);
+#endif
+}
+
+template <typename Tensor>
+void testTensorExp(Tensor& A1, Tensor& A2) {
+  A1.exp2();  // a = exp(a)
+  A2 = A2.exp();
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorLog(Tensor& A1, Tensor& A2) {
+  A1.log2();  // a = log(a)
+  A2 = A2.log();
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSqrt(Tensor& A1, Tensor& A2) {
+  A1.sqrt2();  // a = sqrt(a)
+  A2 = A2.sqrt();
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorPow(Tensor& A1, Tensor& A2) {
+  A1.pow2(3.2);  // a = pow(a, p)
+  A2 = A2.pow(3.2);
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testUnayrMathOp(Tensor& A1, Tensor& A2) {
+  testTensorExp(A1, A2);
+  testTensorLog(A1, A2);
+  testTensorSqrt(A1, A2);
+  testTensorPow(A1, A2);
+}
+
+TEST(Unary, MathOp) {
+  TestUnaryMatrix<CpuMatrix> testCpu(testUnayrMathOp<CpuMatrix>);
+
+#ifdef PADDLE_WITH_GPU
+  TestUnaryMatrix<GpuMatrix> testGpu(testUnayrMathOp<GpuMatrix>);
+#endif
+}
+
+template <typename Tensor>
+void testTensorClip(Tensor& A1, Tensor& A2) {
+  real p1 = 0.003f;
+  real p2 = 0.877f;
+  A1.clip(p1, p2);  // a = a < p1 ? p1 : (a > p2 ? p2 : a)
+  // A2 = A2.min(0.877f).max(0.003f);
+  A2 = (A2 < p1).condition(p1, (A2 > p2).condition(p2, A2));
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorBiggerThanScalar(Tensor& A1, Tensor& A2) {
+  real p = 0.5f;
+  A1.biggerThanScalar(p);  // a = a > p ? 1.0f : 0.0f
+  A2 = (A2 > p).condition((real)1.0, (real)0.0);
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorapplyL1(Tensor& A1, Tensor& A2) {
+  /**
+   * T lambda = p;
+   * a = (a > lambda) ? (a - lambda)
+   *                  : (a < -lambda) ? (a + lambda) : 0
+   *
+   * p = learningRate * decayRate;
+   */
+  real learningRate = 0.7f;
+  real decayRate = 0.6f;
+  A1.applyL1(learningRate, decayRate);
+  A2 = (A2 > (learningRate * decayRate))
+           .condition(
+               (A2 - (learningRate * decayRate)),
+               (A2 < -(learningRate * decayRate))
+                   .condition((A2 + (learningRate * decayRate)), (real)0.0));
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testUnayrCompareOp(Tensor& A1, Tensor& A2) {
+  testTensorClip(A1, A2);
+  testTensorBiggerThanScalar(A1, A2);
+
+  A1.randomizeUniform();
+  A1.subScalar(0.5f);
+  A2.copyFrom(A1);
+  testTensorapplyL1(A1, A2);
+}
+
+TEST(Unary, CompareOp) {
+  TestUnaryMatrix<CpuMatrix> testCpu(testUnayrCompareOp<CpuMatrix>);
+
+#ifdef PADDLE_WITH_GPU
+  TestUnaryMatrix<GpuMatrix> testGpu(testUnayrCompareOp<GpuMatrix>);
+#endif
+}
+
+template <typename Tensor>
+void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B) {
+  real p1 = 2.5;
+  real p2 = 3.2;
+  A1.add(B);  // a += b
+  A2 += B;
+  TensorCheckEqual(A1, A2);
+
+  A1.add(B, p1);  // a += b * p
+  A2 += B * p1;
+  TensorCheckEqual(A1, A2);
+
+  A1.add(B, p1, p2);  // a = p1 * a + p2 * b
+  A2 = A2 * p1 + B * p2;
+  TensorCheckEqual(A1, A2);
+
+  A1.addScalar(B, p1);  // a = b + p
+  A2 = B + p1;
+  TensorCheckEqual(A1, A2);
+
+  A1.addSquare(B, p1);  // a += p * b * b
+  A2 += B.constant(p1) * B * B;
+  TensorCheckEqual(A1, A2);
+
+  A1.decayAddSquare(B, p1, p2);  // a = p1 * a + p2 * b * b
+  A2 = A2 * p1 + B.constant(p2) * B * B;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B) {
+  real p = 2.5;
+  A1.sub(B);  // a -= b
+  A2 -= B;
+  TensorCheckEqual(A1, A2);
+
+  A1.sub(B, p);  // a -= b * p
+  A2 -= B * p;
+  TensorCheckEqual(A1, A2);
+
+  A1.subScalar(B, p);  // a = b - p
+  A2 = B - p;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B) {
+  real p = 2.5;
+  A1.mulScalar(B, p);  // a = b * p
+  A2 = B * p;
+  TensorCheckEqual(A1, A2);
+
+  A1.dotMulSquare(B);  // a *= b * b
+  A2 *= B * B;
+  TensorCheckEqual(A1, A2);
+
+  A1.dotSquareMul(B);  // a = a * a * b
+  A2 = A2 * A2 * B;
+  TensorCheckEqual(A1, A2);
+
+  A1.dotMul(B);  // a *= b
+  A2 *= B;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B) {
+  real p = 2.5;
+  A1.divScalar(B, p);  // a = b / p
+  A2 = B / p;
+  TensorCheckEqual(A1, A2);
+
+  A1.scalarDiv(B, p);  // a = p / b
+  A2 = B.constant(p) / B;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorAssign(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.assign(B);  // a = b
+  A2 = B;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSquare(Tensor& A1, Tensor& A2, Tensor& B) {
+  B.square2(A1);  // b = a * a
+  A2 = B.square();
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSquareDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.squareDerivative(B);  // a *= 2.0 * b
+  A2 = A2 * (real)2.0 * B;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B) {
+  B.reciprocal2(A1);  // b = 1.0f / a
+  A2 = B.reciprocal();
+  TensorCheckEqual(A1, A2);
+
+  real p1 = 0.58;
+  real p2 = 0.32;
+  A1.reciprocal2(B, p1, p2);  // a = 1 / (p1 * b + p2)
+  A2 = (B * p1 + p2).reciprocal();
+  TensorCheckEqual(A1, A2);
+
+  real learningRate = 0.7f;
+  real decayRate = 1.2f;
+  A1.applyL2(B, learningRate, decayRate);  // a *= (1.0f / (1.0f + p * b))
+  A2 *= (B.constant(1.0f) + B.constant(learningRate * decayRate) * B)
+            .reciprocal();
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorReciprocalDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.reciprocalDerivative(B);  // a *= -b * b
+  A2 *= (-B) * B;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSign(Tensor& A1, Tensor& A2, Tensor& B) {
+  B.sign2(A1);  // b = a > 0.0f ? 1.0f : -1.0f
+  A2 = B.sign();
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorAbs(Tensor& A1, Tensor& A2, Tensor& B) {
+  B.abs2(A1);  // b = a > 0.0f ? a : -a
+  A2 = B.abs();
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testBinaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B) {
+  testTensorAdd(A1, A2, B);
+  testTensorSub(A1, A2, B);
+  testTensorMul(A1, A2, B);
+  testTensorDiv(A1, A2, B);
+  testTensorSquare(A1, A2, B);
+  testTensorSquareDerivative(A1, A2, B);
+  testTensorReciprocal(A1, A2, B);
+  testTensorReciprocalDerivative(A1, A2, B);
+  testTensorAbs(A1, A2, B);
+  testTensorSign(A1, A2, B);
+  testTensorAssign(A1, A2, B);
+}
+
+TEST(Binary, BaseOp) {
+  TestBinaryMatrix<CpuMatrix> testCpu(testBinaryBaseOp<CpuMatrix>);
+
+#ifdef PADDLE_WITH_GPU
+  TestBinaryMatrix<GpuMatrix> testGpu(testBinaryBaseOp<GpuMatrix>);
+#endif
+}
+
+template <typename Tensor>
+void testTensorExp(Tensor& A1, Tensor& A2, Tensor& B) {
+  // a = exp(b)
+  A1.exp2(B);
+  A2 = B.exp();
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorExpDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.expDerivative(B);  // a *= b
+  A2 *= B;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorLog(Tensor& A1, Tensor& A2, Tensor& B) {
+  // a = log(b)
+  A1.log2(B);
+  A2 = B.log();
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSqrt(Tensor& A1, Tensor& A2, Tensor& B) {
+  // a = sqrt(b)
+  A1.sqrt2(B);
+  A2 = B.sqrt();
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorInvSqrt(Tensor& A1, Tensor& A2, Tensor& B) {
+  // a = 1.0f / sqrt(b)
+  A1.invSqrt(B);
+  A2 = B.sqrt().reciprocal();
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorPow(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.pow2(B, 2.5f);  // a = pow(b, p)
+  A2 = B.pow(2.5f);
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSoftrelu(Tensor& A1, Tensor& A2, Tensor& B) {
+  /*
+   * const T THRESHOLD = 40.0;
+   * b = log(1.0 +
+   *         exp((a > THRESHOLD) ? THRESHOLD
+   *             : ((a < -THRESHOLD) ? (-THRESHOLD) : a)))
+   */
+  B.softrelu(A1);
+
+  real THRESHOLD = 40.0;
+  A2 = (B.constant(1.0f) +
+        (B > THRESHOLD)
+            .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B))
+            .exp())
+           .log();
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSoftreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  /*
+   * const T THRESHOLD = 40.0;
+   * a *= (1.0 - exp(-1.0 * ((b > THRESHOLD)
+   *                             ? THRESHOLD
+   *                             : ((b < -THRESHOLD) ? (-THRESHOLD) : b)))));
+   */
+  A1.softreluDerivative(B);
+  real THRESHOLD = 40.0;
+  A2 = A2 *
+       (B.constant(1.0f) -
+        (B.constant(-1.0f) *
+         (B > THRESHOLD)
+             .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B)))
+            .exp());
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSigmoid(Tensor& A1, Tensor& A2, Tensor& B) {
+  /*
+    const T THRESHOLD_MIN = -40.0;
+    const T THRESHOLD_MAX = 13.0;
+    T tmp = (a < THRESHOLD_MIN) ? THRESHOLD_MIN
+            : ((a > THRESHOLD_MAX) ? THRESHOLD_MAX : a);
+    b = 1.0f / (1.0f + exp(-tmp)))
+   */
+  B.sigmoid(A1);
+
+  const real THRESHOLD_MIN = -40.0;
+  const real THRESHOLD_MAX = 13.0;
+  auto tmp = (B < THRESHOLD_MIN)
+                 .condition(THRESHOLD_MIN,
+                            (B > THRESHOLD_MAX).condition(THRESHOLD_MAX, B));
+  A2 = (B.constant(1.0f) + (-tmp).exp()).reciprocal();
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSigmoidDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.sigmoidDerivative(B);  // a *= b * (1 - b)
+  A2 *= B * (B.constant(1.0f) - B);
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorTanh(Tensor& A1, Tensor& A2, Tensor& B) {
+  B.tanh(A1);  // b = 2.0 / (1.0 + exp(-2 * a)) - 1.0
+  A2 = B.constant(2.0f) / ((B * ((real)-2.0f)).exp() + (real)1.0f) - (real)1.0f;
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.tanhDerivative(B);  // a *= 1 - b * b
+  A2 *= B.constant(1.0f) - B * B;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorScaledTanh(Tensor& A1, Tensor& A2, Tensor& B) {
+  real p1 = 2.5;
+  real p2 = 3.1;
+  // b = p1 * (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0)
+  B.scaledTanh(A1, p1, p2);
+  A2 = B.constant(p1) *
+       (B.constant(2.0f) / ((B.constant(-2.0f) * p2 * B).exp() + (real)1.0) -
+        (real)1.0);
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorScaledTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  real p1 = 2.5;
+  real p2 = 3.1;
+  // a *= (p2 / p1) * (p1 * p1 - b * b));
+  A1.scaledTanhDerivative(B, p1, p2);
+  A2 = A2 * (B.constant(p2 / p1) * (B.constant(p1 * p1) - B * B));
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testBinaryMathOp(Tensor& A1, Tensor& A2, Tensor& B) {
+  testTensorTanhDerivative(A1, A2, B);
+  testTensorScaledTanhDerivative(A1, A2, B);
+  testTensorSigmoidDerivative(A1, A2, B);
+  testTensorExpDerivative(A1, A2, B);
+  testTensorScaledTanh(A1, A2, B);
+  testTensorTanh(A1, A2, B);
+  testTensorExp(A1, A2, B);
+  testTensorLog(A1, A2, B);
+  testTensorSqrt(A1, A2, B);
+  testTensorInvSqrt(A1, A2, B);
+  testTensorPow(A1, A2, B);
+
+  testTensorSoftrelu(A1, A2, B);
+  testTensorSoftreluDerivative(A1, A2, B);
+  testTensorSigmoid(A1, A2, B);
+}
+
+TEST(Binary, MathOp) {
+  TestBinaryMatrix<CpuMatrix> testCpu(testBinaryMathOp<CpuMatrix>);
+
+#ifdef PADDLE_WITH_GPU
+  TestBinaryMatrix<GpuMatrix> testGpu(testBinaryMathOp<GpuMatrix>);
+#endif
+}
+
+template <typename Tensor>
+void testTensorRelu(Tensor& A1, Tensor& A2, Tensor& B) {
+  B.relu(A1);  // b = a > 0.0f ? a : 0.0f
+  A2 = (B > (real)0.0f).condition(B, (real)0.0f);
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorReluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.reluDerivative(B);  // a *= (b > 0.0f ? 1.0f : 0.0f)
+  A2 *= (B > (real)0.0).condition((real)1.0, (real)0.0);
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorBrelu(Tensor& A1, Tensor& A2, Tensor& B) {
+  /*
+   * b = a > p1 ? a : p1
+   * b = b < p2 ? b : p2
+   * int p1 = 0, p2 = 24;
+   */
+  SetTensorValue(B, 32.0f);
+  B.brelu(A1);
+  auto tmp = (B > (real)0.0f).condition(B, (real)0.0f);
+  A2 = (tmp < (real)24.0f).condition(tmp, (real)24.0f);
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorBreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  SetTensorValue(B, 32.0f);
+  /*
+   * a *= (b > p1 && b < p2) ? 1.0 : 0.0
+   * int p1 = 0, p2 = 24;
+   */
+  A1.breluDerivative(B);
+  A2 *= (B > (real)0.0f && B < (real)24.0f).condition((real)1.0f, (real)0.0f);
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorAbsDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.absDerivative(B);  // a = (b > 0) ? a : (b < 0) ? -a : 0
+  A2 = (B > (real)0.0f)
+           .condition(A2, (B < (real)0.0f).condition(-A2, (real)0.0f));
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorIsEqualTo(Tensor& A1, Tensor& A2, Tensor& B) {
+  real p = 0.613;
+  SetTensorValue(B, p);
+  A1.isEqualTo(B, p);  // a = (b == p)
+  A2 = (B == p);
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorapplyL1(Tensor& A1, Tensor& A2, Tensor& B) {
+  /**
+   * T lambda = p * b;
+   * a = (a > lambda) ? (a - lambda)
+   *                  : (a < -lambda) ? (a + lambda) : 0
+   *
+   * p = learningRate * decayRate;
+   */
+  real learningRate = 0.7f;
+  real decayRate = 0.6f;
+  A1.applyL1(B, learningRate, decayRate);
+  auto lambda = B.constant(learningRate * decayRate) * B;
+  A2 = (A2 > lambda)
+           .condition((A2 - lambda),
+                      (A2 < -lambda).condition((A2 + lambda), (real)0.0f));
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testBinaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B) {
+  B.subScalar(0.5f);
+  SetTensorValue(B, 0.0f);
+  testTensorReluDerivative(A1, A2, B);
+
+  A1.randomizeUniform();
+  A2.copyFrom(A1);
+  testTensorBreluDerivative(A1, A2, B);
+
+  testTensorAbsDerivative(A1, A2, B);
+  testTensorRelu(A1, A2, B);
+  testTensorBrelu(A1, A2, B);
+  testTensorIsEqualTo(A1, A2, B);
+}
+
+TEST(Binary, CompareOp) {
+  TestBinaryMatrix<CpuMatrix> testCpu(testBinaryCompareOp<CpuMatrix>);
+
+#ifdef PADDLE_WITH_GPU
+  TestBinaryMatrix<GpuMatrix> testGpu(testBinaryCompareOp<GpuMatrix>);
+#endif
+}
+
+template <typename Tensor>
+void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  A1.add(B, C);  // a = b + c
+  A2 = B + C;
+  TensorCheckEqual(A1, A2);
+
+  real p1 = 1.5;
+  real p2 = 2.5;
+  real p3 = 3.8;
+  A1.add(B, p1, C, p2);  // a = p1 * b + p2 * c
+  A2 = B * p1 + C * p2;
+  TensorCheckEqual(A1, A2);
+
+  A1.add2(B, C);  // a = a + b + c
+  A2 = A2 + B + C;
+  TensorCheckEqual(A1, A2);
+
+  A1.add2(B, C, p1, p2, p3);  // a = p1 * a + p2 * b + p3 * c
+  A2 = A2 * p1 + B * p2 + C * p3;
+  TensorCheckEqual(A1, A2);
+
+  A1.decayAddSquareMul(B, C, p1, p2);  // a = p1 * a + p2 * b * b * c * c
+  A2 = A2 * p1 + B.constant(p2) * B * B * C * C;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  A1.sub(B, C);  // a = b - c
+  A2 = B - C;
+  TensorCheckEqual(A1, A2);
+
+  real p1 = 1.5;
+  real p2 = 2.5;
+  A1.sub(B, p1, C, p2);  // a = p1 * b - p2 * c
+  A2 = B * p1 - C * p2;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  A1.dotMul(B, C);  // a = b * c
+  A2 = B * C;
+  TensorCheckEqual(A1, A2);
+
+  A1.dotMulSquare(B, C);  // a = b * c * c
+  A2 = B * C * C;
+  TensorCheckEqual(A1, A2);
+
+  A1.dotSquareSquare(B, C);  // a = b * b * c * c
+  A2 = B * B * C * C;
+  TensorCheckEqual(A1, A2);
+
+  real p1 = 1.5;
+  real p2 = 2.5;
+
+  /*
+   * T tmp = p1 * b + p2 * c;
+   * a *= tmp * tmp
+   */
+  A1.dotMulSquareSum(B, C, p1, p2);
+  auto tmp = B * p1 + C * p2;
+  A2 *= tmp * tmp;
+  TensorCheckEqual(A1, A2);
+
+  /*
+   * T tmp = p1 * b + p2 * c;
+   * a = tmp * tmp
+   */
+  A1.dotSquareSum(B, C, p1, p2);
+  auto tmp2 = B * p1 + C * p2;
+  A2 = tmp2 * tmp2;
+  TensorCheckEqual(A1, A2);
+
+  // a *= p1 * b + p2 * c
+  A1.dotMulSum(B, C, p1, p2);
+  A2 *= B * p1 + C * p2;
+  TensorCheckEqual(A1, A2);
+
+  // a = p1 * a + p2 * b * c
+  A1.addDotMul(B, C, p1, p2);
+  A2 = A2 * p1 + B.constant(p2) * B * C;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  A1.dotDiv(B, C);  // a = (b == 0.0) ? 0.0 : b / c
+  A2 = (B == (real)0.0).condition((real)0.0, B / C);
+  TensorCheckEqual(A1, A2);
+
+  real p1 = 1.5;
+  real p2 = 2.5;
+  A1.dotDiv(B, C, p1, p2);  // a = (b + p1) / (c + p2)
+  A2 = (B + p1) / (C + p2);
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  real p1 = 1.5;
+  real p2 = 2.5;
+  real p3 = 3.5;
+  A1.reciprocalSum(B, C, p1, p2, p3);  // a = 1 / (p1 * b + p2 * c + p3)
+  A2 = (B * p1 + C * p2 + p3).reciprocal();
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSoftCrossEntropy(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  A1.softCrossEntropy(B, C);  // a = -c * log(b) - (1 - c) * log(1 - b)
+  A2 = -C * B.log() - (C.constant(1.0f) - C) * (B.constant(1.0f) - B).log();
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSoftCrossEntropyBp(Tensor& A1,
+                                  Tensor& A2,
+                                  Tensor& B,
+                                  Tensor& C) {
+  A1.softCrossEntropyBp(B, C);  // a += (b - c) / (b * (1 - b))
+  A2 += (B - C) / (B * (B.constant(1.0f) - B));
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTernaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  testTensorAdd(A1, A2, B, C);
+  testTensorSub(A1, A2, B, C);
+  testTensorMul(A1, A2, B, C);
+  testTensorDiv(A1, A2, B, C);
+  testTensorReciprocal(A1, A2, B, C);
+  testTensorSoftCrossEntropyBp(A1, A2, B, C);
+
+  testTensorSoftCrossEntropy(A1, A2, B, C);
+}
+
+TEST(Ternary, BaseOp) {
+  TestTernaryMatrix<CpuMatrix> testCpu(testTernaryBaseOp<CpuMatrix>);
+
+#ifdef PADDLE_WITH_GPU
+  TestTernaryMatrix<GpuMatrix> testGpu(testTernaryBaseOp<GpuMatrix>);
+#endif
+}
+
+template <typename Tensor>
+void testTensorBinaryLabelCrossEntropy(Tensor& A1,
+                                       Tensor& A2,
+                                       Tensor& B,
+                                       Tensor& C) {
+  A1.binaryLabelCrossEntropy(B, C);  // a = c > 0.5 ? -log(b) : -log(1.0 - b)
+  A2 = (C > (real)0.5).condition(-(B.log()), -((B.constant(1.0f) - B).log()));
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorBinaryLabelCrossEntropyBp(Tensor& A1,
+                                         Tensor& A2,
+                                         Tensor& B,
+                                         Tensor& C) {
+  // a += c > 0.5 ? -1.0 / b : 1.0 / (1.0 - b)
+  A1.binaryLabelCrossEntropyBp(B, C);
+  A2 += (C > (real)0.5)
+            .condition((B.constant(-1.0f) / B),
+                       (B.constant(1.0f) - B).reciprocal());
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorLogisticRegressionLoss(Tensor& A1,
+                                      Tensor& A2,
+                                      Tensor& B,
+                                      Tensor& C) {
+  SetTensorValue(B, 50.0f);
+  SetTensorValue(B, -50.0f);
+  /**
+   * const T THRESHOLD = 40.0;
+   * T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD)
+   *                                        ? -THRESHOLD
+   *                                        : b;
+   * a = log(1 + exp(x)) - c * x
+   */
+  A1.logisticRegressionLoss(B, C);
+  real THRESHOLD = 40.0;
+  auto tmp =
+      (B > THRESHOLD)
+          .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B));
+  A2 = (C.constant(1.0f) + tmp.exp()).log() - C * tmp;
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorLogisticRegressionLossBp(Tensor& A1,
+                                        Tensor& A2,
+                                        Tensor& B,
+                                        Tensor& C) {
+  SetTensorValue(B, 50.0f);
+  SetTensorValue(B, -50.0f);
+  /**
+   * const T THRESHOLD = 40.0;
+   * T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD)
+   *                                        ? -THRESHOLD
+   *                                        : b;
+   * x = exp(x); a = x / (1 + x) - c
+   */
+  A1.logisticRegressionLossBp(B, C);
+  real THRESHOLD = 40.0;
+  auto tmp =
+      (B > THRESHOLD)
+          .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B));
+  auto tmp2 = tmp.exp();
+  A2 = tmp2 / (C.constant(1.0) + tmp2) - C;
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorBiggerThan(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  A1.biggerThan(B, C);  // a = (b > c) ? 1.0f : 0.0f
+  A2 = (B > C).condition((real)1.0f, (real)0.0f);
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorMax(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  A1.max2(B, C);  // a = (b > c) ? b : c
+  A2 = (B > C).condition(B, C);
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTernaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  testTensorBinaryLabelCrossEntropyBp(A1, A2, B, C);
+  testTensorBinaryLabelCrossEntropy(A1, A2, B, C);
+  testTensorBiggerThan(A1, A2, B, C);
+  testTensorMax(A1, A2, B, C);
+
+  testTensorLogisticRegressionLoss(A1, A2, B, C);
+  testTensorLogisticRegressionLossBp(A1, A2, B, C);
+}
+
+TEST(Ternary, CompareOp) {
+  TestTernaryMatrix<CpuMatrix> testCpu(testTernaryCompareOp<CpuMatrix>);
+
+#ifdef PADDLE_WITH_GPU
+  TestTernaryMatrix<GpuMatrix> testGpu(testTernaryCompareOp<GpuMatrix>);
+#endif
+}
+
+template <typename Tensor>
+void testQuaternaryAdd(
+    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
+  // A1.add3(B, C, D, 1.5f, 2.5f, 3.5f);  // a = p1 * b + p2 * c + p3 * d
+  // A2 = B * 1.5f + C * 2.5f + D * 3.5f;
+  // TensorCheckEqual(A1, A2);
+
+  /*
+   * T tmp = p1 * b + p2 * c + p3 * d;
+   * a += tmp * tmp
+   */
+  real p1 = 1.5f;
+  real p2 = 2.5f;
+  real p3 = 3.5f;
+  A1.addSquareSum(B, C, D, p1, p2, p3);
+  auto tmp = B * p1 + C * p2 + D * p3;
+  A2 += tmp * tmp;
+  TensorCheckEqual(A1, A2);
+}
+
+TEST(Quaternary, BaseOp) {
+  TestQuaternaryMatrix<CpuMatrix> testCpu(testQuaternaryAdd<CpuMatrix>);
+
+#ifdef PADDLE_WITH_GPU
+  TestQuaternaryMatrix<GpuMatrix> testGpu(testQuaternaryAdd<GpuMatrix>);
+#endif
+}
+
+template <typename Tensor>
+void testTensorBiggerThan(
+    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
+  // a = ((b > c && d > 0.5f) || (b < c && d < 0.5f)) ? 1.0f : 0.0f);
+  A1.biggerThan(B, C, D);
+  A2 = ((B > C && D > (real)0.5) || (B < C && D < (real)0.5))
+           .condition((real)1.0, (real)0.0);
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorRankLoss(
+    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
+  /**
+   * const T THRESHOLD = 40.0; a = b - c;
+   * a = (a > THRESHOLD)
+   *         ? THRESHOLD
+   *         : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
+   * a = log(1 + exp(a)) - a * d
+   */
+  A1.rankLoss(B, C, D);
+
+  real THRESHOLD = 40.0;
+  auto tmp = B - C;
+  auto tmp2 =
+      (tmp > THRESHOLD)
+          .condition(THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp));
+  A2 = (D.constant(1.0f) + tmp2.exp()).log() - tmp2 * D;
+
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorRankLossBp(
+    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
+  /**
+   * const T THRESHOLD = 40.0; a = b - c;
+   * a = (a > THRESHOLD)
+   *         ? THRESHOLD
+   *         : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
+   * a = exp(a); a = (a / (1 + a) - d)
+   */
+  A1.rankLossBp(B, C, D);
+  real THRESHOLD = 40.0;
+  auto tmp = B - C;
+  auto tmp2 =
+      (tmp > THRESHOLD)
+          .condition(THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp));
+  auto tmp3 = tmp2.exp();
+  A2 = tmp3 / (D.constant(1.0f) + tmp3) - D;
+
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testQuaternaryCompareOp(
+    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
+  testTensorBiggerThan(A1, A2, B, C, D);
+  testTensorRankLoss(A1, A2, B, C, D);
+  testTensorRankLossBp(A1, A2, B, C, D);
+}
+
+TEST(Quaternary, CompareOp) {
+  TestQuaternaryMatrix<CpuMatrix> testCpu(testQuaternaryCompareOp<CpuMatrix>);
+
+#ifdef PADDLE_WITH_GPU
+  TestQuaternaryMatrix<GpuMatrix> testGpu(testQuaternaryCompareOp<GpuMatrix>);
+#endif
+}
diff --git a/paddle/legacy/math/tests/test_TrainingAlgorithm.cpp b/paddle/legacy/math/tests/test_TrainingAlgorithm.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..214ae8971ae953ce0266f03dc3bba8c6160f1cf6
--- /dev/null
+++ b/paddle/legacy/math/tests/test_TrainingAlgorithm.cpp
@@ -0,0 +1,461 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "OriginalOptimizerApi.h"
+#include "PerfUtils.h"
+#include "TensorCheck.h"
+#include "paddle/legacy/math/TrainingAlgorithmOp.h"
+#include "paddle/legacy/utils/Util.h"
+
+using namespace paddle;  // NOLINT
+
+#ifndef PADDLE_TYPE_DOUBLE
+DEFINE_double(max_diff, 1e-5, "max diff allowed");
+#else
+DEFINE_double(max_diff, 1e-13, "max diff allowed");
+#endif
+
+class SetMaxDiff {
+ public:
+  explicit SetMaxDiff(double max_diff) {
+    max_diff_ = FLAGS_max_diff;
+    FLAGS_max_diff = max_diff;
+  }
+  ~SetMaxDiff() { FLAGS_max_diff = max_diff_; }
+
+ private:
+  double max_diff_;
+};
+
+#define COPY_VECTOR_TO_CPU(cpuVec, vector)               \
+  do {                                                   \
+    if (vector->useGpu()) {                              \
+      cpuVec = Vector::create(vector->getSize(), false); \
+      cpuVec->copyFrom(*vector);                         \
+    } else {                                             \
+      cpuVec = vector;                                   \
+    }                                                    \
+  } while (0)
+
+int VectorCheckErr(const Vector& vector1, const Vector& vector2) {
+  CHECK(vector1.getSize() == vector2.getSize());
+
+  const real* data1 = vector1.getData();
+  const real* data2 = vector2.getData();
+  size_t size = vector1.getSize();
+  int count = 0;
+  for (size_t i = 0; i < size; i++) {
+    real a = data1[i];
+    real b = data2[i];
+    if (fabs(a - b) > FLAGS_max_diff) {
+      if ((fabsf(a - b) / fabsf(a)) > (FLAGS_max_diff / 10.0f)) {
+        count++;
+      }
+    }
+  }
+
+  return count;
+}
+
+int VectorCheckErr(const VectorPtr& vector1, const VectorPtr& vector2) {
+  VectorPtr tmp1;
+  VectorPtr tmp2;
+  COPY_VECTOR_TO_CPU(tmp1, vector1);
+  COPY_VECTOR_TO_CPU(tmp2, vector2);
+  return VectorCheckErr(*tmp1, *tmp2);
+}
+
+#ifdef PADDLE_DISABLE_TIMER
+
+#define CHECK_VECTORPTR(vector1, vector2) \
+  EXPECT_EQ(VectorCheckErr(vector1, vector2), 0)
+
+#else
+
+#define CHECK_VECTORPTR(vector1, vector2)
+
+#endif
+
+typedef std::function<void(size_t size, bool useGpu)> testMatrixFunc;
+
+void testCase(testMatrixFunc matrixFunc) {
+#ifdef PADDLE_WITH_CUDA
+  for (auto useGpu : {false, true}) {
+#else
+  for (auto useGpu : {false}) {
+#endif
+    for (auto size : {1,
+                      32,
+                      64,
+                      128,
+                      512,
+                      1024,
+                      4096,
+                      32768,
+                      65536,
+                      131072,
+                      262144,
+                      524288,
+                      1048576,
+                      2097152}) {
+      LOG(INFO) << " size=" << size << " useGpu=" << useGpu;
+      matrixFunc(size, useGpu);
+    }
+  }
+}
+
+#define INIT_VECTOR(vec1, vec2, type, size, useGpu) \
+  vec1[type] = Vector::create(size, useGpu);        \
+  vec2[type] = Vector::create(size, useGpu);        \
+  vec1[type]->rand();                               \
+  vec2[type]->copyFrom(*vec1[type]);
+
+void testAdagrad(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM1, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
+
+  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
+  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
+  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
+  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
+
+  EXPRESSION_PERFORMANCE(AdagradParameterOptimizer(
+      bufs1, epsilon, learningRate, momentum, decayRate));
+
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
+  BaseMatrix& accum_buffer = *bufs2[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& accum = *bufs2[PARAMETER_GRADIENT_SQURESUM1];
+  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
+
+  EXPRESSION_PERFORMANCE(adagradApply(value,
+                                      grad,
+                                      mom,
+                                      accum_buffer,
+                                      accum,
+                                      lr,
+                                      epsilon,
+                                      learningRate,
+                                      momentum,
+                                      decayRate));
+
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM1],
+                  bufs2[PARAMETER_GRADIENT_SQURESUM1]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
+                  bufs2[PARAMETER_LEARNING_RATE]);
+}
+
+TEST(Training, Adagrad) { testCase(testAdagrad); }
+
+void testAdaDelta(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM1, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
+
+  real rou = (real)rand() / (real)RAND_MAX;           // NOLINT
+  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
+  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
+  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
+  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
+
+  EXPRESSION_PERFORMANCE(AdaDeltaParameterOptimizer(
+      bufs1, rou, epsilon, learningRate, momentum, decayRate));
+
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
+  BaseMatrix& accum = *bufs2[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& accum_update = *bufs2[PARAMETER_GRADIENT_SQURESUM1];
+  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
+
+  EXPRESSION_PERFORMANCE(adadeltaApply(value,
+                                       grad,
+                                       mom,
+                                       accum,
+                                       accum_update,
+                                       lr,
+                                       rou,
+                                       epsilon,
+                                       learningRate,
+                                       momentum,
+                                       decayRate));
+
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM],
+                  bufs2[PARAMETER_GRADIENT_SQURESUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM1],
+                  bufs2[PARAMETER_GRADIENT_SQURESUM1]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
+                  bufs2[PARAMETER_LEARNING_RATE]);
+}
+
+TEST(Training, AdaDelta) { testCase(testAdaDelta); }
+
+template <bool isFirstTime>
+void testRMSProp(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM1, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
+
+  /* make sure 'g - f.square()' greater than 0 */
+  bufs1[PARAMETER_GRADIENT_SQURESUM]->add(1.0);
+  bufs2[PARAMETER_GRADIENT_SQURESUM]->copyFrom(
+      *bufs1[PARAMETER_GRADIENT_SQURESUM]);
+
+  real rou = (real)rand() / (real)RAND_MAX;           // NOLINT
+  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
+  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
+  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
+  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
+  real accumulatedRou = rou;
+
+  EXPRESSION_PERFORMANCE(RMSPropParameterOptimizer(bufs1,
+                                                   accumulatedRou,
+                                                   rou,
+                                                   epsilon,
+                                                   learningRate,
+                                                   momentum,
+                                                   decayRate,
+                                                   isFirstTime));
+
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
+  BaseMatrix& sum = *bufs2[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& sum1 = *bufs2[PARAMETER_GRADIENT_SQURESUM1];
+  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
+
+  EXPRESSION_PERFORMANCE(rmspropApply(value,
+                                      grad,
+                                      mom,
+                                      sum,
+                                      sum1,
+                                      lr,
+                                      accumulatedRou,
+                                      rou,
+                                      epsilon,
+                                      learningRate,
+                                      momentum,
+                                      decayRate,
+                                      isFirstTime));
+
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM],
+                  bufs2[PARAMETER_GRADIENT_SQURESUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM1],
+                  bufs2[PARAMETER_GRADIENT_SQURESUM1]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
+                  bufs2[PARAMETER_LEARNING_RATE]);
+}
+
+TEST(Training, RMSProp) {
+  testCase(testRMSProp<true>);
+  testCase(testRMSProp<false>);
+}
+
+template <bool isFirstTime>
+void testDecayedAdagrad(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
+
+  real rou = (real)rand() / (real)RAND_MAX;           // NOLINT
+  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
+  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
+  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
+  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
+  real accumulatedRou = rou;
+
+  if (isFirstTime) {
+    bufs1[PARAMETER_GRADIENT_SQURESUM]->zeroMem();
+    bufs2[PARAMETER_GRADIENT_SQURESUM]->zeroMem();
+  }
+
+  EXPRESSION_PERFORMANCE(DecayedAdagradParameterOptimizer(bufs1,
+                                                          accumulatedRou,
+                                                          rou,
+                                                          epsilon,
+                                                          learningRate,
+                                                          momentum,
+                                                          decayRate,
+                                                          isFirstTime));
+
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
+  BaseMatrix& sum = *bufs2[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
+
+  EXPRESSION_PERFORMANCE(decayedAdagradApply(value,
+                                             grad,
+                                             mom,
+                                             sum,
+                                             lr,
+                                             accumulatedRou,
+                                             rou,
+                                             epsilon,
+                                             learningRate,
+                                             momentum,
+                                             decayRate,
+                                             isFirstTime));
+
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM],
+                  bufs2[PARAMETER_GRADIENT_SQURESUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
+                  bufs2[PARAMETER_LEARNING_RATE]);
+}
+
+TEST(Training, DecayedAdagrad) {
+  testCase(testDecayedAdagrad<false>);
+  testCase(testDecayedAdagrad<true>);
+}
+
+void testAdam(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_SECOND_MOMENTUM, size, useGpu);
+
+  real beta1 = (real)rand() / (real)RAND_MAX;         // NOLINT
+  real beta2 = (real)rand() / (real)RAND_MAX;         // NOLINT
+  real beta1_power = (real)rand() / (real)RAND_MAX;   // NOLINT
+  real beta2_power = (real)rand() / (real)RAND_MAX;   // NOLINT
+  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
+  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
+
+  EXPRESSION_PERFORMANCE(AdamParameterOptimizer(
+      bufs1, beta1, beta2, beta1_power, beta2_power, epsilon, learningRate));
+
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
+  BaseMatrix& v = *bufs2[PARAMETER_SECOND_MOMENTUM];
+
+  EXPRESSION_PERFORMANCE(adamApply(value,
+                                   grad,
+                                   mom,
+                                   v,
+                                   beta1,
+                                   beta2,
+                                   beta1_power,
+                                   beta2_power,
+                                   epsilon,
+                                   learningRate));
+
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_SECOND_MOMENTUM],
+                  bufs2[PARAMETER_SECOND_MOMENTUM]);
+}
+
+TEST(Training, Adam) { testCase(testAdam); }
+
+void testAdamax(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_WEIGHTED_INFINITY_NORM, size, useGpu);
+
+  real beta1 = (real)rand() / (real)RAND_MAX;  // NOLINT
+  real beta2 = (real)rand() / (real)RAND_MAX;  // NOLINT
+  real alpha = (real)rand() / (real)RAND_MAX;  // NOLINT
+  int64_t step = 2;
+
+  EXPRESSION_PERFORMANCE(
+      AdamaxParameterOptimizer(bufs1, beta1, beta2, step, alpha));
+
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
+  BaseMatrix& u = *bufs2[PARAMETER_WEIGHTED_INFINITY_NORM];
+
+  EXPRESSION_PERFORMANCE(
+      adamaxApply(value, grad, mom, u, beta1, beta2, step, alpha));
+
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_WEIGHTED_INFINITY_NORM],
+                  bufs2[PARAMETER_WEIGHTED_INFINITY_NORM]);
+}
+
+TEST(Training, Adamax) {
+#ifndef PADDLE_TYPE_DOUBLE
+  SetMaxDiff diff(1e-4);
+#endif
+  testCase(testAdamax);
+}
+
+void testSparseMomentum(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM_UT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM_VT, size, useGpu);
+
+  real alpha = (real)rand() / (real)RAND_MAX;         // NOLINT
+  real beta = (real)rand() / (real)RAND_MAX;          // NOLINT
+  real gamma = (real)rand() / (real)RAND_MAX;         // NOLINT
+  real tau = (real)rand() / (real)RAND_MAX;           // NOLINT
+  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
+
+  EXPRESSION_PERFORMANCE(SparseMomentumParameterOptimizer(
+      bufs1, alpha, beta, gamma, tau, learningRate));
+
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& momU = *bufs2[PARAMETER_MOMENTUM_UT];
+  BaseMatrix& momV = *bufs2[PARAMETER_MOMENTUM_VT];
+
+  EXPRESSION_PERFORMANCE(sparseMomentumApply(
+      value, grad, momU, momV, alpha, beta, gamma, tau, learningRate));
+
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM_UT], bufs2[PARAMETER_MOMENTUM_UT]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM_VT], bufs2[PARAMETER_MOMENTUM_VT]);
+}
+
+TEST(Training, SparseMomentum) { testCase(testSparseMomentum); }
diff --git a/paddle/math/tests/test_batchTranspose.cpp b/paddle/legacy/math/tests/test_batchTranspose.cpp
similarity index 100%
rename from paddle/math/tests/test_batchTranspose.cpp
rename to paddle/legacy/math/tests/test_batchTranspose.cpp
diff --git a/paddle/legacy/math/tests/test_lazyAssign.cu b/paddle/legacy/math/tests/test_lazyAssign.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cf8c3d77199571dff314446a1e1b14e9b746e947
--- /dev/null
+++ b/paddle/legacy/math/tests/test_lazyAssign.cu
@@ -0,0 +1,147 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "PerfUtils.h"
+#include "TensorCheck.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/TensorAssign.h"
+
+using paddle::BaseMatrix;
+using paddle::CpuMatrix;
+using paddle::GpuMatrix;
+using autotest::TensorCheckEqual;
+using autotest::TensorCheckErr;
+
+typedef std::function<void(int height, int width)> testMatrixFunc;
+void testMatrixCase(testMatrixFunc matrixFunc) {
+  for (auto height : {1}) {
+    for (auto width : {1,
+                       32,
+                       64,
+                       128,
+                       512,
+                       1024,
+                       4096,
+                       32768,
+                       65536,
+                       131072,
+                       262144,
+                       524288,
+                       1048576,
+                       2097152,
+                       4194304,
+                       8388608}) {
+      matrixFunc(height, width);
+    }
+  }
+}
+
+template <typename Tensor>
+void testLazyAssign(int height, int width) {
+  Tensor A1(height, width);
+  Tensor A2(height, width);
+  Tensor B(height, width);
+  Tensor C(height, width);
+  Tensor D(height, width);
+  A1.randomizeUniform();
+  B.randomizeUniform();
+  C.randomizeUniform();
+  D.randomizeUniform();
+  A2.copyFrom(A1);
+
+  EXPRESSION_PERFORMANCE(A1 = B + C; A1 = A1 * D;);
+
+  EXPRESSION_PERFORMANCE(auto expr1 = A2.lazyAssign(B + C);
+                         auto expr2 = A2.lazyAssign(A2 * D);
+                         AssignEvaluate(expr1, expr2););
+
+  TensorCheckErr(A1, A2);
+}
+
+TEST(lazyAssign, CPU) { testMatrixCase(testLazyAssign<CpuMatrix>); }
+
+#ifdef PADDLE_WITH_GPU
+TEST(lazyAssign, GPU) { testMatrixCase(testLazyAssign<GpuMatrix>); }
+#endif
+
+template <typename Tensor>
+void sgdUpdateTensor(
+    Tensor& A, Tensor& B, Tensor& C, Tensor& D, real p1, real p2, real p3) {
+  C = C * p2 - D * (B + A * p3) * p1;
+  A += C;
+}
+
+void sgdUpdateLazyAssign(BaseMatrix& A,
+                         BaseMatrix& B,
+                         BaseMatrix& C,
+                         BaseMatrix& D,
+                         real p1,
+                         real p2,
+                         real p3) {
+  auto expr1 = C.lazyAssign(C * p2 - D * (B + A * p3) * p1);
+  auto expr2 = A.lazyAssign(A + C);
+  AssignEvaluate(expr1, expr2);
+}
+
+template <typename Tensor>
+void testSgdUpdate(int height, int width) {
+  Tensor A1(height, width);
+  Tensor A2(height, width);
+  Tensor A3(height, width);
+  A1.randomizeUniform();
+  A2.copyFrom(A1);
+  A3.copyFrom(A1);
+
+  Tensor B(height, width);
+  B.randomizeUniform();
+
+  Tensor C1(height, width);
+  Tensor C2(height, width);
+  Tensor C3(height, width);
+  C1.randomizeUniform();
+  C2.copyFrom(C1);
+  C3.copyFrom(C1);
+
+  Tensor D(height, width);
+  D.randomizeUniform();
+
+  real p1 = 0.2;
+  real p2 = 0.3;
+  real p3 = 0.5;
+
+  /**
+   * c = p2 * c - p1 * (b + p3 * a);
+   * a = a + c;
+   */
+  // BaseMatrix API
+  EXPRESSION_PERFORMANCE(A1.sgdUpdate(B, C1, D, p1, p2, p3););
+
+  // Tensor expression
+  EXPRESSION_PERFORMANCE(sgdUpdateTensor(A2, B, C2, D, p1, p2, p3));
+
+  // lazyAssign
+  EXPRESSION_PERFORMANCE(sgdUpdateLazyAssign(A3, B, C3, D, p1, p2, p3));
+
+  TensorCheckErr(A1, A2);
+  TensorCheckErr(A1, A3);
+  TensorCheckErr(C1, C2);
+  TensorCheckErr(C1, C3);
+}
+
+TEST(sgdUpdate, CPU) { testMatrixCase(testSgdUpdate<CpuMatrix>); }
+
+#ifdef PADDLE_WITH_GPU
+TEST(sgdUpdate, GPU) { testMatrixCase(testSgdUpdate<GpuMatrix>); }
+#endif
diff --git a/paddle/legacy/math/tests/test_matrixCompare.cpp b/paddle/legacy/math/tests/test_matrixCompare.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a43adde46fc6526cc3ff5affec2ce1c7c3a44214
--- /dev/null
+++ b/paddle/legacy/math/tests/test_matrixCompare.cpp
@@ -0,0 +1,1698 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_CUDA
+/// This unittest checks GpuMatrix/CpuMatrix get same result, so disable when
+/// only cpu version.
+
+#include <gtest/gtest.h>
+#include "TensorCheck.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/SparseMatrix.h"
+#include "paddle/legacy/utils/DynamicLoader.h"
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/Util.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+using autotest::TensorCheckEqual;
+using autotest::TensorCheckErr;
+
+void testMatrixMaxSequence(int batchSize, int inputDim) {
+  // forward
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
+
+  IVectorPtr cpuSequence;
+  generateSequenceStartPositions(batchSize, cpuSequence);
+  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
+  gpuSequence->copyFrom(*cpuSequence);
+
+  int newBatchSize = cpuSequence->getSize() - 1;
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(newBatchSize, inputDim);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(newBatchSize, inputDim);
+  cpuOutput->zero();
+  gpuOutput->zero();
+
+  IVectorPtr cpuIndex = nullptr;
+  IVectorPtr gpuIndex = nullptr;
+  IVector::resizeOrCreate(cpuIndex, newBatchSize * inputDim, false);
+  IVector::resizeOrCreate(gpuIndex, newBatchSize * inputDim, true);
+  cpuIndex->zeroMem();
+  gpuIndex->zeroMem();
+
+  cpuOutput->maxSequenceForward(*cpuInput, *cpuSequence, *cpuIndex);
+  gpuOutput->maxSequenceForward(*gpuInput, *gpuSequence, *gpuIndex);
+
+  TensorCheckEqual(*cpuOutput, *gpuOutput);
+  TensorCheckEqual(*cpuIndex, *gpuIndex);
+
+  // backward
+  MatrixPtr cpuOutputGrad = std::make_shared<CpuMatrix>(newBatchSize, inputDim);
+  MatrixPtr gpuOutputGrad = std::make_shared<GpuMatrix>(newBatchSize, inputDim);
+  cpuOutputGrad->randomizeUniform();
+  gpuOutputGrad->copyFrom(*cpuOutputGrad);
+
+  MatrixPtr cpuInputGrad = std::make_shared<CpuMatrix>(batchSize, inputDim);
+  MatrixPtr gpuInputGrad = std::make_shared<GpuMatrix>(batchSize, inputDim);
+  cpuInputGrad->randomizeUniform();
+  gpuInputGrad->copyFrom(*cpuInputGrad);
+
+  cpuInputGrad->maxSequenceBackward(*cpuOutputGrad, *cpuSequence, *cpuIndex);
+  gpuInputGrad->maxSequenceBackward(*gpuOutputGrad, *gpuSequence, *gpuIndex);
+
+  TensorCheckEqual(*cpuInputGrad, *gpuInputGrad);
+}
+
+TEST(Matrix, maxSequence) {
+  for (auto batchSize : {1, 3, 997}) {   // prime numbers close to 1, 4, 1024
+    for (auto inputDim : {1, 7, 131}) {  // prime numbers close to 1, 8, 128
+      VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim;
+      testMatrixMaxSequence(batchSize, inputDim);
+    }
+  }
+}
+
+void testMatrixGetSum(int height, int width) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
+
+#ifndef PADDLE_TYPE_DOUBLE
+  int x = log10(height * width);
+  real err = 1e-6 * pow(10, x);
+#else
+  real err = 1e-8;
+#endif
+
+  real cpuSum = cpuInput->getSum();
+  real gpuSum = gpuInput->getSum();
+
+  EXPECT_LE(fabs(cpuSum - gpuSum), err);
+}
+
+void testMatrixGetMinMax(int height, int width) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
+
+  real cpuMin = cpuInput->getMin();
+  real gpuMin = gpuInput->getMin();
+  real cpuMax = cpuInput->getMax();
+  real gpuMax = gpuInput->getMax();
+
+  EXPECT_EQ(cpuMin, gpuMin);
+  EXPECT_EQ(cpuMax, gpuMax);
+}
+
+void testMatrixZeroAtOffset(int height, int width) {
+  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr cpuTest = std::make_shared<CpuMatrix>(height, width);
+
+  cpuA->randomizeUniform();
+  gpuA->copyFrom(*cpuA);
+  cpuTest->copyFrom(*cpuA);
+
+  int columnOffset = rand() % width;  // NOLINT we just use rand() for test.
+  int numColumns = rand() % (width - columnOffset);  // NOLINT
+
+  if (numColumns == 0) return;
+
+  cpuA->zeroAtOffset(columnOffset, numColumns);
+  gpuA->zeroAtOffset(columnOffset, numColumns);
+
+  /* cpuTest */
+  real* a = cpuTest->getData() + columnOffset;
+  for (int64_t i = 0; i < height; ++i) {
+    for (int64_t j = 0; j < numColumns; ++j) {
+      a[i * width + j] = 0;
+    }
+  }
+
+  TensorCheckEqual(*cpuA, *gpuA);
+  TensorCheckEqual(*cpuA, *cpuTest);
+}
+
+void testMatrixDeepSwap(int height, int width) {
+  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuCopyA = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuCopyB = std::make_shared<CpuMatrix>(height, width);
+
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+  cpuCopyA->copyFrom(*cpuA);
+  cpuCopyB->copyFrom(*cpuB);
+
+  // swap matrix cpuA and cpuB
+  cpuA->deepSwap(*cpuB);
+
+  TensorCheckEqual(*cpuA, *cpuCopyB);
+  TensorCheckEqual(*cpuB, *cpuCopyA);
+}
+
+void testMatrixTranspose(int height, int width) {
+  MatrixPtr cpu = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpu = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr cpuT = std::make_shared<CpuMatrix>(width, height);
+  MatrixPtr gpuT = std::make_shared<GpuMatrix>(width, height);
+
+  cpu->randomizeUniform();
+  gpu->copyFrom(*cpu);
+  cpu->transpose(cpuT, false);
+  gpu->transpose(gpuT, true);
+
+  TensorCheckEqual(*cpuT, *gpuT);
+}
+
+void testMatrixRotate(int height, int width) {
+  MatrixPtr cpu = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpu = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr cpuR = std::make_shared<CpuMatrix>(width, height);
+  MatrixPtr gpuR = std::make_shared<GpuMatrix>(width, height);
+
+  cpu->randomizeUniform();
+  gpu->copyFrom(*cpu);
+
+  cpu->rotate(cpuR, false, true);
+  gpu->rotate(gpuR, true, true);
+  TensorCheckEqual(*cpuR, *gpuR);
+
+  cpu->rotate(cpuR, true, false);
+  gpu->rotate(gpuR, false, false);
+  TensorCheckEqual(*cpuR, *gpuR);
+}
+
+void testMatrixInverse(int height) {
+  MatrixPtr cpu = std::make_shared<CpuMatrix>(height, height);
+  MatrixPtr gpu = std::make_shared<GpuMatrix>(height, height);
+  MatrixPtr cpuI = std::make_shared<CpuMatrix>(height, height);
+  MatrixPtr gpuI = std::make_shared<GpuMatrix>(height, height);
+
+  /* Make matrix well conditioned: cpu * cpuT + Identity */
+  cpu->randomizeUniform();
+  MatrixPtr cpuT = cpu->getTranspose();
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, height);
+  outputCheck->mul(*cpu, *cpuT);
+  cpu->setDiag(1.0);
+  cpu->add(*outputCheck);
+
+  gpu->copyFrom(*cpu);
+  cpu->inverse(cpuI, true);
+  gpu->inverse(gpuI, false);
+
+  TensorCheckErr(*cpuI, *gpuI);
+
+  outputCheck->mul(*cpu, *cpuI);
+  cpu->setDiag(1.0);
+  TensorCheckErr(*cpu, *outputCheck);
+}
+
+TEST(Matrix, unary) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      VLOG(3) << " height=" << height << " width=" << width;
+
+      testMatrixDeepSwap(height, width);
+      testMatrixZeroAtOffset(height, width);
+      testMatrixGetSum(height, width);
+      testMatrixTranspose(height, width);
+      testMatrixRotate(height, width);
+    }
+#ifdef LAPACK_FOUND
+    // inverse matrix
+    testMatrixInverse(height);
+#else
+    LOG(WARNING) << "This version of PaddlePaddle was not built with LAPACK"
+                 << "support so we cannot test matrix inverse. To test "
+                 << "matrix inverse, please install LAPACKE "
+                 << "and MKL/Openblas, and re-build PaddlePaddle.";
+#endif
+  }
+}
+
+void testMatrixSoftmax(int height, int width) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width);
+
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
+  cpuOutput->zero();
+  gpuOutput->zero();
+  cpuInput->softmax(*cpuOutput);
+  gpuInput->softmax(*gpuOutput);
+
+  TensorCheckErr(*cpuOutput, *gpuOutput);
+}
+
+void testSequenceSoftmax(int batchSize) {
+  // forward
+  int inputDim = 1;
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
+
+  IVectorPtr cpuSequence;
+  generateSequenceStartPositions(batchSize, cpuSequence);
+  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
+  gpuSequence->copyFrom(*cpuSequence);
+
+  cpuInput->sequenceSoftmax(*cpuInput, *cpuSequence);
+  gpuInput->sequenceSoftmax(*gpuInput, *gpuSequence);
+
+  TensorCheckErr(*cpuInput, *gpuInput);
+}
+
+void testMatrixSoftmaxThreshold(int height, int width) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width);
+
+  cpuInput->randomizeUniform();
+  cpuInput->getData()[0] = 100.0;
+  gpuInput->copyFrom(*cpuInput);
+  cpuOutput->zero();
+  gpuOutput->zero();
+  cpuInput->softmax(*cpuOutput);
+  gpuInput->softmax(*gpuOutput);
+
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
+  outputCheck->copyFrom(*gpuOutput);
+  // check output zero
+  int cpuCount = 0;
+  int gpuCount = 0;
+  auto zeroNum = [](MatrixPtr out, int& count) {
+    for (size_t i = 0; i < out->getHeight(); i++) {
+      for (size_t j = 0; j < out->getWidth(); j++) {
+        if (out->getElement(i, j) == 0) count++;
+      }
+    }
+  };
+  zeroNum(cpuOutput, cpuCount);
+  zeroNum(outputCheck, gpuCount);
+  EXPECT_EQ(cpuCount, 0) << "Cpu softmax output value 0";
+  EXPECT_EQ(gpuCount, 0) << "Gpu softmax output value 0";
+}
+
+void testMatrixSoftmaxBp(int height, int width) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width);
+
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
+  cpuOutput->randomizeUniform();
+  gpuOutput->copyFrom(*cpuOutput);
+  gpuOutput->softmaxBackward(*gpuInput);
+
+  MatrixPtr sftMaxSum = std::make_shared<CpuMatrix>(height, 1);
+  MatrixPtr sftMaxDot = std::make_shared<CpuMatrix>(height, width);
+  sftMaxDot->dotMul(*cpuOutput, *cpuInput);
+  sftMaxSum->colMerge(*sftMaxDot);
+  cpuOutput->softmaxDerivative(*cpuInput, *sftMaxSum);
+
+  TensorCheckErr(*cpuOutput, *gpuOutput);
+}
+
+TEST(Matrix, softmax) {
+  for (auto height : {1, 3, 131}) {    // prime numbers close to 1, 4, 127
+    for (auto width : {1, 17, 251}) {  // prime numbers close to 1, 16, 256
+      VLOG(3) << " height=" << height << " width=" << width;
+
+      testMatrixSoftmax(height, width);
+      testMatrixSoftmaxBp(height, width);
+      testMatrixSoftmaxThreshold(height, width);
+    }
+    testSequenceSoftmax(height);
+  }
+}
+
+void testMatrixAddToRows(int numSamples, int tableSize, int inputDim) {
+  MatrixPtr cpuTable = std::make_shared<CpuMatrix>(tableSize, inputDim);
+  MatrixPtr gpuTable = std::make_shared<GpuMatrix>(tableSize, inputDim);
+  cpuTable->randomizeUniform();
+  gpuTable->copyFrom(*cpuTable);
+
+  IVectorPtr cpuIds;
+  IVectorPtr gpuIds;
+  cpuIds = VectorT<int>::create(numSamples, false);
+  gpuIds = VectorT<int>::create(numSamples, true);
+  cpuIds->rand(tableSize);
+  gpuIds->copyFrom(*cpuIds);
+
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(numSamples, inputDim);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(numSamples, inputDim);
+  cpuOutput->randomizeUniform();
+  gpuOutput->copyFrom(*cpuOutput);
+
+  cpuOutput->addToRows(*cpuTable, *cpuIds);
+  gpuOutput->addToRows(*gpuTable, *gpuIds);
+
+  TensorCheckErr(*cpuTable, *gpuTable);
+}
+
+TEST(Matrix, tableProjection) {
+  for (auto numSamples : {10, 100, 1000, 10000, 80000}) {
+    for (auto tableSize : {10, 100}) {
+      for (auto inputDim : {20, 50}) {
+        VLOG(3) << " numSamples=" << numSamples << " tableSize=" << tableSize
+                << " inputDim=" << inputDim;
+        testMatrixAddToRows(numSamples, tableSize, inputDim);
+      }
+    }
+  }
+}
+
+void testMatrixMul(bool transa, bool transb, int dimM, int dimN, int dimK) {
+  int heightA = transa == false ? dimM : dimK;
+  int widthA = transa == false ? dimK : dimM;
+  int heightB = transb == false ? dimK : dimN;
+  int widthB = transb == false ? dimN : dimK;
+  int heightC = dimM;
+  int widthC = dimN;
+
+  MatrixPtr cpuA = std::make_shared<CpuMatrix>(heightA, widthA, transa);
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(heightB, widthB, transb);
+  MatrixPtr cpuC = std::make_shared<CpuMatrix>(heightC, widthC);
+  MatrixPtr gpuA = std::make_shared<GpuMatrix>(heightA, widthA, transa);
+  MatrixPtr gpuB = std::make_shared<GpuMatrix>(heightB, widthB, transb);
+  MatrixPtr gpuC = std::make_shared<GpuMatrix>(heightC, widthC);
+
+  real alpha = 1.5;
+  real beta = 2.0;
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+  cpuC->randomizeUniform();
+  gpuA->copyFrom(*cpuA);
+  gpuB->copyFrom(*cpuB);
+  gpuC->copyFrom(*cpuC);
+
+  cpuC->mul(*cpuA, *cpuB, alpha, beta);
+  gpuC->mul(*gpuA, *gpuB, alpha, beta);
+
+  TensorCheckErr(*cpuC, *gpuC);
+}
+
+void testSubMatrixMul(bool transa, bool transb, int dimM, int dimN, int dimK) {
+  int heightA = transa == false ? dimM : dimK;
+  int widthA = transa == false ? dimK : dimM;
+  int heightB = transb == false ? dimK : dimN;
+  int widthB = transb == false ? dimN : dimK;
+  int heightC = dimM;
+  int widthC = dimN;
+
+  MatrixPtr cpuA = std::make_shared<CpuMatrix>(heightA, widthA, transa);
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(heightB, widthB, transb);
+  MatrixPtr cpuC = std::make_shared<CpuMatrix>(heightC, widthC);
+  MatrixPtr gpuA = std::make_shared<GpuMatrix>(heightA, widthA, transa);
+  MatrixPtr gpuB = std::make_shared<GpuMatrix>(heightB, widthB, transb);
+  MatrixPtr gpuC = std::make_shared<GpuMatrix>(heightC, widthC);
+
+  real alpha = 1.5;
+  real beta = 2.0;
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+  cpuC->randomizeUniform();
+  gpuA->copyFrom(*cpuA);
+  gpuB->copyFrom(*cpuB);
+  gpuC->copyFrom(*cpuC);
+
+  auto subSize = [](int& start, int& end, int dim) {
+    if (dim == 1) {
+      start = 0;
+      end = dim;
+    } else {
+      int subDim = rand() % (dim - 1) + 1;  // NOLINT
+      start = rand() % (dim - subDim);      // NOLINT
+      end = start + subDim;
+    }
+  };
+
+  auto subMatrix = [](MatrixPtr& sub,
+                      MatrixPtr matrix,
+                      size_t startRow,
+                      size_t endRow,
+                      size_t startCol,
+                      size_t endCol) {
+    if (!matrix->isTransposed()) {
+      sub = matrix->subMatrix(startRow, endRow, startCol, endCol);
+    } else {
+      sub = matrix->subMatrix(startCol, endCol, startRow, endRow);
+    }
+  };
+
+  int startM, endM;
+  int startN, endN;
+  int startK, endK;
+  subSize(startM, endM, dimM);
+  subSize(startN, endN, dimN);
+  subSize(startK, endK, dimK);
+
+  MatrixPtr subCpuA;
+  MatrixPtr subCpuB;
+  MatrixPtr subGpuA;
+  MatrixPtr subGpuB;
+  subMatrix(subCpuA, cpuA, startM, endM, startK, endK);
+  subMatrix(subGpuA, gpuA, startM, endM, startK, endK);
+  subMatrix(subCpuB, cpuB, startK, endK, startN, endN);
+  subMatrix(subGpuB, gpuB, startK, endK, startN, endN);
+  MatrixPtr subCpuC = cpuC->subMatrix(startM, endM, startN, endN);
+  MatrixPtr subGpuC = gpuC->subMatrix(startM, endM, startN, endN);
+
+  subCpuC->mul(*subCpuA, *subCpuB, alpha, beta);
+  subGpuC->mul(*subGpuA, *subGpuB, alpha, beta);
+
+  TensorCheckErr(*cpuC, *gpuC);
+}
+
+TEST(Matrix, mul) {
+  for (auto transa : {false, true}) {
+    for (auto transb : {false, true}) {
+      for (auto dimM : {1, 9, 53, 127, 345, 1023, 2135}) {
+        for (auto dimN : {1, 5, 37, 256, 1024}) {
+          for (auto dimK : {8, 45, 346, 784, 1025}) {
+            if (true == transa && true == transb) {
+              continue;
+            }
+            VLOG(3) << setiosflags(ios::left) << setfill(' ')
+                    << " transa=" << transa << " transb=" << transb
+                    << " dimM=" << setw(5) << dimM << " dimN=" << setw(5)
+                    << dimN << " dimK=" << setw(5) << dimK;
+
+            testMatrixMul(transa, transb, dimM, dimN, dimK);
+            testSubMatrixMul(transa, transb, dimM, dimN, dimK);
+          }
+        }
+      }
+    }
+  }
+}
+
+void testVectorRowFunc(int size) {
+  CpuVectorPtr cpu = std::make_shared<CpuVectorT<real>>(size);
+  GpuVectorPtr gpu = std::make_shared<GpuVectorT<real>>(size);
+
+  cpu->rand();
+  gpu->copyFrom(*cpu);
+
+  EXPECT_EQ(cpu->getMax(), gpu->getMax());
+  EXPECT_EQ(cpu->getMin(), gpu->getMin());
+  EXPECT_EQ(cpu->getAbsMax(), gpu->getAbsMax());
+}
+
+TEST(Vector, rowFunc) {
+  for (auto size : {1, 3, 997}) {  // prime numbers close to 1, 4, 1024
+    VLOG(3) << " size=" << size;
+    testVectorRowFunc(size);
+  }
+}
+
+template <class T>
+void testVectorReset(int size) {
+  std::shared_ptr<CpuVectorT<T>> cpu = std::make_shared<CpuVectorT<T>>(size);
+  std::shared_ptr<GpuVectorT<T>> gpu = std::make_shared<GpuVectorT<T>>(size);
+
+  T value = (T)((int)rand() % 100 + 1.0f / ((int)rand() % 100));
+  cpu->reset(value);
+  gpu->reset(value);
+
+  TensorCheckEqual(*cpu, *gpu);
+}
+
+template <class T>
+void testVecortSelectFrom(int size) {
+  std::shared_ptr<CpuVectorT<T>> cpuDst = std::make_shared<CpuVectorT<T>>(size);
+  std::shared_ptr<GpuVectorT<T>> gpuDst = std::make_shared<GpuVectorT<T>>(size);
+  std::shared_ptr<CpuVectorT<T>> cpuSrc =
+      std::make_shared<CpuVectorT<T>>(size * 2);
+  std::shared_ptr<GpuVectorT<T>> gpuSrc =
+      std::make_shared<GpuVectorT<T>>(size * 2);
+  CpuIVectorPtr cpuIds = std::make_shared<CpuVectorT<int>>(size);
+  GpuIVectorPtr gpuIds = std::make_shared<GpuVectorT<int>>(size);
+
+  if (std::is_same<T, real>::value) {
+    cpuSrc->rand();
+  } else {
+    cpuSrc->rand(100000);
+  }
+  gpuSrc->copyFrom(*cpuSrc);
+  cpuIds->rand(size);
+  gpuIds->copyFrom(*cpuIds);
+
+  cpuDst->selectFrom(*cpuSrc, *cpuIds);
+  gpuDst->selectFrom(*gpuSrc, *gpuIds);
+
+  TensorCheckEqual(*cpuDst, *gpuDst);
+}
+
+template <class T>
+void testVecotrZeroMem(int size) {
+  std::shared_ptr<CpuVectorT<T>> cpu = std::make_shared<CpuVectorT<T>>(size);
+  std::shared_ptr<GpuVectorT<T>> gpu = std::make_shared<GpuVectorT<T>>(size);
+
+  cpu->zeroMem();
+  gpu->zeroMem();
+
+  TensorCheckEqual(*cpu, *gpu);
+}
+
+template <class T>
+void testVectorIsEqual(int size) {
+  std::shared_ptr<CpuVectorT<T>> cpuA = std::make_shared<CpuVectorT<T>>(size);
+  std::shared_ptr<CpuVectorT<T>> cpuB = std::make_shared<CpuVectorT<T>>(size);
+  std::shared_ptr<GpuVectorT<T>> gpuA = std::make_shared<GpuVectorT<T>>(size);
+  std::shared_ptr<GpuVectorT<T>> gpuB = std::make_shared<GpuVectorT<T>>(size);
+
+  if (std::is_same<T, real>::value) {
+    cpuB->rand();
+  } else {
+    cpuB->rand(100000);
+  }
+  gpuB->copyFrom(*cpuB);
+
+  T value = (T)((int)rand() % 100 + 1.0f / ((int)rand() % 100));
+  cpuA->isEqualTo(*cpuB, value);
+  gpuA->isEqualTo(*gpuB, value);
+
+  TensorCheckEqual(*cpuA, *gpuA);
+}
+
+TEST(Vector, Equal) {
+  for (auto size : {1, 3, 997}) {  // prime numbers close to 1, 4, 1024
+    VLOG(3) << " size=" << size;
+    testVectorReset<int>(size);
+    testVectorReset<real>(size);
+    testVecortSelectFrom<int>(size);
+    testVecortSelectFrom<real>(size);
+    testVecotrZeroMem<int>(size);
+    testVecotrZeroMem<real>(size);
+    testVectorIsEqual<int>(size);
+    testVectorIsEqual<real>(size);
+  }
+}
+
+void testMatrixTopK(int samples, int dim, int beamSize) {
+  MatrixPtr cpuSrc = std::make_shared<CpuMatrix>(samples, dim);
+  MatrixPtr gpuSrc = std::make_shared<GpuMatrix>(samples, dim);
+  MatrixPtr cpuVal = std::make_shared<CpuMatrix>(samples, beamSize);
+  MatrixPtr gpuVal = std::make_shared<GpuMatrix>(samples, beamSize);
+  IVectorPtr cpuIds = std::make_shared<CpuIVector>(samples * beamSize);
+  IVectorPtr gpuIds = std::make_shared<GpuIVector>(samples * beamSize);
+
+  cpuSrc->randomizeUniform();
+  gpuSrc->copyFrom(*cpuSrc);
+
+  cpuSrc->rowMax(*cpuIds, *cpuVal);
+  gpuSrc->rowMax(*gpuIds, *gpuVal);
+
+  TensorCheckEqual(*cpuVal, *gpuVal);
+}
+
+TEST(Matrix, topK) {
+  for (auto samples : {1, 17, 131}) {  // prime numbers close to 1, 16, 127
+    for (auto dim : {1, 3, 997}) {     // prime numbers close to 1, 4, 1024
+      for (auto beamSize : {1, 5, 10, 20, 40, (int)rand() % dim + 1}) {
+        if (beamSize > dim) continue;
+        VLOG(3) << " samples=" << samples << " beamSize=" << beamSize
+                << " dim=" << dim;
+        testMatrixTopK(samples, dim, beamSize);
+      }
+    }
+  }
+}
+
+void testSMatrixTopK(int samples, int dim, int beamSize, real ratio) {
+  int nnz = samples * dim * ratio;
+  if (nnz < 1) nnz = 1;  // Because sparseRand in MathUtil.cpp requires this.
+  MatrixPtr cpuSrc = std::make_shared<CpuSparseMatrix>(samples, dim, nnz);
+  MatrixPtr gpuSrc = std::make_shared<GpuSparseMatrix>(samples, dim, nnz);
+  MatrixPtr cpuVal = std::make_shared<CpuMatrix>(samples, beamSize);
+  MatrixPtr gpuVal = std::make_shared<GpuMatrix>(samples, beamSize);
+  IVectorPtr cpuIds = std::make_shared<CpuIVector>(samples * beamSize);
+  IVectorPtr gpuIds = std::make_shared<GpuIVector>(samples * beamSize);
+
+  cpuSrc->randomizeUniform();
+  gpuSrc->copyFrom(*cpuSrc);
+  cpuVal->zero();
+  cpuIds->zero();
+  gpuVal->zero();
+  gpuIds->zero();
+
+  cpuSrc->rowMax(*cpuIds, *cpuVal);
+  gpuSrc->rowMax(*gpuIds, *gpuVal);
+
+  TensorCheckEqual(*cpuVal, *gpuVal);
+
+  IVectorPtr outCheckIds = std::make_shared<CpuIVector>(samples * beamSize);
+  outCheckIds->copyFrom(*gpuIds);
+
+  const int* data1 = cpuIds->getData();
+  const int* data2 = outCheckIds->getData();
+  size_t size = cpuIds->getSize();
+  for (size_t i = 0; i < size; i++) {
+    if (data1[i] == -1 && data1[i] != data2[i]) {
+      EXPECT_EQ(data1[i], data2[i]);
+    }
+  }
+}
+
+TEST(SMatrix, topK) {
+  for (auto samples : {1, 3, 61}) {
+    for (auto dim : {1, 3, 61}) {
+      for (auto beamSize : {1, 3, 61}) {
+        for (auto ratio : {0.01, 0.001}) {
+          if (beamSize > dim) continue;
+          VLOG(3) << " samples=" << samples << " beamSize=" << beamSize
+                  << " dim=" << dim << " ratio=" << ratio;
+          testSMatrixTopK(samples, dim, beamSize, ratio);
+        }
+      }
+    }
+  }
+}
+
+void testMatrixSequenceAvg(int batchSize, int inputDim, int mode) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
+
+  IVectorPtr cpuSequence;
+  generateSequenceStartPositions(batchSize, cpuSequence);
+  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
+  gpuSequence->copyFrom(*cpuSequence);
+
+  int newBatchSize = cpuSequence->getSize() - 1;
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(newBatchSize, inputDim);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(newBatchSize, inputDim);
+  cpuOutput->zero();
+  gpuOutput->zero();
+
+  cpuOutput->sequenceAvgForward(*cpuInput, *cpuSequence, mode);
+  gpuOutput->sequenceAvgForward(*gpuInput, *gpuSequence, mode);
+
+  TensorCheckErr(*cpuOutput, *gpuOutput);
+
+  MatrixPtr cpuInGrad = std::make_shared<CpuMatrix>(batchSize, inputDim);
+  MatrixPtr gpuInGrad = std::make_shared<GpuMatrix>(batchSize, inputDim);
+  cpuInGrad->randomizeUniform();
+  gpuInGrad->copyFrom(*cpuInGrad);
+
+  cpuInGrad->sequenceAvgBackward(*cpuOutput, *cpuSequence, mode);
+  gpuInGrad->sequenceAvgBackward(*gpuOutput, *gpuSequence, mode);
+
+  TensorCheckErr(*cpuInGrad, *gpuInGrad);
+}
+
+TEST(Matrix, sequenceAvg) {
+  for (auto batchSize : {10, 128, 6000}) {
+    for (auto inputDim : {32, 100, 512}) {
+      for (auto mode : {0, 1, 2}) {
+        VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim
+                << " mode=" << mode;
+        testMatrixSequenceAvg(batchSize, inputDim, mode);
+      }
+    }
+  }
+}
+
+void testParamReluBackwardDiff(int height,
+                               int width,
+                               int w_height,
+                               int w_width) {
+  MatrixPtr oGrad = CpuMatrix::create(height, width, false, false);
+  MatrixPtr input = CpuMatrix::create(height, width, false, false);
+  MatrixPtr diff = CpuMatrix::create(height, width, false, false);
+  MatrixPtr w = CpuMatrix::create(w_height, w_width, false, false);
+
+  oGrad->randomizeUniform();
+  input->randomizeUniform();
+  w->randomizeUniform();
+  diff->randomizeUniform();
+  input->add(-0.5);
+
+  MatrixPtr oGradGpu = GpuMatrix::create(height, width, false, true);
+  MatrixPtr inputGpu = GpuMatrix::create(height, width, false, true);
+  MatrixPtr diffGpu = CpuMatrix::create(height, width, false, true);
+  MatrixPtr wGpu = GpuMatrix::create(w_height, w_width, false, true);
+
+  oGradGpu->copyFrom(*oGrad);
+  inputGpu->copyFrom(*input);
+  wGpu->copyFrom(*w);
+  diffGpu->copyFrom(*diff);
+
+  diff->paramReluBackwardDiff(*oGrad, *input, *w);
+  diffGpu->paramReluBackwardDiff(*oGradGpu, *inputGpu, *wGpu);
+
+  TensorCheckErr(*diff, *diffGpu);
+}
+
+TEST(Matrix, paramReluBackwardDiff) {
+  for (auto height : {10, 40, 100}) {
+    for (auto width : {10, 40, 100}) {
+      for (auto w_height : {1, 2}) {
+        for (auto w_width : {1, 2}) {
+          if (width % (w_height * w_width)) continue;
+          testParamReluBackwardDiff(height, width, w_height, w_width);
+        }
+      }
+    }
+  }
+}
+
+void testClassificationError(int numSamples, int dim, int topkSize) {
+  MatrixPtr cpuError = std::make_shared<CpuMatrix>(numSamples, 1);
+  MatrixPtr gpuError = std::make_shared<GpuMatrix>(numSamples, 1);
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(numSamples, dim);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(numSamples, dim);
+  IVectorPtr cpuLabel = std::make_shared<CpuIVector>(numSamples);
+  IVectorPtr gpuLabel = std::make_shared<GpuIVector>(numSamples);
+
+  cpuOutput->randomizeUniform();
+  cpuLabel->rand(dim);
+  gpuOutput->copyFrom(*cpuOutput);
+  gpuLabel->copyFrom(*cpuLabel);
+
+  cpuError->classificationError(*cpuOutput, *cpuLabel, topkSize);
+  gpuError->classificationError(*gpuOutput, *gpuLabel, topkSize);
+
+  TensorCheckEqual(*cpuError, *gpuError);
+}
+
+TEST(Matrix, classificationError) {
+  for (auto numSamples : {1, 3, 31}) {
+    for (auto dim : {1, 3, 31}) {
+      for (auto topkSize : {1, 3, (int)rand() % dim + 1}) {
+        if (topkSize > dim) continue;
+        VLOG(3) << " sample= " << numSamples << " topkSize= " << topkSize
+                << " dim= " << dim;
+        testClassificationError(numSamples, dim, topkSize);
+      }
+    }
+  }
+}
+
+void testMaxPoolFwdBwd(int numSamples,
+                       int channels,
+                       int imgSizeH,
+                       int imgSizeW,
+                       int ksizeH,
+                       int ksizeW,
+                       int strideH,
+                       int strideW,
+                       int padH,
+                       int padW) {
+  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
+  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
+
+  int inWidth = imgSizeH * imgSizeW * channels;
+  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
+
+  int outWidth = channels * outH * outW;
+  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+
+  input->randomizeUniform();
+  target->randomizeUniform();
+  inputGpu->copyFrom(*input);
+  targetGpu->copyFrom(*target);
+
+  target->maxPoolForward(*input,
+                         imgSizeH,
+                         imgSizeW,
+                         channels,
+                         ksizeW,
+                         ksizeH,
+                         strideH,
+                         strideW,
+                         outH,
+                         outW,
+                         padH,
+                         padW);
+  targetGpu->maxPoolForward(*inputGpu,
+                            imgSizeH,
+                            imgSizeW,
+                            channels,
+                            ksizeW,
+                            ksizeH,
+                            strideH,
+                            strideW,
+                            outH,
+                            outW,
+                            padH,
+                            padW);
+  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
+  targetCheck->copyFrom(*targetGpu);
+  checkMatrixEqual(target, targetCheck);
+
+  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
+  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpuGrad =
+      GpuMatrix::create(numSamples, outWidth, false, true);
+
+  inputGrad->randomizeUniform();
+  targetGrad->randomizeUniform();
+  inputGpuGrad->copyFrom(*inputGrad);
+  targetGpuGrad->copyFrom(*targetGrad);
+
+  inputGrad->maxPoolBackward(*input,
+                             imgSizeH,
+                             imgSizeW,
+                             *targetGrad,
+                             *target,
+                             ksizeW,
+                             ksizeH,
+                             strideH,
+                             strideW,
+                             outH,
+                             outW,
+                             1.0,
+                             1.0,
+                             padH,
+                             padW);
+  inputGpuGrad->maxPoolBackward(*inputGpu,
+                                imgSizeH,
+                                imgSizeW,
+                                *targetGpuGrad,
+                                *targetGpu,
+                                ksizeW,
+                                ksizeH,
+                                strideH,
+                                strideW,
+                                outH,
+                                outW,
+                                1.0,
+                                1.0,
+                                padH,
+                                padW);
+  MatrixPtr targetBwdCheck =
+      CpuMatrix::create(numSamples, inWidth, false, false);
+  targetBwdCheck->copyFrom(*inputGpuGrad);
+  checkMatrixEqual(inputGrad, targetBwdCheck);
+}
+
+void testAvgPoolFwdBwd(int numSamples,
+                       int channels,
+                       int imgSizeH,
+                       int imgSizeW,
+                       int ksizeH,
+                       int ksizeW,
+                       int strideH,
+                       int strideW,
+                       int padH,
+                       int padW) {
+  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
+  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
+
+  int inWidth = imgSizeH * imgSizeW * channels;
+  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
+
+  int outWidth = channels * outH * outW;
+  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+
+  input->randomizeUniform();
+  target->randomizeUniform();
+  inputGpu->copyFrom(*input);
+  targetGpu->copyFrom(*target);
+
+  target->avgPoolForward(*input,
+                         imgSizeH,
+                         imgSizeW,
+                         channels,
+                         ksizeW,
+                         ksizeH,
+                         strideH,
+                         strideW,
+                         outH,
+                         outW,
+                         padH,
+                         padW);
+  targetGpu->avgPoolForward(*inputGpu,
+                            imgSizeH,
+                            imgSizeW,
+                            channels,
+                            ksizeW,
+                            ksizeH,
+                            strideH,
+                            strideW,
+                            outH,
+                            outW,
+                            padH,
+                            padW);
+
+  TensorCheckErr(*target, *targetGpu);
+
+  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
+  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpuGrad =
+      GpuMatrix::create(numSamples, outWidth, false, true);
+
+  inputGrad->randomizeUniform();
+  targetGrad->randomizeUniform();
+  inputGpuGrad->copyFrom(*inputGrad);
+  targetGpuGrad->copyFrom(*targetGrad);
+
+  inputGrad->avgPoolBackward(*targetGrad,
+                             imgSizeH,
+                             imgSizeW,
+                             ksizeW,
+                             ksizeH,
+                             strideH,
+                             strideW,
+                             outH,
+                             outW,
+                             1.0,
+                             1.0,
+                             padH,
+                             padW);
+  inputGpuGrad->avgPoolBackward(*targetGpuGrad,
+                                imgSizeH,
+                                imgSizeW,
+                                ksizeW,
+                                ksizeH,
+                                strideH,
+                                strideW,
+                                outH,
+                                outW,
+                                1.0,
+                                1.0,
+                                padH,
+                                padW);
+
+  TensorCheckErr(*inputGrad, *inputGpuGrad);
+}
+
+// TODO(yi): I noticed many such blindly combinatorial tests in this
+// file.  They are no help to locate defects at all.
+TEST(Matrix, PoolFwdBwd) {
+  for (auto numSamples : {1, 3}) {
+    for (auto channels : {1, 3}) {
+      for (auto imgSizeH : {13, 17}) {
+        for (auto imgSizeW : {17, 19}) {
+          for (auto sizeX : {2, 3}) {
+            for (auto sizeY : {2, 3}) {
+              for (auto sH : {1, 2}) {
+                for (auto sW : {1, 2}) {
+                  for (auto pH : {0, (sizeY - 1) / 2}) {
+                    for (auto pW : {0, (sizeX - 1) / 2}) {
+                      VLOG(3) << " numSamples=" << numSamples
+                              << " channels=" << channels
+                              << " imgSizeH=" << imgSizeH
+                              << " imgSizeW=" << imgSizeW << " sizeX=" << sizeX
+                              << " sizeY=" << sizeY << " strideH=" << sH
+                              << " strideW=" << sW << " padingH=" << pH
+                              << " padingW=" << pW;
+                      testMaxPoolFwdBwd(numSamples,
+                                        channels,
+                                        imgSizeH,
+                                        imgSizeW,
+                                        sizeX,
+                                        sizeY,
+                                        sH,
+                                        sW,
+                                        pH,
+                                        pW);
+                      testAvgPoolFwdBwd(numSamples,
+                                        channels,
+                                        imgSizeH,
+                                        imgSizeW,
+                                        sizeX,
+                                        sizeY,
+                                        sH,
+                                        sW,
+                                        pH,
+                                        pW);
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void testMaxOutFwdBwd(
+    int numSamples, int imgSizeH, int imgSizeW, int channels, int groups) {
+  int inWidth = imgSizeH * imgSizeW * channels;
+  int outChannels = channels / groups;
+  int outWidth = imgSizeH * imgSizeW * outChannels;
+
+  // forward
+  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
+
+  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+
+  IVectorPtr id = CpuIVector::create(numSamples * outWidth, false);
+  IVectorPtr idGpu = GpuIVector::create(numSamples * outWidth, true);
+
+  input->randomizeUniform();
+  inputGpu->copyFrom(*input);
+
+  target->maxoutForward(*input, *id, outChannels, groups);
+  targetGpu->maxoutForward(*inputGpu, *idGpu, outChannels, groups);
+
+  TensorCheckErr(*target, *targetGpu);
+  TensorCheckEqual(*id, *idGpu);
+
+  // backward
+  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
+
+  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpuGrad =
+      GpuMatrix::create(numSamples, outWidth, false, true);
+
+  inputGrad->randomizeUniform();
+  targetGrad->randomizeUniform();
+  inputGpuGrad->copyFrom(*inputGrad);
+  targetGpuGrad->copyFrom(*targetGrad);
+
+  inputGrad->maxoutBackward(*targetGrad, *id, outChannels, groups);
+  inputGpuGrad->maxoutBackward(*targetGpuGrad, *idGpu, outChannels, groups);
+
+  TensorCheckErr(*inputGrad, *inputGpuGrad);
+}
+
+TEST(Matrix, MaxOutFwdBwd) {
+  for (auto numSamples : {5, 10}) {
+    for (auto channels : {8, 16}) {
+      for (auto imgSizeH : {14, 28}) {
+        for (auto imgSizeW : {16, 30}) {
+          for (auto groups : {2, 4}) {
+            VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
+                    << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW
+                    << " groups=" << groups;
+            testMaxOutFwdBwd(numSamples, imgSizeH, imgSizeW, channels, groups);
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(CpuMatrix, copyFrom) {
+  const size_t height = 31;
+  const size_t width = 53;
+  CpuMatrix cpu(height, width);
+  GpuMatrix gpu(height, width);
+  CpuMatrix copy(height, width);
+
+  cpu.randomizeUniform();
+  gpu.copyFrom(cpu);
+  copy.copyFrom(gpu, HPPL_STREAM_DEFAULT);
+
+  TensorCheckEqual(cpu, copy);
+}
+
+void testBatch2seqPadding(int batchSize, int inputDim) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
+
+  IVectorPtr cpuSequence;
+  generateSequenceStartPositions(batchSize, cpuSequence);
+  for (int i = 0; i < int(cpuSequence->getSize()); ++i) {
+    (cpuSequence->getData())[i] += 1;  // so no way that maxSeqLen is 0;
+  }
+
+  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
+  gpuSequence->copyFrom(*cpuSequence);
+
+  size_t numSeq = cpuSequence->getSize() - 1;
+  size_t maxSeqLen = *std::max_element(cpuSequence->getData(),
+                                       cpuSequence->getData() + numSeq);
+
+  printf("numSeq = %ld, maxSeqLen = %ld\n", numSeq, maxSeqLen);
+  MatrixPtr cBatch = std::make_shared<CpuMatrix>(numSeq * maxSeqLen, inputDim);
+  MatrixPtr gBatch = std::make_shared<GpuMatrix>(numSeq * maxSeqLen, inputDim);
+  MatrixPtr cCheck = std::make_shared<CpuMatrix>(numSeq * maxSeqLen, inputDim);
+
+  // hl_sequence2batch_copy_padding(gBatch->getData(),
+  //                                gpuInput->getData(),
+  //                                cpuSequence->getData(),
+  //                                inputDim,
+  //                                maxSeqLen,
+  //                                numSeq,
+  //                                false,
+  //                                true);
+  // cCheck->copyFrom(*gBatch);
+
+  // int* seqStart = cpuSequence->getData();
+  // float* batchData = cBatch->getData();
+  // float* seqData = cpuInput->getData();
+  // for (size_t i = 0; i < maxSeqLen; i++) {
+  //   for (size_t j = 0; j < numSeq; j++) {
+  //     size_t sequenceStart = seqStart[j];
+  //     size_t sequenceLength = seqStart[j + 1] - seqStart[j];
+  //     if (i < sequenceLength) {
+  //       memcpy(batchData + (i * numSeq + j) * inputDim,
+  //              seqData + (sequenceStart + i) * inputDim,
+  //              inputDim * sizeof(real));
+  //     } else {
+  //       memset(batchData + (i * numSeq + j) * inputDim,
+  //              0,
+  //              inputDim * sizeof(real));
+  //     }
+  //   }
+  // }
+
+  // TensorCheckErr(*cBatch, *cCheck);
+}
+
+TEST(Matrix, warpCTC) {
+  for (auto batchSize : {1, 3, 17}) {
+    for (auto inputDim : {1, 3, 31}) {
+      VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim;
+      testBatch2seqPadding(batchSize, inputDim);
+    }
+  }
+}
+
+void testMaxPool3DFwdBwd(int numSamples,
+                         int channels,
+                         int imgSizeD,
+                         int imgSizeH,
+                         int imgSizeW,
+                         int ksizeD,
+                         int ksizeH,
+                         int ksizeW,
+                         int strideD,
+                         int strideH,
+                         int strideW,
+                         int padD,
+                         int padH,
+                         int padW) {
+  int outD = outputSize(imgSizeD, ksizeD, padD, strideD, true);
+  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
+  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
+
+  int inWidth = channels * imgSizeD * imgSizeH * imgSizeW;
+  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
+
+  int outWidth = channels * outD * outH * outW;
+  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+  MatrixPtr maxIdx = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr maxIdxGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+
+  input->randomizeUniform();
+  target->randomizeUniform();
+  inputGpu->copyFrom(*input);
+  targetGpu->copyFrom(*target);
+
+  target->maxPool3DForward(*input,
+                           *maxIdx,
+                           channels,
+                           imgSizeD,
+                           imgSizeH,
+                           imgSizeW,
+                           outD,
+                           outH,
+                           outW,
+                           ksizeD,
+                           ksizeH,
+                           ksizeW,
+                           strideD,
+                           strideH,
+                           strideW,
+                           padD,
+                           padH,
+                           padW);
+  targetGpu->maxPool3DForward(*inputGpu,
+                              *maxIdxGpu,
+                              channels,
+                              imgSizeD,
+                              imgSizeH,
+                              imgSizeW,
+                              outD,
+                              outH,
+                              outW,
+                              ksizeD,
+                              ksizeH,
+                              ksizeW,
+                              strideD,
+                              strideH,
+                              strideW,
+                              padD,
+                              padH,
+                              padW);
+  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
+  targetCheck->copyFrom(*targetGpu);
+  checkMatrixEqual(target, targetCheck);
+
+  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
+  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpuGrad =
+      GpuMatrix::create(numSamples, outWidth, false, true);
+
+  inputGrad->randomizeUniform();
+  targetGrad->randomizeUniform();
+  inputGpuGrad->copyFrom(*inputGrad);
+  targetGpuGrad->copyFrom(*targetGrad);
+
+  inputGrad->maxPool3DBackward(*targetGrad,
+                               *maxIdx,
+                               imgSizeD,
+                               imgSizeH,
+                               imgSizeW,
+                               outD,
+                               outH,
+                               outW,
+                               ksizeD,
+                               ksizeH,
+                               ksizeW,
+                               strideD,
+                               strideH,
+                               strideW,
+                               padD,
+                               padH,
+                               padW,
+                               1.0,
+                               1.0);
+  inputGpuGrad->maxPool3DBackward(*targetGpuGrad,
+                                  *maxIdxGpu,
+                                  imgSizeD,
+                                  imgSizeH,
+                                  imgSizeW,
+                                  outD,
+                                  outH,
+                                  outW,
+                                  ksizeD,
+                                  ksizeH,
+                                  ksizeW,
+                                  strideD,
+                                  strideH,
+                                  strideW,
+                                  padD,
+                                  padH,
+                                  padW,
+                                  1.0,
+                                  1.0);
+  MatrixPtr targetBwdCheck =
+      CpuMatrix::create(numSamples, inWidth, false, false);
+  targetBwdCheck->copyFrom(*inputGpuGrad);
+  checkMatrixEqual(inputGrad, targetBwdCheck);
+}
+
+void testAvgPool3DFwdBwd(int numSamples,
+                         int channels,
+                         int imgSizeD,
+                         int imgSizeH,
+                         int imgSizeW,
+                         int ksizeD,
+                         int ksizeH,
+                         int ksizeW,
+                         int strideD,
+                         int strideH,
+                         int strideW,
+                         int padD,
+                         int padH,
+                         int padW) {
+  int outD = outputSize(imgSizeD, ksizeD, padD, strideD, true);
+  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
+  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
+
+  int inWidth = imgSizeD * imgSizeH * imgSizeW * channels;
+  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
+
+  int outWidth = channels * outD * outH * outW;
+  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+
+  input->randomizeUniform();
+  target->randomizeUniform();
+  inputGpu->copyFrom(*input);
+  targetGpu->copyFrom(*target);
+
+  target->avgPool3DForward(*input,
+                           channels,
+                           imgSizeD,
+                           imgSizeH,
+                           imgSizeW,
+                           outD,
+                           outH,
+                           outW,
+                           ksizeD,
+                           ksizeH,
+                           ksizeW,
+                           strideD,
+                           strideH,
+                           strideW,
+                           padD,
+                           padH,
+                           padW);
+
+  targetGpu->avgPool3DForward(*inputGpu,
+                              channels,
+                              imgSizeD,
+                              imgSizeH,
+                              imgSizeW,
+                              outD,
+                              outH,
+                              outW,
+                              ksizeD,
+                              ksizeH,
+                              ksizeW,
+                              strideD,
+                              strideH,
+                              strideW,
+                              padD,
+                              padH,
+                              padW);
+
+  TensorCheckErr(*target, *targetGpu);
+
+  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
+  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpuGrad =
+      GpuMatrix::create(numSamples, outWidth, false, true);
+
+  inputGrad->randomizeUniform();
+  targetGrad->randomizeUniform();
+  inputGpuGrad->copyFrom(*inputGrad);
+  targetGpuGrad->copyFrom(*targetGrad);
+
+  inputGrad->avgPool3DBackward(*targetGrad,
+                               imgSizeD,
+                               imgSizeH,
+                               imgSizeW,
+                               outD,
+                               outH,
+                               outW,
+                               ksizeD,
+                               ksizeH,
+                               ksizeW,
+                               strideD,
+                               strideH,
+                               strideW,
+                               padD,
+                               padH,
+                               padW,
+                               1.0,
+                               1.0);
+
+  inputGpuGrad->avgPool3DBackward(*targetGpuGrad,
+                                  imgSizeD,
+                                  imgSizeH,
+                                  imgSizeW,
+                                  outD,
+                                  outH,
+                                  outW,
+                                  ksizeD,
+                                  ksizeH,
+                                  ksizeW,
+                                  strideD,
+                                  strideH,
+                                  strideW,
+                                  padD,
+                                  padH,
+                                  padW,
+                                  1.0,
+                                  1.0);
+  TensorCheckErr(*inputGrad, *inputGpuGrad);
+}
+
+// TODO(yi): I noticed many such blindly combinatorial tests in this
+// file.  They are no help to locate defects at all.
+TEST(Matrix, Pool3DFwdBwd) {
+  for (auto numSamples : {1, 3}) {
+    for (auto channels : {3}) {
+      for (auto imgSizeD : {9, 16}) {
+        for (auto imgSizeH : {9, 32}) {
+          for (auto imgSizeW : {9, 32}) {
+            for (auto sizeX : {3}) {
+              for (auto sizeY : {3}) {
+                for (auto sizeZ : {3}) {
+                  for (auto sD : {2}) {
+                    for (auto sH : {2}) {
+                      for (auto sW : {2}) {
+                        for (auto pD : {0, (sizeZ - 1) / 2}) {
+                          for (auto pH : {0, (sizeY - 1) / 2}) {
+                            for (auto pW : {0, (sizeX - 1) / 2}) {
+                              VLOG(3) << " numSamples=" << numSamples
+                                      << " channels=" << channels
+                                      << " imgSizeD=" << imgSizeD
+                                      << " imgSizeH=" << imgSizeH
+                                      << " imgSizeW=" << imgSizeW
+                                      << " sizeX=" << sizeX
+                                      << " sizeY=" << sizeY
+                                      << " sizeZ=" << sizeZ << " strideD=" << sD
+                                      << " strideH=" << sH << " strideW=" << sW
+                                      << " padingD=" << pD << " padingH=" << pH
+                                      << " padingW=" << pW;
+
+                              testMaxPool3DFwdBwd(numSamples,
+                                                  channels,
+                                                  imgSizeD,
+                                                  imgSizeH,
+                                                  imgSizeW,
+                                                  sizeX,
+                                                  sizeY,
+                                                  sizeZ,
+                                                  sD,
+                                                  sH,
+                                                  sW,
+                                                  pD,
+                                                  pH,
+                                                  pW);
+                              testAvgPool3DFwdBwd(numSamples,
+                                                  channels,
+                                                  imgSizeD,
+                                                  imgSizeH,
+                                                  imgSizeW,
+                                                  sizeX,
+                                                  sizeY,
+                                                  sizeZ,
+                                                  sD,
+                                                  sH,
+                                                  sW,
+                                                  pD,
+                                                  pH,
+                                                  pW);
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  //  for (auto numSamples : {1, 3}) {
+  //    for (auto channels : {1, 3}) {
+  //      for (auto imgSizeD : {9,16}) {
+  //      for (auto imgSizeH : {9, 32}) {
+  //        for (auto imgSizeW : {9, 32}) {
+  //          for (auto sizeX : {2, 3}) {
+  //            for (auto sizeY : {2, 3}) {
+  //            for (auto sizeZ : {2,3}){
+  //              for (auto sD : {1, 2}) {
+  //              for (auto sH : {1, 2}) {
+  //                for (auto sW : {1, 2}) {
+  //                  for (auto pD : {0, (sizeZ - 1) / 2}){
+  //                  for (auto pH : {0, (sizeY - 1) / 2}) {
+  //                    for (auto pW : {0, (sizeX - 1) / 2}) {
+  //                      VLOG(3) << " numSamples=" << numSamples
+  //                              << " channels=" << channels
+  //                              << " imgSizeD=" << imgSizeD
+  //                              << " imgSizeH=" << imgSizeH
+  //                              << " imgSizeW=" << imgSizeW
+  //                              << " sizeX=" << sizeX
+  //                              << " sizeY=" << sizeY
+  //                              << " sizeZ=" << sizeZ
+  //                              << " strideD=" << sD
+  //                              << " strideH=" << sH
+  //                              << " strideW=" << sW
+  //                              << " padingD=" << pD
+  //                              << " padingH=" << pH
+  //                              << " padingW=" << pW;
+  //
+  //                      testMaxPool3DFwdBwd(numSamples,
+  //                                        channels,
+  //                                        imgSizeD,
+  //                                        imgSizeH,
+  //                                        imgSizeW,
+  //                                        sizeX,
+  //                                        sizeY,
+  //                                        sizeZ,
+  //                                        sD,
+  //                                        sH,
+  //                                        sW,
+  //                                        pD,
+  //                                        pH,
+  //                                        pW);
+  //                      testAvgPool3DFwdBwd(numSamples,
+  //                                        channels,
+  //                                        imgSizeD,
+  //                                        imgSizeH,
+  //                                        imgSizeW,
+  //                                        sizeX,
+  //                                        sizeY,
+  //                                        sizeZ,
+  //                                        sD,
+  //                                        sH,
+  //                                        sW,
+  //                                        pD,
+  //                                        pH,
+  //                                        pW);
+  //                    }
+  //                  }
+  //                }
+  //              }
+  //            }
+  //            }
+  //          }
+  //        }
+  //      }
+  //      }
+  //    }
+  //    }
+  //  }
+  //  }
+}
+
+void testMatrixCol2Vol(int depth, int height, int width) {
+  int channel = 3;
+  int filterX = 3, filterY = 4, filterZ = 5;
+  int strideX = 2, strideY = 2, strideZ = 2;
+  int padX = 1, padY = 1, padZ = 1;
+
+  MatrixPtr cpuImage =
+      std::make_shared<CpuMatrix>(channel, depth * height * width);
+  MatrixPtr gpuImage =
+      std::make_shared<GpuMatrix>(channel, depth * height * width);
+  cpuImage->randomizeUniform();
+  gpuImage->copyFrom(*cpuImage);
+
+  int outD = outputSize(depth, filterZ, padZ, strideZ, true);
+  int outH = outputSize(height, filterY, padY, strideY, true);
+  int outW = outputSize(width, filterX, padX, strideX, true);
+
+  int colBufHeight = channel * filterZ * filterY * filterX;
+  int colBufWidth = outD * outH * outW;
+  MatrixPtr cpuColBuf = std::make_shared<CpuMatrix>(colBufHeight, colBufWidth);
+  MatrixPtr gpuColBuf = std::make_shared<GpuMatrix>(colBufHeight, colBufWidth);
+  cpuColBuf->vol2Col(cpuImage->getData(),
+                     channel,
+                     depth,
+                     height,
+                     width,
+                     filterZ,
+                     filterY,
+                     filterX,
+                     strideZ,
+                     strideY,
+                     strideX,
+                     padZ,
+                     padY,
+                     padX);
+  gpuColBuf->vol2Col(gpuImage->getData(),
+                     channel,
+                     depth,
+                     height,
+                     width,
+                     filterZ,
+                     filterY,
+                     filterX,
+                     strideZ,
+                     strideY,
+                     strideX,
+                     padZ,
+                     padY,
+                     padX);
+  TensorCheckEqual(*cpuColBuf, *gpuColBuf);
+
+  cpuColBuf->randomizeUniform();
+  gpuColBuf->copyFrom(*cpuColBuf);
+  cpuColBuf->col2Vol(cpuImage->getData(),
+                     channel,
+                     depth,
+                     height,
+                     width,
+                     filterZ,
+                     filterY,
+                     filterX,
+                     strideZ,
+                     strideY,
+                     strideX,
+                     padZ,
+                     padY,
+                     padX,
+                     1.0,
+                     1.0);
+  gpuColBuf->col2Vol(gpuImage->getData(),
+                     channel,
+                     depth,
+                     height,
+                     width,
+                     filterZ,
+                     filterY,
+                     filterX,
+                     strideZ,
+                     strideY,
+                     strideX,
+                     padZ,
+                     padY,
+                     padX,
+                     1.0,
+                     1.0);
+  TensorCheckErr(*cpuImage, *gpuImage);
+}
+
+TEST(Matrix, col2Vol) {
+  for (auto depth : {9, 16, 64}) {
+    for (auto height : {9, 11, 128}) {
+      for (auto width : {9, 32, 128}) {
+        VLOG(3) << "depth=" << depth << " height=" << height
+                << " width=" << width;
+        testMatrixCol2Vol(depth, height, width);
+      }
+    }
+  }
+}
+
+#endif
diff --git a/paddle/legacy/math/tests/test_matrixUtil.h b/paddle/legacy/math/tests/test_matrixUtil.h
new file mode 100644
index 0000000000000000000000000000000000000000..58c93f746e7ef4e2f2f98d4f410c74909a723812
--- /dev/null
+++ b/paddle/legacy/math/tests/test_matrixUtil.h
@@ -0,0 +1,233 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <gtest/gtest.h>
+#include <paddle/legacy/utils/Util.h>
+#include "paddle/legacy/math/SparseMatrix.h"
+
+namespace paddle {
+
+void checkMatrixEqual(const MatrixPtr& a, const MatrixPtr& b) {
+  ASSERT_EQ(a->getWidth(), b->getWidth());
+  ASSERT_EQ(a->getHeight(), b->getHeight());
+  ASSERT_EQ(a->isTransposed(), b->isTransposed());
+  for (size_t r = 0; r < a->getHeight(); ++r) {
+    for (size_t c = 0; c < a->getWidth(); ++c) {
+      ASSERT_FLOAT_EQ(a->getElement(r, c), b->getElement(r, c));
+    }
+  }
+}
+
+void checkSMatrixEqual(const CpuSparseMatrix& a, const CpuSparseMatrix& b) {
+  ASSERT_EQ(a.getWidth(), b.getWidth());
+  ASSERT_EQ(a.getHeight(), b.getHeight());
+  ASSERT_EQ(a.isTransposed(), b.isTransposed());
+  ASSERT_EQ(a.getFormat(), b.getFormat());
+  ASSERT_EQ(a.getElementCnt(), b.getElementCnt());
+  for (size_t r = 0; r < a.getElementCnt(); ++r) {
+    ASSERT_FLOAT_EQ(a.getValue()[r], b.getValue()[r]);
+  }
+}
+
+void checkSMatrixEqual(const CpuSparseMatrixPtr& a,
+                       const CpuSparseMatrixPtr& b) {
+  ASSERT_EQ(a->getWidth(), b->getWidth());
+  ASSERT_EQ(a->getHeight(), b->getHeight());
+  ASSERT_EQ(a->isTransposed(), b->isTransposed());
+  ASSERT_EQ(a->getFormat(), b->getFormat());
+  ASSERT_EQ(a->getElementCnt(), b->getElementCnt());
+  for (size_t r = 0; r < a->getElementCnt(); ++r) {
+    ASSERT_FLOAT_EQ(a->getValue()[r], b->getValue()[r]);
+  }
+}
+
+void checkSMatrixEqual2(const CpuSparseMatrixPtr& a,
+                        const CpuSparseMatrixPtr& b) {
+  ASSERT_EQ(a->getWidth(), b->getWidth());
+  ASSERT_EQ(a->getHeight(), b->getHeight());
+  ASSERT_EQ(a->isTransposed(), b->isTransposed());
+  ASSERT_EQ(a->getFormat(), b->getFormat());
+  ASSERT_EQ(a->getValueType(), b->getValueType());
+  ASSERT_EQ(a->getElementCnt(), b->getElementCnt());
+  if (a->getFormat() == SPARSE_CSR) {
+    for (size_t r = 0; r < a->getElementCnt(); ++r) {
+      ASSERT_EQ(a->getCols()[r], b->getCols()[r]);
+      if (a->getValueType() == FLOAT_VALUE) {
+        ASSERT_FLOAT_EQ(a->getValue()[r], b->getValue()[r]);
+      }
+    }
+    for (size_t r = 0; r <= a->getHeight(); r++) {
+      ASSERT_EQ(a->getRows()[r], b->getRows()[r]);
+    }
+  } else {
+    for (size_t r = 0; r < a->getElementCnt(); ++r) {
+      ASSERT_EQ(a->getRows()[r], b->getRows()[r]);
+      if (a->getValueType() == FLOAT_VALUE) {
+        ASSERT_FLOAT_EQ(a->getValue()[r], b->getValue()[r]);
+      }
+    }
+    for (size_t r = 0; r <= a->getWidth(); r++) {
+      ASSERT_EQ(a->getCols()[r], b->getCols()[r]);
+    }
+  }
+}
+
+void checkSMatrixEqual2Dense(const CpuSparseMatrix& a, const CpuMatrix& b) {
+  ASSERT_EQ(a.getWidth(), b.getWidth());
+  ASSERT_EQ(a.getHeight(), b.getHeight());
+  ASSERT_EQ(a.isTransposed(), b.isTransposed());
+
+  if (a.getFormat() == SPARSE_CSC) {
+    int* rows = a.getRows();
+    for (size_t i = 0; i < a.getWidth(); i++) {
+      for (size_t j = a.getColStartIdx(i); j < a.getColStartIdx(i + 1); j++) {
+        if (a.getValueType() == FLOAT_VALUE) {
+          ASSERT_FLOAT_EQ(a.getValue()[j], b.getElement(rows[j], i));
+        } else {
+          ASSERT_FLOAT_EQ(1.0, b.getElement(rows[j], i));
+        }
+      }
+    }
+  } else {
+    int* cols = a.getCols();
+    for (size_t i = 0; i < a.getHeight(); i++) {
+      for (size_t j = a.getRowStartIdx(i); j < a.getRowStartIdx(i + 1); j++) {
+        if (a.getValueType() == FLOAT_VALUE) {
+          ASSERT_FLOAT_EQ(a.getValue()[j], b.getElement(i, cols[j]));
+        } else {
+          ASSERT_FLOAT_EQ(1.0, b.getElement(i, cols[j]));
+        }
+      }
+    }
+  }
+}
+
+void checkSMatrixEqual2Dense(const CpuSparseMatrixPtr& a,
+                             const CpuMatrixPtr& b) {
+  ASSERT_EQ(a->getWidth(), b->getWidth());
+  ASSERT_EQ(a->getHeight(), b->getHeight());
+  ASSERT_EQ(a->isTransposed(), b->isTransposed());
+
+  if (a->getFormat() == SPARSE_CSC) {
+    int* rows = a->getRows();
+    for (size_t i = 0; i < a->getWidth(); i++) {
+      for (size_t j = a->getColStartIdx(i); j < a->getColStartIdx(i + 1); j++) {
+        if (a->getValueType() == FLOAT_VALUE) {
+          ASSERT_FLOAT_EQ(a->getValue()[j], b->getElement(rows[j], i));
+        } else {
+          ASSERT_FLOAT_EQ(1.0, b->getElement(rows[j], i));
+        }
+      }
+    }
+  } else {
+    int* cols = a->getCols();
+    for (size_t i = 0; i < a->getHeight(); i++) {
+      for (size_t j = a->getRowStartIdx(i); j < a->getRowStartIdx(i + 1); j++) {
+        if (a->getValueType() == FLOAT_VALUE) {
+          ASSERT_FLOAT_EQ(a->getValue()[j], b->getElement(i, cols[j]));
+        } else {
+          ASSERT_FLOAT_EQ(1.0, b->getElement(i, cols[j]));
+        }
+      }
+    }
+  }
+}
+
+void checkSMatrixErr(const CpuSparseMatrixPtr& a, const CpuSparseMatrixPtr& b) {
+#ifndef PADDLE_TYPE_DOUBLE
+  real err = 1e-3;
+#else
+  real err = 1e-10;
+#endif
+  ASSERT_EQ(a->getWidth(), b->getWidth());
+  ASSERT_EQ(a->getHeight(), b->getHeight());
+  ASSERT_EQ(a->isTransposed(), b->isTransposed());
+  ASSERT_EQ(a->getFormat(), b->getFormat());
+  ASSERT_EQ(a->getValueType(), b->getValueType());
+  ASSERT_EQ(a->getElementCnt(), b->getElementCnt());
+  int count = 0;
+  if (a->getFormat() == SPARSE_CSR) {
+    for (size_t r = 0; r < a->getElementCnt(); ++r) {
+      ASSERT_EQ(a->getCols()[r], b->getCols()[r]);
+      if (a->getValueType() == FLOAT_VALUE) {
+        real aVal = a->getValue()[r];
+        real bVal = b->getValue()[r];
+        if (std::abs(aVal - bVal) > err) {
+          if ((std::abs(aVal - bVal) / std::abs(aVal)) > (err / 10.0f)) {
+            LOG(INFO) << "a=" << aVal << "\t"
+                      << "b=" << bVal;
+            count++;
+          }
+        }
+      }
+    }
+    for (size_t r = 0; r <= a->getHeight(); r++) {
+      ASSERT_EQ(a->getRows()[r], b->getRows()[r]);
+    }
+  } else {
+    for (size_t r = 0; r < a->getElementCnt(); ++r) {
+      ASSERT_EQ(a->getRows()[r], b->getRows()[r]);
+      if (a->getValueType() == FLOAT_VALUE) {
+        real aVal = a->getValue()[r];
+        real bVal = b->getValue()[r];
+        if (std::abs(aVal - bVal) > err) {
+          if ((std::abs(aVal - bVal) / std::abs(aVal)) > (err / 10.0f)) {
+            count++;
+          }
+        }
+      }
+    }
+    for (size_t r = 0; r <= a->getWidth(); r++) {
+      ASSERT_EQ(a->getCols()[r], b->getCols()[r]);
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+}
+
+void checkMatrixErr(const Matrix& matrix1, const Matrix& matrix2) {
+  CHECK(matrix1.getHeight() == matrix2.getHeight());
+  CHECK(matrix1.getWidth() == matrix2.getWidth());
+#ifndef PADDLE_TYPE_DOUBLE
+  real err = 1e-3;
+#else
+  real err = 1e-10;
+#endif
+
+  int height = matrix1.getHeight();
+  int width = matrix1.getWidth();
+  const real* data1 = matrix1.getData();
+  const real* data2 = matrix2.getData();
+  int count = 0;
+  for (int i = 0; i < height; i++) {
+    for (int j = 0; j < width; j++) {
+      real a = data1[i * width + j];
+      real b = data2[i * width + j];
+      if (std::abs(a - b) > err) {
+        if ((std::abs(a - b) / std::abs(a)) > (err / 10.0f)) {
+          count++;
+        }
+      }
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+}
+
+void checkDataEqual(const real* a, const real* b, size_t size) {
+  for (size_t i = 0; i < size; ++i) {
+    ASSERT_FLOAT_EQ(a[i], b[i]);
+  }
+}
+
+}  //  namespace paddle
diff --git a/paddle/math/tests/test_perturbation.cpp b/paddle/legacy/math/tests/test_perturbation.cpp
similarity index 100%
rename from paddle/math/tests/test_perturbation.cpp
rename to paddle/legacy/math/tests/test_perturbation.cpp
diff --git a/paddle/legacy/math/tests/test_sparseMatrixCompare.cpp b/paddle/legacy/math/tests/test_sparseMatrixCompare.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..492aa0a689540dbb2c687326ff8a2919d89d2e6f
--- /dev/null
+++ b/paddle/legacy/math/tests/test_sparseMatrixCompare.cpp
@@ -0,0 +1,174 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_CUDA
+/// This unittest checks GpuSparseMatrix/CpuSparseMatrix get same result,
+//  so disable when
+/// only cpu version.
+
+#include <gtest/gtest.h>
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/Util.h"
+#include "test_matrixUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+static inline int uniformRandom(int n) { return n == 0 ? 0 : rand() % n; }
+
+void testSpMatrixAddBias(int M, int N, real rate, real scale) {
+  int nnz = M * N * rate;
+
+  MatrixPtr cpuA(new CpuSparseMatrix(M, N, nnz));
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(1, N);
+
+  MatrixPtr gpuA(new GpuSparseMatrix(M, N, nnz));
+  MatrixPtr gpuB = std::make_shared<GpuMatrix>(1, N);
+
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+
+  hl_stream_t stream(HPPL_STREAM_1);
+  gpuA->copyFrom(*cpuA, stream);
+  gpuB->copyFrom(*cpuB, stream);
+  hl_stream_synchronize(stream);
+
+  cpuA->addBias(*cpuB, scale);
+  gpuA->addBias(*gpuB, scale);
+
+  MatrixPtr outputCheck(new CpuSparseMatrix(M, N, nnz));
+  outputCheck->copyFrom(*gpuA, stream);
+  hl_stream_synchronize(stream);
+  checkSMatrixEqual2(std::dynamic_pointer_cast<CpuSparseMatrix>(cpuA),
+                     std::dynamic_pointer_cast<CpuSparseMatrix>(outputCheck));
+}
+
+void testSpMatrixAddDense(int M, int N, real rate) {  // add3
+  int nnz = M * N * rate;
+
+  MatrixPtr cpuA(new CpuSparseMatrix(M, N, nnz));
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(M, N);
+
+  MatrixPtr gpuA(new GpuSparseMatrix(M, N, nnz));
+  MatrixPtr gpuB = std::make_shared<GpuMatrix>(M, N);
+
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+
+  hl_stream_t stream(HPPL_STREAM_3);
+  gpuA->copyFrom(*cpuA, stream);
+  gpuB->copyFrom(*cpuB, stream);
+  hl_stream_synchronize(stream);
+
+  cpuA->add3(cpuB);
+  gpuA->add3(gpuB);
+
+  MatrixPtr outputCheck(new CpuSparseMatrix(M, N, nnz));
+  outputCheck->copyFrom(*gpuA, stream);
+  hl_stream_synchronize(stream);
+  checkSMatrixEqual2(std::dynamic_pointer_cast<CpuSparseMatrix>(cpuA),
+                     std::dynamic_pointer_cast<CpuSparseMatrix>(outputCheck));
+}
+
+void testSpMatrixMul(int M, int N, int K, real rate) {
+  int nnz = M * N * rate;
+
+  MatrixPtr cpuA = std::make_shared<CpuMatrix>(M, K);
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(N, K);
+  MatrixPtr cpuC(new CpuSparseMatrix(M, N, nnz));
+
+  MatrixPtr gpuA = std::make_shared<GpuMatrix>(M, K);
+  MatrixPtr gpuB = std::make_shared<GpuMatrix>(N, K);
+  MatrixPtr gpuC(new GpuSparseMatrix(M, N, nnz));
+
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+  cpuC->randomizeUniform();
+
+  hl_stream_t stream(HPPL_STREAM_3);
+  gpuA->copyFrom(*cpuA, stream);
+  gpuB->copyFrom(*cpuB, stream);
+  gpuC->copyFrom(*cpuC, stream);
+  hl_stream_synchronize(stream);
+
+  cpuC->mul(*cpuA, *cpuB->getTranspose(), 1, 1);
+  gpuC->mul(*gpuA, *gpuB->getTranspose(), 1, 1);
+
+  MatrixPtr outputCheck(new CpuSparseMatrix(M, N, nnz));
+  outputCheck->copyFrom(*gpuC, stream);
+  hl_stream_synchronize(stream);
+  checkSMatrixErr(std::dynamic_pointer_cast<CpuSparseMatrix>(cpuC),
+                  std::dynamic_pointer_cast<CpuSparseMatrix>(outputCheck));
+}
+
+void testSpMatrixCollectBias(int M, int N, real rate) {
+  int nnz = M * N * rate;
+  LOG(INFO) << "nnz=" << nnz;
+
+  MatrixPtr cpuA(new CpuSparseMatrix(M, N, nnz));
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(1, N);
+
+  MatrixPtr gpuA(new GpuSparseMatrix(M, N, nnz));
+  MatrixPtr gpuB = std::make_shared<GpuMatrix>(1, N);
+
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+
+  hl_stream_t stream(HPPL_STREAM_3);
+  gpuA->copyFrom(*cpuA, stream);
+  gpuB->copyFrom(*cpuB, stream);
+  hl_stream_synchronize(stream);
+
+  cpuB->collectBias(*cpuA, 1);
+  gpuB->collectBias(*gpuA, 1);
+
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(1, N);
+  outputCheck->copyFrom(*gpuB, stream);
+  hl_stream_synchronize(stream);
+  checkMatrixErr(*cpuB, *outputCheck);
+}
+
+TEST(SMatrix, sMatrixOp) {
+  for (auto height : {1, 11, 200}) {
+    for (auto width : {200, 2048, 20480}) {
+      VLOG(3) << " height=" << height << " width=" << width;
+      for (auto rate : {0.02, 0.1}) {
+        testSpMatrixAddDense(height, width, rate);
+        testSpMatrixAddBias(height, width, rate, 1.0);
+      }
+    }
+  }
+}
+
+TEST(SMatrix, sMatrixMul) {
+  for (auto M : {1, 40, 128, 200}) {
+    for (auto N : {100, 2000, 20480}) {
+      for (auto K : {100, 512, 1024}) {
+        VLOG(3) << " M=" << M << " N=" << N << " K=" << K;
+        testSpMatrixMul(M, N, K, 0.05);
+      }
+    }
+  }
+}
+
+TEST(SMatrix, sMatrixCollectBias) {
+  for (auto height : {1, 128, 200}) {
+    for (auto width : {100, 2048, 20480}) {
+      VLOG(3) << " height=" << height << " width=" << width;
+      testSpMatrixCollectBias(height, width, 0.1);
+    }
+  }
+}
+
+#endif
diff --git a/paddle/optimizer/CMakeLists.txt b/paddle/legacy/optimizer/CMakeLists.txt
similarity index 100%
rename from paddle/optimizer/CMakeLists.txt
rename to paddle/legacy/optimizer/CMakeLists.txt
diff --git a/paddle/optimizer/adadelta_optimizer.cc b/paddle/legacy/optimizer/adadelta_optimizer.cc
similarity index 100%
rename from paddle/optimizer/adadelta_optimizer.cc
rename to paddle/legacy/optimizer/adadelta_optimizer.cc
diff --git a/paddle/optimizer/adadelta_optimizer.h b/paddle/legacy/optimizer/adadelta_optimizer.h
similarity index 100%
rename from paddle/optimizer/adadelta_optimizer.h
rename to paddle/legacy/optimizer/adadelta_optimizer.h
diff --git a/paddle/optimizer/adagrad_optimizer.cc b/paddle/legacy/optimizer/adagrad_optimizer.cc
similarity index 100%
rename from paddle/optimizer/adagrad_optimizer.cc
rename to paddle/legacy/optimizer/adagrad_optimizer.cc
diff --git a/paddle/optimizer/adagrad_optimizer.h b/paddle/legacy/optimizer/adagrad_optimizer.h
similarity index 100%
rename from paddle/optimizer/adagrad_optimizer.h
rename to paddle/legacy/optimizer/adagrad_optimizer.h
diff --git a/paddle/optimizer/adam_optimizer.cc b/paddle/legacy/optimizer/adam_optimizer.cc
similarity index 100%
rename from paddle/optimizer/adam_optimizer.cc
rename to paddle/legacy/optimizer/adam_optimizer.cc
diff --git a/paddle/optimizer/adam_optimizer.h b/paddle/legacy/optimizer/adam_optimizer.h
similarity index 100%
rename from paddle/optimizer/adam_optimizer.h
rename to paddle/legacy/optimizer/adam_optimizer.h
diff --git a/paddle/optimizer/lr_policy.h b/paddle/legacy/optimizer/lr_policy.h
similarity index 100%
rename from paddle/optimizer/lr_policy.h
rename to paddle/legacy/optimizer/lr_policy.h
diff --git a/paddle/optimizer/optimizer.cc b/paddle/legacy/optimizer/optimizer.cc
similarity index 100%
rename from paddle/optimizer/optimizer.cc
rename to paddle/legacy/optimizer/optimizer.cc
diff --git a/paddle/optimizer/optimizer.h b/paddle/legacy/optimizer/optimizer.h
similarity index 100%
rename from paddle/optimizer/optimizer.h
rename to paddle/legacy/optimizer/optimizer.h
diff --git a/paddle/optimizer/parameter_optimizer.cc b/paddle/legacy/optimizer/parameter_optimizer.cc
similarity index 100%
rename from paddle/optimizer/parameter_optimizer.cc
rename to paddle/legacy/optimizer/parameter_optimizer.cc
diff --git a/paddle/optimizer/parameter_optimizer.h b/paddle/legacy/optimizer/parameter_optimizer.h
similarity index 100%
rename from paddle/optimizer/parameter_optimizer.h
rename to paddle/legacy/optimizer/parameter_optimizer.h
diff --git a/paddle/optimizer/parameter_optimizer_test.cc b/paddle/legacy/optimizer/parameter_optimizer_test.cc
similarity index 100%
rename from paddle/optimizer/parameter_optimizer_test.cc
rename to paddle/legacy/optimizer/parameter_optimizer_test.cc
diff --git a/paddle/legacy/optimizer/serialization.h b/paddle/legacy/optimizer/serialization.h
new file mode 100644
index 0000000000000000000000000000000000000000..2067a8d8cff23bff975d23a4df4d0aa7df20b00f
--- /dev/null
+++ b/paddle/legacy/optimizer/serialization.h
@@ -0,0 +1,49 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include "OptimizerConfig.pb.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "tensor.h"
+
+namespace paddle {
+namespace optimizer {
+
+static void TensorToProto(const Tensor& tensor, TensorProto* proto) {
+  proto->set_data_type(TensorProto::PADDLE_ELEMENT_TYPE_FLOAT32);
+  std::stringstream os;
+  for (size_t i = 0; i < tensor.size(); ++i) {
+    os << tensor[i];
+    proto->add_content(os.str());
+    os.str(std::string());
+  }
+}
+
+static void ProtoToTensor(const TensorProto& proto, Tensor* tensor) {
+  std::stringstream sin;
+  for (auto i = 0; i < proto.content_size(); ++i) {
+    sin << proto.content(i);
+    sin >> (*tensor)[i];
+    sin.str(std::string());
+    sin.clear();
+  }
+}
+
+}  // namespace optimizer
+}  // namespace paddle
diff --git a/paddle/optimizer/serialization_test.cc b/paddle/legacy/optimizer/serialization_test.cc
similarity index 100%
rename from paddle/optimizer/serialization_test.cc
rename to paddle/legacy/optimizer/serialization_test.cc
diff --git a/paddle/optimizer/sgd_optimizer.cc b/paddle/legacy/optimizer/sgd_optimizer.cc
similarity index 100%
rename from paddle/optimizer/sgd_optimizer.cc
rename to paddle/legacy/optimizer/sgd_optimizer.cc
diff --git a/paddle/optimizer/sgd_optimizer.h b/paddle/legacy/optimizer/sgd_optimizer.h
similarity index 100%
rename from paddle/optimizer/sgd_optimizer.h
rename to paddle/legacy/optimizer/sgd_optimizer.h
diff --git a/paddle/legacy/optimizer/tensor.h b/paddle/legacy/optimizer/tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..2e58577d4df7aabd8cd218dc13837461cc681ac6
--- /dev/null
+++ b/paddle/legacy/optimizer/tensor.h
@@ -0,0 +1,68 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+/**
+ * @brief tensor used by optimizer
+ */
+
+#include <string.h>
+#include <memory>
+#include "paddle/legacy/utils/Common.h"
+#include "paddle/legacy/utils/Logging.h"
+
+namespace paddle {
+namespace optimizer {
+
+template <class T>
+class TensorT {
+ public:
+  TensorT(size_t size) : height_(1), width_(size) {
+    // new T[size]() initializes all element to zero value.
+    data_ptr_ = std::shared_ptr<T>(new T[size](), std::default_delete<T[]>());
+    data_ = data_ptr_.get();
+  }
+
+  TensorT(T* data, size_t size)
+      : height_(1), width_(size), data_ptr_(nullptr), data_(data) {}
+
+  TensorT(T* data, size_t h, size_t w)
+      : height_(h), width_(w), data_ptr_(nullptr), data_(data) {}
+
+  virtual ~TensorT() {}
+
+  T* get_buffer() { return this->data_; }
+
+  T& operator[](const size_t idx) {
+    CHECK(idx >= 0 && idx < this->width_) << "out of index range";
+    return data_[idx];
+  }
+  T& operator[](const size_t idx) const {
+    CHECK(idx >= 0 && idx < this->width_) << "out of index range";
+    return data_[idx];
+  }
+  // TODO: replace with tensorshape
+  size_t size() const { return this->width_ * this->height_; }
+
+ protected:
+  size_t height_;
+  size_t width_;
+  std::shared_ptr<T> data_ptr_;
+  T* data_;
+};
+
+// TODO(zhihong): design problem of dynamic datatype, need to fix it
+typedef TensorT<float> Tensor;
+
+}  // namespace optimizer
+}  // namespace paddle
diff --git a/paddle/legacy/parameter/Argument.cpp b/paddle/legacy/parameter/Argument.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3f1d599e901110a1c9390d76c45f8b4b1f4cab2a
--- /dev/null
+++ b/paddle/legacy/parameter/Argument.cpp
@@ -0,0 +1,707 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Argument.h"
+#include "paddle/legacy/math/SparseMatrix.h"
+
+#include <algorithm>
+
+namespace paddle {
+static void resizeAndCopy(MatrixPtr& dest,
+                          const MatrixPtr& src,
+                          bool useGpu,
+                          hl_stream_t stream) {
+  if (src) {
+    if (!dest) {
+      dest = src->clone(0, 0, useGpu);
+    } else {
+      CHECK_EQ(dest->useGpu(), useGpu);
+      dest->resize(src->getHeight(), src->getWidth());
+    }
+    dest->copyFrom(*src, stream);
+  } else {
+    dest.reset();
+  }
+}
+
+static void resizeAndCopy(IVectorPtr& dest,
+                          const IVectorPtr& src,
+                          bool useGpu,
+                          hl_stream_t stream) {
+  if (src) {
+    IVector::resizeOrCreate(dest, src->getSize(), useGpu);
+    dest->copyFrom(*src, stream);
+  } else {
+    dest.reset();
+  }
+}
+
+static void resizeAndCopy(ICpuGpuVectorPtr& dest,
+                          const ICpuGpuVectorPtr& src,
+                          bool useGpu,
+                          hl_stream_t stream) {
+  if (src) {
+    ICpuGpuVector::resizeOrCreate(dest, src->getSize(), useGpu);
+    dest->copyFrom(*src, stream);
+  } else {
+    dest.reset();
+  }
+}
+
+static void resizeAndCopy(MatrixPtr& dest,
+                          const MatrixPtr& src,
+                          int32_t startRow,
+                          int32_t copySize,
+                          bool useGpu,
+                          hl_stream_t stream = HPPL_STREAM_DEFAULT) {
+  if (src) {
+    CHECK_LE((size_t)startRow + copySize, src->getHeight());
+    int height = copySize;
+    int width = src->getWidth();
+    if (!dest) {
+      dest = src->clone(height, width, useGpu);
+    } else {
+      CHECK_EQ(dest->useGpu(), useGpu);
+      dest->resize(height, width);
+    }
+    MatrixPtr submat = src->subMatrix(startRow, copySize);
+    if (dynamic_cast<GpuSparseMatrix*>(dest.get())) {
+      // copy a subMatrix of CpuSparseMatrix to GpuSparseMatrix.
+      // First copy it to CPU, and then copy it to the GPU.
+      MatrixPtr tmp = src->clone(height, width, false);
+      tmp->copyFrom(*submat, stream);
+      dest->copyFrom(*tmp, stream);
+    } else {
+      dest->copyFrom(*submat, stream);
+    }
+  } else {
+    dest.reset();
+  }
+}
+
+static void resizeAndCopy(IVectorPtr& dest,
+                          const IVectorPtr& src,
+                          int32_t startPos,
+                          int32_t copySize,
+                          bool useGpu,
+                          hl_stream_t stream = HPPL_STREAM_DEFAULT) {
+  if (src) {
+    CHECK_LE((size_t)startPos + copySize, src->getSize());
+
+    int height = copySize;
+    IVector::resizeOrCreate(dest, height, useGpu);
+    dest->copyFrom(src->getData() + startPos, height, stream);
+  } else {
+    dest.reset();
+  }
+}
+
+static void resizeAndCopy(ICpuGpuVectorPtr& dest,
+                          const ICpuGpuVectorPtr& src,
+                          int32_t startPos,
+                          int32_t copySize,
+                          bool useGpu,
+                          hl_stream_t stream = HPPL_STREAM_DEFAULT) {
+  if (src) {
+    CHECK_LE((size_t)startPos + copySize, src->getSize());
+
+    ICpuGpuVector::resizeOrCreate(dest, copySize, useGpu);
+    dest->copyFrom(*src, startPos, copySize, useGpu, stream);
+  } else {
+    dest.reset();
+  }
+}
+
+static void resizeAndCopy(SVectorPtr& dest,
+                          const SVectorPtr& src,
+                          bool useGpu,
+                          hl_stream_t stream) {
+  if (src) {
+    size_t height = src->size();
+    if (!dest) {
+      dest = std::make_shared<std::vector<std::string>>(height);
+    } else {
+      dest->resize(height);
+    }
+    std::copy_n(src->begin(), height, dest->begin());
+  } else {
+    dest.reset();
+  }
+}
+
+static void resizeAndCopy(SVectorPtr& dest,
+                          const SVectorPtr& src,
+                          int32_t startPos,
+                          int32_t copySize,
+                          bool useGpu,
+                          hl_stream_t stream = HPPL_STREAM_DEFAULT) {
+  if (src) {
+    CHECK_LE((size_t)startPos + copySize, src->size());
+    size_t height = copySize;
+    if (!dest) {
+      dest = std::make_shared<std::vector<std::string>>(height);
+    } else {
+      dest->resize(height);
+    }
+    std::copy_n(src->begin() + startPos, height, dest->begin());
+  } else {
+    dest.reset();
+  }
+}
+
+void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu) {
+  resizeAndCopyFrom(src, useGpu, HPPL_STREAM_DEFAULT);
+  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+}
+
+void Argument::resizeAndCopyFrom(const Argument& src,
+                                 bool useGpu,
+                                 hl_stream_t stream) {
+  dataId = src.dataId;
+  resizeAndCopy(value, src.value, useGpu, stream);
+  resizeAndCopy(grad, src.grad, useGpu, stream);
+  resizeAndCopy(in, src.in, useGpu, stream);
+  resizeAndCopy(ids, src.ids, useGpu, stream);
+  resizeAndCopy(sequenceStartPositions,
+                src.sequenceStartPositions,
+                false /* useGpu */,
+                stream);
+  if (src.hasSubseq()) {
+    resizeAndCopy(subSequenceStartPositions,
+                  src.subSequenceStartPositions,
+                  false /* useGpu */,
+                  stream);
+  }
+  resizeAndCopy(strs, src.strs, useGpu, stream);
+  frameWidth = src.frameWidth;
+  frameHeight = src.frameHeight;
+  frameDepth = src.frameDepth;
+}
+
+int32_t Argument::resizeAndCopyFrom(const Argument& src,
+                                    int32_t startSeq,
+                                    int32_t copySize,
+                                    bool useGpu) {
+  int32_t size =
+      resizeAndCopyFrom(src, startSeq, copySize, useGpu, HPPL_STREAM_DEFAULT);
+  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  return size;
+}
+
+int32_t Argument::resizeAndCopyFrom(const Argument& src,
+                                    int32_t startSeq,
+                                    int32_t copySize,
+                                    bool useGpu,
+                                    hl_stream_t stream) {
+  dataId = src.dataId;
+  frameWidth = src.frameWidth;
+  frameHeight = src.frameHeight;
+  frameDepth = src.frameDepth;
+
+  if (!src.sequenceStartPositions) {
+    // non-sequence input, copy samples directly
+    int32_t startRow = startSeq;
+    resizeAndCopy(in, src.in, startRow, copySize, useGpu, stream);
+    resizeAndCopy(value, src.value, startRow, copySize, useGpu, stream);
+    resizeAndCopy(grad, src.grad, startRow, copySize, useGpu, stream);
+    resizeAndCopy(ids, src.ids, startRow, copySize, useGpu, stream);
+    resizeAndCopy(strs, src.strs, startRow, copySize, useGpu, stream);
+    return copySize;
+  } else {
+    // sequence input
+    const int* sequence = src.sequenceStartPositions->getData(false);
+    int32_t startRow = sequence[startSeq];           // sample start from here
+    int32_t endRow = sequence[startSeq + copySize];  // sample end
+    int32_t copyFeatureSize = endRow - startRow;     // num of samples
+    resizeAndCopy(in, src.in, startRow, copyFeatureSize, useGpu, stream);
+    resizeAndCopy(value, src.value, startRow, copyFeatureSize, useGpu, stream);
+    resizeAndCopy(grad, src.grad, startRow, copyFeatureSize, useGpu, stream);
+    resizeAndCopy(ids, src.ids, startRow, copyFeatureSize, useGpu, stream);
+    resizeAndCopy(sequenceStartPositions,
+                  src.sequenceStartPositions,
+                  startSeq,
+                  copySize + 1,
+                  false,
+                  stream);
+    // modify new sequenceStartPositions
+    int* destSequences = sequenceStartPositions->getMutableData(false);
+    for (int i = 0; i < copySize + 1; i++) {
+      destSequences[i] -= startRow;
+    }
+    CHECK_EQ(destSequences[0], 0);
+    CHECK_EQ(destSequences[copySize], copyFeatureSize);
+    if (src.hasSubseq()) {
+      // sequence has sub-sequence
+      int* subSequence = src.subSequenceStartPositions->getMutableData(false);
+      int32_t subStartSeq = 0;
+      int32_t subEndSeq = 0;
+      int numSubSequences = src.getNumSubSequences();
+      for (int i = 0; i < numSubSequences + 1; i++) {
+        if (subSequence[i] == startRow) {
+          subStartSeq = i;
+        } else if (subSequence[i] == endRow) {
+          subEndSeq = i;
+          break;
+        }
+      }
+      int32_t copySubSize = subEndSeq - subStartSeq;
+      resizeAndCopy(subSequenceStartPositions,
+                    src.subSequenceStartPositions,
+                    subStartSeq,
+                    copySubSize + 1,
+                    false,
+                    stream);
+      // modify new subSequenceStartPositions
+      int* destSubSequences = subSequenceStartPositions->getMutableData(false);
+      for (int i = 0; i < copySubSize + 1; i++) {
+        destSubSequences[i] -= startRow;
+      }
+      CHECK_EQ(destSubSequences[0], 0);
+      CHECK_EQ(destSubSequences[copySubSize], copyFeatureSize);
+    }
+    resizeAndCopy(strs, src.strs, startRow, copySize, useGpu, stream);
+    return copyFeatureSize;
+  }
+}
+
+void Argument::concat(const std::vector<Argument>& args,
+                      const std::vector<int>& selectRows,
+                      const std::vector<int>& seqStartPos,
+                      const std::vector<int>& copySize,
+                      bool useGpu,
+                      hl_stream_t stream,
+                      PassType passType) {
+  CHECK(!subSequenceStartPositions)
+      << "undefined behavior for subsequence positions";
+
+  size_t batchSize = 0;
+  for (size_t i = 0; i < copySize.size(); ++i)
+    batchSize += copySize[i] * (seqStartPos[i + 1] - seqStartPos[i]);
+
+  auto copyArg = [batchSize, stream](MatrixPtr& dst,
+                                     MatrixPtr src,
+                                     int desStartRow,
+                                     int srcStartRow,
+                                     int size,
+                                     bool useGpu) {
+    if (!src) {
+      dst.reset();
+      return;
+    }
+    size_t width = src->getWidth();
+    if (!dst) {
+      dst = src->clone(batchSize, width, useGpu);
+    } else {
+      dst->resize(batchSize, width);
+    }
+
+    MatrixPtr tmpMatrix = dst->subMatrix(desStartRow, size);
+    tmpMatrix->copyFrom(*src->subMatrix(srcStartRow, size), stream);
+  };
+
+  auto copyIds = [batchSize, stream](IVectorPtr& dst,
+                                     const IVectorPtr& src,
+                                     int desStartRow,
+                                     int srcStartRow,
+                                     int size,
+                                     bool useGpu) {
+    if (!src) {
+      dst.reset();
+      return;
+    }
+    IVector::resizeOrCreate(dst, batchSize, useGpu);
+    dst->subVec(desStartRow, size)
+        ->copyFrom(*src->subVec(srcStartRow, size), stream);
+  };
+
+  auto copyStrs = [batchSize](SVectorPtr& dst,
+                              const SVectorPtr& src,
+                              int desStartRow,
+                              int srcStartRow,
+                              int size,
+                              bool useGpu) {
+    if (!src) {
+      dst.reset();
+      return;
+    }
+    if (!dst) {
+      dst = std::make_shared<std::vector<std::string>>(batchSize);
+    } else {
+      dst->resize(batchSize);
+    }
+    std::copy(src->begin() + srcStartRow,
+              src->begin() + srcStartRow + size,
+              dst->begin() + desStartRow);
+  };
+
+  dataId = args[0].dataId;
+  CHECK_NE(seqStartPos.size(), 0UL);
+  int desStartRow = 0;
+  for (size_t i = 0; i < copySize.size(); ++i) {
+    int startPos = seqStartPos[i];
+    int endPos = seqStartPos[i + 1];
+    CHECK_GE(args.size(), static_cast<size_t>(endPos - startPos));
+    for (int j = startPos; j < endPos; ++j) {
+      const Argument& arg = args[j - startPos];
+      CHECK_EQ(arg.dataId, dataId) << "Arguments to concatenate should have "
+                                   << "the same dataId.";
+      const int srcStartRow = selectRows[j];
+      copyArg(in, arg.in, desStartRow, srcStartRow, copySize[i], useGpu);
+      copyArg(value, arg.value, desStartRow, srcStartRow, copySize[i], useGpu);
+      if (passType != PASS_TEST) {
+        copyArg(grad, arg.grad, desStartRow, srcStartRow, copySize[i], useGpu);
+      }
+      copyIds(ids, arg.ids, desStartRow, srcStartRow, copySize[i], useGpu);
+      copyStrs(strs, arg.strs, desStartRow, srcStartRow, copySize[i], useGpu);
+      desStartRow += copySize[i];
+    }
+  }
+  ICpuGpuVector::resizeOrCreate(
+      sequenceStartPositions, seqStartPos.size(), useGpu);
+  sequenceStartPositions->copyFrom(
+      seqStartPos.data(), seqStartPos.size(), useGpu);
+}
+
+void Argument::concat(const std::vector<Argument>& args,
+                      bool useGpu,
+                      hl_stream_t stream,
+                      PassType passType) {
+  int32_t batchSize = 0;
+  int64_t numSequences = 0;
+  int64_t numSubSequences = 0;
+  for (auto& arg : args) {
+    batchSize += arg.getBatchSize();
+    numSequences += arg.getNumSequences();
+    numSubSequences += arg.getNumSubSequences();
+  }
+
+  auto copyArg = [batchSize, stream](
+      MatrixPtr& dst, MatrixPtr src, int startRow, bool useGpu) {
+    if (!src) {
+      dst.reset();
+      return;
+    }
+    size_t width = src->getWidth();
+    if (!dst) {
+      dst = src->clone(batchSize, width, useGpu);
+    } else {
+      dst->resize(batchSize, width);
+    }
+
+    MatrixPtr tmpMatrix = dst->subMatrix(startRow, src->getHeight());
+    tmpMatrix->copyFrom(*src, stream);
+  };
+
+  auto copyIds = [batchSize, stream](
+      IVectorPtr& dst, const IVectorPtr& src, int startRow, bool useGpu) {
+    if (!src) {
+      dst.reset();
+      return;
+    }
+    IVector::resizeOrCreate(dst, batchSize, useGpu);
+    dst->subVec(startRow, src->getSize())->copyFrom(*src, stream);
+  };
+
+  auto copyStrs = [batchSize](
+      SVectorPtr& dst, const SVectorPtr& src, int startRow, bool useGpu) {
+    if (!src) {
+      dst.reset();
+      return;
+    }
+    if (!dst) {
+      dst = std::make_shared<std::vector<std::string>>(batchSize);
+    } else {
+      dst->resize(batchSize);
+    }
+    std::copy(src->begin(), src->end(), dst->begin() + startRow);
+  };
+
+  auto copySequencePos = [](ICpuGpuVectorPtr& dstSeq,
+                            const ICpuGpuVectorPtr& srcSeq,
+                            int dstNumSequences,
+                            int srcNumSequences,
+                            int& startSequences,
+                            int startRow) {
+    if (srcSeq) {
+      ICpuGpuVector::resizeOrCreate(dstSeq, dstNumSequences + 1, false);
+      const int* src = srcSeq->getData(false);
+      int* dest = dstSeq->getMutableData(false);
+      for (int i = 0; i < srcNumSequences + 1; ++i) {
+        dest[i + startSequences] = src[i] + startRow;
+      }
+      startSequences += srcNumSequences;
+    } else {
+      dstSeq.reset();
+    }
+  };
+
+  int startRow = 0;
+  int startSequences = 0;
+  int startSubSequences = 0;
+  dataId = args[0].dataId;
+  for (auto& arg : args) {
+    CHECK_EQ(arg.dataId, dataId) << "Arguments in concat should have"
+                                 << " same dataId";
+    copyArg(in, arg.in, startRow, useGpu);
+    copyArg(value, arg.value, startRow, useGpu);
+    if (passType != PASS_TEST) copyArg(grad, arg.grad, startRow, useGpu);
+    copyIds(ids, arg.ids, startRow, useGpu);
+    copySequencePos(sequenceStartPositions,
+                    arg.sequenceStartPositions,
+                    numSequences,
+                    arg.getNumSequences(),
+                    startSequences,
+                    startRow);
+    copySequencePos(subSequenceStartPositions,
+                    arg.subSequenceStartPositions,
+                    numSubSequences,
+                    arg.getNumSubSequences(),
+                    startSubSequences,
+                    startRow);
+    copyStrs(strs, arg.strs, startRow, useGpu);
+    startRow += arg.getBatchSize();
+  }
+}
+
+void Argument::splitByDataId(const std::vector<Argument>& argus,
+                             std::vector<std::vector<Argument>>* arguGroups) {
+  arguGroups->clear();
+  int lastDataId = -1;
+  for (const auto& argu : argus) {
+    if (argu.dataId == -1) {
+      // is -1, then create a new group
+      arguGroups->emplace_back();
+      lastDataId = -1;
+    } else if (argu.dataId != lastDataId) {
+      // not -1, also not equal to last Argument, then create a new group
+      arguGroups->emplace_back();
+      lastDataId = argu.dataId;
+    } else {
+      // not -1, and equal to last Argument, do nothing
+    }
+    arguGroups->back().push_back(argu);
+  }
+}
+
+void Argument::getSeqInfo(std::vector<SeqInfo>* seqInfo) const {
+  const int* starts = sequenceStartPositions->getData(false);
+  const int* subStarts =
+      hasSubseq() ? subSequenceStartPositions->getData(false) : nullptr;
+  size_t numSequences = getNumSequences();
+  seqInfo->reserve(numSequences);
+  int subSeqEnd = 0;
+  for (size_t i = 0; i < numSequences; ++i) {
+    SeqInfo info;
+    info.seqStart = starts[i];
+    info.subLevelLength = starts[i + 1] - starts[i];
+    info.seqId = i;
+    if (hasSubseq()) {
+      info.subSeqStart = subSeqEnd;
+      while (subStarts[subSeqEnd] < starts[i + 1]) {
+        ++subSeqEnd;
+      }
+      info.topLevelLength = subSeqEnd - info.subSeqStart;
+    } else {
+      info.topLevelLength = info.subLevelLength;
+      info.subSeqStart = 0;  // not used
+    }
+    seqInfo->push_back(info);
+  }
+  std::sort(
+      seqInfo->begin(), seqInfo->end(), [](const SeqInfo& a, const SeqInfo& b) {
+        return a.topLevelLength > b.topLevelLength;
+      });
+}
+
+void Argument::checkSubset() const {
+  if (getNumSequences() > getNumSubSequences()) {
+    LOG(FATAL) << "numSubSequences is less than numSequences ("
+               << getNumSubSequences() << " vs. " << getNumSequences() << ")";
+  }
+  const int* start = sequenceStartPositions->getData(false);
+  const int* subStart = subSequenceStartPositions->getData(false);
+  int seqId = 0;
+  int subSeqId = 0;
+  while (seqId < getNumSequences() && subSeqId < getNumSubSequences()) {
+    if (start[seqId] > subStart[subSeqId]) {
+      ++subSeqId;
+    } else if (start[seqId] == subStart[subSeqId]) {
+      ++subSeqId;
+      ++seqId;
+    } else {
+      LOG(FATAL) << "seqStartPositions is not subset of subSeqStartPositions";
+    }
+  }
+  if (seqId < getNumSequences()) {
+    LOG(FATAL) << "seqStartPositions is not subset of subSeqStartPositions";
+  }
+}
+
+void Argument::degradeSequence(const Argument& input) {
+  CHECK_EQ(input.hasSubseq(), 1UL);
+  size_t numSequences = input.getNumSequences();
+  size_t numSubSequences = input.getNumSubSequences();
+  ICpuGpuVector::resizeOrCreate(
+      sequenceStartPositions, numSequences + 1, false);
+  int* tgtBuf = sequenceStartPositions->getMutableData(false);
+  const int* starts = input.sequenceStartPositions->getData(false);
+  const int* subStarts = input.subSequenceStartPositions->getData(false);
+  int seqId = 0;
+  for (size_t subSeqId = 0; subSeqId < numSubSequences; ++subSeqId) {
+    if (subStarts[subSeqId] == starts[seqId]) {
+      tgtBuf[seqId] = subSeqId;
+      seqId++;
+    }
+  }
+  tgtBuf[numSequences] = numSubSequences;
+}
+
+void Argument::poolSequenceWithStride(const Argument& input,
+                                      size_t stride,
+                                      ICpuGpuVectorPtr* stridePostions,
+                                      bool reversed) {
+  // If input.sequenceStartPositions = [0, 9, 14, 17, 30] and stride = 5,
+  // then sequenceStartPositions = [0, 2, 3, 4, 7].
+  // If reversed = false, stridePostions = [0, 5, 9, 14, 17, 22, 27, 30];
+  // else reversed = true, stridePostions = [0, 4, 9, 14, 17, 20, 25, 30]
+
+  CHECK(input.sequenceStartPositions);
+  CHECK_EQ(input.hasSubseq(), 0UL);
+  CHECK_GT(stride, 0UL) << "stride must larger than 0";
+  size_t numSequences = input.getNumSequences();
+  ICpuGpuVector::resizeOrCreate(
+      sequenceStartPositions, numSequences + 1, false);
+  const int* starts = input.sequenceStartPositions->getData(false);
+  int* tgtBuf = sequenceStartPositions->getMutableData(false);
+  // first index of target sequence and stride positions are both 0
+  tgtBuf[0] = 0;
+  std::vector<int> stridePos;
+  for (size_t seqId = 0; seqId < numSequences; ++seqId) {
+    size_t seqLength = starts[seqId + 1] - starts[seqId];
+    stridePos.emplace_back(starts[seqId]);
+    if (seqLength == 0) {
+      // empty sequence
+      tgtBuf[seqId + 1] = tgtBuf[seqId];
+    } else {
+      int size = ceil((float)seqLength / stride);
+      tgtBuf[seqId + 1] = tgtBuf[seqId] + size;
+      for (int i = 0; i < size - 1; ++i) {
+        int cur = reversed ? starts[seqId + 1] - (size - 1 - i) * stride
+                           : stridePos.back() + stride;
+        stridePos.emplace_back(cur);
+      }
+    }
+  }
+  stridePos.emplace_back(starts[numSequences]);
+  int size = stridePos.size();
+  CHECK_EQ(size - 1, tgtBuf[numSequences]);
+  ICpuGpuVector::resizeOrCreate(*stridePostions, size, false);
+  (*stridePostions)->getMutableVector(false)->copyFrom(stridePos.data(), size);
+}
+
+void Argument::getValueString(
+    std::unordered_map<std::string, std::string>* out) const {
+  if (value) {
+    std::ostringstream os;
+    value->print(os);
+    out->insert({"value", os.str()});
+  }
+  if (ids) {
+    std::ostringstream os;
+    ids->print(os, ids->getSize());
+    out->insert({"ids", os.str()});
+  }
+  if (sequenceStartPositions) {
+    std::ostringstream os;
+    sequenceStartPositions->getVector(false)->print(
+        os, sequenceStartPositions->getSize());
+    out->insert({"sequence pos", os.str()});
+  }
+  if (subSequenceStartPositions) {
+    std::ostringstream os;
+    subSequenceStartPositions->getVector(false)->print(
+        os, subSequenceStartPositions->getSize());
+    out->insert({"sub-sequence pos", os.str()});
+  }
+}
+
+void Argument::printValueString(std::ostream& stream,
+                                const std::string& prefix) const {
+  std::unordered_map<std::string, std::string> out;
+  getValueString(&out);
+  for (auto field : {"value", "ids", "sequence pos", "sub-sequence pos"}) {
+    auto it = out.find(field);
+    if (it != out.end()) {
+      stream << prefix << field << ":\n" << it->second;
+    }
+  }
+}
+
+void Argument::subArgFrom(const Argument& input,
+                          size_t offset,
+                          size_t height,
+                          size_t width,
+                          bool useGpu,
+                          bool trans,
+                          bool seqFlag,
+                          size_t seqStart,
+                          size_t seqSize) {
+  if (input.value) {
+    value = Matrix::create(
+        input.value->getData() + offset * width, height, width, trans, useGpu);
+  }
+  if (input.ids) {
+    ids = IVector::create(input.ids->getData() + offset, height, useGpu);
+  }
+  if (input.grad) {
+    grad = Matrix::create(
+        input.grad->getData() + offset * width, height, width, trans, useGpu);
+  }
+  if (seqFlag) {
+    sequenceStartPositions = std::make_shared<ICpuGpuVector>(
+        *(input.sequenceStartPositions), seqStart, seqSize);
+  }
+}
+
+void Argument::reorganizeSeqInfo(
+    const ICpuGpuVectorPtr seqStartPos,
+    const ICpuGpuVectorPtr subSeqStartPos,
+    std::vector<std::vector<int>>& reorganizedSeqInfo) {
+  CHECK(seqStartPos);
+  reorganizedSeqInfo.clear();
+
+  int seqNum = seqStartPos->getSize() - 1;
+  int* seqStarts = seqStartPos->getMutableData(false);
+
+  if (subSeqStartPos) {
+    int* subSeqStarts = subSeqStartPos->getMutableData(false);
+    reorganizedSeqInfo.resize(seqNum, std::vector<int>());
+    int seqIdx = 0;
+    for (size_t i = 0; i < subSeqStartPos->getSize(); ++i) {
+      reorganizedSeqInfo[seqIdx].push_back(subSeqStarts[i]);
+      if (subSeqStarts[i] == seqStarts[seqIdx + 1]) {
+        seqIdx++;
+        if (seqIdx == seqNum) return;
+        reorganizedSeqInfo[seqIdx].push_back(subSeqStarts[i]);
+      }
+    }
+  } else {
+    reorganizedSeqInfo.resize(1, std::vector<int>(seqNum + 1, 0));
+    memcpy(reorganizedSeqInfo[0].data(),
+           seqStarts,
+           sizeof(int) * seqStartPos->getSize());
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/parameter/Argument.h b/paddle/legacy/parameter/Argument.h
new file mode 100644
index 0000000000000000000000000000000000000000..ea8634896c18c7c3516c0d584aec4b475d626e61
--- /dev/null
+++ b/paddle/legacy/parameter/Argument.h
@@ -0,0 +1,349 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "hl_gpu.h"
+
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/legacy/parameter/Parameter.h"
+#include "paddle/legacy/utils/Locks.h"
+#include "paddle/legacy/utils/Util.h"
+
+namespace paddle {
+
+typedef std::shared_ptr<std::vector<std::string>> SVectorPtr;
+
+struct Argument {
+  Argument()
+      : in(nullptr),
+        value(nullptr),
+        ids(nullptr),
+        grad(nullptr),
+        strs(nullptr),
+        frameHeight(0),
+        frameWidth(0),
+        frameDepth(0),
+        sequenceStartPositions(nullptr),
+        subSequenceStartPositions(nullptr),
+        cpuSequenceDims(nullptr),
+        deviceId(-1),
+        allCount(0),
+        valueCount(0),
+        gradCount(0),
+        dataId(0) {}
+  Argument(const Argument& argument) {
+    *this = argument;
+    valueCount = 0;
+    gradCount = 0;
+    dataId = argument.dataId;
+  }
+  ~Argument() {}
+
+  void operator=(const Argument& argument) {
+    in = argument.in;
+    value = argument.value;
+    ids = argument.ids;
+    grad = argument.grad;
+    strs = argument.strs;
+    sequenceStartPositions = argument.sequenceStartPositions;
+    subSequenceStartPositions = argument.subSequenceStartPositions;
+    cpuSequenceDims = argument.cpuSequenceDims;
+    deviceId = argument.deviceId;
+    allCount = argument.allCount;
+    frameHeight = argument.frameHeight;
+    frameWidth = argument.frameWidth;
+    frameDepth = argument.frameDepth;
+    dataId = argument.dataId;
+  }
+
+  MatrixPtr in;  // used if needed
+  MatrixPtr value;
+  IVectorPtr ids;  // a sequence of ids. Can be use for class id for costLayer
+  MatrixPtr grad;  // If empty, gradient is not needed.
+  SVectorPtr strs;
+
+  // A dataBatch includes batchSize frames, one frame maybe not only vector
+  size_t frameHeight;
+  size_t frameWidth;
+  size_t frameDepth;
+
+  // If NULL, each position is treated independently.
+  // Otherwise, its size should be #NumberOfSequences + 1.
+  // The first position is always 0 and
+  // the last position should be equal to batchSize.
+  ICpuGpuVectorPtr sequenceStartPositions;
+
+  // If NULL, each sequence has no subsequence.
+  // Otherwise, its size should be #NumberOfSubSequences + 1.
+  // The first position is always 0 and
+  // the last position should be equal to batchSize.
+  ICpuGpuVectorPtr subSequenceStartPositions;
+
+  // dimension of sequence, stored only in CPU
+  IVectorPtr cpuSequenceDims;
+
+  int deviceId;            // the GPU device id which the argument in
+  int allCount;            // the number of output layers using this argument
+  mutable int valueCount;  // waiting this member when layer do forward
+  mutable int gradCount;   // waiting this member when layer do backward
+  mutable LockedCondition valueReadyCond;
+  mutable LockedCondition gradReadyCond;
+
+  int dataId;  // dataProvider id
+
+  /* Increase the reference count of the argument. */
+  void countIncrement() { allCount++; }
+
+  int getAllCount() const { return allCount; }
+
+  void waitValueReady() const {
+    valueReadyCond.wait([this] { return (valueCount != 0); });
+
+    std::lock_guard<std::mutex> guard(*valueReadyCond.mutex());
+    valueCount--;
+  }
+
+  void notifyValueReady() const {
+    valueReadyCond.notify_all([this] { valueCount = allCount; });
+  }
+
+  void waitGradReady() const {
+    gradReadyCond.wait([this] { return (gradCount == allCount); });
+    gradCount = 0;
+  }
+
+  void notifyGradReady() const {
+    gradReadyCond.notify_all([this] { gradCount++; });
+  }
+
+  int64_t getBatchSize() const {
+    if (value) return value->getHeight();
+    if (ids) return ids->getSize();
+    if (grad) return grad->getHeight();
+    if (in) return in->getHeight();
+    if (strs) return strs->size();
+    return 0;
+  }
+  size_t getFrameHeight() const { return frameHeight; }
+  size_t getFrameWidth() const { return frameWidth; }
+  size_t getFrameDepth() const { return frameDepth; }
+  void setFrameHeight(size_t h) { frameHeight = h; }
+  void setFrameWidth(size_t w) { frameWidth = w; }
+  void setFrameDepth(size_t d) { frameDepth = d; }
+
+  int64_t getNumSequences() const {
+    return sequenceStartPositions ? sequenceStartPositions->getSize() - 1
+                                  : getBatchSize();
+  }
+
+  int64_t getNumSubSequences() const {
+    return subSequenceStartPositions ? subSequenceStartPositions->getSize() - 1
+                                     : getBatchSize();
+  }
+
+  bool hasSeq() const { return sequenceStartPositions != nullptr; }
+  bool hasSubseq() const { return subSequenceStartPositions != nullptr; }
+
+  const int* getCpuStartPositions() const {
+    return hasSubseq() ? subSequenceStartPositions->getData(false)
+                       : sequenceStartPositions->getData(false);
+  }
+
+  static inline real sum(const std::vector<Argument>& arguments) {
+    real cost = 0;
+    for (auto& arg : arguments) {
+      if (arg.value) {
+        SetDevice device(arg.deviceId);
+        cost += arg.value->getSum();
+      }
+    }
+    return cost;
+  }
+
+  /**
+   * @brief (value, ids, grad, sequenceStartPositions) of output are subset of
+   *        input. Note that, output share the same memory of input.
+   *
+   * @param input[in]       input
+   * @param offset[in]      offset in terms of rows
+   * @param height[in]      height of output.value
+   * @param width[in]       width of output.value
+   * @param useGpu[in]
+   * @param trans[in]       whether input.value is transform
+   * @param seqFlag[in]     whether input has sequenceStartPositions
+   * @param seqStart[in]    offset of input.sequenceStartPositions
+   * @param seqSize[in]     lenght of output.sequenceStartPositions
+   */
+  void subArgFrom(const Argument& input,
+                  size_t offset,
+                  size_t height,
+                  size_t width,
+                  bool useGpu,
+                  bool trans = false,
+                  bool seqFlag = false,
+                  size_t seqStart = 0,
+                  size_t seqSize = 0);
+  /*
+   * for sequence input:
+   *   startSeq: the sequence id of start
+   *   copySize: how many sequences need to copy
+   *   return value: how many samples are copied
+   * for non-sequence input:
+   *   startSeq: the sample id of start
+   *   copySize: how many samples need to copy
+   *   return value: how many samples are copied
+   * Note that when specifying the stream explicitly in this case,
+   * synchronize should also be called somewhere after this function
+   */
+  int32_t resizeAndCopyFrom(const Argument& src,
+                            int32_t startSeq,
+                            int32_t copySize,
+                            bool useGpu,
+                            hl_stream_t stream);
+
+  /*
+   * same with the above function, except that the stream is
+   * HPPL_STREAM_DEFAULT and synchronize is automatically called
+   * inside it
+   */
+  int32_t resizeAndCopyFrom(const Argument& src,
+                            int32_t startSeq,
+                            int32_t copySize,
+                            bool useGpu = FLAGS_use_gpu);
+
+  void resizeAndCopyFrom(const Argument& src, bool useGpu, hl_stream_t stream);
+
+  /*
+   * same with the above function, except that the stream is
+   * HPPL_STREAM_DEFAULT and synchronize is automatically called
+   * inside it
+   */
+  void resizeAndCopyFrom(const Argument& src, bool useGpu = FLAGS_use_gpu);
+
+  /*
+    @brief Concatenate several arguments into one and put the result into it.
+    @param args : a vector of argument, each element of which is a frame in a
+    batch of sequences.
+    @param selectRows : select several row of args to concatenate
+    @param seqStartPos : sequence start positions in the final Argument
+    @param hl_stream_t : cuda stream
+    @param passTyoe : type of task, training or testing
+   */
+  void concat(const std::vector<Argument>& args,
+              const std::vector<int>& selectRows,
+              const std::vector<int>& seqStartPos,
+              const std::vector<int>& copySize,
+              bool useGpu,
+              hl_stream_t stream,
+              PassType passType);
+
+  /*
+    Concatenate several args into one and put the result into this.
+   */
+  void concat(const std::vector<Argument>& src,
+              bool useGpu = FLAGS_use_gpu,
+              hl_stream_t stream = HPPL_STREAM_DEFAULT,
+              PassType passType = PASS_TEST);
+
+  /*
+   * split vector<Argument> to several vectors according to dataId
+   */
+  static void splitByDataId(const std::vector<Argument>& argus,
+                            std::vector<std::vector<Argument>>* arguGroups);
+
+  struct SeqInfo {
+    // Equal to sequence length for sequence data
+    // Equal to number of subsequences for subsequence data
+    int topLevelLength;
+
+    int seqStart;
+    int seqId;
+
+    // Equal to topLevelLength for sequence data
+    // Equal to sum of the length of subsequences for subsequence data
+    int subLevelLength;
+
+    // Only used for subsequence data, start position of this sequence
+    // is subSequenceStartPositions, i.e.
+    // subSequenceStartPositions[subSeqStart] == seqStart
+    int subSeqStart;
+  };
+  /*
+    Get SeqInfo for each sequence of this argument
+    Elements in *seqInfo are sorted by topLevelLength in descending order
+  */
+  void getSeqInfo(std::vector<SeqInfo>* segInfo) const;
+
+  /*
+   Check Whether sequenceStartPositions is subset of
+   subSequenceStartPositions.
+   */
+  void checkSubset() const;
+
+  /*
+   sequence has sub-sequence degrades to a sequence.
+   */
+  void degradeSequence(const Argument& input);
+
+  /*
+   After pooling with stride n (n is smaller than sequence length),
+   a long sequence will be shorten.
+   This function is invalid for sequence having sub-sequence.
+   */
+  void poolSequenceWithStride(const Argument& input,
+                              size_t stride,
+                              ICpuGpuVectorPtr* stridePositions,
+                              bool reversed = false);
+  /**
+   * @brief getValueString will return the argument's output in string. There
+   * are several kinds of output. The keys of output dictionary are 'value',
+   * 'id', 'sequence pos', 'sub-sequence pos'.
+   * @param out [out]: the return values.
+   */
+  void getValueString(std::unordered_map<std::string, std::string>* out) const;
+
+  /**
+   * @brief printValueString will print the argument's output in order of
+   * 'value', 'id', 'sequence pos', 'sub-sequence pos'.
+   * @param stream: Output stream
+   * @param prefix: line prefix for printing.
+   */
+  void printValueString(std::ostream& stream,
+                        const std::string& prefix = "") const;
+
+  /**
+   * @brief reorganizeSeqInfo will reorganize sequenceStartPositions and
+   * subSequenceStartPositions into a 2 dimensional arrary: reorganizedSeqInfo.
+   *
+   * @param seqStartPos: sequenceStartPositions of an Argument.
+   * @param subSeqStartPos: subSequenceStartPositions of an Argument.
+   * @param the reorganized sequence start position information.
+   *
+   * Examples:
+   * seqStartPos: [0, 4, 15, 20, 28]
+   * subSeqStartPos: [0, 3, 4, 5, 7, 10, 15, 20, 22, 23, 25, 28]
+   * reorganizedSeqInfo:
+   *   [
+   *     [0,3,4],
+   *     [4,5,7,10,15],
+   *     [15,20],
+   *     [20,22,23,25,28]
+   *   ]
+   */
+  static void reorganizeSeqInfo(
+      const ICpuGpuVectorPtr seqStartPos,
+      const ICpuGpuVectorPtr subSeqStartPos,
+      std::vector<std::vector<int>>& reorganizedSeqInfo);
+};
+
+}  // namespace paddle
diff --git a/paddle/parameter/AverageOptimizer.cpp b/paddle/legacy/parameter/AverageOptimizer.cpp
similarity index 100%
rename from paddle/parameter/AverageOptimizer.cpp
rename to paddle/legacy/parameter/AverageOptimizer.cpp
diff --git a/paddle/parameter/AverageOptimizer.h b/paddle/legacy/parameter/AverageOptimizer.h
similarity index 100%
rename from paddle/parameter/AverageOptimizer.h
rename to paddle/legacy/parameter/AverageOptimizer.h
diff --git a/paddle/parameter/CMakeLists.txt b/paddle/legacy/parameter/CMakeLists.txt
similarity index 100%
rename from paddle/parameter/CMakeLists.txt
rename to paddle/legacy/parameter/CMakeLists.txt
diff --git a/paddle/legacy/parameter/FirstOrderOptimizer.cpp b/paddle/legacy/parameter/FirstOrderOptimizer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4f82a115f7bb467737b53b9891d88d3c4f501faf
--- /dev/null
+++ b/paddle/legacy/parameter/FirstOrderOptimizer.cpp
@@ -0,0 +1,330 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "FirstOrderOptimizer.h"
+#include "paddle/legacy/math/TrainingAlgorithmOp.h"
+#include "paddle/legacy/utils/Flags.h"
+#include "paddle/legacy/utils/Util.h"
+
+#include <cmath>
+
+DEFINE_bool(log_clipping, false, "enable log clipping or not");
+
+namespace paddle {
+
+SparseMomentumParameterOptimizer::SparseMomentumParameterOptimizer(
+    const OptimizationConfig& optConfig)
+    : ParameterOptimizer(optConfig) {
+  addParameterType(PARAMETER_MOMENTUM);
+  addParameterType(PARAMETER_MOMENTUM_UT);
+  addParameterType(PARAMETER_MOMENTUM_VT);
+  alpha_ = 1;
+  beta_ = 1;
+  tau_ = -1;
+  threshold_ = 1e+06;
+}
+
+void SparseMomentumParameterOptimizer::init(size_t numRows,
+                                            const ParameterConfig* config) {
+  isParameterSparse_ = numRows != 0;
+  t0Vec_.resize(numRows);
+  t0Vec_.assign(t0Vec_.size(), 0);
+  timer_ = 0;
+  momentum_ = config->momentum();
+  decayRate_ = config->decay_rate();
+  gamma_ = config->learning_rate();
+}
+
+void SparseMomentumParameterOptimizer::startBatch(int64_t numSamplesProcessed) {
+  learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
+  if (isParameterSparse_) {
+    tau_ = tau_ + beta_ / alpha_;
+    alpha_ = alpha_ / momentum_;
+    beta_ = beta_ / (1 + decayRate_ * gamma_ * learningRate_);
+  }
+}
+
+void SparseMomentumParameterOptimizer::update(const VectorPtr vecs[],
+                                              const ParameterConfig& paraConfig,
+                                              size_t sparseId) const {
+  if (sparseId != -1LU) {
+    CHECK_LT(sparseId, t0Vec_.size());
+    if (t0Vec_[sparseId] == 0) {
+      vecs[PARAMETER_MOMENTUM_VT]->assign(*vecs[PARAMETER_VALUE]);
+      t0Vec_[sparseId] = 1;
+    }
+    vecs[PARAMETER_MOMENTUM_UT]->add(*vecs[PARAMETER_GRADIENT],
+                                     -alpha_ * gamma_ * learningRate_);
+    vecs[PARAMETER_MOMENTUM_VT]->add(*vecs[PARAMETER_GRADIENT],
+                                     tau_ * alpha_ * gamma_ * learningRate_);
+    vecs[PARAMETER_VALUE]->add(*vecs[PARAMETER_MOMENTUM_UT],
+                               tau_ / beta_ + 1.0 / alpha_,
+                               *vecs[PARAMETER_MOMENTUM_VT],
+                               1.0 / beta_);
+
+  } else {
+    vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+                                     *vecs[PARAMETER_MOMENTUM],
+                                     learningRate_ * paraConfig.learning_rate(),
+                                     paraConfig.momentum(),
+                                     applyDecay_ ? paraConfig.decay_rate() : 0);
+  }
+}
+
+ParameterOptimizer::TraverseCallback
+SparseMomentumParameterOptimizer::needSpecialTraversal(
+    const ParameterConfig& config) const {
+  if (alpha_ > threshold_ && isParameterSparse_) {
+    //  Restart to avoid large value multiplication
+    //  1. \alpha = 1, \beta = 1, \tau = 0
+    //  2. Note that \tau * u_t + v_t = \beta \theta_t, therefore:
+    //     u_t should be rescaled to u_t/alpha_
+    //     v_t should be reset to \theta_t
+    return [this](const VectorPtr vecs[],
+                  const ParameterConfig& config,
+                  size_t sparseId) {
+      vecs[PARAMETER_MOMENTUM_UT]->divScalar(alpha_);
+      vecs[PARAMETER_MOMENTUM_VT]->assign(*vecs[PARAMETER_VALUE]);
+    };
+  } else {
+    return nullptr;
+  }
+}
+
+void SparseMomentumParameterOptimizer::finishBatch() {
+  timer_++;
+  if (!isParameterSparse_) return;
+  if (alpha_ > threshold_) {
+    alpha_ = 1;
+    beta_ = 1;
+    tau_ = -1;
+  }
+}
+
+void AdagradParameterOptimizer::update(const VectorPtr vecs[],
+                                       const ParameterConfig& config,
+                                       size_t sparseId) const {
+  BaseMatrix& value = *vecs[PARAMETER_VALUE];
+  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
+  BaseMatrix& accum_buffer = *vecs[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& accum = *vecs[PARAMETER_GRADIENT_SQURESUM1];
+  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
+
+  real epsilon = optConfig_.ada_epsilon();
+  real learningRate = learningRate_ * config.learning_rate();
+  real momentum = config.momentum();
+  real decayRate = applyDecay_ ? config.decay_rate() : 0;
+
+  adagradApply(value,
+               grad,
+               mom,
+               accum_buffer,
+               accum,
+               lr,
+               epsilon,
+               learningRate,
+               momentum,
+               decayRate);
+}
+
+ParameterOptimizer::TraverseCallback
+AdagradParameterOptimizer::needSpecialTraversal(
+    const ParameterConfig& config) const {
+  if (numUpdates_ % kMaxNumAccumulates == 0) {
+    // Move the sum to a different buffer to avoid loss of precision
+    // due to too many sums.
+    return [](const VectorPtr vecs[],
+              const ParameterConfig& config,
+              size_t sparseId) {
+      vecs[PARAMETER_GRADIENT_SQURESUM]->add(
+          *vecs[PARAMETER_GRADIENT_SQURESUM1]);
+      vecs[PARAMETER_GRADIENT_SQURESUM1]->zeroMem();
+    };
+  } else {
+    return nullptr;
+  }
+}
+
+void AdaDeltaParameterOptimizer::update(const VectorPtr vecs[],
+                                        const ParameterConfig& config,
+                                        size_t sparseId) const {
+  CHECK(sparseId == -1LU) << "Sparse update is not supported";
+
+  BaseMatrix& value = *vecs[PARAMETER_VALUE];
+  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
+  BaseMatrix& accum = *vecs[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& accum_update = *vecs[PARAMETER_GRADIENT_SQURESUM1];
+  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
+
+  real learningRate = learningRate_ * config.learning_rate();
+  real momentum = config.momentum();
+  real decayRate = applyDecay_ ? config.decay_rate() : 0;
+
+  adadeltaApply(value,
+                grad,
+                mom,
+                accum,
+                accum_update,
+                lr,
+                rou_,
+                epsilon_,
+                learningRate,
+                momentum,
+                decayRate);
+}
+
+void RMSPropParameterOptimizer::update(const VectorPtr vecs[],
+                                       const ParameterConfig& config,
+                                       size_t sparseId) const {
+  BaseMatrix& value = *vecs[PARAMETER_VALUE];
+  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
+  BaseMatrix& sum = *vecs[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& sum1 = *vecs[PARAMETER_GRADIENT_SQURESUM1];
+  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
+
+  real accumulatedRou = rou_;
+  bool firstTime = timer_ == 0;
+  if (sparseId != -1LU) {
+    CHECK_LT(sparseId, t0Vec_.size());
+    accumulatedRou = std::pow(rou_, timer_ + 1 - t0Vec_[sparseId]);
+    firstTime = t0Vec_[sparseId] == 0;
+    t0Vec_[sparseId] = timer_ + 1;
+  }
+
+  real epsilon = optConfig_.ada_epsilon();
+  real learningRate = learningRate_ * config.learning_rate();
+  real momentum = config.momentum();
+  real decayRate = applyDecay_ ? config.decay_rate() : 0;
+
+  rmspropApply(value,
+               grad,
+               mom,
+               sum,
+               sum1,
+               lr,
+               accumulatedRou,
+               rou_,
+               epsilon,
+               learningRate,
+               momentum,
+               decayRate,
+               firstTime);
+}
+
+void DecayedAdagradParameterOptimizer::update(const VectorPtr vecs[],
+                                              const ParameterConfig& config,
+                                              size_t sparseId) const {
+  BaseMatrix& value = *vecs[PARAMETER_VALUE];
+  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
+  BaseMatrix& sum = *vecs[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
+
+  real accumulatedRou = rou_;
+  bool firstTime = timer_ == 0;
+  if (sparseId != -1LU) {
+    CHECK_LT(sparseId, t0Vec_.size());
+    accumulatedRou = std::pow(rou_, timer_ + 1 - t0Vec_[sparseId]);
+    firstTime = t0Vec_[sparseId] == 0;
+    t0Vec_[sparseId] = timer_ + 1;
+  }
+
+  real epsilon = optConfig_.ada_epsilon();
+  real learningRate = learningRate_ * config.learning_rate();
+  real momentum = config.momentum();
+  real decayRate = applyDecay_ ? config.decay_rate() : 0;
+
+  decayedAdagradApply(value,
+                      grad,
+                      mom,
+                      sum,
+                      lr,
+                      accumulatedRou,
+                      rou_,
+                      epsilon,
+                      learningRate,
+                      momentum,
+                      decayRate,
+                      firstTime);
+}
+
+void AdamParameterOptimizer::update(const VectorPtr vecs[],
+                                    const ParameterConfig& config,
+                                    size_t sparseId) const {
+  CHECK(sparseId == -1UL) << "Sparse update is not supported";
+
+  real beta1_power = std::pow(beta1_, step_);
+  real beta2_power = std::pow(beta2_, step_);
+  real learningRate = config.learning_rate() * learningRate_;
+
+  BaseMatrix& value = *vecs[PARAMETER_VALUE];
+  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
+  BaseMatrix& v = *vecs[PARAMETER_SECOND_MOMENTUM];
+
+  adamApply(value,
+            grad,
+            mom,
+            v,
+            beta1_,
+            beta2_,
+            beta1_power,
+            beta2_power,
+            epsilon_,
+            learningRate);
+}
+
+void AdamaxParameterOptimizer::update(const VectorPtr vecs[],
+                                      const ParameterConfig& config,
+                                      size_t sparseId) const {
+  CHECK(sparseId == -1UL) << "Sparse update is not supported";
+  real learningRate = config.learning_rate() * learningRate_;
+
+  BaseMatrix& value = *vecs[PARAMETER_VALUE];
+  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
+  BaseMatrix& u = *vecs[PARAMETER_WEIGHTED_INFINITY_NORM];
+
+  adamaxApply(value, grad, mom, u, beta1_, beta2_, step_, learningRate);
+}
+
+void OptimizerWithGradientClipping::update(const VectorPtr vecs[],
+                                           const ParameterConfig& config,
+                                           size_t sparseId) const {
+  real globalThreshold = optConfig_.gradient_clipping_threshold();
+  real localThreshold = config.gradient_clipping_threshold();
+
+  // Use local gradient clipping threshold if it's enabled,
+  // otherwise using the global one.
+  real threshold = localThreshold > 0.0f ? localThreshold : globalThreshold;
+  std::string field = localThreshold > 0.0f ? "local" : "global";
+
+  real maxAbsGrad = vecs[PARAMETER_GRADIENT]->getAbsMax();
+  if (maxAbsGrad > threshold) {
+    if (FLAGS_log_clipping) {
+      real avgAbsGrad = vecs[PARAMETER_GRADIENT]->getAbsSum() /
+                        vecs[PARAMETER_GRADIENT]->getSize();
+      LOG(INFO) << "parameter=" << config.name() << " need clipping by "
+                << field << " threshold=" << threshold
+                << ", max grad=" << maxAbsGrad << ", avg grad=" << avgAbsGrad;
+    }
+    vecs[PARAMETER_GRADIENT]->clip(-threshold, threshold);
+  }
+  optimizer_->update(vecs, config, sparseId);
+}
+
+}  // namespace paddle
diff --git a/paddle/parameter/FirstOrderOptimizer.h b/paddle/legacy/parameter/FirstOrderOptimizer.h
similarity index 100%
rename from paddle/parameter/FirstOrderOptimizer.h
rename to paddle/legacy/parameter/FirstOrderOptimizer.h
diff --git a/paddle/legacy/parameter/LearningRateScheduler.cpp b/paddle/legacy/parameter/LearningRateScheduler.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..68c44a7ec49f64a1085609d906441c9ed4502888
--- /dev/null
+++ b/paddle/legacy/parameter/LearningRateScheduler.cpp
@@ -0,0 +1,173 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "LearningRateScheduler.h"
+#include "paddle/legacy/utils/StringUtil.h"
+
+namespace paddle {
+
+ClassRegistrar<LearningRateScheduler, OptimizationConfig>
+    LearningRateScheduler::registrar_;
+
+LearningRateScheduler* LearningRateScheduler::create(
+    const OptimizationConfig& config) {
+  return registrar_.createByType(config.learning_rate_schedule(), config);
+}
+
+// LRS stands for LearningRateScheduler
+
+class BaseLRS : public LearningRateScheduler {
+ public:
+  explicit BaseLRS(const OptimizationConfig& config)
+      : learningRate_(config.learning_rate()),
+        a_(config.learning_rate_decay_a()),
+        b_(config.learning_rate_decay_b()) {}
+
+ protected:
+  real learningRate_;
+  real a_;
+  real b_;
+};
+
+class ConstLRS : public BaseLRS {
+ public:
+  explicit ConstLRS(const OptimizationConfig& config) : BaseLRS(config) {}
+  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
+    return learningRate_;
+  }
+};
+REGISTER_LEARNING_RATE_SCHEDULER(constant, ConstLRS);
+
+class PolyLRS : public BaseLRS {
+ public:
+  explicit PolyLRS(const OptimizationConfig& config) : BaseLRS(config) {}
+  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
+    return learningRate_ * pow(1.0 + a_ * numSamplesProcessed, -b_);
+  }
+};
+REGISTER_LEARNING_RATE_SCHEDULER(poly, PolyLRS);
+
+class CaffePolyLRS : public BaseLRS {
+ public:
+  explicit CaffePolyLRS(const OptimizationConfig& config) : BaseLRS(config) {}
+  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
+    if (numSamplesProcessed > a_) {
+      LOG_FIRST_N(WARNING, 1)
+          << "Using caffe_poly learning rate schedule, "
+          << "learning rate hits ZERO when "
+          << "numSamplesProcessed > config.learning_rate_decay_b(), "
+          << "training is over and you can stop it. "
+          << "See common/LearningRateScheduler.cpp for more info.";
+      return 0;
+    } else {
+      return learningRate_ * pow(1.0 - numSamplesProcessed / a_, b_);
+    }
+  }
+};
+REGISTER_LEARNING_RATE_SCHEDULER(caffe_poly, CaffePolyLRS);
+
+class ExpLRS : public BaseLRS {
+ public:
+  explicit ExpLRS(const OptimizationConfig& config) : BaseLRS(config) {}
+  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
+    double decayRatio = (double)numSamplesProcessed / b_;
+    return learningRate_ * pow(a_, decayRatio);
+  }
+};
+REGISTER_LEARNING_RATE_SCHEDULER(exp, ExpLRS);
+
+class DiscreteExpLRS : public BaseLRS {
+ public:
+  explicit DiscreteExpLRS(const OptimizationConfig& config) : BaseLRS(config) {}
+  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
+    int numDecays = floor(numSamplesProcessed / b_);
+    return learningRate_ * pow(a_, numDecays);
+  }
+};
+REGISTER_LEARNING_RATE_SCHEDULER(discexp, DiscreteExpLRS);
+
+class LinearLRS : public BaseLRS {
+ public:
+  explicit LinearLRS(const OptimizationConfig& config) : BaseLRS(config) {}
+  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
+    return std::max(learningRate_ - a_ * numSamplesProcessed, b_);
+  }
+};
+REGISTER_LEARNING_RATE_SCHEDULER(linear, LinearLRS);
+
+/*
+  specify learning rate through
+  learning_rate_args = 'seg0:rate0,seg1:rate1,...,segK:rateK'
+  if seg_{i-1} <= numSamples <= seg_i,
+  then learning_rate = learning_rate_base * rate_i
+*/
+class ManualLRS : public BaseLRS {
+ public:
+  explicit ManualLRS(const OptimizationConfig& config)
+      : BaseLRS(config), currentSegment_(0), lastNum_(0) {
+    std::vector<std::string> pieces;
+    str::split(config.learning_rate_args(), ',', &pieces);
+    rates_.reserve(pieces.size());
+    std::string s1, s2;
+
+    for (auto& piece : pieces) {
+      auto pos = piece.find(':');
+      CHECK(pos != std::string::npos) << "Wrong format for learning_rate_args: "
+                                      << config.learning_rate_args();
+      segments_.push_back(str::to<int64_t>(piece.substr(0, pos)));
+      rates_.push_back(str::to<real>(piece.substr(pos + 1)));
+    }
+  }
+
+  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
+    return calc(numSamplesProcessed);
+  }
+
+  real calc(int64_t num) {
+    // We assume that num never decreases.
+    CHECK_LE(lastNum_, num);
+    lastNum_ = num;
+    while (currentSegment_ < rates_.size()) {
+      if (num <= segments_[currentSegment_]) {
+        return learningRate_ * rates_[currentSegment_];
+      }
+      ++currentSegment_;
+      if (currentSegment_ < rates_.size()) {
+        LOG(INFO) << " learning_rate changes to "
+                  << learningRate_ * rates_[currentSegment_];
+      }
+    }
+    return learningRate_ * rates_.back();
+  }
+
+ protected:
+  std::vector<real> rates_;
+  std::vector<int64_t> segments_;
+  size_t currentSegment_;
+  int64_t lastNum_;
+};
+
+REGISTER_LEARNING_RATE_SCHEDULER(manual, ManualLRS);
+
+class PassManualLRS : public ManualLRS {
+ public:
+  explicit PassManualLRS(const OptimizationConfig& config)
+      : ManualLRS(config) {}
+  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
+    return calc(pass);
+  }
+};
+
+REGISTER_LEARNING_RATE_SCHEDULER(pass_manual, PassManualLRS);
+}  // namespace paddle
diff --git a/paddle/legacy/parameter/LearningRateScheduler.h b/paddle/legacy/parameter/LearningRateScheduler.h
new file mode 100644
index 0000000000000000000000000000000000000000..fc7e380a6af58577f4ba319d85522535b8f93a45
--- /dev/null
+++ b/paddle/legacy/parameter/LearningRateScheduler.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "TrainerConfig.pb.h"
+#include "paddle/legacy/utils/ClassRegistrar.h"
+
+namespace paddle {
+// NOLINTNEXTLINES_4
+#define REGISTER_LEARNING_RATE_SCHEDULER(__type_name, __class_name) \
+  static InitFunction __reg_type_##__type_name([]() {               \
+    LearningRateScheduler::registrar_.registerClass<__class_name>(  \
+        #__type_name);                                              \
+  })
+
+class LearningRateScheduler {
+ public:
+  static LearningRateScheduler* create(const OptimizationConfig& config);
+  virtual ~LearningRateScheduler() {}
+  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) = 0;
+
+  static ClassRegistrar<LearningRateScheduler, OptimizationConfig> registrar_;
+};
+
+}  // namespace paddle
diff --git a/paddle/parameter/OptimizerFunctions.cpp b/paddle/legacy/parameter/OptimizerFunctions.cpp
similarity index 100%
rename from paddle/parameter/OptimizerFunctions.cpp
rename to paddle/legacy/parameter/OptimizerFunctions.cpp
diff --git a/paddle/parameter/OptimizerFunctions.h b/paddle/legacy/parameter/OptimizerFunctions.h
similarity index 100%
rename from paddle/parameter/OptimizerFunctions.h
rename to paddle/legacy/parameter/OptimizerFunctions.h
diff --git a/paddle/parameter/OptimizerWithRegularizer.cpp b/paddle/legacy/parameter/OptimizerWithRegularizer.cpp
similarity index 100%
rename from paddle/parameter/OptimizerWithRegularizer.cpp
rename to paddle/legacy/parameter/OptimizerWithRegularizer.cpp
diff --git a/paddle/parameter/OptimizerWithRegularizer.h b/paddle/legacy/parameter/OptimizerWithRegularizer.h
similarity index 100%
rename from paddle/parameter/OptimizerWithRegularizer.h
rename to paddle/legacy/parameter/OptimizerWithRegularizer.h
diff --git a/paddle/legacy/parameter/Parameter.cpp b/paddle/legacy/parameter/Parameter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..666d808f0c13c5c828c51b2a36ee9d05f7f78c13
--- /dev/null
+++ b/paddle/legacy/parameter/Parameter.cpp
@@ -0,0 +1,425 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Parameter.h"
+#include <gflags/gflags.h>
+#include <fstream>
+#include "AverageOptimizer.h"
+#include "FirstOrderOptimizer.h"
+#include "OptimizerFunctions.h"
+#include "OptimizerWithRegularizer.h"
+#include "ParameterUpdateFunctions.h"
+#include "ThreadLocalBuffer.h"
+#include "hl_gpu.h"
+#include "paddle/legacy/math/CpuSparseMatrix.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/math/SparseRowMatrix.h"
+#include "paddle/legacy/utils/Logging.h"
+
+DEFINE_int32(enable_grad_share,
+             (100 * 1024 * 1024),
+             "threshold for enable gradient parameter share for batch "
+             "multi-cpu training");
+DEFINE_int32(
+    grad_share_block_num,
+    64,
+    "block number of gradient parameter share for batch multi-cpu training");
+
+namespace paddle {
+
+const std::string Parameter::kMissParameterFail = "fail";
+const std::string Parameter::kMissParameterRand = "rand";
+const std::string Parameter::kMissParameterZero = "zero";
+
+Parameter::Parameter(const ParameterConfig& config, bool useGpu, bool doInit)
+    : config_(config),
+      useGpu_(useGpu),
+      deviceId_(-1),
+      sharedCount_(0),
+      updateCounter_(0),
+      updated_(false),
+      headerFormat_(PARAM_FORMAT_ORIGINAL) {
+  setID(-1); /* capture uninitialized id */
+  if (useGpu_ && FLAGS_parallel_nn) {
+    /* gpu environment is specified by device property */
+    deviceId_ = config_.device();
+    if (deviceId_ < 0) {
+      useGpu_ = false;
+    }
+  }
+
+  if (doInit) {
+    initialize();
+  }
+
+  for (int i = 0; i < config.update_hooks_size(); ++i) {
+    this->updaterHooks_.push_back(IParameterUpdaterHook::create(config, i));
+  }
+}
+
+void Parameter::initialize() {
+  SetDevice device(deviceId_);
+
+  bufs_[PARAMETER_VALUE] =
+      Vector::createParallelVector(config_.size(), useGpu_);
+  bufs_[PARAMETER_VALUE]->zeroMem();
+
+  if (config_.is_sparse()) {
+    enableSparseParameter();
+  }
+
+  if (!isStatic()) {
+    bufs_[PARAMETER_GRADIENT] =
+        Vector::createParallelVector(config_.size(), useGpu_);
+    bufs_[PARAMETER_MOMENTUM] =
+        Vector::createParallelVector(config_.size(), useGpu_);
+
+    bufs_[PARAMETER_GRADIENT]->zeroMem();
+    bufs_[PARAMETER_MOMENTUM]->zeroMem();
+  }
+}
+
+void Parameter::randomize(const VectorPtr& value,
+                          const ParameterConfig& config) {
+  if (PARAMETER_INIT_UNIFORM == config.initial_strategy()) {
+    // initialize the parameter as uniform distribution
+    real initial_min = config.initial_mean() - config.initial_std();
+    real initial_max = config.initial_mean() + config.initial_std();
+    value->uniform(initial_min, initial_max);
+    VLOG(1) << config.name() << ": initial_min=" << initial_min
+            << ", initial_max=" << initial_max;
+  } else if (PARAMETER_INIT_NORMAL == config.initial_strategy()) {
+    /* Initialize the parameters randomly */
+    value->randnorm(config.initial_mean(), config.initial_std());
+    VLOG(1) << config.name() << ": initial_mean=" << config.initial_mean()
+            << ", initial_std=" << config.initial_std();
+  } else {
+    LOG(FATAL) << "not supported initial_strategy: "
+               << config.initial_strategy();
+  }
+}
+
+void Parameter::randomize() {
+  if (!bufs_[PARAMETER_VALUE]) return;
+  SetDevice device(deviceId_);
+  Parameter::randomize(bufs_[PARAMETER_VALUE], config_);
+
+  if (config_.is_sparse()) {
+    if (format_ == SPARSE_CSC) {
+      sparseRand(intBufs_[PARAMETER_COLS]->getData(),
+                 intBufs_[PARAMETER_ROWS]->getData(),
+                 config_.size(),
+                 config_.dims(1) + 1,
+                 config_.dims(0),
+                 useGpu_);
+    } else {
+      sparseRand(intBufs_[PARAMETER_ROWS]->getData(),
+                 intBufs_[PARAMETER_COLS]->getData(),
+                 config_.size(),
+                 config_.dims(0) + 1,
+                 config_.dims(1),
+                 useGpu_);
+    }
+  }
+  setValueUpdated();
+}
+
+void Parameter::zeroMem() {
+  if (!bufs_[PARAMETER_VALUE]) return;
+  bufs_[PARAMETER_VALUE]->zeroMem();
+  setValueUpdated();
+  LOG(INFO) << getName() << " set to 0";
+}
+
+bool Parameter::isGradShared(size_t* blockNum) {
+  if (!useGpu_ && !isStatic() && FLAGS_enable_grad_share > 0 &&
+      !isGradSparseUpdate() &&
+      this->getSize() > (size_t)FLAGS_enable_grad_share) {
+    if (blockNum) {
+      *blockNum = (size_t)FLAGS_grad_share_block_num;
+    }
+    return true;
+  }
+  return false;
+}
+
+bool Parameter::isValueShared() {
+  return !useGpu_ && config_.is_shared() && FLAGS_trainer_count > 1;
+}
+
+bool Parameter::isGradSparseUpdate() const {
+  return !useGpu_ && !isStatic() &&
+         (config_.sparse_update() || config_.sparse_remote_update());
+}
+
+void Parameter::setMat(ParameterType pType, int matType) {
+  CHECK(!mats_[pType]);
+
+  if (config_.dims_size() == 0 && matType == MAT_NORMAL) {
+    return;
+  }
+
+  CHECK_EQ((size_t)config_.dims_size(), 2LU);
+  size_t height = config_.dims(0);
+  size_t width = config_.dims(1);
+  if (matType == MAT_NORMAL) {
+    if (!config_.is_sparse()) {
+      CHECK_EQ(height * width, bufs_[pType]->getSize());
+      mats_[pType] =
+          Matrix::create(bufs_[pType]->getMemoryHandle(), height, width);
+    } else {
+      size_t size = bufs_[pType]->getSize();
+      CHECK_GE(height * width, size);
+      if (format_ == SPARSE_CSR) {
+        CHECK_EQ(height + 1, intBufs_[PARAMETER_ROWS]->getSize());
+        CHECK_EQ(size, intBufs_[PARAMETER_COLS]->getSize());
+      } else {
+        CHECK_EQ(width + 1, intBufs_[PARAMETER_COLS]->getSize());
+        CHECK_EQ(size, intBufs_[PARAMETER_ROWS]->getSize());
+      }
+      mats_[pType] =
+          Matrix::createSparseMatrix(bufs_[pType]->getData(),
+                                     intBufs_[PARAMETER_ROWS]->getData(),
+                                     intBufs_[PARAMETER_COLS]->getData(),
+                                     height,
+                                     width,
+                                     bufs_[pType]->getSize(),
+                                     FLOAT_VALUE,
+                                     format_,
+                                     false,
+                                     useGpu_);
+    }
+  }
+#ifndef PADDLE_MOBILE_INFERENCE
+  // NOLINTNEXTLINE
+  else if (matType == MAT_NORMAL_SHARED) {
+    CHECK_EQ(height * width, bufs_[pType]->getSize());
+    size_t blockNum = 0;
+    CHECK(isGradShared(&blockNum));
+    mats_[pType] = std::make_shared<SharedCpuMatrix>(
+        blockNum,
+        std::dynamic_pointer_cast<CpuMemoryHandle>(
+            bufs_[pType]->getMemoryHandle()),
+        height,
+        width);
+  } else if (matType == MAT_VALUE_SHARED) {
+    CHECK_EQ(height * width, bufs_[pType]->getSize());
+    mats_[pType] = std::make_shared<SharedCpuMatrix>(
+        std::dynamic_pointer_cast<CpuMemoryHandle>(
+            bufs_[pType]->getMemoryHandle()),
+        height,
+        width);
+  } else if (matType == MAT_SPARSE_ROW_IDS) {
+    CHECK_EQ(height * width, bufs_[pType]->getSize());
+    mats_[pType] = std::make_shared<SparseRowIdsCpuMatrix>(
+        std::dynamic_pointer_cast<CpuMemoryHandle>(
+            bufs_[pType]->getMemoryHandle()),
+        height,
+        width);
+  } else if (matType == MAT_SPARSE_ROW) {
+    auto valueMat =
+        std::dynamic_pointer_cast<SparseRowCpuMatrix>(mats_[PARAMETER_VALUE]);
+    SparseRowCpuMatrix::IndexDictPtr indexDict(nullptr);
+    if (pType != PARAMETER_VALUE) {
+      CHECK(valueMat) << "The matrix for PARAMETER_VALUE must be set "
+                      << " and its type must be MAT_SPARSE_ROW,"
+                      << " MAT_SPARSE_ROW_PREFETCH or MAT_CACHE_ROW";
+      indexDict = valueMat->getIndexDictHandle();
+    }
+    auto mat =
+        std::make_shared<SparseRowCpuMatrix>(nullptr,
+                                             height,
+                                             width,
+                                             // grad share index with value
+                                             indexDict);
+    mats_[pType] = mat;
+  } else if (matType == MAT_CACHE_ROW) {
+    CHECK(isGradSparseUpdate());
+    auto mat = std::make_shared<CacheRowCpuMatrix>(height, width);
+    mats_[pType] = mat;
+  } else if (matType == MAT_SPARSE_ROW_PREFETCH_FULL_SIZE ||
+             matType == MAT_SPARSE_ROW_PREFETCH) {
+    auto mat = std::make_shared<SparsePrefetchRowCpuMatrix>(
+        bufs_[pType] ? std::dynamic_pointer_cast<CpuMemoryHandle>(
+                           bufs_[pType]->getMemoryHandle())
+                     : nullptr,
+        height,
+        width,
+        nullptr,  // indexDictHandle
+        getGlobalSyncThreadPool());
+    mats_[pType] = mat;
+  } else if (matType == MAT_SPARSE_ROW_AUTO_GROW) {
+    CHECK(isGradSparseUpdate());
+    mats_[pType] = std::make_shared<SparseAutoGrowRowCpuMatrix>(height, width);
+  }
+#endif
+  // NOLINTNEXTLINE
+  else {
+    LOG(FATAL) << "Unsupported mat type" << matType;
+  }
+}
+
+void Parameter::incUpdate(const UpdateCallback& callback) {
+  // Static parameter is fixed, and does not need to be updated
+  if (isStatic()) {
+    return;
+  }
+
+  ++updateCounter_;
+  if (isUpdatable()) {
+    if (callback) callback(this);
+    clearUpdate();
+  }
+}
+
+bool Parameter::save(const std::string& filename) const {
+  std::ofstream fs(filename, std::ios_base::binary);
+  CHECK(fs) << "Fail to open " << filename;
+  return save(fs);
+}
+
+bool Parameter::save(std::ostream& s) const {
+  CpuVector vec(*bufs_[PARAMETER_VALUE].get());
+  Header header;
+  header.format = headerFormat_;
+  header.valueSize = sizeof(real);
+  header.size = getSize();
+
+  CHECK_EQ(header.size, vec.getSize());
+
+  CHECK(s.write(reinterpret_cast<char*>(&header), sizeof(header)))
+      << "Fail to write parameter " << getName();
+
+  CHECK(s.write(reinterpret_cast<char*>(vec.getData()),
+                header.size * sizeof(real)))
+      << "Fail to write parameter " << getName();
+  if (config_.is_sparse()) {
+    CpuIVector rows(*intBufs_[PARAMETER_ROWS].get());
+    CpuIVector cols(*intBufs_[PARAMETER_COLS].get());
+    CHECK(s.write(reinterpret_cast<char*>(rows.getData()),
+                  rows.getSize() * sizeof(int)))
+        << "Fail to write parameter " << getName();
+    CHECK(s.write(reinterpret_cast<char*>(cols.getData()),
+                  cols.getSize() * sizeof(int)))
+        << "Fail to write parameter " << getName();
+  }
+
+  return true;
+}
+
+/**
+ * Load parameter value from a file
+ */
+bool Parameter::load(const std::string& filename) {
+  std::ifstream fs(filename, std::ios_base::binary);
+  if (!fs) {
+    LOG(INFO) << "missing parameters [" << filename << "] while loading model.";
+    if (kMissParameterFail == FLAGS_load_missing_parameter_strategy) {
+      LOG(FATAL) << getName() << " missing, not allowed.";
+      return false;
+    }
+    if (kMissParameterRand == FLAGS_load_missing_parameter_strategy) {
+      LOG(INFO) << getName() << " missing, set to random.";
+      randomize();
+      return true;
+    }
+    if (kMissParameterZero == FLAGS_load_missing_parameter_strategy) {
+      LOG(INFO) << getName() << " missing, set to zero.";
+      zeroMem();
+      return true;
+    }
+    LOG(FATAL) << "unsupported load_missing_parameter_strategy: "
+               << FLAGS_load_missing_parameter_strategy;
+    return false;
+  }
+  return load(fs);
+}
+
+bool Parameter::load(std::istream& s) {
+  CpuVector vec(*bufs_[PARAMETER_VALUE].get());
+  Header header;
+  CHECK(s.read(reinterpret_cast<char*>(&header), sizeof(header)))
+      << "Fail to read parameter " << getName();
+  CHECK(isHeaderFormatSupported(header.format)) << "Incorrect format version: "
+                                                << header.format;
+  headerFormat_ = header.format;
+  CHECK_EQ(header.size, getSize())
+      << "The size (" << header.size << ") in the file does not match the size "
+      << "(" << getSize() << ") of the parameter: " << getName();
+  CHECK_EQ(header.valueSize, sizeof(real))
+      << "Unsupported valueSize " << header.valueSize << " at: " << getName();
+  CHECK(s.read(reinterpret_cast<char*>(vec.getData()),
+               header.size * sizeof(real)));
+
+  auto& tmp = *bufs_[PARAMETER_VALUE].get();
+  if (typeid(tmp) == typeid(GpuVector)) {
+    bufs_[PARAMETER_VALUE]->copyFrom(vec);
+  }
+
+  if (config_.is_sparse() && config_.need_compact()) {
+    // load from dense parameter with many zero
+    CHECK_EQ(config_.dims_size(), 2);
+    auto height = config_.dims(0);
+    auto width = config_.dims(1);
+    auto mat = Matrix::create(vec.getData(), height, width);
+    CpuSparseMatrix sparseMat(height,
+                              width,
+                              0,
+                              FLOAT_VALUE,
+                              format_,
+                              /*trans*/ false);
+    sparseMat.copyFrom(*mat, HPPL_STREAM_DEFAULT);
+    auto nnz = sparseMat.getElementCnt();
+    size_t rowSize = (format_ == SPARSE_CSR) ? height + 1 : nnz;
+    size_t colSize = (format_ == SPARSE_CSR) ? nnz : width + 1;
+
+    intBufs_[PARAMETER_ROWS]->copyFrom(sparseMat.getRows(), rowSize);
+    intBufs_[PARAMETER_COLS]->copyFrom(sparseMat.getCols(), colSize);
+    bufs_[PARAMETER_VALUE]->resize(nnz);  // for setMat check
+    bufs_[PARAMETER_VALUE]->copyFrom(sparseMat.getValue(), nnz);
+    config_.set_size(nnz);
+    LOG(INFO) << "compact nnz=" << (1. * nnz / (height * width))
+              << " name=" << config_.name();
+  } else if (config_.is_sparse()) {
+    CpuIVector rows(*intBufs_[PARAMETER_ROWS].get());
+    CpuIVector cols(*intBufs_[PARAMETER_COLS].get());
+    size_t rowSize, colSize;
+    CHECK_EQ(config_.dims_size(), 2);
+    if (format_ == SPARSE_CSR) {
+      rowSize = config_.dims(0) + 1;
+      colSize = config_.size();
+    } else {
+      rowSize = config_.size();
+      colSize = config_.dims(1) + 1;
+    }
+    CHECK(
+        s.read(reinterpret_cast<char*>(rows.getData()), rowSize * sizeof(int)));
+    CHECK(
+        s.read(reinterpret_cast<char*>(cols.getData()), colSize * sizeof(int)));
+    auto& paramRows = *intBufs_[PARAMETER_ROWS].get();
+    if (typeid(paramRows) == typeid(GpuIVector)) {
+      intBufs_[PARAMETER_ROWS]->copyFrom(rows);
+    }
+    auto& paramCols = *intBufs_[PARAMETER_COLS].get();
+    if (typeid(paramCols) == typeid(GpuIVector)) {
+      intBufs_[PARAMETER_COLS]->copyFrom(cols);
+    }
+  }
+
+  setValueUpdated();
+
+  return true;
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/parameter/Parameter.h b/paddle/legacy/parameter/Parameter.h
new file mode 100644
index 0000000000000000000000000000000000000000..43b567dad045ad786b1b3f2d3614072f58310527
--- /dev/null
+++ b/paddle/legacy/parameter/Parameter.h
@@ -0,0 +1,380 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "ParameterConfig.pb.h"
+#include "TrainerConfig.pb.h"
+
+#include "ParameterUpdaterHook.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/legacy/utils/Common.h"
+#include "paddle/legacy/utils/GlobalConstants.h"
+#include "paddle/legacy/utils/Locks.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
+#include "paddle/legacy/utils/Util.h"
+
+namespace paddle {
+
+typedef enum {
+  /// The paddle original basic format
+  PARAM_FORMAT_ORIGINAL = 0,
+
+  /// See mkldnn_memory_format_t in
+  /// https://github.com/01org/mkl-dnn/blob/master/include/mkldnn_types.h
+  /// for a detailed description.
+  /// 2D weights tensor in the format (output channels, input channels).
+  PARAM_FORMAT_MKLDNN_OI,
+
+  /// The total format items numbers
+  PARAM_FORMAT_ITEMS,
+} PARAM_FORMAT;
+
+class SparsePrefetchRowCpuMatrix;
+
+class Parameter;
+typedef std::function<void(Parameter* param)> UpdateCallback;
+typedef std::function<void(int paramId, Parameter* param)> ParamInitCallback;
+
+class Parameter;
+typedef std::shared_ptr<Parameter> ParameterPtr;
+
+class Parameter {
+ public:
+  Parameter(const ParameterConfig& config, bool useGpu, bool doInit = true);
+  const std::string& getName() const { return config_.name(); }
+
+  size_t getSize() const { return config_.size(); }
+
+  bool isFullSize() const {
+    if (bufs_[PARAMETER_VALUE]) {
+      return this->getSize() == bufs_[PARAMETER_VALUE]->getSize();
+    }
+    return false;
+  }
+
+  inline bool useGpu() const { return useGpu_; }
+
+  int getDeviceId() const { return deviceId_; }
+
+  void setDevice(int deviceId) { deviceId_ = deviceId; }
+
+  /// The id ranges from 0 to the_total_number_of_parameters - 1
+  size_t getID() const { return config_.para_id(); }
+
+  /// ID is a implict value created until neural network is built.
+  void setID(size_t id) { config_.set_para_id(id); }
+
+  bool isStatic() const { return config_.is_static(); }
+
+  enum MatType {
+    MAT_NORMAL,
+    /// both value and grad are shared
+    MAT_NORMAL_SHARED,
+
+    /// Now used in BatchNorm in CPU mode
+    MAT_VALUE_SHARED,
+
+    /// sparse matrix, which has full size parameter
+    MAT_SPARSE_ROW_IDS,
+    /// sparse matrix, parameter size scale by sparse rates.
+    MAT_SPARSE_ROW_AUTO_GROW,
+    MAT_CACHE_ROW,
+    MAT_SPARSE_ROW,
+
+    /// sparse matrix for prefetching parameter from pserver
+    MAT_SPARSE_ROW_PREFETCH,
+    /// same as above, but parameter has full size for saving parameter in local
+    MAT_SPARSE_ROW_PREFETCH_FULL_SIZE,
+  };
+
+  void enableSparseParameter() {
+    if (config_.is_sparse()) {
+      if (config_.format() == "csr") {
+        size_t height = config_.dims(0);
+        size_t nnz = config_.size();
+        enableIntType(PARAMETER_ROWS, height + 1);
+        enableIntType(PARAMETER_COLS, nnz);
+        format_ = SPARSE_CSR;
+      } else {
+        size_t width = config_.dims(1);
+        size_t nnz = config_.size();
+        enableIntType(PARAMETER_COLS, width + 1);
+        enableIntType(PARAMETER_ROWS, nnz);
+        format_ = SPARSE_CSC;
+      }
+    }
+  }
+
+  /// allocate buffer for the give type
+  void enableType(ParameterType type, MatType matType = MAT_NORMAL) {
+    if (bufs_[type] || mats_[type]) {
+      return;
+    }
+    SetDevice device(deviceId_);
+    if (config_.dims_size() == 2) {
+      if (matType == MAT_NORMAL || matType == MAT_NORMAL_SHARED ||
+          matType == MAT_SPARSE_ROW_PREFETCH_FULL_SIZE ||
+          matType == MAT_VALUE_SHARED || matType == MAT_SPARSE_ROW_IDS) {
+        bufs_[type] = Vector::createParallelVector(config_.size(), useGpu_);
+        bufs_[type]->zeroMem();
+      } else {
+        CHECK(isGradSparseUpdate());
+      }
+      if (config_.is_sparse() && type == PARAMETER_VALUE) {
+        enableSparseParameter();
+      }
+      setMat(type, matType);
+    } else {
+      bufs_[type] = Vector::createParallelVector(config_.size(), useGpu_);
+      bufs_[type]->zeroMem();
+    }
+  }
+
+  void enableBufType(ParameterType type) {
+    if (bufs_[type]) return;
+    bufs_[type] = Vector::createParallelVector(config_.size(), useGpu_);
+    bufs_[type]->zeroMem();
+  }
+
+  void enableIntType(ParameterType type, size_t intStoreSize = 0) {
+    if (!intBufs_[type]) {
+      SetDevice device(deviceId_);
+      size_t size = intStoreSize ? intStoreSize : config_.size();
+      intBufs_[type] = IVector::create(size, useGpu_);
+      intBufs_[type]->zeroMem();
+    }
+  }
+
+  void enableSharedType(ParameterType type,
+                        VectorPtr vec,
+                        MatrixPtr mat = nullptr) {
+    if (!bufs_[type] && !mats_[type]) {
+      bufs_[type] = vec;
+      mats_[type] = mat;
+    }
+  }
+
+  /// for batchGradientMachine: blockNum is number of partitions of the matrix.
+  bool isGradShared(size_t* blockNum = NULL);
+
+  bool isValueShared();
+
+  // for AsgdSparseGradientMachine & SgdSparseGradientMachine:
+  // and MultiGradientMachine
+  bool isGradSparseUpdate() const;
+
+  bool isSparseRemoteUpdate() const {
+    return config_.sparse_remote_update() && !useGpu();
+  }
+
+  const ParameterConfig& getConfig() const { return config_; }
+
+  ParameterConfig& getConfig() { return config_; }
+
+  bool hasType(ParameterType pType) const {
+    return bufs_[pType] || mats_[pType];
+  }
+
+  const VectorPtr& getBuf(ParameterType pType) const {
+    return this->bufs_[pType];
+  }
+
+  const VectorPtr* getBufs() const { return bufs_; }
+
+  const MatrixPtr& getMat(ParameterType pType) const { return mats_[pType]; }
+
+  void setValueUpdated() { updated_ = true; }
+
+  void clearValueUpdated() { updated_ = false; }
+
+  bool isValueUpdated() const { return updated_; }
+
+  /**
+   * Save parameter value to a file
+   */
+  bool save(const std::string& filename) const;
+
+  /**
+   * Save parameter to ostream
+   */
+  bool save(std::ostream& s) const;
+
+  /**
+   * Load parameter value from a file
+   */
+  bool load(const std::string& filename);
+
+  /**
+   * Load parameter from istream
+   */
+  bool load(std::istream& is);
+
+  void incShared() { sharedCount_++; }
+
+  /**
+   * After one of the parameter's gradient is merged
+   * You should call this function to do some additional processing,
+   */
+  void incUpdate(const UpdateCallback& callbacks = NULL);
+
+  void clearGradient() {
+    auto& mat = getMat(PARAMETER_GRADIENT);
+    if (mat) {
+      // zeroMem will also clear rows for SparseRowCpuMatrix
+      mat->zeroMem();
+    } else {
+      auto& gradBuf = getBuf(PARAMETER_GRADIENT);
+      if (gradBuf) gradBuf->zeroMem();
+    }
+  }
+
+  void initialize();
+
+  /**
+   * Initialize the value according to config_: initial_mean,
+   * initial_std and initial_strategy.
+   */
+  void randomize();
+  static void randomize(const VectorPtr& value, const ParameterConfig& config);
+
+  /// Initialize the value to 0
+  void zeroMem();
+
+  /// file header structure
+  struct Header {
+    int32_t format;      // = PARAM_FORMAT
+    uint32_t valueSize;  // = sizeof(real)
+    uint64_t size;       // = getSize()
+  };
+
+  /**
+   * @brief Is the header format supported.
+   */
+  static bool isHeaderFormatSupported(int32_t fmt) {
+    return fmt < PARAM_FORMAT_ITEMS;
+  }
+
+  /**
+   * @brief Get the format in header.
+   */
+  int getHeaderFormat() { return headerFormat_; }
+
+  /**
+   * @brief Set the format in header.
+   */
+  void setHeaderFormat(int32_t fmt) {
+    CHECK(isHeaderFormatSupported(fmt)) << "Unsupported format version: "
+                                        << fmt;
+    headerFormat_ = fmt;
+  }
+
+  /**
+   * @brief  Parameter Update Hook.
+   *
+   * The parameter's update hook before ParameterUpdater::updateImpl
+   * It could modify gradient/momentum/etc here. Such as drop some gradient,
+   * etc.
+   */
+  void updateHook() {
+    for (auto& hook : updaterHooks_) {
+      hook->update(this);
+    }
+  }
+
+  /**
+   * @brief  Initialize all updater hook.
+   *
+   * This method should be invoked in ParameterUpdater::init() only.
+   */
+  void initHook() {
+    for (auto& hook : updaterHooks_) {
+      hook->init(this);
+    }
+  }
+
+ protected:
+  /**
+   * @brief create matrix to matType.
+   *
+   * used by gradient machine which needs specify matrix type,
+   * instead of creating in weights.cpp.
+   *
+   * @note  pType should be enabled already.
+   */
+  void setMat(ParameterType pType, int matType);
+
+  bool isUpdatable() { return (updateCounter_ == sharedCount_); }
+
+  void clearUpdate() { updateCounter_ = 0; }
+
+ protected:
+  ParameterConfig config_;
+
+  bool useGpu_;
+
+  int deviceId_;
+
+  /**
+   * @brief bufs_ stores parameter value and gradient.
+   *
+   * Layer should use bufs_[PARAMETER_VALUE] to form weight matrix for
+   * calculation and stores gradient to bufs_[PARAMETER_GRADIENT].
+   */
+  VectorPtr bufs_[NUM_PARAMETER_TYPES];
+
+  /**
+   * @brief Weight matrix for bufs_.
+   *
+   * It's helpfull when parameter shared by multi-layers.
+   * Caller should check, if mats exist, do not create it again.
+   */
+  MatrixPtr mats_[NUM_PARAMETER_TYPES];
+
+  /// Int vectors, used in some User defined parameter types
+  IVectorPtr intBufs_[NUM_PARAMETER_TYPES];
+
+  int sharedCount_;
+  int updateCounter_;
+
+  bool updated_;
+  SparseFormat format_;
+
+  /// The header format for saving or loading param
+  int32_t headerFormat_;
+
+  std::vector<std::shared_ptr<IParameterUpdaterHook>> updaterHooks_;
+
+ public:
+  void setSharedCount(int cnt) { sharedCount_ = cnt; }
+  int getSharedCount() { return sharedCount_; }
+
+  bool isSparse() { return config_.is_sparse(); }
+  SparseFormat getFormat() { return format_; }
+
+  static const std::string kMissParameterFail;
+  static const std::string kMissParameterRand;
+  static const std::string kMissParameterZero;
+};
+
+typedef std::map<std::string, ParameterPtr> ParameterMap;
+
+}  // namespace paddle
diff --git a/paddle/legacy/parameter/ParameterOptimizer.cpp b/paddle/legacy/parameter/ParameterOptimizer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b9dffa5afb4c99314869c7ed547ea9711d718b6e
--- /dev/null
+++ b/paddle/legacy/parameter/ParameterOptimizer.cpp
@@ -0,0 +1,63 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/utils/Logging.h"
+
+#include <fstream>
+
+#include "AverageOptimizer.h"
+#include "FirstOrderOptimizer.h"
+#include "OptimizerFunctions.h"
+#include "OptimizerWithRegularizer.h"
+#include "ParameterOptimizer.h"
+#include "hl_gpu.h"
+
+namespace paddle {
+
+ParameterOptimizer* ParameterOptimizer::create(
+    const OptimizationConfig& optConfig, bool inPserver) {
+  if (inPserver && optConfig.num_batches_per_send_parameter() > 1) {
+    return new AddOptimizer(optConfig);
+  }
+  if (optConfig.learning_method() == "momentum") {
+    return new SgdOptimizer(optConfig);
+  }
+  if (optConfig.learning_method() == "torch_momentum") {
+    return new SgdOptimizer(optConfig);
+  }
+  if (optConfig.learning_method() == "adagrad") {
+    return new AdagradParameterOptimizer(optConfig);
+  }
+  if (optConfig.learning_method() == "adadelta") {
+    return new AdaDeltaParameterOptimizer(optConfig);
+  }
+  if (optConfig.learning_method() == "rmsprop") {
+    return new RMSPropParameterOptimizer(optConfig);
+  }
+  if (optConfig.learning_method() == "decayed_adagrad") {
+    return new DecayedAdagradParameterOptimizer(optConfig);
+  }
+  if (optConfig.learning_method() == "adam") {
+    return new AdamParameterOptimizer(optConfig);
+  }
+  if (optConfig.learning_method() == "adamax") {
+    return new AdamaxParameterOptimizer(optConfig);
+  }
+  if (optConfig.learning_method() == "sparse_momentum") {
+    return new SparseMomentumParameterOptimizer(optConfig);
+  }
+  return nullptr;
+}
+
+}  // namespace paddle
diff --git a/paddle/parameter/ParameterOptimizer.h b/paddle/legacy/parameter/ParameterOptimizer.h
similarity index 100%
rename from paddle/parameter/ParameterOptimizer.h
rename to paddle/legacy/parameter/ParameterOptimizer.h
diff --git a/paddle/legacy/parameter/ParameterUpdateFunctions.cpp b/paddle/legacy/parameter/ParameterUpdateFunctions.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..72c9841acf6d3eb1d28d631e1599a1a403175013
--- /dev/null
+++ b/paddle/legacy/parameter/ParameterUpdateFunctions.cpp
@@ -0,0 +1,300 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/utils/Logging.h"
+#ifdef __AVX__
+#include <x86intrin.h>
+#include <xmmintrin.h>
+#endif
+
+#include "ParameterUpdateFunctions.h"
+
+namespace paddle {
+
+void sgdUpdateCpu(real learningRate,
+                  real momentum,
+                  real decayRate,
+                  size_t size,
+                  real* value,
+                  const real* grad,
+                  real* momentumVec) {
+  decayRate *= learningRate;
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (size_t i = 0; i < size; ++i) {
+    momentumVec[i] = momentum * momentumVec[i] - learningRate * grad[i] -
+                     decayRate * value[i];
+    value[i] += momentumVec[i];
+  }
+}
+
+void sgdUpdate(real learningRate,
+               real momentum,
+               real decayRate,
+               Vector* value,
+               Vector* grad,
+               Vector* momentumVec) {
+  size_t size = value->getSize();
+  real* val = value->getData();
+  real* grd = grad->getData();
+  real* mom = momentumVec->getData();
+  if (typeid(*value) == typeid(CpuVector)) {
+    sgdUpdateCpu(learningRate, momentum, decayRate, size, val, grd, mom);
+  } else if (typeid(*value) == typeid(GpuVector)) {
+    value->sgdUpdate(*grad, *momentumVec, learningRate, momentum, decayRate);
+  } else {
+    LOG(FATAL) << "Wrong";
+  }
+}
+
+void sgdUpdateAvx(float learningRate,
+                  float momentum,
+                  float decayRate,
+                  size_t size,
+                  float* value,
+                  const float* _grad,
+                  float* momentumVec) {
+#ifdef __AVX__
+  float* grad = const_cast<float*>(_grad);  // the gradient is not modified
+                                            // but when invoke simd functions
+                                            // need non-const pointer.
+  size_t gradientAlign = 0;
+  size_t gradientAlignHeader = (size_t)grad % sizeof(__m256);
+  CHECK_EQ(gradientAlignHeader, (size_t)momentumVec % sizeof(__m256))
+      << "Gradent buffer didn't align with momentum buffer";
+  CHECK_EQ(gradientAlignHeader, (size_t)value % sizeof(__m256))
+      << "Gradent buffer didn't align with value buffer";
+  if (0 != gradientAlignHeader) {
+    gradientAlignHeader = sizeof(__m256) - gradientAlignHeader;
+    gradientAlign = gradientAlignHeader / sizeof(real);
+
+    // handle the unalign buffer
+    for (size_t i = 0; i < gradientAlign; i++) {
+      momentumVec[i] = momentum * momentumVec[i] - (learningRate * grad[i]) -
+                       (decayRate * learningRate * value[i]);
+      value[i] += momentumVec[i];
+    }
+    grad += gradientAlign;
+    momentumVec += gradientAlign;
+    value += gradientAlign;
+  }
+
+  constexpr size_t kParallelNum = 8;
+  constexpr size_t nStepSize = (sizeof(__m256) / sizeof(real)) * kParallelNum;
+  size_t cntLoop = (size - gradientAlign) / nStepSize;
+  size_t cntRem = (size - gradientAlign) % nStepSize;
+  __m256 gradientTmp[kParallelNum];
+  __m256 valueTmp[kParallelNum];
+  __m256 lr, mom, dr;
+  std::function<void(void)> loopFun;
+
+  learningRate *= -1;
+  lr = _mm256_set_ps(learningRate,
+                     learningRate,
+                     learningRate,
+                     learningRate,
+                     learningRate,
+                     learningRate,
+                     learningRate,
+                     learningRate);
+
+  if (0 != momentum) {
+    mom = _mm256_set_ps(momentum,
+                        momentum,
+                        momentum,
+                        momentum,
+                        momentum,
+                        momentum,
+                        momentum,
+                        momentum);
+  }
+
+  decayRate *= learningRate;
+  if (0 != decayRate) {
+    dr = _mm256_set_ps(decayRate,
+                       decayRate,
+                       decayRate,
+                       decayRate,
+                       decayRate,
+                       decayRate,
+                       decayRate,
+                       decayRate);
+  }
+
+  auto gradMulFun = [&](void) {
+    gradientTmp[0] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad), lr);
+    gradientTmp[1] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 8), lr);
+    gradientTmp[2] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 16), lr);
+    gradientTmp[3] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 24), lr);
+    gradientTmp[4] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 32), lr);
+    gradientTmp[5] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 40), lr);
+    gradientTmp[6] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 48), lr);
+    gradientTmp[7] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 56), lr);
+  };
+
+  auto valueMulFun = [&](void) {
+    valueTmp[0] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value), dr);
+    valueTmp[1] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 8), dr);
+    valueTmp[2] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 16), dr);
+    valueTmp[3] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 24), dr);
+    valueTmp[4] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 32), dr);
+    valueTmp[5] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 40), dr);
+    valueTmp[6] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 48), dr);
+    valueTmp[7] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 56), dr);
+  };
+
+  auto momentumMulFun = [&](void) {
+    *reinterpret_cast<__m256*>(momentumVec) =
+        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec), mom);
+    *reinterpret_cast<__m256*>(momentumVec + 8) =
+        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 8), mom);
+    *reinterpret_cast<__m256*>(momentumVec + 16) =
+        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 16), mom);
+    *reinterpret_cast<__m256*>(momentumVec + 24) =
+        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 24), mom);
+    *reinterpret_cast<__m256*>(momentumVec + 32) =
+        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 32), mom);
+    *reinterpret_cast<__m256*>(momentumVec + 40) =
+        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 40), mom);
+    *reinterpret_cast<__m256*>(momentumVec + 48) =
+        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 48), mom);
+    *reinterpret_cast<__m256*>(momentumVec + 56) =
+        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 56), mom);
+  };
+
+  auto momentumAddGradFun = [&](void) {
+    *reinterpret_cast<__m256*>(momentumVec) =
+        _mm256_add_ps(*reinterpret_cast<__m256*>(momentumVec), gradientTmp[0]);
+    *reinterpret_cast<__m256*>(momentumVec + 8) = _mm256_add_ps(
+        *reinterpret_cast<__m256*>(momentumVec + 8), gradientTmp[1]);
+    *reinterpret_cast<__m256*>(momentumVec + 16) = _mm256_add_ps(
+        *reinterpret_cast<__m256*>(momentumVec + 16), gradientTmp[2]);
+    *reinterpret_cast<__m256*>(momentumVec + 24) = _mm256_add_ps(
+        *reinterpret_cast<__m256*>(momentumVec + 24), gradientTmp[3]);
+    *reinterpret_cast<__m256*>(momentumVec + 32) = _mm256_add_ps(
+        *reinterpret_cast<__m256*>(momentumVec + 32), gradientTmp[4]);
+    *reinterpret_cast<__m256*>(momentumVec + 40) = _mm256_add_ps(
+        *reinterpret_cast<__m256*>(momentumVec + 40), gradientTmp[5]);
+    *reinterpret_cast<__m256*>(momentumVec + 48) = _mm256_add_ps(
+        *reinterpret_cast<__m256*>(momentumVec + 48), gradientTmp[6]);
+    *reinterpret_cast<__m256*>(momentumVec + 56) = _mm256_add_ps(
+        *reinterpret_cast<__m256*>(momentumVec + 56), gradientTmp[7]);
+  };
+
+  auto momentumZeroFun = [&](void) {
+    *reinterpret_cast<__m256*>(momentumVec) = gradientTmp[0];
+    *reinterpret_cast<__m256*>(momentumVec + 8) = gradientTmp[1];
+    *reinterpret_cast<__m256*>(momentumVec + 16) = gradientTmp[2];
+    *reinterpret_cast<__m256*>(momentumVec + 24) = gradientTmp[3];
+    *reinterpret_cast<__m256*>(momentumVec + 32) = gradientTmp[4];
+    *reinterpret_cast<__m256*>(momentumVec + 40) = gradientTmp[5];
+    *reinterpret_cast<__m256*>(momentumVec + 48) = gradientTmp[6];
+    *reinterpret_cast<__m256*>(momentumVec + 56) = gradientTmp[7];
+  };
+
+  auto momentumAddValueFun = [&](void) {
+    *reinterpret_cast<__m256*>(momentumVec) =
+        _mm256_add_ps(*reinterpret_cast<__m256*>(momentumVec), valueTmp[0]);
+    *reinterpret_cast<__m256*>(momentumVec + 8) =
+        _mm256_add_ps(*reinterpret_cast<__m256*>(momentumVec + 8), valueTmp[1]);
+    *reinterpret_cast<__m256*>(momentumVec + 16) = _mm256_add_ps(
+        *reinterpret_cast<__m256*>(momentumVec + 16), valueTmp[2]);
+    *reinterpret_cast<__m256*>(momentumVec + 24) = _mm256_add_ps(
+        *reinterpret_cast<__m256*>(momentumVec + 24), valueTmp[3]);
+    *reinterpret_cast<__m256*>(momentumVec + 32) = _mm256_add_ps(
+        *reinterpret_cast<__m256*>(momentumVec + 32), valueTmp[4]);
+    *reinterpret_cast<__m256*>(momentumVec + 40) = _mm256_add_ps(
+        *reinterpret_cast<__m256*>(momentumVec + 40), valueTmp[5]);
+    *reinterpret_cast<__m256*>(momentumVec + 48) = _mm256_add_ps(
+        *reinterpret_cast<__m256*>(momentumVec + 48), valueTmp[6]);
+    *reinterpret_cast<__m256*>(momentumVec + 56) = _mm256_add_ps(
+        *reinterpret_cast<__m256*>(momentumVec + 56), valueTmp[7]);
+  };
+
+  auto valueAddMomentumFun = [&](void) {
+    *reinterpret_cast<__m256*>(value) =
+        _mm256_add_ps(*reinterpret_cast<__m256*>(value),
+                      *reinterpret_cast<__m256*>(momentumVec));
+    *reinterpret_cast<__m256*>(value + 8) =
+        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 8),
+                      *reinterpret_cast<__m256*>(momentumVec + 8));
+    *reinterpret_cast<__m256*>(value + 16) =
+        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 16),
+                      *reinterpret_cast<__m256*>(momentumVec + 16));
+    *reinterpret_cast<__m256*>(value + 24) =
+        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 24),
+                      *reinterpret_cast<__m256*>(momentumVec + 24));
+    *reinterpret_cast<__m256*>(value + 32) =
+        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 32),
+                      *reinterpret_cast<__m256*>(momentumVec + 32));
+    *reinterpret_cast<__m256*>(value + 40) =
+        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 40),
+                      *reinterpret_cast<__m256*>(momentumVec + 40));
+    *reinterpret_cast<__m256*>(value + 48) =
+        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 48),
+                      *reinterpret_cast<__m256*>(momentumVec + 48));
+    *reinterpret_cast<__m256*>(value + 56) =
+        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 56),
+                      *reinterpret_cast<__m256*>(momentumVec + 56));
+  };
+
+  if (0 == decayRate && 0 == momentum) {
+    loopFun = [&](void) {
+      gradMulFun();
+      momentumZeroFun();
+      valueAddMomentumFun();
+    };
+  } else if (0 == decayRate && 0 != momentum) {
+    loopFun = [&](void) {
+      gradMulFun();
+      momentumMulFun();
+      momentumAddGradFun();
+      valueAddMomentumFun();
+    };
+  } else if (0 != decayRate && 0 == momentum) {
+    loopFun = [&](void) {
+      gradMulFun();
+      valueMulFun();
+      momentumZeroFun();
+      momentumAddValueFun();
+      valueAddMomentumFun();
+    };
+  } else if (0 != decayRate && 0 != momentum) {
+    loopFun = [&](void) {
+      gradMulFun();
+      valueMulFun();
+      momentumMulFun();
+      momentumAddGradFun();
+      momentumAddValueFun();
+      valueAddMomentumFun();
+    };
+  }
+
+  for (size_t i = 0; i < cntLoop; i++) {
+    loopFun();
+    grad += nStepSize;
+    momentumVec += nStepSize;
+    value += nStepSize;
+  }
+
+  for (size_t i = 0; i < cntRem; i++) {
+    momentumVec[i] = momentum * momentumVec[i] + (learningRate * grad[i]) +
+                     (decayRate * value[i]);
+    value[i] += momentumVec[i];
+  }
+#endif
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/parameter/ParameterUpdateFunctions.h b/paddle/legacy/parameter/ParameterUpdateFunctions.h
new file mode 100644
index 0000000000000000000000000000000000000000..a7cc1c4c47b6c8723520221cb0efc2afb53a900c
--- /dev/null
+++ b/paddle/legacy/parameter/ParameterUpdateFunctions.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/legacy/utils/Common.h"
+
+namespace paddle {
+
+/**
+ * Performs the following operations.
+ *
+ * momentumVec = momentum * momentumVec
+ *               - learningRate * grad
+ *               - learningRate * decayRate * value
+ *
+ * value = value + momentumVec
+ * momentum = 0 or decayRate = 0 are specially handled to avoid unnecessary
+ * computation.
+ */
+void sgdUpdate(real learningRate,
+               real momentum,
+               real decayRate,
+               Vector* value,
+               Vector* grad,
+               Vector* momentumVec);
+
+void sgdUpdateCpu(real learningRate,
+                  real momentum,
+                  real decayRate,
+                  size_t size,
+                  real* value,
+                  const real* grad,
+                  real* momentumVec);
+
+void sgdUpdateAvx(float learningRate,
+                  float momentum,
+                  float decayRate,
+                  size_t size,
+                  float* value,
+                  const float* grad,
+                  float* momentumVec);
+
+}  // namespace paddle
diff --git a/paddle/legacy/parameter/ParameterUpdaterBase.cpp b/paddle/legacy/parameter/ParameterUpdaterBase.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7d9d3fad63160b76d6de0932f39596a8643d0a8e
--- /dev/null
+++ b/paddle/legacy/parameter/ParameterUpdaterBase.cpp
@@ -0,0 +1,41 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ParameterUpdaterBase.h"
+#include <fstream>
+#include "hl_gpu.h"
+#include "paddle/legacy/utils/Logging.h"
+
+namespace paddle {
+
+void ParameterUpdater::init(const std::vector<ParameterPtr>& parameters) {
+  parameters_ = parameters;
+  for (ParameterType type : getParameterTypes()) {
+    for (auto& para : parameters) {
+      para->enableType(type);
+    }
+  }
+  for (size_t pid = 0; pid < parameters_.size(); ++pid) {
+    nonStaticParaIDMap_.insert(
+        std::pair<size_t, size_t>(parameters_[pid]->getID(), pid));
+  }
+
+  for (auto& para : parameters) {
+    if (!para->isStatic()) {
+      para->initHook();
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/parameter/ParameterUpdaterBase.h b/paddle/legacy/parameter/ParameterUpdaterBase.h
similarity index 100%
rename from paddle/parameter/ParameterUpdaterBase.h
rename to paddle/legacy/parameter/ParameterUpdaterBase.h
diff --git a/paddle/legacy/parameter/ParameterUpdaterHook.cpp b/paddle/legacy/parameter/ParameterUpdaterHook.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bfb9769fb67fc71b6f96f09d44b2c108745eafa3
--- /dev/null
+++ b/paddle/legacy/parameter/ParameterUpdaterHook.cpp
@@ -0,0 +1,155 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ParameterUpdaterHook.h"
+
+#include <algorithm>
+#include <atomic>
+#include <fstream>
+#include <mutex>
+#include <thread>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/legacy/parameter/Parameter.h"
+#include "paddle/legacy/utils/Flags.h"
+#include "paddle/legacy/utils/Util.h"
+
+namespace paddle {
+
+/**
+ * The static pruning hook
+ * Static means user specify a sparsity_ratio before training started, and the
+ * network will prune the parameters based on the sparsity_ratio. More details
+ * can be found https://arxiv.org/pdf/1506.02626.pdf.
+ */
+
+class StaticPruningHook : public IParameterUpdaterHook {
+ public:
+  explicit StaticPruningHook(const ParameterUpdaterHookConfig &hookConfig)
+      : initCount_(0) {
+    sparsityRatio_ = hookConfig.sparsity_ratio();
+  }
+
+  static bool sortPairAscend(const std::pair<real, size_t> &pair1,
+                             const std::pair<real, size_t> &pair2) {
+    return pair1.first > pair2.first;
+  }
+
+  void update(Parameter *para) {
+    updateThreadChecker_.check();
+    auto &vec = para->getBuf(PARAMETER_GRADIENT);
+    if (vec) {
+      vec->dotMul(*maskVec_);
+    }
+  }
+
+  void generateMask(Parameter *para) {
+    VectorPtr maskTemp = Vector::create(para->getSize(), false);
+    maskTemp->zeroMem();
+    real *maskTempData = maskTemp->getData();
+    size_t nonZeroNum = para->getSize() * (1 - sparsityRatio_);
+
+    VectorPtr paraVec = para->getBuf(PARAMETER_VALUE);
+    VectorPtr paraCpuCopy = Vector::create(para->getSize(), false);
+
+    paraCpuCopy->copyFrom(*paraVec);
+    std::vector<std::pair<real, size_t>> param;
+
+    for (size_t i = 0; i < para->getSize(); i++)
+      param.push_back(std::make_pair(fabs(paraCpuCopy->getData()[i]), i));
+
+    std::partial_sort(
+        param.begin(), param.begin() + nonZeroNum, param.end(), sortPairAscend);
+    for (size_t i = 0; i < nonZeroNum; i++) maskTempData[param[i].second] = 1.0;
+
+    // Currently just use a mask vector for hack.
+    if (para->useGpu()) {
+      maskVec_ = Vector::create(para->getSize(), para->useGpu());
+      maskVec_->copyFrom(*maskTemp);
+    } else {
+      maskVec_ = maskTemp;
+    }
+  }
+
+  void init(Parameter *para) {
+    generateMask(para);
+    size_t initCount = this->initCount_.fetch_add(1);
+    CHECK_EQ(initCount, 0UL) << "Currently the StaticPruningHook must invoke "
+                                "in same ParamterUpdater";
+    VLOG(3) << "Initialize Parameter " << para;
+    SetDevice device(para->getDeviceId());
+
+    auto &paraVec = para->getBuf(PARAMETER_VALUE);
+    paraVec->dotMul(*maskVec_);
+  }
+
+ private:
+  SameThreadChecker updateThreadChecker_;
+  std::atomic<size_t> initCount_;
+  VectorPtr maskVec_;
+  real sparsityRatio_;
+};
+
+IParameterUpdaterHook::IParameterUpdaterHook() {}
+
+IParameterUpdaterHook::~IParameterUpdaterHook() {}
+
+/**
+ * A Hasher used by g_hooks.
+ *
+ * Use the independent hasher intendedly. There is a hasher in PServer for hash
+ * ParameterBlock. But not to use same hasher to reduce dependency.
+ *
+ * May be extracted to Util.h to unify the hasher.
+ */
+class StringIntPairHasher {
+ public:
+  size_t operator()(const std::pair<std::string, int> &k) const {
+    return intHasher_(strHasher_(k.first) + k.second);
+  }
+
+ private:
+  std::hash<std::string> strHasher_;
+  std::hash<int> intHasher_;
+};
+
+static WeakKVCache<std::pair<std::string, int>,
+                   IParameterUpdaterHook,
+                   StringIntPairHasher>
+    g_hookCache_;
+
+/**
+ * ParameterUpdaterHook actually factory method.
+ */
+static IParameterUpdaterHook *createImpl(
+    const ParameterUpdaterHookConfig &config) {
+  auto &type = config.type();
+  if (type == "pruning") {
+    return new StaticPruningHook(config);
+  }
+
+  LOG(FATAL) << "Unknown Hook type:  " << type;
+  return nullptr;
+}
+
+std::shared_ptr<IParameterUpdaterHook> IParameterUpdaterHook::create(
+    const ParameterConfig &paramConfig, int idx) {
+  std::pair<std::string, int> key = {paramConfig.name(), idx};
+  return g_hookCache_.get(
+      key, [&] { return createImpl(paramConfig.update_hooks(idx)); });
+}
+
+}  // namespace paddle
diff --git a/paddle/parameter/ParameterUpdaterHook.h b/paddle/legacy/parameter/ParameterUpdaterHook.h
similarity index 100%
rename from paddle/parameter/ParameterUpdaterHook.h
rename to paddle/legacy/parameter/ParameterUpdaterHook.h
diff --git a/paddle/legacy/parameter/Regularizer.cpp b/paddle/legacy/parameter/Regularizer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c1d5f4fa68403408bb44341e1e28f2ce3beb2e4c
--- /dev/null
+++ b/paddle/legacy/parameter/Regularizer.cpp
@@ -0,0 +1,54 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Regularizer.h"
+#include "paddle/legacy/utils/Flags.h"
+#include "paddle/legacy/utils/Util.h"
+
+namespace paddle {
+
+Regularizer* Regularizer::get(const std::vector<ParameterType>& types,
+                              const ParameterConfig& paraConfig) {
+  bool useLearningRateVec =
+      std::find(types.begin(), types.end(), PARAMETER_LEARNING_RATE) !=
+      types.end();
+  if (paraConfig.decay_rate_l1() > 0.0f &&
+      paraConfig.decay_rate() > 0.0f) {  // use L1 and L2
+    if (useLearningRateVec) {
+      static L1L2LrRegularizer regularizer_;
+      return &regularizer_;
+    }
+    static L1L2Regularizer regularizer_;
+    return &regularizer_;
+  }
+  if (paraConfig.decay_rate_l1() > 0.0f) {  // use L1 only
+    if (useLearningRateVec) {
+      static L1LrRegularizer regularizer_;
+      return &regularizer_;
+    }
+    static L1Regularizer regularizer_;
+    return &regularizer_;
+  }
+  if (paraConfig.decay_rate() > 0.0f) {  // use L2 only
+    if (useLearningRateVec) {
+      static L2LrRegularizer regularizer_;
+      return &regularizer_;
+    }
+    static L2Regularizer regularizer_;
+    return &regularizer_;
+  }
+  return nullptr;
+}
+
+}  // namespace paddle
diff --git a/paddle/parameter/Regularizer.h b/paddle/legacy/parameter/Regularizer.h
similarity index 100%
rename from paddle/parameter/Regularizer.h
rename to paddle/legacy/parameter/Regularizer.h
diff --git a/paddle/parameter/ThreadLocalBuffer.cpp b/paddle/legacy/parameter/ThreadLocalBuffer.cpp
similarity index 100%
rename from paddle/parameter/ThreadLocalBuffer.cpp
rename to paddle/legacy/parameter/ThreadLocalBuffer.cpp
diff --git a/paddle/legacy/parameter/ThreadLocalBuffer.h b/paddle/legacy/parameter/ThreadLocalBuffer.h
new file mode 100644
index 0000000000000000000000000000000000000000..d360feeed6c98ee60e3bdae924434054080576b0
--- /dev/null
+++ b/paddle/legacy/parameter/ThreadLocalBuffer.h
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/legacy/math/Vector.h"
+
+namespace paddle {
+namespace parameter {
+extern VectorPtr* getThreadLocalBuffer();
+}  // namespace parameter
+}  // namespace paddle
diff --git a/paddle/legacy/parameter/Weight.cpp b/paddle/legacy/parameter/Weight.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9d94050a5cd8c3570c286e8e82c2a1470c40e6db
--- /dev/null
+++ b/paddle/legacy/parameter/Weight.cpp
@@ -0,0 +1,84 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Weight.h"
+#include "paddle/legacy/utils/Logging.h"
+
+namespace paddle {
+
+Weight::Weight(size_t height, size_t width, ParameterPtr param) {
+  VectorPtr vPtr = param->getBuf(PARAMETER_VALUE);
+  VectorPtr gPtr = param->getBuf(PARAMETER_GRADIENT);
+
+  // create a new weight
+  if (param->isSparse()) {
+    CHECK_LE(param->getSize(), width * height);
+  } else {
+    CHECK_EQ(param->getSize(), width * height);
+  }
+
+  // weight_
+  weight_ = param->getMat(PARAMETER_VALUE);
+  if (!weight_ && vPtr) {
+    weight_ = Matrix::create(vPtr->getMemoryHandle(), height, width);
+  }
+  if (weight_) {
+    CHECK_EQ(height, weight_->getHeight());
+    CHECK_EQ(width, weight_->getWidth());
+  }
+
+  // weightGrad
+  weightGrad_ = param->getMat(PARAMETER_GRADIENT);
+  if (!weightGrad_ && gPtr) {
+    weightGrad_ = Matrix::create(gPtr->getMemoryHandle(), height, width);
+  }
+  if (weightGrad_) {
+    CHECK_EQ(height, weightGrad_->getHeight());
+    CHECK_EQ(width, weightGrad_->getWidth());
+  }
+
+  parameter_ = param;
+}
+
+Weight::Weight(size_t height, size_t width, ParameterPtr param, size_t offset) {
+  VectorPtr vPtr = param->getBuf(PARAMETER_VALUE);
+  VectorPtr gPtr = param->getBuf(PARAMETER_GRADIENT);
+
+  // create a new weight
+  CHECK_LE(offset + width * height, param->getSize());
+
+  // weight_
+  if (vPtr) {
+    weight_ = Matrix::create(vPtr->getData() + offset,
+                             height,
+                             width,
+                             /* trans */ false,
+                             param->useGpu());
+  }
+
+  // weightGrad
+  if (gPtr) {
+    weightGrad_ = Matrix::create(gPtr->getData() + offset,
+                                 height,
+                                 width,
+                                 /* trans */ false,
+                                 param->useGpu());
+  }
+
+  parameter_ = param;
+}
+
+const ParameterPtr& Weight::getParameterPtr() { return parameter_; }
+void Weight::setParameterPtr(ParameterPtr param) { parameter_ = param; }
+}  // namespace paddle
diff --git a/paddle/legacy/parameter/Weight.h b/paddle/legacy/parameter/Weight.h
new file mode 100644
index 0000000000000000000000000000000000000000..241c8d829cd0c7b57964324d3378bdfcf09e6a70
--- /dev/null
+++ b/paddle/legacy/parameter/Weight.h
@@ -0,0 +1,48 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <memory>
+#include <vector>
+
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/SparseRowMatrix.h"
+#include "paddle/legacy/parameter/Parameter.h"
+
+namespace paddle {
+
+class Weight {
+ private:
+  MatrixPtr weight_;
+  MatrixPtr weightGrad_;
+  ParameterPtr parameter_;
+
+ public:
+  Weight(size_t height, size_t width, ParameterPtr parameter);
+  Weight(size_t height, size_t width, ParameterPtr parameter, size_t offset);
+
+  const MatrixPtr& getW() { return weight_; }
+  const MatrixPtr& getWGrad() { return weightGrad_; }
+  const ParameterPtr& getParameterPtr();
+
+  void incUpdate(const UpdateCallback& callback) {
+    getParameterPtr()->incUpdate(callback);
+  }
+
+  void setParameterPtr(ParameterPtr param);
+};
+
+typedef std::vector<std::unique_ptr<Weight>> WeightList;
+
+}  // namespace paddle
diff --git a/paddle/parameter/tests/CMakeLists.txt b/paddle/legacy/parameter/tests/CMakeLists.txt
similarity index 100%
rename from paddle/parameter/tests/CMakeLists.txt
rename to paddle/legacy/parameter/tests/CMakeLists.txt
diff --git a/paddle/legacy/parameter/tests/test_argument.cpp b/paddle/legacy/parameter/tests/test_argument.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0c632e0cd10342431dfcada680a18d8f9eabeb9c
--- /dev/null
+++ b/paddle/legacy/parameter/tests/test_argument.cpp
@@ -0,0 +1,57 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <paddle/legacy/parameter/Argument.h>
+
+using namespace paddle;  // NOLINT
+
+TEST(Argument, poolSequenceWithStride) {
+  Argument input, output;
+  ICpuGpuVector::resizeOrCreate(input.sequenceStartPositions, 5, false);
+  int* inStart = input.sequenceStartPositions->getMutableData(false);
+  inStart[0] = 0;
+  inStart[1] = 9;
+  inStart[2] = 14;
+  inStart[3] = 17;
+  inStart[4] = 30;
+
+  int strideResult[] = {0, 5, 9, 14, 17, 22, 27, 30};
+  int strideResultReversed[] = {0, 4, 9, 14, 17, 20, 25, 30};
+
+  for (auto reversed : {false, true}) {
+    ICpuGpuVectorPtr stridePositions;
+    output.poolSequenceWithStride(
+        input, 5 /* stride */, &stridePositions, reversed);
+
+    const int* outStart = output.sequenceStartPositions->getData(false);
+    CHECK_EQ(outStart[0], 0);
+    CHECK_EQ(outStart[1], 2);
+    CHECK_EQ(outStart[2], 3);
+    CHECK_EQ(outStart[3], 4);
+    CHECK_EQ(outStart[4], 7);
+
+    CHECK_EQ(stridePositions->getSize(), 8UL);
+    auto result = reversed ? strideResultReversed : strideResult;
+    for (int i = 0; i < 8; i++) {
+      CHECK_EQ(stridePositions->getData(false)[i], result[i]);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/parameter/tests/test_common.cpp b/paddle/legacy/parameter/tests/test_common.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8de9d6da983553c0b9e574ac27ae8fca14bea5b7
--- /dev/null
+++ b/paddle/legacy/parameter/tests/test_common.cpp
@@ -0,0 +1,174 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <paddle/legacy/utils/Util.h>
+#include <stdlib.h>
+
+#include <gtest/gtest.h>
+#include <paddle/legacy/parameter/ParameterUpdateFunctions.h>
+#include <paddle/legacy/utils/Flags.h>
+#include <paddle/legacy/utils/Stat.h>
+#include <paddle/legacy/utils/Thread.h>
+
+using namespace paddle;  // NOLINT
+
+class CommonTest : public ::testing::Test {
+ protected:
+  CommonTest() : testStat_("test") {}
+  virtual ~CommonTest() {}
+  virtual void SetUp() {
+    const size_t buffSize[] = {
+        100, 128, 500, 1024, 4096, 10240, 102400, 1000000};
+    sizeVec_.resize(8);
+    memcpy(&sizeVec_[0], &buffSize[0], 8 * sizeof(size_t));
+    valueUint_.resize(4);
+    valueUint_[0].first = 0.0;
+    valueUint_[0].second = 0.0;
+    valueUint_[1].first = 0.0;
+    valueUint_[1].second = 1.0;
+    valueUint_[2].first = 1.0;
+    valueUint_[2].second = 0.0;
+    valueUint_[3].first = 1.0;
+    valueUint_[3].second = 1.0;
+    learningRate_ = 1.0;
+  }
+
+  void test_sgdUpadate(real* gradientBuffer,
+                       real* valueBuffer,
+                       real* momentumBuffer,
+                       size_t size);
+
+  virtual void TreaDown() { LOG(INFO) << "All Test Finished."; }
+
+ protected:
+  std::vector<std::pair<real, real>> valueUint_;
+  std::vector<size_t> sizeVec_;
+  real learningRate_;
+  StatSet testStat_;
+};
+
+void CommonTest::test_sgdUpadate(real* gradientBuffer,
+                                 real* valueBuffer,
+                                 real* momentumBuffer,
+                                 size_t size) {
+// sgdUpdateAvx has no double version yet
+#if defined(__AVX__) && !defined(PADDLE_TYPE_DOUBLE)
+  real valueSum1 = 0, valueSum2 = 0, momSum1 = 0, momSum2 = 0;
+  real* gradTmp = new real[size];
+  real* valueTmp = new real[size];
+  real* momentumTmp = new real[size];
+  memcpy(gradTmp, gradientBuffer, size * sizeof(real));
+  memcpy(valueTmp, valueBuffer, size * sizeof(real));
+  memcpy(momentumTmp, momentumBuffer, size * sizeof(real));
+  for (auto& arg : valueUint_) {
+    {
+      {
+        struct timeval t;
+        REGISTER_TIMER("gettimeofday", 0, testStat_);
+        gettimeofday(&t, NULL);
+      }
+      REGISTER_TIMER("avxTimer", 0);
+      sgdUpdateAvx(learningRate_,
+                   arg.first,
+                   arg.second,
+                   size,
+                   valueBuffer,
+                   gradientBuffer,
+                   momentumBuffer);
+    }
+    for (size_t i = 0; i < size; i++) {
+      valueSum1 += valueBuffer[i];
+      momSum1 += momentumBuffer[i];
+      // std::cout << "["
+      //          << valueBuffer[i]
+      //          << "," << momentumBuffer[i]
+      //          << "," << gradientBuffer[i] << "],";
+    }
+    {
+      REGISTER_TIMER("cpuTimer", 0);
+      sgdUpdateCpu(learningRate_,
+                   arg.first,
+                   arg.second,
+                   size,
+                   valueTmp,
+                   gradTmp,
+                   momentumTmp);
+    }
+    for (size_t i = 0; i < size; i++) {
+      valueSum2 += valueTmp[i];
+      momSum2 += momentumTmp[i];
+      // std::cout << "["
+      //          << valueTmp[i]
+      //          << "," << momentumTmp[i]
+      //          << "," << gradTmp[i] << "],";
+    }
+
+    VLOG(3) << "valueSum1 = " << valueSum1 << " ; valueSum2 = " << valueSum2;
+    VLOG(3) << "momSum1 = " << momSum1 << " ; momSum2 = " << momSum2;
+    ASSERT_EQ(valueSum1, valueSum2);
+    ASSERT_EQ(momSum1, momSum2);
+  }
+  delete[] gradTmp;
+  delete[] valueTmp;
+  delete[] momentumTmp;
+#endif
+}
+
+TEST_F(CommonTest, sgdUpdate) {
+  const size_t alignHeader[] = {0, 2, 3, 5, 7, 8};
+  for (auto& size : sizeVec_) {
+    real *gradientBuffer, *valueBuffer, *momentumBuffer;
+    CHECK_EQ(posix_memalign((void**)&gradientBuffer, 32, sizeof(real) * size),
+             0);
+    CHECK_EQ(posix_memalign((void**)&valueBuffer, 32, sizeof(real) * size), 0);
+    CHECK_EQ(posix_memalign((void**)&momentumBuffer, 32, sizeof(real) * size),
+             0);
+
+    for (size_t i = 0; i < size; i++) {
+      gradientBuffer[i] = 1.0;
+      valueBuffer[i] = 2.0;
+      momentumBuffer[i] = 3.0;
+    }
+    for (int i = 0; i < 6; i++) {
+      LOG(INFO) << "----------------------" << size << ":" << alignHeader[i]
+                << "-------------------------";
+      test_sgdUpadate(&gradientBuffer[alignHeader[i]],
+                      &valueBuffer[alignHeader[i]],
+                      &momentumBuffer[alignHeader[i]],
+                      size - alignHeader[i]);
+    }
+    free(gradientBuffer);
+    free(valueBuffer);
+    free(momentumBuffer);
+  }
+  globalStat.printAllStatus();
+  testStat_.printAllStatus();
+}
+
+TEST_F(CommonTest, syncThreadPool) {
+  SyncThreadPool pool(10);
+
+  std::vector<int> nums;
+  nums.resize(10);
+
+  pool.exec([&](int tid, size_t numThreads) { nums[tid] = tid; });
+  for (size_t i = 0; i < nums.size(); ++i) {
+    EXPECT_EQ((int)i, nums[i]);
+  }
+
+  pool.exec([&](int tid, size_t numThreads) { nums[tid] -= tid; });
+  for (size_t i = 0; i < nums.size(); ++i) {
+    EXPECT_EQ((int)0, nums[i]);
+  }
+}
diff --git a/paddle/legacy/pserver/BaseClient.cpp b/paddle/legacy/pserver/BaseClient.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..13bb8a1cc58580a8e0af31c23b420836c7422ad8
--- /dev/null
+++ b/paddle/legacy/pserver/BaseClient.cpp
@@ -0,0 +1,80 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "BaseClient.h"
+#include <gflags/gflags.h>
+#include <string.h>
+#include <vector>
+#include "paddle/legacy/utils/Stat.h"
+
+DECLARE_string(pservers);
+
+namespace paddle {
+
+BaseClient::BaseClient(bool separate, int numPorts)
+    : stopping_(false), numPorts_(numPorts), separateSendAndRecv_(separate) {
+  CHECK_GT(numPorts, 0);
+}
+
+BaseClient::~BaseClient() {}
+
+void BaseClient::recvData() { recvSyncBarrier_->wait(); }
+
+void BaseClient::synchronize(SyncObject syncObjectId) {
+  SynchronizeRequest request;
+  request.set_sync_object_id(syncObjectId);
+  std::vector<SynchronizeResponse> responses;
+  multiCall(__func__, request, &responses);
+}
+
+void BaseClient::startThreads() {
+  if (!separateSendAndRecv_) {
+    return;
+  }
+  recvSyncBarrier_.reset(new ThreadBarrier(threadNum_ + 1));
+
+  sendThreads_.resize(threadNum_);
+  recvThreads_.resize(threadNum_);
+  sendJobQueue_.resize(threadNum_);
+  recvJobQueue_.resize(threadNum_);
+
+  for (int i = 0; i < threadNum_; ++i) {
+    sendJobQueue_[i].reset(new SendQueue());
+    recvJobQueue_[i].reset(new SendQueue());
+
+    sendThreads_[i].reset(
+        new std::thread([this](int id) { this->send(id); }, i));
+
+    recvThreads_[i].reset(
+        new std::thread([this](int id) { this->recv(id); }, i));
+  }
+}
+
+void BaseClient::finishThreads() {
+  if (!separateSendAndRecv_) {
+    return;
+  }
+  stopping_ = true;
+  for (int i = 0; i < threadNum_; i++) {
+    sendJobQueue_[i]->enqueue(nullptr);
+  }
+  for (auto& thread : sendThreads_) {
+    thread->join();
+  }
+  for (auto& thread : recvThreads_) {
+    thread->join();
+  }
+  stopping_ = false;
+}
+}  // namespace paddle
diff --git a/paddle/legacy/pserver/BaseClient.h b/paddle/legacy/pserver/BaseClient.h
new file mode 100644
index 0000000000000000000000000000000000000000..66e8f39cd60998122bb8958b12b23ee7142be94d
--- /dev/null
+++ b/paddle/legacy/pserver/BaseClient.h
@@ -0,0 +1,311 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "ParameterService.pb.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/pserver/ProtoServer.h"
+#include "paddle/legacy/utils/Common.h"
+#include "paddle/legacy/utils/Queue.h"
+
+namespace paddle {
+
+/**
+ * it manages all connections to pservers.
+ * it exists two modes to manage connections to all pservers. Firstly, one
+ * connection owns two threads that separately manage to send and receive
+ * data. Secondly, each thread uses one connection for all activation in it.
+ * the first solution arms with sendThreads_/recvThreads_ and sendJobQueue_/
+ * recvJobQueue_. the second solution use some shared thread pool to manage
+ * connections.
+ */
+class BaseClient {
+ protected:
+  typedef std::unique_ptr<std::thread> ThreadPtr;
+  typedef std::vector<std::vector<iovec>> InputIovs;
+  typedef std::vector<SendParameterRequest> SendRequest;
+  typedef std::vector<SendDataRequest> SendDataRequestVec;
+
+  // TODO(yanfei):
+  // refine data structure to unify parameter and features communication
+  struct SendJob {
+    /// store parameters related blocks data
+    InputIovs parallelInputIovs;
+    /// store protobuf request
+    SendRequest parallelRequests;
+    /// store data, such as features for metric learning
+    SendDataRequestVec parallelDataRequests;
+  };
+
+ public:
+  explicit BaseClient(bool separate = false, int numPorts = FLAGS_ports_num);
+
+  virtual ~BaseClient();
+
+  typedef std::shared_ptr<SendJob> SendJobPtr;
+  typedef Queue<SendJobPtr> SendQueue;
+
+  /// send data to server, support only synchronize
+  template <class DataType>
+  void putData(int clientId,
+               SendDataType type,
+               DataType* datas,
+               size_t size,
+               DataUpdateMode mode) {
+    synchronize(SYNC_DATA);
+    sendData(clientId, type, mode, datas, size);
+    recvData();
+    synchronize(SYNC_DATA);
+  }
+
+  template <class DataType>
+  void putOwnData(int clientId,
+                  SendDataType type,
+                  DataType* datas,
+                  size_t size) {
+    putData(clientId, type, datas, size, DATA_UPDATE_MODE_SET_OWN);
+  }
+
+  template <class DataType>
+  void getAllData(int clientId,
+                  SendDataType type,
+                  DataType* datas,
+                  size_t size) {
+    sendData(clientId,
+             type,
+             DATA_UPDATE_MODE_GET_ALL,
+             reinterpret_cast<DataType*>(NULL),
+             0);
+    recvData();
+    size_t dataOffset = 0;
+    for (auto& recvMem : recvDataMems_) {
+      CHECK_LE(dataOffset, size);
+      size_t memSize = std::min(recvMem.get()->getSize(),
+                                sizeof(DataType) * (size - dataOffset));
+      CHECK_EQ(memSize % sizeof(DataType), size_t(0));
+      memcpy(datas + dataOffset, recvMem.get()->getBuf(), memSize);
+      dataOffset += memSize / sizeof(DataType);
+    }
+    CHECK_EQ(dataOffset, size);
+  }
+
+  /**
+   * Reduces values on all clients.
+   * This reduce just support SUM.
+   * The results are saved in recvBuf of rootId client
+   */
+  template <class DataType>
+  void reduce(DataType* sendBuf,
+              DataType* recvBuf,
+              size_t size,
+              int clientId,
+              int rootId) {
+    putOwnData(clientId, DATA_REDUCE_SUM, sendBuf, size);
+    if (rootId == clientId) {
+      getAllData(clientId, DATA_REDUCE_SUM, recvBuf, size);
+    }
+  }
+
+  /**
+   * return trans data type according to the input type
+   */
+  virtual TransDataType getTransDtype(const std::type_info& info) {
+    TransDataType dataType;
+    if (typeid(int*) == info) {  // NOLINT
+      dataType = TRANS_INT32;
+    } else if (typeid(uint32_t*) == info) {  // NOLINT
+      dataType = TRANS_UINT32_T;
+    } else if (typeid(int64_t*) == info) {  // NOLINT
+      dataType = TRANS_INT64_T;
+    } else if (typeid(uint64_t*) == info) {  // NOLINT
+      dataType = TRANS_UINT64_T;
+    } else if (typeid(float*) == info) {  // NOLINT
+      dataType = TRANS_FLOAT;
+    } else if (typeid(double*) == info) {  // NOLINT
+      dataType = TRANS_DOUBLE;
+    } else {
+      LOG(FATAL) << "not supported";
+    }
+    return dataType;
+  }
+
+ protected:
+  /// for a > 0, b > 0:
+  /// return the smallest x s.t. b*x >= a
+  static int divup(int a, int b) { return (a + b - 1) / b; }
+
+  int calcClientId(int i, int serviceNum) {
+    return (i + FLAGS_trainer_id * numPorts_) % serviceNum;
+  }
+
+  /// start threads in sendThreads_ and recvThreads_
+  void startThreads();
+
+  /// finish threads in sendThreads_ and recvThreads_
+  void finishThreads();
+
+  template <class DataType>
+  void prepareData(int clientId,
+                   SendDataType type,
+                   DataUpdateMode updateMode,
+                   DataType* datas,
+                   size_t size,
+                   SendJob* sendJob) {
+    sendJob->parallelDataRequests.resize(serviceNum_);
+    sendJob->parallelInputIovs.resize(serviceNum_);
+    for (int i = 0; i < serviceNum_; ++i) {
+      auto& request = sendJob->parallelDataRequests[i];
+      request.set_update_mode(updateMode);
+      request.set_type(type);
+      request.set_client_id(clientId);
+      request.set_server_id(i);
+    }
+
+    /// split datas which need send to Server into serviceNum_ pieces
+    if (!datas) {
+      CHECK(!size) << "ownSize should be zero since datas is nullptr";
+    }
+    size_t baseSize = size / serviceNum_;
+    size_t dataOffset = 0;
+    for (int i = 0; i < serviceNum_; ++i) {
+      auto& request = sendJob->parallelDataRequests[i];
+      DataBlock* block = request.add_blocks();
+      size_t ownSize = size_t(i) < size % serviceNum_ ? baseSize + 1 : baseSize;
+      size_t realSize = datas ? std::max(ownSize, size_t(1)) : 0;
+      block->set_total_size(realSize * sizeof(DataType));
+      block->set_data_size(sizeof(DataType));
+      // TODO(yuyang18): The getTransDtype can be rewritten as template method
+      //                 to reduce runtime overhead.
+      block->set_data_type(getTransDtype(typeid(DataType*)));  // NOLINT
+      if (datas) {
+        sendJob->parallelInputIovs[i].push_back(
+            {datas + dataOffset, realSize * sizeof(DataType)});
+      }
+      dataOffset += ownSize;
+    }
+    CHECK_EQ(dataOffset, size);
+  }
+
+  /**
+   * @brief send data to all data servers
+   *
+   * @note  each trainer sends all its data to all data servers
+   *        it's for broadcast data synchronization, such as features
+   *        synchronization in metric learning.
+   */
+  template <class DataType>
+  void sendData(int clientId,
+                SendDataType type,
+                DataUpdateMode updateMode,
+                DataType* datas,
+                size_t size) {
+    SendJobPtr sendJob = std::make_shared<SendJob>();
+    prepareData(clientId, type, updateMode, datas, size, sendJob.get());
+    for (int i = 0; i < threadNum_; ++i) {
+      sendJobQueue_[i]->enqueue(sendJob);
+    }
+  }
+
+  /**
+   * @brief recv data from all data servers
+   *
+   * @note  synchronize all recv threads
+   */
+  void recvData();
+
+  /// send request, and recv responses
+  template <typename ProtoIn, typename ProtoOut>
+  void multiCall(const char* funcName,
+                 const ProtoIn& request,
+                 std::vector<ProtoOut>* responses) {
+    responses->resize(clients_.size());
+    size_t numClients = clients_.size();
+    for (size_t i = 0; i < numClients; ++i) {
+      clients_[i].send(funcName, request);
+    }
+    for (size_t i = 0; i < numClients; ++i) {
+      clients_[i].recv(&(*responses)[i]);
+    }
+  }
+
+  /**
+   * @brief synchronize all trainers and pservers
+   *
+   * @note  used to ensure that data of all trainers have been received
+   */
+  void synchronize(SyncObject syncObjectId = SYNC_DEFAULT);
+
+  /**
+   * @brief use multithread to separately send data
+   *
+   * @note  each thread should read its own JobQueue to handle requests
+   *        each thread should calcClientId() to retrieve connections
+   *        managed by himself.
+   *        send and recv are implemented in child class.
+   */
+  virtual void send(int threadId) = 0;
+
+  /**
+   * @brief use multithread to separately receive data
+   *
+   * @note  almost same as send()
+   */
+  virtual void recv(int threadId) = 0;
+
+ protected:
+  bool stopping_;
+  /// nodes * ports that means the number of real pservers
+  int serviceNum_;
+  /**
+   * threads num for managing all services. Normally the
+   * number of pservers are relatively less than several
+   * hundreds so that using thread-based parallelization
+   * can benifit traffic performance and pserver's sgd
+   * optimization performance.
+   */
+  int threadNum_;
+  /// the connection manager at client end
+  std::vector<ProtoClient> clients_;
+  /// send threads for parallelization
+  std::vector<ThreadPtr> sendThreads_;
+  /// recv threads for parallelization
+  std::vector<ThreadPtr> recvThreads_;
+  std::unique_ptr<ThreadBarrier> recvSyncBarrier_;
+
+  // TODO(yanfei):
+  // current pserver's will return value until all parameters'
+  // optimization are finished so that recv are not overlapped
+  // in reality. More robust implimentation should be to pipeline
+  // all send/recv action based on parameter unit level, and
+  // it will benifits deep and larger model training in future,
+  // especially local node compution power surpasses inter-connection
+  // such as GPU cluster, even with BOX GPU cluster.
+  // queue for buffering send request
+  /**
+   * send/recv queue cooperates with each other to accomplish
+   * overlapping communication with forwardBackward action.
+   */
+  std::vector<std::unique_ptr<SendQueue>> sendJobQueue_;
+  /// queue for buffering recv request
+  std::vector<std::unique_ptr<SendQueue>> recvJobQueue_;
+  /// specific for dserver
+  SendJob sendJob_;
+  /// port num for each node
+  int numPorts_;
+  /// if set, overlapped optimization is disabled
+  bool separateSendAndRecv_;
+  std::vector<CpuMemHandlePtr> recvDataMems_;
+};
+}  // namespace paddle
diff --git a/paddle/pserver/CMakeLists.txt b/paddle/legacy/pserver/CMakeLists.txt
similarity index 100%
rename from paddle/pserver/CMakeLists.txt
rename to paddle/legacy/pserver/CMakeLists.txt
diff --git a/paddle/legacy/pserver/LightNetwork.cpp b/paddle/legacy/pserver/LightNetwork.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..469c95853ecdc02a6028417ca37b0020406eea09
--- /dev/null
+++ b/paddle/legacy/pserver/LightNetwork.cpp
@@ -0,0 +1,459 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <fcntl.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <chrono>
+
+#include <arpa/inet.h>
+#include <net/if.h>
+#include <sys/ioctl.h>
+#include <sstream>
+
+#include "LightNetwork.h"
+#include "RDMANetwork.h"
+#include "paddle/legacy/utils/StringUtil.h"
+#include "paddle/legacy/utils/Util.h"
+
+/// quick ack can reduce the latency of small message
+DEFINE_bool(small_messages,
+            false,
+            "if message size is small, recommend set it True to enable quick "
+            "ack and no delay");
+
+/// reasonable sock_send_buf_size can control the traffic injected into switch
+/// network. Injecting too many data into traffic could cause packets loss which
+/// cause long latency and degrade the efficiency of communication.
+DEFINE_int32(sock_send_buf_size,
+             1024 * 1024 * 40,
+             "restrict sock send buff size, can reduce network congestion if "
+             "set carefully");
+
+/// reasonable size can hold bursted packets and reduce packets loss
+DEFINE_int32(sock_recv_buf_size,
+             1024 * 1024 * 40,
+             "restrict sock recv buff size");
+
+/// reasonable sock_listen_queue_size can control maximum pending connections.
+DEFINE_int32(sock_listen_queue_size,
+             1024,
+             "listen queue size when pserver listen a TCP port");
+
+namespace paddle {
+
+/**
+ * @brief get ip address from interface name
+ *
+ * @param[in] device device interface name
+ */
+std::string getIpAddr(std::string &device) {
+  int sock;
+  struct sockaddr_in sin;
+  struct ifreq ifr;
+
+  sock = socket(AF_INET, SOCK_DGRAM, 0);
+  CHECK(sock >= 0) << "Create socket error.";
+
+  strncpy(ifr.ifr_name, device.c_str(), IFNAMSIZ);
+  ifr.ifr_name[IFNAMSIZ - 1] = 0;
+
+  CHECK_GE(ioctl(sock, SIOCGIFADDR, &ifr), 0);
+  memcpy(&sin, &ifr.ifr_addr, sizeof(sin));
+  close(sock);
+  return std::string(inet_ntoa(sin.sin_addr));
+}
+
+/**
+ * @brief set sock option
+ *
+ * @param[in] sockfd sock file descriptor
+ *
+ * @note adjust some default sock option for better performance
+ */
+void setOption(int sockfd) {
+#if !defined(__APPLE__) && !defined(__OSX__)
+  int sendSize = FLAGS_sock_send_buf_size;
+  int recvSize = FLAGS_sock_recv_buf_size;
+  CHECK_GE(
+      setsockopt(sockfd, SOL_SOCKET, SO_RCVBUF, &recvSize, sizeof(recvSize)),
+      0);
+  CHECK_GE(
+      setsockopt(sockfd, SOL_SOCKET, SO_SNDBUF, &sendSize, sizeof(sendSize)),
+      0);
+#endif
+
+  if (FLAGS_small_messages) {
+    int optval = 1;
+    CHECK_GE(
+        setsockopt(sockfd, IPPROTO_TCP, TCP_NODELAY, &optval, sizeof(optval)),
+        0);
+#ifdef TCP_QUICKACK
+    optval = 1;
+    CHECK_GE(
+        setsockopt(sockfd, IPPROTO_TCP, TCP_QUICKACK, &optval, sizeof(optval)),
+        0);
+#endif
+  }
+  int reuse = 1;
+  CHECK_GE(setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof(reuse)),
+           0);
+}
+
+/**
+ * @brief class constructor for SocketServer
+ * @param[in] addr sock bind address
+ * @param[in] port sock bind port
+ * @param[in] rdmaCpu rdma sock bind cpu core
+ *
+ * @note start one socket server which hosts parameter server process.
+ *       rdmaCpu is passed to rdma deamon for better performance, and
+ *       start tcp socket instead of rdma socket if rdmaCpu is equal
+ *       to -1. Each trainer process starts one connection to one socket
+ *       server, and use --ports_num to build more connections to harness
+ *       fat communication channel if necessary.
+ *       each connection is controlled by single thread with blocking
+ *       read and write.
+ */
+SocketServer::SocketServer(const std::string &addr, int port, int rdmaCpu)
+    : port_(port), addr_(addr), stopping_(false) {
+  if (rdmaCpu == -1) {
+    tcpRdma_ = F_TCP;
+    socket_ = 0;
+    maxPendingConnections_ = FLAGS_sock_listen_queue_size;
+  } else {
+    tcpRdma_ = F_RDMA;
+    rdmaCpu_ = rdmaCpu;
+    rdmaSocket_ = 0;
+
+    std::stringstream ss;
+    ss << port;
+    rdmaUri_ = "rdma://" + addr + ":" + ss.str();
+  }
+
+  /// trigger to initialize RDMA lib
+  CHECK(RdmaClientDaemons::get()) << "initilizate RDMA failed\n";
+}
+
+SocketServer::~SocketServer() {
+  stopping_ = true;
+  /// trigger accept thread to stop
+  {
+    SocketClient trigger(addr_.empty() ? "127.0.0.1" : addr_, port_, tcpRdma_);
+  }
+  this->join();
+}
+
+/**
+ * @brief start one tcp server which hosts parameter server
+ *
+ * @note do tcp socket bind and listen. it will spawn one thread
+ *       for each connection
+ */
+void SocketServer::tcpServer() {
+  int newsockfd;
+  socklen_t clilen;
+  struct sockaddr_in serv_addr, cli_addr;
+  struct hostent *server;
+
+  /// First call to socket() function
+  socket_ = socket(AF_INET, SOCK_STREAM, 0);
+  CHECK(socket_ >= 0) << "ERROR opening socket";
+
+  /// Initialize socket structure
+  bzero((char *)&serv_addr, sizeof(serv_addr));
+  serv_addr.sin_family = AF_INET;
+  serv_addr.sin_port = htons(port_);
+  if (!addr_.empty()) {
+    server = gethostbyname(addr_.c_str());
+    CHECK(server) << "ERROR, no such host: " << addr_;
+    bcopy((char *)server->h_addr,
+          (char *)&serv_addr.sin_addr.s_addr,
+          server->h_length);
+  } else {
+    serv_addr.sin_addr.s_addr = INADDR_ANY;
+  }
+
+  setOption(socket_);
+
+  /// Now bind the host address using bind() call.
+  CHECK(bind(socket_, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) >= 0)
+      << "ERROR on binding " << addr_;
+
+  /// Now start listening for the clients, here process will
+  /// go in sleep mode and will wait for the incoming connection
+  listen(socket_, maxPendingConnections_);
+  clilen = sizeof(cli_addr);
+
+  while (true) {
+    /// Accept actual connection from the client
+    newsockfd = accept(socket_, (struct sockaddr *)&cli_addr, &clilen);
+    if (stopping_) {
+      break;
+    }
+    CHECK(newsockfd >= 0) << "ERROR on accept";
+    constexpr int kPeerNameLen = 128;
+    char peerName[kPeerNameLen];
+    CHECK(inet_ntop(AF_INET, &cli_addr.sin_addr, peerName, kPeerNameLen));
+
+    SocketWorker *worker =
+        new SocketWorker(createChannel(newsockfd, std::string(peerName)), this);
+    worker->start();
+    worker->detach();
+  }
+  close(socket_);
+  LOG(INFO) << "pserver accept thread finish, addr=" << addr_
+            << " port=" << port_;
+}
+
+/**
+ * @brief start one rdma server which hosts parameter server
+ *
+ * @note do rdma bind and listen, which calling self-defined socket
+ *       like rdma library. it will spawn one thread for each connection
+ */
+void SocketServer::rdmaServer() {
+  struct sxi_sock *newsock;
+
+  /// First call to socket() function
+  rdmaSocket_ = rdma::ssocket(rdmaCpu_);
+  CHECK(rdmaSocket_) << "ERROR opening RDMA socket";
+
+  CHECK(rdma::bind(rdmaSocket_, rdmaUri_.c_str()) == 0)
+      << "ERROR bind RDMA socket";
+
+  /// Now start listening for the clients, here process will
+  /// go in sleep mode and will wait for the incoming connection
+  CHECK(rdma::listen(rdmaSocket_) == 0) << "ERROR listen RDMA socket";
+
+  while (true) {
+    /// Accept actual connection from the client
+    newsock = rdma::accept(rdmaSocket_);
+    if (stopping_) {
+      break;
+    }
+    CHECK(newsock) << "ERROR on accept";
+
+    constexpr int kPeerNameLen = 128;
+    char peerName[kPeerNameLen];
+
+    struct sockaddr_in *saddr = rdma::getSourceAddress(newsock);
+    CHECK(inet_ntop(AF_INET, &saddr->sin_addr, peerName, kPeerNameLen));
+
+    SocketWorker *worker =
+        new SocketWorker(createChannel(newsock, std::string(peerName)), this);
+    worker->start();
+    worker->detach();
+  }
+  rdma::close(rdmaSocket_);
+  LOG(INFO) << "pserver accept thread finish, rdma uri=" << rdmaUri_;
+}
+
+/**
+ * @brief start a socket server
+ *
+ * @note framework for starting socket server
+ */
+void SocketServer::run() {
+  if (tcpRdma_ == F_TCP) {
+    LOG(INFO) << "tcp server start ";
+    tcpServer();
+  } else if (tcpRdma_ == F_RDMA) {
+    LOG(INFO) << "rdma server start ";
+    rdmaServer();
+  }
+}
+
+/**
+ * @brief class constructor for rdma client deamons
+ *
+ * @note  automatically start several client deamons for better performance
+ */
+std::unique_ptr<RdmaClientDaemons> RdmaClientDaemons::daemons_ = nullptr;
+std::once_flag RdmaClientDaemons::initDataFlag_;
+
+RdmaClientDaemons::RdmaClientDaemons() {
+  if (FLAGS_rdma_tcp == "rdma") {
+    rdma::init();
+
+    struct sxi_socket *socket;
+    onlineCpus_ = rdma::numCpus();
+    for (auto i = 0; i < onlineCpus_; i++) {
+      socket = rdma::csocket(i);
+      CHECK(socket) << "ERROR open client socket daemon";
+
+      rdmaClientSocket_.push_back(socket);
+    }
+    LOG(INFO) << "RDMA client daemons started, onlineCpus_:" << onlineCpus_;
+    /// round robin scheduler for new connection
+    curCpu_ = 0;
+    /// wait daemons to start completely.
+    sleep(2);
+  }
+}
+
+RdmaClientDaemons::~RdmaClientDaemons() {
+  if (FLAGS_rdma_tcp == "rdma") {
+    for (auto i = 0; i < onlineCpus_; i++) {
+      rdma::close(rdmaClientSocket_[i]);
+    }
+    LOG(INFO) << "RDMA client daemons is destoryed, onlineCpus_ "
+              << onlineCpus_;
+  }
+}
+
+/**
+ * @brief worker thread main context
+ *
+ * @note  each connection from client(trainer) is controlled by single worker
+ *        thread, which is for handling all parameter server requests
+ */
+void SocketWorker::run() {
+  LOG(INFO) << "worker started, peer = " << channel_->getPeerName();
+
+  std::vector<iovec> inputIovs;
+
+  while (true) {
+    std::unique_ptr<MsgReader> msgReader = channel_->readMessage();
+    if (!msgReader) {
+      break;
+    }
+
+    auto callback = [this](const std::vector<iovec> &outputIovs) {
+      channel_->writeMessage(outputIovs);
+    };
+
+    server_->handleRequest(std::move(msgReader), callback);
+  }
+
+  LOG(INFO) << "worker begin to finish, peer = " << channel_->getPeerName();
+  delete this;
+}
+
+/**
+ * @brief start one tcp connection to tcp server
+ * @param[in] serverAddr  tcp server ip
+ * @param[in] serverPort  tcp server port
+ *
+ * @note each object contains one channel which accept byte stream
+ */
+void SocketClient::TcpClient(const std::string &serverAddr, int serverPort) {
+  struct sockaddr_in serv_addr;
+  struct hostent *server;
+
+  int errRet;  // temp for gethostbyname_r
+
+  /// Create a socket point
+  int sockfd = socket(AF_INET, SOCK_STREAM, 0);
+  CHECK(sockfd >= 0) << "ERROR opening socket";
+
+#if defined(__OSX__) || defined(__APPLE__)
+  server = getipnodebyname(serverAddr.c_str(), AF_INET, AI_DEFAULT, &errRet);
+  CHECK_NE(HOST_NOT_FOUND, errRet) << "ERROR, no such host: " << serverAddr
+                                   << " ret = " << errRet;
+  CHECK(server) << "getipnodebyname error!";
+#else
+  struct hostent hostinfo;
+  char buf[1024];  // temp for gethostbyname_r
+  CHECK_EQ(
+      0,
+      gethostbyname_r(
+          serverAddr.c_str(), &hostinfo, buf, sizeof(buf), &server, &errRet))
+      << "ERROR, no such host: " << serverAddr << " ret = " << errRet;
+  CHECK(server) << "gethostbyname_r error!";
+#endif
+
+  bzero((char *)&serv_addr, sizeof(serv_addr));
+  serv_addr.sin_family = AF_INET;
+  bcopy((char *)server->h_addr,
+        (char *)&serv_addr.sin_addr.s_addr,
+        server->h_length);
+  serv_addr.sin_port = htons(serverPort);
+
+  setOption(sockfd);
+
+  /// Now connect to the server
+  int retry_count = 0;
+  do {
+    if (connect(sockfd, (sockaddr *)&serv_addr, sizeof(serv_addr)) == 0) {
+      break;
+    }
+
+    if (errno == ECONNREFUSED) {
+      LOG(WARNING) << "connection refused by pserver, try again!";
+      if (retry_count++ >= 7) {
+        LOG(FATAL) << "connection refused by pserver, maybe pserver failed!";
+      }
+      std::this_thread::sleep_for(std::chrono::seconds(1));
+    } else {
+      CHECK(errno != 0) << "ERROR connecting to " << serverAddr << ":"
+                        << serverPort << "errorno: " << errno;
+    }
+  } while (errno == ECONNREFUSED);
+
+  channel_.reset(new SocketChannel(sockfd, serverAddr));
+  tcpRdma_ = F_TCP;
+}
+
+/**
+ * @brief start one RDMA connection to rdma server
+ * @param[in] serverAddr  rdma server ip
+ * @param[in] serverPort  rdma server port
+ *
+ * @note  each object contains one channel which accept byte stream
+ *        for rdma, low level sock also provide byte stream api.
+ */
+void SocketClient::RdmaClient(const std::string &serverAddr, int serverPort) {
+  struct sxi_sock *sock;
+
+  std::stringstream ss;
+  ss << serverPort;
+
+  std::string rdmaUri = "rdma://" + serverAddr + ":" + ss.str();
+
+  RdmaClientDaemons *daemons = RdmaClientDaemons::daemons_->get();
+  socketDaemon_ = daemons->selectDaemon();
+
+  /// connect to server with socket daemon
+  sock = rdma::connect(socketDaemon_, rdmaUri.c_str());
+  CHECK(sock) << "ERROR connect to server" << rdmaUri;
+
+  std::vector<std::string> seg;
+  str::split(rdmaUri, '/', &seg);
+  std::string server = seg.at(seg.size() - 1);
+  channel_.reset(new SocketChannel(sock, server));
+  tcpRdma_ = F_RDMA;
+}
+
+/**
+ * @brief class constructor
+ * @param[in] serverAddr pserver ip address
+ * @param[in] serverPort pserver port
+ * @param[in] ChannelType F_TCP or F_RDMA
+ *
+ * @note  responsible for building one connection to specified pserver port
+ */
+SocketClient::SocketClient(const std::string &serverAddr,
+                           int serverPort,
+                           enum ChannelType channelType) {
+  if (channelType == F_RDMA)
+    RdmaClient(serverAddr, serverPort);
+  else
+    TcpClient(serverAddr, serverPort);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/pserver/LightNetwork.h b/paddle/legacy/pserver/LightNetwork.h
new file mode 100644
index 0000000000000000000000000000000000000000..380f86832f5894fdf29588dde9a77068c624e066
--- /dev/null
+++ b/paddle/legacy/pserver/LightNetwork.h
@@ -0,0 +1,185 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "SocketChannel.h"
+
+#include <atomic>
+#include <memory>
+#include <thread>
+#include <vector>
+
+#include "paddle/legacy/utils/Thread.h"
+
+struct sxi_socket;
+
+namespace paddle {
+
+class SocketWorker;
+
+/**
+ * @brief class for holding all parameters processing for current port
+ *
+ * @note  each parameter server inherits from one socket server, each
+ *        server contains serveral woker threads which are to parallelize
+ *        the processing of computation, but share some common datas stored
+ *        in child class of socketserver.
+ */
+class SocketServer : public Thread {
+  // rdmaCpu controls the cpu affinity of RDMA server daemon,
+  // which could benifit performance. rdmaCpu = -1 means TCP
+  // is used instead of RDMA transport.
+ public:
+  SocketServer(const std::string& addr, int port, int rdmaCpu);
+  ~SocketServer();
+
+  virtual void run();
+
+  typedef std::function<void(const std::vector<iovec>& outputIovs)>
+      ResponseCallback;
+
+ protected:
+  //
+  // The derived class needs to implement this function
+  // to handle the request received by SocketWorker
+  // The request is encapsulated by MsgReader, which contains
+  // a set of blocks.
+  virtual void handleRequest(std::unique_ptr<MsgReader> msgReader,
+                             ResponseCallback callback) = 0;
+
+  std::unique_ptr<SocketChannel> createChannel(int sock,
+                                               const std::string& peerName) {
+    return std::unique_ptr<SocketChannel>(new SocketChannel(sock, peerName));
+  }
+  std::unique_ptr<SocketChannel> createChannel(struct sxi_sock* sock,
+                                               const std::string& peerName) {
+    return std::unique_ptr<SocketChannel>(new SocketChannel(sock, peerName));
+  }
+
+  friend class SocketWorker;
+
+ private:
+  void rdmaServer();
+  void tcpServer();
+
+  void detach() {}  // detach accept thread is forbidden
+
+ protected:
+  enum ChannelType tcpRdma_;
+  // for rdma
+  int rdmaCpu_;
+  std::string rdmaUri_;
+  sxi_socket* rdmaSocket_;
+  // for tcp
+  int port_;
+  std::string addr_;
+  int socket_;
+  int maxPendingConnections_;
+  bool stopping_;
+};
+
+/**
+ * @brief class for holding one connection from one trainer
+ *
+ * @note  all parameter processing will run in the context of this worker
+ */
+class SocketWorker : public Thread {
+ public:
+  SocketWorker(std::unique_ptr<SocketChannel>&& channel, SocketServer* server)
+      : channel_(std::move(channel)), server_(server) {}
+
+  virtual ~SocketWorker() {}
+
+  virtual void run();
+
+ protected:
+  std::unique_ptr<SocketChannel> channel_;
+  SocketServer* server_;
+  enum ChannelType tcpRdma_;
+};
+
+/**
+ * @brief class for providing rdma client deamon thread
+ *
+ * @note  the deamons are required by sock like rdam library. Here
+ *        use singleton model for daemons. Each deamon hosts in
+ *        single cpu core for better load balance performance
+ */
+class RdmaClientDaemons {
+ private:
+  RdmaClientDaemons();
+
+  static std::unique_ptr<RdmaClientDaemons> daemons_;
+
+ public:
+  static RdmaClientDaemons* get() {
+    std::call_once(RdmaClientDaemons::initDataFlag_,
+                   &RdmaClientDaemons::getInstance);
+
+    return daemons_.get();
+  }
+
+  struct sxi_socket* selectDaemon() {
+    int cpu = curCpu_;
+    curCpu_ = (curCpu_ + 1) % onlineCpus_;
+
+    LOG(INFO) << "select daemon " << cpu << "onlineCpus_ " << onlineCpus_;
+    return rdmaClientSocket_[cpu];
+  }
+
+  ~RdmaClientDaemons();
+
+ public:
+  friend class SocketClient;
+
+ private:
+  static std::once_flag initDataFlag_;
+  static void getInstance() {
+    if (!daemons_.get()) daemons_.reset(new RdmaClientDaemons());
+  }
+
+  std::vector<struct sxi_socket*> rdmaClientSocket_;
+  std::atomic<int> curCpu_;
+  int onlineCpus_;
+};
+
+/**
+ * @brief management for client connection which are from trainers
+ *
+ * @note  it contains one channel descriptor which used to write and
+ *        read data
+ */
+class SocketClient {
+ public:
+  SocketClient(const std::string& serverAddr,
+               int serverPort,
+               enum ChannelType channelType);
+
+  SocketChannel* getChannel() { return channel_.get(); }
+
+ protected:
+  std::unique_ptr<SocketChannel> channel_;
+  struct sxi_socket* socketDaemon_;
+  enum ChannelType tcpRdma_;
+
+ private:
+  void RdmaClient(const std::string& serverAddr, int serverPort);
+  void TcpClient(const std::string& serverAddr, int serverPort);
+};
+
+std::string getIpAddr(std::string& device);
+void setOption(int sockfd);
+
+}  // namespace paddle
diff --git a/paddle/legacy/pserver/ParameterClient2.cpp b/paddle/legacy/pserver/ParameterClient2.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4c544ddc28517f50e7deb23d4fa7a82b34d42677
--- /dev/null
+++ b/paddle/legacy/pserver/ParameterClient2.cpp
@@ -0,0 +1,781 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <unistd.h>
+
+#include "ParameterClient2.h"
+#include "paddle/legacy/math/SparseRowMatrix.h"
+#include "paddle/legacy/utils/Flags.h"
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/StringUtil.h"
+
+DEFINE_string(pservers, "127.0.0.1", "Comma separated addresses of pservers");
+DEFINE_int32(parallel_thread_num, 1, "Thread number for parameter send");
+
+namespace paddle {
+
+template <typename T1, typename T2>
+void copyToRepeatedField(google::protobuf::RepeatedField<T1>* dest,
+                         const T2* src,
+                         size_t size) {
+  dest->Clear();
+  dest->Reserve(size);
+  for (size_t i = 0; i < size; ++i) {
+    dest->AddAlreadyReserved(src[i]);
+  }
+}
+
+ParameterClient2::ParameterClient2(bool separate, int port, int numPorts)
+    : BaseClient(separate, numPorts), port_(port) {
+#ifndef PADDLE_DISABLE_TIMER
+  forwardbackwordTime_ = 0;
+#endif
+}
+
+int ParameterClient2::calcParameterBlockSize(
+    const std::vector<ParameterPtr>& parameters, size_t serviceNum) {
+  size_t totalSize = 0;
+  for (auto& para : parameters) {
+    totalSize += para->getSize();
+  }
+  size_t perServerSize = totalSize / serviceNum;
+
+  int sizeBits = 64 - __builtin_clzl(perServerSize);
+
+  /// 2^10 is min block size
+  /// 2^7 will be max number of blocks in one pserver
+  int blockSizeBits = std::max((sizeBits - 7), 10);
+  return 1 << blockSizeBits;
+}
+
+void ParameterClient2::initThreads() {
+  threadNum_ = serviceNum_;
+  if (FLAGS_parallel_thread_num > 1) {
+    LOG(INFO) << "parallel_thread_num dosent need to set";
+  }
+  syncThreadPool_.reset(new SyncThreadPool(threadNum_));
+  startThreads();
+}
+
+bool ParameterClient2::init(const std::vector<ParameterPtr>& parameters) {
+  destroy();
+
+  std::vector<std::string> hosts;
+  str::split(FLAGS_pservers, ',', &hosts);
+  serviceNum_ = hosts.size() * numPorts_;
+  uint64_t denseBlockSize = calcParameterBlockSize(parameters, serviceNum_);
+
+  /// setup prefetch matrix if exists
+  for (auto& para : parameters) {
+    /// set block size for each parameter
+    para->getConfig().set_parameter_block_size(
+        para->getConfig().sparse_remote_update() ? para->getConfig().dims(1)
+                                                 : denseBlockSize);
+  }
+
+  for (auto& para : parameters) {
+    CHECK_NE(-1UL, para->getID()) << "id in parameter is not initialized";
+    parameterMap_[para->getID()] = para;
+  }
+
+  allSegments_.reserve(parameters.size());
+
+  for (auto& para : parameters) {
+    ParameterSegments segments;
+    segments.name = para->getName();
+    segments.id = para->getID();
+    allSegments_.push_back(segments);
+    if (para->getConfig().sparse_remote_update()) {
+      CHECK_EQ(para->getConfig().parameter_block_size(),
+               para->getConfig().dims(1))
+          << "For sparse remote update parameter,"
+          << " block size is the width of each row.";
+    }
+  }
+
+  /// init clients
+  clients_.reserve(serviceNum_);
+  recvDataMems_.resize(serviceNum_);
+
+  for (size_t i = 0; i < hosts.size(); ++i) {
+    for (int j = 0; j < numPorts_; ++j) {
+      LOG(INFO) << "pserver " << i * numPorts_ + j << " " << hosts[i] << ":"
+                << port_ + j;
+      if (FLAGS_rdma_tcp == "rdma") {
+        clients_.emplace_back(hosts[i], port_ + j, F_RDMA);
+      } else {
+        clients_.emplace_back(hosts[i], port_ + j, F_TCP);
+      }
+    }
+  }
+
+  sparseDistribution_.reset(new SparseParameterDistribution(serviceNum_));
+
+  sleep(2);
+
+  initThreads();
+
+  return true;
+}
+
+ParameterClient2::~ParameterClient2() { destroy(); }
+
+void ParameterClient2::destroy() {
+  if (clients_.empty()) {
+    /// this means not initialized.
+    return;
+  }
+  finishThreads();
+
+  parameterMap_.clear();
+  allSegments_.clear();
+  clients_.clear();
+}
+
+void ParameterClient2::sendParallel(int tid,
+                                    size_t numThreads,
+                                    ParameterType recvParameterType) {
+  int numMyClients = divup(serviceNum_ - tid, numThreads);
+
+  for (int j = 0; j < numMyClients; ++j) {
+    REGISTER_TIMER("client_sendAndRecv_send");
+    int i = numThreads * j + tid;
+    /// Try to make different clients to send data to different pservers
+    /// at the same time so that they will not flood data to the same
+    /// pserver.
+    i = calcClientId(i, serviceNum_);
+    clients_[i].send("sendParameter",
+                     sendJob_.parallelRequests[i],
+                     sendJob_.parallelInputIovs[i]);
+
+    /// clear large structure
+    sendJob_.parallelRequests[i].Clear();
+    sendJob_.parallelInputIovs[i].clear();
+  }
+
+  std::vector<void*> bufs;
+  SendParameterResponse response;
+  for (int j = 0; j < numMyClients; ++j) {
+    REGISTER_TIMER("client_sendAndRecv_recv");
+    int i = numThreads * j + tid;
+    i = calcClientId(i, serviceNum_);
+    auto msgReader = clients_[i].recv(&response);
+    CHECK_EQ(msgReader->getNumBlocks(), (size_t)response.blocks_size());
+    bufs.clear();
+    bufs.reserve(response.blocks_size());
+    for (auto& block : response.blocks()) {
+      auto it = parameterMap_.find(block.para_id());
+      CHECK(it != parameterMap_.end());
+      Parameter* parameter = it->second.get();
+      real* buf = nullptr;
+      if (parameter->getBuf(recvParameterType)) {
+        buf = parameter->getBuf(recvParameterType)->getPoint(block.begin_pos());
+      } else {
+        auto recvMat = dynamic_cast<SparseRowCpuMatrix*>(
+            parameter->getMat(recvParameterType).get());
+        CHECK(recvMat);
+        size_t width = parameter->getConfig().dims(1);
+        // TODO(wuyi): need add lock here? may also cause resize.
+        buf = recvMat->getLocalRow(block.begin_pos() / width);
+      }
+      /// sparse_id is not useful while receiving data since sparse data
+      /// storage is continuous, do commit recieved data as that of dense.
+      bufs.push_back(buf);
+    }
+    msgReader->readBlocks(bufs);
+  }
+}
+
+void ParameterClient2::prepareSendData(
+    ParameterUpdateMode updateMode,
+    ParameterType parameterType,
+    const std::vector<ParameterSegments>& parameterSegments,
+    int64_t numSamples,
+    real cost,
+    bool sendBackParameter,
+    ParameterType sendBackParameterType,
+    BatchStatus batchStatus,
+    SendJob* sendJob) {
+  sendJob->parallelRequests.resize(serviceNum_);
+  sendJob->parallelInputIovs.resize(serviceNum_);
+
+  for (auto& request : sendJob->parallelRequests) {
+#ifndef PADDLE_DISABLE_TIMER
+    if (updateMode == PSERVER_UPDATE_MODE_ADD_GRADIENT) {
+      request.set_forwardbackward_time(forwardbackwordTime_);
+    }
+#endif
+    request.set_trainer_id(trainerId_);
+    request.set_update_mode(updateMode);
+    request.set_send_back_parameter(sendBackParameter);
+    request.set_send_back_parameter_type(sendBackParameterType);
+    request.set_num_samples(numSamples);
+    request.set_cost(cost);
+    request.set_batch_status(batchStatus);
+    CHECK_EQ(request.blocks_size(), 0);
+    VLOG(10) << "request: trainer_id: " << request.trainer_id()
+             << " update_mode" << request.update_mode()
+             << " send_back_parameter: " << request.send_back_parameter()
+             << " send_back_parameter_type: "
+             << request.send_back_parameter_type()
+             << " num_samples: " << request.num_samples()
+             << " cost: " << request.cost()
+             << " batch_status: " << request.batch_status();
+  }
+  for (const auto& segments : parameterSegments) {
+    const auto it = parameterMap_.find(segments.id);
+    CHECK(it != parameterMap_.end());
+    Parameter* parameter = it->second.get();
+    CHECK(parameter != nullptr) << "parameter is nullptr";
+    int64_t nameHash = std::hash<std::string>()(segments.name);
+    bool sendingPara = !(updateMode == PSERVER_UPDATE_MODE_GET_PARAM ||
+                         updateMode == PSERVER_UPDATE_MODE_GET_PARAM_SPARSE ||
+                         updateMode == PSERVER_UPDATE_MODE_SET_PARAM_ZERO);
+    bool sparseUpdate = parameter->getConfig().sparse_remote_update() &&
+                        (updateMode == PSERVER_UPDATE_MODE_ADD_GRADIENT ||
+                         updateMode == PSERVER_UPDATE_MODE_ASYNC_SGD ||
+                         updateMode == PSERVER_UPDATE_MODE_GET_PARAM_SPARSE);
+
+    const auto blockSize = parameter->getConfig().parameter_block_size();
+    CHECK_GE(blockSize, 1LU) << "blockSize should > 0 " << blockSize;
+    const auto paraSize = parameter->getSize();
+    if (sparseUpdate) {
+      auto prefetchMat = std::dynamic_pointer_cast<SparsePrefetchRowCpuMatrix>(
+          parameter->getMat(PARAMETER_VALUE));
+      CHECK(prefetchMat != nullptr) << "prefetchMat is nullptr";
+      auto sendMat = dynamic_cast<SparseRowCpuMatrix*>(
+          parameter->getMat(parameterType).get());
+      CHECK(sendMat != nullptr) << "sendMat is nullptr";
+
+      syncThreadPool_->exec([&](int tid, size_t numThreads) {
+        std::lock_guard<std::mutex> guard(sparseAutoGrowthMutex_);
+        const auto& localIndices = prefetchMat->getLocalIndices();
+        /// num of sparse rows
+        size_t nLocalBlocks = localIndices.size();
+        uint64_t beginDim = 0;
+        uint64_t endDim = 0;
+
+        // HACK(typhoonzero): let it resize first
+        prefetchMat->getLocalRow(nLocalBlocks);
+        sendMat->getLocalRow(nLocalBlocks);
+
+        for (size_t row = 0; row < nLocalBlocks; ++row) {
+          int64_t blockId = localIndices[row];  // local row -> sparse row
+          int serverId = std::abs((blockId + nameHash) % serviceNum_);
+          if (serverId % numThreads != (size_t)tid) {
+            continue;
+          }
+
+          beginDim = blockId * blockSize;
+          endDim = std::min<int64_t>(beginDim + blockSize, paraSize);
+
+          auto& request = sendJob->parallelRequests[serverId];
+          ParameterBlock* block = request.add_blocks();
+          block->set_para_id(segments.id);
+          /// global sparse row id
+          block->set_block_id(blockId);
+          /// local row offset
+          block->set_begin_pos(row * blockSize);
+          /// block len
+          block->set_block_size(endDim - beginDim);
+          if (sendingPara) {
+            sendJob->parallelInputIovs[serverId].push_back(
+                {sendMat->getLocalRow(row), sizeof(real) * (size_t)blockSize});
+            /// detect sparse parameter distribution
+            sparseDistribution_->probeDistribution(serverId,
+                                                   sizeof(real) * blockSize);
+          }
+        }
+      });
+
+    } else {  /// parameter set for dense and sparse
+      real* buf =
+          sendingPara ? parameter->getBuf(parameterType)->getPoint(0) : nullptr;
+      uint64_t endDim = 0;
+      for (uint64_t beginDim = 0; beginDim < paraSize; beginDim = endDim) {
+        endDim = std::min<int64_t>(beginDim + blockSize, paraSize);
+        int64_t blockId = beginDim / blockSize;
+        int serverId = std::abs((blockId + nameHash) % serviceNum_);
+
+        auto& request = sendJob->parallelRequests[serverId];
+        ParameterBlock* block = request.add_blocks();
+        block->set_para_id(segments.id);
+        block->set_block_id(blockId);
+        block->set_begin_pos(beginDim);
+        block->set_block_size(endDim - beginDim);
+        if (buf) {
+          sendJob->parallelInputIovs[serverId].push_back(
+              {buf + beginDim, sizeof(real) * ((size_t)(endDim - beginDim))});
+        }
+      }
+    }
+  }  // parameterSegments
+
+  sparseDistribution_->checkAndResetDistribution();
+}
+
+void ParameterClient2::sendAndReceiveParameter(
+    ParameterUpdateMode updateMode,
+    ParameterType parameterType,
+    const std::vector<ParameterSegments>& parameterSegments,
+    int64_t numSamples,
+    real cost,
+    bool sendBackParameter,
+    ParameterType sendBackParameterType,
+    ParameterType recvParameterType) {
+  prepareSendData(updateMode,
+                  parameterType,
+                  parameterSegments,
+                  numSamples,
+                  cost,
+                  sendBackParameter,
+                  sendBackParameterType,
+                  /*batchStatus = */ BATCH_START_AND_FINISH,
+                  &sendJob_);
+
+  syncThreadPool_->exec([&](int tid, size_t numThreads) {
+    this->sendParallel(tid, numThreads, recvParameterType);
+  });
+}
+
+void ParameterClient2::sendParameter(
+    ParameterUpdateMode updateMode,
+    ParameterType parameterType,
+    const std::vector<ParameterSegments>& parameterSegments,
+    int64_t numSamples,
+    real cost,
+    bool sendBackParameter,
+    BatchStatus batchStatus) {
+  SendJobPtr sendJob = std::make_shared<SendJob>();
+  prepareSendData(updateMode,
+                  parameterType,
+                  parameterSegments,
+                  numSamples,
+                  cost,
+                  sendBackParameter,
+                  PARAMETER_VALUE,
+                  batchStatus,
+                  sendJob.get());
+
+  for (int i = 0; i < threadNum_; i++) {
+    sendJobQueue_[i]->enqueue(sendJob);
+  }
+}
+
+void ParameterClient2::recvParameter() { recvSyncBarrier_->wait(); }
+
+void ParameterClient2::send(int threadId) {
+  int index = threadId;
+  LOG(INFO) << "send thread " << threadId << " started";
+  int numMyClients = divup(serviceNum_ - index, threadNum_);
+  while (true) {
+    SendJobPtr recvJob = sendJobQueue_[index]->dequeue();
+    if (stopping_) {
+      recvJobQueue_[index]->enqueue(recvJob);
+      break;
+    }
+    for (int j = 0; j < numMyClients; ++j) {
+      REGISTER_TIMER("client_send");
+      int i = threadNum_ * j + index;
+      /// Try to make different clients to send data to different pservers
+      /// at the same time so that they will not flood data to the same
+      /// pserver.
+      i = calcClientId(i, serviceNum_);
+      if (recvJob->parallelRequests.size()) {
+        clients_[i].send("sendParameter",
+                         recvJob->parallelRequests[i],
+                         recvJob->parallelInputIovs[i]);
+      } else {
+        clients_[i].send("sendData",
+                         recvJob->parallelDataRequests[i],
+                         recvJob->parallelInputIovs[i]);
+      }
+    }
+    recvJobQueue_[index]->enqueue(recvJob);
+  }
+}
+
+void ParameterClient2::recv(int threadId) {
+  LOG(INFO) << "recv thread " << threadId << " started";
+  int index = threadId;
+  int numMyClients = divup(serviceNum_ - index, threadNum_);
+  while (true) {
+    std::vector<void*> bufs;
+    SendParameterResponse response;
+    SendDataResponse dataResponse;
+    SendJobPtr recvJob = recvJobQueue_[index]->dequeue();
+    if (stopping_) break;
+    for (int j = 0; j < numMyClients; ++j) {
+      REGISTER_TIMER("client_recv");
+      int i = threadNum_ * j + index;
+      i = calcClientId(i, serviceNum_);
+      if (recvJob->parallelRequests.size()) {
+        auto msgReader = clients_[i].recv(&response);
+        CHECK_EQ(msgReader->getNumBlocks(), (size_t)response.blocks_size());
+        bufs.clear();
+        bufs.reserve(response.blocks_size());
+        for (auto& block : response.blocks()) {
+          auto it = parameterMap_.find(block.para_id());
+          CHECK(it != parameterMap_.end());
+          Parameter* parameter = it->second.get();
+          real* buf =
+              parameter->getBuf(PARAMETER_VALUE)->getPoint(block.begin_pos());
+          CHECK_EQ(msgReader->getBlockLength(bufs.size()),
+                   sizeof(real) * (block.block_size()));
+          bufs.push_back(buf);
+        }
+        msgReader->readBlocks(bufs);
+      } else {
+        auto msgReader = clients_[i].recv(&dataResponse);
+        CHECK_EQ(msgReader->getNumBlocks(), (size_t)dataResponse.blocks_size());
+        size_t totalLen = msgReader->getTotalLength();
+        if (0 == totalLen) {
+          continue;
+        }
+        auto& recvMem = recvDataMems_[dataResponse.server_id()];
+        CHECK_EQ(dataResponse.blocks_size(), 1)
+            << "Only one block currently support now!";
+        auto& block = dataResponse.blocks(0);
+        CHECK_EQ(totalLen % sizeof(block.data_size()), 0U);
+        recvMem = std::make_shared<CpuMemoryHandle>(totalLen);
+        msgReader->readNextBlock(recvMem.get()->getBuf());
+      }
+    }
+    recvSyncBarrier_->wait();
+  }
+}
+
+void ParameterClient2::waitPassStart() {
+  WaitPassStartRequest request;
+  std::vector<WaitPassStartResponse> responses;
+  multiCall(__func__, request, &responses);
+}
+
+void ParameterClient2::waitPassFinish() {
+  WaitPassFinishRequest request;
+  std::vector<WaitPassFinishResponse> responses;
+  multiCall(__func__, request, &responses);
+}
+
+void ParameterClient2::synchronize(SyncObject syncObjectId) {
+  SynchronizeRequest request;
+  request.set_sync_object_id(syncObjectId);
+  std::vector<SynchronizeResponse> responses;
+  multiCall(__func__, request, &responses);
+}
+
+void ParameterClient2::asyncFinishPass(SyncObject syncObjectId) {
+  SynchronizeRequest request;
+  request.set_sync_object_id(syncObjectId);
+  request.set_trainer_id(trainerId_);
+  std::vector<SynchronizeResponse> responses;
+  multiCall(__func__, request, &responses);
+}
+
+void ParameterClient2::setConfig(const OptimizationConfig& optConfig,
+                                 const std::string& saveDir,
+                                 bool isSparseServer) {
+  SetConfigRequest request;
+  std::vector<SetConfigResponse> responses;
+
+  for (auto& nameAndPara : parameterMap_) {
+    *request.add_param_configs() = nameAndPara.second->getConfig();
+  }
+
+  *request.mutable_opt_config() = optConfig;
+  request.set_save_dir(saveDir);
+  request.set_is_sparse_server(isSparseServer);
+
+  std::vector<SetConfigRequest> requests;
+  requests.resize(clients_.size());
+  for (size_t i = 0; i < requests.size(); ++i) {
+    requests[i].CopyFrom(request);
+    requests[i].set_server_id(i);
+  }
+
+  responses.resize(clients_.size());
+  size_t numClients = clients_.size();
+  for (size_t i = 0; i < numClients; ++i) {
+    clients_[i].send(__func__, requests[i]);
+  }
+  for (size_t i = 0; i < numClients; ++i) {
+    clients_[i].recv(&responses[i]);
+  }
+}
+
+bool ParameterClient2::inStatus(PServerStatus status) {
+  GetStatusRequest request;
+  std::vector<GetStatusResponse> responses;
+
+  bool ok = true;
+  multiCall("getStatus", request, &responses);
+  for (auto& response : responses) {
+    if (response.status() != status) {
+      ok = false;
+    }
+  }
+
+  return ok;
+}
+
+void ParameterClient2::setStatus(PServerStatus status) {
+  SetStatusRequest request;
+  request.set_status(status);
+  std::vector<SetStatusResponse> responses;
+  multiCall(__func__, request, &responses);
+}
+
+void ParameterClient2::waitForStatus(PServerStatus status) {
+  while (!inStatus(status)) {
+    sleep(1);
+  }
+}
+
+template <typename Proto>
+static void validateResponses(const std::vector<Proto>& responses) {
+  for (auto& response : responses) {
+    CHECK(response.return_message().empty())
+        << "client" << &response - &responses[0]
+        << " error:" << response.return_message();
+  }
+}
+
+PServerVector ParameterClient2::createVector() {
+  CreateVectorRequest request;
+  std::vector<CreateVectorResponse> responses;
+  int64_t handle = -1;
+
+  multiCall(__func__, request, &responses);
+  validateResponses(responses);
+
+  for (auto& response : responses) {
+    if (handle == -1) {
+      handle = response.handle();
+    } else {
+      CHECK_EQ(handle, response.handle()) << "Inconsistent handle from client"
+                                          << &response - &responses[0] << " "
+                                          << handle << " " << response.handle();
+    }
+  }
+  return PServerVector{handle};
+}
+
+void ParameterClient2::releaseVector(PServerVector handle) {
+  ReleaseVectorRequest request;
+  std::vector<ReleaseVectorResponse> responses;
+
+  request.set_handle(handle.handle);
+  multiCall(__func__, request, &responses);
+  validateResponses(responses);
+}
+
+PServerMatrix ParameterClient2::createMatrix(int32_t numCols) {
+  CreateMatrixRequest request;
+  std::vector<CreateMatrixResponse> responses;
+  int64_t handle = -1;
+
+  request.set_num_cols(numCols);
+  multiCall(__func__, request, &responses);
+  validateResponses(responses);
+
+  for (auto& response : responses) {
+    if (handle == -1) {
+      handle = response.handle();
+    } else {
+      CHECK_EQ(handle, response.handle()) << "Inconsistent handle from client"
+                                          << &response - &responses[0] << " "
+                                          << handle << " " << response.handle();
+    }
+  }
+  return PServerMatrix{handle};
+}
+
+void ParameterClient2::releaseMatrix(PServerMatrix handle) {
+  ReleaseMatrixRequest request;
+  std::vector<ReleaseMatrixResponse> responses;
+
+  request.set_handle(handle.handle);
+  multiCall(__func__, request, &responses);
+  validateResponses(responses);
+}
+
+void PreparedOperations::addOperationHelper(Operation* op, CpuVectorPtr vec) {
+  ProtoVector& pvec = *op->add_vectors();
+  size_t dim = vec->getSize();
+  pvec.set_dim(dim);
+  copyToRepeatedField(pvec.mutable_values(), vec->getData(), vec->getSize());
+}
+
+void PreparedOperations::addOperationHelper(Operation* op, CpuMatrixPtr mat) {
+  ProtoMatrix& pmat = *op->add_matrices();
+  pmat.set_num_cols(mat->getWidth());
+  pmat.set_num_rows(mat->getHeight());
+  copyToRepeatedField(
+      pmat.mutable_values(), mat->getData(), pmat.num_cols() * pmat.num_rows());
+}
+
+static inline real addTwo(real a, double b) { return a + b; }
+
+void ParameterClient2::doOperation(PreparedOperations& ops,
+                                   bool waitForGradient,
+                                   bool sendBackGradient,
+                                   bool releasePass) {
+  std::vector<DoOperationResponse> responses;
+  ops.request_.set_wait_for_gradient(waitForGradient);
+  ops.request_.set_send_back_parameter(sendBackGradient);
+  ops.request_.set_release_pass(releasePass);
+  multiCall(__func__, ops.request_, &responses);
+  validateResponses(responses);
+  size_t numPassFinishServers = 0;
+
+  size_t numOps = ops.request_.operations_size();
+  for (auto& response : responses) {
+    numPassFinishServers += response.pass_finish();
+    CHECK_EQ(numOps, (size_t)response.results_size());
+    for (size_t opId = 0; opId < numOps; ++opId) {
+      const OperationResult& result = response.results(opId);
+      std::vector<real*>& resultScalars = ops.localResults_[opId].resultScalars;
+      std::vector<CpuVectorPtr>& resultVectors =
+          ops.localResults_[opId].resultVectors;
+      std::vector<CpuMatrixPtr>& resultMatrices =
+          ops.localResults_[opId].resultMatrices;
+
+      if (&response == &responses[0]) {
+        /// Initialize results to zero
+
+        resultScalars.resize(result.scalars_size());
+        for (auto p : resultScalars) {
+          if (!p) continue;
+          *p = 0;
+        }
+        size_t numVectors = result.vectors_size();
+        resultVectors.resize(numVectors);
+        for (size_t i = 0; i < numVectors; ++i) {
+          if (!resultVectors[i]) continue;
+          resultVectors[i]->resize(result.vectors(i).dim());
+          resultVectors[i]->zeroMem();
+        }
+        size_t numMatrices = result.matrices_size();
+        resultMatrices.resize(numMatrices);
+        for (size_t i = 0; i < numMatrices; ++i) {
+          if (!resultMatrices[i]) continue;
+          resultMatrices[i]->resize(result.matrices(i).num_rows(),
+                                    result.matrices(i).num_cols());
+          resultMatrices[i]->zeroMem();
+        }
+      }
+
+      // aggregate results from each pserver to results
+
+      CHECK_EQ(resultScalars.size(), (size_t)result.scalars_size());
+      for (ssize_t i = 0; i < result.scalars_size(); ++i) {
+        real* rscalar = resultScalars[i];
+        if (!rscalar) continue;
+        *rscalar += result.scalars(i);
+      }
+
+      CHECK_EQ(resultVectors.size(), (size_t)result.vectors_size());
+      for (auto& vec : result.vectors()) {
+        int i = &vec - &result.vectors(0);
+        CpuVectorPtr rvec = resultVectors[i];
+        if (!rvec) continue;
+        CHECK_EQ(rvec->getSize(), (size_t)vec.dim());
+        std::transform(rvec->getData(),
+                       rvec->getData() + rvec->getSize(),
+                       vec.values().data(),
+                       rvec->getData(),
+                       addTwo);
+      }
+
+      CHECK_EQ(resultMatrices.size(), (size_t)result.matrices_size());
+      for (auto& mat : result.matrices()) {
+        int i = &mat - &result.matrices(0);
+        CpuMatrixPtr rmat = resultMatrices[i];
+        if (!rmat) continue;
+        CHECK_EQ(rmat->getHeight(), (size_t)mat.num_rows());
+        CHECK_EQ(rmat->getWidth(), (size_t)mat.num_cols());
+
+        std::transform(rmat->getData(),
+                       rmat->getData() + rmat->getElementCnt(),
+                       mat.values().data(),
+                       rmat->getData(),
+                       addTwo);
+      }
+    }
+  }
+  passFinish_ = numPassFinishServers == clients_.size();
+}
+
+real ParameterClient2::vectorDotProduct(PServerVector u, PServerVector v) {
+  real result = 0.0;
+  PreparedOperations ops;
+  ops.addOperation(PSERVER_OP_utv, u, v)(&result);
+  doOperation(ops, false, false);
+  return result;
+}
+
+void ParameterClient2::vectorScale(PServerVector u, real a) {
+  PreparedOperations ops;
+  ops.addOperation(PSERVER_OP_au, u, a);
+  doOperation(ops, false, false);
+}
+
+void ParameterClient2::vectorCopy(PServerVector src, PServerVector dst) {
+  PreparedOperations ops;
+  ops.addOperation(PSERVER_OP_COPY, src, dst);
+  doOperation(ops, false, false);
+}
+
+void ParameterClient2::vectorAddMult(PServerVector u, PServerVector v, real a) {
+  PreparedOperations ops;
+  ops.addOperation(PSERVER_OP_au_bv, v, u, a, (real)1);
+  doOperation(ops, false, false);
+}
+
+void ParameterClient2::vectorAddMultInto(PServerVector u,
+                                         PServerVector v,
+                                         PServerVector w,
+                                         real a) {
+  PreparedOperations ops;
+  ops.addOperation(PSERVER_OP_au_bv_cw, v, w, u, (real)1, a, (real)0);
+  doOperation(ops, false, false);
+}
+
+void ParameterClient2::vectorScaleInto(PServerVector u,
+                                       PServerVector v,
+                                       real a) {
+  PreparedOperations ops;
+  ops.addOperation(PSERVER_OP_au_bv, v, u, a, (real)0);
+  doOperation(ops, false, false);
+}
+
+void ParameterClient2::loadValueVector(const std::string& dirName) {
+  LoadValueRequest request;
+  request.set_dir_name(dirName);
+  std::vector<LoadValueResponse> responses;
+
+  multiCall(__func__, request, &responses);
+  validateResponses(responses);
+}
+
+void ParameterClient2::saveValueVector(const std::string& dirName) {
+  SaveValueRequest request;
+  request.set_dir_name(dirName);
+  std::vector<SaveValueResponse> responses;
+
+  multiCall(__func__, request, &responses);
+  validateResponses(responses);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/pserver/ParameterClient2.h b/paddle/legacy/pserver/ParameterClient2.h
new file mode 100644
index 0000000000000000000000000000000000000000..9320e19c4df6c5439266f89e5599b9496f145172
--- /dev/null
+++ b/paddle/legacy/pserver/ParameterClient2.h
@@ -0,0 +1,602 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <atomic>
+#include <mutex>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/legacy/parameter/Parameter.h"
+#include "paddle/legacy/pserver/BaseClient.h"
+#include "paddle/legacy/utils/Common.h"
+#include "paddle/legacy/utils/Flags.h"
+#include "paddle/legacy/utils/Locks.h"
+#include "paddle/legacy/utils/Queue.h"
+#include "paddle/legacy/utils/Util.h"
+
+#include "ParameterService.pb.h"
+
+#include "ProtoServer.h"
+#include "SparseParameterDistribution.h"
+
+DECLARE_int32(parallel_thread_num);
+
+namespace paddle {
+
+struct PServerMatrix {
+  int64_t handle;
+};
+
+struct PServerVector {
+  int64_t handle;
+};
+
+/**
+ * @brief A class to help to prepare server-side operations.
+ */
+class PreparedOperations {
+ protected:
+  class ResultsAdder;
+  struct LocalOperationResult;
+
+ public:
+  /**
+   * Offers an easy way to prepare operations that will be performed on
+   * server-side.
+   *
+   * Usage:
+   * @code
+   *   addOperation(optype, arguments...)(results...)
+   * @endcode
+   *
+   * Examples:
+   * 1. set pserver vector to 1:
+   * @code
+   *   PServerVector u = parameterClient.createVector();
+   *   addOperation(PSERVER_OP_RESET, u, (real)1);
+   * @endcode
+   *
+   * 2. Compute inner product of to pserver vectors.
+   * @code
+   *   PServerVector u = parameterClient.createVector();
+   *   PServerVector v = parameterClient.createVector();
+   *   real result;
+   *   addOperation(PSERVER_OP_utv, u, v)(&result)
+   * @endcode
+   *
+   * @param[in] operation The operation that pserver will perform.
+   * @param[in] args Argument list of the operation
+   * @return A ResultsAdder object initialized with the last element of
+   *         localResults_.
+   */
+  template <typename... Args>
+  ResultsAdder addOperation(MatrixVectorOperation operation, Args... args) {
+    Operation* op = request_.add_operations();
+    op->set_operation(operation);
+    localResults_.emplace_back();
+    addOperationHelper(op, args...);
+    return ResultsAdder(&localResults_.back());
+  }
+
+ protected:
+  void addOperationHelper(Operation* op) {}
+
+  /**
+   * @brief Helper function to add an new operation that takes a PServerVector
+   *        as an operand.
+   */
+  void addOperationHelper(Operation* op, PServerVector arg) {
+    op->add_pvectors(arg.handle);
+  }
+
+  /**
+   * @brief Helper function to add an new operation that takes a PServerMatrix
+   *        as an operand.
+   */
+  void addOperationHelper(Operation* op, PServerMatrix arg) {
+    op->add_pmatrices(arg.handle);
+  }
+
+  /**
+   * @brief Helper function to add an new operation that takes a real valued
+   *        scalar as an operand.
+   */
+  void addOperationHelper(Operation* op, real arg) { op->add_scalars(arg); }
+
+  /**
+   * @brief Helper function to add an new operation that takes a CpuVectorPtr
+   *        as an operand.
+   * @note The array of CpuVectors that arg points to will be copied to
+   *       op's vectors field.
+   */
+  void addOperationHelper(Operation* op, CpuVectorPtr arg);
+
+  /**
+   * @brief Helper function to add an new operation that takes a CpuMatrixPtr
+   *        as an operand.
+   * @note The array of CpuMatrixs that arg points to will be copied to
+   *       op's matrices field.
+   */
+  void addOperationHelper(Operation* op, CpuMatrixPtr arg);
+
+  /**
+   * @brief Helper function to add an new operation and prepare the operands.
+   *
+   * @tparam Arg An operand of the operation.
+   * @tparam Args A list of rest operands of the operation.
+   * @param op Pointer to an Operation object.
+   */
+  template <typename Arg, typename... Args>
+  void addOperationHelper(Operation* op, Arg arg, Args... args) {
+    addOperationHelper(op, arg);
+    addOperationHelper(op, args...);
+  }
+
+  /**
+   * @brief ResultsAdder offers easy ways to quickly store operation results.
+   */
+  class ResultsAdder {
+   public:
+    explicit ResultsAdder(LocalOperationResult* localResult)
+        : localResult_(localResult) {}
+    template <typename... Args>
+    void operator()(Args... args) {
+      addResult(args...);
+    }
+    void addResult() {}
+    void addResult(real* arg) { localResult_->resultScalars.push_back(arg); }
+    void AddResult(CpuVectorPtr arg) {
+      localResult_->resultVectors.push_back(arg);
+    }
+    void AddResult(CpuMatrixPtr arg) {
+      localResult_->resultMatrices.push_back(arg);
+    }
+    template <typename Arg, typename... Args>
+    void addResult(Arg arg, Args... args) {
+      addResult(arg);
+      addResult(args...);
+    }
+
+   protected:
+    LocalOperationResult* localResult_;
+  };
+
+ protected:
+  DoOperationRequest request_;
+  std::vector<iovec> inputIovs_;
+  struct LocalOperationResult {
+    std::vector<real*> resultScalars;
+    std::vector<CpuVectorPtr> resultVectors;
+    std::vector<CpuMatrixPtr> resultMatrices;
+  };
+  std::vector<LocalOperationResult> localResults_;
+  friend class ParameterClient2;
+};
+
+struct ParameterSegments {
+  std::string name;  // name of the parameter
+  size_t id;         // id of the parameter
+};
+
+/**
+ * The client interface for parameter server. ParameterClient2 supports 2 modes
+ * for managing connections to parameter servers, in the 1st mode one connection
+ * is shared by 2 threads that are separately responsible for sending and
+ * recieving activities, in the 2nd mode one connection is owned by only one
+ * thread, and all the sending and recieving activities run in that single
+ * thread.
+ * TODO(yanfei):
+ * Additional core idea to further optimizate pserver performance is
+ * to do sync-sgd based parameter level instead of pserver level.
+ * full-parallelization based parameter level for sync-sgd also can
+ * sense forwardbackward computation layer-by-layer for more deeper layer
+ * model.
+ * Firstly, pserver can do full-parallelization on all computation based
+ * parameter level instead of waiting for all gradients are finished and
+ * start to send back parameters value immediately if parameter is ready
+ * instead of waiting for all parameters value are ready
+ * Secondly, parameter client can write back parameters to GPU instead of
+ * waiting until all parameters are received to CPU host end.
+ */
+class ParameterClient2 : public BaseClient {
+ public:
+  /** Constructor.
+   * @param separate True if sending and recieving activities are separated
+   *                 into 2 threads, otherwise false.
+   * @param port Port number that parameter client runs on.
+   * @param numPorts Number of ports parameter clients occupies,
+   *                 numPorts * pserver number is the total number of
+   *                 connections the parameter client maintains.
+   */
+  ParameterClient2(bool separate = false,
+                   int port = FLAGS_port,
+                   int numPorts = FLAGS_ports_num);
+
+  ~ParameterClient2();
+
+  static int calcParameterBlockSize(const std::vector<ParameterPtr>& parameters,
+                                    size_t serviceNum);
+
+ public:
+  bool init(const std::vector<ParameterPtr>& parameters);
+
+  /// service functions
+
+  /**
+   * @brief Sends the segments in parameter to parameter servers, then receives
+   *        the response from the servers.
+   * @param[in] updateMode Indicates how parameters should be updated on the
+   *            server side.
+   * @param[in] parameterType Type of parameter that will be sent.
+   * @param[in] segments Segments in the parameter that will be sent.
+   * @param[in] numSamples Number of samples this update is based on.
+   * @param[in] cost Cost of the batch, will be used to calculate global object
+   *            value.
+   * @param[in] sendBackParameter True if the updated parameters should be sent
+   *            back, otherwise false.
+   * @param[in] sendBackParameterType Send back parameter type on pserver,
+   *            PARAMETER_VALUE by default
+   * @param[in] recvParameterType pserver[sendBackParameterType] will be copy to
+   *            client[recvParameterType]
+   * @note Only parameterType will be sent.
+   */
+  void sendAndReceiveParameter(ParameterUpdateMode updateMode,
+                               ParameterType parameterType,
+                               const std::vector<ParameterSegments>& segments,
+                               int64_t numSamples,
+                               real cost,
+                               bool sendBackParameter,
+                               ParameterType sendBackParameterType,
+                               ParameterType recvParameterType);
+
+  /**
+   * @brief Sends all parameters to parameter servers, and receives the response
+   *        from the servers.
+   */
+  void sendAndReceiveParameter(
+      ParameterUpdateMode updateMode,
+      ParameterType parameterType,
+      int64_t numSamples,
+      real cost,
+      bool sendBackParameter,
+      ParameterType sendBackParameterType = PARAMETER_VALUE,
+      ParameterType recvParameterType = PARAMETER_VALUE) {
+    sendAndReceiveParameter(updateMode,
+                            parameterType,
+                            allSegments_,
+                            numSamples,
+                            cost,
+                            sendBackParameter,
+                            sendBackParameterType,
+                            recvParameterType);
+  }
+
+  /**
+   * @brief Sends the segments in parameter to parameter servers. Each
+   *        sendParameter() must be paired with a recvParameter() in the future.
+   *        Only parameterType will be sent.
+   *
+   * @param[in] updateMode Indicates how parameters should be updated on the
+   *            server side.
+   * @param[in] parameterType Type of parameter that will be sent.
+   * @param[in] segments Segments in the parameter that will be sent.
+   * @param[in] numSamples Number of samples this update is based on.
+   * @param[in] cost Cost of the batch, will be used to calculate global object
+   *            value.
+   * @param[in] sendBackParameter True if the updated parameters should be sent
+   *            back, otherwise false.
+   * @param[in] batchStatus Status of the batch.
+   * @note This function is non-blocking. This means that parameter should
+   *       not change between this call and recvParameter()
+   */
+  void sendParameter(ParameterUpdateMode updateMode,
+                     ParameterType parameterType,
+                     const std::vector<ParameterSegments>& segments,
+                     int64_t numSamples,
+                     real cost,
+                     bool sendBackParameter,
+                     BatchStatus batchStatus);
+
+  void recvParameter();
+
+  /**
+   * Sends all parameters to parameter servers, recvParameter() have to be
+   * invoked
+   * afterwards.
+   *
+   * @note This function is non-blocking. This means that if parameter should
+   *       not changes between this call and recvParameter()
+   */
+  void sendParameter(ParameterUpdateMode updateMode,
+                     ParameterType parameterType,
+                     int64_t numSamples,
+                     real cost,
+                     bool sendBackParameter,
+                     BatchStatus batchStatus) {
+    sendParameter(updateMode,
+                  parameterType,
+                  allSegments_,
+                  numSamples,
+                  cost,
+                  sendBackParameter,
+                  batchStatus);
+  }
+
+  /// Get all parameters from parameter servers
+  void getParameter(ParameterType recvParameterType = PARAMETER_VALUE,
+                    ParameterType sendBackParameterType = PARAMETER_VALUE) {
+    sendAndReceiveParameter(PSERVER_UPDATE_MODE_GET_PARAM,
+                            PARAMETER_VALUE,
+                            0,     // numSamples = 0
+                            0,     // cost = 0
+                            true,  // sendBackParameter = true
+                            sendBackParameterType,
+                            recvParameterType);
+  }
+
+  /// Get parameters by sparse row ids from parameter servers
+  void getParameterSparse(
+      ParameterType recvParameterType = PARAMETER_VALUE,
+      ParameterType sendBackParameterType = PARAMETER_VALUE) {
+    sendAndReceiveParameter(PSERVER_UPDATE_MODE_GET_PARAM_SPARSE,
+                            PARAMETER_VALUE,
+                            0,     // numSamples = 0
+                            0,     // cost = 0
+                            true,  // sendBackParameter = true
+                            sendBackParameterType,
+                            recvParameterType);
+  }
+
+  /// Set all parameters on parameter servers using the local parameters
+  void setParameter() {
+    sendAndReceiveParameter(PSERVER_UPDATE_MODE_SET_PARAM,
+                            PARAMETER_VALUE,
+                            0,       // numSamples = 0
+                            0,       // cost = 0
+                            false);  // sendBackParameter = false
+  }
+  /**
+   * Set all parameters on parameter servers, values will be zero
+   * means do not sending local parameters
+   */
+  void setParameterZero() {
+    sendAndReceiveParameter(PSERVER_UPDATE_MODE_SET_PARAM_ZERO,
+                            PARAMETER_VALUE,
+                            0,       // numSamples = 0
+                            0,       // cost = 0
+                            false);  // sendBackParameter = false
+  }
+
+  /**
+   * @brief Wait until all gradient servers start one pass.
+   *
+   * @note This is now only used by the gradient servers for "sgd"
+   *       algorithm. Calling this function means that the calling gradient
+   *       server is ready to start a new pass.
+   */
+  void waitPassStart();
+
+  /**
+   * @brief Wait until all gradient servers finish one pass.
+   *
+   * @note This is now only used by the gradient servers for "sgd" algorithm.
+   *       Calling this function means that the calling gradient server
+   *       finishes one pass.
+   */
+  void waitPassFinish();
+
+  /// Wait until all gradient servers call this function.
+  void synchronize(SyncObject syncObjectId = SYNC_DEFAULT);
+
+  /// Called when async-sgd finish pass.
+  void asyncFinishPass(SyncObject syncObjectId = SYNC_DEFAULT);
+
+  void asyncStartPass(SyncObject syncObjectId = SYNC_DEFAULT) {
+    return synchronize(syncObjectId);
+  }
+
+  /**
+   * @brief Execute the prepared operations on pservers, fetch the results and
+   *        aggregate results from different pservers.
+   * @param[in] ops Prepared operations that will be executed on pservers.
+   * @param[in] waitForGradient If true, wait for gradient to be ready before
+   *            starting the operations.
+   * @param[in] sendBackParameter If true, send back the parameter to clients
+   *            after the operations are finished.
+   * @param[in] If true, and if all clients call waitPassFinish, signal all
+   *            clients finish the pass.
+   */
+  void doOperation(PreparedOperations& ops,
+                   bool waitForGradient,
+                   bool sendBackParameter,
+                   bool releasePass = true);
+
+  /**
+   * Set the configuration of pserver, including parameter config and
+   * optimization config
+   */
+  void setConfig(const OptimizationConfig& optConfig,
+                 const std::string& saveDir = "",
+                 bool isSparseServer = false);
+
+  /// Return true if all pservers are in the given status
+  bool inStatus(PServerStatus status);
+  bool isPassFinish() { return passFinish_; }
+
+  /// Set pserver status
+  void setStatus(PServerStatus status);
+
+  /**
+   * @brief Wait until all pservers are at status
+   * @note This function is not suitable for frequent use,
+   *       because it sleeps 1 second each time when condition is satisfied.
+   */
+  void waitForStatus(PServerStatus status);
+
+  /// Create a column vector. The size is the dimension of parameter.
+  PServerVector createVector();
+
+  /// Release the PServerVector given handle.
+  void releaseVector(PServerVector handle);
+
+  /**
+   * Create a column major matrix. The number of rows is the dimension of
+   * parameter. The number of columns is specifed by numCols.
+   */
+  PServerMatrix createMatrix(int32_t numCols);
+
+  /// Release the PServerMatrix given handle.
+  void releaseMatrix(PServerMatrix handle);
+
+  // Some basic algebra functions
+  /// Calculate the dot product of u and v
+  real vectorDotProduct(PServerVector u, PServerVector v);
+
+  /// Scale u by a
+  void vectorScale(PServerVector u, real a);
+
+  /// Copy from src to dest
+  void vectorCopy(PServerVector src, PServerVector dst);
+
+  /// u += v * a
+  void vectorAddMult(PServerVector u, PServerVector v, real a);
+
+  /// u = v + w * a
+  void vectorAddMultInto(PServerVector u,
+                         PServerVector v,
+                         PServerVector w,
+                         real a);
+  /// u = v * a
+  void vectorScaleInto(PServerVector u, PServerVector v, real a);
+
+  /// Return pserver parameter value.
+  PServerVector getPServerParameterValue() {
+    PServerVector vec;
+    vec.handle = PARAMETER_VALUE;
+    return vec;
+  }
+
+  /// Return pserver parameter gradient.
+  PServerVector getPServerParameterGradient() {
+    PServerVector vec;
+    vec.handle = PARAMETER_GRADIENT;
+    return vec;
+  }
+
+  /**
+   * Tell pservers to load value vector from file.
+   *
+   * @param[in] dirName The directory that contains the value vector file.
+   */
+  void loadValueVector(const std::string& dirName);
+
+  /// Tell pservers to save value vector to file.
+  void saveValueVector(const std::string& dirName);
+
+  void setTrainerId(int trainerId) { trainerId_ = trainerId; }
+
+#ifndef PADDLE_DISABLE_TIMER
+  void setForwardbackwardTime(uint64_t delta) { forwardbackwordTime_ = delta; }
+#endif
+
+ protected:
+  template <typename ProtoIn, typename ProtoOut>
+  void multiCall(const char* funcName,
+                 const ProtoIn& request,
+                 std::vector<ProtoOut>* responses) {
+    responses->resize(clients_.size());
+    size_t numClients = clients_.size();
+    for (size_t i = 0; i < numClients; ++i) {
+      clients_[i].send(funcName, request);
+    }
+    for (size_t i = 0; i < numClients; ++i) {
+      clients_[i].recv(&(*responses)[i]);
+    }
+  }
+
+ private:
+  void destroy();
+
+  /**
+   * @brief management function for parallelizing send/recv all connections
+   *        to all pservers. it is called under one SyncThreadPool. it
+   *        supports to use N thread to control M connections. the receiving
+   *        actions can be started until all sending action to all connections
+   *        owned by current thread are finished. Different connections
+   * controlled
+   *        by different threads can transfer data asynchronously.
+   */
+  void sendParallel(int tid,
+                    size_t numThreads,
+                    ParameterType recvParameterType);
+  /// sending thread routine for asynchronously send data
+  void send(int threadId);
+  /// receiving thread routing for asynchronously receive data
+  void recv(int threadId);
+
+  /**
+   * @brief main routine to build data for pserver
+   *
+   * @note  it can prepare different kinds of parameter type data. it can
+   *        be regarded as layer for bridging real parameters data and
+   *        protobuf data for communication.
+   *        TODO(yanfei):
+   *        can abstract additional layer to encode and decode data to/from
+   *        protobuf data.
+   */
+  void prepareSendData(
+      ParameterUpdateMode updateMode,
+      ParameterType parameterType,  // client send type
+      const std::vector<ParameterSegments>& parameterSegments,
+      int64_t numSamples,
+      real cost,
+      bool sendBackParameter,
+      ParameterType sendBackParameterType,  // send back type in pserver
+      BatchStatus batchStatus,
+      SendJob* sendJob);
+
+  /// start necessary threads for threadPool
+  void initThreads();
+
+ protected:
+  /// start port number of pserver
+  /// it deduce all ports for dense and sparse with some rules
+  int port_;
+  /// identify the trainer id using this client
+  int trainerId_;
+
+#ifndef PADDLE_DISABLE_TIMER
+  uint64_t forwardbackwordTime_;
+#endif
+  std::mutex sparseAutoGrowthMutex_;
+
+  /// map id to parameter used for decoding protobuf data
+  std::unordered_map<size_t, ParameterPtr> parameterMap_;
+  /// segments for all parameters that needed to sync
+  std::vector<ParameterSegments> allSegments_;
+
+  /// module for sensing sparse parameters distribution on all pservers
+  std::unique_ptr<SparseParameterDistribution> sparseDistribution_;
+
+  /// thread pool for parallelizing all connections to pservers
+  std::unique_ptr<SyncThreadPool> syncThreadPool_;
+
+  bool passFinish_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/pserver/ParameterServer2.cpp b/paddle/legacy/pserver/ParameterServer2.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8533a322d92d292ee613d44795cf60462082a11b
--- /dev/null
+++ b/paddle/legacy/pserver/ParameterServer2.cpp
@@ -0,0 +1,1401 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ParameterServer2.h"
+
+#include <algorithm>
+#include <fstream>
+
+#include "paddle/legacy/math/SIMDFunctions.h"
+#include "paddle/legacy/parameter/AverageOptimizer.h"
+#include "paddle/legacy/parameter/FirstOrderOptimizer.h"
+#include "paddle/legacy/parameter/OptimizerFunctions.h"
+#include "paddle/legacy/parameter/OptimizerWithRegularizer.h"
+#include "paddle/legacy/parameter/ParameterOptimizer.h"
+#include "paddle/legacy/parameter/ParameterUpdateFunctions.h"
+#include "paddle/legacy/parameter/Regularizer.h"
+#include "paddle/legacy/parameter/ThreadLocalBuffer.h"
+#include "paddle/legacy/utils/Flags.h"
+#include "paddle/legacy/utils/GlobalConstants.h"
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/StringUtil.h"
+
+DEFINE_int32(pserver_num_threads, 1, "number of threads for sync op exec");
+DEFINE_double(async_lagged_ratio_min,
+              1.0,
+              "control config_.async_lagged_grad_discard_ratio() min value");
+DEFINE_double(
+    async_lagged_ratio_default,
+    1.5,
+    "if async_lagged_grad_discard_ratio is not set in trainer_config.conf"
+    "use it as defalut value");
+
+namespace paddle {
+
+const std::string ParameterServer2::kRetMsgInvalidMatrixHandle =
+    "Invalid matrix handle";
+const std::string ParameterServer2::kRetMsgInvalidVectorHandle =
+    "Invalid vector handle";
+const std::string ParameterServer2::kRetMsgUnknownOperation =
+    "Unknown operation";
+
+ParameterServer2::ParameterServer2(const std::string& addr,
+                                   int port,
+                                   int rdmaCpu)
+    : ProtoServer(addr, port, rdmaCpu),
+      dataSize_(0),
+      size_(0),
+      gradientReadyBarrier_(FLAGS_num_gradient_servers + 1),
+      parameterReadyBarrier_(FLAGS_num_gradient_servers + 1),
+      passBarrier_(FLAGS_num_gradient_servers + 1),
+      numPassFinishClients_(0),
+      allClientPassFinish_(false),
+      serverId_(-1),
+      batchId_(-1) {
+  /**
+   * register function for remote client calling, these functions
+   * will be mapped to a data structure for quick looking up. each
+   * request from trainer can contains one function name to indicate
+   * remote action. this architecture looks like rpc style for pserver.
+   */
+  REGISTER_SERVICE_FUNCTION_EX(ParameterServer2, sendParameter);
+  REGISTER_SERVICE_FUNCTION_EX(ParameterServer2, sendData);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, setConfig);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, setStatus);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, getStatus);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, doOperation);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, createVector);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, releaseVector);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, createMatrix);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, releaseMatrix);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, waitPassStart);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, waitPassFinish);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, synchronize);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, asyncFinishPass);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, loadValueVector);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, saveValueVector);
+
+  /// thread pool for parallelizing some computations
+  if (FLAGS_pserver_num_threads > 1) {
+    syncThreadPool_.reset(new SyncThreadPool(FLAGS_pserver_num_threads, false));
+  }
+}
+
+bool ParameterServer2::init() {
+  vectors_.resize(NUM_PARAMETER_TYPES);
+  configMap_.clear();
+
+  numSamplesProcessed_ = 0;
+  cost_ = 0;
+  char* mpienv = getenv("OMPI_COMM_WORLD_SIZE");
+  if (mpienv != NULL) {
+    mpiSize_ = atoi(mpienv);
+  } else {
+    mpiSize_ = 1;
+  }
+  status_ = PSERVER_STATUS_NOT_SET;
+  dataMems_.resize(FLAGS_num_gradient_servers);
+  synchronizeBarriers_.resize(SyncObject_ARRAYSIZE);
+  for (auto& barrier : synchronizeBarriers_) {
+    barrier.reset(new ThreadBarrier(FLAGS_num_gradient_servers));
+  }
+
+  // initialization for dicarding lagging gradient
+  asyncUpdateSteps_ = 0;
+  asyncTrainerSteps_.resize(FLAGS_num_gradient_servers);
+  asyncTrainerSteps_.assign(asyncTrainerSteps_.size(), 0);
+  asyncLaggedGradientsNum_ = 0;
+  asyncUpdateStat_.resize(static_cast<int>(FLAGS_num_gradient_servers *
+                                           FLAGS_async_lagged_ratio_default));
+  asyncUpdateStat_.assign(asyncUpdateStat_.size(), 0);
+  asyncTrainerDiscardStat_.resize(FLAGS_num_gradient_servers);
+  asyncTrainerDiscardStat_.assign(asyncTrainerDiscardStat_.size(), 0);
+  asyncTrainerCommitStat_.resize(FLAGS_num_gradient_servers);
+  asyncTrainerCommitStat_.assign(asyncTrainerCommitStat_.size(), 0);
+
+  return true;
+}
+
+void ParameterServer2::getStatus(const GetStatusRequest& request,
+                                 ProtoResponseCallback callback) {
+  (void)request;
+  GetStatusResponse response;
+  response.set_status(status_);
+  callback(response);
+}
+
+void ParameterServer2::setStatus(const SetStatusRequest& request,
+                                 ProtoResponseCallback callback) {
+  status_ = request.status();
+  SetStatusResponse response;
+  callback(response);
+}
+
+void ParameterServer2::setConfig(const SetConfigRequest& request,
+                                 ProtoResponseCallback callback) {
+  {
+    std::lock_guard<RWLock> guard(parameterMutex_);
+
+    serverId_ = request.server_id();
+    isSparseServer_ = request.is_sparse_server();
+
+    if (!request.save_dir().empty()) {
+      mkDir(request.save_dir().c_str());
+    }
+
+    for (const auto& config : request.param_configs()) {
+      CHECK(!configMap_.count(config.para_id()))
+          << "Duplicated parameter name: " << config.name();
+      configMap_[config.para_id()] = config;
+      CHECK_EQ(config.sparse_remote_update(), isSparseServer_);
+    }
+
+    config_ = request.opt_config();
+    if (config_.algorithm() == TrainAlgorithm::AsyncSGD) {
+      auto asyncLaggedRatio = config_.async_lagged_grad_discard_ratio();
+      if (asyncLaggedRatio <= FLAGS_async_lagged_ratio_min) {
+        LOG(INFO) << "WARNING: async_lagged_grad_discard_ratio is too small"
+                  << "reset to default, async_lagged_grad_discard_ratio = "
+                  << FLAGS_async_lagged_ratio_default;
+        asyncLaggedRatio = FLAGS_async_lagged_ratio_default;
+      }
+      asyncLaggedThreshold_ =
+          static_cast<int64_t>(FLAGS_num_gradient_servers * asyncLaggedRatio);
+      LOG(INFO) << "discard lagged async gradient ratio: " << asyncLaggedRatio
+                << " asyncLaggedhreshold: " << asyncLaggedThreshold_;
+    }
+    if (isSparseServer_ && config_.num_batches_per_send_parameter() > 1) {
+      /// sparse server must NOT use local update mode
+      config_.set_num_batches_per_send_parameter(1);
+    }
+
+    if (config_.num_batches_per_send_parameter() > 1 &&
+        config_.center_parameter_update_method() == "average") {
+      /// scaling L1/L2 decay rate as large as L1/L2 apply in trainer
+      /// if parameter regularization in pserver
+      for (auto& pair : configMap_) {
+        ParameterConfig& config = pair.second;
+        if (config_.num_batches_per_send_parameter() ==
+            config.num_batches_regularization()) {
+          real scale =
+              config_.delta_add_rate() * config.num_batches_regularization();
+          if (config_.algorithm() == "sgd") {
+            scale *= FLAGS_num_gradient_servers;
+          }
+          config.set_decay_rate(config.decay_rate() * scale);
+          if (config.decay_rate() > 0.1f) {
+            LOG(FATAL) << "L2 decay=" << config.decay_rate()
+                       << " for parameter:" << config.name()
+                       << " is too large after scale in pserver!";
+          }
+          config.set_decay_rate_l1(config.decay_rate_l1() * scale);
+          if (config.decay_rate_l1() > 0.1f) {
+            LOG(FATAL) << "L1 decay=" << config.decay_rate_l1()
+                       << " for parameter:" << config.name()
+                       << " is too large after scale in pserver!";
+          }
+
+          LOG(INFO) << "parameter:" << config.name()
+                    << " decay apply in pserver,"
+                    << " L1 decay=" << config.decay_rate_l1()
+                    << " L2 decay=" << config.decay_rate();
+        }
+      }
+    }
+  }
+
+  SetConfigResponse response;
+  callback(response);
+}
+
+real bufferSum(const std::vector<ParameterServer2::Buffer>& buffers) {
+  real sum = 0;
+  for (const auto buffer : buffers) {
+    for (size_t i = 0; i < buffer.size; ++i) {
+      sum += buffer.base[i];
+    }
+  }
+  return sum;
+}
+
+void ParameterServer2::mergeSegments(BlockSegments* segments) {
+  if (segments->empty()) {
+    return;
+  }
+  std::sort(segments->begin(), segments->end());
+  auto curr = segments->begin();
+  for (auto it = segments->begin(); it != segments->end(); ++it) {
+    if (it->first <= curr->second) {
+      curr->second = std::max(curr->second, it->second);
+    } else {
+      ++curr;
+      *curr = *it;
+    }
+  }
+  ++curr;
+  segments->erase(curr, segments->end());
+}
+
+void ParameterServer2::setParameter(const SendParameterRequest& request,
+                                    std::vector<Buffer>& inputBuffers,
+                                    SendParameterResponse* response,
+                                    std::vector<Buffer>* outputBuffers) {
+  (void)response;
+  (void)outputBuffers;
+  LOG(INFO) << "pserver: setParameter";
+  std::lock_guard<RWLock> guard(parameterMutex_);
+
+  int64_t numBlocks = blockIdMap_.size();
+  CHECK_EQ(blockIdMap_.size(), blockOffsetMap_.size());
+  /// total bytes for all the added blocks
+  int64_t totalSize = size_;
+  std::vector<int64_t> offsets;
+  offsets.reserve(request.blocks_size());
+  std::vector<int64_t> blockIds;
+  blockIds.reserve(request.blocks_size());
+  int bufferIndex = 0;
+
+  if (!request.blocks().size()) {
+    LOG(WARNING)
+        << "--ports_num or --ports_num_for_sparse might be too large, "
+        << "or total dense parameter size or sparse parameters size "
+        << "might be too small, this psever doesn't store any parameter.";
+    return;
+  }
+
+  for (const auto& block : request.blocks()) {
+    /// block size for parameter(e.g. 128 for sparse row, 1K for dense)
+    uint64_t blockSize = getParameterConfig(block).parameter_block_size();
+    BlockKey key(block.para_id(), block.block_id());
+    if (inputBuffers.size()) {  // if !=PSERVER_UPDATE_MODE_SET_PARAM_ZERO
+      Buffer buffer = inputBuffers[bufferIndex];
+      ++bufferIndex;
+      CHECK_EQ(buffer.size, block.block_size())
+          << "data size is too big:"
+          << " block_size=" << block.block_size()
+          << " data_size=" << buffer.size;
+    }
+
+    /// add a new block
+    if (blockIdMap_.count(key) == 0) {
+      blockOffsetMap_[key] = totalSize;
+      blockIdMap_[key] = numBlocks;
+      ++numBlocks;
+      totalSize += blockSize;
+    }
+    offsets.push_back(blockOffsetMap_[key]);
+    blockIds.push_back(blockIdMap_[key]);
+  }
+
+  size_ = totalSize;
+  LOG(INFO) << "pserver: new cpuvector: size=" << size_;
+  if (!vectors_[PARAMETER_VALUE]) {
+    /// vectors_
+    const auto types = sgdOptimizerGetTypes(config_, true /*inPserver*/);
+    for (const auto type : types) {
+      vectors_[type].reset(new CpuVector(size_));
+      vectors_[type]->zeroMem();
+    }
+
+    blockInfos_.resize(numBlocks);
+    for (auto& info : blockInfos_) {
+      info.lock.reset(new std::mutex());
+    }
+  } else {
+    CHECK_EQ((size_t)size_, vectors_[PARAMETER_VALUE]->getSize())
+        << "Currently adding new blocks is not supported. "
+        << "All blocks must be added in one setParameter call";
+  }
+
+  VectorPtr buf = vectors_[PARAMETER_VALUE];
+  usedSegments_.reserve(offsets.size());
+  /// if offsets is empty, means parameter_block_size is too big or too many
+  /// nodes.
+  if (offsets.empty()) {
+    LOG(WARNING) << "in setParameter: offsets is empty";
+  }
+  for (size_t i = 0; i < offsets.size(); ++i) {
+    size_t blockId = blockIds[i];
+    BlockInfo& info = blockInfos_[blockId];
+    const ParameterConfig& config = getParameterConfig(request.blocks(i));
+    info.config = &config;
+    info.offset = offsets[i];
+    info.optimizer.reset(sgdOptimizerCreate(
+        config_, config, config.sparse_remote_update(), true /*inPserver*/));
+    if (config.sparse_remote_update()) {
+      size_t width = config.dims(1);
+      CHECK_EQ(config.parameter_block_size(), width)
+          << "block size: " << config.parameter_block_size()
+          << "width : " << width;
+    }
+    info.optimizer->init(1, info.config);
+    usedSegments_.push_back(std::make_pair(
+        offsets[i], offsets[i] + request.blocks(i).block_size()));
+  }
+  mergeSegments(&usedSegments_);
+
+  if (request.update_mode() == PSERVER_UPDATE_MODE_SET_PARAM) {
+    /// copy param from trainer
+    for (size_t i = 0; i < offsets.size(); ++i) {
+      Buffer buffer = inputBuffers[i];
+      real* start = buf->getPoint(offsets[i]);
+      CHECK_LE(offsets[i] + buffer.size, buf->getSize());
+      memcpy(start, buffer.base, sizeof(real) * buffer.size);
+    }
+  } else {
+    CHECK(request.update_mode() == PSERVER_UPDATE_MODE_SET_PARAM_ZERO);
+    /// nothing to do, value vector zero mem already
+  }
+}
+
+void ParameterServer2::addGradient(const SendParameterRequest& request,
+                                   std::vector<Buffer>& inputBuffers,
+                                   SendParameterResponse* response,
+                                   std::vector<Buffer>* outputBuffers) {
+  VLOG(1) << "pserver: addGradient";
+
+  {
+    ReadLockGuard guard(parameterMutex_);
+    int bufferIndex = 0;
+    for (const auto& block : request.blocks()) {
+      int64_t offset = getBlockOffset(block);
+      CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
+                          << " id=" << block.para_id()
+                          << " block id=" << block.block_id();
+
+      int64_t blockId = getBlockId(block);
+      CHECK_GE(blockId, 0) << "Only existing parameter block is allowed: "
+                           << " id=" << block.para_id()
+                           << " block id=" << block.block_id();
+
+      Buffer buffer = inputBuffers[bufferIndex];
+      ++bufferIndex;
+
+      const real* gradientBuffer = buffer.base;
+      real* gradientSumBuffer = vectors_[PARAMETER_GRADIENT]->getPoint(offset);
+
+      size_t size = buffer.size;
+
+      BlockInfo& info = blockInfos_[blockId];
+      const ParameterConfig& config = getParameterConfig(blockId);
+      if (config.sparse_remote_update()) {
+        CHECK_EQ(size, config.parameter_block_size());
+      } else {  // dense
+        CHECK_LE(size, config.parameter_block_size());
+      }
+      std::lock_guard<std::mutex> guard(*info.lock);
+      simd::addTo(gradientSumBuffer, gradientBuffer, size);
+    }
+  }
+  if (request.batch_status() == BATCH_FINISH ||
+      request.batch_status() == BATCH_START_AND_FINISH) {
+    numSamplesProcessed_ += request.num_samples();
+    cost_ += request.cost();
+    VLOG(1) << "num samples: " << numSamplesProcessed_
+            << ", new cost:" << cost_;
+
+    /// notify doOperation gradient ready
+    gradientReadyBarrier_.wait();
+
+    /// wait doOperation finish
+    parameterReadyBarrier_.wait();
+    VLOG(1) << "start send back";
+  }
+}
+
+bool ParameterServer2::asyncGrdientCommitCheckAndStat(
+    const SendParameterRequest& request) {
+  const auto trainerId = request.trainer_id();
+  int64_t trainerSteps = asyncTrainerSteps_[trainerId];
+  CHECK_GE(asyncUpdateSteps_, trainerSteps)
+      << " async update steps overflows "
+      << " trainer id: " << trainerId
+      << " async update steps in pserver: " << asyncUpdateSteps_
+      << " async update steps in request: " << trainerSteps;
+
+  asyncUpdateSteps_++;
+  bool commitGradient = true;
+
+  int64_t delta = asyncUpdateSteps_ - trainerSteps;
+  if (delta >= asyncLaggedThreshold_) {
+    VLOG(1) << "discard Async Update: "
+            << " trainer id: " << trainerId
+            << " pserver steps: " << asyncUpdateSteps_
+            << " request steps: " << trainerSteps;
+    asyncLaggedGradientsNum_++;
+    commitGradient = false;
+  }
+  /// stat on lagged steps, to get total discard distribution
+  if (static_cast<size_t>(delta) < asyncUpdateStat_.size()) {
+    asyncUpdateStat_[delta]++;
+  } else {
+    asyncUpdateStat_[asyncUpdateStat_.size() - 1]++;
+  }
+  /// stat on trainerId and discard, to get trainer condition
+  if (commitGradient) {
+    asyncTrainerCommitStat_[trainerId]++;
+  } else {
+    asyncTrainerDiscardStat_[trainerId]++;
+  }
+
+  return commitGradient;
+}
+
+static ThreadLocal<std::vector<bool>> localBlockBitset_;
+
+void ParameterServer2::asyncSGD(const SendParameterRequest& request,
+                                std::vector<Buffer>& inputBuffers,
+                                SendParameterResponse* response,
+                                std::vector<Buffer>* outputBuffers) {
+  int64_t numBlocks = blockIdMap_.size();
+  auto& localBlockBitset = *localBlockBitset_;
+
+  if (isSparseServer_) {
+    if (localBlockBitset.empty()) {
+      localBlockBitset.resize(numBlocks);
+    }
+    localBlockBitset.assign(numBlocks, false);
+  }
+
+  ReadLockGuard guard(parameterMutex_);
+
+  if (request.send_back_parameter()) {
+    outputBuffers->reserve(request.blocks_size());
+  }
+
+  bool commitGradient = asyncGrdientCommitCheckAndStat(request);
+
+  VectorPtr* vecs = parameter::getThreadLocalBuffer();
+  size_t bufferIndex = 0;
+  for (const auto& block : request.blocks()) {
+    int64_t offset = getBlockOffset(block);
+    CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
+                        << " id=" << block.para_id()
+                        << " block id=" << block.block_id();
+    int64_t blockId = getBlockId(block);
+    CHECK_GE(blockId, 0) << "Only existing parameter block is allowed: "
+                         << " id=" << block.para_id()
+                         << " block id=" << block.block_id();
+    Buffer buffer = inputBuffers[bufferIndex];
+    ++bufferIndex;
+
+    size_t size = buffer.size;
+
+    BlockInfo& info = blockInfos_[blockId];
+    const ParameterConfig& config = getParameterConfig(blockId);
+
+    std::lock_guard<std::mutex> guard(*info.lock);
+    /// gradients are too obsolete, will be discarded
+    if (commitGradient) {
+      info.optimizer->startBatch(numSamplesProcessed_);
+
+      for (const auto type : info.optimizer->getParameterTypes()) {
+        vecs[type]->subVecFrom(*vectors_[type], offset, size);
+      }
+      vecs[PARAMETER_GRADIENT]->subVecFrom(buffer.base, 0, size);
+      info.optimizer->update(vecs, config, isSparseServer_ ? 0 : -1);
+
+      if (auto callback = info.optimizer->needSpecialTraversal(config)) {
+        blockTraverse(info, config, offset, size, vecs, callback);
+      }
+      info.optimizer->finishBatch();
+    }
+
+    if (commitGradient && isSparseServer_) {
+      localBlockBitset[blockId] = true;
+    }
+
+    if (!isSparseServer_ && request.send_back_parameter()) {  // dense
+      int type = request.send_back_parameter_type();
+      sendBackParameter(block, type, response, &buffer, outputBuffers);
+    }
+  }  /// foreach block
+
+  asyncTrainerSteps_[request.trainer_id()] = asyncUpdateSteps_;
+
+  if (commitGradient && isSparseServer_) {
+    /// find blocks that trainer do not request update
+    for (int64_t blockId = 0; blockId < numBlocks; ++blockId) {
+      if (localBlockBitset[blockId]) {
+        continue;
+      }
+
+      BlockInfo& info = blockInfos_[blockId];
+      const ParameterConfig& config = *info.config;
+      size_t size = config.parameter_block_size();
+
+      std::lock_guard<std::mutex> guard(*info.lock);
+      info.optimizer->startBatch(numSamplesProcessed_);
+      if (auto callback = info.optimizer->needSpecialTraversal(config)) {
+        blockTraverse(info, config, info.offset, size, vecs, callback);
+      }
+      info.optimizer->finishBatch();
+    }
+  }
+
+  if (commitGradient && (request.batch_status() == BATCH_FINISH ||
+                         request.batch_status() == BATCH_START_AND_FINISH)) {
+    numSamplesProcessed_ += request.num_samples();
+  }
+
+  /// show some performance log if needed
+  if (request.trainer_id() == 0) {
+    /// batchId_ is approximately equal to "real batchId_"
+    batchId_++;
+  }
+}
+
+void ParameterServer2::getParameter(const SendParameterRequest& request,
+                                    std::vector<Buffer>& inputBuffers,
+                                    SendParameterResponse* response,
+                                    std::vector<Buffer>* outputBuffers) {
+  (void)inputBuffers;
+  LOG(INFO) << "pserver: getParameter";
+  ReadLockGuard guard(parameterMutex_);
+  for (const auto& block : request.blocks()) {
+    int type = request.send_back_parameter_type();
+    sendBackParameter(block, type, response, outputBuffers);
+  }
+}
+
+void ParameterServer2::getParameterSparse(const SendParameterRequest& request,
+                                          std::vector<Buffer>& inputBuffers,
+                                          SendParameterResponse* response,
+                                          std::vector<Buffer>* outputBuffers) {
+  (void)inputBuffers;
+  auto& buffer = *readWriteBuffer_;
+  size_t numReals = 0;
+  for (const auto& block : request.blocks()) {
+    numReals += getParameterConfig(block).dims(1);
+  }
+  buffer.resize(numReals);
+
+  VLOG(3) << "pserver: getParameterSparse, numReals=" << numReals;
+
+  ReadLockGuard guard(parameterMutex_);
+  size_t offset = 0;
+  for (const auto& block : request.blocks()) {
+    size_t width = getParameterConfig(block).dims(1);
+    Buffer buf = {buffer.data() + offset, width};
+    int type = request.send_back_parameter_type();
+    sendBackParameterSparse(block, type, response, &buf, width, outputBuffers);
+    offset += width;
+  }
+}
+
+void ParameterServer2::sendBackParameter(const ParameterBlock& block,
+                                         int parameterType,
+                                         SendParameterResponse* response,
+                                         std::vector<Buffer>* outputBuffers) {
+  ParameterBlock* returnBlock = response->add_blocks();
+  returnBlock->set_para_id(block.para_id());
+  returnBlock->set_block_id(block.block_id());
+  returnBlock->set_begin_pos(block.begin_pos());
+  returnBlock->set_block_size(block.block_size());
+
+  int64_t offset = getBlockOffset(block);
+  CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
+                      << " id=" << block.para_id()
+                      << " block id=" << block.block_id();
+
+  real* valueBuffer = vectors_[parameterType]->getPoint(offset);
+  outputBuffers->push_back({valueBuffer, (size_t)block.block_size()});
+}
+
+void ParameterServer2::sendBackParameter(const ParameterBlock& block,
+                                         int parameterType,
+                                         SendParameterResponse* response,
+                                         Buffer* buffer,
+                                         std::vector<Buffer>* outputBuffers) {
+  ParameterBlock* returnBlock = response->add_blocks();
+  returnBlock->set_para_id(block.para_id());
+  returnBlock->set_block_id(block.block_id());
+  returnBlock->set_begin_pos(block.begin_pos());
+  returnBlock->set_block_size(block.block_size());
+
+  int64_t offset = getBlockOffset(block);
+  CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
+                      << " id=" << block.para_id()
+                      << " block id=" << block.block_id();
+
+  size_t size = buffer->size;
+  real* valueBuffer = vectors_[parameterType]->getPoint(offset);
+  /// copy to second buffer to avoid to be polluted by other request
+  memcpy(buffer->base, valueBuffer, sizeof(real) * size);
+  outputBuffers->push_back({buffer->base, size});
+}
+
+void ParameterServer2::sendBackParameterSparse(
+    const ParameterBlock& block,
+    int parameterType,
+    SendParameterResponse* response,
+    Buffer* buffer,
+    size_t width,
+    std::vector<Buffer>* outputBuffers) {
+  ParameterBlock* returnBlock = response->add_blocks();
+  returnBlock->set_para_id(block.para_id());
+  returnBlock->set_block_id(block.block_id());
+  returnBlock->set_begin_pos(block.begin_pos());
+  returnBlock->set_block_size(block.block_size());
+  int64_t offset = getBlockOffset(block);
+  CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
+                      << " id=" << block.para_id()
+                      << " block id=" << block.block_id();
+
+  real* valueBuffer = vectors_[parameterType]->getPoint(offset);
+  CHECK_EQ(buffer->size, width);
+  memcpy(buffer->base, valueBuffer, width * sizeof(real));
+  outputBuffers->push_back(*buffer);
+}
+
+void ParameterServer2::readAllBlocks(
+    MsgReader* msgReader, std::vector<ParameterServer2::Buffer>* buffers) {
+  auto& buffer = *readWriteBuffer_;
+  size_t numBlocks = msgReader->getNumBlocks();
+  buffer.resizeWithAlignHints(msgReader->getTotalLength() / sizeof(real),
+                              numBlocks);
+  std::vector<void*> bufs(numBlocks);
+  buffers->clear();
+  buffers->reserve(numBlocks);
+  buffer.resetAlignAlloc();
+  for (size_t i = 0; i < numBlocks; ++i) {
+    size_t len = msgReader->getBlockLength(i);
+    CHECK_EQ(len % sizeof(real), (size_t)0);
+    size_t size = len / sizeof(real);
+    bufs[i] = buffer.nextBlock(size);
+    buffers->push_back({(real*)bufs[i], size});
+  }
+  msgReader->readBlocks(bufs);
+}
+
+void ParameterServer2::sendParameter(const SendParameterRequest& request,
+                                     std::unique_ptr<MsgReader> msgReader,
+                                     ProtoResponseCallbackEx callback) {
+  SendParameterResponse response;
+  std::vector<Buffer> inputBuffers;
+  std::vector<Buffer> outputBuffers;
+  readAllBlocks(msgReader.get(), &inputBuffers);
+  msgReader.reset();
+
+  switch (request.update_mode()) {
+    case PSERVER_UPDATE_MODE_SET_PARAM:
+    case PSERVER_UPDATE_MODE_SET_PARAM_ZERO:
+      setParameter(request, inputBuffers, &response, &outputBuffers);
+      break;
+    case PSERVER_UPDATE_MODE_GET_PARAM:
+      getParameter(request, inputBuffers, &response, &outputBuffers);
+      break;
+    case PSERVER_UPDATE_MODE_GET_PARAM_SPARSE:
+      getParameterSparse(request, inputBuffers, &response, &outputBuffers);
+      break;
+    case PSERVER_UPDATE_MODE_ASYNC_SGD:
+      asyncSGD(request, inputBuffers, &response, &outputBuffers);
+      break;
+    case PSERVER_UPDATE_MODE_ADD_GRADIENT:
+      addGradient(request, inputBuffers, &response, &outputBuffers);
+      break;
+    case PSERVER_UPDATE_MODE_AVERAGE_PARAMETER:
+      break;
+  }
+  switch (request.update_mode()) {
+    case PSERVER_UPDATE_MODE_ADD_GRADIENT:
+      (*requestVec_).push_back(request);
+      (*callbackVec_).push_back(callback);
+      if (request.batch_status() == BATCH_FINISH ||
+          request.batch_status() == BATCH_START_AND_FINISH) {
+        for (size_t i = 0; i < (*requestVec_).size(); i++) {
+          ReadLockGuard guard(parameterMutex_);
+          SendParameterRequest& request = (*requestVec_)[i];
+          SendParameterResponse responseTemp;
+
+          std::vector<iovec> outputIovs;
+          if (request.send_back_parameter()) {
+            CHECK(!isSparseServer_);
+            std::vector<Buffer> outputBuffersTemp;
+            for (const auto& block : request.blocks()) {
+              int type = request.send_back_parameter_type();
+              sendBackParameter(block, type, &responseTemp, &outputBuffersTemp);
+            }
+            outputIovs.reserve(outputBuffersTemp.size());
+            for (auto buffer : outputBuffersTemp) {
+              outputIovs.push_back({buffer.base, buffer.size * sizeof(real)});
+            }
+          }
+
+          ProtoResponseCallbackEx& callbackTemp = (*callbackVec_)[i];
+          callbackTemp(responseTemp, outputIovs);
+        }
+        (*requestVec_).clear();
+        (*callbackVec_).clear();
+      }
+      break;
+    case PSERVER_UPDATE_MODE_SET_PARAM:
+    case PSERVER_UPDATE_MODE_SET_PARAM_ZERO:
+    case PSERVER_UPDATE_MODE_GET_PARAM:
+    case PSERVER_UPDATE_MODE_GET_PARAM_SPARSE:
+    case PSERVER_UPDATE_MODE_ASYNC_SGD:
+    case PSERVER_UPDATE_MODE_AVERAGE_PARAMETER:
+      std::vector<iovec> outputIovs;
+      outputIovs.reserve(outputBuffers.size());
+      for (auto buffer : outputBuffers) {
+        outputIovs.push_back({buffer.base, buffer.size * sizeof(real)});
+      }
+      callback(response, outputIovs);
+      break;
+  }
+}
+
+template <typename Dtype>
+void ParameterServer2::reduceAndSendData(const SendDataRequest& request,
+                                         std::unique_ptr<MsgReader>& msgReader,
+                                         ProtoResponseCallbackEx& callback) {
+  SendDataResponse response;
+  response.set_type(request.type());
+  response.set_server_id(serverId_);
+
+  auto sendData = reinterpret_cast<Dtype*>(dataMems_[0].get()->getBuf());
+  size_t rawMemSize = dataMems_[0].get()->getSize();
+  CHECK_EQ(rawMemSize % sizeof(Dtype), 0U);
+  size_t dataMemSize = rawMemSize / sizeof(Dtype);
+  for (size_t i = 1; i < dataMems_.size(); ++i) {
+    CHECK_EQ(dataMems_[i].get()->getSize(), rawMemSize);
+    auto data = reinterpret_cast<Dtype*>(dataMems_[i].get()->getBuf());
+    for (size_t j = 0; j < dataMemSize; ++j) {
+      sendData[j] += data[j];
+    }
+  }
+  std::vector<iovec> outputIovs;
+  auto block = response.add_blocks();
+  outputIovs.push_back({sendData, rawMemSize});
+  block->set_total_size(rawMemSize);
+  block->set_data_size(sizeof(Dtype));
+  callback(response, outputIovs);
+}
+
+void ParameterServer2::templateReduceSum(const SendDataRequest& request,
+                                         std::unique_ptr<MsgReader>& msgReader,
+                                         ProtoResponseCallbackEx& callback) {
+  const auto& block = request.blocks(0);
+  switch (block.data_type()) {
+    case TRANS_FLOAT:
+      reduceAndSendData<float>(request, msgReader, callback);
+      break;
+    case TRANS_DOUBLE:
+      reduceAndSendData<double>(request, msgReader, callback);
+      break;
+    case TRANS_INT32:
+      reduceAndSendData<int>(request, msgReader, callback);
+      break;
+    case TRANS_UINT32_T:
+      reduceAndSendData<uint32_t>(request, msgReader, callback);
+      break;
+    case TRANS_INT64_T:
+      reduceAndSendData<int64_t>(request, msgReader, callback);
+      break;
+    case TRANS_UINT64_T:
+      reduceAndSendData<uint64_t>(request, msgReader, callback);
+      break;
+    default:
+      LOG(FATAL) << "not supported";
+      break;
+  }
+}
+
+void ParameterServer2::sendData(const SendDataRequest& request,
+                                std::unique_ptr<MsgReader> msgReader,
+                                ProtoResponseCallbackEx callback) {
+  SendDataResponse response;
+  response.set_type(request.type());
+  response.set_server_id(serverId_);
+
+  switch (request.update_mode()) {
+    case DATA_UPDATE_MODE_SET_OWN: {
+      CHECK_EQ(msgReader->getNumBlocks(), (size_t)(request.blocks_size()));
+      size_t totalLen = msgReader->getTotalLength();
+      if (totalLen > 0) {
+        CHECK_EQ(msgReader->getNumBlocks(), 1U)
+            << "Only one block currently support now!";
+        const auto& block = request.blocks(0);
+        if (0 == dataSize_) {
+          dataSize_ = block.data_size();
+        } else {
+          CHECK_EQ(dataSize_, block.data_size());
+        }
+        int64_t serverId = request.server_id();
+        if (serverId_ < 0) {
+          serverId_ = serverId;
+        } else {
+          CHECK_EQ(serverId_, serverId);
+        }
+        int64_t clientId = request.client_id();
+        dataMems_[clientId] = std::make_shared<CpuMemoryHandle>(totalLen);
+        CHECK_EQ(totalLen % sizeof(block.data_size()), 0U);
+        msgReader->readNextBlock(dataMems_[clientId].get()->getBuf());
+      }
+      msgReader.reset();
+      std::vector<iovec> outputIovs;
+      callback(response, outputIovs);
+      break;
+    }
+    case DATA_UPDATE_MODE_GET_ALL: {
+      /// Currently only support DATA_REDUCE_SUM
+      /// And their Operations are just add
+      CHECK(DATA_REDUCE_SUM == request.type());
+      templateReduceSum(request, msgReader, callback);
+      break;
+    }
+    default: { LOG(FATAL) << "not supported"; }
+  }
+}
+
+void ParameterServer2::clearUnusedSegments(CpuVector* vec) {
+  real* data = vec->getData();
+  if (usedSegments_.empty()) {
+    return;
+  }
+  memset(data, 0, sizeof(real) * usedSegments_[0].first);
+  memset(data + usedSegments_.back().second,
+         0,
+         sizeof(real) * (size_ - usedSegments_.back().second));
+  size_t n = size_ - usedSegments_.back().second;
+
+  for (size_t i = 1; i < usedSegments_.size(); ++i) {
+    memset(
+        data + usedSegments_[i - 1].second,
+        0,
+        sizeof(real) * (usedSegments_[i].first - usedSegments_[i - 1].second));
+    n += usedSegments_[i].first - usedSegments_[i - 1].second;
+  }
+}
+
+void ParameterServer2::parallelExecForEachBlock(ExecFunc func) {
+  SyncThreadPool::execHelper(
+      syncThreadPool_.get(), [&](int tid, size_t numThreads) {
+        int64_t numBlocks = blockIdMap_.size();
+        VectorPtr* vecs = parameter::getThreadLocalBuffer();
+        for (int64_t blockId = tid; blockId < numBlocks;
+             blockId += numThreads) {
+          func(blockId, vecs);
+        }
+      });
+}
+
+void ParameterServer2::blockTraverse(
+    BlockInfo& info,
+    const ParameterConfig& config,
+    int64_t offset,
+    size_t size,
+    const VectorPtr vecs[],
+    const ParameterOptimizer::TraverseCallback& callback) {
+  /// setup sub bufs
+  for (const auto type : info.optimizer->getParameterTypes()) {
+    vecs[type]->subVecFrom(*vectors_[type], offset, size);
+  }
+  callback(vecs, config, config.sparse_remote_update() ? 0 : -1LU);
+}
+
+void ParameterServer2::op_SGD(const Operation& operation,
+                              OperationResult* result) {
+  (void)operation;
+  (void)result;
+
+  if (allClientPassFinish_) {
+    /// when all clients signal pass finished, the update
+    /// is empty.
+    return;
+  }
+
+  {
+    parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
+      BlockInfo& info = blockInfos_[blockId];
+      const ParameterConfig& config = getParameterConfig(blockId);
+      int64_t offset = info.offset;
+      size_t size = config.parameter_block_size();
+
+      info.optimizer->startBatch(numSamplesProcessed_);
+
+      for (const auto type : info.optimizer->getParameterTypes()) {
+        vecs[type]->subVecFrom(*vectors_[type], offset, size);
+      }
+      info.optimizer->update(
+          vecs, config, config.sparse_remote_update() ? 0 : -1LU);
+      vecs[PARAMETER_GRADIENT]->zeroMem();
+
+      if (auto callback = info.optimizer->needSpecialTraversal(config)) {
+        blockTraverse(info, config, offset, size, vecs, callback);
+      }
+      info.optimizer->finishBatch();
+    });
+  }
+
+  batchId_++;
+}
+
+void ParameterServer2::op_start_pass(const Operation& operation,
+                                     OperationResult* result) {
+  (void)operation;
+  (void)result;
+
+  parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
+    BlockInfo& info = blockInfos_[blockId];
+    info.optimizer->startPass();
+  });
+}
+
+void ParameterServer2::op_finish_pass(const Operation& operation,
+                                      OperationResult* result) {
+  (void)operation;
+  (void)result;
+
+  parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
+    BlockInfo& info = blockInfos_[blockId];
+    const ParameterConfig& config = getParameterConfig(blockId);
+    size_t size = config.parameter_block_size();
+
+    /// catch up with
+    if (auto callback = info.optimizer->startCatchUpWith()) {
+      blockTraverse(info, config, info.offset, size, vecs, callback);
+      info.optimizer->finishCatchUpWith();
+    }
+
+    /// finish pass
+    info.optimizer->finishPass();
+  });
+  batchId_ = 0;
+}
+
+void ParameterServer2::op_apply(const Operation& operation,
+                                OperationResult* result) {
+  (void)operation;
+  (void)result;
+
+  parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
+    BlockInfo& info = blockInfos_[blockId];
+    const ParameterConfig& config = getParameterConfig(blockId);
+    int64_t offset = info.offset;
+    size_t size = config.parameter_block_size();
+
+    // catch up with
+    if (auto callback = info.optimizer->startCatchUpWith()) {
+      blockTraverse(info, config, offset, size, vecs, callback);
+      info.optimizer->finishCatchUpWith();
+    }
+
+    // apply to PARAMETER_APPLY
+    if (auto callback = info.optimizer->apply()) {
+      blockTraverse(info, config, offset, size, vecs, callback);
+    }
+  });
+}
+
+void ParameterServer2::op_randomize(const Operation& operation,
+                                    OperationResult* result) {
+  LOG(INFO) << "ParameterServer2::op_randomize: serverId=" << serverId_;
+
+  CpuVector& valueVec = *vectors_[PARAMETER_VALUE];
+
+  parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
+    BlockInfo& info = blockInfos_[blockId];
+    const ParameterConfig& config = getParameterConfig(blockId);
+    size_t size = config.parameter_block_size();
+
+    vecs[PARAMETER_VALUE]->subVecFrom(valueVec, info.offset, size);
+    Parameter::randomize(vecs[PARAMETER_VALUE], config);
+  });
+}
+
+void ParameterServer2::loadValueVector(const LoadValueRequest& request,
+                                       ProtoResponseCallback callback) {
+  LoadValueResponse response;
+  LOG(INFO) << "ParameterServer2::loadValueVector: serverId=" << serverId_;
+
+  constexpr int kBufLen = 100;
+  char buf[kBufLen];
+  snprintf(buf, kBufLen, "/pserver.%04d", static_cast<int>(serverId_));
+  std::string filename = request.dir_name() + buf;
+
+  std::ifstream fs(filename, std::ios_base::binary);
+  CHECK(fs) << "Fail to open " << filename;
+
+  CpuVector& vec = *vectors_[PARAMETER_VALUE];
+  Parameter::Header header;
+  CHECK(fs.read(reinterpret_cast<char*>(&header), sizeof(header)))
+      << "Fail to read parameters in pserver";
+  CHECK(Parameter::isHeaderFormatSupported(header.format))
+      << "Incorrect format version: " << header.format;
+  CHECK_EQ(header.size, (size_t)size_)
+      << "The size (" << header.size << ") in the file does not match the size "
+      << "(" << size_ << ") of the pserver: " << serverId_;
+  CHECK_EQ(header.valueSize, sizeof(real)) << "Unsupported valueSize "
+                                           << header.valueSize;
+  CHECK(fs.read(reinterpret_cast<char*>(vec.getData()),
+                header.size * sizeof(real)));
+
+  callback(response);
+}
+
+void ParameterServer2::saveValueVector(const SaveValueRequest& request,
+                                       ProtoResponseCallback callback) {
+  SaveValueResponse response;
+  LOG(INFO) << "ParameterServer2::SaveValueVector: serverId=" << serverId_;
+
+  mkDir(request.dir_name().c_str());
+
+  constexpr int kBufLen = 100;
+  char buf[kBufLen];
+  snprintf(buf, kBufLen, "/pserver.%04d", static_cast<int>(serverId_));
+  std::string filename = request.dir_name() + buf;
+
+  std::ofstream fs(filename, std::ios_base::binary);
+  CHECK(fs) << "Fail to open " << filename;
+
+  CpuVector& vec = vectors_[PARAMETER_APPLY] ? *vectors_[PARAMETER_APPLY]
+                                             : *vectors_[PARAMETER_VALUE];
+  Parameter::Header header;
+  // TODO(TJ): save param headerFormat_
+  header.format = PARAM_FORMAT_ORIGINAL;
+  header.valueSize = sizeof(real);
+  header.size = size_;
+
+  CHECK_EQ(header.size, vec.getSize());
+
+  CHECK(fs.write(reinterpret_cast<char*>(&header), sizeof(header)))
+      << "Fail to write parameter in pserver: " << serverId_;
+
+  CHECK(fs.write(reinterpret_cast<char*>(vec.getData()),
+                 header.size * sizeof(real)))
+      << "Fail to write parameter in pserver: " << serverId_;
+
+  callback(response);
+}
+
+void ParameterServer2::op_RESET(const Operation& operation,
+                                OperationResult* result) {
+  (void)result;
+  CpuVector* u = vectors_[operation.pvectors(0)].get();
+  u->reset(operation.scalars(0));
+  clearUnusedSegments(u);
+}
+
+void ParameterServer2::op_utv(const Operation& operation,
+                              OperationResult* result) {
+  real* u = vectors_[operation.pvectors(0)]->getData();
+  real* v = vectors_[operation.pvectors(1)]->getData();
+  int64_t size = size_;
+  double sum = 0;
+  for (int64_t i = 0; i < size; ++i) {
+    sum += (double)u[i] * (double)v[i];
+  }
+  result->add_scalars(sum);
+}
+
+void ParameterServer2::op_au_bv(const Operation& operation,
+                                OperationResult* result) {
+  (void)result;
+  real* u = vectors_[operation.pvectors(0)]->getData();
+  real* v = vectors_[operation.pvectors(1)]->getData();
+  int64_t size = size_;
+  real a = operation.scalars(0);
+  real b = operation.scalars(1);
+  for (int64_t i = 0; i < size; ++i) {
+    v[i] = a * u[i] + b * v[i];
+  }
+}
+
+void ParameterServer2::op_COPY(const Operation& operation,
+                               OperationResult* result) {
+  (void)result;
+  real* u = vectors_[operation.pvectors(0)]->getData();
+  real* v = vectors_[operation.pvectors(1)]->getData();
+  int64_t size = size_;
+  for (int64_t i = 0; i < size; ++i) {
+    v[i] = u[i];
+  }
+}
+
+void ParameterServer2::op_au(const Operation& operation,
+                             OperationResult* result) {
+  (void)result;
+  real* u = vectors_[operation.pvectors(0)]->getData();
+  int64_t size = size_;
+  real a = operation.scalars(0);
+  for (int64_t i = 0; i < size; ++i) {
+    u[i] *= a;
+  }
+}
+
+void ParameterServer2::op_au_bv_cw(const Operation& operation,
+                                   OperationResult* result) {
+  (void)result;
+  real* u = vectors_[operation.pvectors(0)]->getData();
+  real* v = vectors_[operation.pvectors(1)]->getData();
+  real* w = vectors_[operation.pvectors(2)]->getData();
+  int64_t size = size_;
+  real a = operation.scalars(0);
+  real b = operation.scalars(1);
+  real c = operation.scalars(2);
+  for (int64_t i = 0; i < size; ++i) {
+    w[i] = a * u[i] + b * v[i] + c * w[i];
+  }
+}
+
+void ParameterServer2::op_make_steepest_desc_dir(const Operation& operation,
+                                                 OperationResult* result) {
+  (void)result;
+  real* dir = vectors_[operation.pvectors(0)]->getData();
+  real* grad = vectors_[operation.pvectors(1)]->getData();
+  real* x = vectors_[operation.pvectors(2)]->getData();
+  int64_t size = size_;
+  real l1weight = operation.scalars(0);
+  for (int64_t i = 0; i < size; ++i) {
+    if (x[i] < 0) {
+      dir[i] = -grad[i] + l1weight;
+    } else if (x[i] > 0) {
+      dir[i] = -grad[i] - l1weight;
+    } else {
+      if (grad[i] < -l1weight) {
+        dir[i] = -grad[i] - l1weight;
+      } else if (grad[i] > l1weight) {
+        dir[i] = -grad[i] + l1weight;
+      } else {
+        dir[i] = 0;
+      }
+    }
+  }
+}
+
+void ParameterServer2::op_fix_dir_signs(const Operation& operation,
+                                        OperationResult* result) {
+  (void)result;
+  real* dir = vectors_[operation.pvectors(0)]->getData();
+  real* steepestDescDir = vectors_[operation.pvectors(1)]->getData();
+  int64_t size = size_;
+  for (int64_t i = 0; i < size; ++i) {
+    if (dir[i] * steepestDescDir[i] <= 0) {
+      dir[i] = 0;
+    }
+  }
+}
+
+void ParameterServer2::op_fix_omega_signs(const Operation& operation,
+                                          OperationResult* result) {
+  (void)result;
+  real* x = vectors_[operation.pvectors(0)]->getData();
+  real* newx = vectors_[operation.pvectors(1)]->getData();
+  int64_t size = size_;
+  for (int64_t i = 0; i < size; ++i) {
+    if (x[i] * newx[i] < 0) {
+      newx[i] = 0;
+    }
+  }
+}
+
+void ParameterServer2::op_dir_deriv(const Operation& operation,
+                                    OperationResult* result) {
+  real* dir = vectors_[operation.pvectors(0)]->getData();
+  real* grad = vectors_[operation.pvectors(1)]->getData();
+  real* x = vectors_[operation.pvectors(2)]->getData();
+  int64_t size = size_;
+  real l1weight = operation.scalars(0);
+  double sum = 0;
+  for (int64_t i = 0; i < size; ++i) {
+    if (dir[i] != 0) {
+      if (x[i] < 0) {
+        sum += dir[i] * (grad[i] - l1weight);
+      } else if (x[i] > 0) {
+        sum += dir[i] * (grad[i] + l1weight);
+      } else if (dir[i] < 0) {
+        sum += dir[i] * (grad[i] - l1weight);
+      } else if (dir[i] > 0) {
+        sum += dir[i] * (grad[i] + l1weight);
+      }
+    }
+  }
+  result->add_scalars(sum);
+}
+
+void ParameterServer2::op_cost(const Operation& operation,
+                               OperationResult* result) {
+  real* x = vectors_[operation.pvectors(0)]->getData();
+  real* newgrad = vectors_[operation.pvectors(1)]->getData();
+  int64_t size = size_;
+  real l1weight = operation.scalars(0);
+  real l2weight = operation.scalars(1);
+  double cost_real = cost_ / mpiSize_;
+  double sum_weight_l1 = 0;
+  double sum_weight_l2 = 0;
+  for (int64_t i = 0; i < size; ++i) {
+    sum_weight_l1 += std::abs(x[i]);
+    sum_weight_l2 += x[i] * x[i];
+    newgrad[i] += 2.0 * l2weight * x[i];
+  }
+  cost_real += l1weight * sum_weight_l1 + l2weight * sum_weight_l2;
+  result->add_scalars(cost_real);
+}
+
+ParameterServer2::OperatorFunction ParameterServer2::opFuncs[] = {
+    nullptr,                         // PSERVER_OP_utu = 0;
+    &ParameterServer2::op_utv,       // PSERVER_OP_utv = 1;
+    &ParameterServer2::op_au,        // PSERVER_OP_au = 2;
+    &ParameterServer2::op_au_bv,     // PSERVER_OP_au_bv = 3;
+    nullptr,                         // PSERVER_OP_aAx_bu = 4;
+    &ParameterServer2::op_SGD,       // PSERVER_OP_SGD = 5;
+    &ParameterServer2::op_RESET,     // PSERVER_OP_RESET = 6;
+    &ParameterServer2::op_COPY,      // PSERVER_OP_COPY = 7;
+    &ParameterServer2::op_au_bv_cw,  // PSERVER_OP_au_bv_cw = 8;
+    &ParameterServer2::op_make_steepest_desc_dir,
+    /// PSERVER_OP_MAKE_STEEPEST_DESC_DIR = 9;
+    &ParameterServer2::op_fix_dir_signs,    // PSERVER_OP_FIX_SIGNS = 10;
+    &ParameterServer2::op_dir_deriv,        // PSERVER_OP_DIR_DERIV = 11;
+    &ParameterServer2::op_fix_omega_signs,  // PSERVER_OP_FIX_OMEGA_SIGNS = 12;
+    &ParameterServer2::op_cost,             // PSERVER_OP_COST = 13
+    &ParameterServer2::op_start_pass,       // PSERVER_OP_START_PASS = 14
+    &ParameterServer2::op_finish_pass,      // PSERVER_OP_FINISH_PASS = 15
+    &ParameterServer2::op_randomize,        // PSERVER_OP_RANDOMIZE = 16
+    &ParameterServer2::op_apply,            // PSERVER_OP_APPLY = 17
+};
+
+void ParameterServer2::doOperation(const DoOperationRequest& request,
+                                   ProtoResponseCallback callback) {
+  if (request.wait_for_gradient()) {
+    /// wait gradient update
+    gradientReadyBarrier_.wait();
+    allClientPassFinish_ = numPassFinishClients_ == FLAGS_num_gradient_servers;
+  }
+
+  DoOperationResponse response;
+  response.set_pass_finish(allClientPassFinish_);
+
+  for (const auto& op : request.operations()) {
+    OperationResult* opResult = response.add_results();
+    if (op.operation() >= ARRAYSIZE(opFuncs)) {
+      LOG(ERROR) << "Unknown operation " << op.operation();
+      response.set_return_message(kRetMsgUnknownOperation);
+    }
+    OperatorFunction opFunc = opFuncs[op.operation()];
+    if (!opFunc) {
+      LOG(ERROR) << "Operation not implemented: " << op.operation();
+      response.set_return_message(kRetMsgUnknownOperation);
+    }
+    (this->*opFunc)(op, opResult);
+  }
+
+  if (request.send_back_parameter()) {
+    /// clean current cost
+    cost_ = 0;
+
+    if (allClientPassFinish_ && request.release_pass()) {
+      /// This signals that all clients finish one pass, so waitPassFinish()
+      /// will stop waiting.
+      numPassFinishClients_ = 0;
+    }
+
+    /// notify addGradient() to send back parameter
+    parameterReadyBarrier_.wait();
+  }
+  callback(response);
+}
+
+void ParameterServer2::waitPassStart(const WaitPassStartRequest& request,
+                                     ProtoResponseCallback callback) {
+  passBarrier_.wait();
+  callback(WaitPassStartResponse());
+}
+
+void ParameterServer2::waitPassFinish(const WaitPassFinishRequest& request,
+                                      ProtoResponseCallback callback) {
+  numPassFinishClients_ += 1;
+
+  while (numPassFinishClients_ != 0) {
+    /// notify doOperation gradient ready
+    gradientReadyBarrier_.wait();
+    /// wait doOperation finish
+    parameterReadyBarrier_.wait();
+  }
+
+  callback(WaitPassFinishResponse());
+}
+
+void ParameterServer2::synchronize(const SynchronizeRequest& request,
+                                   ProtoResponseCallback callback) {
+  synchronizeBarriers_[request.sync_object_id()]->wait();
+  dataSize_ = 0;
+  callback(SynchronizeResponse());
+}
+
+void ParameterServer2::asyncFinishPass(const SynchronizeRequest& request,
+                                       ProtoResponseCallback callback) {
+  synchronizeBarriers_[request.sync_object_id()]->wait();
+  callback(SynchronizeResponse());
+
+  if (request.trainer_id() == 0) {
+    batchId_ = 0;
+  }
+}
+
+void ParameterServer2::createVector(const CreateVectorRequest& request,
+                                    ProtoResponseCallback callback) {
+  (void)request;
+  CreateVectorResponse response;
+  LOG(INFO) << "ParameterServer2::createVector: size=" << size_;
+  CpuVectorPtr vec = std::make_shared<CpuVector>(size_);
+  int64_t handle = -1;
+  {
+    std::lock_guard<RWLock> guard(parameterMutex_);
+    handle = vectors_.size();
+    vectors_.push_back(vec);
+  }
+  response.set_handle(handle);
+  callback(response);
+}
+
+void ParameterServer2::releaseVector(const ReleaseVectorRequest& request,
+                                     ProtoResponseCallback callback) {
+  ReleaseVectorResponse response;
+  CpuVectorPtr vec;
+  {
+    std::lock_guard<RWLock> guard(parameterMutex_);
+    vec.swap(vectors_[request.handle()]);
+  }
+  callback(response);
+}
+
+void ParameterServer2::createMatrix(const CreateMatrixRequest& request,
+                                    ProtoResponseCallback callback) {
+  CreateMatrixResponse response;
+  /// We need to create column major matrix of size_ * num_cols
+  /// Matrix is row majoar. Need to tranpose when use it.
+  CpuMatrixPtr mat = std::make_shared<CpuMatrix>(request.num_cols(), size_);
+  int64_t handle = -1;
+  {
+    std::lock_guard<RWLock> guard(parameterMutex_);
+    handle = matrices_.size();
+    matrices_.push_back(mat);
+  }
+  response.set_handle(handle);
+  callback(response);
+}
+
+void ParameterServer2::releaseMatrix(const ReleaseMatrixRequest& request,
+                                     ProtoResponseCallback callback) {
+  ReleaseMatrixResponse response;
+  CpuMatrixPtr mat;
+  {
+    std::lock_guard<RWLock> guard(parameterMutex_);
+    mat.swap(matrices_[request.handle()]);
+  }
+  callback(response);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/pserver/ParameterServer2.h b/paddle/legacy/pserver/ParameterServer2.h
new file mode 100644
index 0000000000000000000000000000000000000000..069e730ea4ea4b253518d70142f0f242145cd326
--- /dev/null
+++ b/paddle/legacy/pserver/ParameterServer2.h
@@ -0,0 +1,696 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <atomic>
+#include <limits>
+#include <mutex>
+#include <string>
+#include <type_traits>
+#include <unordered_map>
+#include <vector>
+
+#include <stddef.h>
+#include <stdlib.h>
+
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/legacy/parameter/Parameter.h"
+#include "paddle/legacy/parameter/ParameterOptimizer.h"
+#include "paddle/legacy/utils/Common.h"
+#include "paddle/legacy/utils/Locks.h"
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
+
+#include "ParameterService.pb.h"
+
+#include "ProtoServer.h"
+
+DECLARE_int32(port);
+
+namespace paddle {
+
+// @TODO(yanfei):
+// if armed with high density computation resource per node, pserver could also
+// utilize GPU to reduce overhead. if this mechanism is used, it could pipeline
+// network receiving and GPU computation to reduce the network overhead even
+// further. the pipeline could help to accelerate BIG model training.
+// @TODO:(yanfei)
+// for cpu and less/low gpu machine, the time exhausted by forward and backward
+// could be larger than optimization at pserver. However, if armed with lots of
+// gpus per node and if the model size is so large enough that limited cpu
+// computation causes big optmization latency, the GPU may be required by
+// pserver.
+
+/**
+ * Client interface for the parameter server
+ *
+ * it implements several rpc API for remote parameter client usage.
+ * for sync-sgd, client needs one controller thread to build connections
+ * to all pservers, these controller connections do barriers
+ * synchronization with these connections used for transfering data.
+ * each data connection uses block based fine grained synchronization
+ * to gain better scalability. Merging gradients from different trainers
+ * are concurrently executed with block units, so that some network
+ * overhead will be hidden in merging gradient.
+ * for async-sgd, the difference is that pserver will do optimization
+ * immediately if the gradients are ready, so that pserver needs to
+ * prepare separate buffer to store value for sending back to trainer
+ * to prevent from being polluted.
+ */
+class ParameterServer2 : public ProtoServer {
+ protected:
+  /// parameter_ mutex.
+  RWLock parameterMutex_;
+
+  typedef std::pair<size_t, int64_t> BlockKey;
+  struct BlockKeyHash {
+    size_t operator()(const BlockKey& key) const {
+      return std::hash<size_t>()(key.first) + key.second;
+    }
+  };
+
+  // TODO(yanfei):
+  // if index data structure is based on parameters instead of blocks, the
+  // lookup performance could be better. In addition, the block memory
+  // access almost exhibits good locality, so index data structure and
+  // block data structure can be refined further, especially if gpu is used
+  // for pserver.
+  /**
+   * all parameters are stored in CpuVector with a blockMap_ data structure
+   * to index block data required by requests.
+   */
+  typedef std::unordered_map<BlockKey, int64_t, BlockKeyHash> BlockMap;
+  /// <(para, block), global offset(byte) in all parameters>
+  BlockMap blockOffsetMap_;
+  /// <(para, block), global idx [0, nBlocksInAllParameters]>
+  BlockMap blockIdMap_;
+
+  std::vector<CpuVectorPtr> vectors_;
+  std::vector<CpuMatrixPtr> matrices_;
+  std::vector<CpuMemHandlePtr> dataMems_;
+
+  // TODO(yanfei):
+  // if storing sparse_remote_update() flag in request instead of
+  // reading configMap_, and storing config within new block wise
+  // overview data structure, the config mapping, block mapping
+  // can be unified in single clean data structure. Use para_id
+  // to index parameters, use offset to index block within parameter
+  // and keep two index into single one.
+  /**
+   * mapping between parameter and config
+   * different parameter allows different config, such as decay_rate.
+   * for each request, it need to read config for adding gradient
+   * and optmization.
+   */
+  std::unordered_map<size_t, ParameterConfig> configMap_;
+
+  /**
+   * to parallelize the multi-thread and multi-connnection
+   * computation at pserver, it use block unit to reduce
+   * the contention for computation, even further use block
+   * level optimizater control for each block for some special
+   * reason annotated below.
+   */
+  struct BlockInfo {
+    const ParameterConfig* config;
+    std::unique_ptr<std::mutex> lock;
+    /// global offset for all parameters
+    uint64_t offset;
+    /**
+     *
+     * Async sgd in pserver is very different from sync sgd.
+     * Each trainer follows startBatch, update*, finishBatch as in
+     * sync sgd, but all these actions are almost executed by
+     * multi-core and multi-thread simutaneously, so that async
+     * sgd optimization is based on block level in reality, then
+     * per block optimization is necessary indeed. In addition,
+     * per block optimization is also perfered for performance
+     * with multithreads.
+     */
+    std::unique_ptr<ParameterOptimizer> optimizer;
+  };
+  std::vector<BlockInfo> blockInfos_;
+
+  typedef std::vector<std::pair<int64_t, int64_t>> BlockSegments;
+  /// Because some blocks might not be fully used. We keep a
+  /// record of which segments are used.
+  BlockSegments usedSegments_;
+
+  /// record pserver status, all status defined in ParameterService.pb
+  PServerStatus status_;
+  /// record all samples processed which could be used by optimizater
+  std::atomic<int64_t> numSamplesProcessed_;
+  double cost_;
+  int mpiSize_;
+  int dataSize_;
+  /// configuration for current parameter optimizer
+  OptimizationConfig config_;
+
+  /**
+   * The ReadWriteBuffer is based on std::vector, but aligned for avx/sse
+   * compute. And add some helper method to allocate memory aligned blocks.
+   *
+   * @param T          type of element.
+   * @param AlignBytes the memory aligned bytes for allocated blocks.
+   */
+  template <typename T, size_t AlignBytes>
+  class ReadWriteBuffer
+      : public std::vector<T, AlignedAllocator<T, AlignBytes>> {
+   public:
+    static_assert(sizeof(T) % AlignBytes == 0 || AlignBytes % sizeof(T) == 0,
+                  "Type T must be able to aligned.");
+
+    /**
+     * @brief IsTLargerThanAlign compiled time calculated constant for is type
+     * T larger than alignments.
+     */
+    constexpr static bool IsTLargerThanAlign = sizeof(T) >= AlignBytes;
+
+    static_assert(std::is_pod<T>::value, "T must be POD type.");
+
+    /**
+     * @brief if AlignBytes > sizeof(T), then will calcuate how many elements
+     * can be stored in AlignBytes.
+     */
+    constexpr static size_t AlignElementCount = AlignBytes / sizeof(T);
+
+    static_assert(AlignElementCount ==
+                          (AlignElementCount & -AlignElementCount) ||
+                      AlignBytes > sizeof(T),
+                  "AlignElementCount should be exp of 2");
+
+    /**
+     * @brief Resize Buffer, with block count that will be allocated. Each block
+     * will be memory aligned in AlignBytes.
+     * @param size The element count in all blocks.
+     * @param alignBlockCount The block count that will be allocated.
+     */
+    void resizeWithAlignHints(size_t size, size_t alignBlockCount = 1) {
+      if (IsTLargerThanAlign) {  //! So, each elements is memory aligned.
+        this->resize(size);
+      } else {
+        //! at most, we need such elements in buffer to make sure each block is
+        //! aligned.
+        this->resize(size + alignBlockCount * (AlignElementCount - 1));
+      }
+    }
+
+    /**
+     * @brief reset aligned allocate blocks.
+     */
+    void resetAlignAlloc() { this->curOffset_ = 0; }
+
+    /**
+     * @brief get next aligned block address.
+     * @param blockSize is the element count in each block.
+     * @return Aligned block address.
+     */
+    T* nextBlock(size_t blockSize) {
+      T* r = &this->operator[](curOffset_);
+      curOffset_ += blockSize;
+
+      if (!IsTLargerThanAlign) {
+        curOffset_ =
+            (curOffset_ + AlignElementCount - 1) & ~(AlignElementCount - 1);
+      }
+      return r;
+    }
+
+   private:
+    size_t curOffset_;
+  };
+
+  /// to buffer the data from network for further processing to
+  /// reduce redundant memory allocation.
+  ThreadLocal<ReadWriteBuffer<real, ALIGN_HINT>> readWriteBuffer_;
+
+  /// size of the parameter
+  int64_t size_;
+
+  /// for synchronized training, check details in addGradient()
+  /// and doOperation()
+  ThreadBarrier gradientReadyBarrier_;
+  ThreadBarrier parameterReadyBarrier_;
+  ThreadBarrier passBarrier_;
+  ThreadLocal<std::vector<SendParameterRequest>> requestVec_;
+  ThreadLocal<std::vector<ProtoResponseCallbackEx>> callbackVec_;
+
+  std::atomic<int> numPassFinishClients_;
+  bool allClientPassFinish_;
+
+  std::vector<std::unique_ptr<ThreadBarrier>> synchronizeBarriers_;
+  std::atomic<int> serverId_;
+
+  /**
+   *
+   * for lagged async gradient gradient commit control in Async Sgd.
+   * discard lagged gradients from too slow nodes, whose gradients
+   * exhibits bad quality.
+   * Algorithm:
+   * pserver:
+   * 1. initial asyncUpdaterSteps = 0, asyncTrainerSteps_[N] = 0.
+   * syncUpdaterSteps means
+   *    the version of parameter value.
+   * 2. when pull arrives, record asyncUpdateSteps_ into
+   * syncTrainerSteps_[trainer_id]
+   * 3. when push arrives, compare asyncUpdateSteps_ with
+   * syncTrainerSteps_[trainer_id]
+   *    if delta > threshold, discard current gradient, else commit
+   *    gradient.
+   * 4. reset asyncUpdaterSteps_ and asyncTrainerSteps_[N] when pass
+   * finished
+   * Note:
+   * it can not discard all lag-gradient strictly in some special
+   * condition. part of gradients could be discarded if
+   * ConcurrentRemoteParameterUpdater is sed.
+   * this algorithm is implemented in asynSGD()
+   */
+  int64_t asyncLaggedThreshold_;
+  std::atomic<int64_t> asyncUpdateSteps_;
+  std::vector<int64_t> asyncTrainerSteps_;
+  size_t asyncLaggedGradientsNum_;
+  /// stat all async update
+  std::vector<size_t> asyncUpdateStat_;
+  /// stat per trainer_id
+  std::vector<size_t> asyncTrainerDiscardStat_;
+  /// stat per trainer_id
+  std::vector<size_t> asyncTrainerCommitStat_;
+
+  /// only used by controller and other control cmd from trainer number 0
+  std::unique_ptr<SyncThreadPool> syncThreadPool_;
+
+  /// pserver for sparse remote update parameters
+  bool isSparseServer_;
+
+  /// barrier performance tuning sync-sgd required
+  std::atomic<int64_t> batchId_;
+
+ public:
+  struct Buffer {
+    real* base;
+    size_t size;
+  };
+
+ protected:
+  /// async gradient commit control
+  bool asyncGrdientCommitCheckAndStat(const SendParameterRequest& request);
+
+ public:
+  /// disable default parameter for overloading
+  /// @rdmaCpu:the id of cpu core hosting RDMA server(0-N)
+  /// -1 means using TCP transport instead of RDMA
+  ParameterServer2(const std::string& addr, int port, int rdmaCpu = -1);
+
+  ~ParameterServer2() {}
+
+  static const std::string kRetMsgInvalidMatrixHandle;
+  static const std::string kRetMsgInvalidVectorHandle;
+  static const std::string kRetMsgUnknownOperation;
+
+  /// service functions
+  template <typename Dtype>
+  void reduceAndSendData(const SendDataRequest& request,
+                         std::unique_ptr<MsgReader>& msgReader,
+                         ProtoResponseCallbackEx& callback);
+
+  void templateReduceSum(const SendDataRequest& request,
+                         std::unique_ptr<MsgReader>& msgReader,
+                         ProtoResponseCallbackEx& callback);
+
+  /**
+   * @brief framework for sending parameters
+   *
+   * @note  different parameter data type can be sent to pserver.
+   *        in most case, the api is used to send gradients from
+   *        trainer to pserver.
+   *        it also can be used to retrieve parameters from pserver
+   */
+  void sendParameter(const SendParameterRequest& request,
+                     std::unique_ptr<MsgReader> msgReader,
+                     ProtoResponseCallbackEx callback);
+
+  void sendData(const SendDataRequest& request,
+                std::unique_ptr<MsgReader> msgReader,
+                ProtoResponseCallbackEx callback);
+
+  /**
+   * @brief send config to pserver
+   *
+   * @note  it can help pserver to understand the configuration for
+   * optimization,
+   *        logging control, duplicated initialization, etc.
+   */
+  void setConfig(const SetConfigRequest& request,
+                 ProtoResponseCallback callback);
+
+  /**
+   * @brief get status for pserver
+   *
+   * @note  used to check if parameters are ready at pserver
+   */
+  void getStatus(const GetStatusRequest& request,
+                 ProtoResponseCallback callback);
+
+  /**
+   * @brief set status for pserver
+   *
+   * @note  used to check if parameters are ready at pserver, since parameters
+   *        at pserver are initialized by trainer
+   */
+  void setStatus(const SetStatusRequest& request,
+                 ProtoResponseCallback callback);
+
+  /**
+   * @brief framework for doing some operation at pserver end
+   *
+   * @note  if sync-sgd is used, controller will calling op_SGD action
+   *        for gradient optimization.
+   *        check avaiable operations in opFuncs[]
+   */
+  void doOperation(const DoOperationRequest& request,
+                   ProtoResponseCallback callback);
+
+  /// Create a column vector. The size is the dimension of parameter
+  void createVector(const CreateVectorRequest& request,
+                    ProtoResponseCallback callback);
+
+  void releaseVector(const ReleaseVectorRequest& request,
+                     ProtoResponseCallback callback);
+
+  /// Create a column major matrix. The number of rows is the dimension of
+  /// parameter. The number of columns is specifed by num_cols.
+  void createMatrix(const CreateMatrixRequest& request,
+                    ProtoResponseCallback callback);
+
+  void releaseMatrix(const ReleaseMatrixRequest& request,
+                     ProtoResponseCallback callback);
+  /**
+   * @brief stateful control for indicationg sync pass start
+   *
+   * @note  it is valuable for logging and state control,
+   *        especially for sync-sgd control
+   */
+  void waitPassStart(const WaitPassStartRequest& request,
+                     ProtoResponseCallback callback);
+
+  /**
+   * @brief stateful control for indicationg sync pass end
+   *
+   * @note  it is valuable for logging and state control,
+   *        especially for sync-sgd control
+   */
+  void waitPassFinish(const WaitPassFinishRequest& request,
+                      ProtoResponseCallback callback);
+
+  /**
+   * @brief synchronize all distributed trainers
+   *
+   * @note  it's general api for synchronizing trainer and pserver
+   */
+  void synchronize(const SynchronizeRequest& request,
+                   ProtoResponseCallback callback);
+
+  /**
+   * @brief stateful control for indicating async pass is finished
+   *
+   * @note  it is valuable for logging control, state reset, etc.
+   */
+  void asyncFinishPass(const SynchronizeRequest& request,
+                       ProtoResponseCallback callback);
+
+  void loadValueVector(const LoadValueRequest& request,
+                       ProtoResponseCallback callback);
+
+  void saveValueVector(const SaveValueRequest& request,
+                       ProtoResponseCallback callback);
+
+ public:
+  /**
+   * @brief initialize parameter server
+   */
+  bool init();
+
+  /**
+   * @brief set parameters at pserver
+   *
+   * @note  do parameter initialization if neccessy.
+   */
+  void setParameter(const SendParameterRequest& request,
+                    std::vector<Buffer>& inputBuffers,
+                    SendParameterResponse* response,
+                    std::vector<Buffer>* outputBuffers);
+
+  /**
+   * @brief receive gradients and do optimization for async-sgd
+   *
+   * @note  this api asynchronizately receives all data from all
+   *        trainers, and immediately do optimization and return
+   *        optimizated value for trainer.
+   *        this above routine are block based atomic updating,
+   *        which means different block could based different stale
+   *        gradient.
+   *        it will discard some lagged gradients by default for
+   *        better convergence.
+   */
+  void asyncSGD(const SendParameterRequest& request,
+                std::vector<Buffer>& inputBuffers,
+                SendParameterResponse* response,
+                std::vector<Buffer>* outputBuffers);
+
+  /**
+   * @brief merge gradients from all trainer
+   *
+   * @note  this api use block based parallelization as fine grained
+   *        parallelization which benifits lock contention and latency
+   *        hidden for communication, also can harness multi-core
+   *        efficiently.
+   *        it also implements the synchronization for sync-sgd
+   */
+  void addGradient(const SendParameterRequest& request,
+                   std::vector<Buffer>& inputBuffers,
+                   SendParameterResponse* response,
+                   std::vector<Buffer>* outputBuffers);
+
+  /**
+   * @brief get dense parameters from pserver
+   *
+   * @note  for some specified condition, trainer will get parameters from
+   *        pservers.
+   *        e.g.
+   *        if all parameters are stored at perver end for big model training
+   *        trainer can use it to retrieve all parameters if necessary.
+   */
+  void getParameter(const SendParameterRequest& request,
+                    std::vector<Buffer>& inputBuffers,
+                    SendParameterResponse* response,
+                    std::vector<Buffer>* outputBuffers);
+
+  /**
+   * @brief get sparse value from parameter server
+   *
+   * @note  with sparse enabled, pservers own all latest value
+   *        while trainer only retrieve value that only are needed.
+   *        e.g.
+   *        trainer will do prefetch action to retrieve necessary latest
+   *        value from pserver for sparse calculation.
+   */
+  void getParameterSparse(const SendParameterRequest& request,
+                          std::vector<Buffer>& inputBuffers,
+                          SendParameterResponse* response,
+                          std::vector<Buffer>* outputBuffers);
+
+ protected:
+  void mergeSegments(BlockSegments* segments);
+
+  /// set the unused segments to zero
+  void clearUnusedSegments(CpuVector* vec);
+
+  // TODO(yanfei):
+  // if read data and do optimization interleavely block by block,
+  // the performance could be better for gaining less network congestion.
+  /// read all data from connection and store it in static pre-allocated buffer
+  void readAllBlocks(MsgReader* msgReader,
+                     std::vector<ParameterServer2::Buffer>* buffers);
+
+  const ParameterConfig& getParameterConfig(const ParameterBlock& block) {
+    CHECK_LT(block.para_id(), -1UL) << "invalid parameter id:"
+                                    << block.para_id();
+    const auto it = configMap_.find(block.para_id());
+    CHECK(it != configMap_.end()) << "can not find parameter id: "
+                                  << block.para_id();
+    return it->second;
+  }
+
+  /// it implictly check blockOffsetMap_ while retrieving blockId
+  const ParameterConfig& getParameterConfig(int64_t blockId) const {
+    CHECK(blockId >= 0 && blockId < (int64_t)blockInfos_.size())
+        << "block idx out of range, id: " << blockId
+        << " info size: " << blockInfos_.size();
+    return *(blockInfos_[blockId].config);
+  }
+
+  template <class Response>
+  bool isValidVectorHandle(int64_t handle, Response* response) {
+    if (handle < 0 || (size_t)handle >= vectors_.size()) {
+      LOG(ERROR) << "Invalid vector handle " << handle;
+      response->set_return_message(kRetMsgInvalidVectorHandle);
+      return false;
+    }
+    return true;
+  }
+
+  template <class Response>
+  bool isValidMatrixHandle(int64_t handle, Response* response) {
+    if (handle < 0 || (size_t)handle >= matrices_.size()) {
+      LOG(ERROR) << "Invalid matrix handle " << handle;
+      response->set_return_message(kRetMsgInvalidMatrixHandle);
+      return false;
+    }
+    return true;
+  }
+
+  /**
+   * @brief get block offset
+   *
+   * @note  block.begin_dim is added to the block offset.
+   *        return -1 if block cannot be found
+   */
+  int64_t getBlockOffset(const ParameterBlock& block) const {
+    BlockKey key(block.para_id(), block.block_id());
+    auto it = blockOffsetMap_.find(key);
+    if (it == blockOffsetMap_.end()) {
+      return -1;
+    }
+    return it->second;
+  }
+
+  /// return -1 if block cannot be found
+  int64_t getBlockId(const ParameterBlock& block) const {
+    BlockKey key(block.para_id(), block.block_id());
+    auto it = blockIdMap_.find(key);
+    if (it == blockIdMap_.end()) {
+      return -1;
+    }
+    return it->second;
+  }
+
+  /**
+   * @brief prepare data for sending back
+   *
+   * @note  modify reponse and outputBuffers for sending parameter
+   *        back to client. The buffer for socket sending uses
+   *        vectors_[parameterType] directly
+   *        for dense with sync-sgd
+   */
+  void sendBackParameter(const ParameterBlock& block,
+                         int parameterType,
+                         SendParameterResponse* response,
+                         std::vector<Buffer>* outputBuffers);
+
+  /**
+   * @brief prepare data for sending back
+   *
+   * @note  modify response and outputBuffers for sending parameter
+   *        back to client. The buffer for socket sending uses buffer->base
+   *        The parameter values are copied from vectors_[parameterType]
+   *        to buffer->base.
+   *        for dense with async-sgd
+   */
+  void sendBackParameter(const ParameterBlock& block,
+                         int parameterType,
+                         SendParameterResponse* response,
+                         Buffer* buffer,
+                         std::vector<Buffer>* outputBuffers);
+  /**
+   * @brief prepare data for sending back
+   *
+   * @note  specified for sparse
+   */
+  void sendBackParameterSparse(const ParameterBlock& block,
+                               int parameterType,
+                               SendParameterResponse* response,
+                               Buffer* buffer,
+                               size_t width,
+                               std::vector<Buffer>* outputBuffers);
+
+  /**
+   * framework routine for block parallelization
+   * e.g.
+   * for optimization on all blocks at pserver end, this routine can facilitize
+   * the parallelize of do optimization on all blocks with multithreads.
+   */
+  typedef std::function<void(int64_t blockId, const VectorPtr vecs[])> ExecFunc;
+  void parallelExecForEachBlock(ExecFunc func);
+  void blockTraverse(BlockInfo& info,
+                     const ParameterConfig& config,
+                     int64_t offset,
+                     size_t size,
+                     const VectorPtr vecs[],
+                     const ParameterOptimizer::TraverseCallback& callback);
+
+ public:
+  typedef void (ParameterServer2::*OperatorFunction)(const Operation& operation,
+                                                     OperationResult* result);
+
+  /**
+   * doOperation will call following operations indirectly
+   * e.g.
+   * for sync-sgd control, the controller in remote updater will send op_SGD
+   * command to pserver, then send sendParameter request to pserver immediately.
+   * the two function at pserver end will do cooperation to achieve the sync-sgd
+   * gradient merge and optimization.
+   * the most following operations are specified for owlqn, all operations are
+   * under the context of doOperation function
+   */
+  static OperatorFunction opFuncs[];
+
+  void op_SGD(const Operation& operation, OperationResult* result);
+
+  void op_RESET(const Operation& operation, OperationResult* result);
+
+  void op_utv(const Operation& operation, OperationResult* result);
+
+  void op_au_bv(const Operation& operation, OperationResult* result);
+
+  void op_COPY(const Operation& operation, OperationResult* result);
+
+  void op_au(const Operation& operation, OperationResult* result);
+
+  void op_au_bv_cw(const Operation& operation, OperationResult* result);
+
+  void op_make_steepest_desc_dir(const Operation& operation,
+                                 OperationResult* result);
+
+  void op_fix_dir_signs(const Operation& operation, OperationResult* result);
+
+  void op_dir_deriv(const Operation& operation, OperationResult* result);
+
+  void op_fix_omega_signs(const Operation& operation, OperationResult* result);
+
+  void op_cost(const Operation& operation, OperationResult* result);
+
+  void op_start_pass(const Operation& operation, OperationResult* result);
+  void op_finish_pass(const Operation& operation, OperationResult* result);
+
+  void op_apply(const Operation& operation, OperationResult* result);
+
+  void op_randomize(const Operation& operation, OperationResult* result);
+
+  void op_load(const Operation& operation, OperationResult* result);
+  void op_save(const Operation& operation, OperationResult* result);
+};
+
+}  // namespace paddle
diff --git a/paddle/pserver/ParameterServer2Main.cpp b/paddle/legacy/pserver/ParameterServer2Main.cpp
similarity index 100%
rename from paddle/pserver/ParameterServer2Main.cpp
rename to paddle/legacy/pserver/ParameterServer2Main.cpp
diff --git a/paddle/pserver/ParameterServerController.cpp b/paddle/legacy/pserver/ParameterServerController.cpp
similarity index 100%
rename from paddle/pserver/ParameterServerController.cpp
rename to paddle/legacy/pserver/ParameterServerController.cpp
diff --git a/paddle/legacy/pserver/ParameterServerController.h b/paddle/legacy/pserver/ParameterServerController.h
new file mode 100644
index 0000000000000000000000000000000000000000..b90d0cbceaa879b8cb281867b5326ff50c1e311a
--- /dev/null
+++ b/paddle/legacy/pserver/ParameterServerController.h
@@ -0,0 +1,74 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "ParameterServer2.h"
+#include "ParameterServerConfig.pb.h"
+#include "RDMANetwork.h"
+#include "paddle/legacy/utils/StringUtil.h"
+
+namespace paddle {
+
+/**
+ * @brief ParameterServerController is used for create, init and manage multi
+ * parameter server instances. The num of the instances is decided by port
+ * num(the ports number for parameter send) and network devices configured
+ * by gflags or proto.
+ */
+class ParameterServerController final {
+ public:
+  DISABLE_COPY(ParameterServerController);
+
+  /**
+   * @brief Ctor, Create a ParameterServerController from ParameterServerConfig.
+   */
+  explicit ParameterServerController(const ParameterServerConfig& config);
+
+  /**
+   * @brief Dtor.
+   */
+  ~ParameterServerController();
+
+  /**
+   * @brief create ParameterServerController from gflags, this is used for
+   * compatibility with the old usage of configuration by gflags.
+   */
+  static ParameterServerController* createFromGflags();
+
+  /**
+   * @brief create ParameterServerController with ParameterServerConfig, remove
+   * gflags from ParameterServer. Init all ParameterServer2 instances according
+   * to
+   * the config.
+   */
+  static ParameterServerController* create(const ParameterServerConfig& config);
+
+  /**
+   * @brief start all ParameterServer2 instances in this
+   * ParameterServerController.
+   */
+  void start();
+
+  /**
+   * @brief join and wait for all ParameterServer2 instances thread in this
+   * ParameterServerController.
+   */
+  void wait();
+
+ private:
+  std::vector<std::unique_ptr<ParameterServer2>> parameterServers_;
+};
+
+}  // namespace paddle
diff --git a/paddle/pserver/ProtoServer.cpp b/paddle/legacy/pserver/ProtoServer.cpp
similarity index 100%
rename from paddle/pserver/ProtoServer.cpp
rename to paddle/legacy/pserver/ProtoServer.cpp
diff --git a/paddle/pserver/ProtoServer.h b/paddle/legacy/pserver/ProtoServer.h
similarity index 100%
rename from paddle/pserver/ProtoServer.h
rename to paddle/legacy/pserver/ProtoServer.h
diff --git a/paddle/legacy/pserver/RDMANetwork.h b/paddle/legacy/pserver/RDMANetwork.h
new file mode 100644
index 0000000000000000000000000000000000000000..c87056f72c56647c827cdbd7bdd6a992b4bb1cf6
--- /dev/null
+++ b/paddle/legacy/pserver/RDMANetwork.h
@@ -0,0 +1,158 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifndef PADDLE_DISABLE_RDMA
+#include "sxi_sock.h"
+#else
+#define PROMPT_ERR() LOG(FATAL) << "Paddle is not compiled with rdma"
+#endif
+#include "paddle/legacy/utils/Logging.h"
+
+#include <netinet/in.h>
+struct sxi_sock;
+struct sxi_socket;
+
+#ifndef MAX_VEC_SIZE
+// define default MAX_VEC_SIZE
+#define MAX_VEC_SIZE (1UL << 16)
+#endif
+
+namespace paddle {
+/// Namespace rdma is adaptors for sxi_sock.h. Make paddle not depend on it
+/// when disable rdma support
+namespace rdma {
+inline int numCpus() {
+#ifndef PADDLE_DISABLE_RDMA
+  return sxi_num_configured_cpus();
+#else
+  return 0;
+#endif
+}
+
+inline sxi_socket* ssocket(int cpuId) {
+#ifndef PADDLE_DISABLE_RDMA
+  return sxi_ssocket(cpuId);
+#else
+  PROMPT_ERR();
+#endif
+}
+
+inline int listen(sxi_socket* s) {
+#ifndef PADDLE_DISABLE_RDMA
+  return sxi_listen(s);
+#else
+  PROMPT_ERR();
+#endif
+}
+
+inline int bind(sxi_socket* s, const char* str) {
+#ifndef PADDLE_DISABLE_RDMA
+  return sxi_bind(s, str);
+#else
+  PROMPT_ERR();
+#endif
+}
+
+inline sxi_sock* accept(sxi_socket* s) {
+#ifndef PADDLE_DISABLE_RDMA
+  return sxi_accept(s);
+#else
+  PROMPT_ERR();
+#endif
+}
+
+inline sockaddr_in* getSourceAddress(sxi_sock* sock) {
+#ifndef PADDLE_DISABLE_RDMA
+  return reinterpret_cast<sockaddr_in*>(&sock->sa);
+#else
+  PROMPT_ERR();
+#endif
+}
+
+inline int close(sxi_socket* sock) {
+#ifndef PADDLE_DISABLE_RDMA
+  return sxi_socket_close(sock);
+#else
+  PROMPT_ERR();
+#endif
+}
+
+inline int close(sxi_sock* sock) {
+#ifndef PADDLE_DISABLE_RDMA
+  return sxi_sock_close(sock);
+#else
+  PROMPT_ERR();
+#endif
+}
+
+inline void init() {
+#ifndef PADDLE_DISABLE_RDMA
+  sxi_module_init();
+#else
+  PROMPT_ERR();
+#endif
+}
+
+inline sxi_socket* csocket(int cpuId) {
+#ifndef PADDLE_DISABLE_RDMA
+  return sxi_csocket(cpuId);
+#else
+  PROMPT_ERR();
+#endif
+}
+
+inline ssize_t read(sxi_sock* channel, void* data, size_t len) {
+#ifndef PADDLE_DISABLE_RDMA
+  return sxi_read(channel, data, len);
+#else
+  PROMPT_ERR();
+#endif
+}
+
+inline ssize_t write(sxi_sock* channel, void* data, size_t len) {
+#ifndef PADDLE_DISABLE_RDMA
+  return sxi_write(channel, data, len);
+#else
+  PROMPT_ERR();
+#endif
+}
+
+inline ssize_t readv(sxi_sock* channel, iovec* iov, int count) {
+#ifndef PADDLE_DISABLE_RDMA
+  return sxi_readv(channel, iov, count);
+#else
+  PROMPT_ERR();
+#endif
+}
+
+inline ssize_t writev(sxi_sock* channel, iovec* iov, int count) {
+#ifndef PADDLE_DISABLE_RDMA
+  return sxi_writev(channel, iov, count);
+#else
+  PROMPT_ERR();
+#endif
+}
+
+inline sxi_sock* connect(sxi_socket* socket, const char* url) {
+#ifndef PADDLE_DISABLE_RDMA
+  return sxi_connect(socket, url);
+#else
+  PROMPT_ERR();
+#endif
+}
+
+}  //  namespace rdma
+}  //  namespace paddle
diff --git a/paddle/legacy/pserver/SocketChannel.cpp b/paddle/legacy/pserver/SocketChannel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..79c763c62ba845067c7729eafb5b218fc7b91482
--- /dev/null
+++ b/paddle/legacy/pserver/SocketChannel.cpp
@@ -0,0 +1,235 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "SocketChannel.h"
+
+#include <netdb.h>
+#include <netinet/in.h>
+#include <stdio.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include "RDMANetwork.h"
+
+#include "paddle/legacy/utils/Util.h"
+
+namespace paddle {
+
+/**
+ * UIO_MAXIOV is documented in writev(2), but <sys/uio.h> only
+ * declares it on osx/ios if defined(KERNEL)
+ */
+#ifndef UIO_MAXIOV
+#define UIO_MAXIOV 512
+#endif
+
+SocketChannel::~SocketChannel() {
+  if (tcpRdma_ == F_TCP)
+    close(tcpSocket_);
+  else
+    rdma::close(rdmaSocket_);
+  LOG(INFO) << "destory connection in socket channel, peer = " << peerName_;
+}
+
+size_t SocketChannel::read(void* buf, size_t size) {
+  size_t total = 0;
+  while (total < size) {
+    ssize_t len;
+    if (tcpRdma_ == F_TCP)
+      len = ::read(tcpSocket_, (char*)buf + total, size - total);
+    else
+      len = rdma::read(rdmaSocket_, (char*)buf + total, size - total);
+
+    CHECK(len >= 0) << " peer=" << peerName_;
+    if (len <= 0) {
+      return total;
+    }
+    total += len;
+  }
+  return total;
+}
+
+size_t SocketChannel::write(const void* buf, size_t size) {
+  size_t total = 0;
+  while (total < size) {
+    ssize_t len;
+    if (tcpRdma_ == F_TCP)
+      len = ::write(tcpSocket_, (const char*)buf + total, size - total);
+    else
+      len = rdma::write(rdmaSocket_, (char*)buf + total, size - total);
+
+    CHECK(len >= 0) << " peer=" << peerName_;
+    if (len <= 0) {
+      return total;
+    }
+    total += len;
+  }
+  return total;
+}
+
+template <class IOFunc, class SocketType>
+static size_t readwritev(IOFunc iofunc,
+                         SocketType socket,
+                         iovec* iovs,
+                         int iovcnt,
+                         int maxiovs,
+                         const std::string& peerName) {
+  int curIov = 0;
+  size_t total = 0;
+
+  for (int i = 0; i < iovcnt; ++i) {
+    total += iovs[i].iov_len;
+  }
+
+  size_t size = 0;
+  size_t curIovSizeDone = 0;
+
+  while (size < total) {
+    ssize_t len =
+        iofunc(socket, &iovs[curIov], std::min(iovcnt - curIov, maxiovs));
+    CHECK(len > 0) << " peer=" << peerName << " curIov=" << curIov
+                   << " iovCnt=" << iovcnt
+                   << " iovs[curIov].base=" << iovs[curIov].iov_base
+                   << " iovs[curIov].iov_len=" << iovs[curIov].iov_len;
+    size += len;
+
+    /// restore iovs[curIov] to the original value
+    iovs[curIov].iov_base =
+        (void*)((char*)iovs[curIov].iov_base - curIovSizeDone);
+    iovs[curIov].iov_len += curIovSizeDone;
+
+    len += curIovSizeDone;
+
+    while (curIov < iovcnt) {
+      if ((size_t)len < iovs[curIov].iov_len) break;
+      len -= iovs[curIov].iov_len;
+      ++curIov;
+    }
+    if (curIov < iovcnt) {
+      curIovSizeDone = len;
+      iovs[curIov].iov_base = (void*)((char*)iovs[curIov].iov_base + len);
+      iovs[curIov].iov_len -= len;
+    }
+  }
+  return size;
+}
+
+/// rdma::readv and rdma::writev can take advantage of RDMA blocking offload
+/// transfering
+size_t SocketChannel::writev(const std::vector<struct iovec>& iovs) {
+  if (tcpRdma_ == F_TCP)
+    return readwritev(::writev,
+                      tcpSocket_,
+                      const_cast<iovec*>(&iovs[0]),
+                      iovs.size(),
+                      UIO_MAXIOV,
+                      peerName_);
+  else
+    return readwritev(rdma::writev,
+                      rdmaSocket_,
+                      const_cast<iovec*>(&iovs[0]),
+                      iovs.size(),
+                      MAX_VEC_SIZE,
+                      peerName_);
+}
+
+size_t SocketChannel::readv(std::vector<struct iovec>* iovs) {
+  if (tcpRdma_ == F_TCP)
+    return readwritev(::readv,
+                      tcpSocket_,
+                      const_cast<iovec*>(&(*iovs)[0]),
+                      iovs->size(),
+                      UIO_MAXIOV,
+                      peerName_);
+  else
+    return readwritev(rdma::readv,
+                      rdmaSocket_,
+                      const_cast<iovec*>(&(*iovs)[0]),
+                      iovs->size(),
+                      MAX_VEC_SIZE,
+                      peerName_);
+}
+
+void SocketChannel::writeMessage(const std::vector<struct iovec>& userIovs) {
+  MessageHeader header;
+  header.numIovs = userIovs.size();
+
+  std::vector<size_t> iovLengths;
+  iovLengths.reserve(userIovs.size());
+  for (auto& iov : userIovs) {
+    iovLengths.push_back(iov.iov_len);
+  }
+
+  std::vector<iovec> iovs;
+  iovs.reserve(userIovs.size() + 2);
+  iovs.push_back({&header, sizeof(header)});
+  iovs.push_back({&iovLengths[0],
+                  static_cast<size_t>(sizeof(iovLengths[0]) * header.numIovs)});
+  iovs.insert(iovs.end(), userIovs.begin(), userIovs.end());
+
+  header.totalLength = 0;
+  for (auto& iov : iovs) {
+    header.totalLength += iov.iov_len;
+  }
+
+  CHECK(writev(iovs) == (size_t)header.totalLength);
+}
+
+std::unique_ptr<MsgReader> SocketChannel::readMessage() {
+  MessageHeader header;
+
+  size_t len = read(&header, sizeof(header));
+  if (len == 0) {
+    return nullptr;
+  }
+
+  CHECK(len == sizeof(header));
+
+  std::unique_ptr<MsgReader> msgReader(new MsgReader(this, header.numIovs));
+
+  CHECK_EQ(msgReader->getTotalLength() + sizeof(header) +
+               msgReader->getNumBlocks() * sizeof(size_t),
+           (size_t)header.totalLength)
+      << " totalLength=" << msgReader->getTotalLength()
+      << " numBlocks=" << msgReader->getNumBlocks();
+  return msgReader;
+}
+
+MsgReader::MsgReader(SocketChannel* channel, size_t numBlocks)
+    : channel_(channel), blockLengths_(numBlocks), currentBlockIndex_(0) {
+  size_t size = numBlocks * sizeof(blockLengths_[0]);
+  CHECK(channel_->read(&blockLengths_[0], size) == size);
+}
+
+void MsgReader::readBlocks(const std::vector<void*>& bufs) {
+  CHECK_LE(currentBlockIndex_ + bufs.size(), blockLengths_.size());
+  std::vector<iovec> iovs;
+  iovs.reserve(bufs.size());
+  size_t totalLength = 0;
+  for (void* buf : bufs) {
+    iovs.push_back({buf, getNextBlockLength()});
+    totalLength += getNextBlockLength();
+    ++currentBlockIndex_;
+  }
+
+  CHECK(channel_->readv(&iovs) == totalLength);
+}
+
+void MsgReader::readNextBlock(void* buf) {
+  CHECK_LT(currentBlockIndex_, blockLengths_.size());
+  CHECK(channel_->read(buf, getNextBlockLength()) == getNextBlockLength());
+  ++currentBlockIndex_;
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/pserver/SocketChannel.h b/paddle/legacy/pserver/SocketChannel.h
new file mode 100644
index 0000000000000000000000000000000000000000..a7b3cd42f0aa32c3a74e14f87dbfe64d25473254
--- /dev/null
+++ b/paddle/legacy/pserver/SocketChannel.h
@@ -0,0 +1,153 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/legacy/utils/Util.h"
+
+#include <sys/uio.h>
+
+#include <memory>
+#include <vector>
+
+struct sxi_sock;
+
+namespace paddle {
+
+class SocketChannel;
+enum ChannelType {
+  F_TCP = 1,
+  F_RDMA = 2,
+};
+
+/// reading a set of blocks of data from SocketChannel.
+class MsgReader {
+ public:
+  MsgReader(SocketChannel* channel, size_t numIovs);
+  ~MsgReader() {
+    /// ensure all data blocks have been processed
+    CHECK_EQ(currentBlockIndex_, blockLengths_.size());
+  }
+  /**
+   * @brief number of remaining parts
+   */
+  size_t getNumBlocks() const {
+    return blockLengths_.size() - currentBlockIndex_;
+  }
+
+  /**
+   * @brief lenght of next block
+   */
+  size_t getNextBlockLength() const { return getBlockLength(0); }
+
+  /**
+   * @brief get the total length of all the remaining blocks
+   */
+  size_t getTotalLength() const {
+    size_t total = 0;
+    for (size_t i = currentBlockIndex_; i < blockLengths_.size(); ++i) {
+      total += blockLengths_[i];
+    }
+    return total;
+  }
+
+  /**
+   * @brief Get the length for block currentBlockIndex + i
+   */
+  size_t getBlockLength(size_t i) const {
+    return blockLengths_[currentBlockIndex_ + i];
+  }
+
+  /**
+   * @brief  read blocks data and store it to buf
+   */
+  void readBlocks(const std::vector<void*>& bufs);
+  void readNextBlock(void* buf);
+
+ protected:
+  SocketChannel* channel_;
+  std::vector<size_t> blockLengths_;
+  size_t currentBlockIndex_;
+};
+
+/// APIs for reading and writing byte stream data or naive iov data
+/// from the APIs both RDMA and TCP exhibits byte stream style
+class SocketChannel {
+ public:
+  SocketChannel(int socket, const std::string& peerName)
+      : tcpSocket_(socket), peerName_(peerName) {
+    tcpRdma_ = F_TCP;
+  }
+  SocketChannel(struct sxi_sock* socket, const std::string& peerName)
+      : rdmaSocket_(socket), peerName_(peerName) {
+    tcpRdma_ = F_RDMA;
+  }
+
+  ~SocketChannel();
+
+  const std::string& getPeerName() const { return peerName_; }
+
+  /**
+   * @brief read size bytes.
+   *
+   * @note  keep reading until getting size bytes or sock is closed
+   *        is closed
+   */
+  size_t read(void* buf, size_t size);
+
+  /**
+   * @brief write size bytes.
+   *
+   * @note  keep writing until writing size bytes or sock is closed
+   */
+  size_t write(const void* buf, size_t size);
+
+  /**
+   * @brief write a set of buffers.
+   *
+   * @note  keep writing until all buffers are written or sock is closed
+   */
+  size_t writev(const std::vector<struct iovec>& iov);
+
+  /**
+   * @brief read a set of buffers.
+   *
+   * @note  keep reading until all buffers are full or sock is closed.
+   */
+  size_t readv(std::vector<struct iovec>* iov);
+
+  /**
+   * @brief write a set of buffers.
+   *
+   * @note  keep writing until all buffers are passed or sock is closed
+   */
+  void writeMessage(const std::vector<struct iovec>& iov);
+
+  /// return null to indicate socket is closed
+  std::unique_ptr<MsgReader> readMessage();
+
+ protected:
+  struct MessageHeader {
+    int64_t totalLength;  /// include the header
+    int64_t numIovs;
+    int64_t iovLengths[0];
+  };
+
+  int tcpSocket_;
+  struct sxi_sock* rdmaSocket_;
+  const std::string peerName_;
+  enum ChannelType tcpRdma_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/pserver/SparseParameterDistribution.cpp b/paddle/legacy/pserver/SparseParameterDistribution.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3f17b228f0e5fd33b7e7db2afe1fb9421acc69c5
--- /dev/null
+++ b/paddle/legacy/pserver/SparseParameterDistribution.cpp
@@ -0,0 +1,123 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <unistd.h>
+
+#include "paddle/legacy/utils/Logging.h"
+
+#include "paddle/legacy/utils/Flags.h"
+
+#include "SparseParameterDistribution.h"
+
+DEFINE_bool(check_sparse_distribution_in_pserver,
+            false,
+            "check whether sparse parameter exhibts balanced distribution at "
+            "all pservers");
+DEFINE_bool(show_check_sparse_distribution_log,
+            false,
+            "show logs details for sparse parameter distribution in pserver");
+DEFINE_int32(check_sparse_distribution_batches,
+             100,
+             "run sparse parameter distribution check for N batches");
+DEFINE_double(
+    check_sparse_distribution_ratio,
+    0.6,
+    "if parameters dispatched to different pservers exhibit unbalanced "
+    " distribution for check_sparse_distribution_ratio * "
+    " check_sparse_distribution_batches times, crash program");
+DEFINE_double(check_sparse_distribution_unbalance_degree,
+              2.0,
+              "the ratio of maximum data size and minimun data size for "
+              "different pserver");
+
+namespace paddle {
+
+SparseParameterDistribution::SparseParameterDistribution(size_t serviceNum) {
+  totBytes_ = 0;
+  data_.resize(serviceNum);
+
+  batchPassed_ = 0;
+  unbalanceCnt_ = 0;
+}
+
+void SparseParameterDistribution::probeDistribution(int serverId,
+                                                    size_t dataSize) {
+  if (!FLAGS_check_sparse_distribution_in_pserver ||
+      batchPassed_ > FLAGS_check_sparse_distribution_batches) {
+    return;
+  }
+
+  CHECK_LT((size_t)serverId, data_.size())
+      << "invalid sparse parameter distribution probe";
+
+  data_[serverId] += dataSize;
+  totBytes_ += dataSize;
+}
+
+void SparseParameterDistribution::checkAndResetDistribution() {
+  if (!FLAGS_check_sparse_distribution_in_pserver ||
+      batchPassed_ >= FLAGS_check_sparse_distribution_batches) {
+    return;
+  }
+
+  /// at runtime, prepareSendData is called by many contexts,
+  /// so need to check if data is avaiable.
+  if (!totBytes_) {
+    return;
+  }
+
+  /// check if distribution is balanced
+  auto avgSize = totBytes_ / data_.size();
+  auto unbalanceDegree = FLAGS_check_sparse_distribution_unbalance_degree;
+  for (auto& dataSize : data_) {
+    if (dataSize > unbalanceDegree * avgSize ||
+        dataSize * unbalanceDegree < avgSize) {
+      unbalanceCnt_++;
+      break;
+    }
+  }
+
+  auto printData = [&]() {
+    std::stringstream ss;
+    for (auto& dataSize : data_) {
+      ss << dataSize * 0.001 << "KB ";
+    }
+    ss << std::endl;
+    LOG(INFO) << ss.str();
+  };
+
+  /// show all sparse data size for different pserver
+  if (FLAGS_show_check_sparse_distribution_log) {
+    LOG(INFO) << "sparse distribution:";
+    printData();
+  }
+
+  totBytes_ = 0;
+  batchPassed_++;
+
+  if (batchPassed_ == FLAGS_check_sparse_distribution_batches) {
+    LOG(INFO) << "show last parameter distribution sample:";
+    printData();
+    LOG(INFO) << "total unbalanced batches: " << unbalanceCnt_
+              << " in passed batches: " << batchPassed_;
+    CHECK_LE((float)unbalanceCnt_ / (float)batchPassed_,
+             FLAGS_check_sparse_distribution_ratio)
+        << "unbalanced sparse parameter distribution for different pserver. "
+        << "it could be caused by unbalanced sparse ids distribution, try "
+        << "to shuffle dimensions in input samples";
+  }
+
+  std::fill(data_.begin(), data_.end(), 0);
+}
+}  // namespace paddle
diff --git a/paddle/legacy/pserver/SparseParameterDistribution.h b/paddle/legacy/pserver/SparseParameterDistribution.h
new file mode 100644
index 0000000000000000000000000000000000000000..ee78029958f675d07ec0aba2d0c1ea92d664e8fd
--- /dev/null
+++ b/paddle/legacy/pserver/SparseParameterDistribution.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <unistd.h>
+
+#include <atomic>
+#include "paddle/legacy/utils/Logging.h"
+
+namespace paddle {
+
+/*
+ * if sparse_remote_updater is used, different ParameterServer could
+ * be assigned with unbalanced gradients. the parameter value from
+ * ParameterServer also be not balanced. the distribution of different
+ * dimensions of sparse ids determines the unbalanced degree of data
+ * distributed among all ParameterServers. Even distribution will
+ * benifits cluster efficiency.
+ * do check the unbalanced degree of gradients at runtime, crash program
+ * if unbalanced distribution exhibts by default.
+ */
+class SparseParameterDistribution {
+ public:
+  /// serviceNum means the number of ParameterServers
+  explicit SparseParameterDistribution(size_t serviceNum);
+  ~SparseParameterDistribution() {}
+  /// collect data
+  void probeDistribution(int serverId, size_t data);
+  void checkAndResetDistribution();
+
+ private:
+  std::vector<size_t> data_;
+  std::atomic<size_t> totBytes_;
+
+  /// after some batches, stop to check
+  int batchPassed_;
+
+  /// stat on unbalanced distribution found
+  int unbalanceCnt_;
+};
+}  // namespace paddle
diff --git a/paddle/pserver/test/.gitignore b/paddle/legacy/pserver/test/.gitignore
similarity index 100%
rename from paddle/pserver/test/.gitignore
rename to paddle/legacy/pserver/test/.gitignore
diff --git a/paddle/pserver/test/CMakeLists.txt b/paddle/legacy/pserver/test/CMakeLists.txt
similarity index 100%
rename from paddle/pserver/test/CMakeLists.txt
rename to paddle/legacy/pserver/test/CMakeLists.txt
diff --git a/paddle/legacy/pserver/test/SocketTest.cpp b/paddle/legacy/pserver/test/SocketTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3a781fcbf655b554e79fc753f3409d12f10f6646
--- /dev/null
+++ b/paddle/legacy/pserver/test/SocketTest.cpp
@@ -0,0 +1,256 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/utils/Util.h"
+
+#include <netdb.h>
+#include <netinet/in.h>
+#include <stdio.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include <thread>
+
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/legacy/utils/Logging.h"
+
+struct MessageHeader {
+  int64_t dataLength;
+};
+
+class Thread {
+ public:
+  void start();
+  virtual void run() = 0;
+  virtual ~Thread() {}
+
+ protected:
+  std::unique_ptr<std::thread> thread_;
+};
+
+void Thread::start() {
+  thread_.reset(new std::thread([this]() { this->run(); }));
+}
+
+class SocketChannel {
+ public:
+  explicit SocketChannel(int socket) : socket_(socket) {}
+  int getSocketFd() const { return socket_; }
+  uint64_t readAll(void* buf, size_t size);
+  uint64_t writeAll(const void* buf, size_t size);
+
+ protected:
+  int socket_;
+};
+
+uint64_t SocketChannel::readAll(void* buf, size_t size) {
+  uint64_t total = 0;
+  while (total < size) {
+    int64_t len = read(socket_, (char*)buf + total, size - total);
+    if (len <= 0) {
+      return total;
+    }
+    total += len;
+  }
+  return total;
+}
+
+uint64_t SocketChannel::writeAll(const void* buf, size_t size) {
+  uint64_t total = 0;
+  while (total < size) {
+    int64_t len = write(socket_, (const char*)buf + total, size - total);
+    if (len <= 0) {
+      return total;
+    }
+    total += len;
+  }
+  return total;
+}
+
+class SocketWorker : public Thread {
+ public:
+  explicit SocketWorker(int socket) : channel_(socket) {}
+  virtual void run();
+
+  // read n bytes.
+  int64_t readAll(char* buf, size_t n);
+
+  // write n bytes
+
+ protected:
+  SocketChannel channel_;
+  std::string buffer_;
+};
+
+class SocketServer : public Thread {
+ public:
+  explicit SocketServer(int port)
+      : port_(port), socket_(0), maxPendingConnections_(100) {}
+
+  virtual void run();
+
+ protected:
+  int port_;
+  int socket_;
+  int maxPendingConnections_;
+};
+
+void SocketServer::run() {
+  int newsockfd;
+  socklen_t clilen;
+  struct sockaddr_in serv_addr, cli_addr;
+
+  /* First call to socket() function */
+  socket_ = socket(AF_INET, SOCK_STREAM, 0);
+  CHECK(socket_ >= 0) << "ERROR opening socket";
+
+  /* Initialize socket structure */
+  bzero((char*)&serv_addr, sizeof(serv_addr));
+  serv_addr.sin_family = AF_INET;
+  serv_addr.sin_addr.s_addr = INADDR_ANY;
+  serv_addr.sin_port = htons(port_);
+
+  /* Now bind the host address using bind() call.*/
+  CHECK(bind(socket_, (struct sockaddr*)&serv_addr, sizeof(serv_addr)) >= 0)
+      << "ERROR on binding";
+
+  /* Now start listening for the clients, here process will
+   * go in sleep mode and will wait for the incoming connection
+   */
+  listen(socket_, maxPendingConnections_);
+  clilen = sizeof(cli_addr);
+
+  while (true) {
+    /* Accept actual connection from the client */
+    newsockfd = accept(socket_, (struct sockaddr*)&cli_addr, &clilen);
+    CHECK(newsockfd >= 0) << "ERROR on accept";
+
+    SocketWorker* worker = new SocketWorker(newsockfd);
+    worker->start();
+  }
+}
+
+void SocketWorker::run() {
+  MessageHeader header;
+
+  while (true) {
+    int64_t n = channel_.readAll(&header, sizeof(header));
+    CHECK(n == sizeof(header)) << "ERROR reading from socket";
+
+    buffer_.resize(header.dataLength);
+    n = channel_.readAll(&buffer_[0], header.dataLength);
+    CHECK(n == header.dataLength) << "ERROR reading from socket";
+
+    /* Write a response to the client */
+    n = channel_.writeAll(&header, sizeof(header));
+    CHECK(n == sizeof(header)) << "ERROR reading from socket";
+    n = channel_.writeAll(buffer_.data(), buffer_.size());
+    CHECK(n == header.dataLength) << "ERROR writing to socket";
+  }
+}
+
+class SocketClient {
+ public:
+  SocketClient(const std::string& serverAddr, int serverPort);
+  SocketChannel* getChannel() const { return channel_.get(); }
+
+ protected:
+  std::unique_ptr<SocketChannel> channel_;
+};
+
+SocketClient::SocketClient(const std::string& serverAddr, int serverPort) {
+  struct sockaddr_in serv_addr;
+  struct hostent* server;
+
+  // char buffer[256];
+
+  /* Create a socket point */
+  int sockfd = socket(AF_INET, SOCK_STREAM, 0);
+  CHECK(sockfd >= 0) << "ERROR opening socket";
+  server = gethostbyname(serverAddr.c_str());
+  CHECK(server) << "ERROR, no such host: " << serverAddr;
+
+  bzero((char*)&serv_addr, sizeof(serv_addr));
+  serv_addr.sin_family = AF_INET;
+  bcopy((char*)server->h_addr,
+        (char*)&serv_addr.sin_addr.s_addr,
+        server->h_length);
+  serv_addr.sin_port = htons(serverPort);
+
+  /* Now connect to the server */
+  CHECK(connect(sockfd, (sockaddr*)&serv_addr, sizeof(serv_addr)) >= 0)
+      << "ERROR connecting";
+
+  channel_.reset(new SocketChannel(sockfd));
+}
+
+DEFINE_string(server_addr, "127.0.0.1", "Server address");
+DEFINE_int64(dim, 10000000, "Data size");
+DEFINE_int32(loop_time, 100000, "test loop time");
+
+using namespace paddle;  // NOLINT
+
+int main(int argc, char** argv) {
+  paddle::initMain(argc, argv);
+  SocketServer server(FLAGS_port);
+  server.start();
+  sleep(1);
+
+  SocketClient client(FLAGS_server_addr, FLAGS_port);
+
+  SocketChannel* channel = client.getChannel();
+
+  MessageHeader header;
+
+  uint64_t dataSize = FLAGS_dim * sizeof(real);
+
+#ifdef PADDLE_WITH_CUDA
+  GpuVector gpuParam(FLAGS_dim);
+  GpuVector gpuGrad(FLAGS_dim);
+#else
+  CpuVector gpuParam(FLAGS_dim);
+  CpuVector gpuGrad(FLAGS_dim);
+#endif
+  CpuVector cpuParam(FLAGS_dim);
+  CpuVector cpuGrad(FLAGS_dim);
+
+  gpuParam.rand();
+  gpuGrad.rand();
+  cpuParam.rand();
+  cpuGrad.rand();
+
+  for (int i = 0; i < FLAGS_loop_time; ++i) {
+    cpuGrad.copyFrom(gpuGrad);
+
+    header.dataLength = dataSize;
+    CHECK(channel->writeAll(&header, sizeof(header)) == sizeof(header))
+        << "Client write header error";
+
+    CHECK(channel->writeAll(cpuGrad.getData(), dataSize) == dataSize)
+        << "Client write data error";
+
+    /* Now read server response */
+    CHECK(channel->readAll(&header, sizeof(header)) == sizeof(header))
+        << "Client read header error";
+
+    CHECK_EQ((uint64_t)header.dataLength, dataSize);
+    CHECK(channel->readAll(cpuParam.getData(), dataSize) == dataSize)
+        << "Client read data error";
+
+    gpuParam.copyFrom(cpuParam);
+
+    LOG_EVERY_N(INFO, 100) << "i=" << i;
+  }
+  exit(0);
+}
diff --git a/paddle/legacy/pserver/test/test_ParameterServer2.cpp b/paddle/legacy/pserver/test/test_ParameterServer2.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..542e80e046972be38d403bc3223f7e7fcd15e3f0
--- /dev/null
+++ b/paddle/legacy/pserver/test/test_ParameterServer2.cpp
@@ -0,0 +1,624 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <paddle/legacy/pserver/ParameterClient2.h>
+#include <paddle/legacy/pserver/ParameterServer2.h>
+#include <paddle/legacy/utils/Flags.h>
+#include <paddle/legacy/utils/Util.h>
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_int32(num_gradient_servers);
+DEFINE_string(server_addr, "127.0.0.1", "assign server address");
+DEFINE_int32(server_cpu, 0, "assign server cpu");
+
+class ParameterServer2Tester : public ParameterServer2 {
+ public:
+  ParameterServer2Tester(std::string serverAddr,
+                         int port,
+                         int rdmaCpu = -1,
+                         bool sepSendAndRecv = false)
+      : ParameterServer2(serverAddr, port, rdmaCpu), client_(sepSendAndRecv) {}
+  virtual ~ParameterServer2Tester() {}
+  void setup() {
+    CHECK(ParameterServer2::init());
+
+    parameters_.clear();
+    clientConfigs_.clear();
+
+    clientConfigs_.resize(2);
+    {
+      ParameterConfig& config = clientConfigs_[0];
+      config.set_name("para0");
+      config.set_para_id(0);
+      config.set_size(10000);
+      config.set_device(-1);
+      config.set_learning_rate(1.0);
+      config.set_momentum(0.9);
+    }
+
+    {
+      ParameterConfig& config = clientConfigs_[1];
+      config.set_name("para1");
+      config.set_para_id(1);
+      config.set_size(5000);
+      config.set_device(-1);
+      config.set_learning_rate(0.5);
+      config.set_momentum(0.4);
+    }
+
+    for (auto& config : clientConfigs_) {
+      parameters_.emplace_back(new Parameter(config, /* useGpu= */ false));
+    }
+
+    size_t id = 0;
+    for (auto& para : parameters_) {
+      para->setID(id++);
+    }
+
+    CHECK(client_.init(parameters_));
+    OptimizationConfig optConfig;
+    optConfig.set_algorithm("async_sgd");
+    optConfig.set_batch_size(100);
+    optConfig.set_learning_rate(0.1);
+    client_.setConfig(optConfig);
+    client_.setParameter();
+  }
+
+  void setConfigTest();
+  void setStatusTest();
+  void sendParameterTest();
+  void sendDataTest(SendDataType type, size_t size);
+  void operationTest();
+  void mergeBlockSegmentTest();
+  void checkSegments(const BlockSegments& expected, const BlockSegments& segs);
+  void waitPassFinishTest();
+  void synchronizeTest();
+
+ protected:
+  ParameterClient2 client_;
+  vector<ParameterConfig> clientConfigs_;
+  vector<ParameterPtr> parameters_;
+};
+
+std::unique_ptr<ParameterServer2Tester> g_server;
+
+void ParameterServer2Tester::setConfigTest() {
+  setup();
+
+  for (auto& config : clientConfigs_) {
+    auto it = configMap_.find(config.para_id());
+    EXPECT_TRUE(it != configMap_.end());
+    auto& serverConfig = it->second;
+    EXPECT_EQ(config.name(), serverConfig.name());
+    EXPECT_EQ(config.size(), serverConfig.size());
+    EXPECT_EQ(config.learning_rate(), serverConfig.learning_rate());
+    EXPECT_EQ(config.momentum(), serverConfig.momentum());
+  }
+}
+
+void ParameterServer2Tester::setStatusTest() {
+  setup();
+  EXPECT_TRUE(client_.inStatus(PSERVER_STATUS_NOT_SET));
+  client_.setStatus(PSERVER_STATUS_PARAMETER_READY);
+  EXPECT_EQ(PSERVER_STATUS_PARAMETER_READY, status_);
+  EXPECT_TRUE(client_.inStatus(PSERVER_STATUS_PARAMETER_READY));
+}
+
+real sumVector(const CpuVector& vec) {
+  const real* data = vec.getData();
+  size_t dim = vec.getSize();
+  real sum = 0;
+  for (size_t i = 0; i < dim; ++i) {
+    sum += data[i];
+  }
+  return sum;
+}
+
+void ParameterServer2Tester::sendParameterTest() {
+  setup();
+
+  client_.sendAndReceiveParameter(PSERVER_UPDATE_MODE_SET_PARAM,
+                                  PARAMETER_VALUE,
+                                  0,       // numSamples = 0
+                                  0,       // cost = 0
+                                  false);  // sendBackParameter = false
+
+  vector<ParameterPtr> parameterCopies;
+
+  for (auto& parameter : parameters_) {
+    parameterCopies.emplace_back(
+        new Parameter(parameter->getConfig(), /* useGpu= */ false));
+    parameterCopies.back()
+        ->getBuf(PARAMETER_VALUE)
+        ->copyFrom(*parameter->getBuf(PARAMETER_VALUE));
+  }
+
+  client_.sendAndReceiveParameter(PSERVER_UPDATE_MODE_GET_PARAM,
+                                  PARAMETER_VALUE,
+                                  0,      // numSamples = 0
+                                  0,      // cost = 0
+                                  true);  // sendBackParameter = true
+
+  for (size_t i = 0; i != parameters_.size(); ++i) {
+    real* v1 = parameters_[i]->getBuf(PARAMETER_VALUE)->getData();
+    real* v2 = parameterCopies[i]->getBuf(PARAMETER_VALUE)->getData();
+    EXPECT_EQ(parameters_[i]->getSize(), parameterCopies[i]->getSize());
+    size_t size = parameters_[i]->getSize();
+    real sum1 = 0, sum2 = 0;
+    for (size_t j = 0; j < size; ++j) {
+      sum1 += v1[j];
+      sum2 += v2[j];
+    }
+    EXPECT_EQ(sum1, sum2);
+  }
+}
+
+void ParameterServer2Tester::sendDataTest(SendDataType type, size_t size) {
+  ParameterClient2 client1(true);
+  client1.init(parameters_);
+  ParameterClient2 client2(true);
+  client2.init(parameters_);
+  ParameterClient2 client3(true);
+  client3.init(parameters_);
+
+  ThreadWorker worker1;
+  ThreadWorker worker2;
+  ThreadWorker worker3;
+
+  double* testData1 = new double[size];
+  double* testData2 = new double[size];
+  double* testData3 = new double[size];
+  double* getDataExpect = new double[size];
+  double* getDataReal = new double[size];
+  for (size_t i = 0; i < size; ++i) {
+    testData1[i] = rand();  // NOLINT TODO(yuyang18): Use rand_r instead.
+    testData2[i] = rand();  // NOLINT
+    testData3[i] = rand();  // NOLINT
+    getDataExpect[i] = testData1[i] + testData2[i] + testData3[i];
+  }
+
+  auto put1 = [&]() {
+    LOG(INFO) << "putOwnData1 start";
+    client1.putOwnData(0, type, testData1, size);
+    LOG(INFO) << "putOwnData1 finish";
+  };
+
+  auto get1 = [&]() {
+    LOG(INFO) << "sendData1 get all start";
+    client1.getAllData(0, type, getDataReal, size);
+    for (size_t i = 0; i < size; ++i) {
+      CHECK_EQ(getDataReal[i], getDataExpect[i]);
+    }
+    LOG(INFO) << "sendData1 get all finish";
+  };
+
+  auto put2 = [&]() {
+    LOG(INFO) << "putOwnData2 start";
+    client2.putOwnData(1, type, testData2, size);
+    LOG(INFO) << "putOwnData2 finish";
+  };
+
+  auto put3 = [&]() {
+    LOG(INFO) << "putOwnData3 start";
+    client3.putOwnData(2, type, testData3, size);
+    LOG(INFO) << "putOwnData3 finish";
+  };
+
+  worker1.addJob(put1);
+  worker1.addJob(get1);
+  worker2.addJob(put2);
+  worker3.addJob(put3);
+
+  worker1.addJob(put1);
+  worker2.addJob(put2);
+  worker3.addJob(put3);
+  worker1.addJob(get1);
+
+  worker1.wait();
+  worker2.wait();
+  worker3.wait();
+  free(testData1);
+  free(testData2);
+  free(testData3);
+  free(getDataExpect);
+  free(getDataReal);
+}
+
+void ParameterServer2Tester::operationTest() {
+  PServerVector v1, v2;
+  v1 = client_.createVector();
+  EXPECT_EQ(NUM_PARAMETER_TYPES, v1.handle);
+
+  v2 = client_.createVector();
+  EXPECT_EQ(NUM_PARAMETER_TYPES + 1, v2.handle);
+
+  PreparedOperations ops;
+  ops.addOperation(PSERVER_OP_RESET, v1, (real)1);
+  ops.addOperation(PSERVER_OP_RESET, v2, (real)2);
+
+  real res1, res2, res3;
+  ops.addOperation(PSERVER_OP_utv, v1, v2)(&res1);
+
+  ops.addOperation(PSERVER_OP_au_bv, v1, v2, (real)-1, (real)1);
+  ops.addOperation(PSERVER_OP_utv, v1, v2)(&res2);
+
+  ops.addOperation(PSERVER_OP_au_bv, v1, v2, (real)-1, (real)1);
+  ops.addOperation(PSERVER_OP_utv, v1, v2)(&res3);
+  client_.doOperation(ops, false, false);
+
+  EXPECT_EQ(30000, res1);
+  EXPECT_EQ(15000, res2);
+  EXPECT_EQ(0, res3);
+
+  PServerMatrix m1, m2;
+  m1 = client_.createMatrix(4);
+  EXPECT_EQ(0, m1.handle);
+  m2 = client_.createMatrix(8);
+  EXPECT_EQ(1, m2.handle);
+
+  // TODO(yuyang18): add tests for other operations OP_COPY, OP_au
+
+  client_.releaseVector(v1);
+  client_.releaseVector(v2);
+  client_.releaseMatrix(m1);
+  client_.releaseMatrix(m2);
+}
+
+void ParameterServer2Tester::checkSegments(const BlockSegments& expected,
+                                           const BlockSegments& segs) {
+  EXPECT_EQ(expected.size(), segs.size());
+  if (expected.size() != segs.size()) {
+    return;
+  }
+  for (size_t i = 0; i < expected.size(); ++i) {
+    EXPECT_EQ(expected[i], segs[i]);
+  }
+}
+
+void ParameterServer2Tester::mergeBlockSegmentTest() {
+  {
+    BlockSegments segs{{10, 20}, {30, 45}, {50, 70}};
+    mergeSegments(&segs);
+    checkSegments({{10, 20}, {30, 45}, {50, 70}}, segs);
+  }
+  {
+    BlockSegments segs{{30, 45}, {50, 70}, {10, 20}};
+    mergeSegments(&segs);
+    checkSegments({{10, 20}, {30, 45}, {50, 70}}, segs);
+  }
+  {
+    BlockSegments segs{{30, 45}, {50, 70}, {10, 30}};
+    mergeSegments(&segs);
+    checkSegments({{10, 45}, {50, 70}}, segs);
+  }
+  {
+    BlockSegments segs{{30, 45}, {10, 70}, {10, 30}};
+    mergeSegments(&segs);
+    checkSegments({{10, 70}}, segs);
+  }
+  {
+    BlockSegments segs{{30, 45}, {50, 70}, {10, 35}};
+    mergeSegments(&segs);
+    checkSegments({{10, 45}, {50, 70}}, segs);
+  }
+  {
+    BlockSegments segs{{30, 45}, {50, 70}, {10, 60}};
+    mergeSegments(&segs);
+    checkSegments({{10, 70}}, segs);
+  }
+  {
+    BlockSegments segs{{30, 45}, {50, 70}, {30, 47}};
+    mergeSegments(&segs);
+    checkSegments({{30, 47}, {50, 70}}, segs);
+  }
+}
+
+void ParameterServer2Tester::waitPassFinishTest() {
+  ParameterClient2 client1;
+  ParameterClient2 client2;
+  ParameterClient2 client3;
+
+  ThreadWorker worker1;
+  ThreadWorker worker2;
+  ThreadWorker worker3;
+
+  auto init1 = [&]() {
+    LOG(INFO) << "init1 start";
+    client1.init(parameters_);
+    LOG(INFO) << "init1 finish";
+  };
+
+  auto init2 = [&]() {
+    LOG(INFO) << "init2 start";
+    client2.init(parameters_);
+    LOG(INFO) << "init2 finish";
+  };
+
+  auto init3 = [&]() {
+    LOG(INFO) << "init3 start";
+    client3.init(parameters_);
+    LOG(INFO) << "init3 finish";
+  };
+
+  auto update1 = [&]() {
+    LOG(INFO) << "update1 start";
+    client1.sendAndReceiveParameter(PSERVER_UPDATE_MODE_ADD_GRADIENT,
+                                    PARAMETER_VALUE,
+                                    0,      // numSamples = 0
+                                    0,      // cost = 0
+                                    true);  // sendBackParameter = false
+    LOG(INFO) << "update1 finish";
+  };
+
+  auto wait1 = [&]() {
+    LOG(INFO) << "wait1 start";
+    client1.waitPassFinish();
+    LOG(INFO) << "wait1 finish";
+  };
+
+  auto update2 = [&]() {
+    LOG(INFO) << "update2 start";
+    client2.sendAndReceiveParameter(PSERVER_UPDATE_MODE_ADD_GRADIENT,
+                                    PARAMETER_VALUE,
+                                    0,      // numSamples = 0
+                                    0,      // cost = 0
+                                    true);  // sendBackParameter = false
+    LOG(INFO) << "update2 finish";
+  };
+
+  auto wait2 = [&]() {
+    LOG(INFO) << "wait2 start";
+    client2.waitPassFinish();
+    LOG(INFO) << "wait2 finish";
+  };
+
+  auto op3 = [&]() {
+    LOG(INFO) << "op3 start";
+    PreparedOperations ops;
+    ops.addOperation(PSERVER_OP_SGD);
+    client3.doOperation(ops,
+                        /* waitForGradient= */ true,
+                        /* sendBackarameter= */ true);
+    LOG(INFO) << "op3 finish";
+  };
+
+  worker1.addJob(init1);
+  worker2.addJob(init2);
+  worker3.addJob(init3);
+
+  worker1.addJob(update1);
+  worker2.addJob(update2);
+  worker3.addJob(op3);
+
+  worker3.addJob(op3);
+  worker3.addJob(op3);
+  worker2.addJob(update2);
+  worker2.addJob(update2);
+  worker1.addJob(wait1);
+
+  worker2.addJob(wait2);
+  worker3.addJob(op3);
+
+  worker1.wait();
+  worker2.wait();
+  worker3.wait();
+
+  LOG(INFO) << "Pass 1 finished";
+
+  worker1.addJob(update1);
+  worker2.addJob(update2);
+  worker3.addJob(op3);
+
+  worker1.wait();
+  worker2.wait();
+  worker3.wait();
+
+  worker3.addJob(op3);
+  worker3.addJob(op3);
+  worker1.addJob(update1);
+  worker1.addJob(wait1);
+  worker2.addJob(wait2);
+
+  worker1.wait();
+  worker2.wait();
+  worker3.wait();
+
+  LOG(INFO) << "Pass 2 finished";
+}
+
+void ParameterServer2Tester::synchronizeTest() {
+  ParameterClient2 client1;
+  ParameterClient2 client2;
+
+  ThreadWorker worker1;
+  ThreadWorker worker2;
+
+  FLAGS_log_period_server = 2;
+
+  auto init1 = [&]() {
+    LOG(INFO) << "init1 start";
+    client1.init(parameters_);
+    client1.setTrainerId(0);
+    LOG(INFO) << "init1 finish";
+  };
+
+  auto init2 = [&]() {
+    LOG(INFO) << "init2 start";
+    client2.init(parameters_);
+    client2.setTrainerId(1);
+    LOG(INFO) << "init2 finish";
+  };
+
+  auto update1 = [&]() {
+    LOG(INFO) << "update1 start";
+    client1.sendAndReceiveParameter(PSERVER_UPDATE_MODE_ASYNC_SGD,
+                                    PARAMETER_VALUE,
+                                    0,      // numSamples = 0
+                                    0,      // cost = 0
+                                    true);  // sendBackParameter = false
+    LOG(INFO) << "update1 finish";
+  };
+
+  auto wait1 = [&]() {
+    LOG(INFO) << "wait1 start";
+    client1.asyncFinishPass();
+    LOG(INFO) << "wait1 finish";
+  };
+
+  auto update2 = [&]() {
+    LOG(INFO) << "update2 start";
+    client2.sendAndReceiveParameter(PSERVER_UPDATE_MODE_ASYNC_SGD,
+                                    PARAMETER_VALUE,
+                                    0,      // numSamples = 0
+                                    0,      // cost = 0
+                                    true);  // sendBackParameter = false
+    LOG(INFO) << "update2 finish";
+  };
+
+  auto wait2 = [&]() {
+    LOG(INFO) << "wait2 start";
+    client2.asyncFinishPass();
+    LOG(INFO) << "wait2 finish";
+  };
+
+  worker1.addJob(init1);
+  worker2.addJob(init2);
+  // call wait to reset some stats at pserver
+  worker1.addJob(wait1);
+  worker2.addJob(wait2);
+
+  worker1.addJob(update1);
+  worker2.addJob(update2);
+
+  worker2.addJob(update2);
+  worker2.addJob(update2);
+  worker1.addJob(wait1);
+
+  worker2.addJob(wait2);
+
+  worker1.wait();
+  worker2.wait();
+  LOG(INFO) << "Pass 1 finished";
+
+  worker1.addJob(update1);
+  worker2.addJob(update2);
+
+  worker1.wait();
+  worker2.wait();
+
+  worker1.addJob(update1);
+  worker2.addJob(update2);
+  worker1.addJob(update1);
+  worker1.addJob(update1);
+  worker1.addJob(update1);
+  worker1.addJob(update1);
+  worker1.addJob(update1);
+  worker1.addJob(update1);
+  worker1.addJob(wait1);
+  worker2.addJob(wait2);
+
+  worker1.wait();
+  worker2.wait();
+  LOG(INFO) << "Pass 2 finished";
+}
+
+TEST(ParameterServer2, sendParameter) { g_server->sendParameterTest(); }
+
+TEST(ParameterServer2, setConfig) { g_server->setConfigTest(); }
+
+TEST(ParameterServer2, setStatus) { g_server->setStatusTest(); }
+
+TEST(ParameterServer2, operation) { g_server->operationTest(); }
+
+TEST(ParameterServer2, mergeBlockSegment) { g_server->mergeBlockSegmentTest(); }
+
+TEST(ParameterServer2, waitPassFinish) { g_server->waitPassFinishTest(); }
+
+TEST(ParameterServer2, synchronize) { g_server->synchronizeTest(); }
+
+TEST(ParameterServer2, sendData) {
+  // Set gserver and pserver all 3, so that the test is sufficient.
+  int oldFlagsPortsNUm = FLAGS_ports_num;
+  int oldFlagsNumGradientServers = FLAGS_num_gradient_servers;
+  int oldFlagsPort = FLAGS_port;
+  FLAGS_ports_num = 3;
+  FLAGS_num_gradient_servers = 3;
+  FLAGS_port = FLAGS_port + 1;
+  std::unique_ptr<ParameterServer2Tester> g_server1;
+  std::unique_ptr<ParameterServer2Tester> g_server2;
+  std::unique_ptr<ParameterServer2Tester> g_server3;
+  if (FLAGS_rdma_tcp == "rdma") {
+    g_server1.reset(new ParameterServer2Tester(
+        FLAGS_server_addr, FLAGS_port, FLAGS_server_cpu));
+    g_server1->start();
+    g_server2.reset(new ParameterServer2Tester(
+        FLAGS_server_addr, FLAGS_port + 1, FLAGS_server_cpu + 1));
+    g_server2->start();
+    g_server3.reset(new ParameterServer2Tester(
+        FLAGS_server_addr, FLAGS_port + 2, FLAGS_server_cpu + 2));
+    g_server3->start();
+  } else {  // tcp
+    g_server1.reset(new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port));
+    g_server1->start();
+    g_server2.reset(
+        new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port + 1));
+    g_server2->start();
+    g_server3.reset(
+        new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port + 2));
+    g_server3->start();
+  }
+
+  g_server2->init();
+  g_server3->init();
+  sleep(2);
+  g_server1->setup();
+  g_server1->sendDataTest(DATA_REDUCE_SUM, 1 << 24);
+  sleep(2);
+  g_server1->sendDataTest(DATA_REDUCE_SUM, 2);
+  sleep(2);
+  g_server1.reset();
+  g_server2.reset();
+  g_server3.reset();
+
+  FLAGS_ports_num = oldFlagsPortsNUm;
+  FLAGS_num_gradient_servers = oldFlagsNumGradientServers;
+  FLAGS_port = oldFlagsPort;
+}
+
+int main(int argc, char** argv) {
+  paddle::initMain(argc, argv);
+  testing::InitGoogleTest(&argc, argv);
+
+  FLAGS_num_gradient_servers = 2;
+
+  if (FLAGS_rdma_tcp == "rdma") {
+    g_server.reset(new ParameterServer2Tester(
+        FLAGS_server_addr, FLAGS_port, FLAGS_server_cpu));
+  } else {
+    g_server.reset(new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port));
+  }
+
+  g_server->start();
+
+  sleep(2);
+
+  int ret = RUN_ALL_TESTS();
+
+  g_server.reset();
+
+  exit(ret);
+}
diff --git a/paddle/legacy/pserver/test/test_ProtoServer.cpp b/paddle/legacy/pserver/test/test_ProtoServer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f7ab2e8af45f97a6537d41ca1afe51a4d3270b80
--- /dev/null
+++ b/paddle/legacy/pserver/test/test_ProtoServer.cpp
@@ -0,0 +1,169 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+#include "ParameterService.pb.h"
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/legacy/pserver/ProtoServer.h"
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/Util.h"
+
+DEFINE_string(server_addr, "127.0.0.1", "Server address");
+DEFINE_int64(dim, 50000000, "Data size");
+DEFINE_bool(test_proto_server, true, "whether to test ProtoServer");
+DEFINE_bool(benchmark, false, "Do benchmark. Skip some tests");
+
+using namespace paddle;  // NOLINT
+
+class MyServer : public ProtoServer {
+ public:
+  explicit MyServer(int port, int rdmaCpu = -1)
+      : ProtoServer(FLAGS_server_addr, port, rdmaCpu),
+        status_(PSERVER_STATUS_NOT_SET) {
+    REGISTER_SERVICE_FUNCTION(MyServer, getStatus);
+    REGISTER_SERVICE_FUNCTION(MyServer, setStatus);
+    REGISTER_SERVICE_FUNCTION_EX(MyServer, getStatusEx);
+  }
+  void getStatus(const GetStatusRequest& request,
+                 ProtoResponseCallback callback) {
+    (void)request;
+    GetStatusResponse response;
+    response.set_status(status_);
+    callback(response);
+  }
+
+  void getStatusEx(const GetStatusRequest& request,
+                   std::unique_ptr<MsgReader> msgReader,
+                   ProtoResponseCallbackEx callback) {
+    (void)request;
+    GetStatusResponse response;
+    response.set_status(status_);
+    buffer_.resize(msgReader->getNextBlockLength());
+    msgReader->readNextBlock(&buffer_[0]);
+    callback(response, {{&buffer_[0], buffer_.size()}});
+  }
+
+  void setStatus(const SetStatusRequest& request,
+                 ProtoResponseCallback callback) {
+    SetStatusResponse response;
+    status_ = request.status();
+    callback(response);
+  }
+
+ protected:
+  PServerStatus status_;
+  std::string buffer_;
+};
+
+TEST(ProtoServer, regular) {
+  ProtoClient* client;
+  if (FLAGS_rdma_tcp == "rdma")
+    client = new ProtoClient(FLAGS_server_addr, FLAGS_port, F_RDMA);
+  else
+    client = new ProtoClient(FLAGS_server_addr, FLAGS_port, F_TCP);
+  {
+    GetStatusRequest request;
+    GetStatusResponse response;
+    auto msgReader = client->sendAndRecv("getStatus", request, &response);
+    EXPECT_EQ(response.status(), PSERVER_STATUS_NOT_SET);
+    EXPECT_EQ(msgReader->getNumBlocks(), (size_t)0);
+  }
+
+  {
+    SetStatusRequest request;
+    SetStatusResponse response;
+    request.set_status(PSERVER_STATUS_PARAMETER_READY);
+    client->sendAndRecv("setStatus", request, &response);
+  }
+
+  {
+    GetStatusRequest request;
+    GetStatusResponse response;
+    client->sendAndRecv("getStatus", request, &response);
+    EXPECT_EQ(response.status(), PSERVER_STATUS_PARAMETER_READY);
+  }
+
+  delete client;
+}
+
+TEST(ProtoServer, extended) {
+#ifdef PADDLE_WITH_CUDA
+  ProtoClient* client;
+  if (FLAGS_rdma_tcp == "rdma")
+    client = new ProtoClient(FLAGS_server_addr, FLAGS_port, F_RDMA);
+  else
+    client = new ProtoClient(FLAGS_server_addr, FLAGS_port, F_TCP);
+  int64_t dataSize = FLAGS_dim * sizeof(real);
+
+  GpuVector gpuParam(FLAGS_dim);
+  GpuVector gpuGrad(FLAGS_dim);
+  CpuVector cpuParam(FLAGS_dim);
+  CpuVector cpuGrad(FLAGS_dim);
+
+  gpuParam.rand();
+  gpuGrad.rand();
+  cpuParam.rand();
+  cpuGrad.rand();
+
+  for (int k = 0; k < 4; ++k) {
+    for (int i = 0; i < 10; ++i) {
+      cpuGrad.copyFrom(gpuGrad);
+      if (FLAGS_test_proto_server) {
+        GetStatusRequest request;
+        GetStatusResponse response;
+        {
+          REGISTER_TIMER("sendAndRecv");
+          auto msgReader =
+              client->sendAndRecv("getStatusEx",
+                                  request,
+                                  {{cpuGrad.getData(), (size_t)dataSize}},
+                                  &response);
+
+          EXPECT_EQ(msgReader->getNumBlocks(), (size_t)1);
+          EXPECT_EQ(msgReader->getNextBlockLength(), (size_t)dataSize);
+          msgReader->readNextBlock(cpuParam.getData());
+        }
+        if (!FLAGS_benchmark) {
+          real* v1 = cpuGrad.getData();
+          real* v2 = cpuParam.getData();
+          real sum1 = 0, sum2 = 0;
+          for (int j = 0; j < FLAGS_dim; ++j) {
+            sum1 += v1[j];
+            sum2 += v2[j];
+          }
+          EXPECT_EQ(sum1, sum2);
+        }
+      }
+      gpuParam.copyFrom(cpuParam);
+
+      LOG_EVERY_N(INFO, 10) << "i=" << i;
+    }
+    globalStat.printAllStatus();
+    globalStat.reset();
+  }
+
+  delete client;
+#endif
+}
+
+int main(int argc, char** argv) {
+  paddle::initMain(argc, argv);
+  testing::InitGoogleTest(&argc, argv);
+  MyServer server(FLAGS_port, FLAGS_rdma_tcp == "rdma" ? 0 : -1);
+  server.start();
+  usleep(10000);
+
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/pserver/test/test_ProtoServer.sh b/paddle/legacy/pserver/test/test_ProtoServer.sh
new file mode 100755
index 0000000000000000000000000000000000000000..1439350847308cc5590329b0fe2a6d2c77d04409
--- /dev/null
+++ b/paddle/legacy/pserver/test/test_ProtoServer.sh
@@ -0,0 +1,33 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+for ((port=12340;port<=12360;port++))
+do
+    port_used_num=`netstat -a |grep $port|wc -l`
+    if [ $port_used_num -eq 0 ]
+    then
+        echo $port;
+        legacy/pserver/test/test_ProtoServer --port=$port
+        if [ $? -eq 0 ]
+           then
+               exit 0
+           else
+               echo "test_ProtoServer run wrong"
+       	       exit 1
+        fi
+fi
+done
+echo "test_ProtoServer port not found"
+exit 1
diff --git a/paddle/trainer/CMakeLists.txt b/paddle/legacy/trainer/CMakeLists.txt
similarity index 100%
rename from paddle/trainer/CMakeLists.txt
rename to paddle/legacy/trainer/CMakeLists.txt
diff --git a/paddle/legacy/trainer/MergeModel.cpp b/paddle/legacy/trainer/MergeModel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8a3601f192224a43687191527374149d99285ae0
--- /dev/null
+++ b/paddle/legacy/trainer/MergeModel.cpp
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+
+#include "ParamUtil.h"
+#include "Trainer.h"
+#include "paddle/legacy/pserver/ParameterServer2.h"
+#include "paddle/legacy/utils/PythonUtil.h"
+
+DEFINE_string(model_dir, "", "Directory for separated model files");
+DEFINE_string(config_file, "", "Config file for the model");
+DEFINE_string(model_file, "", "File for merged model file");
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+int main(int argc, char** argv) {
+  initMain(argc, argv);
+  initPython(argc, argv);
+
+  if (FLAGS_model_dir.empty() || FLAGS_config_file.empty() ||
+      FLAGS_model_file.empty()) {
+    LOG(INFO) << "Usage: ./paddle_merge_model --model_dir=pass-00000 "
+                 "--config_file=config.py --model_file=out.paddle";
+    return 0;
+  }
+
+  string confFile = FLAGS_config_file;
+#ifndef PADDLE_WITH_CUDA
+  FLAGS_use_gpu = false;
+#endif
+  auto config = std::make_shared<TrainerConfigHelper>(confFile);
+  unique_ptr<GradientMachine> gradientMachine(GradientMachine::create(*config));
+  gradientMachine->loadParameters(FLAGS_model_dir);
+
+  ofstream os(FLAGS_model_file);
+
+  string buf;
+  config->getConfig().SerializeToString(&buf);
+  int64_t size = buf.size();
+  os.write((char*)&size, sizeof(size));
+  CHECK(os) << "Fail to write to " << FLAGS_model_file;
+  os.write(buf.data(), buf.size());
+  vector<ParameterPtr>& parameters = gradientMachine->getParameters();
+  for (auto& para : parameters) {
+    para->save(os);
+    CHECK(os) << "Fail to write to " << FLAGS_model_file;
+  }
+  os.close();
+
+  return 0;
+}
diff --git a/paddle/legacy/trainer/NewRemoteParameterUpdater.cpp b/paddle/legacy/trainer/NewRemoteParameterUpdater.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cdd832acd16e5c259a7f6463aac537e4e6537c97
--- /dev/null
+++ b/paddle/legacy/trainer/NewRemoteParameterUpdater.cpp
@@ -0,0 +1,150 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "NewRemoteParameterUpdater.h"
+#include "Trainer.h"
+#include "paddle/legacy/utils/Stat.h"
+
+DECLARE_int32(trainer_id);
+DECLARE_string(save_dir);
+
+namespace paddle {
+NewRemoteParameterUpdater::NewRemoteParameterUpdater(
+    const OptimizationConfig &config, const std::string pserverSpec)
+    : trainerConfig_(config),
+      parameterClient_(-1),
+      newParameters_(nullptr),
+      newGradients_(nullptr),
+      pserverSpec_(pserverSpec) {}
+
+NewRemoteParameterUpdater::NewRemoteParameterUpdater(
+    const OptimizationConfig &config,
+    const std::string pserverSpec,
+    const bool useEtcd)
+    : trainerConfig_(config),
+      parameterClient_(-1),
+      newParameters_(nullptr),
+      newGradients_(nullptr),
+      pserverSpec_(pserverSpec),
+      useEtcd_(useEtcd) {}
+
+void NewRemoteParameterUpdater::init(
+    const std::vector<ParameterPtr> &parameters) {
+  ParameterUpdater::init(parameters);
+
+  // create parameter server client.
+  if (useEtcd_) {
+    parameterClient_ =
+        paddle_new_etcd_pserver_client((char *)pserverSpec_.c_str());
+  } else {
+    parameterClient_ = paddle_new_pserver_client((char *)pserverSpec_.c_str(),
+                                                 FLAGS_trainer_id == 0);
+  }
+
+  // init new parameter and gradient.
+  newParameters_ = initNewParameter(PARAMETER_VALUE);
+  newGradients_ = initNewParameter(PARAMETER_GRADIENT);
+
+  // init parameter, one trainer will get the opportunity to int parameter and
+  // send them to parameter server. Others will get the initialized parameter
+  // from parameter server
+  if (paddle_begin_init_params(parameterClient_)) {
+    LOG(INFO) << "paddle_begin_init_params start";
+    // NOTE: convert V1 OptimizatioinConfig proto to V2 OptimizerConfig.
+    // This makes golang pserver compatible with handy V1 demos.
+    // TODO(wuyi): Refine or remove these ugly converting lines
+    OptimizerConfig optimizerConfigV2;
+    if (trainerConfig_.learning_method() == "momentum") {
+      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::SGD);
+    } else if (trainerConfig_.learning_method() == "adagrad") {
+      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::Adagrad);
+      optimizerConfigV2.mutable_adagrad()->set_epsilon(
+          trainerConfig_.ada_epsilon());
+    } else if (trainerConfig_.learning_method() == "adadelta") {
+      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::Adagrad);
+      optimizerConfigV2.mutable_adadelta()->set_epsilon(
+          trainerConfig_.ada_epsilon());
+      optimizerConfigV2.mutable_adadelta()->set_rho(trainerConfig_.ada_rou());
+    } else if (trainerConfig_.learning_method() == "adam") {
+      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::Adam);
+      optimizerConfigV2.mutable_adam()->set_beta_1(trainerConfig_.adam_beta1());
+      optimizerConfigV2.mutable_adam()->set_beta_2(trainerConfig_.adam_beta2());
+      optimizerConfigV2.mutable_adam()->set_epsilon(
+          trainerConfig_.adam_epsilon());
+    } else {
+      LOG(ERROR) << "got unsupported v1 optimizer config: "
+                 << trainerConfig_.learning_method();
+      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::SGD);
+    }
+
+    if (trainerConfig_.learning_rate_schedule() == "constant") {
+      optimizerConfigV2.set_lr_policy(paddle::OptimizerConfig::Const);
+      optimizerConfigV2.mutable_const_lr()->set_learning_rate(
+          trainerConfig_.learning_rate());
+    } else if (trainerConfig_.learning_rate_schedule() == "linear") {
+      optimizerConfigV2.set_lr_policy(paddle::OptimizerConfig::Linear);
+      optimizerConfigV2.mutable_linear_lr()->set_learning_rate(
+          trainerConfig_.learning_rate());
+      optimizerConfigV2.mutable_linear_lr()->set_lr_decay_a(
+          trainerConfig_.learning_rate_decay_a());
+      optimizerConfigV2.mutable_linear_lr()->set_lr_decay_b(
+          trainerConfig_.learning_rate_decay_b());
+    } else {
+      LOG(ERROR) << "got unsupported v1 learning_rate_schedule config: "
+                 << trainerConfig_.learning_rate_schedule() << ", set to const";
+      optimizerConfigV2.set_lr_policy(paddle::OptimizerConfig::Const);
+      optimizerConfigV2.mutable_const_lr()->set_learning_rate(
+          trainerConfig_.learning_rate());
+    }
+
+    // overwrite optimizerConfigV2 for per-parameter(layer) configs
+    for (int i = 0; i < parameterSize(); ++i) {
+      // FIXME(typhoonzero): paramConfig always have default values,
+      // how to check if it's default?
+      // TODO(typhoonzero): log output: optimizerConfigV2.DebugString();
+      LOG(INFO) << "trainerConfig_: " << trainerConfig_.DebugString();
+      // send param and config to pserver
+      std::string bytes = optimizerConfigV2.SerializeAsString();
+      const char *array = bytes.data();
+      int size = (int)bytes.size();
+      paddle_init_param(
+          parameterClient_, *newParameters_[i], (void *)array, size);
+    }
+    paddle_finish_init_params(parameterClient_);
+    LOG(INFO) << "paddle_begin_init_params done";
+  } else {
+    paddle_get_params(parameterClient_, newParameters_, parameterSize());
+  }
+
+  LOG(INFO) << "NewRemoteParameterUpdater initialized";
+}
+
+void NewRemoteParameterUpdater::updateImpl(Parameter *para) {}
+
+void NewRemoteParameterUpdater::finishBatch(real cost) {
+  // send gradient to parameter server.
+  paddle_send_grads(parameterClient_, newGradients_, parameterSize());
+  // get the updated parameter from parameterClient.
+  paddle_get_params(parameterClient_, newParameters_, parameterSize());
+
+  // clear gradient after update parameter.
+  for (auto &para : parameters_) {
+    para->getBuf(PARAMETER_GRADIENT)->zeroMem();
+  }
+}
+
+void NewRemoteParameterUpdater::startPass() {}
+
+bool NewRemoteParameterUpdater::finishPass() { return true; }
+}  // namespace paddle
diff --git a/paddle/legacy/trainer/NewRemoteParameterUpdater.h b/paddle/legacy/trainer/NewRemoteParameterUpdater.h
new file mode 100644
index 0000000000000000000000000000000000000000..707e9ceb9b6a22d265f9bf7b02af7f3002930fd4
--- /dev/null
+++ b/paddle/legacy/trainer/NewRemoteParameterUpdater.h
@@ -0,0 +1,121 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <functional>
+#include <thread>
+#include "OptimizerConfig.pb.h"
+#include "ParameterUpdater.h"
+#include "libpaddle_pserver_cclient.h"
+#include "paddle/legacy/pserver/ParameterClient2.h"
+#include "paddle/legacy/utils/Queue.h"
+#include "paddle/legacy/utils/Util.h"
+
+namespace paddle {
+
+/**
+ * New remote parameter updater for dense parameters that use cclient of go.
+ */
+class NewRemoteParameterUpdater : public ParameterUpdater {
+ public:
+  NewRemoteParameterUpdater(const OptimizationConfig& config,
+                            const std::string pserverSpec);
+  NewRemoteParameterUpdater(const OptimizationConfig& config,
+                            const std::string pserverSpec,
+                            const bool useEtcd);
+  ~NewRemoteParameterUpdater() {
+    releaseNewParameter(newParameters_);
+    releaseNewParameter(newGradients_);
+    if (parameterClient_ >= 0) paddle_pserver_client_release(parameterClient_);
+  }
+
+  /**
+   * initialize the internal parameter client and itself.
+   */
+  virtual void init(const std::vector<ParameterPtr>& parameters);
+  /**
+   * @brief start batch
+   *
+   * @note  one batch training exhibits stateful feature to help
+   *        to do performance tuning, sgd optimization if necessary.
+   */
+  virtual PassType startBatch(int64_t batchSize) { return PASS_TRAIN; }
+
+  /**
+   * send parameters to pservers and get returned parameters
+   * from all pservers if necessary.
+   */
+  virtual void finishBatch(real cost);
+  virtual void startPass();
+  virtual bool finishPass();
+
+ protected:
+  /**
+   * work need to do after finishBatch
+   */
+  virtual void updateImpl(Parameter* para);
+
+ private:
+  int parameterSize() { return (int)parameters_.size(); }
+
+  /**
+   * init parameter of go paddle pserver cclient.
+   * @param new_params
+   * @param type
+   */
+  paddle_parameter** initNewParameter(ParameterType type) {
+    paddle_parameter** new_params =
+        (paddle_parameter**)malloc(sizeof(paddle_parameter*) * parameterSize());
+    for (int i = 0; i < parameterSize(); ++i) {
+      new_params[i] = (paddle_parameter*)malloc(sizeof(paddle_parameter));
+      memset(new_params[i], 0, sizeof(paddle_parameter));
+    }
+
+    for (int i = 0; i < parameterSize(); ++i) {
+      ParameterPtr param = parameters_[i];
+      new_params[i]->element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
+      new_params[i]->name = (char*)param->getName().c_str();
+      new_params[i]->content =
+          (unsigned char*)(param->getBuf(type).get()->getData());
+      new_params[i]->content_len =
+          (int)param->getBuf(type).get()->getSize() * sizeof(real);
+    }
+    return new_params;
+  }
+
+  void releaseNewParameter(paddle_parameter** newParams) {
+    if (newParams != nullptr) {
+      for (int i = 0; i < parameterSize(); ++i) {
+        free(newParams[i]);
+      }
+      free(newParams);
+    }
+  }
+
+ protected:
+  const OptimizationConfig& trainerConfig_;
+  /// internal parameter client object for exchanging data with pserver
+  paddle_pserver_client parameterClient_;
+  /// the parameters for new pserver client
+  paddle_parameter** newParameters_;
+  /// the gradinets for new pserver client
+  paddle_parameter** newGradients_;
+  /// the specification of parameter server "host1:port,host1:port"
+  std::string pserverSpec_;
+  /// true if pserverSpec_ is etcd endpoint, else pserverSpec_ is pserver addr
+  bool useEtcd_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/trainer/ParamUtil.cpp b/paddle/legacy/trainer/ParamUtil.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b5aba32dee1d07015ae3fce1cc76242b8ae80fe5
--- /dev/null
+++ b/paddle/legacy/trainer/ParamUtil.cpp
@@ -0,0 +1,163 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ParamUtil.h"
+
+#include <fenv.h>
+#include <stdio.h>
+
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <sstream>
+
+#include <google/protobuf/text_format.h>
+#include <paddle/legacy/utils/Version.h>
+
+#include "paddle/legacy/utils/GlobalConstants.h"
+#include "paddle/legacy/utils/PythonUtil.h"
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/Util.h"
+
+#include "TesterConfig.h"
+#include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
+#include "paddle/legacy/gserver/layers/ValidationLayer.h"
+
+namespace paddle {
+
+ParameterUtil::ParameterUtil(
+    const std::shared_ptr<TrainerConfigHelper> &config,
+    std::unique_ptr<ParameterUtilConfig> &&intconfig,
+    const GradientMachinePtr &gradientMachine,
+    const std::shared_ptr<ParameterUpdater> &parameterUpdater) {
+  config_ = config;
+  intConfig_ = std::move(intconfig);
+  gserver_ = gradientMachine;
+  pUpdater_ = parameterUpdater;
+}
+
+bool ParameterUtil::loadParameters(int passId, bool local, bool remote) {
+  constexpr int kBufLen = 100;
+  char buf[kBufLen];
+  snprintf(buf, kBufLen, "pass-%05d", passId);
+  std::string doneFile = path::join(config_->getSaveDir(), buf, "done");
+  if (!fileExist(doneFile.c_str())) return false;
+  loadParametersWithPath(path::join(config_->getSaveDir(), buf), local, remote);
+  return true;
+}
+
+void ParameterUtil::loadParametersWithPath(const std::string &dir,
+                                           bool local,
+                                           bool remote) {
+  if (local) {
+    gserver_->loadParameters(dir);
+  }
+  if (remote && pUpdater_) {
+    pUpdater_->loadParametersRemote(dir);
+  }
+}
+
+void ParameterUtil::saveParametersOnePass(int passId, int passInnerId) {
+  pUpdater_->apply();
+  saveParameters(passId, passInnerId);
+  if (intConfig_->save_only_one_ && passId >= intConfig_->saving_period_) {
+    deleteParameters(passId - intConfig_->saving_period_);
+  }
+  pUpdater_->restore();
+}
+
+void ParameterUtil::saveParameters(int passId, int passInnerId) {
+  constexpr int kBufLen = 100;
+  char buf[kBufLen];
+  if (passInnerId > 0) {
+    snprintf(buf, kBufLen, "pass-%05d-%03d", passId, passInnerId);
+  } else {
+    snprintf(buf, kBufLen, "pass-%05d", passId);
+  }
+
+  std::string basePath = config_->getSaveDir();
+  if (basePath.find('/') == std::string::npos) {
+    basePath = "./" + basePath;
+  }
+  mkDirRecursively(basePath.c_str());
+
+  std::string saveDir = path::join(basePath, buf);
+  mkDir(saveDir.c_str());
+  if (!intConfig_->load_save_param_pserver_) {
+    pUpdater_->getParametersRemote(true /*full parameter*/,
+                                   true /*after apply*/);
+  }
+
+  gserver_->saveParameters(saveDir);
+  if (intConfig_->load_save_param_pserver_) {
+    pUpdater_->saveParametersRemote(saveDir);
+  }
+  std::string doneFile = path::join(saveDir, "done");
+  touchFile(doneFile.c_str());
+  std::ofstream out(doneFile);
+  version::printVersion(out);
+  out.close();
+  VLOG(1) << "save dir " << saveDir;
+  saveConfigWithPath(saveDir);
+}
+
+void ParameterUtil::deleteParameters(int passId, int passInnerId) {
+  constexpr int kBufLen = 100;
+  char buf[kBufLen];
+  const std::string &saveDir = config_->getSaveDir();
+  if (passInnerId > 0) {
+    snprintf(buf,
+             kBufLen,
+             "%s/pass-%05d-%03d",
+             saveDir.c_str(),
+             passId,
+             passInnerId);
+  } else {
+    snprintf(buf, kBufLen, "%s/pass-%05d", saveDir.c_str(), passId);
+  }
+  mkDir(saveDir.c_str());
+  LOG(INFO) << "delete dir " << buf;
+  rmDir(buf);
+}
+
+void ParameterUtil::saveConfigWithPath(const std::string &path) {
+  std::string src;
+  // save config in some path
+  if (!intConfig_->config_.empty()) {
+    src = intConfig_->config_;
+  } else {
+    bool ok;
+    src = config_->getConfigName(&ok);
+    if (!ok) {
+      return;
+    }
+  }
+  copyFileToPath(src, path);
+
+  // save other import config file name to path.txt
+  std::string ss = path::join(path, "path.txt");
+  std::ofstream os(ss);
+  std::string fileName = path::basename(src);
+  CHECK(os.write(fileName.c_str(), fileName.length()))
+      << "Fail to write config file name " << ss;
+  VLOG(1) << "fileName " << fileName;
+  os.close();
+
+  // copy other import config files
+  for (int i = 0; i < config_->getConfig().config_files_size(); ++i) {
+    copyFileToPath(config_->getConfig().config_files(i), path);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/trainer/ParamUtil.h b/paddle/legacy/trainer/ParamUtil.h
new file mode 100644
index 0000000000000000000000000000000000000000..07786967762a7b9267d190de5275f0f94bbd21ef
--- /dev/null
+++ b/paddle/legacy/trainer/ParamUtil.h
@@ -0,0 +1,125 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/legacy/utils/Util.h"
+
+#include <stdio.h>
+
+#include "hl_gpu.h"
+#include "paddle/legacy/gserver/dataproviders/DataProvider.h"
+#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
+
+#include <stdlib.h>
+#include <fstream>
+#include "ParameterUpdater.h"
+#include "TrainerConfig.pb.h"
+#include "TrainerConfigHelper.h"
+
+namespace paddle {
+
+/**
+ * Configuration for parameter utils.
+ */
+struct ParameterUtilConfig {
+  DISABLE_COPY(ParameterUtilConfig);
+
+  ParameterUtilConfig(bool save_only_one,
+                      int saving_period,
+                      bool load_save_parameters_in_pserver,
+                      std::string config)
+      : save_only_one_(save_only_one),
+        saving_period_(saving_period),
+        load_save_param_pserver_(load_save_parameters_in_pserver),
+        config_(config) {}
+
+  bool save_only_one_;
+  int saving_period_;
+  bool load_save_param_pserver_;
+  std::string config_;
+};
+
+/**
+ * ParameterUtil
+ * Utility class for loading and saving parameters
+ */
+class ParameterUtil {
+ public:
+  /**
+   * Ctor.
+   *
+   * @param config
+   * @param intconfig
+   * @param gradientMachine
+   * @param parameterUpdater
+   * @return
+   */
+  ParameterUtil(const std::shared_ptr<TrainerConfigHelper> &config,
+                std::unique_ptr<ParameterUtilConfig> &&intconfig,
+                const GradientMachinePtr &gradientMachine,
+                const std::shared_ptr<ParameterUpdater> &parameterUpdater);
+
+  /// Load parameter from the saved parameter file as pass passId
+  /// if loadsave_parameters_in_pserver is set, some parameters MUST
+  /// load in pserver, which is "remote".
+  /// loadParameters can choose to load local/remote parameter, or both.
+  bool loadParameters(int passId, bool local = true, bool remote = false);
+
+  /// load parameters given path info
+  void loadParametersWithPath(const std::string &dir,
+                              bool local = true,
+                              bool remote = false);
+
+  /// Save parameter to dist for pass passId
+  /// passInnerId means saving times in one pass, some users want to
+  /// save parameters when have processed some batches in one pass
+  /// passInnerId = 0 means do not need to save in one inner pass
+  void saveParameters(int passId, int passInnerId = 0);
+
+  /// save parameters for one pass, when passInnerId > 0 means saving
+  /// the passInnerId times in one pass
+  void saveParametersOnePass(int passId, int passInnerId = 0);
+
+  /// delete parameter from disk via passId
+  void deleteParameters(int passId, int passInnerId = 0);
+
+  /// save config given path info
+  void saveConfigWithPath(const std::string &path);
+
+  /**
+   * Try to load parameter from config.
+   * @return true if can load from trainer config.
+   */
+  inline bool tryLoadParametersFromConfig() {
+    auto &c = config_->getConfig();
+    if (!c.init_model_path().empty()) {
+      loadParametersWithPath(c.init_model_path());
+      return true;
+    } else if (c.start_pass() > 0) {
+      CHECK(loadParameters(c.start_pass() - 1));
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+ private:
+  std::shared_ptr<TrainerConfigHelper> config_;
+  std::unique_ptr<ParameterUtilConfig> intConfig_;
+  GradientMachinePtr gserver_;
+  std::shared_ptr<ParameterUpdater> pUpdater_;
+};
+
+}  //  namespace paddle
diff --git a/paddle/legacy/trainer/ParameterUpdater.cpp b/paddle/legacy/trainer/ParameterUpdater.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..549fb0332da78053a261928b5558beb1ffbc79c5
--- /dev/null
+++ b/paddle/legacy/trainer/ParameterUpdater.cpp
@@ -0,0 +1,152 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ParameterUpdater.h"
+
+#include "paddle/legacy/utils/Logging.h"
+
+#include "paddle/legacy/utils/Thread.h"
+
+namespace paddle {
+
+static const hl_stream_t kDeviceToHostStream = HPPL_STREAM_1;
+static const hl_stream_t kHostToDeviceStream = HPPL_STREAM_2;
+
+SgdUpdaterWithCpuAverager::SgdUpdaterWithCpuAverager(
+    const OptimizationConfig& optConfig)
+    : SgdLocalUpdater(optConfig, false /*with averager*/) {
+  CHECK(FLAGS_use_gpu && optConfig.do_average_in_cpu());
+  averager_.reset(AverageOptimizer::create(optConfig,
+                                           new DummyOptimizer(optConfig),
+                                           false /*sparse*/,
+                                           true /*apply*/));
+  updateWorker_.addJob([]() { hl_set_device(FLAGS_gpu_id); });
+}
+
+void SgdUpdaterWithCpuAverager::init(
+    const std::vector<ParameterPtr>& parameters) {
+  SgdLocalUpdater::init(parameters);
+  averager_->init(parameters_.size(), nullptr);
+  copyEvents_.resize(parameters_.size());
+  for (auto& parameter : parameters) {
+    SetDevice device(parameter->getDeviceId());
+    cpuParameters_.emplace_back(new Parameter(parameter->getConfig(),
+                                              /* useGpu= */ false,
+                                              /* doInit= */ false));
+    if (parameter->useGpu()) {
+      cpuParameters_.back()->enableType(PARAMETER_APPLY);
+    } else {
+      cpuParameters_.back()->enableSharedType(
+          PARAMETER_APPLY, parameter->getBuf(PARAMETER_VALUE));
+    }
+    for (ParameterType type : averager_->getParameterTypes()) {
+      cpuParameters_.back()->enableType(type);
+    }
+
+    hl_create_event(&copyEvents_[nonStaticParaIDMap_[parameter->getID()]]);
+  }
+}
+
+SgdUpdaterWithCpuAverager::~SgdUpdaterWithCpuAverager() {
+  for (auto& event : copyEvents_) {
+    hl_destroy_event(event);
+  }
+}
+
+void SgdUpdaterWithCpuAverager::updateImpl(Parameter* para) {
+  SgdLocalUpdater::updateImpl(para);
+
+  if (para->useGpu()) {
+    size_t pid = nonStaticParaIDMap_[para->getID()];
+    Parameter* cpuPara = cpuParameters_[pid].get();
+    cpuPara->getBuf(PARAMETER_VALUE)
+        ->copyFrom(*para->getBuf(PARAMETER_VALUE), kDeviceToHostStream);
+    hl_stream_record_event(kDeviceToHostStream, copyEvents_[pid]);
+  }
+
+  updateWorker_.addJob(
+      std::bind(&SgdUpdaterWithCpuAverager::updateFunc, this, para));
+}
+
+void SgdUpdaterWithCpuAverager::updateFunc(Parameter* para) {
+  SetDevice setDevice(para->getDeviceId());
+  size_t pid = nonStaticParaIDMap_[para->getID()];
+  Parameter* cpuPara = cpuParameters_[pid].get();
+  if (para->useGpu()) {
+    hl_event_synchronize(copyEvents_[pid]);
+  }
+  averager_->update(cpuPara->getBufs(), cpuPara->getConfig(), -1LU);
+}
+
+void SgdUpdaterWithCpuAverager::finishBatch(real cost) {
+  SgdLocalUpdater::finishBatch(cost);
+
+  updateWorker_.wait();
+  for (auto para : cpuParameters_) {
+    if (auto callback = averager_->needSpecialTraversal(para->getConfig())) {
+      callback(para->getBufs(), para->getConfig(), -1LU);
+    }
+  }
+  averager_->finishBatch();
+}
+
+void SgdUpdaterWithCpuAverager::apply() {
+  // backup gpu value
+  for (auto& para : parameters_) {
+    SetDevice setDevice(para->getDeviceId());
+    para->getBuf(PARAMETER_GRADIENT)
+        ->copyFrom(*para->getBuf(PARAMETER_VALUE), kHostToDeviceStream);
+  }
+
+  // apply on cpu parameter
+  if (auto callback = averager_->apply()) {
+    for (auto para : cpuParameters_) {
+      callback(para->getBufs(), para->getConfig(), -1LU);
+    }
+  }
+
+  // copy to gpu value
+  for (auto& para : parameters_) {
+    SetDevice setDevice(para->getDeviceId());
+    size_t pid = nonStaticParaIDMap_[para->getID()];
+    Parameter* cpuPara = cpuParameters_[pid].get();
+    if (parameters_[pid]->useGpu()) {
+      para->getBuf(PARAMETER_VALUE)
+          ->copyFrom(*cpuPara->getBuf(PARAMETER_APPLY), kHostToDeviceStream);
+    }
+  }
+  hl_stream_synchronize(kHostToDeviceStream);
+  for (auto& para : parameters_) {
+    para->setValueUpdated();
+  }
+}
+
+void SgdUpdaterWithCpuAverager::restore() {
+  // restore on cpu parameter
+  if (auto callback = averager_->restore()) {
+    for (auto para : cpuParameters_) {
+      callback(para->getBufs(), para->getConfig(), -1LU);
+    }
+  }
+
+  // restore gpu value
+  for (auto& para : parameters_) {
+    SetDevice device(para->getDeviceId());
+    para->getBuf(PARAMETER_VALUE)->copyFrom(*para->getBuf(PARAMETER_GRADIENT));
+    para->getBuf(PARAMETER_GRADIENT)->zeroMem();
+    para->setValueUpdated();
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/trainer/ParameterUpdater.h b/paddle/legacy/trainer/ParameterUpdater.h
new file mode 100644
index 0000000000000000000000000000000000000000..acddc3702d78fdb198973f70a8642c5192af992b
--- /dev/null
+++ b/paddle/legacy/trainer/ParameterUpdater.h
@@ -0,0 +1,265 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/legacy/utils/Thread.h"
+#include "paddle/legacy/utils/Util.h"
+
+#include "paddle/legacy/parameter/AverageOptimizer.h"
+#include "paddle/legacy/parameter/FirstOrderOptimizer.h"
+#include "paddle/legacy/parameter/OptimizerFunctions.h"
+#include "paddle/legacy/parameter/OptimizerWithRegularizer.h"
+#include "paddle/legacy/parameter/Parameter.h"
+#include "paddle/legacy/parameter/ParameterUpdaterBase.h"
+
+#include "TrainerConfig.pb.h"
+#include "paddle/legacy/gserver/layers/Layer.h"
+
+#include <memory>
+#include <vector>
+
+namespace paddle {
+
+/**
+ * @brief Parameter Updater for SGD, and local(not cluster) run.
+ */
+class SgdLocalUpdater : public ParameterUpdater {
+ public:
+  /**
+   * @brief Ctor. Initialize optimizer locally by optConfig.
+   * @param optConfig optimization config.
+   * @param withAverager with average optimizer or not, default is true.
+   */
+  explicit SgdLocalUpdater(const OptimizationConfig& optConfig,
+                           bool withAverager = true)
+      : numSamplesProcessed_(0) {
+    auto baseOptimizer = ParameterOptimizer::create(optConfig);
+    optimizer_.reset(withAverager
+                         ? AverageOptimizer::create(optConfig, baseOptimizer)
+                         : baseOptimizer);
+    CHECK(optimizer_) << "fail to create optimizer: "
+                      << optConfig.learning_method();
+    auto types = optimizer_->getParameterTypes();
+    for (auto type : types) {
+      addParameterType(type);
+    }
+  }
+
+  /**
+   * @brief Initialize parameters and optimizer_.
+   *        For example,
+   *           If optimizer need hassien vector, then parameter's hassien will
+   *           be initialized.
+   * @param parameters The parameter need to be initialized.
+   */
+  virtual void init(const std::vector<ParameterPtr>& parameters) {
+    ParameterUpdater::init(parameters);
+    optimizer_->init(parameters_.size(), nullptr);
+    // check no L1 decay in parameter configs
+    CHECK(std::find_if(parameters.begin(),
+                       parameters.end(),
+                       [](const ParameterPtr& para) {
+                         return para->getConfig().decay_rate_l1() > 0.0f;
+                       }) == parameters.end())
+        << "SgdLocalUpdater cannot support L1 decay in parameter";
+  }
+
+  /**
+   * @brief Start a batch with current mini-batch size
+   * @param current mini-batch size.
+   * @return Always PASS_TRAIN.
+   */
+  virtual PassType startBatch(int64_t batchSize) {
+    numSamplesProcessed_ += batchSize;
+    optimizer_->startBatch(numSamplesProcessed_);
+    return PASS_TRAIN;
+  }
+
+  /**
+   * @brief finish a mini-batch.
+   */
+  virtual void finishBatch(real cost) { optimizer_->finishBatch(); }
+
+  /**
+   * @brief start a pass.
+   */
+  virtual void startPass() { optimizer_->startPass(); }
+
+  /**
+   * @brief finish a pass.
+   * @param cost sum cost during one pass.
+   * @return true if accept (used for owlqn).
+   */
+  virtual bool finishPass() {
+    optimizer_->finishPass();
+    return ParameterUpdater::finishPass();
+  }
+
+  /**
+   * @brief apply model average.
+   */
+  virtual void apply() {
+    if (auto callback = optimizer_->apply()) {
+      for (auto para : parameters_) {
+        SetDevice device(para->getDeviceId());
+        callback(para->getBufs(), para->getConfig(), -1UL);
+      }
+    }
+  }
+
+  /**
+   * @brief restore parameter value before model average
+   */
+  virtual void restore() {
+    if (auto callback = optimizer_->restore()) {
+      for (auto para : parameters_) {
+        SetDevice device(para->getDeviceId());
+        callback(para->getBufs(), para->getConfig(), -1UL);
+      }
+    }
+  }
+
+ protected:
+  /**
+   * @brief update method. Update value from gradient.
+   * @param para parameter that will be updated.
+   */
+  virtual void updateImpl(Parameter* para) {
+    optimizer_->update(para->getBufs(), para->getConfig());
+    if (auto callback = optimizer_->needSpecialTraversal(para->getConfig())) {
+      callback(para->getBufs(), para->getConfig(), -1UL);
+    }
+
+    para->setValueUpdated();
+    para->getBuf(PARAMETER_GRADIENT)->zeroMem();
+  }
+
+  std::unique_ptr<ParameterOptimizer> optimizer_;
+
+  /**
+   * @brief total number of samples processed.
+   */
+  int64_t numSamplesProcessed_;
+};
+
+/**
+ * @brief SgdCpuUpdater is used only in recursive neural network
+ * @deprecated
+ */
+class SgdCpuUpdater : public SgdLocalUpdater, public Deprecated {
+ public:
+  explicit SgdCpuUpdater(const OptimizationConfig& optConfig)
+      : SgdLocalUpdater(optConfig),
+        Deprecated(
+            "SgdCpuUpdater is used only in recursive neural network, "
+            "and recursive neural network is deprecated in paddle. "
+            "Use it all by your own.") {}
+
+  /**
+   * @brief update all parameter on finish batch.
+   * @param cost
+   */
+  virtual void finishBatch(real cost) {
+    for (auto para : parameters_) {
+      SgdLocalUpdater::update(para.get());
+    }
+    optimizer_->finishBatch();
+  }
+
+ protected:
+  /**
+   * @brief do nothing.
+   * @param para
+   */
+  virtual void updateImpl(Parameter* para) {}
+};
+
+/**
+ * @brief Sgd Local Updater With average in cpu.
+ *
+ * It will do model average in cpu to reduce gpu memory comsuption.
+ */
+class SgdUpdaterWithCpuAverager : public SgdLocalUpdater {
+ public:
+  /**
+   * @brief Ctor.
+   *
+   * SgdUpdaterWithCpuAverager will do everything as a
+   * SgdLocalUpdater, then copy parameter from GPU to CPU, and do model
+   * average in cpu.
+   */
+  explicit SgdUpdaterWithCpuAverager(const OptimizationConfig& optConfig);
+  ~SgdUpdaterWithCpuAverager();
+
+  /**
+   * @brief init. Initialize cpu parameters, model average optimizer.
+   * @param parameters
+   */
+  virtual void init(const std::vector<ParameterPtr>& parameters);
+
+  virtual PassType startBatch(int64_t batchSize) {
+    averager_->startBatch(-1UL);
+    return SgdLocalUpdater::startBatch(batchSize);
+  }
+  virtual void finishBatch(real cost);
+
+  virtual void startPass() {
+    averager_->startPass();
+    SgdLocalUpdater::startPass();
+  }
+  virtual bool finishPass() {
+    averager_->finishPass();
+    return SgdLocalUpdater::finishPass();
+  }
+
+  /// apply the averaged parameter to PARAMETER_VALUE
+  /// use PARAETER_GRADIENT for backing up PARAMETER_VALUE
+  virtual void apply();
+
+  /**
+   * @brief Restore parameter before apply().
+   */
+  virtual void restore();
+
+ protected:
+  virtual void updateImpl(Parameter* para);
+
+  void updateFunc(Parameter* para);
+
+ protected:
+  std::unique_ptr<ParameterOptimizer> averager_;
+
+  /**
+   * @brief The thread worker which do model average.
+   *
+   * For each parameter, GPU->CPU parameter is async, and do model average in
+   * another thread. Because the training process don't need model average while
+   * training, and model average only used in evaluation stage and saving stage.
+   * So the model average is totally async.
+   */
+  ThreadWorker updateWorker_;
+
+  /**
+   * @brief The parameter mirror in cpu.
+   */
+  std::vector<ParameterPtr> cpuParameters_;
+
+  /**
+   * @brief GPU -> CPU copy event. Model average will wait after copy done.
+   */
+  std::vector<hl_event_t> copyEvents_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/trainer/RemoteParameterUpdater.cpp b/paddle/legacy/trainer/RemoteParameterUpdater.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5de1cc7827aa8f219de60fe9da67fbb0595eb1d5
--- /dev/null
+++ b/paddle/legacy/trainer/RemoteParameterUpdater.cpp
@@ -0,0 +1,843 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "RemoteParameterUpdater.h"
+#include "Trainer.h"
+#include "paddle/legacy/utils/GlobalConstants.h"
+#include "paddle/legacy/utils/Stat.h"
+
+DECLARE_int32(trainer_id);
+DECLARE_string(save_dir);
+
+namespace paddle {
+
+static const hl_stream_t kDeviceToHostStream = HPPL_STREAM_1;
+static const hl_stream_t kHostToDeviceStream = HPPL_STREAM_2;
+static const int kFinishBatchPid = -1;
+
+const std::string RemoteParameterUpdater::kAverage = "average";
+const std::string RemoteParameterUpdater::kElasticAverage = "elastic_average";
+
+RemoteParameterUpdater::RemoteParameterUpdater(
+    const OptimizationConfig& config,
+    int expectedPassCount,
+    std::unique_ptr<ParameterUpdater>&& localUpdater)
+    : config_(config),
+      localUpdater_(std::move(localUpdater)),
+      numBatches_(0),
+      passCount_(0),
+      expectedPassCount_(expectedPassCount),
+      separateSendAndRecv_(false),
+      isFirstPass_(true),
+      useApplyInPserver_(false) {
+  addParameterType(PARAMETER_MOMENTUM);
+}
+
+void RemoteParameterUpdater::init(const std::vector<ParameterPtr>& parameters) {
+  ParameterUpdater::init(parameters);
+
+  if (localUpdater_) {
+    localUpdater_->init(parameters);
+
+    for (auto& parameter : parameters) {
+      parameter->enableType(PARAMETER_DELTA);
+    }
+
+    CHECK(config_.center_parameter_update_method() == kAverage ||
+          config_.center_parameter_update_method() == kElasticAverage)
+        << "unknown center_parameter_update_method";
+
+    // modify delta_add_rate
+    CHECK_GT(FLAGS_num_gradient_servers, 1)
+        << "FLAGS_num_gradient_servers should be set in trainer args.";
+    real delta_add_rate = config_.delta_add_rate() / FLAGS_num_gradient_servers;
+    config_.set_delta_add_rate(delta_add_rate);
+    LOG(INFO) << "center parameter in pserver,"
+              << " modify delta_add_rate=" << delta_add_rate;
+  }
+
+  if (!FLAGS_use_gpu) {
+    cpuParameters_ = parameters;
+  } else {
+    for (auto& parameter : parameters) {
+      cpuParameters_.emplace_back(new Parameter(parameter->getConfig(),
+                                                /* useGpu= */ false));
+      cpuParameters_.back()->setID(parameter->getID());
+      if (localUpdater_) {
+        cpuParameters_.back()->enableType(PARAMETER_DELTA);
+      }
+    }
+  }
+
+  parameterClient_.reset(new ParameterClient2(separateSendAndRecv_));
+  parameterClient_->init(cpuParameters_);
+  parameterClient_->setTrainerId(FLAGS_trainer_id);
+
+  if (FLAGS_trainer_id == 0) {
+    parameterClient_->setConfig(config_);
+    copyParametersFromDevice(PARAMETER_VALUE);
+    parameterClient_->setParameter();
+    parameterClient_->setStatus(PSERVER_STATUS_PARAMETER_READY);
+  } else {
+    parameterClient_->waitForStatus(PSERVER_STATUS_PARAMETER_READY);
+    parameterClient_->getParameter();
+    copyParametersToDevice(PARAMETER_VALUE);
+  }
+  if (FLAGS_trainer_id == 0 &&
+      (config_.algorithm() != TrainAlgorithm::AsyncSGD)) {
+    startController();
+    useApplyInPserver_ = useApplyInPserver(config_);
+  }
+}
+
+void RemoteParameterUpdater::startController() {
+  controllerThread_.reset(new std::thread([this]() { this->controller(); }));
+}
+
+void RemoteParameterUpdater::controller() {
+  ParameterClient2 client(false);
+  client.init(cpuParameters_);
+  while (true) {
+    /*start pass*/ {
+      client.waitPassStart();
+
+      PreparedOperations ops;
+      ops.addOperation(PSERVER_OP_START_PASS);
+      client.doOperation(ops,
+                         /* waitForGradient= */ false,
+                         /* sendBackarameter= */ false,
+                         /* releasePass= */ false);
+    }
+
+    while (true) {
+      PreparedOperations ops;
+      ops.addOperation(PSERVER_OP_SGD);
+      client.doOperation(ops,
+                         /* waitForGradient= */ true,
+                         /* sendBackarameter= */ true,
+                         /* releasePass= */ false);
+      if (client.isPassFinish()) {
+        break;
+      }
+    }
+
+    /*finish pass*/ {
+      PreparedOperations ops;
+      ops.addOperation(PSERVER_OP_FINISH_PASS);
+      client.doOperation(ops,
+                         /* waitForGradient= */ true,
+                         /* sendBackarameter= */ true,
+                         /* releasePass= */ true);
+    }
+
+    passCount_++;
+    if (passCount_ == expectedPassCount_) {
+      break;
+    }
+  }
+}
+
+void RemoteParameterUpdater::copyParametersToDevice(
+    ParameterType parameterType) {
+  if (!FLAGS_use_gpu) {
+    return;
+  }
+  int numParameters = cpuParameters_.size();
+  for (int i = 0; i < numParameters; ++i) {
+    parameters_[i]
+        ->getBuf(parameterType)
+        ->copyFrom(*cpuParameters_[i]->getBuf(parameterType));
+    if (parameterType == PARAMETER_VALUE) {
+      parameters_[i]->setValueUpdated();
+    }
+  }
+}
+
+void RemoteParameterUpdater::copyParametersFromDevice(
+    ParameterType parameterType) {
+  if (!FLAGS_use_gpu) {
+    return;
+  }
+  int numParameters = cpuParameters_.size();
+  for (int i = 0; i < numParameters; ++i) {
+    cpuParameters_[i]
+        ->getBuf(parameterType)
+        ->copyFrom(*parameters_[i]->getBuf(parameterType));
+  }
+}
+
+void RemoteParameterUpdater::updateImpl(Parameter* para) {
+  REGISTER_TIMER("update");
+  if (localUpdater_) {
+    localUpdater_->update(para);
+  }
+}
+
+void RemoteParameterUpdater::finishBatch(real cost) {
+  if (localUpdater_) {
+    localUpdater_->finishBatch(cost);
+  }
+
+  const std::string& algorithm = config_.algorithm();
+  ParameterUpdateMode mode;
+  if (algorithm == TrainAlgorithm::AsyncSGD) {
+    mode = PSERVER_UPDATE_MODE_ASYNC_SGD;
+  } else if (algorithm == TrainAlgorithm::SGD) {
+    mode = PSERVER_UPDATE_MODE_ADD_GRADIENT;
+  } else {
+    LOG(FATAL) << "Unknown algorithm: " << algorithm;
+  }
+
+  ParameterType sendType;
+  bool sendBackParameter = true;
+  if (localUpdater_) {
+    ++numBatches_;
+    if (numBatches_ % config_.num_batches_per_send_parameter() != 0) {
+      return;
+    }
+
+    if (config_.center_parameter_update_method() == kElasticAverage) {
+      parameterClient_->getParameter(PARAMETER_DELTA);
+      copyParametersToDevice(PARAMETER_DELTA);
+      sendBackParameter = false;  // no need send back after send
+
+      // calc delta
+      for (auto& para : parameters_) {
+        // DELTA = LOCAL_VALUE - CENTER_VALUE/*store in DELTA*/
+        para->getBuf(PARAMETER_DELTA)
+            ->add(*para->getBuf(PARAMETER_VALUE), -1.0f, 1.0f);
+
+        // when delta send to pserver, pserver will do:
+        // CENTER_VALUE += alpha * (LOCAL_VALUE - CENTER_VALUE)
+      }
+    } else {
+      // calc delta
+      for (auto& para : parameters_) {
+        // DELTA = NEW_VALUE - OLD_VALUE/*store in DELTA*/
+        para->getBuf(PARAMETER_DELTA)
+            ->add(*para->getBuf(PARAMETER_VALUE), -1.0f, 1.0f);
+      }
+    }
+
+    sendType = PARAMETER_DELTA;
+
+  } else {
+    // In this case, we perform SGD on pserver.
+    sendType = PARAMETER_GRADIENT;
+  }
+
+  copyParametersFromDevice(sendType);
+
+  {
+    REGISTER_TIMER("sendAndRecv_dense");
+    parameterClient_->sendAndReceiveParameter(mode,
+                                              sendType,
+                                              batchSize_,
+                                              0,  // cost = 0
+                                              sendBackParameter);
+  }
+
+  if (sendBackParameter) {
+    copyParametersToDevice(PARAMETER_VALUE);
+  }
+
+  if (localUpdater_) {
+    if (config_.center_parameter_update_method() == kElasticAverage) {
+      for (auto& para : parameters_) {
+        SetDevice device(para->getDeviceId());
+        // LOCAL_VALUE += -alpha * (LOCAL_VALUE - CENTER_VALUE)
+        para->getBuf(PARAMETER_VALUE)
+            ->add(*para->getBuf(PARAMETER_DELTA), -config_.delta_add_rate());
+      }
+
+    } else {  // average
+      // copy value to delta
+      for (auto& para : parameters_) {
+        SetDevice device(para->getDeviceId());
+        para->getBuf(PARAMETER_DELTA)->copyFrom(*para->getBuf(PARAMETER_VALUE));
+      }
+    }
+  } else {
+    for (auto& para : parameters_) {
+      SetDevice device(para->getDeviceId());
+      para->getBuf(sendType)->zeroMem();
+    }
+  }
+}
+
+void RemoteParameterUpdater::startPass() {
+  if (config_.algorithm() == TrainAlgorithm::SGD) {
+    parameterClient_->waitPassStart();
+  } else {
+    // sync could benifits reducing lagged trainer for async-sgd
+    // even if sync could not remove all lagged trainer for the
+    // sake of file loading, buffer etc.
+    parameterClient_->asyncStartPass();
+  }
+
+  if (localUpdater_) {
+    localUpdater_->startPass();
+    numBatches_ = 0;
+
+    if (config_.center_parameter_update_method() == kElasticAverage) {
+      if (!isFirstPass_) {
+        // restore local value from delta
+        for (auto& para : parameters_) {
+          SetDevice device(para->getDeviceId());
+          para->getBuf(PARAMETER_VALUE)
+              ->copyFrom(*para->getBuf(PARAMETER_DELTA));
+        }
+      }
+    } else {  // average
+      // copy value to delta
+      for (auto& para : parameters_) {
+        SetDevice device(para->getDeviceId());
+        para->getBuf(PARAMETER_DELTA)->copyFrom(*para->getBuf(PARAMETER_VALUE));
+      }
+    }
+  }
+}
+
+bool RemoteParameterUpdater::finishPass() {
+  if (localUpdater_) {
+    localUpdater_->finishPass();
+  }
+
+  if (config_.algorithm() == TrainAlgorithm::SGD) {
+    parameterClient_->waitPassFinish();
+  } else {
+    parameterClient_->asyncFinishPass();
+  }
+  if (localUpdater_) {
+    if (config_.center_parameter_update_method() == kElasticAverage) {
+      // backup local value to delta as we will get
+      // the remote parameter for saving/testing
+      for (auto& para : parameters_) {
+        SetDevice device(para->getDeviceId());
+        para->getBuf(PARAMETER_DELTA)->copyFrom(*para->getBuf(PARAMETER_VALUE));
+      }
+    }
+  }
+  parameterClient_->getParameter();
+  copyParametersToDevice(PARAMETER_VALUE);
+
+  isFirstPass_ = false;
+  return true;
+}
+
+void RemoteParameterUpdater::apply() {
+  if (useApplyInPserver_) {
+    PreparedOperations ops;
+    ops.addOperation(PSERVER_OP_APPLY);
+    parameterClient_->doOperation(ops,
+                                  /* waitForGradient= */ false,
+                                  /* sendBackarameter= */ false);
+    parameterClient_->getParameter(
+        /* recvParameterType= */ PARAMETER_VALUE,
+        /* sendBackParameterType= */ PARAMETER_APPLY);
+    copyParametersToDevice(PARAMETER_VALUE);
+  }
+}
+
+void RemoteParameterUpdater::restore() {
+  if (useApplyInPserver_) {
+    parameterClient_->getParameter();
+    copyParametersToDevice(PARAMETER_VALUE);
+  }
+}
+
+ConcurrentRemoteParameterUpdater::ConcurrentRemoteParameterUpdater(
+    OptimizationConfig config,
+    int passCount,
+    std::unique_ptr<ParameterUpdater>&& localUpdater)
+    : RemoteParameterUpdater(config, passCount, std::move(localUpdater)) {
+  sendThread_.reset(new std::thread([this]() { this->send(); }));
+  recvThread_.reset(new std::thread([this]() { this->recv(); }));
+
+  stopping_ = false;
+  oneBatchFinished_ = false;
+  separateSendAndRecv_ = true;
+}
+
+ConcurrentRemoteParameterUpdater::~ConcurrentRemoteParameterUpdater() {
+  stopping_ = true;
+  sendQueue_.enqueue(0);
+  sendThread_->join();
+  recvQueue_.enqueue(0);
+  recvThread_->join();
+}
+
+void ConcurrentRemoteParameterUpdater::finishBatch(real cost) {
+  if (localUpdater_) {
+    localUpdater_->finishBatch(cost);
+
+    if (!needToUpdateRemotely()) {
+      ++numBatches_;
+      return;
+    }
+  }
+
+  sendQueue_.enqueue(kFinishBatchPid);
+
+  finishBatchCond_.wait([this]() { return oneBatchFinished_; });
+  oneBatchFinished_ = false;
+  {
+    REGISTER_TIMER("sync_hostToDeviceStream");
+    for (auto& para : parameters_) {
+      SetDevice device(para->getDeviceId());
+      hl_stream_synchronize(kHostToDeviceStream);
+    }
+  }
+
+  if (localUpdater_) {
+    ++numBatches_;
+  }
+}
+
+// Use para=NULL to signal the end of one batch
+void ConcurrentRemoteParameterUpdater::send(Parameter* para) {
+  const std::string& algorithm = config_.algorithm();
+  ParameterUpdateMode mode;
+  if (algorithm == TrainAlgorithm::AsyncSGD) {
+    mode = PSERVER_UPDATE_MODE_ASYNC_SGD;
+  } else if (algorithm == TrainAlgorithm::SGD) {
+    mode = PSERVER_UPDATE_MODE_ADD_GRADIENT;
+  } else {
+    LOG(FATAL) << "Unknown algorithm: " << algorithm;
+  }
+  ParameterType sendType;
+  if (localUpdater_) {
+    sendType = PARAMETER_DELTA;
+  } else {
+    // In this case, we perform SGD on pserver.
+    sendType = PARAMETER_GRADIENT;
+  }
+  std::vector<ParameterSegments> paraSegment;
+  if (para == NULL) {
+    parameterClient_->sendParameter(
+        mode,
+        sendType,
+        paraSegment,
+        batchSize_,
+        0,              // cost=0
+        true,           // sendBackParameter = true
+        batchStatus_);  // batchStatus_ = BATCH_FINISH
+
+  } else {
+    ParameterSegments paraSegTemp;
+    paraSegment.reserve(1);
+    paraSegTemp.name = para->getName();
+    paraSegTemp.id = para->getID();
+    paraSegment.push_back(paraSegTemp);
+    {
+      SetDevice device(para->getDeviceId());
+      REGISTER_TIMER("copySingleParaFromDevice");
+      copySingleParaFromDevice(para, sendType);
+      hl_stream_synchronize(kDeviceToHostStream);
+    }
+    parameterClient_->sendParameter(mode,
+                                    sendType,
+                                    paraSegment,
+                                    batchSize_,
+                                    0,     // cost=0
+                                    true,  // sendBackParameter = true
+                                    batchStatus_);
+    if (batchStatus_ == BATCH_START) batchStatus_ = BATCH_ON;
+  }
+}
+void ConcurrentRemoteParameterUpdater::recv(Parameter* para) {
+  parameterClient_->recvParameter();
+  if (para != NULL) {
+    REGISTER_TIMER("copySingleParaToDevice");
+    SetDevice device(para->getDeviceId());
+    copySingleParaToDevice(para, PARAMETER_VALUE);
+
+    if (localUpdater_) {
+      para->getBuf(PARAMETER_DELTA)->copyFrom(*para->getBuf(PARAMETER_VALUE));
+    } else {
+      // if cpu, parameter should not changes until recvParameter().
+      // if gpu, zero mem when send finish
+      if (!FLAGS_use_gpu) {
+        para->getBuf(PARAMETER_GRADIENT)->zeroMem();
+      }
+    }
+  }
+}
+
+void ConcurrentRemoteParameterUpdater::recv() {
+  if (FLAGS_use_gpu) hl_set_device(FLAGS_gpu_id);
+  StatPtr stat = getStat("recv");
+  FOR_TIMING(Timer timer);
+  while (true) {
+    int pid;
+    {
+      REGISTER_TIMER("recv_dequeue");
+      pid = recvQueue_.dequeue();
+    }
+    if (pid == kFinishBatchPid) {
+      Parameter* para = NULL;
+      FOR_TIMING(timer.start());
+      recv(para);
+      FOR_TIMING(timer.stop());
+      FOR_TIMING(stat->addSample(timer.get()));
+      FOR_TIMING(timer.reset());
+      finishBatchCond_.notify_all([this] { oneBatchFinished_ = true; });
+    } else {
+      if (stopping_) break;
+      Parameter* para = parameters_[pid].get();
+      FOR_TIMING(timer.start());
+      recv(para);
+      FOR_TIMING(timer.stop());
+      oneBatchFinished_ = false;
+    }
+  }
+}
+
+void ConcurrentRemoteParameterUpdater::send() {
+  if (FLAGS_use_gpu) hl_set_device(FLAGS_gpu_id);
+  StatPtr stat = getStat("send");
+  FOR_TIMING(Timer timer);
+  while (true) {
+    int pid;
+    {
+      REGISTER_TIMER("send_dequeue");
+      pid = sendQueue_.dequeue();
+    }
+    if (pid == kFinishBatchPid) {
+      batchStatus_ = BATCH_FINISH;
+      if (!localUpdater_) {
+        // if cpu, parameter should not changes until recvParameter().
+        // if gpu, zeroMem() at the end of batch so that it won't
+        // interfere with computation.
+        if (FLAGS_use_gpu) {
+          REGISTER_TIMER("para_zeroMem");
+          for (auto& para : parameters_) {
+            SetDevice device(para->getDeviceId());
+            para->getBuf(PARAMETER_GRADIENT)->zeroMem();
+          }
+        }
+      }
+      Parameter* para = NULL;
+      FOR_TIMING(timer.start());
+      send(para);
+      FOR_TIMING(timer.stop());
+      FOR_TIMING(stat->addSample(timer.get()));
+      FOR_TIMING(timer.reset());
+      recvQueue_.enqueue(pid);
+    } else {
+      if (stopping_) break;
+      Parameter* para = parameters_[pid].get();
+      if (localUpdater_) {
+        // DELTA = NEW_VALUE - OLD_VALUE/*store in DELTA*/
+        para->getBuf(PARAMETER_DELTA)
+            ->add(*para->getBuf(PARAMETER_VALUE), -1.0f, 1.0f);
+      }
+      FOR_TIMING(timer.start());
+      send(para);
+      FOR_TIMING(timer.stop());
+      recvQueue_.enqueue(nonStaticParaIDMap_[para->getID()]);
+    }
+  }
+}
+
+void ConcurrentRemoteParameterUpdater::updateImpl(Parameter* para) {
+  REGISTER_TIMER("update");
+  if (localUpdater_) {
+    localUpdater_->update(para);
+    if (!needToUpdateRemotely()) {
+      return;
+    }
+  }
+  sendQueue_.enqueue(nonStaticParaIDMap_[para->getID()]);
+}
+
+void ConcurrentRemoteParameterUpdater::copySingleParaToDevice(
+    Parameter* para, ParameterType parameterType) {
+  if (!FLAGS_use_gpu) {
+    return;
+  }
+  int i = nonStaticParaIDMap_[para->getID()];
+  para->getBuf(parameterType)
+      ->copyFrom(*cpuParameters_[i]->getBuf(parameterType),
+                 kHostToDeviceStream);
+  if (parameterType == PARAMETER_VALUE) {
+    para->setValueUpdated();
+  }
+}
+
+void ConcurrentRemoteParameterUpdater::copySingleParaFromDevice(
+    Parameter* para, ParameterType parameterType) {
+  if (!FLAGS_use_gpu) {
+    return;
+  }
+  int i = nonStaticParaIDMap_[para->getID()];
+  cpuParameters_[i]
+      ->getBuf(parameterType)
+      ->copyFrom(*para->getBuf(parameterType), kDeviceToHostStream);
+}
+
+SparseRemoteParameterUpdater::SparseRemoteParameterUpdater(
+    const OptimizationConfig& config, int expectedPassCount, bool testing)
+    : config_(config),
+      passCount_(0),
+      expectedPassCount_(expectedPassCount),
+      testing_(testing),
+      useApplyInPserver_(false) {}
+
+void SparseRemoteParameterUpdater::init(
+    const std::vector<ParameterPtr>& parameters) {
+  ParameterUpdater::init(parameters);
+
+  parameterClient_.reset(new ParameterClient2(
+      false, FLAGS_port + FLAGS_ports_num, FLAGS_ports_num_for_sparse));
+  parameterClient_->init(parameters_);
+  parameterClient_->setTrainerId(FLAGS_trainer_id);
+
+  if (FLAGS_trainer_id == 0) {
+    parameterClient_->setConfig(
+        config_, FLAGS_save_dir, true /*is_sparse_server*/);
+    if (parameters[0]->isFullSize()) {
+      parameterClient_->setParameter();
+    } else {  // init in pserver
+      parameterClient_->setParameterZero();
+    }
+  }
+  if (FLAGS_trainer_id == 0 && !testing_ &&
+      config_.algorithm() == TrainAlgorithm::SGD) {
+    startController();
+    useApplyInPserver_ = useApplyInPserver(config_);
+  }
+}
+
+void SparseRemoteParameterUpdater::startController() {
+  controllerThread_.reset(new std::thread([this]() { this->controller(); }));
+}
+
+void SparseRemoteParameterUpdater::controller() {
+  ParameterClient2 client(
+      false, FLAGS_port + FLAGS_ports_num, FLAGS_ports_num_for_sparse);
+  client.init(parameters_);
+
+  while (true) {
+    /*start pass*/ {
+      client.waitPassStart();
+
+      PreparedOperations ops;
+      ops.addOperation(PSERVER_OP_START_PASS);
+      client.doOperation(ops,
+                         /* waitForGradient= */ false,
+                         /* sendBackarameter= */ false,
+                         /* releasePass= */ false);
+    }
+
+    while (true) {
+      PreparedOperations ops;
+      ops.addOperation(PSERVER_OP_SGD);
+      client.doOperation(ops,
+                         /* waitForGradient= */ true,
+                         /* sendBackarameter= */ true,
+                         /* releasePass= */ false);
+      if (client.isPassFinish()) {
+        break;
+      }
+    }
+
+    /*finish pass*/ {
+      PreparedOperations ops;
+      ops.addOperation(PSERVER_OP_FINISH_PASS);
+      client.doOperation(ops,
+                         /* waitForGradient= */ true,
+                         /* sendBackarameter= */ true,
+                         /* releasePass= */ true);
+    }
+
+    passCount_++;
+    if (passCount_ == expectedPassCount_) {
+      break;
+    }
+  }
+}
+
+PassType SparseRemoteParameterUpdater::startBatch(int64_t batchSize) {
+  batchSize_ = batchSize;
+  return PASS_TRAIN;
+}
+
+void SparseRemoteParameterUpdater::finishBatch(real cost) {
+  const std::string& algorithm = config_.algorithm();
+  ParameterUpdateMode mode;
+  if (algorithm == TrainAlgorithm::AsyncSGD) {
+    mode = PSERVER_UPDATE_MODE_ASYNC_SGD;
+  } else if (algorithm == TrainAlgorithm::SGD) {
+    mode = PSERVER_UPDATE_MODE_ADD_GRADIENT;
+  } else {
+    LOG(FATAL) << "Unknown algorithm: " << algorithm;
+  }
+
+  ParameterType sendType = PARAMETER_GRADIENT;
+
+  REGISTER_TIMER("sendSparseParam");
+  parameterClient_->sendAndReceiveParameter(mode,
+                                            sendType,
+                                            batchSize_,
+                                            0,       // cost = 0
+                                            false);  // sendBackParameter
+
+  // grad zero move to sgd grad machine, before merge grad sparse remote
+}
+
+void SparseRemoteParameterUpdater::startPass() {
+  if (config_.algorithm() == TrainAlgorithm::SGD) {
+    parameterClient_->waitPassStart();
+  } else {
+    if (FLAGS_trainer_id == 0) {
+      PreparedOperations ops;
+      ops.addOperation(PSERVER_OP_START_PASS);
+      parameterClient_->doOperation(ops,
+                                    /* waitForGradient= */ false,
+                                    /* sendBackarameter= */ false);
+    }
+    parameterClient_->asyncStartPass();
+  }
+}
+
+bool SparseRemoteParameterUpdater::finishPass() {
+  if (config_.algorithm() == TrainAlgorithm::SGD) {
+    parameterClient_->waitPassFinish();
+  } else {
+    if (FLAGS_trainer_id == 0) {
+      PreparedOperations ops;
+      ops.addOperation(PSERVER_OP_FINISH_PASS);
+      parameterClient_->doOperation(ops,
+                                    /* waitForGradient= */ false,
+                                    /* sendBackarameter= */ false);
+    }
+    parameterClient_->asyncFinishPass();
+  }
+
+  return true;
+}
+
+// Trainer will call getParametersRemote at batch start or before save,
+// so we do not get values in apply() and restore().
+void SparseRemoteParameterUpdater::apply() {
+  if (useApplyInPserver_) {
+    PreparedOperations ops;
+    ops.addOperation(PSERVER_OP_APPLY);
+    parameterClient_->doOperation(ops,
+                                  /* waitForGradient= */ false,
+                                  /* sendBackarameter= */ false);
+  }
+}
+
+void SparseRemoteParameterUpdater::restore() {}
+
+void SparseRemoteParameterUpdater::getParametersRemote(bool fullSize,
+                                                       bool apply) {
+  ParameterType sendBackParameterType =
+      (useApplyInPserver_ && apply) ? PARAMETER_APPLY : PARAMETER_VALUE;
+  std::function<void()> getParams;
+  std::function<void(Parameter&, real)> applyL1;
+  if (fullSize) {
+    getParams = [&] {
+      parameterClient_->getParameter(
+          /* recvParameterType= */ PARAMETER_VALUE, sendBackParameterType);
+    };
+    applyL1 = [](Parameter& para, real decayRate) {
+      para.getBuf(PARAMETER_VALUE)->applyL1(/*lr=*/1.0f, decayRate);
+    };
+  } else {
+    getParams = [&] {
+      parameterClient_->getParameterSparse(
+          /* recvParameterType= */ PARAMETER_VALUE, sendBackParameterType);
+    };
+    applyL1 = [](Parameter& para, real decayRate) {
+      para.getMat(PARAMETER_VALUE)->applyL1(/*lr=*/1.0f, decayRate);
+    };
+  }
+  {
+    REGISTER_TIMER("getParamDenseAndSparse");
+    getParams();
+    if (config_.shrink_parameter_value() > 0) {
+      for (auto& para : parameters_) {
+        if (para->getConfig().decay_rate_l1() > 0) {
+          applyL1(*para, config_.shrink_parameter_value());
+        }
+      }
+    }
+  }
+}
+
+void SparseRemoteParameterUpdater::randParametersRemote() {
+  CHECK_EQ(FLAGS_trainer_id, 0);
+
+  PreparedOperations ops;
+  ops.addOperation(PSERVER_OP_RANDOMIZE);
+  parameterClient_->doOperation(ops,
+                                /* waitForGradient= */ false,
+                                /* sendBackarameter= */ false);
+}
+
+void SparseRemoteParameterUpdater::loadParametersRemote(
+    const std::string& dirName) {
+  if (FLAGS_trainer_id == 0) {
+    parameterClient_->loadValueVector(dirName);
+  }
+
+  if (testing_) {
+    // we do not use synchronize() here,
+    // because test mode may run only one tester
+    if (FLAGS_trainer_id == 0) {
+      parameterClient_->setStatus(PSERVER_STATUS_PARAMETER_READY);
+    } else {
+      parameterClient_->waitForStatus(PSERVER_STATUS_PARAMETER_READY);
+    }
+  }
+}
+
+void SparseRemoteParameterUpdater::saveParametersRemote(
+    const std::string& dirName) {
+  if (FLAGS_trainer_id == 0) {
+    parameterClient_->saveValueVector(dirName);
+  }
+}
+
+void SparseRemoteParameterUpdaterComposite::init(
+    const std::vector<ParameterPtr>& parameters) {
+  parameters_ = parameters;
+
+  std::vector<ParameterPtr> parametersArray[NUMBER_UPDATERS];
+
+  for (auto& para : parameters_) {
+    if (para->isSparseRemoteUpdate()) {
+      parametersArray[UPDATER_SPARSE_REMOTE].push_back(para);
+    } else {
+      parametersArray[UPDATER_NORMAL].push_back(para);
+    }
+  }
+  CHECK(!parametersArray[UPDATER_SPARSE_REMOTE].empty());
+  CHECK(!parametersArray[UPDATER_NORMAL].empty());
+
+  syncThreadPool_->execPlusOwner([&](int tid, size_t numThreads) {
+    updaters_[tid]->init(parametersArray[tid]);
+  });
+
+  parameterTypes_ = updaters_[UPDATER_NORMAL]->getParameterTypes();
+}
+
+std::vector<std::function<ParameterUpdater*(
+    const std::string&, const OptimizationConfig&, bool, size_t)>>
+    ParameterUpdaterCreators::constructors_;
+
+}  // namespace paddle
diff --git a/paddle/legacy/trainer/RemoteParameterUpdater.h b/paddle/legacy/trainer/RemoteParameterUpdater.h
new file mode 100644
index 0000000000000000000000000000000000000000..68468532981a49ef32f5f0da1170815d657d86c1
--- /dev/null
+++ b/paddle/legacy/trainer/RemoteParameterUpdater.h
@@ -0,0 +1,416 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <functional>
+#include <thread>
+#include "ParameterUpdater.h"
+#include "paddle/legacy/pserver/ParameterClient2.h"
+#include "paddle/legacy/utils/Queue.h"
+#include "paddle/legacy/utils/Util.h"
+
+namespace paddle {
+
+// TODO(yanfei):
+// I think that the biggest feature of rdma is packet lossless control
+// feature instead of high bandwiths, zero copy and gpu-direct rdma in
+// theroy.
+// But zero-copy and gpu-direct rdma features can help to reduce latency
+// caused by os system.
+// So, for some specified cluster, such as high density gpu cluster,
+// gpu-direct and zero copy could help to improve cluster communication
+// performance.
+//
+
+/**
+ * Normal remote parameter updater for dense parameters.
+ *
+ * It first packs all parameters for all pservers using ParameterClient
+ * module, then wait for merged parameters data from all pservers.
+ * The synchronization pattern specified by sync-sgd or async-sgd is
+ * achieved by all pservers with the help of the controller within this
+ * remote parameter updater.
+ * This module indeedly bridges the gradient machines and parameter servers.
+ * It helps to transfer the parameters from acceleration device to cpu end
+ * for network. It contains additional parameters copy buffers for
+ * acceleration devices at cpu end, such as gpu, otherwise it will
+ * directly use original parameters data to update pservers.
+ *
+ * This remote parameter updater does not use pipeline mechanism to hide
+ * copy latency from gpu to cpu buffer. In addition the overlapped between
+ * backward and communication is not supported.
+ */
+class RemoteParameterUpdater : public ParameterUpdater {
+ public:
+  RemoteParameterUpdater(
+      const OptimizationConfig& config,
+      int expectedPassCount,
+      std::unique_ptr<ParameterUpdater>&& localUpdater = nullptr);
+  ~RemoteParameterUpdater() {
+    if (controllerThread_) {
+      controllerThread_->join();
+    }
+  }
+
+  /**
+   * initialize the internal parameter client and itself.
+   */
+  virtual void init(const std::vector<ParameterPtr>& parameters);
+  /**
+   * @brief start batch
+   *
+   * @note  one batch training exhibits stateful feature to help
+   *        to do performance tuning, sgd optimization if necessary.
+   */
+  virtual PassType startBatch(int64_t batchSize) {
+    if (localUpdater_) {
+      localUpdater_->startBatch(batchSize);
+    }
+    batchSize_ = batchSize;
+    batchStatus_ = BATCH_START;
+    return PASS_TRAIN;
+  }
+
+  /**
+   * send parameters to pservers and get returned parameters
+   * from all pservers if necessary. it will implictly
+   * cooperate with controller thread for sync-sgd.
+   */
+  virtual void finishBatch(real cost);
+  virtual void startPass();
+  virtual bool finishPass();
+
+#ifndef PADDLE_DISABLE_TIMER
+  virtual void setForwardbackwardTime(uint64_t delta) {
+    parameterClient_->setForwardbackwardTime(delta);
+  }
+#endif
+
+  virtual void apply();
+  virtual void restore();
+
+ protected:
+  /**
+   * control all pservers with all trainers for sync-sgd
+   */
+  virtual void controller();
+
+  /**
+   * work need to do after finishBatch
+   */
+  virtual void updateImpl(Parameter* para);
+
+  void startController();
+
+  /**
+   * @brief copy parameters from cpu host to device, such as gpu.
+   *
+   * @note  return if all data are transfered.
+   */
+  void copyParametersToDevice(ParameterType parameterType);
+
+  /**
+   * @brief copy parameters from device to cpu host
+   *
+   * @note  return if all data are transfered
+   */
+  void copyParametersFromDevice(ParameterType parameterType);
+
+ protected:
+  /// Optimization config used to guide initialization and finishBatch
+  OptimizationConfig config_;
+  /// internal parameter client object for exchanging data with pserver
+  std::unique_ptr<ParameterClient2> parameterClient_;
+  /// internal shadow buffer at cpu host end, use original parameters_
+  /// if no acceleration devices are used.
+  std::vector<ParameterPtr> cpuParameters_;
+  /// local updater for aggregating multi-batches local delta
+  std::unique_ptr<ParameterUpdater> localUpdater_;
+  /// the size of mini-batch
+  int64_t batchSize_;
+  /// batches passed
+  int64_t numBatches_;
+  /// for stateful control
+  BatchStatus batchStatus_;
+  /// controller thread for sync-sgd
+  std::unique_ptr<std::thread> controllerThread_;
+  /// passed already finished
+  int64_t passCount_;
+  /// expected passes to finished
+  int64_t expectedPassCount_;
+  /// use normal synchronization communication if True
+  bool separateSendAndRecv_;
+  /// true if it's first pass
+  bool isFirstPass_;
+  bool useApplyInPserver_;
+
+  static const std::string kAverage;
+  static const std::string kElasticAverage;
+};
+
+// TODO(yanfei):
+// do parameters level synchronization Optimization at pserver end with
+// ConcurrentRemoteParameterUpdater to get more parallelization, at last
+// to really hide pserver latency in backward computation.
+//
+/**
+ * This updater add additional optimization for overlapping synchronization
+ * from pservers with backward computation.
+ *
+ * Parameter can be sent to pservers when related backward stage is finished.
+ * This concurrent udpater does data copy from acceleration device to host
+ * memory aynchronously. In addition internal parameter client reads data in
+ * host memory and send them to all pservers in next stage. So this class
+ * help to pipeline device-to-host copy and host-to-network to hide network
+ * latency in backward stage.
+ * It contains separate send and recv thread for pipeline usage.
+ */
+class ConcurrentRemoteParameterUpdater : public RemoteParameterUpdater {
+ public:
+  ConcurrentRemoteParameterUpdater(
+      OptimizationConfig config,
+      int expectedPassCount,
+      std::unique_ptr<ParameterUpdater>&& localUpdater);
+  ~ConcurrentRemoteParameterUpdater();
+
+  /**
+   * @brief send paraemeters to all pservers
+   *
+   * @note  it just signal the end signal to internal parameter client
+   *        to finished the aynchronous send action. In addition it also
+   *        do synchronization for all asynchronous host-to-device copy.
+   */
+  virtual void finishBatch(real cost);
+
+ protected:
+  virtual void updateImpl(Parameter* para);
+  /// internal thread called in send thread
+  void send(Parameter* para);  // para == NULL indicate end of a minibatch
+  /// internal function called in recv thread
+  void recv(Parameter* para);
+  /**
+   * @brief send thread for relaying data from gradient to parameter client
+   *
+   * @note  just pipe data to internal parameter client for pipeline
+   */
+  void send();
+  /**
+   * @brief recv thread for relaying data from internal parameter client to
+   *        host memory
+   *
+   * @note  it contains the asynchronous data copy form host to device
+   */
+  void recv();
+  /// copy specified parameter from host to device
+  void copySingleParaToDevice(Parameter* para, ParameterType parameterType);
+  /// copy specified parameter from device to host
+  void copySingleParaFromDevice(Parameter* para, ParameterType parameterType);
+  bool needToUpdateRemotely() {
+    return (numBatches_ + 1) % config_.num_batches_per_send_parameter() == 0;
+  }
+
+ private:
+  /// send thread used for overlapping
+  std::unique_ptr<std::thread> sendThread_;
+  /// recv thread used for overlapping
+  std::unique_ptr<std::thread> recvThread_;
+  /// buffer queue for overlapping
+  Queue<int> sendQueue_;
+  /// buffer queue for overlapping
+  Queue<int> recvQueue_;
+  /// flags indicating to stop
+  bool stopping_;
+  /// conditional variable for threads synchronization between the
+  /// thread calling finishBatch and internal recv thread
+  LockedCondition finishBatchCond_;
+  bool oneBatchFinished_;
+};
+
+// TODO(yanfei):
+// merge sparse updater with dense updater, and could help to reduce
+// the synchronization between sparse and dense udpater. it could also
+// reduce the threads for managing all connections.
+/**
+ * This class is specified for updating sparse parameters.
+ *
+ * It allows part of parameter to be exchanged with all pservers.
+ * If sparse input assigned, part gradients of first hidden layer
+ * could remained zero which can not need to be exchanged within
+ * all pservers. This is the key optimization point for this updater
+ *
+ * For updating sparse parameters, all latest parameters are stored
+ * in pservers instead of keeping full copy at train end, so need to
+ * prefetch parameters weight value which can be changed in next-batch
+ * before doing next forwardbackward. Also, with above fact that the
+ * parameters can be stored in pserver instead of trainer, we can
+ * fetch specified parmeters if necessary, and can support huge
+ * parameters which is larger enough than  the RAM size in single
+ * node.
+ *
+ * Internally, this updater will direct internal parameter client
+ * to encapsulate sparse specified message for all pservers.
+ */
+class SparseRemoteParameterUpdater : public ParameterUpdater {
+ public:
+  SparseRemoteParameterUpdater(const OptimizationConfig& config,
+                               int expectedPassCount,
+                               bool testing);
+  ~SparseRemoteParameterUpdater() {
+    if (controllerThread_) {
+      controllerThread_->join();
+    }
+  }
+
+  /// initialization
+  virtual void init(const std::vector<ParameterPtr>& parameters);
+
+  /// stateful batch control
+  virtual PassType startBatch(int64_t batchSize);
+  /// send all sparse related parameters to all pservers
+  virtual void finishBatch(real cost);
+  virtual void startPass();
+  virtual bool finishPass();
+
+  virtual void apply();
+  virtual void restore();
+
+  /// load parameters from pservers
+  virtual void loadParametersRemote(const std::string& dirName);
+  /// save parameters to pservers
+  virtual void saveParametersRemote(const std::string& dirName);
+  /**
+   * @brief get latest sparse parameters value from all pservers
+   *
+   * @note  call it before next mini-batch
+   */
+  virtual void getParametersRemote(bool fullSize, bool apply);
+  virtual void randParametersRemote();
+#ifndef PADDLE_DISABLE_TIMER
+  virtual void setForwardbackwardTime(uint64_t delta) {
+    parameterClient_->setForwardbackwardTime(delta);
+  }
+#endif
+
+ protected:
+  /// update implimentation, not implemented
+  virtual void updateImpl(Parameter* para) {}
+
+  /// internal controller routine for controller thread
+  virtual void controller();
+
+  /// start controller thread
+  void startController();
+
+ protected:
+  /// optimization config
+  OptimizationConfig config_;
+  /// internal parameter client
+  std::unique_ptr<ParameterClient2> parameterClient_;
+  int64_t batchSize_;
+  std::unique_ptr<std::thread> controllerThread_;
+  int64_t passCount_;
+  int64_t expectedPassCount_;
+  bool testing_;
+  bool useApplyInPserver_;
+};
+
+/**
+ * Class for supporting normal updater and sparse updater
+ *
+ * Not all parts of one model are sparse, so it exists dense updater
+ * for normal layers while sparse updater is for sparse layers.
+ *
+ * it directly call internal dense and sparse udpater individually.
+ */
+class SparseRemoteParameterUpdaterComposite : public ParameterUpdaterComposite {
+ public:
+  enum {
+    UPDATER_SPARSE_REMOTE = 0,  // execute in sync thread pool(tid:0)
+    UPDATER_NORMAL = 1,         // execute in Owner thread(tid:1)
+    NUMBER_UPDATERS = 2,
+  };
+  /**
+   * @brief create one dense updater and one sparse updater
+   *
+   * @note  use syncThreadPool to synchronize these two updaters
+   */
+  SparseRemoteParameterUpdaterComposite(
+      const OptimizationConfig& config,
+      int expectedPassCount,
+      bool testing,
+      std::unique_ptr<ParameterUpdater>&& normalUpdater) {
+    updaters_.resize(NUMBER_UPDATERS);
+    updaters_[UPDATER_SPARSE_REMOTE].reset(
+        new SparseRemoteParameterUpdater(config, expectedPassCount, testing));
+    updaters_[UPDATER_NORMAL] = std::move(normalUpdater);
+
+    syncThreadPool_.reset(new SyncThreadPool(NUMBER_UPDATERS - 1));
+  }
+
+  /// initialization of dense and sparse updaters
+  virtual void init(const std::vector<ParameterPtr>& parameters);
+};
+
+class ParameterUpdaterCreators {
+ public:
+  /**
+   * @brief add a creator to create custom ParameterUpdater while training.
+   *        The creator is a function with type (alogrithm, optConfig, isLocal,
+   *        numPasses) -> ParameterUpdater*. Trainer will use this
+   *        ParameterUpdater if creator can create a no nullptr
+   *        ParameterUpdater. Return nullptr will use trainer's default
+   *        updaters.
+   *
+   * @param creator method which can create ParameterUpdater.
+   */
+  static void addCreator(
+      const std::function<ParameterUpdater*(
+          const std::string&,         // algo
+          const OptimizationConfig&,  // optConfig
+          bool,                       // isLocal
+          size_t                      // numPasses
+          )>& creator) {  // NOLINT  explicit move closing ) in this line
+                          // for readability
+    constructors_.push_back(creator);
+  }
+
+  /**
+   * @brief Try to create an updater by given algo, optConfig, isLocal,
+   *        numPasses. Return nullptr if cannot create anyone.
+   * @param algo algorithm string.
+   * @param optConfig optimization config.
+   * @param isLocal is in local mode or not.
+   * @param numPasses total passes that trainer will train.
+   * @return nullptr if fail, not nullptr if we can create an updater.
+   */
+  static ParameterUpdater* tryCreateUpdater(const std::string& algo,
+                                            const OptimizationConfig& optConfig,
+                                            bool isLocal,
+                                            size_t numPasses) {
+    for (auto& c : constructors_) {
+      if (auto updater = c(algo, optConfig, isLocal, numPasses)) {
+        return updater;
+      }
+    }
+    return nullptr;
+  }
+
+ private:
+  static std::vector<std::function<ParameterUpdater*(
+      const std::string&, const OptimizationConfig&, bool, size_t)>>
+      constructors_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/trainer/Tester.cpp b/paddle/legacy/trainer/Tester.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d977ca9657a7688c101ed060935c644e4876e6d1
--- /dev/null
+++ b/paddle/legacy/trainer/Tester.cpp
@@ -0,0 +1,380 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Tester.h"
+
+#include <fenv.h>
+#include <stdio.h>
+
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <sstream>
+
+#include <google/protobuf/text_format.h>
+
+#include "paddle/legacy/utils/GlobalConstants.h"
+#include "paddle/legacy/utils/PythonUtil.h"
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/Util.h"
+
+#include "TesterConfig.h"
+#include "paddle/legacy/gserver/gradientmachines/GradientMachineMode.h"
+#include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
+#include "paddle/legacy/gserver/layers/ValidationLayer.h"
+
+namespace paddle {
+
+Tester::Tester(const std::shared_ptr<TrainerConfigHelper>& config,
+               std::unique_ptr<TesterConfig>&& intconfig,
+               const GradientMachinePtr& gradientMachine,
+               const std::shared_ptr<ParameterUpdater>& parameterUpdater,
+               std::shared_ptr<DataProvider> testDataProvider)
+    : config_(config),
+      intconfig_(std::move(intconfig)),
+      gradientMachine_(gradientMachine),
+      parameterUpdater_(parameterUpdater),
+      testDataProvider_(testDataProvider) {
+  if (config_->getOptConfig().use_sparse_remote_updater()) {
+    LOG(FATAL) << "It's prohibited to set sparse_remote_update "
+               << "when doing train and test jobs in the same "
+               << "process. You could run paddle --job=test in "
+               << "a separate process.";
+  }
+  testEvaluator_.reset(gradientMachine_->makeEvaluator());
+  if (intconfig_->distributeTest) {
+    testParameterClient_.reset(new ParameterClient2(true));
+  }
+
+  if (testParameterClient_) {
+    testParameterClient_->init(gradientMachine_->getParameters());
+  }
+
+  std::unique_ptr<ParameterUtilConfig> paramConfig(
+      new ParameterUtilConfig(intconfig_->saveOnlyOne,
+                              intconfig_->savingPeriod,
+                              intconfig_->loadsaveParametersInPserver,
+                              intconfig_->config));
+
+  paramUtil_.reset(new ParameterUtil(
+      config_, std::move(paramConfig), gradientMachine_, parameterUpdater_));
+}
+
+void Tester::startTestPeriod() {
+  if (testDataProvider_) {
+    testDataProvider_->reset();
+  }
+  testEvaluator_->start();
+  testContext_.cost = 0;
+  testContext_.numSamples = 0;
+
+  parameterUpdater_->apply();
+  if (intconfig_->prevBatchState) {
+    gradientMachine_->getState(*intconfig_->trainState);
+    gradientMachine_->setState(*intconfig_->testState);
+  }
+}
+
+void Tester::testOneDataBatch(const DataBatch& dataBatch,
+                              std::vector<Argument>* outArgs) {
+  testContext_.cost +=
+      forwardOneBatch(dataBatch, testEvaluator_.get(), outArgs);
+  testContext_.numSamples += dataBatch.getSize();
+}
+
+void Tester::testOnePeriod() {
+  DataBatch dataBatch;
+  int64_t batchSize = config_->getOptConfig().batch_size();
+  std::vector<Argument> outArgs;
+  startTestPeriod();
+  while (testDataProvider_->getNextBatch(batchSize, &dataBatch) != 0) {
+    testOneDataBatch(dataBatch, &outArgs);
+  }
+  finishTestPeriod();
+}
+
+void Tester::finishTestPeriod() {
+  if (intconfig_->prevBatchState) {
+    gradientMachine_->resetState();
+  }
+  testEvaluator_->finish();
+  CHECK_GT(testContext_.numSamples, 0)
+      << "There is no samples in your test batch. Possibly "
+         "wrong implementation of DataProvidor.reset()";
+  LOG(INFO) << " Test samples=" << testContext_.numSamples
+            << " cost=" << testContext_.cost / testContext_.numSamples
+            << " Eval: " << *testEvaluator_;
+  parameterUpdater_->restore();
+  if (intconfig_->prevBatchState) {
+    gradientMachine_->getState(*intconfig_->testState);
+    gradientMachine_->setState(*intconfig_->trainState);
+  }
+}
+
+int64_t Tester::testOneBatchById(int64_t batchId) {
+  DataBatch dataBatch;
+  int32_t batchSize = config_->getOptConfig().batch_size();
+
+  testDataProvider_->getNextBatch(batchSize, &dataBatch);
+
+  int64_t actualBatchSize = dataBatch.getSize();
+  if (actualBatchSize == 0) {
+    return 0;
+  }
+
+  std::vector<Argument> outArgs;
+
+  stats_ += std::pair<int64_t, real>{
+      actualBatchSize,
+      forwardOneBatch(dataBatch, testEvaluator_.get(), &outArgs)};
+
+  if (((batchId + 1) % intconfig_->logPeriod) == 0) {
+    LOG(INFO) << " Batch=" << batchId + 1 << " " << stats_.getStats(false);
+  }
+
+  return actualBatchSize;
+}
+
+real Tester::forwardOneBatch(const DataBatch& dataBatch,
+                             Evaluator* evaluator,
+                             std::vector<Argument>* pOutArgs) {
+  auto& outArgs = *pOutArgs;
+  const std::vector<Argument>& inArgs = dataBatch.getStreams();
+  if (intconfig_->loadsaveParametersInPserver) {
+    REGISTER_TIMER("prefetch");
+    gradientMachine_->prefetch(inArgs);
+    parameterUpdater_->getParametersRemote(false /*full parameter*/,
+                                           true /*after apply*/);
+  }
+
+  gradientMachine_->forward(inArgs, &outArgs, PASS_TEST);
+
+  // write features if set this flag and outArgs is not empty
+  std::string featFile = intconfig_->featFile;
+  if (!featFile.empty() && outArgs.empty()) {
+    size_t numOutputs = outArgs.size();
+    std::vector<MatrixPtr> featMatrices;
+    featMatrices.resize(numOutputs);
+    for (size_t i = 0; i < numOutputs; ++i) {
+      featMatrices[i] = Matrix::create(outArgs[i].value->getHeight(),
+                                       outArgs[i].value->getWidth(),
+                                       false,
+                                       false);  // CPU data buffer
+      featMatrices[i]->copyFrom(*(outArgs[i].value), HPPL_STREAM_DEFAULT);
+    }
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+    FILE* fp = fopen(featFile.c_str(), "ab+");
+    CHECK(!ferror(fp)) << "Fail to open " << featFile;
+
+    size_t sampleNum = featMatrices[0]->getHeight();
+    for (size_t i = 0; i < sampleNum; ++i) {
+      for (size_t j = 0; j < numOutputs; ++j) {
+        size_t dim = featMatrices[j]->getWidth();
+        fwrite(featMatrices[j]->getData() + i * dim, sizeof(real), dim, fp);
+      }
+    }
+    fclose(fp);
+  }
+  if (evaluator) {
+    gradientMachine_->eval(evaluator);
+  }
+
+  // Save the output layers if predict_output_dir is not empty
+  std::string predictOutputDir = intconfig_->predictOutputDir;
+  if (!predictOutputDir.empty() && !outArgs.empty()) {
+    CHECK(intconfig_->testing) << "Only valid in test mode";
+    if (!os_.is_open()) {
+      // TODO(yuyang18): Refactor these lines.
+      constexpr int kBufLen = 100;
+      char buf[kBufLen];
+      snprintf(buf, kBufLen, "rank-%05d", intconfig_->trainerId);
+      mkDir(predictOutputDir.c_str());
+      std::string filename = path::join(predictOutputDir, buf);
+      os_.open(filename, std::ofstream::trunc);
+      CHECK(os_.is_open()) << "Failed to open file " << filename;
+    }
+    printOutput(outArgs, os_);
+    return 0.0;  // In this case, there is no meaning to calculate cost
+  }
+
+  return Argument::sum(outArgs);
+}
+
+void Tester::testOnePassBatch(int passId) {
+  stats_.reset();
+  const std::vector<Argument> inArgs;
+  gradientMachine_->forward(inArgs, nullptr, PASS_TEST);
+  int64_t num;
+  real cost;
+  gradientMachine_->getStats(cost, num);
+  stats_ += std::pair<int64_t, real>{num, cost};
+  gradientMachine_->onPassEnd();
+
+  LOG(INFO) << " Pass=" << passId << " " << stats_.getStats(false);
+}
+
+void Tester::testOnePass(int passId) {
+  stats_.reset();
+  int64_t batchId = 0;
+  int num = 0;
+  if (intconfig_->prevBatchState) {
+    gradientMachine_->resetState();
+  }
+
+  testEvaluator_->start();
+
+  do {
+    num = testOneBatchById(batchId);
+    ++batchId;
+  } while (num > 0);
+
+  gradientMachine_->onPassEnd();
+  testEvaluator_->finish();
+
+  LOG(INFO) << " Pass=" << passId << " " << stats_.getStats(false)
+            << " Eval: " << *testEvaluator_;
+
+  if (intconfig_->distributeTest) {
+    testEvaluator_->distributeEval(testParameterClient_.get());
+    if (0 == intconfig_->trainerId) {
+      LOG(INFO) << "distribute eval: " << *testEvaluator_;
+    }
+  }
+}
+
+void Tester::test() {
+  CHECK(testDataProvider_) << "TestData is not specified";
+  testDataProvider_->setSkipShuffle();
+  testDataProvider_->reset();
+  gradientMachine_->start();
+
+  // For evaluation
+  std::vector<std::string> modelList;
+  std::string modelListFromConfig = intconfig_->modelList;
+  std::string initModelPath = intconfig_->initModelPath;
+  if (!modelListFromConfig.empty()) {
+    loadFileList(modelListFromConfig, modelList);
+    intconfig_->testPass = 0;
+    intconfig_->numPasses = modelList.size();
+    intconfig_->savingPeriod = 1;
+    CHECK_EQ(intconfig_->testWait, 0) << "--test_wait must be 0 for evaluation";
+  } else if (!initModelPath.empty()) {
+    modelList.push_back(initModelPath);
+    intconfig_->testPass = 0;
+    intconfig_->numPasses = 1;
+    intconfig_->savingPeriod = 1;
+    CHECK_EQ(intconfig_->testWait, 0) << "--test_wait must be 0 for evaluation";
+  }
+
+  for (int i = intconfig_->testPass; i < intconfig_->numPasses; ++i) {
+    int passId = i;
+    if (passId % intconfig_->savingPeriod == 0) {
+      if (intconfig_->testWait) {
+        while (paramUtil_->loadParameters(
+                   passId, true /*local*/, true /*remote*/) == false) {
+          LOG(INFO) << "Waiting for parameters of pass " << passId;
+          sleep(60);  // sleep 60s
+        }
+      } else {
+        if (modelList.size() == 0) {
+          CHECK_EQ(paramUtil_->loadParameters(
+                       passId, true /*local*/, true /*remote*/),
+                   true);
+        } else {
+          paramUtil_->loadParametersWithPath(
+              modelList[i], true /*local*/, true /*remote*/);
+        }
+      }
+      if (IGradientMachineMode::trainWholeDataInOneBatch(intconfig_->mode)) {
+        testOnePassBatch(passId);
+      } else {
+        testOnePass(passId);
+      }
+      if (passId + intconfig_->savingPeriod < intconfig_->numPasses) {
+        // if there is at least 1 more pass to test, then call reset,
+        // otherwise not.
+        testDataProvider_->reset();
+      }
+    }
+  }
+
+  gradientMachine_->finish();
+}
+
+void Tester::printOutput(const std::vector<Argument>& outArgs,
+                         std::ostream& os) {
+  size_t numOutputs = outArgs.size();
+  size_t numIns = outArgs[0].getBatchSize();
+  if (cpuMat_.size() != numOutputs || cpuVec_.size() != numOutputs) {
+    cpuMat_.resize(numOutputs, nullptr);
+    cpuVec_.resize(numOutputs, nullptr);
+  }
+
+  for (size_t i = 0; i < numOutputs; ++i) {
+    if (outArgs[i].value != nullptr) {
+      if (outArgs[i].value->useGpu()) {
+        if (dynamic_cast<GpuMatrix*>(outArgs[i].value.get())) {
+          size_t dim = outArgs[i].value->getWidth();
+          Matrix::resizeOrCreate(cpuMat_[i], numIns, dim, false, false);
+          cpuMat_[i]->copyFrom(*outArgs[i].value);
+        } else if (dynamic_cast<GpuSparseMatrix*>(outArgs[i].value.get())) {
+          auto sparseMat =
+              dynamic_cast<GpuSparseMatrix*>(outArgs[i].value.get());
+          cpuMat_[i] = Matrix::createSparseMatrix(sparseMat->getHeight(),
+                                                  sparseMat->getWidth(),
+                                                  sparseMat->getElementCnt(),
+                                                  sparseMat->getValueType(),
+                                                  sparseMat->format_,
+                                                  false,  /* trans */
+                                                  false); /* useGpu */
+          hl_stream_t stream = HPPL_STREAM_DEFAULT;
+          cpuMat_[i]->copyFrom(*sparseMat, stream);
+        } else {
+          LOG(WARNING) << "Not supported gpu matrix type";
+        }
+      }
+    } else if (outArgs[i].ids != nullptr) {
+      if (outArgs[i].ids->useGpu()) {
+        IVector::resizeOrCreate(cpuVec_[i], outArgs[i].ids->getSize(), false);
+        cpuVec_[i]->copyFrom(*outArgs[i].ids);
+      }
+    } else if (outArgs[i].strs != nullptr) {
+      continue;
+    } else {
+      LOG(WARNING) << "outArgs[" << i << "] has no data to print";
+    }
+  }
+
+  for (size_t i = 0; i < numIns; ++i) {
+    for (size_t j = 0; j < numOutputs; ++j) {
+      if (outArgs[j].value != nullptr) {
+        if (outArgs[j].value->useGpu()) {
+          cpuMat_[j]->printOneRow(os, i);
+        } else {
+          outArgs[j].value->printOneRow(os, i);
+        }
+      } else if (outArgs[j].ids != nullptr) {
+        if (outArgs[j].ids->useGpu()) {
+          cpuVec_[j]->printOneElement(os, i);
+        } else {
+          outArgs[j].ids->printOneElement(os, i);
+        }
+      } else if (outArgs[j].strs != nullptr) {
+        os << (*outArgs[j].strs)[i] << ";";
+      }
+    }
+    os << std::endl;
+  }
+}
+}  // namespace paddle
diff --git a/paddle/legacy/trainer/Tester.h b/paddle/legacy/trainer/Tester.h
new file mode 100644
index 0000000000000000000000000000000000000000..a298602d1d0894af90c098818908862a553cb3e7
--- /dev/null
+++ b/paddle/legacy/trainer/Tester.h
@@ -0,0 +1,149 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/legacy/utils/Util.h"
+
+#include <stdio.h>
+
+#include "hl_gpu.h"
+#include "paddle/legacy/gserver/dataproviders/DataProvider.h"
+#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
+
+#include "TrainerConfig.pb.h"
+
+#include <stdlib.h>
+#include <fstream>
+#include "ParamUtil.h"
+#include "ParameterUpdater.h"
+#include "TesterConfig.h"
+#include "TrainerInternalConfig.h"
+
+namespace paddle {
+
+/**
+ * Neural Network test logics code.
+ * It is a private class for Trainer.
+ */
+class Tester {
+ public:
+  /**
+   * Ctor
+   * @param config Trainer Config.
+   * @param intconfig Tester Config.
+   * @param gradientMachine Gradient machine(neuralnetwork) that will be tested.
+   * @param parameterUpdater Parameter Updater. Not for updating parameter, just
+   *                         for getting parameter from parameter-server.
+   * @param testDataProvider Test data provider.
+   */
+  Tester(const std::shared_ptr<TrainerConfigHelper>& config,
+         std::unique_ptr<TesterConfig>&& intconfig,
+         const GradientMachinePtr& gradientMachine,
+         const std::shared_ptr<ParameterUpdater>& parameterUpdater,
+         std::shared_ptr<DataProvider> testDataProvider);
+
+  /**
+   * test one period.
+   *
+   * One period means 2 things.
+   *   if test_period !=0 and not test_all_data_in_one_period, then
+   *      will test test_period * batch_size data.
+   *   else
+   *      will test whole test data.
+   *
+   * It is convenience to test small set of data when test data set is large and
+   * is training at same time.
+   */
+  void testOnePeriod();
+  void startTestPeriod();
+  void finishTestPeriod();
+  void testOneDataBatch(const DataBatch& dataBatch,
+                        std::vector<Argument>* outArgs);
+
+  /**
+   * Test for given data batch.
+   * @param dataBatch Data batch.
+   * @param evaluator Evaluator
+   * @return cost
+   */
+  real forwardOneBatch(const DataBatch& dataBatch,
+                       Evaluator* evaluator,
+                       std::vector<Argument>* outArgs);
+
+  /**
+   * performance the full pass of test given test data provider
+   */
+  void test();
+
+ protected:
+  std::shared_ptr<ParameterClient2> testParameterClient_;
+  std::shared_ptr<TrainerConfigHelper> config_;
+  std::unique_ptr<TesterConfig> intconfig_;
+  GradientMachinePtr gradientMachine_;
+  std::shared_ptr<ParameterUpdater> parameterUpdater_;
+  std::unique_ptr<Evaluator> testEvaluator_;
+  std::unique_ptr<ParameterUtil> paramUtil_;
+  DataProviderPtr testDataProvider_;
+  TrainerStats stats_;
+
+  // Used for saving the values of output layers
+  std::ofstream os_;
+  std::vector<MatrixPtr> cpuMat_;
+  std::vector<IVectorPtr> cpuVec_;
+  struct {
+    int64_t numSamples;
+    real cost;
+  } testContext_;
+
+ private:
+  /**
+   * Test one batch by batchId. It is only used for testOnePass.
+   *
+   * Durning testOnePass, each log_period will print cost statistics.
+   *
+   * @param batchId current batch id (from 0)
+   * @return num of tested samples. Zero if end of pass.
+   */
+  int64_t testOneBatchById(int64_t batchId);
+
+  /**
+   * Test whole pass in one batch.
+   *
+   *
+   * @param passId current pass id (from 0)
+   */
+  void testOnePassBatch(int passId);
+
+  /**
+   * test for one pass in several mini-batches.
+   *
+   * Used for sgd method.
+   *
+   * @param passId current pass id (from 0)
+   */
+  void testOnePass(int passId);
+
+  /**
+   * print the outArgs to a stream
+   *
+   * used for save feature file
+   *
+   * @param [in] outArgs output arguments for network.
+   * @param [in,out] os output stream.
+   */
+  void printOutput(const std::vector<Argument>& outArgs, std::ostream& os);
+};
+
+}  //  namespace paddle
diff --git a/paddle/legacy/trainer/TesterConfig.h b/paddle/legacy/trainer/TesterConfig.h
new file mode 100644
index 0000000000000000000000000000000000000000..6c78f7cda347d5808d11e8af98672ef56898d643
--- /dev/null
+++ b/paddle/legacy/trainer/TesterConfig.h
@@ -0,0 +1,138 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/legacy/utils/Util.h"
+
+#include <stdio.h>
+
+#include "hl_gpu.h"
+#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
+
+#include "TrainerConfig.pb.h"
+
+#include <stdlib.h>
+#include <fstream>
+#include "ParameterUpdater.h"
+
+namespace paddle {
+
+/**
+ * TesterConfig
+ * general configs for training
+ */
+struct TesterConfig {
+  /**
+   * indicate test period
+   */
+  int testPeriod;
+
+  /**
+   * indicate whether to save previous batch state
+   */
+  bool prevBatchState;
+
+  /**
+   * log period
+   */
+  int logPeriod;
+
+  /**
+   * loadsave parameters in pserver
+   */
+  bool loadsaveParametersInPserver;
+
+  /**
+   * feat file
+   */
+  std::string featFile;
+
+  /**
+   * predict output dir
+   */
+  std::string predictOutputDir;
+
+  /**
+   * trianer id
+   */
+  int trainerId;
+
+  /**
+   * distribute test
+   */
+  bool distributeTest;
+
+  /**
+   * training state
+   */
+  MachineState* trainState;
+
+  /**
+   * test state
+   */
+  MachineState* testState;
+
+  /**
+   * model list
+   */
+  std::string modelList;
+
+  /**
+   * test passes
+   */
+  int testPass;
+
+  /**
+   * num passes
+   */
+  int numPasses;
+
+  /**
+   * saving period
+   */
+  int savingPeriod;
+
+  /**
+   * test wait
+   */
+  int testWait;
+
+  /**
+   * init model path
+   */
+  std::string initModelPath;
+
+  /**
+   * save only one
+   */
+  bool saveOnlyOne;
+
+  /**
+   * testing mode
+   */
+  bool testing;
+
+  /**
+   * mode
+   */
+  int mode;
+
+  /**
+   * config loc
+   */
+  std::string config;
+};
+
+}  //  namespace paddle
diff --git a/paddle/legacy/trainer/ThreadParameterUpdater.cpp b/paddle/legacy/trainer/ThreadParameterUpdater.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0601bdf24e3150f5d182e2addde3a91609a967e4
--- /dev/null
+++ b/paddle/legacy/trainer/ThreadParameterUpdater.cpp
@@ -0,0 +1,309 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ThreadParameterUpdater.h"
+
+#include "paddle/legacy/utils/Logging.h"
+
+#include "paddle/legacy/math/SparseRowMatrix.h"
+#include "paddle/legacy/parameter/ThreadLocalBuffer.h"
+#include "paddle/legacy/utils/Thread.h"
+
+DECLARE_int32(trainer_count);
+
+namespace paddle {
+
+SgdThreadUpdater::SgdThreadUpdater(const OptimizationConfig& optConfig)
+    : config_(optConfig), numSamplesProcessed_(0) {
+  // fill types
+  auto types = sgdOptimizerGetTypes(optConfig, false /*inPserver*/);
+  for (auto type : types) {
+    addParameterType(type);
+  }
+}
+
+void SgdThreadUpdater::init(const std::vector<ParameterPtr>& parameters) {
+  ParameterUpdater::init(parameters);
+
+  // calc max parameter id
+  size_t maxId = 0;
+  for (auto& para : parameters_) {
+    maxId = std::max(maxId, para->getID());
+  }
+
+  optimizers_.resize(maxId + 1);
+  for (auto& para : parameters_) {
+    int pid = para->getID();
+    optimizers_[pid].reset(sgdOptimizerCreate(config_,
+                                              para->getConfig(),
+                                              para->isGradSparseUpdate(),
+                                              false /*inPserver*/));
+    size_t numRows = para->isGradSparseUpdate() ? para->getConfig().dims(0) : 0;
+    optimizers_[pid]->init(numRows, &para->getConfig());
+    if (para->isGradSparseUpdate() && FLAGS_trainer_count == 1) {
+      // For trainer_count=1, the gradient machine is NeuralNetwork, which does
+      // not create parameter buf for PARAMETER_GRADIENT for sparse update in
+      // Parameter::enableType(). But gradient parameter buf is still used
+      // in SgdThreadUpdater. We need to explicitly create it.
+      //
+      // The AverageOptimizer::restore/apply method will use PARAMETER_GRADIENT
+      // as a temp buffer.
+      para->enableBufType(PARAMETER_GRADIENT);
+    }
+  }
+}
+
+void SgdThreadUpdater::startPass() {
+  for (auto& para : parameters_) {
+    int pid = para->getID();
+    optimizers_[pid]->startPass();
+  }
+}
+
+bool SgdThreadUpdater::finishPass() {
+  catchUpWith();
+
+  for (auto& para : parameters_) {
+    int pid = para->getID();
+    optimizers_[pid]->finishPass();
+  }
+  return true;
+}
+
+void SgdThreadUpdater::updateImpl(Parameter* para) {
+  if (!para->useGpu()) return;
+  SetDevice setDevice(para->getDeviceId());
+  ParameterOptimizer* optimizer = optimizers_[para->getID()].get();
+  optimizer->update(para->getBufs(), para->getConfig());
+  if (auto callback = optimizer->needSpecialTraversal(para->getConfig())) {
+    callback(para->getBufs(), para->getConfig(), -1LU);
+  }
+
+  para->setValueUpdated();
+  para->clearGradient();
+}
+
+void SgdThreadUpdater::threadTraverse(
+    const ParameterOptimizer::TraverseCallback& callback,
+    int tid,
+    size_t numThreads,
+    Parameter* para) {
+  VectorPtr* vecs = parameter::getThreadLocalBuffer();
+  if (para->isGradSparseUpdate()) {
+    size_t height = para->getConfig().dims(0);
+    size_t width = para->getConfig().dims(1);
+    for (size_t i = tid; i < height; i += numThreads) {
+      // setup sub bufs
+      for (auto type : parameterTypes_) {
+        vecs[type]->subVecFrom(*para->getBuf(type), i * width, width);
+      }
+      callback(vecs, para->getConfig(), i);
+    }
+  } else {  // dense
+    // setup sub bufs
+    auto interval = calcSplitArrayInterval(
+        para->getSize(), (size_t)tid, numThreads, 8LU /*for avx*/);
+    for (auto type : parameterTypes_) {
+      vecs[type]->subVecFrom(*para->getBuf(type), interval);
+    }
+
+    callback(vecs, para->getConfig(), -1LU);
+  }
+}
+
+void SgdThreadUpdater::traverse(GetTraverseCallback getTraverseCallback) {
+  bool hasCpuPara = false;
+  bool hasGpuPara = false;
+  for (auto& para : parameters_) {
+    if (para->useGpu()) {
+      hasGpuPara = true;
+    } else {
+      hasCpuPara = true;
+    }
+  }
+
+  auto cpuTraverse = [&](int tid, size_t numThreads) {
+    for (auto& para : parameters_) {
+      if (auto callback = getTraverseCallback(para.get())) {
+        threadTraverse(callback, tid, numThreads, para.get());
+      }
+    }
+  };
+  auto gpuTraverse = [&](int tid, size_t numThreads) {
+    for (auto& para : parameters_) {
+      if (para->useGpu()) {
+        if (auto callback = getTraverseCallback(para.get())) {
+          SetDevice setDevice(para->getDeviceId());
+          callback(para->getBufs(), para->getConfig(), -1LU);
+        }
+      }
+    }
+  };
+
+  if (hasCpuPara && hasGpuPara) {
+    getGlobalSyncThreadPool()->exec(cpuTraverse, gpuTraverse);
+  } else if (hasCpuPara) {
+    getGlobalSyncThreadPool()->exec(cpuTraverse);
+  } else if (hasGpuPara) {
+    gpuTraverse(0, 0);
+  }
+}
+
+void SgdThreadUpdater::catchUpWith() {
+  traverse([this](Parameter* para) {
+    return optimizers_[para->getID()]->startCatchUpWith();
+  });
+
+  for (auto& para : parameters_) {
+    int pid = para->getID();
+    optimizers_[pid]->finishCatchUpWith();
+  }
+}
+
+void SgdThreadUpdater::apply() {
+  catchUpWith();
+
+  traverse(
+      [this](Parameter* para) { return optimizers_[para->getID()]->apply(); });
+}
+
+void SgdThreadUpdater::restore() {
+  traverse([this](Parameter* para) {
+    return optimizers_[para->getID()]->restore();
+  });
+}
+
+PassType SgdThreadUpdater::startBatch(int64_t batchSize) {
+  numSamplesProcessed_ += batchSize;
+  for (auto& para : parameters_) {
+    int pid = para->getID();
+    optimizers_[pid]->startBatch(numSamplesProcessed_);
+  }
+  return PASS_TRAIN;
+}
+
+void SgdThreadUpdater::finishBatch(real cost) {
+  getGlobalSyncThreadPool()->exec([&](int tid, size_t numThreads) {
+    for (auto& para : parameters_) {
+      if (para->isGradSparseUpdate()) {
+        threadUpdateSparse(tid, numThreads, para.get());
+      } else if (!para->useGpu()) {
+        threadUpdateDense(tid, numThreads, para.get());
+      }
+    }
+  });
+
+  for (auto& para : parameters_) {
+    int pid = para->getID();
+    optimizers_[pid]->finishBatch();
+  }
+}
+
+void SgdThreadUpdater::threadUpdateSparse(int tid,
+                                          size_t numThreads,
+                                          Parameter* para) {
+  int pid = para->getID();
+  ParameterOptimizer* optimizer = optimizers_[pid].get();
+  VectorPtr* vecs = parameter::getThreadLocalBuffer();
+
+  size_t height = para->getConfig().dims(0);
+  size_t width = para->getConfig().dims(1);
+
+  if (dynamic_cast<SparseRowIdsCpuMatrix*>(
+          para->getMat(PARAMETER_GRADIENT).get())) {
+    // From MultiGradientMachine
+    SparseRowIdsCpuMatrix* mainMat = dynamic_cast<SparseRowIdsCpuMatrix*>(
+        para->getMat(PARAMETER_GRADIENT).get());
+    std::vector<uint32_t>& sparseIds = mainMat->getIds(tid);
+
+    for (auto id : sparseIds) {
+      // setup sub bufs
+      for (auto type : parameterTypes_) {
+        vecs[type]->subVecFrom(*para->getBuf(type), id * width, width);
+      }
+      optimizer->update(vecs, para->getConfig(), id);
+      vecs[PARAMETER_GRADIENT]->zeroMem();
+    }
+    sparseIds.clear();
+  } else if (dynamic_cast<SparseRowCpuMatrix*>(
+                 para->getMat(PARAMETER_GRADIENT).get())) {
+    // From NeuralNetwork
+    SparseRowCpuMatrix* mainMat = dynamic_cast<SparseRowCpuMatrix*>(
+        para->getMat(PARAMETER_GRADIENT).get());
+
+    std::vector<unsigned int>& localIndices =
+        mainMat->getIndexDictHandle()->localIndices;
+
+    auto interval =
+        calcSplitArrayInterval(localIndices.size(), tid, numThreads);
+    for (size_t i = interval.first; i < interval.second; ++i) {
+      auto id = localIndices[i];
+      real* row = mainMat->getLocalRow(i);
+      // setup sub bufs
+      for (auto type : parameterTypes_) {
+        if (type == PARAMETER_GRADIENT) {
+          vecs[type]->subVecFrom(row, 0, width);
+        } else {
+          vecs[type]->subVecFrom(*para->getBuf(type), id * width, width);
+        }
+      }
+      optimizer->update(vecs, para->getConfig(), id);
+      vecs[PARAMETER_GRADIENT]->zeroMem();
+    }
+    // For numThreads > 1, MultiGradientMachine is used, which goes
+    // to the above branch.
+    CHECK_EQ(numThreads, 1UL);
+    mainMat->clearIndices();
+  } else {
+    auto& m = *para->getMat(PARAMETER_GRADIENT).get();
+    LOG(FATAL) << "Internal error: " << para->getName() << " "
+               << typeid(m).name();
+  }
+
+  if (auto callback = optimizer->needSpecialTraversal(para->getConfig())) {
+    for (size_t i = tid; i < height; i += numThreads) {
+      // setup sub bufs
+      for (auto type : parameterTypes_) {
+        vecs[type]->subVecFrom(*para->getBuf(type), i * width, width);
+      }
+      callback(vecs, para->getConfig(), i);
+    }
+  }
+}
+
+void SgdThreadUpdater::threadUpdateDense(int tid,
+                                         size_t numThreads,
+                                         Parameter* para) {
+  int pid = para->getID();
+  ParameterOptimizer* optimizer = optimizers_[pid].get();
+  VectorPtr* vecs = parameter::getThreadLocalBuffer();
+
+  auto interval = calcSplitArrayInterval(
+      para->getSize(), (size_t)tid, numThreads, 8LU /*for avx*/);
+
+  // setup sub bufs
+  for (auto type : parameterTypes_) {
+    vecs[type]->subVecFrom(*para->getBuf(type), interval);
+  }
+
+  // update
+  optimizer->update(vecs, para->getConfig());
+  vecs[PARAMETER_GRADIENT]->zeroMem();
+
+  if (auto callback = optimizer->needSpecialTraversal(para->getConfig())) {
+    callback(vecs, para->getConfig(), -1LU);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/trainer/ThreadParameterUpdater.h b/paddle/legacy/trainer/ThreadParameterUpdater.h
new file mode 100644
index 0000000000000000000000000000000000000000..172287d4eb56828c83e6670226b4c1f179fac6d8
--- /dev/null
+++ b/paddle/legacy/trainer/ThreadParameterUpdater.h
@@ -0,0 +1,85 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/legacy/parameter/AverageOptimizer.h"
+#include "paddle/legacy/parameter/FirstOrderOptimizer.h"
+#include "paddle/legacy/parameter/OptimizerFunctions.h"
+#include "paddle/legacy/parameter/OptimizerWithRegularizer.h"
+#include "paddle/legacy/parameter/Parameter.h"
+#include "paddle/legacy/parameter/Regularizer.h"
+#include "paddle/legacy/utils/Util.h"
+
+#include <memory>
+#include <vector>
+
+namespace paddle {
+
+/**
+ * \brief A parameter updater that uses multiple threads to update parameters.
+   This parameter updater handles GPU and CPU updates differently,
+   because at the current moment, the merging on CPU is happening on the
+   main thread, and the its parameter size can be much larger than the one GPU.
+   Thus, for GPU, the parameter updates happens in updateImpl() function, which
+   is called by gradient machines as a callback function supplied to backward()
+   and forwardBackward().
+   For CPU, the parameter updates happens in separate threads maintained by this
+   class.
+ */
+class SgdThreadUpdater : public ParameterUpdater {
+ public:
+  explicit SgdThreadUpdater(const OptimizationConfig& optConfig);
+  virtual ~SgdThreadUpdater() {}
+
+  // Use the startPass() function of the base optimizer.
+  virtual void startPass();
+
+  // Use the finishPass() function of the base optimizer.
+  virtual bool finishPass();
+
+  virtual void init(const std::vector<ParameterPtr>& parameters);
+  virtual PassType startBatch(int64_t batchSize);
+  // Call finishBatch for each optimizer.
+  virtual void finishBatch(real cost);
+  virtual void catchUpWith();
+  virtual void apply();
+  virtual void restore();
+
+ protected:
+  // This is the function that will be eventualy called by the GradientMachine.
+  // used only for GPU update.
+  virtual void updateImpl(Parameter* para);
+  OptimizationConfig config_;
+  int64_t numSamplesProcessed_;
+
+  // One optimizers for each parameter.
+  std::vector<std::unique_ptr<ParameterOptimizer>> optimizers_;
+
+  // The update function for CPU sparse parameters.
+  void threadUpdateSparse(int tid, size_t numThreads, Parameter* para);
+
+  // The update function for CPU dense parameters.
+  void threadUpdateDense(int tid, size_t numThreads, Parameter* para);
+  // The update function for after update operations, such as averager.
+  void threadTraverse(const ParameterOptimizer::TraverseCallback& callback,
+                      int tid,
+                      size_t numThreads,
+                      Parameter* para);
+  typedef std::function<const ParameterOptimizer::TraverseCallback(Parameter*)>
+      GetTraverseCallback;
+  void traverse(GetTraverseCallback getTraverseCallback);
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/trainer/Trainer.cpp b/paddle/legacy/trainer/Trainer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2db754793cf19e0c29455f61ada5f1d15b3204af
--- /dev/null
+++ b/paddle/legacy/trainer/Trainer.cpp
@@ -0,0 +1,653 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Trainer.h"
+
+#include <stdio.h>
+
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <sstream>
+
+#include <google/protobuf/text_format.h>
+
+#include "paddle/legacy/utils/Common.h"
+#include "paddle/legacy/utils/GlobalConstants.h"
+#include "paddle/legacy/utils/PythonUtil.h"
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/Util.h"
+
+#include "RemoteParameterUpdater.h"
+#include "TesterConfig.h"
+#include "ThreadParameterUpdater.h"
+#include "TrainerConfigHelper.h"
+#include "paddle/legacy/gserver/gradientmachines/GradientMachineMode.h"
+#include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
+#include "paddle/legacy/gserver/layers/ValidationLayer.h"
+
+DEFINE_string(config, "", "Trainer config file");
+
+DEFINE_int32(test_period,
+             0,
+             "if equal 0, do test on all test data at the end of "
+             "each pass. While if equal non-zero, do test on all test "
+             "data every test_period batches");
+DEFINE_bool(test_all_data_in_one_period,
+            false,
+            "This option was deprecated, since we will always do "
+            "test on all test set ");
+
+DEFINE_bool(local, true, "Train in local mode or not");
+
+DEFINE_int32(average_test_period,
+             0,
+             "Do test on average parameter every so"
+             " many batches. MUST be devided by FLAGS_log_period."
+             " Default 0 means do not test average parameter");
+
+DEFINE_int32(saving_period, 1, "Save parameteres every so many passes");
+DEFINE_int64(saving_period_by_batches,
+             0,
+             "Save parameters every so many batches in one pass");
+DEFINE_string(save_dir, "", "Directory for saving model parameter");
+DEFINE_int32(start_pass,
+             0,
+             "Start training from this pass. "
+             "Will load parameter from the previous pass");
+DEFINE_int32(test_pass, -1, "Will load parameter start from this pass to test");
+DEFINE_int32(test_wait, 0, "Waiting for pass parameter if not exist");
+DEFINE_bool(with_cost, true, "enable cost layer or not");
+DEFINE_bool(distribute_test, false, "test in distribute mode");
+
+DEFINE_int32(num_passes, 100, "train for so many passes");
+
+DEFINE_string(config_args,
+              "",
+              "arguments passed to config file."
+              "Format: key1=value1,key2=value2");
+
+DEFINE_bool(save_only_one,
+            false,
+            "Save only parameters in last pass, remove previous.");
+
+DEFINE_string(feat_file, "", "File name of extracted feature.");
+DEFINE_string(predict_output_dir,
+              "",
+              "Directory that saves the predicted results of output layers");
+DEFINE_string(model_list, "", "File that saves the model list when evaluation");
+
+namespace paddle {
+
+void Trainer::init(const std::shared_ptr<TrainerConfigHelper>& config,
+                   bool testing,
+                   const std::shared_ptr<GradientMachine>& gradientMachine,
+                   const std::shared_ptr<DataProvider>& dataProvider,
+                   const std::shared_ptr<DataProvider>& testDataProvider) {
+  this->stats_ = std::make_shared<TrainerStats>();
+
+  config_ = config;
+
+  config_->updateConfigFromFlags();
+
+  testing_ = testing;
+
+  // in testing, mode_ may GradientMachine::kTesting or
+  // GradientMachine::kSgdSparseCpuTraining
+
+  if (FLAGS_local) {
+    CHECK(!FLAGS_loadsave_parameters_in_pserver)
+        << "local and loadsave_parameters_in_pserver can not both true";
+    if (config_->getOptConfig().use_sparse_remote_updater()) {
+      config_->disableRemoteSparseUpdaterForEachParams();
+      LOG(INFO) << "ignore sparse_remote_update=true due to  --local=true";
+    }
+  }
+  if (FLAGS_loadsave_parameters_in_pserver) {
+    CHECK(config_->getOptConfig().use_sparse_remote_updater())
+        << "no parameter to load from pserver, please check network config";
+  }
+  if (testing && !FLAGS_loadsave_parameters_in_pserver) {
+    if (config_->getOptConfig().use_sparse_remote_updater()) {
+      config_->disableRemoteSparseUpdater();
+      LOG(INFO) << "because parameter is loaded local,"
+                << "tester ignore sparse_remote_update flag";
+    }
+  }
+
+  CHECK(TrainAlgorithm::isValid(config_->getOptConfig().algorithm()))
+      << "invalid algorithm configuration: "
+      << config_->getOptConfig().algorithm();
+
+  bool useSparseUpdater = false;
+  for (auto& paraConfig : config_->getModelConfig().parameters()) {
+    if (paraConfig.sparse_update() || paraConfig.sparse_remote_update()) {
+      useSparseUpdater = true;
+    }
+  }
+
+  if (FLAGS_use_mkldnn) {
+    CHECK_EQ(FLAGS_trainer_count, 1) << "MKLDNN only need 1 trainer";
+  }
+
+  if (testing) {
+    LOG(INFO) << "trainer: in testing mode";
+    if (config_->getOptConfig().use_sparse_remote_updater() ||
+        FLAGS_trainer_count > 1) {
+      mode_ = GradientMachine::kSgdSparseCpuTraining;
+      LOG(INFO) << "trainer mode: SgdSparseCpuTraining";
+    } else {
+      mode_ = GradientMachine::kTesting;
+      LOG(INFO) << "trainer mode: Testing";
+    }
+  } else if (IGradientMachineMode::tryGetMode(
+                 (int*)&mode_,
+                 config_->getOptConfig().algorithm(),
+                 FLAGS_trainer_count,
+                 FLAGS_local,
+                 FLAGS_use_gpu)) {
+    LOG(INFO) << "Custom trainer mode.";
+  } else if ((config_->getOptConfig().algorithm() == TrainAlgorithm::SGD ||
+              config_->getOptConfig().algorithm() ==
+                  TrainAlgorithm::AsyncSGD) &&
+             useSparseUpdater) {
+    mode_ = GradientMachine::kSgdSparseCpuTraining;
+    LOG(INFO) << "trainer mode: SgdSparseCpuTraining";
+  } else {
+    mode_ = GradientMachine::kNormal;
+    LOG(INFO) << "trainer mode: Normal";
+  }
+
+  // initialize trainer internal
+  trainerInternal_.init(config_,
+                        gradientMachine,
+                        TrainerInternalConfig::createFromMode(mode_),
+                        stats_,
+                        testing);
+  std::unique_ptr<ParameterUtilConfig> paramConfig(
+      new ParameterUtilConfig(FLAGS_save_only_one,
+                              FLAGS_saving_period,
+                              FLAGS_loadsave_parameters_in_pserver,
+                              FLAGS_config));
+
+  paramUtil_.reset(
+      new paddle::ParameterUtil(config_,
+                                std::move(paramConfig),
+                                trainerInternal_.getGradientMachine(),
+                                trainerInternal_.getParameterUpdater()));
+
+  bool gpuData =
+      FLAGS_use_gpu && (!FLAGS_parallel_nn) &&
+      (!IGradientMachineMode::dataMustInCpu(mode_, FLAGS_trainer_count));
+
+  dataProvider_ = dataProvider;
+  if (!dataProvider_ && config_->hasDataConfig() && !testing_) {
+    dataProvider_.reset(DataProvider::create(*config_, *config_, gpuData));
+  }
+  if (!testDataProvider_) {
+    // No evaluator_ if there is testDataProvider but no dataProvider.
+    evaluator_.reset(trainerInternal_.getGradientMachine()->makeEvaluator());
+    currentEvaluator_.reset(
+        trainerInternal_.getGradientMachine()->makeEvaluator());
+    if (FLAGS_average_test_period > 0 && FLAGS_trainer_id == 0 &&
+        config_->getOptConfig().average_window() > 0) {
+      CHECK_EQ(FLAGS_average_test_period % FLAGS_log_period, 0)
+          << "FLAGS_average_test_period must be divided by FALGS_log_period";
+      averageEvaluator_.reset(
+          trainerInternal_.getGradientMachine()->makeEvaluator());
+    }
+  }
+
+  testDataProvider_ = testDataProvider;
+  if (!testDataProvider_ && config_->hasTestDataConfig()) {
+    testDataProvider_.reset(
+        DataProvider::create(config_->getTestDataConfig(), *config_, gpuData));
+  }
+  if (testDataProvider_) {
+    createTester();
+  }
+
+  if (!testing &&
+      (trainerInternal_.getGradientMachine()->hasStaticParameters())) {
+    CHECK(!FLAGS_loadsave_parameters_in_pserver)
+        << "is_static and loadsave_parameters_in_pserver can not both true";
+  }
+  if (testing) {
+    // will load per pass for tester
+  } else if (paramUtil_->tryLoadParametersFromConfig()) {
+    // load from config already.
+  } else {
+    trainerInternal_.getGradientMachine()->randParameters();
+  }
+
+  // Only non static parameters need to be updated
+  std::vector<ParameterPtr>& parameters =
+      trainerInternal_.getGradientMachine()->getNonStaticParameters();
+  if (trainerInternal_.getParameterUpdater()) {
+    trainerInternal_.getParameterUpdater()->init(parameters);
+
+    if (FLAGS_loadsave_parameters_in_pserver && FLAGS_trainer_id == 0) {
+      if (testing) {
+        // will load per pass for tester
+      } else if (!config_->getConfig().init_model_path().empty() &&
+                 (FLAGS_local || FLAGS_trainer_id == 0)) {
+        paramUtil_->loadParametersWithPath(
+            config_->getConfig().init_model_path(),
+            false /*local*/,
+            true /*remote*/);
+      } else if (config_->getConfig().start_pass() > 0 &&
+                 (FLAGS_local || FLAGS_trainer_id == 0)) {
+        CHECK(paramUtil_->loadParameters(config_->getConfig().start_pass() - 1,
+                                         false /*local*/,
+                                         true /*remote*/));
+      } else {
+        trainerInternal_.getParameterUpdater()->randParametersRemote();
+      }
+    }
+  }
+
+  // set current evaluator and evalutor
+  trainerInternal_.setCurrentEvaluator(currentEvaluator_.get());
+  trainerInternal_.setEvaluator(evaluator_.get());
+}
+
+void Trainer::train(size_t numPasses) {
+  startTrain();
+  for (size_t i = 0; i < numPasses; ++i) {
+    if (IGradientMachineMode::trainWholeDataInOneBatch(mode_)) {
+      trainOnePassBatch(config_->getConfig().start_pass() + i);
+    } else {
+      trainOnePass();
+    }
+    if (i < numPasses - 1) {
+      dataProvider_->reset();
+    }
+  }
+
+  finishTrain();
+}
+
+static double genPerturbation(real* d, real* grad, size_t dim) {
+  auto& reng = ThreadLocalRandomEngine::get();
+  std::uniform_real_distribution<double> dist(-1, 1);
+  double gradNorm = 0, dNorm = 0;
+  for (size_t i = 0; i < dim; ++i) {
+    d[i] = dist(reng);
+    dNorm += d[i] * d[i];
+    gradNorm += grad[i] * grad[i];
+  }
+  if (gradNorm > 0) {
+    real s = 0.5 * sqrt(gradNorm / dNorm);
+    for (size_t i = 0; i < dim; ++i) {
+      d[i] = s * d[i] + grad[i];
+    }
+  }
+  double delta = 0;
+  for (size_t i = 0; i < dim; ++i) {
+    delta += grad[i] * d[i];
+  }
+  return delta;
+}
+
+real Trainer::checkGradient() {
+  trainerInternal_.getGradientMachine()->start();
+  std::vector<ParameterPtr>& parameters =
+      trainerInternal_.getGradientMachine()->getNonStaticParameters();
+  DataBatch dataBatch;
+  int32_t batchSize = config_->getOptConfig().batch_size();
+
+  dataProvider_->getNextBatch(batchSize, &dataBatch);
+
+  CHECK(dataBatch.getSize()) << "No data from data provider";
+  std::vector<Argument>& inArgs = dataBatch.getStreams();
+  std::vector<Argument> outArgs;
+
+  trainerInternal_.getGradientMachine()->forward(inArgs, &outArgs, PASS_GC);
+  real cost = Argument::sum(outArgs);
+  LOG(INFO) << "original cost=" << cost;
+  trainerInternal_.getGradientMachine()->backward();
+
+  real maxDiff = 0;
+  char fill = ' ';
+  for (auto& parameter : parameters) {
+    CpuVector oldPara(parameter->getSize());
+    CpuVector newPara(parameter->getSize());
+    oldPara.copyFrom(*parameter->getBuf(PARAMETER_VALUE));
+    real* newp = newPara.getData();
+    real* oldp = oldPara.getData();
+    CpuVector cpuGrad(*parameter->getBuf(PARAMETER_GRADIENT));
+    real* grad = cpuGrad.getData();
+    size_t dim = parameter->getSize();
+    std::vector<real> d(dim);
+
+    double delta = genPerturbation(d.data(), grad, dim);
+
+    // use a step such that delta / cost is FLAGS_checkgrad_eps
+    real step =
+        (delta != 0) ? cost / delta * FLAGS_checkgrad_eps : FLAGS_checkgrad_eps;
+    delta *= step;
+    for (size_t i = 0; i < dim; ++i) {
+      newp[i] = oldp[i] + step * d[i];
+    }
+
+    parameter->getBuf(PARAMETER_VALUE)->copyFrom(newPara);
+    parameter->setValueUpdated();
+    trainerInternal_.getGradientMachine()->forward(inArgs, &outArgs, PASS_GC);
+    real newCost1 = Argument::sum(outArgs);
+
+    for (size_t i = 0; i < dim; ++i) {
+      newp[i] = oldp[i] - step * d[i];
+    }
+
+    parameter->getBuf(PARAMETER_VALUE)->copyFrom(newPara);
+    parameter->setValueUpdated();
+    trainerInternal_.getGradientMachine()->forward(inArgs, &outArgs, PASS_GC);
+    real newCost2 = Argument::sum(outArgs);
+
+    real trueDelta = 0.5 * (newCost1 - newCost2);
+    real diff = (1e-20 + trueDelta) / (1e-20 + delta) - 1;
+    LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(fill)
+              << std::setw(20) << parameter->getName()
+              << "step=" << std::setw(15) << step << "cost1=" << std::setw(10)
+              << newCost1 << "cost2=" << std::setw(10) << newCost2
+              << "true_delta=" << std::setw(15) << trueDelta
+              << "analytic_delta=" << std::setw(15) << delta << "diff=" << diff
+              << (std::abs(diff) > 0.01 ? " ***" : "");
+
+    maxDiff = std::max(maxDiff, std::abs(diff));
+
+    // restore parameter
+    parameter->getBuf(PARAMETER_VALUE)->copyFrom(oldPara);
+    parameter->setValueUpdated();
+
+    fill = (fill == ' ') ? '.' : ' ';
+  }
+  return maxDiff;
+}
+
+void Trainer::startTrain() {
+  trainPassContext_.passId = config_->getConfig().start_pass();
+  srand(config_->getConfig().start_pass() + 1);
+  if (dataProvider_) {
+    dataProvider_->reset();
+  }
+
+  trainerInternal_.getGradientMachine()->start();
+}
+
+void Trainer::finishTrain() { trainerInternal_.getGradientMachine()->finish(); }
+
+void Trainer::startTrainPass() {
+  stats_->reset();
+  trainPassContext_.batchId = 0;
+  trainPassContext_.avgTestCost = 0;
+  trainPassContext_.numAvgTests = 0;
+  trainPassContext_.passInnerId = 1;
+
+  trainerInternal_.getParameterUpdater()->startPass();
+  evaluator_->start();
+  if (FLAGS_prev_batch_state) {
+    trainerInternal_.getGradientMachine()->resetState();
+    trainerInternal_.getGradientMachine()->getState(testState_);
+  }
+}
+
+void Trainer::trainOneDataBatch(DataBatch& dataBatch) {
+  int num = dataBatch.getSize();
+  if (averageEvaluator_) {
+    int64_t mod = trainPassContext_.batchId % FLAGS_average_test_period;
+    if (mod >= FLAGS_average_test_period - FLAGS_log_period) {
+      if (mod == FLAGS_average_test_period - FLAGS_log_period) {
+        averageEvaluator_->start();
+      }
+      trainerInternal_.getParameterUpdater()->apply();
+      if (FLAGS_prev_batch_state) {
+        trainerInternal_.getGradientMachine()->getState(trainState_);
+      }
+      trainPassContext_.avgTestCost += tester_->forwardOneBatch(
+          dataBatch, averageEvaluator_.get(), &forwardOutput_);
+      if (FLAGS_prev_batch_state) {
+        trainerInternal_.getGradientMachine()->setState(trainState_);
+      }
+      trainPassContext_.numAvgTests += num;
+      trainerInternal_.getParameterUpdater()->restore();
+    }
+  }
+  {
+    REGISTER_TIMER("TrainBatch");
+    trainerInternal_.trainOneBatch(
+        trainPassContext_.batchId, dataBatch, &forwardOutput_);
+  }
+
+  if (averageEvaluator_ &&
+      trainPassContext_.batchId % FLAGS_average_test_period ==
+          FLAGS_average_test_period - 1) {
+    averageEvaluator_->finish();
+    LOG(INFO) << " Averaged parameter:"
+              << " cost="
+              << trainPassContext_.avgTestCost / trainPassContext_.numAvgTests
+              << " Eval: " << *averageEvaluator_;
+    trainPassContext_.numAvgTests = 0;
+    trainPassContext_.avgTestCost = 0;
+  }
+
+  ++trainPassContext_.batchId;
+
+  if (trainPassContext_.batchId % FLAGS_log_period == 0) {
+    FOR_TIMING(globalStat.setThreadInfo(true));
+    FOR_TIMING(globalStat.printAllStatus());
+    FOR_TIMING(globalStat.reset());
+  }
+
+  if (testDataProvider_ && FLAGS_test_period > 0 &&
+      trainPassContext_.batchId % FLAGS_test_period == 0) {
+    tester_->testOnePeriod();
+  }
+
+  if (FLAGS_saving_period_by_batches > 0 &&
+      trainPassContext_.batchId >
+          FLAGS_saving_period_by_batches * trainPassContext_.passInnerId &&
+      0 == FLAGS_trainer_id) {
+    trainerInternal_.getParameterUpdater()->catchUpWith();
+    if (testDataProvider_) {
+      tester_->testOnePeriod();
+    }
+    paramUtil_->saveParametersOnePass(trainPassContext_.passId,
+                                      trainPassContext_.passInnerId);
+    ++trainPassContext_.passInnerId;
+  }
+}
+
+void Trainer::finishTrainPass() {
+  if (trainPassContext_.batchId == 0) {
+    // This means no more data from DataProvider
+    return;
+  }
+
+  trainerInternal_.finishTrainPass(trainPassContext_.passId,
+                                   trainPassContext_.batchId);
+
+  FOR_TIMING(globalStat.setThreadInfo(true));
+  FOR_TIMING(globalStat.printAllStatus());
+  FOR_TIMING(globalStat.reset());
+
+  if (testDataProvider_) {
+    tester_->testOnePeriod();
+  }
+
+  if (trainPassContext_.passId % FLAGS_saving_period == 0 &&
+      FLAGS_trainer_id == 0) {
+    paramUtil_->saveParametersOnePass(trainPassContext_.passId);
+  }
+  ++trainPassContext_.passId;
+}
+
+void Trainer::trainOnePass() {
+  startTrainPass();
+  size_t batchSize = config_->getOptConfig().batch_size();
+  while (true) {
+    DataBatch dataBatch;
+
+    int num = 0;
+    {
+      REGISTER_TIMER("getTrainBatch");
+      num = dataProvider_->getNextBatch(batchSize, &dataBatch);
+    }
+    if (num == 0) break;
+    CHECK_EQ(num, dataBatch.getSize());
+    trainOneDataBatch(dataBatch);
+  }
+
+  finishTrainPass();
+}
+
+void Trainer::trainOnePassBatch(int passId) {
+  this->stats_->reset();
+
+  trainerInternal_.getParameterUpdater()->startPass();
+  const std::vector<Argument> inArgs;
+  {
+    REGISTER_TIMER("onePass");
+    trainerInternal_.getGradientMachine()->forwardBackward(
+        inArgs, nullptr, PASS_TRAIN, nullptr);
+  }
+
+  real cost = .0;
+  int64_t num = 0;
+  trainerInternal_.getGradientMachine()->getStats(cost, num);
+  *stats_ += {num, cost};
+
+  trainerInternal_.getGradientMachine()->onPassEnd();
+
+  bool accepted = trainerInternal_.getParameterUpdater()->finishPass();
+
+  globalStat.setThreadInfo(true);
+  globalStat.printAllStatus();
+  globalStat.reset();
+
+  LOG(INFO) << " Pass=" << passId
+            << " AcceptedPass=" << (accepted ? acceptedPassId_ : -1)
+            << stats_->getStats(false /*withCurrentCost*/);
+
+  if (accepted) {
+    if (acceptedPassId_ % FLAGS_saving_period == 0 && FLAGS_trainer_id == 0) {
+      paramUtil_->saveParameters(acceptedPassId_);
+    }
+    acceptedPassId_++;
+    if (FLAGS_save_only_one && acceptedPassId_ >= FLAGS_saving_period) {
+      paramUtil_->deleteParameters(acceptedPassId_ - FLAGS_saving_period);
+    }
+  }
+}
+
+real Trainer::calcGradient(const DataBatch& dataBatch,
+                           const Vector& value,
+                           Vector& gradient) {
+  CHECK_EQ(value.getSize(), gradient.getSize());
+  std::vector<ParameterPtr>& parameters =
+      trainerInternal_.getGradientMachine()->getParameters();
+
+  clearGradient();
+
+  size_t offset = 0;
+  size_t valueSize = value.getSize();
+
+  for (auto& para : parameters) {
+    CHECK_LE(offset + para->getSize(), valueSize);
+    VectorPtr val =
+        Vector::create(para->getSize(), value.getMemoryHandle(), offset);
+    para->getBuf(PARAMETER_VALUE)->copyFrom(*val);
+    para->setValueUpdated();
+    offset += para->getSize();
+  }
+
+  CHECK_EQ(offset, valueSize);
+
+  std::vector<Argument> inArgs = dataBatch.getStreams();
+  std::vector<Argument> outArgs;
+
+  trainerInternal_.getGradientMachine()->forwardBackward(
+      inArgs, &outArgs, PASS_TRAIN);
+  real cost = Argument::sum(outArgs);
+
+  offset = 0;
+  for (auto& para : parameters) {
+    VectorPtr grad =
+        Vector::create(para->getSize(), gradient.getMemoryHandle(), offset);
+    if (para->getBuf(PARAMETER_GRADIENT)) {
+      grad->copyFrom(*para->getBuf(PARAMETER_GRADIENT));
+    }
+    offset += para->getSize();
+  }
+
+  return cost;
+}
+
+void Trainer::clearGradient() {
+  std::vector<ParameterPtr>& parameters =
+      trainerInternal_.getGradientMachine()->getNonStaticParameters();
+  for (auto& parameter : parameters) {
+    parameter->clearGradient();
+  }
+}
+
+int Trainer::getBatchSize() { return config_->getOptConfig().batch_size(); }
+
+void Trainer::createTester() {
+  tester_.reset(new paddle::Tester(config_,
+                                   createTesterConfig(),
+                                   trainerInternal_.getGradientMachine(),
+                                   trainerInternal_.getParameterUpdater(),
+                                   testDataProvider_));
+}
+
+void Trainer::test() { tester_->test(); }
+
+std::unique_ptr<TesterConfig> Trainer::createTesterConfig() {
+  TesterConfig* conf = new TesterConfig;
+  if (FLAGS_test_period) {
+    LOG(WARNING) << "The meaning of --test_period is changed: "
+                 << "if equal 0, do test on all test data at the end of "
+                 << "each pass. While if equal non-zero, do test on all test "
+                 << "data every test_period batches ";
+  }
+  if (FLAGS_test_all_data_in_one_period) {
+    LOG(WARNING) << "--test_all_data_in_one_period was deprecated, since "
+                 << "we will always do test on all test set ";
+  }
+  conf->testPeriod = FLAGS_test_period;
+  conf->prevBatchState = FLAGS_prev_batch_state;
+  conf->logPeriod = FLAGS_log_period;
+  conf->loadsaveParametersInPserver = FLAGS_loadsave_parameters_in_pserver;
+  conf->featFile = FLAGS_feat_file;
+  conf->predictOutputDir = FLAGS_predict_output_dir;
+  conf->trainerId = FLAGS_trainer_id;
+  conf->distributeTest = FLAGS_distribute_test;
+  conf->config = FLAGS_config;
+  conf->modelList = FLAGS_model_list;
+  conf->testPass = FLAGS_test_pass;
+  conf->numPasses = FLAGS_num_passes;
+  conf->savingPeriod = FLAGS_saving_period;
+  conf->testWait = FLAGS_test_wait;
+  conf->initModelPath = FLAGS_init_model_path;
+  conf->saveOnlyOne = FLAGS_save_only_one;
+  conf->testing = testing_;
+  conf->mode = mode_;
+  conf->trainState = &trainState_;
+  conf->testState = &testState_;
+  return std::unique_ptr<TesterConfig>(conf);
+}
+
+ParameterUtil* Trainer::getParameterUtilPtr() { return paramUtil_.get(); }
+}  // namespace paddle
diff --git a/paddle/legacy/trainer/Trainer.h b/paddle/legacy/trainer/Trainer.h
new file mode 100644
index 0000000000000000000000000000000000000000..b467f9af0cf12a39dd3d119c59e6cafcb05474b4
--- /dev/null
+++ b/paddle/legacy/trainer/Trainer.h
@@ -0,0 +1,204 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/legacy/utils/Util.h"
+
+#include <stdio.h>
+
+#include "hl_gpu.h"
+#include "paddle/legacy/gserver/dataproviders/DataProvider.h"
+#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
+
+#include <stdlib.h>
+#include <fstream>
+#include "ParamUtil.h"
+#include "ParameterUpdater.h"
+#include "Tester.h"
+#include "TrainerConfigHelper.h"
+#include "TrainerInternal.h"
+
+DECLARE_int32(num_passes);
+
+namespace paddle {
+
+/**
+ * Trainer Class
+ *
+ * Trainer combines GradientMachine, ParameterUpdater, DataProvider together to
+ * train/test a NeuralNetwork.
+ */
+class Trainer {
+ public:
+  /**
+   * Ctor.
+   * @return
+   */
+  Trainer() : acceptedPassId_(0) {}
+
+  virtual ~Trainer() {}
+
+  /**
+   * initialize a new trainer using config
+   *
+   * @param config TrainerConfig.
+   * @param testing true if only for testing
+   * @param gradientMachine GradientMachine that will be trained.
+   *                        nullptr if create from config.
+   * @param dataProvider Train Data Provider. null if create from config.
+   * @param testDataProvider Test Data Provider. null if create from config.
+   */
+  virtual void init(
+      const std::shared_ptr<TrainerConfigHelper>& config,
+      bool testing = false,
+      const std::shared_ptr<GradientMachine>& gradientMachine = nullptr,
+      const std::shared_ptr<DataProvider>& dataProvider = nullptr,
+      const std::shared_ptr<DataProvider>& testDataProvider = nullptr);
+
+  /**
+   * Train until num_passes reached.
+   * One pass means neural network train through all training data.
+   *
+   * @param numPasses the number of traning pass.
+   * @note Durning neural network training, the num passes may set a very large
+   * value, and kill training process when result is good enough.
+   */
+  void train(size_t numPasses = (size_t)FLAGS_num_passes);
+
+  /**
+   * compare the gradient from bp with finite difference
+   * @return  the maximal difference
+   */
+  real checkGradient();
+
+  void startTrain();
+  void finishTrain();
+  void startTrainPass();
+  void finishTrainPass();
+  void trainOneDataBatch(DataBatch& dataBatch);
+  void time();
+
+  /**
+   * given a dataBatch and the current parameter value
+   * calculate its gradient and return the cost.
+   *
+   * TODO(yuyang18): I think this method is deprecated and buggy. Should it be
+   * removed?
+   */
+  real calcGradient(const DataBatch& dataBatch,
+                    const Vector& value,
+                    Vector& gradient);
+
+  /**
+   * Get Trainer Config.
+   */
+  const TrainerConfig& getConfig() const { return config_->getConfig(); }
+
+  /**
+   * Get Train Data Provider
+   */
+  const DataProviderPtr& getDataProvider() { return dataProvider_; }
+
+  /**
+   * Get Gradient Machine.
+   */
+  const GradientMachinePtr& getGradientMachine() {
+    return trainerInternal_.getGradientMachine();
+  }
+
+  /**
+   * Get batch size in optimization config.
+   * @note This method didn't return the actual batch size. Just batch size
+   * set in the optimization config. The actual batch size in one trainer may
+   * less than batch size in config due to there are not enough data.
+   */
+  int getBatchSize();
+
+  /**
+   * Do test job
+   */
+  void test();
+
+  /**
+   * Get parameter util ptr
+   *
+   * TODO(yuyang18): Make it return a smart pointer.
+   */
+  ParameterUtil* getParameterUtilPtr();
+
+ protected:
+  /**
+   * Train one pass of data.
+   *
+   * SGD Method.
+   */
+  void trainOnePass();
+
+  /**
+   * Train one pass in one batch.
+   *
+   */
+  void trainOnePassBatch(int passId);
+
+  /**
+   * set parameter gradient to zero
+   */
+  void clearGradient();
+
+  void createTester();
+
+ private:
+  std::unique_ptr<TesterConfig> createTesterConfig();
+
+ protected:
+  std::shared_ptr<TrainerConfigHelper> config_;
+  std::shared_ptr<TrainerStats> stats_;
+
+  DataProviderPtr dataProvider_;
+  DataProviderPtr testDataProvider_;
+  MachineState trainState_;
+  MachineState testState_;
+
+  struct TrainPassContext {
+    int64_t batchId;
+    real avgTestCost;
+    int64_t numAvgTests;
+    int passId;
+    int passInnerId;
+  };
+  std::vector<paddle::Argument> forwardOutput_;
+
+  TrainPassContext trainPassContext_;
+
+  std::unique_ptr<Evaluator> evaluator_;
+  std::unique_ptr<Evaluator> currentEvaluator_;
+  std::unique_ptr<Evaluator> averageEvaluator_;
+  // training mode
+  // used to decide which GradientMachine and ParameterUpdater to create
+  GradientMachine::CreateMode mode_;
+  int testing_;
+  int acceptedPassId_;
+
+  // trainer tester
+  std::unique_ptr<Tester> tester_;
+
+  // parameter util
+  std::unique_ptr<ParameterUtil> paramUtil_;
+
+  // trainer Internal
+  TrainerInternal trainerInternal_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/trainer/TrainerBenchmark.cpp b/paddle/legacy/trainer/TrainerBenchmark.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7f5bd2335481c417b466ac4ca9ca54798524045f
--- /dev/null
+++ b/paddle/legacy/trainer/TrainerBenchmark.cpp
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#undef PADDLE_DISABLE_TIMER
+
+#include "Trainer.h"
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/Util.h"
+
+DECLARE_int32(test_period);
+
+DEFINE_bool(feed_data, false, "Wether to read data from DataProvider.");
+
+namespace paddle {
+
+void Trainer::time() {
+  startTrain();
+
+  trainerInternal_.getParameterUpdater()->startPass();
+  evaluator_->start();
+
+  DataBatch dataBatch;
+  int32_t batchSize = config_->getOptConfig().batch_size();
+  int32_t num = dataProvider_->getNextBatch(batchSize, &dataBatch);
+  CHECK_EQ(num, batchSize) << "The sample number is less than batch size "
+                           << num << " != " << batchSize;
+
+  CHECK(dataBatch.getSize()) << "No data from data provider";
+
+  std::vector<paddle::Argument> outputs;
+  // burning time
+  LOG(INFO) << "Burning time...";
+  for (int n = 0; n < 10; ++n) {
+    trainerInternal_.trainOneBatch(n, dataBatch, &outputs);
+  }
+  LOG(INFO) << "Burning time end.";
+
+  for (int n = 0; n < FLAGS_test_period; n++) {
+    if (FLAGS_feed_data) {
+      REGISTER_TIMER("GetData");
+      num = dataProvider_->getNextBatch(batchSize, &dataBatch);
+    }
+
+    if (num != batchSize) {
+      break;
+    }
+
+    {
+      REGISTER_TIMER("FwdBwd");
+      trainerInternal_.trainOneBatch(n, dataBatch, &outputs);
+    }
+  }
+  globalStat.setThreadInfo(true);
+  globalStat.printSegTimerStatus();
+  globalStat.reset();
+
+  finishTrain();
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/trainer/TrainerConfigHelper.cpp b/paddle/legacy/trainer/TrainerConfigHelper.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4d31ba8d71d52ac51191affc612a79b6734dee74
--- /dev/null
+++ b/paddle/legacy/trainer/TrainerConfigHelper.cpp
@@ -0,0 +1,199 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "TrainerConfigHelper.h"
+#include "ParamUtil.h"
+#include "TrainerConfig.pb.h"
+#include "paddle/legacy/utils/Flags.h"
+#include "paddle/legacy/utils/PythonUtil.h"
+
+DECLARE_string(config);
+DECLARE_string(init_model_path);
+DECLARE_int32(start_pass);
+DECLARE_string(save_dir);
+DECLARE_int32(trainer_id);
+DECLARE_bool(local);
+DECLARE_bool(with_cost);
+DECLARE_bool(with_gpu);
+DECLARE_bool(parallel_nn);
+DECLARE_string(config_args);
+DECLARE_bool(use_mkldnn);
+DECLARE_bool(use_mkl_packed);
+
+const char *kConfigParserModuleName = "paddle.trainer.config_parser";
+const char *kConfigParserFuncName = "parse_config_and_serialize";
+
+namespace paddle {
+
+struct TrainerConfigHelperPrivate {
+  TrainerConfig conf;
+};
+
+TrainerConfigHelper::TrainerConfigHelper(const std::string &configFilePath)
+    : m(new TrainerConfigHelperPrivate()) {
+  std::ostringstream configArgs;
+  configArgs << "trainer_id=" << FLAGS_trainer_id << ",local=" << FLAGS_local
+             << ",with_cost=" << FLAGS_with_cost << ",use_gpu=" << FLAGS_use_gpu
+             << ",parallel_nn=" << FLAGS_parallel_nn
+             << ",use_mkldnn=" << FLAGS_use_mkldnn
+             << ",use_mkl_packed=" << FLAGS_use_mkl_packed
+             << ",cudnn_version=" << hl_get_cudnn_lib_version();
+  if (!FLAGS_config_args.empty()) {
+    configArgs << "," << FLAGS_config_args;
+  }
+
+  VLOG(3) << "Parsing trainer config " << configFilePath;
+  std::string configProtoStr =
+      callPythonFunc(kConfigParserModuleName,
+                     kConfigParserFuncName,
+                     {configFilePath, configArgs.str()});
+  CHECK(m->conf.ParseFromString(configProtoStr));
+}
+
+TrainerConfigHelper::TrainerConfigHelper(const TrainerConfig &config)
+    : m(new TrainerConfigHelperPrivate()) {
+  m->conf = config;
+}
+
+TrainerConfigHelper::~TrainerConfigHelper() { delete m; }
+
+const TrainerConfig &TrainerConfigHelper::getConfig() const { return m->conf; }
+
+TrainerConfig &TrainerConfigHelper::getMutableConfig() { return m->conf; }
+
+const OptimizationConfig &TrainerConfigHelper::getOptConfig() const {
+  return m->conf.opt_config();
+}
+
+const ModelConfig &TrainerConfigHelper::getModelConfig() const {
+  return m->conf.model_config();
+}
+
+const DataConfig *TrainerConfigHelper::getDataConfigPtr() const {
+  if (m->conf.has_data_config()) {
+    return &m->conf.data_config();
+  } else {
+    return nullptr;
+  }
+}
+
+const DataConfig &TrainerConfigHelper::getTestDataConfig() const {
+  CHECK(m->conf.has_test_data_config());
+  return m->conf.test_data_config();
+}
+
+bool TrainerConfigHelper::hasDataConfig() const {
+  return m->conf.has_data_config();
+}
+
+bool TrainerConfigHelper::hasTestDataConfig() const {
+  return m->conf.has_test_data_config();
+}
+
+void TrainerConfigHelper::updateConfigFromFlags() {
+  if (!FLAGS_save_dir.empty()) {
+    m->conf.set_save_dir(FLAGS_save_dir);
+  }
+  if (!FLAGS_init_model_path.empty()) {
+    m->conf.set_init_model_path(FLAGS_init_model_path);
+  }
+  if (FLAGS_start_pass != 0) {
+    m->conf.set_start_pass(FLAGS_start_pass);
+  }
+}
+
+void TrainerConfigHelper::disableRemoteSparseUpdater() {
+  m->conf.mutable_opt_config()->set_use_sparse_remote_updater(false);
+}
+
+void TrainerConfigHelper::disableRemoteSparseUpdaterForEachParams() {
+  this->disableRemoteSparseUpdater();
+  for (int i = 0; i < m->conf.model_config().parameters_size(); ++i) {
+    m->conf.mutable_model_config()
+        ->mutable_parameters(i)
+        ->set_sparse_remote_update(false);
+  }
+}
+
+OptimizationConfig &TrainerConfigHelper::getOptConfig() {
+  return *m->conf.mutable_opt_config();
+}
+
+void TrainerConfigHelper::setSaveDir(const std::string &saveDir) {
+  m->conf.set_save_dir(saveDir);
+}
+
+const std::string &TrainerConfigHelper::getSaveDir() const {
+  return m->conf.save_dir();
+}
+
+std::string TrainerConfigHelper::getConfigNameFromPath(
+    const std::string &modelPath) {
+  std::ifstream s(path::join(modelPath, "path.txt"));
+  CHECK(s.is_open()) << " fail to open path.txt";
+  std::string ss;
+  getline(s, ss);
+  VLOG(3) << "fileName " << path::join(modelPath, ss);
+  s.close();
+  return path::join(modelPath, ss);
+}
+
+std::string TrainerConfigHelper::getConfigNameFromPassId(
+    int passId, const std::string &modelPath) {
+  constexpr int kBufLen = 100;
+  char buf[kBufLen];
+  snprintf(buf, kBufLen, "pass-%05d", passId);
+  return TrainerConfigHelper::getConfigNameFromPath(path::join(modelPath, buf));
+}
+
+std::string TrainerConfigHelper::getConfigName(bool *ok) const {
+  std::string retv = "";
+
+  if (!m->conf.config_file().empty()) {
+    retv = m->conf.config_file();
+  } else if (!m->conf.init_model_path().empty()) {
+    retv = getConfigNameFromPath(m->conf.init_model_path());
+  } else if (m->conf.start_pass() >= 1) {
+    retv = getConfigNameFromPassId(m->conf.start_pass(), m->conf.save_dir());
+  }
+
+  if (ok) {
+    *ok = !retv.empty();
+  }
+
+  return retv;
+}
+
+std::shared_ptr<TrainerConfigHelper> TrainerConfigHelper::createFromFlags() {
+  std::string configPath;
+  if (!FLAGS_config.empty()) {
+    configPath = FLAGS_config;
+  } else if (!FLAGS_init_model_path.empty()) {
+    configPath = getConfigNameFromPath(FLAGS_init_model_path);
+  } else if (FLAGS_start_pass >= 1) {
+    configPath =
+        getConfigNameFromPassId(FLAGS_start_pass - 1, FLAGS_init_model_path);
+  } else {
+    return nullptr;
+  }
+  return std::make_shared<TrainerConfigHelper>(configPath);
+}
+
+std::shared_ptr<TrainerConfigHelper>
+TrainerConfigHelper::createFromFlagConfig() {
+  CHECK(!FLAGS_config.empty());
+  return std::make_shared<TrainerConfigHelper>(FLAGS_config);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/trainer/TrainerConfigHelper.h b/paddle/legacy/trainer/TrainerConfigHelper.h
new file mode 100644
index 0000000000000000000000000000000000000000..0e428bea2c4b44bf98772ccca8f8b10d315efbbd
--- /dev/null
+++ b/paddle/legacy/trainer/TrainerConfigHelper.h
@@ -0,0 +1,205 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <paddle/legacy/utils/Logging.h>
+#include <paddle/legacy/utils/Util.h>
+#include <memory>
+
+namespace paddle {
+
+class TrainerConfig;
+class OptimizationConfig;
+struct TrainerConfigHelperPrivate;
+class ModelConfig;
+class DataConfig;
+
+/**
+ * @brief TrainerConfig Helper. A class wrap protobuf's TrainerConfig Object,
+ * simplize the usage for TrainerConfig.
+ *
+ * The all operation to TrainerConfig object should use this object. It remove
+ * many copy & paste code in trainer.
+ *
+ * @TODO(yuyang18): Make cmake check compiler support keyword 'final' or not.
+ * Define a macro to unify 'final' keyword
+ */
+class TrainerConfigHelper /*final*/ {
+ public:
+  DISABLE_COPY(TrainerConfigHelper);
+
+  /**
+   * @brief Ctor, Create a TrainerConfig from config file
+   * @param configFilePath Config file path.
+   */
+  explicit TrainerConfigHelper(const std::string& configFilePath);
+  explicit TrainerConfigHelper(const TrainerConfig& config);
+
+  /**
+   * Dtor
+   * @warning this class is a final class. Should not be inherited.
+   */
+  ~TrainerConfigHelper();
+
+  /**
+   * @brief Get Trainer Config itself.
+   */
+  const TrainerConfig& getConfig() const;
+
+  TrainerConfig& getMutableConfig();
+
+  /**
+   * @brief Get Optimizer Config.
+   */
+  const OptimizationConfig& getOptConfig() const;
+
+  /**
+   * @brief Get Model Config.
+   */
+  const ModelConfig& getModelConfig() const;
+
+  /**
+   * @brief Get Train Data Config Pointer.
+   * @return nullptr if there is no train data. Else will return pointer
+   */
+  const DataConfig* getDataConfigPtr() const;
+
+  /**
+   * @brief Get Tain Data Config.
+   * @warning Core when there is no train data.
+   */
+  const DataConfig& getDataConfig() const {
+    CHECK(this->hasDataConfig());
+    auto conf = this->getDataConfigPtr();
+    return *conf;
+  }
+
+  /**
+   * @brief Get test data config
+   * @warning Core when there is no test data.
+   */
+  const DataConfig& getTestDataConfig() const;
+
+  /**
+   * @brief Has train data config or not.
+   * @return true if has train data.
+   */
+  bool hasDataConfig() const;
+
+  /**
+   * @brief Has test data config or not.
+   * @return true if has test data.
+   */
+  bool hasTestDataConfig() const;
+
+  /**
+   * @brief Update trainer config from command line flags.
+   *        Override config's (save_dir, init_model_path, start_pass) if command
+   *        flags is existed.
+   */
+  void updateConfigFromFlags();
+
+  /**
+   * @brief Disable optimization's sparse remote update.
+   */
+  void disableRemoteSparseUpdater();
+
+  /**
+   * @brief Disable optimization and each parameter's sparse remote update.
+   */
+  void disableRemoteSparseUpdaterForEachParams();
+
+  /**
+   * @brief implicit conversion.
+   */
+  inline operator const TrainerConfig&() const { return this->getConfig(); }
+
+  /**
+   * @brief implicit conversion.
+   */
+  inline operator const OptimizationConfig&() const {
+    return this->getOptConfig();
+  }
+
+  /**
+   * @brief implicit conversion.
+   */
+  inline operator const DataConfig&() const { return this->getDataConfig(); }
+
+  /**
+   * @brief implicit conversion.
+   */
+  inline operator const ModelConfig&() const { return this->getModelConfig(); }
+
+  /**
+   * @brief Get mutable optimization config.
+   */
+  OptimizationConfig& getOptConfig();
+
+  /**
+   * @brief set model save directory.
+   * @param saveDir Directory path.
+   */
+  void setSaveDir(const std::string& saveDir);
+
+  /**
+   * @brief get model save directory.
+   * @return save directory path.
+   */
+  const std::string& getSaveDir() const;
+
+  /**
+   * @brief Get config file name from model path.
+   *
+   * Paddle save model to a directory, and write a file 'path.txt' which save
+   * config filename.
+   *
+   * @param modelPath model saved directory.
+   * @return config file name.
+   */
+  static std::string getConfigNameFromPath(const std::string& modelPath);
+
+  /**
+   * @brief Get config file name from this config instance.
+   * @param[out] ok true if no error.
+   * @return config file name.
+   */
+  std::string getConfigName(bool* ok = nullptr) const;
+
+  /**
+   * @brief Try to create TrainerConfigHelper from all command line flags.
+   *        Try to load from --config, --init_model_path, --start_pass one by
+   *        one. Return nullptr if cannot load TrainerConfigHelper from all
+   *        these place.
+   * @return nullptr if cannot load, otherwise return a TrainerConfigHelper.
+   */
+  static std::shared_ptr<TrainerConfigHelper> createFromFlags();
+
+  /**
+   * @brief Try to create TrainerConfigHelper only from '--config' flag.
+   * @return nullptr if cannot load, otherwise return a TrainerConfigHelper.
+   */
+  static std::shared_ptr<TrainerConfigHelper> createFromFlagConfig();
+
+ private:
+  static std::string getConfigNameFromPassId(int passId,
+                                             const std::string& modelPath);
+
+  TrainerConfigHelperPrivate* m;
+};
+
+typedef std::shared_ptr<TrainerConfigHelper> TrainerConfigHelperPtr;
+
+}  // namespace paddle
diff --git a/paddle/legacy/trainer/TrainerInternal.cpp b/paddle/legacy/trainer/TrainerInternal.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ee3dea6340167ab16d2bfefe3d757b10f5d90bb5
--- /dev/null
+++ b/paddle/legacy/trainer/TrainerInternal.cpp
@@ -0,0 +1,303 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "TrainerInternal.h"
+
+#include <fenv.h>
+#include <stdio.h>
+
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <sstream>
+
+#include <google/protobuf/text_format.h>
+
+#include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
+#include "paddle/legacy/gserver/layers/ValidationLayer.h"
+#include "paddle/legacy/utils/GlobalConstants.h"
+#include "paddle/legacy/utils/PythonUtil.h"
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/Util.h"
+
+#include "RemoteParameterUpdater.h"
+#include "ThreadParameterUpdater.h"
+
+namespace paddle {
+
+void TrainerInternal::init(const std::shared_ptr<TrainerConfigHelper>& config,
+                           const GradientMachinePtr& gradientMachine,
+                           std::unique_ptr<TrainerInternalConfig>&& intconfig,
+                           const std::shared_ptr<TrainerStats>& stats,
+                           bool testing) {
+  config_ = config;
+  intconfig_ = std::move(intconfig);
+  stats_ = stats;
+
+  //! in training will use parameter updater definitly.
+  //! But only use parameter in testing mode when some parameter in pserver.
+  if (!testing || (config_->getOptConfig().use_sparse_remote_updater() &&
+                   intconfig_->loadsave_parameters_in_pserver)) {
+    createParameterUpdater(testing);
+  }
+
+  gradientMachine_ = gradientMachine;
+  if (!gradientMachine) {
+    CHECK(config_->getConfig().has_model_config())
+        << "Missing model_config in trainer_config";
+    gradientMachine_.reset(
+        GradientMachine::create(config_->getConfig().model_config(),
+                                intconfig_->mode,
+                                parameterUpdater_->getParameterTypes()));
+  }
+}
+
+void TrainerInternal::trainOneBatch(int64_t batchId,
+                                    const DataBatch& dataBatch,
+                                    std::vector<Argument>* outArgs) {
+  // true means updating parameter whenever gradient is ready during backward()
+  bool doPipelineUpdate =
+      (intconfig_->mode != GradientMachine::kSgdSparseCpuTraining) &&
+      (intconfig_->local || intconfig_->use_gpu ||
+       intconfig_->trainer_count <= 1);
+
+  int64_t actualBatchSize = dataBatch.getSize();
+  if (actualBatchSize == 0) {
+    return;
+  }
+
+  bool showStats = intconfig_->show_param_stats_period > 0 &&
+                   (batchId + 1) % intconfig_->show_param_stats_period == 0 &&
+                   intconfig_->trainer_id == 0;
+
+  std::vector<ParaStat> paraStats;
+  if (showStats) {
+    paraStats.resize(gradientMachine_->getParameters().size());
+  }
+
+  const std::vector<Argument>& inArgs = dataBatch.getStreams();
+
+  PassType passType = parameterUpdater_->startBatch(actualBatchSize);
+
+  if (config_->getOptConfig().use_sparse_remote_updater()) {
+    REGISTER_TIMER("prefetch");
+    gradientMachine_->prefetch(inArgs);
+    parameterUpdater_->getParametersRemote();
+  }
+
+  UpdateCallback updateCallback = [this, showStats, &paraStats](
+      Parameter* para) {
+    if (showStats) {
+      //! @TODO(yuyang18) Show stats is actually a ParameterHook, refactor
+      // it
+      //! to ParameterHook.
+      auto& grad = para->getBuf(PARAMETER_GRADIENT);
+      SetDevice device(para->getDeviceId());
+      paraStats[para->getID()].avgAbsGrad = grad->getAbsSum() / para->getSize();
+      paraStats[para->getID()].maxAbsGrad = grad->getAbsMax();
+    }
+    parameterUpdater_->update(para);
+  };
+
+  {
+#ifndef PADDLE_DISABLE_TIMER
+    Timer timer;
+    timer.start();
+#endif
+    REGISTER_TIMER("forwardBackward");
+    forwardBackwardBatch(
+        inArgs, *outArgs, passType, updateCallback, doPipelineUpdate);
+#ifndef PADDLE_DISABLE_TIMER
+    timer.stop();
+    parameterUpdater_->setForwardbackwardTime(timer.get());
+#endif
+  }
+
+  if (!doPipelineUpdate) {
+    auto& parameters = gradientMachine_->getNonStaticParameters();
+    for (auto& para : parameters) {
+      updateCallback(para.get());
+    }
+  }
+
+  real cost = 0;
+  {
+    REGISTER_TIMER("sumCost");
+    cost = Argument::sum(*outArgs);
+  }
+
+  if (batchId % intconfig_->log_period == 0) {
+    currentEvaluator_->start();
+    stats_->resetCurrentStat();
+  }
+  {
+    REGISTER_TIMER("eval");
+    gradientMachine_->eval(currentEvaluator_);
+    gradientMachine_->eval(evaluator_);
+  }
+
+  *stats_ += {actualBatchSize, cost};
+  {
+    REGISTER_TIMER("finishBatch");
+    parameterUpdater_->finishBatch(cost);
+  }
+
+  if (showStats) {
+    showParameterStats(paraStats);
+  }
+  if ((batchId + 1) % intconfig_->log_period == 0) {
+    currentEvaluator_->finish();
+
+    if (intconfig_->dot_period > 0) {
+      std::cerr << std::endl;
+    }
+    LOG(INFO) << " Batch=" << batchId + 1 << " " << *stats_
+              << " Eval: " << *evaluator_
+              << " CurrentEval: " << *currentEvaluator_;
+  } else if (intconfig_->dot_period > 0 &&
+             (batchId + 1) % intconfig_->dot_period == 0) {
+    std::cerr << ".";
+  }
+}
+
+/**
+ * finish train pass
+ */
+void TrainerInternal::finishTrainPass(int passId, int batchId) {
+  gradientMachine_->onPassEnd();
+  parameterUpdater_->finishPass();
+  evaluator_->finish();
+  LOG(INFO) << " Pass=" << passId << " Batch=" << batchId << " "
+            << stats_->getStats(false /*without current cost*/)
+            << " Eval: " << *evaluator_;
+}
+
+void TrainerInternal::showParameterStats(
+    const std::vector<ParaStat>& paraStats) {
+  std::vector<ParameterPtr>& parameters = gradientMachine_->getParameters();
+  for (auto& parameter : parameters) {
+    SetDevice device(parameter->getDeviceId());
+    real sum = parameter->getBuf(PARAMETER_VALUE)->getAbsSum();
+    const auto& lr = parameter->getBuf(PARAMETER_LEARNING_RATE);
+    std::ostringstream osLrHistogram;
+    if (lr) {
+      if (VLOG_IS_ON(2)) {
+        osLrHistogram << " lr_histogram: ";
+        lr->histogram(osLrHistogram);
+      } else {
+        osLrHistogram << " max_lr=" << std::setw(11) << lr->getMax()
+                      << " min_lr=" << std::setw(11) << lr->getMin()
+                      << " avg_lr=" << std::setw(11)
+                      << lr->getSum() / parameter->getSize();
+      }
+    }
+    int pid = parameter->getID();
+    LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ')
+              << std::setw(20) << parameter->getName()
+              << " avg_abs_val=" << std::setw(11) << sum / parameter->getSize()
+              << " max_val=" << std::setw(11)
+              << parameter->getBuf(PARAMETER_VALUE)->getAbsMax()
+              << " avg_abs_grad=" << std::setw(11) << paraStats[pid].avgAbsGrad
+              << " max_grad=" << std::setw(11) << paraStats[pid].maxAbsGrad
+              << osLrHistogram.str();
+  }
+}
+
+void TrainerInternal::createParameterUpdater(bool testing) {
+  const std::string& alg = config_->getOptConfig().algorithm();
+  parameterUpdater_.reset(ParameterUpdaterCreators::tryCreateUpdater(
+      alg, config_->getOptConfig(), intconfig_->local, intconfig_->num_passes));
+  if (parameterUpdater_) {
+    return;
+  }
+
+  if (!intconfig_->local) {
+    if (testing && config_->getOptConfig().use_sparse_remote_updater()) {
+      std::unique_ptr<ParameterUpdater> localUpdater;
+      localUpdater.reset(
+          new SgdLocalUpdater(config_->getOptConfig()));  // do nothing
+      parameterUpdater_.reset(
+          new SparseRemoteParameterUpdaterComposite(config_->getOptConfig(),
+                                                    intconfig_->num_passes,
+                                                    testing,
+                                                    std::move(localUpdater)));
+    } else {
+      if (GradientMachine::kSgdSparseCpuTraining == intconfig_->mode &&
+          !intconfig_->use_old_updater) {
+        intconfig_->use_old_updater = true;
+        LOG(INFO) << "Sgd sparse training can not work with"
+                  << " ConcurrentRemoteParameterUpdater,"
+                  << " automatically reset --use_old_updater=true";
+      }
+
+      std::unique_ptr<ParameterUpdater> localUpdater;
+      if (config_->getOptConfig().num_batches_per_send_parameter() > 1) {
+        CHECK(alg == TrainAlgorithm::SGD || alg == TrainAlgorithm::AsyncSGD)
+            << "Unsupported algorithm in remote-local mode: " << alg;
+        if (GradientMachine::kSgdSparseCpuTraining == intconfig_->mode) {
+          localUpdater.reset(new SgdThreadUpdater(*config_));
+        } else {
+          localUpdater.reset(new SgdLocalUpdater(*config_));
+        }
+      }
+
+      localUpdater.reset(
+          intconfig_->use_old_updater
+              ? new RemoteParameterUpdater(
+                    *config_, intconfig_->num_passes, std::move(localUpdater))
+              : new ConcurrentRemoteParameterUpdater(
+                    *config_, intconfig_->num_passes, std::move(localUpdater)));
+
+      if (config_->getOptConfig().use_sparse_remote_updater()) {
+        localUpdater.reset(
+            new SparseRemoteParameterUpdaterComposite(*config_,
+                                                      intconfig_->num_passes,
+                                                      testing,
+                                                      std::move(localUpdater)));
+      }
+
+      this->parameterUpdater_ = std::move(localUpdater);
+    }
+  } else {
+    CHECK_EQ(config_->getOptConfig().num_batches_per_send_parameter(), 1)
+        << "num_batches_per_send_parameter should be one in local mode!";
+
+    if (GradientMachine::kSgdSparseCpuTraining == intconfig_->mode) {
+      parameterUpdater_.reset(new SgdThreadUpdater(*config_));
+    } else if (alg == TrainAlgorithm::SGD || alg == TrainAlgorithm::AsyncSGD) {
+      if (config_->getModelConfig().type() == "recursive_nn") {
+        parameterUpdater_.reset(new SgdCpuUpdater(*config_));
+      } else if (intconfig_->use_gpu &&
+                 config_->getOptConfig().do_average_in_cpu() &&
+                 config_->getOptConfig().average_window() > 0) {
+        parameterUpdater_.reset(new SgdUpdaterWithCpuAverager(*config_));
+      } else {
+        parameterUpdater_.reset(new SgdLocalUpdater(*config_));
+      }
+    } else {
+      LOG(FATAL) << "Unsupported algorithm in local mode: " << alg;
+    }
+  }
+}
+
+void TrainerInternal::forwardBackwardBatch(const std::vector<Argument>& inArgs,
+                                           std::vector<Argument>& outArgs,
+                                           PassType& passType,
+                                           UpdateCallback updateCallback,
+                                           bool doPipelineUpdate) {
+  gradientMachine_->forwardBackward(
+      inArgs, &outArgs, passType, doPipelineUpdate ? updateCallback : nullptr);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/trainer/TrainerInternal.h b/paddle/legacy/trainer/TrainerInternal.h
new file mode 100644
index 0000000000000000000000000000000000000000..93919a68fca2930cdf106f45d112e2a459fe695a
--- /dev/null
+++ b/paddle/legacy/trainer/TrainerInternal.h
@@ -0,0 +1,139 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/legacy/utils/Util.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fstream>
+
+#include "ParameterUpdater.h"
+#include "TrainerConfig.pb.h"
+#include "TrainerConfigHelper.h"
+#include "TrainerInternalConfig.h"
+#include "hl_gpu.h"
+#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
+
+namespace paddle {
+
+/**
+ * TrainerInteral
+ * the core training class for driving training logic
+ */
+class TrainerInternal {
+ public:
+  struct ParaStat {
+    real maxAbsGrad;
+    real avgAbsGrad;
+    ParaStat() : maxAbsGrad(.0), avgAbsGrad(.0) {}
+  };
+
+  TrainerInternal() {}
+
+  /**
+   * Intializes trainer internal class
+   * @param config network config
+   * @param machine gradient machine
+   * @param intconfig training config
+   * @param stats training stats
+   * @param testing if it is in testing phase
+   */
+  void init(const std::shared_ptr<TrainerConfigHelper>& config,
+            const GradientMachinePtr& machine,
+            std::unique_ptr<TrainerInternalConfig>&& intconfig,
+            const std::shared_ptr<TrainerStats>& stats,
+            bool testing);
+
+  virtual ~TrainerInternal() {}
+
+  /**
+   * CreateParameterUpdater
+   * @param testing if it is in testing phase
+   */
+  void createParameterUpdater(bool testing);
+
+  /**
+   * FinishTrainPass
+   * @param passId current pass id
+   * @param batchId current batch id, starts from 0
+   */
+  void finishTrainPass(int passId, int batchId);
+
+  /**
+   * trainOneBatch
+   * @param batchId current batch id
+   * @param dataBatch data for the batch
+   */
+  void trainOneBatch(int64_t batchId,
+                     const DataBatch& dataBatch,
+                     std::vector<Argument>* outArgs);
+
+  /**
+   * showParameterStats
+   * @param paraStats training stats
+   */
+  void showParameterStats(const std::vector<ParaStat>& paraStats);
+
+  /**
+   * getGradientMachine
+   */
+  inline const GradientMachinePtr& getGradientMachine() const {
+    return gradientMachine_;
+  }
+
+  /**
+   * getParameterUpdater
+   */
+  inline const std::shared_ptr<ParameterUpdater>& getParameterUpdater() {
+    return parameterUpdater_;
+  }
+
+  /**
+   * setCurrentEvaluator
+   * @param eval evaluator to set
+   */
+  inline void setCurrentEvaluator(Evaluator* eval) { currentEvaluator_ = eval; }
+
+  /**
+   * setEvaluator
+   * @param eval evaluator to set
+   */
+  inline void setEvaluator(Evaluator* eval) { evaluator_ = eval; }
+
+  /**
+   * forwardBackwardBatch
+   * @param inArgs input argument for data batch
+   * @param outArgs output argument from neural network
+   * @param updateCallback layerwise parameter gradient statistics
+   * @param doPipelineUpdate whether to do pipeline update
+   */
+  virtual void forwardBackwardBatch(const std::vector<Argument>& inArgs,
+                                    std::vector<Argument>& outArgs,
+                                    PassType& passType,
+                                    UpdateCallback updateCallback,
+                                    bool doPipelineUpdate);
+
+ protected:
+  std::shared_ptr<ParameterUpdater> parameterUpdater_;
+  GradientMachinePtr gradientMachine_;
+  std::shared_ptr<TrainerConfigHelper> config_;
+  std::unique_ptr<TrainerInternalConfig> intconfig_;
+  std::shared_ptr<TrainerStats> stats_;
+  Evaluator* currentEvaluator_;
+  Evaluator* evaluator_;
+};
+
+}  // namespace paddle
diff --git a/paddle/trainer/TrainerInternalConfig.cpp b/paddle/legacy/trainer/TrainerInternalConfig.cpp
similarity index 100%
rename from paddle/trainer/TrainerInternalConfig.cpp
rename to paddle/legacy/trainer/TrainerInternalConfig.cpp
diff --git a/paddle/legacy/trainer/TrainerInternalConfig.h b/paddle/legacy/trainer/TrainerInternalConfig.h
new file mode 100644
index 0000000000000000000000000000000000000000..b91b53932381a8698b331a2989b5f16829c06a25
--- /dev/null
+++ b/paddle/legacy/trainer/TrainerInternalConfig.h
@@ -0,0 +1,233 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/legacy/utils/Util.h"
+
+#include <stdio.h>
+
+#include "hl_gpu.h"
+#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
+
+#include "TrainerConfig.pb.h"
+
+#include <stdlib.h>
+#include <fstream>
+#include <sstream>
+#include "ParameterUpdater.h"
+
+namespace paddle {
+/**
+ * @brief TrainerStats object will statistics sample processed and total cost.
+ *
+ * There are two stats in it, the 'AvgCost' and 'CurrentAvgCost'. 'AvgCost'
+ * means cost through one pass(all mini-batches). 'CurrentAvgCost' means cost
+ * through one mini-batch.
+ */
+class TrainerStats {
+ public:
+  /**
+   * @brief reset all stats.
+   *
+   * often used before pass start.
+   */
+  inline void reset() {
+    numProcessed_ = 0;
+    totalCost_ = .0;
+    this->resetCurrentStat();
+  }
+
+  /**
+   * @brief reset current stat.
+   *
+   * 'current' means the most recent --log_period mini-batches
+   */
+  inline void resetCurrentStat() {
+    currentCost_ = .0;
+    currentSamples_ = 0;
+  }
+
+  /**
+   * @brief add cost to stat.
+   * @param numProcessed current mini-batch size
+   * @param cost current mini-batch cost
+   */
+  inline void addCost(int64_t numProcessed, real cost) {
+    this->numProcessed_ += numProcessed;
+    this->totalCost_ += cost;
+    this->currentSamples_ += numProcessed;
+    this->currentCost_ += cost;
+  }
+
+  /**
+   * @brief get average cost through on pass(all processed mini-batches)
+   * @return pass average cost
+   */
+  inline real getAvgCost() const {
+    CHECK_NE(this->numProcessed_, 0);
+    return this->totalCost_ / this->numProcessed_;
+  }
+
+  /**
+   * @brief get current mini-batch's average cost.
+   * @return mini-batch average cost
+   */
+  inline real getCurrentAvgCost() const {
+    CHECK_NE(this->currentSamples_, 0);
+    return this->currentCost_ / this->currentSamples_;
+  }
+
+  /**
+   * @brief get all processed samples' number
+   * @return all processed samples' number
+   */
+  inline int64_t getNumProcessed() const { return this->numProcessed_; }
+
+  /**
+   * @brief same function as addCost. But it is simple to invoke.
+   * For example:
+   *
+   * @code{.cpp}
+   * TrainerStats stat;
+   * cost = neuralNetwork.forward(batchSize);
+   * stat += {batchSize, cost};
+   * @endcode
+   *
+   * @param p a pair of parameter, first is numProcessed, second is cost.
+   * @return *this
+   */
+  inline TrainerStats& operator+=(const std::pair<int64_t, real>& p) {
+    this->addCost(p.first, p.second);
+    return *this;
+  }
+
+  /**
+   * @brief TrainerStats Constructor.
+   *
+   * reset stat when constructed.
+   */
+  inline TrainerStats() { this->reset(); }
+
+  /**
+   * @brief show stats to ostream.
+   *
+   * If there is no need to print current cost, set withCurrentCost to False.
+   *
+   * @param os output stream.
+   * @param withCurrentCost print current cost or not.
+   */
+  void showStats(std::ostream& os, bool withCurrentCost = true) const {
+    os << "samples=" << this->getNumProcessed()
+       << " AvgCost=" << this->getAvgCost();
+    if (withCurrentCost) {
+      os << " CurrentCost=" << this->getCurrentAvgCost();
+    }
+  }
+
+  /**
+   * @brief get stats to std::string
+   * @param withCurrentCost return current cost or not
+   * @return stats string
+   */
+  std::string getStats(bool withCurrentCost = true) const {
+    std::ostringstream os;
+    this->showStats(os, withCurrentCost);
+    return os.str();
+  }
+
+ private:
+  int64_t numProcessed_;
+  real totalCost_;
+  real currentCost_;
+  int64_t currentSamples_;
+};
+
+inline std::ostream& operator<<(std::ostream& os, const TrainerStats& stats) {
+  stats.showStats(os);
+  return os;
+}
+
+/**
+ * TrainerInternalConfig
+ * general configs for training
+ */
+struct TrainerInternalConfig {
+  /**
+   * @brief Create TrainerInternalConfig from GradientMachine::CreateMode and
+   * command line arguments.
+   * @param mode
+   * @return
+   */
+  static std::unique_ptr<TrainerInternalConfig> createFromMode(
+      GradientMachine::CreateMode mode);
+
+  /**
+   * indicate whether the training is local
+   * if local, no parameter server is used
+   */
+  bool local;
+
+  /**
+   * indicate whether training uses GPU
+   */
+  bool use_gpu;
+
+  /**
+   * indicate number of trainer
+   */
+  int trainer_count;
+
+  /**
+   * how frequently to show param stats
+   */
+  int show_param_stats_period;
+
+  /**
+   * current trainer id
+   */
+  int trainer_id;
+
+  /**
+   * frequency to dump log
+   */
+  int log_period;
+
+  /**
+   * dot period
+   */
+  int dot_period;
+
+  /**
+   * num passes for training
+   */
+  int num_passes;
+
+  /**
+   * use old updater
+   */
+  bool use_old_updater;
+
+  /**
+   * whether to load and save parameter in pserver
+   */
+  bool loadsave_parameters_in_pserver;
+
+  /**
+   * training mode
+   */
+  GradientMachine::CreateMode mode;
+};
+
+}  //  namespace paddle
diff --git a/paddle/legacy/trainer/TrainerMain.cpp b/paddle/legacy/trainer/TrainerMain.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..911aeba1928f7208aecb92910dac981f00fc6db5
--- /dev/null
+++ b/paddle/legacy/trainer/TrainerMain.cpp
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <fenv.h>
+#include "paddle/legacy/pserver/ParameterServerController.h"
+#include "paddle/legacy/utils/PythonUtil.h"
+
+#include "ParamUtil.h"
+#include "Trainer.h"
+
+DEFINE_bool(start_pserver, false, "Whether to start pserver");
+DECLARE_int32(gpu_id);
+DEFINE_string(job, "train", "one of (train, test, checkgrad)");
+DECLARE_int32(start_pass);
+DECLARE_string(config);
+DECLARE_string(init_model_path);
+DECLARE_string(rdma_tcp);
+
+using namespace paddle;  // NOLINT
+
+int main(int argc, char** argv) {
+  // write logs instantly (never buffer log messages)
+  FLAGS_logbuflevel = -1;
+
+  initMain(argc, argv);
+  initPython(argc, argv);
+
+  std::unique_ptr<ParameterServerController> parameterServerPtr(nullptr);
+  if (FLAGS_start_pserver) {
+    parameterServerPtr.reset(
+        paddle::ParameterServerController::createFromGflags());
+    parameterServerPtr->start();
+  }
+  Trainer trainer;
+  auto config = TrainerConfigHelper::createFromFlags();
+  CHECK(config != nullptr) << "no valid config";
+
+  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
+  trainer.init(config, FLAGS_job == "test");
+
+  if (FLAGS_job == "train") {
+    trainer.train();
+  } else if (FLAGS_job == "checkgrad") {
+    trainer.checkGradient();
+  } else if (FLAGS_job == "test") {
+    trainer.test();
+  } else if (FLAGS_job == "time") {
+    trainer.time();
+  } else {
+    LOG(FATAL) << "Unknown job type: " << FLAGS_job;
+  }
+
+  return 0;
+}
diff --git a/paddle/trainer/tests/.gitignore b/paddle/legacy/trainer/tests/.gitignore
similarity index 100%
rename from paddle/trainer/tests/.gitignore
rename to paddle/legacy/trainer/tests/.gitignore
diff --git a/paddle/legacy/trainer/tests/CMakeLists.txt b/paddle/legacy/trainer/tests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..08548bea4c4a7fc4fa99d9305208abd4ee442572
--- /dev/null
+++ b/paddle/legacy/trainer/tests/CMakeLists.txt
@@ -0,0 +1,37 @@
+add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/sample_trainer_config.conf
+    COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/* ${CMAKE_CURRENT_BINARY_DIR}
+)
+add_custom_target(copy_trainer_conf ALL DEPENDS sample_trainer_config.conf)
+
+set(PYTHON_PATH 
+   ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d 
+   ${PADDLE_BINARY_DIR}/python/:${PADDLE_BINARY_DIR}/paddle/legacy/trainer/tests)
+function(trainer_test TARGET)
+  add_unittest_without_exec(${TARGET} ${TARGET}.cpp)
+  add_test(NAME ${TARGET}
+    COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}
+      WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
+endfunction()
+
+trainer_test(test_Compare)
+trainer_test(test_PyDataProviderWrapper)
+trainer_test(test_recurrent_machine_generation)
+trainer_test(test_Trainer)
+
+############### test_TrainerOnePass ##########################
+if(WITH_PYTHON)
+  # only run test_TrainerOnePass when PYTHON is enabled, because train one pass
+  # is using PyDataProvider2.
+  add_unittest_without_exec(test_TrainerOnePass
+      test_TrainerOnePass.cpp)
+  add_test(NAME test_TrainerOnePass
+    COMMAND ${PYTHON_PATH} ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port 
+          ${CMAKE_CURRENT_BINARY_DIR}/test_TrainerOnePass
+      WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
+endif()
+
+#################### test_config_parser #########################
+add_test(NAME test_config_parser
+  COMMAND ${PYTHON_PATH} ${PYTHON_EXECUTABLE} 
+        ${PADDLE_SOURCE_DIR}/paddle/legacy/trainer/tests/config_parser_test.py
+    WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
diff --git a/paddle/trainer/tests/__init__.py b/paddle/legacy/trainer/tests/__init__.py
similarity index 100%
rename from paddle/trainer/tests/__init__.py
rename to paddle/legacy/trainer/tests/__init__.py
diff --git a/paddle/legacy/trainer/tests/config_parser_test.py b/paddle/legacy/trainer/tests/config_parser_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d3d82cbdafcf85d42247e810fe7caa685a86e4d
--- /dev/null
+++ b/paddle/legacy/trainer/tests/config_parser_test.py
@@ -0,0 +1,23 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer.config_parser import parse_config_and_serialize
+
+if __name__ == '__main__':
+    parse_config_and_serialize('legacy/trainer/tests/test_config.conf', '')
+    parse_config_and_serialize(
+        'legacy/trainer/tests/sample_trainer_config.conf',
+        'extension_module_name=paddle.trainer.config_parser_extension')
+    parse_config_and_serialize(
+        'legacy/gserver/tests/pyDataProvider/trainer.conf', '')
diff --git a/paddle/trainer/tests/fake_file_list.list b/paddle/legacy/trainer/tests/fake_file_list.list
similarity index 100%
rename from paddle/trainer/tests/fake_file_list.list
rename to paddle/legacy/trainer/tests/fake_file_list.list
diff --git a/paddle/trainer/tests/picojson.h b/paddle/legacy/trainer/tests/picojson.h
similarity index 100%
rename from paddle/trainer/tests/picojson.h
rename to paddle/legacy/trainer/tests/picojson.h
diff --git a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.data b/paddle/legacy/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.data
similarity index 100%
rename from paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.data
rename to paddle/legacy/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.data
diff --git a/paddle/legacy/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.list b/paddle/legacy/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.list
new file mode 100644
index 0000000000000000000000000000000000000000..11c1b1b38b9edacc4953fdf526906d28bcc2d720
--- /dev/null
+++ b/paddle/legacy/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.list
@@ -0,0 +1 @@
+legacy/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.data
diff --git a/paddle/trainer/tests/rnn_gen_test_model_dir/r1.test.beam b/paddle/legacy/trainer/tests/rnn_gen_test_model_dir/r1.test.beam
similarity index 100%
rename from paddle/trainer/tests/rnn_gen_test_model_dir/r1.test.beam
rename to paddle/legacy/trainer/tests/rnn_gen_test_model_dir/r1.test.beam
diff --git a/paddle/trainer/tests/rnn_gen_test_model_dir/r1.test.nest b/paddle/legacy/trainer/tests/rnn_gen_test_model_dir/r1.test.nest
similarity index 100%
rename from paddle/trainer/tests/rnn_gen_test_model_dir/r1.test.nest
rename to paddle/legacy/trainer/tests/rnn_gen_test_model_dir/r1.test.nest
diff --git a/paddle/trainer/tests/rnn_gen_test_model_dir/r1.test.nobeam b/paddle/legacy/trainer/tests/rnn_gen_test_model_dir/r1.test.nobeam
similarity index 100%
rename from paddle/trainer/tests/rnn_gen_test_model_dir/r1.test.nobeam
rename to paddle/legacy/trainer/tests/rnn_gen_test_model_dir/r1.test.nobeam
diff --git a/paddle/trainer/tests/rnn_gen_test_model_dir/t1/transtable b/paddle/legacy/trainer/tests/rnn_gen_test_model_dir/t1/transtable
similarity index 100%
rename from paddle/trainer/tests/rnn_gen_test_model_dir/t1/transtable
rename to paddle/legacy/trainer/tests/rnn_gen_test_model_dir/t1/transtable
diff --git a/paddle/trainer/tests/rnn_gen_test_model_dir/t1/wordvec b/paddle/legacy/trainer/tests/rnn_gen_test_model_dir/t1/wordvec
similarity index 100%
rename from paddle/trainer/tests/rnn_gen_test_model_dir/t1/wordvec
rename to paddle/legacy/trainer/tests/rnn_gen_test_model_dir/t1/wordvec
diff --git a/paddle/trainer/tests/sample_data.txt b/paddle/legacy/trainer/tests/sample_data.txt
similarity index 100%
rename from paddle/trainer/tests/sample_data.txt
rename to paddle/legacy/trainer/tests/sample_data.txt
diff --git a/paddle/legacy/trainer/tests/sample_filelist.txt b/paddle/legacy/trainer/tests/sample_filelist.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8573f9e1795edd37cfa0d21f0effc08a80d38e29
--- /dev/null
+++ b/paddle/legacy/trainer/tests/sample_filelist.txt
@@ -0,0 +1 @@
+legacy/trainer/tests/sample_data.txt
diff --git a/paddle/legacy/trainer/tests/sample_trainer_config.conf b/paddle/legacy/trainer/tests/sample_trainer_config.conf
new file mode 100644
index 0000000000000000000000000000000000000000..5800b3625661efac80b84b19c2a5cedc34718488
--- /dev/null
+++ b/paddle/legacy/trainer/tests/sample_trainer_config.conf
@@ -0,0 +1,87 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+TrainData(SimpleData(
+            files = "legacy/trainer/tests/sample_filelist.txt",
+            feat_dim = 3,
+            context_len = 0,
+            buffer_capacity = 1000000))
+
+TestData(SimpleData(
+           files = "legacy/trainer/tests/sample_filelist.txt",
+           feat_dim = 3,
+           context_len = 0,
+           buffer_capacity = 1000000))
+
+settings(batch_size = 100)
+
+data = data_layer(name='input', size=3)
+
+fc1 = fc_layer(input=data, size=5,
+               bias_attr=False,
+               act=SigmoidActivation())
+
+fc2 = fc_layer(input=data, size=9,
+               bias_attr=False,
+               act=LinearActivation())
+
+fc3 = fc_layer(input=data, size=3,
+               bias_attr=False,
+               act=TanhActivation())
+
+fc4 = fc_layer(input=data, size=5,
+               bias_attr=False,
+               act=LinearActivation(),
+               param_attr=ParamAttr(name='sharew'))
+
+fc5 = fc_layer(input=data, size=5,
+               bias_attr=False,
+               act=BReluActivation())
+
+fc6 = fc_layer(input=data, size=5,
+               bias_attr=False,
+               act=SoftReluActivation())
+
+fc7 = fc_layer(input=data, size=3,
+               bias_attr=False,
+               act=SquareActivation())
+
+fc8 = fc_layer(input=data, size=5,
+               bias_attr=True,
+               act=SquareActivation())
+
+with mixed_layer(size=3, act=SoftmaxActivation()) as layer9:
+    layer9 += full_matrix_projection(input=fc1)
+    layer9 += full_matrix_projection(input=fc2)
+    layer9 += full_matrix_projection(input=fc3)
+    layer9 += trans_full_matrix_projection(input=fc4,
+                                           param_attr=ParamAttr(name='sharew'))
+    layer9 += full_matrix_projection(input=fc5)
+    layer9 += full_matrix_projection(input=fc6)
+    layer9 += full_matrix_projection(input=fc7)
+    layer9 += full_matrix_projection(input=fc8)
+
+if get_config_arg('with_cost', bool, True):
+    # This is for training the neural network.
+    # We need to have another data layer for label
+    # and a layer for calculating cost
+    lbl = data_layer(name='label', size=1)
+    outputs(classification_cost(input=layer9, label=lbl))
+else:    
+    # This is for prediction where we don't have label
+    # and don't need to calculate cost
+    outputs(layer9)
diff --git a/paddle/legacy/trainer/tests/sample_trainer_config_hsigmoid.conf b/paddle/legacy/trainer/tests/sample_trainer_config_hsigmoid.conf
new file mode 100644
index 0000000000000000000000000000000000000000..155c40b31f30c40e1ddeb65500f55162beb9a0ee
--- /dev/null
+++ b/paddle/legacy/trainer/tests/sample_trainer_config_hsigmoid.conf
@@ -0,0 +1,53 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from paddle.trainer_config_helpers import *
+
+TrainData(SimpleData(
+    files = "legacy/trainer/tests/sample_filelist.txt",
+    feat_dim = 3,
+    context_len = 0,
+    buffer_capacity = 1000000,
+))
+
+settings(batch_size = 100)
+
+data = data_layer(name='input', size=3)
+
+fc1 = fc_layer(input=data, size=12,
+               bias_attr=False,
+               act=SigmoidActivation())
+
+fc2 = fc_layer(input=data, size=19,
+               bias_attr=False,
+               act=LinearActivation())
+
+fc3 = fc_layer(input=data, size=5,
+               bias_attr=False,
+               act=TanhActivation())
+
+fc4 = fc_layer(input=data, size=5,
+               bias_attr=False,
+               act=LinearActivation())
+
+# This is for training the neural network.
+# We need to have another data layer for label
+# and a layer for calculating cost
+lbl = data_layer(name='label', size=1)
+
+outputs(hsigmoid(input=[fc1, fc2, fc3, fc4],
+                 label=lbl,
+                 num_classes=3))
diff --git a/paddle/legacy/trainer/tests/sample_trainer_config_parallel.conf b/paddle/legacy/trainer/tests/sample_trainer_config_parallel.conf
new file mode 100644
index 0000000000000000000000000000000000000000..49cdde7fa2c55e6536a49633f959af6a888ec463
--- /dev/null
+++ b/paddle/legacy/trainer/tests/sample_trainer_config_parallel.conf
@@ -0,0 +1,86 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+TrainData(SimpleData(
+            files = "legacy/trainer/tests/sample_filelist.txt",
+            feat_dim = 3,
+            context_len = 0,
+            buffer_capacity = 1000000))
+
+TestData(SimpleData(
+           files = "legacy/trainer/tests/sample_filelist.txt",
+           feat_dim = 3,
+           context_len = 0,
+           buffer_capacity = 1000000))
+
+settings(batch_size = 100)
+
+# Output layer, label layer, cost layer, preferably set to the same environment.
+output_device = 0
+
+# Input Layer does not need to specify the device number.
+data = data_layer(name='input', size=3)
+
+# Calculate in the CPU.
+fc1 = fc_layer(input=data, size=5,
+               bias_attr=True,
+               layer_attr=ExtraAttr(device=-1),
+               act=SigmoidActivation())
+
+# Calculate in the GPU 0.
+fc2 = fc_layer(input=fc1, size=10,
+               bias_attr=True,
+               layer_attr=ExtraAttr(device=0),
+               act=SigmoidActivation())
+
+# Calculate in the GPU 1.
+fc3 = fc_layer(input=fc1, size=10,
+               bias_attr=True,
+               layer_attr=ExtraAttr(device=1),
+               act=SigmoidActivation())
+
+# Calculate in the GPU 0.
+fc4 = fc_layer(input=[fc2,fc3], size=10,
+               bias_attr=True,
+               layer_attr=ExtraAttr(device=0),
+               act=SigmoidActivation())
+
+# Calculate in the GPU 1.
+fc5 = fc_layer(input=[fc2,fc3], size=10,
+               bias_attr=True,
+               layer_attr=ExtraAttr(device=1),
+               act=SigmoidActivation())
+
+output = fc_layer(input=[fc4,fc5], size=10,
+                  bias_attr=True,
+                  layer_attr=ExtraAttr(device=output_device),
+                  act=SoftmaxActivation())
+
+if get_config_arg('with_cost', bool, True):
+    # This is for training the neural network.
+    # We need to have another data layer for label
+    # and a layer for calculating cost
+    lbl = data_layer(name='label', size=1,
+                    layer_attr=ExtraAttr(device=output_device))
+                    
+    outputs(classification_cost(input=output, 
+                                label=lbl,
+                                layer_attr=ExtraAttr(device=output_device)))
+else:
+    # This is for prediction where we don't have label
+    # and don't need to calculate cost
+    outputs(output)
diff --git a/paddle/legacy/trainer/tests/sample_trainer_nest_rnn_gen.conf b/paddle/legacy/trainer/tests/sample_trainer_nest_rnn_gen.conf
new file mode 100644
index 0000000000000000000000000000000000000000..51ef905a5a182464f69a1629e51bf8180eadb3fb
--- /dev/null
+++ b/paddle/legacy/trainer/tests/sample_trainer_nest_rnn_gen.conf
@@ -0,0 +1,73 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=15, learning_rate=0)
+
+num_words = 5
+beam_flag = get_config_arg('beam_search', bool, False)
+
+sent_id = data_layer(name="sent_id", size=1)
+
+# This layer has no actual use, but only to decide batch_size in generation.
+# When generating, at least one Memory in RecurrentLayer MUST have a boot layer.
+dummy_data = data_layer(name="dummy_data_input", size=2)
+
+def outer_step(dummy_data):
+
+    gen_inputs = [StaticInput(input=dummy_data, size=2, is_seq=True),
+                  GeneratedInput(size=num_words,
+                                 embedding_name="wordvec",
+                                 embedding_size=num_words)]
+
+    def inner_step(dummy_memory, predict_word):
+
+        # simplified RNN for testing
+        with mixed_layer(size=num_words) as layer:
+            layer += full_matrix_projection(input=predict_word,
+                                            param_attr=ParamAttr(name="transtable"))
+
+        with mixed_layer(size=num_words, act=ExpActivation()) as out:
+            out += trans_full_matrix_projection(input=layer,
+                                                param_attr=ParamAttr(name="wordvec"))
+
+        return out
+
+    beam_gen = beam_search(name="rnn_gen",
+                           step=inner_step,
+                           input=gen_inputs,
+                           bos_id=0,
+                           eos_id=num_words-1,
+                           beam_size=2 if beam_flag else 1,
+                           num_results_per_sample=1,
+                           max_length=10)
+    return beam_gen
+
+beam_gen_concat = recurrent_group(name="rnn_gen_concat",
+                                  step=outer_step,
+                                  input=[SubsequenceInput(dummy_data)])
+
+seqtext_printer_evaluator(input=beam_gen_concat,
+                          id_input=sent_id,
+                          dict_file="./legacy/trainer/tests/test_gen_dict.txt",
+                          result_file="./legacy/trainer/tests/dump_text.test")
+#outputs(beam_gen_concat)
+# In this config, as dummy_data_input doesn't work on beam_gen (we can find dummy_memory
+# is read-only memory, and isn't used by other layers of step), we show the Inputs and Outputs
+# as follows. Note that "__beam_search_predict__" is the default output name of beam_search.
+Inputs("sent_id","dummy_data_input")
+Outputs("__beam_search_predict__")
diff --git a/paddle/legacy/trainer/tests/sample_trainer_rnn_gen.conf b/paddle/legacy/trainer/tests/sample_trainer_rnn_gen.conf
new file mode 100644
index 0000000000000000000000000000000000000000..35c7f0fcd91f9b534a4f535387af720659d7f9b8
--- /dev/null
+++ b/paddle/legacy/trainer/tests/sample_trainer_rnn_gen.conf
@@ -0,0 +1,66 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=15, learning_rate=0)
+
+num_words = 5
+beam_flag = get_config_arg('beam_search', bool, False)
+
+sent_id = data_layer(name="sent_id", size=1)
+
+# This layer has no actual use, but only to decide batch_size in generation.
+# When generating, at least one Memory in RecurrentLayer MUST have a boot layer.
+dummy_data = data_layer(name="dummy_data_input", size=2)
+
+gen_inputs = [StaticInput(input=dummy_data, size=2),
+              GeneratedInput(size=num_words,
+                             embedding_name="wordvec",
+                             embedding_size=num_words)]
+
+def step(dummy_memory, predict_word):
+
+    # simplified RNN for testing
+    with mixed_layer(size=num_words) as layer:
+        layer += full_matrix_projection(input=predict_word,
+                                        param_attr=ParamAttr(name="transtable"))
+
+    with mixed_layer(size=num_words, act=ExpActivation()) as out:
+        out += trans_full_matrix_projection(input=layer,
+                                            param_attr=ParamAttr(name="wordvec"))
+
+    return out
+
+beam_gen = beam_search(name="rnn_gen",
+                       step=step,
+                       input=gen_inputs,
+                       bos_id=0,
+                       eos_id=num_words-1,
+                       beam_size=2 if beam_flag else 1,
+                       num_results_per_sample=2 if beam_flag else 1,
+                       max_length=10)
+
+seqtext_printer_evaluator(input=beam_gen,
+                          id_input=sent_id,
+                          dict_file="./legacy/trainer/tests/test_gen_dict.txt",
+                          result_file="./legacy/trainer/tests/dump_text.test")
+#outputs(beam_gen)
+# In this config, as dummy_data_input doesn't work on beam_gen (we can find dummy_memory
+# is read-only memory, and isn't used by other layers of step), we show the Inputs and Outputs
+# as follows. Note that "__beam_search_predict__" is the default output name of beam_search.
+Inputs("sent_id","dummy_data_input")
+Outputs("__beam_search_predict__")
diff --git a/paddle/legacy/trainer/tests/simple_sparse_neural_network.py b/paddle/legacy/trainer/tests/simple_sparse_neural_network.py
new file mode 100644
index 0000000000000000000000000000000000000000..9419f4d903b1de205a6c549c7dcd9bb85ed7396b
--- /dev/null
+++ b/paddle/legacy/trainer/tests/simple_sparse_neural_network.py
@@ -0,0 +1,37 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=17, learning_method=AdaGradOptimizer(), learning_rate=1e-4)
+
+file_list = 'legacy/trainer/tests/fake_file_list.list'
+
+define_py_data_sources2(
+    train_list=file_list,
+    test_list=file_list,
+    module="simple_sparse_neural_network_dp",
+    obj="process")
+
+embedding = embedding_layer(
+    input=data_layer(
+        name="word_ids", size=8191),
+    size=128,
+    param_attr=ParamAttr(sparse_update=True))
+prediction = fc_layer(input=embedding, size=10, act=SoftmaxActivation())
+
+outputs(
+    classification_cost(
+        input=prediction, label=data_layer(
+            name='label', size=10)))
diff --git a/paddle/trainer/tests/simple_sparse_neural_network_dp.py b/paddle/legacy/trainer/tests/simple_sparse_neural_network_dp.py
similarity index 100%
rename from paddle/trainer/tests/simple_sparse_neural_network_dp.py
rename to paddle/legacy/trainer/tests/simple_sparse_neural_network_dp.py
diff --git a/paddle/trainer/tests/testPyDataWrapper.py b/paddle/legacy/trainer/tests/testPyDataWrapper.py
similarity index 100%
rename from paddle/trainer/tests/testPyDataWrapper.py
rename to paddle/legacy/trainer/tests/testPyDataWrapper.py
diff --git a/paddle/legacy/trainer/tests/test_Compare.cpp b/paddle/legacy/trainer/tests/test_Compare.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e37e546be8513b1cc7438810a01641859a4bad18
--- /dev/null
+++ b/paddle/legacy/trainer/tests/test_Compare.cpp
@@ -0,0 +1,158 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <paddle/legacy/utils/PythonUtil.h>
+
+#include "paddle/legacy/trainer/Trainer.h"
+
+#include <gtest/gtest.h>
+#include <cstdlib>
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+static const string& configFile =
+    "legacy/trainer/tests/sample_trainer_config.conf";
+
+DECLARE_int32(gpu_id);
+DECLARE_bool(use_gpu);
+DECLARE_string(config);
+DECLARE_string(config_args);
+
+struct comData {
+  vector<Argument> outArgs;
+  vector<ParameterPtr> parameters;
+};
+
+void calcGradient(bool useGpu, comData& Data) {
+  FLAGS_use_gpu = useGpu;
+  FLAGS_config = configFile;
+
+  *ThreadLocalRand::getSeed() = 0;
+  srand(0);
+  Trainer trainer;
+  trainer.init(TrainerConfigHelper::createFromFlagConfig());
+
+  Data.parameters = trainer.getGradientMachine()->getParameters();
+  DataBatch dataBatch;
+  int32_t batchSize = trainer.getConfig().opt_config().batch_size();
+  trainer.getDataProvider()->setSkipShuffle();
+  trainer.getDataProvider()->getNextBatch(batchSize, &dataBatch);
+  CHECK(dataBatch.getSize()) << "No data from data provider";
+  vector<Argument>& inArgs = dataBatch.getStreams();
+  trainer.getGradientMachine()->start();
+  for (int i = 0; i < 2; ++i) {
+    trainer.getGradientMachine()->forwardBackward(
+        inArgs, &Data.outArgs, PASS_TRAIN);
+  }
+  trainer.getGradientMachine()->finish();
+}
+
+void compareGradient(comData& comDataCpu, comData& comDataGpu);
+
+TEST(Trainer, create) {
+  int devCount = 0;
+  devCount = hl_get_device_count();
+  FLAGS_config_args = "drop_rate=0";
+
+  comData comDataCpu;
+  calcGradient(false, comDataCpu);
+  LOG(INFO) << "Cpu is completed";
+
+  {
+    LOG(INFO) << "Test GPU";
+    comData comData;
+    calcGradient(true, comData);
+    compareGradient(comDataCpu, comData);
+    LOG(INFO) << "Gpu is completed";
+  }
+
+  {
+    LOG(INFO) << "Test test multi gpu";
+    comData comData;
+    FLAGS_trainer_count = devCount;
+    calcGradient(true, comData);
+    compareGradient(comDataCpu, comData);
+    LOG(INFO) << "Gpu4 is completed";
+  }
+
+  {
+    LOG(INFO) << "Test use_sparse_update=true";
+    comData comData;
+    calcGradient(false, comData);
+    compareGradient(comDataCpu, comData);
+    LOG(INFO) << "Cpu4 is completed";
+  }
+}
+
+double checkBuffer(real* A, real* B, size_t len) {
+#ifdef PADDLE_TYPE_DOUBLE
+  double precision = 1e-7;
+#else
+  double precision = 2e-3;
+#endif
+  int nNum = 0;
+  double maxE = 0;
+  for (size_t i = 0; i < len; ++i) {
+    double e = fabs(A[i] - B[i]);
+    maxE = std::max(e, maxE);
+    nNum += e > precision * fabs(A[i]);
+  }
+  EXPECT_EQ(0, nNum);
+  return maxE;
+}
+
+void compareGradient(comData& comDataCpu, comData& comDataGpu) {
+  /*compare outArgs*/
+  vector<Argument> outArgs1 = comDataCpu.outArgs;
+  vector<Argument> outArgs2 = comDataGpu.outArgs;
+  CpuMatrix out1(outArgs1[0].value->getHeight(), outArgs1[0].value->getWidth());
+  CpuMatrix out2(outArgs2[0].value->getHeight(), outArgs2[0].value->getWidth());
+  out1.copyFrom(*outArgs1[0].value);
+  out2.copyFrom(*outArgs2[0].value);
+  checkBuffer(out1.getData(), out2.getData(), out1.getElementCnt());
+
+  /*compare parameters*/
+  vector<ParameterPtr>& parameters1 = comDataCpu.parameters;
+  vector<ParameterPtr>& parameters2 = comDataGpu.parameters;
+  for (size_t i = 0; i < parameters1.size(); ++i) {
+    ParameterPtr parameter1, parameter2;
+    parameter1 = parameters1[i];
+    parameter2 = parameters2[i];
+    /*compare parameters value*/
+    CpuVector para1(parameter1->getSize());
+    CpuVector para2(parameter2->getSize());
+    para1.copyFrom(*parameter1->getBuf(PARAMETER_VALUE));
+    para2.copyFrom(*parameter2->getBuf(PARAMETER_VALUE));
+    checkBuffer(para1.getData(), para2.getData(), para1.getSize());
+
+    /*compare parameters grad*/
+    CpuVector cpuGrad1(*parameter1->getBuf(PARAMETER_GRADIENT));
+    CpuVector cpuGrad2(*parameter2->getBuf(PARAMETER_GRADIENT));
+    double e =
+        checkBuffer(cpuGrad1.getData(), cpuGrad2.getData(), cpuGrad1.getSize());
+    LOG(INFO) << parameter1->getName() << " max error=" << e;
+  }
+}
+
+int main(int argc, char** argv) {
+#ifndef PADDLE_WITH_CUDA
+  exit(0);
+#endif
+  paddle::initMain(argc, argv);
+  testing::InitGoogleTest(&argc, argv);
+  initPython(argc, argv);
+  int ret = RUN_ALL_TESTS();
+  exit(ret);
+}
diff --git a/paddle/legacy/trainer/tests/test_PyDataProviderWrapper.cpp b/paddle/legacy/trainer/tests/test_PyDataProviderWrapper.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..847adcfabada18e11203d3f18fb6dc355c670afb
--- /dev/null
+++ b/paddle/legacy/trainer/tests/test_PyDataProviderWrapper.cpp
@@ -0,0 +1,220 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef PADDLE_NO_PYTHON
+#include <DataConfig.pb.h>
+#include <gtest/gtest.h>
+#include <paddle/legacy/gserver/dataproviders/DataProvider.h>
+#include <paddle/legacy/math/Matrix.h>
+#include <paddle/legacy/parameter/Argument.h>
+#include <paddle/legacy/utils/PythonUtil.h>
+#include <fstream>
+#include <typeinfo>
+#include <unordered_map>
+#include <unordered_set>
+#include "picojson.h"
+
+void checkValue(std::vector<paddle::Argument>& arguments, picojson::array& arr);
+const std::string kDir = "./legacy/trainer/tests/pydata_provider_wrapper_dir/";
+
+TEST(PyDataProviderWrapper, SequenceData) {
+  paddle::DataConfig conf;
+  conf.set_type("py");
+  conf.set_load_data_module("testPyDataWrapper");
+  conf.set_load_data_object("processSeqAndGenerateData");
+  conf.set_load_data_args(kDir + "test_pydata_provider_wrapper.json");
+  conf.clear_files();
+  conf.set_files(kDir + "test_pydata_provider_wrapper.list");
+  paddle::DataProviderPtr provider(paddle::DataProvider::create(conf, false));
+  provider->setSkipShuffle();
+  provider->reset();
+  paddle::DataBatch batchFromPy;
+  provider->getNextBatch(100, &batchFromPy);
+
+  picojson::value val;
+  std::fstream fin;
+  fin.open(kDir + "test_pydata_provider_wrapper.json", std::ios_base::in);
+  EXPECT_TRUE(fin.is_open());
+  if (fin.is_open()) {
+    std::string err = picojson::parse(val, fin);
+    EXPECT_TRUE(err.empty());
+    EXPECT_TRUE(val.is<picojson::array>());
+    picojson::array& arr = val.get<picojson::array>();
+    std::vector<paddle::Argument>& arguments = batchFromPy.getStreams();
+    // CHECK Value
+    checkValue(arguments, arr);
+    // CHECK sequenceStartPositions
+    for (size_t i = 0; i < arr.size(); i++) {
+      int row_id = arr[i].get<picojson::array>().size();
+      EXPECT_EQ(0, arguments[i].sequenceStartPositions->getData(false)[0]);
+      EXPECT_EQ((int)row_id,
+                arguments[i].sequenceStartPositions->getData(false)[1]);
+    }
+    fin.close();
+  }
+}
+
+TEST(PyDataProviderWrapper, HasSubSequenceData) {
+  paddle::DataConfig conf;
+  conf.set_type("py");
+  conf.set_load_data_module("testPyDataWrapper");
+  conf.set_load_data_object("processSubSeqAndGenerateData");
+  conf.set_load_data_args(kDir + "test_pydata_provider_wrapper.json");
+  conf.clear_files();
+  conf.set_files(kDir + "test_pydata_provider_wrapper.list");
+  paddle::DataProviderPtr provider(paddle::DataProvider::create(conf, false));
+  provider->setSkipShuffle();
+  provider->reset();
+  paddle::DataBatch batchFromPy;
+  provider->getNextBatch(1, &batchFromPy);
+
+  picojson::value val;
+  std::fstream fin;
+  fin.open(kDir + "test_pydata_provider_wrapper.json", std::ios_base::in);
+  EXPECT_TRUE(fin.is_open());
+  if (fin.is_open()) {
+    std::string err = picojson::parse(val, fin);
+    EXPECT_TRUE(err.empty());
+    EXPECT_TRUE(val.is<picojson::array>());
+    picojson::array& arr = val.get<picojson::array>();
+    std::vector<paddle::Argument>& arguments = batchFromPy.getStreams();
+    // CHECK Value
+    checkValue(arguments, arr);
+    // CHECK sequenceStartPositions and subSequenceStartPositions
+    for (size_t i = 0; i < arr.size(); i++) {
+      int row_id = arr[i].get<picojson::array>().size();
+      EXPECT_EQ(0, arguments[i].sequenceStartPositions->getData(false)[0]);
+      EXPECT_EQ((int)row_id,
+                arguments[i].sequenceStartPositions->getData(false)[1]);
+      EXPECT_EQ(0, arguments[i].subSequenceStartPositions->getData(false)[0]);
+      EXPECT_EQ((int)row_id,
+                arguments[i].subSequenceStartPositions->getData(false)[1]);
+    }
+    fin.close();
+  }
+}
+
+int main(int argc, char** argv) {
+  paddle::initMain(argc, argv);
+  paddle::initPython(argc, argv);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+void checkValue(std::vector<paddle::Argument>& arguments,
+                picojson::array& arr) {
+  // CHECK SLOT 0, Sparse Value.
+  paddle::Argument& sparse_values_seq = arguments[0];
+  paddle::MatrixPtr& sparse_values_seq_rawmatrix = sparse_values_seq.value;
+  EXPECT_TRUE(sparse_values_seq_rawmatrix != nullptr);
+  paddle::CpuSparseMatrix* sparse_val_seq_sparse_mat =
+      dynamic_cast<paddle::CpuSparseMatrix*>(sparse_values_seq_rawmatrix.get());
+  EXPECT_TRUE(sparse_val_seq_sparse_mat != nullptr);
+  EXPECT_EQ(arr.size(), arguments.size());
+  EXPECT_TRUE(arr[0].is<picojson::array>());
+  size_t row_id = 0;
+  for (picojson::value& sparse_val_seq : arr[0].get<picojson::array>()) {
+    std::unordered_map<int, real> cols;
+    for (picojson::value& kv : sparse_val_seq.get<picojson::array>()) {
+      EXPECT_TRUE(kv.get(0).is<double>());
+      EXPECT_TRUE(kv.get(1).is<double>());
+      int col = (int)(kv.get(0).get<double>());
+      real val = (real)(kv.get(1).get<double>());
+      cols.insert({col, val});
+    }
+    size_t colNum = sparse_val_seq_sparse_mat->getColNum(row_id);
+    EXPECT_EQ(cols.size(), colNum);
+    int* rowIds = sparse_val_seq_sparse_mat->getRowCols(row_id);
+    real* rowBuf = sparse_val_seq_sparse_mat->getRowValues(row_id);
+    for (size_t i = 0; i < colNum; ++i) {
+      int id = rowIds[i];
+      auto it = cols.find(id);
+      EXPECT_NE(cols.end(), it);
+      real expect = it->second;
+      EXPECT_NEAR(expect, *rowBuf, 1e-5);
+      ++rowBuf;
+    }
+    ++row_id;
+  }
+
+  // CHECK SLOT 1, Dense Value.
+  paddle::Argument& dense_arg = arguments[1];
+  paddle::MatrixPtr& dense_mat = dense_arg.value;
+  EXPECT_NE(nullptr, dense_mat);
+  EXPECT_TRUE(arr[1].is<picojson::array>());
+  row_id = 0;
+  for (picojson::value& dense_seq : arr[1].get<picojson::array>()) {
+    EXPECT_TRUE(dense_seq.is<picojson::array>());
+    picojson::array& row = dense_seq.get<picojson::array>();
+    EXPECT_EQ(row.size(), dense_mat->getWidth());
+    real* rowBuf = dense_mat->getRowBuf(row_id++);
+
+    for (picojson::value& val : row) {
+      EXPECT_TRUE(val.is<double>());
+      real expect = val.get<double>();
+      EXPECT_NEAR(expect, *rowBuf++, 1e-5);
+    }
+  }
+
+  // CHECK SLOT 2, Sparse Non Value.
+  paddle::Argument& sparse_non_val_arg = arguments[2];
+  paddle::MatrixPtr& sparse_non_val_rawm = sparse_non_val_arg.value;
+  EXPECT_NE(nullptr, sparse_non_val_rawm);
+  paddle::CpuSparseMatrix* sparse_non_val_m =
+      dynamic_cast<paddle::CpuSparseMatrix*>(sparse_non_val_rawm.get());
+  EXPECT_NE(nullptr, sparse_non_val_m);
+  row_id = 0;
+  for (picojson::value& row : arr[2].get<picojson::array>()) {
+    EXPECT_TRUE(row.is<picojson::array>());
+    std::unordered_set<int> ids;
+    for (picojson::value& id : row.get<picojson::array>()) {
+      EXPECT_TRUE(id.is<double>());
+      ids.insert((int)(id.get<double>()));
+    }
+    size_t colNum = sparse_non_val_m->getColNum(row_id);
+    EXPECT_EQ(ids.size(), colNum);
+    for (size_t i = 0; i < colNum; ++i) {
+      int col = sparse_non_val_m->getRowCols(row_id)[i];
+      EXPECT_TRUE(ids.find(col) != ids.end());
+    }
+    ++row_id;
+  }
+
+  // CHECK SLOT 3, Index.
+  paddle::Argument& index_arg = arguments[3];
+  paddle::IVectorPtr indices = index_arg.ids;
+  EXPECT_NE(nullptr, indices);
+  int* idPtr = indices->getData();
+  for (picojson::value& id : arr[3].get<picojson::array>()) {
+    EXPECT_TRUE(id.is<double>());
+    int _id = (int)(id.get<double>());
+    EXPECT_EQ(_id, *idPtr++);
+  }
+
+  // CHECK SLOT 4, String.
+  paddle::Argument& strArg = arguments[4];
+  std::vector<std::string>* strPtr = strArg.strs.get();
+  EXPECT_NE(nullptr, strPtr);
+  size_t vecIndex = 0;
+  for (picojson::value& str : arr[4].get<picojson::array>()) {
+    EXPECT_TRUE(str.is<std::string>());
+    std::string _str = str.get<std::string>();
+    EXPECT_EQ(_str, (*strPtr)[vecIndex++]);
+  }
+}
+
+#else
+int main() { return 0; }
+
+#endif
diff --git a/paddle/legacy/trainer/tests/test_Trainer.cpp b/paddle/legacy/trainer/tests/test_Trainer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..14ad0a265281a8df20a70b0da2873ea451338ddb
--- /dev/null
+++ b/paddle/legacy/trainer/tests/test_Trainer.cpp
@@ -0,0 +1,107 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <paddle/legacy/utils/PythonUtil.h>
+#include <paddle/legacy/utils/Version.h>
+#include "paddle/legacy/trainer/Trainer.h"
+
+#include <gtest/gtest.h>
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+static const string& configFile1 =
+    "legacy/trainer/tests/sample_trainer_config.conf";
+static const string& configFile2 =
+    "legacy/trainer/tests/sample_trainer_config_hsigmoid.conf";
+static const string& configFile4 =
+    "legacy/trainer/tests/sample_trainer_config_parallel.conf";
+
+DECLARE_bool(use_gpu);
+DECLARE_string(config);
+DECLARE_int32(gpu_id);
+DECLARE_bool(allow_only_one_model_on_one_gpu);
+
+void checkGradientTest(const string& configFile,
+                       bool useGpu,
+                       bool parallel,
+                       int trainerCount = 1) {
+  FLAGS_use_gpu = useGpu;
+  FLAGS_parallel_nn = parallel;
+  FLAGS_config = configFile;
+  FLAGS_trainer_count = trainerCount;
+  LOG(INFO) << " useGpu=" << useGpu << " trainerCount=" << trainerCount
+            << " configFile=" << configFile;
+
+  Trainer trainer;
+  trainer.init(TrainerConfigHelper::createFromFlagConfig());
+  EXPECT_LE(fabs(trainer.checkGradient()), 0.02);
+}
+
+TEST(checkGradient, cpu) { checkGradientTest(configFile1, false, false); }
+
+#ifdef PADDLE_WITH_CUDA
+TEST(checkGradient, gpu) { checkGradientTest(configFile1, true, false); }
+
+TEST(checkGradient, multiGpu) {
+  int numGpu;
+  numGpu = hl_get_device_count();
+  for (auto count : {2, 4}) {
+    if (count <= numGpu) {
+      checkGradientTest(configFile1, true, false, count);
+    }
+  }
+}
+
+TEST(checkGradient, parallel) {
+  if (hl_get_device_count() >= 2) {
+    checkGradientTest(configFile4, true, true);
+  }
+}
+
+TEST(checkGradient, multiParallel) {
+  FLAGS_allow_only_one_model_on_one_gpu = false;
+  checkGradientTest(configFile4, true, true, 2);
+  FLAGS_allow_only_one_model_on_one_gpu = true;
+}
+
+#endif
+
+TEST(checkGradient, multi) {
+  int numGpu;
+  if (version::isWithGpu()) {
+    numGpu = hl_get_device_count();
+  } else {
+    numGpu = 0;
+  }
+  for (bool useGpu : {false, true}) {
+    for (auto count : {2, 4}) {
+      if (useGpu && count > numGpu) continue;
+      checkGradientTest(configFile1, useGpu, false, count);
+    }
+  }
+}
+
+TEST(checkGradient, hsigmoid) { checkGradientTest(configFile2, false, false); }
+
+TEST(checkGradient, non_parallel) {
+  checkGradientTest(configFile4, false, false);
+}
+
+int main(int argc, char** argv) {
+  initMain(argc, argv);
+  initPython(argc, argv);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/trainer/tests/test_TrainerOnePass.cpp b/paddle/legacy/trainer/tests/test_TrainerOnePass.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3e5c5ea723f3fd80316ee826fe9c6566e7049b7b
--- /dev/null
+++ b/paddle/legacy/trainer/tests/test_TrainerOnePass.cpp
@@ -0,0 +1,318 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <paddle/legacy/utils/GlobalConstants.h>
+#include <paddle/legacy/utils/PythonUtil.h>
+#include "paddle/legacy/trainer/Trainer.h"
+#include "paddle/legacy/trainer/TrainerInternal.h"
+
+#include <gtest/gtest.h>
+#include <paddle/legacy/pserver/ParameterServer2.h>
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+static const string& configFile1 =
+    "legacy/trainer/tests/sample_trainer_config.conf";
+static const string& configFile2 =
+    "legacy/trainer/tests/sample_trainer_config_parallel.conf";
+
+static const string& configFileSimpleSparse =
+    "legacy/trainer/tests/simple_sparse_neural_network.py";
+
+DECLARE_bool(use_gpu);
+DECLARE_string(config);
+DECLARE_int32(gpu_id);
+DECLARE_int32(seed);
+DECLARE_int32(num_passes);
+DECLARE_int32(saving_period);
+
+class TrainerForTest : public paddle::Trainer {
+ public:
+  inline const std::shared_ptr<ParameterUpdater>& getParameterUpdaterForTest() {
+    return this->trainerInternal_.getParameterUpdater();
+  }
+};
+
+int gNumDevices = 0;
+
+void trainerOnePassTest(const string& configFile,
+                        bool useGpu,
+                        bool parallel,
+                        int trainerCount = 1,
+                        double averageWindow = 0.0f,
+                        bool doAverageInCpu = false) {
+  FLAGS_use_gpu = useGpu;
+  FLAGS_parallel_nn = parallel;
+  FLAGS_config = configFile;
+  FLAGS_trainer_count = trainerCount;
+  LOG(INFO) << " useGpu=" << useGpu << " trainerCount=" << trainerCount
+            << " configFile=" << configFile;
+  srand(FLAGS_seed);
+
+  if (useGpu) {
+    if (gNumDevices < trainerCount) {
+      return;
+    }
+  }
+
+  Trainer trainer;
+  auto config = TrainerConfigHelper::createFromFlagConfig();
+  if (averageWindow > 0) {
+    config->getOptConfig().set_average_window(averageWindow);
+    config->getOptConfig().set_do_average_in_cpu(doAverageInCpu);
+  }
+  trainer.init(config);
+  trainer.train();
+}
+
+// 1. test trainer (cpu, gpu).
+TEST(trainerOnePass, cpu) { trainerOnePassTest(configFile1, false, false); }
+
+#ifdef PADDLE_WITH_CUDA
+TEST(trainerOnePass, gpu) { trainerOnePassTest(configFile1, true, false); }
+
+TEST(trainerOnePass, gpu2) { trainerOnePassTest(configFile1, true, false, 2); }
+
+TEST(trainerOnePass, gpu4) { trainerOnePassTest(configFile1, true, false, 4); }
+
+TEST(trainerOnePass, parallel) {
+  if (hl_get_device_count() >= 2) {
+    trainerOnePassTest(configFile2, true, true);
+  }
+}
+#endif
+
+// 2. test average_window.
+#ifdef PADDLE_WITH_CUDA
+TEST(average_window, gpu) {
+  trainerOnePassTest(configFile1, true, false, 4, 0.01);
+}
+
+TEST(average_window, gpu2) {
+  FLAGS_num_passes = 20;
+  trainerOnePassTest(configFile1, true, false, 2, 0.01);
+  FLAGS_num_passes = 1;
+}
+
+TEST(average_window, gpu4) {
+  FLAGS_num_passes = 20;
+  trainerOnePassTest(configFile1, true, false, 4, 0.01);
+  FLAGS_num_passes = 1;
+}
+
+TEST(average_window_cpu, gpu2) {
+  FLAGS_num_passes = 20;
+  trainerOnePassTest(configFile1, true, false, 2, 0.01, true);
+  FLAGS_num_passes = 1;
+}
+
+TEST(average_window_cpu, gpu4) {
+  FLAGS_num_passes = 20;
+  trainerOnePassTest(configFile1, true, false, 4, 0.01, true);
+  FLAGS_num_passes = 1;
+}
+#endif
+
+// 3. test trainer + pserver.
+DECLARE_int32(num_gradient_servers);
+DECLARE_int32(port);
+DECLARE_bool(local);
+DECLARE_bool(use_old_updater);
+
+double checkRemoteParameterUpdater(TrainerForTest& trainer) {
+  auto gradientMachine = trainer.getGradientMachine();
+  auto parameterUpdater = trainer.getParameterUpdaterForTest();
+  auto dataProvider = trainer.getDataProvider();
+  auto& parameters = gradientMachine->getParameters();
+  const TrainerConfig& config = trainer.getConfig();
+  const string& alg = config.opt_config().algorithm();
+
+  vector<ParameterPtr> parameterCheck;
+  for (auto& parameter : parameters) {
+    parameterCheck.emplace_back(
+        new Parameter(parameter->getConfig(), /* useGpu= */ false));
+    parameterCheck.back()
+        ->getBuf(PARAMETER_VALUE)
+        ->copyFrom(*parameter->getBuf(PARAMETER_VALUE));
+    parameterCheck.back()
+        ->getBuf(PARAMETER_GRADIENT)
+        ->copyFrom(*parameter->getBuf(PARAMETER_GRADIENT));
+  }
+
+  std::unique_ptr<ParameterUpdater> parameterUpdaterCheck;
+  if (alg == TrainAlgorithm::SGD) {
+    parameterUpdaterCheck.reset(new SgdLocalUpdater(config.opt_config()));
+  } else {
+    LOG(INFO) << "unsupported algorithm in remote parameter check: " << alg;
+    return -1.0;
+  }
+  parameterUpdaterCheck->init(parameterCheck);
+
+  // gradientMachine->start(config, *dataProvider);
+  DataBatch dataBatch;
+  int32_t batchSize = config.opt_config().batch_size();
+  dataProvider->getNextBatch(batchSize, &dataBatch);
+  CHECK(dataBatch.getSize()) << "No data from data provider";
+  int64_t actualBatchSize = dataBatch.getSize();
+  const vector<Argument>& inArgs = dataBatch.getStreams();
+  vector<Argument> outArgs;
+
+  UpdateCallback updateCallback = [parameterUpdater,
+                                   parameterCheck](Parameter* para) {
+    parameterCheck[para->getID()]
+        ->getBuf(PARAMETER_GRADIENT)
+        ->copyFrom(*para->getBuf(PARAMETER_GRADIENT));
+    parameterUpdater->update(para);
+  };
+
+  parameterUpdater->startPass();
+  parameterUpdaterCheck->startPass();
+
+  for (int i = 0; i < config.opt_config().num_batches_per_get_parameter() * 2;
+       ++i) {
+    PassType passType = parameterUpdater->startBatch(actualBatchSize);
+    gradientMachine->forwardBackward(
+        inArgs, &outArgs, passType, updateCallback);
+    parameterUpdater->finishBatch(0);
+
+    parameterUpdaterCheck->startBatch(actualBatchSize);
+    for (auto& para : parameterCheck) {
+      parameterUpdaterCheck->update(para.get());
+    }
+    parameterUpdaterCheck->finishBatch(0);
+  }
+
+  double sum = 0.0f;
+  for (size_t i = 0; i != parameters.size(); ++i) {
+    real *v1, *v2;
+    CpuVector trainerPara(parameters[i]->getSize());
+    trainerPara.copyFrom(*parameters[i]->getBuf(PARAMETER_VALUE));
+    if (!FLAGS_use_gpu) {
+      v1 = parameters[i]->getBuf(PARAMETER_VALUE)->getData();
+    } else {
+      v1 = trainerPara.getData();
+    }
+    v2 = parameterCheck[i]->getBuf(PARAMETER_VALUE)->getData();
+
+    size_t size = parameters[i]->getSize();
+    double diff = 0;
+    for (size_t j = 0; j < size; ++j) {
+      diff += fabs(v1[j] - v2[j]);
+    }
+    sum += diff;
+    LOG(INFO) << setiosflags(ios::left) << setfill(' ') << setw(20)
+              << parameters[i]->getName() << "diff=" << setw(15) << diff;
+  }
+
+  parameterUpdater->finishPass();
+  parameterUpdaterCheck->finishPass();
+  gradientMachine->finish();
+  return sum;
+}
+
+void checkRemoteParameterUpdaterTest(const string& configFile,
+                                     bool useGpu,
+                                     bool parallel,
+                                     int trainerCount = 1,
+                                     bool useOldUpdater = false,
+                                     int num_batches_per_get_parameter = 1) {
+  FLAGS_use_gpu = useGpu;
+  FLAGS_parallel_nn = parallel;
+  FLAGS_config = configFile;
+  FLAGS_trainer_count = trainerCount;
+  FLAGS_use_old_updater = useOldUpdater;
+  LOG(INFO) << " useGpu=" << useGpu << " trainerCount=" << trainerCount
+            << " configFile=" << configFile;
+  srand(FLAGS_seed);
+
+  if (useGpu) {
+    if (gNumDevices < trainerCount) {
+      return;
+    }
+  }
+
+  FLAGS_local = 0;
+  std::shared_ptr<ParameterServer2> pserver;
+  pserver.reset(new ParameterServer2(std::string(), FLAGS_port));
+  pserver->init();
+  pserver->start();
+
+  TrainerForTest trainer;
+  auto config = TrainerConfigHelper::createFromFlagConfig();
+  config->getOptConfig().set_num_batches_per_get_parameter(
+      num_batches_per_get_parameter);
+  trainer.init(config);
+  EXPECT_EQ(checkRemoteParameterUpdater(trainer), 0);
+
+  FLAGS_local = 1;
+}
+
+TEST(checkRemoteUpdater, cpuTrainer) {
+  checkRemoteParameterUpdaterTest(configFile1, false, false);
+}
+
+TEST(checkRemoteUpdater, cpuTrainerOldUpdater) {
+  checkRemoteParameterUpdaterTest(configFile1, false, false, 1, true);
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(checkRemoteUpdater, gpuTrainer) {
+  checkRemoteParameterUpdaterTest(configFile1, true, false);
+}
+
+TEST(checkRemoteUpdater, gpu2Trainer) {
+  checkRemoteParameterUpdaterTest(configFile1, true, false, 2);
+}
+
+TEST(checkRemoteUpdater, gpu4Trainer) {
+  checkRemoteParameterUpdaterTest(configFile1, true, false, 4);
+}
+
+TEST(checkRemoteUpdater, gpuTrainerOldUpdater) {
+  checkRemoteParameterUpdaterTest(configFile1, true, false, 1, true);
+}
+
+TEST(checkRemoteUpdater, gpu2TrainerOldUpdater) {
+  checkRemoteParameterUpdaterTest(configFile1, true, false, 2, true);
+}
+
+TEST(checkRemoteUpdater, gpu4TrainerOldUpdater) {
+  checkRemoteParameterUpdaterTest(configFile1, true, false, 4, true);
+}
+
+#endif
+
+TEST(checkRemoteUpdater, cpuDeltaTrainer) {
+  checkRemoteParameterUpdaterTest(configFile1, false, false, 1, false, 10);
+}
+
+TEST(checkRemoteUpdater, cpuDeltaTrainerOldUpdater) {
+  checkRemoteParameterUpdaterTest(configFile1, false, false, 1, true, 10);
+}
+
+TEST(SgdThreadUpdater, simpleSparseNN) {
+  trainerOnePassTest(configFileSimpleSparse, false, false, 1, 0.5, true);
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  initPython(argc, argv);
+  gNumDevices = hl_get_device_count();
+
+  FLAGS_num_passes = 1;          // train one pass
+  FLAGS_saving_period = 100000;  // do not save parameteres
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/trainer/tests/test_config.conf b/paddle/legacy/trainer/tests/test_config.conf
new file mode 100644
index 0000000000000000000000000000000000000000..bce687ad83686d465987d72defd37db2b50953a1
--- /dev/null
+++ b/paddle/legacy/trainer/tests/test_config.conf
@@ -0,0 +1,77 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+TrainData(SimpleData(
+    files = "legacy/trainer/tests/sample_filelist.txt",
+    feat_dim = 3,
+    context_len = 0,
+    buffer_capacity = 1000000,
+    async_load_data = False))
+
+settings(batch_size = 100)
+
+data = data_layer(name='input', size=3)
+
+wt = data_layer(name='weight', size=1)
+
+fc1 = fc_layer(input=data, size=5,
+               bias_attr=True,
+               act=SigmoidActivation())
+
+fc2 = fc_layer(input=data, size=12,
+               bias_attr=True,
+               param_attr=ParamAttr(name='sharew'),
+               act=LinearActivation())
+
+fc3 = fc_layer(input=data, size=3,
+               bias_attr=True,
+               act=TanhActivation())
+
+fc4 = fc_layer(input=data, size=5,
+               bias_attr=True,
+               layer_attr=ExtraAttr(drop_rate=0.5),
+               act=SquareActivation())
+
+pool = img_pool_layer(input=fc2,
+                      pool_size=2,
+                      pool_size_y=3,
+                      num_channels=1,
+                      padding=1,
+                      padding_y=2,
+                      stride=2,
+                      stride_y=3,
+                      pool_type=CudnnAvgPooling())
+
+concat = concat_layer(input=[fc3, fc4])
+
+with mixed_layer(size=3, act=SoftmaxActivation()) as output:
+    output += full_matrix_projection(input=fc1)
+    output += trans_full_matrix_projection(input=fc2,
+                                           param_attr=ParamAttr(name='sharew'))
+    output += full_matrix_projection(input=concat)
+    output += identity_projection(input=fc3)
+
+lbl = data_layer(name='label', size=1)
+
+cost = classification_cost(input=output, label=lbl, weight=wt,
+                           layer_attr=ExtraAttr(device=-1))
+
+nce = nce_layer(input=fc2, label=lbl, weight=wt,
+                num_classes=3, 
+                neg_distribution=[0.1, 0.3, 0.6])
+                
+outputs(cost, nce)
diff --git a/paddle/trainer/tests/test_gen_dict.txt b/paddle/legacy/trainer/tests/test_gen_dict.txt
similarity index 100%
rename from paddle/trainer/tests/test_gen_dict.txt
rename to paddle/legacy/trainer/tests/test_gen_dict.txt
diff --git a/paddle/legacy/trainer/tests/test_recurrent_machine_generation.cpp b/paddle/legacy/trainer/tests/test_recurrent_machine_generation.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..47b4e82cd32917fcf32dbb5ffabb47330dab93d9
--- /dev/null
+++ b/paddle/legacy/trainer/tests/test_recurrent_machine_generation.cpp
@@ -0,0 +1,157 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <fstream>
+
+#include <paddle/legacy/trainer/Trainer.h>
+#include <paddle/legacy/utils/PythonUtil.h>
+
+#include <gtest/gtest.h>
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+static const string& CONFIG_FILE =
+    "legacy/trainer/tests/sample_trainer_rnn_gen.conf";
+static const string& NEST_CONFIG_FILE =
+    "legacy/trainer/tests/sample_trainer_nest_rnn_gen.conf";
+static const string& OUTPUT_DIR = "legacy/trainer/tests/dump_text.test";
+static string modelDir =
+    "legacy/trainer/tests/rnn_gen_test_model_dir/t1";       // NOLINT
+static string expectFile =                                  // NOLINT
+    "legacy/trainer/tests/rnn_gen_test_model_dir/r1.test";  // NOLINT
+
+DECLARE_string(config_args);
+
+vector<float> readRetFile(const string& fname) {
+  ifstream inFile(fname);
+  float ret;
+  vector<float> nums;
+  while (inFile >> ret) {
+    nums.push_back(ret);
+  }
+  return nums;
+}
+
+void checkOutput(const string& expRetFile) {
+  vector<float> rets = readRetFile(OUTPUT_DIR);
+  vector<float> expRets = readRetFile(expRetFile);
+  EXPECT_EQ(rets.size(), expRets.size());
+  for (size_t i = 0; i < rets.size(); i++) {
+    EXPECT_FLOAT_EQ(rets[i], expRets[i]);
+  }
+}
+
+void prepareInArgs(vector<Argument>& inArgs,
+                   const size_t batchSize,
+                   bool useGpu,
+                   bool hasSubseq) {
+  inArgs.clear();
+  // sentence id
+  Argument sentId;
+  sentId.value = nullptr;
+  if (hasSubseq) {
+    // as there is only one sequence, there is only one label.
+    IVector::resizeOrCreate(sentId.ids, 1, useGpu);
+    sentId.ids->setElement(0, 0);
+  } else {
+    // as there is batchSize word, there is batchSize label.
+    IVector::resizeOrCreate(sentId.ids, batchSize, useGpu);
+    for (size_t i = 0; i < batchSize; ++i) sentId.ids->setElement(i, i);
+  }
+  inArgs.emplace_back(sentId);
+
+  // a dummy layer to decide batch size
+  Argument dummyInput;
+  dummyInput.value = Matrix::create(batchSize, 2, false, useGpu);
+  dummyInput.value->randomizeUniform();
+  if (hasSubseq) {
+    // generate one sequence with batchSize subsequence,
+    // and each subsequence has only one word.
+    dummyInput.sequenceStartPositions = ICpuGpuVector::create(2, false);
+    int* buf = dummyInput.sequenceStartPositions->getMutableData(false);
+    dummyInput.subSequenceStartPositions =
+        ICpuGpuVector::create(batchSize + 1, false);
+    int* subBuf = dummyInput.subSequenceStartPositions->getMutableData(false);
+    buf[0] = 0;
+    buf[1] = batchSize;
+    for (size_t i = 0; i < batchSize + 1; i++) subBuf[i] = i;
+  }
+  inArgs.emplace_back(dummyInput);
+}
+
+void testGeneration(const string& configFile,
+                    bool useGpu,
+                    bool hasSubseq,
+                    const string& expRetFile) {
+  FLAGS_use_gpu = useGpu;
+  auto config = std::make_shared<TrainerConfigHelper>(configFile);
+  unique_ptr<GradientMachine> gradientMachine(GradientMachine::create(*config));
+  gradientMachine->loadParameters(modelDir);
+  vector<Argument> inArgs(2);
+
+  const size_t batchSize = 15;
+  prepareInArgs(inArgs, batchSize, useGpu, hasSubseq);
+  vector<Argument> outArgs;
+  unique_ptr<Evaluator> testEvaluator(gradientMachine->makeEvaluator());
+  testEvaluator->start();
+  gradientMachine->forward(inArgs, &outArgs, PASS_TEST);
+  gradientMachine->eval(testEvaluator.get());
+  testEvaluator->finish();
+  checkOutput(expRetFile);
+}
+
+#ifndef PADDLE_TYPE_DOUBLE
+
+TEST(RecurrentGradientMachine, test_generation) {
+#ifndef PADDLE_WITH_CUDA
+  const auto useGpuConfs = {false};
+#else
+  const auto useGpuConfs = {true, false};
+#endif
+  auto testGen = [&](const string& configFile,
+                     bool hasSubseq,
+                     const string& expRetFile,
+                     bool beam_search) {
+    FLAGS_config_args = beam_search ? "beam_search=1" : "beam_search=0";
+    for (auto useGpu : useGpuConfs) {
+      LOG(INFO) << configFile << " useGpu=" << useGpu
+                << " beam_search=" << beam_search;
+      testGeneration(configFile, useGpu, hasSubseq, expRetFile);
+    }
+  };
+  testGen(CONFIG_FILE, false, expectFile + ".nobeam", false);  // no beam search
+  testGen(CONFIG_FILE, false, expectFile + ".beam", true);     // beam search
+  // In hierarchical RNN, beam search and one way search are only in inner-RNN,
+  // outer-RNN will concat the generated inner-results (first for beam search)
+  // from inner-RNN. Thus, they have the same outer-results.
+  testGen(NEST_CONFIG_FILE,
+          true,
+          expectFile + ".nest",
+          false);  // no beam search
+  testGen(NEST_CONFIG_FILE, true, expectFile + ".nest", true);  // beam search
+}
+#endif
+
+int main(int argc, char** argv) {
+  initMain(argc, argv);
+  initPython(argc, argv);
+  CHECK(argc == 1 || argc == 3);
+  if (argc == 3) {
+    modelDir = argv[1];
+    expectFile = argv[2];
+  }
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/utils/.gitignore b/paddle/legacy/utils/.gitignore
similarity index 100%
rename from paddle/utils/.gitignore
rename to paddle/legacy/utils/.gitignore
diff --git a/paddle/utils/Any.h b/paddle/legacy/utils/Any.h
similarity index 100%
rename from paddle/utils/Any.h
rename to paddle/legacy/utils/Any.h
diff --git a/paddle/utils/CMakeLists.txt b/paddle/legacy/utils/CMakeLists.txt
similarity index 100%
rename from paddle/utils/CMakeLists.txt
rename to paddle/legacy/utils/CMakeLists.txt
diff --git a/paddle/utils/ClassRegistrar.h b/paddle/legacy/utils/ClassRegistrar.h
similarity index 100%
rename from paddle/utils/ClassRegistrar.h
rename to paddle/legacy/utils/ClassRegistrar.h
diff --git a/paddle/utils/Common.h b/paddle/legacy/utils/Common.h
similarity index 100%
rename from paddle/utils/Common.h
rename to paddle/legacy/utils/Common.h
diff --git a/paddle/legacy/utils/CpuId.cpp b/paddle/legacy/utils/CpuId.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..66e7c6606f070aef4fd954b8f4ada994b2f4fb96
--- /dev/null
+++ b/paddle/legacy/utils/CpuId.cpp
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/utils/CpuId.h"
+#include "paddle/legacy/utils/Util.h"
+
+#ifdef _WIN32
+
+#include <intrin.h>
+
+/// for MSVC
+#define CPUID(info, x) __cpuidex(info, x, 0)
+
+#else
+
+#if !defined(__arm__) && !defined(__aarch64__)
+#include <cpuid.h>
+/// for GCC/Clang
+#define CPUID(info, x) __cpuid_count(x, 0, info[0], info[1], info[2], info[3])
+#endif
+
+#endif
+
+namespace paddle {
+
+SIMDFlags::SIMDFlags() {
+#if defined(__arm__) || defined(__aarch64__)
+  simd_flags_ = SIMD_NEON;
+#else
+  unsigned int cpuInfo[4];
+  // CPUID: https://en.wikipedia.org/wiki/CPUID
+  // clang-format off
+  CPUID(cpuInfo, 0x00000001);
+  simd_flags_ |= cpuInfo[3] & (1 << 25) ? SIMD_SSE   : SIMD_NONE;
+  simd_flags_ |= cpuInfo[3] & (1 << 26) ? SIMD_SSE2  : SIMD_NONE;
+  simd_flags_ |= cpuInfo[2] & (1 <<  0) ? SIMD_SSE3  : SIMD_NONE;
+  simd_flags_ |= cpuInfo[2] & (1 <<  9) ? SIMD_SSSE3 : SIMD_NONE;
+  simd_flags_ |= cpuInfo[2] & (1 << 19) ? SIMD_SSE41 : SIMD_NONE;
+  simd_flags_ |= cpuInfo[2] & (1 << 20) ? SIMD_SSE42 : SIMD_NONE;
+  simd_flags_ |= cpuInfo[2] & (1 << 12) ? SIMD_FMA3  : SIMD_NONE;
+  simd_flags_ |= cpuInfo[2] & (1 << 28) ? SIMD_AVX   : SIMD_NONE;
+
+  CPUID(cpuInfo, 0x00000007);
+  simd_flags_ |= cpuInfo[1] & (1 <<  5) ? SIMD_AVX2  : SIMD_NONE;
+  simd_flags_ |= cpuInfo[1] & (1 << 16) ? SIMD_AVX512: SIMD_NONE;
+
+  CPUID(cpuInfo, 0x80000001);
+  simd_flags_ |= cpuInfo[2] & (1 << 16) ? SIMD_FMA4  : SIMD_NONE;
+  // clang-fotmat on
+#endif
+}
+
+SIMDFlags const* SIMDFlags::instance() {
+  static SIMDFlags instance;
+  return &instance;
+}
+
+}  // namespace paddle
diff --git a/paddle/utils/CpuId.h b/paddle/legacy/utils/CpuId.h
similarity index 100%
rename from paddle/utils/CpuId.h
rename to paddle/legacy/utils/CpuId.h
diff --git a/paddle/utils/CustomStackTrace.cpp b/paddle/legacy/utils/CustomStackTrace.cpp
similarity index 100%
rename from paddle/utils/CustomStackTrace.cpp
rename to paddle/legacy/utils/CustomStackTrace.cpp
diff --git a/paddle/utils/CustomStackTrace.h b/paddle/legacy/utils/CustomStackTrace.h
similarity index 100%
rename from paddle/utils/CustomStackTrace.h
rename to paddle/legacy/utils/CustomStackTrace.h
diff --git a/paddle/utils/DynamicLoader.cpp b/paddle/legacy/utils/DynamicLoader.cpp
similarity index 100%
rename from paddle/utils/DynamicLoader.cpp
rename to paddle/legacy/utils/DynamicLoader.cpp
diff --git a/paddle/utils/DynamicLoader.h b/paddle/legacy/utils/DynamicLoader.h
similarity index 100%
rename from paddle/utils/DynamicLoader.h
rename to paddle/legacy/utils/DynamicLoader.h
diff --git a/paddle/utils/Error.h b/paddle/legacy/utils/Error.h
similarity index 100%
rename from paddle/utils/Error.h
rename to paddle/legacy/utils/Error.h
diff --git a/paddle/utils/Excepts.h b/paddle/legacy/utils/Excepts.h
similarity index 100%
rename from paddle/utils/Excepts.h
rename to paddle/legacy/utils/Excepts.h
diff --git a/paddle/utils/Flags.cpp b/paddle/legacy/utils/Flags.cpp
similarity index 100%
rename from paddle/utils/Flags.cpp
rename to paddle/legacy/utils/Flags.cpp
diff --git a/paddle/utils/Flags.h b/paddle/legacy/utils/Flags.h
similarity index 100%
rename from paddle/utils/Flags.h
rename to paddle/legacy/utils/Flags.h
diff --git a/paddle/utils/GlobalConstants.cpp b/paddle/legacy/utils/GlobalConstants.cpp
similarity index 100%
rename from paddle/utils/GlobalConstants.cpp
rename to paddle/legacy/utils/GlobalConstants.cpp
diff --git a/paddle/utils/GlobalConstants.h b/paddle/legacy/utils/GlobalConstants.h
similarity index 100%
rename from paddle/utils/GlobalConstants.h
rename to paddle/legacy/utils/GlobalConstants.h
diff --git a/paddle/utils/Locks.h b/paddle/legacy/utils/Locks.h
similarity index 100%
rename from paddle/utils/Locks.h
rename to paddle/legacy/utils/Locks.h
diff --git a/paddle/utils/Logging.cpp b/paddle/legacy/utils/Logging.cpp
similarity index 100%
rename from paddle/utils/Logging.cpp
rename to paddle/legacy/utils/Logging.cpp
diff --git a/paddle/utils/Logging.h b/paddle/legacy/utils/Logging.h
similarity index 100%
rename from paddle/utils/Logging.h
rename to paddle/legacy/utils/Logging.h
diff --git a/paddle/utils/PythonUtil.cpp b/paddle/legacy/utils/PythonUtil.cpp
similarity index 100%
rename from paddle/utils/PythonUtil.cpp
rename to paddle/legacy/utils/PythonUtil.cpp
diff --git a/paddle/legacy/utils/PythonUtil.h b/paddle/legacy/utils/PythonUtil.h
new file mode 100644
index 0000000000000000000000000000000000000000..b0c8612c378fbe12cdf24e51a5b6546740b2d4c8
--- /dev/null
+++ b/paddle/legacy/utils/PythonUtil.h
@@ -0,0 +1,353 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+// clang-format off
+#include "paddle/legacy/utils/Util.h"
+
+#ifndef PADDLE_NO_PYTHON
+// must include the following two blocks, otherwise,
+// gcc compiler may produce warning
+#ifdef __APPLE__
+#define _POSIX_SOURCE
+#define _POSIX_C_SOURCE 200809L
+#define _XOPEN_SOURCE 700
+#endif
+
+#ifdef _POSIX_C_SOURCE
+#define __TEMP_POSIX_C_SOURCE _POSIX_C_SOURCE
+#undef _POSIX_C_SOURCE
+#endif
+#ifdef _XOPEN_SOURCE
+#define __TEMP_XOPEN_SOURCE _XOPEN_SOURCE
+#undef _XOPEN_SOURCE
+#endif
+#include <Python.h>
+#include <frameobject.h>
+#endif
+
+#include <stdarg.h>
+#include <map>
+#include <mutex>
+// clang-format on
+
+namespace paddle {
+
+std::string callPythonFunc(const std::string& moduleName,
+                           const std::string& funcName,
+                           const std::vector<std::string>& args);
+
+#ifndef PADDLE_NO_PYTHON
+
+/**
+ * Global lock guard of python C-api invokes.
+ * NOTE: the lock of this guard is reentrant or recursive.
+ */
+class PyGuard {
+ public:
+  PyGuard();
+  PyGuard(const PyGuard& other) = delete;
+  PyGuard& operator=(const PyGuard& other) = delete;
+
+ private:
+  std::lock_guard<std::recursive_mutex> guard_;
+};
+
+struct PyObjectDeleter {
+  void operator()(PyObject* obj) {
+    if (obj) {
+      Py_DECREF(obj);
+    }
+  }
+};
+
+typedef std::unique_ptr<PyObject, PyObjectDeleter> PyObjectPtr;
+
+PyObjectPtr callPythonFuncRetPyObj(const std::string& moduleName,
+                                   const std::string& funcName,
+                                   const std::vector<std::string>& args);
+
+PyObjectPtr createPythonClass(const std::string& moduleName,
+                              const std::string& className,
+                              const std::vector<std::string>& args,
+                              const std::map<std::string, std::string>& kwargs);
+
+#define CHECK_PY(x) CHECK((x) != nullptr) << ::paddle::py::getPyCallStack()
+
+namespace py {
+PyObjectPtr import(const std::string& moduleName);
+
+/**
+ * Cast a PyLong or PyInt to int type T.
+ * @tparam T return type.
+ * @param [in] obj PyLong or PyInt object.
+ * @param [out] ok status for casting. False if error occured. nullptr if user
+ *                 don't care is ok or not.
+ * @return The value of python object, or 0 if not ok.
+ */
+template <typename T>
+T castInt(PyObject* obj, bool* ok = nullptr) {
+  if (PyLong_Check(obj)) {
+    if (ok) *ok = true;
+    return (T)PyLong_AsUnsignedLong(obj);
+  } else if (PyInt_Check(obj)) {
+    if (ok) *ok = true;
+    return (T)PyInt_AsLong(obj);
+  } else {
+    if (ok) *ok = false;
+    return (T)0;
+  }
+}
+
+/**
+ * Invoke repr of python object.
+ *
+ * Just like toString method in java.
+ */
+char* repr(PyObject* obj);
+
+/**
+ * Invoke repr of python object.
+ */
+inline char* repr(const PyObjectPtr& obj) { return repr(obj.get()); }
+
+/**
+ * Get Python Error Stack String.
+ */
+std::string getPyCallStack();
+
+/**
+ * Object Helper for PyObjectPtr.
+ *
+ * Implements getAttr method for object.
+ */
+class ObjectHelper {
+ public:
+  explicit ObjectHelper(const PyObjectPtr& obj) : obj_(obj) {}
+
+  /**
+   * get attribute
+   */
+  inline PyObject* getAttr(const std::string& field) const {
+    auto obj = PyObject_GetAttrString(obj_.get(), field.c_str());
+    CHECK_PY(obj) << "Cannot get attribute on python object " << obj_.get();
+    return obj;
+  }
+
+  /**
+   * Get Int attribute
+   * @param [in] field  attribute name.
+   * @param [out] ok true if this attribute is int.
+   * @tparam T int type.
+   * @return int value.
+   */
+  template <typename T>
+  T getIntAttr(const std::string& field, bool* ok = nullptr) const {
+    PyObjectPtr tmp(getAttr(field));
+    return castInt<T>(tmp.get(), ok);
+  }
+
+  /**
+   * Get int attribute. Log(Fatal) when not ok
+   * @param field attribute name.
+   * @return int value.
+   */
+  template <typename T>
+  T getIntAttrWithError(const std::string& field) const {
+    bool ok;
+    T tmp = getIntAttr<T>(field, &ok);
+    CHECK(ok) << "Cannot get integer attribute on object " << obj_.get();
+    return tmp;
+  }
+
+  /**
+   * Get bool attribute.
+   * @param field
+   * @param [out] isBoolType return true if attribute is bool type. If the
+   *                         attribute is not bool type, then an implicit
+   *                         conversion will happens, and will return the
+   *                         conversion result.
+   *
+   *                         Such as, if the attribute is 1, then the return
+   *                         value of function will be true, but the isBoolType
+   *                         will return false.
+   * @return
+   */
+  bool getBoolAttr(const std::string& field, bool* isBoolType = nullptr) const {
+    PyObjectPtr tmp(getAttr(field));
+    if (isBoolType) {
+      *isBoolType = PyBool_Check(tmp.get());
+    }
+    return PyObject_IsTrue(tmp.get());
+  }
+
+ private:
+  const PyObjectPtr& obj_;
+};
+
+/**
+ * Python Sequence Helper
+ *
+ * The python sequence means list or tuple.
+ */
+class SequenceHelper {
+ public:
+  explicit SequenceHelper(const PyObjectPtr& seq) : seq_(seq.get()) {
+    CHECK(PySequence_Check(seq_));
+  }
+
+  explicit SequenceHelper(PyObject* seq) : seq_(seq) {
+    CHECK(PySequence_Check(seq_));
+  }
+
+  inline size_t size() const { return (size_t)PySequence_Size(seq_); }
+
+  inline PyObject* operator[](size_t i) const {
+    return PySequence_Fast_GET_ITEM(seq_, i);
+  }
+
+  inline double getDouble(size_t i) const {
+    auto* ptr = (*this)[i];
+    return PyFloat_AsDouble(ptr);
+  }
+
+  /**
+   * Set a sequence item o[i] = obj;
+   * @param i index
+   * @param obj setted item.
+   * @param steal if steal = true, sequence will move object in iteself,
+   *              just like std::move. Otherwise, it will increase reference
+   *              count. Default is false.
+   */
+  inline void set(size_t i, const PyObjectPtr& obj, bool steal = false) {
+    this->set(i, obj.get(), steal);
+  }
+
+  /**
+   * Set a sequence item o[i] = obj;
+   */
+  inline void set(size_t i, PyObject* obj, bool steal = false) {
+    if (!steal) {
+      Py_XINCREF(obj);
+    }
+    if (PyTuple_Check(seq_)) {
+      CHECK_NE(PyTuple_SetItem(seq_, i, obj), -1) << getPyCallStack();
+    } else {
+      CHECK_NE(PySequence_SetItem(seq_, i, obj), -1) << getPyCallStack();
+    }
+  }
+
+ private:
+  PyObject* seq_;
+};
+
+class DictHelper {
+ public:
+  explicit DictHelper(PyObject* d) : dict_(d) {}
+
+  explicit DictHelper(const PyObjectPtr& d) : dict_(d.get()) {}
+
+  void set(const std::string& key, PyObject* item) {
+    PyDict_SetItemString(dict_, key.c_str(), item);
+  }
+
+  void setBool(const std::string& key, bool b) {
+    this->set(key, PyBool_FromLong(b));
+  }
+
+  void setStringList(const std::string& key,
+                     const std::vector<std::string>& items) {
+    auto* list = PyList_New(items.size());
+    for (size_t i = 0; i < items.size(); ++i) {
+      PyList_SetItem(list, i, PyString_FromString(items[i].c_str()));
+    }
+    this->set(key, list);
+  }
+
+ private:
+  inline void checkDict() { CHECK(PyDict_Check(this->dict_)); }
+
+  PyObject* dict_;
+};
+
+inline static bool isCallable(const PyObjectPtr& obj) {
+  return PyCallable_Check(obj.get());
+}
+
+/**
+ * Wrap a callable object.
+ */
+class CallableHelper {
+ public:
+  explicit CallableHelper(const PyObjectPtr& obj) : obj_(obj) {
+    CHECK(py::isCallable(obj_));
+  }
+
+  ~CallableHelper() {}
+
+  /**
+   * reset args, and create new tuple.
+   * @param sz args size.
+   */
+  void setArgsSize(size_t sz) { args.reset(PyTuple_New(sz)); }
+
+  /**
+   * Get args sequence. User can set/get by SequenceHelper.
+   */
+  SequenceHelper getArgs() { return SequenceHelper(args); }
+
+  /**
+   * Call python method, return an object.
+   */
+  PyObject* operator()() {
+    PyGuard guard;
+    return PyObject_Call(obj_.get(), args.get(), kwargs.get());
+  }
+
+ private:
+  const PyObjectPtr& obj_;
+  PyObjectPtr args;
+  PyObjectPtr kwargs;
+};
+
+inline static PyObject* iterNext(const PyObjectPtr& context, bool* atEnd) {
+  PyGuard g;
+  PyObject* data = PyIter_Next(context.get());
+  if (data == nullptr) {
+    if (PyErr_ExceptionMatches(PyExc_StopIteration)) {
+      PyErr_Clear();
+      *atEnd = true;
+      return nullptr;
+    } else if (PyErr_Occurred()) {
+      CHECK_PY(data) << "Calling iterator next error";
+      return nullptr;
+    } else {
+      *atEnd = false;
+      return data;  // just return none in iterator.
+    }
+  } else {
+    *atEnd = false;
+    return data;
+  }
+}
+}  // namespace py
+
+#endif
+
+/**
+ * Initialize python.
+ */
+void initPython(int argc, char** argv);
+
+}  // namespace paddle
diff --git a/paddle/utils/Queue.h b/paddle/legacy/utils/Queue.h
similarity index 100%
rename from paddle/utils/Queue.h
rename to paddle/legacy/utils/Queue.h
diff --git a/paddle/utils/Stat.cpp b/paddle/legacy/utils/Stat.cpp
similarity index 100%
rename from paddle/utils/Stat.cpp
rename to paddle/legacy/utils/Stat.cpp
diff --git a/paddle/utils/Stat.h b/paddle/legacy/utils/Stat.h
similarity index 100%
rename from paddle/utils/Stat.h
rename to paddle/legacy/utils/Stat.h
diff --git a/paddle/utils/StringUtil.cpp b/paddle/legacy/utils/StringUtil.cpp
similarity index 100%
rename from paddle/utils/StringUtil.cpp
rename to paddle/legacy/utils/StringUtil.cpp
diff --git a/paddle/utils/StringUtil.h b/paddle/legacy/utils/StringUtil.h
similarity index 100%
rename from paddle/utils/StringUtil.h
rename to paddle/legacy/utils/StringUtil.h
diff --git a/paddle/utils/Thread.h b/paddle/legacy/utils/Thread.h
similarity index 100%
rename from paddle/utils/Thread.h
rename to paddle/legacy/utils/Thread.h
diff --git a/paddle/utils/ThreadLocal.cpp b/paddle/legacy/utils/ThreadLocal.cpp
similarity index 100%
rename from paddle/utils/ThreadLocal.cpp
rename to paddle/legacy/utils/ThreadLocal.cpp
diff --git a/paddle/utils/ThreadLocal.h b/paddle/legacy/utils/ThreadLocal.h
similarity index 100%
rename from paddle/utils/ThreadLocal.h
rename to paddle/legacy/utils/ThreadLocal.h
diff --git a/paddle/utils/Util.cpp b/paddle/legacy/utils/Util.cpp
similarity index 100%
rename from paddle/utils/Util.cpp
rename to paddle/legacy/utils/Util.cpp
diff --git a/paddle/utils/Util.h b/paddle/legacy/utils/Util.h
similarity index 100%
rename from paddle/utils/Util.h
rename to paddle/legacy/utils/Util.h
diff --git a/paddle/utils/Version.cpp b/paddle/legacy/utils/Version.cpp
similarity index 100%
rename from paddle/utils/Version.cpp
rename to paddle/legacy/utils/Version.cpp
diff --git a/paddle/utils/Version.h b/paddle/legacy/utils/Version.h
similarity index 100%
rename from paddle/utils/Version.h
rename to paddle/legacy/utils/Version.h
diff --git a/paddle/legacy/utils/arch/linux/Locks.cpp b/paddle/legacy/utils/arch/linux/Locks.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..32d351e3328afd79007aea7a51e59cbfc941eeeb
--- /dev/null
+++ b/paddle/legacy/utils/arch/linux/Locks.cpp
@@ -0,0 +1,149 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/utils/Locks.h"
+#include <semaphore.h>
+#include <unistd.h>
+#include "paddle/legacy/utils/Logging.h"
+
+namespace paddle {
+class SemaphorePrivate {
+ public:
+  sem_t sem;
+};
+
+Semaphore::Semaphore(int initValue) : m(new SemaphorePrivate()) {
+  sem_init(&m->sem, 0, initValue);
+}
+
+Semaphore::~Semaphore() {
+  sem_destroy(&m->sem);
+  delete m;
+}
+
+bool Semaphore::timeWait(struct timespec* ts) {
+  return (0 == sem_timedwait(&m->sem, ts));
+}
+
+void Semaphore::wait() { sem_wait(&m->sem); }
+
+void Semaphore::post() { sem_post(&m->sem); }
+
+/// SpinLockPrivate
+
+#ifdef PADDLE_USE_PTHREAD_SPINLOCK
+
+class SpinLockPrivate {
+ public:
+  inline SpinLockPrivate() { pthread_spin_init(&lock_, 0); }
+  inline ~SpinLockPrivate() { pthread_spin_destroy(&lock_); }
+
+  inline void lock() { pthread_spin_lock(&lock_); }
+  inline void unlock() { pthread_spin_unlock(&lock_); }
+
+  pthread_spinlock_t lock_;
+  char padding_[64 - sizeof(pthread_spinlock_t)];
+};
+
+#else
+// clang-format off
+#include <cstddef>
+#include <atomic>
+// clang-format on
+
+class SpinLockPrivate {
+ public:
+  inline void lock() {
+    while (lock_.test_and_set(std::memory_order_acquire)) {
+    }
+  }
+  inline void unlock() { lock_.clear(std::memory_order_release); }
+
+  std::atomic_flag lock_ = ATOMIC_FLAG_INIT;
+  char padding_[64 - sizeof(lock_)];  // Padding to cache line size
+};
+
+#endif
+
+SpinLock::SpinLock() : m(new SpinLockPrivate()) {}
+SpinLock::~SpinLock() { delete m; }
+void SpinLock::lock() { m->lock(); }
+void SpinLock::unlock() { m->unlock(); }
+
+/// ThreadBarrierPrivate
+
+#ifdef PADDLE_USE_PTHREAD_BARRIER
+
+class ThreadBarrierPrivate {
+ public:
+  pthread_barrier_t barrier_;
+
+  inline explicit ThreadBarrierPrivate(int count) {
+    pthread_barrier_init(&barrier_, nullptr, count);
+  }
+
+  inline ~ThreadBarrierPrivate() { pthread_barrier_destroy(&barrier_); }
+
+  inline void wait() { pthread_barrier_wait(&barrier_); }
+};
+
+#else
+
+class ThreadBarrierPrivate {
+ public:
+  pthread_mutex_t mutex_;
+  pthread_cond_t cond_;
+  int count_;
+  int tripCount_;
+
+  inline explicit ThreadBarrierPrivate(int cnt) : count_(0), tripCount_(cnt) {
+    CHECK_NE(cnt, 0);
+    CHECK_GE(pthread_mutex_init(&mutex_, 0), 0);
+    CHECK_GE(pthread_cond_init(&cond_, 0), 0);
+  }
+
+  inline ~ThreadBarrierPrivate() {
+    pthread_cond_destroy(&cond_);
+    pthread_mutex_destroy(&mutex_);
+  }
+
+  /**
+   * @brief wait
+   * @return true if the last wait
+   */
+  inline bool wait() {
+    pthread_mutex_lock(&mutex_);
+    ++count_;
+    if (count_ >= tripCount_) {
+      count_ = 0;
+      pthread_cond_broadcast(&cond_);
+      pthread_mutex_unlock(&mutex_);
+      return true;
+    } else {
+      pthread_cond_wait(&cond_, &mutex_);
+      pthread_mutex_unlock(&mutex_);
+      return false;
+    }
+  }
+};
+
+#endif
+
+/// ThreadBarrier
+
+ThreadBarrier::ThreadBarrier(int count) : m(new ThreadBarrierPrivate(count)) {}
+ThreadBarrier::~ThreadBarrier() { delete m; }
+void ThreadBarrier::wait() { m->wait(); }
+
+}  // namespace paddle
diff --git a/paddle/legacy/utils/arch/osx/Excepts.cpp b/paddle/legacy/utils/arch/osx/Excepts.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2b7d6dca8454417fd78f6da7f906785d24a6219b
--- /dev/null
+++ b/paddle/legacy/utils/arch/osx/Excepts.cpp
@@ -0,0 +1,57 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/utils/Excepts.h"
+
+#if defined(__APPLE__) || defined(__OSX__)
+#if defined(__arm__) || defined(__arm64__)
+// TODO(liuyiqun): implement the arm version
+int fegetexcept(void) { return -1; }
+int feenableexcept(unsigned int excepts) { return -1; }
+int fedisableexcept(unsigned int excepts) { return -1; }
+#else
+int fegetexcept(void) {
+  static fenv_t fenv;
+  return fegetenv(&fenv) ? -1 : (fenv.__control & FE_ALL_EXCEPT);
+}
+
+int feenableexcept(unsigned int excepts) {
+  static fenv_t fenv;
+  unsigned int new_excepts = excepts & FE_ALL_EXCEPT, old_excepts;
+
+  if (fegetenv(&fenv)) return -1;
+  old_excepts = fenv.__control & FE_ALL_EXCEPT;
+
+  // unmask
+  fenv.__control &= ~new_excepts;
+  fenv.__mxcsr &= ~(new_excepts << 7);
+
+  return (fesetenv(&fenv) ? -1 : old_excepts);
+}
+
+int fedisableexcept(unsigned int excepts) {
+  static fenv_t fenv;
+  unsigned int new_excepts = excepts & FE_ALL_EXCEPT, old_excepts;
+
+  if (fegetenv(&fenv)) return -1;
+  old_excepts = fenv.__control & FE_ALL_EXCEPT;
+
+  // mask
+  fenv.__control |= new_excepts;
+  fenv.__mxcsr |= new_excepts << 7;
+
+  return (fesetenv(&fenv) ? -1 : old_excepts);
+}
+#endif
+#endif
diff --git a/paddle/legacy/utils/arch/osx/Locks.cpp b/paddle/legacy/utils/arch/osx/Locks.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b68c48f0c31aa928a634e0369295ec084b9ccd8e
--- /dev/null
+++ b/paddle/legacy/utils/arch/osx/Locks.cpp
@@ -0,0 +1,105 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/utils/Locks.h"
+#include <dispatch/dispatch.h>
+#include <libkern/OSAtomic.h>
+#include <atomic>
+#include "paddle/legacy/utils/Logging.h"
+
+namespace paddle {
+
+class SemaphorePrivate {
+ public:
+  ~SemaphorePrivate() { dispatch_release(sem); }
+
+  dispatch_semaphore_t sem;
+};
+
+Semaphore::Semaphore(int initValue) : m(new SemaphorePrivate()) {
+  m->sem = dispatch_semaphore_create(initValue);
+}
+
+Semaphore::~Semaphore() { delete m; }
+
+bool Semaphore::timeWait(timespec *ts) {
+  dispatch_time_t tm = dispatch_walltime(ts, 0);
+  return (0 == dispatch_semaphore_wait(m->sem, tm));
+}
+
+void Semaphore::wait() {
+  dispatch_semaphore_wait(m->sem, DISPATCH_TIME_FOREVER);
+}
+
+void Semaphore::post() { dispatch_semaphore_signal(m->sem); }
+
+class SpinLockPrivate {
+ public:
+  std::atomic_flag lock_ = ATOMIC_FLAG_INIT;
+  char padding_[64 - sizeof(lock_)];  // Padding to cache line size
+};
+
+SpinLock::SpinLock() : m(new SpinLockPrivate()) {}
+SpinLock::~SpinLock() { delete m; }
+
+void SpinLock::lock() {
+  while (m->lock_.test_and_set(std::memory_order_acquire)) {
+  }
+}
+
+void SpinLock::unlock() { m->lock_.clear(std::memory_order_release); }
+
+class ThreadBarrierPrivate {
+ public:
+  pthread_mutex_t mutex_;
+  pthread_cond_t cond_;
+  int count_;
+  int tripCount_;
+
+  inline explicit ThreadBarrierPrivate(int cnt) : count_(0), tripCount_(cnt) {
+    CHECK_NE(cnt, 0);
+    CHECK_GE(pthread_mutex_init(&mutex_, 0), 0);
+    CHECK_GE(pthread_cond_init(&cond_, 0), 0);
+  }
+
+  inline ~ThreadBarrierPrivate() {
+    pthread_cond_destroy(&cond_);
+    pthread_mutex_destroy(&mutex_);
+  }
+
+  /**
+   * @brief wait
+   * @return true if the last wait
+   */
+  inline bool wait() {
+    pthread_mutex_lock(&mutex_);
+    ++count_;
+    if (count_ >= tripCount_) {
+      count_ = 0;
+      pthread_cond_broadcast(&cond_);
+      pthread_mutex_unlock(&mutex_);
+      return true;
+    } else {
+      pthread_cond_wait(&cond_, &mutex_);
+      pthread_mutex_unlock(&mutex_);
+      return false;
+    }
+  }
+};
+
+ThreadBarrier::ThreadBarrier(int count) : m(new ThreadBarrierPrivate(count)) {}
+ThreadBarrier::~ThreadBarrier() { delete m; }
+void ThreadBarrier::wait() { m->wait(); }
+
+}  // namespace paddle
diff --git a/paddle/utils/enable_virtualenv.py b/paddle/legacy/utils/enable_virtualenv.py
similarity index 100%
rename from paddle/utils/enable_virtualenv.py
rename to paddle/legacy/utils/enable_virtualenv.py
diff --git a/paddle/legacy/utils/tests/CMakeLists.txt b/paddle/legacy/utils/tests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4af01db5c84cb497b756027cbb6ad06c081a8899
--- /dev/null
+++ b/paddle/legacy/utils/tests/CMakeLists.txt
@@ -0,0 +1,18 @@
+add_simple_unittest(test_Thread)
+add_simple_unittest(test_StringUtils)
+add_simple_unittest(test_CustomStackTrace)
+add_simple_unittest(test_ThreadBarrier)
+add_simple_unittest(test_SpinLock)
+add_simple_unittest(test_SIMDFlags)
+add_simple_unittest(test_Error)
+
+add_executable(
+    test_CustomStackTracePrint
+    test_CustomStackTracePrint.cpp
+)
+link_paddle_exe(test_CustomStackTracePrint)
+if(NOT APPLE)
+    add_test(NAME test_CustomStackTracePrint
+        COMMAND ${PADDLE_SOURCE_DIR}/paddle/legacy/utils/tests/test_CustomStackTracePrint.sh
+        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+endif()
diff --git a/paddle/legacy/utils/tests/test_CustomStackTrace.cpp b/paddle/legacy/utils/tests/test_CustomStackTrace.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2a418e3ae2277fc5dc6856d131dafa9daf0bad47
--- /dev/null
+++ b/paddle/legacy/utils/tests/test_CustomStackTrace.cpp
@@ -0,0 +1,92 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gflags/gflags.h>  // NOLINT
+#include <gtest/gtest.h>    // NOLINT
+
+#include "paddle/legacy/utils/CustomStackTrace.h"
+#include "paddle/legacy/utils/Locks.h"
+#include "paddle/legacy/utils/StringUtil.h"
+#include "paddle/legacy/utils/Util.h"
+
+DEFINE_int32(test_thread_num, 10, "testing thread number");
+
+void testNormalImpl(
+    const std::function<void(paddle::CustomStackTrace<std::string>&,
+                             size_t,
+                             size_t,
+                             paddle::ThreadBarrier&,
+                             paddle::ThreadBarrier&)>& callback) {
+  paddle::CustomStackTrace<std::string> tracer;
+  paddle::ThreadBarrier doneBarrier(FLAGS_test_thread_num + 1);
+  paddle::ThreadBarrier startBarrier(FLAGS_test_thread_num + 1);
+  constexpr size_t countDown = 10;
+  constexpr size_t layerSize = 1000;
+  std::vector<std::unique_ptr<std::thread>> threads;
+  threads.reserve(FLAGS_test_thread_num);
+
+  for (int32_t i = 0; i < FLAGS_test_thread_num; ++i) {
+    threads.emplace_back(
+        new std::thread([&tracer, &startBarrier, &doneBarrier, &callback] {
+          callback(tracer, countDown, layerSize, startBarrier, doneBarrier);
+        }));
+  }
+  size_t cntDown = countDown;
+  while (cntDown-- > 0) {
+    startBarrier.wait();
+    sleep(1);
+    doneBarrier.wait();
+    ASSERT_TRUE(tracer.empty());
+  }
+
+  for (auto& thread : threads) {
+    thread->join();
+  }
+}
+
+TEST(CustomStackTrace, normalTrain) {
+  testNormalImpl([](paddle::CustomStackTrace<std::string>& tracer,
+                    size_t countDown,
+                    size_t layerSize,
+                    paddle::ThreadBarrier& start,
+                    paddle::ThreadBarrier& finish) {
+    while (countDown-- > 0) {
+      start.wait();
+      for (size_t i = 0; i < layerSize; ++i) {
+        tracer.push("layer_" + paddle::str::to_string(i));
+      }
+      for (size_t i = 0; i < layerSize; ++i) {
+        tracer.pop("layer_" + paddle::str::to_string(layerSize - 1 - i));
+      }
+      finish.wait();
+    }
+  });
+}
+
+TEST(CustomStackTrace, normalTest) {
+  testNormalImpl([](paddle::CustomStackTrace<std::string>& tracer,
+                    size_t countDown,
+                    size_t layerSize,
+                    paddle::ThreadBarrier& start,
+                    paddle::ThreadBarrier& finish) {
+    while (countDown-- > 0) {
+      start.wait();
+      for (size_t i = 0; i < layerSize; ++i) {
+        tracer.push("layer_" + paddle::str::to_string(i));
+      }
+      tracer.clear();  // in forward test, tracer will clear after forward.
+      finish.wait();
+    }
+  });
+}
diff --git a/paddle/legacy/utils/tests/test_CustomStackTracePrint.cpp b/paddle/legacy/utils/tests/test_CustomStackTracePrint.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..78886a3ed9f237a39079bbf604a376f98bd86b59
--- /dev/null
+++ b/paddle/legacy/utils/tests/test_CustomStackTracePrint.cpp
@@ -0,0 +1,30 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/utils/CustomStackTrace.h"
+#include "paddle/legacy/utils/StringUtil.h"
+#include "paddle/legacy/utils/Util.h"
+
+int main(int argc, char** argv) {
+  paddle::initMain(argc, argv);
+
+  for (size_t i = 0; i < 1000; ++i) {
+    paddle::gLayerStackTrace.push("layer_" + paddle::str::to_string(i));
+    if (i == 998) {
+      throw "Unhandle exception";
+    }
+  }
+
+  return 0;
+}
diff --git a/paddle/utils/tests/test_CustomStackTracePrint.sh b/paddle/legacy/utils/tests/test_CustomStackTracePrint.sh
similarity index 100%
rename from paddle/utils/tests/test_CustomStackTracePrint.sh
rename to paddle/legacy/utils/tests/test_CustomStackTracePrint.sh
diff --git a/paddle/legacy/utils/tests/test_Error.cpp b/paddle/legacy/utils/tests/test_Error.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..250c4d58a64a0d284a15418e40264f1857d30050
--- /dev/null
+++ b/paddle/legacy/utils/tests/test_Error.cpp
@@ -0,0 +1,34 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/utils/Error.h"
+
+#include <gtest/gtest.h>
+
+TEST(Error, testAll) {
+  paddle::Error error;
+  ASSERT_TRUE(error.isOK());
+  error = paddle::Error("I'm the error");
+  ASSERT_FALSE(error.isOK());
+  ASSERT_STREQ("I'm the error", error.msg());
+
+  error = paddle::Error("error2");
+  ASSERT_FALSE(error.isOK());
+  ASSERT_STREQ("error2", error.msg());
+
+  int i = 3;
+  auto error3 = paddle::Error("error%d", i);
+  ASSERT_FALSE(error3.isOK());
+  ASSERT_STREQ("error3", error3.msg());
+}
diff --git a/paddle/legacy/utils/tests/test_SIMDFlags.cpp b/paddle/legacy/utils/tests/test_SIMDFlags.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6362210acdaf26a26a2548ddaf8ed455b9c76618
--- /dev/null
+++ b/paddle/legacy/utils/tests/test_SIMDFlags.cpp
@@ -0,0 +1,48 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+
+#include "paddle/legacy/utils/CpuId.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Util.h"
+
+using namespace paddle;  // NOLINT
+
+TEST(SIMDFlags, gccTest) {
+#if (defined(__GNUC__) || defined(__GNUG__)) && !(defined(__clang__)) && \
+    !defined(__arm__) && !defined(__aarch64__)
+  // clang-format off
+  CHECK(!__builtin_cpu_supports("sse")    != HAS_SSE);
+  CHECK(!__builtin_cpu_supports("sse2")   != HAS_SSE2);
+  CHECK(!__builtin_cpu_supports("sse3")   != HAS_SSE3);
+  CHECK(!__builtin_cpu_supports("ssse3")  != HAS_SSSE3);
+  CHECK(!__builtin_cpu_supports("sse4.1") != HAS_SSE41);
+  CHECK(!__builtin_cpu_supports("sse4.2") != HAS_SSE42);
+  CHECK(!__builtin_cpu_supports("avx")    != HAS_AVX);
+  CHECK(!__builtin_cpu_supports("avx2")   != HAS_AVX2);
+// clang-format on
+#endif
+}
+
+TEST(SIMDFlags, normalPrint) {
+  LOG(INFO) << "Has SSE:     " << std::boolalpha << HAS_SSE;
+  LOG(INFO) << "Has SSE2:    " << std::boolalpha << HAS_SSE2;
+  LOG(INFO) << "Has SSE3:    " << std::boolalpha << HAS_SSE3;
+  LOG(INFO) << "Has SSSE3:   " << std::boolalpha << HAS_SSSE3;
+  LOG(INFO) << "Has SSE4:    " << std::boolalpha << HAS_SSE41 || HAS_SSE42;
+  LOG(INFO) << "Has FMA3:    " << std::boolalpha << HAS_FMA3;
+  LOG(INFO) << "Has FMA4:    " << std::boolalpha << HAS_FMA4;
+  LOG(INFO) << "Has AVX:     " << std::boolalpha << HAS_AVX;
+  LOG(INFO) << "Has AVX2:    " << std::boolalpha << HAS_AVX2;
+  LOG(INFO) << "Has AVX512:  " << std::boolalpha << HAS_AVX512;
+  LOG(INFO) << "Has NEON:    " << std::boolalpha << HAS_NEON;
+}
diff --git a/paddle/legacy/utils/tests/test_SpinLock.cpp b/paddle/legacy/utils/tests/test_SpinLock.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4cd7836d6af251b48925de95c2811361313d7b41
--- /dev/null
+++ b/paddle/legacy/utils/tests/test_SpinLock.cpp
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <vector>
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include "paddle/legacy/utils/Locks.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Util.h"
+
+DEFINE_int32(test_thread_num, 100, "testing thread number");
+
+void testNormalImpl(
+    size_t thread_num,
+    const std::function<void(size_t, size_t&, paddle::SpinLock&)>& callback) {
+  paddle::SpinLock mutex;
+  std::vector<std::thread> threads;
+  threads.reserve(thread_num);
+
+  size_t count = 0;
+  for (size_t i = 0; i < thread_num; ++i) {
+    threads.emplace_back([&thread_num, &count, &mutex, &callback] {
+      callback(thread_num, count, mutex);
+    });
+  }
+  for (auto& thread : threads) {
+    thread.join();
+  }
+  // Check whether all threads reach this point or not
+  CHECK_EQ(count, thread_num);
+}
+
+TEST(ThreadSpinLock, normalTest) {
+  for (auto& thread_num : {10, 30, 50, 100, 300, 1000}) {
+    testNormalImpl(
+        thread_num,
+        [](size_t thread_num, size_t& count, paddle::SpinLock& mutex) {
+          std::lock_guard<paddle::SpinLock> lock(mutex);
+          ++count;
+        });
+  }
+}
diff --git a/paddle/legacy/utils/tests/test_StringUtils.cpp b/paddle/legacy/utils/tests/test_StringUtils.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..61d2815f097af7125bfefdc4909509564300d6aa
--- /dev/null
+++ b/paddle/legacy/utils/tests/test_StringUtils.cpp
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/utils/StringUtil.h"
+
+#include <gtest/gtest.h>
+
+TEST(StringUtil, to) {
+  ASSERT_NEAR(paddle::str::to<double>("12.45"), 12.45, 1e-5);
+  ASSERT_DEATH_IF_SUPPORTED(paddle::str::to<double>("12.45x23"), ".*");
+  ASSERT_DEATH_IF_SUPPORTED(paddle::str::to<int>(""), ".*");
+}
diff --git a/paddle/legacy/utils/tests/test_Thread.cpp b/paddle/legacy/utils/tests/test_Thread.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5e07da3236862c5f585671d9bb8e3fbbd1c5b5fc
--- /dev/null
+++ b/paddle/legacy/utils/tests/test_Thread.cpp
@@ -0,0 +1,81 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <paddle/legacy/utils/Thread.h>
+#include <atomic>
+
+using paddle::AsyncThreadPool;  // NOLINT
+
+TEST(AsyncThreadPool, addJob) {
+  AsyncThreadPool pool(8);
+  auto a = pool.addJob([] { return 1; });
+  auto b = pool.addJob([] { return true; });
+  auto c = pool.addJob([] { return false; });
+
+  ASSERT_EQ(a.get(), 1);
+  ASSERT_TRUE(b.get());
+  ASSERT_FALSE(c.get());
+}
+
+TEST(AsyncThreadPool, addBatchJob) {
+  AsyncThreadPool pool(8);
+  std::atomic<int> counter{0};
+
+  std::vector<AsyncThreadPool::JobFunc> jobs;
+
+  for (int i = 0; i < 10000; i++) {
+    jobs.emplace_back([&] { counter++; });
+  }
+
+  pool.addBatchJobs(jobs);
+
+  ASSERT_EQ(counter, 10000);
+}
+
+TEST(AsyncThreadPool, multiThreadAddBatchJob) {
+  AsyncThreadPool levelOnePool(200);
+  AsyncThreadPool levelTwoPool(200);
+
+  std::shared_ptr<std::mutex> mut = std::make_shared<std::mutex>();
+  int counter = 0;
+  const int numMonitors = 300;
+  const int numSlaves = 300;
+  std::vector<AsyncThreadPool::JobFunc> moniterJobs(numMonitors, [&] {
+    std::vector<AsyncThreadPool::JobFunc> slaveJobs(numSlaves, [mut, &counter] {
+      std::lock_guard<std::mutex> lk(*mut);
+      counter++;
+    });
+    levelTwoPool.addBatchJobs(slaveJobs);
+  });
+  levelOnePool.addBatchJobs(moniterJobs);
+  ASSERT_EQ(counter, numMonitors * numSlaves);
+}
+
+TEST(AsyncThreadPool, addBatchJobWithResults) {
+  AsyncThreadPool pool(100);
+
+  std::vector<std::function<int()>> jobs;
+  const int numJobs = 100;
+  for (int i = 0; i < numJobs; i++) {
+    jobs.emplace_back([i] { return i; });
+  }
+
+  std::vector<int> res;
+  pool.addBatchJobs(jobs, res);
+
+  for (int i = 0; i < numJobs; i++) {
+    ASSERT_EQ(res[i], i);
+  }
+}
diff --git a/paddle/legacy/utils/tests/test_ThreadBarrier.cpp b/paddle/legacy/utils/tests/test_ThreadBarrier.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9c8851ae2112320c89aa3e7ed6e850d00cc14006
--- /dev/null
+++ b/paddle/legacy/utils/tests/test_ThreadBarrier.cpp
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <set>
+#include <vector>
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include "paddle/legacy/utils/Locks.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Util.h"
+
+DEFINE_int32(test_thread_num, 100, "testing thread number");
+
+void testNormalImpl(
+    size_t thread_num,
+    const std::function<void(size_t,
+                             std::mutex&,
+                             std::set<std::thread::id>&,
+                             paddle::ThreadBarrier&)>& callback) {
+  std::mutex mutex;
+  std::set<std::thread::id> tids;
+  paddle::ThreadBarrier barrier(thread_num);
+
+  std::vector<std::thread> threads;
+  threads.reserve(thread_num);
+  for (size_t i = 0; i < thread_num; ++i) {
+    threads.emplace_back([&thread_num, &mutex, &tids, &barrier, &callback] {
+      callback(thread_num, mutex, tids, barrier);
+    });
+  }
+
+  for (auto& thread : threads) {
+    thread.join();
+  }
+}
+
+TEST(ThreadBarrier, normalTest) {
+  for (auto& thread_num : {10, 30, 50, 100, 300, 1000}) {
+    testNormalImpl(thread_num,
+                   [](size_t thread_num,
+                      std::mutex& mutex,
+                      std::set<std::thread::id>& tids,
+                      paddle::ThreadBarrier& barrier) {
+                     {
+                       std::lock_guard<std::mutex> guard(mutex);
+                       tids.insert(std::this_thread::get_id());
+                     }
+                     barrier.wait();
+                     // Check whether all threads reach this point or not
+                     CHECK_EQ(tids.size(), thread_num);
+                   });
+  }
+}
diff --git a/paddle/math/Allocator.h b/paddle/math/Allocator.h
deleted file mode 100644
index c43a83891eb6b7eae278169736149ad1d89e950e..0000000000000000000000000000000000000000
--- a/paddle/math/Allocator.h
+++ /dev/null
@@ -1,137 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdlib.h>
-#include <mutex>
-#include "hl_gpu.h"
-#include "paddle/utils/Logging.h"
-
-namespace paddle {
-
-/**
- * @brief Allocator base class.
- *
- * This is the base class of all Allocator class.
- */
-class Allocator {
- public:
-  virtual ~Allocator() {}
-  virtual void* alloc(size_t size) = 0;
-  virtual void free(void* ptr) = 0;
-  virtual std::string getName() = 0;
-};
-
-/**
- * @brief CPU allocator implementation.
- */
-class CpuAllocator : public Allocator {
- public:
-  ~CpuAllocator() {}
-
-  /**
-   * @brief Aligned allocation on CPU.
-   * @param size Size to be allocated.
-   * @return Pointer to the allocated memory
-   */
-  virtual void* alloc(size_t size) {
-    void* ptr;
-#ifdef PADDLE_WITH_MKLDNN
-    // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp
-    // memory alignment
-    CHECK_EQ(posix_memalign(&ptr, 4096ul, size), 0);
-#else
-    CHECK_EQ(posix_memalign(&ptr, 32ul, size), 0);
-#endif
-    CHECK(ptr) << "Fail to allocate CPU memory: size=" << size;
-    return ptr;
-  }
-
-  /**
-   * @brief Free the memory space.
-   * @param ptr  Pointer to be free.
-   */
-  virtual void free(void* ptr) {
-    if (ptr) {
-      ::free(ptr);
-    }
-  }
-
-  virtual std::string getName() { return "cpu_alloc"; }
-};
-
-/**
- * @brief GPU allocator implementation.
- */
-class GpuAllocator : public Allocator {
- public:
-  ~GpuAllocator() {}
-
-  /**
-   * @brief Allocate GPU memory.
-   * @param size Size to be allocated.
-   * @return Pointer to the allocated memory
-   */
-  virtual void* alloc(size_t size) {
-    void* ptr = hl_malloc_device(size);
-    CHECK(ptr) << "Fail to allocate GPU memory " << size << " bytes";
-    return ptr;
-  }
-
-  /**
-   * @brief Free the GPU memory.
-   * @param ptr  Pointer to be free.
-   */
-  virtual void free(void* ptr) {
-    if (ptr) {
-      hl_free_mem_device(ptr);
-    }
-  }
-
-  virtual std::string getName() { return "gpu_alloc"; }
-};
-
-/**
- * @brief CPU pinned memory allocator implementation.
- */
-class CudaHostAllocator : public Allocator {
- public:
-  ~CudaHostAllocator() {}
-
-  /**
-   * @brief Allocate pinned memory.
-   * @param size Size to be allocated.
-   * @return Pointer to the allocated memory
-   */
-  virtual void* alloc(size_t size) {
-    void* ptr = hl_malloc_host(size);
-    CHECK(ptr) << "Fail to allocate pinned memory " << size << " bytes";
-    return ptr;
-  }
-
-  /**
-   * @brief Free the pinned memory.
-   * @param ptr  Pointer to be free.
-   */
-  virtual void free(void* ptr) {
-    if (ptr) {
-      hl_free_mem_host(ptr);
-    }
-  }
-
-  virtual std::string getName() { return "cuda_host_alloc"; }
-};
-
-}  // namespace paddle
diff --git a/paddle/math/BaseMatrix.cu b/paddle/math/BaseMatrix.cu
deleted file mode 100644
index 7b57419e5a510ba50aff0b47681d1294607e31f9..0000000000000000000000000000000000000000
--- a/paddle/math/BaseMatrix.cu
+++ /dev/null
@@ -1,1953 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <paddle/utils/Logging.h>
-#include <string.h>
-#include <cmath>
-#include "BaseMatrix.h"
-#include "MathFunctions.h"
-#include "NEONFunctions.h"
-#include "SIMDFunctions.h"
-#include "hl_matrix_apply.cuh"
-#include "hl_matrix_base.cuh"
-#include "hl_matrix_ops.cuh"
-
-namespace paddle {
-
-const char* SPARSE_SUPPORT_ERROR = "Sparse Matrix/Vector is not supported.";
-
-template <class T>
-template <class Op>
-int BaseMatrixT<T>::applyUnary(Op op) {
-  MatrixOffset offset(0, 0);
-  applyUnary(op, height_, width_, offset);
-  return 0;
-}
-
-template <class T>
-template <class Op>
-int BaseMatrixT<T>::applyUnary(Op op,
-                               int numRows,
-                               int numCols,
-                               MatrixOffset& offset) {
-  CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
-  int dimM = numRows;
-  int dimN = numCols;
-  int lda = stride_;
-
-  T* A = data_;
-  CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_);
-
-  CHECK_LE(dimM + offset.aRow_, this->height_);
-  CHECK_LE(dimN + offset.aCol_, this->width_);
-  if (true == useGpu_) {
-    hl_gpu_apply_unary_op(op, A, dimM, dimN, lda);
-  } else {
-    hl_cpu_apply_unary_op(op, A, dimM, dimN, lda);
-  }
-  return 0;
-}
-
-template <class T>
-template <class Op>
-int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b) {
-  CHECK(height_ == b.height_ && width_ == b.width_)
-      << "Matrix dimensions are not equal";
-
-  MatrixOffset offset(0, 0, 0, 0);
-  applyBinary(op, b, height_, width_, offset);
-  return 0;
-}
-
-template <class T>
-template <class Op>
-int BaseMatrixT<T>::applyBinary(
-    Op op, BaseMatrixT& b, int numRows, int numCols, MatrixOffset& offset) {
-  applyBinary(op, b, numRows, numCols, offset, false_type(), false_type());
-  return 0;
-}
-
-template <class T>
-template <class Op, class bAsRowVector, class bAsColVector>
-int BaseMatrixT<T>::applyBinary(Op op,
-                                BaseMatrixT& b,
-                                int numRows,
-                                int numCols,
-                                MatrixOffset& offset,
-                                bAsRowVector,
-                                bAsColVector) {
-  CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
-  CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR;
-  CHECK(useGpu_ == b.useGpu_) << "Matrix type mismatch";
-
-  int dimM = numRows;
-  int dimN = numCols;
-  int lda = stride_;
-  int ldb = b.stride_;
-
-  T* A = data_;
-  T* B = b.data_;
-  CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_);
-  CAL_MATRIX_START_ADDRESS(
-      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
-  CHECK_LE(dimM + offset.aRow_, this->height_);
-  CHECK_LE(dimN + offset.aCol_, this->width_);
-  if (!bAsRowVector::value && !bAsColVector::value) {
-    CHECK_LE(dimM + offset.bRow_, b.height_);
-    CHECK_LE(dimN + offset.bCol_, b.width_);
-  } else if (bAsRowVector::value && !bAsColVector::value) {
-    CHECK_LE(dimN + offset.bCol_, b.width_);
-  } else if (!bAsRowVector::value && bAsColVector::value) {
-    CHECK_LE(dimM + offset.bRow_, b.height_);
-  } else {
-  }
-  if (true == useGpu_) {
-    hl_gpu_apply_binary_op<T, Op, bAsRowVector::value, bAsColVector::value>(
-        op, A, B, dimM, dimN, lda, ldb);
-  } else {
-    hl_cpu_apply_binary_op<T, Op, bAsRowVector::value, bAsColVector::value>(
-        op, A, B, dimM, dimN, lda, ldb);
-  }
-
-  return 0;
-}
-
-template <class T>
-template <class Op>
-int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c) {
-  CHECK_EQ(height_, b.height_);
-  CHECK_EQ(width_, b.width_);
-  CHECK_EQ(height_, c.height_);
-  CHECK_EQ(width_, c.width_);
-
-  MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  applyTernary(op, b, c, height_, width_, offset);
-
-  return 0;
-}
-
-template <class T>
-template <class Op>
-int BaseMatrixT<T>::applyTernary(Op op,
-                                 BaseMatrixT& b,
-                                 BaseMatrixT& c,
-                                 int numRows,
-                                 int numCols,
-                                 MatrixOffset& offset) {
-  applyTernary(op, b, c, numRows, numCols, offset, false_type(), false_type());
-
-  return 0;
-}
-
-template <class T>
-template <class Op, class cAsRowVector, class cAsColVector>
-int BaseMatrixT<T>::applyTernary(Op op,
-                                 BaseMatrixT& b,
-                                 BaseMatrixT& c,
-                                 int numRows,
-                                 int numCols,
-                                 MatrixOffset& offset,
-                                 cAsRowVector,
-                                 cAsColVector) {
-  CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
-  CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR;
-  CHECK(!c.isSparse()) << SPARSE_SUPPORT_ERROR;
-  CHECK_EQ(useGpu_, b.useGpu_);
-  CHECK_EQ(useGpu_, c.useGpu_);
-
-  int dimM = numRows;
-  int dimN = numCols;
-  int lda = stride_;
-  int ldb = b.stride_;
-  int ldc = c.stride_;
-
-  T* A = data_;
-  T* B = b.data_;
-  T* C = c.data_;
-  CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_);
-  CAL_MATRIX_START_ADDRESS(
-      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
-  CAL_MATRIX_START_ADDRESS(
-      C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_);
-
-  CHECK_LE(dimM + offset.aRow_, this->height_);
-  CHECK_LE(dimN + offset.aCol_, this->width_);
-  CHECK_LE(dimM + offset.bRow_, b.height_);
-  CHECK_LE(dimN + offset.bCol_, b.width_);
-  if (!cAsRowVector::value && !cAsColVector::value) {
-    CHECK_LE(dimM + offset.cRow_, c.height_);
-    CHECK_LE(dimN + offset.cCol_, c.width_);
-  } else if (cAsRowVector::value && !cAsColVector::value) {
-    CHECK_LE(dimN + offset.cCol_, c.width_);
-  } else if (!cAsRowVector::value && cAsColVector::value) {
-    CHECK_LE(dimM + offset.cRow_, c.height_);
-  } else {
-  }
-
-  if (true == useGpu_) {
-    hl_gpu_apply_ternary_op<T, Op, cAsRowVector::value, cAsColVector::value>(
-        op, A, B, C, dimM, dimN, lda, ldb, ldc);
-  } else {
-    hl_cpu_apply_ternary_op<T, Op, cAsRowVector::value, cAsColVector::value>(
-        op, A, B, C, dimM, dimN, lda, ldb, ldc);
-  }
-
-  return 0;
-}
-
-template <class T>
-template <class Op>
-int BaseMatrixT<T>::applyQuaternary(Op op,
-                                    BaseMatrixT& b,
-                                    BaseMatrixT& c,
-                                    BaseMatrixT& d) {
-  CHECK_EQ(height_, b.height_);
-  CHECK_EQ(width_, b.width_);
-  CHECK_EQ(height_, c.height_);
-  CHECK_EQ(width_, c.width_);
-  CHECK_EQ(height_, d.height_);
-  CHECK_EQ(width_, d.width_);
-
-  MatrixOffset offset(0, 0, 0, 0, 0, 0, 0, 0);
-  applyQuaternary(op, b, c, d, height_, width_, offset);
-
-  return 0;
-}
-
-template <class T>
-template <class Op>
-int BaseMatrixT<T>::applyQuaternary(Op op,
-                                    BaseMatrixT& b,
-                                    BaseMatrixT& c,
-                                    BaseMatrixT& d,
-                                    int numRows,
-                                    int numCols,
-                                    MatrixOffset& offset) {
-  CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
-  CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR;
-  CHECK(!c.isSparse()) << SPARSE_SUPPORT_ERROR;
-  CHECK(!d.isSparse()) << SPARSE_SUPPORT_ERROR;
-  CHECK_EQ(useGpu_, b.useGpu_);
-  CHECK_EQ(useGpu_, c.useGpu_);
-  CHECK_EQ(useGpu_, d.useGpu_);
-
-  int dimM = numRows;
-  int dimN = numCols;
-  int lda = stride_;
-  int ldb = b.stride_;
-  int ldc = c.stride_;
-  int ldd = d.stride_;
-
-  T* A = data_;
-  T* B = b.data_;
-  T* C = c.data_;
-  T* D = d.data_;
-  CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_);
-  CAL_MATRIX_START_ADDRESS(
-      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
-  CAL_MATRIX_START_ADDRESS(
-      C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_);
-  CAL_MATRIX_START_ADDRESS(
-      D, d.height_, d.width_, ldd, offset.dCol_, offset.dRow_);
-
-  CHECK_LE(dimM + offset.aRow_, this->height_);
-  CHECK_LE(dimN + offset.aCol_, this->width_);
-  CHECK_LE(dimM + offset.bRow_, b.height_);
-  CHECK_LE(dimN + offset.bCol_, b.width_);
-  CHECK_LE(dimM + offset.cRow_, c.height_);
-  CHECK_LE(dimN + offset.cCol_, c.width_);
-  CHECK_LE(dimM + offset.dRow_, d.height_);
-  CHECK_LE(dimN + offset.dCol_, d.width_);
-  if (true == useGpu_) {
-    hl_gpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb, ldc, ldd);
-  } else {
-    hl_cpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb, ldc, ldd);
-  }
-
-  return 0;
-}
-
-template <class T>
-template <class Agg,
-          class Op,
-          class Saver,
-          class aAsRowVector,
-          class aAsColVector>
-int BaseMatrixT<T>::aggregate(Agg agg,
-                              Op op,
-                              Saver sv,
-                              BaseMatrixT& b,
-                              int numRows,
-                              int numCols,
-                              MatrixOffset& offset,
-                              aAsRowVector,
-                              aAsColVector) {
-  CHECK_EQ(useGpu_, b.useGpu_);
-
-  int ld = stride_;
-  int ldb = b.stride_;
-
-  T* dst = data_;
-  T* B = b.data_;
-  CAL_MATRIX_START_ADDRESS(
-      dst, height_, width_, ld, offset.aCol_, offset.aRow_);
-  CAL_MATRIX_START_ADDRESS(
-      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
-
-  if (aAsRowVector::value && !aAsColVector::value) {
-    if (useGpu_) {
-      hl_gpu_matrix_column_op(agg, op, sv, numRows, numCols, dst, B, ldb);
-    } else {
-      hl_cpu_matrix_column_op(agg, op, sv, numRows, numCols, dst, B, ldb);
-    }
-  } else if (!aAsRowVector::value && aAsColVector::value) {
-    if (useGpu_) {
-      hl_gpu_matrix_row_op(agg, op, sv, numRows, numCols, dst, ld, B, ldb);
-    } else {
-      hl_cpu_matrix_row_op(agg, op, sv, numRows, numCols, dst, ld, B, ldb);
-    }
-  } else {
-    LOG(FATAL) << "not supported";
-  }
-
-  return 0;
-}
-
-template <class T>
-template <class Agg,
-          class Op,
-          class Saver,
-          class aAsRowVector,
-          class aAsColVector>
-int BaseMatrixT<T>::aggregate(Agg agg,
-                              Op op,
-                              Saver sv,
-                              BaseMatrixT& b,
-                              BaseMatrixT& c,
-                              int numRows,
-                              int numCols,
-                              MatrixOffset& offset,
-                              aAsRowVector,
-                              aAsColVector) {
-  CHECK_EQ(useGpu_, b.useGpu_);
-  CHECK_EQ(useGpu_, c.useGpu_);
-
-  int ld = stride_;
-  int ldb = b.stride_;
-  int ldc = c.stride_;
-
-  T* dst = data_;
-  T* B = b.data_;
-  T* C = c.data_;
-  CAL_MATRIX_START_ADDRESS(
-      dst, height_, width_, ld, offset.aCol_, offset.aRow_);
-  CAL_MATRIX_START_ADDRESS(
-      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
-  CAL_MATRIX_START_ADDRESS(
-      C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_);
-
-  if (aAsRowVector::value && !aAsColVector::value) {
-    if (useGpu_) {
-      hl_gpu_matrix_column_op(
-          agg, op, sv, numRows, numCols, dst, B, ldb, C, ldc);
-    } else {
-      hl_cpu_matrix_column_op(
-          agg, op, sv, numRows, numCols, dst, B, ldb, C, ldc);
-    }
-  } else if (!aAsRowVector::value && aAsColVector::value) {
-    if (useGpu_) {
-      hl_gpu_matrix_row_op(
-          agg, op, sv, numRows, numCols, dst, ld, B, ldb, C, ldc);
-    } else {
-      hl_cpu_matrix_row_op(
-          agg, op, sv, numRows, numCols, dst, ld, B, ldb, C, ldc);
-    }
-  } else {
-    LOG(FATAL) << "not supported";
-  }
-
-  return 0;
-}
-
-/**
- * @brief   unary operator.
- *
- */
-
-DEFINE_MATRIX_UNARY_OP(Neg, a = -a);
-template <class T>
-void BaseMatrixT<T>::neg() {
-  applyUnary(unary::Neg<T>());
-}
-
-DEFINE_MATRIX_UNARY_OP(Exp, a = exp(a));
-template <>
-void BaseMatrixT<real>::exp2() {
-  applyUnary(unary::Exp<real>());
-}
-
-DEFINE_MATRIX_UNARY_OP(Log, a = log(a));
-template <>
-void BaseMatrixT<real>::log2() {
-  if (useGpu_) {
-    applyUnary(unary::Log<real>());
-  } else {
-    vLog(height_ * width_, data_, data_);
-  }
-}
-
-DEFINE_MATRIX_UNARY_OP(Sqrt, a = sqrt(a));
-template <>
-void BaseMatrixT<real>::sqrt2() {
-  applyUnary(unary::Sqrt<real>());
-}
-
-DEFINE_MATRIX_UNARY_OP(Square, a = a * a);
-template <class T>
-void BaseMatrixT<T>::square2() {
-  applyUnary(unary::Square<T>());
-}
-
-DEFINE_MATRIX_UNARY_OP(Reciprocal, a = 1.0f / a);
-template <class T>
-void BaseMatrixT<T>::reciprocal2() {
-  applyUnary(unary::Reciprocal<T>());
-}
-
-DEFINE_MATRIX_UNARY_OP(Abs, a = a > 0 ? a : -a);
-template <class T>
-void BaseMatrixT<T>::abs2() {
-  applyUnary(unary::Abs<T>());
-}
-
-DEFINE_MATRIX_UNARY_OP(Sign, a = (a > 0) - (a < 0));
-template <class T>
-void BaseMatrixT<T>::sign2() {
-  applyUnary(unary::Sign<T>());
-}
-
-DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
-template <class T>
-void BaseMatrixT<T>::zero() {
-  applyUnary(unary::Zero<T>());
-}
-
-template <class T>
-void BaseMatrixT<T>::zeroAtOffset(int64_t columnOffset, int64_t numColumns) {
-  int numRows = height_;
-  int numCols = numColumns;
-  MatrixOffset offset(columnOffset, 0);
-  applyUnary(unary::Zero<T>(), numRows, numCols, offset);
-}
-
-DEFINE_MATRIX_UNARY_OP(One, a = 1);
-template <class T>
-void BaseMatrixT<T>::one() {
-  applyUnary(unary::One<T>());
-}
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(a, p));
-template <>
-void BaseMatrixT<real>::pow2(real p) {
-  if (useGpu_) {
-    applyUnary(unary::Pow<real>(p));
-  } else {
-    vPow(height_ * width_, data_, p, data_);
-  }
-}
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(SubScalar, ONE_PARAMETER, a -= p);
-template <class T>
-void BaseMatrixT<T>::subScalar(T p) {
-  applyUnary(unary::SubScalar<T>(p));
-}
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(MulScalar, ONE_PARAMETER, a *= p);
-template <class T>
-void BaseMatrixT<T>::mulScalar(T p) {
-  applyUnary(unary::MulScalar<T>(p));
-}
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(DivScalar, ONE_PARAMETER, a /= p);
-template <class T>
-void BaseMatrixT<T>::divScalar(T p) {
-  applyUnary(unary::DivScalar<T>(p));
-}
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(Assign, ONE_PARAMETER, a = p);
-template <class T>
-void BaseMatrixT<T>::assign(T p) {
-  applyUnary(unary::Assign<T>(p));
-}
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(Add, ONE_PARAMETER, a += p);
-template <class T>
-void BaseMatrixT<T>::add(T p) {
-  applyUnary(unary::Add<T>(p));
-}
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(Add2, TWO_PARAMETER, a = a * p1 + p2);
-template <class T>
-void BaseMatrixT<T>::add(T p1, T p2) {
-  applyUnary(unary::Add2<T>(p1, p2));
-}
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(Clip,
-                                 TWO_PARAMETER,
-                                 a = a < p1 ? p1 : (a > p2 ? p2 : a));
-template <class T>
-void BaseMatrixT<T>::clip(T p1, T p2) {
-  applyUnary(unary::Clip<T>(p1, p2));
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(ClipDerivative,
-                                  TWO_PARAMETER,
-                                  a = b < p1 ? 0 : (b > p2 ? 0 : 1));
-template <class T>
-void BaseMatrixT<T>::clipDerivative(BaseMatrixT& b, T p1, T p2) {
-  applyBinary(binary::ClipDerivative<T>(p1, p2), b);
-}
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(BiggerThanScalar,
-                                 ONE_PARAMETER,
-                                 a = a > p ? 1.0f : 0.0f);
-template <class T>
-void BaseMatrixT<T>::biggerThanScalar(T p) {
-  applyUnary(unary::BiggerThanScalar<T>(p));
-}
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(DownClip, ONE_PARAMETER, a = a > p ? a : p);
-template <class T>
-void BaseMatrixT<T>::downClip(T p) {
-  applyUnary(unary::DownClip<T>(p));
-}
-
-/**
- * @brief   binary operator.
- *
- */
-
-DEFINE_MATRIX_BINARY_OP(Add, a += b);
-template <class T>
-void BaseMatrixT<T>::add(BaseMatrixT& b) {
-  applyBinary(binary::Add<T>(), b);
-}
-
-template <>
-void BaseMatrixT<real>::add(BaseMatrixT& b) {
-  if (useGpu_) {
-    applyBinary(binary::Add<real>(), b);
-  } else {  // cpu branch
-    CHECK_EQ(height_, b.height_);
-    CHECK_EQ(width_, b.width_);
-    vAdd(height_ * width_, data_, b.data_, data_);
-  }
-}
-
-template <class T>
-void BaseMatrixT<T>::addAtOffset(BaseMatrixT& b, int64_t columnOffset) {
-  if (columnOffset + b.width_ <= width_) {
-    int numRows = height_;
-    int numCols = b.width_;
-    MatrixOffset offset(columnOffset, 0, 0, 0);
-    applyBinary(binary::Add<T>(), b, numRows, numCols, offset);
-  } else if (columnOffset + width_ <= b.width_) {
-    int numRows = height_;
-    int numCols = width_;
-    MatrixOffset offset(0, 0, columnOffset, 0);
-    applyBinary(binary::Add<T>(), b, numRows, numCols, offset);
-  } else {
-    LOG(FATAL) << "Wrong argument "
-               << " a.width=" << width_ << " b.width=" << b.width_
-               << " columnOffset=" << columnOffset;
-  }
-}
-
-template <class T>
-void BaseMatrixT<T>::addP2P(BaseMatrixT& b) {
-  T* A = data_;
-  T* B = b.data_;
-  int dimM = height_;
-  int dimN = width_;
-
-  hl_gpu_apply_binary_op<T, binary::Add<T>, 0, 0>(
-      binary::Add<T>(), A, B, dimM, dimN, dimN, dimN);
-}
-
-template <class T>
-void BaseMatrixT<T>::addColVector(BaseMatrixT& b) {
-  MatrixOffset offset(0, 0, 0, 0);
-  int numRows = height_;
-  int numCols = width_;
-  applyBinary(binary::Add<T>(),
-              b,
-              numRows,
-              numCols,
-              offset,
-              false_type(),
-              true_type() /* bAsColVector */);
-}
-
-template <class T>
-void BaseMatrixT<T>::addRowVector(BaseMatrixT& b) {
-  MatrixOffset offset(0, 0, 0, 0);
-  int numRows = height_;
-  int numCols = width_;
-  applyBinary(binary::Add<T>(),
-              b,
-              numRows,
-              numCols,
-              offset,
-              true_type() /* bAsRowVector */,
-              false_type());
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(Add1, ONE_PARAMETER, a += b * p);
-template <class T>
-void BaseMatrixT<T>::add(BaseMatrixT& b, T p) {
-  applyBinary(binary::Add1<T>(p), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(b, p));
-template <>
-void BaseMatrixT<real>::pow2(BaseMatrixT& b, real p) {
-  if (useGpu_) {
-    applyBinary(binary::Pow<real>(p), b);
-  } else {
-    vPow(height_ * width_, b.data_, p, data_);
-  }
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(Add2, TWO_PARAMETER, a = p1 * a + p2 * b);
-template <class T>
-void BaseMatrixT<T>::add(BaseMatrixT& b, T p1, T p2) {
-  applyBinary(binary::Add2<T>(p1, p2), b);
-}
-
-template <class T>
-void BaseMatrixT<T>::addBias(BaseMatrixT& b, T scale) {
-  MatrixOffset offset(0, 0, 0, 0);
-  int numRows = height_;
-  int numCols = width_;
-  applyBinary(binary::Add1<T>(scale),
-              b,
-              numRows,
-              numCols,
-              offset,
-              true_type() /* bAsRowVector */,
-              false_type());
-}
-
-DEFINE_MATRIX_BINARY_OP(Sub, a -= b);
-template <class T>
-void BaseMatrixT<T>::sub(BaseMatrixT& b) {
-  applyBinary(binary::Sub<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(Sub1, ONE_PARAMETER, a -= b * p);
-template <class T>
-void BaseMatrixT<T>::sub(BaseMatrixT& b, T p) {
-  applyBinary(binary::Sub1<T>(p), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(Relu, b = a > 0.0f ? a : 0.0f);
-template <class T>
-void BaseMatrixT<T>::relu(BaseMatrixT& b) {
-  applyBinary(binary::Relu<T>(), b);
-}
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-template <>
-void BaseMatrixT<float>::relu(BaseMatrixT& b) {
-  neon::relu(data_, b.data_, height_ * width_);
-}
-#endif
-
-DEFINE_MATRIX_BINARY_OP(ReluDerivative, a *= (b > 0.0f ? 1.0f : 0.0f));
-template <class T>
-void BaseMatrixT<T>::reluDerivative(BaseMatrixT& b) {
-  applyBinary(binary::ReluDerivative<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(Softrelu, const T THRESHOLD = 40.0;
-                        b = log(1.0 + exp((a > THRESHOLD)
-                                              ? THRESHOLD
-                                              : ((a < -THRESHOLD) ? (-THRESHOLD)
-                                                                  : a))));
-template <>
-void BaseMatrixT<real>::softrelu(BaseMatrixT& b) {
-  applyBinary(binary::Softrelu<real>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(
-    SoftreluDerivative, const T THRESHOLD = 40.0;
-    a *= (1.0 - exp(-1.0 * ((b > THRESHOLD)
-                                ? THRESHOLD
-                                : ((b < -THRESHOLD) ? (-THRESHOLD) : b)))));
-template <>
-void BaseMatrixT<real>::softreluDerivative(BaseMatrixT& b) {
-  applyBinary(binary::SoftreluDerivative<real>(), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(Brelu, TWO_PARAMETER, b = a > p1 ? a : p1;
-                                  b = b < p2 ? b : p2);
-template <class T>
-void BaseMatrixT<T>::brelu(BaseMatrixT& b) {
-  int p1 = 0, p2 = 24;  //! TODO(yuyang18): Make p1,p2 configuable.
-  applyBinary(binary::Brelu<T>(p1, p2), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(BreluDerivative,
-                                  TWO_PARAMETER,
-                                  a *= (b > p1 && b < p2) ? 1.0 : 0.0);
-template <class T>
-void BaseMatrixT<T>::breluDerivative(BaseMatrixT& b) {
-  int p1 = 0, p2 = 24;
-  applyBinary(binary::BreluDerivative<T>(p1, p2), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(Square, b = a * a);
-template <class T>
-void BaseMatrixT<T>::square2(BaseMatrixT& b) {
-  applyBinary(binary::Square<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(SquareDerivative, a *= 2.0 * b);
-template <class T>
-void BaseMatrixT<T>::squareDerivative(BaseMatrixT& b) {
-  applyBinary(binary::SquareDerivative<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(Tanh, T tmp = -2.0 * a;
-                        tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
-                        b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
-template <>
-void BaseMatrixT<real>::tanh(BaseMatrixT& b) {
-  applyBinary(binary::Tanh<real>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(TanhDerivative, a *= 1 - b * b);
-template <class T>
-void BaseMatrixT<T>::tanhDerivative(BaseMatrixT& b) {
-  applyBinary(binary::TanhDerivative<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(
-    ScaledTanh, TWO_PARAMETER, b = p1 * (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0));
-template <>
-void BaseMatrixT<real>::scaledTanh(BaseMatrixT& b, real p1, real p2) {
-  applyBinary(binary::ScaledTanh<real>(p1, p2), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(ScaledTanhDerivative,
-                                  TWO_PARAMETER,
-                                  a *= p2 * (p1 - b * b));
-template <class T>
-void BaseMatrixT<T>::scaledTanhDerivative(BaseMatrixT& b, T p1, T p2) {
-  applyBinary(binary::ScaledTanhDerivative<T>(p1 * p1, p2 / p1), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(Reciprocal, b = 1.0f / a);
-template <class T>
-void BaseMatrixT<T>::reciprocal2(BaseMatrixT& b) {
-  applyBinary(binary::Reciprocal<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(ReciprocalDerivative, a *= -b * b);
-template <class T>
-void BaseMatrixT<T>::reciprocalDerivative(BaseMatrixT& b) {
-  applyBinary(binary::ReciprocalDerivative<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(Abs, b = a > 0.0f ? a : -a);
-template <class T>
-void BaseMatrixT<T>::abs2(BaseMatrixT& b) {
-  applyBinary(binary::Abs<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(AbsDerivative, a = (b > 0) ? a : (b < 0) ? -a : 0);
-template <class T>
-void BaseMatrixT<T>::absDerivative(BaseMatrixT& b) {
-  applyBinary(binary::AbsDerivative<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(Sigmoid, const T THRESHOLD_MIN = -40.0;
-                        const T THRESHOLD_MAX = 13.0;
-                        T tmp = (a < THRESHOLD_MIN)
-                                    ? THRESHOLD_MIN
-                                    : ((a > THRESHOLD_MAX) ? THRESHOLD_MAX : a);
-                        b = 1.0f / (1.0f + exp(-tmp)));
-template <>
-void BaseMatrixT<real>::sigmoid(BaseMatrixT& b) {
-  if (useGpu_) {
-    applyBinary(binary::Sigmoid<real>(), b);
-  } else {  // cpu versioni
-    size_t numSamples = this->height_;
-    size_t dim = this->width_;
-    CHECK_EQ(b.height_, numSamples);
-    CHECK_EQ(b.width_, dim);
-    const real* in = this->data_;
-    real* out = b.data_;
-
-    // out = - in
-    const float THRESHOLD_MIN = -40.0;  // make sure sigmoid(x) > 0
-    const float THRESHOLD_MAX = 13.0;   // make sure sigmoid(x) < 1
-    for (size_t i = 0; i < numSamples * dim; ++i) {
-      real tmp = in[i];
-      tmp = (tmp < THRESHOLD_MIN)
-                ? THRESHOLD_MIN
-                : ((tmp > THRESHOLD_MAX) ? THRESHOLD_MAX : tmp);
-      out[i] = -tmp;
-    }
-
-    // out = exp(out)
-    vExp(numSamples * dim, out, out);
-
-    // out = 1 / (1 + out)
-    for (size_t i = 0; i < numSamples * dim; ++i) {
-      out[i] = 1 / (1 + out[i]);
-    }
-  }
-}
-
-DEFINE_MATRIX_BINARY_OP(SigmoidDerivative, a *= b * (1 - b));
-template <class T>
-void BaseMatrixT<T>::sigmoidDerivative(BaseMatrixT& b) {
-  applyBinary(binary::SigmoidDerivative<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(ExpDerivative, a *= b);
-template <class T>
-void BaseMatrixT<T>::expDerivative(BaseMatrixT& b) {
-  applyBinary(binary::ExpDerivative<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(Sign, b = a > 0.0f ? 1.0f : -1.0f);
-template <class T>
-void BaseMatrixT<T>::sign2(BaseMatrixT& b) {
-  applyBinary(binary::Sign<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(Exp, a = exp(b));
-template <>
-void BaseMatrixT<real>::exp2(BaseMatrixT& b) {
-  applyBinary(binary::Exp<real>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(Log, a = log(b));
-template <>
-void BaseMatrixT<real>::log2(BaseMatrixT& b) {
-  if (useGpu_) {
-    applyBinary(binary::Log<real>(), b);
-  } else {
-    vLog(height_ * width_, b.data_, data_);
-  }
-}
-
-DEFINE_MATRIX_BINARY_OP(Sqrt, a = sqrt(b));
-template <>
-void BaseMatrixT<real>::sqrt2(BaseMatrixT& b) {
-  applyBinary(binary::Sqrt<real>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(InvSqrt, a = 1.0f / sqrt(b));
-template <>
-void BaseMatrixT<real>::invSqrt(BaseMatrixT& b) {
-  if (useGpu_) {
-    applyBinary(binary::InvSqrt<real>(), b);
-  } else {  // cpu branch
-    CHECK_EQ(height_, b.height_);
-    CHECK_EQ(width_, b.width_);
-    vInvSqrt(height_ * width_, b.data_, data_);
-  }
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(IsEqual, ONE_PARAMETER, a = (b == p));
-template <class T>
-void BaseMatrixT<T>::isEqualTo(BaseMatrixT& b, T value) {
-  applyBinary(binary::IsEqual<T>(value), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(AddScalar, ONE_PARAMETER, a = b + p);
-template <class T>
-void BaseMatrixT<T>::addScalar(BaseMatrixT& b, T p) {
-  applyBinary(binary::AddScalar<T>(p), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(SubScalar, ONE_PARAMETER, a = b - p);
-template <class T>
-void BaseMatrixT<T>::subScalar(BaseMatrixT& b, T p) {
-  applyBinary(binary::SubScalar<T>(p), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(MulScalar, ONE_PARAMETER, a = b * p);
-template <class T>
-void BaseMatrixT<T>::mulScalar(BaseMatrixT& b, T p) {
-  applyBinary(binary::MulScalar<T>(p), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(DivScalar, ONE_PARAMETER, a = b / p);
-template <class T>
-void BaseMatrixT<T>::divScalar(BaseMatrixT& b, T p) {
-  applyBinary(binary::DivScalar<T>(p), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(ScalarDiv, ONE_PARAMETER, a = p / b);
-template <class T>
-void BaseMatrixT<T>::scalarDiv(BaseMatrixT& b, T p) {
-  applyBinary(binary::ScalarDiv<T>(p), b);
-}
-
-/**
- * @brief   ternary operator.
- *
- */
-
-DEFINE_MATRIX_TERNARY_OP(SoftCrossEntropy,
-                         a = -c * log(b) - (1 - c) * log(1 - b));
-template <>
-void BaseMatrixT<real>::softCrossEntropy(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::SoftCrossEntropy<real>(), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_OP(SoftCrossEntropyBp, a += (b - c) / (b * (1 - b)));
-template <class T>
-void BaseMatrixT<T>::softCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::SoftCrossEntropyBp<T>(), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_OP(BinaryCrossEntropy,
-                         a = c > 0.5 ? -log(b) : -log(1.0 - b));
-template <>
-void BaseMatrixT<real>::binaryLabelCrossEntropy(BaseMatrixT& b,
-                                                BaseMatrixT& c) {
-  if (useGpu_) {
-    applyTernary(ternary::BinaryCrossEntropy<real>(), b, c);
-  } else {
-    CHECK_EQ(height_, b.height_);
-    CHECK_EQ(height_, c.height_);
-    CHECK_EQ(width_, b.width_);
-    CHECK_EQ(width_, c.width_);
-
-    size_t size = height_ * width_;
-    real* out = b.data_;
-    real* label = c.data_;
-    real* cost = data_;
-
-    for (size_t i = 0; i < size; ++i) {
-      cost[i] = label[i] > 0.5 ? out[i] : 1.0 - out[i];
-    }
-    vLog(size, cost, cost);
-    for (size_t i = 0; i < size; ++i) {
-      cost[i] *= -1.0;
-    }
-  }
-}
-
-DEFINE_MATRIX_TERNARY_OP(BinaryCrossEntropyBp,
-                         a += c > 0.5 ? -1.0 / b : 1.0 / (1.0 - b));
-template <class T>
-void BaseMatrixT<T>::binaryLabelCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::BinaryCrossEntropyBp<T>(), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_OP(Add, a = b + c);
-template <class T>
-void BaseMatrixT<T>::add(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::Add<T>(), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(Add1, TWO_PARAMETER, a = p1 * b + p2 * c);
-template <class T>
-void BaseMatrixT<T>::add(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2) {
-  applyTernary(ternary::Add1<T>(p1, p2), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_OP(Sub, a = b - c);
-template <class T>
-void BaseMatrixT<T>::sub(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::Sub<T>(), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(Sub1, TWO_PARAMETER, a = p1 * b - p2 * c);
-template <class T>
-void BaseMatrixT<T>::sub(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2) {
-  applyTernary(ternary::Sub1<T>(p1, p2), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_OP(Add2, a = a + b + c);
-template <class T>
-void BaseMatrixT<T>::add2(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::Add2<T>(), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(Add3,
-                                   THREE_PARAMETER,
-                                   a = p1 * a + p2 * b + p3 * c);
-template <class T>
-void BaseMatrixT<T>::add2(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3) {
-  applyTernary(ternary::Add3<T>(p1, p2, p3), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(SgdUpdate,
-                                   THREE_PARAMETER,
-                                   c = p2 * c - p1 * (b + p3 * a);
-                                   a = a + c);
-template <class T>
-void BaseMatrixT<T>::sgdUpdate(BaseMatrixT& b,  // grad
-                               BaseMatrixT& c,  // mom
-                               T p1,            // learningRate,
-                               T p2,            // momentum,
-                               T p3) {          // decayRate
-  applyTernary(ternary::SgdUpdate<T>(p1, p2, p3), b, c);
-}
-
-DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(SgdUpdate,
-                                      THREE_PARAMETER,
-                                      c = p2 * c - p1 * d * (b + p3 * a);
-                                      a += c);
-template <class T>
-void BaseMatrixT<T>::sgdUpdate(BaseMatrixT& b,  // grad,
-                               BaseMatrixT& c,  // mom,
-                               BaseMatrixT& d,  // lr,
-                               T p1,            // learningRate,
-                               T p2,            // momentum,
-                               T p3) {          // decayRate
-  applyQuaternary(quaternary::SgdUpdate<T>(p1, p2, p3), b, c, d);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL1, ONE_PARAMETER, T lambda = p * b;
-                                  a = (a > lambda)
-                                          ? (a - lambda)
-                                          : (a < -lambda) ? (a + lambda) : 0);
-template <class T>
-void BaseMatrixT<T>::applyL1(BaseMatrixT& lr, T learningRate, T decayRate) {
-  applyBinary(binary::ApplyL1<T>(learningRate * decayRate), lr);
-}
-
-template <>
-void BaseMatrixT<real>::applyL1(BaseMatrixT& lr,
-                                real learningRate,
-                                real decayRate) {
-  if (useGpu_) {
-    applyBinary(binary::ApplyL1<real>(learningRate * decayRate), lr);
-  } else {
-    simd::decayL1(this->data_,
-                  this->data_,
-                  lr.data_,
-                  learningRate * decayRate,
-                  height_ * width_);
-  }
-}
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(ApplyL1, ONE_PARAMETER, T lambda = p;
-                                 a = (a > lambda)
-                                         ? (a - lambda)
-                                         : (a < -lambda) ? (a + lambda) : 0);
-template <class T>
-void BaseMatrixT<T>::applyL1(T learningRate, T decayRate) {
-  applyUnary(unary::ApplyL1<T>(learningRate * decayRate));
-}
-
-template <>
-void BaseMatrixT<real>::applyL1(real learningRate, real decayRate) {
-  if (useGpu_) {
-    applyUnary(unary::ApplyL1<real>(learningRate * decayRate));
-  } else {
-    simd::decayL1(
-        this->data_, this->data_, learningRate * decayRate, height_ * width_);
-  }
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL2,
-                                  ONE_PARAMETER,
-                                  a *= (1.0f / (1.0f + p * b)));
-template <class T>
-void BaseMatrixT<T>::applyL2(BaseMatrixT& lr, T learningRate, T decayRate) {
-  if (useGpu_) {
-    applyBinary(binary::ApplyL2<T>(learningRate * decayRate), lr);
-  } else {
-    size_t size = this->height_ * this->width_;
-    T decay = learningRate * decayRate;
-    for (size_t j = 0; j < size; ++j) {
-      this->data_[j] *= 1.0f / (1.0f + decay * lr.data_[j]);
-    }
-  }
-}
-
-template <class T>
-void BaseMatrixT<T>::applyL2(T learningRate, T decayRate) {
-  BaseMatrixT<T>::mulScalar(1.0f / (1.0f + learningRate * decayRate));
-}
-
-DEFINE_MATRIX_BINARY_OP(DotMul, a *= b);
-template <class T>
-void BaseMatrixT<T>::dotMul(BaseMatrixT& b) {
-  applyBinary(binary::DotMul<T>(), b);
-}
-
-DEFINE_MATRIX_TERNARY_OP(DotMul, a = b * c);
-template <class T>
-void BaseMatrixT<T>::dotMul(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::DotMul<T>(), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_OP(DotDiv, a = (b == 0.0) ? 0.0 : b / c);
-template <class T>
-void BaseMatrixT<T>::dotDiv(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::DotDiv<T>(), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotDiv2P,
-                                   TWO_PARAMETER,
-                                   a = (b + p1) / (c + p2));
-template <class T>
-void BaseMatrixT<T>::dotDiv(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
-  applyTernary(ternary::DotDiv2P<T>(p1, p2), b, c);
-}
-
-DEFINE_MATRIX_QUATERNARY_OP(RankLoss, const T THRESHOLD = 40.0; a = b - c;
-                            a = (a > THRESHOLD)
-                                    ? THRESHOLD
-                                    : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
-                            a = log(1 + exp(a)) - a * d);
-template <>
-void BaseMatrixT<real>::rankLoss(BaseMatrixT& b,
-                                 BaseMatrixT& c,
-                                 BaseMatrixT& d) {
-  applyQuaternary(quaternary::RankLoss<real>(), b, c, d);
-}
-
-DEFINE_MATRIX_QUATERNARY_OP(RankLossBp, const T THRESHOLD = 40.0; a = b - c;
-                            a = (a > THRESHOLD)
-                                    ? THRESHOLD
-                                    : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
-                            a = exp(a);
-                            a = (a / (1 + a) - d));
-template <>
-void BaseMatrixT<real>::rankLossBp(BaseMatrixT& b,
-                                   BaseMatrixT& c,
-                                   BaseMatrixT& d) {
-  applyQuaternary(quaternary::RankLossBp<real>(), b, c, d);
-}
-
-/* this = log(1 + exp(b)) - c * b */
-DEFINE_MATRIX_TERNARY_OP(LogisticRegressionLoss, const T THRESHOLD = 40.0;
-                         T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD)
-                                                                 ? -THRESHOLD
-                                                                 : b;
-                         a = log(1 + exp(x)) - c * x);
-template <>
-void BaseMatrixT<real>::logisticRegressionLoss(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::LogisticRegressionLoss<real>(), b, c);
-}
-
-/* this = exp(b)/(1+exp(b)) - c */
-DEFINE_MATRIX_TERNARY_OP(LogisticRegressionLossBp, const T THRESHOLD = 40.0;
-                         T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD)
-                                                                 ? -THRESHOLD
-                                                                 : b;
-                         x = exp(x);
-                         a = x / (1 + x) - c);
-template <>
-void BaseMatrixT<real>::logisticRegressionLossBp(BaseMatrixT& b,
-                                                 BaseMatrixT& c) {
-  applyTernary(ternary::LogisticRegressionLossBp<real>(), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_OP(BiggerThan, a = (b > c) ? 1.0f : 0.0f);
-template <class T>
-void BaseMatrixT<T>::biggerThan(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::BiggerThan<T>(), b, c);
-}
-
-DEFINE_MATRIX_QUATERNARY_OP(
-    BiggerThan, a = ((b > c && d > 0.5f) || (b < c && d < 0.5f)) ? 1.0f : 0.0f);
-template <class T>
-void BaseMatrixT<T>::biggerThan(BaseMatrixT& b,
-                                BaseMatrixT& c,
-                                BaseMatrixT& d) {
-  applyQuaternary(quaternary::BiggerThan<T>(), b, c, d);
-}
-
-DEFINE_MATRIX_TERNARY_OP(Max, a = (b > c) ? b : c);
-template <class T>
-void BaseMatrixT<T>::max2(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::Max<T>(), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(BinaryClassificationError,
-                                   ONE_PARAMETER,
-                                   c += ((a > p) == (b > p)) ? 0.0f : 1.0f);
-template <class T>
-void BaseMatrixT<T>::binaryClassificationError2(size_t destCol,
-                                                BaseMatrixT& b,
-                                                BaseMatrixT& c,
-                                                T p) {
-  CHECK(!useGpu_) << "do not support gpu";
-  MatrixOffset offset(0, 0, 0, 0, destCol, 0);
-  int numRows = b.height_;
-  int numCols = b.width_;
-  b.applyTernary(ternary::BinaryClassificationError<T>(p),
-                 c,
-                 *this,
-                 numRows,
-                 numCols,
-                 offset,
-                 false_type(),
-                 true_type() /*cAsColVector*/);
-}
-
-template <>
-void BaseMatrixT<real>::binaryClassificationError(size_t destCol,
-                                                  BaseMatrixT& b,
-                                                  BaseMatrixT& c,
-                                                  real p) {
-  MatrixOffset offset(destCol, 0, 0, 0, 0, 0);
-  int numRows = b.height_;
-  int numCols = b.width_;
-  aggregate(aggregate::sum(),
-            base::binary::classificationError(p),
-            base::binary::add(),
-            b,
-            c,
-            numRows,
-            numCols,
-            offset,
-            false_type(),
-            true_type() /*aAsColVector*/);
-}
-
-DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(Add3,
-                                      THREE_PARAMETER,
-                                      a = p1 * b + p2 * c + p3 * d);
-template <class T>
-void BaseMatrixT<T>::add3(
-    BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d, T p1, T p2, T p3) {
-  applyQuaternary(quaternary::Add3<T>(p1, p2, p3), b, c, d);
-}
-
-DEFINE_MATRIX_TERNARY_OP(DotMulSquare, a = b * c * c);
-template <class T>
-void BaseMatrixT<T>::dotMulSquare(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::DotMulSquare<T>(), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_OP(DotSquareSquare, a = b * b * c * c);
-template <class T>
-void BaseMatrixT<T>::dotSquareSquare(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::DotSquareSquare<T>(), b, c);
-}
-
-DEFINE_MATRIX_BINARY_OP(DotMulSquare, a *= b * b);
-template <class T>
-void BaseMatrixT<T>::dotMulSquare(BaseMatrixT& b) {
-  applyBinary(binary::DotMulSquare<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(DotSquareMul, a = a * a * b);
-template <class T>
-void BaseMatrixT<T>::dotSquareMul(BaseMatrixT& b) {
-  applyBinary(binary::DotSquareMul<T>(), b);
-}
-
-DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(AddSquareSum,
-                                      THREE_PARAMETER,
-                                      T tmp = p1 * b + p2 * c + p3 * d;
-                                      a += tmp * tmp);
-template <class T>
-void BaseMatrixT<T>::addSquareSum(
-    BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT d, T p1, T p2, T p3) {
-  applyQuaternary(quaternary::AddSquareSum<T>(p1, p2, p3), b, c, d);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(AddSquare, ONE_PARAMETER, a += p * b * b);
-template <class T>
-void BaseMatrixT<T>::addSquare(BaseMatrixT& b, T p) {
-  applyBinary(binary::AddSquare<T>(p), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(DecayAddSquare,
-                                  TWO_PARAMETER,
-                                  a = p1 * a + p2 * b * b);
-template <class T>
-void BaseMatrixT<T>::decayAddSquare(BaseMatrixT& b, T p1, T p2) {
-  applyBinary(binary::DecayAddSquare<T>(p1, p2), b);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(DecayAddSquareMul,
-                                   TWO_PARAMETER,
-                                   a = p1 * a + p2 * b * b * c * c);
-template <class T>
-void BaseMatrixT<T>::decayAddSquareMul(BaseMatrixT& b,
-                                       BaseMatrixT& c,
-                                       T p1,
-                                       T p2) {
-  applyTernary(ternary::DecayAddSquareMul<T>(p1, p2), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(ReciprocalSum,
-                                   THREE_PARAMETER,
-                                   a = 1 / (p1 * b + p2 * c + p3));
-template <class T>
-void BaseMatrixT<T>::reciprocalSum(
-    BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3) {
-  applyTernary(ternary::ReciprocalSum<T>(p1, p2, p3), b, c);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(Reciprocal2,
-                                  TWO_PARAMETER,
-                                  a = 1 / (p1 * b + p2));
-template <class T>
-void BaseMatrixT<T>::reciprocal2(BaseMatrixT& b, T p1, T p2) {
-  applyBinary(binary::Reciprocal2<T>(p1, p2), b);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSquareSum,
-                                   TWO_PARAMETER,
-                                   T tmp = p1 * b + p2 * c;
-                                   a *= tmp * tmp);
-template <class T>
-void BaseMatrixT<T>::dotMulSquareSum(BaseMatrixT& b,
-                                     BaseMatrixT& c,
-                                     T p1,
-                                     T p2) {
-  applyTernary(ternary::DotMulSquareSum<T>(p1, p2), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotSquareSum,
-                                   TWO_PARAMETER,
-                                   T tmp = p1 * b + p2 * c;
-                                   a = tmp * tmp);
-template <class T>
-void BaseMatrixT<T>::dotSquareSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
-  applyTernary(ternary::DotSquareSum<T>(p1, p2), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSum,
-                                   TWO_PARAMETER,
-                                   a *= p1 * b + p2 * c);
-template <class T>
-void BaseMatrixT<T>::dotMulSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
-  applyTernary(ternary::DotMulSum<T>(p1, p2), b, c);
-}
-
-DEFINE_MATRIX_BINARY_OP(CopyAndClear, b = a; a = 0);
-template <class T>
-void BaseMatrixT<T>::copyAndClear(BaseMatrixT& b) {
-  applyBinary(binary::CopyAndClear<T>(), b);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(AddDotMul,
-                                   TWO_PARAMETER,
-                                   a = p1 * a + p2 * b * c);
-template <class T>
-void BaseMatrixT<T>::addDotMul(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
-  applyTernary(ternary::AddDotMul<T>(p1, p2), b, c);
-}
-
-DEFINE_MATRIX_BINARY_OP(Assign, a = b;);
-template <class T>
-void BaseMatrixT<T>::assign(BaseMatrixT& b) {
-  if (useGpu_) {
-    applyBinary(binary::Assign<T>(), b);
-  } else {  // cpu version
-    CHECK_EQ(this->height_, b.height_);
-    CHECK_EQ(this->width_, b.width_);
-    memcpy(data_, b.data_, sizeof(T) * height_ * width_);
-  }
-}
-
-template <class T>
-void BaseMatrixT<T>::assignAtOffset(BaseMatrixT& b, int64_t columnOffset) {
-  if (columnOffset + b.width_ <= width_) {
-    int numRows = height_;
-    int numCols = b.width_;
-    MatrixOffset offset(columnOffset, 0, 0, 0);
-    applyBinary(binary::Assign<T>(), b, numRows, numCols, offset);
-  } else if (columnOffset + width_ <= b.width_) {
-    int numRows = height_;
-    int numCols = width_;
-    MatrixOffset offset(0, 0, columnOffset, 0);
-    applyBinary(binary::Assign<T>(), b, numRows, numCols, offset);
-  } else {
-    LOG(FATAL) << "Wrong argument "
-               << " a.width=" << width_ << " b.width=" << b.width_
-               << " columnOffset=" << columnOffset;
-  }
-}
-
-DEFINE_MATRIX_BINARY_OP(DeepSwap, T tmp = a; a = b; b = tmp);
-template <class T>
-void BaseMatrixT<T>::deepSwap(BaseMatrixT& b) {
-  applyBinary(binary::DeepSwap<T>(), b);
-}
-
-template <>
-void BaseMatrixT<real>::rowDotMul(size_t destCol,
-                                  BaseMatrixT& b,
-                                  BaseMatrixT& c) {
-  int numRows = b.height_;
-  int numCols = b.width_;
-  MatrixOffset offset(destCol, 0, 0, 0, 0, 0);
-  aggregate(aggregate::sum(),
-            base::binary::mul(),
-            base::binary::add(),
-            b,
-            c,
-            numRows,
-            numCols,
-            offset,
-            false_type(),
-            true_type() /*aAsColVector*/);
-}
-
-template <class T>
-void BaseMatrixT<T>::rowDotMul2(size_t destCol,
-                                BaseMatrixT& b,
-                                BaseMatrixT& c) {
-  CHECK(!useGpu_) << "do not support gpu";
-
-  size_t height = this->height_;
-  CHECK_LT(destCol, this->width_);
-  CHECK_EQ(height, b.height_);
-  CHECK_EQ(height, c.height_);
-  CHECK_EQ(b.width_, c.width_);
-  size_t width = b.width_;
-  T* A = this->data_;
-  const T* B = b.data_;
-  const T* C = c.data_;
-  for (size_t i = 0; i < height;
-       ++i, A += this->width_, B += width, C += width) {
-    for (size_t j = 0; j < width; ++j) {
-      A[destCol] += B[j] * C[j];
-    }
-  }
-}
-
-template <>
-void BaseMatrixT<real>::addDotMulVMM(BaseMatrixT& b, BaseMatrixT& c) {
-  MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  int numRows = b.height_;
-  int numCols = b.width_;
-  aggregate(aggregate::sum(),
-            base::binary::mul(),
-            base::binary::add(),
-            b,
-            c,
-            numRows,
-            numCols,
-            offset,
-            true_type() /*aAsRowVector*/,
-            false_type());
-}
-
-template <class T>
-void BaseMatrixT<T>::addDotMulVMM2(BaseMatrixT& b, BaseMatrixT& c) {
-  CHECK(!useGpu_) << "do not support gpu";
-
-  CHECK_EQ(height_, 1LU);
-  CHECK_EQ(b.height_, c.height_);
-  CHECK_EQ(width_, b.width_);
-  CHECK_EQ(width_, c.width_);
-  size_t height = b.height_;
-  size_t width = b.width_;
-  T* A = this->data_;
-  const T* B = b.data_;
-  const T* C = c.data_;
-  for (size_t i = 0; i < height; ++i, B += width, C += width) {
-    for (size_t j = 0; j < width; ++j) {
-      A[j] += B[j] * C[j];
-    }
-  }
-}
-
-DEFINE_MATRIX_TERNARY_OP(addDotMulMMV, a += b * c);
-template <class T>
-void BaseMatrixT<T>::addDotMulMMV(BaseMatrixT& b, BaseMatrixT& c) {
-  MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  int numRows = height_;
-  int numCols = width_;
-  applyTernary(ternary::addDotMulMMV<T>(),
-               b,
-               c,
-               numRows,
-               numCols,
-               offset,
-               true_type() /*cAsRowVector*/,
-               false_type());
-}
-
-template <class T>
-void BaseMatrixT<T>::addDotMulMMV2(BaseMatrixT& b, BaseMatrixT& c) {
-  CHECK(!useGpu_) << "do not support gpu";
-
-  CHECK_EQ(c.height_, 1LU);
-  CHECK_EQ(height_, b.height_);
-  CHECK_EQ(width_, b.width_);
-  CHECK_EQ(width_, c.width_);
-  size_t height = height_;
-  size_t width = width_;
-  T* A = this->data_;
-  const T* B = b.data_;
-  const T* C = c.data_;
-  for (size_t i = 0; i < height; ++i, A += width, B += width) {
-    for (size_t j = 0; j < width; ++j) {
-      A[j] += B[j] * C[j];
-    }
-  }
-}
-
-template <class T>
-void BaseMatrixT<T>::rowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
-  MatrixOffset offset(0, 0, 0, 0, cCol, 0);
-  int numRows = height_;
-  int numCols = width_;
-  applyTernary(ternary::DotMul<T>(),
-               b,
-               c,
-               numRows,
-               numCols,
-               offset,
-               false_type(),
-               true_type() /*cAsColVector*/);
-}
-
-template <class T>
-void BaseMatrixT<T>::rowScale2(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
-  CHECK(!useGpu_) << "do not support gpu";
-
-  size_t height = this->height_;
-  size_t width = this->width_;
-  CHECK_EQ(height, b.height_);
-  CHECK_EQ(width, b.width_);
-  CHECK_LT(cCol, c.width_);
-  CHECK_EQ(height, c.height_);
-  T* A = this->data_;
-  const T* B = b.data_;
-  const T* C = c.data_;
-  for (size_t i = 0; i < height; ++i, A += width, B += width, C += c.width_) {
-    for (size_t j = 0; j < width; ++j) {
-      A[j] = B[j] * C[cCol];
-    }
-  }
-}
-
-template <class T>
-void BaseMatrixT<T>::colScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c) {
-  MatrixOffset offset(0, 0, 0, 0, 0, cRow);
-  int numRows = height_;
-  int numCols = width_;
-  applyTernary(ternary::DotMul<T>(),
-               b,
-               c,
-               numRows,
-               numCols,
-               offset,
-               true_type() /* cAsRowVector */,
-               false_type() /* cAsColVector */);
-}
-
-template <class T>
-void BaseMatrixT<T>::addColScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c) {
-  MatrixOffset offset(0, 0, 0, 0, 0, cRow);
-  int numRows = height_;
-  int numCols = width_;
-  applyTernary(ternary::addDotMulMMV<T>(),
-               b,
-               c,
-               numRows,
-               numCols,
-               offset,
-               true_type() /* cAsRowVector */,
-               false_type() /* cAsColVector */);
-}
-
-template <class T>
-void BaseMatrixT<T>::addRowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
-  MatrixOffset offset(0, 0, 0, 0, cCol, 0);
-  int numRows = height_;
-  int numCols = width_;
-  applyTernary(ternary::addDotMulMMV<T>(),
-               b,
-               c,
-               numRows,
-               numCols,
-               offset,
-               false_type(),
-               true_type() /*cAsColVector*/);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(RowAdd, ONE_PARAMETER, a = b + p * c);
-template <class T>
-void BaseMatrixT<T>::rowAdd(size_t cCol, BaseMatrixT& b, BaseMatrixT& c, T p) {
-  MatrixOffset offset(0, 0, 0, 0, cCol, 0);
-  int numRows = height_;
-  int numCols = width_;
-  applyTernary(ternary::RowAdd<T>(p),
-               b,
-               c,
-               numRows,
-               numCols,
-               offset,
-               false_type(),
-               true_type() /*cAsColVector*/);
-}
-
-DEFINE_MATRIX_TERNARY_OP(RowPow, a = pow(b, c));
-template <>
-void BaseMatrixT<real>::rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
-  if (useGpu_) {
-    MatrixOffset offset(0, 0, 0, 0, cCol, 0);
-    int numRows = height_;
-    int numCols = width_;
-    applyTernary(ternary::RowPow<real>(),
-                 b,
-                 c,
-                 numRows,
-                 numCols,
-                 offset,
-                 false_type(),
-                 true_type() /*cAsColVector*/);
-  } else {
-    size_t height = this->height_;
-    size_t width = this->width_;
-    CHECK_EQ(height, b.height_);
-    CHECK_EQ(width, b.width_);
-    CHECK_LT(cCol, c.width_);
-    CHECK_EQ(height, c.height_);
-    real* A = this->data_;
-    const real* B = b.data_;
-    const real* C = c.data_;
-    for (size_t i = 0; i < height; ++i, A += width, B += width, C += c.width_) {
-      vPow(width, B, C[cCol], A);
-    }
-  }
-}
-
-template <class T>
-void BaseMatrixT<T>::mulRowVector(BaseMatrixT& b) {
-  MatrixOffset offset(0, 0, 0, 0);
-  int numRows = height_;
-  int numCols = width_;
-  applyBinary(binary::DotMul<T>(),
-              b,
-              numRows,
-              numCols,
-              offset,
-              true_type() /* bAsRowVector */,
-              false_type());
-}
-
-DEFINE_MATRIX_BINARY_OP(DotDiv, a /= b);
-template <class T>
-void BaseMatrixT<T>::divRowVector(BaseMatrixT& b) {
-  MatrixOffset offset(0, 0, 0, 0);
-  int numRows = height_;
-  int numCols = width_;
-  applyBinary(binary::DotDiv<T>(),
-              b,
-              numRows,
-              numCols,
-              offset,
-              true_type() /* bAsRowVector */,
-              false_type());
-}
-
-template <class T>
-void BaseMatrixT<T>::mulColVector(BaseMatrixT& b) {
-  MatrixOffset offset(0, 0, 0, 0);
-  int numRows = height_;
-  int numCols = width_;
-  applyBinary(binary::DotMul<T>(),
-              b,
-              numRows,
-              numCols,
-              offset,
-              false_type(),
-              true_type() /* bAsColVector */);
-}
-
-template <class T>
-void BaseMatrixT<T>::divColVector(BaseMatrixT& b) {
-  MatrixOffset offset(0, 0, 0, 0);
-  int numRows = height_;
-  int numCols = width_;
-  applyBinary(binary::DotDiv<T>(),
-              b,
-              numRows,
-              numCols,
-              offset,
-              false_type(),
-              true_type() /* bAsColVector */);
-}
-
-template <>
-template <class Agg>
-int BaseMatrixT<real>::applyRow(Agg agg, BaseMatrixT& b) {
-  MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  size_t numRows = b.height_;
-  size_t numCols = b.width_;
-  CHECK_EQ(height_, numRows);
-  CHECK_EQ(width_, 1UL);
-  aggregate(agg,
-            base::unary::identity(),
-            base::binary::second(),
-            b,
-            numRows,
-            numCols,
-            offset,
-            false_type(),
-            true_type() /*aAsColVector*/);
-
-  return 0;
-}
-
-template <>
-template <class Agg, class Saver>
-int BaseMatrixT<real>::applyRow(Agg agg, Saver sv, BaseMatrixT& b) {
-  MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  size_t numRows = b.height_;
-  size_t numCols = b.width_;
-  CHECK_EQ(height_, numRows);
-  CHECK_EQ(width_, 1UL);
-  aggregate(agg,
-            base::unary::identity(),
-            sv,
-            b,
-            numRows,
-            numCols,
-            offset,
-            false_type(),
-            true_type() /*aAsColVector*/);
-
-  return 0;
-}
-
-template <>
-template <class Agg>
-int BaseMatrixT<real>::applyRow(Agg agg,
-                                real scaleDest,
-                                real scaleAgg,
-                                BaseMatrixT& b) {
-  if (scaleDest != 0) {
-    applyRow(agg, base::binary::add2(scaleDest, scaleAgg), b);
-  } else {
-    applyRow(agg, base::binary::second(), b);
-    if (scaleAgg != 1) {
-      mulScalar(scaleAgg);
-    }
-  }
-  return 0;
-}
-
-template <>
-template <class Agg, class Op, class Saver>
-int BaseMatrixT<real>::applyRow(
-    Agg agg, Op op, Saver sv, BaseMatrixT& b, BaseMatrixT& c) {
-  MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  size_t numRows = b.height_;
-  size_t numCols = b.width_;
-  CHECK_EQ(height_, numRows);
-  CHECK_EQ(width_, 1UL);
-  CHECK_EQ(c.height_, numRows);
-  CHECK_EQ(c.width_, numCols);
-  aggregate(agg,
-            op,
-            sv,
-            b,
-            c,
-            numRows,
-            numCols,
-            offset,
-            false_type(),
-            true_type() /*aAsColVector*/);
-  return 0;
-}
-
-template <>
-template <class Agg, class Op>
-int BaseMatrixT<real>::applyRow(Agg agg,
-                                Op op,
-                                real scaleDest,
-                                real scaleAgg,
-                                BaseMatrixT& b,
-                                BaseMatrixT& c) {
-  if (scaleDest != 0) {
-    applyRow(agg, op, base::binary::add2(scaleDest, scaleAgg), b, c);
-  } else {
-    applyRow(agg, op, base::binary::second(), b, c);
-    if (scaleAgg != 1) {
-      mulScalar(scaleAgg);
-    }
-  }
-  return 0;
-}
-
-template <>
-template <class Agg>
-int BaseMatrixT<real>::applyCol(Agg agg, BaseMatrixT& b) {
-  MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  size_t numRows = b.height_;
-  size_t numCols = b.width_;
-  CHECK_EQ(width_, numCols);
-  CHECK_EQ(height_, 1UL);
-  aggregate(agg,
-            base::unary::identity(),
-            base::binary::second(),
-            b,
-            numRows,
-            numCols,
-            offset,
-            true_type() /*aAsRowVector*/,
-            false_type());
-
-  return 0;
-}
-
-template <>
-template <class Agg, class Saver>
-int BaseMatrixT<real>::applyCol(Agg agg, Saver sv, BaseMatrixT& b) {
-  MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  size_t numRows = b.height_;
-  size_t numCols = b.width_;
-  CHECK_EQ(width_, numCols);
-  CHECK_EQ(height_, 1UL);
-  aggregate(agg,
-            base::unary::identity(),
-            sv,
-            b,
-            numRows,
-            numCols,
-            offset,
-            true_type() /*aAsRowVector*/,
-            false_type());
-
-  return 0;
-}
-
-template <>
-template <class Agg>
-int BaseMatrixT<real>::applyCol(Agg agg,
-                                real scaleDest,
-                                real scaleAgg,
-                                BaseMatrixT& b) {
-  if (scaleDest != 0) {
-    applyCol(agg, base::binary::add2(scaleDest, scaleAgg), b);
-  } else {
-    applyCol(agg, base::binary::second(), b);
-    if (scaleAgg != 1) {
-      mulScalar(scaleAgg);
-    }
-  }
-  return 0;
-}
-
-template <>
-void BaseMatrixT<real>::sumRows(BaseMatrixT& b, real scaleSum, real scaleDest) {
-  applyRow(aggregate::sum(), scaleDest, scaleSum, b);
-}
-
-template <>
-void BaseMatrixT<real>::maxRows(BaseMatrixT& b) {
-  applyRow(aggregate::max(), b);
-}
-
-template <>
-void BaseMatrixT<real>::minRows(BaseMatrixT& b) {
-  applyRow(aggregate::min(), b);
-}
-
-template <>
-void BaseMatrixT<real>::maxCols(BaseMatrixT& b) {
-  applyCol(aggregate::max(), b);
-}
-
-template <>
-void BaseMatrixT<real>::minCols(BaseMatrixT& b) {
-  applyCol(aggregate::min(), b);
-}
-
-template <>
-void BaseMatrixT<real>::sumCols(BaseMatrixT& b, real scaleSum, real scaleDest) {
-  applyCol(aggregate::sum(), scaleDest, scaleSum, b);
-}
-
-template <>
-void BaseMatrixT<real>::sumOfSquaredDiffs(BaseMatrixT& b,
-                                          BaseMatrixT& c,
-                                          real scaleSum,
-                                          real scaleDest) {
-  applyRow(
-      aggregate::sum(), base::binary::squaredDiff(), scaleDest, scaleSum, b, c);
-}
-
-template <>
-void BaseMatrixT<real>::sumOfProducts(BaseMatrixT& b,
-                                      BaseMatrixT& c,
-                                      real scaleSum,
-                                      real scaleDest) {
-  applyRow(aggregate::sum(), base::binary::mul(), scaleDest, scaleSum, b, c);
-}
-
-template class BaseMatrixT<real>;
-
-#ifndef PADDLE_MOBILE_INFERENCE
-
-template class BaseMatrixT<int>;
-
-#else
-
-template <>
-void BaseMatrixT<int>::zero() {
-  applyUnary(unary::Zero<int>());
-}
-
-template <>
-void BaseMatrixT<int>::assign(int p) {
-  applyUnary(unary::Assign<int>(p));
-}
-
-template <>
-void BaseMatrixT<int>::isEqualTo(BaseMatrixT& b, int value) {
-  applyBinary(binary::IsEqual<int>(value), b);
-}
-
-template <>
-void BaseMatrixT<int>::neg() {
-  applyUnary(unary::Neg<int>());
-}
-
-template <>
-void BaseMatrixT<int>::abs2() {
-  applyUnary(unary::Abs<int>());
-}
-
-template <>
-void BaseMatrixT<int>::add(int p) {
-  applyUnary(unary::Add<int>(p));
-}
-
-template <>
-void BaseMatrixT<int>::add(int p1, int p2) {
-  applyUnary(unary::Add2<int>(p1, p2));
-}
-
-template <>
-void BaseMatrixT<int>::applyL1(int learningRate, int decayRate) {
-  applyUnary(unary::ApplyL1<int>(learningRate * decayRate));
-}
-
-#endif
-}  // namespace paddle
diff --git a/paddle/math/BaseMatrix.h b/paddle/math/BaseMatrix.h
deleted file mode 100644
index 1958629aa0354fcc332b1e5677a64c29397e0d26..0000000000000000000000000000000000000000
--- a/paddle/math/BaseMatrix.h
+++ /dev/null
@@ -1,1095 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <stdint.h>
-#include <cstddef>
-#include "TensorExpression.h"
-#include "paddle/utils/Common.h"
-
-namespace paddle {
-
-/*
- * nvcc currently does not support C++11,
- * so I realized false_type and true_type.
- */
-template <class T, T v>
-struct bool_constant {
-  static const T value = v;
-};
-typedef bool_constant<bool, false> false_type;
-typedef bool_constant<bool, true> true_type;
-
-/**
- * @brief   Calculate matrix element address.
- *
- * For instance, address of A[i][j] = i * ld + j.
- *
- */
-#define CAL_MATRIX_START_ADDRESS(address, height, width, ld, col, row) \
-  CHECK_LE(col, width);                                                \
-  CHECK_LE(row, height);                                               \
-  address += row * ld + col;
-
-class MatrixOffset {
- public:
-  size_t aCol_;
-  size_t aRow_;
-  size_t bCol_;
-  size_t bRow_;
-  size_t cCol_;
-  size_t cRow_;
-  size_t dCol_;
-  size_t dRow_;
-  MatrixOffset(size_t aCol = 0,
-               size_t aRow = 0,
-               size_t bCol = 0,
-               size_t bRow = 0,
-               size_t cCol = 0,
-               size_t cRow = 0,
-               size_t dCol = 0,
-               size_t dRow = 0)
-      : aCol_(aCol),
-        aRow_(aRow),
-        bCol_(bCol),
-        bRow_(bRow),
-        cCol_(cCol),
-        cRow_(cRow),
-        dCol_(dCol),
-        dRow_(dRow) {}
-};
-
-template <class T>
-class BaseMatrixT : public TensorExpression<BaseMatrixT<T>, T> {
- public:
-  size_t height_, width_;
-  size_t stride_;
-  T* data_;
-  bool trans_;
-  bool useGpu_;
-
- public:
-  virtual ~BaseMatrixT() {}
-  BaseMatrixT(size_t height, size_t width, T* data, bool trans, bool useGpu)
-      : height_(height),
-        width_(width),
-        stride_(width),
-        data_(data),
-        trans_(trans),
-        useGpu_(useGpu) {}
-
-  /**
-   * @note This constructor is for temporarily making a matrix with different
-   *       useGpu flag as the original matrix so that mixed gpu/cpu operations
-   *       can be performed successfully.
-   */
-  BaseMatrixT(BaseMatrixT& mat, bool useGpu)
-      : height_(mat.height_),
-        width_(mat.width_),
-        stride_(mat.stride_),
-        data_(mat.data_),
-        trans_(mat.trans_),
-        useGpu_(useGpu) {}
-
-  BaseMatrixT(size_t height,
-              size_t width,
-              size_t stride,
-              T* data,
-              bool trans,
-              bool use_gpu)
-      : height_(height),
-        width_(width),
-        stride_(stride),
-        data_(data),
-        trans_(trans),
-        useGpu_(use_gpu) {
-    /* CHECK_LE(width_, stride_); */
-  }
-
-  /// caller should make sure that the size of data is at least height*width
-  void setData(T* data) { data_ = data; }
-
-  /**
-   * unary operator: element wise op(a).
-   *
-   * @code
-   * for 0 <= i < this->height_ & for 0 <= j < this->width_.
-   * @endcode
-   */
-  template <class Op>
-  int applyUnary(Op op);
-
-  /**
-   * unary operator: element wise op(a).
-   *
-   * @code
-   * for 0 <= i < numRows & for 0 <= j < numCols.
-   * While matrix start address is:
-   *  A = this->data_ + offset.aRow_*ld + offset.aCol_;
-   * @endcode
-   */
-  template <class Op>
-  int applyUnary(Op op, int numRows, int numCols, MatrixOffset& offset);
-
-  /**
-   * binary operator: element wise op(a, b).
-   *
-   * @code
-   * for 0 <= i < this->height_ & for 0 <= j < this->width_.
-   * While this->height_ == b.height_ && this->width_ == b.width_.
-   * @endcode
-   */
-  template <class Op>
-  int applyBinary(Op op, BaseMatrixT& b);
-
-  /**
-   * binary operator: element wise op(a, b)
-   *
-   * @code
-   * for 0 <= i < numRows & for 0 <= j < numCols.
-   * While matrix start address is:
-   *   A = this->data_ + offset.aRow_*lda + offset.aCol_;
-   *   B = b->data_ + offset.bRow_*ldb + offset.bCol_;
-   *
-   * if (bAsRowVector == false_type && bAsColVector == false_type)
-   *   op(A[i * lda + j], B[i * ldb + j])
-   *
-   * if (bAsRowVector == true_type && bAsColVector == false_type)
-   *   op(A[i * lda + j], B[j])
-   *
-   * if (bAsRowVector == false_type && bAsColVector == true_type)
-   *   op(A[i * lda + j], B[i * ldb])
-   *
-   * if (bAsRowVector == true_type && bAsColVector == true_type)
-   *   op(A[i * lda + j], B[0])
-   * @endcode
-   */
-  template <class Op, class bAsRowVector, class bAsColVector>
-  int applyBinary(Op op,
-                  BaseMatrixT& b,
-                  int numRows,
-                  int numCols,
-                  MatrixOffset& offset,
-                  bAsRowVector,
-                  bAsColVector);
-
-  template <class Op>
-  int applyBinary(
-      Op op, BaseMatrixT& b, int numRows, int numCols, MatrixOffset& offset);
-
-  /**
-   * ternary operator: element wise op(a, b, c).
-   *
-   * @code
-   * for 0 <= i < this->height_ & for 0 <= j < this->width_.
-   *
-   * While this->height_ == b.height_ && this->width_ == b.width_
-   *    && this->height_ == c.height_ && this->width_ == c.width_
-   * @endcode
-   */
-  template <class Op>
-  int applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * ternary operator: element wise op(a, b, c).
-   *
-   * @code
-   *  for 0 <= i < numRows & for 0 <= j < numCols.
-   *  While matrix start address is:
-   *
-   *    A = this->data_ + offset.aRow_*lda + offset.aCol_;
-   *    B = b->data_ + offset.bRow_*ldb + offset.bCol_;
-   *    C = c->data_ + offset.cRow_*ldc + offset.cCol_;
-   *
-   *    if (cAsRowVector == false_type && cAsColVector == false_type)
-   *      op(A[i*lda + j], B[i*ldb + j], C[i*ldc + j])
-   *
-   *    if (cAsRowVector == true_type && cAsColVector == false_type)
-   *      op(A[i*lda + j], B[i*ldb + j], C[j])
-   *
-   *    if (cAsRowVector == false_type && cAsColVector == true_type)
-   *      op(A[i*lda + j], B[i*ldb + j], C[i*ldc])
-   *
-   *    if (cAsRowVector == 1 && cAsColVector == 1)
-   *      op(A[i*lda + j], B[i*ldb + j], C[0])
-   * @endcode
-   */
-  template <class Op, class cAsRowVector, class cAsColVector>
-  int applyTernary(Op op,
-                   BaseMatrixT& b,
-                   BaseMatrixT& c,
-                   int numRows,
-                   int numCols,
-                   MatrixOffset& offset,
-                   cAsRowVector,
-                   cAsColVector);
-
-  template <class Op>
-  int applyTernary(Op op,
-                   BaseMatrixT& b,
-                   BaseMatrixT& c,
-                   int numRows,
-                   int numCols,
-                   MatrixOffset& offset);
-
-  /**
-   * quaternary operator: element wise op(a, b, c, d).
-   *
-   * @code
-   * for 0 <= i < this->height_ & for 0 <= j < this->width_.
-   *
-   * While this->height_ == b.height_ && this->width_ == b.width_
-   *    && this->height_ == c.height_ && this->width_ == c.width_
-   *    && this->height_ == d.height_ && this->width_ == d.width_
-   * @endcode
-   */
-  template <class Op>
-  int applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d);
-
-  /**
-   * quaternary operator: element wise op(a, b, c, d).
-   *
-   * @code
-   * for 0 <= i < numRows & for 0 <= j < numCols.
-   * While matrix start address is:
-   *    A = this->data_ + offset.aRow_*lda + offset.aCol_;
-   *    B = b->data_ + offset.bRow_*ldb + offset.bCol_;
-   *    C = c->data_ + offset.cRow_*ldc + offset.cCol_;
-   *    D = d->data_ + offset.dRow_*ldd + offset.dCol_;
-   * @endcode
-   */
-  template <class Op>
-  int applyQuaternary(Op op,
-                      BaseMatrixT& b,
-                      BaseMatrixT& c,
-                      BaseMatrixT& d,
-                      int numRows,
-                      int numCols,
-                      MatrixOffset& offset);
-
-  /**
-   * a aggregate expression that apply each row(or column) of matrix b.
-   * op and sv is element wise operator.
-   *
-   * @code
-   * if (aAsRowVector == true_type && aAsColVector == false_type)
-   *  for each column j & 0 <= i < numRows, do:
-   *    dst = agg(op(b[i*ldb + j]))
-   *    a[j] = sv(a[j], dst)
-   *
-   * if (aAsRowVector == false_type && aAsColVector == true_type)
-   *  for each row i & 0 <= j < numCols, do:
-   *    dst = agg(op(b[i*ldb + j]))
-   *    a[i] = sv(a[i], dst)
-   * @endcode
-   */
-  template <class Agg,
-            class Op,
-            class Saver,
-            class aAsRowVector,
-            class aAsColVector>
-  int aggregate(Agg agg,
-                Op op,
-                Saver sv,
-                BaseMatrixT& b,
-                int numRows,
-                int numCols,
-                MatrixOffset& offset,
-                aAsRowVector,
-                aAsColVector);
-
-  /**
-   * a aggregate expression that apply each row(or column) of matrix b and c.
-   *
-   * op and sv is element wise operator.
-   *
-   * @code
-   * if (aAsRowVector == true_type && aAsColVector == false_type)
-   *   for each column j & 0 <= i < numRows, do:
-   *     dst = agg(op(b[i*ldb + j], c[i*ldc + j]))
-   *     a[j] = sv(a[j], dst)
-   *
-   * if (aAsRowVector == false_type && aAsColVector == true_type)
-   *   for each row i & 0 <= j < numCols, do:
-   *     dst = agg(op(b[i*ldb + j], c[i*ldc + j]))
-   *     a[i] = sv(a[i], dst)
-   * @endcode
-   */
-  template <class Agg,
-            class Op,
-            class Saver,
-            class aAsRowVector,
-            class aAsColVector>
-  int aggregate(Agg agg,
-                Op op,
-                Saver sv,
-                BaseMatrixT& b,
-                BaseMatrixT& c,
-                int numRows,
-                int numCols,
-                MatrixOffset& offset,
-                aAsRowVector,
-                aAsColVector);
-
-  /**
-   * a aggregate expression that apply each row of matrix b.
-   *
-   * @code
-   * for each row i & 0 <= j < b.width_, do:
-   *   this[i] = agg(b[i*ldb + j])
-   * @endcode
-   */
-  template <class Agg>
-  int applyRow(Agg agg, BaseMatrixT& b);
-
-  /**
-   * a aggregate expression that apply each row of matrix b.
-   *
-   * @code
-   * for each row i & 0 <= j < b.width_, do:
-   *   dst = agg(op(b[i*ldb + j], c[i*ldc + j])
-   *   this[i] = sv(this[i], dst)
-   * @endcode
-   */
-  template <class Agg, class Op, class Saver>
-  int applyRow(Agg agg, Op op, Saver sv, BaseMatrixT& b, BaseMatrixT& c);
-
-  // Same as the above with the special handing of sv=add2(scaleDest, scaleAgg)
-  template <class Agg, class Op>
-  int applyRow(Agg agg,
-               Op op,
-               real scaleDest,
-               real scaleAgg,
-               BaseMatrixT& b,
-               BaseMatrixT& c);
-
-  /**
-   * a aggregate expression that apply each row of matrix b.
-   *
-   * @code
-   * for each row i & 0 <= j < b.width_, do:
-   *   dst = agg(b[i*ldb + j])
-   *   this[i] = sv(this[i], dst)
-   * @endcode
-   */
-  template <class Agg, class Saver>
-  int applyRow(Agg agg, Saver sv, BaseMatrixT& b);
-
-  // Same as the above with the special handing of sv=add2(scaleDest, scaleAgg)
-  template <class Agg>
-  int applyRow(Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b);
-
-  /**
-   * a aggregate expression that apply each column of matrix b.
-   *
-   * @code
-   * for each column j & 0 <= i < b.height_, do:
-   *   this[j] = agg(b[i*ldb + j])
-   * @endcode
-   */
-  template <class Agg>
-  int applyCol(Agg agg, BaseMatrixT& b);
-
-  /**
-   * a aggregate expression that apply each column of matrix b.
-   *
-   * @code
-   * for each column j & 0 <= i < b.height_, do:
-   *   dst = agg(b[i*ldb + j])
-   *   this[j] = sv(this[j], dst)
-   * @endcode
-   */
-  template <class Agg, class Saver>
-  int applyCol(Agg agg, Saver sv, BaseMatrixT& b);
-
-  // Same as the above with the special handing of sv=add2(scaleDest, scaleAgg)
-  template <class Agg>
-  int applyCol(Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b);
-
-  bool useGpu() const { return useGpu_; }
-
-  const T* rowBuf(size_t row) const { return data_ + width_ * row; }
-
-  T* rowBuf(size_t row) { return data_ + width_ * row; }
-
-  /**
-   * @brief   unary operator.
-   *
-   */
-  void neg();
-  void exp2();
-  void pow2(T p);
-  void log2();
-  void sqrt2();
-  void square2();
-  void reciprocal2();
-  void abs2();
-  void sign2();
-  void zero();
-
-  /**
-   * @code
-   * this(row, col + columnOffset) = 0 for 0 <= col < numColumns
-   * @endcode
-   */
-  void zeroAtOffset(int64_t columnOffset, int64_t numColumns);
-  void one();
-  void subScalar(T p);
-  void mulScalar(T p);
-  void divScalar(T p);
-
-  /**
-   * @code
-   * this = p
-   * @endcode
-   */
-  void assign(T p);
-
-  /**
-   * @code
-   * swap(this, b)
-   * example: swap two Matrices
-   * MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-   * MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-   * cpuA->deepSwap(*cpuB);
-   * @endcode
-   */
-  void deepSwap(BaseMatrixT& b);
-
-  /**
-   * @code
-   * this = this + p
-   * @endcode
-   */
-  void add(T p);
-
-  /**
-   * @code
-   * this = this*p1 + p2
-   * @endcode
-   */
-  void add(T p1, T p2);
-
-  /**
-   * this = this < low ? low : this
-   *
-   * this = this > high ? high : this
-   */
-  void clip(T p1, T p2);
-
-  /**
-   * this = b < low ? 0 : 1
-   *
-   * this = b > high ? 0 : 1
-   */
-  void clipDerivative(BaseMatrixT& b, T p1, T p2);
-
-  /**
-   * @code
-   * a = a > p ? 1.0f : 0.0f
-   * @endcode
-   */
-  void biggerThanScalar(T p);
-
-  /**
-   * @code
-   * a = a > p ? a : p
-   * @endcode
-   */
-  void downClip(T p);
-
-  /**
-   * @code
-   * this = b
-   * @endcode
-   */
-  void assign(BaseMatrixT& b);
-
-  /**
-   * @code
-   * If b.width + columOffset <= this.width
-   *  this(row, col + columnOffset) = b(row, col) for 0 <= col < b.width
-   *
-   * If this.width + columnOffset <= b.width
-   *  this(row, col) = b(row, col + columnOffset) for 0 <= col < this.width
-   *
-   * Otherwise, FATAL
-   * @endcode
-   */
-  void assignAtOffset(BaseMatrixT& b, int64_t columnOffset);
-
-  /// this = this + b
-  void add(BaseMatrixT& b);
-
-  /**
-   * @code
-   * If b.width + columOffset <= this.width
-   *  this(row, col + columnOffset) += b(row, col) for 0 <= col < b.width
-   *
-   * If this.width + columnOffset <= b.width
-   *  this(row, col) += b(row, col + columnOffset) for 0 <= col < this.width
-   *
-   * Otherwise, FATAL
-   * @endcode
-   */
-  void addAtOffset(BaseMatrixT& b, int64_t columnOffset);
-
-  void addColVector(BaseMatrixT& b);
-  void addRowVector(BaseMatrixT& b);
-  void addBias(BaseMatrixT& b, T scale);
-
-  void mulRowVector(BaseMatrixT& b);
-  void divRowVector(BaseMatrixT& b);
-
-  void mulColVector(BaseMatrixT& b);
-  void divColVector(BaseMatrixT& b);
-
-  void addP2P(BaseMatrixT& b);
-
-  /**
-   * @code
-   * this = this + b*p
-   * @endcode
-   */
-  void add(BaseMatrixT& b, T p);
-
-  /**
-   * @code
-   * this = p1*this + p2*b
-   * @endcode
-   */
-  void add(BaseMatrixT& b, T p1, T p2);
-
-  /**
-   * @code
-   * this = this - b
-   * @endcode
-   */
-  void sub(BaseMatrixT& b);
-
-  /**
-   * @code
-   * this = this - b*p
-   * @endcode
-   */
-  void sub(BaseMatrixT& b, T p);
-
-  /**
-   * @code
-   * b = max(0, this)
-   * @endcode
-   */
-  void relu(BaseMatrixT& b);
-  void reluDerivative(BaseMatrixT& b);
-
-  /**
-   * @code
-   * b = log(1.0 + exp(this))
-   * @endcode
-   */
-  void softrelu(BaseMatrixT& b);
-  void softreluDerivative(BaseMatrixT& b);
-
-  /**
-   * @code
-   * b = min(max(this, p1), p2)
-   * @endcode
-   */
-  void brelu(BaseMatrixT& b);
-  void breluDerivative(BaseMatrixT& b);
-
-  /**
-   * @code
-   * b = this * this
-   * @endcode
-   */
-  void square2(BaseMatrixT& b);
-  void squareDerivative(BaseMatrixT& b);
-
-  /**
-   * @code
-   * b = tanh(this)
-   * @endcode
-   */
-  void tanh(BaseMatrixT& b);
-  void tanhDerivative(BaseMatrixT& b);
-
-  /**
-   * @code
-   * b = p1 * tanh(p2 * this)
-   * @endcode
-   */
-  void scaledTanh(BaseMatrixT& b, T p1, T p2);
-  void scaledTanhDerivative(BaseMatrixT& b, T p1, T p2);
-
-  /**
-   * @code
-   * b = 1.0f / this
-   * @endcode
-   */
-  void reciprocal2(BaseMatrixT& b);
-  void reciprocalDerivative(BaseMatrixT& b);
-
-  /**
-   * @code
-   * b = this > 0.0f ? this : -this
-   * @endcode
-   */
-  void abs2(BaseMatrixT& b);
-  void absDerivative(BaseMatrixT& b);
-
-  /**
-   * @code
-   * b = 1.0f / (1.0f + exp(-this))
-   * @endcode
-   */
-  void sigmoid(BaseMatrixT& b);
-  void sigmoidDerivative(BaseMatrixT& b);
-
-  /**
-   * @code
-   * b = a
-   * @endcode
-   */
-  void expDerivative(BaseMatrixT& b);
-
-  void sign2(BaseMatrixT& b);
-
-  void exp2(BaseMatrixT& b);
-  void pow2(BaseMatrixT& b, T p);
-  void log2(BaseMatrixT& b);
-  void sqrt2(BaseMatrixT& b);
-  void addScalar(BaseMatrixT& b, T p);
-  void subScalar(BaseMatrixT& b, T p);
-  void mulScalar(BaseMatrixT& b, T p);
-  void divScalar(BaseMatrixT& b, T p);
-  void scalarDiv(BaseMatrixT& b, T p);
-
-  /**
-   * @code
-   * this = 1.0f / sqrt(b)
-   * @endcode
-   */
-  void invSqrt(BaseMatrixT& b);
-
-  /// this = (b == value)
-  void isEqualTo(BaseMatrixT& b, T value);
-
-  /**
-   * @brief   ternary operator.
-   */
-  void softCrossEntropy(BaseMatrixT& b, BaseMatrixT& c);
-  void softCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c);
-  void binaryLabelCrossEntropy(BaseMatrixT& b, BaseMatrixT& c);
-  void binaryLabelCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this = b + c
-   * @endcode
-   */
-  void add(BaseMatrixT& b, BaseMatrixT& c);
-  /**
-   * @code
-   * this = b*p1 + c*p2
-   * @endcode
-   */
-  void add(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2);
-  /**
-   * @code
-   * this = b - c
-   * @endcode
-   */
-  void sub(BaseMatrixT& b, BaseMatrixT& c);
-  /**
-   * @code
-   * this = b*p1 - c*p2
-   * @endcode
-   */
-  void sub(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2);
-
-  /**
-   * @code
-   * this = this + b + c
-   * @endcode
-   */
-  void add2(BaseMatrixT& b, BaseMatrixT& c);
-  /**
-   * @code
-   * this = this*p1 + b*p2 + c*p3
-   * @endcode
-   */
-  void add2(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3);
-
-  /**
-   * @code
-   * this = a*p1 + b*p2 + c*p3
-   * @endcode
-   */
-  void add3(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d, T p1, T p2, T p3);
-
-  /**
-   * @code
-   *   c = p2 * c - p1 *  (b + p3 * this)
-   *   this += mom
-   * @endcode
-   */
-  void sgdUpdate(BaseMatrixT& b,  //  grad
-                 BaseMatrixT& c,  //  mom
-                 T p1,            //  learningRate,
-                 T p2,            //  momentum,
-                 T p3);           //  decayRate
-
-  /**
-   * @code
-   *   c = p2 * c - p1 * d * (b + p3 * this)
-   *   this += mom
-   * @endcode
-   */
-  void sgdUpdate(BaseMatrixT& b,  // grad,
-                 BaseMatrixT& c,  // mom,
-                 BaseMatrixT& d,  // lr,
-                 T p1,            // learningRate,
-                 T p2,            // momentum,
-                 T p3);           // decayRate
-
-  /// apply L1/L2 to *this*
-  virtual void applyL1(T learningRate, T decayRate);
-  void applyL1(BaseMatrixT& lr, T learningRate, T decayRate);
-  void applyL2(T learningRate, T decayRate);
-  void applyL2(BaseMatrixT& lr, T learningRate, T decayRate);
-
-  /**
-   * @code
-   * this *= b
-   * @endcode
-   */
-  void dotMul(BaseMatrixT& b);
-
-  /**
-   * @code
-   * this = b * c
-   * @endcode
-   */
-  void dotMul(BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this = b / c
-   * @endcode
-   */
-  void dotDiv(BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this = (b + p1) / (c + p2)
-   * @endcode
-   */
-  void dotDiv(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2);
-
-  /**
-   * @code
-   * this = log(1 + exp(b - c)) - d * (b - c)
-   * @endcode
-   */
-  void rankLoss(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d);
-  void rankLossBp(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d);
-
-  /**
-   * @code
-   * this = log(1 + exp(b)) - c * b
-   * @endcode
-   */
-  void logisticRegressionLoss(BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this += exp(b)/(1+exp(b)) - c
-   * @endcode
-   */
-  void logisticRegressionLossBp(BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this = b > c ? 1.0 : 0.0
-   * @endcode
-   */
-  void biggerThan(BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this = ((b>c && d>0.5) || (b<c && d<0.5)) ? 1 : 0)
-   * @endcode
-   */
-  void biggerThan(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d);
-
-  /**
-   * @code
-   * this = b>c ? b : c
-   * @endcode
-   */
-  void max2(BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this[destCol] += (b>p1 == c>p1) ? 0 : 1)
-   * @endcode
-   */
-  void binaryClassificationError(size_t destCol,
-                                 BaseMatrixT& b,
-                                 BaseMatrixT& c,
-                                 T p);
-  void binaryClassificationError2(size_t destCol,
-                                  BaseMatrixT& b,
-                                  BaseMatrixT& c,
-                                  T p);
-
-  /**
-   * @code
-   * this = this * b * b
-   * @endcode
-   */
-  void dotMulSquare(BaseMatrixT& b);
-
-  /**
-   * @code
-   * this = this * this * b
-   * @endcode
-   */
-  void dotSquareMul(BaseMatrixT& b);
-
-  /**
-   * @code
-   * this = b * c * c
-   * @endcode
-   */
-  void dotMulSquare(BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this = b * b * c * c
-   * @endcode
-   */
-  void dotSquareSquare(BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this = this * (p1*b + p2*c)^2
-   * @endcode
-   */
-  void dotMulSquareSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2);
-
-  /**
-   * @code
-   * this = (p1*b + p2*c)^2
-   * @endcode
-   */
-  void dotSquareSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2);
-
-  /**
-   * @code
-   * this=  this * (p1*b + p2*c)
-   * @endcode
-   */
-  void dotMulSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2);
-
-  /**
-   * @code
-   * this += sqr(p1*b + p2*c + p3*d)
-   * @endcode
-   */
-  void addSquareSum(
-      BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT d, T p1, T p2, T p3);
-
-  /**
-   * @code
-   * this += p * sqr(b)
-   * @endcode
-   */
-  void addSquare(BaseMatrixT& b, T p);
-
-  /**
-   * @code
-   * this = p1 * this + p2 * sqr(b)
-   * @endcode
-   */
-  void decayAddSquare(BaseMatrixT& b, T p1, T p2);
-
-  /**
-   * @code
-   * this = p1 * this + p2 * sqr(b * c)
-   * @endcode
-   */
-  void decayAddSquareMul(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2);
-
-  /**
-   * @code
-   * this = 1 / (p1 * b + p2)
-   * @endcode
-   */
-  void reciprocal2(BaseMatrixT& b, T p1, T p2);
-
-  /**
-   * @code
-   * this = 1 / (p1 * b + p2 * c + p3)
-   * @endcode
-   */
-  void reciprocalSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3);
-
-  /**
-   * @code
-   * b = this; this = 0
-   * @endcode
-   */
-  void copyAndClear(BaseMatrixT& b);
-
-  /**
-   * @code
-   * this_row[destCol] += dotprod(b_row, c_row)
-   * @endcode
-   */
-  void rowDotMul(size_t destCol, BaseMatrixT& b, BaseMatrixT& c);
-  void rowDotMul2(size_t destCol, BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * this is vector (one row matrix)
-   *
-   * @code
-   *   for each row i, do:
-   *      this_row += dotmul(b_row_i, c_row_i)
-   * @endcode
-   */
-  void addDotMulVMM(BaseMatrixT& b, BaseMatrixT& c);
-  void addDotMulVMM2(BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * c is vector (one row matrix)
-   *
-   * @code
-   * for each row i, do:
-   *    this_row_i += dotmul(b_row_i, c_row)
-   * @endcode
-   */
-  void addDotMulMMV(BaseMatrixT& b, BaseMatrixT& c);
-  void addDotMulMMV2(BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this = p1 * this + p2 * b * c
-   * @endcode
-   */
-  void addDotMul(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2);
-
-  /**
-   * @code
-   * this_row = b_row * c_row[cCol]
-   * @endcode
-   */
-  void rowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c);
-  void rowScale2(size_t cCol, BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this_col = b_col * c_col[cRow]
-   * @endcode
-   */
-  void colScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this_col += b_col * c_col[cRow]
-   * @endcode
-   */
-  void addColScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this_row += b_row * c_row[cCol]
-   * @endcode
-   */
-  void addRowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c);
-
-  /// calculate the sum of each row of the matrix b.
-  /// this_i = scaleDest * this_i + scaleSum * \sum_j b_{ij}
-  void sumRows(BaseMatrixT& b, T scaleSum, T scaleDest);
-
-  /// calculate the maximum value of each row of the matrix b.
-  void maxRows(BaseMatrixT& b);
-  /// calculate the minimum value of each row of the matrix b.
-  void minRows(BaseMatrixT& b);
-
-  /// calculate the maximum value of each column of the matrix b.
-  void maxCols(BaseMatrixT& b);
-  /// calculate the minimum value of each column of the matrix b.
-  void minCols(BaseMatrixT& b);
-
-  /// calculate the sum of each column of the matrix b.
-  /// this_i = scaleDest * this_i + scaleSum * \sum_j b_{ji}
-  void sumCols(BaseMatrixT& b, T scaleSum, T scaleDest);
-
-  /// this_i = scaleDest * this_i + scaleSum * \sum_j (b_{ij} - c_{ij})^2
-  void sumOfSquaredDiffs(BaseMatrixT& b,
-                         BaseMatrixT& c,
-                         T scaleSum,
-                         T scaleDest);
-
-  /// this_i = scaleDest * this_i + scaleSum * \sum_j b_{ij} * c_{ij}
-  void sumOfProducts(BaseMatrixT& b, BaseMatrixT& c, T scaleSum, T scaleDest);
-
-  /**
-   * @code
-   * this_row = b_row + p * ones * c_row[cCol]
-   * @endcode
-   */
-  void rowAdd(size_t cCol, BaseMatrixT& b, BaseMatrixT& c, T p);
-  /**
-   * @code
-   * this_row = pow(b_row, c_row[cCol])
-   * @endcode
-   */
-  void rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c);
-
-  virtual bool isSparse() const { return false; }
-
-  template <typename ExpressionType>
-  void operator=(const ExpressionType& expr) {
-    if (useGpu_) {
-      TensorGpuApply<T>(*this, expr);
-    } else {
-      TensorCpuApply<T>(*this, expr);
-    }
-  }
-
-  template <typename ExpressionType>
-  void operator+=(const ExpressionType& expr) {
-    (*this) = (*this) + expr;
-  }
-  template <typename ExpressionType>
-  void operator-=(const ExpressionType& expr) {
-    (*this) = (*this) - expr;
-  }
-  template <typename ExpressionType>
-  void operator*=(const ExpressionType& expr) {
-    (*this) = (*this) * expr;
-  }
-  template <typename ExpressionType>
-  void operator/=(const ExpressionType& expr) {
-    (*this) = (*this) / expr;
-  }
-};
-
-typedef BaseMatrixT<real> BaseMatrix;
-typedef BaseMatrixT<int> IBaseMatrix;
-
-}  // namespace paddle
diff --git a/paddle/math/CMakeLists.txt b/paddle/math/CMakeLists.txt
deleted file mode 100644
index 3c897b5f3e09cd53ddd5b767333ce4759250da71..0000000000000000000000000000000000000000
--- a/paddle/math/CMakeLists.txt
+++ /dev/null
@@ -1,57 +0,0 @@
-# common package contains:
-#   * the utilities:
-#       * Thread Libs
-#       * Memory Manage libs
-#       * CommandLine Parser
-#       * Logging
-#       * Timer/Stats
-#   * the math libraries:
-#       * Matrix/Vector
-#   * the parameter optimizers.
-#   * the parameter updater functions.
-#
-# TODO(yuyang18): separate libs.
-#
-file(GLOB MATH_HEADERS . *.h)
-file(GLOB MATH_SOURCES . *.cpp)
-
-if(NOT WITH_MKLDNN)
-    set(DNN_HEADER "${CMAKE_CURRENT_SOURCE_DIR}/MKLDNNMatrix.h")
-    set(DNN_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/MKLDNNMatrix.cpp")
-    list(REMOVE_ITEM MATH_HEADERS "${DNN_HEADER}")
-    list(REMOVE_ITEM MATH_SOURCES "${DNN_SOURCE}")
-    message(STATUS "Skip compiling with MKLDNNMatrix")
-else()
-    message(STATUS "Compile with MKLDNNMatrix")
-endif()
-
-if(MOBILE_INFERENCE)
-    # Remove sparse
-    list(REMOVE_ITEM MATH_HEADERS
-         ${CMAKE_CURRENT_SOURCE_DIR}/CpuSparseMatrix.h
-         ${CMAKE_CURRENT_SOURCE_DIR}/SparseMatrix.h
-         ${CMAKE_CURRENT_SOURCE_DIR}/SparseRowMatrix.h)
-    list(REMOVE_ITEM MATH_SOURCES
-         ${CMAKE_CURRENT_SOURCE_DIR}/CpuSparseMatrix.cpp
-         ${CMAKE_CURRENT_SOURCE_DIR}/SparseMatrix.cpp
-         ${CMAKE_CURRENT_SOURCE_DIR}/SparseRowMatrix.cpp)
-endif()
-set(MATH_SOURCES
-    "${PADDLE_SOURCE_DIR}/paddle/math/BaseMatrix.cu"
-    "${PADDLE_SOURCE_DIR}/paddle/math/TrainingAlgorithmOp.cu"
-    ${MATH_SOURCES})
-if(NOT WITH_GPU)
-    # then compile BaseMatrix.cu as c++ file
-    compile_cu_as_cpp("${PADDLE_SOURCE_DIR}/paddle/math/BaseMatrix.cu")
-    compile_cu_as_cpp("${PADDLE_SOURCE_DIR}/paddle/math/TrainingAlgorithmOp.cu")
-    add_library(paddle_math STATIC
-        ${MATH_SOURCES})
-else()
-    cuda_add_library(paddle_math ${MATH_SOURCES})
-endif()
-
-
-add_dependencies(paddle_math paddle_proto ${external_project_dependencies})  # depends
-if(WITH_TESTING)
-    add_subdirectory(tests)
-endif()
diff --git a/paddle/math/CpuSparseMatrix.cpp b/paddle/math/CpuSparseMatrix.cpp
deleted file mode 100644
index 023450ffb794086399d7131ba5faa4dbefeaaf7d..0000000000000000000000000000000000000000
--- a/paddle/math/CpuSparseMatrix.cpp
+++ /dev/null
@@ -1,787 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CpuSparseMatrix.h"
-#include "SparseMatrix.h"
-#include "float.h"
-#include "hl_gpu.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/utils/Util.h"
-
-namespace paddle {
-
-const size_t CpuSparseMatrix::DEFAULT_AVG_WIDTH;
-
-CpuSparseMatrix::CpuSparseMatrix(size_t height,
-                                 size_t width,
-                                 size_t nnz,
-                                 SparseValueType valueType,
-                                 SparseFormat format,
-                                 bool trans)
-    : Matrix(NULL, height, width, trans, false) {
-  resize(height, width, nnz, valueType, format);
-}
-
-CpuSparseMatrix::CpuSparseMatrix(CpuMemHandlePtr dataHandle,
-                                 size_t height,
-                                 size_t width,
-                                 size_t nnz,
-                                 SparseValueType valueType,
-                                 SparseFormat format,
-                                 bool trans)
-    : Matrix(dataHandle, height, width, trans, false) {
-  resize(height, width, nnz, valueType, format);
-}
-
-CpuSparseMatrix::CpuSparseMatrix(real* data,
-                                 int* rows,
-                                 int* cols,
-                                 size_t height,
-                                 size_t width,
-                                 size_t nnz,
-                                 SparseValueType valueType,
-                                 SparseFormat format,
-                                 bool trans)
-    : Matrix(NULL, height, width, trans, false) {
-  cols_ = cols;
-  rows_ = rows;
-  value_ = data;
-  height_ = height;
-  width_ = width;
-  elementCnt_ = nnz;
-  valueType_ = valueType;
-  format_ = format;
-}
-
-void CpuSparseMatrix::resize(size_t newHeight,
-                             size_t newWidth,
-                             size_t newNnz,
-                             SparseValueType valueType,
-                             SparseFormat format) {
-  CHECK_LE(newNnz, newHeight * newWidth);
-  size_t newSize = 0;
-  if (format == SPARSE_CSR) {
-    newSize = (newHeight + 1) * sizeof(int) + newNnz * sizeof(int);
-  } else {
-    newSize = (newWidth + 1) * sizeof(int) + newNnz * sizeof(int);
-  }
-
-  if (NO_VALUE != valueType) {
-    newSize += newNnz * sizeof(real);
-  }
-
-  if (NULL == memoryHandle_.get() || newSize > memoryHandle_->getSize()) {
-    memoryHandle_ = std::make_shared<CpuMemoryHandle>(newSize);
-  }
-
-  height_ = newHeight;
-  width_ = newWidth;
-  elementCnt_ = newNnz;
-  valueType_ = valueType;
-  format_ = format;
-  sparseResize();
-}
-void CpuSparseMatrix::sparseResize() {
-  if (format_ == SPARSE_CSR) {
-    rows_ = reinterpret_cast<int*>(
-        reinterpret_cast<char*>(memoryHandle_->getBuf()));
-    cols_ = reinterpret_cast<int*>(
-        reinterpret_cast<char*>(memoryHandle_->getBuf()) +
-        (height_ + 1) * sizeof(int));
-    if (NO_VALUE != valueType_) {
-      value_ = reinterpret_cast<real*>(
-          reinterpret_cast<char*>(memoryHandle_->getBuf()) +
-          (height_ + 1) * sizeof(int) + elementCnt_ * sizeof(int));
-    } else {
-      value_ = NULL;
-    }
-  } else {
-    cols_ = reinterpret_cast<int*>(
-        reinterpret_cast<char*>(memoryHandle_->getBuf()));
-    rows_ = reinterpret_cast<int*>(
-        reinterpret_cast<char*>(memoryHandle_->getBuf()) +
-        (width_ + 1) * sizeof(int));
-    if (NO_VALUE != valueType_) {
-      value_ = reinterpret_cast<real*>(
-          reinterpret_cast<char*>(memoryHandle_->getBuf()) +
-          (width_ + 1) * sizeof(int) + elementCnt_ * sizeof(int));
-    } else {
-      value_ = NULL;
-    }
-  }
-}
-
-void CpuSparseMatrix::resize(size_t newHeight, size_t newWidth) {
-  resize(newHeight,
-         newWidth,
-         newHeight * std::min(DEFAULT_AVG_WIDTH, newWidth),
-         valueType_,
-         format_);
-}
-
-MatrixPtr CpuSparseMatrix::getTranspose() {
-  if (!memoryHandle_ && !value_) {
-    MatrixPtr dest(new CpuSparseMatrix(
-        height_, width_, elementCnt_, valueType_, format_, true));
-    return dest;
-  } else if (memoryHandle_) {
-    MatrixPtr dest(new CpuSparseMatrix(
-        std::dynamic_pointer_cast<CpuMemoryHandle>(memoryHandle_),
-        height_,
-        width_,
-        elementCnt_,
-        valueType_,
-        format_,
-        true));
-    return dest;
-  } else if (value_) {
-    MatrixPtr dest(new CpuSparseMatrix(value_,
-                                       rows_,
-                                       cols_,
-                                       height_,
-                                       width_,
-                                       elementCnt_,
-                                       valueType_,
-                                       format_,
-                                       true));
-    return dest;
-  } else {
-    return NULL;
-  }
-}
-
-SparseValueType CpuSparseMatrix::getValueType() { return valueType_; }
-
-void CpuSparseMatrix::mul(const Matrix& a,
-                          const Matrix& b,
-                          real scaleAB,
-                          real scaleT) {
-  CHECK(!isTransposed()) << "Not supported";
-  const auto a_ptr = dynamic_cast<const CpuMatrix*>(&a);
-  const auto b_ptr = dynamic_cast<const CpuMatrix*>(&b);
-
-  if (a_ptr && b_ptr) {
-    CpuMatrix::mul((CpuMatrix*)a_ptr, (CpuMatrix*)b_ptr, this, scaleAB, scaleT);
-  } else {
-    LOG(FATAL) << "not supported";
-  }
-}
-
-void CpuSparseMatrix::add3(CpuMatrix* b) {
-  CHECK(getFormat() != SPARSE_CSC) << "Not supported";
-  CHECK(height_ == b->getHeight());
-  CHECK(width_ == b->getWidth());
-  real* A = getValue();
-  real* B = b->getData();
-  int* cols = getCols();
-  for (size_t i = 0; i < height_; i++) {
-    size_t start = getRowStartIdx(i);
-    size_t end = getRowStartIdx(i + 1);
-    for (size_t j = start; j < end; j++) {
-      A[j] = B[i * width_ + cols[j]];
-    }
-  }
-}
-
-void CpuSparseMatrix::add3(MatrixPtr b) {
-  if (dynamic_cast<CpuMatrix*>(b.get())) {
-    add3(dynamic_cast<CpuMatrix*>(b.get()));
-  } else {
-    LOG(FATAL) << "not supported";
-  }
-}
-
-void CpuSparseMatrix::addBias(Matrix& b, real scale) {
-  CHECK_EQ(b.getHeight(), (size_t)1);
-  CHECK_EQ(width_, b.getWidth());
-  real* A = getValue();
-  real* B = b.getData();
-  int* cols = getCols();
-  size_t nnz = getElementCnt();
-  for (size_t i = 0; i < nnz; i++) {
-    A[i] += scale * B[cols[i]];
-  }
-}
-
-template <class T>
-void printBuf(std::ostream& os, T* a, size_t len, const char* name) {
-  os << "\n: " << name << " [";
-  for (size_t i = 0; i < len; i++) {
-    os << a[i] << " ";
-  }
-  os << "]\n";
-}
-
-void CpuSparseMatrix::print(std::ostream& os) const {
-  size_t rowSize = format_ == SPARSE_CSC ? elementCnt_ : height_ + 1;
-  size_t colSize = format_ == SPARSE_CSC ? width_ + 1 : elementCnt_;
-  printBuf(os, rows_, rowSize, "row");
-  printBuf(os, cols_, colSize, "col");
-  if (valueType_ == FLOAT_VALUE) {
-    printBuf(os, value_, elementCnt_, "value");
-  }
-  return;
-}
-
-void CpuSparseMatrix::printOneRow(std::ostream& os, size_t idx) const {
-  CHECK_LT(idx, height_);
-  if (format_ == SPARSE_CSC) {
-    LOG(FATAL) << "SPARSE_CSC not supported";
-    return;
-  }
-
-  const int* col = getRowCols(idx);
-  size_t num = getColNum(idx);
-  if (num > 0) {
-    if (valueType_ == FLOAT_VALUE) {
-      const real* data = getRowValues(idx);
-      os << col[0] << ":" << data[0];
-      for (size_t i = 1; i < num; ++i) {
-        os << " " << col[i] << ":" << data[i];
-      }
-    } else {
-      os << col[0];
-      for (size_t i = 1; i < num; ++i) {
-        os << " " << col[i];
-      }
-    }
-  }
-  os << ";";
-}
-
-void CpuSparseMatrix::rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c) {
-  CHECK(getFormat() != SPARSE_CSC) << "Not supported";
-  CHECK_EQ(height_, b.getHeight());
-  CHECK_EQ(width_, b.getWidth());
-  real* A = getValue();
-  real* B = b.getValue();
-  if (b.getValueType() == FLOAT_VALUE) {
-    for (size_t i = 0; i < height_; i++) {
-      size_t start = getRowStartIdx(i);
-      size_t end = getRowStartIdx(i + 1);
-      CHECK_EQ(start, b.getRowStartIdx(i));
-      CHECK_EQ(end, b.getRowStartIdx(i + 1));
-      for (size_t j = start; j < end; j++) {
-        A[j] = B[j] * c.getElement(i, cCol);
-      }
-    }
-  } else if (b.getValueType() == NO_VALUE) {
-    for (size_t i = 0; i < height_; i++) {
-      size_t start = getRowStartIdx(i);
-      size_t end = getRowStartIdx(i + 1);
-      CHECK_EQ(start, b.getRowStartIdx(i));
-      CHECK_EQ(end, b.getRowStartIdx(i + 1));
-      for (size_t j = start; j < end; j++) {
-        A[j] = c.getElement(i, cCol);
-      }
-    }
-  }
-}
-
-void CpuSparseMatrix::randomizeUniform() {
-  CHECK_LE(elementCnt_, height_ * width_);
-  if (valueType_ == FLOAT_VALUE) {
-    real* data = getValue();
-    for (size_t i = 0; i < elementCnt_; ++i) {
-      *data++ = rand() / static_cast<real>(RAND_MAX);  // NOLINT
-    }
-  }
-  if (format_ == SPARSE_CSR) {
-    sparseRand(rows_, cols_, elementCnt_, height_ + 1, width_, false);
-  } else {
-    sparseRand(cols_, rows_, elementCnt_, width_ + 1, height_, false);
-  }
-}
-
-void CpuSparseMatrix::copyFrom(std::vector<int>& rows,
-                               std::vector<int>& cols,
-                               std::vector<real>& values) {
-  size_t size = format_ == SPARSE_CSR ? cols.size() : rows.size();
-  resize(height_, width_, size, valueType_, format_);
-  if (valueType_ == FLOAT_VALUE) {
-    memcpy(&value_[0], &values[0], sizeof(real) * values.size());
-  }
-  memcpy(&cols_[0], &cols[0], sizeof(int) * cols.size());
-  memcpy(&rows_[0], &rows[0], sizeof(int) * rows.size());
-}
-
-// Copy from a CpuMatrix, only supported in sparse_float_value_t
-// SparseMatrix.
-void CpuSparseMatrix::copyFrom(const CpuMatrix& src) {
-  CHECK_EQ(getHeight(), src.getHeight());
-  CHECK_EQ(getWidth(), src.getWidth());
-  CHECK(!src.trans_ && !trans_);
-  if (format_ == SPARSE_CSR) {
-    std::vector<int> rows(getHeight() + 1);
-    std::vector<int> cols;
-    std::vector<real> values;
-    rows[0] = 0;
-    for (size_t r = 0; r < getHeight(); ++r) {
-      for (size_t c = 0; c < getWidth(); ++c) {
-        real v = src.getElement(r, c);
-        if (fabs(v) > FLT_EPSILON) {
-          cols.push_back(c);
-          values.push_back(v);
-        }
-      }
-      rows[r + 1] = values.size();
-    }
-    copyFrom(rows, cols, values);
-  } else {
-    std::vector<int> cols(getWidth() + 1);
-    std::vector<int> rows;
-    std::vector<real> values;
-    cols[0] = 0;
-    for (size_t r = 0; r < getWidth(); ++r) {
-      for (size_t c = 0; c < getHeight(); ++c) {
-        real v = src.getElement(c, r);
-        if (fabs(v) > FLT_EPSILON) {
-          rows.push_back(c);
-          values.push_back(v);
-        }
-      }
-      cols[r + 1] = values.size();
-    }
-    copyFrom(rows, cols, values);
-  }
-}
-
-MatrixPtr CpuSparseMatrix::clone(size_t height, size_t width, bool useGpu) {
-  if (height == 0 && width == 0) {
-    height = height_;
-    width = width_;
-  }
-  CHECK(width && height);
-  if (!useGpu) {
-    return std::make_shared<CpuSparseMatrix>(
-        height, width, 0, valueType_, format_);
-  } else {
-    return std::make_shared<GpuSparseMatrix>(
-        height, width, elementCnt_, valueType_, format_);
-  }
-}
-
-MatrixPtr CpuSparseMatrix::subMatrix(size_t startRow, size_t numRows) {
-  CHECK_LE(startRow + numRows, height_);
-  CHECK_EQ(format_, SPARSE_CSR);
-  if (valueType_ == NO_VALUE) {
-    return std::make_shared<CpuSparseMatrix>(
-        nullptr,
-        rows_ + startRow,
-        cols_,
-        numRows,
-        width_,
-        rows_[startRow + numRows] - rows_[startRow],
-        valueType_,
-        format_,
-        trans_);
-  } else {
-    return std::make_shared<CpuSparseMatrix>(
-        value_,
-        rows_ + startRow,
-        cols_,
-        numRows,
-        width_,
-        rows_[startRow + numRows] - rows_[startRow],
-        valueType_,
-        format_,
-        trans_);
-  }
-}
-
-/* mem MUST be alloced outside (memAlloc=false) */
-void CpuSparseMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
-  CHECK(!memAlloc);
-  CpuSparseMatrix* mat = dynamic_cast<CpuSparseMatrix*>(matTrans.get());
-  if (format_ == SPARSE_CSR) {
-    /*statistic element number in each col*/
-    int* colCounters = mat->getRows() + 1;
-    memset(colCounters, 0, sizeof(int) * width_);
-    for (size_t i = 0; i < elementCnt_; ++i) {
-      int col = cols_[i];
-      colCounters[col]++;
-    }
-    /*fill mat rows */
-    mat->getRows()[0] = 0;
-    for (size_t i = 1; i < width_ + 1; i++) {
-      mat->getRows()[i] = mat->getRows()[i - 1] + mat->getRows()[i];
-    }
-    /*fill mat values and cols*/
-    std::vector<int> colNumVec(width_, 0);
-    if (valueType_ == FLOAT_VALUE) {
-      for (size_t i = 0; i < height_; i++) {
-        for (int j = rows_[i]; j < rows_[i + 1]; j++) {
-          int colIdx = cols_[j];
-          int index = mat->getRows()[colIdx] + colNumVec[colIdx];
-          mat->getCols()[index] = i;
-          mat->getValue()[index] = value_[j];
-          colNumVec[colIdx]++;
-        }
-      }
-    } else {
-      for (size_t i = 0; i < height_; i++) {
-        for (int j = rows_[i]; j < rows_[i + 1]; j++) {
-          int colIdx = cols_[j];
-          int index = mat->getRows()[colIdx] + colNumVec[colIdx];
-          mat->getCols()[index] = i;
-          colNumVec[colIdx]++;
-        }
-      }
-    }
-  } else {
-    /*statistic element number in each row*/
-    int* rowCounters = mat->getCols() + 1;
-    memset(rowCounters, 0, sizeof(int) * height_);
-    for (size_t i = 0; i < elementCnt_; ++i) {
-      int row = rows_[i];
-      rowCounters[row]++;
-    }
-
-    /*fill mat cols */
-    mat->getCols()[0] = 0;
-    for (size_t i = 1; i < height_ + 1; i++) {
-      mat->getCols()[i] = mat->getCols()[i - 1] + mat->getCols()[i];
-    }
-    /*fill mat values and rows*/
-    std::vector<int> rowNumVec(height_, 0);
-    if (valueType_ == FLOAT_VALUE) {
-      for (size_t i = 0; i < width_; i++) {
-        for (int j = cols_[i]; j < cols_[i + 1]; j++) {
-          int rowIdx = rows_[j];
-          int index = mat->getCols()[rowIdx] + rowNumVec[rowIdx];
-          mat->getRows()[index] = i;
-          mat->getValue()[index] = value_[j];
-          rowNumVec[rowIdx]++;
-        }
-      }
-    } else {
-      for (size_t i = 0; i < width_; i++) {
-        for (int j = cols_[i]; j < cols_[i + 1]; j++) {
-          int rowIdx = rows_[j];
-          int index = mat->getCols()[rowIdx] + rowNumVec[rowIdx];
-          mat->getRows()[index] = i;
-          rowNumVec[rowIdx]++;
-        }
-      }
-    }
-  }
-}
-
-void CpuSparseMatrix::setRow(size_t row,
-                             size_t colNum,
-                             const unsigned int* cols,
-                             const real* values) {
-  if (format_ == SPARSE_CSR) {
-    CHECK_LT(row, height_);
-    CHECK(NULL != cols);
-    if (0 == row) {
-      rows_[row] = 0;
-    }
-    rows_[row + 1] = rows_[row] + colNum;
-    for (size_t i = 0; i < colNum; ++i) {
-      cols_[rows_[row] + i] = cols[i];
-    }
-    if (valueType_ == NO_VALUE) {
-      CHECK(!values);
-    } else {
-      for (size_t i = 0; i < colNum; ++i) {
-        value_[rows_[row] + i] = values[i];
-      }
-    }
-  } else {
-    LOG(FATAL) << "not supported";
-  }
-}
-
-void CpuSparseMatrix::fillRowIndices(IVectorPtr& outVec) const {
-  if (format_ == SPARSE_CSR) {
-    auto nnz = getElementCnt();
-    IVector::resizeOrCreate(outVec, nnz, false);
-    auto out = outVec->getData();
-    int* rows = getRows();
-    for (size_t i = 0; i < height_; i++) {
-      for (int j = rows[i]; j < rows[i + 1]; j++) {
-        out[j] = i;
-      }
-    }
-  } else {
-    LOG(FATAL) << "SPARSE_CSC not supported";
-  }
-}
-
-ThreadLocal<std::vector<CpuSparseMatrixPtr>> CpuSparseMatrix::cpuLocalMats_;
-
-CpuSparseMatrixPtr CpuSparseMatrix::getTmpSparseMatrix(size_t height,
-                                                       size_t width) {
-  std::vector<CpuSparseMatrixPtr>* localMats = cpuLocalMats_.get();
-  auto it = localMats->begin();
-  while (it != localMats->end()) {
-    if (it->unique()) {
-      (*it)->resize(height, width, elementCnt_, valueType_, format_);
-      return *it;
-    }
-  }
-  localMats->emplace_back(std::make_shared<CpuSparseMatrix>(
-      height, width, elementCnt_, valueType_, format_, false));
-  return localMats->back();
-}
-
-void CpuSparseMatrix::copyFrom(const Matrix& src, hl_stream_t stream) {
-  if (dynamic_cast<const GpuSparseMatrix*>(&src)) {
-    auto tmpSrc = dynamic_cast<const GpuSparseMatrix*>(&src);
-    copyFrom(*tmpSrc, stream);
-  } else if (dynamic_cast<const CpuSparseMatrix*>(&src)) {
-    auto tmpSrc = dynamic_cast<const CpuSparseMatrix*>(&src);
-    copyFrom(*tmpSrc);
-  } else if (dynamic_cast<const CpuMatrix*>(&src)) {
-    auto tmpSrc = dynamic_cast<const CpuMatrix*>(&src);
-    copyFrom(*tmpSrc);
-  } else {
-    LOG(FATAL) << "not implemented";
-  }
-}
-
-void CpuSparseMatrix::copyFrom(const Matrix& src) {
-  if (dynamic_cast<const CpuSparseMatrix*>(&src)) {
-    auto tmpSrc = dynamic_cast<const CpuSparseMatrix*>(&src);
-    copyFrom(*tmpSrc);
-  } else if (dynamic_cast<const CpuMatrix*>(&src)) {
-    auto tmpSrc = dynamic_cast<const CpuMatrix*>(&src);
-    copyFrom(*tmpSrc);
-  } else {
-    LOG(FATAL) << "not implemented";
-  }
-}
-
-void CpuSparseMatrix::copyFrom(const GpuSparseMatrix& src, hl_stream_t stream) {
-  CHECK_EQ(height_, src.getHeight());
-  CHECK_EQ(width_, src.getWidth());
-  CHECK_EQ(size_t(elementCnt_), src.getElementCnt());
-  size_t valSize = valueType_ == NO_VALUE ? 0 : elementCnt_;
-  if (format_ == SPARSE_CSC)
-    hl_memcpy_from_csc_matrix(value_,
-                              valSize,
-                              rows_,
-                              elementCnt_,
-                              cols_,
-                              width_ + 1,
-                              src.sMatrix_.get(),
-                              stream);
-  else
-    hl_memcpy_from_csr_matrix(value_,
-                              valSize,
-                              rows_,
-                              height_ + 1,
-                              cols_,
-                              elementCnt_,
-                              src.sMatrix_.get(),
-                              stream);
-}
-
-void CpuSparseMatrix::copyFrom(const CpuSparseMatrix& src) {
-  CHECK_EQ(height_, src.getHeight());
-  CHECK_EQ(width_, src.getWidth());
-  CHECK_EQ(format_, src.getFormat());
-  int start = format_ == SPARSE_CSR ? src.getRows()[0] : src.getCols()[0];
-  if (format_ == SPARSE_CSR) {
-    size_t totalColNum = 0;
-    for (size_t i = 0; i < height_; ++i) {
-      totalColNum += src.getColNum(i);
-    }
-    resize(height_, width_, totalColNum, valueType_, format_);
-    rows_[0] = 0;
-    for (size_t i = 0; i < height_; ++i) {
-      rows_[i + 1] = rows_[i] + src.getColNum(i);
-    }
-    memcpy(cols_, src.getCols() + start, totalColNum * sizeof(int));
-  } else {
-    size_t totalColNum = 0;
-    for (size_t i = 0; i < width_; ++i) {
-      totalColNum += src.getRowNum(i);
-    }
-    resize(height_, width_, totalColNum, valueType_, format_);
-    cols_[0] = 0;
-    for (size_t i = 0; i < width_; ++i) {
-      cols_[i + 1] = cols_[i] + src.getRowNum(i);
-    }
-    memcpy(rows_, src.getRows() + start, totalColNum * sizeof(int));
-  }
-
-  // if have different value type, only copy rows and cols
-  if (valueType_ == FLOAT_VALUE && src.getValueType() == FLOAT_VALUE) {
-    memcpy(value_, src.getValue() + start, elementCnt_ * sizeof(real));
-  }
-}
-
-void CpuSparseMatrix::copyRow(int offsets,
-                              size_t colNum,
-                              const sparse_non_value_t* row) {
-  for (size_t j = 0; j < colNum; j++) {
-    cols_[offsets + j] = row[j].col;
-  }
-}
-
-void CpuSparseMatrix::copyRow(int offsets,
-                              size_t colNum,
-                              const sparse_float_value_t* row) {
-  for (size_t j = 0; j < colNum; j++) {
-    cols_[offsets + j] = row[j].col;
-    value_[offsets + j] = row[j].value;
-  }
-}
-
-template <class T>
-void CpuSparseMatrix::copyFrom(int64_t* ids, int64_t* indices, T* data) {
-  size_t totalColNum = 0;
-  for (size_t i = 0; i < height_; ++i) {
-    int64_t id = ids[i];
-    totalColNum += indices[id + 1] - indices[id];
-  }
-  valueType_ = typeid(T) == typeid(sparse_non_value_t) ? NO_VALUE : FLOAT_VALUE;
-
-  resize(height_, width_, totalColNum, valueType_, format_);
-
-  rows_[0] = 0;
-  for (size_t i = 0; i < height_; ++i) {
-    int64_t id = ids[i];
-    T* row = data + indices[id];
-    size_t colNum = indices[id + 1] - indices[id];
-    rows_[i + 1] = rows_[i] + colNum;
-    copyRow(rows_[i], colNum, row);
-  }
-}
-
-template <class T>
-void CpuSparseMatrix::copyFrom(int64_t* indices, T* data) {
-  CHECK(format_ == SPARSE_CSR);
-  size_t totalColNum = indices[height_] - indices[0];
-  valueType_ = typeid(T) == typeid(sparse_non_value_t) ? NO_VALUE : FLOAT_VALUE;
-  resize(height_, width_, totalColNum, valueType_, format_);
-
-  rows_[0] = 0;
-  for (size_t i = 0; i < height_; ++i) {
-    T* row = data + indices[i];
-    size_t colNum = indices[i + 1] - indices[i];
-    rows_[i + 1] = rows_[i] + colNum;
-    copyRow(rows_[i], colNum, row);
-  }
-}
-
-void CpuSparseMatrix::trimFrom(const CpuSparseMatrix& src) {
-  CHECK_EQ(height_, src.getHeight());
-  CHECK_LE(width_, src.getWidth());
-  CHECK_EQ(format_, src.getFormat());
-  CHECK_EQ(valueType_, src.getValueType());
-  if (format_ == SPARSE_CSR) {
-    int* srcCols = src.getCols();
-    size_t numLessWidth =
-        std::count_if(srcCols, srcCols + src.getElementCnt(), [this](size_t n) {
-          return n < this->width_;
-        });
-    resize(height_, width_, numLessWidth, valueType_, format_);
-    rows_[0] = 0;
-    size_t index = 0;
-    for (size_t r = 0; r < height_; ++r) {
-      for (int i = src.getRows()[r]; i < src.getRows()[r + 1]; ++i) {
-        if (srcCols[i] < static_cast<int>(width_)) {
-          cols_[index] = srcCols[i];
-          if (valueType_ == FLOAT_VALUE) {
-            value_[index] = src.getValue()[i];
-          }
-          ++index;
-        }
-      }
-      rows_[r + 1] = index;
-    }
-    CHECK_EQ(index, numLessWidth);
-  } else {
-    size_t numLessWidth = src.getCols()[width_] - src.getCols()[0];
-    resize(height_, width_, numLessWidth, valueType_, format_);
-    cols_[0] = 0;
-    size_t index = 0;
-    // note: c < width_, not src.getWidth();
-    for (size_t c = 0; c < width_; ++c) {
-      for (int i = src.getCols()[c]; i < src.getCols()[c + 1]; ++i) {
-        rows_[index] = src.getRows()[i];
-        if (valueType_ == FLOAT_VALUE) {
-          value_[index] = src.getValue()[i];
-        }
-        ++index;
-      }
-      cols_[c + 1] = index;
-    }
-    CHECK_EQ(index, numLessWidth);
-  }
-}
-
-void CpuSparseMatrix::zeroMem() {
-  CHECK(valueType_ == FLOAT_VALUE);
-  memset(value_, 0, elementCnt_ * sizeof(real));
-}
-
-template void CpuSparseMatrix::copyFrom(int64_t* ids,
-                                        int64_t* indices,
-                                        sparse_non_value_t* data);
-
-template void CpuSparseMatrix::copyFrom(int64_t* ids,
-                                        int64_t* indices,
-                                        sparse_float_value_t* data);
-
-template void CpuSparseMatrix::copyFrom(int64_t* indices,
-                                        sparse_non_value_t* data);
-
-template void CpuSparseMatrix::copyFrom(int64_t* indices,
-                                        sparse_float_value_t* data);
-
-void CpuSparseMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
-  size_t numSamples = getHeight();
-  size_t beam = maxVal.getWidth();
-  CHECK_EQ(maxIds.getSize(), numSamples * beam);
-  CHECK_EQ(maxVal.getHeight(), numSamples);
-  maxVal.zeroMem();
-  int* outids = maxIds.getData();
-  real* outvalues = maxVal.getData();
-
-  typedef std::pair<real, size_t> valuepair;
-  std::vector<valuepair> vec;
-  for (size_t i = 0; i < numSamples; i++) {
-    vec.clear();
-
-    auto num = getColNum(i);
-    auto ids = getRowCols(i);
-    auto values = getRowValues(i);
-    for (size_t j = 0; j < num; j++) {
-      vec.push_back(std::make_pair(values[j], ids[j]));
-    }
-
-    size_t outsize = std::min(num, beam);
-    std::partial_sort(vec.begin(),
-                      vec.begin() + outsize,
-                      vec.end(),
-                      [](const valuepair& a, const valuepair& b) {
-                        return a.first > b.first;
-                      });
-    for (size_t j = 0; j < outsize; j++) {
-      outids[i * beam + j] = vec[j].second;
-      outvalues[i * beam + j] = vec[j].first;
-    }
-    if (outsize < beam) {
-      // if the number of values to sort are less than the output size,
-      // use -1 to indicate the end of valid sorted values.
-      outids[i * beam + outsize] = -1;
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/math/MKLDNNMatrix.h b/paddle/math/MKLDNNMatrix.h
deleted file mode 100644
index d4a78f3e54b73add3c00e17f13d91359839d3d14..0000000000000000000000000000000000000000
--- a/paddle/math/MKLDNNMatrix.h
+++ /dev/null
@@ -1,256 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "Matrix.h"
-#include "mkldnn.hpp"
-#include "paddle/parameter/Parameter.h"
-
-namespace paddle {
-
-class MKLDNNMatrix;
-typedef std::shared_ptr<MKLDNNMatrix> MKLDNNMatrixPtr;
-
-#define CHECK_PRIMITIVE_DESC_EQ(MAT, PD, ...)                        \
-  CHECK(MAT) << " can not be empty.";                                \
-  CHECK(MAT->getPrimitiveDesc() == PD)                               \
-      << #MAT "->getPrimitiveDesc() and " #PD " should be equal.\n " \
-      << "" __VA_ARGS__;
-
-/**
- * @brief MKLDNN Matrix.
- *
- */
-class MKLDNNMatrix : public CpuMatrix, public mkldnn::memory {
- public:
-  MKLDNNMatrix(CpuMatrixPtr m, mkldnn::memory::primitive_desc pd)
-      : CpuMatrix(m->getData(), m->getHeight(), m->getWidth(), false),
-        mkldnn::memory(pd, m->getData()),
-        m_(m) {}
-
-  ~MKLDNNMatrix() {}
-
-  /**
-   * Create MKLDNNMatrix from a MatrixPtr and memory primitive_desc
-   */
-  static MKLDNNMatrixPtr create(mkldnn::memory::primitive_desc pd,
-                                MatrixPtr m = nullptr);
-
-  /**
-   * Create MKLDNNMatrix from a MatrixPtr and memory details info
-   */
-  static MKLDNNMatrixPtr create(
-      mkldnn::memory::dims dims,
-      mkldnn::memory::format fmt,
-      mkldnn::engine& eg,
-      MatrixPtr m = nullptr,
-      mkldnn::memory::data_type dtype = mkldnn::memory::data_type::f32);
-
-  /**
-   * Create primitive descriptor.
-   * default with f32 dtype
-   */
-  static mkldnn::memory::primitive_desc createPrimitiveDesc(
-      const mkldnn::memory::dims dims,
-      const mkldnn::memory::format& fmt,
-      const mkldnn::engine& eg,
-      const mkldnn::memory::data_type& dtype = mkldnn::memory::data_type::f32) {
-    return mkldnn::memory::primitive_desc(memory::desc(dims, dtype, fmt), eg);
-  }
-
-  /**
-   * Create Memory descriptor.
-   * default with any format and f32 dtype
-   */
-  static mkldnn::memory::desc createMemoryDesc(
-      const mkldnn::memory::dims dims,
-      const mkldnn::memory::format& fmt = mkldnn::memory::format::any,
-      const mkldnn::memory::data_type& dtype = mkldnn::memory::data_type::f32) {
-    return mkldnn::memory::desc(dims, dtype, fmt);
-  }
-
-  /**
-   * Create reorder primitive.
-   * Create a mkldnn::reorder handle for converting src MKLDNNMatrix to dst.
-   * checkData: whether to check the data handle of src and dst.
-   *            if true, it will check the data and do not allow them equal;
-   *            otherwise, it will not check them, then the reorder created
-   *            may have inplace buffer.
-   *            Do not set false, if you can not guarantee the inplace logical
-   *            would work with your reorder.
-   */
-  static std::shared_ptr<mkldnn::reorder> createReorder(
-      const MKLDNNMatrixPtr& src,
-      const MKLDNNMatrixPtr& dst,
-      bool checkData = true);
-
-  void copyFrom(const Matrix& src) {
-    // TODO(TJ): reorder data if this format is not nchw or x
-    m_->copyFrom(src);
-  }
-
-  void copyTo(Matrix& dst) {
-    // TODO(TJ): reorder data if this format is not nchw or x
-    dst.copyFrom(*m_);
-  }
-
- public:
-  /**
-   * Reorder this MKLDNNMatrix from other format.
-   * Support inplace reorder.
-   * @note: this function would only reorder the data layout.
-   *        will NOT change this original dim or format info
-   */
-  void reorderDataFrom(const MKLDNNMatrixPtr& m,
-                       memory::format srcFmt,
-                       memory::dims targetDim);
-
-  /**
-   * Reorder this MKLDNNMatrix to other format.
-   * Support inplace reorder.
-   * @note: this function would only reorder the data layout.
-   *        will NOT change the dst dim or format info
-   */
-  void reorderDataTo(const MKLDNNMatrixPtr& m,
-                     memory::format dstFmt,
-                     memory::dims targetDim);
-
-  /**
-   * Dimensionality reduction.
-   * Change format "nchw --> nc" or "oihw --> oi" if the h and w are both 1
-   */
-  void downSpatial();
-
-  /**
-   * set the memory data handle.
-   * Caution: This will not check the buffer size of the data,
-   *          it should be coverd by user.
-   */
-  void setData(real* data) {
-    set_data_handle(data);
-    CpuMatrix::setData(data);
-    m_.reset();
-  }
-
-  /**
-   * override the CpuMatrix::resize
-   */
-  void resize(size_t newHeight, size_t newWidth) override {
-    m_->resize(newHeight, newWidth);
-    if (data_ == m_->getData() && elementCnt_ == newHeight * newWidth) {
-      return;
-    }
-    CpuMatrix::setData(data_);
-    height_ = newHeight;
-    width_ = newWidth;
-    elementCnt_ = newHeight * newWidth;
-    stride_ = width_;
-    auto pd = mkldnn::memory::primitive_desc(
-        mkldnn::memory::desc({(int)newHeight, (int)newWidth},
-                             getDtype(),
-                             mkldnn::memory::format::nc),
-        getEngine());
-    resetMKLDNNMemory(pd, data_);
-  }
-
-  /**
-   * override Matrix::getData
-   * check data before return
-   */
-  real* getData() override {
-    CHECK_EQ((void*)data_, get_data_handle());
-    return data_;
-  }
-
-  const real* getData() const override {
-    CHECK_EQ((void*)data_, get_data_handle());
-    return data_;
-  }
-
-  /**
-   * Get primitive descriptor.
-   */
-  mkldnn::memory::primitive_desc getPrimitiveDesc() {
-    return this->get_primitive_desc();
-  }
-
-  /**
-   * Get memory descriptor.
-   */
-  mkldnn::memory::desc getMemoryDesc() { return getPrimitiveDesc().desc(); }
-
-  /**
-   * Get dimensions.
-   */
-  mkldnn::memory::dims getDims() {
-    mkldnn::memory::desc md = getMemoryDesc();
-    const int* src = md.data.dims;
-    int ndims = md.data.ndims;
-    mkldnn::memory::dims dst;
-    dst.resize(ndims);
-    for (int i = 0; i < ndims; ++i) {
-      dst[i] = src[i];
-    }
-    return dst;
-  }
-
-  /**
-   * Get format.
-   */
-  mkldnn::memory::format getFormat() {
-    return (mkldnn::memory::format)(getMemoryDesc().data.format);
-  }
-
-  /**
-   * Get memory data type.
-   */
-  mkldnn::memory::data_type getDtype() {
-    return (mkldnn::memory::data_type)(getMemoryDesc().data.data_type);
-  }
-
-  /**
-   * Get engine.
-   */
-  mkldnn::engine getEngine() { return getPrimitiveDesc().get_engine(); }
-
- protected:
-  /**
-   * Do reorder once.
-   * Can support inplace.
-   */
-  void reorderOnce(void* srcData,
-                   void* dstData,
-                   memory::format srcFmt,
-                   memory::format dstFmt,
-                   memory::dims dm);
-  /**
-   * reset this MKLDNN Memory from primitve desc
-   */
-  void resetMKLDNNMemory(memory::primitive_desc pd, real* data) {
-    mkldnn_primitive_t result;
-    mkldnn::error::wrap_c_api(
-        mkldnn_primitive_create(&result, pd.get(), nullptr, nullptr),
-        "could not create a memory primitive");
-    reset(result);
-    set_data_handle(data);
-  }
-
- private:
-  // save the CpuMatrixPtr in case the buffer released outside
-  CpuMatrixPtr m_;
-};
-
-}  // namespace paddle
diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp
deleted file mode 100644
index f48119aa511578b21602a225277f01b4c6a9e9a8..0000000000000000000000000000000000000000
--- a/paddle/math/MathFunctions.cpp
+++ /dev/null
@@ -1,348 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/math/MathFunctions.h"
-#include "hl_matrix_apply.cuh"
-#include "hl_matrix_ops.cuh"
-#include "paddle/utils/DynamicLoader.h"
-
-namespace dynload {
-
-std::once_flag lapack_dso_flag;
-void* lapack_dso_handle = nullptr;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load lapack routine
- * via operator overloading.
- *
- * note: default dynamic linked libs
- */
-
-// The argument for stringizing operator is not macro-expanded first.
-// We have to use two levels of macro to do the expansion.
-// See https://gcc.gnu.org/onlinedocs/cpp/Stringizing.html
-#define STR(x) #x
-
-// clang-format off
-#ifndef LAPACK_FOUND
-#define DYNAMIC_LOAD_LAPACK_WRAP(__name)                                       \
-  struct DynLoad__##__name {                                                   \
-    template <typename... Args>                                                \
-    auto operator()(Args... args) -> decltype(__name(args...)) {               \
-      using lapack_func = decltype(__name(args...)) (*)(Args...);              \
-      std::call_once(lapack_dso_flag, GetLapackDsoHandle, &lapack_dso_handle); \
-      void* p_##__name = dlsym(lapack_dso_handle, STR(__name));                \
-      CHECK(p_##__name) << "Cannot find symbol " << STR(__name)                \
-                        << " in liblapack.so";                                 \
-      return reinterpret_cast<lapack_func>(p_##__name)(args...);               \
-    }                                                                          \
-  } __name;  // struct DynLoad__##__name
-#else
-#define DYNAMIC_LOAD_LAPACK_WRAP(__name)                                       \
-  struct DynLoad__##__name {                                                   \
-    template <typename... Args>                                                \
-    auto operator()(Args... args) -> decltype(__name(args...)) {               \
-      return __name(args...);                                                  \
-    }                                                                          \
-  } __name;  // struct DynLoad__##__name
-#endif
-
-#define  PADDLE_SGETRF  LAPACKE_sgetrf
-#define  PADDLE_DGETRF  LAPACKE_dgetrf
-#define  PADDLE_SGETRI  LAPACKE_sgetri
-#define  PADDLE_DGETRI  LAPACKE_dgetri
-
-#define LAPACK_ROUTINE_EACH(__macro)       \
-  __macro(PADDLE_SGETRF)                   \
-  __macro(PADDLE_DGETRF)                   \
-  __macro(PADDLE_SGETRI)                   \
-  __macro(PADDLE_DGETRI)
-// clang-format on
-
-LAPACK_ROUTINE_EACH(DYNAMIC_LOAD_LAPACK_WRAP)
-
-}  // namespace dynload
-
-namespace paddle {
-
-#ifndef PADDLE_USE_EIGEN_FOR_BLAS
-template <>
-void gemm<float>(const CBLAS_TRANSPOSE transA,
-                 const CBLAS_TRANSPOSE transB,
-                 const int M,
-                 const int N,
-                 const int K,
-                 const float alpha,
-                 const float* A,
-                 const int lda,
-                 const float* B,
-                 const int ldb,
-                 const float beta,
-                 float* C,
-                 const int ldc) {
-  cblas_sgemm(CblasRowMajor,
-              transA,
-              transB,
-              M,
-              N,
-              K,
-              alpha,
-              A,
-              lda,
-              B,
-              ldb,
-              beta,
-              C,
-              ldc);
-}
-
-template <>
-void gemm<double>(const CBLAS_TRANSPOSE transA,
-                  const CBLAS_TRANSPOSE transB,
-                  const int M,
-                  const int N,
-                  const int K,
-                  const double alpha,
-                  const double* A,
-                  const int lda,
-                  const double* B,
-                  const int ldb,
-                  const double beta,
-                  double* C,
-                  const int ldc) {
-  cblas_dgemm(CblasRowMajor,
-              transA,
-              transB,
-              M,
-              N,
-              K,
-              alpha,
-              A,
-              lda,
-              B,
-              ldb,
-              beta,
-              C,
-              ldc);
-}
-#endif
-
-template <>
-int getrf<float>(const CBLAS_ORDER order,
-                 const int M,
-                 const int N,
-                 float* A,
-                 const int lda,
-                 int* ipiv) {
-  return dynload::PADDLE_SGETRF(order, M, N, A, lda, ipiv);
-}
-
-template <>
-int getrf<double>(const CBLAS_ORDER order,
-                  const int M,
-                  const int N,
-                  double* A,
-                  const int lda,
-                  int* ipiv) {
-  return dynload::PADDLE_DGETRF(order, M, N, A, lda, ipiv);
-}
-
-template <>
-int getri<float>(const CBLAS_ORDER order,
-                 const int N,
-                 float* A,
-                 const int lda,
-                 const int* ipiv) {
-  return dynload::PADDLE_SGETRI(order, N, A, lda, ipiv);
-}
-
-template <>
-int getri<double>(const CBLAS_ORDER order,
-                  const int N,
-                  double* A,
-                  const int lda,
-                  const int* ipiv) {
-  return dynload::PADDLE_DGETRI(order, N, A, lda, ipiv);
-}
-
-#ifndef PADDLE_USE_EIGEN_FOR_BLAS
-template <>
-void axpy<float>(const int n, const float alpha, const float* x, float* y) {
-  cblas_saxpy(n, alpha, x, 1, y, 1);
-}
-
-template <>
-void axpy<double>(const int n, const double alpha, const double* x, double* y) {
-  cblas_daxpy(n, alpha, x, 1, y, 1);
-}
-
-template <>
-float dotProduct<float>(const int n, const float* x, const float* y) {
-  return cblas_sdot(n, x, 1, y, 1);
-}
-
-template <>
-double dotProduct<double>(const int n, const double* x, const double* y) {
-  return cblas_ddot(n, x, 1, y, 1);
-}
-#endif
-
-#if defined(PADDLE_WITH_MKLML)
-
-template <>
-void vExp<float>(const int n, const float* a, float* r) {
-  vsExp(n, a, r);
-}
-
-template <>
-void vExp<double>(const int n, const double* a, double* r) {
-  vdExp(n, a, r);
-}
-
-template <>
-void vPow<float>(const int n, const float* a, const float b, float* r) {
-  vsPowx(n, a, b, r);
-}
-
-template <>
-void vPow<double>(const int n, const double* a, const double b, double* r) {
-  vdPowx(n, a, b, r);
-}
-
-template <>
-void vLog<float>(const int n, const float* a, float* r) {
-  vsLn(n, a, r);
-}
-
-template <>
-void vLog<double>(const int n, const double* a, double* r) {
-  vdLn(n, a, r);
-}
-
-template <>
-void vAdd<float>(const int n, const float* a, const float* b, float* r) {
-  vsAdd(n, a, b, r);
-}
-
-template <>
-void vAdd<double>(const int n, const double* a, const double* b, double* r) {
-  vdAdd(n, a, b, r);
-}
-
-template <>
-void vTanh<float>(const int n, const float* a, float* r) {
-  vsTanh(n, a, r);
-}
-
-template <>
-void vTanh<double>(const int n, const double* a, double* r) {
-  vdTanh(n, a, r);
-}
-
-template <>
-void vInvSqrt<float>(const int n, const float* a, float* r) {
-  vsInvSqrt(n, a, r);
-}
-
-template <>
-void vInvSqrt<double>(const int n, const double* a, double* r) {
-  vdInvSqrt(n, a, r);
-}
-
-template <>
-void vLog1p<float>(const int n, const float* a, float* r) {
-  vsLog1p(n, a, r);
-}
-
-template <>
-void vLog1p<double>(const int n, const double* a, double* r) {
-  vdLog1p(n, a, r);
-}
-#else
-
-DEFINE_MATRIX_BINARY_OP(vExp, b = std::exp(a));
-template <class T>
-void vExp(const int n, const T* a, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vExp<T>, 0, 0>(
-      binary::vExp<T>(), const_cast<T*>(a), r, 1, n, n, n);
-}
-
-DEFINE_MATRIX_BINARY_OP(vLog, b = std::log(a));
-template <class T>
-void vLog(const int n, const T* a, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vLog<T>, 0, 0>(
-      binary::vLog<T>(), const_cast<T*>(a), r, 1, n, n, n);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(vPow, ONE_PARAMETER, b = std::pow(a, p));
-template <class T>
-void vPow(const int n, const T* a, const T b, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vPow<T>, 0, 0>(
-      binary::vPow<T>(b), const_cast<T*>(a), r, 1, n, n, n);
-}
-
-DEFINE_MATRIX_TERNARY_OP(vAdd, c = a + b);
-template <class T>
-void vAdd(const int n, const T* a, const T* b, T* r) {
-  hl_cpu_apply_ternary_op<T, ternary::vAdd<T>, 0, 0>(ternary::vAdd<T>(),
-                                                     const_cast<T*>(a),
-                                                     const_cast<T*>(b),
-                                                     r,
-                                                     1,
-                                                     n,
-                                                     n,
-                                                     n,
-                                                     n);
-}
-
-DEFINE_MATRIX_BINARY_OP(vInvSqrt, b = 1.0f / std::sqrt(a));
-template <class T>
-void vInvSqrt(const int n, const T* a, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vInvSqrt<T>, 0, 0>(
-      binary::vInvSqrt<T>(), const_cast<T*>(a), r, 1, n, n, n);
-}
-
-DEFINE_MATRIX_BINARY_OP(vLog1p, b = std::log(1.0f + a));
-template <class T>
-void vLog1p(const int n, const T* a, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vLog1p<T>, 0, 0>(
-      binary::vLog1p<T>(), const_cast<T*>(a), r, 1, n, n, n);
-}
-
-DEFINE_MATRIX_BINARY_OP(vTanh, T tmp = -2.0 * a;
-                        tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
-                        b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
-template <class T>
-void vTanh(const int n, const T* a, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vTanh<T>, 0, 0>(
-      binary::vTanh<T>(), const_cast<T*>(a), r, 1, n, n, n);
-}
-
-template void vExp(const int n, const float* a, float* r);
-template void vExp(const int n, const double* a, double* r);
-template void vLog(const int n, const float* a, float* r);
-template void vLog(const int n, const double* a, double* r);
-template void vPow(const int n, const float* a, const float b, float* r);
-template void vPow(const int n, const double* a, const double b, double* r);
-template void vAdd(const int n, const float* a, const float* b, float* r);
-template void vAdd(const int n, const double* a, const double* b, double* r);
-template void vInvSqrt(const int n, const double* a, double* r);
-template void vInvSqrt(const int n, const float* a, float* r);
-template void vLog1p(const int n, const float* a, float* r);
-template void vLog1p(const int n, const double* a, double* r);
-template void vTanh(const int n, const float* a, float* r);
-template void vTanh(const int n, const double* a, double* r);
-#endif
-}  // namespace paddle
diff --git a/paddle/math/MathUtils.cpp b/paddle/math/MathUtils.cpp
deleted file mode 100644
index b2afdbcd51a3cf5d3e6f3e2bb14902bf78fe68c8..0000000000000000000000000000000000000000
--- a/paddle/math/MathUtils.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MathUtils.h"
-#include <algorithm>
-#include "Vector.h"
-#include "paddle/utils/Logging.h"
-
-namespace paddle {
-
-/*if csc, major is cols and minor is rows, else
- * major is rows and minor is cols, according to
- * major value to initialize minor value"
- */
-void sparseRand(
-    int* major, int* minor, int nnz, int majorLen, int minorMax, bool useGpu) {
-  CHECK(size_t(nnz) >= size_t(1));
-  int* cpuMajor;
-  int* cpuMinor;
-  CpuIVector cpuMinorVec(nnz);
-  CpuIVector cpuMajorVec(majorLen);
-  if (useGpu) {
-    cpuMajor = cpuMajorVec.getData();
-    cpuMinor = cpuMinorVec.getData();
-  } else {
-    cpuMajor = major;
-    cpuMinor = minor;
-  }
-
-  /*major value init*/
-  for (int i = 0; i < majorLen - 1; i++) {
-    cpuMajor[i] = 1.0 * i * nnz / (majorLen - 1);
-  }
-  cpuMajor[majorLen - 1] = nnz;
-
-  /*minor value init according to major value*/
-  std::vector<char> used(minorMax, 0);
-  for (int i = 0; i < majorLen - 1; i++) {
-    CHECK_LE(cpuMajor[i + 1] - cpuMajor[i], minorMax);
-    used.assign(minorMax, 0);
-    for (int j = cpuMajor[i]; j < cpuMajor[i + 1]; j++) {
-      int idx = ::rand() % minorMax;
-      while (used[idx]) {
-        idx = ::rand() % minorMax;
-      }
-      cpuMinor[j] = idx;
-      used[idx] = 1;
-    }
-    std::sort(cpuMinor + cpuMajor[i],
-              cpuMinor + cpuMajor[i + 1],
-              [](int a, int b) { return a < b; });
-  }
-  /*memcpy result to gpu*/
-  if (useGpu) {
-    hl_memcpy_host2device(major, cpuMajor, sizeof(int) * majorLen);
-    hl_memcpy_host2device(minor, cpuMinor, sizeof(int) * nnz);
-  }
-}
-
-int outputSize(
-    int imageSize, int filterSize, int padding, int stride, bool caffeMode) {
-  int outputSize;
-  if (!caffeMode) {
-    outputSize =
-        (imageSize - filterSize + 2 * padding + stride - 1) / stride + 1;
-  } else {
-    outputSize = (imageSize - filterSize + 2 * padding) / stride + 1;
-  }
-  CHECK_GE(outputSize, 1);
-  return outputSize;
-}
-
-int imageSize(
-    int outputSize, int filterSize, int padding, int stride, bool caffeMode) {
-  int imageSize;
-  if (!caffeMode) {
-    imageSize =
-        (outputSize - 1) * stride + filterSize - 2 * padding - stride + 1;
-  } else {
-    imageSize = (outputSize - 1) * stride + filterSize - 2 * padding;
-  }
-  CHECK_GE(imageSize, 1);
-  return imageSize;
-}
-
-}  // namespace paddle
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
deleted file mode 100644
index bcd6dfe1fda6b1243007b0c26a6e0087eedcc10c..0000000000000000000000000000000000000000
--- a/paddle/math/Matrix.cpp
+++ /dev/null
@@ -1,4787 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Matrix.h"
-#include "MathFunctions.h"
-#include "SparseMatrix.h"
-#include "SparseRowMatrix.h"
-
-#include <float.h>
-#include <algorithm>
-#include <cmath>
-
-#include <string.h>
-#include "hl_cnn.h"
-#include "hl_gpu.h"
-#include "hl_table_apply.h"
-#include "hl_top_k.h"
-#include "paddle/utils/Logging.h"
-
-#include "NEONFunctions.h"
-#include "paddle/function/GemmFunctor.h"
-#include "paddle/utils/ThreadLocal.h"
-
-#include "SIMDFunctions.h"
-
-namespace paddle {
-
-inline real _pow(real a, real beta) { return std::pow(a, beta); }
-
-inline real _square(real a) { return a * a; }
-
-inline real _safelog(real a) { return a > 0.0f ? std::log(a) : -40.0f; }
-
-Matrix::Matrix(MemoryHandlePtr memHandle,
-               size_t height,
-               size_t width,
-               bool trans,
-               bool use_gpu)
-    : BaseMatrix(
-          height,
-          width,
-          memHandle ? (reinterpret_cast<real*>(memHandle->getBuf())) : nullptr,
-          trans,
-          use_gpu) {
-  elementCnt_ = width * height;
-  memoryHandle_ = memHandle;
-}
-
-Matrix::Matrix(
-    real* data, size_t height, size_t width, bool trans, bool use_gpu)
-    : BaseMatrix(height, width, data, trans, use_gpu) {
-  elementCnt_ = width * height;
-}
-
-Matrix::Matrix(real* data,
-               size_t height,
-               size_t width,
-               size_t stride,
-               bool trans,
-               bool use_gpu)
-    : BaseMatrix(height, width, stride, data, trans, use_gpu) {
-  elementCnt_ = width * height;
-}
-
-MatrixPtr Matrix::createSparseMatrix(real* data,
-                                     int* row,
-                                     int* col,
-                                     size_t height,
-                                     size_t width,
-                                     size_t nnz, /* used to allocate space */
-                                     SparseValueType valueType, /*value type*/
-                                     SparseFormat format,
-                                     bool trans,
-                                     bool useGpu) {
-  if (useGpu) {
-    return std::make_shared<GpuSparseMatrix>(
-        data, row, col, height, width, nnz, valueType, format, trans);
-  } else {
-    return std::make_shared<CpuSparseMatrix>(
-        data, row, col, height, width, nnz, valueType, format, trans);
-  }
-}
-
-MatrixPtr Matrix::createSparseMatrix(size_t height,
-                                     size_t width,
-                                     size_t nnz, /* used to allocate space */
-                                     SparseValueType valueType, /*value type*/
-                                     SparseFormat format,
-                                     bool trans,
-                                     bool useGpu) {
-  if (useGpu) {
-    return std::make_shared<GpuSparseMatrix>(
-        height, width, nnz, valueType, format, trans);
-  } else {
-    return std::make_shared<CpuSparseMatrix>(
-        height, width, nnz, valueType, format, trans);
-  }
-}
-
-MatrixPtr Matrix::create(MemoryHandlePtr memHandle,
-                         size_t height,
-                         size_t width,
-                         bool trans) {
-  if (auto gpuHandle = std::dynamic_pointer_cast<GpuMemoryHandle>(memHandle)) {
-    return std::make_shared<GpuMatrix>(gpuHandle, height, width, trans);
-  } else if (auto cpuHandle =
-                 std::dynamic_pointer_cast<CpuMemoryHandle>(memHandle)) {
-    return std::make_shared<CpuMatrix>(cpuHandle, height, width, trans);
-  } else {
-    LOG(FATAL) << "Wrong";
-    return nullptr;
-  }
-}
-
-MatrixPtr Matrix::create(size_t height, size_t width, bool trans, bool useGpu) {
-  if (useGpu) {
-    return std::make_shared<GpuMatrix>(height, width, trans);
-  } else {
-    return std::make_shared<CpuMatrix>(height, width, trans);
-  }
-}
-
-MatrixPtr Matrix::create(
-    real* data, size_t height, size_t width, bool trans, bool useGpu) {
-  if (useGpu) {
-    return std::make_shared<GpuMatrix>(data, height, width, trans);
-  } else {
-    return std::make_shared<CpuMatrix>(data, height, width, trans);
-  }
-}
-
-MatrixPtr Matrix::create(real* data,
-                         size_t height,
-                         size_t width,
-                         size_t stride,
-                         bool trans,
-                         bool useGpu) {
-  if (useGpu) {
-    return std::make_shared<GpuMatrix>(data, height, width, stride, trans);
-  } else {
-    return std::make_shared<CpuMatrix>(data, height, width, stride, trans);
-  }
-}
-
-MatrixPtr Matrix::createSparseMatrix(size_t height,
-                                     size_t width,
-                                     size_t nnz,
-                                     SparseValueType valueType,
-                                     bool trans,
-                                     bool useGpu) {
-  if (useGpu) {
-    return std::make_shared<GpuSparseMatrix>(
-        height, width, nnz, valueType, SPARSE_CSR, trans);
-  } else {
-    return std::make_shared<CpuSparseMatrix>(
-        height, width, nnz, valueType, SPARSE_CSR, trans);
-  }
-}
-
-void Matrix::resizeOrCreate(
-    MatrixPtr& matrix, size_t height, size_t width, bool trans, bool useGpu) {
-  if (!matrix) {
-    matrix = Matrix::create(height, width, trans, useGpu);
-  } else {
-    CHECK_EQ(matrix->useGpu(), useGpu);
-    matrix->resize(height, width);
-  }
-}
-
-void Matrix::resizeOrCreateSparseMatrix(MatrixPtr& matrix,
-                                        size_t height,
-                                        size_t width,
-                                        size_t nnz,
-                                        SparseValueType valueType,
-                                        SparseFormat format,
-                                        bool trans,
-                                        bool useGpu) {
-  if (!matrix) {
-    matrix = Matrix::createSparseMatrix(
-        height, width, nnz, valueType, format, trans, useGpu);
-  } else {
-    CHECK(dynamic_cast<CpuSparseMatrix*>(matrix.get()) ||
-          dynamic_cast<GpuSparseMatrix*>(matrix.get()));
-    CHECK_EQ(matrix->useGpu(), useGpu);
-    matrix->resize(height, width, nnz, valueType, format);
-  }
-}
-
-void Matrix::reshape(size_t height, size_t width) {
-  CHECK(isContiguous());
-  CHECK(height_ * width_ == height * width);
-  height_ = height;
-  width_ = width;
-  stride_ = width_;
-}
-
-MatrixPtr Matrix::subMatrix(size_t startRow,
-                            size_t endRow,
-                            size_t startCol,
-                            size_t endCol) {
-  CHECK_LE(startRow, endRow);
-  CHECK_LE(endRow, getHeight());
-  CHECK_LE(startCol, endCol);
-  CHECK_LE(endCol, getWidth());
-
-  return Matrix::create(getData() + startRow * getStride() + startCol,
-                        endRow - startRow,
-                        endCol - startCol,
-                        getStride(),
-                        trans_,
-                        useGpu_);
-}
-
-void Matrix::setDiag(real value) {
-  CHECK(data_ != NULL);
-  CHECK_EQ(height_, width_);
-
-  zeroMem();
-  BaseMatrix diag(height_, 1, stride_ + 1, data_, false, useGpu_);
-  diag.assign(value);
-}
-
-GpuMatrix::GpuMatrix(size_t height, size_t width, bool trans)
-    : Matrix(std::make_shared<GpuMemoryHandle>(height * width * sizeof(real)),
-             height,
-             width,
-             trans,
-             true) {}
-
-GpuMatrix::~GpuMatrix() {}
-
-void GpuMatrix::zeroMem() {
-  CHECK(data_ != NULL);
-  zero();
-}
-
-void GpuMatrix::resetOne() {
-  CHECK(data_ != NULL);
-  one();
-}
-
-void GpuMatrix::resize(size_t newHeight, size_t newWidth) {
-  size_t newSize = newHeight * newWidth;
-  if (NULL == memoryHandle_.get() ||
-      newSize * sizeof(real) > memoryHandle_->getAllocSize()) {
-    memoryHandle_ = std::make_shared<GpuMemoryHandle>(newSize * sizeof(real));
-    data_ = reinterpret_cast<real*>(memoryHandle_->getBuf());
-  }
-  height_ = newHeight;
-  width_ = newWidth;
-  elementCnt_ = newSize;
-  stride_ = width_;
-}
-
-real GpuMatrix::getElement(size_t x, size_t y) const {
-  real elem = 0;
-  hl_memcpy_device2host(&elem, &data_[x * stride_ + y], sizeof(real));
-  return elem;
-}
-
-real GpuMatrix::getSum() {
-  CHECK(isContiguous());
-  real sum = 0.0f;
-  hl_vector_sum(data_, &sum, height_ * width_);
-  return sum;
-}
-
-real GpuMatrix::getMin() {
-  CHECK(isContiguous());
-  auto vec = GpuVector(height_ * width_, data_);
-  return vec.getMin();
-}
-
-real GpuMatrix::getMax() {
-  CHECK(isContiguous());
-  auto vec = GpuVector(height_ * width_, data_);
-  return vec.getMax();
-}
-
-void GpuMatrix::accumulateColSum(Matrix& src) {
-  CHECK_EQ(getWidth(), src.getWidth());
-  CHECK_EQ(getHeight(), (size_t)1);
-  sumCols(src, 1.0, 1.0);
-}
-
-real GpuMatrix::getAbsSum() {
-  CHECK(isContiguous());
-  real sum = 0.0f;
-  hl_vector_abs_sum(data_, &sum, height_ * width_);
-  return sum;
-}
-
-void GpuMatrix::copyFrom(const Matrix& src) {
-  CHECK(isContiguous());
-  CHECK(src.isContiguous());
-  CHECK(elementCnt_ == src.getElementCnt());
-
-  if (typeid(src) == typeid(CpuMatrix)) {
-    hl_memcpy_host2device(
-        data_, const_cast<real*>(src.getData()), sizeof(real) * elementCnt_);
-  } else if (typeid(src) == typeid(GpuMatrix)) {
-    hl_memcpy_device2device(
-        data_, const_cast<real*>(src.getData()), sizeof(real) * elementCnt_);
-  } else {
-    LOG(FATAL) << "Wrong";
-  }
-}
-
-void GpuMatrix::copyFrom(const Matrix& src, hl_stream_t stream) {
-  CHECK(isContiguous());
-  CHECK(src.isContiguous());
-  CHECK(elementCnt_ == src.getElementCnt());
-  hl_memcpy_async(this->getData(),
-                  const_cast<real*>(src.getData()),
-                  sizeof(real) * elementCnt_,
-                  stream);
-}
-
-void GpuMatrix::copyFrom(const real* hostSrc, size_t size) {
-  CHECK(isContiguous());
-  CHECK(size <= elementCnt_);
-  hl_memcpy_host2device(data_, const_cast<real*>(hostSrc), sizeof(real) * size);
-}
-
-void GpuMatrix::copyFrom(const real* hostSrc, const int64_t* seq) {
-  LOG(FATAL) << "not implemented";
-}
-
-void GpuMatrix::copyFrom(const IVector& src) {
-  CHECK(isContiguous());
-  CpuMatrix matrix(src.getSize(), 1, false);
-  matrix.copyFrom(src);
-  copyFrom(matrix);
-}
-
-void GpuMatrix::copyByRowIndex(Matrix& b, const IVector& rowIndex) {
-  size_t height = getHeight();
-  size_t width = getWidth();
-  CHECK_EQ(b.getWidth(), width);
-  real* dst = getData();
-  real* src = b.getData();
-  const int* index = rowIndex.getData();
-  hl_sequence2batch_copy(dst, src, index, width, height, true);
-}
-
-MatrixPtr GpuMatrix::clone(size_t height, size_t width, bool useGpu) {
-  CHECK(isContiguous());
-
-  if (height == 0 && width == 0) {
-    height = height_;
-    width = width_;
-  }
-
-  CHECK(width && height);
-
-  if (useGpu) {
-    return std::make_shared<GpuMatrix>(height, width);
-  } else {
-    return std::make_shared<CpuMatrix>(height, width);
-  }
-}
-
-MatrixPtr GpuMatrix::getTranspose() {
-  if (memoryHandle_.get() != NULL) {
-    MatrixPtr copy_T(
-        new GpuMatrix(std::dynamic_pointer_cast<GpuMemoryHandle>(memoryHandle_),
-                      height_,
-                      width_,
-                      true));
-    return copy_T;
-  } else {
-    MatrixPtr copy_T(new GpuMatrix(data_, height_, width_, true));
-    return copy_T;
-  }
-}
-
-void GpuMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
-  if (memAlloc) {
-    matTrans = std::make_shared<GpuMatrix>(width_, height_);
-  } else {
-    CHECK(matTrans != NULL);
-    CHECK_EQ(matTrans->getHeight(), width_);
-    CHECK_EQ(matTrans->getWidth(), height_);
-  }
-  real* dataTrans = matTrans->getData();
-  real* data = getData();
-  int lda = getStride();
-  int ldc = matTrans->getStride();
-
-  hl_matrix_transpose(data, dataTrans, height_, width_, lda, ldc);
-}
-
-void GpuMatrix::rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise) {
-  if (memAlloc) {
-    matRot = std::make_shared<GpuMatrix>(width_, height_);
-  } else {
-    CHECK(matRot != NULL);
-    CHECK_EQ(matRot->getHeight(), width_);
-    CHECK_EQ(matRot->getWidth(), height_);
-  }
-
-  real* dataRot = matRot->getData();
-  real* data = getData();
-  hl_matrix_rotate(data, dataRot, height_, width_, clockWise);
-}
-
-MatrixPtr GpuMatrix::getInverse() {
-  MatrixPtr matInv;
-  inverse(matInv, true);
-  return matInv;
-}
-
-void GpuMatrix::inverse(MatrixPtr& matInv, bool memAlloc) {
-  CHECK_EQ(height_, width_);
-
-  if (memAlloc) {
-    matInv = std::make_shared<GpuMatrix>(height_, width_);
-  } else {
-    CHECK(matInv != NULL);
-  }
-
-  real* data = getData();
-  real* dataInv = matInv->getData();
-  int lda = getStride();
-  int ldc = matInv->getStride();
-
-  hl_matrix_inverse(data, dataInv, height_, lda, ldc);
-}
-
-void GpuMatrix::addBias(Matrix& b, real scale) {
-  CHECK(b.getHeight() == 1) << "the Bias should be a vector";
-  BaseMatrix::addBias(b, scale);
-}
-
-void GpuMatrix::addSharedBias(Matrix& b, real scale) {
-  CHECK(b.getHeight() == 1) << "the Bias should be a vector";
-  CHECK_LE(b.getWidth(), getWidth());
-  CHECK_EQ(getWidth() % b.getWidth(), 0UL);
-  hl_matrix_add_shared_bias(
-      getData(), b.getData(), b.getWidth(), getHeight(), getWidth(), scale);
-}
-
-void GpuMatrix::collectBias(Matrix& a, real scale) {
-#ifdef PADDLE_WITH_CUDA
-  CHECK_EQ(getHeight(), (size_t)1);
-  CHECK_EQ(width_, a.getWidth());
-  GpuSparseMatrix* sMatPtr = dynamic_cast<GpuSparseMatrix*>(&a);
-  if (!sMatPtr) {
-    sumCols(a, /* scaleSum= */ scale, /* scaleDest= */ 1);
-  } else {
-    real* data = getData();
-    hl_sparse_matrix_s A_d = sMatPtr->sMatrix_.get();
-    hl_sparse_matrix_column_sum(data, A_d, sMatPtr->getHeight(), width_, scale);
-  }
-#endif
-}
-
-void GpuMatrix::collectSharedBias(Matrix& a, real scale) {
-  CHECK_EQ(getHeight(), (size_t)1);
-  CHECK_EQ(a.getWidth() % getWidth(), 0UL);
-  hl_matrix_collect_shared_bias(
-      getData(), a.getData(), getWidth(), a.getHeight(), a.getWidth(), scale);
-}
-
-void GpuMatrix::sequenceAvgForward(Matrix& a,
-                                   const IVector& startsPos,
-                                   int mode) {
-  size_t height = getHeight();
-  size_t width = getWidth();
-  CHECK_EQ(height, startsPos.getSize() - 1);
-  CHECK_EQ(width, a.getWidth());
-  real* dst = getData();
-  real* src = a.getData();
-  const int* starts = startsPos.getData();
-
-  hl_sequence_avg_forward(dst, src, starts, height, width, mode);
-}
-
-void GpuMatrix::sequenceAvgBackward(Matrix& a,
-                                    const IVector& startsPos,
-                                    int mode) {
-  size_t height = a.getHeight();
-  size_t width = getWidth();
-  CHECK_EQ(height, startsPos.getSize() - 1);
-  CHECK_EQ(width, a.getWidth());
-  real* dst = getData();
-  real* src = a.getData();
-  const int* starts = startsPos.getData();
-
-  hl_sequence_avg_backward(dst, src, starts, height, width, mode);
-}
-
-/* this = scaleAB*(a*b) +  scaleT*this */
-void GpuMatrix::mul(const GpuMatrix& a,
-                    const GpuMatrix& b,
-                    real scaleAB,
-                    real scaleT) {
-  CHECK(!isTransposed()) << "Not supported";
-
-  if (!a.isTransposed() && !b.isTransposed()) {
-    CHECK_EQ(width_, b.width_);
-    CHECK_EQ(height_, a.height_);
-    CHECK_EQ(a.width_, b.height_);
-  } else if (a.isTransposed() && !b.isTransposed()) {
-    CHECK_EQ(width_, b.width_);
-    CHECK_EQ(height_, a.width_);
-    CHECK_EQ(a.height_, b.height_);
-  } else if (!a.isTransposed() && b.isTransposed()) {
-    CHECK_EQ(width_, b.height_);
-    CHECK_EQ(height_, a.height_);
-    CHECK_EQ(a.width_, b.width_);
-  } else {
-    LOG(FATAL) << "Is not supported";
-  }
-
-  real* A_d = a.data_;
-  real* B_d = b.data_;
-  real* C_d = data_;
-  int dimM = getHeight();
-  int dimN = getWidth();
-  int dimK = !a.isTransposed() ? a.width_ : a.height_;
-  int lda = a.getStride();
-  int ldb = b.getStride();
-  int ldc = getStride();
-  hl_trans_op_t transa = !a.isTransposed() ? HPPL_OP_N : HPPL_OP_T;
-  hl_trans_op_t transb = !b.isTransposed() ? HPPL_OP_N : HPPL_OP_T;
-
-  hl_matrix_mul(A_d,
-                transa,
-                B_d,
-                transb,
-                C_d,
-                dimM,
-                dimN,
-                dimK,
-                scaleAB,
-                scaleT,
-                lda,
-                ldb,
-                ldc);
-}
-
-void GpuMatrix::mul(const GpuSparseMatrix& a,
-                    const GpuMatrix& b,
-                    real scaleAB,
-                    real scaleT) {
-#ifdef PADDLE_WITH_CUDA
-  CHECK(isContiguous());
-  CHECK(b.isContiguous());
-  CHECK(b.useGpu_ == true) << "Matrix type are not equal";
-  CHECK(!trans_ && !b.trans_) << "not supported";
-
-  if (!a.trans_) {
-    CHECK(width_ == b.width_ && height_ == a.height_ && a.width_ == b.height_)
-        << "Matrix dimensions are not equal";
-  } else {
-    CHECK(width_ == b.width_ && height_ == a.width_ && a.height_ == b.height_)
-        << "Matrix dimensions are not equal";
-  }
-  hl_trans_op_t transA = a.trans_ ? HPPL_OP_T : HPPL_OP_N;
-  hl_sparse_matrix_s A_d = a.sMatrix_.get();
-  real* B_d = b.data_;
-  real* C_d = data_;
-  hl_matrix_csr_mul_dense(A_d,
-                          transA,
-                          B_d,
-                          HPPL_OP_N,
-                          C_d,
-                          height_,
-                          width_,
-                          b.height_,
-                          scaleAB,
-                          scaleT);
-#endif
-}
-
-void GpuMatrix::mul(const GpuMatrix& a,
-                    const GpuSparseMatrix& b,
-                    real scaleAB,
-                    real scaleT) {
-#ifdef PADDLE_WITH_CUDA
-  CHECK(isContiguous());
-  CHECK(a.isContiguous());
-  CHECK(a.useGpu_ == true) << "Matrix type are not equal";
-
-  hl_sparse_matrix_s B_d = b.sMatrix_.get();
-  real* A_d = a.data_;
-  real* C_d = data_;
-  hl_trans_op_t transB = b.trans_ ? HPPL_OP_T : HPPL_OP_N;
-  if (!b.trans_) {
-    CHECK(width_ == b.width_ && height_ == a.height_ && a.width_ == b.height_)
-        << "Matrix dimensions are not equal";
-  } else {
-    CHECK(width_ == b.height_ && height_ == a.height_ && a.width_ == b.width_)
-        << "Matrix dimensions are not equal";
-  }
-  if (b.format_ == SPARSE_CSC) {
-    hl_matrix_dense_mul_csc(A_d,
-                            HPPL_OP_N,
-                            B_d,
-                            transB,
-                            C_d,
-                            height_,
-                            width_,
-                            a.width_,
-                            scaleAB,
-                            scaleT);
-  } else {
-    hl_matrix_dense_mul_csr(A_d,
-                            HPPL_OP_N,
-                            B_d,
-                            transB,
-                            C_d,
-                            height_,
-                            width_,
-                            a.width_,
-                            scaleAB,
-                            scaleT);
-  }
-#endif
-}
-
-/* this = a*b */
-void GpuMatrix::mul(const Matrix& a, const Matrix& b) { mul(a, b, 1.0, 0.0); }
-
-void GpuMatrix::mul(const Matrix& a,
-                    const Matrix& b,
-                    real scaleAB,
-                    real scaleT) {
-  const auto a_ptr = dynamic_cast<const GpuMatrix*>(&a);
-  const auto b_ptr = dynamic_cast<const GpuMatrix*>(&b);
-  const auto a_ptr_s = dynamic_cast<const GpuSparseMatrix*>(&a);
-  const auto b_ptr_s = dynamic_cast<const GpuSparseMatrix*>(&b);
-
-  if (a_ptr && b_ptr) {
-    mul(*a_ptr, *b_ptr, scaleAB, scaleT);
-  } else if (a_ptr_s && b_ptr) {
-    mul(*a_ptr_s, *b_ptr, scaleAB, scaleT);
-  } else if (a_ptr && b_ptr_s) {
-    mul(*a_ptr, *b_ptr_s, scaleAB, scaleT);
-  } else {
-    LOG(FATAL) << "Not supported";
-  }
-}
-
-/* this = this* b */
-void GpuMatrix::rightMul(Matrix& b) { rightMul(b, 1.0, 0.0); }
-
-/* this = scaleAB*(this*b) +  scaleT*this */
-void GpuMatrix::rightMul(Matrix& b, real scaleAB, real scaleT) {
-  CHECK(dynamic_cast<GpuMatrix*>(&b));
-  CHECK(!isTransposed()) << "Not supported";
-  CHECK(!b.isTransposed()) << "Not supported";
-  mul(*this, *dynamic_cast<GpuMatrix*>(&b), scaleAB, scaleT);
-}
-
-/* this = a*this */
-void GpuMatrix::leftMul(Matrix& a) { leftMul(a, 1.0, 0.0); }
-
-/* this = scaleAB*(a*this) +  scaleT*this */
-void GpuMatrix::leftMul(Matrix& a, real scaleAB, real scaleT) {
-  CHECK(dynamic_cast<GpuMatrix*>(&a));
-  CHECK(!isTransposed()) << "Not supported";
-  CHECK(!a.isTransposed()) << "Not supported";
-  mul(*dynamic_cast<GpuMatrix*>(&a), *this, scaleAB, scaleT);
-}
-
-void GpuMatrix::selectRows(Matrix& table, IVector& ids) {
-#ifdef PADDLE_WITH_CUDA
-  CHECK(dynamic_cast<GpuMatrix*>(&table));
-  CHECK(table.useGpu());
-  CHECK(ids.useGpu());
-  CHECK_EQ(getHeight(), ids.getSize());
-  CHECK_EQ(getWidth(), table.getWidth());
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  real* a = getData();
-  size_t tableSize = table.getHeight();
-  int* index = ids.getData();
-
-  hl_matrix_select_rows(a,
-                        stride_,
-                        table.getData(),
-                        table.stride_,
-                        index,
-                        numSamples,
-                        tableSize,
-                        dim);
-#endif
-}
-
-void GpuMatrix::addToRows(Matrix& table, IVector& ids) {
-#ifdef PADDLE_WITH_CUDA
-  CHECK(dynamic_cast<GpuMatrix*>(&table));
-  CHECK(table.useGpu());
-  CHECK(ids.useGpu());
-  CHECK_EQ(getHeight(), ids.getSize());
-  CHECK_EQ(getWidth(), table.getWidth());
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  real* a = getData();
-  size_t tableSize = table.getHeight();
-  int* index = ids.getData();
-
-  hl_matrix_add_to_rows(table.getData(),
-                        table.stride_,
-                        a,
-                        stride_,
-                        index,
-                        numSamples,
-                        tableSize,
-                        dim);
-#endif
-}
-
-void GpuMatrix::colMerge(Matrix& src) {
-  CHECK(src.height_ == height_);
-  if (!trans_ && !src.trans_) {
-    sumRows(src, /* scaleSum= */ 1, /* scaleDest= */ 0);
-  } else {
-    LOG(FATAL) << "Is not supported";
-  }
-}
-
-void GpuMatrix::rowSum(Matrix& sum) {
-  CHECK_EQ(sum.getHeight(), getHeight());
-  CHECK_EQ(sum.getWidth(), (size_t)1);
-
-  sum.sumRows(*this, /* scaleSum= */ 1, /* scaleDest= */ 0);
-}
-
-void GpuMatrix::rowMax(Matrix& max) {
-  CHECK_EQ(max.getHeight(), getHeight());
-  CHECK_EQ(max.getWidth(), (size_t)1);
-
-  max.maxRows(*this);
-}
-
-void GpuMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
-#ifdef PADDLE_WITH_CUDA
-  CHECK(maxIds.useGpu() && maxVal.useGpu()) << "Matrix type are not equal";
-  size_t numSamples = getHeight();
-  size_t beam = maxVal.getWidth();
-  CHECK_EQ(maxIds.getSize(), numSamples * beam);
-  CHECK_EQ(maxVal.getHeight(), numSamples);
-  CHECK_EQ(maxVal.getWidth(), beam);
-
-  hl_matrix_top_k(maxVal.getData(),
-                  maxVal.getStride(),
-                  maxIds.getData(),
-                  this->getData(),
-                  this->getStride(),
-                  this->getWidth(),
-                  beam,
-                  numSamples);
-#endif
-}
-
-void GpuMatrix::colMax(Matrix& max) {
-  CHECK_EQ(max.getWidth(), getWidth());
-  CHECK_EQ(max.getHeight(), (size_t)1);
-
-  max.maxCols(*this);
-}
-
-void GpuMatrix::colMax(IVector& maxIds, Matrix& maxVal) {
-  LOG(FATAL) << "Is not supported";
-}
-
-void GpuMatrix::maxoutForward(Matrix& a,
-                              IVector& id,
-                              size_t channels,
-                              size_t groups) {
-  CHECK(dynamic_cast<GpuMatrix*>(&a));
-  CHECK(dynamic_cast<GpuIVector*>(&id));
-  CHECK_EQ(a.getHeight(), getHeight());
-
-  size_t size = getWidth();
-  size_t batchSize = getHeight();
-  const real* input = a.getData();
-  real* output = getData();
-  int* idForGpu = id.getData();
-
-  hl_maxout_forward(
-      input, output, idForGpu, batchSize, size, size / channels, groups);
-}
-
-void GpuMatrix::maxoutBackward(Matrix& a,
-                               IVector& id,
-                               size_t channels,
-                               size_t groups) {
-  CHECK(dynamic_cast<GpuMatrix*>(&a));
-  CHECK(dynamic_cast<GpuIVector*>(&id));
-  CHECK_EQ(a.getHeight(), getHeight());
-
-  size_t size = a.getWidth();
-  size_t batchSize = getHeight();
-  real* input = getData();
-  const real* output = a.getData();
-  const int* idForGpu = id.getData();
-
-  hl_maxout_backward(
-      input, output, idForGpu, batchSize, size, size / channels, groups);
-}
-
-/*calulate the error of classification */
-void GpuMatrix::classificationError(Matrix& output,
-                                    IVector& label,
-                                    size_t topkSize) {
-  auto gpuOutput = dynamic_cast<GpuMatrix*>(&output);
-  auto gpuLabel = dynamic_cast<GpuIVector*>(&label);
-  size_t numSamples = this->getHeight();
-  GpuMatrixPtr gpuTopVal = std::make_shared<GpuMatrix>(numSamples, topkSize);
-  GpuIVectorPtr gpuTopIds = std::make_shared<GpuIVector>(numSamples * topkSize);
-
-  CHECK(gpuOutput && gpuLabel) << "Invalid argument pointer";
-  CHECK(gpuTopVal && gpuTopIds) << "Allocate GPU memory failed";
-  CHECK(gpuLabel->getSize() == numSamples) << "Vector size is not equal";
-  CHECK(numSamples == gpuOutput->getHeight() && this->getWidth() == 1)
-      << "Matrix dimensions are not equal";
-
-  size_t dim = gpuOutput->getWidth();
-  hl_matrix_classification_error(gpuTopVal->getData(),
-                                 gpuTopVal->getStride(),
-                                 gpuTopIds->getData(),
-                                 gpuOutput->getData(),
-                                 gpuOutput->getStride(),
-                                 dim,
-                                 topkSize,
-                                 numSamples,
-                                 gpuLabel->getData(),
-                                 this->getData());
-}
-
-/* copy -log(output[i * width + label]) to this->data[i] */
-void GpuMatrix::oneHotCrossEntropy(Matrix& output, IVector& label) {
-  GpuMatrix* output_ptr = dynamic_cast<GpuMatrix*>(&output);
-  GpuIVector* label_ptr = dynamic_cast<GpuIVector*>(&label);
-
-  CHECK(output_ptr && label_ptr) << "Invalid argument pointer";
-
-  CHECK(height_ == label.getSize() && width_ == 1 && height_ == output.height_)
-      << "Matrix dimensions are not equal";
-
-  real* A_d = output_ptr->data_;
-  real* C_d = data_;
-  int* label_d = label_ptr->getData();
-
-  hl_matrix_cross_entropy(A_d, C_d, label_d, height_, output.width_);
-}
-
-/* calculate the error of outputV according to label */
-void GpuMatrix::oneHotCrossEntropyBp(Matrix& outputV, IVector& label) {
-  GpuMatrix* output_ptr = dynamic_cast<GpuMatrix*>(&outputV);
-  GpuIVector* label_ptr = dynamic_cast<GpuIVector*>(&label);
-
-  CHECK(output_ptr && label_ptr) << "Invalid argument pointer";
-
-  CHECK(height_ == output_ptr->height_ && width_ == output_ptr->width_)
-      << "Matrix dimensions are not equal";
-
-  real* output_d = output_ptr->data_;
-  real* grad_d = data_;
-  int* label_d = label_ptr->getData();
-
-  hl_matrix_cross_entropy_bp(grad_d, output_d, label_d, height_, width_);
-}
-
-void GpuMatrix::oneHotCrossEntropyWithSelfNorm(Matrix& output,
-                                               IVector& label,
-                                               real alpha) {
-  LOG(FATAL) << "Not implemented";
-}
-
-void GpuMatrix::oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
-                                                 IVector& label,
-                                                 real alpha) {
-  LOG(FATAL) << "Not implemented";
-}
-
-void GpuMatrix::softmax(Matrix& output) {
-  CHECK(output.useGpu()) << "Matrix type are not equal";
-
-  size_t height = getHeight();
-  size_t width = getWidth();
-  CHECK(height == output.getHeight() && width == output.getWidth())
-      << "Matrix dimensions are not equal";
-
-  real* inputData = getData();
-  real* outputData = output.getData();
-  hl_matrix_softmax(inputData, outputData, height, width);
-}
-
-void GpuMatrix::sequenceSoftmax(Matrix& output, const IVector& index) {
-  CHECK_EQ(getWidth(), 1UL);
-  CHECK_EQ(output.getWidth(), 1UL);
-  CHECK(isContiguous());
-
-  real* inputData = getData();
-  real* outputData = output.getData();
-  auto starts = index.getData();
-  int numSequences = index.getSize() - 1;
-  hl_sequence_softmax_forward(inputData, outputData, starts, numSequences);
-}
-
-void GpuMatrix::softmaxDerivative(Matrix& output, Matrix& sftmaxSum) {
-  CHECK(output.useGpu_ == true && sftmaxSum.useGpu_ == true)
-      << "Matrix type are not equal";
-
-  CHECK(height_ == output.height_ && width_ == output.width_ &&
-        height_ == sftmaxSum.height_)
-      << "Matrix dimensions are not equal";
-
-  real* output_d = output.data_;
-  real* sftmaxSum_d = sftmaxSum.data_;
-  real* grad_d = data_;
-  hl_matrix_softmax_derivative(grad_d, output_d, sftmaxSum_d, height_, width_);
-}
-
-void GpuMatrix::softmaxBackward(Matrix& outputV) {
-  CHECK(outputV.useGpu()) << "Matrix type are not equal";
-
-  size_t height = getHeight();
-  size_t width = getWidth();
-  CHECK(height == outputV.getHeight() && width == outputV.getWidth())
-      << "Matrix dimensions are not equal";
-
-  real* output_grad = getData();
-  real* output_value = outputV.getData();
-  hl_softmax_backward(output_value, output_grad, height, width);
-}
-
-void GpuMatrix::sumOfSquares(Matrix& output, Matrix& label) {
-  CHECK_EQ(label.getHeight(), height_);
-  CHECK_EQ(output.getHeight(), height_);
-  CHECK_EQ(label.getWidth(), output.getWidth());
-  CHECK_EQ((size_t)1, width_);
-
-  auto labelptr = dynamic_cast<GpuSparseMatrix*>(&label);
-  if (labelptr) {
-    LOG(FATAL) << "not supported: GpuSparseMatrix as label";
-  }
-
-  BaseMatrix::sumOfSquaredDiffs(output,
-                                label,
-                                /* scaleSum= */ 1,
-                                /* scaleDest= */ 1);
-}
-
-void GpuMatrix::sumOfSquaresBp(Matrix& outputV, Matrix& label) {
-  add2(outputV, label, 1, 2, -2);
-}
-
-void GpuMatrix::tanh(Matrix& output) { BaseMatrix::tanh(output); }
-
-void GpuMatrix::tanhDerivative(Matrix& output) {
-  BaseMatrix::tanhDerivative(output);
-}
-
-void GpuMatrix::softrelu(Matrix& output) { BaseMatrix::softrelu(output); }
-
-void GpuMatrix::softreluDerivative(Matrix& output) {
-  BaseMatrix::softreluDerivative(output);
-}
-
-void GpuMatrix::scaledTanh(Matrix& output, real p1, real p2) {
-  BaseMatrix::scaledTanh(output, p1, p2);
-}
-
-void GpuMatrix::randomizeUniform() {
-  CHECK(isContiguous());
-  real* data = data_;
-  size_t size = height_ * width_;
-
-  hl_rand(data, size);
-}
-
-void GpuMatrix::print(std::ostream& os) const {
-  CHECK(isContiguous());
-  CpuMatrix cpuMat(getHeight(), getWidth());
-  cpuMat.copyFrom(*this);
-  cpuMat.print(os);
-}
-
-void GpuMatrix::print(std::ostream& os, size_t height, size_t width) const {
-  CHECK(isContiguous());
-  CpuMatrix cpuMat(getHeight(), getWidth());
-  cpuMat.copyFrom(*this);
-  cpuMat.print(os, height, width);
-}
-
-void GpuMatrix::check(std::ostream& os, Matrix& refMat, bool printDiff) {
-  CHECK(isContiguous());
-  CHECK(height_ == refMat.getHeight());
-  CHECK(width_ == refMat.getWidth());
-  CpuMatrix cpuRef(height_, width_);
-  GpuMatrix gpuRef(height_, width_);
-  cpuRef.copyFrom(refMat);
-  gpuRef.copyFrom(*this);
-  size_t diffCnt = 0;
-  for (size_t i = 0; i < height_; ++i) {
-    for (size_t j = 0; j < width_; ++j) {
-      real a = gpuRef.getElement(i, j);
-      real b = cpuRef.getElement(i, j);
-      if (fabs(a - b) > 0.00001) {
-        ++diffCnt;
-        if (printDiff) {
-          os << "ref= " << a << "  check= " << b << std::endl;
-        }
-      }
-    }
-  }
-  LOG(INFO) << "the  diffCnt is " << diffCnt;
-}
-
-void GpuMatrix::upsampleForward(Matrix& input,
-                                Matrix& mask,
-                                size_t imgSizeH,
-                                size_t imgSizeW,
-                                size_t channels,
-                                size_t outputH,
-                                size_t outputW) {
-  CHECK(input.useGpu_ == true) << "Matrix type are not equal";
-  CHECK(mask.useGpu_ == true) << "Matrix type are not equal";
-
-  real* inputData = input.getData();
-  real* maskData = mask.getData();
-  real* outData = data_;
-
-  size_t batch = input.getHeight();
-
-  CHECK(imgSizeH * imgSizeW * channels == input.getWidth());
-  CHECK(imgSizeH * imgSizeW * channels == mask.getWidth());
-  CHECK_EQ(batch, this->getHeight());
-  CHECK(width_ == outputH * outputW * channels);
-  hl_upsample_forward(inputData,
-                      maskData,
-                      batch,
-                      imgSizeH,
-                      imgSizeW,
-                      channels,
-                      outputH,
-                      outputW,
-                      outData);
-}
-
-void GpuMatrix::upsampleBackward(Matrix& outputGrad,
-                                 Matrix& mask,
-                                 size_t imgSizeH,
-                                 size_t imgSizeW,
-                                 size_t channels,
-                                 size_t outputH,
-                                 size_t outputW) {
-  CHECK(outputGrad.useGpu_ == true) << "Matrix type are not equal";
-  CHECK(mask.useGpu_ == true) << "Matrix type are not equal";
-
-  real* outputGradData = outputGrad.getData();
-  real* maskData = mask.getData();
-  real* inputGradData = data_;
-  size_t batch = outputGrad.getHeight();
-
-  CHECK(imgSizeH * imgSizeW == this->getWidth() / channels);
-  CHECK_EQ(batch, this->getHeight());
-  CHECK_EQ(channels * outputH * outputW, outputGrad.getWidth());
-  hl_upsample_backward(outputGradData,
-                       maskData,
-                       batch,
-                       imgSizeH,
-                       imgSizeW,
-                       channels,
-                       outputH,
-                       outputW,
-                       inputGradData);
-}
-
-void GpuMatrix::maxPoolForward(Matrix& inputMat,
-                               size_t imgSizeH,
-                               size_t imgSizeW,
-                               size_t channels,
-                               size_t sizeX,
-                               size_t sizeY,
-                               size_t strideH,
-                               size_t strideW,
-                               size_t outputH,
-                               size_t outputW,
-                               size_t paddingH,
-                               size_t paddingW,
-                               MatrixPtr maskMatP) {
-  CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";
-
-  real* inputData = inputMat.getData();
-  real* maskData = NULL;
-  size_t frameNum = inputMat.getHeight();
-  CHECK(imgSizeH * imgSizeW * channels == inputMat.getWidth());
-  CHECK(height_ == inputMat.getHeight());
-  CHECK(width_ == outputH * outputW * channels);
-
-  if (maskMatP != NULL) {
-    CHECK(maskMatP->useGpu_ == true) << "Matrix type are not equal";
-    CHECK(outputH * outputW * channels == maskMatP->getWidth());
-    maskData = maskMatP->getData();
-  }
-
-  hl_maxpool_forward(frameNum,
-                     inputData,
-                     channels,
-                     imgSizeH,
-                     imgSizeW,
-                     outputH,
-                     outputW,
-                     sizeX,
-                     sizeY,
-                     strideH,
-                     strideW,
-                     paddingH,
-                     paddingW,
-                     data_,
-                     getStride(),
-                     maskData);
-}
-
-void GpuMatrix::maxPoolBackward(Matrix& inputMat,
-                                size_t imgSizeH,
-                                size_t imgSizeW,
-                                Matrix& outGrad,
-                                Matrix& outV,
-                                size_t sizeX,
-                                size_t sizeY,
-                                size_t strideH,
-                                size_t strideW,
-                                size_t outputH,
-                                size_t outputW,
-                                real scaleTargets,
-                                real scaleOutput,
-                                size_t paddingH,
-                                size_t paddingW) {
-  CHECK(inputMat.useGpu_ == true && outGrad.useGpu_ == true &&
-        outV.useGpu_ == true)
-      << "Matrix type are not equal";
-
-  real* inputData = inputMat.getData();
-  real* outData = outV.getData();
-  real* outDiff = outGrad.getData();
-  size_t frameNum = inputMat.getHeight();
-  size_t channels = outV.getWidth() / outputH / outputW;
-  CHECK(imgSizeH * imgSizeW * channels == inputMat.getWidth());
-  CHECK(height_ == inputMat.getHeight());
-  CHECK(outGrad.getHeight() == outV.getHeight() &&
-        outGrad.getWidth() == outV.getWidth());
-
-  hl_maxpool_backward(frameNum,
-                      inputData,
-                      outData,
-                      outDiff,
-                      channels,
-                      imgSizeH,
-                      imgSizeW,
-                      outputH,
-                      outputW,
-                      sizeX,
-                      sizeY,
-                      strideH,
-                      strideW,
-                      paddingH,
-                      paddingW,
-                      scaleTargets,
-                      scaleOutput,
-                      data_,
-                      outGrad.getStride());
-}
-
-void GpuMatrix::avgPoolForward(Matrix& inputMat,
-                               size_t imgSizeH,
-                               size_t imgSizeW,
-                               size_t channels,
-                               size_t sizeX,
-                               size_t sizeY,
-                               size_t strideH,
-                               size_t strideW,
-                               size_t outputH,
-                               size_t outputW,
-                               size_t paddingH,
-                               size_t paddingW,
-                               bool excludeMode) {
-  CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";
-
-  real* inputData = inputMat.getData();
-  size_t frameNum = inputMat.getHeight();
-  CHECK(imgSizeH * imgSizeW * channels == inputMat.getWidth());
-  CHECK(height_ == inputMat.getHeight());
-  CHECK(width_ == outputH * outputW * channels);
-
-  hl_avgpool_forward(frameNum,
-                     inputData,
-                     channels,
-                     imgSizeH,
-                     imgSizeW,
-                     outputH,
-                     outputW,
-                     sizeX,
-                     sizeY,
-                     strideH,
-                     strideW,
-                     paddingH,
-                     paddingW,
-                     data_,
-                     getStride(),
-                     excludeMode);
-}
-
-void GpuMatrix::avgPoolBackward(Matrix& outGrad,
-                                size_t imgSizeH,
-                                size_t imgSizeW,
-                                size_t sizeX,
-                                size_t sizeY,
-                                size_t strideH,
-                                size_t strideW,
-                                size_t outputH,
-                                size_t outputW,
-                                real scaleTargets,
-                                real scaleOutput,
-                                size_t paddingH,
-                                size_t paddingW,
-                                bool excludeMode) {
-  CHECK(outGrad.useGpu_ == true) << "Matrix type are not equal";
-
-  real* outDiff = outGrad.getData();
-  size_t frameNum = outGrad.getHeight();
-  size_t channels = outGrad.getWidth() / outputH / outputW;
-  CHECK(imgSizeH * imgSizeW * channels == width_);
-  CHECK(height_ == outGrad.getHeight());
-  CHECK(outGrad.getWidth() == outputH * outputW * channels);
-
-  hl_avgpool_backward(frameNum,
-                      outDiff,
-                      channels,
-                      imgSizeH,
-                      imgSizeW,
-                      outputH,
-                      outputW,
-                      sizeX,
-                      sizeY,
-                      strideH,
-                      strideW,
-                      paddingH,
-                      paddingW,
-                      scaleTargets,
-                      scaleOutput,
-                      data_,
-                      outGrad.getStride(),
-                      excludeMode);
-}
-
-void GpuMatrix::maxPool3DForward(Matrix& inputMat,
-                                 Matrix& maxPoolIdx,
-                                 size_t channels,
-                                 size_t imgSizeD,
-                                 size_t imgSizeH,
-                                 size_t imgSizeW,
-                                 size_t outputD,
-                                 size_t outputH,
-                                 size_t outputW,
-                                 size_t sizeZ,
-                                 size_t sizeY,
-                                 size_t sizeX,
-                                 size_t strideD,
-                                 size_t strideH,
-                                 size_t strideW,
-                                 size_t paddingD,
-                                 size_t paddingH,
-                                 size_t paddingW) {
-  CHECK(inputMat.useGpu_) << "Matrix type are not correct";
-
-  real* inputData = inputMat.getData();
-  real* maxPoolIdxData = maxPoolIdx.getData();
-  size_t num = inputMat.getHeight();
-  CHECK(imgSizeD * imgSizeH * imgSizeW * channels == inputMat.getWidth());
-  CHECK(height_ == inputMat.getHeight());
-  CHECK(width_ == outputD * outputH * outputW * channels);
-
-  hl_maxpool3D_forward(num,
-                       inputData,
-                       channels,
-                       imgSizeD,
-                       imgSizeH,
-                       imgSizeW,
-                       outputD,
-                       outputH,
-                       outputW,
-                       sizeZ,
-                       sizeY,
-                       sizeX,
-                       strideD,
-                       strideH,
-                       strideW,
-                       paddingD,
-                       paddingH,
-                       paddingW,
-                       getData(),
-                       maxPoolIdxData,
-                       getStride());
-}
-
-void GpuMatrix::maxPool3DBackward(Matrix& outGrad,
-                                  Matrix& maxPoolIdx,
-                                  size_t imgSizeD,
-                                  size_t imgSizeH,
-                                  size_t imgSizeW,
-                                  size_t outputD,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  size_t sizeZ,
-                                  size_t sizeY,
-                                  size_t sizeX,
-                                  size_t strideD,
-                                  size_t strideH,
-                                  size_t strideW,
-                                  size_t paddingD,
-                                  size_t paddingH,
-                                  size_t paddingW,
-                                  real scaleTargets,
-                                  real scaleOutput) {
-  CHECK(outGrad.useGpu_ && maxPoolIdx.useGpu_) << "Matrix type are not equal";
-
-  real* outDiff = outGrad.getData();
-  real* maxPoolIdxData = maxPoolIdx.getData();
-  size_t frameNum = getHeight();
-  size_t channels = outGrad.getWidth() / outputD / outputH / outputW;
-  CHECK(imgSizeD * imgSizeH * imgSizeW * channels == getWidth());
-  CHECK(outGrad.getHeight() == maxPoolIdx.getHeight() &&
-        outGrad.getWidth() == maxPoolIdx.getWidth());
-
-  hl_maxpool3D_backward(frameNum,
-                        outDiff,
-                        channels,
-                        imgSizeD,
-                        imgSizeH,
-                        imgSizeW,
-                        outputD,
-                        outputH,
-                        outputW,
-                        sizeZ,
-                        sizeY,
-                        sizeX,
-                        strideD,
-                        strideH,
-                        strideW,
-                        paddingD,
-                        paddingH,
-                        paddingW,
-                        scaleTargets,
-                        scaleOutput,
-                        getData(),
-                        maxPoolIdxData,
-                        outGrad.getStride());
-}
-
-void GpuMatrix::avgPool3DForward(Matrix& inputMat,
-                                 size_t channels,
-                                 size_t imgSizeD,
-                                 size_t imgSizeH,
-                                 size_t imgSizeW,
-                                 size_t outputD,
-                                 size_t outputH,
-                                 size_t outputW,
-                                 size_t sizeZ,
-                                 size_t sizeY,
-                                 size_t sizeX,
-                                 size_t strideD,
-                                 size_t strideH,
-                                 size_t strideW,
-                                 size_t paddingD,
-                                 size_t paddingH,
-                                 size_t paddingW) {
-  CHECK(inputMat.useGpu_) << "Matrix type are not equal";
-
-  real* inputData = inputMat.getData();
-  size_t frameNum = inputMat.getHeight();
-  CHECK(imgSizeD * imgSizeH * imgSizeW * channels == inputMat.getWidth());
-  CHECK(height_ == inputMat.getHeight());
-  CHECK(width_ == outputD * outputH * outputW * channels);
-
-  hl_avgpool3D_forward(frameNum,
-                       inputData,
-                       channels,
-                       imgSizeD,
-                       imgSizeH,
-                       imgSizeW,
-                       outputD,
-                       outputH,
-                       outputW,
-                       sizeZ,
-                       sizeY,
-                       sizeX,
-                       strideD,
-                       strideH,
-                       strideW,
-                       paddingD,
-                       paddingH,
-                       paddingW,
-                       getData(),
-                       getStride());
-}
-
-void GpuMatrix::avgPool3DBackward(Matrix& outGrad,
-                                  size_t imgSizeD,
-                                  size_t imgSizeH,
-                                  size_t imgSizeW,
-                                  size_t outputD,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  size_t sizeZ,
-                                  size_t sizeY,
-                                  size_t sizeX,
-                                  size_t strideD,
-                                  size_t strideH,
-                                  size_t strideW,
-                                  size_t paddingD,
-                                  size_t paddingH,
-                                  size_t paddingW,
-                                  real scaleTargets,
-                                  real scaleOutput) {
-  CHECK(outGrad.useGpu_) << "Matrix type are not equal";
-
-  real* outDiff = outGrad.getData();
-  size_t frameNum = outGrad.getHeight();
-  size_t channels = outGrad.getWidth() / outputD / outputH / outputW;
-  CHECK(imgSizeD * imgSizeH * imgSizeW * channels == width_);
-  CHECK(height_ == outGrad.getHeight());
-  CHECK(outGrad.getWidth() == outputD * outputH * outputW * channels);
-
-  hl_avgpool3D_backward(frameNum,
-                        outDiff,
-                        channels,
-                        imgSizeD,
-                        imgSizeH,
-                        imgSizeW,
-                        outputD,
-                        outputH,
-                        outputW,
-                        sizeZ,
-                        sizeY,
-                        sizeX,
-                        strideD,
-                        strideH,
-                        strideW,
-                        paddingD,
-                        paddingH,
-                        paddingW,
-                        scaleTargets,
-                        scaleOutput,
-                        getData(),
-                        outGrad.getStride());
-}
-
-void GpuMatrix::maxSequenceForward(Matrix& input,
-                                   const IVector& sequence,
-                                   IVector& index) {
-  CHECK(dynamic_cast<GpuMatrix*>(&input));
-  CHECK(dynamic_cast<const GpuIVector*>(&sequence));
-  CHECK(dynamic_cast<GpuIVector*>(&index));
-
-  real* outData = getData();
-  real* inputData = input.getData();
-  const int* starts = sequence.getData();
-  int* maxIndex = index.getData();
-  size_t numSequences = getHeight();
-  size_t dim = getWidth();
-
-  CHECK_EQ(dim, input.getWidth());
-  CHECK_EQ(numSequences, sequence.getSize() - 1);
-  CHECK_EQ(numSequences * dim, index.getSize());
-
-  hl_max_sequence_forward(
-      inputData, starts, outData, maxIndex, numSequences, dim);
-}
-
-void GpuMatrix::maxSequenceBackward(Matrix& outputGrad,
-                                    const IVector& sequence,
-                                    IVector& index) {
-  CHECK(dynamic_cast<GpuMatrix*>(&outputGrad));
-  CHECK(dynamic_cast<const GpuIVector*>(&sequence));
-  CHECK(dynamic_cast<GpuIVector*>(&index));
-
-  real* inputGrad = getData();
-  real* outGrad = outputGrad.getData();
-  int* maxIndex = index.getData();
-  size_t dim = getWidth();
-  size_t numSequences = sequence.getSize() - 1;
-
-  CHECK_EQ(dim, outputGrad.getWidth());
-  CHECK_EQ(numSequences, outputGrad.getHeight());
-  CHECK_EQ(numSequences * dim, index.getSize());
-
-  hl_max_sequence_backward(outGrad, maxIndex, inputGrad, numSequences, dim);
-}
-
-void GpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
-  CHECK(data.useGpu_ == true && W.useGpu_ == true)
-      << "Matrix type are not equal";
-  real* input = data.getData();
-  real* w = W.getData();
-  size_t numElements = data.getWidth();
-  size_t numSamples = data.getHeight();
-  size_t paraSize = W.getHeight() * W.getWidth();
-  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
-  size_t partial_sum = numElements / paraSize;
-  real* output = getData();
-  hl_param_relu_forward(output, input, w, numElements, numSamples, partial_sum);
-}
-
-void GpuMatrix::paramReluBackwardW(Matrix& oGrad, Matrix& data) {
-  CHECK(oGrad.useGpu_ == true && data.useGpu_ == true)
-      << "Matrix type are not equal";
-  real* ograd = oGrad.getData();
-  real* input = data.getData();
-  real* wgrad = data_;
-  size_t numElements = data.getWidth();
-  size_t numSamples = data.getHeight();
-  size_t paraSize = this->getHeight() * this->getWidth();
-  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
-  size_t partial_sum = numElements / paraSize;
-  hl_param_relu_backward_w(
-      wgrad, ograd, input, numElements, numSamples, partial_sum);
-}
-
-void GpuMatrix::paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
-  real* diff = data_;
-  real* input = data.getData();
-  real* ograd = oGrad.getData();
-  real* w = W.getData();
-  size_t numElements = data.getWidth();
-  size_t numSamples = data.getHeight();
-  size_t paraSize = W.getHeight() * W.getWidth();
-  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
-  size_t partial_sum = numElements / paraSize;
-  hl_param_relu_backward_diff(
-      ograd, input, w, diff, numElements, numSamples, partial_sum);
-}
-
-void GpuMatrix::addColumnVector(const Matrix& b) {
-  BaseMatrix::addColVector(const_cast<Matrix&>(b));
-}
-
-void GpuMatrix::bilinearForward(const Matrix& in,
-                                const size_t inImgH,
-                                const size_t inImgW,
-                                const size_t outImgH,
-                                const size_t outImgW,
-                                const size_t numChannels,
-                                const real ratioH,
-                                const real ratioW) {
-  CHECK(dynamic_cast<const GpuMatrix*>(&in));
-
-  const size_t outputW = getWidth();
-  const size_t outputH = getHeight();
-  const size_t inputW = in.getWidth();
-  const size_t inputH = in.getHeight();
-
-  real* outData = getData();
-  const real* inData = in.getData();
-
-  if (inImgH == outImgW && inImgW == outImgW) {
-    this->copyFrom(in);
-  } else {
-    hl_bilinear_forward(inData,
-                        inImgH,
-                        inImgW,
-                        inputH,
-                        inputW,
-                        outData,
-                        outImgH,
-                        outImgW,
-                        outputH,
-                        outputW,
-                        numChannels,
-                        ratioH,
-                        ratioW);
-  }
-}
-
-void GpuMatrix::bilinearBackward(const Matrix& out,
-                                 const size_t outImgH,
-                                 const size_t outImgW,
-                                 const size_t inImgH,
-                                 const size_t inImgW,
-                                 const size_t numChannels,
-                                 const real ratioH,
-                                 const real ratioW) {
-  CHECK(dynamic_cast<const GpuMatrix*>(&out));
-
-  const size_t inputW = getWidth();
-  const size_t inputH = getHeight();
-  const size_t outputW = out.getWidth();
-  const size_t outputH = out.getHeight();
-
-  real* inGrad = getData();
-  const real* outGrad = out.getData();
-
-  if (outImgH == inImgH && outImgW == inImgW) {
-    this->add(const_cast<Matrix&>(out));
-  } else {
-    hl_bilinear_backward(inGrad,
-                         inImgH,
-                         inImgW,
-                         inputH,
-                         inputW,
-                         outGrad,
-                         outImgH,
-                         outImgW,
-                         outputH,
-                         outputW,
-                         numChannels,
-                         ratioH,
-                         ratioW);
-  }
-}
-
-void GpuMatrix::multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
-#ifdef PADDLE_WITH_CUDA
-  GpuMatrix* outputPtr = dynamic_cast<GpuMatrix*>(&output);
-  auto labelPtr = dynamic_cast<GpuSparseMatrix*>(&label);
-
-  CHECK(outputPtr && labelPtr) << "Invalid argument pointer";
-  CHECK(labelPtr->format_ == SPARSE_CSR) << "Matrix format not supported";
-  CHECK(height_ == outputPtr->height_ && width_ == 1 &&
-        outputPtr->width_ == labelPtr->getWidth() &&
-        outputPtr->height_ == labelPtr->getHeight())
-      << "Matrix dimensions are not equal";
-
-  real* output_d = outputPtr->data_;
-  real* entropy_d = data_;
-  hl_sparse_matrix_s mat_d = labelPtr->sMatrix_.get();
-  hl_matrix_multi_binary_cross_entropy(
-      output_d, entropy_d, mat_d, height_, outputPtr->width_);
-#endif
-}
-
-void GpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
-#ifdef PADDLE_WITH_CUDA
-  GpuMatrix* outputPtr = dynamic_cast<GpuMatrix*>(&output);
-  auto labelPtr = dynamic_cast<GpuSparseMatrix*>(&label);
-
-  CHECK(outputPtr && labelPtr) << "Invalid argument pointer";
-  CHECK(labelPtr->format_ == SPARSE_CSR) << "Matrix format not supported";
-  CHECK(height_ == outputPtr->height_ && width_ == outputPtr->width_ &&
-        outputPtr->width_ == labelPtr->getWidth() &&
-        outputPtr->height_ == labelPtr->getHeight())
-      << "Matrix dimensions are not equal";
-
-  real* output_d = outputPtr->data_;
-  real* grad_d = data_;
-  hl_sparse_matrix_s mat_d = labelPtr->sMatrix_.get();
-  hl_matrix_multi_binary_cross_entropy_bp(
-      output_d, grad_d, mat_d, height_, width_);
-#endif
-}
-
-void GpuMatrix::vol2Col(real* dataSrc,
-                        int channels,
-                        int depth,
-                        int height,
-                        int width,
-                        int filterD,
-                        int filterH,
-                        int filterW,
-                        int strideD,
-                        int strideH,
-                        int strideW,
-                        int paddingD,
-                        int paddingH,
-                        int paddingW) {
-  hl_matrix_vol2Col(dataSrc,
-                    channels,
-                    depth,
-                    height,
-                    width,
-                    filterD,
-                    filterH,
-                    filterW,
-                    strideD,
-                    strideH,
-                    strideW,
-                    paddingD,
-                    paddingH,
-                    paddingW,
-                    getData());
-}
-
-void GpuMatrix::col2Vol(real* dataDst,
-                        int channels,
-                        int depth,
-                        int height,
-                        int width,
-                        int filterD,
-                        int filterH,
-                        int filterW,
-                        int strideD,
-                        int strideH,
-                        int strideW,
-                        int paddingD,
-                        int paddingH,
-                        int paddingW,
-                        real alpha,
-                        real beta) {
-  hl_matrix_col2Vol(dataDst,
-                    channels,
-                    depth,
-                    height,
-                    width,
-                    filterD,
-                    filterH,
-                    filterW,
-                    strideD,
-                    strideH,
-                    strideW,
-                    paddingD,
-                    paddingH,
-                    paddingW,
-                    getData(),
-                    alpha,
-                    beta);
-}
-
-/**
- * CpuMatrix
- */
-
-CpuMatrix::CpuMatrix(size_t height, size_t width, bool trans)
-    : Matrix(std::make_shared<CpuMemoryHandle>(height * width * sizeof(real)),
-             height,
-             width,
-             trans,
-             false) {}
-
-CpuMatrix::~CpuMatrix() {}
-
-void CpuMatrix::zeroMem() {
-  CHECK(data_ != NULL);
-  if (isContiguous()) {
-    memset(data_, 0, height_ * width_ * sizeof(real));
-  } else {
-    BaseMatrix::zero();
-  }
-}
-void CpuMatrix::resetOne() {
-  CHECK(data_ != NULL);
-  BaseMatrix::one();
-}
-
-void CpuMatrix::copyFrom(const Matrix& src) {
-  CHECK(isContiguous());
-  if (typeid(src) == typeid(GpuMatrix)) {
-    CHECK(src.isContiguous());
-    CHECK(elementCnt_ == src.getElementCnt());
-    hl_memcpy_device2host(
-        data_, const_cast<real*>(src.getData()), sizeof(real) * elementCnt_);
-  } else if (typeid(src) == typeid(CpuMatrix) ||
-             typeid(src) == typeid(SharedCpuMatrix)) {
-    CHECK(src.isContiguous());
-    CHECK(elementCnt_ == src.getElementCnt());
-    memcpy(data_, src.getData(), sizeof(real) * elementCnt_);
-  } else if (typeid(src) == typeid(CpuSparseMatrix)) {
-    CHECK_GE(elementCnt_, src.getElementCnt());
-    copyFrom(dynamic_cast<CpuSparseMatrix&>(const_cast<Matrix&>(src)));
-  } else {
-    LOG(FATAL) << "Wrong";
-  }
-}
-
-void CpuMatrix::copyFrom(CpuSparseMatrix& src) {
-  CHECK(isContiguous());
-  CHECK(height_ == src.getHeight());
-  CHECK(width_ == src.getWidth());
-  memset(data_, 0, sizeof(real) * height_ * width_);
-  if (src.getValueType() == FLOAT_VALUE) {
-    if (src.getFormat() == SPARSE_CSC) {
-      int* rows = src.getRows();
-      real* vals = src.getValue();
-      for (size_t i = 0; i < width_; i++) {
-        for (size_t j = src.getColStartIdx(i); j < src.getColStartIdx(i + 1);
-             j++) {
-          data_[rows[j] * width_ + i] = vals[j];
-        }
-      }
-    } else {
-      int* cols = src.getCols();
-      real* vals = src.getValue();
-      for (size_t i = 0; i < height_; i++) {
-        for (size_t j = src.getRowStartIdx(i); j < src.getRowStartIdx(i + 1);
-             j++) {
-          data_[i * width_ + cols[j]] = vals[j];
-        }
-      }
-    }
-  } else {
-    if (src.getFormat() == SPARSE_CSC) {
-      int* rows = src.getRows();
-      for (size_t i = 0; i < width_; i++) {
-        for (size_t j = src.getColStartIdx(i); j < src.getColStartIdx(i + 1);
-             j++) {
-          data_[rows[j] * width_ + i] = 1.0;
-        }
-      }
-    } else {
-      int* cols = src.getCols();
-      for (size_t i = 0; i < height_; i++) {
-        for (size_t j = src.getRowStartIdx(i); j < src.getRowStartIdx(i + 1);
-             j++) {
-          data_[i * width_ + cols[j]] = 1.0;
-        }
-      }
-    }
-  }
-}
-
-void CpuMatrix::copyFrom(const Matrix& src, hl_stream_t stream) {
-  CHECK(isContiguous());
-  CHECK(src.isContiguous());
-  CHECK(elementCnt_ == src.getElementCnt());
-  if (typeid(src) == typeid(GpuMatrix)) {
-    hl_memcpy_async(this->getData(),
-                    const_cast<real*>(src.getData()),
-                    sizeof(real) * elementCnt_,
-                    stream);
-    // There is a need to add synchronization to ensure that the data is copied.
-    hl_stream_synchronize(stream);
-  } else if (typeid(src) == typeid(CpuMatrix)) {
-    memcpy(data_, src.getData(), sizeof(real) * elementCnt_);
-  } else {
-    LOG(FATAL) << "Wrong";
-  }
-}
-
-void CpuMatrix::copyFrom(const real* cpuSrc, size_t size) {
-  CHECK(isContiguous());
-  CHECK(size <= elementCnt_);
-  memcpy(data_, cpuSrc, sizeof(real) * size);
-}
-
-void CpuMatrix::copyFrom(const real* cpuSrc, const int64_t* seq) {
-  CHECK(isContiguous());
-  for (size_t i = 0; i < height_; i++) {
-    memcpy(data_ + i * width_, cpuSrc + seq[i] * width_, sizeof(real) * width_);
-  }
-}
-
-void CpuMatrix::copyFrom(const IVector& src) {
-  CHECK(isContiguous());
-  CHECK(elementCnt_ == src.getSize())
-      << "the src and dst should have same size.";
-  const int* cpuSrc = NULL;
-  IVectorPtr tmp;
-  if (src.useGpu()) {
-    CpuIVector tmp(src.getSize());
-    tmp.copyFrom(src);
-    cpuSrc = tmp.getData();
-  } else {
-    cpuSrc = src.getData();
-  }
-  for (size_t i = 0; i < elementCnt_; ++i) {
-    data_[i] = cpuSrc[i];
-  }
-}
-
-void CpuMatrix::copyByRowIndex(Matrix& b, const IVector& rowIndex) {
-  size_t height = getHeight();
-  size_t width = getWidth();
-  CHECK_EQ(b.getWidth(), width);
-  const int* index = rowIndex.getData();
-  for (size_t i = 0; i < height; i++) {
-    CHECK_LT(static_cast<size_t>(index[i]), b.getHeight());
-    real* src = b.getData() + index[i] * width;
-    real* dst = getData() + i * width;
-    memcpy(dst, src, sizeof(real) * width);
-  }
-}
-
-MatrixPtr CpuMatrix::clone(size_t height, size_t width, bool useGpu) {
-  CHECK(isContiguous());
-
-  if (height == 0 && width == 0) {
-    height = height_;
-    width = width_;
-  }
-
-  CHECK(width && height);
-
-  if (useGpu) {
-    return std::make_shared<GpuMatrix>(height, width);
-  } else {
-    return std::make_shared<CpuMatrix>(height, width);
-  }
-}
-
-void CpuMatrix::resize(size_t newHeight, size_t newWidth) {
-  size_t newSize = newHeight * newWidth;
-  if (NULL == memoryHandle_.get() ||
-      newSize * sizeof(real) > memoryHandle_->getAllocSize()) {
-    memoryHandle_ = std::make_shared<CpuMemoryHandle>(newSize * sizeof(real));
-    data_ = reinterpret_cast<real*>(memoryHandle_->getBuf());
-  }
-
-  height_ = newHeight;
-  width_ = newWidth;
-  elementCnt_ = newSize;
-  stride_ = width_;
-}
-
-real CpuMatrix::getElement(size_t x, size_t y) const {
-  return data_[x * stride_ + y];
-}
-
-real CpuMatrix::getSum() {
-  CHECK(isContiguous());
-  double sum = 0;
-  for (size_t i = 0; i < height_; ++i) {
-    for (size_t j = 0; j < width_; ++j) {
-      sum += data_[i * width_ + j];
-    }
-  }
-  return sum;
-}
-
-void CpuMatrix::accumulateColSum(Matrix& src) {
-  CHECK_EQ(getWidth(), src.getWidth());
-  CHECK_EQ(getHeight(), (size_t)1);
-
-  sumCols(src, /* scaleSum= */ 1, /* scaleDest= */ 1);
-}
-
-real CpuMatrix::getAbsSum() {
-  CHECK(isContiguous());
-  double sum = 0;
-  for (size_t i = 0; i < height_; ++i) {
-    for (size_t j = 0; j < width_; ++j) {
-      sum += fabs(data_[i * width_ + j]);
-    }
-  }
-  return sum;
-}
-
-MatrixPtr CpuMatrix::getTranspose() {
-  if (memoryHandle_.get() != NULL) {
-    return std::make_shared<CpuMatrix>(
-        std::dynamic_pointer_cast<CpuMemoryHandle>(memoryHandle_),
-        height_,
-        width_,
-        true);
-  } else {
-    MatrixPtr copy_T(new CpuMatrix(data_, height_, width_, true));
-    return copy_T;
-  }
-}
-
-void CpuMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
-  if (memAlloc) {
-    matTrans = std::make_shared<CpuMatrix>(width_, height_);
-  } else {
-    CHECK(matTrans != NULL);
-    CHECK_EQ(matTrans->getHeight(), width_);
-    CHECK_EQ(matTrans->getWidth(), height_);
-  }
-  real* dataTrans = matTrans->getData();
-  real* data = getData();
-  int lda = getStride();
-  int ldc = matTrans->getStride();
-
-  for (size_t i = 0; i < height_; i++) {
-    for (size_t j = 0; j < width_; j++) {
-      dataTrans[j * ldc + i] = data[i * lda + j];
-    }
-  }
-}
-
-void CpuMatrix::rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise) {
-  if (memAlloc) {
-    matRot = std::make_shared<CpuMatrix>(width_, height_);
-  } else {
-    CHECK(matRot != NULL);
-    CHECK_EQ(matRot->getHeight(), width_);
-    CHECK_EQ(matRot->getWidth(), height_);
-  }
-  real* dataRot = matRot->getData();
-  real* data = getData();
-
-  for (size_t i = 0; i < height_; i++) {
-    for (size_t j = 0; j < width_; j++) {
-      if (clockWise) {
-        dataRot[j * height_ + i] = data[(height_ - i - 1) * width_ + j];
-      } else {
-        dataRot[j * height_ + i] = data[i * width_ + (width_ - j - 1)];
-      }
-    }
-  }
-}
-
-MatrixPtr CpuMatrix::getInverse() {
-  MatrixPtr matInv;
-  inverse(matInv, true);
-  return matInv;
-}
-
-void CpuMatrix::inverse(MatrixPtr& matInv, bool memAlloc) {
-  CHECK_EQ(height_, width_);
-
-  if (memAlloc) {
-    matInv = std::make_shared<CpuMatrix>(height_, width_);
-  } else {
-    CHECK(matInv != NULL);
-  }
-
-  CHECK_EQ(height_, matInv->getHeight());
-  CHECK_EQ(width_, matInv->getWidth());
-  matInv->copyFrom(*this);
-
-  real* data = getData();
-  real* dataInv = matInv->getData();
-  int ldc = matInv->getStride();
-
-  if (height_ == 1) {
-    CHECK_NE(*data, 0);
-    *dataInv = 1.0 / (*data);
-    return;
-  }
-
-  /* Compute the LU decomposition of the matrix */
-  std::vector<int> ipiv(height_);
-  CBLAS_ORDER order = (matInv->isTransposed() ? CblasColMajor : CblasRowMajor);
-  int info = getrf<real>(order, height_, height_, dataInv, ldc, ipiv.data());
-  CHECK_EQ(info, 0);
-
-  /* Compute the inverse of the matrix given its LU decompsotion */
-  info = getri<real>(order, height_, dataInv, ldc, ipiv.data());
-  CHECK_EQ(info, 0);
-}
-
-void CpuMatrix::upsampleForward(Matrix& input,
-                                Matrix& mask,
-                                size_t imgSizeH,
-                                size_t imgSizeW,
-                                size_t channels,
-                                size_t outputH,
-                                size_t outputW) {
-  real* inputData = input.getData();
-  real* maskData = mask.getData();
-  real* outData = data_;
-  size_t inLength = imgSizeH * imgSizeW;
-  size_t outLength = outputH * outputW;
-  size_t batch = input.getHeight();
-  CHECK(inLength == input.getWidth() / channels);
-  CHECK_EQ(batch, this->getHeight());
-  CHECK_EQ(channels * outLength, this->getWidth());
-
-  for (size_t k = 0; k < batch; k++) {
-    for (size_t c = 0; c < channels; c++) {
-      for (size_t i = 0; i < inLength; i++) {
-        size_t out_index = static_cast<int>(maskData[i]);
-        if (out_index >= outLength) {
-          LOG(FATAL) << "upsample index " << out_index << " out of range.";
-        }
-        outData[out_index] = inputData[i];
-      }
-      inputData += inLength;
-      maskData += inLength;
-      outData += outLength;
-    }
-  }
-}
-
-void CpuMatrix::upsampleBackward(Matrix& outputGrad,
-                                 Matrix& mask,
-                                 size_t imgSizeH,
-                                 size_t imgSizeW,
-                                 size_t channels,
-                                 size_t outputH,
-                                 size_t outputW) {
-  real* outputGradData = outputGrad.getData();
-  real* maskData = mask.getData();
-  real* inputGradData = data_;
-  size_t inLength = imgSizeH * imgSizeW;
-  size_t outLength = outputH * outputW;
-  size_t batch = outputGrad.getHeight();
-  CHECK(inLength == this->getWidth() / channels);
-  CHECK_EQ(batch, this->getHeight());
-  CHECK_EQ(channels * outLength, outputGrad.getWidth());
-
-  for (size_t k = 0; k < batch; k++) {
-    for (size_t c = 0; c < channels; c++) {
-      for (size_t i = 0; i < inLength; i++) {
-        size_t out_index = static_cast<int>(maskData[i]);
-        if (out_index >= outLength) {
-          LOG(FATAL) << "upsample index " << out_index << " out of range.";
-        }
-        inputGradData[i] = outputGradData[out_index];
-      }
-      inputGradData += inLength;
-      maskData += inLength;
-      outputGradData += outLength;
-    }
-  }
-}
-
-void CpuMatrix::maxPoolForward(Matrix& inputMat,
-                               size_t imgSizeH,
-                               size_t imgSizeW,
-                               size_t channels,
-                               size_t sizeX,
-                               size_t sizeY,
-                               size_t strideH,
-                               size_t strideW,
-                               size_t outputH,
-                               size_t outputW,
-                               size_t paddingH,
-                               size_t paddingW,
-                               MatrixPtr maskMatP) {
-  real* inputData = inputMat.getData();
-  real* outData = data_;
-  real* maskData = NULL;
-  size_t num = inputMat.getHeight();
-  size_t inLength = imgSizeH * imgSizeW;
-  size_t outLength = outputH * outputW;
-  CHECK(inLength == inputMat.getWidth() / channels);
-  CHECK_EQ(num, this->getHeight());
-  CHECK_EQ(channels * outLength, this->getWidth());
-  size_t outStride = getStride();
-
-  if (maskMatP != NULL) {
-    maskData = maskMatP->getData();
-    CHECK_EQ(channels * outLength, maskMatP->getWidth());
-  }
-
-  /* pool max one by one */
-  for (size_t n = 0; n < num; ++n) {  // frame by frame
-    if (!isContiguous()) {
-      outData = data_ + n * outStride;
-    }
-    for (size_t c = 0; c < channels; ++c) {  // channel by channel
-      for (size_t ph = 0; ph < outputH; ++ph) {
-        int hstart = ph * strideH - paddingH;
-        int hend = hstart + sizeY;
-        hstart = hstart < 0 ? 0 : hstart;
-        hend = hend < (int)imgSizeH ? hend : (int)imgSizeH;
-        for (size_t pw = 0; pw < outputW; ++pw) {
-          int wstart = pw * strideW - paddingW;
-          int wend = wstart + sizeX;
-          wstart = wstart < 0 ? 0 : wstart;
-          wend = wend < (int)imgSizeW ? wend : (int)imgSizeW;
-
-          real maxval = -(real)FLT_MAX;
-          int max_index = -1;
-          for (int h = hstart; h < hend; ++h) {
-            for (int w = wstart; w < wend; ++w) {
-              if (maxval < inputData[h * imgSizeW + w]) {
-                maxval = inputData[h * imgSizeW + w];
-                max_index = h * imgSizeW + w;
-              }
-            }
-          }
-
-          outData[ph * outputW + pw] = maxval;
-          if (maskData != NULL) maskData[ph * outputW + pw] = max_index;
-        }
-      }
-      // compute offset
-      inputData += inLength;
-      outData += outLength;
-
-      if (maskData != NULL) maskData += outLength;
-    }
-  }
-}
-
-void CpuMatrix::maxPoolBackward(Matrix& image,
-                                size_t imgSizeH,
-                                size_t imgSizeW,
-                                Matrix& outGrad,
-                                Matrix& outV,
-                                size_t sizeX,
-                                size_t sizeY,
-                                size_t strideH,
-                                size_t strideW,
-                                size_t outputH,
-                                size_t outputW,
-                                real scaleTargets,
-                                real scaleOutput,
-                                size_t paddingH,
-                                size_t paddingW) {
-  size_t num = image.getHeight();
-  size_t inLength = imgSizeH * imgSizeW;
-  size_t outLength = outputH * outputW;
-  size_t channels = size_t(width_ / inLength);
-  CHECK(image.getWidth() == inLength * channels);
-  CHECK(image.getHeight() == height_ && image.getWidth() == width_);
-  CHECK(outV.getHeight() == outGrad.getHeight() &&
-        outV.getWidth() == outGrad.getWidth());
-
-  real* tgtGrad = data_;
-  real* inData = image.getData();
-  real* otData = outV.getData();
-  real* otGrad = outGrad.getData();
-
-  size_t outStride = outV.getStride();
-  real* origOutData = otData;
-  real* origOutGrad = otGrad;
-
-  for (size_t n = 0; n < num; ++n) {
-    if (!outV.isContiguous()) {
-      otData = origOutData + n * outStride;
-      otGrad = origOutGrad + n * outStride;
-    }
-    for (size_t c = 0; c < channels; ++c) {
-      for (size_t ph = 0; ph < outputH; ++ph) {
-        int hstart = ph * strideH - paddingH;
-        int hend = std::min(hstart + sizeY, imgSizeH);
-        hstart = std::max(hstart, 0);
-        for (size_t pw = 0; pw < outputW; ++pw) {
-          int wstart = pw * strideW - paddingW;
-          int wend = std::min(wstart + sizeX, imgSizeW);
-          wstart = std::max(wstart, 0);
-          for (int h = hstart; h < hend; ++h) {
-            for (int w = wstart; w < wend; ++w) {
-              tgtGrad[h * imgSizeW + w] =
-                  scaleTargets * tgtGrad[h * imgSizeW + w] +
-                  scaleOutput * otGrad[ph * outputW + pw] *
-                      (inData[h * imgSizeW + w] == otData[ph * outputW + pw]);
-            }
-          }
-        }
-      }
-      // offset
-      inData += inLength;
-      tgtGrad += inLength;
-      otData += outLength;
-      otGrad += outLength;
-    }
-  }
-}
-
-void CpuMatrix::avgPoolForward(Matrix& input,
-                               size_t imgSizeH,
-                               size_t imgSizeW,
-                               size_t channels,
-                               size_t sizeX,
-                               size_t sizeY,
-                               size_t strideH,
-                               size_t strideW,
-                               size_t outputH,
-                               size_t outputW,
-                               size_t paddingH,
-                               size_t paddingW,
-                               bool excludeMode) {
-  // The main loop
-  size_t num = input.getHeight();
-  size_t inLength = imgSizeH * imgSizeW;
-  size_t outLength = outputH * outputW;
-  CHECK(inLength * channels == input.getWidth());
-  CHECK(outLength * channels * num == height_ * width_);
-  real* tgtData = data_;
-  real* inData = input.getData();
-
-  for (size_t n = 0; n < num; ++n) {
-    if (!isContiguous()) {
-      tgtData = data_ + n * getStride();
-    }
-    for (size_t c = 0; c < channels; ++c) {
-      for (size_t ph = 0; ph < outputH; ++ph) {
-        int hstart = ph * strideH - paddingH;
-        int hend = std::min(hstart + sizeY, imgSizeH);
-        hstart = std::max(hstart, 0);
-        for (size_t pw = 0; pw < outputW; ++pw) {
-          int wstart = pw * strideW - paddingW;
-          int wend = std::min(wstart + sizeX, imgSizeW);
-          wstart = std::max(wstart, 0);
-          tgtData[ph * outputW + pw] = 0;  // clear
-          for (int h = hstart; h < hend; ++h) {
-            for (int w = wstart; w < wend; ++w) {
-              tgtData[ph * outputW + pw] += inData[h * imgSizeW + w];
-            }
-          }
-          int poolSize =
-              excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
-          CHECK(poolSize);
-          tgtData[ph * outputW + pw] /= poolSize;
-        }
-      }
-      // compute offset
-      inData += inLength;
-      tgtData += outLength;
-    }
-  }
-}
-
-void CpuMatrix::avgPoolBackward(Matrix& input,
-                                size_t imgSizeH,
-                                size_t imgSizeW,
-                                size_t sizeX,
-                                size_t sizeY,
-                                size_t strideH,
-                                size_t strideW,
-                                size_t outputH,
-                                size_t outputW,
-                                real scaleTargets,
-                                real scaleOutput,
-                                size_t paddingH,
-                                size_t paddingW,
-                                bool excludeMode) {
-  size_t num = input.getHeight();
-  size_t channels = input.getWidth() / outputH / outputW;
-  size_t inLength = imgSizeH * imgSizeW;
-  size_t outLength = outputH * outputW;
-  CHECK(inLength * channels == getWidth());
-  real* inData = input.getData();
-  real* outData = getData();
-
-  for (size_t n = 0; n < num; ++n) {
-    if (!input.isContiguous()) {
-      inData = input.getData() + n * input.getStride();
-    }
-    for (size_t c = 0; c < channels; ++c) {
-      for (size_t ph = 0; ph < outputH; ++ph) {
-        int hstart = ph * strideH - paddingH;
-        int hend = std::min(hstart + sizeY, imgSizeH);
-        hstart = std::max(hstart, 0);
-        for (size_t pw = 0; pw < outputW; ++pw) {
-          int wstart = pw * strideW - paddingW;
-          int wend = std::min(wstart + sizeX, imgSizeW);
-          wstart = std::max(wstart, 0);
-          int poolSize =
-              excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
-          CHECK(poolSize);
-
-          for (int h = hstart; h < hend; ++h) {
-            for (int w = wstart; w < wend; ++w) {
-              outData[h * imgSizeW + w] += inData[ph * outputW + pw] / poolSize;
-            }
-          }
-        }
-      }
-      // offset
-      outData += inLength;
-      inData += outLength;
-    }
-  }
-}
-
-void CpuMatrix::maxPool3DForward(Matrix& inputMat,
-                                 Matrix& maxPoolIdx,
-                                 size_t channels,
-                                 size_t imgSizeD,
-                                 size_t imgSizeH,
-                                 size_t imgSizeW,
-                                 size_t outputD,
-                                 size_t outputH,
-                                 size_t outputW,
-                                 size_t sizeZ,
-                                 size_t sizeY,
-                                 size_t sizeX,
-                                 size_t strideD,
-                                 size_t strideH,
-                                 size_t strideW,
-                                 size_t paddingD,
-                                 size_t paddingH,
-                                 size_t paddingW) {
-  real* inputData = inputMat.getData();
-  real* outData = getData();
-  real* maxPoolIdxData = maxPoolIdx.getData();
-  size_t num = inputMat.getHeight();
-  size_t inLength = imgSizeH * imgSizeW * imgSizeD;
-  size_t outLength = outputH * outputW * outputD;
-  CHECK(inLength == inputMat.getWidth() / channels);
-  CHECK_EQ(num, this->getHeight());
-  CHECK_EQ(channels * outLength, this->getWidth());
-  size_t outStride = getStride();
-
-  /* initialize the data_ */
-  for (size_t i = 0; i < height_; i++) {
-    for (size_t j = 0; j < width_; j++) {
-      outData[(i)*outStride + j] = -(real)FLT_MAX;
-      maxPoolIdxData[(i)*outStride + j] = -1;
-    }
-  }
-
-  /* pool max one by one */
-  for (size_t n = 0; n < num; ++n) {  // frame by frame
-    if (!isContiguous()) {
-      outData = getData() + n * outStride;
-      maxPoolIdxData = maxPoolIdx.getData() + n * outStride;
-    }
-    for (size_t c = 0; c < channels; ++c) {  // channel by channel
-      for (size_t pd = 0; pd < outputD; ++pd) {
-        int dstart = pd * strideD - paddingD;
-        int dend = std::min(dstart + sizeZ, imgSizeD);
-        dstart = std::max(dstart, 0);
-        for (size_t ph = 0; ph < outputH; ++ph) {
-          int hstart = ph * strideH - paddingH;
-          int hend = std::min(hstart + sizeY, imgSizeH);
-          hstart = std::max(hstart, 0);
-          for (size_t pw = 0; pw < outputW; ++pw) {
-            int wstart = pw * strideW - paddingW;
-            int wend = std::min(wstart + sizeX, imgSizeW);
-            wstart = std::max(wstart, 0);
-            int maxIdx = -1;
-            real maxOutData = outData[(pd * outputH + ph) * outputW + pw];
-            for (int d = dstart; d < dend; ++d) {
-              for (int h = hstart; h < hend; ++h) {
-                for (int w = wstart; w < wend; ++w) {
-                  if (maxOutData <
-                      inputData[(d * imgSizeH + h) * imgSizeW + w]) {
-                    maxOutData = inputData[(d * imgSizeH + h) * imgSizeW + w];
-                    maxIdx = (d * imgSizeH + h) * imgSizeW + w;
-                  }
-                }
-              }
-            }
-            outData[(pd * outputH + ph) * outputW + pw] = maxOutData;
-            maxPoolIdxData[(pd * outputH + ph) * outputW + pw] = maxIdx;
-          }
-        }
-      }
-      // compute offset
-      inputData += inLength;
-      outData += outLength;
-      maxPoolIdxData += outLength;
-    }
-  }
-}
-
-void CpuMatrix::maxPool3DBackward(Matrix& outGrad,
-                                  Matrix& maxPoolIdx,
-                                  size_t imgSizeD,
-                                  size_t imgSizeH,
-                                  size_t imgSizeW,
-                                  size_t outputD,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  size_t sizeZ,
-                                  size_t sizeY,
-                                  size_t sizeX,
-                                  size_t strideD,
-                                  size_t strideH,
-                                  size_t strideW,
-                                  size_t paddingD,
-                                  size_t paddingH,
-                                  size_t paddingW,
-                                  real scaleTargets,
-                                  real scaleOutput) {
-  size_t num = getHeight();
-  size_t inLength = imgSizeH * imgSizeW * imgSizeD;
-  size_t outLength = outputH * outputW * outputD;
-  size_t channels = size_t(width_ / inLength);
-  CHECK(maxPoolIdx.getHeight() == outGrad.getHeight() &&
-        maxPoolIdx.getWidth() == outGrad.getWidth());
-
-  real* tgtGrad = getData();
-  real* otGrad = outGrad.getData();
-  real* maxPoolIdxData = maxPoolIdx.getData();
-  size_t outStride = outGrad.getStride();
-
-  for (size_t n = 0; n < num; ++n) {
-    if (!outGrad.isContiguous()) {
-      otGrad = outGrad.getData() + n * outStride;
-      maxPoolIdxData = maxPoolIdx.getData() + n * outStride;
-    }
-    for (size_t c = 0; c < channels; ++c) {
-      for (size_t pd = 0; pd < outputD; ++pd) {
-        for (size_t ph = 0; ph < outputH; ++ph) {
-          for (size_t pw = 0; pw < outputW; ++pw) {
-            const size_t index = (pd * outputH + ph) * outputW + pw;
-            const size_t tgtIdx = static_cast<size_t>(maxPoolIdxData[index]);
-            tgtGrad[tgtIdx] =
-                scaleTargets * tgtGrad[tgtIdx] + scaleOutput * otGrad[index];
-          }
-        }
-      }
-      // offset
-      tgtGrad += inLength;
-      otGrad += outLength;
-      maxPoolIdxData += outLength;
-    }
-  }
-}
-
-void CpuMatrix::avgPool3DForward(Matrix& input,
-                                 size_t channels,
-                                 size_t imgSizeD,
-                                 size_t imgSizeH,
-                                 size_t imgSizeW,
-                                 size_t outputD,
-                                 size_t outputH,
-                                 size_t outputW,
-                                 size_t sizeZ,
-                                 size_t sizeY,
-                                 size_t sizeX,
-                                 size_t strideD,
-                                 size_t strideH,
-                                 size_t strideW,
-                                 size_t paddingD,
-                                 size_t paddingH,
-                                 size_t paddingW) {
-  // The main loop
-  size_t num = input.getHeight();
-  size_t inLength = imgSizeH * imgSizeW * imgSizeD;
-  size_t outLength = outputH * outputW * outputD;
-  CHECK(inLength * channels == input.getWidth());
-  CHECK(outLength * channels * num == height_ * width_);
-  real* tgtData = getData();
-  real* inData = input.getData();
-
-  for (size_t n = 0; n < num; ++n) {
-    if (!isContiguous()) {
-      tgtData = data_ + n * getStride();
-    }
-    for (size_t c = 0; c < channels; ++c) {
-      for (size_t pd = 0; pd < outputD; ++pd) {
-        int dstart = pd * strideD - paddingD;
-        int dend = std::min(dstart + sizeZ, imgSizeD);
-        dstart = std::max(dstart, 0);
-        for (size_t ph = 0; ph < outputH; ++ph) {
-          int hstart = ph * strideH - paddingH;
-          int hend = std::min(hstart + sizeY, imgSizeH);
-          hstart = std::max(hstart, 0);
-          for (size_t pw = 0; pw < outputW; ++pw) {
-            int wstart = pw * strideW - paddingW;
-            int wend = std::min(wstart + sizeX, imgSizeW);
-            wstart = std::max(wstart, 0);
-
-            tgtData[(pd * outputH + ph) * outputW + pw] = 0;  // clear
-            for (int d = dstart; d < dend; ++d) {
-              for (int h = hstart; h < hend; ++h) {
-                for (int w = wstart; w < wend; ++w) {
-                  tgtData[(pd * outputH + ph) * outputW + pw] +=
-                      inData[(d * imgSizeH + h) * imgSizeW + w];
-                }
-              }
-            }
-            int poolSize = (dend - dstart) * (hend - hstart) * (wend - wstart);
-            CHECK(poolSize);
-            tgtData[(pd * outputH + ph) * outputW + pw] /= poolSize;
-          }
-        }
-      }
-      // compute offset
-      inData += inLength;
-      tgtData += outLength;
-    }
-  }
-}
-
-void CpuMatrix::avgPool3DBackward(Matrix& input,
-                                  size_t imgSizeD,
-                                  size_t imgSizeH,
-                                  size_t imgSizeW,
-                                  size_t outputD,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  size_t sizeZ,
-                                  size_t sizeY,
-                                  size_t sizeX,
-                                  size_t strideD,
-                                  size_t strideH,
-                                  size_t strideW,
-                                  size_t paddingD,
-                                  size_t paddingH,
-                                  size_t paddingW,
-                                  real scaleTargets,
-                                  real scaleOutput) {
-  size_t num = input.getHeight();
-  size_t inLength = imgSizeH * imgSizeW * imgSizeD;
-  size_t outLength = outputH * outputW * outputD;
-  size_t channels = input.getWidth() / outLength;
-  CHECK(inLength * channels == getWidth());
-  real* inData = input.getData();
-  real* outData = getData();
-
-  for (size_t n = 0; n < num; ++n) {
-    if (!input.isContiguous()) {
-      inData = input.getData() + n * input.getStride();
-    }
-    for (size_t c = 0; c < channels; ++c) {
-      for (size_t pd = 0; pd < outputD; ++pd) {
-        int dstart = pd * strideD - paddingD;
-        int dend = std::min(dstart + sizeZ, imgSizeD);
-        dstart = std::max(dstart, 0);
-        for (size_t ph = 0; ph < outputH; ++ph) {
-          int hstart = ph * strideH - paddingH;
-          int hend = std::min(hstart + sizeY, imgSizeH);
-          hstart = std::max(hstart, 0);
-          for (size_t pw = 0; pw < outputW; ++pw) {
-            int wstart = pw * strideW - paddingW;
-            int wend = std::min(wstart + sizeX, imgSizeW);
-            wstart = std::max(wstart, 0);
-            int poolSize = (dend - dstart) * (hend - hstart) * (wend - wstart);
-            CHECK(poolSize);
-            for (int d = dstart; d < dend; ++d) {
-              for (int h = hstart; h < hend; ++h) {
-                for (int w = wstart; w < wend; ++w) {
-                  outData[(d * imgSizeH + h) * imgSizeW + w] +=
-                      inData[(pd * outputH + ph) * outputW + pw] / poolSize;
-                }
-              }
-            }
-          }
-        }
-      }
-      // offset
-      outData += inLength;
-      inData += outLength;
-    }
-  }
-}
-
-/**
- * Input: one or more sequences. Each sequence contains some instances.
- * Output: output size is the number of input sequences (NOT input instances).
- * output[i] is set to max_{for each instance in this sequence}{input[i]}
- */
-void CpuMatrix::maxSequenceForward(Matrix& input,
-                                   const IVector& sequence,
-                                   IVector& index) {
-  CHECK(dynamic_cast<CpuMatrix*>(&input));
-  CHECK(dynamic_cast<const CpuIVector*>(&sequence));
-  CHECK(dynamic_cast<CpuIVector*>(&index));
-
-  real* outData = getData();
-  real* inputData = input.getData();
-  const int* starts = sequence.getData();
-  int* maxIndex = index.getData();
-  size_t numSequences = getHeight();
-  size_t dim = getWidth();
-
-  CHECK_EQ(dim, input.getWidth());
-  CHECK_EQ(numSequences, sequence.getSize() - 1);
-  CHECK_EQ(starts[numSequences], (int)input.getHeight());
-  CHECK_EQ(numSequences * dim, index.getSize());
-
-  for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
-    // current sequence, loop for each input instance
-    // (1) first instance: do not need compare, copy value to outV directly
-    for (size_t k = 0; k < dim; ++k) {
-      outData[sequenceId * dim + k] = inputData[starts[sequenceId] * dim + k];
-      maxIndex[sequenceId * dim + k] = starts[sequenceId];
-    }
-    // (2) other instance in same sequence
-    for (int insId = starts[sequenceId] + 1; insId < starts[sequenceId + 1];
-         ++insId) {
-      // insId is the index on all instances
-      for (size_t k = 0; k < dim; ++k) {
-        // for each dim
-        if (inputData[insId * dim + k] > outData[sequenceId * dim + k]) {
-          // update max value and record index
-          outData[sequenceId * dim + k] = inputData[insId * dim + k];
-          maxIndex[sequenceId * dim + k] = insId;
-        }
-      }
-    }
-  }
-}
-
-void CpuMatrix::maxSequenceBackward(Matrix& outputGrad,
-                                    const IVector& sequence,
-                                    IVector& index) {
-  CHECK(dynamic_cast<CpuMatrix*>(&outputGrad));
-  CHECK(dynamic_cast<const CpuIVector*>(&sequence));
-  CHECK(dynamic_cast<CpuIVector*>(&index));
-
-  real* inputGrad = getData();
-  real* outGrad = outputGrad.getData();
-  int* maxIndex = index.getData();
-  size_t dim = getWidth();
-  size_t numSequences = sequence.getSize() - 1;
-
-  CHECK_EQ(dim, outputGrad.getWidth());
-  CHECK_EQ(numSequences, outputGrad.getHeight());
-  CHECK_EQ(numSequences * dim, index.getSize());
-
-  for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
-    // current sequence
-    for (size_t j = 0; j < dim; ++j) {
-      // each dim
-      int insId = maxIndex[sequenceId * dim + j];
-      inputGrad[insId * dim + j] += outGrad[sequenceId * dim + j];
-    }
-  }
-}
-
-inline void vecAddTo(real* a, const real* b, size_t len) {
-  for (unsigned int i = 0; i < len; ++i) {
-    a[i] += b[i];
-  }
-}
-
-inline void vecAddTo(real* a, const real* b, real scaleB, size_t len) {
-  for (unsigned int i = 0; i < len; ++i) {
-    a[i] += scaleB * b[i];
-  }
-}
-
-inline void colVecAddTo(
-    real* a, const real* b, size_t len, size_t aWidth, size_t bWidth) {
-  for (unsigned int i = 0; i < len; ++i) {
-    a[i * aWidth] += b[i * bWidth];
-  }
-}
-
-inline void colVecAddTo(
-    real* a, real* b, real c, size_t len, size_t aWidth, size_t bWidth) {
-  for (unsigned int i = 0; i < len; ++i) {
-    a[i * aWidth] += b[i * bWidth] * c;
-  }
-}
-
-void CpuMatrix::addBias(Matrix& b, real scale) {
-  CHECK(b.useGpu_ == false) << "Matrix type are not equal";
-
-  CHECK_EQ(b.getHeight(), (size_t)1);
-  CHECK_EQ(width_, b.getWidth());
-  real* aData = getData();
-  real* bData = b.getData();
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-
-  if (scale == 1 && getStride() % 32 == 0) {  // use libaddto
-    // @TODO(yuyang18) Make input addr can be unaligned.
-    // So merge this if and else
-    CHECK_EQ((size_t)aData % 32, 0UL);
-    CHECK_EQ((size_t)bData % 32, 0UL);
-    for (size_t i = 0; i < numSamples; i++) {
-      simd::addTo(aData + i * getStride(), bData, dim);
-    }
-  } else {
-    for (size_t i = 0; i < numSamples; i++) {
-      for (size_t j = 0; j < dim; j++) {
-        aData[i * getStride() + j] += scale * bData[j];
-      }
-    }
-  }
-}
-
-void CpuMatrix::addSharedBias(Matrix& b, real scale) {
-  CHECK_EQ(b.getHeight(), (size_t)1);
-  real* aData = getData();
-  real* bData = b.getData();
-  size_t numSamples = getHeight();
-  size_t channel = b.getWidth();
-  CHECK_EQ(getWidth() % channel, 0UL);
-  size_t dim = getWidth() / channel;
-
-  for (size_t i = 0; i < numSamples; i++) {
-    for (size_t c = 0; c < channel; c++) {
-      for (size_t j = 0; j < dim; j++) {
-        aData[i * getStride() + c * dim + j] += scale * bData[c];
-      }
-    }
-  }
-}
-
-void CpuMatrix::collectBias(Matrix& a, real scale) {
-  CHECK_EQ(getHeight(), (size_t)1);
-  CHECK_EQ(width_, a.getWidth());
-  CpuSparseMatrix* aptr = dynamic_cast<CpuSparseMatrix*>(&a);
-  if (!aptr) {
-    sumCols(a, /* scaleSum= */ scale, /* scaleDest= */ 1);
-  } else {
-    size_t nnz = aptr->getElementCnt();
-    int* cols = aptr->getCols();
-    real* A = aptr->getValue();
-    real* B = getData();
-    for (size_t i = 0; i < nnz; i++) {
-      B[cols[i]] += scale * A[i];
-    }
-  }
-}
-
-void CpuMatrix::collectSharedBias(Matrix& a, real scale) {
-  CHECK_EQ(getHeight(), (size_t)1);
-  real* B = getData();
-  real* A = a.getData();
-  size_t numSamples = a.getHeight();
-  size_t channel = getWidth();
-  CHECK_EQ(a.getWidth() % channel, 0UL);
-  size_t dim = a.getWidth() / channel;
-  for (size_t i = 0; i < numSamples; i++) {
-    for (size_t c = 0; c < channel; c++) {
-      for (size_t j = 0; j < dim; j++) {
-        B[c] += scale * A[i * channel * dim + c * dim + j];
-      }
-    }
-  }
-}
-
-void CpuMatrix::sequenceAvgForward(Matrix& a,
-                                   const IVector& startsPos,
-                                   int mode) {
-  size_t height = getHeight();
-  size_t width = getWidth();
-  CHECK_EQ(height, startsPos.getSize() - 1);
-  CHECK_EQ(width, a.getWidth());
-  real* dst = getData();
-  real* src = a.getData();
-  const int* starts = startsPos.getData();
-  MatrixPtr outMtx = Matrix::create(nullptr, 1, width, false, false);
-  MatrixPtr dataMtx = Matrix::create(nullptr, 1, width, false, false);
-  for (size_t i = 0; i < height; i++) {
-    int sequenceLength = starts[i + 1] - starts[i];
-    if (0 == sequenceLength) {
-      // empty sequence
-      continue;
-    }
-    outMtx->setData(dst + i * width);
-    dataMtx->setData(src + starts[i] * width, sequenceLength, width);
-    if (mode == 0) {
-      // plain average
-      outMtx->sumCols(*dataMtx,
-                      (real)1 / (real)sequenceLength,
-                      /* scaleDest= */ 1);
-    } else if (mode == 1) {
-      // sum instead of average
-      outMtx->sumCols(*dataMtx, /* scaleSum= */ 1, /* scaleDest= */ 1);
-    } else if (mode == 2) {
-      // divide by square root of sequenceLength
-      outMtx->sumCols(*dataMtx,
-                      (real)1 / std::sqrt(sequenceLength),
-                      /* scaleDest= */ 1);
-    } else {
-      LOG(FATAL) << "should not reach here";
-    }
-  }
-}
-
-void CpuMatrix::sequenceAvgBackward(Matrix& a,
-                                    const IVector& startsPos,
-                                    int mode) {
-  size_t height = a.getHeight();
-  size_t width = getWidth();
-  CHECK_EQ(height, startsPos.getSize() - 1);
-  CHECK_EQ(width, a.getWidth());
-  real* dst = getData();
-  real* src = a.getData();
-  const int* starts = startsPos.getData();
-  MatrixPtr outMtx = Matrix::create(nullptr, 1, width, false, false);
-  MatrixPtr dataMtx = Matrix::create(nullptr, 1, width, false, false);
-  for (size_t i = 0; i < height; ++i) {
-    int sequenceLength = starts[i + 1] - starts[i];
-    if (0 == sequenceLength) {
-      // empty sequence
-      continue;
-    }
-    outMtx->setData(dst + starts[i] * width, sequenceLength, width);
-    dataMtx->setData(src + i * width);
-    if (mode == 0) {
-      // plain average
-      outMtx->addBias(*dataMtx, 1.0f / sequenceLength);
-    } else if (mode == 1) {
-      // sum instead of average
-      outMtx->addBias(*dataMtx, 1.0f);
-    } else if (mode == 2) {
-      // divide by square root of sequenceLength
-      outMtx->addBias(*dataMtx, 1.0f / std::sqrt(sequenceLength));
-    } else {
-      LOG(FATAL) << "should not reach here";
-    }
-  }
-}
-
-/* this = scaleAB*(a*b) + scaleT*this*/
-void CpuMatrix::mul(const Matrix& a,
-                    const Matrix& b,
-                    real scaleAB,
-                    real scaleT) {
-  CHECK(!isTransposed()) << "Not supported";
-  const auto a_ptr = dynamic_cast<const CpuMatrix*>(&a);
-  const auto b_ptr = dynamic_cast<const CpuMatrix*>(&b);
-  const auto a_ptr_s = dynamic_cast<const CpuSparseMatrix*>(&a);
-  const auto b_ptr_s = dynamic_cast<const CpuSparseMatrix*>(&b);
-
-  if (a_ptr && b_ptr) {
-    mul((CpuMatrix*)a_ptr, (CpuMatrix*)b_ptr, scaleAB, scaleT);
-  } else if (a_ptr_s && b_ptr) {
-    mul((CpuSparseMatrix*)a_ptr_s, (CpuMatrix*)b_ptr, scaleAB, scaleT);
-  } else if (a_ptr && b_ptr_s) {
-    mul((CpuMatrix*)a_ptr, (CpuSparseMatrix*)b_ptr_s, scaleAB, scaleT);
-  } else {
-    LOG(FATAL) << "Not supported";
-  }
-}
-
-void CpuMatrix::mul(CpuSparseMatrix* a,
-                    CpuMatrix* b,
-                    real scaleAB,
-                    real scaleT) {
-  if (dynamic_cast<CacheRowCpuMatrix*>(b)) {
-    return mul(a, dynamic_cast<CacheRowCpuMatrix*>(b), this, scaleAB, scaleT);
-  } else if (dynamic_cast<SparseRowCpuMatrix*>(b)) {
-    return mul(a, dynamic_cast<SparseRowCpuMatrix*>(b), this, scaleAB, scaleT);
-  } else {
-    return mul(a, b, this, scaleAB, scaleT);
-  }
-}
-
-void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) {
-  CHECK(!isTransposed()) << "Not supported";
-
-  size_t a_col, b_col, a_row, b_row;
-  bool a_trans, b_trans;
-  if (!a->isTransposed()) {
-    a_col = a->getWidth();
-    a_row = a->getHeight();
-    a_trans = false;
-  } else {
-    a_col = a->getHeight();
-    a_row = a->getWidth();
-    a_trans = true;
-  }
-  if (!b->isTransposed()) {
-    b_col = b->getWidth();
-    b_row = b->getHeight();
-    b_trans = false;
-  } else {
-    b_col = b->getHeight();
-    b_row = b->getWidth();
-    b_trans = true;
-  }
-
-  CHECK_EQ(a_col, b_row);
-  CHECK_EQ(a_row, getHeight());
-  CHECK_EQ(b_col, getWidth());
-
-  real* A = a->getData();
-  real* B = b->getData();
-  real* C = getData();
-
-  int M = getHeight();
-  int N = getWidth();
-  int K = a_col;
-  int lda = a->getStride();
-  int ldb = b->getStride();
-  int ldc = getStride();
-  BlasGemm<DEVICE_TYPE_CPU, real>::compute(
-      a_trans, b_trans, M, N, K, scaleAB, A, lda, B, ldb, scaleT, C, ldc);
-}
-
-void CpuMatrix::mul(
-    CpuMatrix* a, CpuMatrix* b, CpuSparseMatrix* c, real scaleAB, real scaleT) {
-  CHECK(!c->isTransposed()) << "Not supported";
-  CHECK_EQ(c->getValueType(), FLOAT_VALUE);
-
-  real* A = a->getData();
-  real* B = b->getData();
-  real* C = c->getValue();
-  int* rows = c->getRows();
-  int* cols = c->getCols();
-  size_t height = c->getHeight();
-  size_t width = c->getWidth();
-  if (scaleT == 0) {
-    c->zeroMem();
-  }
-
-  if (!a->isTransposed() && !b->isTransposed()) {
-    size_t m = a->getWidth();
-    CHECK_EQ(b->getHeight(), m);
-    CHECK_EQ(a->getHeight(), height);
-    CHECK_EQ(b->getWidth(), width);
-    if (c->getFormat() == SPARSE_CSC) {
-      for (size_t i = 0; i < width; i++) {
-        size_t start = c->getColStartIdx(i);
-        size_t end = c->getColStartIdx(i + 1);
-        for (size_t j = start; j < end; j++) {
-          real sum = 0;
-          size_t rowIdx = rows[j];
-          for (size_t k = 0; k < m; k++) {
-            sum += A[rowIdx * m + k] * B[k * width + i];
-          }
-          C[j] = scaleAB * sum + scaleT * C[j];
-        }
-      }
-    } else {
-      for (size_t i = 0; i < height; i++) {
-        size_t start = c->getRowStartIdx(i);
-        size_t end = c->getRowStartIdx(i + 1);
-        for (size_t j = start; j < end; j++) {
-          real sum = 0;
-          size_t colIdx = cols[j];
-          for (size_t k = 0; k < m; k++) {
-            sum += A[i * m + k] * B[k * width + colIdx];
-          }
-          C[j] = scaleAB * sum + scaleT * C[j];
-        }
-      }
-    }
-  } else if (a->isTransposed() && !b->isTransposed()) {
-    size_t m = a->getHeight();
-    CHECK_EQ(m, b->getHeight());
-    CHECK_EQ(b->getWidth(), width);
-    CHECK_EQ(a->getWidth(), height);
-
-    if (c->getFormat() == SPARSE_CSC) {
-      for (size_t i = 0; i < width; i++) {
-        size_t start = c->getColStartIdx(i);
-        size_t end = c->getColStartIdx(i + 1);
-        for (size_t j = start; j < end; j++) {
-          real sum = 0;
-          size_t rowIdx = rows[j];
-          for (size_t k = 0; k < m; k++) {
-            sum += A[k * height + rowIdx] * B[k * width + i];
-          }
-          C[j] = scaleAB * sum + scaleT * C[j];
-        }
-      }
-    } else {
-      for (size_t i = 0; i < height; i++) {
-        int start = c->getRowStartIdx(i);
-        int end = c->getRowStartIdx(i + 1);
-        for (int j = start; j < end; j++) {
-          real sum = 0;
-          size_t colIdx = cols[j];
-          for (size_t k = 0; k < m; k++) {
-            sum += A[k * height + i] * B[k * width + colIdx];
-          }
-          C[j] = scaleAB * sum + scaleT * C[j];
-        }
-      }
-    }
-  } else if (!a->isTransposed() && b->isTransposed()) {
-    size_t m = a->getWidth();
-    CHECK_EQ(b->getWidth(), m);
-    CHECK_EQ(a->getHeight(), height);
-    CHECK_EQ(b->getHeight(), width);
-    if (c->getFormat() == SPARSE_CSR) {
-      for (size_t i = 0; i < height; i++) {
-        size_t start = c->getRowStartIdx(i);
-        size_t end = c->getRowStartIdx(i + 1);
-        for (size_t j = start; j < end; j++) {
-          real sum = 0;
-          size_t colIdx = cols[j];
-          for (size_t k = 0; k < m; k++) {
-            sum += A[i * m + k] * B[colIdx * m + k];
-          }
-          C[j] = scaleAB * sum + scaleT * C[j];
-        }
-      }
-    } else {
-      LOG(FATAL) << "Not supported csc format "
-                    "when a is not trans and b is trans";
-    }
-  } else {
-    LOG(FATAL) << "Not supported";
-  }
-}
-
-void CpuMatrix::mul(CpuMatrix* a,
-                    CpuSparseMatrix* b,
-                    real scaleAB,
-                    real scaleT) {
-  CHECK(!trans_) << "Not supported";
-  CHECK(!a->isTransposed()) << "Not supported";
-  CHECK(scaleT == 0 || scaleT == 1);
-
-  // TODO(yuyang18): Maybe bug implementation here
-  CHECK_EQ(scaleAB, static_cast<real>(1.0));
-
-  real* A = a->getData();
-  real* B = b->getValue();
-  real* C = getData();
-  int* rows = b->getRows();
-  int* cols = b->getCols();
-
-  if (scaleT == 0) {
-    zeroMem();
-  }
-  if (b->getFormat() == SPARSE_CSC) {
-    if (!b->isTransposed()) {
-      size_t m = a->getWidth();
-      CHECK_EQ(b->getHeight(), m);
-      CHECK_EQ(a->getHeight(), height_);
-      CHECK_EQ(b->getWidth(), width_);
-
-      if (b->getValueType() == NO_VALUE) {
-        for (size_t j = 0; j < b->getWidth(); ++j) {
-          int start = b->getColStartIdx(j);
-          int end = b->getColStartIdx(j + 1);
-          for (int i = start; i < end; ++i) {
-            colVecAddTo(C + j, A + rows[i], height_, width_, a->getWidth());
-          }
-        }
-      } else if (b->getValueType() == FLOAT_VALUE) {
-        for (size_t j = 0; j < b->getWidth(); ++j) {
-          int start = b->getColStartIdx(j);
-          int end = b->getColStartIdx(j + 1);
-          for (int i = start; i < end; ++i) {
-            colVecAddTo(
-                C + j, A + rows[i], B[i], height_, width_, a->getWidth());
-          }
-        }
-      }
-    } else /*if (b->isTransposed())*/ {
-      size_t m = a->getWidth();
-      CHECK_EQ(b->getHeight(), width_);
-      CHECK_EQ(a->getHeight(), height_);
-      CHECK_EQ(b->getWidth(), m);
-      if (b->getValueType() == NO_VALUE) {
-        for (size_t i = 0; i < b->getWidth(); ++i) {
-          int start = b->getColStartIdx(i);
-          int end = b->getColStartIdx(i + 1);
-          for (int j = start; j < end; ++j) {
-            colVecAddTo(C + rows[j], A + i, height_, width_, a->getWidth());
-          }
-        }
-      } else if (b->getValueType() == FLOAT_VALUE) {
-        for (size_t i = 0; i < b->getWidth(); ++i) {
-          int start = b->getColStartIdx(i);
-          int end = b->getColStartIdx(i + 1);
-          for (int j = start; j < end; ++j) {
-            colVecAddTo(
-                C + rows[j], A + i, B[j], height_, width_, a->getWidth());
-          }
-        }
-      }
-    }
-  } else {
-    if (!b->isTransposed()) {
-      size_t m = a->getWidth();
-      CHECK_EQ(b->getHeight(), m);
-      CHECK_EQ(a->getHeight(), height_);
-      CHECK_EQ(b->getWidth(), width_);
-
-      if (b->getValueType() == NO_VALUE) {
-        for (size_t j = 0; j < b->getHeight(); ++j) {
-          int start = b->getRowStartIdx(j);
-          int end = b->getRowStartIdx(j + 1);
-          for (int i = start; i < end; ++i) {
-            colVecAddTo(C + cols[i], A + j, height_, width_, a->getWidth());
-          }
-        }
-      } else if (b->getValueType() == FLOAT_VALUE) {
-        for (size_t j = 0; j < b->getHeight(); ++j) {
-          int start = b->getRowStartIdx(j);
-          int end = b->getRowStartIdx(j + 1);
-          for (int i = start; i < end; ++i) {
-            colVecAddTo(
-                C + cols[i], A + j, B[i], height_, width_, a->getWidth());
-          }
-        }
-      }
-    } else /*if (b->isTransposed())*/ {
-      size_t m = a->getWidth();
-      CHECK_EQ(b->getHeight(), width_);
-      CHECK_EQ(a->getHeight(), height_);
-      CHECK_EQ(b->getWidth(), m);
-      if (b->getValueType() == NO_VALUE) {
-        for (size_t i = 0; i < b->getHeight(); ++i) {
-          int start = b->getRowStartIdx(i);
-          int end = b->getRowStartIdx(i + 1);
-          for (int j = start; j < end; ++j) {
-            colVecAddTo(C + i, A + cols[j], height_, width_, a->getWidth());
-          }
-        }
-      } else if (b->getValueType() == FLOAT_VALUE) {
-        for (size_t i = 0; i < b->getHeight(); ++i) {
-          int start = b->getRowStartIdx(i);
-          int end = b->getRowStartIdx(i + 1);
-          for (int j = start; j < end; ++j) {
-            colVecAddTo(
-                C + i, A + cols[j], B[j], height_, width_, a->getWidth());
-          }
-        }
-      }
-    }
-  }
-}
-
-void CpuMatrix::selectRows(Matrix& table, IVector& ids) {
-  if (dynamic_cast<CacheRowCpuMatrix*>(&table)) {
-    selectRowsImp(*dynamic_cast<CacheRowCpuMatrix*>(&table), ids);
-  } else if (dynamic_cast<SparseRowCpuMatrix*>(&table)) {
-    selectRowsImp(*dynamic_cast<SparseRowCpuMatrix*>(&table), ids);
-  } else {
-    CHECK(table.isContiguous());
-    selectRowsImp(*dynamic_cast<CpuMatrix*>(&table), ids);
-  }
-}
-
-void CpuMatrix::selectElements(Matrix& table, IVector& ids) {
-  CHECK_EQ(table.getHeight(), ids.getSize());
-  CHECK_EQ(getHeight(), ids.getSize());
-  CHECK_EQ(getWidth(), 1U);
-  real* tableData = table.getData();
-  int* idsData = ids.getData();
-  for (size_t i = 0; i < table.getHeight(); i++) {
-    data_[i] += tableData[i * table.getWidth() + idsData[i]];
-  }
-}
-
-void CpuMatrix::addElements(Matrix& table, IVector& ids) {
-  CHECK_EQ(table.getHeight(), ids.getSize());
-  CHECK_EQ(getHeight(), ids.getSize());
-  CHECK_EQ(getWidth(), 1U);
-  real* tableData = table.getData();
-  int* idsData = ids.getData();
-  for (size_t i = 0; i < table.getHeight(); i++) {
-    tableData[i * table.getWidth() + idsData[i]] += data_[i];
-  }
-}
-
-// this.row[i] += table.row[ids[i]]
-template <typename TableMatType>
-void CpuMatrix::selectRowsImp(TableMatType& table, IVector& ids) {
-  CHECK(!table.useGpu());
-  CHECK(!ids.useGpu());
-  CHECK_EQ(getHeight(), ids.getSize());
-  CHECK_EQ(getWidth(), table.getWidth());
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  real* a = getData();
-  size_t tableSize = table.getHeight();
-  int* index = ids.getData();
-
-  for (size_t i = 0; i < numSamples; ++i) {
-    if (index[i] == -1) continue;
-    CHECK_LT(index[i], (int)tableSize);
-    CHECK_GE(index[i], 0);
-    vecAddTo(a + i * stride_, table.getRow(index[i]), dim);
-  }
-}
-
-void CpuMatrix::addToRows(Matrix& table, IVector& ids) {
-  if (dynamic_cast<CacheRowCpuMatrix*>(&table)) {
-    addToRowsImp(*dynamic_cast<CacheRowCpuMatrix*>(&table), ids);
-  } else if (dynamic_cast<SparseAutoGrowRowCpuMatrix*>(&table)) {
-    addToRowsImp(*dynamic_cast<SparseAutoGrowRowCpuMatrix*>(&table), ids);
-  } else if (dynamic_cast<SparseRowCpuMatrix*>(&table)) {
-    addToRowsImp(*dynamic_cast<SparseRowCpuMatrix*>(&table), ids);
-  } else {
-    CHECK(table.isContiguous());
-    addToRowsImp(*dynamic_cast<CpuMatrix*>(&table), ids);
-  }
-}
-
-// table.row[ids[i]] += this.row[i]
-template <typename TableMatType>
-void CpuMatrix::addToRowsImp(TableMatType& table, IVector& ids) {
-  CHECK(!table.useGpu());
-  CHECK(!ids.useGpu());
-  CHECK_EQ(getHeight(), ids.getSize());
-  CHECK_EQ(getWidth(), table.getWidth());
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  real* a = getData();
-  size_t tableSize = table.getHeight();
-  int* index = ids.getData();
-
-  for (size_t i = 0; i < numSamples; ++i) {
-    if (index[i] == -1) continue;
-    CHECK_LT(index[i], (int)tableSize);
-    CHECK_GE(index[i], 0);
-    vecAddTo(table.getRow(index[i]), a + i * stride_, dim);
-  }
-}
-
-static ThreadLocal<std::vector<const real*>> threadLocalColArray;
-
-template <typename MatBType, typename MatCType>
-void CpuMatrix::mul(
-    CpuSparseMatrix* a, MatBType* b, MatCType* c, real scaleAB, real scaleT) {
-  CHECK(!c->isTransposed()) << "Not supported";
-  CHECK(!b->isTransposed()) << "Not supported";
-  // TODO(yuyang18): Maybe bug implementation here.
-  CHECK(scaleAB == 1) << "Not supported";
-  CHECK(scaleT == 0 || scaleT == 1) << "Not supported";
-  CHECK_EQ(a->getFormat(), SPARSE_CSR) << "Not supported";
-
-  real* B = b->getData();
-  real* C = c->getData();
-  size_t height = c->getHeight();
-  size_t width = c->getWidth();
-  int* cols = a->getCols();
-  real* values = a->getValue();
-
-  if (scaleT == 0) {
-    c->zeroMem();
-  }
-
-  if (!a->isTransposed()) {
-    size_t m = a->getWidth();
-    CHECK_EQ(b->getHeight(), m);
-    CHECK_EQ(a->getHeight(), height);
-    CHECK_EQ(b->getWidth(), width);
-
-    if (a->getValueType() == NO_VALUE) {
-      if (width % 32 == 0) {  // use libaddto
-        // @TODO(yuyang18) Make input addr can be unaligned.
-        // So merge this if and else
-        CHECK_EQ((size_t)B % 32, 0UL);
-        CHECK_EQ((size_t)C % 32, 0UL);
-        auto& colArray = *threadLocalColArray;
-        for (size_t i = 0; i < a->getHeight(); ++i) {
-          const int start = a->getRowStartIdx(i);
-          const int end = a->getRowStartIdx(i + 1);
-          size_t colNum = end - start;
-          colArray.resize(colNum);
-          for (int j = 0; j < end - start; ++j) {
-            colArray[j] = b->getRow(cols[j + start]);
-          }
-          simd::batchAddTo(c->getRow(i), &colArray[0], colNum, width);
-        }
-
-      } else {
-        for (size_t i = 0; i < a->getHeight(); ++i) {
-          const int start = a->getRowStartIdx(i);
-          const int end = a->getRowStartIdx(i + 1);
-          for (int j = start; j < end; ++j) {
-            vecAddTo(c->getRow(i), b->getRow(cols[j]), width);
-          }
-        }
-      }
-    } else if (a->getValueType() == FLOAT_VALUE) {
-      for (size_t i = 0; i < a->getHeight(); ++i) {
-        const int start = a->getRowStartIdx(i);
-        const int end = a->getRowStartIdx(i + 1);
-        for (int j = start; j < end; ++j) {
-          vecAddTo(c->getRow(i), b->getRow(cols[j]), values[j], width);
-        }
-      }
-    }
-  } else /*if (a->isTransposed())*/ {
-    size_t m = a->getHeight();
-    CHECK_EQ(b->getHeight(), m);
-    CHECK_EQ(a->getWidth(), height);
-    CHECK_EQ(b->getWidth(), width);
-    if (a->getValueType() == NO_VALUE) {
-      if (width % 32 == 0) {  // use libaddto
-        // @TODO(yuyang18) Make input addr can be unaligned.
-        // So merge this if and else
-        CHECK_EQ((size_t)B % 32, 0UL);
-        CHECK_EQ((size_t)C % 32, 0UL);
-        for (size_t i = 0; i < a->getHeight(); ++i) {
-          const int start = a->getRowStartIdx(i);
-          const int end = a->getRowStartIdx(i + 1);
-          for (int j = start; j < end; ++j) {
-            simd::addTo(c->getRow(cols[j]), b->getRow(i), width);
-          }
-        }
-
-      } else {
-        for (size_t i = 0; i < a->getHeight(); ++i) {
-          const int start = a->getRowStartIdx(i);
-          const int end = a->getRowStartIdx(i + 1);
-          for (int j = start; j < end; ++j) {
-            vecAddTo(c->getRow(cols[j]), b->getRow(i), width);
-          }
-        }
-      }
-    } else if (a->getValueType() == FLOAT_VALUE) {
-      for (size_t i = 0; i < a->getHeight(); ++i) {
-        const int start = a->getRowStartIdx(i);
-        const int end = a->getRowStartIdx(i + 1);
-        for (int j = start; j < end; ++j) {
-          vecAddTo(c->getRow(cols[j]), b->getRow(i), values[j], width);
-        }
-      }
-    }
-  }
-}
-
-// instantiation mul() called in SparseRowMatrix.cpp
-template void CpuMatrix::mul<CpuMatrix, SparseRowCpuMatrix>(
-    CpuSparseMatrix* a,
-    CpuMatrix* b,
-    SparseRowCpuMatrix* c,
-    real scaleAB,
-    real scaleT);
-template void CpuMatrix::mul<CpuMatrix, SparseAutoGrowRowCpuMatrix>(
-    CpuSparseMatrix* a,
-    CpuMatrix* b,
-    SparseAutoGrowRowCpuMatrix* c,
-    real scaleAB,
-    real scaleT);
-template void CpuMatrix::mul<CpuMatrix, CacheRowCpuMatrix>(CpuSparseMatrix* a,
-                                                           CpuMatrix* b,
-                                                           CacheRowCpuMatrix* c,
-                                                           real scaleAB,
-                                                           real scaleT);
-
-#ifndef PADDLE_MOBILE_INFERENCE
-void SharedCpuMatrix::mul(CpuSparseMatrix* a,
-                          CpuMatrix* b,
-                          real scaleAB,
-                          real scaleT) {
-  CHECK(!isTransposed()) << "Not supported";
-  CHECK(!b->isTransposed()) << "Not supported";
-  CHECK_EQ(scaleAB, 1) << "Not supported";
-  CHECK_EQ(scaleT, 1) << "Not supported";
-  CHECK_EQ(a->getFormat(), SPARSE_CSR) << "not supported";
-
-  real* B = b->getData();
-  real* C = getData();
-  size_t height = getHeight();
-  size_t width = getWidth();
-
-  // get real trans
-  MatrixPtr aTrans;
-  if (a->isTransposed()) {
-    aTrans = a->getTmpSparseMatrix(a->getWidth(), a->getHeight());
-    a->transpose(aTrans, false);
-  }
-  a = dynamic_cast<CpuSparseMatrix*>(aTrans.get());
-
-  size_t m = a->getWidth();
-  CHECK_EQ(b->getHeight(), m);
-  CHECK_EQ(a->getHeight(), height);
-  CHECK_EQ(b->getWidth(), width);
-
-  size_t blockSize = (height / blockNum_) + 1;
-  CpuMatrixPtr localBuf = *localBuf_;
-  if (!localBuf) {
-    localBuf = std::make_shared<CpuMatrix>(blockSize, width);
-  } else {
-    localBuf->resize(blockSize, width);
-  }
-  localBuf->zeroMem();
-  real* localC = localBuf->getData();
-  std::vector<int>& blockSeq = *blockSeq_;
-  if (blockSeq.size() == 0) {
-    for (int k = 0; k < blockNum_; ++k) {
-      blockSeq.push_back(k);
-    }
-    std::shuffle(
-        blockSeq.begin(), blockSeq.end(), ThreadLocalRandomEngine::get());
-  }
-  std::vector<int>& localBufRows = *localBufRows_;
-  int* cols = a->getCols();
-  real* value = a->getValue();
-
-  for (int k = 0; k < blockNum_; ++k) {
-    int blockId = blockSeq[k];
-    size_t blockBegin = blockId * blockSize;
-    size_t blockEnd = (blockId + 1) * blockSize;
-    if (blockId == blockNum_ - 1) {
-      blockEnd = height;
-    }
-    if (a->getValueType() == NO_VALUE) {
-      for (size_t i = blockBegin; i < blockEnd; ++i) {
-        int start = a->getRowStartIdx(i);
-        int end = a->getRowStartIdx(i);
-        size_t colNum = a->getColNum(i);
-        if (colNum == 0) {
-          continue;
-        }  // skip empty row
-        localBufRows.push_back(i);
-        size_t bufPos = localBufRows.size() - 1;
-        for (int j = start; j < end; ++j) {
-          vecAddTo(localC + bufPos * width, B + cols[j] * width, width);
-        }
-      }
-    } else if (a->getValueType() == FLOAT_VALUE) {
-      for (size_t i = blockBegin; i < blockEnd; ++i) {
-        int start = a->getRowStartIdx(i);
-        int end = a->getRowStartIdx(i);
-        size_t colNum = a->getColNum(i);
-        if (colNum == 0) {
-          continue;
-        }  // skip empty row
-        localBufRows.push_back(i);
-        size_t bufPos = localBufRows.size() - 1;
-        for (int j = start; j < end; ++j) {
-          vecAddTo(
-              localC + bufPos * width, B + cols[j] * width, value[j], width);
-        }
-      }
-    }
-
-    {
-      std::lock_guard<std::mutex> guard(*blockLocks_[blockId]);
-      for (size_t i = 0; i < localBufRows.size(); ++i) {
-        vecAddTo(C + localBufRows[i] * width, localC + i * width, width);
-      }
-    }
-    memset(localC, 0, localBufRows.size() * width * sizeof(real));
-    localBufRows.clear();
-  }
-
-  VLOG(2) << " B[0]=" << B[0] << " B[1]=" << B[1] << " C[0]=" << C[0]
-          << " C[1]=" << C[1];
-}
-
-void SharedCpuMatrix::add(Matrix& b, real p1, real p2) {
-  CHECK_EQ(blockNum_, 1);
-  std::lock_guard<std::mutex> guard(*blockLocks_[0]);
-  CpuMatrix::add(b, p1, p2);
-}
-
-void SharedCpuMatrix::add(real p1, real p2) {
-  CHECK_EQ(blockNum_, 1);
-  std::lock_guard<std::mutex> guard(*blockLocks_[0]);
-  CpuMatrix::add(p1, p2);
-}
-
-void SharedCpuMatrix::initShared(int blockNum) {
-  CHECK_GT(height_ * width_, 1UL * 1024 * 1024)
-      << "should not share small matrix";
-  initBlock(blockNum);
-}
-
-void SharedCpuMatrix::initBlock(int blockNum) {
-  CHECK_LE(blockNum, 200) << "should not use large block number";
-  blockNum_ = blockNum;
-  blockLocks_.resize(blockNum);
-  for (auto& locker : blockLocks_) {
-    locker.reset(new std::mutex);
-  }
-}
-
-#endif
-/* Add a (column) vector b to matrix a, column by column */
-void CpuMatrix::addColumnVector(const Matrix& b) {
-  BaseMatrix::addColVector(const_cast<Matrix&>(b));
-}
-
-/* this = a*b */
-void CpuMatrix::mul(const Matrix& a, const Matrix& b) {
-  return mul(a, b, 1.0, 0.0);
-}
-
-/* this = scaleAB*(this*b) +  scaleT*this */
-void CpuMatrix::rightMul(Matrix& b, real scaleAB, real scaleT) {
-  (void)b;
-  (void)scaleAB;
-  (void)scaleT;
-  LOG(FATAL) << "Not implemented";
-}
-
-/* this = this* b */
-void CpuMatrix::rightMul(Matrix& b) { return rightMul(b, 1.0, 0.0); }
-
-/* this = scaleAB*(a*this) +  scaleT*this */
-void CpuMatrix::leftMul(Matrix& a, real scaleAB, real scaleT) {
-  (void)a;
-  (void)scaleAB;
-  (void)scaleT;
-  LOG(FATAL) << "Not implemented";
-}
-
-/* this = a*this) */
-void CpuMatrix::leftMul(Matrix& a) { return leftMul(a, 1.0, 0.0); }
-
-void CpuMatrix::colMerge(Matrix& src) { src.rowSum(*this); }
-
-void CpuMatrix::rowSum(Matrix& sum) {
-  CHECK_EQ(sum.getHeight(), getHeight());
-  CHECK_EQ(sum.getWidth(), (size_t)1);
-
-  sum.sumRows(*this, /* scaleSum= */ 1, /* scaleDest= */ 0);
-}
-
-void CpuMatrix::rowMaxId(IVector& maxIds) {
-  CHECK(!maxIds.useGpu()) << "Matrix type are not equal";
-
-  size_t numSamples = getHeight();
-  CHECK_EQ(maxIds.getSize(), numSamples);
-
-  real* a = getData();
-  int* s = maxIds.getData();
-  size_t dim = getWidth();
-
-  for (size_t i = 0; i < numSamples; i++) {
-    real sm = a[i * dim];
-    int maxId = 0;
-    for (size_t j = 1; j < dim; j++) {
-      if (a[i * dim + j] > sm) {
-        maxId = j;
-        sm = a[i * dim + j];
-      }
-    }
-    s[i] = maxId;
-  }
-}
-
-void CpuMatrix::rowMax(Matrix& max) {
-  CHECK_EQ(max.getHeight(), getHeight());
-  CHECK_EQ(max.getWidth(), (size_t)1);
-  max.maxRows(*this);
-}
-
-/* Get the top k elements of each row of this matrix */
-void CpuMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
-  CHECK(isContiguous());
-  CHECK(!maxIds.useGpu() && !maxVal.useGpu()) << "Matrix type are not equal";
-  size_t numSamples = getHeight();
-  size_t beam = maxVal.getWidth();
-  CHECK_EQ(maxIds.getSize(), numSamples * beam);
-  CHECK_EQ(maxVal.getHeight(), numSamples);
-  CHECK_EQ(maxVal.getWidth(), beam);
-
-  real* a = getData();
-  int* s = maxIds.getData();
-  real* t = maxVal.getData();
-  size_t dim = getWidth();
-  for (size_t i = 0; i < numSamples; i++) {
-    std::vector<std::pair<real, size_t>> vec;
-    for (size_t j = 0; j < dim; j++) {
-      vec.push_back(std::pair<real, size_t>(a[i * dim + j], j));
-    }
-
-    std::partial_sort(
-        vec.begin(),
-        vec.begin() + beam,
-        vec.end(),
-        [](const std::pair<real, size_t>& l, const std::pair<real, size_t>& r) {
-          return l.first > r.first;
-        });
-    for (size_t j = 0; j < beam; j++) {
-      t[i * beam + j] = vec[j].first;
-      s[i * beam + j] = vec[j].second;
-    }
-  }
-}
-
-void CpuMatrix::colMax(Matrix& max) {
-  CHECK_EQ(max.getWidth(), getWidth());
-  CHECK_EQ(max.getHeight(), (size_t)1);
-  max.maxCols(*this);
-}
-
-void CpuMatrix::colMax(IVector& maxIds, Matrix& maxVal) {
-  CHECK(isContiguous());
-  CHECK(!maxIds.useGpu() && !maxVal.useGpu()) << "Matrix type are not equal";
-  size_t numSamples = getWidth();
-  size_t beam = maxVal.getHeight();
-  CHECK_EQ(maxIds.getSize(), numSamples * beam);
-  CHECK_EQ(maxVal.getWidth(), numSamples);
-
-  real* a = getData();
-  int* s = maxIds.getData();
-  real* t = maxVal.getData();
-  size_t dim = getHeight();
-  for (size_t i = 0; i < numSamples; i++) {
-    std::vector<std::pair<real, size_t>> vec;
-    for (size_t j = 0; j < dim; j++) {
-      vec.push_back(std::pair<real, size_t>(a[i + j * numSamples], j));
-    }
-
-    std::partial_sort(
-        vec.begin(),
-        vec.begin() + beam,
-        vec.end(),
-        [](const std::pair<real, size_t>& l, const std::pair<real, size_t>& r) {
-          return l.first > r.first;
-        });
-    for (size_t j = 0; j < beam; j++) {
-      t[i + j * numSamples] = vec[j].first;
-      s[i + j * numSamples] = vec[j].second;
-    }
-  }
-}
-
-void CpuMatrix::maxoutForward(Matrix& a,
-                              IVector& id,
-                              size_t channels,
-                              size_t groups) {
-  CHECK(dynamic_cast<CpuMatrix*>(&a));
-  CHECK(dynamic_cast<CpuIVector*>(&id));
-  CHECK_EQ(a.getHeight(), getHeight());
-
-  size_t size = getWidth();
-  size_t batchSize = getHeight();
-  size_t featLen = size / channels;
-  const real* input = a.getData();
-  int* idForCpu = id.getData();
-
-  MatrixPtr maxInMat, maxOutMat;
-  Matrix::resizeOrCreate(maxInMat, groups, size, false, false);
-  Matrix::resizeOrCreate(maxOutMat, 1, size, false, false);
-
-  for (size_t batch_idx = 0; batch_idx < batchSize; ++batch_idx) {
-    size_t newIndex = batch_idx * size;
-    IVectorPtr tmpId = IVector::create(idForCpu + newIndex, size, false);
-
-    for (size_t i = 0; i < channels; ++i) {
-      size_t newFeatLen = i * featLen;
-      for (size_t j = 0; j < groups; ++j) {
-        maxInMat->subMatrix(j, j + 1, newFeatLen, newFeatLen + featLen)
-            ->copyFrom(input + (newIndex + newFeatLen) * groups + j * featLen,
-                       featLen);
-      }
-    }
-    maxInMat->colMax(*tmpId, *maxOutMat);
-    this->subRowMatrix(batch_idx, batch_idx + 1)->copyFrom(*maxOutMat);
-  }
-}
-
-void CpuMatrix::maxoutBackward(Matrix& a,
-                               IVector& id,
-                               size_t channels,
-                               size_t groups) {
-  CHECK(dynamic_cast<CpuMatrix*>(&a));
-  CHECK(dynamic_cast<CpuIVector*>(&id));
-  CHECK_EQ(a.getHeight(), getHeight());
-
-  size_t size = a.getWidth();
-  size_t batchSize = getHeight();
-  size_t featLen = size / channels;
-  size_t newFeatLen = groups * featLen;
-  real* inputG = getData();
-  const real* outG = a.getData();
-  int* idForCpu = id.getData();
-
-  for (size_t batch_idx = 0; batch_idx < batchSize; ++batch_idx) {
-    size_t newIndex = batch_idx * size;
-    int* idData = idForCpu + newIndex;
-
-    for (size_t i = 0; i < size; ++i) {
-      int gradIdx =
-          idData[i] * featLen + (i / featLen) * newFeatLen + i % featLen;
-      (inputG + newIndex * groups)[gradIdx] += (outG + newIndex)[i];
-    }
-  }
-}
-
-void CpuMatrix::rowNormalizeL1(Matrix& out) {
-  CHECK(!out.useGpu());
-
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  CHECK_EQ(out.getHeight(), numSamples);
-  CHECK_EQ(out.getWidth(), dim);
-  real* a = getData();
-  real* b = out.getData();
-  for (size_t i = 0; i < numSamples; ++i) {
-    real s = 0;
-    for (size_t j = 0; j < dim; ++j) {
-      s += a[i * dim + j];
-    }
-    // Right now, we just bet that sum won't be zero. If this really happens,
-    // we will figure out what should be done then.
-    CHECK_GT(s, 0);
-    s = 1 / s;
-    for (size_t j = 0; j < dim; ++j) {
-      b[i * dim + j] = s * a[i * dim + j];
-    }
-  }
-}
-
-/* calulate classification error */
-void CpuMatrix::classificationError(Matrix& output,
-                                    IVector& label,
-                                    size_t topkSize) {
-  size_t numSamples = this->getHeight();
-  auto cpuOutput = dynamic_cast<CpuMatrix*>(&output);
-  auto cpuLabel = dynamic_cast<CpuIVector*>(&label);
-  IVectorPtr cpuTopIds = std::make_shared<CpuIVector>(numSamples * topkSize);
-  MatrixPtr cpuTopVal = std::make_shared<CpuMatrix>(numSamples, topkSize);
-
-  CHECK(cpuOutput && cpuLabel) << "Invalid argument pointer";
-  CHECK(cpuTopIds && cpuTopVal) << "Allocate cpu memory failed";
-  CHECK(cpuLabel->getSize() == numSamples) << "Vector size is not equal";
-  CHECK(cpuOutput->getHeight() == numSamples && this->getWidth() == 1)
-      << "Matrix dimensions are not equal";
-
-  // top k matrix classification
-  cpuOutput->rowMax(*cpuTopIds, *cpuTopVal);
-
-  size_t dim = cpuOutput->getWidth();
-  real* result = this->getData();
-  int* ids = cpuTopIds->getData();
-  int* lbl = cpuLabel->getData();
-  for (size_t i = 0; i < numSamples; ++i) {
-    CHECK_GE(lbl[i], 0);
-    CHECK_LT((size_t)lbl[i], dim);
-
-    for (size_t j = 0; j < topkSize; ++j) {
-      if (ids[j + i * topkSize] == lbl[i]) {
-        result[i] = 0;
-        break;
-      }
-      result[i] = 1.0f;
-    }
-  }
-}
-
-/* copy -log(output[label]) to this->data[i] */
-void CpuMatrix::oneHotCrossEntropy(Matrix& output, IVector& label) {
-  CHECK(dynamic_cast<CpuMatrix*>(&output));
-  CHECK(dynamic_cast<CpuIVector*>(&label));
-
-  size_t numSamples = getHeight();
-  size_t dim = output.getWidth();
-  CHECK_EQ(label.getSize(), numSamples);
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(getWidth(), (size_t)1);
-
-  real* out = output.getData();
-  real* cost = getData();
-  int* lbl = label.getData();
-  for (size_t i = 0; i < numSamples; ++i, out += dim) {
-    CHECK_GE(lbl[i], 0);
-    CHECK_LT((size_t)lbl[i], dim);
-    cost[i] = -std::log(out[lbl[i]]);
-  }
-}
-
-/* calculate the error of outputV according to label */
-void CpuMatrix::oneHotCrossEntropyBp(Matrix& output, IVector& label) {
-  CHECK(dynamic_cast<CpuMatrix*>(&output));
-  CHECK(dynamic_cast<CpuIVector*>(&label));
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  CHECK_EQ(output.getWidth(), dim);
-  real* out = output.getData();
-  real* grad = getData();
-  int* lbl = label.getData();
-  for (size_t i = 0; i < numSamples; ++i, out += dim, grad += dim) {
-    grad[lbl[i]] -= 1 / out[lbl[i]];
-  }
-}
-
-/*
-    We implement the matrix functionality in CostLayer.cpp,
-    but we define the scalar function here for sanity check
-    deletion of the function does not affect anything neverthelss
-*/
-void CpuMatrix::oneHotCrossEntropyWithSelfNorm(Matrix& output,
-                                               IVector& label,
-                                               real alpha) {
-  CHECK(dynamic_cast<CpuMatrix*>(&output));
-  CHECK(dynamic_cast<CpuIVector*>(&label));
-
-  size_t numSamples = getHeight();
-  size_t dim = output.getWidth();
-  CHECK_EQ(label.getSize(), numSamples);
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(getWidth(), (size_t)1);
-
-  real* out = output.getData();
-  real* cost = getData();
-  int* lbl = label.getData();
-  for (size_t i = 0; i < numSamples; ++i, out += dim) {
-    CHECK_GE(lbl[i], 0);
-    CHECK_LT((size_t)lbl[i], dim);
-    real sum = 0;
-    for (size_t j = 0; j < dim; ++j) {
-      sum += out[j];
-    }
-    sum = _safelog(sum);
-    cost[i] = -_safelog(out[lbl[i]]) + sum + alpha * _square(sum);
-  }
-}
-
-/*
-    We implement the matrix functionality in CostLayer.cpp,
-    but we define the scalar function here for sanity check
-    deletion of the function does not affect anything neverthelss
-*/
-void CpuMatrix::oneHotCrossEntropyWithSelfNormBp(Matrix& output,
-                                                 IVector& label,
-                                                 real alpha) {
-  CHECK(dynamic_cast<CpuMatrix*>(&output));
-  CHECK(dynamic_cast<CpuIVector*>(&label));
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  CHECK_EQ(output.getWidth(), dim);
-  real* out = output.getData();
-  real* grad = getData();
-  int* lbl = label.getData();
-
-  for (size_t i = 0; i < numSamples; ++i, out += dim, grad += dim) {
-    grad[lbl[i]] -= 1 / out[lbl[i]];
-    real sum = 0;
-    for (size_t j = 0; j < dim; ++j) {
-      sum += out[j];
-    }
-    for (size_t j = 0; j < dim; ++j) {
-      if (j == (size_t)lbl[i]) {
-        grad[j] += -1 / out[j];
-      }
-      grad[j] += 1 / sum + 2 * alpha * _safelog(sum) / sum;
-    }
-  }
-}
-
-#define FORWARD_LOOP()                      \
-  size_t numSamples = getHeight();          \
-  size_t dim = getWidth();                  \
-  CHECK_EQ(output.getHeight(), numSamples); \
-  CHECK_EQ(output.getWidth(), dim);         \
-  const real* in = getData();               \
-  real* out = output.getData();             \
-  for (size_t i = 0; i < numSamples; ++i, in += dim, out += dim)
-
-#define BACKWARD_LOOP()                     \
-  size_t numSamples = getHeight();          \
-  size_t dim = getWidth();                  \
-  CHECK_EQ(output.getHeight(), numSamples); \
-  CHECK_EQ(output.getWidth(), dim);         \
-  real* grad = getData();                   \
-  real* out = output.getData();             \
-  for (size_t i = 0; i < numSamples; ++i, grad += dim, out += dim)
-
-void CpuMatrix::softmax(Matrix& output) {
-  CHECK(!output.useGpu());
-
-  const float THRESHOLD = -64.0;
-
-  FORWARD_LOOP() {
-    real max = -1.0e20;
-    for (size_t j = 0; j < dim; ++j) {
-      if (in[j] > max) {
-        max = in[j];
-      }
-    }
-    for (size_t j = 0; j < dim; ++j) {
-      real a = in[j] - max;
-      if (a < THRESHOLD) {
-        a = THRESHOLD;
-      }
-      out[j] = a;
-    }
-    vExp(dim, out, out);
-
-    real sum = 0;
-    for (size_t j = 0; j < dim; ++j) {
-      sum += out[j];
-    }
-    sum = 1 / sum;
-    for (size_t j = 0; j < dim; ++j) {
-      out[j] *= sum;
-    }
-  }
-}
-
-void CpuMatrix::sequenceSoftmax(Matrix& output, const IVector& index) {
-  CHECK_EQ(getWidth(), 1UL);
-  CHECK_EQ(output.getWidth(), 1UL);
-  CHECK(isContiguous());
-
-  MatrixPtr inTmp = Matrix::create(nullptr,
-                                   /* height= */ 1,
-                                   1,
-                                   /* trans= */ false,
-                                   false);
-  MatrixPtr outTmp = Matrix::create(nullptr,
-                                    /* height= */ 1,
-                                    1,
-                                    /* trans= */ false,
-                                    false);
-  size_t numSequences = index.getSize() - 1;
-  auto starts = index.getData();
-  for (size_t i = 0; i < numSequences; ++i) {
-    size_t offset = starts[i];
-    size_t size = starts[i + 1] - starts[i];
-    inTmp->setData(getData() + offset, 1UL, size);
-    outTmp->setData(output.getData() + offset, 1UL, size);
-    inTmp->softmax(*outTmp);
-  }
-}
-
-void CpuMatrix::softmaxDerivative(Matrix& output, Matrix& sftmaxSum) {
-  CHECK(output.useGpu_ == false) << "Matrix type are not equal";
-  CHECK_EQ(getHeight(), sftmaxSum.getHeight());
-
-  real* sums = sftmaxSum.getData();
-
-  BACKWARD_LOOP() {
-    real sum = sums[i];
-    for (size_t j = 0; j < dim; ++j) {
-      grad[j] = out[j] * (grad[j] - sum);
-    }
-  }
-}
-
-void CpuMatrix::sumOfSquares(Matrix& output, Matrix& label) {
-  CHECK(output.useGpu_ == false && label.useGpu_ == false)
-      << "Matrix type are not equal";
-
-  size_t numSamples = getHeight();
-  size_t dim = output.getWidth();
-  CHECK_EQ(label.getHeight(), numSamples);
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(label.getWidth(), dim);
-  CHECK_EQ(getWidth(), (size_t)1);
-  real* out = output.getData();
-  real* cost = getData();
-
-  auto labelptr = dynamic_cast<CpuSparseMatrix*>(&label);
-  if (labelptr) {
-    // it is a CpuSparseMatrix
-    if (labelptr->getFormat() == SPARSE_CSR) {
-      // treat label as a SparseMatrix
-      for (size_t i = 0; i < numSamples; ++i) {
-        for (size_t j = 0; j < dim; ++j) {
-          cost[i] += _square(out[i * dim + j]);
-        }
-      }
-      if (labelptr->getValueType() == NO_VALUE) {
-        int* cols = labelptr->getCols();
-        for (size_t i = 0; i < numSamples; ++i) {
-          for (size_t j = labelptr->getRowStartIdx(i);
-               j < labelptr->getRowStartIdx(i + 1);
-               ++j) {
-            cost[i] += 1.0 - 2.0 * out[i * dim + cols[j]];
-            /*
-             * explanation of above line: original codes are follows:
-             * cost[i] -= _square(out[i * dim + feature.col]);
-             * cost[i] += _square(1.0 - out[i * dim + feature.col]);
-             */
-          }
-        }
-      } else if (labelptr->getValueType() == FLOAT_VALUE) {
-        int* cols = labelptr->getCols();
-        real* values = labelptr->getValue();
-        for (size_t i = 0; i < numSamples; ++i) {
-          real sum1 = 0;
-          real sum2 = 0;
-          for (size_t j = labelptr->getRowStartIdx(i);
-               j < labelptr->getRowStartIdx(i + 1);
-               ++j) {
-            sum1 += values[j] * values[j];
-            sum2 += values[j] * out[i * dim + cols[j]];
-            /*
-             * explanation of above line: original codes are follows:
-             * cost[i] -= _square(out[i * dim + feature.col]);
-             * cost[i] += _square(value.col - out[i * dim + feature.col]);
-             */
-          }
-          cost[i] += sum1 - 2.0 * sum2;
-        }
-      } else {
-        LOG(FATAL) << "unsupported sparse matrix value type in sumOfSquares";
-        return;
-      }
-      return;
-    } else {
-      LOG(FATAL) << "unsupported sparse matrix format in sumOfSquares";
-      return;
-    }
-  }
-
-  BaseMatrix::sumOfSquaredDiffs(output,
-                                label,
-                                /* scaleSum= */ 1,
-                                /* scaleDest= */ 1);
-}
-
-/* calculate the error of outputV according to label */
-void CpuMatrix::sumOfSquaresBp(Matrix& output, Matrix& label) {
-  CHECK(output.useGpu_ == false && label.useGpu_ == false)
-      << "Matrix type are not equal";
-
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  CHECK_EQ(output.getWidth(), dim);
-  CHECK_EQ(label.getWidth(), dim);
-
-  real* out = output.getData();
-  real* grad = getData();
-
-  auto labelptr = dynamic_cast<CpuSparseMatrix*>(&label);
-  if (labelptr) {
-    // it is a CpuSparseMatrix
-    if (labelptr->getFormat() == SPARSE_CSR) {
-      // treat label as a SparseMatrix
-      for (size_t i = 0; i < numSamples; ++i) {
-        for (size_t j = 0; j < dim; ++j) {
-          grad[i * dim + j] += 2.0 * out[i * dim + j];
-        }
-      }
-      if (labelptr->getValueType() == NO_VALUE) {
-        int* cols = labelptr->getCols();
-        for (size_t i = 0; i < numSamples; ++i) {
-          for (size_t j = labelptr->getRowStartIdx(i);
-               j < labelptr->getRowStartIdx(i + 1);
-               ++j) {
-            grad[i * dim + cols[j]] -= 2.0;
-            /*
-             * explanation of above line: original codes are follows:
-             * grad[i * dim + feature.col] -= 2.0 * out[i * dim + feature.col];
-             * grad[i * dim + feature.col] += 2.0 * (out[i * dim + feature.col]
-             * - 1);
-             */
-          }
-        }
-      } else if (labelptr->getValueType() == FLOAT_VALUE) {
-        int* cols = labelptr->getCols();
-        real* values = labelptr->getValue();
-        for (size_t i = 0; i < numSamples; ++i) {
-          for (size_t j = labelptr->getRowStartIdx(i);
-               j < labelptr->getRowStartIdx(i + 1);
-               ++j) {
-            grad[i * dim + cols[j]] -= 2.0 * values[j];
-            /*
-             * explanation of above line: original codes are follows:
-             * grad[i * dim + feature.col] -= 2.0 * out[i * dim + feature.col];
-             * grad[i * dim + feature.col] += 2.0 * (out[i * dim + feature.col]
-             * - value.col);
-             */
-          }
-        }
-      } else {
-        LOG(FATAL) << "unsupported sparse matrix value type in sumOfSquares";
-        return;
-      }
-      return;
-    } else {
-      LOG(FATAL) << "unsupported sparse matrix format in sumOfSquares";
-      return;
-    }
-  }
-
-  real* lbl = label.getData();
-  size_t ld = getStride();
-  size_t outLd = output.getStride();
-  size_t lblLd = label.getStride();
-  CHECK(lbl);
-  for (size_t i = 0; i < numSamples;
-       ++i, out += outLd, lbl += lblLd, grad += ld) {
-    for (size_t j = 0; j < dim; ++j) {
-      grad[j] += 2.0 * (out[j] - lbl[j]);  // positive gradient;
-    }
-  }
-}
-
-void CpuMatrix::smoothL1(Matrix& output, Matrix& label, real destScale) {
-  CHECK(output.useGpu_ == false && label.useGpu_ == false)
-      << "Matrix type are not equal";
-
-  size_t numSamples = getHeight();
-  size_t dim = output.getWidth();
-  CHECK_EQ(label.getHeight(), numSamples);
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(label.getWidth(), dim);
-  CHECK_EQ(getWidth(), (size_t)1);
-
-  real* cost = getData();
-  real* out = output.getData();
-  real* lbl = label.getData();
-
-  for (size_t i = 0; i < numSamples; ++i, out += dim, lbl += dim) {
-    for (size_t j = 0; j < dim; ++j) {
-      real absVal = std::fabs(out[j] - lbl[j]);
-      cost[i] *= destScale;
-      if (absVal < 1.0)
-        cost[i] += 0.5 * absVal * absVal;
-      else
-        cost[i] += absVal - 0.5;
-    }
-  }
-}
-
-void CpuMatrix::smoothL1Bp(Matrix& output, Matrix& label, real destScale) {
-  CHECK(output.useGpu_ == false && label.useGpu_ == false)
-      << "Matrix type are not equal";
-
-  size_t numSamples = getHeight();
-  size_t dim = output.getWidth();
-  CHECK_EQ(label.getHeight(), numSamples);
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(label.getWidth(), dim);
-  CHECK_EQ(getWidth(), dim);
-
-  real* out = output.getData();
-  real* lbl = label.getData();
-  real* grad = getData();
-
-  for (size_t i = 0; i < numSamples; ++i, out += dim, grad += dim, lbl += dim) {
-    for (size_t j = 0; j < dim; ++j) {
-      real val = out[j] - lbl[j];
-      grad[j] *= destScale;
-      if (std::fabs(val) < 1) {
-        grad[j] += val;
-      } else {
-        grad[j] += (real(0) < val) - (val < real(0));
-      }
-    }
-  }
-}
-
-void CpuMatrix::tanh(Matrix& output) {
-  CHECK(isContiguous());
-  CHECK(output.isContiguous());
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(output.getWidth(), dim);
-  vTanh(numSamples * dim, getData(), output.getData());
-}
-
-void CpuMatrix::tanhDerivative(Matrix& output) {
-  BaseMatrix::tanhDerivative(output);
-}
-
-void CpuMatrix::softrelu(Matrix& output) {
-  CHECK(isContiguous());
-  CHECK(output.isContiguous());
-  const real THRESHOLD = 40.0;
-  FORWARD_LOOP() {  // TODO(yuyang18): SIMD it?
-    for (size_t j = 0; j < dim; ++j) {
-      real x = in[j];
-      if (x > THRESHOLD) {
-        x = THRESHOLD;
-      } else if (x < -THRESHOLD) {
-        x = -THRESHOLD;
-      }
-      out[j] = x;
-    }
-  }
-  vExp(numSamples * dim, output.getData(), output.getData());
-  vLog1p(numSamples * dim, output.getData(), output.getData());
-}
-
-void CpuMatrix::softreluDerivative(Matrix& output) {
-  CHECK(isContiguous());
-  CHECK(output.isContiguous());
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  size_t size = numSamples * dim;
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(output.getWidth(), dim);
-  real* grad = getData();
-  MatrixPtr tmpMat = Matrix::create(numSamples, dim);
-  real* tmp = tmpMat->getData();
-
-  vExp(size, output.getData(), tmpMat->getData());
-
-  for (size_t i = 0; i < size; ++i) {
-    grad[i] *= (1.0 - 1.0 / tmp[i]);
-  }
-}
-
-void CpuMatrix::scaledTanh(Matrix& output, real p1, real p2) {
-  CHECK(isContiguous());
-  CHECK(output.isContiguous());
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(output.getWidth(), dim);
-
-  const real* in = getData();
-  real* out = output.getData();
-
-  // out = p2*in
-  for (size_t i = 0; i < numSamples * dim; ++i) {
-    out[i] = p2 * in[i];
-  }
-
-  vTanh(numSamples * dim, out, out);
-
-  // out = p1 * out
-  for (size_t i = 0; i < numSamples * dim; ++i) {
-    out[i] = p1 * out[i];
-  }
-}
-
-/* uniform randomization, minimize precision = 1e-5 */
-void CpuMatrix::randomizeUniform() {
-  CHECK(isContiguous());
-  real* data = getData();
-  unsigned int* randSeed = ThreadLocalRand::getSeed();
-  real recipRandMax = 1.0f / (real)RAND_MAX;
-  for (size_t i = 0; i < elementCnt_; ++i) {
-    *data++ = rand_r(randSeed) * recipRandMax;
-  }
-}
-
-void CpuMatrix::print(std::ostream& os) const {
-  CHECK(isContiguous());
-  for (size_t i = 0; i < height_; ++i) {
-    for (size_t j = 0; j < width_; ++j) {
-      os << data_[i * width_ + j] << " ";
-    }
-    os << std::endl;
-  }
-}
-
-void CpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
-  real* input = data.getData();
-  real* w = W.getData();
-  real* output = data_;
-  size_t numElements = data.getWidth();
-  size_t numSamples = data.getHeight();
-  size_t paraSize = W.getHeight() * W.getWidth();
-  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
-
-  size_t partial_sum = numElements / paraSize;
-  if (paraSize == numElements) {
-    for (size_t n = 0; n < numSamples * numElements; ++n) {
-      output[n] = input[n] > 0 ? input[n] : input[n] * w[n % numElements];
-    }
-    return;
-  }
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-  for (size_t n = 0; n < numSamples; ++n) {
-    for (size_t i = 0; i < paraSize; i++) {
-      neon::prelu(
-          input + i * partial_sum, w[i], output + i * partial_sum, partial_sum);
-    }
-    input = input + numElements;
-    output = output + numElements;
-  }
-#else
-  for (size_t n = 0, k = 0; n < numSamples; ++n) {
-    for (size_t i = 0; i < numElements; ++i, ++k) {
-      output[k] = input[k] > 0 ? input[k] : input[k] * w[i / partial_sum];
-    }
-  }
-#endif
-}
-
-void CpuMatrix::paramReluBackwardW(Matrix& oGrad, Matrix& data) {
-  real* ograd = oGrad.getData();
-  real* input = data.getData();
-  real* wgrad = data_;
-  size_t numElements = data.getWidth();
-  size_t numSamples = data.getHeight();
-  size_t paraSize = this->getHeight() * this->getWidth();
-  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
-  size_t partial_sum = numElements / paraSize;
-  for (size_t n = 0, k = 0; n < numSamples; ++n) {
-    for (size_t i = 0; i < numElements; ++i, ++k) {
-      wgrad[i / partial_sum] += ograd[k] * (input[k] > 0 ? 0 : input[k]);
-    }
-  }
-}
-
-void CpuMatrix::paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
-  real* diff = data_;
-  real* input = data.getData();
-  real* ograd = oGrad.getData();
-  real* w = W.getData();
-  size_t numElements = data.getWidth();
-  size_t numSamples = data.getHeight();
-  size_t paraSize = W.getHeight() * W.getWidth();
-  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
-  size_t partial_sum = numElements / paraSize;
-  for (size_t n = 0, k = 0; n < numSamples; ++n) {
-    for (size_t i = 0; i < numElements; ++i, ++k) {
-      diff[k] += ograd[k] * (input[k] > 0 ? 1 : w[i / partial_sum]);
-    }
-  }
-}
-
-void CpuMatrix::print(std::ostream& os, size_t height, size_t width) const {
-  CHECK(isContiguous());
-  size_t h = height_ < height ? height_ : height;
-  size_t w = width_ < width ? width_ : width;
-  os.setf(std::ostream::scientific);
-  os << "[";
-  for (size_t i = 0; i < h; ++i) {
-    for (size_t j = 0; j < w; ++j) {
-      os << data_[i * width_ + j] << " ";
-    }
-    if (i == h - 1) {
-      os << "]";
-    }
-    os << std::endl;
-  }
-}
-
-void CpuMatrix::printOneRow(std::ostream& os, size_t idx) const {
-  CHECK_LT(idx, height_);
-  size_t offset = idx * stride_;
-  os << data_[offset];
-  for (size_t i = 1; i < width_; ++i) {
-    os << " " << data_[offset + i];
-  }
-  os << ";";
-}
-
-void CpuMatrix::check(std::ostream& os, Matrix& refMat, bool printDiff) {
-  CHECK(isContiguous());
-  CHECK(height_ == refMat.getHeight());
-  CHECK(width_ == refMat.getWidth());
-  CpuMatrix cpuRef(height_, width_);
-  cpuRef.copyFrom(refMat);
-  size_t diffCnt = 0;
-  for (size_t i = 0; i < height_; ++i) {
-    for (size_t j = 0; j < width_; ++j) {
-      real a = getElement(i, j);
-      real b = cpuRef.getElement(i, j);
-      if (fabs(a - b) > 0.00001) {
-        ++diffCnt;
-        if (printDiff) {
-          os << "ref= " << a << "  check= " << b << std::endl;
-        }
-      }
-    }
-  }
-  LOG(INFO) << "the  diffCnt is " << diffCnt;
-}
-
-real CpuMatrix::getMin() {
-  size_t size = getHeight() * getWidth();
-  real* data = getData();
-  real res = data[0];
-  for (size_t i = 1; i < size; ++i) {
-    if (res > data[i]) {
-      res = data[i];
-    }
-  }
-  return res;
-}
-
-real CpuMatrix::getMax() {
-  size_t size = getHeight() * getWidth();
-  real* data = getData();
-  real res = data[0];
-  for (size_t i = 1; i < size; ++i) {
-    if (res < data[i]) {
-      res = data[i];
-    }
-  }
-  return res;
-}
-
-void CpuMatrix::circularConv(Matrix& in0, Matrix& in1) {
-  size_t height = this->getHeight();
-  size_t width0 = this->getWidth();
-  size_t width1 = in1.getWidth();
-
-  CHECK_EQ(height, in0.getHeight());
-  CHECK_EQ(width0, in0.getWidth());
-  CHECK_EQ(height, in1.getHeight());
-
-  CHECK_EQ(width1 % 2, 1U);
-
-  real* outV = this->getData();
-  real* inV0 = in0.getData();
-  real* inV1 = in1.getData();
-
-  int leftCtxLen = (width1 - 1) / 2;
-  for (size_t x = 0; x < height;
-       ++x, outV += width0, inV0 += width0, inV1 += width1) {
-    for (size_t i = 0; i < width0; ++i) {  // each dimension of output
-      for (size_t j = 0; j < width1; ++j) {
-        // iterate over all dimentions of inV1
-        int index = i + j - leftCtxLen;
-        index = (index + width0) % width0;
-        outV[i] += inV0[index] * inV1[j];
-      }
-    }
-  }
-}
-
-void CpuMatrix::circularConvDerivative(
-    Matrix& outG, Matrix& in0, Matrix& in1, Matrix& inG0, Matrix& inG1) {
-  size_t height = in0.getHeight();
-  size_t width0 = in0.getWidth();
-  size_t width1 = in1.getWidth();
-
-  CHECK_EQ(height, in1.getHeight());
-  CHECK_EQ(height, inG0.getHeight());
-  CHECK_EQ(width0, inG0.getWidth());
-  CHECK_EQ(height, inG1.getHeight());
-  CHECK_EQ(width1, inG1.getWidth());
-  CHECK_EQ(height, outG.getHeight());
-  CHECK_EQ(width0, outG.getWidth());
-
-  real* outGV = outG.getData();
-  real* inV0 = in0.getData();
-  real* inV1 = in1.getData();
-  real* inGV0 = inG0.getData();
-  real* inGV1 = inG1.getData();
-
-  int leftCtxLen = (width1 - 1) / 2;
-  for (size_t x = 0; x < height; ++x,
-              outGV += width0,
-              inV0 += width0,
-              inV1 += width1,
-              inGV0 += width0,
-              inGV1 += width1) {
-    for (size_t j = 0; j < width1; ++j) {  // iterate over width1
-      for (size_t i = 0; i < width0; ++i) {
-        // such over all dimensions of outG
-        int index = i + j - leftCtxLen;
-        index = (index + width0) % width0;
-        inGV0[index] += outGV[i] * inV1[j];
-        inGV1[j] += outGV[i] * inV0[index];
-      }
-    }
-  }
-}
-
-void CpuMatrix::multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
-  CHECK(dynamic_cast<CpuMatrix*>(&output));
-  auto labelPtr = dynamic_cast<CpuSparseMatrix*>(&label);
-  CHECK(labelPtr);
-
-  size_t numSamples = getHeight();
-  size_t dim = output.getWidth();
-  CHECK_EQ(numSamples, output.getHeight());
-  CHECK_EQ(numSamples, labelPtr->getHeight());
-  CHECK_EQ(dim, labelPtr->getWidth());
-
-  real* out = output.getData();
-  real* cost = getData();
-  for (size_t i = 0; i < numSamples; ++i, out += dim) {
-    for (size_t j = 0; j < dim; ++j) {
-      CHECK(out[j] > 0 && out[j] < 1.0);
-      cost[i] -= std::log(1 - out[j]);
-    }
-
-    const int* cols = labelPtr->getRowCols(i);
-    for (size_t j = 0; j < labelPtr->getColNum(i); ++j) {
-      CHECK_LT(size_t(cols[j]), dim);
-      cost[i] -= std::log(out[cols[j]] / (1 - out[cols[j]]));
-    }
-  }
-}
-
-void CpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
-  CHECK(dynamic_cast<CpuMatrix*>(&output));
-  auto labelPtr = dynamic_cast<CpuSparseMatrix*>(&label);
-  CHECK(labelPtr);
-
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  CHECK_EQ(numSamples, output.getHeight());
-  CHECK_EQ(numSamples, labelPtr->getHeight());
-  CHECK_EQ(dim, output.getWidth());
-  CHECK_EQ(dim, labelPtr->getWidth());
-
-  real* out = output.getData();
-  real* grad = getData();
-  for (size_t i = 0; i < numSamples; ++i, out += dim, grad += dim) {
-    for (size_t j = 0; j < dim; ++j) {
-      CHECK(out[j] > 0 && out[j] < 1.0);
-      grad[j] += 1.0 / (1 - out[j]);
-    }
-
-    const int* cols = labelPtr->getRowCols(i);
-    for (size_t j = 0; j < labelPtr->getColNum(i); ++j) {
-      CHECK_LT(size_t(cols[j]), dim);
-      grad[cols[j]] -= 1.0 / (out[cols[j]] * (1 - out[cols[j]]));
-    }
-  }
-}
-
-/* calculate the classification error for multi binary label */
-void CpuMatrix::classificationErrorMulti(Matrix& output,
-                                         Matrix& label,
-                                         real threshold) {
-  CHECK(dynamic_cast<CpuMatrix*>(&output));
-  auto labelPtr = dynamic_cast<CpuSparseMatrix*>(&label);
-  CHECK(labelPtr);
-
-  size_t numSamples = getHeight();
-  size_t dim = output.getWidth();
-  CHECK_EQ(numSamples, output.getHeight());
-  CHECK_EQ(numSamples, labelPtr->getHeight());
-  CHECK_EQ(dim, labelPtr->getWidth());
-
-  real* out = output.getData();
-  real* result = getData();
-  for (size_t i = 0; i < numSamples; ++i, out += dim) {
-    real sum = 0.0;
-    for (size_t j = 0; j < dim; ++j) {
-      if (out[j] >= threshold) {
-        sum += 1.0;
-      }
-    }
-
-    const int* cols = labelPtr->getRowCols(i);
-    for (size_t j = 0; j < labelPtr->getColNum(i); ++j) {
-      CHECK_LT(size_t(cols[j]), dim);
-      if (out[cols[j]] < threshold) {
-        sum += 1.0;
-      } else {
-        sum -= 1.0;
-      }
-    }
-    result[i] = sum / dim;
-  }
-}
-
-void CpuMatrix::bilinearForward(const Matrix& in,
-                                const size_t inImgH,
-                                const size_t inImgW,
-                                const size_t outImgH,
-                                const size_t outImgW,
-                                const size_t numChannels,
-                                const real ratioH,
-                                const real ratioW) {
-  CHECK(dynamic_cast<const CpuMatrix*>(&in));
-
-  size_t outputW = getWidth();
-  size_t batchSize = getHeight();
-  size_t inputW = in.getWidth();
-  size_t inputH = in.getHeight();
-  size_t inPosOffset = inImgH * inImgW;
-  size_t outPosOffset = outImgH * outImgW;
-  (void)(inputH);
-
-  real* outData = getData();
-  const real* inData = in.getData();
-
-  if (inImgH == outImgH && inImgW == outImgW) {
-    this->copyFrom(in);
-  } else {
-    for (size_t k = 0; k < batchSize; ++k) {  // loop for batches
-      for (size_t i = 0; i < outImgH; ++i) {  // loop for images
-        size_t h = ratioH * i;
-        size_t hid = (h < inImgH - 1) ? 1 : 0;
-        real h1lambda = ratioH * i - h;
-        real h2lambda = 1 - h1lambda;
-
-        for (size_t j = 0; j < outImgW; ++j) {
-          size_t w = ratioW * j;
-          size_t wid = (w < inImgW - 1) ? 1 : 0;
-          real w1lambda = ratioW * j - w;
-          real w2lambda = 1 - w1lambda;
-          // calculate four position for bilinear interpolation
-          const real* inPos = &inData[k * inputW + h * inImgW + w];
-          real* outPos = &outData[k * outputW + i * outImgW + j];
-          for (size_t c = 0; c < numChannels; ++c) {  // loop for channels
-            // bilinear interpolation
-            outPos[0] =
-                h2lambda * (w2lambda * inPos[0] + w1lambda * inPos[wid]) +
-                h1lambda * (w2lambda * inPos[hid * inImgW] +
-                            w1lambda * inPos[hid * inImgW + wid]);
-            inPos += inPosOffset;
-            outPos += outPosOffset;
-          }
-        }
-      }
-    }
-  }
-}
-
-void CpuMatrix::bilinearBackward(const Matrix& out,
-                                 const size_t outImgH,
-                                 const size_t outImgW,
-                                 const size_t inImgH,
-                                 const size_t inImgW,
-                                 const size_t numChannels,
-                                 const real ratioH,
-                                 const real ratioW) {
-  CHECK(dynamic_cast<const CpuMatrix*>(&out));
-
-  size_t inputW = getWidth();
-  size_t inputH = getHeight();
-  size_t outputW = out.getWidth();
-  size_t batchSize = out.getHeight();
-  size_t inPosOffset = inImgH * inImgW;
-  size_t outPosOffset = outImgH * outImgW;
-  (void)(inputH);
-
-  real* inGrad = getData();
-  const real* outGrad = out.getData();
-
-  if (inImgH == outImgH && inImgW == outImgW) {
-    this->add(const_cast<Matrix&>(out));
-  } else {
-    for (size_t k = 0; k < batchSize; ++k) {  // loop for batches
-      for (size_t i = 0; i < outImgH; ++i) {  // loop for images
-        size_t h = ratioH * i;
-        size_t hid = (h < inImgH - 1) ? 1 : 0;
-        real h1lambda = ratioH * i - h;
-        real h2lambda = 1 - h1lambda;
-        for (size_t j = 0; j < outImgW; ++j) {
-          size_t w = ratioW * j;
-          size_t wid = (w < inImgW - 1) ? 1 : 0;
-          real w1lambda = ratioW * j - w;
-          real w2lambda = 1 - w1lambda;
-
-          real* inPos = &inGrad[k * inputW + h * inImgW + w];
-          const real* outPos = &outGrad[k * outputW + i * outImgW + j];
-          for (size_t c = 0; c < numChannels; ++c) {  // loop for channels
-            inPos[0] += h2lambda * w2lambda * outPos[0];
-            inPos[wid] += h2lambda * w1lambda * outPos[0];
-            inPos[hid * inImgW] += h1lambda * w2lambda * outPos[0];
-            inPos[hid * inImgW + wid] += h1lambda * w1lambda * outPos[0];
-            inPos += inPosOffset;
-            outPos += outPosOffset;
-          }
-        }
-      }
-    }
-  }
-}
-
-void CpuMatrix::vol2Col(real* data,
-                        int channels,
-                        int depth,
-                        int height,
-                        int width,
-                        int filterD,
-                        int filterH,
-                        int filterW,
-                        int strideD,
-                        int strideH,
-                        int strideW,
-                        int paddingD,
-                        int paddingH,
-                        int paddingW) {
-  real* outData = getData();
-  int outHeight = (height + 2 * paddingH - filterH) / strideH + 1;
-  int outWidth = (width + 2 * paddingW - filterW) / strideW + 1;
-  int outDepth = (depth + 2 * paddingD - filterD) / strideD + 1;
-
-  int channelsCol = channels * filterD * filterH * filterW;
-  for (int c = 0; c < channelsCol; ++c) {
-    int wOffset = c % filterW;
-    int hOffset = (c / filterW) % filterH;
-    int dOffset = (c / filterW / filterH) % filterD;
-    int cIn = c / filterW / filterH / filterD;
-    for (int d = 0; d < outDepth; ++d) {
-      for (int h = 0; h < outHeight; ++h) {
-        for (int w = 0; w < outWidth; ++w) {
-          int dPad = d * strideD - paddingD + dOffset;
-          int hPad = h * strideH - paddingH + hOffset;
-          int wPad = w * strideW - paddingW + wOffset;
-
-          if (hPad >= 0 && hPad < height && wPad >= 0 && wPad < width &&
-              dPad >= 0 && dPad < depth)
-            outData[((c * outDepth + d) * outHeight + h) * outWidth + w] =
-                data[((cIn * depth + dPad) * height + hPad) * width + wPad];
-          else
-            outData[((c * outDepth + d) * outHeight + h) * outWidth + w] = 0;
-        }
-      }
-    }
-  }
-}
-
-void CpuMatrix::col2Vol(real* trg,
-                        int channels,
-                        int depth,
-                        int height,
-                        int width,
-                        int filterD,
-                        int filterH,
-                        int filterW,
-                        int strideD,
-                        int strideH,
-                        int strideW,
-                        int paddingD,
-                        int paddingH,
-                        int paddingW,
-                        real alpha,
-                        real beta) {
-  real* src = getData();
-  int outDepth = (depth + 2 * paddingD - filterD) / strideD + 1;
-  int outHeight = (height + 2 * paddingH - filterH) / strideH + 1;
-  int outWidth = (width + 2 * paddingW - filterW) / strideW + 1;
-  int channelsCol = channels * filterD * filterH * filterW;
-  for (int c = 0; c < channelsCol; ++c) {
-    int wOffset = c % filterW;
-    int hOffset = (c / filterW) % filterH;
-    int dOffset = (c / filterW / filterH) % filterD;
-    int cIm = c / filterW / filterH / filterD;
-    for (int d = 0; d < outDepth; ++d) {
-      for (int h = 0; h < outHeight; ++h) {
-        for (int w = 0; w < outWidth; ++w) {
-          int dPad = d * strideD - paddingD + dOffset;
-          int hPad = h * strideH - paddingH + hOffset;
-          int wPad = w * strideW - paddingW + wOffset;
-          if (hPad >= 0 && hPad < height && wPad >= 0 && wPad < width &&
-              dPad >= 0 && dPad < depth)
-            trg[((cIm * depth + dPad) * height + hPad) * width + wPad] =
-                alpha *
-                    src[((c * outDepth + d) * outHeight + h) * outWidth + w] +
-                beta *
-                    trg[((cIm * depth + dPad) * height + hPad) * width + wPad];
-        }
-      }
-    }
-  }
-}
-
-////////////////////////////////////////////////////////////////
-//               functions executed via cpu                   //
-////////////////////////////////////////////////////////////////
-
-void GpuMatrix::selectElements(Matrix& table, IVector& ids) {
-  execViaCpu2(&CpuMatrix::selectElements, *this, table, ids);
-}
-}  // namespace paddle
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
deleted file mode 100644
index 4c3b2c95361065372f5969a2da73bce0eb9d123f..0000000000000000000000000000000000000000
--- a/paddle/math/Matrix.h
+++ /dev/null
@@ -1,2189 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdint.h>
-#include <memory>
-#include <thread>
-
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/ThreadLocal.h"
-
-#include <hl_gpu.h>
-
-#include "BaseMatrix.h"
-#include "MemoryHandle.h"
-#include "Vector.h"
-#include "paddle/utils/Common.h"
-#include "paddle/utils/ThreadLocal.h"
-
-namespace paddle {
-
-/// TODO(tianbing), move to paddle/function/TensorType.h
-enum SparseValueType { NO_VALUE = 0, FLOAT_VALUE = 1 };
-
-/**
- * @brief  matrix sparse_format .
- *
- * nnz represents nonzero number in sparse matrix.
- *
- * SPARSE_CSR: row major matrix. length of row is height_ + 1, each element
- * represents row start index in Matrix. length of col and value are nnz.
- *
- * SPARSE_CSC: col major matrix. length of col is width_ + 1, each element
- * represents col start index in Matrix. length of col and value are nnz.
- *
- * @code
- * for example: [0, 1, 0, 2, 0;
- *               1, 0, 0, 0, 0;
- *               0, 0, 0, 2, 5];
- * SPARSE_CSR row   [0, 2, 3, 5];
- *            col   [1, 3, 0, 3, 4];
- *            value [1, 2, 1, 2, 5]
- * SPARSE_CSC col   [0, 1, 2, 2, 4, 5];
- *            row   [1, 0, 0, 2, 2];
- *            value [1, 1, 2, 2, 5]
- * @endcode
- */
-/// TODO(tianbing), move to paddle/function/TensorType.h
-enum SparseFormat { SPARSE_CSR = 0, SPARSE_CSC = 1 };
-
-class Matrix;
-class GpuMatrix;
-class CpuMatrix;
-class CpuSparseMatrix;
-class GpuSparseMatrix;
-typedef std::shared_ptr<Matrix> MatrixPtr;
-typedef std::shared_ptr<GpuMatrix> GpuMatrixPtr;
-typedef std::shared_ptr<CpuMatrix> CpuMatrixPtr;
-typedef std::shared_ptr<GpuSparseMatrix> GpuSparseMatrixPtr;
-typedef std::shared_ptr<CpuSparseMatrix> CpuSparseMatrixPtr;
-
-/**
- * Copy or assignemnt constructor will share the data as opposed to making a
- * copy of the original data. To make a copy of the orinal data, use copyFrom()
- * instead.
- */
-class Matrix : public BaseMatrix {
- protected:
-  Matrix(MemoryHandlePtr memHandle,
-         size_t height,
-         size_t width,
-         bool trans,
-         bool use_gpu);
-
-  Matrix(real* data, size_t height, size_t width, bool trans, bool use_gpu);
-
-  Matrix(real* data,
-         size_t height,
-         size_t width,
-         size_t stride,
-         bool trans,
-         bool use_gpu);
-
-  static ThreadLocal<MatrixPtr> tmpMat_;
-
- public:
-  size_t elementCnt_;  // maximal number of elements which can be held in data_
-  MemoryHandlePtr memoryHandle_;
-
- public:
-  virtual ~Matrix() {}
-
-  static MatrixPtr create(MemoryHandlePtr memHandle,
-                          size_t height,
-                          size_t width,
-                          bool trans = false);
-  static MatrixPtr create(size_t height,
-                          size_t width,
-                          bool trans = false,
-                          bool useGpu = false);
-  static MatrixPtr create(real* data,
-                          size_t height,
-                          size_t width,
-                          bool trans = false,
-                          bool useGpu = false);
-  static MatrixPtr create(real* data,
-                          size_t height,
-                          size_t width,
-                          size_t stride,
-                          bool trans = false,
-                          bool useGpu = false);
-
-  static MatrixPtr createSparseMatrix(size_t height,
-                                      size_t width,
-                                      size_t nnz,
-                                      SparseValueType valueType = FLOAT_VALUE,
-                                      bool trans = false,
-                                      bool useGpu = false);
-  static MatrixPtr createSparseMatrix(size_t height,
-                                      size_t width,
-                                      size_t nnz,
-                                      SparseValueType valueType = FLOAT_VALUE,
-                                      SparseFormat foramt = SPARSE_CSR,
-                                      bool trans = false,
-                                      bool useGpu = false);
-
-  static MatrixPtr createSparseMatrix(real* data,
-                                      int* row,
-                                      int* col,
-                                      size_t height,
-                                      size_t width,
-                                      size_t nnz, /* used to allocate space */
-                                      SparseValueType valueType, /*value type*/
-                                      SparseFormat format,
-                                      bool trans,
-                                      bool useGpu);
-
-  static void resizeOrCreateSparseMatrix(
-      MatrixPtr& matrix,
-      size_t height,
-      size_t width,
-      size_t nnz,
-      SparseValueType valueType = FLOAT_VALUE,
-      SparseFormat foramt = SPARSE_CSR,
-      bool trans = false,
-      bool useGpu = false);
-
-  static void resizeOrCreate(MatrixPtr& a,
-                             size_t height,
-                             size_t width,
-                             bool trans = false,
-                             bool useGpu = false);
-
-  /**
-   * @brief  set the data buffer used to hold the matrix data.
-   *
-   * caller should make sure that the size of data is at least
-   * sizeof(real)*height*width.
-   */
-  void setData(real* data) {
-    BaseMatrix::setData(data);
-    memoryHandle_.reset();
-  }
-
-  /// the data should be contiguous
-  void setData(real* data, size_t newHeight, size_t newWidth) {
-    setData(data);
-    height_ = newHeight;
-    width_ = newWidth;
-    elementCnt_ = newHeight * newWidth;
-    stride_ = width_;
-  }
-
-  size_t getWidth() const { return width_; }
-  size_t getHeight() const { return height_; }
-  size_t getStride() const { return stride_; }
-  size_t getElementCnt() const { return elementCnt_; }
-  virtual real* getData() { return data_; }
-  virtual const real* getData() const { return data_; }
-  bool isTransposed() const { return trans_; }
-  bool isContiguous() const { return stride_ == width_ || height_ == 1; }
-
-  // If sparse matrix, need to dynamic_cast to CpuSparseMatrix/GpuSparseMatrix
-  // befor call the following functions.
-  // Declare these functions in the base class just easy to call them.
-  // And these declarations should be moved to base class of sparse matrix
-  // if refactor sparse matrix
-  virtual int* getRows() const {
-    LOG(FATAL) << "Not implemented";
-    return nullptr;  //! suppress warning for no return value.
-  }
-
-  virtual int* getCols() const {
-    LOG(FATAL) << "Not implemented";
-    return nullptr;  //! suppress warning for no return value.
-  }
-
-  virtual SparseFormat getFormat() const {
-    LOG(FATAL) << "Not implemented";
-    return SPARSE_CSR;  //! suppress warning for no return value.
-  }
-
-  virtual SparseValueType getValueType() const {
-    LOG(FATAL) << "Not implemented";
-    return NO_VALUE;  //! suppress warning for no return value.
-  }
-
-  /**
-   * @brief matrix elment-wise add
-   *
-   * Named add3 just because add/add2 has been used in BaseMatrix.cu
-   * and they are not virtual function.
-   */
-  virtual void add3(MatrixPtr b) { LOG(FATAL) << "Not implemented"; }
-
-  MemoryHandlePtr getMemoryHandle() const { return memoryHandle_; }
-
-  virtual void zeroMem() { LOG(FATAL) << "Not implemented"; }
-
-  virtual void resetOne() { LOG(FATAL) << "Not implemented"; }
-
-  void setDiag(real value);
-
-  virtual void copyFrom(const Matrix& src) { LOG(FATAL) << "Not implemented"; }
-
-  virtual void trimFrom(const CpuSparseMatrix& src) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  // For GpuMatrix this is an asynchronous copy interface
-  // For CpuMatrix this is an synchronous copy interface
-  virtual void copyFrom(const Matrix& src, hl_stream_t stream) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  MatrixPtr subMatrix(size_t startRow,
-                      size_t endRow,
-                      size_t startCol,
-                      size_t endCol);
-
-  MatrixPtr subRowMatrix(size_t startRow, size_t endRow) {
-    return subMatrix(startRow, endRow, 0, getWidth());
-  }
-
-  MatrixPtr subColMatrix(size_t startCol, size_t endCol) {
-    return subMatrix(0, getHeight(), startCol, endCol);
-  }
-
-  virtual MatrixPtr subMatrix(size_t startRow, size_t numRows) {
-    CHECK_LE(startRow + numRows, getHeight());
-    return Matrix::create(getData() + startRow * getWidth(),
-                          numRows,
-                          getWidth(),
-                          trans_,
-                          useGpu_);
-  }
-  virtual MatrixPtr subMatrix(size_t startRow, size_t numRows, MatrixPtr dest) {
-    CHECK_LE(startRow + numRows, getHeight());
-    CHECK_EQ(useGpu_, dest->useGpu_);
-    dest->setData(this->rowBuf(startRow), numRows, getWidth());
-    return dest;
-  }
-
-  /**
-   * If this is GpuMatrix, src is assumed to be CPU memory
-   *
-   * If this is CpuMatrix, src is assumed to be CPU memory
-   */
-  virtual void copyFrom(const real* src, size_t size) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void copyFrom(const real* src, const int64_t* seq) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @brief convert a int vector to a real matrix.
-   *
-   * (1) source and dest are both in CPU.
-   *
-   * (2) sizes are exactly match.
-   */
-  virtual void copyFrom(const IVector& src) {
-    LOG(FATAL) << "copy data from int vector only available on CpuMatrix.";
-  }
-
-  virtual void copyByRowIndex(Matrix& b, const IVector& rowIndex) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @brief Create a matrix with the same type (GpuMatrix, CpuMatrix,
-   *        NonValueSparseMatrix, etc.) as this.
-   *
-   * If height and width is zero, the new matrix will have the same size
-   * as this, otherwise the new matrix will have the specified size.
-   *
-   */
-  virtual MatrixPtr clone(size_t height = 0,
-                          size_t width = 0,
-                          bool useGpu = false) {
-    LOG(FATAL) << "Not implemented";
-    return nullptr;
-  }
-
-  virtual real* getRowBuf(size_t row) {
-    LOG(FATAL) << "Not implemented";
-    return nullptr;
-  }
-
-  virtual real getElement(size_t x, size_t y) const {
-    LOG(FATAL) << "Not implemented";
-    return 0;
-  }
-
-  virtual real getSum() {
-    LOG(FATAL) << "Not implemented";
-    return 0;
-  }
-
-  virtual void accumulateColSum(Matrix& src) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual real getAbsSum() {
-    LOG(FATAL) << "Not implemented";
-    return 0;
-  }
-
-  /**
-   * @note Original data may not be preserved after resize().
-   */
-  virtual void resize(size_t newHeight, size_t newWidth) = 0;
-
-  /**
-   * @note This should only be used for sparse matrix.
-   */
-  virtual void resize(size_t newHeight,
-                      size_t newWidth,
-                      size_t newNnz, /* total item used to allocate space */
-                      SparseValueType valueType,
-                      SparseFormat format) = 0;
-
-  /**
-   * @brief This should only be used for sparse matrix.
-   *
-   * Currently must be called for each row in order.
-   * The matrix is not valid until setRow is called for the last row.
-   */
-  virtual void setRow(size_t row,
-                      size_t colNum,
-                      const unsigned int* cols,
-                      const real* values) = 0;
-
-  virtual MatrixPtr getTranspose() = 0;
-
-  /**
-   * @brief  hard transpose.
-   *
-   * allocate matTrans' memory outside, then set memAlloc as false;
-   * else set as true.
-   */
-  virtual void transpose(MatrixPtr& matTrans, bool memAlloc) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @brief  rotate 90 degrees in clock-wise if clockWise=true;
-   *         otherwise rotate in anti clock-wise
-   * clock-wise:
-   * \f[
-   *   y(j,i) = x(M-i-1,j)
-   * \f]
-   * anti clock-wise:
-   * \f[
-   *   y(j,i) = x(i, N-1-j)
-   * \f]
-   * where \f$x\f$ is (M x N) input, and \f$y\f$ is (N x M) output.
-   *
-   * allocate matRot' memory outside, then set memAlloc as false;
-   * else set as true.
-   */
-  virtual void rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual MatrixPtr getInverse() {
-    LOG(FATAL) << "Not implemented";
-    return nullptr;
-  }
-
-  /**
-   * @brief  inverse.
-   *
-   * if allocate matInv's memory outside, then set memAlloc as false;
-   * else set as true.
-   */
-  virtual void inverse(MatrixPtr& matInv, bool memAlloc) {
-    LOG(FATAL) << "Not implemented";
-  }
-
- public:
-  /// Only set all variables to 0 or NULL but not free them.
-  virtual void clear() {
-    height_ = 0;
-    width_ = 0;
-    data_ = NULL;
-  }
-
-  void reshape(size_t height, size_t width);
-
-  /// add b to each sample of this.
-  virtual void addBias(Matrix& b, real scale) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void addSharedBias(Matrix& b, real scale) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  void addBias(Matrix& b, real scale, bool sharedBias) {
-    if (!sharedBias) {
-      addBias(b, scale);
-    } else {
-      addSharedBias(b, scale);
-    }
-  }
-
-  /// add each sample from a to this.
-  virtual void collectBias(Matrix& a, real scale) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void collectSharedBias(Matrix& a, real scale) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  void collectBias(Matrix& a, real scale, bool sharedBias) {
-    if (!sharedBias) {
-      collectBias(a, scale);
-    } else {
-      collectSharedBias(a, scale);
-    }
-  }
-
-  virtual void sequenceAvgForward(Matrix& a,
-                                  const IVector& startsPos,
-                                  int mode) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void sequenceAvgBackward(Matrix& a,
-                                   const IVector& startsPos,
-                                   int mode) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * this = scaleAB*(a*b) + scaleT*this
-   * @endcode
-   */
-  virtual void mul(const Matrix& a,
-                   const Matrix& b,
-                   real scaleAB,
-                   real scaleT) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// Add a vector (column) b to matrix a, column by column.
-  virtual void addColumnVector(const Matrix& b) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * For j < codeLength:
-   *   this(i, j) += vec(index(i, j), 0)
-   * where index(i, j) = ((codes(i) + numClasses) >> (j + 1)) - 1
-   * @endcode
-   */
-  virtual void addByBitCode(size_t numClasses,
-                            const IVector& codes,
-                            const Matrix& vec) {
-    (void)numClasses;
-    (void)codes;
-    (void)vec;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength:
-   *   vec(index(i, j), 0) += this(i, j)
-   * where index is same as the index for addByBitCode
-   * @endcode
-   */
-  virtual void addByBitCodeBackward(size_t numClasses,
-                                    const IVector& codes,
-                                    Matrix& vec) {
-    (void)numClasses;
-    (void)codes;
-    (void)vec;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength:
-   *   this(i, j) += <mat.row(index(i, j)), input.row(i)>
-   * where index is same as the index for addByBitCode
-   * @endcode
-   */
-  virtual void mulByBitCode(size_t numClasses,
-                            const IVector& codes,
-                            const Matrix& mat,
-                            const Matrix& input) {
-    (void)numClasses;
-    (void)codes;
-    (void)mat;
-    (void)input;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength:
-   *   mat.row(index(i, j)) += this(i, j) * input.row(i)
-   * where index is same as the index for addByBitCode
-   * @endcode
-   */
-  virtual void mulByBitCodeBackwardWeight(size_t numClasses,
-                                          const IVector& codes,
-                                          Matrix& mat,
-                                          const Matrix& input) {
-    (void)numClasses;
-    (void)codes;
-    (void)mat;
-    (void)input;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength:
-   *   input.row(i) += this(i, j) * mat.row(index(i, j))
-   * where index is same as the index for addByBitCode
-   * @endcode
-   */
-  virtual void mulByBitCodeBackwardError(size_t numClasses,
-                                         const IVector& codes,
-                                         const Matrix& mat,
-                                         Matrix& input) {
-    (void)numClasses;
-    (void)codes;
-    (void)mat;
-    (void)input;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength
-   *   sum(i, 0) = scaleSum * \sum_j  bit(i, j) * this(i, j)
-   * where bit(i, j) = ((codes(i) + numClasses) & 2^j) ? 1 : 0
-   * @endcode
-   */
-  virtual void sumByBitCode(size_t numClasses,
-                            IVector& codes,
-                            Matrix& sum,
-                            real scaleSum) {
-    (void)numClasses;
-    (void)codes;
-    (void)sum;
-    (void)scaleSum;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength
-   *  this(i, j) -= bit(i, j)
-   * where bit(i, j) is same as that for sumByBitCode
-   * @endcode
-   */
-  virtual void subByBitCode(size_t numClasses_, IVector& codes) {
-    (void)numClasses_;
-    (void)codes;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * add the sum of each row of this to mat
-   */
-  virtual void rowSum(Matrix& sum) {
-    (void)sum;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * set the max of each row of this to mat
-   */
-  virtual void rowMax(Matrix& max) {
-    (void)max;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * set the max of each column of this to mat
-   */
-  virtual void colMax(Matrix& max) { LOG(FATAL) << "not implemented"; }
-
-  /**
-   * @brief Get the top k elements of each column of this matrix.
-   *
-   * The row ids and values of these elements are stored in
-   * maxIds and max respectively. where k is the size of maxIds.
-   * And note that the top k elements are not sorted.
-   */
-  virtual void colMax(IVector& maxIds, Matrix& maxVal) {
-    LOG(FATAL) << "not implemented";
-  }
-
-  virtual void maxoutForward(Matrix& a,
-                             IVector& id,
-                             size_t channels,
-                             size_t groups) {
-    LOG(FATAL) << "not implemented";
-  }
-
-  virtual void maxoutBackward(Matrix& a,
-                              IVector& id,
-                              size_t channels,
-                              size_t groups) {
-    LOG(FATAL) << "not implemented";
-  }
-
-  virtual void rowMaxId(IVector& maxIds) { LOG(FATAL) << "Not implemented"; }
-
-  /**
-   * @brief Get the top k elements of each row of this matrix.
-   *
-   * The column ids and values of these elements are stored in
-   * maxIds and max respectively. where k is the size of maxIds.
-   * And note that the top k elements are not sorted.
-   */
-  virtual void rowMax(IVector& maxIds, Matrix& max) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// normalize each row so that the sum of each row is 1.
-  virtual void rowNormalizeL1(Matrix& out) {
-    (void)out;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   *  this = a*b
-   * @endcode
-   */
-  virtual void mul(const Matrix& a, const Matrix& b) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * this = scaleAB*(this*b) +  scaleT*this
-   * @endcode
-   */
-  virtual void rightMul(Matrix& b, real scaleAB, real scaleT) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * this = this* b
-   * @endcode
-   */
-  virtual void rightMul(Matrix& b) { LOG(FATAL) << "Not implemented"; }
-
-  /**
-   * @code
-   * this = scaleAB*(a*this) +  scaleT*this
-   * @endcode
-   */
-  virtual void leftMul(Matrix& a, real scaleAB, real scaleT) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * this = a*this)
-   * @endcode
-   */
-  virtual void leftMul(Matrix& a) { LOG(FATAL) << "Not implemented"; }
-
-  /// merge the element for each col.
-  virtual void colMerge(Matrix& src) { LOG(FATAL) << "Not implemented"; }
-
-  /// copy -log(output[label]) to this->data[i].
-  virtual void oneHotCrossEntropy(Matrix& output, IVector& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// calculate the error of outputV according to label.
-  virtual void oneHotCrossEntropyBp(Matrix& outputV, IVector& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// copy -log(output[label]) to this->data[i].
-  virtual void oneHotCrossEntropyWithSelfNorm(Matrix& output,
-                                              IVector& label,
-                                              real alpha) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// calculate the error of outputV according to label.
-  virtual void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
-                                                IVector& label,
-                                                real alpha) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * \f[
-   *  a[i] = \sum_{j=-(N-1)/2}^{(N-1)/2} b_{i+j} * c_{j}
-   * \f]
-   *
-   * b contains M elements,
-   * c contains N elements (N is odd),
-   * b's index arithmetic is computed modulo M,
-   * c's index arithmetic is computed modulo N.
-   */
-  virtual void circularConv(Matrix& b, Matrix& c) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void circularConvDerivative(Matrix& output,
-                                      Matrix& prevOut1,
-                                      Matrix& prevOut2,
-                                      Matrix& prevGrad1,
-                                      Matrix& prevGrad2) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /* output_ij = exp(this_{ij}) / (sum_j exp(this_ij)) */
-  virtual void softmax(Matrix& output) {
-    (void)output;
-    LOG(FATAL) << "Not implemeted";
-  }
-  virtual void sequenceSoftmax(Matrix& output, const IVector& index) {
-    (void)output;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void softmaxBackward(Matrix& outputV) {
-    (void)outputV;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /*
-    sum_i = sum_j this_ij * output_ij
-    this_ij = output_ij* (this_ij - sum_i)
-  */
-  virtual void softmaxDerivative(Matrix& output, Matrix& sftmaxSum) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// calculate the sum of squares diff cost.
-  virtual void sumOfSquares(Matrix& output, Matrix& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// gradient of sumOfSquares.
-  virtual void sumOfSquaresBp(Matrix& outputV, Matrix& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void smoothL1(Matrix& output, Matrix& label, real destScale) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void smoothL1Bp(Matrix& outputV, Matrix& label, real destScale) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void tanh(Matrix& output) { LOG(FATAL) << "Not implemented"; }
-
-  virtual void tanhDerivative(Matrix& output) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void softrelu(Matrix& output) { LOG(FATAL) << "Not implemented"; }
-
-  virtual void softreluDerivative(Matrix& output) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void scaledTanh(Matrix& output, real p1, real p2) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// print out the values of elements to os
-  virtual void print(std::ostream& os) const {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * print a part of the matrix
-   * from the (top,left) value to the (height, width) value (not included)
-   */
-  virtual void print(std::ostream& os, size_t height, size_t width) const {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// print one row to os
-  virtual void printOneRow(std::ostream& os, size_t idx) const {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void check(std::ostream& os, Matrix& refMat, bool printDiff = true) {}
-
-  virtual real getMin() {
-    LOG(FATAL) << "Not implemented";
-    return 0;
-  }
-  virtual real getMax() {
-    LOG(FATAL) << "Not implemented";
-    return 0;
-  }
-
-  virtual void randomizeUniform() { LOG(FATAL) << "Not implemented"; }
-
-  /**
-   * @brief  calulate the error of classification
-   *
-   * output[i] = 1 if row i is an error.
-   *
-   * output[i] = 0 if row i is correct.
-   *
-   */
-  virtual void classificationError(Matrix& output,
-                                   IVector& label,
-                                   size_t topkSize = 1) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void upsampleForward(Matrix& input,
-                               Matrix& mask,
-                               size_t imgSizeH,
-                               size_t imgSizeW,
-                               size_t channels,
-                               size_t outputH,
-                               size_t outputW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void upsampleBackward(Matrix& outputGrad,
-                                Matrix& mask,
-                                size_t imgSizeH,
-                                size_t imgSizeW,
-                                size_t channels,
-                                size_t outputH,
-                                size_t outputW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * Pooling forward operation, pick out the largest element
-   * in the sizeX of value, if the maskMatP is not NULL, it will
-   * also caculate the location indices.
-   */
-  virtual void maxPoolForward(Matrix& inputMat,
-                              size_t imgSizeH,
-                              size_t imgSizeW,
-                              size_t channels,
-                              size_t sizeX,
-                              size_t sizeY,
-                              size_t strideH,
-                              size_t strideW,
-                              size_t outputH,
-                              size_t outputW,
-                              size_t paddingH,
-                              size_t paddingW,
-                              MatrixPtr maskMatP = NULL) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /// Pooling backward operation.
-  virtual void maxPoolBackward(Matrix& image,
-                               size_t imgSizeH,
-                               size_t imgSizeW,
-                               Matrix& outGrad,
-                               Matrix& outV,
-                               size_t sizeX,
-                               size_t sizeY,
-                               size_t strideH,
-                               size_t strideW,
-                               size_t outputH,
-                               size_t outputW,
-                               real scaleTargets,
-                               real scaleOutput,
-                               size_t paddingH,
-                               size_t paddingW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /// Pooling forward operation, caculate the average of sizeX elements.
-  virtual void avgPoolForward(Matrix& input,
-                              size_t imgSizeH,
-                              size_t imgSizeW,
-                              size_t channels,
-                              size_t sizeX,
-                              size_t sizeY,
-                              size_t strideH,
-                              size_t strideW,
-                              size_t outputH,
-                              size_t outputW,
-                              size_t paddingH,
-                              size_t paddingW,
-                              bool excludeMode = true) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void avgPoolBackward(Matrix& input,
-                               size_t imgSizeH,
-                               size_t imgSizeW,
-                               size_t sizeX,
-                               size_t sizeY,
-                               size_t strideH,
-                               size_t strideW,
-                               size_t outputH,
-                               size_t outputW,
-                               real scaleTargets,
-                               real scaleOutput,
-                               size_t paddingH,
-                               size_t paddingW,
-                               bool excludeMode = true) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * Pooling 3D forward operation, pick out the largest element
-   * in the sizeX of value
-   */
-  virtual void maxPool3DForward(Matrix& inputMat,
-                                Matrix& maxPoolIdx,
-                                size_t channels,
-                                size_t imgSizeD,
-                                size_t imgSizeH,
-                                size_t imgSizeW,
-                                size_t outputD,
-                                size_t outputH,
-                                size_t outputW,
-                                size_t sizeZ,
-                                size_t sizeY,
-                                size_t sizeX,
-                                size_t strideD,
-                                size_t strideH,
-                                size_t strideW,
-                                size_t paddingD,
-                                size_t paddingH,
-                                size_t paddingW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void maxPool3DBackward(Matrix& outGrad,
-                                 Matrix& maxPoolIdx,
-                                 size_t imgSizeD,
-                                 size_t imgSizeH,
-                                 size_t imgSizeW,
-                                 size_t outputD,
-                                 size_t outputH,
-                                 size_t outputW,
-                                 size_t sizeZ,
-                                 size_t sizeY,
-                                 size_t sizeX,
-                                 size_t strideD,
-                                 size_t strideH,
-                                 size_t strideW,
-                                 size_t paddingD,
-                                 size_t paddingH,
-                                 size_t paddingW,
-                                 real scaleTargets,
-                                 real scaleOutput) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void avgPool3DForward(Matrix& input,
-                                size_t channels,
-                                size_t imgSizeD,
-                                size_t imgSizeH,
-                                size_t imgSizeW,
-                                size_t outputD,
-                                size_t outputH,
-                                size_t outputW,
-                                size_t sizeZ,
-                                size_t sizeY,
-                                size_t sizeX,
-                                size_t strideD,
-                                size_t strideH,
-                                size_t strideW,
-                                size_t paddingD,
-                                size_t paddingH,
-                                size_t paddingW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void avgPool3DBackward(Matrix& input,
-                                 size_t imgSizeD,
-                                 size_t imgSizeH,
-                                 size_t imgSizeW,
-                                 size_t outputD,
-                                 size_t outputH,
-                                 size_t outputW,
-                                 size_t sizeZ,
-                                 size_t sizeY,
-                                 size_t sizeX,
-                                 size_t strideD,
-                                 size_t strideH,
-                                 size_t strideW,
-                                 size_t paddingD,
-                                 size_t paddingH,
-                                 size_t paddingW,
-                                 real scaleTargets,
-                                 real scaleOutput) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
- * Input: one or more sequences. Each sequence contains some instances.
- *
- * Output: output size is the number of input sequences (NOT input
- * instances).
- *
- * output[i] is set to max_input[i].
- */
-  virtual void maxSequenceForward(Matrix& input,
-                                  const IVector& sequence,
-                                  IVector& index) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void maxSequenceBackward(Matrix& outputGrad,
-                                   const IVector& sequence,
-                                   IVector& index) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * this.row[i] += table.row[ids[i]]
-   * if ids[i] == -1, it will be ignored
-   * @endcode
-   */
-  virtual void selectRows(Matrix& table, IVector& ids) {
-    (void)table;
-    (void)ids;
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * this[i] = table[i, id[i]]
-   * @endcode
-   */
-  virtual void selectElements(Matrix& table, IVector& ids) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * table.row[ids[i]] += this.row[i]
-   * if ids[i] == -1, it will be ignored
-   * @endcode
-   */
-  virtual void addToRows(Matrix& table, IVector& ids) {
-    (void)table;
-    (void)ids;
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * table[i, id[i]] += this[i]
-   * @endcode
-   */
-  virtual void addElements(Matrix& table, IVector& ids) {
-    LOG(FATAL) << "Not implemented";
-  }
-  /**
-   * @brief  cross entropy for multi binary labels
-   *
-   * @code
-   * this[i] = -sum(label[i][j]*log(output[i][j])
-   *           + (1-label[i][j])*log(1-output[i][j]))
-   * @endcode
-   */
-  virtual void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @brief  The gradient of cross entropy for multi binary labels on output
-   *
-   * @code
-   * this[i][j] = -label[i][j]/output[i][j]
-   *              + (1-label[i][j])/(1-output[i][j])
-   * @endcode
-   */
-  virtual void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @brief  Calculate the classification error for multi binary labels
-   *
-   * @code
-   * this[i] = sum((output[i][j] >= threshold && label[i][j] == 0)
-   *            || (output[i][j] < threshold && label[i][j] == 1))
-   *            / output->getWidth()
-   * @endcode
-   */
-  virtual void classificationErrorMulti(Matrix& output,
-                                        Matrix& label,
-                                        real threshold) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void paramReluForward(Matrix& data, Matrix& W) {
-    LOG(FATAL) << "Not implemented";
-  }
-  virtual void paramReluBackwardW(Matrix& oGrad, Matrix& data) {
-    LOG(FATAL) << "Not implemented";
-  }
-  virtual void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void vol2Col(real* data,
-                       int channels,
-                       int depth,
-                       int height,
-                       int width,
-                       int filterD,
-                       int filterH,
-                       int filterW,
-                       int strideD,
-                       int strideH,
-                       int strideW,
-                       int paddingD,
-                       int paddingH,
-                       int paddingW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void col2Vol(real* trg,
-                       int channels,
-                       int depth,
-                       int height,
-                       int width,
-                       int filterD,
-                       int filterH,
-                       int filterW,
-                       int strideD,
-                       int strideH,
-                       int strideW,
-                       int paddingD,
-                       int paddingH,
-                       int paddingW,
-                       real alpha,
-                       real beta) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void bilinearForward(const Matrix& in,
-                               const size_t inImgH,
-                               const size_t inImgW,
-                               const size_t outImgH,
-                               const size_t outImgW,
-                               const size_t numChannels,
-                               const real ratioH,
-                               const real ratioW) {
-    LOG(FATAL) << "Not implemented";
-  }
-  virtual void bilinearBackward(const Matrix& out,
-                                const size_t outImgH,
-                                const size_t outImgW,
-                                const size_t inImgH,
-                                const size_t inImgW,
-                                const size_t numChannels,
-                                const real ratioH,
-                                const real ratioW) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  template <typename ExpressionType>
-  void operator=(const ExpressionType& expr) {
-    if (useGpu_) {
-      TensorGpuApply<real>(*this, expr);
-    } else {
-      TensorCpuApply<real>(*this, expr);
-    }
-  }
-
-  bool isEmpty() const { return data_ == nullptr; }
-
-  explicit operator bool() const { return !isEmpty(); }
-};
-
-inline std::ostream& operator<<(std::ostream& os, const Matrix& mat) {
-  mat.print(os);
-  return os;
-}
-
-class GpuMatrix : public Matrix {
- public:
-  GpuMatrix();
-
-  GpuMatrix(size_t height, size_t width, bool trans = false);
-  GpuMatrix(real* data, size_t height, size_t width, bool trans = false)
-      : Matrix(data, height, width, trans, true) {}
-  GpuMatrix(real* data,
-            size_t height,
-            size_t width,
-            size_t stride,
-            bool trans = false)
-      : Matrix(data, height, width, stride, trans, true) {}
-  GpuMatrix(GpuMemHandlePtr dataHandle,
-            size_t height,
-            size_t width,
-            bool trans = false)
-      : Matrix(dataHandle, height, width, trans, true) {}
-  ~GpuMatrix();
-
-  void zeroMem();
-  void resetOne();
-  void setDiag(real value);
-
-  void resize(size_t newHeight, size_t newWidth);
-  void resize(size_t newHeight,
-              size_t newWidth,
-              size_t newNnz, /* used to allocate space */
-              SparseValueType valueType,
-              SparseFormat format) {
-    LOG(FATAL) << "Only Support Sparse Matrix";
-  }
-  void setRow(size_t row,
-              size_t colNum,
-              const unsigned int* cols,
-              const real* values) {
-    LOG(FATAL) << "Only Support Sparse Matrix";
-  }
-
-  /**
-   * Copy the data from cpu_memory buffer
-   */
-  void copyFrom(const real* hostSrc, size_t size);
-
-  void copyFrom(const real* hostSrc, const int64_t* seq);
-
-  void copyFrom(const Matrix& src, hl_stream_t stream);
-
-  void copyFrom(const Matrix& src);
-
-  void copyFrom(const IVector& src);
-
-  void copyByRowIndex(Matrix& b, const IVector& rowIndex);
-
-  MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
-
-  real getElement(size_t x, size_t y) const;
-
-  real* getRow(size_t row) { return BaseMatrix::rowBuf(row); }
-  virtual real* getRowBuf(size_t row) { return getRow(row); }
-
-  real getSum();
-  void accumulateColSum(Matrix& src);
-  real getAbsSum();
-
-  real getMin();
-  real getMax();
-
-  MatrixPtr getTranspose();
-  void transpose(MatrixPtr& matTrans, bool memAlloc);
-  void rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise);
-
-  MatrixPtr getInverse();
-  void inverse(MatrixPtr& matInv, bool memAlloc);
-
-  /// add b to each sample of this.
-  void addBias(Matrix& b, real scale);
-  void addSharedBias(Matrix& b, real scale);
-
-  /**
-   * @code
-   * add each sample from a to this.
-   * @endcode
-   */
-  void collectBias(Matrix& a, real scale);
-  void collectSharedBias(Matrix& a, real scale);
-
-  void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
-  void sequenceAvgBackward(Matrix& a, const IVector& startsPos, int mode);
-
-  /**
-   * @code
-   * this.row[i] += table.row[ids[i]]
-   * @endcode
-   */
-  virtual void selectRows(Matrix& table, IVector& ids);
-
-  /**
-   * @code
-   * this[i] = table[i, id[i]]
-   * @endcode
-   */
-  virtual void selectElements(Matrix& table, IVector& ids);
-
-  /**
-   * @code
-   * table.row[ids[i]] += this.row[i]
-   * @endcode
-   */
-  virtual void addToRows(Matrix& table, IVector& ids);
-
-  void addColumnVector(const Matrix& b);
-
-  /**
-   * @code
-   * this = scaleAB*(a*b) + scaleT*this
-   * @endcode
-   */
-  void mul(const Matrix& a, const Matrix& b, real scaleAB, real scaleT);
-
-  /**
-   * @code
-   * this = a*b
-   * @endcode
-   */
-  void mul(const Matrix& a, const Matrix& b);
-
-  void mul(const GpuMatrix& a, const GpuMatrix& b, real scaleAB, real scaleT);
-
-  void mul(const GpuSparseMatrix& a,
-           const GpuMatrix& b,
-           real scaleAB,
-           real scaleT);
-
-  void mul(const GpuMatrix& a,
-           const GpuSparseMatrix& b,
-           real scaleAB,
-           real scaleT);
-
-  /**
-   * @code
-   * this = scaleAB*(this*b) +  scaleT*this
-   * @endcode
-   */
-  void rightMul(Matrix& b, real scaleAB, real scaleT);
-
-  /**
-   * @code
-   * this = this* b
-   * @endcode
-   */
-  void rightMul(Matrix& b);
-
-  /**
-   * @code
-   * this = scaleAB*(a*this) +  scaleT*this
-   * @endcode
-   */
-  void leftMul(Matrix& a, real scaleAB, real scaleT);
-
-  /**
-   * @code
-   * this = a*this
-   * @endcode
-   */
-  void leftMul(Matrix& a);
-
-  void colMerge(Matrix& src);
-  void rowSum(Matrix& sum);
-  void rowMax(Matrix& max);
-  void rowMax(IVector& maxIds, Matrix& max);
-  void colMax(Matrix& max);
-  void colMax(IVector& maxIds, Matrix& max);
-  void maxoutForward(Matrix& a, IVector& id, size_t channels, size_t groups);
-  void maxoutBackward(Matrix& a, IVector& id, size_t channels, size_t groups);
-
-  void oneHotCrossEntropy(Matrix& output, IVector& label);
-  void oneHotCrossEntropyBp(Matrix& outputV, IVector& label);
-  void oneHotCrossEntropyWithSelfNorm(Matrix& output,
-                                      IVector& label,
-                                      real alpha);
-  void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
-                                        IVector& label,
-                                        real alpha);
-
-  void softmax(Matrix& output);
-  void sequenceSoftmax(Matrix& output, const IVector& index);
-  void softmaxBackward(Matrix& outputV);
-  void softmaxDerivative(Matrix& output, Matrix& sftmaxSum);
-
-  /// calculate the sum of squares diff cost.
-  void sumOfSquares(Matrix& output, Matrix& label);
-
-  /// gradient of sumOfSquares.
-  void sumOfSquaresBp(Matrix& outputV, Matrix& label);
-  void tanh(Matrix& output);
-  void tanhDerivative(Matrix& output);
-  void softrelu(Matrix& output);
-  void softreluDerivative(Matrix& output);
-  void scaledTanh(Matrix& output, real p1, real p2);
-
-  virtual void print(std::ostream& os) const;
-  virtual void print(std::ostream& os, size_t height, size_t width) const;
-
-  void paramReluForward(Matrix& data, Matrix& W);
-  void paramReluBackwardW(Matrix& oGrad, Matrix& data);
-  void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W);
-
-  void check(std::ostream& os, Matrix& refMat, bool printDiff = true);
-  void randomizeUniform();
-
-  void classificationError(Matrix& output, IVector& label, size_t topkSize = 1);
-
-  void upsampleForward(Matrix& input,
-                       Matrix& mask,
-                       size_t imgSizeH,
-                       size_t imgSizeW,
-                       size_t channels,
-                       size_t outputH,
-                       size_t outputW);
-
-  void upsampleBackward(Matrix& outputGrad,
-                        Matrix& mask,
-                        size_t imgSizeH,
-                        size_t imgSizeW,
-                        size_t channels,
-                        size_t outputH,
-                        size_t outputW);
-
-  void maxPoolForward(Matrix& inputMat,
-                      size_t imgSizeH,
-                      size_t imgSizeW,
-                      size_t channels,
-                      size_t sizeX,
-                      size_t sizeY,
-                      size_t strideH,
-                      size_t strideW,
-                      size_t outputH,
-                      size_t outputW,
-                      size_t paddingH,
-                      size_t paddingW,
-                      MatrixPtr maskMatP);
-
-  void maxPoolBackward(Matrix& image,
-                       size_t imgSizeH,
-                       size_t imgSizeW,
-                       Matrix& outGrad,
-                       Matrix& outV,
-                       size_t sizeX,
-                       size_t sizeY,
-                       size_t strideH,
-                       size_t strideW,
-                       size_t outputH,
-                       size_t outputW,
-                       real scaleTargets,
-                       real scaleOutput,
-                       size_t paddingH,
-                       size_t paddingW);
-
-  void avgPoolForward(Matrix& input,
-                      size_t imgSizeH,
-                      size_t imgSizeW,
-                      size_t channels,
-                      size_t sizeX,
-                      size_t sizeY,
-                      size_t strideH,
-                      size_t strideW,
-                      size_t outputH,
-                      size_t outputW,
-                      size_t paddingH,
-                      size_t paddingW,
-                      bool excludeMode = true);
-
-  void avgPoolBackward(Matrix& input,
-                       size_t imgSizeH,
-                       size_t imgSizeW,
-                       size_t sizeX,
-                       size_t sizeY,
-                       size_t strideH,
-                       size_t strideW,
-                       size_t outputH,
-                       size_t outputW,
-                       real scaleTargets,
-                       real scaleOutput,
-                       size_t paddingH,
-                       size_t paddingW,
-                       bool excludeMode = true);
-
-  void maxPool3DForward(Matrix& inputMat,
-                        Matrix& maxPoolIdx,
-                        size_t channels,
-                        size_t imgSizeD,
-                        size_t imgSizeH,
-                        size_t imgSizeW,
-                        size_t outputD,
-                        size_t outputH,
-                        size_t outputW,
-                        size_t sizeZ,
-                        size_t sizeY,
-                        size_t sizeX,
-                        size_t strideD,
-                        size_t strideH,
-                        size_t strideW,
-                        size_t paddingD,
-                        size_t paddingH,
-                        size_t paddingW);
-
-  void maxPool3DBackward(Matrix& outGrad,
-                         Matrix& maxPoolIdx,
-                         size_t imgSizeD,
-                         size_t imgSizeH,
-                         size_t imgSizeW,
-                         size_t outputD,
-                         size_t outputH,
-                         size_t outputW,
-                         size_t sizeZ,
-                         size_t sizeY,
-                         size_t sizeX,
-                         size_t strideD,
-                         size_t strideH,
-                         size_t strideW,
-                         size_t paddingD,
-                         size_t paddingH,
-                         size_t paddingW,
-                         real scaleTargets,
-                         real scaleOutput);
-
-  void avgPool3DForward(Matrix& input,
-                        size_t channels,
-                        size_t imgSizeD,
-                        size_t imgSizeH,
-                        size_t imgSizeW,
-                        size_t outputD,
-                        size_t outputH,
-                        size_t outputW,
-                        size_t sizeZ,
-                        size_t sizeY,
-                        size_t sizeX,
-                        size_t strideD,
-                        size_t strideH,
-                        size_t strideW,
-                        size_t paddingD,
-                        size_t paddingH,
-                        size_t paddingW);
-
-  void avgPool3DBackward(Matrix& input,
-                         size_t imgSizeD,
-                         size_t imgSizeH,
-                         size_t imgSizeW,
-                         size_t outputD,
-                         size_t outputH,
-                         size_t outputW,
-                         size_t sizeZ,
-                         size_t sizeY,
-                         size_t sizeX,
-                         size_t strideD,
-                         size_t strideH,
-                         size_t strideW,
-                         size_t paddingD,
-                         size_t paddingH,
-                         size_t paddingW,
-                         real scaleTargets,
-                         real scaleOutput);
-
-  void maxSequenceForward(Matrix& input,
-                          const IVector& sequence,
-                          IVector& index);
-
-  void maxSequenceBackward(Matrix& outputGrad,
-                           const IVector& sequence,
-                           IVector& index);
-
-  void bilinearForward(const Matrix& in,
-                       const size_t inImgH,
-                       const size_t inImgW,
-                       const size_t outImgH,
-                       const size_t outImgW,
-                       const size_t numChannels,
-                       const real ratioH,
-                       const real ratioW);
-
-  void bilinearBackward(const Matrix& out,
-                        const size_t outImgH,
-                        const size_t outImgW,
-                        const size_t inImgH,
-                        const size_t inImgW,
-                        const size_t numChannels,
-                        const real ratioH,
-                        const real ratioW);
-
-  void vol2Col(real* data,
-               int channels,
-               int depth,
-               int height,
-               int width,
-               int filterD,
-               int filterH,
-               int filterW,
-               int strideD,
-               int strideH,
-               int strideW,
-               int paddingD,
-               int paddingH,
-               int paddingW);
-
-  void col2Vol(real* trg,
-               int channels,
-               int depth,
-               int height,
-               int width,
-               int filterD,
-               int filterH,
-               int filterW,
-               int strideD,
-               int strideH,
-               int strideW,
-               int paddingD,
-               int paddingH,
-               int paddingW,
-               real alpha,
-               real beta);
-
-  void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label);
-
-  void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label);
-
-  template <typename ExpressionType>
-  void operator=(const ExpressionType& expr) {
-    TensorGpuApply<real>(*this, expr);
-  }
-};
-
-class CpuMatrix : public Matrix {
- private:
-  MatrixPtr sftmaxSum_;
-  MatrixPtr sftmaxDot_;
-
- public:
-  CpuMatrix(size_t height, size_t width, bool trans = false);
-  CpuMatrix(real* data, size_t height, size_t width, bool trans = false)
-      : Matrix(data, height, width, trans, false) {}
-  CpuMatrix(real* data,
-            size_t height,
-            size_t width,
-            size_t stride,
-            bool trans = false)
-      : Matrix(data, height, width, stride, trans, false) {}
-
-  CpuMatrix(CpuMemHandlePtr dataHandle,
-            size_t height,
-            size_t width,
-            bool trans = false)
-      : Matrix(dataHandle, height, width, trans, false) {}
-
-  ~CpuMatrix();
-
-  void zeroMem();
-  void resetOne();
-  void setDiag(real value);
-
-  void resize(size_t newHeight, size_t newWidth);
-  void resize(size_t newHeight,
-              size_t newWidth,
-              size_t newNnz, /* used to allocate space */
-              SparseValueType valueType,
-              SparseFormat format) {
-    LOG(FATAL) << "Only Support Sparse Matrix";
-  }
-  void setRow(size_t row,
-              size_t colNum,
-              const unsigned int* cols,
-              const real* values) {
-    LOG(FATAL) << "Only Support Sparse Matrix";
-  }
-
-  real getElement(size_t x, size_t y) const;
-  real getSum();
-  void accumulateColSum(Matrix& src);
-  real getAbsSum();
-
-  MatrixPtr getTranspose();
-  void transpose(MatrixPtr& matTrans, bool memAlloc);
-  void rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise);
-
-  MatrixPtr getInverse();
-  void inverse(MatrixPtr& matInv, bool memAlloc);
-
-  void copyFrom(const Matrix& src);
-
-  void copyFrom(const Matrix& src, hl_stream_t stream);
-
-  void copyFrom(const real* cpuSrc, size_t size);
-
-  void copyFrom(const real* cpuSrc, const int64_t* seq);
-
-  void copyFrom(const IVector& src);
-
-  void copyFrom(CpuSparseMatrix& src);
-
-  void copyByRowIndex(Matrix& b, const IVector& rowIndex);
-
-  MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
-
-  void upsampleForward(Matrix& input,
-                       Matrix& mask,
-                       size_t imgSizeH,
-                       size_t imgSizeW,
-                       size_t channels,
-                       size_t outputH,
-                       size_t outputW);
-
-  void upsampleBackward(Matrix& outputGrad,
-                        Matrix& mask,
-                        size_t imgSizeH,
-                        size_t imgSizeW,
-                        size_t channels,
-                        size_t outputH,
-                        size_t outputW);
-
-  void maxPoolForward(Matrix& inputMat,
-                      size_t imgSizeH,
-                      size_t imgSizeW,
-                      size_t channels,
-                      size_t sizeX,
-                      size_t sizeY,
-                      size_t strideH,
-                      size_t strideW,
-                      size_t outputH,
-                      size_t outputW,
-                      size_t paddingH,
-                      size_t paddingW,
-                      MatrixPtr maskMatP);
-
-  void maxPoolBackward(Matrix& image,
-                       size_t imgSizeH,
-                       size_t imgSizeW,
-                       Matrix& outGrad,
-                       Matrix& outV,
-                       size_t sizeX,
-                       size_t sizeY,
-                       size_t strideH,
-                       size_t strideW,
-                       size_t outputH,
-                       size_t outputW,
-                       real scaleTargets,
-                       real scaleOutput,
-                       size_t paddingH,
-                       size_t paddingW);
-
-  void avgPoolForward(Matrix& input,
-                      size_t imgSizeH,
-                      size_t imgSizeW,
-                      size_t channels,
-                      size_t sizeX,
-                      size_t sizeY,
-                      size_t strideH,
-                      size_t strideW,
-                      size_t outputH,
-                      size_t outputW,
-                      size_t paddingH,
-                      size_t paddingW,
-                      bool excludeMode = true);
-
-  void avgPoolBackward(Matrix& input,
-                       size_t imgSizeH,
-                       size_t imgSizeW,
-                       size_t sizeX,
-                       size_t sizeY,
-                       size_t strideH,
-                       size_t strideW,
-                       size_t outputH,
-                       size_t outputW,
-                       real scaleTargets,
-                       real scaleOutput,
-                       size_t paddingH,
-                       size_t paddingW,
-                       bool excludeMode = true);
-
-  void maxPool3DForward(Matrix& inputMat,
-                        Matrix& maxPoolIdx,
-                        size_t channels,
-                        size_t imgSizeD,
-                        size_t imgSizeH,
-                        size_t imgSizeW,
-                        size_t outputD,
-                        size_t outputH,
-                        size_t outputW,
-                        size_t sizeZ,
-                        size_t sizeY,
-                        size_t sizeX,
-                        size_t strideD,
-                        size_t strideH,
-                        size_t strideW,
-                        size_t paddingD,
-                        size_t paddingH,
-                        size_t paddingW);
-
-  void maxPool3DBackward(Matrix& outGrad,
-                         Matrix& maxPoolIdx,
-                         size_t imgSizeD,
-                         size_t imgSizeH,
-                         size_t imgSizeW,
-                         size_t outputD,
-                         size_t outputH,
-                         size_t outputW,
-                         size_t sizeZ,
-                         size_t sizeY,
-                         size_t sizeX,
-                         size_t strideD,
-                         size_t strideH,
-                         size_t strideW,
-                         size_t paddingD,
-                         size_t paddingH,
-                         size_t paddingW,
-                         real scaleTargets,
-                         real scaleOutput);
-
-  void avgPool3DForward(Matrix& input,
-                        size_t channels,
-                        size_t imgSizeD,
-                        size_t imgSizeH,
-                        size_t imgSizeW,
-                        size_t outputD,
-                        size_t outputH,
-                        size_t outputW,
-                        size_t sizeZ,
-                        size_t sizeY,
-                        size_t sizeX,
-                        size_t strideD,
-                        size_t strideH,
-                        size_t strideW,
-                        size_t paddingD,
-                        size_t paddingH,
-                        size_t paddingW);
-
-  void avgPool3DBackward(Matrix& input,
-                         size_t imgSizeD,
-                         size_t imgSizeH,
-                         size_t imgSizeW,
-                         size_t outputD,
-                         size_t outputH,
-                         size_t outputW,
-                         size_t sizeZ,
-                         size_t sizeY,
-                         size_t sizeX,
-                         size_t strideD,
-                         size_t strideH,
-                         size_t strideW,
-                         size_t paddingD,
-                         size_t paddingH,
-                         size_t paddingW,
-                         real scaleTargets,
-                         real scaleOutput);
-
-  void maxSequenceForward(Matrix& input,
-                          const IVector& sequence,
-                          IVector& index);
-
-  void maxSequenceBackward(Matrix& outputGrad,
-                           const IVector& sequence,
-                           IVector& index);
-
-  real* getRow(size_t row) { return BaseMatrix::rowBuf(row); }
-  virtual real* getRowBuf(size_t row) { return getRow(row); }
-
- public:
-  /// add b to each sample of this.
-  void addBias(Matrix& b, real scale);
-  void addSharedBias(Matrix& b, real scale);
-
-  /// add each sample of a to this.
-  void collectBias(Matrix& a, real scale);
-  void collectSharedBias(Matrix& a, real scale);
-
-  void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
-  void sequenceAvgBackward(Matrix& a, const IVector& startsPos, int mode);
-
-  /**
-   * @code
-   * this.row[i] += table.row[ids[i]]
-   * @endcode
-   */
-  virtual void selectRows(Matrix& table, IVector& ids);
-
-  /**
-   * @code
-   * table.row[ids[i]] += this.row[i]
-   * @endcode
-   */
-  virtual void addToRows(Matrix& table, IVector& ids);
-
-  /**
-   * @code
-   * this[i] = table[i, id[i]]
-   * @endcode
-   */
-  virtual void selectElements(Matrix& table, IVector& ids);
-
-  /**
-   * @code
-   * table[i, id[i]] += this[i]
-   * @endcode
-   */
-  virtual void addElements(Matrix& table, IVector& ids);
-
-  /**
-   * use abstract getRow() to get row from table.
-   *
-   * Define table as template instead of virtual class for performance sake.
-   * internal used by above two virtual funcs.
-   */
-  template <typename TableMatType>
-  void selectRowsImp(TableMatType& table, IVector& ids);
-  template <typename TableMatType>
-  void addToRowsImp(TableMatType& table, IVector& ids);
-
-  void addColumnVector(const Matrix& b);
-
-  void mul(const Matrix& a, const Matrix& b, real scaleAB, real scaleT);
-  void mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
-
-  void mul(CpuMatrix* a, CpuSparseMatrix* b, real scaleAB, real scaleT);
-
-  static void mul(CpuMatrix* a,
-                  CpuMatrix* b,
-                  CpuSparseMatrix* c,
-                  real scaleAB,
-                  real scaleT);
-
-  /**
-   * c = a * b
-   *
-   * use abstract getRow() to get row from B,C.
-   * Define B,C as template instead of virtual class for performance sake.
-   */
-  template <typename MatBType, typename MatCType>
-  static void mul(
-      CpuSparseMatrix* a, MatBType* b, MatCType* c, real scaleAB, real scaleT);
-
-  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
-
-  void mul(const Matrix& a, const Matrix& b);
-
-  void rightMul(Matrix& b, real scaleAB, real scaleT);
-  void rightMul(Matrix& b);
-
-  void leftMul(Matrix& a, real scaleAB, real scaleT);
-  void leftMul(Matrix& a);
-  void colMerge(Matrix& src);
-  void rowSum(Matrix& sum);
-  void rowMaxId(IVector& maxIds);
-  void rowMax(Matrix& max);
-  void rowMax(IVector& maxIds, Matrix& maxVal);
-  void colMax(Matrix& max);
-  void colMax(IVector& maxIds, Matrix& maxVal);
-  void maxoutForward(Matrix& a, IVector& id, size_t channels, size_t groups);
-  void maxoutBackward(Matrix& a, IVector& id, size_t channels, size_t groups);
-  void rowNormalizeL1(Matrix& out);
-
-  void oneHotCrossEntropy(Matrix& output, IVector& label);
-  void oneHotCrossEntropyBp(Matrix& outputV, IVector& label);
-  void oneHotCrossEntropyWithSelfNorm(Matrix& output,
-                                      IVector& label,
-                                      real alpha);
-  void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
-                                        IVector& label,
-                                        real alpha);
-
-  void circularConv(Matrix& b, Matrix& c);
-  void circularConvDerivative(Matrix& output,
-                              Matrix& prevOut1,
-                              Matrix& prevOut2,
-                              Matrix& prevGrad1,
-                              Matrix& prevGrad2);
-
-  void softmax(Matrix& output);
-  void sequenceSoftmax(Matrix& output, const IVector& index);
-  void softmaxDerivative(Matrix& output, Matrix& sftmaxSum);
-
-  /// calculate the sum of squares diff cost.
-  void sumOfSquares(Matrix& output, Matrix& label);
-
-  /// gradient of sumOfSquares.
-  void sumOfSquaresBp(Matrix& outputV, Matrix& label);
-
-  void smoothL1(Matrix& output, Matrix& label, real destScale);
-  void smoothL1Bp(Matrix& output, Matrix& label, real destScale);
-
-  void tanh(Matrix& output);
-  void tanhDerivative(Matrix& output);
-
-  void softrelu(Matrix& output);
-  void softreluDerivative(Matrix& output);
-  void scaledTanh(Matrix& output, real p1, real p2);
-
-  void print(std::ostream& os) const;
-  void print(std::ostream& os, size_t height, size_t width) const;
-  void printOneRow(std::ostream& os, size_t idx) const;
-
-  void paramReluForward(Matrix& data, Matrix& W);
-  void paramReluBackwardW(Matrix& oGrad, Matrix& data);
-  void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W);
-
-  void check(std::ostream& os, Matrix& refMat, bool printDiff = true);
-
-  real getMin();
-  real getMax();
-
-  void randomizeUniform();
-
-  void classificationError(Matrix& output, IVector& label, size_t topkSize = 1);
-
-  void addByBitCode(size_t numClasses, const IVector& codes, const Matrix& vec);
-
-  void addByBitCodeBackward(size_t numClasses,
-                            const IVector& codes,
-                            Matrix& vec);
-
-  void mulByBitCode(size_t numClasses,
-                    const IVector& codes,
-                    const Matrix& mat,
-                    const Matrix& input);
-
-  void mulByBitCodeBackwardWeight(size_t numClasses,
-                                  const IVector& codes,
-                                  Matrix& mat,
-                                  const Matrix& input);
-
-  void mulByBitCodeBackwardError(size_t numClasses,
-                                 const IVector& codes,
-                                 const Matrix& mat,
-                                 Matrix& input);
-
-  void sumByBitCode(size_t numClasses,
-                    IVector& codes,
-                    Matrix& sum,
-                    real scaleSum);
-
-  void subByBitCode(size_t numClasses_, IVector& codes);
-
-  void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label);
-  void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label);
-  void classificationErrorMulti(Matrix& output, Matrix& label, real threshold);
-
-  void bilinearForward(const Matrix& in,
-                       const size_t inImgH,
-                       const size_t inImgW,
-                       const size_t outImgH,
-                       const size_t outImgW,
-                       const size_t numChannels,
-                       const real ratioH,
-                       const real ratioW);
-
-  void bilinearBackward(const Matrix& out,
-                        const size_t outImgH,
-                        const size_t outImgW,
-                        const size_t inImgH,
-                        const size_t inImgW,
-                        const size_t numChannels,
-                        const real ratioH,
-                        const real ratioW);
-
-  void vol2Col(real* data,
-               int channels,
-               int depth,
-               int height,
-               int width,
-               int filterD,
-               int filterH,
-               int filterW,
-               int strideD,
-               int strideH,
-               int strideW,
-               int paddingD,
-               int paddingH,
-               int paddingW);
-
-  void col2Vol(real* trg,
-               int channels,
-               int depth,
-               int height,
-               int width,
-               int filterD,
-               int filterH,
-               int filterW,
-               int strideD,
-               int strideH,
-               int strideW,
-               int paddingD,
-               int paddingH,
-               int paddingW,
-               real alpha,
-               real beta);
-
-  template <typename ExpressionType>
-  void operator=(const ExpressionType& expr) {
-    TensorCpuApply<real>(*this, expr);
-  }
-};
-
-class SharedCpuMatrix : public CpuMatrix {
- public:
-#ifndef PADDLE_MOBILE_INFERENCE
-  /* blockNum is number of partitions of the matrix  */
-  SharedCpuMatrix(int blockNum, size_t height, size_t width, bool trans = false)
-      : CpuMatrix(height, width, trans) {
-    initShared(blockNum);
-  }
-  SharedCpuMatrix(
-      int blockNum, real* data, size_t height, size_t width, bool trans = false)
-      : CpuMatrix(data, height, width, trans) {
-    initShared(blockNum);
-  }
-
-  SharedCpuMatrix(int blockNum,
-                  CpuMemHandlePtr dataHandle,
-                  size_t height,
-                  size_t width,
-                  bool trans = false)
-      : CpuMatrix(dataHandle, height, width, trans) {
-    initShared(blockNum);
-  }
-
-  SharedCpuMatrix(CpuMemHandlePtr dataHandle,
-                  size_t height,
-                  size_t width,
-                  bool trans = false)
-      : CpuMatrix(dataHandle, height, width, trans) {
-    initBlock(1);
-  }
-
-  ~SharedCpuMatrix() {}
-
- public:
-  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
-  virtual void add(Matrix& b, real p1, real p2);
-  virtual void add(real p1, real p2);
-
- private:
-  using Matrix::mul;
-  void initShared(int blockNum);
-  void initBlock(int blockNum);
-
-  int blockNum_;
-  std::vector<std::unique_ptr<std::mutex>> blockLocks_;
-  ThreadLocal<CpuMatrixPtr> localBuf_;
-  ThreadLocal<std::vector<int>> localBufRows_;
-  ThreadLocal<std::vector<int>> blockSeq_;
-#endif
-};
-
-typedef struct { unsigned int col; } sparse_non_value_t;
-
-typedef struct {
-  unsigned int col;
-  float value;
-} sparse_float_value_t;
-
-}  // namespace paddle
-#include "ExecViaCpu.h"
diff --git a/paddle/math/MatrixBitCode.cpp b/paddle/math/MatrixBitCode.cpp
deleted file mode 100644
index f7a949294b54a5a874e1239a13ca9dce3ba18e94..0000000000000000000000000000000000000000
--- a/paddle/math/MatrixBitCode.cpp
+++ /dev/null
@@ -1,291 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Matrix.h"
-#include "hl_gpu.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Util.h"
-
-namespace paddle {
-
-namespace {
-
-struct SimpleCode {
-  SimpleCode(size_t code, size_t numClasses) : c_(code + numClasses) {}
-  inline size_t calcIndex(int bit) const { return (c_ >> (bit + 1)) - 1; }
-  inline bool calcBit(int bit) const { return c_ & (1 << bit); }
-  inline int getLength() const { return findLastSet(c_) - 1; }
-
- private:
-  size_t c_;
-};
-
-struct SimpleCodeTable {
-  explicit SimpleCodeTable(size_t numClasses) : numClasses_(numClasses) {}
-  SimpleCode operator()(size_t code) const {
-    return SimpleCode(code, numClasses_);
-  }
-  size_t size() const { return numClasses_; }
-  int getMaxCodeLength() const { return findLastSet(numClasses_ - 1); }
-
- private:
-  size_t numClasses_;
-  int maxCodeLength_;
-};
-
-}  // namespace
-
-/**
- * CodeTable class should support 3 functions:
- *
- * size_t size()
- *   return the number of codes
- *
- * int getMaxCodeLength()
- *   return the maximal code length
- *
- * Code operator()(size_t i)
- *   return the i-th code. Code class is descriebed below.
- *
- * Code class should support 3 functions:
- *
- * int getLength()
- *   return the length of the code
- *
- * bool calcIndex(int bit)
- *   bit ranges from 0 to getLength() - 1
- *   return the index for the (1+bit) level parent
- *
- * bool calcBit(int bit)
- *   return true if the bit level parent is the right child of (1+bit) level
- *   parent
- *
- */
-
-/*
-   for i:
-     for j < codeLength:
-       op(tmat(i, j), vec(0, index(i, j)))
-*/
-template <class CodeTable, class Op, class TMat, class Mat>
-static void addByBitCodeT(
-    Op op, CodeTable codeTable, const IVector& codes, TMat& tmat, Mat& vec) {
-  CHECK(!vec.useGpu());
-
-  size_t numClasses = codeTable.size();
-  size_t maxCodeLength = codeTable.getMaxCodeLength();
-  size_t numSamples = tmat.getHeight();
-  size_t oWidth = tmat.getWidth();
-  CHECK_EQ(tmat.getWidth(), maxCodeLength);
-  CHECK_EQ(codes.getSize(), numSamples);
-  CHECK_EQ(vec.getHeight(), (size_t)1);
-  CHECK_EQ(vec.getWidth(), numClasses - 1);
-
-  auto data = tmat.getData();
-  auto v = vec.getData();
-  const int* c = codes.getData();
-  for (size_t i = 0; i < numSamples; ++i) {
-    auto code = codeTable(c[i]);
-    int codeLength = code.getLength();
-    for (int j = 0; j < codeLength; ++j) {
-      size_t index = code.calcIndex(j);
-      op(data[i * oWidth + j], v[index]);
-    }
-  }
-}
-
-/* For j < codeLength:
-   this(i, j) += vec(0, index(i, j))
-*/
-void CpuMatrix::addByBitCode(size_t numClasses,
-                             const IVector& codes,
-                             const Matrix& vec) {
-  auto op = [](real& t, real v) { t += v; };
-  addByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, vec);
-}
-
-/* For j < codeLength:
-   vec(0, index(i, j)) += this(i, j)
-*/
-void CpuMatrix::addByBitCodeBackward(size_t numClasses,
-                                     const IVector& codes,
-                                     Matrix& vec) {
-  auto op = [](real t, real& v) { v += t; };
-  addByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, vec);
-}
-
-/*
-  for i:
-    for j < codeLength:
-      op(tmat(i, j), mat.row(index(i, j)), input.row(i))
-*/
-template <class Op,
-          class CodeTable,
-          class IVec,
-          class TMat,
-          class WMat,
-          class InMat>
-void mulByBitCodeT(Op op,
-                   CodeTable codeTable,
-                   IVec& codes,
-                   TMat& tmat,
-                   WMat& weight,
-                   InMat& input) {
-  CHECK(!tmat.useGpu() && !weight.useGpu() && !input.useGpu());
-
-  size_t numClasses = codeTable.size();
-  size_t maxCodeLength = codeTable.getMaxCodeLength();
-  size_t numSamples = tmat.getHeight();
-  size_t inputDim = input.getWidth();
-  size_t oWidth = tmat.getWidth();
-  CHECK_EQ(tmat.getWidth(), maxCodeLength);
-  CHECK_EQ(codes.getSize(), numSamples);
-  CHECK_EQ(input.getHeight(), numSamples);
-  CHECK_EQ(weight.getHeight(), numClasses - 1);
-  CHECK_EQ(weight.getWidth(), inputDim);
-
-  real* data = tmat.getData();
-  const int* c = codes.getData();
-  for (size_t i = 0; i < numSamples; ++i) {
-    auto code = codeTable(c[i]);
-    int codeLength = code.getLength();
-    for (int j = 0; j < codeLength; ++j) {
-      size_t index = code.calcIndex(j);
-      op(data[i * oWidth + j], weight.rowBuf(index), input.rowBuf(i), inputDim);
-    }
-  }
-}
-
-/* For j < codeLength:
-   this(i, j) += <weight.row(index(i, j)), input.row(i)>
-*/
-void CpuMatrix::mulByBitCode(size_t numClasses,
-                             const IVector& codes,
-                             const Matrix& weight,
-                             const Matrix& input) {
-  auto op = [](
-      real& t, const real* weightRow, const real* inputRow, size_t inputDim) {
-    real sum = 0;
-    for (size_t k = 0; k < inputDim; ++k) {
-      sum += weightRow[k] * inputRow[k];
-    }
-    t += sum;
-  };
-
-  mulByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, weight, input);
-}
-
-/* For index(i, j) >= 0:
-   weight.row(index(i, j)) += this(i, j) * input.row(i)
-*/
-void CpuMatrix::mulByBitCodeBackwardWeight(size_t numClasses,
-                                           const IVector& codes,
-                                           Matrix& weight,
-                                           const Matrix& input) {
-  auto op = [](
-      const real t, real* weightRow, const real* inputRow, size_t inputDim) {
-    for (size_t k = 0; k < inputDim; ++k) {
-      weightRow[k] += t * inputRow[k];
-    }
-  };
-
-  mulByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, weight, input);
-}
-
-/* For j < codeLength:
-   input.row(i) += this(i, j) * weight.row(index(i, j))
-*/
-void CpuMatrix::mulByBitCodeBackwardError(size_t numClasses,
-                                          const IVector& codes,
-                                          const Matrix& weight,
-                                          Matrix& input) {
-  auto op = [](
-      const real t, const real* weightRow, real* inputRow, size_t inputDim) {
-    for (size_t k = 0; k < inputDim; ++k) {
-      inputRow[k] += t * weightRow[k];
-    }
-  };
-
-  mulByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, weight, input);
-}
-
-template <class CodeTable>
-void sumByBitCodeT(CodeTable codeTable,
-                   IVector& codes,
-                   const CpuMatrix& tmat,
-                   Matrix& sum,
-                   real scaleSum) {
-  size_t maxCodeLength = codeTable.getMaxCodeLength();
-  size_t numSamples = tmat.getHeight();
-  size_t oWidth = tmat.getWidth();
-  CHECK_EQ(tmat.getWidth(), maxCodeLength);
-  CHECK_EQ(codes.getSize(), numSamples);
-  CHECK_EQ(sum.getHeight(), numSamples);
-  CHECK_EQ(sum.getWidth(), (size_t)1);
-
-  const real* data = tmat.getData();
-  real* s = sum.getData();
-  int* c = codes.getData();
-  for (size_t i = 0; i < numSamples; ++i) {
-    real sm = 0;
-    auto code = codeTable(c[i]);
-    int codeLength = code.getLength();
-    for (int j = 0; j < codeLength; ++j) {
-      if (code.calcBit(j)) {
-        sm += data[i * oWidth + j];
-      }
-    }
-    s[i] = scaleSum * sm;
-  }
-}
-
-/* For j < codeLength:
-   sum(i, 0) = \sum_j  bit(i, j) * this(i, j)
-*/
-void CpuMatrix::sumByBitCode(size_t numClasses,
-                             IVector& codes,
-                             Matrix& sum,
-                             real scaleSum) {
-  sumByBitCodeT(SimpleCodeTable(numClasses), codes, *this, sum, scaleSum);
-}
-
-template <class CodeTable>
-void subByBitCodeT(CodeTable codeTable, IVector& codes, CpuMatrix& tmat) {
-  size_t maxCodeLength = codeTable.getMaxCodeLength();
-  size_t numSamples = tmat.getHeight();
-  size_t oWidth = tmat.getWidth();
-  CHECK_EQ(tmat.getWidth(), maxCodeLength);
-  CHECK_EQ(codes.getSize(), numSamples);
-
-  real* data = tmat.getData();
-  int* c = codes.getData();
-  for (size_t i = 0; i < numSamples; ++i) {
-    auto code = codeTable(c[i]);
-    int codeLength = code.getLength();
-    for (int j = 0; j < codeLength; ++j) {
-      if (code.calcBit(j)) {
-        data[i * oWidth + j] -= 1;
-      }
-    }
-  }
-}
-
-/* For j < codeLength
-   this(i, j) -= bit(i, j)
-*/
-void CpuMatrix::subByBitCode(size_t numClasses, IVector& codes) {
-  subByBitCodeT(SimpleCodeTable(numClasses), codes, *this);
-}
-
-}  // namespace paddle
diff --git a/paddle/math/RowBuffer.h b/paddle/math/RowBuffer.h
deleted file mode 100644
index 6950afaa21d60615b27c06a151b0afbb296653bf..0000000000000000000000000000000000000000
--- a/paddle/math/RowBuffer.h
+++ /dev/null
@@ -1,139 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-#include "MemoryHandle.h"
-#include "paddle/utils/Util.h"
-
-namespace paddle {
-
-/**
- * @brief The RowBuffer class
- * Represent the SparseRow Matrix Data.
- *
- * If not set memory handler, then the data could be auto growth.
- */
-class RowBuffer {
- public:
-  /**
-   * @brief RowBuffer create a auto-growth row buffer. The row length is width.
-   * @param width the length of each row, a.k.a matrix width.
-   */
-  explicit RowBuffer(size_t width) : width_(width) {}
-
-  /**
-   * @brief RowBuffer create a row buffer, which cannot be auto-growth.
-   * @param mem the pre-allocated memory.
-   * @param width the length of each row, a.k.a matrix width.
-   */
-  RowBuffer(const CpuMemHandlePtr& mem, size_t width)
-      : preallocatedBuf_(mem), width_(width) {}
-
-  /**
-   * @brief resize resize the buffer with rowCount
-   * @param rowCnt number of row. matrix height.
-   */
-  inline void resize(int rowCnt) {
-    if (preallocatedBuf_) {
-      CHECK(preallocatedBuf_->getSize() >= rowCnt * width_ * sizeof(real));
-    } else {
-      rowStore_.resize(rowCnt * width_);
-    }
-  }
-
-  /**
-   * @brief get a row buffer with row index.
-   * @param row the index of row.
-   * @return row buffer.
-   */
-  inline real* get(int row) const {
-    if (preallocatedBuf_) {
-      CHECK_LE((row)*width_ * sizeof(real), preallocatedBuf_->getSize());
-      return reinterpret_cast<real*>(preallocatedBuf_->getBuf()) + row * width_;
-    } else {
-      CHECK_LE((row + 1) * width_, rowStore_.size());
-      return const_cast<real*>(rowStore_.data() + row * width_);
-    }
-  }
-
-  /**
-   * @brief get a row buffer with row index. If row index is larger than local
-   *        buffer, the size of local buffer will grow.
-   * @param row the index of row.
-   * @return row buffer.
-   */
-  inline real* getWithAutoGrowth(int row) {
-    if (preallocatedBuf_) {
-      return get(row);
-    } else {
-      if ((rowStore_.size() <= row * width_)) {
-        rowStore_.resize((row + 1) * width_);
-      }
-      return rowStore_.data() + row * width_;
-    }
-  }
-
-  /**
-   * @return raw data buffer.
-   */
-  inline real* data() {
-    if (preallocatedBuf_) {
-      return reinterpret_cast<real*>(preallocatedBuf_->getBuf());
-    } else {
-      return rowStore_.data();
-    }
-  }
-
-  /**
-   * @brief clear local buffer. It only affect auto-growth buffer.
-   */
-  inline void clear() {
-    // swap an empty vector to it to free the memory.
-    std::vector<real, AlignedAllocator<real, 32>> empty;
-    rowStore_.swap(empty);
-  }
-
-  /**
-   * @brief get current number of rows.
-   * @return number of rows.
-   */
-  inline size_t getRowCount() const {
-    if (preallocatedBuf_) {
-      return preallocatedBuf_->getSize() / sizeof(real) / width_;
-    } else {
-      return rowStore_.size() / width_;
-    }
-  }
-
-  /**
-   * @brief get is this buffer can automatically grow or not.
-   * @return ture if can automacitally grow.
-   */
-  inline bool isAutoGrowth() const { return !preallocatedBuf_; }
-
-  /**
-   * @brief return the width of matrix. a.k.a length of row.
-   * @return width of matrix
-   */
-  inline size_t getWidth() const { return width_; }
-
- private:
-  //! TODO(yuyang18): Add resize method to CpuMemHandlePtr, then we can get rid
-  //! of std::vector here.
-  CpuMemHandlePtr preallocatedBuf_;
-  std::vector<real, AlignedAllocator<real, 32>> rowStore_;
-  size_t width_;
-};
-}  // namespace paddle
diff --git a/paddle/math/SparseMatrix.cpp b/paddle/math/SparseMatrix.cpp
deleted file mode 100644
index 1faa343dbcef3d20b29b272a8da37f8e2bba654b..0000000000000000000000000000000000000000
--- a/paddle/math/SparseMatrix.cpp
+++ /dev/null
@@ -1,864 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "SparseMatrix.h"
-#include <algorithm>
-#include <iostream>
-#include <vector>
-#include "hl_gpu.h"
-#include "hl_top_k.h"
-#include "paddle/utils/Util.h"
-
-namespace paddle {
-
-GpuSparseMatrix::GpuSparseMatrix(size_t height,
-                                 size_t width,
-                                 size_t nnz,
-                                 SparseValueType valueType,
-                                 SparseFormat format,
-                                 bool trans)
-    : Matrix(NULL, height, width, trans, true) {
-  resize(height, width, nnz, valueType, format);
-}
-
-GpuSparseMatrix::GpuSparseMatrix(GpuMemHandlePtr dataHandle,
-                                 hl_sparse_matrix_s_ptr sMatrix,
-                                 size_t height,
-                                 size_t width,
-                                 size_t nnz,
-                                 SparseValueType valueType,
-                                 SparseFormat format,
-                                 bool trans,
-                                 MemoryHandlePtr sMemoryHandle)
-    : Matrix(dataHandle, height, width, trans, true) {
-  CHECK(dataHandle && sMatrix) << "Invalid argument pointer";
-
-  size_t size = 0;
-  if (format == SPARSE_CSR) {
-    size = (height + 1) * sizeof(int) + nnz * sizeof(int);
-  } else {
-    size = (width + 1) * sizeof(int) + nnz * sizeof(int);
-  }
-
-  if (NO_VALUE != valueType) {
-    size += nnz * sizeof(real);
-  }
-  CHECK_LE(size, dataHandle->getSize());
-
-  sMatrix_ = sMatrix;
-
-  if (sMemoryHandle == NULL) {
-    sMemoryHandle_ = std::make_shared<CpuMemoryHandle>(dataHandle->getSize());
-  } else {
-    CHECK_EQ(sMemoryHandle->getSize(), dataHandle->getSize());
-    sMemoryHandle_ = sMemoryHandle;
-  }
-
-  elementCnt_ = nnz;
-  valueType_ = valueType;
-  format_ = format;
-  if (format_ == SPARSE_CSR)
-    sparseResizeCSR();
-  else
-    sparseResizeCSC();
-}
-
-GpuSparseMatrix::GpuSparseMatrix(hl_sparse_matrix_s_ptr sMatrix,
-                                 size_t height,
-                                 size_t width,
-                                 size_t nnz,
-                                 SparseValueType valueType,
-                                 SparseFormat format,
-                                 bool trans,
-                                 MemoryHandlePtr sMemoryHandle)
-    : Matrix(NULL, height, width, trans, true) {
-  CHECK(sMatrix) << "Invalid argument pointer";
-  sMatrix_ = sMatrix;
-  sMemoryHandle_ = sMemoryHandle;
-  elementCnt_ = nnz;
-  format_ = format;
-  valueType_ = valueType;
-}
-
-GpuSparseMatrix::GpuSparseMatrix(real* value,
-                                 int* rows,
-                                 int* cols,
-                                 size_t height,
-                                 size_t width,
-                                 size_t nnz,
-                                 SparseValueType valueType,
-                                 SparseFormat format,
-                                 bool trans)
-    : Matrix(NULL, height, width, trans, true) {
-  size_t size = 0;
-  if (format == SPARSE_CSR) {
-    size = (height + 1) * sizeof(int) + nnz * sizeof(int);
-  } else {
-    size = (width + 1) * sizeof(int) + nnz * sizeof(int);
-  }
-
-  if (NO_VALUE != valueType) {
-    size += nnz * sizeof(real);
-  }
-  elementCnt_ = nnz;
-  valueType_ = valueType;
-  format_ = format;
-
-  sMemoryHandle_ = std::make_shared<CpuMemoryHandle>(size);
-  if (format_ == SPARSE_CSR) {
-    rows_ = reinterpret_cast<int*>(
-        reinterpret_cast<char*>(sMemoryHandle_->getBuf()));
-    cols_ = reinterpret_cast<int*>(
-        reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
-        (height_ + 1) * sizeof(int));
-    if (NO_VALUE != valueType_) {
-      value_ = reinterpret_cast<real*>(
-          reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
-          (height_ + 1) * sizeof(int) + elementCnt_ * sizeof(int));
-    } else {
-      value_ = NULL;
-    }
-
-    if (sMatrix_ == NULL) {
-      /* construct hl_sparse_matrix_s */
-      hl_sparse_matrix_s tmp;
-      hl_construct_sparse_matrix(
-          &tmp,
-          value,
-          rows,
-          cols,
-          HL_SPARSE_CSR,
-          valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE,
-          height_,
-          width_,
-          elementCnt_);
-      hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix);
-      sMatrix_ = tmp2;
-    }
-
-  } else {
-    cols_ = reinterpret_cast<int*>(
-        reinterpret_cast<char*>(sMemoryHandle_->getBuf()));
-    rows_ = reinterpret_cast<int*>(
-        reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
-        (width_ + 1) * sizeof(int));
-    if (NO_VALUE != valueType_) {
-      value_ = reinterpret_cast<real*>(
-          reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
-          (width_ + 1) * sizeof(int) + elementCnt_ * sizeof(int));
-    } else {
-      value_ = NULL;
-    }
-
-    if (sMatrix_ == NULL) {
-      /* construct hl_sparse_matrix_s */
-      hl_sparse_matrix_s tmp;
-      hl_construct_sparse_matrix(
-          &tmp,
-          value,
-          rows,
-          cols,
-          HL_SPARSE_CSC,
-          valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE,
-          height_,
-          width_,
-          elementCnt_);
-      hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix);
-      sMatrix_ = tmp2;
-    }
-  }
-}
-
-void GpuSparseMatrix::sparseResizeCSR() {
-  rows_ =
-      reinterpret_cast<int*>(reinterpret_cast<char*>(sMemoryHandle_->getBuf()));
-  cols_ =
-      reinterpret_cast<int*>(reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
-                             (height_ + 1) * sizeof(int));
-  if (NO_VALUE != valueType_) {
-    value_ = reinterpret_cast<real*>(
-        reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
-        (height_ + 1) * sizeof(int) + elementCnt_ * sizeof(int));
-  } else {
-    value_ = NULL;
-  }
-
-  if (sMatrix_ == NULL) {
-    /* construct hl_sparse_matrix_s */
-    hl_sparse_matrix_s tmp;
-    hl_construct_sparse_matrix(
-        &tmp,
-        data_,
-        memoryHandle_->getSize(),
-        HL_SPARSE_CSR,
-        valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE,
-        height_,
-        width_,
-        elementCnt_);
-    hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix);
-    sMatrix_ = tmp2;
-  }
-}
-
-void GpuSparseMatrix::sparseResizeCSC() {
-  cols_ =
-      reinterpret_cast<int*>(reinterpret_cast<char*>(sMemoryHandle_->getBuf()));
-  rows_ =
-      reinterpret_cast<int*>(reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
-                             (width_ + 1) * sizeof(int));
-  if (NO_VALUE != valueType_) {
-    value_ = reinterpret_cast<real*>(
-        reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
-        (width_ + 1) * sizeof(int) + elementCnt_ * sizeof(int));
-  } else {
-    value_ = NULL;
-  }
-
-  if (sMatrix_ == NULL) {
-    /* construct hl_sparse_matrix_s */
-    hl_sparse_matrix_s tmp;
-    hl_construct_sparse_matrix(
-        &tmp,
-        memoryHandle_->getBuf(),
-        memoryHandle_->getSize(),
-        HL_SPARSE_CSC,
-        valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE,
-        height_,
-        width_,
-        elementCnt_);
-    hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix);
-    sMatrix_ = tmp2;
-  }
-}
-
-void GpuSparseMatrix::resize(size_t newHeight,
-                             size_t newWidth,
-                             size_t newNnz,
-                             SparseValueType valueType,
-                             SparseFormat format) {
-  if (format == SPARSE_CSR) {
-    resizeCSR(newHeight, newWidth, newNnz, valueType);
-  } else {
-    resizeCSC(newHeight, newWidth, newNnz, valueType);
-  }
-}
-
-void GpuSparseMatrix::resizeCSR(size_t newHeight,
-                                size_t newWidth,
-                                size_t newNnz,
-                                SparseValueType valueType) {
-  size_t newSize = (newHeight + 1) * sizeof(int) + newNnz * sizeof(int);
-  if (NO_VALUE != valueType) {
-    newSize += newNnz * sizeof(real);
-  }
-
-  if (NULL == memoryHandle_.get() || newSize > memoryHandle_->getSize()) {
-    memoryHandle_ = std::make_shared<GpuMemoryHandle>(newSize);
-    data_ = reinterpret_cast<real*>(memoryHandle_->getBuf());
-    sMemoryHandle_ = std::make_shared<CpuMemoryHandle>(newSize);
-    end_ = reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
-           sMemoryHandle_->getSize();
-    sMatrix_ = NULL;
-  } else if (valueType != valueType_) {
-    sMatrix_ = NULL;
-  } else {
-    /*
-     * newNnz > elementCnt_ is necessary for the following condition:
-     * Firstly, height_ is 9 elementCnt_ is 56
-     * Secondly, height_ is 11 elementCnt_ is 44
-     *   ==> height_ is bigger, sMatrix_ will resize, and total item is 44 now
-     * Then, height_ is 10 elementCnt_ is 52
-     *   ==> Without newNnz > elementCnt_ condition, sMatrix_ will fail
-     */
-    if ((ssize_t)((newHeight + 1) * sizeof(int)) >
-            ((char*)cols_ - (char*)rows_) ||
-        newNnz > static_cast<size_t>(sMatrix_->nnz)) {
-      sMatrix_ = NULL;
-    } else if (NO_VALUE == valueType) {
-      if ((ssize_t)(newNnz * sizeof(int)) > (end_ - (char*)cols_)) {
-        sMatrix_ = NULL;
-      }
-    } else {
-      if ((ssize_t)(newNnz * sizeof(int)) > ((char*)value_ - (char*)cols_) ||
-          (ssize_t)(newNnz * sizeof(real)) > (end_ - (char*)value_)) {
-        sMatrix_ = NULL;
-      }
-    }
-  }
-
-  height_ = newHeight;
-  width_ = newWidth;
-  elementCnt_ = newNnz;
-  valueType_ = valueType;
-  format_ = SPARSE_CSR;
-
-  if (sMatrix_ == NULL) {
-    sparseResizeCSR();
-  }
-}
-
-void GpuSparseMatrix::resizeCSC(size_t newHeight,
-                                size_t newWidth,
-                                size_t newNnz,
-                                SparseValueType valueType) {
-  size_t newSize = (newWidth + 1) * sizeof(int) + newNnz * sizeof(int);
-  if (NO_VALUE != valueType) {
-    newSize += newNnz * sizeof(real);
-  }
-
-  if (NULL == memoryHandle_.get() || newSize > memoryHandle_->getSize()) {
-    memoryHandle_ = std::make_shared<GpuMemoryHandle>(newSize);
-    data_ = reinterpret_cast<real*>(memoryHandle_->getBuf());
-    sMemoryHandle_ = std::make_shared<CpuMemoryHandle>(newSize);
-    end_ = reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
-           sMemoryHandle_->getSize();
-    sMatrix_ = NULL;
-  } else if (valueType != valueType_) {
-    sMatrix_ = NULL;
-  } else {
-    /*
-     * newNnz > elementCnt_ is necessary for the following condition:
-     * Firstly, height_ is 9 elementCnt_ is 56
-     * Secondly, height_ is 11 elementCnt_ is 44
-     *   ==> height_ is bigger, sMatrix_ will resize,
-     *       and total item is 44 now
-     * Then, height_ is 10 elementCnt_ is 52
-     *   ==> Without newNnz > elementCnt_ condition, sMatrix_ will fail
-     */
-    if ((ssize_t)((newWidth + 1) * sizeof(int)) >
-            ((char*)rows_ - (char*)cols_) ||
-        newNnz > static_cast<size_t>(sMatrix_->nnz)) {
-      sMatrix_ = NULL;
-    } else if (NO_VALUE == valueType) {
-      if ((ssize_t)(newNnz * sizeof(int)) > (end_ - (char*)rows_)) {
-        sMatrix_ = NULL;
-      }
-    } else {
-      if ((ssize_t)(newNnz * sizeof(int)) > ((char*)value_ - (char*)rows_) ||
-          (ssize_t)(newNnz * sizeof(real)) > (end_ - (char*)value_)) {
-        sMatrix_ = NULL;
-      }
-    }
-  }
-
-  height_ = newHeight;
-  width_ = newWidth;
-  elementCnt_ = newNnz;
-  valueType_ = valueType;
-  format_ = SPARSE_CSC;
-
-  if (sMatrix_ == NULL) {
-    sparseResizeCSC();
-  }
-}
-
-void GpuSparseMatrix::resize(size_t newHeight, size_t newWidth) {
-  resize(newHeight, newWidth, elementCnt_, valueType_, format_);
-}
-
-MatrixPtr GpuSparseMatrix::getTranspose() {
-  CHECK(memoryHandle_.get() || sMatrix_) << "not supported";
-  if (memoryHandle_.get()) {
-    MatrixPtr copy_T(new GpuSparseMatrix(
-        std::dynamic_pointer_cast<GpuMemoryHandle>(memoryHandle_),
-        sMatrix_,
-        height_,
-        width_,
-        elementCnt_,
-        valueType_,
-        format_,
-        true,
-        sMemoryHandle_));
-    return copy_T;
-  } else {
-    MatrixPtr copy_T(new GpuSparseMatrix(sMatrix_,
-                                         height_,
-                                         width_,
-                                         elementCnt_,
-                                         valueType_,
-                                         format_,
-                                         true,
-                                         sMemoryHandle_));
-    return copy_T;
-  }
-}
-
-void GpuSparseMatrix::copyRow(int offsets,
-                              size_t colNum,
-                              const sparse_non_value_t* row) {
-  memcpy(cols_ + offsets, row, sizeof(int) * colNum);
-}
-
-void GpuSparseMatrix::copyRow(int offsets,
-                              size_t colNum,
-                              const sparse_float_value_t* row) {
-  for (size_t j = 0; j < colNum; j++) {
-    cols_[offsets + j] = row[j].col;
-    value_[offsets + j] = row[j].value;
-  }
-}
-
-void GpuSparseMatrix::copyFrom(const Matrix& src, hl_stream_t stream) {
-  if (auto mat = dynamic_cast<const CpuSparseMatrix*>(&src)) {
-    copyFrom(*(const_cast<CpuSparseMatrix*>(mat)), stream);
-  } else if (auto mat = dynamic_cast<const GpuSparseMatrix*>(&src)) {
-    copyFrom(*(const_cast<GpuSparseMatrix*>(mat)), stream);
-  } else {
-    LOG(FATAL) << "Not implemented";
-  }
-}
-
-void GpuSparseMatrix::copyFrom(const Matrix& src) {
-  copyFrom(src, HPPL_STREAM_1);
-  hl_stream_synchronize(HPPL_STREAM_1);
-}
-
-template <class T>
-void GpuSparseMatrix::copyFrom(int64_t* ids,
-                               int64_t* indices,
-                               T* data,
-                               hl_stream_t stream) {
-  CHECK_EQ(format_, SPARSE_CSR);
-  size_t nnz = 0;
-  for (size_t i = 0; i < height_; i++) {
-    int64_t id = ids[i];
-    nnz += indices[id + 1] - indices[id];
-  }
-
-  resize(height_,
-         width_,
-         nnz,
-         sizeof(T) == sizeof(sparse_non_value_t) ? NO_VALUE : FLOAT_VALUE,
-         format_);
-
-  rows_[0] = 0;
-  for (size_t i = 0; i < height_; i++) {
-    int64_t id = ids[i];
-    size_t colNum = indices[id + 1] - indices[id];
-    rows_[i + 1] = rows_[i] + colNum;
-
-    T* row = data + indices[id];
-    copyRow(rows_[i], colNum, row);
-  }
-
-  sMatrix_->format = HL_SPARSE_CSR;
-  sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
-  sMatrix_->rows = height_;
-  sMatrix_->cols = width_;
-  sMatrix_->nnz = nnz;
-  hl_memcpy_csr_matrix(sMatrix_.get(), value_, rows_, cols_, stream);
-}
-
-void GpuSparseMatrix::setRow(size_t row,
-                             size_t colNum,
-                             const unsigned int* cols,
-                             const real* values) {
-  CHECK_EQ(format_, SPARSE_CSR);
-  if (NO_VALUE == valueType_) {
-    CHECK_LT(row, height_);
-    CHECK(NULL != cols);
-    CHECK(NULL == values);
-  } else {
-    CHECK_LT(row, height_);
-    CHECK(NULL != cols);
-    CHECK(NULL != values);
-  }
-  if (0 == row) {
-    rows_[row] = 0;
-  }
-  rows_[row + 1] = rows_[row] + colNum;
-
-  memcpy(cols_ + rows_[row], cols, sizeof(*cols) * colNum);
-  if (FLOAT_VALUE == valueType_) {
-    memcpy(value_ + rows_[row], values, sizeof(*values) * colNum);
-  }
-
-  if (height_ - 1 == row) {
-    sMatrix_->format = HL_SPARSE_CSR;
-    sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
-    sMatrix_->rows = height_;
-    sMatrix_->cols = width_;
-    sMatrix_->nnz = elementCnt_;
-    hl_memcpy_csr_matrix(
-        sMatrix_.get(), value_, rows_, cols_, HPPL_STREAM_DEFAULT);
-  }
-}
-
-SparseValueType GpuSparseMatrix::getValueType() const { return valueType_; }
-
-void GpuSparseMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
-  CHECK_EQ(format_, SPARSE_CSC);
-  int nnz = sMatrix_->nnz;
-  if (memAlloc) {
-    matTrans = std::make_shared<GpuSparseMatrix>(
-        width_, height_, nnz, valueType_, format_, false);
-  } else {
-    CHECK(matTrans != nullptr);
-  }
-
-  CpuIVector rows(nnz);
-  CpuIVector cols(width_ + 1);
-  CpuIVector cols_full(nnz);
-  CpuVector value(nnz);
-  hl_stream_t stream = HPPL_STREAM_1;
-  hl_memcpy_from_csc_matrix(value.getData(),
-                            nnz,
-                            rows.getData(),
-                            nnz,
-                            cols.getData(),
-                            width_ + 1,
-                            sMatrix_.get(),
-                            stream);
-
-  hl_stream_synchronize(stream);
-
-  /*for every non zero number, get its column index*/
-  std::vector<Element> dataVec;
-  for (size_t i = 0; i < width_; i++) {
-    for (int j = cols.getData()[i]; j < cols.getData()[i + 1]; j++) {
-      cols_full.getData()[j] = i;
-    }
-  }
-
-  /*sort row index and column index by the ascending order*/
-  for (int i = 0; i < nnz; i++) {
-    dataVec.emplace_back(
-        rows.getData()[i], cols_full.getData()[i], value.getData()[i]);
-  }
-  std::sort(dataVec.begin(), dataVec.end(), [](Element a, Element b) {
-    return a.row < b.row || (a.row == b.row && a.col < b.col);
-  });
-
-  /*get sorted data, row index, and col index, put them in the right place*/
-  cols.resize(height_ + 1);
-  rows.resize(nnz);
-  value.resize(nnz);
-
-  cols.getData()[0] = 0;
-  rows.getData()[0] = dataVec[0].col;
-  value.getData()[0] = dataVec[0].val;
-  for (int i = 1; i < nnz; i++) {
-    if (dataVec[i].row != dataVec[i - 1].row) {
-      for (int j = dataVec[i - 1].row + 1; j <= dataVec[i].row; j++) {
-        cols.getData()[j] = i;
-      }
-    }
-    rows.getData()[i] = dataVec[i].col;
-    value.getData()[i] = dataVec[i].val;
-  }
-  cols.getData()[height_] = nnz;
-
-  /*copy back from cpu*/
-  GpuSparseMatrixPtr dest =
-      std::dynamic_pointer_cast<GpuSparseMatrix>(matTrans);
-  hl_memcpy_csc_matrix((dest->sMatrix_).get(),
-                       value.getData(),
-                       rows.getData(),
-                       cols.getData(),
-                       stream);
-  hl_stream_synchronize(stream);
-}
-
-void GpuSparseMatrix::mul(const GpuMatrix& a,
-                          const GpuMatrix& b,
-                          real scaleAB,
-                          real scaleT) {
-  CHECK(a.useGpu_ && b.useGpu_) << "type not match";
-  CHECK(!trans_) << "trans not supported";
-  real* A_d = (real*)a.getData();
-  real* B_d = (real*)b.getData();
-  hl_sparse_matrix_s C_d = sMatrix_.get();
-  hl_trans_op_t a_trans = a.trans_ ? HPPL_OP_T : HPPL_OP_N;
-  hl_trans_op_t b_trans = b.trans_ ? HPPL_OP_T : HPPL_OP_N;
-
-  if (!a.trans_ && !b.trans_) {
-    CHECK(height_ == a.getHeight());
-    CHECK(width_ == b.getWidth());
-    CHECK(a.getWidth() == b.getHeight());
-  } else if (a.trans_ && !b.trans_) {
-    CHECK(height_ == a.getWidth());
-    CHECK(width_ == b.getWidth());
-    CHECK(a.getHeight() == b.getHeight());
-  } else if (!a.trans_ && b.trans_) {
-    CHECK(height_ == a.getHeight());
-    CHECK(width_ == b.getHeight());
-    CHECK(a.getWidth() == b.getWidth());
-  } else {
-    LOG(INFO) << "Not support";
-  }
-  int dimM = height_;
-  int dimN = width_;
-  int dimK = !b.trans_ ? b.getHeight() : b.getWidth();
-  hl_sparse_matrix_mul(
-      A_d, a_trans, B_d, b_trans, C_d, dimM, dimN, dimK, scaleAB, scaleT);
-}
-
-void GpuSparseMatrix::mul(const Matrix& a,
-                          const Matrix& b,
-                          real scaleAB,
-                          real scaleT) {
-  const auto a_ptr = dynamic_cast<const GpuMatrix*>(&a);
-  const auto b_ptr = dynamic_cast<const GpuMatrix*>(&b);
-  if (a_ptr && b_ptr) {
-    mul(*a_ptr, *b_ptr, scaleAB, scaleT);
-  } else {
-    LOG(FATAL) << "not supported";
-  }
-}
-
-template <class T>
-void printBuf(std::ostream& os, T* a, size_t len, const char* name) {
-  os << "\n: " << name << " [";
-  for (size_t i = 0; i < len; i++) {
-    os << a[i] << " ";
-  }
-  os << "]\n";
-}
-
-void GpuSparseMatrix::print(std::ostream& os) const {
-  if (format_ == SPARSE_CSC) {
-    int nnz = sMatrix_->nnz;
-    IVectorPtr rows = IVector::create(nnz, false);
-    IVectorPtr cols = IVector::create(width_ + 1, false);
-    VectorPtr value = Vector::create(nnz, false);
-    hl_stream_t stream = HPPL_STREAM_DEFAULT;
-    hl_memcpy_from_csc_matrix(value->getData(),
-                              value->getSize(),
-                              rows->getData(),
-                              rows->getSize(),
-                              cols->getData(),
-                              cols->getSize(),
-                              sMatrix_.get(),
-                              stream);
-    hl_stream_synchronize(stream);
-
-    printBuf(os, cols->getData(), width_ + 1, "col idx");
-    printBuf(os, rows->getData(), elementCnt_, "row idx");
-    printBuf(os, value->getData(), elementCnt_, "value");
-  }
-}
-
-void GpuSparseMatrix::copyFromCSR(CpuSparseMatrix& src, hl_stream_t stream) {
-  trans_ = src.trans_;
-  size_t nnz = src.getElementCnt();
-
-  resize(src.getHeight(), src.getWidth(), nnz, valueType_, src.getFormat());
-  // if have different value type, only copy rows and cols
-  SparseValueType vType =
-      valueType_ != src.getValueType() ? NO_VALUE : valueType_;
-
-  sMatrix_->format = HL_SPARSE_CSR;
-  sMatrix_->type = vType == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
-  sMatrix_->rows = height_;
-  sMatrix_->cols = width_;
-  sMatrix_->nnz = nnz;
-
-  hl_memcpy_csr_matrix(sMatrix_.get(),
-                       vType == NO_VALUE ? NULL : src.getValue(),
-                       src.getRows(),
-                       src.getCols(),
-                       stream);
-
-  // restore type of sMatrix_
-  sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
-}
-
-void GpuSparseMatrix::copyFromCSC(CpuSparseMatrix& src, hl_stream_t stream) {
-  trans_ = src.trans_;
-  size_t nnz = src.getElementCnt();
-
-  resize(src.getHeight(), src.getWidth(), nnz, valueType_, src.getFormat());
-
-  // if have different value type, only copy rows and cols
-  SparseValueType vType =
-      valueType_ != src.getValueType() ? NO_VALUE : valueType_;
-
-  sMatrix_->format = HL_SPARSE_CSC;
-  sMatrix_->type = vType == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
-  sMatrix_->rows = height_;
-  sMatrix_->cols = width_;
-  sMatrix_->nnz = nnz;
-
-  hl_memcpy_csc_matrix(sMatrix_.get(),
-                       vType == NO_VALUE ? NULL : src.getValue(),
-                       src.getRows(),
-                       src.getCols(),
-                       stream);
-
-  // restore type of sMatrix_
-  sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
-}
-
-void GpuSparseMatrix::copyFrom(GpuSparseMatrix& src, hl_stream_t stream) {
-  CHECK(trans_ == src.trans_);
-  CHECK(format_ == src.getFormat());
-  resize(src.getHeight(),
-         src.getWidth(),
-         elementCnt_,
-         valueType_,
-         src.getFormat());
-
-  size_t rowSize = format_ == SPARSE_CSC ? elementCnt_ : height_ + 1;
-  size_t colSize = format_ == SPARSE_CSC ? width_ + 1 : elementCnt_;
-
-  if (valueType_ == FLOAT_VALUE && src.getValueType() == FLOAT_VALUE) {
-    hl_memcpy_async(
-        getValue(), src.getValue(), sizeof(real) * elementCnt_, stream);
-  }
-  CHECK(getRows());
-  CHECK(src.getRows());
-
-  hl_memcpy_async(getRows(), src.getRows(), sizeof(int) * rowSize, stream);
-  hl_memcpy_async(getCols(), src.getCols(), sizeof(int) * colSize, stream);
-}
-
-void GpuSparseMatrix::copyFrom(CpuSparseMatrix& src, hl_stream_t stream) {
-  if (format_ == SPARSE_CSR) {
-    copyFromCSR(src, stream);
-  } else {
-    copyFromCSC(src, stream);
-  }
-}
-
-void GpuSparseMatrix::trimFromCSR(const CpuSparseMatrix& src) {
-  trans_ = src.trans_;
-  int* srcCols = src.getCols();
-  size_t nnz = std::count_if(srcCols,
-                             srcCols + src.getElementCnt(),
-                             [this](size_t n) { return n < this->width_; });
-  resize(height_, width_, nnz, valueType_, format_);
-
-  rows_[0] = 0;
-  size_t index = 0;
-  for (size_t r = 0; r < height_; ++r) {
-    for (int i = src.getRows()[r]; i < src.getRows()[r + 1]; ++i) {
-      if (srcCols[i] < (int)width_) {
-        cols_[index] = srcCols[i];
-        if (valueType_ == FLOAT_VALUE) {
-          value_[index] = src.getValue()[i];
-        }
-        ++index;
-      }
-    }
-    rows_[r + 1] = index;
-  }
-  CHECK_EQ(index, nnz);
-
-  sMatrix_->format = HL_SPARSE_CSR;
-  sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
-  sMatrix_->rows = height_;
-  sMatrix_->cols = width_;
-  sMatrix_->nnz = nnz;
-
-  hl_memcpy_csr_matrix(sMatrix_.get(),
-                       valueType_ == NO_VALUE ? NULL : value_,
-                       rows_,
-                       cols_,
-                       /*default stream = */ HPPL_STREAM_DEFAULT);
-}
-
-void GpuSparseMatrix::trimFromCSC(const CpuSparseMatrix& src) {
-  trans_ = src.trans_;
-  size_t nnz = src.getCols()[width_] - src.getCols()[0];
-  resize(height_, width_, nnz, valueType_, format_);
-
-  cols_[0] = 0;
-  for (size_t i = 0; i < width_; i++) {
-    cols_[i + 1] = cols_[i] + (int)(src.getRowNum(i));
-  }
-  memcpy(rows_, src.getRows() + src.getCols()[0], sizeof(int) * nnz);
-  if (valueType_ == FLOAT_VALUE) {
-    memcpy(value_, src.getValue() + src.getCols()[0], sizeof(real) * nnz);
-  }
-
-  sMatrix_->format = HL_SPARSE_CSC;
-  sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
-  sMatrix_->rows = height_;
-  sMatrix_->cols = width_;
-  sMatrix_->nnz = nnz;
-
-  hl_memcpy_csc_matrix(sMatrix_.get(),
-                       valueType_ == NO_VALUE ? NULL : value_,
-                       rows_,
-                       cols_,
-                       /*default stream = */ HPPL_STREAM_DEFAULT);
-}
-
-void GpuSparseMatrix::trimFrom(const CpuSparseMatrix& src) {
-  if (format_ == SPARSE_CSR) {
-    trimFromCSR(src);
-  } else {
-    trimFromCSC(src);
-  }
-}
-
-void GpuSparseMatrix::addBias(Matrix& b, real scale) {
-  CHECK(b.getHeight() == 1) << "the Bias should be a vector";
-  hl_sparse_matrix_s A_d = sMatrix_.get();
-  hl_sparse_matrix_add_bias(A_d, b.getData(), scale);
-}
-
-void GpuSparseMatrix::add3(GpuMatrix* b) {
-  CHECK(getFormat() != SPARSE_CSC) << "Not supported";
-  CHECK(height_ == b->getHeight());
-  CHECK(width_ == b->getWidth());
-  real* B_d = b->getData();
-  hl_sparse_matrix_s A_d = sMatrix_.get();
-  hl_sparse_matrix_add_dense(A_d, B_d, height_, width_, 1, 0);
-}
-
-void GpuSparseMatrix::add3(MatrixPtr b) {
-  if (dynamic_cast<GpuMatrix*>(b.get())) {
-    add3(dynamic_cast<GpuMatrix*>(b.get()));
-  } else {
-    LOG(FATAL) << "not supported";
-  }
-}
-
-void GpuSparseMatrix::zeroMem() {
-  CHECK(valueType_ == FLOAT_VALUE);
-  real* value = getValue();
-  if (value == NULL) {
-    LOG(FATAL) << "value is nullptr";
-  }
-  hl_matrix_zero_mem(value, elementCnt_);
-}
-
-void GpuSparseMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
-#ifdef PADDLE_WITH_CUDA
-  CHECK(maxIds.useGpu() && maxVal.useGpu()) << "Matrix type are not equal";
-  size_t numSamples = getHeight();
-  size_t beam = maxVal.getWidth();
-  CHECK_EQ(maxIds.getSize(), numSamples * beam);
-  CHECK_EQ(maxVal.getHeight(), numSamples);
-  CHECK_EQ(format_, SPARSE_CSR) << "Only support SPARSE_CSR";
-
-  hl_sparse_matrix_top_k(maxVal.getData(),
-                         maxVal.getStride(),
-                         maxIds.getData(),
-                         sMatrix_.get(),
-                         beam,
-                         numSamples);
-#endif
-}
-
-template void GpuSparseMatrix::copyFrom(int64_t* ids,
-                                        int64_t* indices,
-                                        sparse_non_value_t* data,
-                                        hl_stream_t stream);
-template void GpuSparseMatrix::copyFrom(int64_t* ids,
-                                        int64_t* indices,
-                                        sparse_float_value_t* data,
-                                        hl_stream_t stream);
-}  // namespace paddle
diff --git a/paddle/math/SparseRowMatrix.cpp b/paddle/math/SparseRowMatrix.cpp
deleted file mode 100644
index 4254175aabc8c32edb243d4a82c2e34c81393f74..0000000000000000000000000000000000000000
--- a/paddle/math/SparseRowMatrix.cpp
+++ /dev/null
@@ -1,282 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "SparseRowMatrix.h"
-#include "CpuSparseMatrix.h"
-
-#include <algorithm>
-
-#include "paddle/utils/Logging.h"
-
-#include "SIMDFunctions.h"
-
-#include "paddle/utils/Thread.h"
-#include "paddle/utils/Util.h"
-
-namespace paddle {
-
-const unsigned int SparseRowCpuMatrix::kUnusedId_ = -1U;
-
-void SparseRowCpuMatrix::init(size_t height, size_t width) {
-  height_ = height;
-  if (!indexDictHandle_) {
-    indexDictHandle_.reset(new IndexDict);
-    indexDictHandle_->globalIndices.assign(height, kUnusedId_);
-  }
-  localIndices_ = &indexDictHandle_->localIndices;
-  globalIndices_ = indexDictHandle_->globalIndices.data();
-}
-
-void SparseRowCpuMatrix::mul(CpuSparseMatrix* a,
-                             CpuMatrix* b,
-                             real scaleAB,
-                             real scaleT) {
-  CpuMatrix::mul<CpuMatrix, SparseRowCpuMatrix>(a, b, this, scaleAB, scaleT);
-}
-
-void SparseRowCpuMatrix::copyFrom(const real* src, size_t size) {
-  LOG(FATAL) << "This should not be called";
-}
-
-void SparseRowCpuMatrix::zeroMem() {
-  apply([](real* buf, size_t len) { memset(buf, 0, sizeof(real) * len); });
-  clearRows();
-}
-
-void SparseRowCpuMatrix::applyL1(real learningRate, real decayRate) {
-  apply([=](real* buf, size_t len) {
-    CpuVector value(0, nullptr);
-    value.subVecFrom(buf, 0, len);
-    value.applyL1(learningRate, decayRate);
-  });
-}
-
-void SparseRowCpuMatrix::sgdUpdate(BaseMatrix& value,
-                                   IVector& t0,
-                                   real learningRate,
-                                   int currentTime,
-                                   real decayRate,
-                                   bool useL1,
-                                   bool fini) {
-  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
-
-  // t0 and value are vectors
-  CHECK_EQ(t0.getSize(), this->height_);
-  CHECK_EQ(value.width_, this->height_ * this->width_);
-
-  if (decayRate == 0.0f) {
-    if (fini) {
-      return;
-    }
-
-    for (size_t i = 0; i < localIndices.size(); ++i) {
-      real* g = getLocalRow(i);
-      real* v = value.rowBuf(localIndices[i]);
-      for (size_t j = 0; j < this->width_; ++j) {
-        v[j] -= learningRate * g[j];
-      }
-    }
-    return;
-  }  // else
-
-  if (useL1) {  // L1 decay
-    if (fini) {
-      for (size_t i = 0; i < this->height_; ++i) {
-        real* v = value.rowBuf(i);
-        int* t = t0.getData() + i;
-        if (t[0] < currentTime) {
-          // W(t0) -> W(t+1)
-          int tDiff = currentTime - t[0];
-          real delta = tDiff * learningRate * decayRate;
-          simd::decayL1(v, v, delta, this->width_);
-        }
-      }
-      return;
-    }  // else
-
-    for (size_t i = 0; i < localIndices.size(); ++i) {
-      real* g = getLocalRow(i);
-      real* v = value.rowBuf(localIndices[i]);
-      int* t = t0.getData() + localIndices[i];
-      if (t[0] < currentTime) {
-        // W(t0) -> W(t)
-        int tDiff = currentTime - t[0];
-        real delta = tDiff * learningRate * decayRate;
-        simd::decayL1(v, v, delta, this->width_);
-      }
-
-      // W(t) -> W(t+1)
-      for (size_t j = 0; j < this->width_; ++j) {
-        v[j] -= learningRate * g[j];
-      }
-      simd::decayL1(v, v, learningRate * decayRate, this->width_);
-
-      // state update to t+1
-      t[0] = currentTime + 1;
-    }
-
-  } else {  // L2 decay
-    if (fini) {
-      for (size_t i = 0; i < this->height_; ++i) {
-        real* v = value.rowBuf(i);
-        int* t = t0.getData() + i;
-        if (t[0] < currentTime) {
-          // W(t0) -> W(t+1)
-          int tDiff = currentTime - t[0];
-          real recip = 1.0f / (1.0f + tDiff * learningRate * decayRate);
-          for (size_t j = 0; j < this->width_; ++j) {
-            v[j] *= recip;
-          }
-        }
-      }
-      return;
-    }  // else
-
-    real recipDecay = 1.0f / (1.0f + learningRate * decayRate);
-
-    for (size_t i = 0; i < localIndices.size(); ++i) {
-      real* g = getLocalRow(i);
-      real* v = value.rowBuf(localIndices[i]);
-      int* t = t0.getData() + localIndices[i];
-      if (t[0] < currentTime) {
-        // W(t0) -> W(t)
-        int tDiff = currentTime - t[0];
-        real recip = 1.0f / (1.0f + tDiff * learningRate * decayRate);
-        for (size_t j = 0; j < this->width_; ++j) {
-          v[j] *= recip;
-        }
-      }
-
-      // W(t) -> W(t+1)
-      for (size_t j = 0; j < this->width_; ++j) {
-        v[j] = recipDecay * (v[j] - learningRate * g[j]);
-      }
-
-      // state update to t+1
-      t[0] = currentTime + 1;
-    }
-  }
-}
-
-void SparseRowCpuMatrix::addTo(BaseMatrix& dest,
-                               std::vector<uint32_t>& ids,
-                               size_t tid,
-                               size_t numThreads) {
-  CHECK(!dest.useGpu_);
-  CHECK_EQ(dest.height_ * dest.width_, this->height_ * this->width_);
-
-  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
-  for (size_t i = 0; i < localIndices.size(); ++i) {
-    uint32_t id = localIndices[i];
-    if (id % numThreads == tid) {
-      simd::addTo(dest.rowBuf(id), getLocalRow(i), this->width_);
-      ids.push_back(id);
-    }
-  }
-}
-
-void SparseRowCpuMatrix::addTo(SparseRowCpuMatrix& dest,
-                               size_t tid,
-                               size_t numThreads) {
-  CHECK(!dest.useGpu_);
-  CHECK_EQ(dest.height_ * dest.width_, this->height_ * this->width_);
-
-  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
-  for (size_t i = 0; i < localIndices.size(); ++i) {
-    uint32_t id = localIndices[i];
-    if (id % numThreads == tid) {
-      dest.checkIndex(id);
-      simd::addTo(dest.getRow(id), getLocalRow(i), this->width_);
-    }
-  }
-}
-
-void SparseRowCpuMatrix::zeroMemThread(size_t tid, size_t numThreads) {
-  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
-  for (size_t i = 0; i < localIndices.size(); ++i) {
-    uint32_t id = localIndices[i];
-    if (id % numThreads == tid) {
-      memset(this->getLocalRow(i), 0, this->width_ * sizeof(real));
-    }
-  }
-}
-
-void SparseAutoGrowRowCpuMatrix::mul(CpuSparseMatrix* a,
-                                     CpuMatrix* b,
-                                     real scaleAB,
-                                     real scaleT) {
-  CpuMatrix::mul<CpuMatrix, SparseAutoGrowRowCpuMatrix>(
-      a, b, this, scaleAB, scaleT);
-}
-
-void CacheRowCpuMatrix::mul(CpuSparseMatrix* a,
-                            CpuMatrix* b,
-                            real scaleAB,
-                            real scaleT) {
-  CpuMatrix::mul<CpuMatrix, CacheRowCpuMatrix>(a, b, this, scaleAB, scaleT);
-}
-
-void SparsePrefetchRowCpuMatrix::addRows(const unsigned int* ids, size_t len) {
-  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
-  for (size_t i = 0; i < len; i++) {
-    CHECK_LT(*(ids + i), this->getHeight())
-        << "id:" << *(ids + i) << "Height:" << this->getHeight()
-        << "sparse id value exceeds the max input dimension, "
-        << "it could be caused invalid input data samples";
-  }
-  localIndices.insert(localIndices.end(), ids, ids + len);
-}
-
-void SparsePrefetchRowCpuMatrix::addRows(MatrixPtr input) {
-  CpuSparseMatrix* mat = dynamic_cast<CpuSparseMatrix*>(input.get());
-  CHECK(mat) << "only support sparse matrix";
-  addRows(reinterpret_cast<const unsigned int*>(mat->getCols()),
-          mat->getElementCnt());
-}
-
-void SparsePrefetchRowCpuMatrix::addRows(IVectorPtr ids) {
-  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
-  size_t numSamples = ids->getSize();
-  int* index = ids->getData();
-  for (size_t i = 0; i < numSamples; ++i) {
-    if (index[i] == -1) continue;
-
-    unsigned int id = (unsigned int)index[i];
-    CHECK_LT(id, this->getHeight())
-        << "id:" << id << "Height:" << this->getHeight()
-        << "sparse id value exceeds the max input dimension, "
-        << "it could be caused invalid input data samples";
-    localIndices.push_back(id);
-  }
-}
-
-void SparsePrefetchRowCpuMatrix::setupIndices() {
-  auto& localIndices = indexDictHandle_->localIndices;
-  uniqueIds(localIndices);
-  // for each sparse row
-  for (size_t id = 0; id < localIndices.size(); ++id) {
-    globalIndices_[localIndices[id]] = id;  // sparse row -> local id
-  }
-  checkStoreSize();
-}
-
-void SparseRowCpuMatrix::checkIndices() {
-  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
-  for (size_t i = 0; i < localIndices.size(); ++i) {
-    CHECK_EQ(globalIndices_[localIndices[i]], i);
-  }
-  checkStoreSize();
-}
-
-}  // namespace paddle
diff --git a/paddle/math/SparseRowMatrix.h b/paddle/math/SparseRowMatrix.h
deleted file mode 100644
index cf6779e8b0b1d6b0c13b21a08ffff5af76e57ba6..0000000000000000000000000000000000000000
--- a/paddle/math/SparseRowMatrix.h
+++ /dev/null
@@ -1,341 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifndef PADDLE_MOBILE_INFERENCE
-
-#include <gflags/gflags.h>
-#include <string.h>
-#include <algorithm>
-#include "Matrix.h"
-#include "RowBuffer.h"
-#include "paddle/utils/Util.h"
-
-namespace paddle {
-
-/**
- * Sparse Row
- */
-class SparseRowCpuMatrix : public CpuMatrix {
- public:
-  struct IndexDict {
-    // In the following, global id means the row id in the original matrix.
-    // Local id means the row id in the local storage which only contains
-    // the sparse rows.
-    std::vector<unsigned int> localIndices;   // local id -> global id
-    std::vector<unsigned int> globalIndices;  // global id -> local id
-  };
-  typedef std::shared_ptr<IndexDict> IndexDictPtr;
-
-  /// heightStore is max number of rows of the sparse matrix.
-  SparseRowCpuMatrix(CpuMemHandlePtr dataHandle,
-                     size_t height,
-                     size_t width,
-                     IndexDictPtr indexDictHandle = nullptr,
-                     bool trans = false)
-      : CpuMatrix(nullptr, height, width, trans),
-        indexDictHandle_(indexDictHandle) {
-    init(height, width);
-    buf_.reset(new RowBuffer(dataHandle, width));
-  }
-
-  virtual ~SparseRowCpuMatrix() {}
-
- public:
-  /**
-   *  Get the row buf
-   *
-   *  @param row row id in the original matrix
-   */
-  real* getRow(size_t row) {
-    CHECK_NE(globalIndices_[row], kUnusedId_);
-    return getLocalRow(globalIndices_[row]);
-  }
-
-  /**
-   *  Get the row buf
-   *
-   *  @param row row id in local storage
-   */
-  real* getLocalRow(size_t row) { return buf_->getWithAutoGrowth(row); }
-
-  /**
-   *  reserve the storage for rows according to current size of
-   * indexDictHandle.
-   *
-   *  This is only used when SparseRowCpuMatrix is constructed with
-   *  indexDictHandle.
-   */
-  void reserveStore() { buf_->resize(localIndices_->size()); }
-
-  // row is the row id in the original matrix
-  virtual real* getRowBuf(size_t row) { return getRow(row); }
-
-  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
-
-  /**
-   * Fill data according to row indexs added, setup indices inside.
-   *
-   * *src* and *size* are data and size of normal dense CpuMatrix.
-   */
-  virtual void copyFrom(const real* src, size_t size);
-  virtual void zeroMem();
-
-  /**
-   * apply L1 to all sparse rows, should be apply after indices ready.
-   */
-  virtual void applyL1(real learningRate, real decayRate);
-
-  void clearIndices() { clearRows(); }
-  void zeroMemThread(size_t tid, size_t numThreads);
-
-  /**
-   *  value -= grad * learningRate,  this is gradient.
-   *
-   * If L1 decay set use L1, else if L2 set use L2, otherwise no decay atall.
-   *
-   * t0 is a int vector used by L1/L2 decay, size = height of parameter
-   * matrix,
-   * store the time that each weight row last updated.
-   *
-   * Time is batchId, currentTime is current batchId.
-   *
-   * While pass finished, caller should call this func one more time
-   *  with (fini=true) to let weight decay catch up current time.
-   */
-  void sgdUpdate(BaseMatrix& value,
-                 IVector& t0,
-                 real learningRate,
-                 int currentTime,
-                 real decayRate,
-                 bool useL1,
-                 bool fini = false);
-
-  /**
-   *  merge rows in *this* to *dest* for designated thread
-   *
-   *  values add to *dest* matrix
-   *
-   *  ids occured in *this* append to *ids*
-   *  filtered by  (id % numThreads == tid)
-   */
-  void addTo(BaseMatrix& dest,
-             std::vector<uint32_t>& ids,
-             size_t tid,
-             size_t numThreads);
-
-  /**
-   *  the second version addTo(), *dest* is a SparseRowCpuMatrix.
-   *
-   *  The dest's indices should be setup already, addTo() will
-   *  check src ids is exist in dest's indices.
-   */
-  void addTo(SparseRowCpuMatrix& dest, size_t tid, size_t numThreads);
-
-  const IndexDictPtr& getIndexDictHandle() const { return indexDictHandle_; }
-
-  /**
-   *  check all local and global indices consistency
-   */
-  void checkIndices();
-  /**
-   *  check whether row *i* exist in indices
-   */
-  void checkIndex(size_t i) {
-    size_t localId = globalIndices_[i];
-    CHECK_LT(localId, localIndices_->size());
-    CHECK_EQ((*localIndices_)[localId], i);
-  }
-
-  std::vector<unsigned int>& getLocalIndices() const {
-    return indexDictHandle_->localIndices;
-  }
-
- protected:
-  template <typename Func>
-  void apply(Func f) {
-    f(buf_->data(), localIndices_->size() * width_);
-  }
-
-  void init(size_t height, size_t width);
-
-  /// clear row indices.
-  void clearRows() {
-    for (auto id : *localIndices_) {
-      globalIndices_[id] = kUnusedId_;
-    }
-    localIndices_->clear();
-    buf_->clear();
-  }
-
-  inline void checkStoreSize() {
-    if (buf_->isAutoGrowth()) {
-      if (buf_->getRowCount() > 0.5 * height_) {
-        LOG(WARNING) << "There are more than 0.5*height ("
-                     << localIndices_->size() << ") rows are used for sparse "
-                     << "update, which is not efficient. Considering not use "
-                     << "sparse_update.";
-      }
-    } else {
-      CHECK_LE(localIndices_->size(), buf_->getRowCount());
-    }
-  }
-
-  std::unique_ptr<RowBuffer> buf_;
-  IndexDictPtr indexDictHandle_;
-  std::vector<unsigned int>* localIndices_;  // =&indexDictHandle_->localIndices
-  unsigned int* globalIndices_;  // =indexDictHandle_->globalIndices.data();
-  static const unsigned int kUnusedId_;
-};
-
-class SyncThreadPool;
-
-/// For prefetching parameters from remote Parameter server
-class SparsePrefetchRowCpuMatrix : public SparseRowCpuMatrix {
- public:
-  SparsePrefetchRowCpuMatrix(CpuMemHandlePtr dataHandle,
-                             size_t height,
-                             size_t width,
-                             IndexDictPtr indexDictHandle = nullptr,
-                             SyncThreadPool* pool = nullptr,
-                             bool trans = false)
-      : SparseRowCpuMatrix(dataHandle, height, width, indexDictHandle, trans),
-        pool_(pool) {}
-
-  /**
-   * Extract feature ids from *input*, to fill row indexs.
-   *
-   * *input* must be sparse matrix.
-   *
-   * Can call many times before setup.
-   */
-  void addRows(MatrixPtr input);
-  void addRows(IVectorPtr ids);
-
-  /**
-   * setup global indices of SparseRowMatrix after finish add rows.
-   */
-  void setupIndices();
-
- protected:
-  void addRows(const unsigned int* ids, size_t len);
-  SyncThreadPool* pool_;
-};
-
-class SparseAutoGrowRowCpuMatrix : public SparseRowCpuMatrix {
- public:
-  SparseAutoGrowRowCpuMatrix(size_t height,
-                             size_t width,
-                             IndexDictPtr indexDictHandle = nullptr,
-                             bool trans = false)
-      : SparseRowCpuMatrix(nullptr, height, width, indexDictHandle, trans) {}
-
-  real* getRow(size_t row) {
-    auto id = globalIndices_[row];
-    if (id == kUnusedId_) {
-      id = globalIndices_[row] = localIndices_->size();
-      localIndices_->push_back(row);
-      checkStoreSize();
-    }
-    return getLocalRow(id);
-  }
-
-  virtual real* getRowBuf(size_t row) { return getRow(row); }
-
-  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
-};
-
-class CacheRowCpuMatrix : public SparseAutoGrowRowCpuMatrix {
- public:
-  CacheRowCpuMatrix(size_t height,
-                    size_t width,
-                    IndexDictPtr indexDictHandle = nullptr,
-                    bool trans = false)
-      : SparseAutoGrowRowCpuMatrix(height, width, indexDictHandle, trans),
-        sourceData_(nullptr) {}
-
-  void setSourceData(CpuVectorPtr sourceVec) {
-    sourceDataVec_ = sourceVec;
-    sourceData_ = sourceVec->getData();
-  }
-
-  real* getRow(size_t row) {
-    auto id = globalIndices_[row];
-    if (id == kUnusedId_) {
-      id = globalIndices_[row] = localIndices_->size();
-      localIndices_->push_back(row);
-      checkStoreSize();
-      memcpy(
-          getLocalRow(id), sourceData_ + width_ * row, sizeof(float) * width_);
-    }
-    return getLocalRow(id);
-  }
-
-  virtual real* getRowBuf(size_t row) { return getRow(row); }
-
-  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
-
- public:
-  CpuVectorPtr sourceDataVec_;
-  real* sourceData_;
-};
-
-/**
- * Sparse Row Ids Matrix.
- *
- * mostly same as CpuMatrix, but maintain sparse row ids occured,
- * ids are hashed by worker thread id.
- */
-class SparseRowIdsCpuMatrix : public CpuMatrix {
- public:
-  SparseRowIdsCpuMatrix(CpuMemHandlePtr dataHandle,
-                        size_t height,
-                        size_t width,
-                        bool trans = false)
-      : CpuMatrix(dataHandle, height, width, trans) {}
-
-  void setNumOfThreads(size_t numOfThreads) { idsArray_.resize(numOfThreads); }
-
-  std::vector<uint32_t>& getIds(size_t threadId) { return idsArray_[threadId]; }
-
- private:
-  std::vector<std::vector<uint32_t>> idsArray_;
-};
-
-}  // namespace paddle
-
-#else
-namespace paddle {
-
-class SparseRowCpuMatrix : public CpuMatrix {
- public:
-  void reserveStore() {}
-  void clearIndices() {}
-};
-
-class SparsePrefetchRowCpuMatrix : public SparseRowCpuMatrix {
- public:
-  void setupIndices() {}
-  void addRows(MatrixPtr input) {}
-  void addRows(IVectorPtr ids) {}
-};
-
-class SparseAutoGrowRowCpuMatrix : public SparseRowCpuMatrix {};
-class CacheRowCpuMatrix : public SparseAutoGrowRowCpuMatrix {};
-class SparseRowIdsCpuMatrix : public CpuMatrix {};
-
-}  // namespace paddle
-
-#endif
diff --git a/paddle/math/Storage.cpp b/paddle/math/Storage.cpp
deleted file mode 100644
index 5982bf2e5637ff4b4af6baae47e40b68e0c07c86..0000000000000000000000000000000000000000
--- a/paddle/math/Storage.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Storage.h"
-#include "Allocator.h"
-#include "paddle/utils/StringUtil.h"
-#include "paddle/utils/Util.h"
-
-#ifndef PADDLE_MOBILE_INFERENCE
-DEFINE_int32(pool_limit_size,
-             536870912,
-             "maximum memory size managed by a memory pool, default is 512M");
-#else
-DEFINE_int32(pool_limit_size, 0, "default is 0");
-#endif
-
-namespace paddle {
-
-// Initialization StorageEngine singleton.
-// Other modules may rely on storage management,
-// so StorageEngine need to be initialized before other modules.
-static InitFunction __init_storage_engine([]() { StorageEngine::singleton(); },
-                                          std::numeric_limits<int>::max());
-
-StorageEngine::StorageEngine() : cpuAllocator_(nullptr) {}
-
-StorageEngine::~StorageEngine() {
-  delete cpuAllocator_;
-  for (auto it : gpuAllocator_) {
-    delete it;
-  }
-}
-
-StorageEngine* StorageEngine::singleton() {
-  static StorageEngine storage;
-  return &storage;
-}
-
-PoolAllocator* StorageEngine::getGpuAllocator(int deviceId) {
-  {
-    // if gpuAllocator_ has been constructed
-    ReadLockGuard guard(lock_);
-    if (deviceId < static_cast<int>(gpuAllocator_.size()) &&
-        (gpuAllocator_[deviceId] != nullptr)) {
-      return gpuAllocator_[deviceId];
-    }
-  }
-
-  {
-    // Construct gpuAllocator_
-    std::lock_guard<RWLock> guard(lock_);
-    if (deviceId >= static_cast<int>(gpuAllocator_.size())) {
-      gpuAllocator_.resize(deviceId + 1);
-    }
-    if (gpuAllocator_[deviceId] == nullptr) {
-      std::string name =
-          "gpu" + str::to_string(deviceId) + std::string("_pool");
-      gpuAllocator_[deviceId] =
-          new PoolAllocator(new GpuAllocator(), FLAGS_pool_limit_size, name);
-    }
-    return gpuAllocator_[deviceId];
-  }
-}
-
-PoolAllocator* StorageEngine::getCpuAllocator() {
-  {
-    // if cpuAllocator_ has been constructed
-    ReadLockGuard guard(lock_);
-    if (cpuAllocator_ != nullptr) {
-      return cpuAllocator_;
-    }
-  }
-
-  {
-    // Construct cpuAllocator_
-    std::lock_guard<RWLock> guard(lock_);
-    if (cpuAllocator_ == nullptr) {
-      if (FLAGS_use_gpu) {
-        cpuAllocator_ = new PoolAllocator(
-            new CudaHostAllocator(), FLAGS_pool_limit_size, "cuda_host_pool");
-      } else {
-        cpuAllocator_ = new PoolAllocator(
-            new CpuAllocator(), FLAGS_pool_limit_size, "cpu_pool");
-      }
-    }
-    return cpuAllocator_;
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/math/Storage.h b/paddle/math/Storage.h
deleted file mode 100644
index 61a9aa2a07442d9e4ede80c961e17e079eb8b3ba..0000000000000000000000000000000000000000
--- a/paddle/math/Storage.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <mutex>
-#include <vector>
-#include "PoolAllocator.h"
-#include "paddle/utils/Locks.h"
-
-namespace paddle {
-
-/**
- * @brief Storage manager for multiple devices.
- */
-class StorageEngine {
- public:
-  /**
-   * @return Storage singleton
-   */
-  static StorageEngine* singleton();
-
-  /**
-   * @return return one gpu allocator by deviceId
-   */
-  PoolAllocator* getGpuAllocator(int deviceId);
-
-  /**
-   * @return return cpu allocator
-   */
-  PoolAllocator* getCpuAllocator();
-
- protected:
-  StorageEngine();
-  ~StorageEngine();
-  RWLock lock_;
-  std::vector<PoolAllocator*> gpuAllocator_;
-  PoolAllocator* cpuAllocator_;
-};
-
-}  // namespace paddle
diff --git a/paddle/math/TensorAssign.h b/paddle/math/TensorAssign.h
deleted file mode 100644
index 7d4726ddba43202970c37dd1a08f842104b24ada..0000000000000000000000000000000000000000
--- a/paddle/math/TensorAssign.h
+++ /dev/null
@@ -1,158 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include "paddle/utils/Logging.h"
-
-namespace paddle {
-
-/**
- * \brief Tensor Assign Expression(return by lazyAssign,
- * and evaluated by AssignEvaluate)
- */
-template <typename LhsType, typename RhsType, class T>
-class TensorAssignOp {
- public:
-  explicit TensorAssignOp(const LhsType& lhs, const RhsType& rhs)
-      : lhs_(lhs), rhs_(rhs) {
-#ifndef __CUDA_ARCH__
-    CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
-    CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
-    CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
-#endif
-  }
-
-  INLINE void apply(const int i, const int j) {
-    lhs_.applyRef(i, j) = rhs_.apply(i, j);
-  }
-  INLINE void apply(const int index) {
-    lhs_.applyRef(index) = rhs_.apply(index);
-  }
-
-  INLINE size_t getWidth() const { return lhs_.getWidth(); }
-  INLINE size_t getHeight() const { return rhs_.getHeight(); }
-  INLINE bool isContiguous() const {
-    return lhs_.isContiguous() && rhs_.isContiguous();
-  }
-  INLINE bool useGpu() const { return lhs_.useGpu(); }
-
- private:
-  TensorApply<LhsType, T> lhs_;
-  TensorApply<const RhsType, T> rhs_;
-};
-
-template <typename Assign, typename... AssignOp>
-void AssignCpuEvaluate(int height,
-                       int width,
-                       bool isContiguous,
-                       Assign&& assign,
-                       AssignOp&&... args) {
-  if (isContiguous) {
-    int size = height * width;
-    for (int index = 0; index < size; index++) {
-      assign.apply(index);
-      __attribute__((unused)) int dummy[] = {(((args)).apply(index), 0)...};
-    }
-  } else {
-    for (int i = 0; i < height; i++) {
-      for (int j = 0; j < width; j++) {
-        assign.apply(i, j);
-        __attribute__((unused)) int dummy[] = {(((args)).apply(i, j), 0)...};
-      }
-    }
-  }
-}
-
-#ifdef __NVCC__
-template <typename Assign, typename... AssignOp>
-__global__ void AssignGpuEvaluate1(const int border,
-                                   Assign assign,
-                                   AssignOp... args) {
-  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < border) {
-    assign.apply(idx);
-    __attribute__((unused)) int dummy[] = {(((args)).apply(idx), 0)...};
-  }
-}
-
-template <typename Assign, typename... AssignOp>
-__global__ void AssignGpuEvaluate2(const int height,
-                                   const int width,
-                                   Assign assign,
-                                   AssignOp... args) {
-  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
-  for (int i = rowIdx; i < height; i += gridDim.y * blockDim.y) {
-    for (int j = colIdx; j < width; j += gridDim.x * blockDim.x) {
-      assign.apply(i, j);
-      __attribute__((unused)) int dummy[] = {(((args)).apply(i, j), 0)...};
-    }
-  }
-}
-#endif
-
-/**
- * \brief Evaluate one or more TensorAssignOp objects.
- *
- * \note At least one assignment expression is required
- */
-template <typename Assign, typename... AssignOp>
-void AssignEvaluate(Assign&& assign, AssignOp&&... args) {
-  const bool useGpu_ = assign.useGpu();
-  bool isContiguous_ = assign.isContiguous();
-  const size_t height = assign.getHeight();
-  const size_t width = assign.getWidth();
-
-  const int packSize = sizeof...(args);
-  const bool packUseGpu[] = {((args)).useGpu()...};
-  const bool packIsContiguous[] = {((args)).isContiguous()...};
-  const size_t packHeight[] = {((args)).getHeight()...};
-  const size_t packWidth[] = {((args)).getWidth()...};
-
-  for (int i = 0; i < packSize; i++) {
-    CHECK_EQ(useGpu_, packUseGpu[i]);
-    CHECK_EQ(height, packHeight[i]);
-    CHECK_EQ(width, packWidth[i]);
-    isContiguous_ = isContiguous_ && packIsContiguous[i];
-  }
-
-  if (useGpu_) {
-#ifdef __NVCC__
-    if (isContiguous_) {
-      int size = height * width;
-      int blockSize = size <= 1024 ? size : 1024;
-      int gridSize = (size + 1024 - 1) / 1024;
-      AssignGpuEvaluate1<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
-          size, assign, args...);
-    } else {
-      int blockSizeY = std::min(32, (int)height);
-      int blockSizeX = (32 / blockSizeY) * 32;
-      int gridSizeX = std::min(32, (int)(width + blockSizeX - 1) / blockSizeX);
-      int gridSizeY = std::min(32, (int)(height + blockSizeY - 1) / blockSizeY);
-      dim3 threads(blockSizeX, blockSizeY);
-      dim3 grid(gridSizeX, gridSizeY);
-      AssignGpuEvaluate2<<<grid, threads, 0, STREAM_DEFAULT>>>(
-          height, width, assign, args...);
-    }
-
-    CHECK_SYNC("AssignEvaluate failed");
-#endif
-  } else {
-    AssignCpuEvaluate(height, width, isContiguous_, assign, args...);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/math/TensorEvaluate.h b/paddle/math/TensorEvaluate.h
deleted file mode 100644
index 2a722016e777a131ef14636a6871d29d9b131044..0000000000000000000000000000000000000000
--- a/paddle/math/TensorEvaluate.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include "hl_base.h"
-#include "paddle/utils/Logging.h"
-
-namespace paddle {
-
-/**
- * \brief The tensor cpu evaluate api.
- */
-template <class T, typename LeftType, typename RightType>
-inline void TensorCpuApply(LeftType& lhs, const RightType& rhs) {
-  TensorApply<LeftType, T> lhs_(lhs);
-  TensorApply<const RightType, T> rhs_(rhs);
-  CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
-  CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
-  CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
-
-  int height = lhs_.getHeight();
-  int width = lhs_.getWidth();
-  if (lhs_.isContiguous() && rhs_.isContiguous()) {
-    int size = height * width;
-    for (int index = 0; index < size; index++) {
-      lhs_.applyRef(index) = rhs_.apply(index);
-    }
-  } else {
-    for (int i = 0; i < height; i++) {
-      for (int j = 0; j < width; j++) {
-        lhs_.applyRef(i, j) = rhs_.apply(i, j);
-      }
-    }
-  }
-}
-
-#ifdef __NVCC__
-template <typename LeftType, typename RightType>
-__global__ void TensorElementWiseOp(LeftType lhs,
-                                    RightType rhs,
-                                    const int border) {
-  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < border) {
-    lhs.applyRef(idx) = rhs.apply(idx);
-  }
-}
-
-template <typename LeftType, typename RightType>
-__global__ void TensorElementWiseOp(LeftType lhs, RightType rhs) {
-  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
-  for (int i = rowIdx; i < lhs.getHeight(); i += gridDim.y * blockDim.y) {
-    for (int j = colIdx; j < lhs.getWidth(); j += gridDim.x * blockDim.x) {
-      lhs.applyRef(i, j) = rhs.apply(i, j);
-    }
-  }
-}
-
-/**
- * \brief The tensor gpu evaluate api.
- */
-template <class T, typename LeftType, typename RightType>
-inline void TensorGpuApply(LeftType& lhs, const RightType& rhs) {
-  TensorApply<LeftType, T> lhs_(lhs);
-  TensorApply<const RightType, T> rhs_(rhs);
-  CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
-  CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
-  CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
-
-  int dimM = lhs_.getHeight();
-  int dimN = lhs_.getWidth();
-
-  if (lhs_.isContiguous() && rhs_.isContiguous()) {
-    int size = dimM * dimN;
-    int blockSize = size <= 1024 ? size : 1024;
-    int gridSize = (size + 1024 - 1) / 1024;
-    TensorElementWiseOp<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
-        lhs_, rhs_, size);
-  } else {
-    int blockSizeY = std::min(32, dimM);
-    int blockSizeX = (32 / blockSizeY) * 32;
-    int gridSizeX = std::min(32, (dimN + blockSizeX - 1) / blockSizeX);
-    int gridSizeY = std::min(32, (dimM + blockSizeY - 1) / blockSizeY);
-    dim3 threads(blockSizeX, blockSizeY);
-    dim3 grid(gridSizeX, gridSizeY);
-    TensorElementWiseOp<<<grid, threads, 0, STREAM_DEFAULT>>>(lhs_, rhs_);
-  }
-
-  CHECK_SYNC("TensorGpuApply failed");
-}
-#else
-template <class T, typename LeftType, typename RightType>
-inline void TensorGpuApply(LeftType& lhs, RightType& rhs) {
-  LOG(FATAL) << "Since it is gcc compiled, "
-                "this calculation does not support GPU implementation.";
-}
-#endif
-
-}  // namespace paddle
diff --git a/paddle/math/TensorExpression.h b/paddle/math/TensorExpression.h
deleted file mode 100644
index f6da9adfca50e49ca260e20313c8979a38e1b06b..0000000000000000000000000000000000000000
--- a/paddle/math/TensorExpression.h
+++ /dev/null
@@ -1,446 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <stdint.h>
-#include <cstddef>
-#include "hl_tensor_ops.h"
-#include "paddle/utils/Common.h"
-#include "paddle/utils/Logging.h"
-
-namespace paddle {
-
-template <class OP, typename ExprType, class T>
-class TensorConstant;
-template <class OP, typename ExprType, class T>
-class TensorUnaryOp;
-template <class OP, typename LhsType, typename RhsType, class T>
-class TensorBinaryOp;
-template <typename ExprType1, typename ExprType2, typename ExprType3, class T>
-class TensorTernaryOp;
-
-template <typename LhsType, typename RhsType, class T>
-class TensorAssignOp;
-
-/**
- * \brief Tensor base class.
- *
- * This is the base class of all Tensor and Expression class.
- */
-template <typename Derived, class T>
-class TensorExpression {
- public:
-  /**
-   * Element wise unary expression.
-   */
-  template <typename UnaryOp>
-  const TensorUnaryOp<UnaryOp, const Derived, T> unaryExpression(
-      const UnaryOp& op) const {
-    return TensorUnaryOp<UnaryOp, const Derived, T>(op, derived());
-  }
-
-  const TensorUnaryOp<hppl::unary::add_scale<T>, const Derived, T> operator+(
-      T p) const {
-    return unaryExpression(hppl::unary::add_scale<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::sub_scale<T>, const Derived, T> operator-(
-      T p) const {
-    return unaryExpression(hppl::unary::sub_scale<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::mul_scale<T>, const Derived, T> operator*(
-      T p) const {
-    return unaryExpression(hppl::unary::mul_scale<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::div_scale<T>, const Derived, T> operator/(
-      T p) const {
-    return unaryExpression(hppl::unary::div_scale<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::neg<T>, const Derived, T> operator-() const {
-    return unaryExpression(hppl::unary::neg<T>());
-  }
-
-  const TensorUnaryOp<hppl::unary::exp_op<T>, const Derived, T> exp() const {
-    return unaryExpression(hppl::unary::exp_op<T>());
-  }
-
-  const TensorUnaryOp<hppl::unary::log_op<T>, const Derived, T> log() const {
-    return unaryExpression(hppl::unary::log_op<T>());
-  }
-
-  const TensorUnaryOp<hppl::unary::sqrt_op<T>, const Derived, T> sqrt() const {
-    return unaryExpression(hppl::unary::sqrt_op<T>());
-  }
-
-  const TensorUnaryOp<hppl::unary::square<T>, const Derived, T> square() const {
-    return unaryExpression(hppl::unary::square<T>());
-  }
-
-  const TensorUnaryOp<hppl::unary::reciprocal<T>, const Derived, T> reciprocal()
-      const {
-    return unaryExpression(hppl::unary::reciprocal<T>());
-  }
-
-  const TensorUnaryOp<hppl::unary::abs<T>, const Derived, T> abs() const {
-    return unaryExpression(hppl::unary::abs<T>());
-  }
-
-  const TensorUnaryOp<hppl::unary::sign<T>, const Derived, T> sign() const {
-    return unaryExpression(hppl::unary::sign<T>());
-  }
-
-  const TensorUnaryOp<hppl::unary::pow_op<T>, const Derived, T> pow(T p) const {
-    return unaryExpression(hppl::unary::pow_op<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::min<T>, const Derived, T> min(T p) const {
-    return unaryExpression(hppl::unary::min<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::max<T>, const Derived, T> max(T p) const {
-    return unaryExpression(hppl::unary::max<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::cmp_eq<T>, const Derived, T> operator==(
-      T p) const {
-    return unaryExpression(hppl::unary::cmp_eq<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::cmp_ne<T>, const Derived, T> operator!=(
-      T p) const {
-    return unaryExpression(hppl::unary::cmp_ne<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::cmp_le<T>, const Derived, T> operator<=(
-      T p) const {
-    return unaryExpression(hppl::unary::cmp_le<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::cmp_lt<T>, const Derived, T> operator<(
-      T p) const {
-    return unaryExpression(hppl::unary::cmp_lt<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::cmp_ge<T>, const Derived, T> operator>=(
-      T p) const {
-    return unaryExpression(hppl::unary::cmp_ge<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::cmp_gt<T>, const Derived, T> operator>(
-      T p) const {
-    return unaryExpression(hppl::unary::cmp_gt<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::and_op<T>, const Derived, T> operator&&(
-      T p) const {
-    return unaryExpression(hppl::unary::and_op<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::or_op<T>, const Derived, T> operator||(
-      T p) const {
-    return unaryExpression(hppl::unary::or_op<T>(p));
-  }
-
-  /**
-   * Element wise binary expression.
-   */
-  template <typename BinaryOp, typename ExpressionType>
-  const TensorBinaryOp<BinaryOp, const Derived, const ExpressionType, T>
-  binaryExpression(const BinaryOp& op, const ExpressionType& expr) const {
-    return TensorBinaryOp<BinaryOp, const Derived, const ExpressionType, T>(
-        op, derived(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::cmp_eq<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator==(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::cmp_eq<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::cmp_ne<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator!=(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::cmp_ne<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::cmp_le<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator<=(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::cmp_le<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::cmp_lt<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator<(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::cmp_lt<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::cmp_ge<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator>=(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::cmp_ge<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::cmp_gt<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator>(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::cmp_gt<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::and_op<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator&&(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::and_op<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::or_op<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator||(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::or_op<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::add<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator+(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::add<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::sub<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator-(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::sub<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::mul<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator*(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::mul<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::div<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator/(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::div<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::min<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  min(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::min<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::max<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  max(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::max<T>(), expr);
-  }
-
-  /**
-   * Element wise ternary expression.
-   *
-   * ternary conditional operator(?: operator).
-   * The conditional expression returns one of two values depending on
-   * the result of derived expression.
-   * If derived expression evaluates to true, then expression1 is evaluated.
-   * If derived expression evaluates to false, then expression2 is evaluated.
-   */
-  template <typename ExprType1, typename ExprType2>
-  const TensorTernaryOp<const Derived, const ExprType1, const ExprType2, T>
-  condition(const ExprType1& expr1, const ExprType2& expr2) const {
-    return TensorTernaryOp<const Derived, const ExprType1, const ExprType2, T>(
-        derived(), expr1, expr2);
-  }
-
-  template <typename ExprType>
-  const TensorTernaryOp<
-      const Derived,
-      const TensorConstant<hppl::unary::constant<T>, const Derived, T>,
-      const ExprType,
-      T>
-  condition(T p, const ExprType& expr) const {
-    return condition(constant(p), expr);
-  }
-
-  template <typename ExprType>
-  const TensorTernaryOp<
-      const Derived,
-      const ExprType,
-      const TensorConstant<hppl::unary::constant<T>, const Derived, T>,
-      T>
-  condition(const ExprType& expr, T p) const {
-    return condition(expr, constant(p));
-  }
-
-  const TensorTernaryOp<
-      const Derived,
-      const TensorConstant<hppl::unary::constant<T>, const Derived, T>,
-      const TensorConstant<hppl::unary::constant<T>, const Derived, T>,
-      T>
-  condition(T p1, T p2) const {
-    return condition(constant(p1), constant(p2));
-  }
-
-  /**
-   * return a TensorConstant. A TensorConstant object hold a constant value.
-   */
-  const TensorConstant<hppl::unary::constant<T>, const Derived, T> constant(
-      T p) const {
-    return TensorConstant<hppl::unary::constant<T>, const Derived, T>(
-        hppl::unary::constant<T>(p), derived());
-  }
-
-  /**
-   * return a TensorAssignOp, and use AssignEvaluate to evaluate one or more
-   * TensorAssignOp objects.
-   */
-  template <typename ExpressionType>
-  TensorAssignOp<Derived, ExpressionType, T> lazyAssign(
-      const ExpressionType& expr) const {
-    return TensorAssignOp<Derived, ExpressionType, T>(derived(), expr);
-  }
-
- protected:
-  const Derived& derived() const { return *static_cast<const Derived*>(this); }
-};
-
-/**
- * \brief Unary Operator Expression
- */
-template <class OP, typename ExprType, class T>
-class TensorUnaryOp
-    : public TensorExpression<TensorUnaryOp<OP, ExprType, T>, T> {
- public:
-  explicit TensorUnaryOp(const OP op, const ExprType& expr)
-      : op_(op), expr_(expr) {}
-
-  const OP op_;
-  const ExprType expr_;
-};
-
-/**
- * \brief Binary Operator Expression
- */
-template <class OP, typename LhsType, typename RhsType, class T>
-class TensorBinaryOp
-    : public TensorExpression<TensorBinaryOp<OP, LhsType, RhsType, T>, T> {
- public:
-  explicit TensorBinaryOp(const OP op, const LhsType& lhs, const RhsType& rhs)
-      : op_(op), lhs_(lhs), rhs_(rhs) {}
-
-  const OP op_;
-  const LhsType lhs_;
-  const RhsType rhs_;
-};
-
-/**
- * \brief Ternary Operator Expression
- */
-template <typename ExprType1, typename ExprType2, typename ExprType3, class T>
-class TensorTernaryOp : public TensorExpression<
-                            TensorTernaryOp<ExprType1, ExprType2, ExprType3, T>,
-                            T> {
- public:
-  explicit TensorTernaryOp(const ExprType1& expr1,
-                           const ExprType2& expr2,
-                           const ExprType3& expr3)
-      : expr1_(expr1), expr2_(expr2), expr3_(expr3) {}
-
-  const ExprType1 expr1_;
-  const ExprType2 expr2_;
-  const ExprType3 expr3_;
-};
-
-/**
- * \brief Constant Expression
- */
-template <class OP, typename ExprType, class T>
-class TensorConstant
-    : public TensorExpression<TensorConstant<OP, ExprType, T>, T> {
- public:
-  explicit TensorConstant(const OP op, const ExprType& expr)
-      : op_(op), expr_(expr) {}
-
-  const OP op_;
-  const ExprType expr_;
-};
-
-/**
- * \brief operator+ overload
- * \return a unary operator expression
- */
-template <typename Derived, class T>
-const TensorUnaryOp<hppl::unary::add_scale<T>, const Derived, T> operator+(
-    T p, const TensorExpression<Derived, T>& expr) {
-  return expr + p;
-}
-
-/**
- * \brief operator* overload
- * \return a unary operator expression
- */
-template <typename Derived, class T>
-const TensorUnaryOp<hppl::unary::mul_scale<T>, const Derived, T> operator*(
-    T p, const TensorExpression<Derived, T>& expr) {
-  return expr * p;
-}
-
-}  // namespace paddle
-
-#include "TensorApply.h"
-#include "TensorEvaluate.h"
diff --git a/paddle/math/TrainingAlgorithmOp.cu b/paddle/math/TrainingAlgorithmOp.cu
deleted file mode 100644
index b844768d3b9fd05b5a0eada5e315b9e91588a4ee..0000000000000000000000000000000000000000
--- a/paddle/math/TrainingAlgorithmOp.cu
+++ /dev/null
@@ -1,356 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "BaseMatrix.h"
-#include "TrainingAlgorithmOp.h"
-#include "paddle/utils/Logging.h"
-
-#if __cplusplus > 199711L
-
-#include "TensorAssign.h"
-
-namespace paddle {
-
-void sparseMomentumApply(BaseMatrix& value,
-                         BaseMatrix& grad,
-                         BaseMatrix& momU,
-                         BaseMatrix& momV,
-                         real alpha,
-                         real beta,
-                         real gamma,
-                         real tau,
-                         real learningRate) {
-  auto expr1 = momU.lazyAssign(momU - (alpha * gamma * learningRate) * grad);
-  auto expr2 =
-      momV.lazyAssign(momV + (tau * alpha * gamma * learningRate) * grad);
-  auto expr3 = value.lazyAssign((tau / beta + (real)1 / alpha) * momU +
-                                ((real)1 / beta) * momV);
-
-  AssignEvaluate(expr1, expr2, expr3);
-}
-
-void adadeltaApply(BaseMatrix& value,
-                   BaseMatrix& grad,
-                   BaseMatrix& mom,
-                   BaseMatrix& accum,
-                   BaseMatrix& accum_update,
-                   BaseMatrix& lr,
-                   real rou,
-                   real epsilon,
-                   real learningRate,
-                   real momentum,
-                   real decayRate) {
-  auto expr1 = accum.lazyAssign(rou * accum + ((real)1 - rou) * grad.square());
-  auto expr2 =
-      lr.lazyAssign(((accum_update + epsilon) / (accum + epsilon)).sqrt());
-  auto expr3 = accum_update.lazyAssign(rou * accum_update +
-                                       ((real)1 - rou) * (grad * lr).square());
-  auto expr4 = mom.lazyAssign(mom * momentum -
-                              learningRate * lr * (grad + value * decayRate));
-  auto expr5 = value.lazyAssign(value + mom);
-
-  AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
-}
-
-void adagradApply(BaseMatrix& value,
-                  BaseMatrix& grad,
-                  BaseMatrix& mom,
-                  BaseMatrix& accum_buffer,
-                  BaseMatrix& accum,
-                  BaseMatrix& lr,
-                  real epsilon,
-                  real learningRate,
-                  real momentum,
-                  real decayRate) {
-  auto expr1 = accum.lazyAssign(accum + grad.square());
-  auto expr2 =
-      lr.lazyAssign((accum_buffer + accum + epsilon).sqrt().reciprocal());
-  auto expr3 = mom.lazyAssign(mom * momentum -
-                              learningRate * lr * (grad + value * decayRate));
-  auto expr4 = value.lazyAssign(value + mom);
-
-  AssignEvaluate(expr1, expr2, expr3, expr4);
-}
-
-void rmspropApply(BaseMatrix& value,
-                  BaseMatrix& grad,
-                  BaseMatrix& mom,
-                  BaseMatrix& g,
-                  BaseMatrix& f,
-                  BaseMatrix& lr,
-                  real accumulatedRou,
-                  real rou,
-                  real epsilon,
-                  real learningRate,
-                  real momentum,
-                  real decayRate,
-                  bool firstTime) {
-  auto expr2 = f.lazyAssign(accumulatedRou * f + ((real)1 - rou) * grad);
-  auto expr3 = lr.lazyAssign((g - f.square() + epsilon).sqrt().reciprocal());
-  auto expr4 = mom.lazyAssign(mom * momentum -
-                              learningRate * lr * (grad + value * decayRate));
-  auto expr5 = value.lazyAssign(value + mom);
-
-  if (firstTime) {
-    auto expr1 = g.lazyAssign(accumulatedRou * g + grad.square());
-
-    AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
-  } else {
-    auto expr1 =
-        g.lazyAssign(accumulatedRou * g + ((real)1 - rou) * grad.square());
-
-    AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
-  }
-}
-
-void decayedAdagradApply(BaseMatrix& value,
-                         BaseMatrix& grad,
-                         BaseMatrix& mom,
-                         BaseMatrix& accum,
-                         BaseMatrix& lr,
-                         real accumulatedRou,
-                         real rou,
-                         real epsilon,
-                         real learningRate,
-                         real momentum,
-                         real decayRate,
-                         bool firstTime) {
-  auto expr2 = lr.lazyAssign((accum + epsilon).sqrt().reciprocal());
-  auto expr3 = mom.lazyAssign(mom * momentum -
-                              learningRate * lr * (grad + value * decayRate));
-  auto expr4 = value.lazyAssign(value + mom);
-
-  if (firstTime) {
-    auto expr1 = accum.lazyAssign(accumulatedRou * accum + grad.square());
-
-    AssignEvaluate(expr1, expr2, expr3, expr4);
-  } else {
-    auto expr1 = accum.lazyAssign(accumulatedRou * accum +
-                                  ((real)1 - rou) * grad.square());
-
-    AssignEvaluate(expr1, expr2, expr3, expr4);
-  }
-}
-
-void adamApply(BaseMatrix& value,
-               BaseMatrix& grad,
-               BaseMatrix& mom,  // firse moment
-               BaseMatrix& v,    // second moment
-               real beta1,
-               real beta2,
-               real beta1_power,
-               real beta2_power,
-               real epsilon,
-               real learningRate) {
-  real alpha =
-      learningRate * std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
-
-  auto expr1 = mom.lazyAssign(beta1 * mom + ((real)1 - beta1) * grad);
-  auto expr2 = v.lazyAssign(beta2 * v + ((real)1 - beta2) * grad.square());
-  auto expr3 = value.lazyAssign(value - (mom * alpha) / (v.sqrt() + epsilon));
-
-  AssignEvaluate(expr1, expr2, expr3);
-}
-
-void adamaxApply(BaseMatrix& value,
-                 BaseMatrix& grad,
-                 BaseMatrix& mom,  // firse moment
-                 BaseMatrix& u,    // weighted infinity norm
-                 real beta1,
-                 real beta2,
-                 int64_t step,
-                 real alpha) {
-  auto expr1 = mom.lazyAssign(beta1 * mom + ((real)1 - beta1) * grad);
-  auto expr2 =
-      u.lazyAssign((beta2 * u > grad.abs()).condition(beta2 * u, grad.abs()));
-  auto expr3 = value.lazyAssign(
-      value - (alpha / ((real)1 - (real)std::pow(beta1, step))) * (mom / u));
-
-  AssignEvaluate(expr1, expr2, expr3);
-}
-
-}  // namespace paddle
-
-#else
-
-namespace paddle {
-
-void sparseMomentumApply(BaseMatrix& value,
-                         BaseMatrix& grad,
-                         BaseMatrix& momU,
-                         BaseMatrix& momV,
-                         real alpha,
-                         real beta,
-                         real gamma,
-                         real tau,
-                         real learningRate) {
-  /**
-   * \alpha_t = \alpha_{t-1} / k
-   * \beta_t = \beta_{t-1} / (1 + \lambda\gamma_t)
-   * u_t = u_{t-1} - \alpha_t \gamma_t g_t
-   * v_t = v_{t-1} + \tau_{t-1} \alpha_t \gamma_t g_t
-   * \tau_t = \tau_{t-1} + \beta_t / \alpha_t
-   */
-  momU -= (alpha * gamma * learningRate) * grad;
-  momV += (tau * alpha * gamma * learningRate) * grad;
-  value = (tau / beta + (real)1 / alpha) * momU + ((real)1 / beta) * momV;
-}
-
-void adadeltaApply(BaseMatrix& value,
-                   BaseMatrix& grad,
-                   BaseMatrix& mom,
-                   BaseMatrix& accum,
-                   BaseMatrix& accum_update,
-                   BaseMatrix& lr,
-                   real rou,
-                   real epsilon,
-                   real learningRate,
-                   real momentum,
-                   real decayRate) {
-  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
-  accum = rou * accum + ((real)1 - rou) * grad.square();
-
-  // learn_rate: sqrt(( E(dx_{t-1}^2) + epsilon ) / ( E(g_t^2) + epsilon ))
-  lr = ((accum_update + epsilon) / (accum + epsilon)).sqrt();
-
-  // E(dx_t^2) = \rou * E(dx_{t-1}^2) + (1-\rou) * (-g*learn_rate)^2
-  accum_update = rou * accum_update + ((real)1 - rou) * (grad * lr).square();
-
-  mom = mom * momentum - learningRate * lr * (grad + value * decayRate);
-  value += mom;
-}
-
-void adagradApply(BaseMatrix& value,
-                  BaseMatrix& grad,
-                  BaseMatrix& mom,
-                  BaseMatrix& accum_buffer,
-                  BaseMatrix& accum,
-                  BaseMatrix& lr,
-                  real epsilon,
-                  real learningRate,
-                  real momentum,
-                  real decayRate) {
-  accum += grad.square();
-  lr = (accum_buffer + accum + epsilon).sqrt().reciprocal();
-  mom = mom * momentum - learningRate * lr * (grad + value * decayRate);
-  value += mom;
-}
-
-void rmspropApply(BaseMatrix& value,
-                  BaseMatrix& grad,
-                  BaseMatrix& mom,
-                  BaseMatrix& g,
-                  BaseMatrix& f,
-                  BaseMatrix& lr,
-                  real accumulatedRou,
-                  real rou,
-                  real epsilon,
-                  real learningRate,
-                  real momentum,
-                  real decayRate,
-                  bool firstTime) {
-  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
-  // For the first time update, make the sum be the current square
-  // so that the initial estimation of E(g_t^2) will not be too small.
-  if (firstTime) {
-    g = accumulatedRou * g + grad.square();
-  } else {
-    g = accumulatedRou * g + ((real)1 - rou) * grad.square();
-  }
-
-  // E(f_t) = \rou * E(f_{t-1}) + (1-\rou) * g
-  f = accumulatedRou * f + ((real)1 - rou) * grad;
-
-  // learn_rate = 1/sqrt( ( E(g_t^2) - (E(f_t))^2 + epsilon )
-  // Basiclly if the sign of the gradient changes more often,
-  // the learning rate will be decreased.
-  lr = (g - f.square() + epsilon).sqrt().reciprocal();
-
-  mom = mom * momentum - learningRate * lr * (grad + value * decayRate);
-  value += mom;
-}
-
-void decayedAdagradApply(BaseMatrix& value,
-                         BaseMatrix& grad,
-                         BaseMatrix& mom,
-                         BaseMatrix& accum,
-                         BaseMatrix& lr,
-                         real accumulatedRou,
-                         real rou,
-                         real epsilon,
-                         real learningRate,
-                         real momentum,
-                         real decayRate,
-                         bool firstTime) {
-  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
-  // For the first time update, make the sum be the current square
-  // so that the initial estimation of E(g_t^2) will not be too small.
-  if (firstTime) {
-    accum = accumulatedRou * accum + grad.square();
-  } else {
-    accum = accumulatedRou * accum + ((real)1 - rou) * grad.square();
-  }
-
-  // learn_rate = 1/sqrt( ( E(g_t^2) + epsilon )
-  // Basiclly if the bigger the magnitude gradient is,
-  // the smaller the learning rate will be.
-  lr = (accum + epsilon).sqrt().reciprocal();
-
-  mom = mom * momentum - learningRate * lr * (grad + value * decayRate);
-  value += mom;
-}
-
-void adamApply(BaseMatrix& value,
-               BaseMatrix& grad,
-               BaseMatrix& mom,  // firse moment
-               BaseMatrix& v,    // second moment
-               real beta1,
-               real beta2,
-               real beta1_power,
-               real beta2_power,
-               real epsilon,
-               real learningRate) {
-  real alpha =
-      learningRate * std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
-
-  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
-  mom = beta1 * mom + ((real)1 - beta1) * grad;
-
-  // v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2
-  v = beta2 * v + ((real)1 - beta2) * grad.square();
-
-  value -= (mom * alpha) / (v.sqrt() + epsilon);
-}
-
-void adamaxApply(BaseMatrix& value,
-                 BaseMatrix& grad,
-                 BaseMatrix& mom,  // firse moment
-                 BaseMatrix& u,    // weighted infinity norm
-                 real beta1,
-                 real beta2,
-                 int64_t step,
-                 real alpha) {
-  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
-  mom = beta1 * mom + ((real)1 - beta1) * grad;
-
-  // u_t = max(\beta_2*u_{t-1}, abs(g_t))
-  u = (beta2 * u > grad.abs()).condition(beta2 * u, grad.abs());
-
-  // \theta_t = \theta_{t-1} - (\alpha/(1-\beta_1^t))*m_t/u_t
-  value -= (alpha / ((real)1 - (real)std::pow(beta1, step))) * (mom / u);
-}
-
-}  // namespace paddle
-
-#endif
diff --git a/paddle/math/TrainingAlgorithmOp.h b/paddle/math/TrainingAlgorithmOp.h
deleted file mode 100644
index fe40fc2d36e796bd4be7b7fc1e12a6eafa5d4700..0000000000000000000000000000000000000000
--- a/paddle/math/TrainingAlgorithmOp.h
+++ /dev/null
@@ -1,122 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "BaseMatrix.h"
-#include "paddle/utils/Logging.h"
-
-namespace paddle {
-
-/**
- * \brief Sparse Momentum optimizer.
- */
-extern void sparseMomentumApply(BaseMatrix& value,
-                                BaseMatrix& grad,
-                                BaseMatrix& momU,
-                                BaseMatrix& momV,
-                                real alpha,
-                                real beta,
-                                real gamma,
-                                real tau,
-                                real learningRate);
-
-/**
- * \brief AdaDelta optimizer.
- */
-extern void adadeltaApply(BaseMatrix& value,
-                          BaseMatrix& grad,
-                          BaseMatrix& sum,
-                          BaseMatrix& sum1,
-                          BaseMatrix& mom,
-                          BaseMatrix& lr,
-                          real rou,
-                          real epsilon,
-                          real learningRate,
-                          real momentum,
-                          real decayRate);
-
-/**
- * \brief AdaGrad optimizer.
- */
-extern void adagradApply(BaseMatrix& value,
-                         BaseMatrix& grad,
-                         BaseMatrix& sum,
-                         BaseMatrix& sum1,
-                         BaseMatrix& mom,
-                         BaseMatrix& lr,
-                         real epsilon,
-                         real learningRate,
-                         real momentum,
-                         real decayRate);
-
-/**
- * \brief RMSProp optimizer.
- */
-extern void rmspropApply(BaseMatrix& value,
-                         BaseMatrix& grad,
-                         BaseMatrix& g,
-                         BaseMatrix& f,
-                         BaseMatrix& mom,
-                         BaseMatrix& lr,
-                         real accumulatedRou,
-                         real rou,
-                         real epsilon,
-                         real learningRate,
-                         real momentum,
-                         real decayRate,
-                         bool firstTime);
-
-/**
- * \brief Decayed AdaGrad optimizer.
- */
-extern void decayedAdagradApply(BaseMatrix& value,
-                                BaseMatrix& grad,
-                                BaseMatrix& mom,
-                                BaseMatrix& accum,
-                                BaseMatrix& lr,
-                                real accumulatedRou,
-                                real rou,
-                                real epsilon,
-                                real learningRate,
-                                real momentum,
-                                real decayRate,
-                                bool firstTime);
-
-/**
- * \brief Adam optimizer.
- */
-extern void adamApply(BaseMatrix& value,
-                      BaseMatrix& grad,
-                      BaseMatrix& mom,
-                      BaseMatrix& v,
-                      real beta1,
-                      real beta2,
-                      real beta1_power,
-                      real beta2_power,
-                      real epsilon,
-                      real learningRate);
-
-/**
- * \brief AdaMax optimizer.
- */
-extern void adamaxApply(BaseMatrix& value,
-                        BaseMatrix& grad,
-                        BaseMatrix& mom,  // firse moment
-                        BaseMatrix& u,    // weighted infinity norm
-                        real beta1,
-                        real beta2,
-                        int64_t step,
-                        real alpha);
-}  // namespace paddle
diff --git a/paddle/math/Vector.cpp b/paddle/math/Vector.cpp
deleted file mode 100644
index 2a47ed7ef81a2e969757c244370cc346b13e1c03..0000000000000000000000000000000000000000
--- a/paddle/math/Vector.cpp
+++ /dev/null
@@ -1,1091 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Vector.h"
-#include "paddle/utils/Util.h"
-
-#include <memory>
-#include "Matrix.h"
-#include "hl_gpu.h"
-#include "hl_matrix.h"
-#include "hl_table_apply.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Thread.h"
-#include "paddle/utils/ThreadLocal.h"
-
-namespace paddle {
-
-template <class T>
-std::shared_ptr<VectorT<T>> VectorT<T>::create(size_t size, bool useGpu) {
-  if (useGpu) {
-    return std::make_shared<GpuVectorT<T>>(size);
-  } else {
-    return std::make_shared<CpuVectorT<T>>(size);
-  }
-}
-
-template <class T>
-std::shared_ptr<VectorT<T>> VectorT<T>::createParallelVector(
-    size_t size, bool useGpu, SyncThreadPool* pool) {
-  if (!useGpu && FLAGS_trainer_count > 1 && FLAGS_enable_parallel_vector &&
-      size >= (size_t)FLAGS_enable_parallel_vector) {
-    return std::make_shared<ParallelCpuVectorT<T>>(
-        size, pool ? pool : getGlobalSyncThreadPool());
-  } else {
-    return create(size, useGpu);
-  }
-}
-
-template <class T>
-std::shared_ptr<VectorT<T>> VectorT<T>::create(T* data,
-                                               size_t size,
-                                               bool useGpu) {
-  if (useGpu) {
-    return std::make_shared<GpuVectorT<T>>(size, data);
-  } else {
-    return std::make_shared<CpuVectorT<T>>(size, data);
-  }
-}
-
-template <class T>
-std::shared_ptr<VectorT<T>> VectorT<T>::create(size_t size,
-                                               MemoryHandlePtr memoryHandle,
-                                               size_t offset) {
-  if (auto cpuMemHandle =
-          std::dynamic_pointer_cast<CpuMemoryHandle>(memoryHandle)) {
-    return std::make_shared<CpuVectorT<T>>(size, cpuMemHandle, offset);
-  } else if (auto gpuMemHandle =
-                 std::dynamic_pointer_cast<GpuMemoryHandle>(memoryHandle)) {
-    return std::make_shared<GpuVectorT<T>>(size, gpuMemHandle, offset);
-  } else {
-    LOG(FATAL) << "Wrong";
-    return NULL;
-  }
-}
-
-template <>
-MatrixPtr VectorT<real>::toOneHotSparseMatrix(size_t idRange, bool useGpu) {
-  LOG(FATAL) << "Wrong for real vector";
-  return nullptr;
-}
-
-template <>
-MatrixPtr VectorT<int>::toOneHotSparseMatrix(size_t idRange, bool useGpu) {
-  size_t height = getSize();
-  size_t width = idRange;
-  MatrixPtr mat = Matrix::createSparseMatrix(
-      height, idRange, height, NO_VALUE, SPARSE_CSR, false, useGpu);
-
-  CpuIVector cpuIds(height);
-  cpuIds.copyFrom(*this);
-  int* idData = cpuIds.getData();
-
-  for (decltype(height) i = 0; i < height; i++) {
-    const unsigned int id = idData[i];
-    CHECK_LT(id, width);
-    mat->setRow(i, 1, &id, nullptr);
-  }
-  return mat;
-}
-
-template <>
-std::shared_ptr<VectorT<int>> VectorT<real>::castToInt() {
-  std::shared_ptr<VectorT<int>> ret = IVector::create(this->getSize(), useGpu_);
-  if (useGpu_) {
-    hl_vector_cast2int(ret->getData(), this->getData(), this->getSize());
-  } else {
-    for (size_t i = 0; i < getSize(); ++i) {
-      ret->getData()[i] = int(this->getData()[i]);
-    }
-  }
-  return ret;
-}
-
-template <class T>
-GpuVectorT<T>::GpuVectorT(size_t size)
-    : VectorT<T>(size,
-                 std::make_shared<GpuMemoryHandle>(sizeof(T) * size),
-                 0, /* offset = 0 */
-                 true /* useGpu = true */) {}
-
-template <class T>
-T GpuVectorT<T>::getElement(size_t i) const {
-  T elem = 0;
-  hl_memcpy_device2host(&elem, const_cast<T*>(&this->getData()[i]), sizeof(T));
-  return elem;
-}
-template <class T>
-void GpuVectorT<T>::setElement(size_t i, const T& value) {
-  hl_memcpy_host2device(&this->getData()[i], const_cast<T*>(&value), sizeof(T));
-}
-
-template <class T>
-T* GpuVectorT<T>::getPoint(const uint64_t beginPos) {
-  LOG(FATAL) << "Not implemented" << beginPos;
-  return NULL;
-}
-
-template <>
-int GpuVectorT<int>::getAbsSum() {
-  LOG(FATAL) << "Not implemented";
-  return 0;
-}
-
-template <>
-int GpuVectorT<int>::getSum() {
-  LOG(FATAL) << "Not implemented";
-  return 0;
-}
-
-template <>
-real GpuVectorT<real>::getAbsSum() {
-  real* A = this->getData();
-  real sum = 0;
-  hl_vector_abs_sum(A, &sum, this->getSize());
-  return sum;
-}
-
-template <>
-real GpuVectorT<real>::getSum() {
-  real* A = this->getData();
-  real sum = 0;
-  hl_vector_sum(A, &sum, this->getSize());
-  return sum;
-}
-
-template <>
-int GpuVectorT<int>::getMax() {
-  CpuIVector cpuIVec = CpuIVector(this->getSize());
-  copyTo(&cpuIVec);
-  return cpuIVec.getMax();
-}
-
-template <>
-int GpuVectorT<int>::getAbsMax() {
-  CpuIVector cpuIVec = CpuIVector(this->getSize());
-  copyTo(&cpuIVec);
-  return cpuIVec.getAbsMax();
-}
-
-template <class T>
-void GpuVectorT<T>::isEqualTo(const VectorT<T>& b, const T& value) {
-  BaseMatrixT<T>::isEqualTo((BaseMatrixT<T>&)b, value);
-}
-
-template <class T>
-void GpuVectorT<T>::selectFrom(const VectorT<T>& src, const VectorT<int>& ids) {
-#ifdef PADDLE_WITH_CUDA
-  hl_vector_select_from<T>(this->getData(),
-                           this->getSize(),
-                           src.getData(),
-                           src.getSize(),
-                           ids.getData(),
-                           ids.getSize());
-#endif
-}
-
-template <class Func>
-real gpuRowFunc(Func f, GpuVector& v) {
-  static ThreadLocal<std::unique_ptr<CpuVectorT<real>>> local;
-  if (!*local) {
-    (*local).reset(new CpuVector(1));
-  }
-  real* A = v.getData();
-  f(A, (*local)->getData(), 1, v.getSize());
-  return (*local)->getData()[0];
-}
-
-template <>
-real GpuVectorT<real>::getMax() {
-  return gpuRowFunc(hl_matrix_row_max, *this);
-}
-
-template <>
-real GpuVectorT<real>::getAbsMax() {
-  return std::max(gpuRowFunc(hl_matrix_row_max, *this),
-                  -gpuRowFunc(hl_matrix_row_min, *this));
-}
-
-template <>
-int GpuVectorT<int>::getMin() {
-  LOG(FATAL) << "Not implemented";
-  return 0;
-}
-
-template <>
-real GpuVectorT<real>::getMin() {
-  return gpuRowFunc(hl_matrix_row_min, *this);
-}
-
-template <class T>
-T GpuVectorT<T>::get(size_t pos) {
-  T val = (T)0;
-  hl_memcpy_device2host((void*)&val, (void*)(this->getData() + pos), sizeof(T));
-  return val;
-}
-
-template <class T>
-void GpuVectorT<T>::histogram(std::ostream& os, int type) {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <class T>
-void GpuVectorT<T>::zeroMem() {
-  BaseMatrixT<T>::zero();
-}
-
-template <class T>
-void GpuVectorT<T>::reset(const T& value) {
-  BaseMatrixT<T>::assign(value);
-}
-
-template <class T>
-void GpuVectorT<T>::fillSequence() {
-  LOG(FATAL) << "not implemented";
-}
-
-template <class T>
-void GpuVectorT<T>::copyFrom(const VectorT<T>& src) {
-  src.copyTo(this);
-}
-
-template <class T>
-void GpuVectorT<T>::copyFrom(const VectorT<T>& src, hl_stream_t stream) {
-  CHECK_EQ(src.getSize(), this->getSize());
-  hl_memcpy_async((void*)this->getData(),
-                  (void*)src.getData(),
-                  sizeof(T) * this->getSize(),
-                  stream);
-}
-
-template <class T>
-void GpuVectorT<T>::copyFrom(const T* gpuSrc, size_t size) {
-  CHECK(gpuSrc != NULL);
-  CHECK_LE(size, this->size_);
-
-  hl_memcpy((void*)this->getData(), (void*)gpuSrc, sizeof(T) * size);
-}
-
-template <class T>
-void GpuVectorT<T>::copyFrom(const T* gpuSrc, size_t size, hl_stream_t stream) {
-  CHECK(gpuSrc != NULL);
-  CHECK_LE(size, this->size_);
-
-  hl_memcpy_async(
-      (void*)this->getData(), (void*)gpuSrc, sizeof(T) * size, stream);
-}
-
-template <class T>
-void GpuVectorT<T>::copyTo(CpuVectorT<T>* dest) const {
-  CHECK_EQ(this->getSize(), dest->getSize());
-
-  hl_memcpy_device2host((void*)dest->getData(),
-                        (void*)this->getData(),
-                        sizeof(T) * this->getSize());
-}
-
-template <class T>
-void GpuVectorT<T>::copyTo(GpuVectorT<T>* dest) const {
-  CHECK_EQ(this->getSize(), dest->getSize());
-
-  hl_memcpy_device2device((void*)dest->getData(),
-                          (void*)this->getData(),
-                          sizeof(T) * this->getSize());
-}
-
-template <>
-void GpuVectorT<int>::rand() {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <>
-void GpuVectorT<int>::print(std::ostream& os, size_t num) const {
-  IVectorPtr dest = IVector::create(this->size_, false);
-  hl_memcpy_device2host((void*)dest->getData(),
-                        (void*)this->getData(),
-                        sizeof(int) * this->getSize());
-  dest->print(os, num);
-}
-
-template <>
-void GpuVectorT<real>::print(std::ostream& os, size_t num) const {
-  VectorPtr dest = Vector::create(this->size_, false);
-  hl_memcpy_device2host((void*)dest->getData(),
-                        (void*)this->getData(),
-                        sizeof(int) * this->getSize());
-  dest->print(os, num);
-}
-
-template <>
-void GpuVectorT<int>::printOneElement(std::ostream& os, size_t idx) const {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <>
-void GpuVectorT<real>::printOneElement(std::ostream& os, size_t idx) const {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <>
-void CpuVectorT<int>::rand() {
-  LOG(FATAL) << "Not implemented";
-}
-template <>
-void GpuVectorT<real>::rand(size_t classNum) {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <>
-void CpuVectorT<real>::rand(size_t classNum) {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <>
-void GpuVectorT<real>::rand() {
-  VectorPtr cPtr = Vector::create(this->size_, false);
-  cPtr->rand();
-
-  hl_memcpy_host2device(data_, cPtr->getData(), this->size_ * sizeof(real));
-}
-
-template <>
-void GpuVectorT<int>::rand(size_t classNum) {
-  IVectorPtr cPtr = IVector::create(this->size_, false);
-  cPtr->rand(classNum);
-
-  hl_memcpy_host2device(data_, cPtr->getData(), this->size_ * sizeof(int));
-}
-
-template <>
-void CpuVectorT<int>::rand(size_t classNum) {
-  size_t size = this->getSize();
-  int* data = this->getData();
-  for (size_t i = 0; i < size; i++) {
-    data[i] =
-        std::min(classNum - 1,
-                 size_t(::rand() * (1. / ((double)RAND_MAX + 1)) * classNum));
-  }
-}
-
-template <>
-void CpuVectorT<real>::rand() {
-  size_t size = this->getSize();
-  real* data = this->getData();
-  for (size_t i = 0; i < size; i++) {
-    data[i] = ::rand() * (1. / (double)RAND_MAX);
-    // data[ii] = ((temp > RAND_MAX/2)? 1 : -1) *
-    // sqrt( abs((temp-RAND_MAX/2))/(double(RAND_MAX))/2048 );
-  }
-}
-
-template <class T>
-void CpuVectorT<T>::randnorm(real, real) {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <class T>
-void CpuVectorT<T>::uniform(real, real) {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <class T>
-void GpuVectorT<T>::randnorm(real, real) {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <class T>
-void GpuVectorT<T>::uniform(real, real) {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <>
-void CpuVectorT<real>::randnorm(real mean, real std) {
-  size_t size = this->getSize();
-  real* data = this->getData();
-  unsigned int* seed = ThreadLocalRand::getSeed();
-  auto rand1 = [&]() { return (1. + ::rand_r(seed)) * (1. / (1. + RAND_MAX)); };
-  for (size_t i = 0; i < size - 1; i += 2) {
-    real r1 = rand1();
-    r1 = std::sqrt(-2 * std::log(r1));
-    real r2 = rand1();
-    data[i] = mean + std * r1 * cos(2 * M_PI * r2);
-    data[i + 1] = mean + std * r1 * sin(2 * M_PI * r2);
-  }
-  real r1 = rand1();
-  r1 = std::sqrt(-2 * std::log(r1));
-  real r2 = rand1();
-  data[size - 1] = mean + std * r1 * cos(2 * M_PI * r2);
-}
-
-template <>
-void CpuVectorT<real>::uniform(real left, real right) {
-  size_t size = this->getSize();
-  real* data = this->getData();
-  real range = right - left;
-  unsigned int* seed = ThreadLocalRand::getSeed();
-  auto rand1 = [&]() { return ::rand_r(seed) * (1. / (1. + RAND_MAX)); };
-  for (size_t i = 0; i < size; ++i) {
-    data[i] = rand1() * range + left;
-  }
-}
-
-template <>
-void GpuVectorT<real>::randnorm(real mean, real std) {
-  CpuVector cpuVec = CpuVector(this->getSize());
-  cpuVec.randnorm(mean, std);
-
-  hl_memcpy_host2device(
-      data_, cpuVec.getData(), this->getSize() * sizeof(real));
-}
-
-template <>
-void GpuVectorT<real>::uniform(real left, real right) {
-  CpuVector cpuVec = CpuVector(this->getSize());
-  cpuVec.uniform(left, right);
-
-  hl_memcpy_host2device(
-      data_, cpuVec.getData(), this->getSize() * sizeof(real));
-}
-
-template <class T>
-CpuVectorT<T>::CpuVectorT(size_t size)
-    : VectorT<T>(size,
-                 std::make_shared<CpuMemoryHandle>(sizeof(T) * size),
-                 0, /* offset = 0 */
-                 false /* useGpu = false */) {}
-
-template <class T>
-CpuVectorT<T>::CpuVectorT(const VectorT<T>& src)
-    : VectorT<T>(src.getSize(),
-                 src.getMemoryHandle(),
-                 0, /* offset = 0 */
-                 false /* useGpu = false */) {
-  if (typeid(*this->memoryHandle_.get()) != typeid(CpuMemoryHandle)) {
-    this->memoryHandle_ =
-        std::make_shared<CpuMemoryHandle>(sizeof(T) * this->getSize());
-    this->data_ = reinterpret_cast<T*>(this->memoryHandle_->getBuf());
-  }
-  src.copyTo(this);
-}
-
-template <class T>
-T CpuVectorT<T>::getAbsSum() {
-  const T* A = this->getData();
-  size_t size = this->getSize();
-  T sum = 0;
-  for (size_t i = 0; i < size; i++) {
-    sum += (A[i] > 0) ? A[i] : -A[i];
-  }
-  return sum;
-}
-
-// cannot use above version, due to precision issue of float
-template <>
-real CpuVectorT<real>::getAbsSum() {
-  const real* A = this->getData();
-  size_t size = this->getSize();
-  double sum = 0;
-  for (size_t i = 0; i < size; i++) {
-    sum += (A[i] > 0) ? A[i] : -A[i];
-  }
-  return sum;
-}
-
-template <class T>
-T CpuVectorT<T>::getSum() {
-  const T* A = this->getData();
-  size_t size = this->getSize();
-  T sum = 0;
-  for (size_t i = 0; i < size; i++) {
-    sum += A[i];
-  }
-  return sum;
-}
-
-template <>
-real CpuVectorT<real>::getSum() {
-  const real* A = this->getData();
-  size_t size = this->getSize();
-  double sum = 0;
-  for (size_t i = 0; i < size; i++) {
-    sum += A[i];
-  }
-  return sum;
-}
-
-template <class T>
-T CpuVectorT<T>::get(size_t pos) {
-  return this->getData()[pos];
-}
-
-template <class T>
-T CpuVectorT<T>::getMax() {
-  const T* A = this->getData();
-  size_t size = this->getSize();
-  T res = A[0];
-  for (size_t i = 1; i < size; i++) {
-    if (res < A[i]) res = A[i];
-  }
-  return res;
-}
-
-template <class T>
-T CpuVectorT<T>::getAbsMax() {
-  const T* A = this->getData();
-  size_t size = this->getSize();
-  T res = std::abs(A[0]);
-  for (size_t i = 1; i < size; i++) {
-    if (res < std::abs(A[i])) res = std::abs(A[i]);
-  }
-  return res;
-}
-
-template <class T>
-T CpuVectorT<T>::getMin() {
-  const T* A = this->getData();
-  size_t size = this->getSize();
-  T res = A[0];
-  for (size_t i = 1; i < size; i++) {
-    if (res > A[i]) res = A[i];
-  }
-  return res;
-}
-
-template <class T>
-void CpuVectorT<T>::isEqualTo(const VectorT<T>& b, const T& value) {
-  size_t size = this->getSize();
-  CHECK_EQ(b.getSize(), size);
-
-  const T* B = b.getData();
-  T* A = this->getData();
-  for (size_t i = 0; i < size; i++) {
-    A[i] = (B[i] == value);
-  }
-}
-
-template <class T>
-void CpuVectorT<T>::selectFrom(const VectorT<T>& src, const VectorT<int>& ids) {
-  size_t size = this->getSize();
-  CHECK_EQ(ids.getSize(), size);
-
-  const int* indices = ids.getData();
-  const T* B = src.getData();
-  T* A = this->getData();
-  for (size_t i = 0; i < size; i++) {
-    int index = indices[i];
-    CHECK_LT(index, (int)src.getSize());
-    A[i] = B[index];
-  }
-}
-
-static int getSignAndExponentOfFloat(float a) {
-  uint32_t* pa = reinterpret_cast<uint32_t*>(&a);
-  return *pa >> 23;
-}
-
-template <class T>
-void CpuVectorT<T>::histogram(std::ostream& os, int type) {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <>
-void CpuVectorT<real>::histogram(std::ostream& os, int type) {
-  int counters[512];
-  memset(counters, 0, sizeof(counters));
-  int counterZero = 0;
-
-  const real* A = this->getData();
-  size_t size = this->getSize();
-  for (size_t i = 0; i < size; i++) {
-    if (A[i] == 0.0f) {
-      ++counterZero;
-    } else {
-      ++counters[getSignAndExponentOfFloat(A[i])];
-    }
-  }
-
-  int64_t sum = 0;
-  float sizeNonZero = size - counterZero;
-  os << "zero:" << counterZero;
-  for (int i = 0; i < 256; i++) {
-    int counter = counters[i];
-    if (counter) {
-      os << " 2^" << i - 127 << ":" << counter / sizeNonZero * 100 << "%";
-      sum += counter * (i - 127);
-    }
-  }
-  for (int i = 0; i < 256; i++) {
-    int counter = counters[i + 256];
-    if (counter) {
-      os << " -2^" << i - 127 << ":" << counter / sizeNonZero * 100 << "%";
-      sum += counter * (i - 127);
-    }
-  }
-  os << ", nonzero_exponent_avg=" << sum / sizeNonZero;
-}
-
-template <class T>
-void CpuVectorT<T>::zeroMem() {
-  memset(this->getData(), 0, sizeof(T) * this->getSize());
-}
-
-template <class T>
-void CpuVectorT<T>::reset(const T& value) {
-  T* A = this->getData();
-  size_t size = this->getSize();
-  for (size_t i = 0; i < size; i++) {
-    A[i] = value;
-  }
-}
-
-template <class T>
-void CpuVectorT<T>::fillSequence() {
-  T* A = this->getData();
-  size_t size = this->getSize();
-  for (size_t i = 0; i < size; i++) {
-    A[i] = i;
-  }
-}
-
-template <class T>
-void CpuVectorT<T>::copyFrom(const VectorT<T>& src) {
-  src.copyTo(this);
-}
-
-template <class T>
-void CpuVectorT<T>::copyFrom(const VectorT<T>& src, hl_stream_t stream) {
-  if (typeid(src) == typeid(GpuVectorT<T>)) {
-    hl_memcpy_async((void*)this->getData(),
-                    (void*)src.getData(),
-                    sizeof(T) * this->getSize(),
-                    stream);
-    // There is a need to add synchronization to ensure that the data is copied.
-    hl_stream_synchronize(stream);
-  } else {
-    src.copyTo(this);
-  }
-}
-
-template <class T>
-void CpuVectorT<T>::copyFrom(const T* hostSrc, size_t size) {
-  CHECK(hostSrc != NULL);
-  CHECK_LE(size, this->size_);
-  memcpy(this->data_, hostSrc, sizeof(T) * size);
-}
-
-template <class T>
-void CpuVectorT<T>::copyFrom(const T* hostSrc,
-                             size_t size,
-                             hl_stream_t stream) {
-  (void)stream;
-
-  CHECK(hostSrc != NULL);
-  CHECK_LE(size, this->size_);
-  memcpy(this->data_, hostSrc, sizeof(T) * size);
-}
-
-template <class T>
-void CpuVectorT<T>::copyTo(CpuVectorT<T>* dest) const {
-  CHECK_EQ(this->getSize(), dest->getSize());
-  memcpy(dest->getData(), this->getData(), sizeof(T) * this->getSize());
-}
-
-template <class T>
-void CpuVectorT<T>::copyTo(GpuVectorT<T>* dest) const {
-  CHECK_EQ(this->getSize(), dest->getSize());
-  hl_memcpy_host2device((void*)dest->getData(),
-                        (void*)this->getData(),
-                        sizeof(T) * this->getSize());
-}
-
-template <>
-void CpuVectorT<real>::print(std::ostream& os, size_t num) const {
-  size_t w = size_ < num ? size_ : num;
-  os << "[";
-  for (size_t i = 0; i < w; ++i) {
-    os << data_[i] << " ";
-  }
-  os << "]" << std::endl;
-}
-
-template <>
-void CpuVectorT<int>::print(std::ostream& os, size_t num) const {
-  size_t w = size_ < num ? size_ : num;
-  os << "[";
-  for (size_t i = 0; i < w; ++i) {
-    os << (int)data_[i] << " ";
-  }
-  os << "]" << std::endl;
-}
-
-template <>
-void CpuVectorT<real>::printOneElement(std::ostream& os, size_t idx) const {
-  CHECK_LT(idx, size_);
-  os << data_[idx] << ";";
-}
-
-template <>
-void CpuVectorT<int>::printOneElement(std::ostream& os, size_t idx) const {
-  CHECK_LT(idx, size_);
-  os << (int)data_[idx] << ";";
-}
-
-template <class T>
-void ParallelCpuVectorT<T>::parallelExec(ExecFunc func) {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <>
-void ParallelCpuVectorT<real>::parallelExec(ExecFunc func) {
-  pool_->exec([this, func](int tid, size_t numThreads) {
-    auto interval = calcSplitArrayInterval(
-        this->getSize(), (size_t)tid, numThreads, 8LU /*for avx*/);
-    // setup sub bufs
-    CpuVector subVec(0, nullptr);
-    subVec.subVecFrom(*this, interval);
-    func(subVec);
-  });
-}
-
-template <class T>
-void ParallelCpuVectorT<T>::exec(SyncThreadPool::JobFunc func) {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <>
-void ParallelCpuVectorT<real>::exec(SyncThreadPool::JobFunc func) {
-  pool_->exec(func);
-}
-
-template <class T>
-CpuGpuVectorT<T>::CpuGpuVectorT(size_t size, bool useGpu) : sync_(nullptr) {
-  if (!useGpu) {
-    cpuVectorT_ = std::make_shared<CpuVectorT<T>>(size);
-  } else {
-    gpuVectorT_ = std::make_shared<GpuVectorT<T>>(size);
-  }
-  setSync(useGpu);
-}
-
-template <class T>
-CpuGpuVectorT<T>::CpuGpuVectorT(const std::shared_ptr<VectorT<T>>& src)
-    : sync_(nullptr) {
-  bool useGpu = src->useGpu();
-  if (useGpu) {
-    gpuVectorT_ = src;
-  } else {
-    cpuVectorT_ = src;
-  }
-  setSync(useGpu);
-}
-
-template <class T>
-CpuGpuVectorT<T>::CpuGpuVectorT(size_t size, T* data, bool useGpu)
-    : sync_(nullptr) {
-  if (!useGpu) {
-    cpuVectorT_ = std::make_shared<CpuVectorT<T>>(size, data);
-    setSync(DATA_AT_CPU);
-  } else {
-    gpuVectorT_ = std::make_shared<GpuVectorT<T>>(size, data);
-    setSync(DATA_AT_GPU);
-  }
-}
-
-template <class T>
-std::shared_ptr<CpuGpuVectorT<T>> CpuGpuVectorT<T>::create(size_t size,
-                                                           bool useGpu) {
-  return std::make_shared<CpuGpuVectorT<T>>(size, useGpu);
-}
-
-template <class T>
-void CpuGpuVectorT<T>::resize(size_t size, bool useGpu) {
-  if (useGpu) {
-    CHECK(gpuVectorT_) << "gpuVectorT_ is null";
-    // If memoryHandle_ is nullptr,
-    // the data may be owned by the caller when it was constructed.
-    // It should not resize for this case.
-    if (gpuVectorT_->getMemoryHandle()) {
-      gpuVectorT_->resize(size);
-    } else {
-      CHECK_EQ(gpuVectorT_->getSize(), size);
-    }
-  } else {
-    CHECK(cpuVectorT_) << "cpuVectorT_ is null";
-    // If memoryHandle_ is nullptr,
-    // the data may be owned by the caller when it was constructed.
-    // It should not resize for this case.
-    if (cpuVectorT_->getMemoryHandle()) {
-      cpuVectorT_->resize(size);
-    } else {
-      CHECK_EQ(cpuVectorT_->getSize(), size);
-    }
-  }
-  setSync(useGpu);
-}
-
-template <class T>
-void CpuGpuVectorT<T>::resizeOrCreate(std::shared_ptr<CpuGpuVectorT<T>>& vec,
-                                      size_t size,
-                                      bool useGpu) {
-  if (vec) {
-    vec->resize(size, useGpu);
-  } else {
-    vec = create(size, useGpu);
-  }
-}
-
-template <class T>
-void CpuGpuVectorT<T>::resizeOrCreate(size_t size, bool useGpu) {
-  if (useGpu && (!gpuVectorT_)) {
-    gpuVectorT_ = VectorT<T>::create(size, true);
-  } else if ((!useGpu) && (!cpuVectorT_)) {
-    cpuVectorT_ = VectorT<T>::create(size, false);
-  } else {
-    CHECK((useGpu && gpuVectorT_) || (!useGpu && cpuVectorT_));
-    this->resize(size, useGpu);
-  }
-}
-
-template <class T>
-CpuGpuVectorT<T>::CpuGpuVectorT(CpuGpuVectorT<T>& src,
-                                size_t offset,
-                                size_t size)
-    : sync_(nullptr) {
-  CHECK_LE(offset + size, static_cast<size_t>(src.getSize()));
-#ifdef PADDLE_WITH_CUDA
-  SyncedFlag* flag = src.getSync();
-  if (*flag == DATA_AT_CPU) {
-    src.copyToGpu();  // will set synchronous data between CPU and GPU
-  } else if (*flag == DATA_AT_GPU) {
-    src.copyToCpu();  // will set synchronous data between CPU and GPU
-  }
-#endif
-  auto cMemHandle = (src.getVector(false))->getMemoryHandle();
-  cpuVectorT_ = std::make_shared<CpuVectorT<T>>(
-      size, std::dynamic_pointer_cast<CpuMemoryHandle>(cMemHandle), offset);
-#ifdef PADDLE_WITH_CUDA
-  auto gMemHandle = (src.getVector(true))->getMemoryHandle();
-  gpuVectorT_ = std::make_shared<GpuVectorT<T>>(
-      size, std::dynamic_pointer_cast<GpuMemoryHandle>(gMemHandle), offset);
-  src.setSync(SYNCED);
-#endif
-  setSync(src.getSync());
-}
-
-template <class T>
-std::shared_ptr<const VectorT<T>> CpuGpuVectorT<T>::getVector(
-    bool useGpu) const {
-  auto* self = const_cast<CpuGpuVectorT<T>*>(this);
-  if (useGpu) {
-    self->copyToGpu();
-    return std::const_pointer_cast<const VectorT<T>>(gpuVectorT_);
-  } else {
-    self->copyToCpu();
-    return std::const_pointer_cast<const VectorT<T>>(cpuVectorT_);
-  }
-}
-
-template <class T>
-std::shared_ptr<VectorT<T>>& CpuGpuVectorT<T>::getMutableVector(bool useGpu) {
-  setSync(useGpu);
-  if (useGpu) {
-    copyToGpu();
-    return gpuVectorT_;
-  } else {
-    copyToCpu();
-    return cpuVectorT_;
-  }
-}
-
-template <class T>
-const T* CpuGpuVectorT<T>::getData(bool useGpu) const {
-  auto self = const_cast<CpuGpuVectorT<T>*>(this);
-  if (useGpu) {
-    self->copyToGpu();
-    return gpuVectorT_->getData();
-  } else {
-    self->copyToCpu();
-    return cpuVectorT_->getData();
-  }
-}
-
-// Operation will change data and need to reset sync_ & syncFlag_.
-#define MUTABLE_VECTOR_OP(OP, useGpu, args...) \
-  do {                                         \
-    if (useGpu) {                              \
-      copyToGpu();                             \
-      setSync(useGpu);                         \
-      return gpuVectorT_->OP(args);            \
-    } else {                                   \
-      copyToCpu();                             \
-      setSync(useGpu);                         \
-      return cpuVectorT_->OP(args);            \
-    }                                          \
-  } while (0)
-
-template <class T>
-T* CpuGpuVectorT<T>::getMutableData(bool useGpu) {
-  MUTABLE_VECTOR_OP(getData, useGpu);
-}
-
-template <class T>
-void CpuGpuVectorT<T>::zeroMem(bool useGpu) {
-  MUTABLE_VECTOR_OP(zeroMem, useGpu);
-}
-
-template <class T>
-void CpuGpuVectorT<T>::fillSequence(bool useGpu) {
-  MUTABLE_VECTOR_OP(fillSequence, useGpu);
-}
-
-template <class T>
-void CpuGpuVectorT<T>::setElement(size_t i, const T& value, bool useGpu) {
-  MUTABLE_VECTOR_OP(setElement, useGpu, i, value);
-}
-
-template <class T>
-T CpuGpuVectorT<T>::getElement(size_t i) const {
-  switch (*this->getSync()) {
-    case SYNCED:
-    case DATA_AT_CPU:
-      return cpuVectorT_->getElement(i);
-      break;
-    case DATA_AT_GPU:
-      return gpuVectorT_->getElement(i);
-      break;
-    default:
-      LOG(FATAL) << "Not support";
-      break;
-  }
-}
-
-template <class T>
-void CpuGpuVectorT<T>::copyFrom(const VectorT<T>& src, hl_stream_t stream) {
-  auto cVec = dynamic_cast<const CpuVectorT<T>*>(&src);
-  auto gVec = dynamic_cast<const GpuVectorT<T>*>(&src);
-  if (cVec) {
-    copyToCpu(cVec->getData(), cVec->getSize(), stream);
-  } else if (gVec) {
-    copyToGpu(gVec->getData(), gVec->getSize(), stream);
-  } else {
-    LOG(FATAL) << "Invalid type of src";
-  }
-}
-
-template <class T>
-void CpuGpuVectorT<T>::copyFrom(const T* data, size_t size, bool useGpu) {
-  if (useGpu) {
-    copyToGpu(data, size);
-  } else {
-    copyToCpu(data, size);
-  }
-}
-
-template <class T>
-void CpuGpuVectorT<T>::copyFrom(const T* data,
-                                size_t size,
-                                hl_stream_t stream,
-                                bool useGpu) {
-  if (useGpu) {
-    copyToGpu(data, size, stream);
-  } else {
-    copyToCpu(data, size, stream);
-  }
-}
-
-template <class T>
-void CpuGpuVectorT<T>::copyFrom(CpuGpuVectorT<T>& src,
-                                size_t offset,
-                                size_t size,
-                                bool useGpu,
-                                hl_stream_t stream) {
-  if (useGpu) {
-    VectorT<T>::resizeOrCreate(gpuVectorT_, size, true);
-    gpuVectorT_->copyFrom(src.getData(true) + offset, size, stream);
-  } else {
-    VectorT<T>::resizeOrCreate(cpuVectorT_, size, false);
-    cpuVectorT_->copyFrom(src.getData(false) + offset, size, stream);
-  }
-  setSync(useGpu);
-}
-
-template <class T>
-void CpuGpuVectorT<T>::copyFrom(CpuGpuVectorT<T>& src, hl_stream_t stream) {
-  switch (*src.getSync()) {
-    case DATA_AT_CPU:
-      copyFrom(*(src.getVector(false)), stream);
-      break;
-    case DATA_AT_GPU:
-      copyFrom(*(src.getVector(true)), stream);
-      break;
-    case SYNCED:
-      copyFrom(*(src.getVector(false)), stream);
-      copyFrom(*(src.getVector(true)), stream);
-      setSync(SYNCED);
-      break;
-    default:
-      LOG(FATAL) << "Not support";
-      break;
-  }
-}
-
-template <class T>
-void CpuGpuVectorT<T>::copyToCpu() {
-  switch (*this->getSync()) {
-    case DATA_AT_GPU:
-      CHECK(gpuVectorT_);
-      this->resizeOrCreate(gpuVectorT_->getSize(), false);
-      cpuVectorT_->copyFrom(*gpuVectorT_);
-      setSync(SYNCED);
-      break;
-    case DATA_AT_CPU:
-    case SYNCED:
-      CHECK(cpuVectorT_);
-      break;
-    default:
-      LOG(FATAL) << "Not support";
-      break;
-  }
-}
-
-template <class T>
-void CpuGpuVectorT<T>::copyToGpu() {
-  switch (*this->getSync()) {
-    case DATA_AT_CPU:
-      CHECK(cpuVectorT_);
-      this->resizeOrCreate(cpuVectorT_->getSize(), true);
-      gpuVectorT_->copyFrom(*cpuVectorT_);
-      setSync(SYNCED);
-      break;
-    case DATA_AT_GPU:
-    case SYNCED:
-      CHECK(gpuVectorT_);
-      break;
-    default:
-      LOG(FATAL) << "Not support";
-      break;
-  }
-}
-
-template class VectorT<real>;
-template class VectorT<int>;
-template class CpuVectorT<real>;
-template class CpuVectorT<int>;
-template class GpuVectorT<real>;
-template class GpuVectorT<int>;
-template class CpuGpuVectorT<real>;
-template class CpuGpuVectorT<int>;
-
-}  // namespace paddle
diff --git a/paddle/math/Vector.h b/paddle/math/Vector.h
deleted file mode 100644
index 964b42cae52af9b487ab17103bc5e999514e4dd1..0000000000000000000000000000000000000000
--- a/paddle/math/Vector.h
+++ /dev/null
@@ -1,726 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cmath>
-#include <memory>
-
-#include <hl_gpu.h>
-
-#include "BaseMatrix.h"
-#include "MemoryHandle.h"
-#include "paddle/utils/Common.h"
-#include "paddle/utils/Thread.h"
-
-namespace paddle {
-
-template <class T>
-class GpuVectorT;
-template <class T>
-class CpuVectorT;
-
-template <class T>
-class BaseVector;
-
-class SyncThreadPool;
-
-class Matrix;
-
-template <class T>
-class BaseVector : public BaseMatrixT<T> {
- public:
-  BaseVector(size_t size, T* data, bool useGpu)
-      : BaseMatrixT<T>(1, size, data, false, useGpu), size_(this->width_) {}
-
-  ~BaseVector() {}
-
- protected:
-  size_t& size_;
-};
-
-/**
- * Copy or assignemnt constructor will share the data as opposed to making a
- * copy of the original data. To make a copy of the orinal data, use copyFrom()
- * instead.
- */
-template <class T>
-class VectorT : public BaseVector<T> {
- protected:
-  VectorT(size_t size, MemoryHandlePtr memoryHandle, size_t offset, bool useGpu)
-      : BaseVector<T>(size,
-                      reinterpret_cast<T*>(memoryHandle->getBuf()) + offset,
-                      useGpu) {
-    memoryHandle_ = memoryHandle;
-  }
-
-  // data is still owned by the caller.
-  // data should be valid during the life of this vector.
-  // Caller is responsible for release the memory.
-  VectorT(size_t size, T* data, bool useGpu)
-      : BaseVector<T>(size, data, useGpu) {}
-
- public:
-  virtual ~VectorT() {}
-
-  static std::shared_ptr<VectorT<T>> create(size_t size, bool useGpu);
-
-  static std::shared_ptr<VectorT<T>> create(T* data, size_t size, bool useGpu);
-
-  static std::shared_ptr<VectorT<T>> create(size_t size,
-                                            MemoryHandlePtr memoryHandle,
-                                            size_t offset = 0);
-
-  // owner can set SyncThreadPool,
-  // if not set, will use globalSyncThreadPool,
-  // which can be used in main thread only.
-  static std::shared_ptr<VectorT<T>> createParallelVector(
-      size_t size, bool useGpu, SyncThreadPool* pool = nullptr);
-
-  size_t getSize() const { return this->size_; }
-  const T* getData() const { return this->data_; }
-  T* getData() { return this->data_; }
-
-  virtual void zeroMem() = 0;
-  // set all elements to value
-  virtual void reset(const T& value) = 0;
-  // fill data by 0, 1, 2, ...
-  virtual void fillSequence() = 0;
-
-  MemoryHandlePtr getMemoryHandle() const { return memoryHandle_; }
-
-  /**
-   * resizing to a big vector will not preserve old values.
-   */
-  void resize(size_t newSize) {
-    if (!memoryHandle_ || newSize * sizeof(T) > memoryHandle_->getAllocSize()) {
-      memoryHandle_ = newMemory(newSize * sizeof(T));
-      this->data_ = reinterpret_cast<T*>(memoryHandle_->getBuf());
-    }
-    this->size_ = newSize;
-  }
-
-  static void resizeOrCreate(std::shared_ptr<VectorT<T>>& vec,
-                             size_t size,
-                             bool useGpu) {
-    if (vec) {
-      vec->resize(size);
-    } else {
-      vec = create(size, useGpu);
-    }
-  }
-
-  virtual MemoryHandlePtr newMemory(size_t size) = 0;
-
-  /**
-   * form sub vector from *src*, shallow copy
-   */
-  void subVecFrom(const VectorT<T>& src, size_t start, size_t size) {
-    CHECK_EQ(BaseVector<T>::useGpu_, src.useGpu_);
-    CHECK_LT(start, src.size_);
-    CHECK_LE(start + size, src.size_);
-
-    BaseVector<T>::size_ = size;
-    BaseVector<T>::data_ = const_cast<T*>(src.data_) + start;
-  }
-
-  std::shared_ptr<VectorT<T>> subVec(size_t start, size_t size) {
-    CHECK_LE(start + size, static_cast<size_t>(getSize()));
-    return VectorT<T>::create(getData() + start, size, BaseVector<T>::useGpu_);
-  }
-
-  /**
-   * form sub vector from *src*, shallow copy
-   */
-  void subVecFrom(const T* src, size_t start, size_t size) {
-    BaseVector<T>::size_ = size;
-    BaseVector<T>::data_ = const_cast<T*>(src) + start;
-  }
-
-  /**
-   * form sub vector from *src*, shallow copy
-   * in *interval* [interval.first, interval.second)
-   */
-  void subVecFrom(const VectorT<T>& src, std::pair<size_t, size_t> interval) {
-    subVecFrom(src, interval.first, interval.second - interval.first);
-  }
-
-  /**
-   * convert the vector to a sparse one_hot matrix of width idRange
-   * only applies to IVector
-   */
-  std::shared_ptr<Matrix> toOneHotSparseMatrix(size_t idRange, bool useGpu);
-
-  /**
-   * @brief cast vector of "real" elements to "int" elements.
-   *
-   * @note: float -> int must be casted, or you'll get wrong data.
-   */
-  std::shared_ptr<VectorT<int>> castToInt();
-
-  /**
-   * This function will crash if the size of src and dest is different.
-   */
-  virtual void copyFrom(const VectorT<T>& src) = 0;
-
-  /**
-   * If GpuVector, this function is an asynchronous interface,
-   * will push the copy-task to the specifed-stream and return immediately.
-   *
-   * If CpuVector, this function is an synchronous interface,
-   * same as the copyFrom(const VectorT<T>& src).
-   */
-  virtual void copyFrom(const VectorT<T>& src, hl_stream_t stream) = 0;
-
-  /**
-   * copy size elements from src
-   *
-   * If this is GpuVector, src can be cpu or gpu memory
-   *
-   * If this is CpuVector, src is assumed to be cpu memory
-   */
-  virtual void copyFrom(const T* src, size_t size) = 0;
-
-  /**
-   * copy size elements from src
-   *
-   * If this is GpuVector, src can be cpu or gpu memory
-   *
-   * If this is CpuVector, src is assumed to be cpu memory,
-   */
-  virtual void copyFrom(const T* src, size_t size, hl_stream_t stream) = 0;
-
-  /**
-   * exec a func in single/multi thread
-   */
-  virtual void exec(SyncThreadPool::JobFunc func) { func(0, 1); }
-
-  /// Get the buffer point with beginPos
-  virtual T* getPoint(const uint64_t beginPos) = 0;
-
-  /// Get the value for the i'th element
-  virtual T getElement(size_t i) const = 0;
-  virtual void setElement(size_t i, const T& value) = 0;
-
-  //----------  math operations ----------------
-
-  // sum of the absolute value of each elements
-  virtual T getAbsSum() = 0;
-
-  virtual T getSum() = 0;
-  virtual T getMax() = 0;
-  virtual T getAbsMax() = 0;
-  virtual T getMin() = 0;
-
-  /// element-wise calc:  this = (b == value)
-  virtual void isEqualTo(const VectorT<T>& b, const T& value) = 0;
-
-  /// select elements indexed by *ids* from vector *src*
-  virtual void selectFrom(const VectorT<T>& src, const VectorT<int>& ids) = 0;
-
-  enum HistogramType {
-    HISTOGRAM_EXPONENT = 0,
-  };
-
-  /**
-   * @brief  print histogram of vector values
-   *
-   * @note   only exponent histogram supported currently
-   */
-  virtual void histogram(std::ostream& os, int type = HISTOGRAM_EXPONENT) = 0;
-
-  /// generate uniform random value for each element
-  virtual void rand() = 0;
-  /**
-   * generate uniform random value for each element,
-   * data range is from 0 to (classes - 1).
-   */
-  virtual void rand(size_t classes) = 0;
-
-  /**
-   * Debug use only. Very inefficient for GPU vector.
-   * get the value at pos.
-   */
-  virtual T get(size_t pos) = 0;
-
-  /**
-   * generate univariate Gaussian distributed random numbers
-   * with given mean and standardDeviation.
-   */
-  virtual void randnorm(real mean, real standardDeviation) = 0;
-
-  /**
-   * generate uniform distributed random numbers
-   * with given range.
-   */
-  virtual void uniform(real left, real right) = 0;
-
-  /// print the first "num" elements of the Vector
-  virtual void print(std::ostream& os, size_t num) const = 0;
-
-  /// print the "idx" element of the Vector
-  virtual void printOneElement(std::ostream& os, size_t idx) const = 0;
-
-  template <typename ExpressionType>
-  void operator=(const ExpressionType& expr) {
-    if (BaseVector<T>::useGpu_) {
-      TensorGpuApply<T>(*this, expr);
-    } else {
-      TensorCpuApply<T>(*this, expr);
-    }
-  }
-
- protected:
-  friend class GpuVectorT<T>;
-  friend class CpuVectorT<T>;
-  virtual void copyTo(CpuVectorT<T>* dest) const = 0;
-  virtual void copyTo(GpuVectorT<T>* dest) const = 0;
-  MemoryHandlePtr memoryHandle_;
-};
-
-template <class T>
-std::ostream& operator<<(std::ostream& os, const VectorT<T>& vec) {
-  vec.print(os, vec.getSize());
-  return os;
-}
-
-template <class T>
-class GpuVectorT : public VectorT<T> {
- public:
-  explicit GpuVectorT(size_t size);
-  GpuVectorT(size_t size, GpuMemHandlePtr memHandle, size_t offset)
-      : VectorT<T>(size, memHandle, offset, true) {}
-
-  // data is still owned by the caller.
-  // data should be valid during the life of this vector.
-  // Caller is responsible for release the memory.
-  GpuVectorT(size_t size, T* data) : VectorT<T>(size, data, true) {}
-
-  virtual MemoryHandlePtr newMemory(size_t size) {
-    return std::make_shared<GpuMemoryHandle>(size);
-  }
-  virtual void zeroMem();
-  virtual void reset(const T& value);
-  virtual void fillSequence();
-
-  virtual void copyFrom(const T* src, size_t size);
-  virtual void copyFrom(const T* src, size_t size, hl_stream_t stream);
-  virtual void copyFrom(const VectorT<T>& src);
-  virtual void copyFrom(const VectorT<T>& src, hl_stream_t stream);
-  virtual T getElement(size_t i) const;
-  virtual void setElement(size_t i, const T& value);
-  virtual T* getPoint(const uint64_t beginPos);
-
-  virtual T getAbsSum();
-  virtual T getSum();
-  virtual T getMax();
-  virtual T getAbsMax();
-  virtual T getMin();
-  virtual void isEqualTo(const VectorT<T>& b, const T& value);
-  virtual void selectFrom(const VectorT<T>& src, const VectorT<int>& ids);
-  virtual void histogram(std::ostream& os, int type);
-  virtual void rand();
-  virtual void rand(size_t classes);
-  virtual void randnorm(real mean, real standardDeviation);
-  virtual void uniform(real left, real right);
-  virtual T get(size_t pos);
-  virtual void print(std::ostream& os, size_t num) const;
-  virtual void printOneElement(std::ostream& os, size_t idx) const;
-
-  template <typename ExpressionType>
-  void operator=(const ExpressionType& expr) {
-    TensorGpuApply<T>(*this, expr);
-  }
-
- protected:
-  virtual void copyTo(CpuVectorT<T>* dest) const;
-  virtual void copyTo(GpuVectorT<T>* dest) const;
-};
-
-template <class T>
-class CpuVectorT : public VectorT<T> {
- public:
-  explicit CpuVectorT(size_t size);
-  CpuVectorT(size_t size, MemoryHandlePtr memoryHandle, size_t offset)
-      : VectorT<T>(size, memoryHandle, offset, false) {}
-
-  // data is still owned by the caller.
-  // data should be valid during the life of this vector.
-  // Caller is responsible for release the memory.
-  CpuVectorT(size_t size, T* data) : VectorT<T>(size, data, false) {}
-
-  /**
-   * If src is a CpuVector, the new CpuVector will share the data with src
-   *
-   * If src is a GpuVector, the new CpuVector will copy data from src
-   */
-  explicit CpuVectorT(const VectorT<T>& src);
-
-  virtual MemoryHandlePtr newMemory(size_t size) {
-    return std::make_shared<CpuMemoryHandle>(size);
-  }
-
-  virtual void zeroMem();
-  virtual void reset(const T& value);
-  virtual void fillSequence();
-  virtual void copyFrom(const T* src, size_t size);
-  virtual void copyFrom(const T* src, size_t size, hl_stream_t stream);
-  virtual void copyFrom(const VectorT<T>& src);
-  virtual void copyFrom(const VectorT<T>& src, hl_stream_t stream);
-  virtual void copyTo(CpuVectorT<T>* dest) const;
-  virtual void copyTo(GpuVectorT<T>* dest) const;
-
-  /// Get the buffer point with beginPos
-  virtual T* getPoint(const uint64_t beginPos) {
-    return this->getData() + beginPos;
-  }
-
-  virtual T getElement(size_t i) const { return this->getData()[i]; }
-  virtual void setElement(size_t i, const T& value) {
-    this->getData()[i] = value;
-  }
-
-  virtual T getAbsSum();
-  virtual T getSum();
-  virtual T getMax();
-  virtual T getAbsMax();
-  virtual T getMin();
-  virtual void isEqualTo(const VectorT<T>& b, const T& value);
-  virtual void selectFrom(const VectorT<T>& src, const VectorT<int>& ids);
-  virtual void histogram(std::ostream& os, int type);
-  virtual void rand();
-  virtual void rand(size_t classes);
-  virtual void randnorm(real mean, real standardDeviation);
-  virtual void uniform(real left, real right);
-  virtual T get(size_t pos);
-  virtual void print(std::ostream& os, size_t num) const;
-  virtual void printOneElement(std::ostream& os, size_t idx) const;
-
-  template <typename ExpressionType>
-  void operator=(const ExpressionType& expr) {
-    TensorCpuApply<T>(*this, expr);
-  }
-};
-
-template <class T>
-class ParallelCpuVectorT : public CpuVectorT<T> {
- public:
-  ParallelCpuVectorT(size_t size, SyncThreadPool* pool)
-      : CpuVectorT<T>(size), pool_(pool) {}
-
-  virtual void zeroMem() {
-    parallelExec([](CpuVectorT<T>& vec) { vec.CpuVectorT<T>::zeroMem(); });
-  }
-  virtual void randnorm(real mean, real standardDeviation) {
-    parallelExec([=](CpuVectorT<T>& vec) {
-      vec.CpuVectorT<T>::randnorm(mean, standardDeviation);
-    });
-  }
-  virtual void uniform(real left, real right) {
-    parallelExec(
-        [=](CpuVectorT<T>& vec) { vec.CpuVectorT<T>::uniform(left, right); });
-  }
-
-  virtual void exec(SyncThreadPool::JobFunc jobFunc);
-
- private:
-  typedef std::function<void(CpuVectorT<T>& vec)> ExecFunc;
-  void parallelExec(ExecFunc func);
-  SyncThreadPool* pool_;
-};
-
-/**
- * A class to do conversion between CpuVector and GpuVector automatically.
- */
-template <class T>
-class CpuGpuVectorT {
- public:
-  /**
-   * @brief An enum type of SyncedFlag using to
-   *        mark data memory is in CPU or GPU.
-   *
-   * DATA_AT_CPU: data is located in CPU.
-   *
-   * DATA_AT_GPU: data is located in GPU.
-   *
-   * SYNCED: data is located in CPU and GPU simultaneously.
-   */
-  enum SyncedFlag { DATA_AT_CPU = 0, DATA_AT_GPU = 1, SYNCED = 2 };
-
-  /**
-   * @brief A constructor, create cpuVectorT_ or gpuVectorT_.
-   *
-   * @param[in] size    data size.
-   * @param[in] useGpu  use gpu or not.
-   */
-  explicit CpuGpuVectorT(size_t size, bool useGpu);
-
-  /**
-   * @brief A constructor, create CpuGpuVectorT by VectorT.
-   *
-   * If src is CpuVector, cpuVectorT_ is shared data with src.
-   *
-   * If src is GpuVector, gpuVectorT_ is shared data with src.
-   */
-  explicit CpuGpuVectorT(const std::shared_ptr<VectorT<T>>& src);
-
-  /**
-   * @brief A constructor.
-   *
-   * If useGpu is true, data should be located in device and
-   * create gpuVectorT_ with data.
-   *
-   * If useGpu is false, data should be located in host and
-   * create cpuVectorT_ with data.
-   *
-   * @note Data is owned by the caller and should be valid during
-   *       the life of this vector.
-   *       Caller is responsible for release the memory.
-   */
-  CpuGpuVectorT(size_t size, T* data, bool useGpu);
-
-  CpuGpuVectorT(CpuGpuVectorT<T>& src, size_t offset, size_t size);
-
-  virtual ~CpuGpuVectorT() {}
-
-  static std::shared_ptr<CpuGpuVectorT<T>> create(size_t size, bool useGpu);
-
-  /**
-   * @brief resize vector.
-   *
-   * If useGpu is true, resize gpuVectorT_ and set syncFlag_ to DATA_AT_GPU,
-   *
-   * otherwise resize cpuVectorT_ and set syncFlag_ to DATA_AT_CPU.
-   */
-  void resize(size_t size, bool useGpu);
-
-  /**
-   * @brief resize or create CpuGpuVectorT.
-   */
-  static void resizeOrCreate(std::shared_ptr<CpuGpuVectorT<T>>& vec,
-                             size_t size,
-                             bool useGpu);
-
-  /**
-   * @brief return a const cpuVectorT_ or gpuVectorT_.
-   *
-   * If useGpu is true, return gpuVectorT_.
-   *
-   * If useGpu is false, return cpuVectorT_.
-   *
-   * @note Caller should not change the data.
-   *       If caller changes const attribute,
-   *       should set syncFlag_.
-   */
-  std::shared_ptr<const VectorT<T>> getVector(bool useGpu) const;
-
-  /**
-   * @brief return a const cpuVectorT_ or gpuVectorT_.
-   *
-   * @note: This interface will change syncFlag_, so if you will
-   *        not change the data, you should call getVector.
-   */
-  std::shared_ptr<VectorT<T>>& getMutableVector(bool useGpu);
-
-  /**
-   * @brief return const T* data.
-   *
-   * If useGpu is true, return device data.
-   *
-   * If useGpu is false, return host data.
-   */
-  const T* getData(bool useGpu) const;
-
-  // TODO(yuyang18): Make getData more c++ style.
-  //  inline T* getData(bool useGpu) {
-  //    return getMutableData(useGpu);
-  //  }
-
-  T* getMutableData(bool useGpu);
-
-  /**
-   * If useGpu is true, gpuVectorT_->Op().
-   *
-   * If useGpu is false, cpuVectorT_->Op().
-   *
-   * Op is zeroMem, fillSequence, ...
-   */
-  void zeroMem(bool useGpu);
-  void fillSequence(bool useGpu);
-  void setElement(size_t i, const T& value, bool useGpu);
-
-  /**
-   * @brief return i-th element.
-   */
-  T getElement(size_t i) const;
-
-  /**
-   * @brief return vector size.
-   */
-  size_t getSize() const {
-    size_t size = 0;
-    switch (*sync_) {
-      case SYNCED:
-      case DATA_AT_CPU:
-        size = cpuVectorT_->getSize();
-        break;
-      case DATA_AT_GPU:
-        size = gpuVectorT_->getSize();
-        break;
-      default:
-        LOG(FATAL) << "Not support";
-        break;
-    }
-    return size;
-  }
-
-  /// copy data to cpuVectorT_.
-  inline void copyToCpu(const T* data, size_t size) {
-    this->resizeOrCreate(size, false);
-    cpuVectorT_->copyFrom(data, size);
-    setSync(DATA_AT_CPU);
-  }
-  /// copy data to cpuVectorT_ using specifed-stream.
-  inline void copyToCpu(const T* data, size_t size, hl_stream_t stream) {
-    this->resizeOrCreate(size, false);
-    cpuVectorT_->copyFrom(data, size, stream);
-    setSync(DATA_AT_CPU);
-  }
-
-  /// copy data to gpuVectorT_.
-  inline void copyToGpu(const T* data, size_t size) {
-    this->resizeOrCreate(size, true);
-    gpuVectorT_->copyFrom(data, size);
-    setSync(DATA_AT_GPU);
-  }
-  /// copy data to gpuVectorT_ using specifed-stream.
-  inline void copyToGpu(const T* data, size_t size, hl_stream_t stream) {
-    this->resizeOrCreate(size, true);
-    gpuVectorT_->copyFrom(data, size, stream);
-    setSync(DATA_AT_GPU);
-  }
-
-  /**
-   * @brief copy from src using specifed-stream.
-   *
-   * If src is CpuVectorT, copy to cpuVectorT_.
-   *
-   * If src is GpuVectorT, copy to gpuVectorT_.
-   */
-  void copyFrom(const VectorT<T>& src, hl_stream_t stream);
-
-  /**
-   * @brief copy data.
-   *
-   * If useGpu is false, copy host data to cpuVectorT_.
-   *
-   * If useGpu is true, copy device data to gpuVectorT_.
-   *
-   * @note  data address should consistent with useGpu.
-   */
-  void copyFrom(const T* data, size_t size, bool useGpu);
-  void copyFrom(const T* data, size_t size, hl_stream_t stream, bool useGpu);
-
-  /**
-   * @brief copy from (src + offset) using specifed-stream.
-   */
-  void copyFrom(CpuGpuVectorT<T>& src,
-                size_t offset,
-                size_t size,
-                bool useGpu,
-                hl_stream_t stream);
-
-  /**
-   * @brief copy from src using specifed-stream.
-   */
-  void copyFrom(CpuGpuVectorT<T>& src, hl_stream_t stream);
-
-  /**
-   * @brief return sync_.
-   */
-  inline SyncedFlag* getSync() const { return sync_; }
-
-  /**
-   * @brief set sync_.
-   */
-  inline void setSync(SyncedFlag* sync) { sync_ = sync; }
-
-  inline void setSync(SyncedFlag syncFlag) {
-    if (sync_) {
-      *sync_ = syncFlag;
-    } else {
-      syncFlag_ = syncFlag;
-      sync_ = &syncFlag_;
-    }
-  }
-
-  inline void setSync(bool useGpu) {
-    SyncedFlag flag = useGpu ? DATA_AT_GPU : DATA_AT_CPU;
-    setSync(flag);
-  }
-
- protected:
-  void resizeOrCreate(size_t size, bool useGpu);
-
-  /**
-   * @brief copy between cpuVectorT_ and gpuVectorT_.
-   *
-   * If syncFlag_ is DATA_AT_CPU and SYNCED, do nothing.
-   *
-   * If syncFlag_ is DATA_AT_GPU, copy gpuVectorT_ to cpuVectorT_
-   *   and set syncFlag_ to SYNCED.
-   */
-  void copyToCpu();
-
-  /**
-   * @brief copy between cpuVectorT_ and gpuVectorT_.
-   *
-   * If syncFlag_ is DATA_AT_GPU and SYNCED, do nothing.
-   *
-   * If syncFlag_ is DATA_AT_CPU, copy cpuVectorT_ to gpuVectorT_
-   *   and set syncFlag_ to SYNCED.
-   */
-  void copyToGpu();
-
-  /// host pointer.
-  std::shared_ptr<VectorT<T>> cpuVectorT_;
-  /// device pointer.
-  std::shared_ptr<VectorT<T>> gpuVectorT_;
-  /// specify current data address.
-  SyncedFlag syncFlag_;
-  SyncedFlag* sync_;
-};
-
-typedef VectorT<real> Vector;
-typedef CpuVectorT<real> CpuVector;
-typedef GpuVectorT<real> GpuVector;
-
-typedef VectorT<int> IVector;
-typedef CpuVectorT<int> CpuIVector;
-typedef GpuVectorT<int> GpuIVector;
-
-typedef std::shared_ptr<Vector> VectorPtr;
-typedef std::shared_ptr<CpuVector> CpuVectorPtr;
-typedef std::shared_ptr<GpuVector> GpuVectorPtr;
-
-typedef std::shared_ptr<IVector> IVectorPtr;
-typedef std::shared_ptr<CpuIVector> CpuIVectorPtr;
-typedef std::shared_ptr<GpuIVector> GpuIVectorPtr;
-
-typedef CpuGpuVectorT<real> CpuGpuVector;
-typedef CpuGpuVectorT<int> ICpuGpuVector;
-typedef std::shared_ptr<CpuGpuVector> CpuGpuVectorPtr;
-typedef std::shared_ptr<ICpuGpuVector> ICpuGpuVectorPtr;
-
-}  // namespace paddle
diff --git a/paddle/math/tests/OriginalOptimizerApi.h b/paddle/math/tests/OriginalOptimizerApi.h
deleted file mode 100644
index e30d784b232dd7d477877d3f7c90cd185357328c..0000000000000000000000000000000000000000
--- a/paddle/math/tests/OriginalOptimizerApi.h
+++ /dev/null
@@ -1,201 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/math/Vector.h"
-#include "paddle/utils/GlobalConstants.h"
-
-using namespace paddle;  // NOLINT
-
-void SparseMomentumParameterOptimizer(const VectorPtr vecs[],
-                                      real alpha,
-                                      real beta,
-                                      real gamma,
-                                      real tau,
-                                      real learningRate) {
-  vecs[PARAMETER_MOMENTUM_UT]->add(*vecs[PARAMETER_GRADIENT],
-                                   -alpha * gamma * learningRate);
-  vecs[PARAMETER_MOMENTUM_VT]->add(*vecs[PARAMETER_GRADIENT],
-                                   tau * alpha * gamma * learningRate);
-  vecs[PARAMETER_VALUE]->add(*vecs[PARAMETER_MOMENTUM_UT],
-                             tau / beta + 1.0 / alpha,
-                             *vecs[PARAMETER_MOMENTUM_VT],
-                             1.0 / beta);
-}
-
-void AdagradParameterOptimizer(const VectorPtr vecs[],
-                               real epsilon,
-                               real learningRate,
-                               real momentum,
-                               real decayRate) {
-  vecs[PARAMETER_GRADIENT_SQURESUM1]->addSquare(*vecs[PARAMETER_GRADIENT],
-                                                1.0f);
-  vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM],
-                                     *vecs[PARAMETER_GRADIENT_SQURESUM1]);
-  vecs[PARAMETER_LEARNING_RATE]->add(epsilon);
-  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
-
-  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
-                                   *vecs[PARAMETER_MOMENTUM],
-                                   *vecs[PARAMETER_LEARNING_RATE],
-                                   learningRate,
-                                   momentum,
-                                   decayRate);
-}
-
-void AdaDeltaParameterOptimizer(const VectorPtr vecs[],
-                                real rou,
-                                real epsilon,
-                                real learningRate,
-                                real momentum,
-                                real decayRate) {
-  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
-  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
-      *vecs[PARAMETER_GRADIENT], rou, 1.0f - rou);
-
-  // learn_rate = sqrt( ( E(dx_{t-1}^2) + epsilon ) / ( E(g_t^2) + epsilon ) )
-  vecs[PARAMETER_LEARNING_RATE]->dotDiv(*vecs[PARAMETER_GRADIENT_SQURESUM1],
-                                        *vecs[PARAMETER_GRADIENT_SQURESUM],
-                                        epsilon,
-                                        epsilon);
-  vecs[PARAMETER_LEARNING_RATE]->sqrt2();
-
-  // E(dx_t^2) = \rou * E(dx_{t-1}^2) + (1-\rou) * (-g*learn_rate)^2
-  vecs[PARAMETER_GRADIENT_SQURESUM1]->decayAddSquareMul(
-      *vecs[PARAMETER_GRADIENT],
-      *vecs[PARAMETER_LEARNING_RATE],
-      rou,
-      1.0f - rou);
-
-  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
-                                   *vecs[PARAMETER_MOMENTUM],
-                                   *vecs[PARAMETER_LEARNING_RATE],
-                                   learningRate,
-                                   momentum,
-                                   decayRate);
-}
-
-void RMSPropParameterOptimizer(const VectorPtr vecs[],
-                               real accumulatedRou,
-                               real rou,
-                               real epsilon,
-                               real learningRate,
-                               real momentum,
-                               real decayRate,
-                               bool firstTime) {
-  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
-  // For the first time update, make the sum be the current square
-  // so that the initial estimation of E(g_t^2) will not be too small.
-  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
-      *vecs[PARAMETER_GRADIENT], accumulatedRou, firstTime ? 1.0f : 1.0f - rou);
-
-  // E(g_t) = \rou * E(g_{t-1}) + (1-\rou) * g
-  vecs[PARAMETER_GRADIENT_SQURESUM1]->add(
-      *vecs[PARAMETER_GRADIENT], accumulatedRou, 1.0f - rou);
-
-  // learn_rate = 1/sqrt( ( E(g_t^2) - (E(g_t))^2 + epsilon )
-  // Basiclly if the sign of the gradient changes more often,
-  // the learning rate will be decreased.
-  vecs[PARAMETER_LEARNING_RATE]->assign(*vecs[PARAMETER_GRADIENT_SQURESUM]);
-  vecs[PARAMETER_LEARNING_RATE]->addSquare(*vecs[PARAMETER_GRADIENT_SQURESUM1],
-                                           -1.0f);
-  vecs[PARAMETER_LEARNING_RATE]->add(epsilon);
-  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
-
-  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
-                                   *vecs[PARAMETER_MOMENTUM],
-                                   *vecs[PARAMETER_LEARNING_RATE],
-                                   learningRate,
-                                   momentum,
-                                   decayRate);
-}
-
-void DecayedAdagradParameterOptimizer(const VectorPtr vecs[],
-                                      real accumulatedRou,
-                                      real rou,
-                                      real epsilon,
-                                      real learningRate,
-                                      real momentum,
-                                      real decayRate,
-                                      bool firstTime) {
-  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
-  // For the first time update, make the sum be the current square
-  // so that the initial estimation of E(g_t^2) will not be too small.
-  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
-      *vecs[PARAMETER_GRADIENT], accumulatedRou, firstTime ? 1.0f : 1.0f - rou);
-
-  // learn_rate = 1/sqrt( ( E(g_t^2) + epsilon )
-  // Basiclly if the bigger the magnitude gradient is,
-  // the smaller the learning rate will be.
-  vecs[PARAMETER_LEARNING_RATE]->assign(epsilon);
-  vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM]);
-  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
-
-  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
-                                   *vecs[PARAMETER_MOMENTUM],
-                                   *vecs[PARAMETER_LEARNING_RATE],
-                                   learningRate,
-                                   momentum,
-                                   decayRate);
-}
-
-void AdamParameterOptimizer(const VectorPtr vecs[],
-                            real beta1,
-                            real beta2,
-                            real beta1_power,
-                            real beta2_power,
-                            real epsilon,
-                            real learningRate) {
-  Vector* m = vecs[PARAMETER_MOMENTUM].get();
-  Vector* g = vecs[PARAMETER_GRADIENT].get();
-  Vector* v = vecs[PARAMETER_SECOND_MOMENTUM].get();
-  Vector* theta = vecs[PARAMETER_VALUE].get();
-
-  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
-  m->add(*g, beta1, 1 - beta1);
-
-  // v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2
-  g->square2();
-  v->add(*g, beta2, 1 - beta2);
-
-  // tmp = m_t / ( \sqrt{v_t} + \epsilon )
-  // \theta_t = \theta_{t-1} - \alpha * \sqrt(1-\beta_2^t) / (1-\beta_1^t) * tmp
-  g->sqrt2(*v);
-  g->dotDiv(*m, *g, 0., epsilon);
-  real alpha =
-      learningRate * std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
-  theta->add(*theta, 1.0, *g, -alpha);
-}
-
-void AdamaxParameterOptimizer(
-    const VectorPtr vecs[], real beta1, real beta2, int64_t step, real alpha) {
-  Vector* m = vecs[PARAMETER_MOMENTUM].get();
-  Vector* g = vecs[PARAMETER_GRADIENT].get();
-  Vector* u = vecs[PARAMETER_WEIGHTED_INFINITY_NORM].get();
-  Vector* theta = vecs[PARAMETER_VALUE].get();
-
-  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
-  m->add(*g, beta1, 1 - beta1);
-
-  // u_t = max(\beta_2*u_{t-1}, abs(g_t))
-  u->mulScalar(beta2);
-  g->abs2();
-  u->max2(*u, *g);
-
-  // \theta_t = \theta_{t-1} - (\alpha/(1-\beta_1^t))*m_t/u_t
-  g->dotDiv(*m, *u);
-  real learningRate = alpha / (1 - std::pow(beta1, step));
-  theta->add(*theta, 1.0, *g, -learningRate);
-}
diff --git a/paddle/math/tests/PerfUtils.h b/paddle/math/tests/PerfUtils.h
deleted file mode 100644
index bee2351e2fb80f9ccef670535c92485389f0c51a..0000000000000000000000000000000000000000
--- a/paddle/math/tests/PerfUtils.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-// Performance Check
-#ifdef PADDLE_DISABLE_TIMER
-
-#define EXPRESSION_PERFORMANCE(expression) expression;
-
-#else
-
-#include "paddle/utils/Stat.h"
-using namespace paddle;  // NOLINT
-
-#define EXPRESSION_PERFORMANCE(expression)                             \
-  do {                                                                 \
-    char expr[30];                                                     \
-    strncpy(expr, #expression, 30);                                    \
-    if (expr[29] != '\0') {                                            \
-      expr[27] = '.';                                                  \
-      expr[28] = '.';                                                  \
-      expr[29] = '\0';                                                 \
-    }                                                                  \
-    expression;                                                        \
-    for (int i = 0; i < 20; i++) {                                     \
-      REGISTER_TIMER(expr);                                            \
-      expression;                                                      \
-    }                                                                  \
-    LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ') \
-              << *globalStat.getStat(expr);                            \
-    globalStat.reset();                                                \
-  } while (0)
-
-#endif
diff --git a/paddle/math/tests/TensorCheck.h b/paddle/math/tests/TensorCheck.h
deleted file mode 100644
index 40ac04ef5d4baa0239bb03b04c3a6cce0fcac5a5..0000000000000000000000000000000000000000
--- a/paddle/math/tests/TensorCheck.h
+++ /dev/null
@@ -1,216 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-/**
- * This file provides a TensorCheck template function, which can be used to
- * compare CpuMatrix and GpuMatrix, CpuVector and GpuVector, and so on.
- */
-
-#include <cmath>
-#include "paddle/math/Matrix.h"
-
-namespace autotest {
-
-using paddle::Matrix;
-using paddle::CpuMatrix;
-using paddle::GpuMatrix;
-using paddle::VectorT;
-using paddle::CpuVectorT;
-using paddle::GpuVectorT;
-
-class AssertEqual {
- public:
-  AssertEqual(real err = 0) : err_(err) {}
-
-  inline bool operator()(real a, real b) {
-    if (err_ == 0) {
-      if (a != b) {
-        return false;
-      }
-    } else {
-      if (std::fabs(a - b) > err_) {
-        if ((std::fabs(a - b) / std::fabs(a)) > (err_ / 10.0f)) {
-          return false;
-        }
-      }
-    }
-
-    return true;
-  }
-
- private:
-  real err_;
-};
-
-template <typename Tensor>
-class CopyToCpu;
-
-template <>
-class CopyToCpu<CpuMatrix> {
- public:
-  explicit CopyToCpu(const CpuMatrix& arg) : arg_(arg) {}
-  const CpuMatrix& copiedArg() const { return arg_; }
-
- private:
-  const CpuMatrix& arg_;
-};
-
-template <>
-class CopyToCpu<GpuMatrix> {
- public:
-  explicit CopyToCpu(const GpuMatrix& arg)
-      : arg_(arg.getHeight(), arg.getWidth()) {
-    arg_.copyFrom(arg);
-  }
-  CpuMatrix& copiedArg() { return arg_; }
-
- private:
-  CpuMatrix arg_;
-};
-
-template <>
-class CopyToCpu<Matrix> {
- public:
-  explicit CopyToCpu(const Matrix& arg)
-      : arg_(arg.getHeight(), arg.getWidth()) {
-    arg_.copyFrom(arg);
-  }
-  CpuMatrix& copiedArg() { return arg_; }
-
- private:
-  CpuMatrix arg_;
-};
-
-template <typename T>
-class CopyToCpu<CpuVectorT<T>> {
- public:
-  explicit CopyToCpu(const CpuVectorT<T>& arg) : arg_(arg) {}
-  const CpuVectorT<T>& copiedArg() const { return arg_; }
-
- private:
-  const CpuVectorT<T>& arg_;
-};
-
-template <typename T>
-class CopyToCpu<GpuVectorT<T>> {
- public:
-  explicit CopyToCpu(const GpuVectorT<T>& arg) : arg_(arg.getSize()) {
-    arg_.copyFrom(arg);
-  }
-  CpuVectorT<T>& copiedArg() { return arg_; }
-
- private:
-  CpuVectorT<T> arg_;
-};
-
-template <typename T>
-class CopyToCpu<VectorT<T>> {
- public:
-  explicit CopyToCpu(const VectorT<T>& arg) : arg_(arg.getSize()) {
-    arg_.copyFrom(arg);
-  }
-  CpuVectorT<T>& copiedArg() { return arg_; }
-
- private:
-  CpuVectorT<T> arg_;
-};
-
-template <typename AssertEq>
-void TensorCheck(AssertEq compare,
-                 const CpuMatrix& matrix1,
-                 const CpuMatrix& matrix2) {
-  CHECK(matrix1.getHeight() == matrix2.getHeight());
-  CHECK(matrix1.getWidth() == matrix2.getWidth());
-
-  int height = matrix1.getHeight();
-  int width = matrix1.getWidth();
-  const real* data1 = matrix1.getData();
-  const real* data2 = matrix2.getData();
-  int count = 0;
-  for (int i = 0; i < height; i++) {
-    for (int j = 0; j < width; j++) {
-      real a = data1[i * width + j];
-      real b = data2[i * width + j];
-      if (!compare(a, b)) {
-        count++;
-      }
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-}
-
-template <typename AssertEq, class T>
-void TensorCheck(AssertEq compare,
-                 const CpuVectorT<T>& vector1,
-                 const CpuVectorT<T>& vector2) {
-  CHECK(vector1.getSize() == vector2.getSize());
-
-  const T* data1 = vector1.getData();
-  const T* data2 = vector2.getData();
-  size_t size = vector1.getSize();
-  int count = 0;
-  for (size_t i = 0; i < size; i++) {
-    real a = data1[i];
-    real b = data2[i];
-    if (!compare(a, b)) {
-      count++;
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different elements.";
-}
-
-template <typename AssertEq, typename Tensor1, typename Tensor2>
-void TensorCheck(AssertEq compare,
-                 const Tensor1& tensor1,
-                 const Tensor2& tensor2) {
-  TensorCheck(compare,
-              CopyToCpu<Tensor1>(tensor1).copiedArg(),
-              CopyToCpu<Tensor2>(tensor2).copiedArg());
-}
-
-template <typename AssertEq>
-void TensorCheck(AssertEq compare, real args1, real args2) {
-  EXPECT_EQ(compare(args1, args2), true) << "[Test error] args1 = " << args1
-                                         << ", args2 = " << args2;
-}
-
-template <typename AssertEq>
-void TensorCheck(AssertEq compare, size_t args1, size_t args2) {
-  EXPECT_EQ(args1, args2) << "[Test error] args1 = " << args1
-                          << ", args2 = " << args2;
-}
-
-template <typename Tensor1, typename Tensor2>
-void TensorCheckEqual(const Tensor1& tensor1, const Tensor2& tensor2) {
-  AssertEqual compare(0);
-  TensorCheck(compare,
-              CopyToCpu<Tensor1>(tensor1).copiedArg(),
-              CopyToCpu<Tensor2>(tensor2).copiedArg());
-}
-
-template <typename Tensor1, typename Tensor2>
-void TensorCheckErr(const Tensor1& tensor1, const Tensor2& tensor2) {
-#ifndef PADDLE_TYPE_DOUBLE
-  AssertEqual compare(1e-3);
-#else
-  AssertEqual compare(1e-10);
-#endif
-  TensorCheck(compare,
-              CopyToCpu<Tensor1>(tensor1).copiedArg(),
-              CopyToCpu<Tensor2>(tensor2).copiedArg());
-}
-
-}  // namespace autotest
diff --git a/paddle/math/tests/TestUtils.h b/paddle/math/tests/TestUtils.h
deleted file mode 100644
index e1966ec8a74747960420ec80fdfbb957f7cf177f..0000000000000000000000000000000000000000
--- a/paddle/math/tests/TestUtils.h
+++ /dev/null
@@ -1,294 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-/**
- * This file provides a AutoCompare calss to simplify the comparison
- * of CPU and GPU member functions.
- *
- * This takes two steps
- * 1. Construct an AutoCompare object.
- *    When constructing an AutoCompare object, you can set the err argument
- * to specify the maximum error for CPU and GPU functions.
- *
- * 2. Use the template functions cmpWithArg or cmpWithoutArg.
- * A. [cmpWithArg] Requires the caller construct the cpu arguments.
- *
- *  AutoCompare test;
- *  Init Argument arg1,arg2...
- *  test.cmpWithArg(function, arg1, arg2....)
- *
- * B. [cmpWithoutArg] The caller do not need construct arguments.
- *    If matrix used in these functions arguments is the same size.
- *    Such as the element wise function and the aggregate function
- *    defined in the BaseMatrix.cpp.
- *
- *  AutoCompare test;
- *  test.cmpWithoutArg<I...>(function, height, width)
- */
-
-#include <gtest/gtest.h>
-#include "TensorCheck.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/SparseMatrix.h"
-
-namespace autotest {
-
-using paddle::BaseMatrix;
-using paddle::CpuMatrix;
-using paddle::GpuMatrix;
-using paddle::CpuIVector;
-using paddle::GpuIVector;
-using paddle::CpuSparseMatrix;
-using paddle::GpuSparseMatrix;
-
-template <typename T1, typename T2>
-class ReplaceType {
- public:
-  typedef T1 type;
-};
-
-template <>
-class ReplaceType<BaseMatrix, CpuMatrix> {
- public:
-  typedef CpuMatrix type;
-};
-
-template <>
-class ReplaceType<BaseMatrix, GpuMatrix> {
- public:
-  typedef GpuMatrix type;
-};
-
-template <>
-class ReplaceType<Matrix, CpuMatrix> {
- public:
-  typedef CpuMatrix type;
-};
-
-template <>
-class ReplaceType<Matrix, GpuMatrix> {
- public:
-  typedef GpuMatrix type;
-};
-
-// construct a argument
-template <typename T>
-T construct(int height, int width);
-
-template <>
-float construct(int height, int width) {
-  return 0.5;
-}
-
-template <>
-double construct(int height, int width) {
-  return 0.5;
-}
-
-template <>
-size_t construct(int height, int width) {
-  size_t offset = std::rand() % (height < width ? height : width);
-  return offset;
-}
-
-template <>
-CpuMatrix construct(int height, int width) {
-  CpuMatrix a(height, width);
-  return a;
-}
-
-template <>
-GpuMatrix construct(int height, int width) {
-  GpuMatrix a(height, width);
-  return a;
-}
-
-// init a argument
-template <typename T>
-void init(T& v) {
-  return;
-}
-
-template <>
-void init(CpuMatrix& v) {
-  v.randomizeUniform();
-}
-
-template <>
-void init(GpuMatrix& v) {
-  v.randomizeUniform();
-}
-
-// init a tuple which contains a set of arguments.
-template <std::size_t I = 0, typename... Args>
-inline typename std::enable_if<I == sizeof...(Args), void>::type initTuple(
-    std::tuple<Args...>& t) {}
-
-template <std::size_t I = 0, typename... Args>
-    inline typename std::enable_if <
-    I<sizeof...(Args), void>::type initTuple(std::tuple<Args...>& t) {
-  init(std::get<I>(t));
-  initTuple<I + 1>(t);
-}
-
-// copy a argument, copy src to dest
-template <typename T1, typename T2>
-void copy(T1& dest, T2& src) {
-  dest = src;
-}
-
-template <>
-void copy(GpuMatrix& dest, CpuMatrix& src) {
-  dest.copyFrom(src);
-}
-
-// copy a tuple, copy src to dest
-template <std::size_t I = 0, typename... Args1, typename... Args2>
-inline typename std::enable_if<I == sizeof...(Args1), void>::type copyTuple(
-    std::tuple<Args1...>& dest, std::tuple<Args2...>& src) {}
-
-template <std::size_t I = 0, typename... Args1, typename... Args2>
-    inline typename std::enable_if <
-    I<sizeof...(Args1), void>::type copyTuple(std::tuple<Args1...>& dest,
-                                              std::tuple<Args2...>& src) {
-  copy(std::get<I>(dest), std::get<I>(src));
-  copyTuple<I + 1>(dest, src);
-}
-
-// call member function
-template <typename C,
-          typename FC,
-          typename R,
-          typename... FArgs,
-          typename... Args>
-R call(C& obj, R (FC::*f)(FArgs...), Args&&... args) {
-  return (obj.*f)(args...);
-}
-
-template <typename T>
-class ReturnType {
- public:
-  typedef T type;
-};
-
-template <>
-class ReturnType<CpuMatrix> {
- public:
-  typedef GpuMatrix type;
-};
-
-template <>
-class ReturnType<CpuIVector> {
- public:
-  typedef GpuIVector type;
-};
-
-template <>
-class ReturnType<CpuSparseMatrix> {
- public:
-  typedef GpuSparseMatrix type;
-};
-
-template <typename T>
-typename ReturnType<T>::type autoArgs(T& v) {
-  return v;
-}
-
-template <>
-GpuMatrix autoArgs(CpuMatrix& v) {
-  GpuMatrix a(v.getHeight(), v.getWidth());
-  a.copyFrom(v);
-  return a;
-}
-
-template <>
-GpuIVector autoArgs(CpuIVector& v) {
-  GpuIVector a(v.getSize());
-  a.copyFrom(v);
-  return a;
-}
-
-template <>
-GpuSparseMatrix autoArgs(CpuSparseMatrix& v) {
-  GpuSparseMatrix a(v.getHeight(),
-                    v.getWidth(),
-                    v.getElementCnt(),
-                    v.getValueType(),
-                    v.getFormat());
-  a.copyFrom(v, HPPL_STREAM_DEFAULT);
-  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-  return a;
-}
-
-class AutoCompare {
- public:
-  /**
-   * err is the allowed calculation error.
-   * The smaller the value of err,
-   * the stricter the comparison is between CPU and GPU calculations.
-   */
-  AutoCompare(size_t height, size_t width, real err = 1e-3)
-      : cpu(height, width), gpu(height, width), compare(err) {
-    init(cpu);
-    copy(gpu, cpu);
-  }
-
-  template <typename C, typename R, typename... FArgs, typename... Args>
-  void cmpWithArg(R (C::*f)(FArgs...), Args&&... args) {
-    static_assert(sizeof...(FArgs) == sizeof...(Args),
-                  "size of parameter packs are not equal");
-    call(cpu, f, args...);
-    call(gpu, f, autoArgs(args)...);
-
-    TensorCheck(compare, cpu, gpu);
-  }
-
-  template <std::size_t... I, typename C, typename R, typename... Args>
-  void cmpWithoutArg(R (C::*f)(Args...), size_t height, size_t width) {
-    static_assert(sizeof...(I) == sizeof...(Args),
-                  "size of parameter packs are not equal");
-    (void)height;
-    (void)width;
-    auto tuple1 = std::make_tuple(
-        construct<typename ReplaceType<
-            typename std::decay<
-                typename std::tuple_element<I,
-                                            std::tuple<Args...>>::type>::type,
-            CpuMatrix>::type>(height, width)...);
-
-    auto tuple2 = std::make_tuple(
-        construct<typename ReplaceType<
-            typename std::decay<
-                typename std::tuple_element<I,
-                                            std::tuple<Args...>>::type>::type,
-            GpuMatrix>::type>(height, width)...);
-
-    initTuple(tuple1);
-    copyTuple(tuple2, tuple1);
-
-    call(cpu, f, std::get<I>(tuple1)...);
-    call(gpu, f, std::get<I>(tuple2)...);
-
-    TensorCheck(compare, cpu, gpu);
-  }
-
- protected:
-  CpuMatrix cpu;
-  GpuMatrix gpu;
-  AssertEqual compare;
-};
-
-}  // namespace autotest
diff --git a/paddle/math/tests/test_Allocator.cpp b/paddle/math/tests/test_Allocator.cpp
deleted file mode 100644
index 84bc1c1d9e0a8368a69c1e53a63056eb45b9239f..0000000000000000000000000000000000000000
--- a/paddle/math/tests/test_Allocator.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Util.h"
-#define private public
-#include "paddle/math/Allocator.h"
-#include "paddle/math/MemoryHandle.h"
-#include "paddle/math/PoolAllocator.h"
-
-using namespace paddle;  // NOLINT
-
-template <typename Allocator>
-void testPoolAllocator() {
-  PoolAllocator* pool =
-      new PoolAllocator(new Allocator(), /* sizeLimit */ 1024);
-
-  /* alloc from system memory */
-  void* ptr1 = pool->alloc(10);
-  void* ptr2 = pool->alloc(200);
-  void* ptr3 = pool->alloc(200);
-  pool->free(ptr1, 10);
-  pool->free(ptr2, 200);
-  pool->free(ptr3, 200);
-  pool->printAll();
-  EXPECT_EQ((size_t)2, pool->pool_.size());
-  EXPECT_EQ((size_t)1, pool->pool_[10].size());
-  EXPECT_EQ((size_t)2, pool->pool_[200].size());
-  EXPECT_EQ(ptr1, pool->pool_[10][0]);
-  EXPECT_EQ(ptr2, pool->pool_[200][0]);
-  EXPECT_EQ(ptr3, pool->pool_[200][1]);
-
-  /* alloc from pool */
-  void* ptr4 = pool->alloc(10);
-  void* ptr5 = pool->alloc(200);
-  pool->printAll();
-  EXPECT_EQ((size_t)0, pool->pool_[10].size());
-  EXPECT_EQ((size_t)1, pool->pool_[200].size());
-  EXPECT_EQ(ptr1, ptr4);
-  EXPECT_EQ(ptr3, ptr5);
-  pool->free(ptr4, 10);
-  pool->free(ptr5, 200);
-
-  /* alloc size > sizeLimit */
-  void* ptr6 = pool->alloc(1024);
-  pool->free(ptr6, 1024);
-  EXPECT_LE((size_t)1024, pool->poolMemorySize_);
-
-  void* ptr7 = pool->alloc(1);
-  EXPECT_EQ((size_t)0, pool->poolMemorySize_);
-  EXPECT_EQ((size_t)0, pool->pool_.size());
-  pool->free(ptr7, 1);
-
-  delete pool;
-}
-
-TEST(Allocator, Pool) {
-  testPoolAllocator<CpuAllocator>();
-#ifdef PADDLE_WITH_CUDA
-  testPoolAllocator<GpuAllocator>();
-#endif
-}
-
-TEST(MemoryHandle, Cpu) {
-  for (auto size : {10, 30, 50, 100, 200, 512, 1000, 1023, 1024, 1025, 8193}) {
-    CpuMemoryHandle handle(size);
-    EXPECT_LE(handle.getSize(), handle.getAllocSize());
-  }
-
-  void* ptr1;
-  void* ptr2;
-  {
-    CpuMemoryHandle handle(256);
-    ptr1 = handle.getBuf();
-  }
-  {
-    CpuMemoryHandle handle(256);
-    ptr2 = handle.getBuf();
-  }
-  EXPECT_EQ(ptr1, ptr2);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(MemoryHandle, Gpu) {
-  int numGpu = hl_get_device_count();
-
-  /* alloc from system memory */
-  void* ptr3[numGpu];
-  void* ptr4[numGpu];
-  for (int i = 0; i < numGpu; i++) {
-    SetDevice device(i);
-    GpuMemoryHandle handle1(30);
-    GpuMemoryHandle handle2(30);
-    GpuMemoryHandle handle3(4000);
-    GpuMemoryHandle handle4(500);
-    ptr3[i] = handle3.getBuf();
-    ptr4[i] = handle4.getBuf();
-  }
-
-  /* alloc from pool */
-  for (int i = 0; i < numGpu; i++) {
-    SetDevice device(i);
-    GpuMemoryHandle handle1(30);
-    GpuMemoryHandle handle3(4000);
-    GpuMemoryHandle handle4(500);
-    EXPECT_EQ(ptr3[i], handle3.getBuf());
-    EXPECT_EQ(ptr4[i], handle4.getBuf());
-  }
-}
-#endif
diff --git a/paddle/math/tests/test_BaseMatrix.cpp b/paddle/math/tests/test_BaseMatrix.cpp
deleted file mode 100644
index 6f7beb60c8f535d51b18c4984b89d1972f4c82bd..0000000000000000000000000000000000000000
--- a/paddle/math/tests/test_BaseMatrix.cpp
+++ /dev/null
@@ -1,247 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_CUDA
-/**
- * This test file use autotest::AutoCompare and cmpWithoutArg to compares the
- * implementation of CPU and GPU member function in
- * BaseMatrix.cpp and Matrix.cpp.
- */
-
-#include <gtest/gtest.h>
-#include "TestUtils.h"
-#include "paddle/math/BaseMatrix.h"
-
-using paddle::BaseMatrix;
-using paddle::Matrix;
-using autotest::AutoCompare;
-
-// Test all void (BaseMatrix::*)() function
-TEST(BaseMatrix, void) {
-  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
-      auto compare = [height, width](void (BaseMatrix::*f)()) {
-        AutoCompare test(height, width, 1e-5);
-        test.cmpWithoutArg(f, height, width);
-      };
-
-      compare(&BaseMatrix::neg);
-      compare(&BaseMatrix::exp2);
-      compare(&BaseMatrix::log2);
-      compare(&BaseMatrix::sqrt2);
-      compare(&BaseMatrix::square2);
-      compare(&BaseMatrix::reciprocal2);
-      compare(&BaseMatrix::abs2);
-      compare(&BaseMatrix::sign2);
-      compare(&BaseMatrix::zero);
-      compare(&BaseMatrix::one);
-    }
-  }
-}
-
-// Test all void (BaseMatrix::*)(real) function
-TEST(BaseMatrix, real) {
-  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
-      auto compare = [height, width](void (BaseMatrix::*f)(real)) {
-        AutoCompare test(height, width, 1e-5);
-        test.cmpWithoutArg<0>(f, height, width);
-      };
-
-      compare(&BaseMatrix::pow2);
-      compare(&BaseMatrix::subScalar);
-      compare(&BaseMatrix::mulScalar);
-      compare(&BaseMatrix::divScalar);
-      compare(&BaseMatrix::assign);
-      compare(&BaseMatrix::add);
-      compare(&BaseMatrix::biggerThanScalar);
-      compare(&BaseMatrix::downClip);
-    }
-  }
-}
-
-// Test all void (BaseMatrix::*)(BaseMatrix&) function
-TEST(BaseMatrix, BaseMatrix) {
-  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
-      auto compare = [height, width](void (BaseMatrix::*f)(BaseMatrix&)) {
-        AutoCompare test(height, width, 1e-5);
-        test.cmpWithoutArg<0>(f, height, width);
-      };
-
-      compare(&BaseMatrix::assign);
-      compare(&BaseMatrix::add);
-      compare(&BaseMatrix::relu);
-      compare(&BaseMatrix::reluDerivative);
-      compare(&BaseMatrix::softrelu);
-      compare(&BaseMatrix::softreluDerivative);
-      compare(&BaseMatrix::brelu);
-      compare(&BaseMatrix::breluDerivative);
-      compare(&BaseMatrix::square2);
-      compare(&BaseMatrix::squareDerivative);
-      compare(&BaseMatrix::tanh);
-      compare(&BaseMatrix::tanhDerivative);
-      compare(&BaseMatrix::reciprocal2);
-      compare(&BaseMatrix::reciprocalDerivative);
-      compare(&BaseMatrix::abs2);
-      compare(&BaseMatrix::absDerivative);
-      compare(&BaseMatrix::sigmoid);
-      compare(&BaseMatrix::sigmoidDerivative);
-      compare(&BaseMatrix::expDerivative);
-      compare(&BaseMatrix::sign2);
-      compare(&BaseMatrix::exp2);
-      compare(&BaseMatrix::log2);
-      compare(&BaseMatrix::sqrt2);
-      compare(&BaseMatrix::dotMul);
-      compare(&BaseMatrix::dotMulSquare);
-      compare(&BaseMatrix::dotSquareMul);
-      compare(&BaseMatrix::addColVector);
-      compare(&BaseMatrix::addRowVector);
-      compare(&BaseMatrix::mulRowVector);
-      compare(&BaseMatrix::divRowVector);
-      compare(&BaseMatrix::mulColVector);
-      compare(&BaseMatrix::divColVector);
-      compare(&BaseMatrix::addP2P);
-      compare(&BaseMatrix::invSqrt);
-    }
-  }
-}
-
-// Test all void (BaseMatrix::*)(real, real) function
-TEST(BaseMatrix, real_real) {
-  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
-      auto compare = [height, width](void (BaseMatrix::*f)(real, real)) {
-        AutoCompare test(height, width, 1e-5);
-        test.cmpWithoutArg<0, 1>(f, height, width);
-      };
-
-      compare(&BaseMatrix::add);
-      compare(&BaseMatrix::clip);
-    }
-  }
-}
-
-// Test all void (BaseMatrix::*)(BaseMatrix&, real) function
-TEST(BaseMatrix, BaseMatrix_real) {
-  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
-      auto compare = [height, width](void (BaseMatrix::*f)(BaseMatrix&, real)) {
-        AutoCompare test(height, width, 1e-5);
-        test.cmpWithoutArg<0, 1>(f, height, width);
-      };
-
-      compare(&BaseMatrix::addBias);
-      compare(&BaseMatrix::add);
-      compare(&BaseMatrix::sub);
-      compare(&BaseMatrix::pow2);
-      compare(&BaseMatrix::addScalar);
-      compare(&BaseMatrix::subScalar);
-      compare(&BaseMatrix::mulScalar);
-      compare(&BaseMatrix::divScalar);
-      compare(&BaseMatrix::scalarDiv);
-      compare(&BaseMatrix::addSquare);
-      compare(&BaseMatrix::isEqualTo);
-    }
-  }
-}
-
-// Test all void (BaseMatrix::*)(BaseMatrix&, BaseMatrix&) function
-TEST(BaseMatrix, BaseMatrix_BaseMatrix) {
-  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
-      auto compare = [height,
-                      width](void (BaseMatrix::*f)(BaseMatrix&, BaseMatrix&)) {
-        AutoCompare test(height, width, 1e-5);
-        test.cmpWithoutArg<0, 1>(f, height, width);
-      };
-
-      compare(&BaseMatrix::softCrossEntropy);
-      compare(&BaseMatrix::softCrossEntropyBp);
-      compare(&BaseMatrix::binaryLabelCrossEntropy);
-      compare(&BaseMatrix::binaryLabelCrossEntropyBp);
-      compare(&BaseMatrix::sub);
-      compare(&BaseMatrix::add2);
-      compare(&BaseMatrix::dotMul);
-      compare(&BaseMatrix::dotDiv);
-      compare(&BaseMatrix::logisticRegressionLoss);
-      compare(&BaseMatrix::logisticRegressionLossBp);
-      compare(&BaseMatrix::biggerThan);
-      compare(&BaseMatrix::max2);
-      compare(&BaseMatrix::dotMulSquare);
-      compare(&BaseMatrix::dotSquareSquare);
-    }
-  }
-}
-
-void TestEelementWise(size_t height, size_t width) {
-  AutoCompare rowScale(height, width);
-  rowScale.cmpWithoutArg<0, 1, 2>(&BaseMatrix::rowScale, height, width);
-
-  AutoCompare rowDotMul(height, width);
-  rowDotMul.cmpWithoutArg<0, 1, 2>(&BaseMatrix::rowDotMul, height, width);
-
-  AutoCompare binaryClassificationError(height, width);
-  binaryClassificationError.cmpWithoutArg<0, 1, 2, 3>(
-      &BaseMatrix::binaryClassificationError, height, width);
-
-  AutoCompare sumOfSquaresBp(height, width);
-  sumOfSquaresBp.cmpWithoutArg<0, 1>(&Matrix::sumOfSquaresBp, height, width);
-}
-
-void TestAggregateToRow(size_t height, size_t width) {
-  AutoCompare maxCols(1, width);
-  maxCols.cmpWithoutArg<0>(&BaseMatrix::maxCols, height, width);
-
-  AutoCompare minCols(1, width);
-  minCols.cmpWithoutArg<0>(&BaseMatrix::minCols, height, width);
-
-  AutoCompare addDotMulVMM(1, width);
-  addDotMulVMM.cmpWithoutArg<0, 1>(&BaseMatrix::addDotMulVMM, height, width);
-
-  AutoCompare sumCols(1, width);
-  sumCols.cmpWithoutArg<0, 1, 2>(&BaseMatrix::sumCols, height, width);
-
-  AutoCompare collectBias(1, width);
-  collectBias.cmpWithoutArg<0, 1>(
-      static_cast<void (Matrix::*)(Matrix&, real)>(&Matrix::collectBias),
-      height,
-      width);
-}
-
-void TestAggregateToCol(size_t height, size_t width) {
-  AutoCompare maxRows(height, 1);
-  maxRows.cmpWithoutArg<0>(&BaseMatrix::maxRows, height, width);
-
-  AutoCompare minRows(height, 1);
-  minRows.cmpWithoutArg<0>(&BaseMatrix::minRows, height, width);
-
-  AutoCompare sumRows(height, 1);
-  sumRows.cmpWithoutArg<0, 1, 2>(&BaseMatrix::sumRows, height, width);
-
-  AutoCompare sumOfSquares(height, 1);
-  sumOfSquares.cmpWithoutArg<0, 1>(&Matrix::sumOfSquares, height, width);
-}
-
-TEST(BaseMatrix, Other) {
-  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
-      TestEelementWise(height, width);
-      TestAggregateToRow(height, width);
-      TestAggregateToCol(height, width);
-    }
-  }
-}
-
-#endif
diff --git a/paddle/math/tests/test_CpuGpuVector.cpp b/paddle/math/tests/test_CpuGpuVector.cpp
deleted file mode 100644
index 395541a76ae5e5497fdaa8b4870e421cbf62608a..0000000000000000000000000000000000000000
--- a/paddle/math/tests/test_CpuGpuVector.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_CUDA
-
-#include <gtest/gtest.h>
-#include "paddle/math/Vector.h"
-#include "paddle/utils/Util.h"
-#include "test_matrixUtil.h"
-
-using namespace paddle;  // NOLINT
-
-TEST(CpuGpuVector, getData) {
-  size_t size = 500;
-  hl_stream_t stream(HPPL_STREAM_DEFAULT);
-  CpuVectorPtr cpuVec = std::make_shared<CpuVector>(size);
-  GpuVectorPtr gpuVec = std::make_shared<GpuVector>(size);
-  cpuVec->uniform(0.0, 10.0);
-  gpuVec->copyFrom(*cpuVec, stream);
-  hl_stream_synchronize(stream);
-
-  CpuGpuVectorPtr vec = std::make_shared<CpuGpuVector>(gpuVec);
-  auto a = vec->getData(false);
-  auto b = cpuVec->getData();
-  hl_stream_synchronize(stream);
-  checkDataEqual(a, b, size);
-}
-
-TEST(CpuGpuVector, subCreate) {
-  size_t size1 = 1024;
-  size_t offset = 100;
-  size_t size2 = 500;
-  hl_stream_t stream(HPPL_STREAM_DEFAULT);
-  CpuGpuVectorPtr v1 = std::make_shared<CpuGpuVector>(size1, /*useGpu*/ false);
-  auto vec = v1->getMutableVector(false);
-  vec->uniform(0.0, 10.0);
-  auto v2 = std::make_shared<CpuGpuVector>(*v1, offset, size2);
-  CHECK_EQ(*v1->getSync(), *v2->getSync());
-
-  // check subVec equal
-  checkDataEqual(v1->getData(false) + offset, v2->getData(false), size2);
-
-  CpuVectorPtr v1Check = std::make_shared<CpuVector>(size1);
-  CpuVectorPtr v2Check = std::make_shared<CpuVector>(size2);
-  v1Check->copyFrom(*(v1->getVector(true)), stream);
-  v2Check->copyFrom(*(v2->getVector(true)), stream);
-  hl_stream_synchronize(stream);
-
-  checkDataEqual(v2->getData(false), v2Check->getData(), size2);
-  checkDataEqual(v1Check->getData() + offset, v2Check->getData(), size2);
-
-  CpuVectorPtr noise = std::make_shared<CpuVector>(size2);
-  noise->uniform(0.0, 1.0);
-  auto v = v2->getMutableVector(false);  // will change header
-  // add noise to subVec
-  v->add(*noise);
-
-  // check v1_cpu_data == v2_cpu_data
-  checkDataEqual(v1->getData(false) + offset, v2->getData(false), size2);
-
-  v1Check->copyFrom(*(v1->getVector(true)), stream);
-  v2Check->copyFrom(*(v2->getVector(true)), stream);
-  hl_stream_synchronize(stream);
-
-  // check v1_gpu_data == v2_gpu_data
-  checkDataEqual(v1Check->getData() + offset, v2Check->getData(), size2);
-}
-
-#endif
diff --git a/paddle/math/tests/test_ExecViaCpu.cpp b/paddle/math/tests/test_ExecViaCpu.cpp
deleted file mode 100644
index 72256cb9d4c93159418d27c7ca0d4f8b9a412a64..0000000000000000000000000000000000000000
--- a/paddle/math/tests/test_ExecViaCpu.cpp
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/utils/PythonUtil.h>
-#include <paddle/utils/Util.h>
-#include <vector>
-#include "paddle/math/SparseMatrix.h"
-
-using namespace paddle;  // NOLINT
-
-const int height = 10;
-const int width = 16;
-
-real f(Matrix& mat1,
-       const Matrix& mat2,
-       IVector& vec1,
-       const IVector& vec2,
-       real scalar) {
-  CHECK(!mat1.useGpu());
-  CHECK(!mat2.useGpu());
-  CHECK(!vec1.useGpu());
-  CHECK(!vec2.useGpu());
-  mat1.copyFrom(mat2);
-  vec1.copyFrom(vec2);
-
-  return scalar;
-}
-
-class Functor {
- public:
-  real operator()(Matrix& mat1,
-                  const Matrix& mat2,
-                  IVector& vec1,
-                  const IVector& vec2,
-                  real scalar) {
-    a_ = f(mat1, mat2, vec1, vec2, scalar);
-    return a_;
-  }
-
- private:
-  real a_;
-};
-
-template <typename F>
-void testWrapper(F&& f) {
-  MatrixPtr cpumat1 = Matrix::create(height, width, false, /*useGpu=*/false);
-  MatrixPtr cpumat2 = Matrix::create(height, width, false, /*useGpu=*/false);
-
-  IVectorPtr cpuvec1 = IVector::create(height, /*useGpu=*/false);
-  IVectorPtr cpuvec2 = IVector::create(height, /*useGpu=*/false);
-
-  const real scalar = 1.23456;
-
-  MatrixPtr gpumat1 = Matrix::create(height, width, false, /*useGpu=*/true);
-  MatrixPtr gpumat2 = Matrix::create(height, width, false, /*useGpu=*/true);
-  IVectorPtr gpuvec1 = IVector::create(height, /*useGpu=*/true);
-  IVectorPtr gpuvec2 = IVector::create(height, /*useGpu=*/true);
-
-  cpumat2->randomizeUniform();
-  cpuvec2->rand(width);
-  gpumat2->copyFrom(*cpumat2);
-  gpuvec2->copyFrom(*cpuvec2);
-
-  real ret = execViaCpu(f, *gpumat1, *gpumat2, *gpuvec1, *gpuvec2, 1.23456);
-  EXPECT_EQ(ret, scalar);
-  cpumat1->copyFrom(*gpumat1);
-  cpuvec1->copyFrom(*gpuvec1);
-
-  for (int i = 0; i < height; ++i) {
-    EXPECT_EQ(cpuvec1->getElement(i), cpuvec2->getElement(i));
-    for (int j = 0; j < width; ++j) {
-      EXPECT_EQ(cpumat1->getElement(i, j), cpumat2->getElement(i, j));
-    }
-  }
-  gpumat1->resize(height, 1);
-  execViaCpu2(&CpuMatrix::selectElements, *gpumat1, *gpumat2, *gpuvec1);
-
-  cpumat1->resize(height, 1);
-  cpumat1->selectElements(*cpumat2, *cpuvec1);
-  for (int i = 0; i < height; ++i) {
-    EXPECT_EQ(cpumat1->getElement(i, 0), gpumat1->getElement(i, 0));
-  }
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(ExecViaCpu, test1) {
-  testWrapper(f);
-  testWrapper(&f);
-
-  auto lambda = [](Matrix& mat1,
-                   const Matrix& mat2,
-                   IVector& vec1,
-                   const IVector& vec2,
-                   real scalar) -> real {
-    return f(mat1, mat2, vec1, vec2, scalar);
-  };
-  LOG(INFO) << "lambda is_class=" << std::is_class<decltype(lambda)>::value
-            << " is_function=" << std::is_function<decltype(lambda)>::value;
-  testWrapper(lambda);
-
-  Functor functor;
-  testWrapper(functor);
-}
-#endif
diff --git a/paddle/math/tests/test_FPException.cpp b/paddle/math/tests/test_FPException.cpp
deleted file mode 100644
index d87fdcda9edc8644301b7fe77f4c0c751d5a774a..0000000000000000000000000000000000000000
--- a/paddle/math/tests/test_FPException.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/**
- * This test is about floating point calculation exception.
- * Paddle catches FE_INVALID, FE DIVBYZERO and FE_OVERFLOW exceptions.
- *
- * Some exceptions occur in the middle of a set of formulas,
- * that can be circumvented by some tricks.
- * For example,
- * calculate tanh
- *   b = 2.0 / (1.0 + exp(-2 * a)) - 1.0
- *
- * If the result of (-2 * a) is too large,
- * a FE_OVERFLOW exception occurs when calculating exp.
- * But the result of tanh is no overflow problem,
- * so we can add some tricks to prevent exp calculate an excessive value.
- *
- */
-
-#include <gtest/gtest.h>
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Common.h"
-
-using namespace paddle;  // NOLINT
-
-void SetTensorValue(Matrix& matrix, real value) {
-  int height = matrix.getHeight();
-  int width = matrix.getWidth();
-  int stride = matrix.getStride();
-  real* data = matrix.getData();
-  for (int i = 0; i < height; i++) {
-    int j = rand() % width;  // NOLINT
-    if (typeid(matrix) == typeid(CpuMatrix)) {
-      data[i * stride + j] = value;
-    } else if (typeid(matrix) == typeid(GpuMatrix)) {
-      hl_memcpy(&data[i * stride + j], &value, sizeof(real));
-    } else {
-      LOG(FATAL) << "should not reach here";
-    }
-  }
-}
-
-template <typename Matrix>
-void testTanh(real illegal) {
-  MatrixPtr A = std::make_shared<Matrix>(10, 10);
-  MatrixPtr B = std::make_shared<Matrix>(10, 10);
-  A->randomizeUniform();
-  B->randomizeUniform();
-
-  SetTensorValue(*A, illegal);
-
-  A->tanh(*B);
-}
-
-template <typename Matrix>
-void testSigmoid(real illegal) {
-  MatrixPtr A = std::make_shared<Matrix>(10, 10);
-  MatrixPtr B = std::make_shared<Matrix>(10, 10);
-  A->randomizeUniform();
-  B->randomizeUniform();
-
-  SetTensorValue(*A, illegal);
-
-  A->sigmoid(*B);
-}
-
-TEST(fp, overflow) {
-  for (auto illegal : {-90.0, 90.0}) {
-    LOG(INFO) << " illegal=" << illegal;
-    testTanh<CpuMatrix>(illegal);
-    testSigmoid<CpuMatrix>(illegal);
-  }
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-
-  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/math/tests/test_GpuProfiler.cpp b/paddle/math/tests/test_GpuProfiler.cpp
deleted file mode 100644
index 828159660bae1ad1c0b56fd7202f0357549877ca..0000000000000000000000000000000000000000
--- a/paddle/math/tests/test_GpuProfiler.cpp
+++ /dev/null
@@ -1,165 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_CUDA
-
-#include <gtest/gtest.h>
-#include "paddle/math/Matrix.h"
-#include "paddle/math/SparseMatrix.h"
-#include "paddle/testing/TestUtil.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/Util.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-void MatrixCheckErr(const Matrix& matrix1, const Matrix& matrix2) {
-  CHECK(matrix1.getHeight() == matrix2.getHeight());
-  CHECK(matrix1.getWidth() == matrix2.getWidth());
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-
-  int height = matrix1.getHeight();
-  int width = matrix1.getWidth();
-  const real* data1 = matrix1.getData();
-  const real* data2 = matrix2.getData();
-  int count = 0;
-  for (int i = 0; i < height; i++) {
-    for (int j = 0; j < width; j++) {
-      real a = data1[i * width + j];
-      real b = data2[i * width + j];
-      if (fabs(a - b) > err) {
-        if ((fabsf(a - b) / fabsf(a)) > (err / 10.0f)) {
-          count++;
-        }
-      }
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-}
-
-void testBilinearFwdBwd(int numSamples,
-                        int imgSizeH,
-                        int imgSizeW,
-                        int channels) {
-  int inWidth = imgSizeH * imgSizeW * channels;
-  int outWidth = 2 * imgSizeH * 2 * imgSizeW * channels;
-  real ratioH = 0.5;
-  real ratioW = 0.5;
-
-  // forward
-  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
-  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
-
-  input->randomizeUniform();
-  inputGpu->copyFrom(*input);
-
-  {
-    // nvprof: GPU Proflier
-    REGISTER_GPU_PROFILER("testBilinearFwdBwd");
-    target->bilinearForward(*input,
-                            imgSizeH,
-                            imgSizeW,
-                            2 * imgSizeH,
-                            2 * imgSizeW,
-                            channels,
-                            ratioH,
-                            ratioW);
-    targetGpu->bilinearForward(*inputGpu,
-                               imgSizeH,
-                               imgSizeW,
-                               2 * imgSizeH,
-                               2 * imgSizeW,
-                               channels,
-                               ratioH,
-                               ratioW);
-  }
-
-  // check
-  targetCheck->copyFrom(*targetGpu);
-  MatrixCheckErr(*target, *targetCheck);
-
-  // backward
-  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpuGrad =
-      GpuMatrix::create(numSamples, outWidth, false, true);
-  MatrixPtr targetCheckGrad =
-      CpuMatrix::create(numSamples, inWidth, false, false);
-
-  inputGrad->randomizeUniform();
-  targetGrad->randomizeUniform();
-  inputGpuGrad->copyFrom(*inputGrad);
-  targetGpuGrad->copyFrom(*targetGrad);
-
-  inputGrad->bilinearBackward(*targetGrad,
-                              2 * imgSizeH,
-                              2 * imgSizeW,
-                              imgSizeH,
-                              imgSizeW,
-                              channels,
-                              ratioH,
-                              ratioW);
-  inputGpuGrad->bilinearBackward(*targetGpuGrad,
-                                 2 * imgSizeH,
-                                 2 * imgSizeW,
-                                 imgSizeH,
-                                 imgSizeW,
-                                 channels,
-                                 ratioH,
-                                 ratioW);
-
-  // check
-  targetCheckGrad->copyFrom(*inputGpuGrad);
-  MatrixCheckErr(*inputGrad, *targetCheckGrad);
-}
-
-TEST(Profiler, testBilinearFwdBwd) {
-  auto numSamples = 10;
-  auto channels = 16;
-  auto imgSize = 64;
-  {
-    // nvprof: GPU Proflier
-    REGISTER_GPU_PROFILER("testBilinearFwdBwd");
-    // Paddle built-in timer
-    REGISTER_TIMER_INFO(
-        "testBilinearFwdBwd",
-        "numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64");
-    testBilinearFwdBwd(numSamples, imgSize, imgSize, channels);
-  }
-  globalStat.printAllStatus();
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-
-  // nvprof: GPU Proflier
-  REGISTER_GPU_PROFILER(
-      "RecursiveProfilingTest",
-      "numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64");
-
-  return RUN_ALL_TESTS();
-}
-
-#endif
diff --git a/paddle/math/tests/test_RowBuffer.cpp b/paddle/math/tests/test_RowBuffer.cpp
deleted file mode 100644
index e38de853e03874be3fd3582f7b39b1d490886d78..0000000000000000000000000000000000000000
--- a/paddle/math/tests/test_RowBuffer.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/math/RowBuffer.h"
-
-TEST(RowBuffer, testAutoGrow) {
-  paddle::RowBuffer buf(128);
-  ASSERT_EQ(128UL, buf.getWidth());
-  ASSERT_TRUE(buf.isAutoGrowth());
-  buf.resize(2);
-  ASSERT_EQ(2UL, buf.getRowCount());
-  for (size_t i = 0; i < buf.getWidth() * 2; ++i) {
-    buf.data()[i] = i;
-  }
-  for (size_t i = 0; i < buf.getRowCount(); ++i) {
-    for (size_t j = 0; j < buf.getWidth(); ++j) {
-      ASSERT_NEAR(i * buf.getWidth() + j, buf.get(i)[j], 1e-5);
-    }
-  }
-
-  auto data = buf.getWithAutoGrowth(2);
-  for (size_t i = 0; i < buf.getWidth(); ++i) {
-    data[i] = i;
-  }
-
-  ASSERT_EQ(3UL, buf.getRowCount());
-  for (size_t i = 0; i < buf.getRowCount() - 1; ++i) {
-    for (size_t j = 0; j < buf.getWidth(); ++j) {
-      ASSERT_NEAR(i * buf.getWidth() + j, buf.get(i)[j], 1e-5);
-    }
-  }
-  for (size_t i = 0; i < buf.getWidth(); ++i) {
-    ASSERT_NEAR(i, buf.get(2)[i], 1e-5);
-  }
-}
-
-TEST(RowBuffer, testWithMemBuf) {
-  paddle::CpuMemHandlePtr mem =
-      std::make_shared<paddle::CpuMemoryHandle>(128 * 2 * sizeof(real));
-  paddle::RowBuffer buf(mem, 128);
-  ASSERT_TRUE(!buf.isAutoGrowth());
-  ASSERT_EQ(2UL, buf.getRowCount());
-  for (size_t i = 0; i < buf.getWidth() * 2; ++i) {
-    buf.data()[i] = i;
-  }
-  for (size_t i = 0; i < buf.getRowCount(); ++i) {
-    for (size_t j = 0; j < buf.getWidth(); ++j) {
-      ASSERT_NEAR(i * buf.getWidth() + j, buf.getWithAutoGrowth(i)[j], 1e-5);
-    }
-  }
-
-  ASSERT_DEATH_IF_SUPPORTED(buf.getWithAutoGrowth(3), ".*");
-}
diff --git a/paddle/math/tests/test_SIMDFunctions.cpp b/paddle/math/tests/test_SIMDFunctions.cpp
deleted file mode 100644
index b692679436ee7bd3b8c4a675e969e15b065cc534..0000000000000000000000000000000000000000
--- a/paddle/math/tests/test_SIMDFunctions.cpp
+++ /dev/null
@@ -1,171 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/math/SIMDFunctions.h"
-#include "paddle/utils/Util.h"
-
-#include <gtest/gtest.h>
-
-#include <algorithm>
-#include <functional>
-#include <memory>
-#include <random>
-
-#include <stdlib.h>
-#include <time.h>
-
-static constexpr size_t VECTOR_LEN = 3072;
-static constexpr size_t BATCH_SIZE = 64;
-static constexpr size_t ALIGN = 32;
-static_assert(VECTOR_LEN % ALIGN == 0, "VECTOR_LEN % ALIGN == 0");
-static_assert(BATCH_SIZE % ALIGN == 0, "BATCH_SIZE % ALIGN == 0");
-static constexpr float EPSILON = 1e-5;
-static std::mt19937 RandomEngine(time(0));
-
-inline static std::unique_ptr<float[]> NewVector(size_t len = VECTOR_LEN,
-                                                 size_t align = ALIGN) {
-  float* ptr;
-  CHECK_EQ(posix_memalign((void**)&ptr, align, len * sizeof(float)), 0);
-  return std::unique_ptr<float[]>(ptr);
-}
-
-inline static std::unique_ptr<float[]> NewRandomVector(size_t len = VECTOR_LEN,
-                                                       size_t align = ALIGN) {
-  std::uniform_real_distribution<float> dist(-100.0f, 100.0f);
-  auto generator = std::bind(dist, RandomEngine);
-  auto retv = NewVector(len, align);
-  std::generate_n(retv.get(), len, generator);
-  return retv;
-}
-
-TEST(SIMDFunction, addTo) {
-  typedef std::function<void(float*, const float*, size_t)> AddToMethodType;
-
-  AddToMethodType naive = paddle::simd::naive::addTo<float>;
-  AddToMethodType simd = paddle::simd::addTo<float>;
-
-  auto A = NewRandomVector();
-  auto B = NewRandomVector();
-
-  auto ACopy = NewVector();
-  memcpy(ACopy.get(), A.get(), VECTOR_LEN * sizeof(float));
-
-  naive(A.get(), B.get(), VECTOR_LEN);
-  simd(ACopy.get(), B.get(), VECTOR_LEN);
-
-  for (size_t i = 0; i < VECTOR_LEN; ++i) {
-    ASSERT_NEAR(A[i], ACopy[i], EPSILON);
-  }
-}
-
-TEST(SIMDFunction, batchAddTo) {
-  auto A = NewRandomVector();
-  auto ACopy = NewVector();
-  memcpy(ACopy.get(), A.get(), sizeof(float) * VECTOR_LEN);
-
-  std::vector<std::unique_ptr<float[]>> B;
-  for (size_t i = 0; i < BATCH_SIZE; ++i) {
-    B.emplace_back(NewRandomVector());
-  }
-  std::unique_ptr<float* []> BRaw(new float*[BATCH_SIZE]);
-  for (size_t i = 0; i < BATCH_SIZE; ++i) {
-    BRaw[i] = B[i].get();
-  }
-
-  typedef std::function<void(float*, const float**, int, size_t)>
-      BatchAddToMethodType;
-
-  BatchAddToMethodType naive = paddle::simd::naive::batchAddTo<float>;
-  BatchAddToMethodType simd = paddle::simd::batchAddTo<float>;
-
-  naive(A.get(), (const float**)BRaw.get(), BATCH_SIZE, VECTOR_LEN);
-  simd(ACopy.get(), (const float**)BRaw.get(), BATCH_SIZE, VECTOR_LEN);
-
-  for (size_t i = 0; i < VECTOR_LEN; ++i) {
-    ASSERT_NEAR(A[i], ACopy[i], EPSILON);
-  }
-}
-
-TEST(SIMDFunction, colMax) {
-  auto A = NewRandomVector(VECTOR_LEN * BATCH_SIZE);
-  auto naiveResult = NewVector(BATCH_SIZE);
-  auto simdResult = NewVector(BATCH_SIZE);
-
-  typedef std::function<void(float*, const float*, int, int)> ColMaxMethodType;
-  ColMaxMethodType naive = paddle::simd::naive::colMax<float>;
-  ColMaxMethodType simd = paddle::simd::colMax<float>;
-
-  naive(naiveResult.get(), A.get(), BATCH_SIZE, VECTOR_LEN);
-  simd(simdResult.get(), A.get(), BATCH_SIZE, VECTOR_LEN);
-
-  for (size_t i = 0; i < BATCH_SIZE; ++i) {
-    ASSERT_NEAR(naiveResult[i], simdResult[i], EPSILON);
-  }
-}
-
-TEST(SIMDFunction, decayL1_WithLR) {
-  auto dest = NewRandomVector();
-  auto src = NewRandomVector();
-  auto lr = NewRandomVector();
-  auto lambda = 0.23f;
-
-  auto simd_dest = NewVector();
-  memcpy(simd_dest.get(), dest.get(), sizeof(float) * VECTOR_LEN);
-
-  typedef std::function<void(float*, float*, float*, float, size_t)>
-      DecayL1MethodType;
-
-  DecayL1MethodType naive = [](
-      float* d, float* s, float* lr, float l, size_t len) {
-    paddle::simd::naive::decayL1<float>(d, s, lr, l, len);
-  };
-
-  DecayL1MethodType simd = [](
-      float* d, float* s, float* lr, float l, size_t len) {
-    paddle::simd::decayL1<float>(d, s, lr, l, len);
-  };
-
-  naive(dest.get(), src.get(), lr.get(), lambda, VECTOR_LEN);
-  simd(simd_dest.get(), src.get(), lr.get(), lambda, VECTOR_LEN);
-
-  for (size_t i = 0; i < VECTOR_LEN; ++i) {
-    ASSERT_NEAR(dest[i], simd_dest[i], EPSILON);
-  }
-}
-
-TEST(SIMDFunction, decayL1_WithoutLR) {
-  auto dest = NewRandomVector();
-  auto src = NewRandomVector();
-  auto lambda = 0.23;
-
-  auto simd_dest = NewVector();
-  memcpy(simd_dest.get(), dest.get(), sizeof(float) * VECTOR_LEN);
-
-  typedef std::function<void(float*, float*, float, size_t)> DecayL1MethodType;
-
-  DecayL1MethodType naive = [](float* d, float* s, float l, size_t len) {
-    paddle::simd::naive::decayL1<float>(d, s, l, len);
-  };
-
-  DecayL1MethodType simd = [](float* d, float* s, float l, size_t len) {
-    paddle::simd::decayL1<float>(d, s, l, len);
-  };
-
-  naive(dest.get(), src.get(), lambda, VECTOR_LEN);
-  simd(simd_dest.get(), src.get(), lambda, VECTOR_LEN);
-
-  for (size_t i = 0; i < VECTOR_LEN; ++i) {
-    ASSERT_NEAR(dest[i], simd_dest[i], EPSILON);
-  }
-}
diff --git a/paddle/math/tests/test_SparseMatrix.cpp b/paddle/math/tests/test_SparseMatrix.cpp
deleted file mode 100644
index dbcbeb8d506cf22c026bb7299bf7f71de488cb4a..0000000000000000000000000000000000000000
--- a/paddle/math/tests/test_SparseMatrix.cpp
+++ /dev/null
@@ -1,565 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <paddle/utils/PythonUtil.h>
-#include <vector>
-#include "test_matrixUtil.h"
-
-using namespace paddle;  // NOLINT
-
-TEST(Matrix, CopyCpuMatrixToSparseMatrix) {
-  const size_t HEIGHT = 20;
-  const size_t WIDTH = 10;
-  const size_t WIDTH_TEST = 15;
-  MatrixPtr testMatrix(
-      new CpuSparseMatrix(HEIGHT, WIDTH, HEIGHT * 5, FLOAT_VALUE, SPARSE_CSR));
-  MatrixPtr testCpuMatrix(new CpuMatrix(HEIGHT, WIDTH));
-  testCpuMatrix->randomizeUniform();
-  testMatrix->copyFrom(*testCpuMatrix, HPPL_STREAM_DEFAULT);
-  MatrixPtr mulCpuMatrix(new CpuMatrix(WIDTH, WIDTH_TEST));
-  mulCpuMatrix->randomizeUniform();
-  MatrixPtr ret1(new CpuMatrix(HEIGHT, WIDTH_TEST)),
-      ret2(new CpuMatrix(HEIGHT, WIDTH_TEST));
-  ret1->zeroMem();
-  ret2->zeroMem();
-  ret1->mul(*testMatrix, *mulCpuMatrix, 1.0, 1.0);
-  ret2->mul(*testCpuMatrix, *mulCpuMatrix, 1.0, 1.0);
-  checkMatrixEqual(ret1, ret2);
-}
-
-struct MatrixPara {
-  size_t height;
-  size_t width;
-  bool trans;
-  bool sparse;
-  size_t nnz;
-  SparseFormat format;
-};
-
-#ifdef PADDLE_WITH_CUDA
-void test_sparse_matrix_mul(MatrixPara paraA,
-                            MatrixPara paraB,
-                            MatrixPara paraC) {
-  // for cpu sparse matrix mul
-  MatrixPtr cpuMatrixA, cpuMatrixB, cpuMatrixC, gpuMatrixC_d2h;
-  // for gpu sparse matrix mul
-  MatrixPtr gpuMatrixA, gpuMatrixB, gpuMatrixC;
-  // for cpu dense matrix mul
-  MatrixPtr cpuDenseA, cpuDenseB, cpuDenseC;
-
-  if (paraA.sparse) {
-    cpuMatrixA = Matrix::createSparseMatrix(paraA.height,
-                                            paraA.width,
-                                            paraA.nnz,
-                                            FLOAT_VALUE,
-                                            paraA.format,
-                                            paraA.trans,
-                                            false);
-    gpuMatrixA = Matrix::createSparseMatrix(paraA.height,
-                                            paraA.width,
-                                            paraA.nnz,
-                                            FLOAT_VALUE,
-                                            paraA.format,
-                                            paraA.trans,
-                                            true);
-  } else {
-    cpuMatrixA = Matrix::create(paraA.height, paraA.width, paraA.trans, false);
-    gpuMatrixA = Matrix::create(paraA.height, paraA.width, paraA.trans, true);
-  }
-  cpuDenseA = Matrix::create(paraA.height, paraA.width, paraA.trans, false);
-
-  if (paraB.sparse) {
-    cpuMatrixB = Matrix::createSparseMatrix(paraB.height,
-                                            paraB.width,
-                                            paraB.nnz,
-                                            FLOAT_VALUE,
-                                            paraB.format,
-                                            paraB.trans,
-                                            false);
-    gpuMatrixB = Matrix::createSparseMatrix(paraB.height,
-                                            paraB.width,
-                                            paraB.nnz,
-                                            FLOAT_VALUE,
-                                            paraB.format,
-                                            paraB.trans,
-                                            true);
-  } else {
-    cpuMatrixB = Matrix::create(paraB.height, paraB.width, paraB.trans, false);
-    gpuMatrixB = Matrix::create(paraB.height, paraB.width, paraB.trans, true);
-  }
-  cpuDenseB = Matrix::create(paraB.height, paraB.width, paraB.trans, false);
-
-  if (paraC.sparse) {
-    cpuMatrixC = Matrix::createSparseMatrix(paraC.height,
-                                            paraC.width,
-                                            paraC.nnz,
-                                            FLOAT_VALUE,
-                                            paraC.format,
-                                            paraC.trans,
-                                            false);
-    gpuMatrixC = Matrix::createSparseMatrix(paraC.height,
-                                            paraC.width,
-                                            paraC.nnz,
-                                            FLOAT_VALUE,
-                                            paraC.format,
-                                            paraC.trans,
-                                            true);
-    gpuMatrixC_d2h = Matrix::createSparseMatrix(paraC.height,
-                                                paraC.width,
-                                                paraC.nnz,
-                                                FLOAT_VALUE,
-                                                paraC.format,
-                                                paraC.trans,
-                                                false);
-  } else {
-    cpuMatrixC = Matrix::create(paraC.height, paraC.width, paraC.trans, false);
-    gpuMatrixC = Matrix::create(paraC.height, paraC.width, paraC.trans, true);
-    gpuMatrixC_d2h =
-        Matrix::create(paraC.height, paraC.width, paraC.trans, false);
-  }
-  cpuDenseC = Matrix::create(paraC.height, paraC.width, paraC.trans, false);
-
-  /*matrix init*/
-  hl_stream_t stream(HPPL_STREAM_1);
-  cpuMatrixA->randomizeUniform();
-  cpuMatrixB->randomizeUniform();
-  cpuMatrixC->randomizeUniform();
-
-  gpuMatrixA->copyFrom(*cpuMatrixA, stream);
-  gpuMatrixB->copyFrom(*cpuMatrixB, stream);
-  gpuMatrixC->copyFrom(*cpuMatrixC, stream);
-
-  cpuDenseA->copyFrom(*cpuMatrixA);
-  cpuDenseB->copyFrom(*cpuMatrixB);
-  cpuDenseC->copyFrom(*cpuMatrixC);
-
-  hl_stream_synchronize(stream);
-
-  /*matrix mul*/
-  cpuMatrixC->mul(*cpuMatrixA, *cpuMatrixB, 1.0, 1.0);
-  gpuMatrixC->mul(*gpuMatrixA, *gpuMatrixB, 1.0, 1.0);
-  cpuDenseC->mul(*cpuDenseA, *cpuDenseB, 1.0, 1.0);
-
-  gpuMatrixC_d2h->copyFrom(*gpuMatrixC, stream);
-  hl_stream_synchronize(stream);
-
-  /*check result*/
-  if (paraC.sparse) {
-    checkSMatrixEqual(
-        std::dynamic_pointer_cast<CpuSparseMatrix>(cpuMatrixC),
-        std::dynamic_pointer_cast<CpuSparseMatrix>(gpuMatrixC_d2h));
-    checkSMatrixEqual2Dense(
-        std::dynamic_pointer_cast<CpuSparseMatrix>(cpuMatrixC),
-        std::dynamic_pointer_cast<CpuMatrix>(cpuDenseC));
-  } else {
-    checkMatrixEqual(cpuMatrixC, gpuMatrixC_d2h);
-    checkMatrixEqual(cpuMatrixC, cpuDenseC);
-  }
-}
-
-TEST(Matrix, SparseMatrixMul) {
-  const size_t DIM_M = 4;
-  const size_t DIM_N = 4;
-  const size_t DIM_K = 8;
-  const size_t NNZ = 5;
-  for (auto format : {SPARSE_CSC, SPARSE_CSR}) {
-    std::string str_format = format == SPARSE_CSC ? "CSC" : "CSR";
-    LOG(INFO) << "test dense mul " << str_format;
-    test_sparse_matrix_mul(
-        {DIM_M, DIM_K, /*trans*/ false, /*sparse*/ false, NNZ, format},
-        {DIM_K, DIM_N, /*trans*/ false, /*sparse*/ true, NNZ, format},
-        {DIM_M, DIM_N, /*trans*/ false, /*sparse*/ false, NNZ, format});
-
-    LOG(INFO) << "test dense mul " << str_format << "  trans";
-    test_sparse_matrix_mul(
-        {DIM_M, DIM_K, /*trans*/ false, /*sparse*/ false, NNZ, format},
-        {DIM_N, DIM_K, /*trans*/ true, /*sparse*/ true, NNZ, format},
-        {DIM_M, DIM_N, /*trans*/ false, /*sparse*/ false, NNZ, format});
-
-    LOG(INFO) << "test dense mul dense 2 " << str_format;
-    test_sparse_matrix_mul(
-        {DIM_M, DIM_K, /*trans*/ false, /*sparse*/ false, NNZ, format},
-        {DIM_K, DIM_N, /*trans*/ false, /*sparse*/ false, NNZ, format},
-        {DIM_M, DIM_N, /*trans*/ false, /*sparse*/ true, NNZ, format});
-
-    LOG(INFO) << "test denseT mul dense 2 " << str_format;
-    test_sparse_matrix_mul(
-        {DIM_K, DIM_M, /*trans*/ true, /*sparse*/ false, NNZ, format},
-        {DIM_K, DIM_N, /*trans*/ false, /*sparse*/ false, NNZ, format},
-        {DIM_M, DIM_N, /*trans*/ false, /*sparse*/ true, NNZ, format});
-  }
-}
-
-TEST(Matrix, CopySparseMatrixToGpuSparseMatrix) {
-  const size_t HEIGHT = 20;
-  const size_t WIDTH = 10;
-  const size_t WIDTH_TEST = 15;
-  MatrixPtr testMatrix(
-      new CpuSparseMatrix(HEIGHT, WIDTH, HEIGHT * 2, FLOAT_VALUE, SPARSE_CSR));
-  MatrixPtr testCpuMatrix(new CpuMatrix(HEIGHT, WIDTH));
-  testCpuMatrix->randomizeUniform();
-  testMatrix->copyFrom(*testCpuMatrix, HPPL_STREAM_DEFAULT);
-
-  MatrixPtr testGpuMatrix = testMatrix->clone(HEIGHT, WIDTH, true);
-  hl_stream_t gpuStream(HPPL_STREAM_3);
-  testGpuMatrix->copyFrom(*testMatrix, gpuStream);
-  hl_stream_synchronize(gpuStream);
-
-  MatrixPtr mulCpuMatrix(new CpuMatrix(WIDTH, WIDTH_TEST));
-  mulCpuMatrix->randomizeUniform();
-  MatrixPtr mulGpuMatrix(new GpuMatrix(WIDTH, WIDTH_TEST));
-  mulGpuMatrix->copyFrom(*mulCpuMatrix);
-  MatrixPtr ret1(new CpuMatrix(HEIGHT, WIDTH_TEST));
-  MatrixPtr ret2(new GpuMatrix(HEIGHT, WIDTH_TEST));
-  ret1->zeroMem();
-  ret2->zeroMem();
-  ret1->mul(*testMatrix, *mulCpuMatrix, 1.0, 1.0);
-  ret2->mul(*testGpuMatrix, *mulGpuMatrix, 1.0, 1.0);
-  checkMatrixEqual(ret1, ret2);
-}
-
-#endif
-
-TEST(Matrix, SparseMatrixTranspose) {
-  for (auto height : {10, 50, 100}) {
-    for (auto width : {10, 50, 100}) {
-      auto nnz = height * width;
-      for (auto valueType : {FLOAT_VALUE, NO_VALUE}) {
-        for (auto format : {SPARSE_CSR, SPARSE_CSC}) {
-          for (auto sparseRate : {0.1, 0.2, 0.5}) {
-            MatrixPtr matA = Matrix::createSparseMatrix(
-                height, width, size_t(nnz * sparseRate), valueType, format);
-            MatrixPtr matB(new CpuSparseMatrix(
-                width, height, size_t(nnz * sparseRate), valueType, format));
-            matA->randomizeUniform();
-            matA->transpose(matB, false);
-
-            /*dense matrix transpose*/
-            CpuMatrixPtr matC(new CpuMatrix(height, width));
-            matC->copyFrom(*matA);
-            MatrixPtr matD(new CpuMatrix(width, height));
-            matC->transpose(matD, false);
-
-            /*check result*/
-            checkSMatrixEqual2Dense(
-                std::dynamic_pointer_cast<CpuSparseMatrix>(matB),
-                std::dynamic_pointer_cast<CpuMatrix>(matD));
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(Matrix, CpuSparseMatrixSubMatrix) {
-  const size_t HEIGHT = 10;
-  const size_t WIDTH = 10;
-  const size_t NNZ = HEIGHT * WIDTH;
-  for (auto valueType : {FLOAT_VALUE, NO_VALUE}) {
-    size_t startRow = 3;
-    size_t rowNum = 2;
-    real sparseRate = 0.1;
-    /*sparse matrix init and get subMatrix*/
-    CpuSparseMatrixPtr matA = std::make_shared<CpuSparseMatrix>(
-        HEIGHT, WIDTH, size_t(NNZ * sparseRate), valueType, SPARSE_CSR);
-    matA->randomizeUniform();
-    CpuSparseMatrixPtr matB = std::dynamic_pointer_cast<CpuSparseMatrix>(
-        matA->subMatrix(startRow, rowNum));
-
-    int start = matA->getRows()[startRow];
-    int end = matA->getRows()[startRow + rowNum];
-
-    /*compare two matrix*/
-    ASSERT_EQ(matB->getElementCnt(), size_t(end - start));
-    if (valueType == FLOAT_VALUE) {
-      for (size_t i = 0; i < matB->getElementCnt(); i++) {
-        ASSERT_FLOAT_EQ(matB->getValue()[start + i],
-                        matA->getValue()[start + i]);
-      }
-    }
-
-    for (size_t i = 0; i < matB->getElementCnt(); i++) {
-      ASSERT_EQ(matB->getCols()[start + i], matA->getCols()[start + i]);
-    }
-    for (size_t i = 0; i < rowNum; i++) {
-      ASSERT_EQ(matB->getRows()[i], matA->getRows()[startRow + i]);
-    }
-  }
-}
-
-void sparseValid(
-    int* major, int* minor, size_t nnz, size_t majorLen, size_t minorLen) {
-  CHECK_EQ(nnz, size_t(major[majorLen - 1]));
-  CHECK_EQ(nnz, minorLen);
-  for (size_t i = 0; i < majorLen - 1; i++) {
-    EXPECT_LE(major[i], major[i + 1]);
-    for (int j = major[i]; j < major[i + 1] - 1; j++) {
-      EXPECT_LE(minor[j], minor[j + 1]);
-    }
-  }
-}
-
-TEST(Matrix, CpuSparseMatrixRandUniform) {
-  const size_t HEIGHT = 5;
-  const size_t WIDTH = 10;
-  const size_t NNZ = HEIGHT * WIDTH;
-  int* major = nullptr;
-  int* minor = nullptr;
-  size_t majorLen = 0;
-  size_t minorLen = 0;
-  size_t nnz = 0;
-  for (auto valueType : {NO_VALUE, FLOAT_VALUE}) {
-    for (auto format : {SPARSE_CSR, SPARSE_CSC}) {
-      CpuSparseMatrixPtr matA = std::make_shared<CpuSparseMatrix>(
-          HEIGHT, WIDTH, size_t(NNZ * 0.1), valueType, format);
-      matA->randomizeUniform();
-      nnz = matA->getElementCnt();
-      if (format == SPARSE_CSR) {
-        majorLen = matA->getHeight() + 1;
-        minorLen = matA->getElementCnt();
-        major = matA->getRows();
-        minor = matA->getCols();
-      } else {
-        majorLen = matA->getWidth() + 1;
-        minorLen = matA->getElementCnt();
-        major = matA->getCols();
-        minor = matA->getRows();
-      }
-      sparseValid(major, minor, nnz, majorLen, minorLen);
-    }
-  }
-}
-
-TEST(Matrix, CpuSparseMatrixCopyFrom) {
-  size_t height = 10;
-  size_t width = 8;
-  int64_t indices[11] = {0, 1, 5, 5, 9, 13, 15, 17, 19, 30, 32};
-  sparse_non_value_t data[32];
-  for (size_t i = 0; i < 32; i++) {
-    data[i].col = ::rand() % width;
-  }
-  CpuSparseMatrixPtr mat = std::make_shared<CpuSparseMatrix>(
-      height, width, 32, NO_VALUE, SPARSE_CSR, false);
-  mat->copyFrom(indices, data);
-
-  /*compare indices*/
-  size_t sum = 0;
-  CHECK_EQ(sum, size_t(mat->getRows()[0]));
-  for (size_t i = 1; i < height + 1; i++) {
-    sum += indices[i] - indices[i - 1];
-    CHECK_EQ(sum, size_t(mat->getRows()[i]));
-  }
-  CHECK_EQ(mat->getElementCnt(), size_t(indices[height] - indices[0]));
-  for (size_t i = 0; i < mat->getElementCnt(); i++) {
-    CHECK_EQ(size_t(mat->getCols()[i]), size_t(data[i].col));
-  }
-}
-
-TEST(Matrix, SparseMatrixCSRFormatTrimFrom) {
-  size_t height = 10;
-  size_t width = 8;
-  int64_t indices[11] = {0, 1, 5, 5, 9, 13, 15, 17, 19, 27, 32};
-  sparse_float_value_t data[32];
-  int value[32] = {
-      1,                       // row_0 : 1
-      5, 3, 1, 6,              // row_1 : 4
-      0, 1, 2, 3,              // row_3 : 4
-      4, 5, 6, 7,              // row_4 : 4
-      2, 3,                    // row_5 : 2
-      3, 5,                    // row_6 : 2
-      0, 1,                    // row_7 : 2
-      0, 1, 2, 3, 4, 5, 6, 7,  // row_8 : 8
-      2, 4, 7, 3, 1            // row_9 : 5
-  };
-  for (size_t i = 0; i < 32; i++) {
-    data[i].col = value[i];
-    data[i].value = float(value[i]);
-  }
-  CpuSparseMatrixPtr mat = std::make_shared<CpuSparseMatrix>(
-      height, width, 32, FLOAT_VALUE, SPARSE_CSR, false);
-  mat->copyFrom(indices, data);
-
-  /*compare indices*/
-  size_t sum = 0;
-  CHECK_EQ(sum, size_t(mat->getRows()[0]));
-  for (size_t i = 1; i < height + 1; i++) {
-    sum += indices[i] - indices[i - 1];
-    CHECK_EQ(sum, size_t(mat->getRows()[i]));
-  }
-  CHECK_EQ(mat->getElementCnt(), size_t(indices[height] - indices[0]));
-  for (size_t i = 0; i < mat->getElementCnt(); i++) {
-    CHECK_EQ(size_t(mat->getCols()[i]), size_t(data[i].col));
-  }
-
-  size_t trimedWidth = 4;
-  int64_t trimedIndices[11] = {0, 1, 3, 3, 7, 7, 9, 10, 12, 16, 19};
-  sparse_float_value_t trimedData[19];
-  int trimedValue[19] = {
-      1,  // row_0 : 1
-      3,
-      1,  // row_1 : 2
-      0,
-      1,
-      2,
-      3,  // row_3 : 4
-      2,
-      3,  // row_5 : 2
-      3,  // row_6 : 1
-      0,
-      1,  // row_7 : 2
-      0,
-      1,
-      2,
-      3,  // row_8 : 4
-      2,
-      3,
-      1  // row_9 : 3
-  };
-  for (size_t i = 0; i < 19; i++) {
-    trimedData[i].col = trimedValue[i];
-    trimedData[i].value = float(trimedValue[i]);
-  }
-  CpuSparseMatrixPtr matA = std::make_shared<CpuSparseMatrix>(
-      height, trimedWidth, 19, FLOAT_VALUE, SPARSE_CSR, false);
-  matA->copyFrom(trimedIndices, trimedData);
-
-  /*compare indices*/
-  sum = 0;
-  CHECK_EQ(sum, size_t(matA->getRows()[0]));
-  for (size_t i = 1; i < height + 1; i++) {
-    sum += trimedIndices[i] - trimedIndices[i - 1];
-    CHECK_EQ(sum, size_t(matA->getRows()[i]));
-  }
-  CHECK_EQ(matA->getElementCnt(),
-           size_t(trimedIndices[height] - trimedIndices[0]));
-  for (size_t i = 0; i < matA->getElementCnt(); i++) {
-    CHECK_EQ(size_t(matA->getCols()[i]), size_t(trimedData[i].col));
-  }
-
-  CpuSparseMatrixPtr matB = std::make_shared<CpuSparseMatrix>(
-      height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSR, false);
-  matB->trimFrom(*mat);
-  checkSMatrixEqual2(matA, matB);
-
-#ifdef PADDLE_WITH_CUDA
-  GpuSparseMatrixPtr matC = std::make_shared<GpuSparseMatrix>(
-      height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSR, true);
-  matC->trimFrom(*mat);
-
-  CpuSparseMatrixPtr matD =
-      std::make_shared<CpuSparseMatrix>(height,
-                                        trimedWidth,
-                                        matC->getElementCnt(),
-                                        FLOAT_VALUE,
-                                        SPARSE_CSR,
-                                        false);
-  matD->copyFrom(*matC, HPPL_STREAM_DEFAULT);
-  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-  checkSMatrixEqual2(matA, matD);
-#endif
-}
-
-TEST(Matrix, SparseMatrixCSCFormatTrimFrom) {
-  size_t height = 8;
-  size_t width = 10;
-  int indices[11] = {0, 1, 5, 5, 9, 13, 15, 17, 19, 27, 32};
-  int value[32] = {
-      1,                       // col_0 : 1
-      5, 3, 1, 6,              // col_1 : 4
-      0, 1, 2, 3,              // col_3 : 4
-      4, 5, 6, 7,              // col_4 : 4
-      2, 3,                    // col_5 : 2
-      3, 5,                    // col_6 : 2
-      0, 1,                    // col_7 : 2
-      0, 1, 2, 3, 4, 5, 6, 7,  // col_8 : 8
-      2, 4, 7, 3, 1            // col_9 : 5
-  };
-  std::vector<int> rows(value, value + 32);
-  std::vector<int> cols(indices, indices + 11);
-  std::vector<real> values(value, value + 32);
-  CpuSparseMatrixPtr mat = std::make_shared<CpuSparseMatrix>(
-      height, width, 32, FLOAT_VALUE, SPARSE_CSC, false);
-  mat->copyFrom(rows, cols, values);
-
-  /*compare indices*/
-  size_t sum = 0;
-  CHECK_EQ(sum, size_t(mat->getCols()[0]));
-  for (size_t i = 1; i < width + 1; i++) {
-    sum += indices[i] - indices[i - 1];
-    CHECK_EQ(sum, size_t(mat->getCols()[i]));
-  }
-  CHECK_EQ(mat->getElementCnt(), size_t(indices[width] - indices[0]));
-  for (size_t i = 0; i < mat->getElementCnt(); i++) {
-    CHECK_EQ(size_t(mat->getRows()[i]), size_t(value[i]));
-  }
-
-  size_t trimedWidth = 5;
-  int trimedIndices[6] = {0, 1, 5, 5, 9, 13};
-  int trimedValue[13] = {
-      1,  // col_0 : 1
-      5,
-      3,
-      1,
-      6,  // col_1 : 4
-      0,
-      1,
-      2,
-      3,  // col_3 : 4
-      4,
-      5,
-      6,
-      7  // col_4 : 4
-  };
-  std::vector<int> rowsA(trimedValue, trimedValue + 13);
-  std::vector<int> colsA(trimedIndices, trimedIndices + 6);
-  std::vector<real> valuesA(trimedValue, trimedValue + 13);
-  CpuSparseMatrixPtr matA = std::make_shared<CpuSparseMatrix>(
-      height, trimedWidth, 13, FLOAT_VALUE, SPARSE_CSC, false);
-  matA->copyFrom(rowsA, colsA, valuesA);
-
-  /*compare indices*/
-  sum = 0;
-  CHECK_EQ(sum, size_t(matA->getCols()[0]));
-  for (size_t i = 1; i < trimedWidth + 1; i++) {
-    sum += trimedIndices[i] - trimedIndices[i - 1];
-    CHECK_EQ(sum, size_t(matA->getCols()[i]));
-  }
-  CHECK_EQ(matA->getElementCnt(),
-           size_t(trimedIndices[trimedWidth] - trimedIndices[0]));
-  for (size_t i = 0; i < matA->getElementCnt(); i++) {
-    CHECK_EQ(size_t(matA->getRows()[i]), size_t(rowsA[i]));
-  }
-
-  CpuSparseMatrixPtr matB = std::make_shared<CpuSparseMatrix>(
-      height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSC, false);
-  matB->trimFrom(*mat);
-  checkSMatrixEqual2(matA, matB);
-
-#ifdef PADDLE_WITH_CUDA
-  GpuSparseMatrixPtr matC = std::make_shared<GpuSparseMatrix>(
-      height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSC, true);
-  matC->trimFrom(*mat);
-
-  CpuSparseMatrixPtr matD =
-      std::make_shared<CpuSparseMatrix>(height,
-                                        trimedWidth,
-                                        matC->getElementCnt(),
-                                        FLOAT_VALUE,
-                                        SPARSE_CSC,
-                                        false);
-  matD->copyFrom(*matC, HPPL_STREAM_DEFAULT);
-  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-  checkSMatrixEqual2(matA, matD);
-#endif
-}
diff --git a/paddle/math/tests/test_Tensor.cu b/paddle/math/tests/test_Tensor.cu
deleted file mode 100644
index acb2da86d0f41d12fced97d1ddaf5be00959fb82..0000000000000000000000000000000000000000
--- a/paddle/math/tests/test_Tensor.cu
+++ /dev/null
@@ -1,1162 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "TensorCheck.h"
-#include "paddle/math/Matrix.h"
-
-using paddle::Matrix;
-using paddle::CpuMatrix;
-using paddle::GpuMatrix;
-using paddle::CpuVector;
-using paddle::GpuVector;
-using paddle::CpuIVector;
-using paddle::GpuIVector;
-using autotest::TensorCheckEqual;
-using autotest::TensorCheckErr;
-
-#define INIT_UNARY(A1, A2)  \
-  Tensor A1(height, width); \
-  Tensor A2(height, width); \
-  A1.randomizeUniform();    \
-  A2.copyFrom(A1)
-#define INIT_BINARY(A1, A2, B) \
-  INIT_UNARY(A1, A2);          \
-  Tensor B(height, width);     \
-  B.randomizeUniform()
-#define INIT_TERNARY(A1, A2, B, C) \
-  INIT_BINARY(A1, A2, B);          \
-  Tensor C(height, width);         \
-  C.randomizeUniform()
-#define INIT_QUATERNARY(A1, A2, B, C, D) \
-  INIT_TERNARY(A1, A2, B, C);            \
-  Tensor D(height, width);               \
-  D.randomizeUniform()
-
-template <typename Tensor>
-struct TestUnaryMatrix {
-  typedef std::function<void(Tensor& A1, Tensor& A2)> UnaryFunc;
-
-  explicit TestUnaryMatrix(UnaryFunc testUnaryFunc) {
-    for (auto height : {1, 11, 73, 128, 200, 330}) {
-      for (auto width : {1, 32, 100, 512, 1000, 3210}) {
-        LOG(INFO) << " height=" << height << " width=" << width;
-        INIT_UNARY(A1, A2);
-        testUnaryFunc(A1, A2);
-      }
-    }
-  }
-};
-
-template <typename Tensor>
-struct TestBinaryMatrix {
-  typedef std::function<void(Tensor& A1, Tensor& A2, Tensor& B)> BinaryFunc;
-
-  explicit TestBinaryMatrix(BinaryFunc testBinaryFunc) {
-    for (auto height : {1, 11, 73, 128, 200, 330}) {
-      for (auto width : {1, 32, 100, 512, 1000, 3210}) {
-        LOG(INFO) << " height=" << height << " width=" << width;
-        INIT_BINARY(A1, A2, B);
-        testBinaryFunc(A1, A2, B);
-      }
-    }
-  }
-};
-
-template <typename Tensor>
-struct TestTernaryMatrix {
-  typedef std::function<void(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C)>
-      TernaryFunc;
-
-  explicit TestTernaryMatrix(TernaryFunc testTernaryFunc) {
-    for (auto height : {1, 11, 73, 128, 200, 330}) {
-      for (auto width : {1, 32, 100, 512, 1000, 3210}) {
-        LOG(INFO) << " height=" << height << " width=" << width;
-        INIT_TERNARY(A1, A2, B, C);
-        testTernaryFunc(A1, A2, B, C);
-      }
-    }
-  }
-};
-
-template <typename Tensor>
-struct TestQuaternaryMatrix {
-  typedef std::function<void(
-      Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D)>
-      QuaternaryFunc;
-
-  explicit TestQuaternaryMatrix(QuaternaryFunc testQuaternaryFunc) {
-    for (auto height : {1, 11, 73, 128, 200, 330}) {
-      for (auto width : {1, 32, 100, 512, 1000, 3210}) {
-        LOG(INFO) << " height=" << height << " width=" << width;
-        INIT_QUATERNARY(A1, A2, B, C, D);
-        testQuaternaryFunc(A1, A2, B, C, D);
-      }
-    }
-  }
-};
-
-template <typename Tensor, class T>
-struct TestUnaryVectorT {
-  typedef std::function<void(Tensor& A1, Tensor& A2)> UnaryFunc;
-
-  explicit TestUnaryVectorT(UnaryFunc testUnaryFunc) {
-    for (auto size : {1, 11, 73, 128, 200, 330, 512, 1000, 4210}) {
-      LOG(INFO) << " size=" << size;
-      Tensor A1(size);
-      Tensor A2(size);
-      if (typeid(T) == typeid(real)) {
-        A1.rand();
-      } else {
-        A1.rand(1000);
-      }
-      A2.copyFrom(A1);
-      testUnaryFunc(A1, A2);
-    }
-  }
-};
-
-void SetTensorValue(Matrix& matrix, real value) {
-  int height = matrix.getHeight();
-  int width = matrix.getWidth();
-  int stride = matrix.getStride();
-  real* data = matrix.getData();
-  for (int i = 0; i < height; i++) {
-    int j = rand() % width;  // NOLINT
-    if (typeid(matrix) == typeid(CpuMatrix)) {
-      data[i * stride + j] = value;
-    } else if (typeid(matrix) == typeid(GpuMatrix)) {
-      hl_memcpy(&data[i * stride + j], &value, sizeof(real));
-    } else {
-    }
-  }
-}
-
-template <typename Tensor>
-void testTensorAddScalar(Tensor& A1, Tensor& A2) {
-  real p1 = 2.5;
-  real p2 = 3.0;
-  A1.add(p1);  // a += p
-  A2 += p1;
-  TensorCheckEqual(A1, A2);
-
-  A1.add(p1, p2);  // a = a * p1 + p2
-  A2 = A2 * p1 + p2;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSubScalar(Tensor& A1, Tensor& A2) {
-  real p = 2.5;
-  A1.subScalar(p);  // a -= p
-  A2 -= p;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorMulScalar(Tensor& A1, Tensor& A2) {
-  real p = 2.5;
-  A1.mulScalar(p);  // a *= p
-  A2 *= p;
-  TensorCheckEqual(A1, A2);
-
-  real learningRate = 0.7f;
-  real decayRate = 1.2f;
-  A1.applyL2(learningRate, decayRate);
-  A2 = A2 * (1.0f / (1.0f + learningRate * decayRate));
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorDivScalar(Tensor& A1, Tensor& A2) {
-  real p = 2.5;
-  A1.divScalar(p);  // a /= p
-  A2 /= p;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorNeg(Tensor& A1, Tensor& A2) {
-  A1.neg();  // a = -a
-  A2 = -A2;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorAbs(Tensor& A1, Tensor& A2) {
-  A1.abs2();  // a = a > 0 ? a : -a
-  A2 = A2.abs();
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSquare(Tensor& A1, Tensor& A2) {
-  A1.square2();  // a = a * a
-  A2 = A2.square();
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorReciprocal(Tensor& A1, Tensor& A2) {
-  A1.reciprocal2();  // a = 1.0f / a
-  A2 = A2.reciprocal();
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSign(Tensor& A1, Tensor& A2) {
-  A1.sign2();  // a = (a > 0) - (a < 0)
-  A2 = A2.sign();
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorAssign(Tensor& A1, Tensor& A2) {
-  A1.assign(1.5);  // a = p
-  A2 = A2.constant(1.5);
-  TensorCheckEqual(A1, A2);
-
-  A1.one();  // a = 1
-  A2 = A2.constant(1.0);
-  TensorCheckEqual(A1, A2);
-
-  A1.zero();  // a = 0
-  A2 = A2.constant(0.0);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testUnaryBaseOp(Tensor& A1, Tensor& A2) {
-  testTensorAddScalar(A1, A2);
-  testTensorSubScalar(A1, A2);
-  testTensorMulScalar(A1, A2);
-  testTensorDivScalar(A1, A2);
-  testTensorNeg(A1, A2);
-  testTensorAbs(A1, A2);
-  testTensorSquare(A1, A2);
-  testTensorReciprocal(A1, A2);
-  testTensorSign(A1, A2);
-  testTensorAssign(A1, A2);
-}
-
-template <typename Tensor>
-void testUnaryBaseOpInt(Tensor& A1, Tensor& A2) {
-  A1.add(2);  // a += p
-  A2 += 2;
-  TensorCheckEqual(A1, A2);
-
-  A1.add(3, 2);  // a = a * p1 + p2
-  A2 = A2 * 3 + 2;
-  TensorCheckEqual(A1, A2);
-
-  testTensorNeg(A1, A2);
-  testTensorAbs(A1, A2);
-}
-
-TEST(Unary, BaseOp) {
-  TestUnaryMatrix<CpuMatrix> testCpuMatrix(testUnaryBaseOp<CpuMatrix>);
-  TestUnaryVectorT<CpuVector, real> testCpuVector(testUnaryBaseOp<CpuVector>);
-  TestUnaryVectorT<CpuIVector, int> testCpuIVector(
-      testUnaryBaseOpInt<CpuIVector>);
-
-#ifdef PADDLE_WITH_GPU
-  TestUnaryMatrix<GpuMatrix> testGpuMatrix(testUnaryBaseOp<GpuMatrix>);
-  TestUnaryVectorT<GpuVector, real> testGpuVector(testUnaryBaseOp<GpuVector>);
-  TestUnaryVectorT<GpuIVector, int> testGpuIVector(
-      testUnaryBaseOpInt<GpuIVector>);
-#endif
-}
-
-template <typename Tensor>
-void testTensorExp(Tensor& A1, Tensor& A2) {
-  A1.exp2();  // a = exp(a)
-  A2 = A2.exp();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorLog(Tensor& A1, Tensor& A2) {
-  A1.log2();  // a = log(a)
-  A2 = A2.log();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSqrt(Tensor& A1, Tensor& A2) {
-  A1.sqrt2();  // a = sqrt(a)
-  A2 = A2.sqrt();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorPow(Tensor& A1, Tensor& A2) {
-  A1.pow2(3.2);  // a = pow(a, p)
-  A2 = A2.pow(3.2);
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testUnayrMathOp(Tensor& A1, Tensor& A2) {
-  testTensorExp(A1, A2);
-  testTensorLog(A1, A2);
-  testTensorSqrt(A1, A2);
-  testTensorPow(A1, A2);
-}
-
-TEST(Unary, MathOp) {
-  TestUnaryMatrix<CpuMatrix> testCpu(testUnayrMathOp<CpuMatrix>);
-
-#ifdef PADDLE_WITH_GPU
-  TestUnaryMatrix<GpuMatrix> testGpu(testUnayrMathOp<GpuMatrix>);
-#endif
-}
-
-template <typename Tensor>
-void testTensorClip(Tensor& A1, Tensor& A2) {
-  real p1 = 0.003f;
-  real p2 = 0.877f;
-  A1.clip(p1, p2);  // a = a < p1 ? p1 : (a > p2 ? p2 : a)
-  // A2 = A2.min(0.877f).max(0.003f);
-  A2 = (A2 < p1).condition(p1, (A2 > p2).condition(p2, A2));
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorBiggerThanScalar(Tensor& A1, Tensor& A2) {
-  real p = 0.5f;
-  A1.biggerThanScalar(p);  // a = a > p ? 1.0f : 0.0f
-  A2 = (A2 > p).condition((real)1.0, (real)0.0);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorapplyL1(Tensor& A1, Tensor& A2) {
-  /**
-   * T lambda = p;
-   * a = (a > lambda) ? (a - lambda)
-   *                  : (a < -lambda) ? (a + lambda) : 0
-   *
-   * p = learningRate * decayRate;
-   */
-  real learningRate = 0.7f;
-  real decayRate = 0.6f;
-  A1.applyL1(learningRate, decayRate);
-  A2 = (A2 > (learningRate * decayRate))
-           .condition(
-               (A2 - (learningRate * decayRate)),
-               (A2 < -(learningRate * decayRate))
-                   .condition((A2 + (learningRate * decayRate)), (real)0.0));
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testUnayrCompareOp(Tensor& A1, Tensor& A2) {
-  testTensorClip(A1, A2);
-  testTensorBiggerThanScalar(A1, A2);
-
-  A1.randomizeUniform();
-  A1.subScalar(0.5f);
-  A2.copyFrom(A1);
-  testTensorapplyL1(A1, A2);
-}
-
-TEST(Unary, CompareOp) {
-  TestUnaryMatrix<CpuMatrix> testCpu(testUnayrCompareOp<CpuMatrix>);
-
-#ifdef PADDLE_WITH_GPU
-  TestUnaryMatrix<GpuMatrix> testGpu(testUnayrCompareOp<GpuMatrix>);
-#endif
-}
-
-template <typename Tensor>
-void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B) {
-  real p1 = 2.5;
-  real p2 = 3.2;
-  A1.add(B);  // a += b
-  A2 += B;
-  TensorCheckEqual(A1, A2);
-
-  A1.add(B, p1);  // a += b * p
-  A2 += B * p1;
-  TensorCheckEqual(A1, A2);
-
-  A1.add(B, p1, p2);  // a = p1 * a + p2 * b
-  A2 = A2 * p1 + B * p2;
-  TensorCheckEqual(A1, A2);
-
-  A1.addScalar(B, p1);  // a = b + p
-  A2 = B + p1;
-  TensorCheckEqual(A1, A2);
-
-  A1.addSquare(B, p1);  // a += p * b * b
-  A2 += B.constant(p1) * B * B;
-  TensorCheckEqual(A1, A2);
-
-  A1.decayAddSquare(B, p1, p2);  // a = p1 * a + p2 * b * b
-  A2 = A2 * p1 + B.constant(p2) * B * B;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B) {
-  real p = 2.5;
-  A1.sub(B);  // a -= b
-  A2 -= B;
-  TensorCheckEqual(A1, A2);
-
-  A1.sub(B, p);  // a -= b * p
-  A2 -= B * p;
-  TensorCheckEqual(A1, A2);
-
-  A1.subScalar(B, p);  // a = b - p
-  A2 = B - p;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B) {
-  real p = 2.5;
-  A1.mulScalar(B, p);  // a = b * p
-  A2 = B * p;
-  TensorCheckEqual(A1, A2);
-
-  A1.dotMulSquare(B);  // a *= b * b
-  A2 *= B * B;
-  TensorCheckEqual(A1, A2);
-
-  A1.dotSquareMul(B);  // a = a * a * b
-  A2 = A2 * A2 * B;
-  TensorCheckEqual(A1, A2);
-
-  A1.dotMul(B);  // a *= b
-  A2 *= B;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B) {
-  real p = 2.5;
-  A1.divScalar(B, p);  // a = b / p
-  A2 = B / p;
-  TensorCheckEqual(A1, A2);
-
-  A1.scalarDiv(B, p);  // a = p / b
-  A2 = B.constant(p) / B;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorAssign(Tensor& A1, Tensor& A2, Tensor& B) {
-  A1.assign(B);  // a = b
-  A2 = B;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSquare(Tensor& A1, Tensor& A2, Tensor& B) {
-  B.square2(A1);  // b = a * a
-  A2 = B.square();
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSquareDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  A1.squareDerivative(B);  // a *= 2.0 * b
-  A2 = A2 * (real)2.0 * B;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B) {
-  B.reciprocal2(A1);  // b = 1.0f / a
-  A2 = B.reciprocal();
-  TensorCheckEqual(A1, A2);
-
-  real p1 = 0.58;
-  real p2 = 0.32;
-  A1.reciprocal2(B, p1, p2);  // a = 1 / (p1 * b + p2)
-  A2 = (B * p1 + p2).reciprocal();
-  TensorCheckEqual(A1, A2);
-
-  real learningRate = 0.7f;
-  real decayRate = 1.2f;
-  A1.applyL2(B, learningRate, decayRate);  // a *= (1.0f / (1.0f + p * b))
-  A2 *= (B.constant(1.0f) + B.constant(learningRate * decayRate) * B)
-            .reciprocal();
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorReciprocalDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  A1.reciprocalDerivative(B);  // a *= -b * b
-  A2 *= (-B) * B;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSign(Tensor& A1, Tensor& A2, Tensor& B) {
-  B.sign2(A1);  // b = a > 0.0f ? 1.0f : -1.0f
-  A2 = B.sign();
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorAbs(Tensor& A1, Tensor& A2, Tensor& B) {
-  B.abs2(A1);  // b = a > 0.0f ? a : -a
-  A2 = B.abs();
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testBinaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B) {
-  testTensorAdd(A1, A2, B);
-  testTensorSub(A1, A2, B);
-  testTensorMul(A1, A2, B);
-  testTensorDiv(A1, A2, B);
-  testTensorSquare(A1, A2, B);
-  testTensorSquareDerivative(A1, A2, B);
-  testTensorReciprocal(A1, A2, B);
-  testTensorReciprocalDerivative(A1, A2, B);
-  testTensorAbs(A1, A2, B);
-  testTensorSign(A1, A2, B);
-  testTensorAssign(A1, A2, B);
-}
-
-TEST(Binary, BaseOp) {
-  TestBinaryMatrix<CpuMatrix> testCpu(testBinaryBaseOp<CpuMatrix>);
-
-#ifdef PADDLE_WITH_GPU
-  TestBinaryMatrix<GpuMatrix> testGpu(testBinaryBaseOp<GpuMatrix>);
-#endif
-}
-
-template <typename Tensor>
-void testTensorExp(Tensor& A1, Tensor& A2, Tensor& B) {
-  // a = exp(b)
-  A1.exp2(B);
-  A2 = B.exp();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorExpDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  A1.expDerivative(B);  // a *= b
-  A2 *= B;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorLog(Tensor& A1, Tensor& A2, Tensor& B) {
-  // a = log(b)
-  A1.log2(B);
-  A2 = B.log();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSqrt(Tensor& A1, Tensor& A2, Tensor& B) {
-  // a = sqrt(b)
-  A1.sqrt2(B);
-  A2 = B.sqrt();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorInvSqrt(Tensor& A1, Tensor& A2, Tensor& B) {
-  // a = 1.0f / sqrt(b)
-  A1.invSqrt(B);
-  A2 = B.sqrt().reciprocal();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorPow(Tensor& A1, Tensor& A2, Tensor& B) {
-  A1.pow2(B, 2.5f);  // a = pow(b, p)
-  A2 = B.pow(2.5f);
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSoftrelu(Tensor& A1, Tensor& A2, Tensor& B) {
-  /*
-   * const T THRESHOLD = 40.0;
-   * b = log(1.0 +
-   *         exp((a > THRESHOLD) ? THRESHOLD
-   *             : ((a < -THRESHOLD) ? (-THRESHOLD) : a)))
-   */
-  B.softrelu(A1);
-
-  real THRESHOLD = 40.0;
-  A2 = (B.constant(1.0f) +
-        (B > THRESHOLD)
-            .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B))
-            .exp())
-           .log();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSoftreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  /*
-   * const T THRESHOLD = 40.0;
-   * a *= (1.0 - exp(-1.0 * ((b > THRESHOLD)
-   *                             ? THRESHOLD
-   *                             : ((b < -THRESHOLD) ? (-THRESHOLD) : b)))));
-   */
-  A1.softreluDerivative(B);
-  real THRESHOLD = 40.0;
-  A2 = A2 *
-       (B.constant(1.0f) -
-        (B.constant(-1.0f) *
-         (B > THRESHOLD)
-             .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B)))
-            .exp());
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSigmoid(Tensor& A1, Tensor& A2, Tensor& B) {
-  /*
-    const T THRESHOLD_MIN = -40.0;
-    const T THRESHOLD_MAX = 13.0;
-    T tmp = (a < THRESHOLD_MIN) ? THRESHOLD_MIN
-            : ((a > THRESHOLD_MAX) ? THRESHOLD_MAX : a);
-    b = 1.0f / (1.0f + exp(-tmp)))
-   */
-  B.sigmoid(A1);
-
-  const real THRESHOLD_MIN = -40.0;
-  const real THRESHOLD_MAX = 13.0;
-  auto tmp = (B < THRESHOLD_MIN)
-                 .condition(THRESHOLD_MIN,
-                            (B > THRESHOLD_MAX).condition(THRESHOLD_MAX, B));
-  A2 = (B.constant(1.0f) + (-tmp).exp()).reciprocal();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSigmoidDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  A1.sigmoidDerivative(B);  // a *= b * (1 - b)
-  A2 *= B * (B.constant(1.0f) - B);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorTanh(Tensor& A1, Tensor& A2, Tensor& B) {
-  B.tanh(A1);  // b = 2.0 / (1.0 + exp(-2 * a)) - 1.0
-  A2 = B.constant(2.0f) / ((B * ((real)-2.0f)).exp() + (real)1.0f) - (real)1.0f;
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  A1.tanhDerivative(B);  // a *= 1 - b * b
-  A2 *= B.constant(1.0f) - B * B;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorScaledTanh(Tensor& A1, Tensor& A2, Tensor& B) {
-  real p1 = 2.5;
-  real p2 = 3.1;
-  // b = p1 * (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0)
-  B.scaledTanh(A1, p1, p2);
-  A2 = B.constant(p1) *
-       (B.constant(2.0f) / ((B.constant(-2.0f) * p2 * B).exp() + (real)1.0) -
-        (real)1.0);
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorScaledTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  real p1 = 2.5;
-  real p2 = 3.1;
-  // a *= (p2 / p1) * (p1 * p1 - b * b));
-  A1.scaledTanhDerivative(B, p1, p2);
-  A2 = A2 * (B.constant(p2 / p1) * (B.constant(p1 * p1) - B * B));
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testBinaryMathOp(Tensor& A1, Tensor& A2, Tensor& B) {
-  testTensorTanhDerivative(A1, A2, B);
-  testTensorScaledTanhDerivative(A1, A2, B);
-  testTensorSigmoidDerivative(A1, A2, B);
-  testTensorExpDerivative(A1, A2, B);
-  testTensorScaledTanh(A1, A2, B);
-  testTensorTanh(A1, A2, B);
-  testTensorExp(A1, A2, B);
-  testTensorLog(A1, A2, B);
-  testTensorSqrt(A1, A2, B);
-  testTensorInvSqrt(A1, A2, B);
-  testTensorPow(A1, A2, B);
-
-  testTensorSoftrelu(A1, A2, B);
-  testTensorSoftreluDerivative(A1, A2, B);
-  testTensorSigmoid(A1, A2, B);
-}
-
-TEST(Binary, MathOp) {
-  TestBinaryMatrix<CpuMatrix> testCpu(testBinaryMathOp<CpuMatrix>);
-
-#ifdef PADDLE_WITH_GPU
-  TestBinaryMatrix<GpuMatrix> testGpu(testBinaryMathOp<GpuMatrix>);
-#endif
-}
-
-template <typename Tensor>
-void testTensorRelu(Tensor& A1, Tensor& A2, Tensor& B) {
-  B.relu(A1);  // b = a > 0.0f ? a : 0.0f
-  A2 = (B > (real)0.0f).condition(B, (real)0.0f);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorReluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  A1.reluDerivative(B);  // a *= (b > 0.0f ? 1.0f : 0.0f)
-  A2 *= (B > (real)0.0).condition((real)1.0, (real)0.0);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorBrelu(Tensor& A1, Tensor& A2, Tensor& B) {
-  /*
-   * b = a > p1 ? a : p1
-   * b = b < p2 ? b : p2
-   * int p1 = 0, p2 = 24;
-   */
-  SetTensorValue(B, 32.0f);
-  B.brelu(A1);
-  auto tmp = (B > (real)0.0f).condition(B, (real)0.0f);
-  A2 = (tmp < (real)24.0f).condition(tmp, (real)24.0f);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorBreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  SetTensorValue(B, 32.0f);
-  /*
-   * a *= (b > p1 && b < p2) ? 1.0 : 0.0
-   * int p1 = 0, p2 = 24;
-   */
-  A1.breluDerivative(B);
-  A2 *= (B > (real)0.0f && B < (real)24.0f).condition((real)1.0f, (real)0.0f);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorAbsDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  A1.absDerivative(B);  // a = (b > 0) ? a : (b < 0) ? -a : 0
-  A2 = (B > (real)0.0f)
-           .condition(A2, (B < (real)0.0f).condition(-A2, (real)0.0f));
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorIsEqualTo(Tensor& A1, Tensor& A2, Tensor& B) {
-  real p = 0.613;
-  SetTensorValue(B, p);
-  A1.isEqualTo(B, p);  // a = (b == p)
-  A2 = (B == p);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorapplyL1(Tensor& A1, Tensor& A2, Tensor& B) {
-  /**
-   * T lambda = p * b;
-   * a = (a > lambda) ? (a - lambda)
-   *                  : (a < -lambda) ? (a + lambda) : 0
-   *
-   * p = learningRate * decayRate;
-   */
-  real learningRate = 0.7f;
-  real decayRate = 0.6f;
-  A1.applyL1(B, learningRate, decayRate);
-  auto lambda = B.constant(learningRate * decayRate) * B;
-  A2 = (A2 > lambda)
-           .condition((A2 - lambda),
-                      (A2 < -lambda).condition((A2 + lambda), (real)0.0f));
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testBinaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B) {
-  B.subScalar(0.5f);
-  SetTensorValue(B, 0.0f);
-  testTensorReluDerivative(A1, A2, B);
-
-  A1.randomizeUniform();
-  A2.copyFrom(A1);
-  testTensorBreluDerivative(A1, A2, B);
-
-  testTensorAbsDerivative(A1, A2, B);
-  testTensorRelu(A1, A2, B);
-  testTensorBrelu(A1, A2, B);
-  testTensorIsEqualTo(A1, A2, B);
-}
-
-TEST(Binary, CompareOp) {
-  TestBinaryMatrix<CpuMatrix> testCpu(testBinaryCompareOp<CpuMatrix>);
-
-#ifdef PADDLE_WITH_GPU
-  TestBinaryMatrix<GpuMatrix> testGpu(testBinaryCompareOp<GpuMatrix>);
-#endif
-}
-
-template <typename Tensor>
-void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  A1.add(B, C);  // a = b + c
-  A2 = B + C;
-  TensorCheckEqual(A1, A2);
-
-  real p1 = 1.5;
-  real p2 = 2.5;
-  real p3 = 3.8;
-  A1.add(B, p1, C, p2);  // a = p1 * b + p2 * c
-  A2 = B * p1 + C * p2;
-  TensorCheckEqual(A1, A2);
-
-  A1.add2(B, C);  // a = a + b + c
-  A2 = A2 + B + C;
-  TensorCheckEqual(A1, A2);
-
-  A1.add2(B, C, p1, p2, p3);  // a = p1 * a + p2 * b + p3 * c
-  A2 = A2 * p1 + B * p2 + C * p3;
-  TensorCheckEqual(A1, A2);
-
-  A1.decayAddSquareMul(B, C, p1, p2);  // a = p1 * a + p2 * b * b * c * c
-  A2 = A2 * p1 + B.constant(p2) * B * B * C * C;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  A1.sub(B, C);  // a = b - c
-  A2 = B - C;
-  TensorCheckEqual(A1, A2);
-
-  real p1 = 1.5;
-  real p2 = 2.5;
-  A1.sub(B, p1, C, p2);  // a = p1 * b - p2 * c
-  A2 = B * p1 - C * p2;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  A1.dotMul(B, C);  // a = b * c
-  A2 = B * C;
-  TensorCheckEqual(A1, A2);
-
-  A1.dotMulSquare(B, C);  // a = b * c * c
-  A2 = B * C * C;
-  TensorCheckEqual(A1, A2);
-
-  A1.dotSquareSquare(B, C);  // a = b * b * c * c
-  A2 = B * B * C * C;
-  TensorCheckEqual(A1, A2);
-
-  real p1 = 1.5;
-  real p2 = 2.5;
-
-  /*
-   * T tmp = p1 * b + p2 * c;
-   * a *= tmp * tmp
-   */
-  A1.dotMulSquareSum(B, C, p1, p2);
-  auto tmp = B * p1 + C * p2;
-  A2 *= tmp * tmp;
-  TensorCheckEqual(A1, A2);
-
-  /*
-   * T tmp = p1 * b + p2 * c;
-   * a = tmp * tmp
-   */
-  A1.dotSquareSum(B, C, p1, p2);
-  auto tmp2 = B * p1 + C * p2;
-  A2 = tmp2 * tmp2;
-  TensorCheckEqual(A1, A2);
-
-  // a *= p1 * b + p2 * c
-  A1.dotMulSum(B, C, p1, p2);
-  A2 *= B * p1 + C * p2;
-  TensorCheckEqual(A1, A2);
-
-  // a = p1 * a + p2 * b * c
-  A1.addDotMul(B, C, p1, p2);
-  A2 = A2 * p1 + B.constant(p2) * B * C;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  A1.dotDiv(B, C);  // a = (b == 0.0) ? 0.0 : b / c
-  A2 = (B == (real)0.0).condition((real)0.0, B / C);
-  TensorCheckEqual(A1, A2);
-
-  real p1 = 1.5;
-  real p2 = 2.5;
-  A1.dotDiv(B, C, p1, p2);  // a = (b + p1) / (c + p2)
-  A2 = (B + p1) / (C + p2);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  real p1 = 1.5;
-  real p2 = 2.5;
-  real p3 = 3.5;
-  A1.reciprocalSum(B, C, p1, p2, p3);  // a = 1 / (p1 * b + p2 * c + p3)
-  A2 = (B * p1 + C * p2 + p3).reciprocal();
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSoftCrossEntropy(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  A1.softCrossEntropy(B, C);  // a = -c * log(b) - (1 - c) * log(1 - b)
-  A2 = -C * B.log() - (C.constant(1.0f) - C) * (B.constant(1.0f) - B).log();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSoftCrossEntropyBp(Tensor& A1,
-                                  Tensor& A2,
-                                  Tensor& B,
-                                  Tensor& C) {
-  A1.softCrossEntropyBp(B, C);  // a += (b - c) / (b * (1 - b))
-  A2 += (B - C) / (B * (B.constant(1.0f) - B));
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTernaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  testTensorAdd(A1, A2, B, C);
-  testTensorSub(A1, A2, B, C);
-  testTensorMul(A1, A2, B, C);
-  testTensorDiv(A1, A2, B, C);
-  testTensorReciprocal(A1, A2, B, C);
-  testTensorSoftCrossEntropyBp(A1, A2, B, C);
-
-  testTensorSoftCrossEntropy(A1, A2, B, C);
-}
-
-TEST(Ternary, BaseOp) {
-  TestTernaryMatrix<CpuMatrix> testCpu(testTernaryBaseOp<CpuMatrix>);
-
-#ifdef PADDLE_WITH_GPU
-  TestTernaryMatrix<GpuMatrix> testGpu(testTernaryBaseOp<GpuMatrix>);
-#endif
-}
-
-template <typename Tensor>
-void testTensorBinaryLabelCrossEntropy(Tensor& A1,
-                                       Tensor& A2,
-                                       Tensor& B,
-                                       Tensor& C) {
-  A1.binaryLabelCrossEntropy(B, C);  // a = c > 0.5 ? -log(b) : -log(1.0 - b)
-  A2 = (C > (real)0.5).condition(-(B.log()), -((B.constant(1.0f) - B).log()));
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorBinaryLabelCrossEntropyBp(Tensor& A1,
-                                         Tensor& A2,
-                                         Tensor& B,
-                                         Tensor& C) {
-  // a += c > 0.5 ? -1.0 / b : 1.0 / (1.0 - b)
-  A1.binaryLabelCrossEntropyBp(B, C);
-  A2 += (C > (real)0.5)
-            .condition((B.constant(-1.0f) / B),
-                       (B.constant(1.0f) - B).reciprocal());
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorLogisticRegressionLoss(Tensor& A1,
-                                      Tensor& A2,
-                                      Tensor& B,
-                                      Tensor& C) {
-  SetTensorValue(B, 50.0f);
-  SetTensorValue(B, -50.0f);
-  /**
-   * const T THRESHOLD = 40.0;
-   * T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD)
-   *                                        ? -THRESHOLD
-   *                                        : b;
-   * a = log(1 + exp(x)) - c * x
-   */
-  A1.logisticRegressionLoss(B, C);
-  real THRESHOLD = 40.0;
-  auto tmp =
-      (B > THRESHOLD)
-          .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B));
-  A2 = (C.constant(1.0f) + tmp.exp()).log() - C * tmp;
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorLogisticRegressionLossBp(Tensor& A1,
-                                        Tensor& A2,
-                                        Tensor& B,
-                                        Tensor& C) {
-  SetTensorValue(B, 50.0f);
-  SetTensorValue(B, -50.0f);
-  /**
-   * const T THRESHOLD = 40.0;
-   * T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD)
-   *                                        ? -THRESHOLD
-   *                                        : b;
-   * x = exp(x); a = x / (1 + x) - c
-   */
-  A1.logisticRegressionLossBp(B, C);
-  real THRESHOLD = 40.0;
-  auto tmp =
-      (B > THRESHOLD)
-          .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B));
-  auto tmp2 = tmp.exp();
-  A2 = tmp2 / (C.constant(1.0) + tmp2) - C;
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorBiggerThan(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  A1.biggerThan(B, C);  // a = (b > c) ? 1.0f : 0.0f
-  A2 = (B > C).condition((real)1.0f, (real)0.0f);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorMax(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  A1.max2(B, C);  // a = (b > c) ? b : c
-  A2 = (B > C).condition(B, C);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTernaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  testTensorBinaryLabelCrossEntropyBp(A1, A2, B, C);
-  testTensorBinaryLabelCrossEntropy(A1, A2, B, C);
-  testTensorBiggerThan(A1, A2, B, C);
-  testTensorMax(A1, A2, B, C);
-
-  testTensorLogisticRegressionLoss(A1, A2, B, C);
-  testTensorLogisticRegressionLossBp(A1, A2, B, C);
-}
-
-TEST(Ternary, CompareOp) {
-  TestTernaryMatrix<CpuMatrix> testCpu(testTernaryCompareOp<CpuMatrix>);
-
-#ifdef PADDLE_WITH_GPU
-  TestTernaryMatrix<GpuMatrix> testGpu(testTernaryCompareOp<GpuMatrix>);
-#endif
-}
-
-template <typename Tensor>
-void testQuaternaryAdd(
-    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
-  // A1.add3(B, C, D, 1.5f, 2.5f, 3.5f);  // a = p1 * b + p2 * c + p3 * d
-  // A2 = B * 1.5f + C * 2.5f + D * 3.5f;
-  // TensorCheckEqual(A1, A2);
-
-  /*
-   * T tmp = p1 * b + p2 * c + p3 * d;
-   * a += tmp * tmp
-   */
-  real p1 = 1.5f;
-  real p2 = 2.5f;
-  real p3 = 3.5f;
-  A1.addSquareSum(B, C, D, p1, p2, p3);
-  auto tmp = B * p1 + C * p2 + D * p3;
-  A2 += tmp * tmp;
-  TensorCheckEqual(A1, A2);
-}
-
-TEST(Quaternary, BaseOp) {
-  TestQuaternaryMatrix<CpuMatrix> testCpu(testQuaternaryAdd<CpuMatrix>);
-
-#ifdef PADDLE_WITH_GPU
-  TestQuaternaryMatrix<GpuMatrix> testGpu(testQuaternaryAdd<GpuMatrix>);
-#endif
-}
-
-template <typename Tensor>
-void testTensorBiggerThan(
-    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
-  // a = ((b > c && d > 0.5f) || (b < c && d < 0.5f)) ? 1.0f : 0.0f);
-  A1.biggerThan(B, C, D);
-  A2 = ((B > C && D > (real)0.5) || (B < C && D < (real)0.5))
-           .condition((real)1.0, (real)0.0);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorRankLoss(
-    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
-  /**
-   * const T THRESHOLD = 40.0; a = b - c;
-   * a = (a > THRESHOLD)
-   *         ? THRESHOLD
-   *         : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
-   * a = log(1 + exp(a)) - a * d
-   */
-  A1.rankLoss(B, C, D);
-
-  real THRESHOLD = 40.0;
-  auto tmp = B - C;
-  auto tmp2 =
-      (tmp > THRESHOLD)
-          .condition(THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp));
-  A2 = (D.constant(1.0f) + tmp2.exp()).log() - tmp2 * D;
-
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorRankLossBp(
-    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
-  /**
-   * const T THRESHOLD = 40.0; a = b - c;
-   * a = (a > THRESHOLD)
-   *         ? THRESHOLD
-   *         : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
-   * a = exp(a); a = (a / (1 + a) - d)
-   */
-  A1.rankLossBp(B, C, D);
-  real THRESHOLD = 40.0;
-  auto tmp = B - C;
-  auto tmp2 =
-      (tmp > THRESHOLD)
-          .condition(THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp));
-  auto tmp3 = tmp2.exp();
-  A2 = tmp3 / (D.constant(1.0f) + tmp3) - D;
-
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testQuaternaryCompareOp(
-    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
-  testTensorBiggerThan(A1, A2, B, C, D);
-  testTensorRankLoss(A1, A2, B, C, D);
-  testTensorRankLossBp(A1, A2, B, C, D);
-}
-
-TEST(Quaternary, CompareOp) {
-  TestQuaternaryMatrix<CpuMatrix> testCpu(testQuaternaryCompareOp<CpuMatrix>);
-
-#ifdef PADDLE_WITH_GPU
-  TestQuaternaryMatrix<GpuMatrix> testGpu(testQuaternaryCompareOp<GpuMatrix>);
-#endif
-}
diff --git a/paddle/math/tests/test_TrainingAlgorithm.cpp b/paddle/math/tests/test_TrainingAlgorithm.cpp
deleted file mode 100644
index fb58d26734cab5d7d7bbbbe1cf8a920e4195b4bb..0000000000000000000000000000000000000000
--- a/paddle/math/tests/test_TrainingAlgorithm.cpp
+++ /dev/null
@@ -1,461 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "OriginalOptimizerApi.h"
-#include "PerfUtils.h"
-#include "TensorCheck.h"
-#include "paddle/math/TrainingAlgorithmOp.h"
-#include "paddle/utils/Util.h"
-
-using namespace paddle;  // NOLINT
-
-#ifndef PADDLE_TYPE_DOUBLE
-DEFINE_double(max_diff, 1e-5, "max diff allowed");
-#else
-DEFINE_double(max_diff, 1e-13, "max diff allowed");
-#endif
-
-class SetMaxDiff {
- public:
-  explicit SetMaxDiff(double max_diff) {
-    max_diff_ = FLAGS_max_diff;
-    FLAGS_max_diff = max_diff;
-  }
-  ~SetMaxDiff() { FLAGS_max_diff = max_diff_; }
-
- private:
-  double max_diff_;
-};
-
-#define COPY_VECTOR_TO_CPU(cpuVec, vector)               \
-  do {                                                   \
-    if (vector->useGpu()) {                              \
-      cpuVec = Vector::create(vector->getSize(), false); \
-      cpuVec->copyFrom(*vector);                         \
-    } else {                                             \
-      cpuVec = vector;                                   \
-    }                                                    \
-  } while (0)
-
-int VectorCheckErr(const Vector& vector1, const Vector& vector2) {
-  CHECK(vector1.getSize() == vector2.getSize());
-
-  const real* data1 = vector1.getData();
-  const real* data2 = vector2.getData();
-  size_t size = vector1.getSize();
-  int count = 0;
-  for (size_t i = 0; i < size; i++) {
-    real a = data1[i];
-    real b = data2[i];
-    if (fabs(a - b) > FLAGS_max_diff) {
-      if ((fabsf(a - b) / fabsf(a)) > (FLAGS_max_diff / 10.0f)) {
-        count++;
-      }
-    }
-  }
-
-  return count;
-}
-
-int VectorCheckErr(const VectorPtr& vector1, const VectorPtr& vector2) {
-  VectorPtr tmp1;
-  VectorPtr tmp2;
-  COPY_VECTOR_TO_CPU(tmp1, vector1);
-  COPY_VECTOR_TO_CPU(tmp2, vector2);
-  return VectorCheckErr(*tmp1, *tmp2);
-}
-
-#ifdef PADDLE_DISABLE_TIMER
-
-#define CHECK_VECTORPTR(vector1, vector2) \
-  EXPECT_EQ(VectorCheckErr(vector1, vector2), 0)
-
-#else
-
-#define CHECK_VECTORPTR(vector1, vector2)
-
-#endif
-
-typedef std::function<void(size_t size, bool useGpu)> testMatrixFunc;
-
-void testCase(testMatrixFunc matrixFunc) {
-#ifdef PADDLE_WITH_CUDA
-  for (auto useGpu : {false, true}) {
-#else
-  for (auto useGpu : {false}) {
-#endif
-    for (auto size : {1,
-                      32,
-                      64,
-                      128,
-                      512,
-                      1024,
-                      4096,
-                      32768,
-                      65536,
-                      131072,
-                      262144,
-                      524288,
-                      1048576,
-                      2097152}) {
-      LOG(INFO) << " size=" << size << " useGpu=" << useGpu;
-      matrixFunc(size, useGpu);
-    }
-  }
-}
-
-#define INIT_VECTOR(vec1, vec2, type, size, useGpu) \
-  vec1[type] = Vector::create(size, useGpu);        \
-  vec2[type] = Vector::create(size, useGpu);        \
-  vec1[type]->rand();                               \
-  vec2[type]->copyFrom(*vec1[type]);
-
-void testAdagrad(size_t size, bool useGpu) {
-  VectorPtr bufs1[NUM_PARAMETER_TYPES];
-  VectorPtr bufs2[NUM_PARAMETER_TYPES];
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM1, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
-
-  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
-  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
-  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
-  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
-
-  EXPRESSION_PERFORMANCE(AdagradParameterOptimizer(
-      bufs1, epsilon, learningRate, momentum, decayRate));
-
-  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
-  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
-  BaseMatrix& accum_buffer = *bufs2[PARAMETER_GRADIENT_SQURESUM];
-  BaseMatrix& accum = *bufs2[PARAMETER_GRADIENT_SQURESUM1];
-  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
-
-  EXPRESSION_PERFORMANCE(adagradApply(value,
-                                      grad,
-                                      mom,
-                                      accum_buffer,
-                                      accum,
-                                      lr,
-                                      epsilon,
-                                      learningRate,
-                                      momentum,
-                                      decayRate));
-
-  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM1],
-                  bufs2[PARAMETER_GRADIENT_SQURESUM1]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
-                  bufs2[PARAMETER_LEARNING_RATE]);
-}
-
-TEST(Training, Adagrad) { testCase(testAdagrad); }
-
-void testAdaDelta(size_t size, bool useGpu) {
-  VectorPtr bufs1[NUM_PARAMETER_TYPES];
-  VectorPtr bufs2[NUM_PARAMETER_TYPES];
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM1, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
-
-  real rou = (real)rand() / (real)RAND_MAX;           // NOLINT
-  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
-  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
-  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
-  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
-
-  EXPRESSION_PERFORMANCE(AdaDeltaParameterOptimizer(
-      bufs1, rou, epsilon, learningRate, momentum, decayRate));
-
-  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
-  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
-  BaseMatrix& accum = *bufs2[PARAMETER_GRADIENT_SQURESUM];
-  BaseMatrix& accum_update = *bufs2[PARAMETER_GRADIENT_SQURESUM1];
-  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
-
-  EXPRESSION_PERFORMANCE(adadeltaApply(value,
-                                       grad,
-                                       mom,
-                                       accum,
-                                       accum_update,
-                                       lr,
-                                       rou,
-                                       epsilon,
-                                       learningRate,
-                                       momentum,
-                                       decayRate));
-
-  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM],
-                  bufs2[PARAMETER_GRADIENT_SQURESUM]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM1],
-                  bufs2[PARAMETER_GRADIENT_SQURESUM1]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
-                  bufs2[PARAMETER_LEARNING_RATE]);
-}
-
-TEST(Training, AdaDelta) { testCase(testAdaDelta); }
-
-template <bool isFirstTime>
-void testRMSProp(size_t size, bool useGpu) {
-  VectorPtr bufs1[NUM_PARAMETER_TYPES];
-  VectorPtr bufs2[NUM_PARAMETER_TYPES];
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM1, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
-
-  /* make sure 'g - f.square()' greater than 0 */
-  bufs1[PARAMETER_GRADIENT_SQURESUM]->add(1.0);
-  bufs2[PARAMETER_GRADIENT_SQURESUM]->copyFrom(
-      *bufs1[PARAMETER_GRADIENT_SQURESUM]);
-
-  real rou = (real)rand() / (real)RAND_MAX;           // NOLINT
-  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
-  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
-  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
-  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
-  real accumulatedRou = rou;
-
-  EXPRESSION_PERFORMANCE(RMSPropParameterOptimizer(bufs1,
-                                                   accumulatedRou,
-                                                   rou,
-                                                   epsilon,
-                                                   learningRate,
-                                                   momentum,
-                                                   decayRate,
-                                                   isFirstTime));
-
-  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
-  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
-  BaseMatrix& sum = *bufs2[PARAMETER_GRADIENT_SQURESUM];
-  BaseMatrix& sum1 = *bufs2[PARAMETER_GRADIENT_SQURESUM1];
-  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
-
-  EXPRESSION_PERFORMANCE(rmspropApply(value,
-                                      grad,
-                                      mom,
-                                      sum,
-                                      sum1,
-                                      lr,
-                                      accumulatedRou,
-                                      rou,
-                                      epsilon,
-                                      learningRate,
-                                      momentum,
-                                      decayRate,
-                                      isFirstTime));
-
-  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM],
-                  bufs2[PARAMETER_GRADIENT_SQURESUM]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM1],
-                  bufs2[PARAMETER_GRADIENT_SQURESUM1]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
-                  bufs2[PARAMETER_LEARNING_RATE]);
-}
-
-TEST(Training, RMSProp) {
-  testCase(testRMSProp<true>);
-  testCase(testRMSProp<false>);
-}
-
-template <bool isFirstTime>
-void testDecayedAdagrad(size_t size, bool useGpu) {
-  VectorPtr bufs1[NUM_PARAMETER_TYPES];
-  VectorPtr bufs2[NUM_PARAMETER_TYPES];
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
-
-  real rou = (real)rand() / (real)RAND_MAX;           // NOLINT
-  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
-  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
-  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
-  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
-  real accumulatedRou = rou;
-
-  if (isFirstTime) {
-    bufs1[PARAMETER_GRADIENT_SQURESUM]->zeroMem();
-    bufs2[PARAMETER_GRADIENT_SQURESUM]->zeroMem();
-  }
-
-  EXPRESSION_PERFORMANCE(DecayedAdagradParameterOptimizer(bufs1,
-                                                          accumulatedRou,
-                                                          rou,
-                                                          epsilon,
-                                                          learningRate,
-                                                          momentum,
-                                                          decayRate,
-                                                          isFirstTime));
-
-  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
-  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
-  BaseMatrix& sum = *bufs2[PARAMETER_GRADIENT_SQURESUM];
-  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
-
-  EXPRESSION_PERFORMANCE(decayedAdagradApply(value,
-                                             grad,
-                                             mom,
-                                             sum,
-                                             lr,
-                                             accumulatedRou,
-                                             rou,
-                                             epsilon,
-                                             learningRate,
-                                             momentum,
-                                             decayRate,
-                                             isFirstTime));
-
-  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM],
-                  bufs2[PARAMETER_GRADIENT_SQURESUM]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
-                  bufs2[PARAMETER_LEARNING_RATE]);
-}
-
-TEST(Training, DecayedAdagrad) {
-  testCase(testDecayedAdagrad<false>);
-  testCase(testDecayedAdagrad<true>);
-}
-
-void testAdam(size_t size, bool useGpu) {
-  VectorPtr bufs1[NUM_PARAMETER_TYPES];
-  VectorPtr bufs2[NUM_PARAMETER_TYPES];
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_SECOND_MOMENTUM, size, useGpu);
-
-  real beta1 = (real)rand() / (real)RAND_MAX;         // NOLINT
-  real beta2 = (real)rand() / (real)RAND_MAX;         // NOLINT
-  real beta1_power = (real)rand() / (real)RAND_MAX;   // NOLINT
-  real beta2_power = (real)rand() / (real)RAND_MAX;   // NOLINT
-  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
-  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
-
-  EXPRESSION_PERFORMANCE(AdamParameterOptimizer(
-      bufs1, beta1, beta2, beta1_power, beta2_power, epsilon, learningRate));
-
-  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
-  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
-  BaseMatrix& v = *bufs2[PARAMETER_SECOND_MOMENTUM];
-
-  EXPRESSION_PERFORMANCE(adamApply(value,
-                                   grad,
-                                   mom,
-                                   v,
-                                   beta1,
-                                   beta2,
-                                   beta1_power,
-                                   beta2_power,
-                                   epsilon,
-                                   learningRate));
-
-  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_SECOND_MOMENTUM],
-                  bufs2[PARAMETER_SECOND_MOMENTUM]);
-}
-
-TEST(Training, Adam) { testCase(testAdam); }
-
-void testAdamax(size_t size, bool useGpu) {
-  VectorPtr bufs1[NUM_PARAMETER_TYPES];
-  VectorPtr bufs2[NUM_PARAMETER_TYPES];
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_WEIGHTED_INFINITY_NORM, size, useGpu);
-
-  real beta1 = (real)rand() / (real)RAND_MAX;  // NOLINT
-  real beta2 = (real)rand() / (real)RAND_MAX;  // NOLINT
-  real alpha = (real)rand() / (real)RAND_MAX;  // NOLINT
-  int64_t step = 2;
-
-  EXPRESSION_PERFORMANCE(
-      AdamaxParameterOptimizer(bufs1, beta1, beta2, step, alpha));
-
-  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
-  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
-  BaseMatrix& u = *bufs2[PARAMETER_WEIGHTED_INFINITY_NORM];
-
-  EXPRESSION_PERFORMANCE(
-      adamaxApply(value, grad, mom, u, beta1, beta2, step, alpha));
-
-  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_WEIGHTED_INFINITY_NORM],
-                  bufs2[PARAMETER_WEIGHTED_INFINITY_NORM]);
-}
-
-TEST(Training, Adamax) {
-#ifndef PADDLE_TYPE_DOUBLE
-  SetMaxDiff diff(1e-4);
-#endif
-  testCase(testAdamax);
-}
-
-void testSparseMomentum(size_t size, bool useGpu) {
-  VectorPtr bufs1[NUM_PARAMETER_TYPES];
-  VectorPtr bufs2[NUM_PARAMETER_TYPES];
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM_UT, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM_VT, size, useGpu);
-
-  real alpha = (real)rand() / (real)RAND_MAX;         // NOLINT
-  real beta = (real)rand() / (real)RAND_MAX;          // NOLINT
-  real gamma = (real)rand() / (real)RAND_MAX;         // NOLINT
-  real tau = (real)rand() / (real)RAND_MAX;           // NOLINT
-  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
-
-  EXPRESSION_PERFORMANCE(SparseMomentumParameterOptimizer(
-      bufs1, alpha, beta, gamma, tau, learningRate));
-
-  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
-  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
-  BaseMatrix& momU = *bufs2[PARAMETER_MOMENTUM_UT];
-  BaseMatrix& momV = *bufs2[PARAMETER_MOMENTUM_VT];
-
-  EXPRESSION_PERFORMANCE(sparseMomentumApply(
-      value, grad, momU, momV, alpha, beta, gamma, tau, learningRate));
-
-  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM_UT], bufs2[PARAMETER_MOMENTUM_UT]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM_VT], bufs2[PARAMETER_MOMENTUM_VT]);
-}
-
-TEST(Training, SparseMomentum) { testCase(testSparseMomentum); }
diff --git a/paddle/math/tests/test_lazyAssign.cu b/paddle/math/tests/test_lazyAssign.cu
deleted file mode 100644
index cbd74bbfe33270f351632b58d7e89f8e60d15b83..0000000000000000000000000000000000000000
--- a/paddle/math/tests/test_lazyAssign.cu
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "PerfUtils.h"
-#include "TensorCheck.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/TensorAssign.h"
-
-using paddle::BaseMatrix;
-using paddle::CpuMatrix;
-using paddle::GpuMatrix;
-using autotest::TensorCheckEqual;
-using autotest::TensorCheckErr;
-
-typedef std::function<void(int height, int width)> testMatrixFunc;
-void testMatrixCase(testMatrixFunc matrixFunc) {
-  for (auto height : {1}) {
-    for (auto width : {1,
-                       32,
-                       64,
-                       128,
-                       512,
-                       1024,
-                       4096,
-                       32768,
-                       65536,
-                       131072,
-                       262144,
-                       524288,
-                       1048576,
-                       2097152,
-                       4194304,
-                       8388608}) {
-      matrixFunc(height, width);
-    }
-  }
-}
-
-template <typename Tensor>
-void testLazyAssign(int height, int width) {
-  Tensor A1(height, width);
-  Tensor A2(height, width);
-  Tensor B(height, width);
-  Tensor C(height, width);
-  Tensor D(height, width);
-  A1.randomizeUniform();
-  B.randomizeUniform();
-  C.randomizeUniform();
-  D.randomizeUniform();
-  A2.copyFrom(A1);
-
-  EXPRESSION_PERFORMANCE(A1 = B + C; A1 = A1 * D;);
-
-  EXPRESSION_PERFORMANCE(auto expr1 = A2.lazyAssign(B + C);
-                         auto expr2 = A2.lazyAssign(A2 * D);
-                         AssignEvaluate(expr1, expr2););
-
-  TensorCheckErr(A1, A2);
-}
-
-TEST(lazyAssign, CPU) { testMatrixCase(testLazyAssign<CpuMatrix>); }
-
-#ifdef PADDLE_WITH_GPU
-TEST(lazyAssign, GPU) { testMatrixCase(testLazyAssign<GpuMatrix>); }
-#endif
-
-template <typename Tensor>
-void sgdUpdateTensor(
-    Tensor& A, Tensor& B, Tensor& C, Tensor& D, real p1, real p2, real p3) {
-  C = C * p2 - D * (B + A * p3) * p1;
-  A += C;
-}
-
-void sgdUpdateLazyAssign(BaseMatrix& A,
-                         BaseMatrix& B,
-                         BaseMatrix& C,
-                         BaseMatrix& D,
-                         real p1,
-                         real p2,
-                         real p3) {
-  auto expr1 = C.lazyAssign(C * p2 - D * (B + A * p3) * p1);
-  auto expr2 = A.lazyAssign(A + C);
-  AssignEvaluate(expr1, expr2);
-}
-
-template <typename Tensor>
-void testSgdUpdate(int height, int width) {
-  Tensor A1(height, width);
-  Tensor A2(height, width);
-  Tensor A3(height, width);
-  A1.randomizeUniform();
-  A2.copyFrom(A1);
-  A3.copyFrom(A1);
-
-  Tensor B(height, width);
-  B.randomizeUniform();
-
-  Tensor C1(height, width);
-  Tensor C2(height, width);
-  Tensor C3(height, width);
-  C1.randomizeUniform();
-  C2.copyFrom(C1);
-  C3.copyFrom(C1);
-
-  Tensor D(height, width);
-  D.randomizeUniform();
-
-  real p1 = 0.2;
-  real p2 = 0.3;
-  real p3 = 0.5;
-
-  /**
-   * c = p2 * c - p1 * (b + p3 * a);
-   * a = a + c;
-   */
-  // BaseMatrix API
-  EXPRESSION_PERFORMANCE(A1.sgdUpdate(B, C1, D, p1, p2, p3););
-
-  // Tensor expression
-  EXPRESSION_PERFORMANCE(sgdUpdateTensor(A2, B, C2, D, p1, p2, p3));
-
-  // lazyAssign
-  EXPRESSION_PERFORMANCE(sgdUpdateLazyAssign(A3, B, C3, D, p1, p2, p3));
-
-  TensorCheckErr(A1, A2);
-  TensorCheckErr(A1, A3);
-  TensorCheckErr(C1, C2);
-  TensorCheckErr(C1, C3);
-}
-
-TEST(sgdUpdate, CPU) { testMatrixCase(testSgdUpdate<CpuMatrix>); }
-
-#ifdef PADDLE_WITH_GPU
-TEST(sgdUpdate, GPU) { testMatrixCase(testSgdUpdate<GpuMatrix>); }
-#endif
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
deleted file mode 100644
index e45ddd433faf18dbcd647b305db3a36d38c90825..0000000000000000000000000000000000000000
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ /dev/null
@@ -1,1698 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_CUDA
-/// This unittest checks GpuMatrix/CpuMatrix get same result, so disable when
-/// only cpu version.
-
-#include <gtest/gtest.h>
-#include "TensorCheck.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/SparseMatrix.h"
-#include "paddle/testing/TestUtil.h"
-#include "paddle/utils/DynamicLoader.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/Util.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-using autotest::TensorCheckEqual;
-using autotest::TensorCheckErr;
-
-void testMatrixMaxSequence(int batchSize, int inputDim) {
-  // forward
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-
-  IVectorPtr cpuSequence;
-  generateSequenceStartPositions(batchSize, cpuSequence);
-  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
-  gpuSequence->copyFrom(*cpuSequence);
-
-  int newBatchSize = cpuSequence->getSize() - 1;
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(newBatchSize, inputDim);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(newBatchSize, inputDim);
-  cpuOutput->zero();
-  gpuOutput->zero();
-
-  IVectorPtr cpuIndex = nullptr;
-  IVectorPtr gpuIndex = nullptr;
-  IVector::resizeOrCreate(cpuIndex, newBatchSize * inputDim, false);
-  IVector::resizeOrCreate(gpuIndex, newBatchSize * inputDim, true);
-  cpuIndex->zeroMem();
-  gpuIndex->zeroMem();
-
-  cpuOutput->maxSequenceForward(*cpuInput, *cpuSequence, *cpuIndex);
-  gpuOutput->maxSequenceForward(*gpuInput, *gpuSequence, *gpuIndex);
-
-  TensorCheckEqual(*cpuOutput, *gpuOutput);
-  TensorCheckEqual(*cpuIndex, *gpuIndex);
-
-  // backward
-  MatrixPtr cpuOutputGrad = std::make_shared<CpuMatrix>(newBatchSize, inputDim);
-  MatrixPtr gpuOutputGrad = std::make_shared<GpuMatrix>(newBatchSize, inputDim);
-  cpuOutputGrad->randomizeUniform();
-  gpuOutputGrad->copyFrom(*cpuOutputGrad);
-
-  MatrixPtr cpuInputGrad = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  MatrixPtr gpuInputGrad = std::make_shared<GpuMatrix>(batchSize, inputDim);
-  cpuInputGrad->randomizeUniform();
-  gpuInputGrad->copyFrom(*cpuInputGrad);
-
-  cpuInputGrad->maxSequenceBackward(*cpuOutputGrad, *cpuSequence, *cpuIndex);
-  gpuInputGrad->maxSequenceBackward(*gpuOutputGrad, *gpuSequence, *gpuIndex);
-
-  TensorCheckEqual(*cpuInputGrad, *gpuInputGrad);
-}
-
-TEST(Matrix, maxSequence) {
-  for (auto batchSize : {1, 3, 997}) {   // prime numbers close to 1, 4, 1024
-    for (auto inputDim : {1, 7, 131}) {  // prime numbers close to 1, 8, 128
-      VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim;
-      testMatrixMaxSequence(batchSize, inputDim);
-    }
-  }
-}
-
-void testMatrixGetSum(int height, int width) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-
-#ifndef PADDLE_TYPE_DOUBLE
-  int x = log10(height * width);
-  real err = 1e-6 * pow(10, x);
-#else
-  real err = 1e-8;
-#endif
-
-  real cpuSum = cpuInput->getSum();
-  real gpuSum = gpuInput->getSum();
-
-  EXPECT_LE(fabs(cpuSum - gpuSum), err);
-}
-
-void testMatrixGetMinMax(int height, int width) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-
-  real cpuMin = cpuInput->getMin();
-  real gpuMin = gpuInput->getMin();
-  real cpuMax = cpuInput->getMax();
-  real gpuMax = gpuInput->getMax();
-
-  EXPECT_EQ(cpuMin, gpuMin);
-  EXPECT_EQ(cpuMax, gpuMax);
-}
-
-void testMatrixZeroAtOffset(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr cpuTest = std::make_shared<CpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  cpuTest->copyFrom(*cpuA);
-
-  int columnOffset = rand() % width;  // NOLINT we just use rand() for test.
-  int numColumns = rand() % (width - columnOffset);  // NOLINT
-
-  if (numColumns == 0) return;
-
-  cpuA->zeroAtOffset(columnOffset, numColumns);
-  gpuA->zeroAtOffset(columnOffset, numColumns);
-
-  /* cpuTest */
-  real* a = cpuTest->getData() + columnOffset;
-  for (int64_t i = 0; i < height; ++i) {
-    for (int64_t j = 0; j < numColumns; ++j) {
-      a[i * width + j] = 0;
-    }
-  }
-
-  TensorCheckEqual(*cpuA, *gpuA);
-  TensorCheckEqual(*cpuA, *cpuTest);
-}
-
-void testMatrixDeepSwap(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuCopyA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuCopyB = std::make_shared<CpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  cpuCopyA->copyFrom(*cpuA);
-  cpuCopyB->copyFrom(*cpuB);
-
-  // swap matrix cpuA and cpuB
-  cpuA->deepSwap(*cpuB);
-
-  TensorCheckEqual(*cpuA, *cpuCopyB);
-  TensorCheckEqual(*cpuB, *cpuCopyA);
-}
-
-void testMatrixTranspose(int height, int width) {
-  MatrixPtr cpu = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpu = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr cpuT = std::make_shared<CpuMatrix>(width, height);
-  MatrixPtr gpuT = std::make_shared<GpuMatrix>(width, height);
-
-  cpu->randomizeUniform();
-  gpu->copyFrom(*cpu);
-  cpu->transpose(cpuT, false);
-  gpu->transpose(gpuT, true);
-
-  TensorCheckEqual(*cpuT, *gpuT);
-}
-
-void testMatrixRotate(int height, int width) {
-  MatrixPtr cpu = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpu = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr cpuR = std::make_shared<CpuMatrix>(width, height);
-  MatrixPtr gpuR = std::make_shared<GpuMatrix>(width, height);
-
-  cpu->randomizeUniform();
-  gpu->copyFrom(*cpu);
-
-  cpu->rotate(cpuR, false, true);
-  gpu->rotate(gpuR, true, true);
-  TensorCheckEqual(*cpuR, *gpuR);
-
-  cpu->rotate(cpuR, true, false);
-  gpu->rotate(gpuR, false, false);
-  TensorCheckEqual(*cpuR, *gpuR);
-}
-
-void testMatrixInverse(int height) {
-  MatrixPtr cpu = std::make_shared<CpuMatrix>(height, height);
-  MatrixPtr gpu = std::make_shared<GpuMatrix>(height, height);
-  MatrixPtr cpuI = std::make_shared<CpuMatrix>(height, height);
-  MatrixPtr gpuI = std::make_shared<GpuMatrix>(height, height);
-
-  /* Make matrix well conditioned: cpu * cpuT + Identity */
-  cpu->randomizeUniform();
-  MatrixPtr cpuT = cpu->getTranspose();
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, height);
-  outputCheck->mul(*cpu, *cpuT);
-  cpu->setDiag(1.0);
-  cpu->add(*outputCheck);
-
-  gpu->copyFrom(*cpu);
-  cpu->inverse(cpuI, true);
-  gpu->inverse(gpuI, false);
-
-  TensorCheckErr(*cpuI, *gpuI);
-
-  outputCheck->mul(*cpu, *cpuI);
-  cpu->setDiag(1.0);
-  TensorCheckErr(*cpu, *outputCheck);
-}
-
-TEST(Matrix, unary) {
-  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
-      VLOG(3) << " height=" << height << " width=" << width;
-
-      testMatrixDeepSwap(height, width);
-      testMatrixZeroAtOffset(height, width);
-      testMatrixGetSum(height, width);
-      testMatrixTranspose(height, width);
-      testMatrixRotate(height, width);
-    }
-#ifdef LAPACK_FOUND
-    // inverse matrix
-    testMatrixInverse(height);
-#else
-    LOG(WARNING) << "This version of PaddlePaddle was not built with LAPACK"
-                 << "support so we cannot test matrix inverse. To test "
-                 << "matrix inverse, please install LAPACKE "
-                 << "and MKL/Openblas, and re-build PaddlePaddle.";
-#endif
-  }
-}
-
-void testMatrixSoftmax(int height, int width) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width);
-
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-  cpuOutput->zero();
-  gpuOutput->zero();
-  cpuInput->softmax(*cpuOutput);
-  gpuInput->softmax(*gpuOutput);
-
-  TensorCheckErr(*cpuOutput, *gpuOutput);
-}
-
-void testSequenceSoftmax(int batchSize) {
-  // forward
-  int inputDim = 1;
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-
-  IVectorPtr cpuSequence;
-  generateSequenceStartPositions(batchSize, cpuSequence);
-  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
-  gpuSequence->copyFrom(*cpuSequence);
-
-  cpuInput->sequenceSoftmax(*cpuInput, *cpuSequence);
-  gpuInput->sequenceSoftmax(*gpuInput, *gpuSequence);
-
-  TensorCheckErr(*cpuInput, *gpuInput);
-}
-
-void testMatrixSoftmaxThreshold(int height, int width) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width);
-
-  cpuInput->randomizeUniform();
-  cpuInput->getData()[0] = 100.0;
-  gpuInput->copyFrom(*cpuInput);
-  cpuOutput->zero();
-  gpuOutput->zero();
-  cpuInput->softmax(*cpuOutput);
-  gpuInput->softmax(*gpuOutput);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuOutput);
-  // check output zero
-  int cpuCount = 0;
-  int gpuCount = 0;
-  auto zeroNum = [](MatrixPtr out, int& count) {
-    for (size_t i = 0; i < out->getHeight(); i++) {
-      for (size_t j = 0; j < out->getWidth(); j++) {
-        if (out->getElement(i, j) == 0) count++;
-      }
-    }
-  };
-  zeroNum(cpuOutput, cpuCount);
-  zeroNum(outputCheck, gpuCount);
-  EXPECT_EQ(cpuCount, 0) << "Cpu softmax output value 0";
-  EXPECT_EQ(gpuCount, 0) << "Gpu softmax output value 0";
-}
-
-void testMatrixSoftmaxBp(int height, int width) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width);
-
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-  cpuOutput->randomizeUniform();
-  gpuOutput->copyFrom(*cpuOutput);
-  gpuOutput->softmaxBackward(*gpuInput);
-
-  MatrixPtr sftMaxSum = std::make_shared<CpuMatrix>(height, 1);
-  MatrixPtr sftMaxDot = std::make_shared<CpuMatrix>(height, width);
-  sftMaxDot->dotMul(*cpuOutput, *cpuInput);
-  sftMaxSum->colMerge(*sftMaxDot);
-  cpuOutput->softmaxDerivative(*cpuInput, *sftMaxSum);
-
-  TensorCheckErr(*cpuOutput, *gpuOutput);
-}
-
-TEST(Matrix, softmax) {
-  for (auto height : {1, 3, 131}) {    // prime numbers close to 1, 4, 127
-    for (auto width : {1, 17, 251}) {  // prime numbers close to 1, 16, 256
-      VLOG(3) << " height=" << height << " width=" << width;
-
-      testMatrixSoftmax(height, width);
-      testMatrixSoftmaxBp(height, width);
-      testMatrixSoftmaxThreshold(height, width);
-    }
-    testSequenceSoftmax(height);
-  }
-}
-
-void testMatrixAddToRows(int numSamples, int tableSize, int inputDim) {
-  MatrixPtr cpuTable = std::make_shared<CpuMatrix>(tableSize, inputDim);
-  MatrixPtr gpuTable = std::make_shared<GpuMatrix>(tableSize, inputDim);
-  cpuTable->randomizeUniform();
-  gpuTable->copyFrom(*cpuTable);
-
-  IVectorPtr cpuIds;
-  IVectorPtr gpuIds;
-  cpuIds = VectorT<int>::create(numSamples, false);
-  gpuIds = VectorT<int>::create(numSamples, true);
-  cpuIds->rand(tableSize);
-  gpuIds->copyFrom(*cpuIds);
-
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(numSamples, inputDim);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(numSamples, inputDim);
-  cpuOutput->randomizeUniform();
-  gpuOutput->copyFrom(*cpuOutput);
-
-  cpuOutput->addToRows(*cpuTable, *cpuIds);
-  gpuOutput->addToRows(*gpuTable, *gpuIds);
-
-  TensorCheckErr(*cpuTable, *gpuTable);
-}
-
-TEST(Matrix, tableProjection) {
-  for (auto numSamples : {10, 100, 1000, 10000, 80000}) {
-    for (auto tableSize : {10, 100}) {
-      for (auto inputDim : {20, 50}) {
-        VLOG(3) << " numSamples=" << numSamples << " tableSize=" << tableSize
-                << " inputDim=" << inputDim;
-        testMatrixAddToRows(numSamples, tableSize, inputDim);
-      }
-    }
-  }
-}
-
-void testMatrixMul(bool transa, bool transb, int dimM, int dimN, int dimK) {
-  int heightA = transa == false ? dimM : dimK;
-  int widthA = transa == false ? dimK : dimM;
-  int heightB = transb == false ? dimK : dimN;
-  int widthB = transb == false ? dimN : dimK;
-  int heightC = dimM;
-  int widthC = dimN;
-
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(heightA, widthA, transa);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(heightB, widthB, transb);
-  MatrixPtr cpuC = std::make_shared<CpuMatrix>(heightC, widthC);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(heightA, widthA, transa);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(heightB, widthB, transb);
-  MatrixPtr gpuC = std::make_shared<GpuMatrix>(heightC, widthC);
-
-  real alpha = 1.5;
-  real beta = 2.0;
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  cpuC->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  gpuC->copyFrom(*cpuC);
-
-  cpuC->mul(*cpuA, *cpuB, alpha, beta);
-  gpuC->mul(*gpuA, *gpuB, alpha, beta);
-
-  TensorCheckErr(*cpuC, *gpuC);
-}
-
-void testSubMatrixMul(bool transa, bool transb, int dimM, int dimN, int dimK) {
-  int heightA = transa == false ? dimM : dimK;
-  int widthA = transa == false ? dimK : dimM;
-  int heightB = transb == false ? dimK : dimN;
-  int widthB = transb == false ? dimN : dimK;
-  int heightC = dimM;
-  int widthC = dimN;
-
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(heightA, widthA, transa);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(heightB, widthB, transb);
-  MatrixPtr cpuC = std::make_shared<CpuMatrix>(heightC, widthC);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(heightA, widthA, transa);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(heightB, widthB, transb);
-  MatrixPtr gpuC = std::make_shared<GpuMatrix>(heightC, widthC);
-
-  real alpha = 1.5;
-  real beta = 2.0;
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  cpuC->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  gpuC->copyFrom(*cpuC);
-
-  auto subSize = [](int& start, int& end, int dim) {
-    if (dim == 1) {
-      start = 0;
-      end = dim;
-    } else {
-      int subDim = rand() % (dim - 1) + 1;  // NOLINT
-      start = rand() % (dim - subDim);      // NOLINT
-      end = start + subDim;
-    }
-  };
-
-  auto subMatrix = [](MatrixPtr& sub,
-                      MatrixPtr matrix,
-                      size_t startRow,
-                      size_t endRow,
-                      size_t startCol,
-                      size_t endCol) {
-    if (!matrix->isTransposed()) {
-      sub = matrix->subMatrix(startRow, endRow, startCol, endCol);
-    } else {
-      sub = matrix->subMatrix(startCol, endCol, startRow, endRow);
-    }
-  };
-
-  int startM, endM;
-  int startN, endN;
-  int startK, endK;
-  subSize(startM, endM, dimM);
-  subSize(startN, endN, dimN);
-  subSize(startK, endK, dimK);
-
-  MatrixPtr subCpuA;
-  MatrixPtr subCpuB;
-  MatrixPtr subGpuA;
-  MatrixPtr subGpuB;
-  subMatrix(subCpuA, cpuA, startM, endM, startK, endK);
-  subMatrix(subGpuA, gpuA, startM, endM, startK, endK);
-  subMatrix(subCpuB, cpuB, startK, endK, startN, endN);
-  subMatrix(subGpuB, gpuB, startK, endK, startN, endN);
-  MatrixPtr subCpuC = cpuC->subMatrix(startM, endM, startN, endN);
-  MatrixPtr subGpuC = gpuC->subMatrix(startM, endM, startN, endN);
-
-  subCpuC->mul(*subCpuA, *subCpuB, alpha, beta);
-  subGpuC->mul(*subGpuA, *subGpuB, alpha, beta);
-
-  TensorCheckErr(*cpuC, *gpuC);
-}
-
-TEST(Matrix, mul) {
-  for (auto transa : {false, true}) {
-    for (auto transb : {false, true}) {
-      for (auto dimM : {1, 9, 53, 127, 345, 1023, 2135}) {
-        for (auto dimN : {1, 5, 37, 256, 1024}) {
-          for (auto dimK : {8, 45, 346, 784, 1025}) {
-            if (true == transa && true == transb) {
-              continue;
-            }
-            VLOG(3) << setiosflags(ios::left) << setfill(' ')
-                    << " transa=" << transa << " transb=" << transb
-                    << " dimM=" << setw(5) << dimM << " dimN=" << setw(5)
-                    << dimN << " dimK=" << setw(5) << dimK;
-
-            testMatrixMul(transa, transb, dimM, dimN, dimK);
-            testSubMatrixMul(transa, transb, dimM, dimN, dimK);
-          }
-        }
-      }
-    }
-  }
-}
-
-void testVectorRowFunc(int size) {
-  CpuVectorPtr cpu = std::make_shared<CpuVectorT<real>>(size);
-  GpuVectorPtr gpu = std::make_shared<GpuVectorT<real>>(size);
-
-  cpu->rand();
-  gpu->copyFrom(*cpu);
-
-  EXPECT_EQ(cpu->getMax(), gpu->getMax());
-  EXPECT_EQ(cpu->getMin(), gpu->getMin());
-  EXPECT_EQ(cpu->getAbsMax(), gpu->getAbsMax());
-}
-
-TEST(Vector, rowFunc) {
-  for (auto size : {1, 3, 997}) {  // prime numbers close to 1, 4, 1024
-    VLOG(3) << " size=" << size;
-    testVectorRowFunc(size);
-  }
-}
-
-template <class T>
-void testVectorReset(int size) {
-  std::shared_ptr<CpuVectorT<T>> cpu = std::make_shared<CpuVectorT<T>>(size);
-  std::shared_ptr<GpuVectorT<T>> gpu = std::make_shared<GpuVectorT<T>>(size);
-
-  T value = (T)((int)rand() % 100 + 1.0f / ((int)rand() % 100));
-  cpu->reset(value);
-  gpu->reset(value);
-
-  TensorCheckEqual(*cpu, *gpu);
-}
-
-template <class T>
-void testVecortSelectFrom(int size) {
-  std::shared_ptr<CpuVectorT<T>> cpuDst = std::make_shared<CpuVectorT<T>>(size);
-  std::shared_ptr<GpuVectorT<T>> gpuDst = std::make_shared<GpuVectorT<T>>(size);
-  std::shared_ptr<CpuVectorT<T>> cpuSrc =
-      std::make_shared<CpuVectorT<T>>(size * 2);
-  std::shared_ptr<GpuVectorT<T>> gpuSrc =
-      std::make_shared<GpuVectorT<T>>(size * 2);
-  CpuIVectorPtr cpuIds = std::make_shared<CpuVectorT<int>>(size);
-  GpuIVectorPtr gpuIds = std::make_shared<GpuVectorT<int>>(size);
-
-  if (std::is_same<T, real>::value) {
-    cpuSrc->rand();
-  } else {
-    cpuSrc->rand(100000);
-  }
-  gpuSrc->copyFrom(*cpuSrc);
-  cpuIds->rand(size);
-  gpuIds->copyFrom(*cpuIds);
-
-  cpuDst->selectFrom(*cpuSrc, *cpuIds);
-  gpuDst->selectFrom(*gpuSrc, *gpuIds);
-
-  TensorCheckEqual(*cpuDst, *gpuDst);
-}
-
-template <class T>
-void testVecotrZeroMem(int size) {
-  std::shared_ptr<CpuVectorT<T>> cpu = std::make_shared<CpuVectorT<T>>(size);
-  std::shared_ptr<GpuVectorT<T>> gpu = std::make_shared<GpuVectorT<T>>(size);
-
-  cpu->zeroMem();
-  gpu->zeroMem();
-
-  TensorCheckEqual(*cpu, *gpu);
-}
-
-template <class T>
-void testVectorIsEqual(int size) {
-  std::shared_ptr<CpuVectorT<T>> cpuA = std::make_shared<CpuVectorT<T>>(size);
-  std::shared_ptr<CpuVectorT<T>> cpuB = std::make_shared<CpuVectorT<T>>(size);
-  std::shared_ptr<GpuVectorT<T>> gpuA = std::make_shared<GpuVectorT<T>>(size);
-  std::shared_ptr<GpuVectorT<T>> gpuB = std::make_shared<GpuVectorT<T>>(size);
-
-  if (std::is_same<T, real>::value) {
-    cpuB->rand();
-  } else {
-    cpuB->rand(100000);
-  }
-  gpuB->copyFrom(*cpuB);
-
-  T value = (T)((int)rand() % 100 + 1.0f / ((int)rand() % 100));
-  cpuA->isEqualTo(*cpuB, value);
-  gpuA->isEqualTo(*gpuB, value);
-
-  TensorCheckEqual(*cpuA, *gpuA);
-}
-
-TEST(Vector, Equal) {
-  for (auto size : {1, 3, 997}) {  // prime numbers close to 1, 4, 1024
-    VLOG(3) << " size=" << size;
-    testVectorReset<int>(size);
-    testVectorReset<real>(size);
-    testVecortSelectFrom<int>(size);
-    testVecortSelectFrom<real>(size);
-    testVecotrZeroMem<int>(size);
-    testVecotrZeroMem<real>(size);
-    testVectorIsEqual<int>(size);
-    testVectorIsEqual<real>(size);
-  }
-}
-
-void testMatrixTopK(int samples, int dim, int beamSize) {
-  MatrixPtr cpuSrc = std::make_shared<CpuMatrix>(samples, dim);
-  MatrixPtr gpuSrc = std::make_shared<GpuMatrix>(samples, dim);
-  MatrixPtr cpuVal = std::make_shared<CpuMatrix>(samples, beamSize);
-  MatrixPtr gpuVal = std::make_shared<GpuMatrix>(samples, beamSize);
-  IVectorPtr cpuIds = std::make_shared<CpuIVector>(samples * beamSize);
-  IVectorPtr gpuIds = std::make_shared<GpuIVector>(samples * beamSize);
-
-  cpuSrc->randomizeUniform();
-  gpuSrc->copyFrom(*cpuSrc);
-
-  cpuSrc->rowMax(*cpuIds, *cpuVal);
-  gpuSrc->rowMax(*gpuIds, *gpuVal);
-
-  TensorCheckEqual(*cpuVal, *gpuVal);
-}
-
-TEST(Matrix, topK) {
-  for (auto samples : {1, 17, 131}) {  // prime numbers close to 1, 16, 127
-    for (auto dim : {1, 3, 997}) {     // prime numbers close to 1, 4, 1024
-      for (auto beamSize : {1, 5, 10, 20, 40, (int)rand() % dim + 1}) {
-        if (beamSize > dim) continue;
-        VLOG(3) << " samples=" << samples << " beamSize=" << beamSize
-                << " dim=" << dim;
-        testMatrixTopK(samples, dim, beamSize);
-      }
-    }
-  }
-}
-
-void testSMatrixTopK(int samples, int dim, int beamSize, real ratio) {
-  int nnz = samples * dim * ratio;
-  if (nnz < 1) nnz = 1;  // Because sparseRand in MathUtil.cpp requires this.
-  MatrixPtr cpuSrc = std::make_shared<CpuSparseMatrix>(samples, dim, nnz);
-  MatrixPtr gpuSrc = std::make_shared<GpuSparseMatrix>(samples, dim, nnz);
-  MatrixPtr cpuVal = std::make_shared<CpuMatrix>(samples, beamSize);
-  MatrixPtr gpuVal = std::make_shared<GpuMatrix>(samples, beamSize);
-  IVectorPtr cpuIds = std::make_shared<CpuIVector>(samples * beamSize);
-  IVectorPtr gpuIds = std::make_shared<GpuIVector>(samples * beamSize);
-
-  cpuSrc->randomizeUniform();
-  gpuSrc->copyFrom(*cpuSrc);
-  cpuVal->zero();
-  cpuIds->zero();
-  gpuVal->zero();
-  gpuIds->zero();
-
-  cpuSrc->rowMax(*cpuIds, *cpuVal);
-  gpuSrc->rowMax(*gpuIds, *gpuVal);
-
-  TensorCheckEqual(*cpuVal, *gpuVal);
-
-  IVectorPtr outCheckIds = std::make_shared<CpuIVector>(samples * beamSize);
-  outCheckIds->copyFrom(*gpuIds);
-
-  const int* data1 = cpuIds->getData();
-  const int* data2 = outCheckIds->getData();
-  size_t size = cpuIds->getSize();
-  for (size_t i = 0; i < size; i++) {
-    if (data1[i] == -1 && data1[i] != data2[i]) {
-      EXPECT_EQ(data1[i], data2[i]);
-    }
-  }
-}
-
-TEST(SMatrix, topK) {
-  for (auto samples : {1, 3, 61}) {
-    for (auto dim : {1, 3, 61}) {
-      for (auto beamSize : {1, 3, 61}) {
-        for (auto ratio : {0.01, 0.001}) {
-          if (beamSize > dim) continue;
-          VLOG(3) << " samples=" << samples << " beamSize=" << beamSize
-                  << " dim=" << dim << " ratio=" << ratio;
-          testSMatrixTopK(samples, dim, beamSize, ratio);
-        }
-      }
-    }
-  }
-}
-
-void testMatrixSequenceAvg(int batchSize, int inputDim, int mode) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-
-  IVectorPtr cpuSequence;
-  generateSequenceStartPositions(batchSize, cpuSequence);
-  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
-  gpuSequence->copyFrom(*cpuSequence);
-
-  int newBatchSize = cpuSequence->getSize() - 1;
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(newBatchSize, inputDim);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(newBatchSize, inputDim);
-  cpuOutput->zero();
-  gpuOutput->zero();
-
-  cpuOutput->sequenceAvgForward(*cpuInput, *cpuSequence, mode);
-  gpuOutput->sequenceAvgForward(*gpuInput, *gpuSequence, mode);
-
-  TensorCheckErr(*cpuOutput, *gpuOutput);
-
-  MatrixPtr cpuInGrad = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  MatrixPtr gpuInGrad = std::make_shared<GpuMatrix>(batchSize, inputDim);
-  cpuInGrad->randomizeUniform();
-  gpuInGrad->copyFrom(*cpuInGrad);
-
-  cpuInGrad->sequenceAvgBackward(*cpuOutput, *cpuSequence, mode);
-  gpuInGrad->sequenceAvgBackward(*gpuOutput, *gpuSequence, mode);
-
-  TensorCheckErr(*cpuInGrad, *gpuInGrad);
-}
-
-TEST(Matrix, sequenceAvg) {
-  for (auto batchSize : {10, 128, 6000}) {
-    for (auto inputDim : {32, 100, 512}) {
-      for (auto mode : {0, 1, 2}) {
-        VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim
-                << " mode=" << mode;
-        testMatrixSequenceAvg(batchSize, inputDim, mode);
-      }
-    }
-  }
-}
-
-void testParamReluBackwardDiff(int height,
-                               int width,
-                               int w_height,
-                               int w_width) {
-  MatrixPtr oGrad = CpuMatrix::create(height, width, false, false);
-  MatrixPtr input = CpuMatrix::create(height, width, false, false);
-  MatrixPtr diff = CpuMatrix::create(height, width, false, false);
-  MatrixPtr w = CpuMatrix::create(w_height, w_width, false, false);
-
-  oGrad->randomizeUniform();
-  input->randomizeUniform();
-  w->randomizeUniform();
-  diff->randomizeUniform();
-  input->add(-0.5);
-
-  MatrixPtr oGradGpu = GpuMatrix::create(height, width, false, true);
-  MatrixPtr inputGpu = GpuMatrix::create(height, width, false, true);
-  MatrixPtr diffGpu = CpuMatrix::create(height, width, false, true);
-  MatrixPtr wGpu = GpuMatrix::create(w_height, w_width, false, true);
-
-  oGradGpu->copyFrom(*oGrad);
-  inputGpu->copyFrom(*input);
-  wGpu->copyFrom(*w);
-  diffGpu->copyFrom(*diff);
-
-  diff->paramReluBackwardDiff(*oGrad, *input, *w);
-  diffGpu->paramReluBackwardDiff(*oGradGpu, *inputGpu, *wGpu);
-
-  TensorCheckErr(*diff, *diffGpu);
-}
-
-TEST(Matrix, paramReluBackwardDiff) {
-  for (auto height : {10, 40, 100}) {
-    for (auto width : {10, 40, 100}) {
-      for (auto w_height : {1, 2}) {
-        for (auto w_width : {1, 2}) {
-          if (width % (w_height * w_width)) continue;
-          testParamReluBackwardDiff(height, width, w_height, w_width);
-        }
-      }
-    }
-  }
-}
-
-void testClassificationError(int numSamples, int dim, int topkSize) {
-  MatrixPtr cpuError = std::make_shared<CpuMatrix>(numSamples, 1);
-  MatrixPtr gpuError = std::make_shared<GpuMatrix>(numSamples, 1);
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(numSamples, dim);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(numSamples, dim);
-  IVectorPtr cpuLabel = std::make_shared<CpuIVector>(numSamples);
-  IVectorPtr gpuLabel = std::make_shared<GpuIVector>(numSamples);
-
-  cpuOutput->randomizeUniform();
-  cpuLabel->rand(dim);
-  gpuOutput->copyFrom(*cpuOutput);
-  gpuLabel->copyFrom(*cpuLabel);
-
-  cpuError->classificationError(*cpuOutput, *cpuLabel, topkSize);
-  gpuError->classificationError(*gpuOutput, *gpuLabel, topkSize);
-
-  TensorCheckEqual(*cpuError, *gpuError);
-}
-
-TEST(Matrix, classificationError) {
-  for (auto numSamples : {1, 3, 31}) {
-    for (auto dim : {1, 3, 31}) {
-      for (auto topkSize : {1, 3, (int)rand() % dim + 1}) {
-        if (topkSize > dim) continue;
-        VLOG(3) << " sample= " << numSamples << " topkSize= " << topkSize
-                << " dim= " << dim;
-        testClassificationError(numSamples, dim, topkSize);
-      }
-    }
-  }
-}
-
-void testMaxPoolFwdBwd(int numSamples,
-                       int channels,
-                       int imgSizeH,
-                       int imgSizeW,
-                       int ksizeH,
-                       int ksizeW,
-                       int strideH,
-                       int strideW,
-                       int padH,
-                       int padW) {
-  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
-  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
-
-  int inWidth = imgSizeH * imgSizeW * channels;
-  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  int outWidth = channels * outH * outW;
-  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
-
-  input->randomizeUniform();
-  target->randomizeUniform();
-  inputGpu->copyFrom(*input);
-  targetGpu->copyFrom(*target);
-
-  target->maxPoolForward(*input,
-                         imgSizeH,
-                         imgSizeW,
-                         channels,
-                         ksizeW,
-                         ksizeH,
-                         strideH,
-                         strideW,
-                         outH,
-                         outW,
-                         padH,
-                         padW);
-  targetGpu->maxPoolForward(*inputGpu,
-                            imgSizeH,
-                            imgSizeW,
-                            channels,
-                            ksizeW,
-                            ksizeH,
-                            strideH,
-                            strideW,
-                            outH,
-                            outW,
-                            padH,
-                            padW);
-  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
-  targetCheck->copyFrom(*targetGpu);
-  checkMatrixEqual(target, targetCheck);
-
-  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
-  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpuGrad =
-      GpuMatrix::create(numSamples, outWidth, false, true);
-
-  inputGrad->randomizeUniform();
-  targetGrad->randomizeUniform();
-  inputGpuGrad->copyFrom(*inputGrad);
-  targetGpuGrad->copyFrom(*targetGrad);
-
-  inputGrad->maxPoolBackward(*input,
-                             imgSizeH,
-                             imgSizeW,
-                             *targetGrad,
-                             *target,
-                             ksizeW,
-                             ksizeH,
-                             strideH,
-                             strideW,
-                             outH,
-                             outW,
-                             1.0,
-                             1.0,
-                             padH,
-                             padW);
-  inputGpuGrad->maxPoolBackward(*inputGpu,
-                                imgSizeH,
-                                imgSizeW,
-                                *targetGpuGrad,
-                                *targetGpu,
-                                ksizeW,
-                                ksizeH,
-                                strideH,
-                                strideW,
-                                outH,
-                                outW,
-                                1.0,
-                                1.0,
-                                padH,
-                                padW);
-  MatrixPtr targetBwdCheck =
-      CpuMatrix::create(numSamples, inWidth, false, false);
-  targetBwdCheck->copyFrom(*inputGpuGrad);
-  checkMatrixEqual(inputGrad, targetBwdCheck);
-}
-
-void testAvgPoolFwdBwd(int numSamples,
-                       int channels,
-                       int imgSizeH,
-                       int imgSizeW,
-                       int ksizeH,
-                       int ksizeW,
-                       int strideH,
-                       int strideW,
-                       int padH,
-                       int padW) {
-  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
-  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
-
-  int inWidth = imgSizeH * imgSizeW * channels;
-  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  int outWidth = channels * outH * outW;
-  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
-
-  input->randomizeUniform();
-  target->randomizeUniform();
-  inputGpu->copyFrom(*input);
-  targetGpu->copyFrom(*target);
-
-  target->avgPoolForward(*input,
-                         imgSizeH,
-                         imgSizeW,
-                         channels,
-                         ksizeW,
-                         ksizeH,
-                         strideH,
-                         strideW,
-                         outH,
-                         outW,
-                         padH,
-                         padW);
-  targetGpu->avgPoolForward(*inputGpu,
-                            imgSizeH,
-                            imgSizeW,
-                            channels,
-                            ksizeW,
-                            ksizeH,
-                            strideH,
-                            strideW,
-                            outH,
-                            outW,
-                            padH,
-                            padW);
-
-  TensorCheckErr(*target, *targetGpu);
-
-  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
-  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpuGrad =
-      GpuMatrix::create(numSamples, outWidth, false, true);
-
-  inputGrad->randomizeUniform();
-  targetGrad->randomizeUniform();
-  inputGpuGrad->copyFrom(*inputGrad);
-  targetGpuGrad->copyFrom(*targetGrad);
-
-  inputGrad->avgPoolBackward(*targetGrad,
-                             imgSizeH,
-                             imgSizeW,
-                             ksizeW,
-                             ksizeH,
-                             strideH,
-                             strideW,
-                             outH,
-                             outW,
-                             1.0,
-                             1.0,
-                             padH,
-                             padW);
-  inputGpuGrad->avgPoolBackward(*targetGpuGrad,
-                                imgSizeH,
-                                imgSizeW,
-                                ksizeW,
-                                ksizeH,
-                                strideH,
-                                strideW,
-                                outH,
-                                outW,
-                                1.0,
-                                1.0,
-                                padH,
-                                padW);
-
-  TensorCheckErr(*inputGrad, *inputGpuGrad);
-}
-
-// TODO(yi): I noticed many such blindly combinatorial tests in this
-// file.  They are no help to locate defects at all.
-TEST(Matrix, PoolFwdBwd) {
-  for (auto numSamples : {1, 3}) {
-    for (auto channels : {1, 3}) {
-      for (auto imgSizeH : {13, 17}) {
-        for (auto imgSizeW : {17, 19}) {
-          for (auto sizeX : {2, 3}) {
-            for (auto sizeY : {2, 3}) {
-              for (auto sH : {1, 2}) {
-                for (auto sW : {1, 2}) {
-                  for (auto pH : {0, (sizeY - 1) / 2}) {
-                    for (auto pW : {0, (sizeX - 1) / 2}) {
-                      VLOG(3) << " numSamples=" << numSamples
-                              << " channels=" << channels
-                              << " imgSizeH=" << imgSizeH
-                              << " imgSizeW=" << imgSizeW << " sizeX=" << sizeX
-                              << " sizeY=" << sizeY << " strideH=" << sH
-                              << " strideW=" << sW << " padingH=" << pH
-                              << " padingW=" << pW;
-                      testMaxPoolFwdBwd(numSamples,
-                                        channels,
-                                        imgSizeH,
-                                        imgSizeW,
-                                        sizeX,
-                                        sizeY,
-                                        sH,
-                                        sW,
-                                        pH,
-                                        pW);
-                      testAvgPoolFwdBwd(numSamples,
-                                        channels,
-                                        imgSizeH,
-                                        imgSizeW,
-                                        sizeX,
-                                        sizeY,
-                                        sH,
-                                        sW,
-                                        pH,
-                                        pW);
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-void testMaxOutFwdBwd(
-    int numSamples, int imgSizeH, int imgSizeW, int channels, int groups) {
-  int inWidth = imgSizeH * imgSizeW * channels;
-  int outChannels = channels / groups;
-  int outWidth = imgSizeH * imgSizeW * outChannels;
-
-  // forward
-  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
-
-  IVectorPtr id = CpuIVector::create(numSamples * outWidth, false);
-  IVectorPtr idGpu = GpuIVector::create(numSamples * outWidth, true);
-
-  input->randomizeUniform();
-  inputGpu->copyFrom(*input);
-
-  target->maxoutForward(*input, *id, outChannels, groups);
-  targetGpu->maxoutForward(*inputGpu, *idGpu, outChannels, groups);
-
-  TensorCheckErr(*target, *targetGpu);
-  TensorCheckEqual(*id, *idGpu);
-
-  // backward
-  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpuGrad =
-      GpuMatrix::create(numSamples, outWidth, false, true);
-
-  inputGrad->randomizeUniform();
-  targetGrad->randomizeUniform();
-  inputGpuGrad->copyFrom(*inputGrad);
-  targetGpuGrad->copyFrom(*targetGrad);
-
-  inputGrad->maxoutBackward(*targetGrad, *id, outChannels, groups);
-  inputGpuGrad->maxoutBackward(*targetGpuGrad, *idGpu, outChannels, groups);
-
-  TensorCheckErr(*inputGrad, *inputGpuGrad);
-}
-
-TEST(Matrix, MaxOutFwdBwd) {
-  for (auto numSamples : {5, 10}) {
-    for (auto channels : {8, 16}) {
-      for (auto imgSizeH : {14, 28}) {
-        for (auto imgSizeW : {16, 30}) {
-          for (auto groups : {2, 4}) {
-            VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
-                    << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW
-                    << " groups=" << groups;
-            testMaxOutFwdBwd(numSamples, imgSizeH, imgSizeW, channels, groups);
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(CpuMatrix, copyFrom) {
-  const size_t height = 31;
-  const size_t width = 53;
-  CpuMatrix cpu(height, width);
-  GpuMatrix gpu(height, width);
-  CpuMatrix copy(height, width);
-
-  cpu.randomizeUniform();
-  gpu.copyFrom(cpu);
-  copy.copyFrom(gpu, HPPL_STREAM_DEFAULT);
-
-  TensorCheckEqual(cpu, copy);
-}
-
-void testBatch2seqPadding(int batchSize, int inputDim) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-
-  IVectorPtr cpuSequence;
-  generateSequenceStartPositions(batchSize, cpuSequence);
-  for (int i = 0; i < int(cpuSequence->getSize()); ++i) {
-    (cpuSequence->getData())[i] += 1;  // so no way that maxSeqLen is 0;
-  }
-
-  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
-  gpuSequence->copyFrom(*cpuSequence);
-
-  size_t numSeq = cpuSequence->getSize() - 1;
-  size_t maxSeqLen = *std::max_element(cpuSequence->getData(),
-                                       cpuSequence->getData() + numSeq);
-
-  printf("numSeq = %ld, maxSeqLen = %ld\n", numSeq, maxSeqLen);
-  MatrixPtr cBatch = std::make_shared<CpuMatrix>(numSeq * maxSeqLen, inputDim);
-  MatrixPtr gBatch = std::make_shared<GpuMatrix>(numSeq * maxSeqLen, inputDim);
-  MatrixPtr cCheck = std::make_shared<CpuMatrix>(numSeq * maxSeqLen, inputDim);
-
-  // hl_sequence2batch_copy_padding(gBatch->getData(),
-  //                                gpuInput->getData(),
-  //                                cpuSequence->getData(),
-  //                                inputDim,
-  //                                maxSeqLen,
-  //                                numSeq,
-  //                                false,
-  //                                true);
-  // cCheck->copyFrom(*gBatch);
-
-  // int* seqStart = cpuSequence->getData();
-  // float* batchData = cBatch->getData();
-  // float* seqData = cpuInput->getData();
-  // for (size_t i = 0; i < maxSeqLen; i++) {
-  //   for (size_t j = 0; j < numSeq; j++) {
-  //     size_t sequenceStart = seqStart[j];
-  //     size_t sequenceLength = seqStart[j + 1] - seqStart[j];
-  //     if (i < sequenceLength) {
-  //       memcpy(batchData + (i * numSeq + j) * inputDim,
-  //              seqData + (sequenceStart + i) * inputDim,
-  //              inputDim * sizeof(real));
-  //     } else {
-  //       memset(batchData + (i * numSeq + j) * inputDim,
-  //              0,
-  //              inputDim * sizeof(real));
-  //     }
-  //   }
-  // }
-
-  // TensorCheckErr(*cBatch, *cCheck);
-}
-
-TEST(Matrix, warpCTC) {
-  for (auto batchSize : {1, 3, 17}) {
-    for (auto inputDim : {1, 3, 31}) {
-      VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim;
-      testBatch2seqPadding(batchSize, inputDim);
-    }
-  }
-}
-
-void testMaxPool3DFwdBwd(int numSamples,
-                         int channels,
-                         int imgSizeD,
-                         int imgSizeH,
-                         int imgSizeW,
-                         int ksizeD,
-                         int ksizeH,
-                         int ksizeW,
-                         int strideD,
-                         int strideH,
-                         int strideW,
-                         int padD,
-                         int padH,
-                         int padW) {
-  int outD = outputSize(imgSizeD, ksizeD, padD, strideD, true);
-  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
-  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
-
-  int inWidth = channels * imgSizeD * imgSizeH * imgSizeW;
-  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  int outWidth = channels * outD * outH * outW;
-  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
-  MatrixPtr maxIdx = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr maxIdxGpu = GpuMatrix::create(numSamples, outWidth, false, true);
-
-  input->randomizeUniform();
-  target->randomizeUniform();
-  inputGpu->copyFrom(*input);
-  targetGpu->copyFrom(*target);
-
-  target->maxPool3DForward(*input,
-                           *maxIdx,
-                           channels,
-                           imgSizeD,
-                           imgSizeH,
-                           imgSizeW,
-                           outD,
-                           outH,
-                           outW,
-                           ksizeD,
-                           ksizeH,
-                           ksizeW,
-                           strideD,
-                           strideH,
-                           strideW,
-                           padD,
-                           padH,
-                           padW);
-  targetGpu->maxPool3DForward(*inputGpu,
-                              *maxIdxGpu,
-                              channels,
-                              imgSizeD,
-                              imgSizeH,
-                              imgSizeW,
-                              outD,
-                              outH,
-                              outW,
-                              ksizeD,
-                              ksizeH,
-                              ksizeW,
-                              strideD,
-                              strideH,
-                              strideW,
-                              padD,
-                              padH,
-                              padW);
-  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
-  targetCheck->copyFrom(*targetGpu);
-  checkMatrixEqual(target, targetCheck);
-
-  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
-  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpuGrad =
-      GpuMatrix::create(numSamples, outWidth, false, true);
-
-  inputGrad->randomizeUniform();
-  targetGrad->randomizeUniform();
-  inputGpuGrad->copyFrom(*inputGrad);
-  targetGpuGrad->copyFrom(*targetGrad);
-
-  inputGrad->maxPool3DBackward(*targetGrad,
-                               *maxIdx,
-                               imgSizeD,
-                               imgSizeH,
-                               imgSizeW,
-                               outD,
-                               outH,
-                               outW,
-                               ksizeD,
-                               ksizeH,
-                               ksizeW,
-                               strideD,
-                               strideH,
-                               strideW,
-                               padD,
-                               padH,
-                               padW,
-                               1.0,
-                               1.0);
-  inputGpuGrad->maxPool3DBackward(*targetGpuGrad,
-                                  *maxIdxGpu,
-                                  imgSizeD,
-                                  imgSizeH,
-                                  imgSizeW,
-                                  outD,
-                                  outH,
-                                  outW,
-                                  ksizeD,
-                                  ksizeH,
-                                  ksizeW,
-                                  strideD,
-                                  strideH,
-                                  strideW,
-                                  padD,
-                                  padH,
-                                  padW,
-                                  1.0,
-                                  1.0);
-  MatrixPtr targetBwdCheck =
-      CpuMatrix::create(numSamples, inWidth, false, false);
-  targetBwdCheck->copyFrom(*inputGpuGrad);
-  checkMatrixEqual(inputGrad, targetBwdCheck);
-}
-
-void testAvgPool3DFwdBwd(int numSamples,
-                         int channels,
-                         int imgSizeD,
-                         int imgSizeH,
-                         int imgSizeW,
-                         int ksizeD,
-                         int ksizeH,
-                         int ksizeW,
-                         int strideD,
-                         int strideH,
-                         int strideW,
-                         int padD,
-                         int padH,
-                         int padW) {
-  int outD = outputSize(imgSizeD, ksizeD, padD, strideD, true);
-  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
-  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
-
-  int inWidth = imgSizeD * imgSizeH * imgSizeW * channels;
-  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  int outWidth = channels * outD * outH * outW;
-  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
-
-  input->randomizeUniform();
-  target->randomizeUniform();
-  inputGpu->copyFrom(*input);
-  targetGpu->copyFrom(*target);
-
-  target->avgPool3DForward(*input,
-                           channels,
-                           imgSizeD,
-                           imgSizeH,
-                           imgSizeW,
-                           outD,
-                           outH,
-                           outW,
-                           ksizeD,
-                           ksizeH,
-                           ksizeW,
-                           strideD,
-                           strideH,
-                           strideW,
-                           padD,
-                           padH,
-                           padW);
-
-  targetGpu->avgPool3DForward(*inputGpu,
-                              channels,
-                              imgSizeD,
-                              imgSizeH,
-                              imgSizeW,
-                              outD,
-                              outH,
-                              outW,
-                              ksizeD,
-                              ksizeH,
-                              ksizeW,
-                              strideD,
-                              strideH,
-                              strideW,
-                              padD,
-                              padH,
-                              padW);
-
-  TensorCheckErr(*target, *targetGpu);
-
-  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
-  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpuGrad =
-      GpuMatrix::create(numSamples, outWidth, false, true);
-
-  inputGrad->randomizeUniform();
-  targetGrad->randomizeUniform();
-  inputGpuGrad->copyFrom(*inputGrad);
-  targetGpuGrad->copyFrom(*targetGrad);
-
-  inputGrad->avgPool3DBackward(*targetGrad,
-                               imgSizeD,
-                               imgSizeH,
-                               imgSizeW,
-                               outD,
-                               outH,
-                               outW,
-                               ksizeD,
-                               ksizeH,
-                               ksizeW,
-                               strideD,
-                               strideH,
-                               strideW,
-                               padD,
-                               padH,
-                               padW,
-                               1.0,
-                               1.0);
-
-  inputGpuGrad->avgPool3DBackward(*targetGpuGrad,
-                                  imgSizeD,
-                                  imgSizeH,
-                                  imgSizeW,
-                                  outD,
-                                  outH,
-                                  outW,
-                                  ksizeD,
-                                  ksizeH,
-                                  ksizeW,
-                                  strideD,
-                                  strideH,
-                                  strideW,
-                                  padD,
-                                  padH,
-                                  padW,
-                                  1.0,
-                                  1.0);
-  TensorCheckErr(*inputGrad, *inputGpuGrad);
-}
-
-// TODO(yi): I noticed many such blindly combinatorial tests in this
-// file.  They are no help to locate defects at all.
-TEST(Matrix, Pool3DFwdBwd) {
-  for (auto numSamples : {1, 3}) {
-    for (auto channels : {3}) {
-      for (auto imgSizeD : {9, 16}) {
-        for (auto imgSizeH : {9, 32}) {
-          for (auto imgSizeW : {9, 32}) {
-            for (auto sizeX : {3}) {
-              for (auto sizeY : {3}) {
-                for (auto sizeZ : {3}) {
-                  for (auto sD : {2}) {
-                    for (auto sH : {2}) {
-                      for (auto sW : {2}) {
-                        for (auto pD : {0, (sizeZ - 1) / 2}) {
-                          for (auto pH : {0, (sizeY - 1) / 2}) {
-                            for (auto pW : {0, (sizeX - 1) / 2}) {
-                              VLOG(3) << " numSamples=" << numSamples
-                                      << " channels=" << channels
-                                      << " imgSizeD=" << imgSizeD
-                                      << " imgSizeH=" << imgSizeH
-                                      << " imgSizeW=" << imgSizeW
-                                      << " sizeX=" << sizeX
-                                      << " sizeY=" << sizeY
-                                      << " sizeZ=" << sizeZ << " strideD=" << sD
-                                      << " strideH=" << sH << " strideW=" << sW
-                                      << " padingD=" << pD << " padingH=" << pH
-                                      << " padingW=" << pW;
-
-                              testMaxPool3DFwdBwd(numSamples,
-                                                  channels,
-                                                  imgSizeD,
-                                                  imgSizeH,
-                                                  imgSizeW,
-                                                  sizeX,
-                                                  sizeY,
-                                                  sizeZ,
-                                                  sD,
-                                                  sH,
-                                                  sW,
-                                                  pD,
-                                                  pH,
-                                                  pW);
-                              testAvgPool3DFwdBwd(numSamples,
-                                                  channels,
-                                                  imgSizeD,
-                                                  imgSizeH,
-                                                  imgSizeW,
-                                                  sizeX,
-                                                  sizeY,
-                                                  sizeZ,
-                                                  sD,
-                                                  sH,
-                                                  sW,
-                                                  pD,
-                                                  pH,
-                                                  pW);
-                            }
-                          }
-                        }
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-
-  //  for (auto numSamples : {1, 3}) {
-  //    for (auto channels : {1, 3}) {
-  //      for (auto imgSizeD : {9,16}) {
-  //      for (auto imgSizeH : {9, 32}) {
-  //        for (auto imgSizeW : {9, 32}) {
-  //          for (auto sizeX : {2, 3}) {
-  //            for (auto sizeY : {2, 3}) {
-  //            for (auto sizeZ : {2,3}){
-  //              for (auto sD : {1, 2}) {
-  //              for (auto sH : {1, 2}) {
-  //                for (auto sW : {1, 2}) {
-  //                  for (auto pD : {0, (sizeZ - 1) / 2}){
-  //                  for (auto pH : {0, (sizeY - 1) / 2}) {
-  //                    for (auto pW : {0, (sizeX - 1) / 2}) {
-  //                      VLOG(3) << " numSamples=" << numSamples
-  //                              << " channels=" << channels
-  //                              << " imgSizeD=" << imgSizeD
-  //                              << " imgSizeH=" << imgSizeH
-  //                              << " imgSizeW=" << imgSizeW
-  //                              << " sizeX=" << sizeX
-  //                              << " sizeY=" << sizeY
-  //                              << " sizeZ=" << sizeZ
-  //                              << " strideD=" << sD
-  //                              << " strideH=" << sH
-  //                              << " strideW=" << sW
-  //                              << " padingD=" << pD
-  //                              << " padingH=" << pH
-  //                              << " padingW=" << pW;
-  //
-  //                      testMaxPool3DFwdBwd(numSamples,
-  //                                        channels,
-  //                                        imgSizeD,
-  //                                        imgSizeH,
-  //                                        imgSizeW,
-  //                                        sizeX,
-  //                                        sizeY,
-  //                                        sizeZ,
-  //                                        sD,
-  //                                        sH,
-  //                                        sW,
-  //                                        pD,
-  //                                        pH,
-  //                                        pW);
-  //                      testAvgPool3DFwdBwd(numSamples,
-  //                                        channels,
-  //                                        imgSizeD,
-  //                                        imgSizeH,
-  //                                        imgSizeW,
-  //                                        sizeX,
-  //                                        sizeY,
-  //                                        sizeZ,
-  //                                        sD,
-  //                                        sH,
-  //                                        sW,
-  //                                        pD,
-  //                                        pH,
-  //                                        pW);
-  //                    }
-  //                  }
-  //                }
-  //              }
-  //            }
-  //            }
-  //          }
-  //        }
-  //      }
-  //      }
-  //    }
-  //    }
-  //  }
-  //  }
-}
-
-void testMatrixCol2Vol(int depth, int height, int width) {
-  int channel = 3;
-  int filterX = 3, filterY = 4, filterZ = 5;
-  int strideX = 2, strideY = 2, strideZ = 2;
-  int padX = 1, padY = 1, padZ = 1;
-
-  MatrixPtr cpuImage =
-      std::make_shared<CpuMatrix>(channel, depth * height * width);
-  MatrixPtr gpuImage =
-      std::make_shared<GpuMatrix>(channel, depth * height * width);
-  cpuImage->randomizeUniform();
-  gpuImage->copyFrom(*cpuImage);
-
-  int outD = outputSize(depth, filterZ, padZ, strideZ, true);
-  int outH = outputSize(height, filterY, padY, strideY, true);
-  int outW = outputSize(width, filterX, padX, strideX, true);
-
-  int colBufHeight = channel * filterZ * filterY * filterX;
-  int colBufWidth = outD * outH * outW;
-  MatrixPtr cpuColBuf = std::make_shared<CpuMatrix>(colBufHeight, colBufWidth);
-  MatrixPtr gpuColBuf = std::make_shared<GpuMatrix>(colBufHeight, colBufWidth);
-  cpuColBuf->vol2Col(cpuImage->getData(),
-                     channel,
-                     depth,
-                     height,
-                     width,
-                     filterZ,
-                     filterY,
-                     filterX,
-                     strideZ,
-                     strideY,
-                     strideX,
-                     padZ,
-                     padY,
-                     padX);
-  gpuColBuf->vol2Col(gpuImage->getData(),
-                     channel,
-                     depth,
-                     height,
-                     width,
-                     filterZ,
-                     filterY,
-                     filterX,
-                     strideZ,
-                     strideY,
-                     strideX,
-                     padZ,
-                     padY,
-                     padX);
-  TensorCheckEqual(*cpuColBuf, *gpuColBuf);
-
-  cpuColBuf->randomizeUniform();
-  gpuColBuf->copyFrom(*cpuColBuf);
-  cpuColBuf->col2Vol(cpuImage->getData(),
-                     channel,
-                     depth,
-                     height,
-                     width,
-                     filterZ,
-                     filterY,
-                     filterX,
-                     strideZ,
-                     strideY,
-                     strideX,
-                     padZ,
-                     padY,
-                     padX,
-                     1.0,
-                     1.0);
-  gpuColBuf->col2Vol(gpuImage->getData(),
-                     channel,
-                     depth,
-                     height,
-                     width,
-                     filterZ,
-                     filterY,
-                     filterX,
-                     strideZ,
-                     strideY,
-                     strideX,
-                     padZ,
-                     padY,
-                     padX,
-                     1.0,
-                     1.0);
-  TensorCheckErr(*cpuImage, *gpuImage);
-}
-
-TEST(Matrix, col2Vol) {
-  for (auto depth : {9, 16, 64}) {
-    for (auto height : {9, 11, 128}) {
-      for (auto width : {9, 32, 128}) {
-        VLOG(3) << "depth=" << depth << " height=" << height
-                << " width=" << width;
-        testMatrixCol2Vol(depth, height, width);
-      }
-    }
-  }
-}
-
-#endif
diff --git a/paddle/math/tests/test_matrixUtil.h b/paddle/math/tests/test_matrixUtil.h
deleted file mode 100644
index 86297547dcd83ca87d1c87a8489f7af2f3e9f492..0000000000000000000000000000000000000000
--- a/paddle/math/tests/test_matrixUtil.h
+++ /dev/null
@@ -1,233 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <gtest/gtest.h>
-#include <paddle/utils/Util.h>
-#include "paddle/math/SparseMatrix.h"
-
-namespace paddle {
-
-void checkMatrixEqual(const MatrixPtr& a, const MatrixPtr& b) {
-  ASSERT_EQ(a->getWidth(), b->getWidth());
-  ASSERT_EQ(a->getHeight(), b->getHeight());
-  ASSERT_EQ(a->isTransposed(), b->isTransposed());
-  for (size_t r = 0; r < a->getHeight(); ++r) {
-    for (size_t c = 0; c < a->getWidth(); ++c) {
-      ASSERT_FLOAT_EQ(a->getElement(r, c), b->getElement(r, c));
-    }
-  }
-}
-
-void checkSMatrixEqual(const CpuSparseMatrix& a, const CpuSparseMatrix& b) {
-  ASSERT_EQ(a.getWidth(), b.getWidth());
-  ASSERT_EQ(a.getHeight(), b.getHeight());
-  ASSERT_EQ(a.isTransposed(), b.isTransposed());
-  ASSERT_EQ(a.getFormat(), b.getFormat());
-  ASSERT_EQ(a.getElementCnt(), b.getElementCnt());
-  for (size_t r = 0; r < a.getElementCnt(); ++r) {
-    ASSERT_FLOAT_EQ(a.getValue()[r], b.getValue()[r]);
-  }
-}
-
-void checkSMatrixEqual(const CpuSparseMatrixPtr& a,
-                       const CpuSparseMatrixPtr& b) {
-  ASSERT_EQ(a->getWidth(), b->getWidth());
-  ASSERT_EQ(a->getHeight(), b->getHeight());
-  ASSERT_EQ(a->isTransposed(), b->isTransposed());
-  ASSERT_EQ(a->getFormat(), b->getFormat());
-  ASSERT_EQ(a->getElementCnt(), b->getElementCnt());
-  for (size_t r = 0; r < a->getElementCnt(); ++r) {
-    ASSERT_FLOAT_EQ(a->getValue()[r], b->getValue()[r]);
-  }
-}
-
-void checkSMatrixEqual2(const CpuSparseMatrixPtr& a,
-                        const CpuSparseMatrixPtr& b) {
-  ASSERT_EQ(a->getWidth(), b->getWidth());
-  ASSERT_EQ(a->getHeight(), b->getHeight());
-  ASSERT_EQ(a->isTransposed(), b->isTransposed());
-  ASSERT_EQ(a->getFormat(), b->getFormat());
-  ASSERT_EQ(a->getValueType(), b->getValueType());
-  ASSERT_EQ(a->getElementCnt(), b->getElementCnt());
-  if (a->getFormat() == SPARSE_CSR) {
-    for (size_t r = 0; r < a->getElementCnt(); ++r) {
-      ASSERT_EQ(a->getCols()[r], b->getCols()[r]);
-      if (a->getValueType() == FLOAT_VALUE) {
-        ASSERT_FLOAT_EQ(a->getValue()[r], b->getValue()[r]);
-      }
-    }
-    for (size_t r = 0; r <= a->getHeight(); r++) {
-      ASSERT_EQ(a->getRows()[r], b->getRows()[r]);
-    }
-  } else {
-    for (size_t r = 0; r < a->getElementCnt(); ++r) {
-      ASSERT_EQ(a->getRows()[r], b->getRows()[r]);
-      if (a->getValueType() == FLOAT_VALUE) {
-        ASSERT_FLOAT_EQ(a->getValue()[r], b->getValue()[r]);
-      }
-    }
-    for (size_t r = 0; r <= a->getWidth(); r++) {
-      ASSERT_EQ(a->getCols()[r], b->getCols()[r]);
-    }
-  }
-}
-
-void checkSMatrixEqual2Dense(const CpuSparseMatrix& a, const CpuMatrix& b) {
-  ASSERT_EQ(a.getWidth(), b.getWidth());
-  ASSERT_EQ(a.getHeight(), b.getHeight());
-  ASSERT_EQ(a.isTransposed(), b.isTransposed());
-
-  if (a.getFormat() == SPARSE_CSC) {
-    int* rows = a.getRows();
-    for (size_t i = 0; i < a.getWidth(); i++) {
-      for (size_t j = a.getColStartIdx(i); j < a.getColStartIdx(i + 1); j++) {
-        if (a.getValueType() == FLOAT_VALUE) {
-          ASSERT_FLOAT_EQ(a.getValue()[j], b.getElement(rows[j], i));
-        } else {
-          ASSERT_FLOAT_EQ(1.0, b.getElement(rows[j], i));
-        }
-      }
-    }
-  } else {
-    int* cols = a.getCols();
-    for (size_t i = 0; i < a.getHeight(); i++) {
-      for (size_t j = a.getRowStartIdx(i); j < a.getRowStartIdx(i + 1); j++) {
-        if (a.getValueType() == FLOAT_VALUE) {
-          ASSERT_FLOAT_EQ(a.getValue()[j], b.getElement(i, cols[j]));
-        } else {
-          ASSERT_FLOAT_EQ(1.0, b.getElement(i, cols[j]));
-        }
-      }
-    }
-  }
-}
-
-void checkSMatrixEqual2Dense(const CpuSparseMatrixPtr& a,
-                             const CpuMatrixPtr& b) {
-  ASSERT_EQ(a->getWidth(), b->getWidth());
-  ASSERT_EQ(a->getHeight(), b->getHeight());
-  ASSERT_EQ(a->isTransposed(), b->isTransposed());
-
-  if (a->getFormat() == SPARSE_CSC) {
-    int* rows = a->getRows();
-    for (size_t i = 0; i < a->getWidth(); i++) {
-      for (size_t j = a->getColStartIdx(i); j < a->getColStartIdx(i + 1); j++) {
-        if (a->getValueType() == FLOAT_VALUE) {
-          ASSERT_FLOAT_EQ(a->getValue()[j], b->getElement(rows[j], i));
-        } else {
-          ASSERT_FLOAT_EQ(1.0, b->getElement(rows[j], i));
-        }
-      }
-    }
-  } else {
-    int* cols = a->getCols();
-    for (size_t i = 0; i < a->getHeight(); i++) {
-      for (size_t j = a->getRowStartIdx(i); j < a->getRowStartIdx(i + 1); j++) {
-        if (a->getValueType() == FLOAT_VALUE) {
-          ASSERT_FLOAT_EQ(a->getValue()[j], b->getElement(i, cols[j]));
-        } else {
-          ASSERT_FLOAT_EQ(1.0, b->getElement(i, cols[j]));
-        }
-      }
-    }
-  }
-}
-
-void checkSMatrixErr(const CpuSparseMatrixPtr& a, const CpuSparseMatrixPtr& b) {
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-  ASSERT_EQ(a->getWidth(), b->getWidth());
-  ASSERT_EQ(a->getHeight(), b->getHeight());
-  ASSERT_EQ(a->isTransposed(), b->isTransposed());
-  ASSERT_EQ(a->getFormat(), b->getFormat());
-  ASSERT_EQ(a->getValueType(), b->getValueType());
-  ASSERT_EQ(a->getElementCnt(), b->getElementCnt());
-  int count = 0;
-  if (a->getFormat() == SPARSE_CSR) {
-    for (size_t r = 0; r < a->getElementCnt(); ++r) {
-      ASSERT_EQ(a->getCols()[r], b->getCols()[r]);
-      if (a->getValueType() == FLOAT_VALUE) {
-        real aVal = a->getValue()[r];
-        real bVal = b->getValue()[r];
-        if (std::abs(aVal - bVal) > err) {
-          if ((std::abs(aVal - bVal) / std::abs(aVal)) > (err / 10.0f)) {
-            LOG(INFO) << "a=" << aVal << "\t"
-                      << "b=" << bVal;
-            count++;
-          }
-        }
-      }
-    }
-    for (size_t r = 0; r <= a->getHeight(); r++) {
-      ASSERT_EQ(a->getRows()[r], b->getRows()[r]);
-    }
-  } else {
-    for (size_t r = 0; r < a->getElementCnt(); ++r) {
-      ASSERT_EQ(a->getRows()[r], b->getRows()[r]);
-      if (a->getValueType() == FLOAT_VALUE) {
-        real aVal = a->getValue()[r];
-        real bVal = b->getValue()[r];
-        if (std::abs(aVal - bVal) > err) {
-          if ((std::abs(aVal - bVal) / std::abs(aVal)) > (err / 10.0f)) {
-            count++;
-          }
-        }
-      }
-    }
-    for (size_t r = 0; r <= a->getWidth(); r++) {
-      ASSERT_EQ(a->getCols()[r], b->getCols()[r]);
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-}
-
-void checkMatrixErr(const Matrix& matrix1, const Matrix& matrix2) {
-  CHECK(matrix1.getHeight() == matrix2.getHeight());
-  CHECK(matrix1.getWidth() == matrix2.getWidth());
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-
-  int height = matrix1.getHeight();
-  int width = matrix1.getWidth();
-  const real* data1 = matrix1.getData();
-  const real* data2 = matrix2.getData();
-  int count = 0;
-  for (int i = 0; i < height; i++) {
-    for (int j = 0; j < width; j++) {
-      real a = data1[i * width + j];
-      real b = data2[i * width + j];
-      if (std::abs(a - b) > err) {
-        if ((std::abs(a - b) / std::abs(a)) > (err / 10.0f)) {
-          count++;
-        }
-      }
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-}
-
-void checkDataEqual(const real* a, const real* b, size_t size) {
-  for (size_t i = 0; i < size; ++i) {
-    ASSERT_FLOAT_EQ(a[i], b[i]);
-  }
-}
-
-}  //  namespace paddle
diff --git a/paddle/math/tests/test_sparseMatrixCompare.cpp b/paddle/math/tests/test_sparseMatrixCompare.cpp
deleted file mode 100644
index 12647d21a29936e169b893ec8119b64fec9af580..0000000000000000000000000000000000000000
--- a/paddle/math/tests/test_sparseMatrixCompare.cpp
+++ /dev/null
@@ -1,174 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_CUDA
-/// This unittest checks GpuSparseMatrix/CpuSparseMatrix get same result,
-//  so disable when
-/// only cpu version.
-
-#include <gtest/gtest.h>
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Util.h"
-#include "test_matrixUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-static inline int uniformRandom(int n) { return n == 0 ? 0 : rand() % n; }
-
-void testSpMatrixAddBias(int M, int N, real rate, real scale) {
-  int nnz = M * N * rate;
-
-  MatrixPtr cpuA(new CpuSparseMatrix(M, N, nnz));
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(1, N);
-
-  MatrixPtr gpuA(new GpuSparseMatrix(M, N, nnz));
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(1, N);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-
-  hl_stream_t stream(HPPL_STREAM_1);
-  gpuA->copyFrom(*cpuA, stream);
-  gpuB->copyFrom(*cpuB, stream);
-  hl_stream_synchronize(stream);
-
-  cpuA->addBias(*cpuB, scale);
-  gpuA->addBias(*gpuB, scale);
-
-  MatrixPtr outputCheck(new CpuSparseMatrix(M, N, nnz));
-  outputCheck->copyFrom(*gpuA, stream);
-  hl_stream_synchronize(stream);
-  checkSMatrixEqual2(std::dynamic_pointer_cast<CpuSparseMatrix>(cpuA),
-                     std::dynamic_pointer_cast<CpuSparseMatrix>(outputCheck));
-}
-
-void testSpMatrixAddDense(int M, int N, real rate) {  // add3
-  int nnz = M * N * rate;
-
-  MatrixPtr cpuA(new CpuSparseMatrix(M, N, nnz));
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(M, N);
-
-  MatrixPtr gpuA(new GpuSparseMatrix(M, N, nnz));
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(M, N);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-
-  hl_stream_t stream(HPPL_STREAM_3);
-  gpuA->copyFrom(*cpuA, stream);
-  gpuB->copyFrom(*cpuB, stream);
-  hl_stream_synchronize(stream);
-
-  cpuA->add3(cpuB);
-  gpuA->add3(gpuB);
-
-  MatrixPtr outputCheck(new CpuSparseMatrix(M, N, nnz));
-  outputCheck->copyFrom(*gpuA, stream);
-  hl_stream_synchronize(stream);
-  checkSMatrixEqual2(std::dynamic_pointer_cast<CpuSparseMatrix>(cpuA),
-                     std::dynamic_pointer_cast<CpuSparseMatrix>(outputCheck));
-}
-
-void testSpMatrixMul(int M, int N, int K, real rate) {
-  int nnz = M * N * rate;
-
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(M, K);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(N, K);
-  MatrixPtr cpuC(new CpuSparseMatrix(M, N, nnz));
-
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(M, K);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(N, K);
-  MatrixPtr gpuC(new GpuSparseMatrix(M, N, nnz));
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  cpuC->randomizeUniform();
-
-  hl_stream_t stream(HPPL_STREAM_3);
-  gpuA->copyFrom(*cpuA, stream);
-  gpuB->copyFrom(*cpuB, stream);
-  gpuC->copyFrom(*cpuC, stream);
-  hl_stream_synchronize(stream);
-
-  cpuC->mul(*cpuA, *cpuB->getTranspose(), 1, 1);
-  gpuC->mul(*gpuA, *gpuB->getTranspose(), 1, 1);
-
-  MatrixPtr outputCheck(new CpuSparseMatrix(M, N, nnz));
-  outputCheck->copyFrom(*gpuC, stream);
-  hl_stream_synchronize(stream);
-  checkSMatrixErr(std::dynamic_pointer_cast<CpuSparseMatrix>(cpuC),
-                  std::dynamic_pointer_cast<CpuSparseMatrix>(outputCheck));
-}
-
-void testSpMatrixCollectBias(int M, int N, real rate) {
-  int nnz = M * N * rate;
-  LOG(INFO) << "nnz=" << nnz;
-
-  MatrixPtr cpuA(new CpuSparseMatrix(M, N, nnz));
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(1, N);
-
-  MatrixPtr gpuA(new GpuSparseMatrix(M, N, nnz));
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(1, N);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-
-  hl_stream_t stream(HPPL_STREAM_3);
-  gpuA->copyFrom(*cpuA, stream);
-  gpuB->copyFrom(*cpuB, stream);
-  hl_stream_synchronize(stream);
-
-  cpuB->collectBias(*cpuA, 1);
-  gpuB->collectBias(*gpuA, 1);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(1, N);
-  outputCheck->copyFrom(*gpuB, stream);
-  hl_stream_synchronize(stream);
-  checkMatrixErr(*cpuB, *outputCheck);
-}
-
-TEST(SMatrix, sMatrixOp) {
-  for (auto height : {1, 11, 200}) {
-    for (auto width : {200, 2048, 20480}) {
-      VLOG(3) << " height=" << height << " width=" << width;
-      for (auto rate : {0.02, 0.1}) {
-        testSpMatrixAddDense(height, width, rate);
-        testSpMatrixAddBias(height, width, rate, 1.0);
-      }
-    }
-  }
-}
-
-TEST(SMatrix, sMatrixMul) {
-  for (auto M : {1, 40, 128, 200}) {
-    for (auto N : {100, 2000, 20480}) {
-      for (auto K : {100, 512, 1024}) {
-        VLOG(3) << " M=" << M << " N=" << N << " K=" << K;
-        testSpMatrixMul(M, N, K, 0.05);
-      }
-    }
-  }
-}
-
-TEST(SMatrix, sMatrixCollectBias) {
-  for (auto height : {1, 128, 200}) {
-    for (auto width : {100, 2048, 20480}) {
-      VLOG(3) << " height=" << height << " width=" << width;
-      testSpMatrixCollectBias(height, width, 0.1);
-    }
-  }
-}
-
-#endif
diff --git a/paddle/optimizer/serialization.h b/paddle/optimizer/serialization.h
deleted file mode 100644
index bf12eed15f0190b8e856163c68690f3f6eef9a12..0000000000000000000000000000000000000000
--- a/paddle/optimizer/serialization.h
+++ /dev/null
@@ -1,49 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <iostream>
-#include <sstream>
-#include <string>
-#include <type_traits>
-#include "OptimizerConfig.pb.h"
-#include "paddle/utils/Logging.h"
-#include "tensor.h"
-
-namespace paddle {
-namespace optimizer {
-
-static void TensorToProto(const Tensor& tensor, TensorProto* proto) {
-  proto->set_data_type(TensorProto::PADDLE_ELEMENT_TYPE_FLOAT32);
-  std::stringstream os;
-  for (size_t i = 0; i < tensor.size(); ++i) {
-    os << tensor[i];
-    proto->add_content(os.str());
-    os.str(std::string());
-  }
-}
-
-static void ProtoToTensor(const TensorProto& proto, Tensor* tensor) {
-  std::stringstream sin;
-  for (auto i = 0; i < proto.content_size(); ++i) {
-    sin << proto.content(i);
-    sin >> (*tensor)[i];
-    sin.str(std::string());
-    sin.clear();
-  }
-}
-
-}  // namespace optimizer
-}  // namespace paddle
diff --git a/paddle/optimizer/tensor.h b/paddle/optimizer/tensor.h
deleted file mode 100644
index d2cef99074335be6f9852d60daa103b9b45a550d..0000000000000000000000000000000000000000
--- a/paddle/optimizer/tensor.h
+++ /dev/null
@@ -1,68 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-/**
- * @brief tensor used by optimizer
- */
-
-#include <string.h>
-#include <memory>
-#include "paddle/utils/Common.h"
-#include "paddle/utils/Logging.h"
-
-namespace paddle {
-namespace optimizer {
-
-template <class T>
-class TensorT {
- public:
-  TensorT(size_t size) : height_(1), width_(size) {
-    // new T[size]() initializes all element to zero value.
-    data_ptr_ = std::shared_ptr<T>(new T[size](), std::default_delete<T[]>());
-    data_ = data_ptr_.get();
-  }
-
-  TensorT(T* data, size_t size)
-      : height_(1), width_(size), data_ptr_(nullptr), data_(data) {}
-
-  TensorT(T* data, size_t h, size_t w)
-      : height_(h), width_(w), data_ptr_(nullptr), data_(data) {}
-
-  virtual ~TensorT() {}
-
-  T* get_buffer() { return this->data_; }
-
-  T& operator[](const size_t idx) {
-    CHECK(idx >= 0 && idx < this->width_) << "out of index range";
-    return data_[idx];
-  }
-  T& operator[](const size_t idx) const {
-    CHECK(idx >= 0 && idx < this->width_) << "out of index range";
-    return data_[idx];
-  }
-  // TODO: replace with tensorshape
-  size_t size() const { return this->width_ * this->height_; }
-
- protected:
-  size_t height_;
-  size_t width_;
-  std::shared_ptr<T> data_ptr_;
-  T* data_;
-};
-
-// TODO(zhihong): design problem of dynamic datatype, need to fix it
-typedef TensorT<float> Tensor;
-
-}  // namespace optimizer
-}  // namespace paddle
diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp
deleted file mode 100644
index 94522f718a0c19bfc704ca92eddef5c5a9cb6919..0000000000000000000000000000000000000000
--- a/paddle/parameter/Argument.cpp
+++ /dev/null
@@ -1,707 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Argument.h"
-#include "paddle/math/SparseMatrix.h"
-
-#include <algorithm>
-
-namespace paddle {
-static void resizeAndCopy(MatrixPtr& dest,
-                          const MatrixPtr& src,
-                          bool useGpu,
-                          hl_stream_t stream) {
-  if (src) {
-    if (!dest) {
-      dest = src->clone(0, 0, useGpu);
-    } else {
-      CHECK_EQ(dest->useGpu(), useGpu);
-      dest->resize(src->getHeight(), src->getWidth());
-    }
-    dest->copyFrom(*src, stream);
-  } else {
-    dest.reset();
-  }
-}
-
-static void resizeAndCopy(IVectorPtr& dest,
-                          const IVectorPtr& src,
-                          bool useGpu,
-                          hl_stream_t stream) {
-  if (src) {
-    IVector::resizeOrCreate(dest, src->getSize(), useGpu);
-    dest->copyFrom(*src, stream);
-  } else {
-    dest.reset();
-  }
-}
-
-static void resizeAndCopy(ICpuGpuVectorPtr& dest,
-                          const ICpuGpuVectorPtr& src,
-                          bool useGpu,
-                          hl_stream_t stream) {
-  if (src) {
-    ICpuGpuVector::resizeOrCreate(dest, src->getSize(), useGpu);
-    dest->copyFrom(*src, stream);
-  } else {
-    dest.reset();
-  }
-}
-
-static void resizeAndCopy(MatrixPtr& dest,
-                          const MatrixPtr& src,
-                          int32_t startRow,
-                          int32_t copySize,
-                          bool useGpu,
-                          hl_stream_t stream = HPPL_STREAM_DEFAULT) {
-  if (src) {
-    CHECK_LE((size_t)startRow + copySize, src->getHeight());
-    int height = copySize;
-    int width = src->getWidth();
-    if (!dest) {
-      dest = src->clone(height, width, useGpu);
-    } else {
-      CHECK_EQ(dest->useGpu(), useGpu);
-      dest->resize(height, width);
-    }
-    MatrixPtr submat = src->subMatrix(startRow, copySize);
-    if (dynamic_cast<GpuSparseMatrix*>(dest.get())) {
-      // copy a subMatrix of CpuSparseMatrix to GpuSparseMatrix.
-      // First copy it to CPU, and then copy it to the GPU.
-      MatrixPtr tmp = src->clone(height, width, false);
-      tmp->copyFrom(*submat, stream);
-      dest->copyFrom(*tmp, stream);
-    } else {
-      dest->copyFrom(*submat, stream);
-    }
-  } else {
-    dest.reset();
-  }
-}
-
-static void resizeAndCopy(IVectorPtr& dest,
-                          const IVectorPtr& src,
-                          int32_t startPos,
-                          int32_t copySize,
-                          bool useGpu,
-                          hl_stream_t stream = HPPL_STREAM_DEFAULT) {
-  if (src) {
-    CHECK_LE((size_t)startPos + copySize, src->getSize());
-
-    int height = copySize;
-    IVector::resizeOrCreate(dest, height, useGpu);
-    dest->copyFrom(src->getData() + startPos, height, stream);
-  } else {
-    dest.reset();
-  }
-}
-
-static void resizeAndCopy(ICpuGpuVectorPtr& dest,
-                          const ICpuGpuVectorPtr& src,
-                          int32_t startPos,
-                          int32_t copySize,
-                          bool useGpu,
-                          hl_stream_t stream = HPPL_STREAM_DEFAULT) {
-  if (src) {
-    CHECK_LE((size_t)startPos + copySize, src->getSize());
-
-    ICpuGpuVector::resizeOrCreate(dest, copySize, useGpu);
-    dest->copyFrom(*src, startPos, copySize, useGpu, stream);
-  } else {
-    dest.reset();
-  }
-}
-
-static void resizeAndCopy(SVectorPtr& dest,
-                          const SVectorPtr& src,
-                          bool useGpu,
-                          hl_stream_t stream) {
-  if (src) {
-    size_t height = src->size();
-    if (!dest) {
-      dest = std::make_shared<std::vector<std::string>>(height);
-    } else {
-      dest->resize(height);
-    }
-    std::copy_n(src->begin(), height, dest->begin());
-  } else {
-    dest.reset();
-  }
-}
-
-static void resizeAndCopy(SVectorPtr& dest,
-                          const SVectorPtr& src,
-                          int32_t startPos,
-                          int32_t copySize,
-                          bool useGpu,
-                          hl_stream_t stream = HPPL_STREAM_DEFAULT) {
-  if (src) {
-    CHECK_LE((size_t)startPos + copySize, src->size());
-    size_t height = copySize;
-    if (!dest) {
-      dest = std::make_shared<std::vector<std::string>>(height);
-    } else {
-      dest->resize(height);
-    }
-    std::copy_n(src->begin() + startPos, height, dest->begin());
-  } else {
-    dest.reset();
-  }
-}
-
-void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu) {
-  resizeAndCopyFrom(src, useGpu, HPPL_STREAM_DEFAULT);
-  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-}
-
-void Argument::resizeAndCopyFrom(const Argument& src,
-                                 bool useGpu,
-                                 hl_stream_t stream) {
-  dataId = src.dataId;
-  resizeAndCopy(value, src.value, useGpu, stream);
-  resizeAndCopy(grad, src.grad, useGpu, stream);
-  resizeAndCopy(in, src.in, useGpu, stream);
-  resizeAndCopy(ids, src.ids, useGpu, stream);
-  resizeAndCopy(sequenceStartPositions,
-                src.sequenceStartPositions,
-                false /* useGpu */,
-                stream);
-  if (src.hasSubseq()) {
-    resizeAndCopy(subSequenceStartPositions,
-                  src.subSequenceStartPositions,
-                  false /* useGpu */,
-                  stream);
-  }
-  resizeAndCopy(strs, src.strs, useGpu, stream);
-  frameWidth = src.frameWidth;
-  frameHeight = src.frameHeight;
-  frameDepth = src.frameDepth;
-}
-
-int32_t Argument::resizeAndCopyFrom(const Argument& src,
-                                    int32_t startSeq,
-                                    int32_t copySize,
-                                    bool useGpu) {
-  int32_t size =
-      resizeAndCopyFrom(src, startSeq, copySize, useGpu, HPPL_STREAM_DEFAULT);
-  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-  return size;
-}
-
-int32_t Argument::resizeAndCopyFrom(const Argument& src,
-                                    int32_t startSeq,
-                                    int32_t copySize,
-                                    bool useGpu,
-                                    hl_stream_t stream) {
-  dataId = src.dataId;
-  frameWidth = src.frameWidth;
-  frameHeight = src.frameHeight;
-  frameDepth = src.frameDepth;
-
-  if (!src.sequenceStartPositions) {
-    // non-sequence input, copy samples directly
-    int32_t startRow = startSeq;
-    resizeAndCopy(in, src.in, startRow, copySize, useGpu, stream);
-    resizeAndCopy(value, src.value, startRow, copySize, useGpu, stream);
-    resizeAndCopy(grad, src.grad, startRow, copySize, useGpu, stream);
-    resizeAndCopy(ids, src.ids, startRow, copySize, useGpu, stream);
-    resizeAndCopy(strs, src.strs, startRow, copySize, useGpu, stream);
-    return copySize;
-  } else {
-    // sequence input
-    const int* sequence = src.sequenceStartPositions->getData(false);
-    int32_t startRow = sequence[startSeq];           // sample start from here
-    int32_t endRow = sequence[startSeq + copySize];  // sample end
-    int32_t copyFeatureSize = endRow - startRow;     // num of samples
-    resizeAndCopy(in, src.in, startRow, copyFeatureSize, useGpu, stream);
-    resizeAndCopy(value, src.value, startRow, copyFeatureSize, useGpu, stream);
-    resizeAndCopy(grad, src.grad, startRow, copyFeatureSize, useGpu, stream);
-    resizeAndCopy(ids, src.ids, startRow, copyFeatureSize, useGpu, stream);
-    resizeAndCopy(sequenceStartPositions,
-                  src.sequenceStartPositions,
-                  startSeq,
-                  copySize + 1,
-                  false,
-                  stream);
-    // modify new sequenceStartPositions
-    int* destSequences = sequenceStartPositions->getMutableData(false);
-    for (int i = 0; i < copySize + 1; i++) {
-      destSequences[i] -= startRow;
-    }
-    CHECK_EQ(destSequences[0], 0);
-    CHECK_EQ(destSequences[copySize], copyFeatureSize);
-    if (src.hasSubseq()) {
-      // sequence has sub-sequence
-      int* subSequence = src.subSequenceStartPositions->getMutableData(false);
-      int32_t subStartSeq = 0;
-      int32_t subEndSeq = 0;
-      int numSubSequences = src.getNumSubSequences();
-      for (int i = 0; i < numSubSequences + 1; i++) {
-        if (subSequence[i] == startRow) {
-          subStartSeq = i;
-        } else if (subSequence[i] == endRow) {
-          subEndSeq = i;
-          break;
-        }
-      }
-      int32_t copySubSize = subEndSeq - subStartSeq;
-      resizeAndCopy(subSequenceStartPositions,
-                    src.subSequenceStartPositions,
-                    subStartSeq,
-                    copySubSize + 1,
-                    false,
-                    stream);
-      // modify new subSequenceStartPositions
-      int* destSubSequences = subSequenceStartPositions->getMutableData(false);
-      for (int i = 0; i < copySubSize + 1; i++) {
-        destSubSequences[i] -= startRow;
-      }
-      CHECK_EQ(destSubSequences[0], 0);
-      CHECK_EQ(destSubSequences[copySubSize], copyFeatureSize);
-    }
-    resizeAndCopy(strs, src.strs, startRow, copySize, useGpu, stream);
-    return copyFeatureSize;
-  }
-}
-
-void Argument::concat(const std::vector<Argument>& args,
-                      const std::vector<int>& selectRows,
-                      const std::vector<int>& seqStartPos,
-                      const std::vector<int>& copySize,
-                      bool useGpu,
-                      hl_stream_t stream,
-                      PassType passType) {
-  CHECK(!subSequenceStartPositions)
-      << "undefined behavior for subsequence positions";
-
-  size_t batchSize = 0;
-  for (size_t i = 0; i < copySize.size(); ++i)
-    batchSize += copySize[i] * (seqStartPos[i + 1] - seqStartPos[i]);
-
-  auto copyArg = [batchSize, stream](MatrixPtr& dst,
-                                     MatrixPtr src,
-                                     int desStartRow,
-                                     int srcStartRow,
-                                     int size,
-                                     bool useGpu) {
-    if (!src) {
-      dst.reset();
-      return;
-    }
-    size_t width = src->getWidth();
-    if (!dst) {
-      dst = src->clone(batchSize, width, useGpu);
-    } else {
-      dst->resize(batchSize, width);
-    }
-
-    MatrixPtr tmpMatrix = dst->subMatrix(desStartRow, size);
-    tmpMatrix->copyFrom(*src->subMatrix(srcStartRow, size), stream);
-  };
-
-  auto copyIds = [batchSize, stream](IVectorPtr& dst,
-                                     const IVectorPtr& src,
-                                     int desStartRow,
-                                     int srcStartRow,
-                                     int size,
-                                     bool useGpu) {
-    if (!src) {
-      dst.reset();
-      return;
-    }
-    IVector::resizeOrCreate(dst, batchSize, useGpu);
-    dst->subVec(desStartRow, size)
-        ->copyFrom(*src->subVec(srcStartRow, size), stream);
-  };
-
-  auto copyStrs = [batchSize](SVectorPtr& dst,
-                              const SVectorPtr& src,
-                              int desStartRow,
-                              int srcStartRow,
-                              int size,
-                              bool useGpu) {
-    if (!src) {
-      dst.reset();
-      return;
-    }
-    if (!dst) {
-      dst = std::make_shared<std::vector<std::string>>(batchSize);
-    } else {
-      dst->resize(batchSize);
-    }
-    std::copy(src->begin() + srcStartRow,
-              src->begin() + srcStartRow + size,
-              dst->begin() + desStartRow);
-  };
-
-  dataId = args[0].dataId;
-  CHECK_NE(seqStartPos.size(), 0UL);
-  int desStartRow = 0;
-  for (size_t i = 0; i < copySize.size(); ++i) {
-    int startPos = seqStartPos[i];
-    int endPos = seqStartPos[i + 1];
-    CHECK_GE(args.size(), static_cast<size_t>(endPos - startPos));
-    for (int j = startPos; j < endPos; ++j) {
-      const Argument& arg = args[j - startPos];
-      CHECK_EQ(arg.dataId, dataId) << "Arguments to concatenate should have "
-                                   << "the same dataId.";
-      const int srcStartRow = selectRows[j];
-      copyArg(in, arg.in, desStartRow, srcStartRow, copySize[i], useGpu);
-      copyArg(value, arg.value, desStartRow, srcStartRow, copySize[i], useGpu);
-      if (passType != PASS_TEST) {
-        copyArg(grad, arg.grad, desStartRow, srcStartRow, copySize[i], useGpu);
-      }
-      copyIds(ids, arg.ids, desStartRow, srcStartRow, copySize[i], useGpu);
-      copyStrs(strs, arg.strs, desStartRow, srcStartRow, copySize[i], useGpu);
-      desStartRow += copySize[i];
-    }
-  }
-  ICpuGpuVector::resizeOrCreate(
-      sequenceStartPositions, seqStartPos.size(), useGpu);
-  sequenceStartPositions->copyFrom(
-      seqStartPos.data(), seqStartPos.size(), useGpu);
-}
-
-void Argument::concat(const std::vector<Argument>& args,
-                      bool useGpu,
-                      hl_stream_t stream,
-                      PassType passType) {
-  int32_t batchSize = 0;
-  int64_t numSequences = 0;
-  int64_t numSubSequences = 0;
-  for (auto& arg : args) {
-    batchSize += arg.getBatchSize();
-    numSequences += arg.getNumSequences();
-    numSubSequences += arg.getNumSubSequences();
-  }
-
-  auto copyArg = [batchSize, stream](
-      MatrixPtr& dst, MatrixPtr src, int startRow, bool useGpu) {
-    if (!src) {
-      dst.reset();
-      return;
-    }
-    size_t width = src->getWidth();
-    if (!dst) {
-      dst = src->clone(batchSize, width, useGpu);
-    } else {
-      dst->resize(batchSize, width);
-    }
-
-    MatrixPtr tmpMatrix = dst->subMatrix(startRow, src->getHeight());
-    tmpMatrix->copyFrom(*src, stream);
-  };
-
-  auto copyIds = [batchSize, stream](
-      IVectorPtr& dst, const IVectorPtr& src, int startRow, bool useGpu) {
-    if (!src) {
-      dst.reset();
-      return;
-    }
-    IVector::resizeOrCreate(dst, batchSize, useGpu);
-    dst->subVec(startRow, src->getSize())->copyFrom(*src, stream);
-  };
-
-  auto copyStrs = [batchSize](
-      SVectorPtr& dst, const SVectorPtr& src, int startRow, bool useGpu) {
-    if (!src) {
-      dst.reset();
-      return;
-    }
-    if (!dst) {
-      dst = std::make_shared<std::vector<std::string>>(batchSize);
-    } else {
-      dst->resize(batchSize);
-    }
-    std::copy(src->begin(), src->end(), dst->begin() + startRow);
-  };
-
-  auto copySequencePos = [](ICpuGpuVectorPtr& dstSeq,
-                            const ICpuGpuVectorPtr& srcSeq,
-                            int dstNumSequences,
-                            int srcNumSequences,
-                            int& startSequences,
-                            int startRow) {
-    if (srcSeq) {
-      ICpuGpuVector::resizeOrCreate(dstSeq, dstNumSequences + 1, false);
-      const int* src = srcSeq->getData(false);
-      int* dest = dstSeq->getMutableData(false);
-      for (int i = 0; i < srcNumSequences + 1; ++i) {
-        dest[i + startSequences] = src[i] + startRow;
-      }
-      startSequences += srcNumSequences;
-    } else {
-      dstSeq.reset();
-    }
-  };
-
-  int startRow = 0;
-  int startSequences = 0;
-  int startSubSequences = 0;
-  dataId = args[0].dataId;
-  for (auto& arg : args) {
-    CHECK_EQ(arg.dataId, dataId) << "Arguments in concat should have"
-                                 << " same dataId";
-    copyArg(in, arg.in, startRow, useGpu);
-    copyArg(value, arg.value, startRow, useGpu);
-    if (passType != PASS_TEST) copyArg(grad, arg.grad, startRow, useGpu);
-    copyIds(ids, arg.ids, startRow, useGpu);
-    copySequencePos(sequenceStartPositions,
-                    arg.sequenceStartPositions,
-                    numSequences,
-                    arg.getNumSequences(),
-                    startSequences,
-                    startRow);
-    copySequencePos(subSequenceStartPositions,
-                    arg.subSequenceStartPositions,
-                    numSubSequences,
-                    arg.getNumSubSequences(),
-                    startSubSequences,
-                    startRow);
-    copyStrs(strs, arg.strs, startRow, useGpu);
-    startRow += arg.getBatchSize();
-  }
-}
-
-void Argument::splitByDataId(const std::vector<Argument>& argus,
-                             std::vector<std::vector<Argument>>* arguGroups) {
-  arguGroups->clear();
-  int lastDataId = -1;
-  for (const auto& argu : argus) {
-    if (argu.dataId == -1) {
-      // is -1, then create a new group
-      arguGroups->emplace_back();
-      lastDataId = -1;
-    } else if (argu.dataId != lastDataId) {
-      // not -1, also not equal to last Argument, then create a new group
-      arguGroups->emplace_back();
-      lastDataId = argu.dataId;
-    } else {
-      // not -1, and equal to last Argument, do nothing
-    }
-    arguGroups->back().push_back(argu);
-  }
-}
-
-void Argument::getSeqInfo(std::vector<SeqInfo>* seqInfo) const {
-  const int* starts = sequenceStartPositions->getData(false);
-  const int* subStarts =
-      hasSubseq() ? subSequenceStartPositions->getData(false) : nullptr;
-  size_t numSequences = getNumSequences();
-  seqInfo->reserve(numSequences);
-  int subSeqEnd = 0;
-  for (size_t i = 0; i < numSequences; ++i) {
-    SeqInfo info;
-    info.seqStart = starts[i];
-    info.subLevelLength = starts[i + 1] - starts[i];
-    info.seqId = i;
-    if (hasSubseq()) {
-      info.subSeqStart = subSeqEnd;
-      while (subStarts[subSeqEnd] < starts[i + 1]) {
-        ++subSeqEnd;
-      }
-      info.topLevelLength = subSeqEnd - info.subSeqStart;
-    } else {
-      info.topLevelLength = info.subLevelLength;
-      info.subSeqStart = 0;  // not used
-    }
-    seqInfo->push_back(info);
-  }
-  std::sort(
-      seqInfo->begin(), seqInfo->end(), [](const SeqInfo& a, const SeqInfo& b) {
-        return a.topLevelLength > b.topLevelLength;
-      });
-}
-
-void Argument::checkSubset() const {
-  if (getNumSequences() > getNumSubSequences()) {
-    LOG(FATAL) << "numSubSequences is less than numSequences ("
-               << getNumSubSequences() << " vs. " << getNumSequences() << ")";
-  }
-  const int* start = sequenceStartPositions->getData(false);
-  const int* subStart = subSequenceStartPositions->getData(false);
-  int seqId = 0;
-  int subSeqId = 0;
-  while (seqId < getNumSequences() && subSeqId < getNumSubSequences()) {
-    if (start[seqId] > subStart[subSeqId]) {
-      ++subSeqId;
-    } else if (start[seqId] == subStart[subSeqId]) {
-      ++subSeqId;
-      ++seqId;
-    } else {
-      LOG(FATAL) << "seqStartPositions is not subset of subSeqStartPositions";
-    }
-  }
-  if (seqId < getNumSequences()) {
-    LOG(FATAL) << "seqStartPositions is not subset of subSeqStartPositions";
-  }
-}
-
-void Argument::degradeSequence(const Argument& input) {
-  CHECK_EQ(input.hasSubseq(), 1UL);
-  size_t numSequences = input.getNumSequences();
-  size_t numSubSequences = input.getNumSubSequences();
-  ICpuGpuVector::resizeOrCreate(
-      sequenceStartPositions, numSequences + 1, false);
-  int* tgtBuf = sequenceStartPositions->getMutableData(false);
-  const int* starts = input.sequenceStartPositions->getData(false);
-  const int* subStarts = input.subSequenceStartPositions->getData(false);
-  int seqId = 0;
-  for (size_t subSeqId = 0; subSeqId < numSubSequences; ++subSeqId) {
-    if (subStarts[subSeqId] == starts[seqId]) {
-      tgtBuf[seqId] = subSeqId;
-      seqId++;
-    }
-  }
-  tgtBuf[numSequences] = numSubSequences;
-}
-
-void Argument::poolSequenceWithStride(const Argument& input,
-                                      size_t stride,
-                                      ICpuGpuVectorPtr* stridePostions,
-                                      bool reversed) {
-  // If input.sequenceStartPositions = [0, 9, 14, 17, 30] and stride = 5,
-  // then sequenceStartPositions = [0, 2, 3, 4, 7].
-  // If reversed = false, stridePostions = [0, 5, 9, 14, 17, 22, 27, 30];
-  // else reversed = true, stridePostions = [0, 4, 9, 14, 17, 20, 25, 30]
-
-  CHECK(input.sequenceStartPositions);
-  CHECK_EQ(input.hasSubseq(), 0UL);
-  CHECK_GT(stride, 0UL) << "stride must larger than 0";
-  size_t numSequences = input.getNumSequences();
-  ICpuGpuVector::resizeOrCreate(
-      sequenceStartPositions, numSequences + 1, false);
-  const int* starts = input.sequenceStartPositions->getData(false);
-  int* tgtBuf = sequenceStartPositions->getMutableData(false);
-  // first index of target sequence and stride positions are both 0
-  tgtBuf[0] = 0;
-  std::vector<int> stridePos;
-  for (size_t seqId = 0; seqId < numSequences; ++seqId) {
-    size_t seqLength = starts[seqId + 1] - starts[seqId];
-    stridePos.emplace_back(starts[seqId]);
-    if (seqLength == 0) {
-      // empty sequence
-      tgtBuf[seqId + 1] = tgtBuf[seqId];
-    } else {
-      int size = ceil((float)seqLength / stride);
-      tgtBuf[seqId + 1] = tgtBuf[seqId] + size;
-      for (int i = 0; i < size - 1; ++i) {
-        int cur = reversed ? starts[seqId + 1] - (size - 1 - i) * stride
-                           : stridePos.back() + stride;
-        stridePos.emplace_back(cur);
-      }
-    }
-  }
-  stridePos.emplace_back(starts[numSequences]);
-  int size = stridePos.size();
-  CHECK_EQ(size - 1, tgtBuf[numSequences]);
-  ICpuGpuVector::resizeOrCreate(*stridePostions, size, false);
-  (*stridePostions)->getMutableVector(false)->copyFrom(stridePos.data(), size);
-}
-
-void Argument::getValueString(
-    std::unordered_map<std::string, std::string>* out) const {
-  if (value) {
-    std::ostringstream os;
-    value->print(os);
-    out->insert({"value", os.str()});
-  }
-  if (ids) {
-    std::ostringstream os;
-    ids->print(os, ids->getSize());
-    out->insert({"ids", os.str()});
-  }
-  if (sequenceStartPositions) {
-    std::ostringstream os;
-    sequenceStartPositions->getVector(false)->print(
-        os, sequenceStartPositions->getSize());
-    out->insert({"sequence pos", os.str()});
-  }
-  if (subSequenceStartPositions) {
-    std::ostringstream os;
-    subSequenceStartPositions->getVector(false)->print(
-        os, subSequenceStartPositions->getSize());
-    out->insert({"sub-sequence pos", os.str()});
-  }
-}
-
-void Argument::printValueString(std::ostream& stream,
-                                const std::string& prefix) const {
-  std::unordered_map<std::string, std::string> out;
-  getValueString(&out);
-  for (auto field : {"value", "ids", "sequence pos", "sub-sequence pos"}) {
-    auto it = out.find(field);
-    if (it != out.end()) {
-      stream << prefix << field << ":\n" << it->second;
-    }
-  }
-}
-
-void Argument::subArgFrom(const Argument& input,
-                          size_t offset,
-                          size_t height,
-                          size_t width,
-                          bool useGpu,
-                          bool trans,
-                          bool seqFlag,
-                          size_t seqStart,
-                          size_t seqSize) {
-  if (input.value) {
-    value = Matrix::create(
-        input.value->getData() + offset * width, height, width, trans, useGpu);
-  }
-  if (input.ids) {
-    ids = IVector::create(input.ids->getData() + offset, height, useGpu);
-  }
-  if (input.grad) {
-    grad = Matrix::create(
-        input.grad->getData() + offset * width, height, width, trans, useGpu);
-  }
-  if (seqFlag) {
-    sequenceStartPositions = std::make_shared<ICpuGpuVector>(
-        *(input.sequenceStartPositions), seqStart, seqSize);
-  }
-}
-
-void Argument::reorganizeSeqInfo(
-    const ICpuGpuVectorPtr seqStartPos,
-    const ICpuGpuVectorPtr subSeqStartPos,
-    std::vector<std::vector<int>>& reorganizedSeqInfo) {
-  CHECK(seqStartPos);
-  reorganizedSeqInfo.clear();
-
-  int seqNum = seqStartPos->getSize() - 1;
-  int* seqStarts = seqStartPos->getMutableData(false);
-
-  if (subSeqStartPos) {
-    int* subSeqStarts = subSeqStartPos->getMutableData(false);
-    reorganizedSeqInfo.resize(seqNum, std::vector<int>());
-    int seqIdx = 0;
-    for (size_t i = 0; i < subSeqStartPos->getSize(); ++i) {
-      reorganizedSeqInfo[seqIdx].push_back(subSeqStarts[i]);
-      if (subSeqStarts[i] == seqStarts[seqIdx + 1]) {
-        seqIdx++;
-        if (seqIdx == seqNum) return;
-        reorganizedSeqInfo[seqIdx].push_back(subSeqStarts[i]);
-      }
-    }
-  } else {
-    reorganizedSeqInfo.resize(1, std::vector<int>(seqNum + 1, 0));
-    memcpy(reorganizedSeqInfo[0].data(),
-           seqStarts,
-           sizeof(int) * seqStartPos->getSize());
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/parameter/Argument.h b/paddle/parameter/Argument.h
deleted file mode 100644
index e580d38216b699360fb30f135be8052ab56abf66..0000000000000000000000000000000000000000
--- a/paddle/parameter/Argument.h
+++ /dev/null
@@ -1,349 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "hl_gpu.h"
-
-#include "paddle/math/Matrix.h"
-#include "paddle/math/Vector.h"
-#include "paddle/parameter/Parameter.h"
-#include "paddle/utils/Locks.h"
-#include "paddle/utils/Util.h"
-
-namespace paddle {
-
-typedef std::shared_ptr<std::vector<std::string>> SVectorPtr;
-
-struct Argument {
-  Argument()
-      : in(nullptr),
-        value(nullptr),
-        ids(nullptr),
-        grad(nullptr),
-        strs(nullptr),
-        frameHeight(0),
-        frameWidth(0),
-        frameDepth(0),
-        sequenceStartPositions(nullptr),
-        subSequenceStartPositions(nullptr),
-        cpuSequenceDims(nullptr),
-        deviceId(-1),
-        allCount(0),
-        valueCount(0),
-        gradCount(0),
-        dataId(0) {}
-  Argument(const Argument& argument) {
-    *this = argument;
-    valueCount = 0;
-    gradCount = 0;
-    dataId = argument.dataId;
-  }
-  ~Argument() {}
-
-  void operator=(const Argument& argument) {
-    in = argument.in;
-    value = argument.value;
-    ids = argument.ids;
-    grad = argument.grad;
-    strs = argument.strs;
-    sequenceStartPositions = argument.sequenceStartPositions;
-    subSequenceStartPositions = argument.subSequenceStartPositions;
-    cpuSequenceDims = argument.cpuSequenceDims;
-    deviceId = argument.deviceId;
-    allCount = argument.allCount;
-    frameHeight = argument.frameHeight;
-    frameWidth = argument.frameWidth;
-    frameDepth = argument.frameDepth;
-    dataId = argument.dataId;
-  }
-
-  MatrixPtr in;  // used if needed
-  MatrixPtr value;
-  IVectorPtr ids;  // a sequence of ids. Can be use for class id for costLayer
-  MatrixPtr grad;  // If empty, gradient is not needed.
-  SVectorPtr strs;
-
-  // A dataBatch includes batchSize frames, one frame maybe not only vector
-  size_t frameHeight;
-  size_t frameWidth;
-  size_t frameDepth;
-
-  // If NULL, each position is treated independently.
-  // Otherwise, its size should be #NumberOfSequences + 1.
-  // The first position is always 0 and
-  // the last position should be equal to batchSize.
-  ICpuGpuVectorPtr sequenceStartPositions;
-
-  // If NULL, each sequence has no subsequence.
-  // Otherwise, its size should be #NumberOfSubSequences + 1.
-  // The first position is always 0 and
-  // the last position should be equal to batchSize.
-  ICpuGpuVectorPtr subSequenceStartPositions;
-
-  // dimension of sequence, stored only in CPU
-  IVectorPtr cpuSequenceDims;
-
-  int deviceId;            // the GPU device id which the argument in
-  int allCount;            // the number of output layers using this argument
-  mutable int valueCount;  // waiting this member when layer do forward
-  mutable int gradCount;   // waiting this member when layer do backward
-  mutable LockedCondition valueReadyCond;
-  mutable LockedCondition gradReadyCond;
-
-  int dataId;  // dataProvider id
-
-  /* Increase the reference count of the argument. */
-  void countIncrement() { allCount++; }
-
-  int getAllCount() const { return allCount; }
-
-  void waitValueReady() const {
-    valueReadyCond.wait([this] { return (valueCount != 0); });
-
-    std::lock_guard<std::mutex> guard(*valueReadyCond.mutex());
-    valueCount--;
-  }
-
-  void notifyValueReady() const {
-    valueReadyCond.notify_all([this] { valueCount = allCount; });
-  }
-
-  void waitGradReady() const {
-    gradReadyCond.wait([this] { return (gradCount == allCount); });
-    gradCount = 0;
-  }
-
-  void notifyGradReady() const {
-    gradReadyCond.notify_all([this] { gradCount++; });
-  }
-
-  int64_t getBatchSize() const {
-    if (value) return value->getHeight();
-    if (ids) return ids->getSize();
-    if (grad) return grad->getHeight();
-    if (in) return in->getHeight();
-    if (strs) return strs->size();
-    return 0;
-  }
-  size_t getFrameHeight() const { return frameHeight; }
-  size_t getFrameWidth() const { return frameWidth; }
-  size_t getFrameDepth() const { return frameDepth; }
-  void setFrameHeight(size_t h) { frameHeight = h; }
-  void setFrameWidth(size_t w) { frameWidth = w; }
-  void setFrameDepth(size_t d) { frameDepth = d; }
-
-  int64_t getNumSequences() const {
-    return sequenceStartPositions ? sequenceStartPositions->getSize() - 1
-                                  : getBatchSize();
-  }
-
-  int64_t getNumSubSequences() const {
-    return subSequenceStartPositions ? subSequenceStartPositions->getSize() - 1
-                                     : getBatchSize();
-  }
-
-  bool hasSeq() const { return sequenceStartPositions != nullptr; }
-  bool hasSubseq() const { return subSequenceStartPositions != nullptr; }
-
-  const int* getCpuStartPositions() const {
-    return hasSubseq() ? subSequenceStartPositions->getData(false)
-                       : sequenceStartPositions->getData(false);
-  }
-
-  static inline real sum(const std::vector<Argument>& arguments) {
-    real cost = 0;
-    for (auto& arg : arguments) {
-      if (arg.value) {
-        SetDevice device(arg.deviceId);
-        cost += arg.value->getSum();
-      }
-    }
-    return cost;
-  }
-
-  /**
-   * @brief (value, ids, grad, sequenceStartPositions) of output are subset of
-   *        input. Note that, output share the same memory of input.
-   *
-   * @param input[in]       input
-   * @param offset[in]      offset in terms of rows
-   * @param height[in]      height of output.value
-   * @param width[in]       width of output.value
-   * @param useGpu[in]
-   * @param trans[in]       whether input.value is transform
-   * @param seqFlag[in]     whether input has sequenceStartPositions
-   * @param seqStart[in]    offset of input.sequenceStartPositions
-   * @param seqSize[in]     lenght of output.sequenceStartPositions
-   */
-  void subArgFrom(const Argument& input,
-                  size_t offset,
-                  size_t height,
-                  size_t width,
-                  bool useGpu,
-                  bool trans = false,
-                  bool seqFlag = false,
-                  size_t seqStart = 0,
-                  size_t seqSize = 0);
-  /*
-   * for sequence input:
-   *   startSeq: the sequence id of start
-   *   copySize: how many sequences need to copy
-   *   return value: how many samples are copied
-   * for non-sequence input:
-   *   startSeq: the sample id of start
-   *   copySize: how many samples need to copy
-   *   return value: how many samples are copied
-   * Note that when specifying the stream explicitly in this case,
-   * synchronize should also be called somewhere after this function
-   */
-  int32_t resizeAndCopyFrom(const Argument& src,
-                            int32_t startSeq,
-                            int32_t copySize,
-                            bool useGpu,
-                            hl_stream_t stream);
-
-  /*
-   * same with the above function, except that the stream is
-   * HPPL_STREAM_DEFAULT and synchronize is automatically called
-   * inside it
-   */
-  int32_t resizeAndCopyFrom(const Argument& src,
-                            int32_t startSeq,
-                            int32_t copySize,
-                            bool useGpu = FLAGS_use_gpu);
-
-  void resizeAndCopyFrom(const Argument& src, bool useGpu, hl_stream_t stream);
-
-  /*
-   * same with the above function, except that the stream is
-   * HPPL_STREAM_DEFAULT and synchronize is automatically called
-   * inside it
-   */
-  void resizeAndCopyFrom(const Argument& src, bool useGpu = FLAGS_use_gpu);
-
-  /*
-    @brief Concatenate several arguments into one and put the result into it.
-    @param args : a vector of argument, each element of which is a frame in a
-    batch of sequences.
-    @param selectRows : select several row of args to concatenate
-    @param seqStartPos : sequence start positions in the final Argument
-    @param hl_stream_t : cuda stream
-    @param passTyoe : type of task, training or testing
-   */
-  void concat(const std::vector<Argument>& args,
-              const std::vector<int>& selectRows,
-              const std::vector<int>& seqStartPos,
-              const std::vector<int>& copySize,
-              bool useGpu,
-              hl_stream_t stream,
-              PassType passType);
-
-  /*
-    Concatenate several args into one and put the result into this.
-   */
-  void concat(const std::vector<Argument>& src,
-              bool useGpu = FLAGS_use_gpu,
-              hl_stream_t stream = HPPL_STREAM_DEFAULT,
-              PassType passType = PASS_TEST);
-
-  /*
-   * split vector<Argument> to several vectors according to dataId
-   */
-  static void splitByDataId(const std::vector<Argument>& argus,
-                            std::vector<std::vector<Argument>>* arguGroups);
-
-  struct SeqInfo {
-    // Equal to sequence length for sequence data
-    // Equal to number of subsequences for subsequence data
-    int topLevelLength;
-
-    int seqStart;
-    int seqId;
-
-    // Equal to topLevelLength for sequence data
-    // Equal to sum of the length of subsequences for subsequence data
-    int subLevelLength;
-
-    // Only used for subsequence data, start position of this sequence
-    // is subSequenceStartPositions, i.e.
-    // subSequenceStartPositions[subSeqStart] == seqStart
-    int subSeqStart;
-  };
-  /*
-    Get SeqInfo for each sequence of this argument
-    Elements in *seqInfo are sorted by topLevelLength in descending order
-  */
-  void getSeqInfo(std::vector<SeqInfo>* segInfo) const;
-
-  /*
-   Check Whether sequenceStartPositions is subset of
-   subSequenceStartPositions.
-   */
-  void checkSubset() const;
-
-  /*
-   sequence has sub-sequence degrades to a sequence.
-   */
-  void degradeSequence(const Argument& input);
-
-  /*
-   After pooling with stride n (n is smaller than sequence length),
-   a long sequence will be shorten.
-   This function is invalid for sequence having sub-sequence.
-   */
-  void poolSequenceWithStride(const Argument& input,
-                              size_t stride,
-                              ICpuGpuVectorPtr* stridePositions,
-                              bool reversed = false);
-  /**
-   * @brief getValueString will return the argument's output in string. There
-   * are several kinds of output. The keys of output dictionary are 'value',
-   * 'id', 'sequence pos', 'sub-sequence pos'.
-   * @param out [out]: the return values.
-   */
-  void getValueString(std::unordered_map<std::string, std::string>* out) const;
-
-  /**
-   * @brief printValueString will print the argument's output in order of
-   * 'value', 'id', 'sequence pos', 'sub-sequence pos'.
-   * @param stream: Output stream
-   * @param prefix: line prefix for printing.
-   */
-  void printValueString(std::ostream& stream,
-                        const std::string& prefix = "") const;
-
-  /**
-   * @brief reorganizeSeqInfo will reorganize sequenceStartPositions and
-   * subSequenceStartPositions into a 2 dimensional arrary: reorganizedSeqInfo.
-   *
-   * @param seqStartPos: sequenceStartPositions of an Argument.
-   * @param subSeqStartPos: subSequenceStartPositions of an Argument.
-   * @param the reorganized sequence start position information.
-   *
-   * Examples:
-   * seqStartPos: [0, 4, 15, 20, 28]
-   * subSeqStartPos: [0, 3, 4, 5, 7, 10, 15, 20, 22, 23, 25, 28]
-   * reorganizedSeqInfo:
-   *   [
-   *     [0,3,4],
-   *     [4,5,7,10,15],
-   *     [15,20],
-   *     [20,22,23,25,28]
-   *   ]
-   */
-  static void reorganizeSeqInfo(
-      const ICpuGpuVectorPtr seqStartPos,
-      const ICpuGpuVectorPtr subSeqStartPos,
-      std::vector<std::vector<int>>& reorganizedSeqInfo);
-};
-
-}  // namespace paddle
diff --git a/paddle/parameter/FirstOrderOptimizer.cpp b/paddle/parameter/FirstOrderOptimizer.cpp
deleted file mode 100644
index 182e833405e8f8bc3a4c9ffddbf628040f9cceaa..0000000000000000000000000000000000000000
--- a/paddle/parameter/FirstOrderOptimizer.cpp
+++ /dev/null
@@ -1,330 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "FirstOrderOptimizer.h"
-#include "paddle/math/TrainingAlgorithmOp.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/Util.h"
-
-#include <cmath>
-
-DEFINE_bool(log_clipping, false, "enable log clipping or not");
-
-namespace paddle {
-
-SparseMomentumParameterOptimizer::SparseMomentumParameterOptimizer(
-    const OptimizationConfig& optConfig)
-    : ParameterOptimizer(optConfig) {
-  addParameterType(PARAMETER_MOMENTUM);
-  addParameterType(PARAMETER_MOMENTUM_UT);
-  addParameterType(PARAMETER_MOMENTUM_VT);
-  alpha_ = 1;
-  beta_ = 1;
-  tau_ = -1;
-  threshold_ = 1e+06;
-}
-
-void SparseMomentumParameterOptimizer::init(size_t numRows,
-                                            const ParameterConfig* config) {
-  isParameterSparse_ = numRows != 0;
-  t0Vec_.resize(numRows);
-  t0Vec_.assign(t0Vec_.size(), 0);
-  timer_ = 0;
-  momentum_ = config->momentum();
-  decayRate_ = config->decay_rate();
-  gamma_ = config->learning_rate();
-}
-
-void SparseMomentumParameterOptimizer::startBatch(int64_t numSamplesProcessed) {
-  learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
-  if (isParameterSparse_) {
-    tau_ = tau_ + beta_ / alpha_;
-    alpha_ = alpha_ / momentum_;
-    beta_ = beta_ / (1 + decayRate_ * gamma_ * learningRate_);
-  }
-}
-
-void SparseMomentumParameterOptimizer::update(const VectorPtr vecs[],
-                                              const ParameterConfig& paraConfig,
-                                              size_t sparseId) const {
-  if (sparseId != -1LU) {
-    CHECK_LT(sparseId, t0Vec_.size());
-    if (t0Vec_[sparseId] == 0) {
-      vecs[PARAMETER_MOMENTUM_VT]->assign(*vecs[PARAMETER_VALUE]);
-      t0Vec_[sparseId] = 1;
-    }
-    vecs[PARAMETER_MOMENTUM_UT]->add(*vecs[PARAMETER_GRADIENT],
-                                     -alpha_ * gamma_ * learningRate_);
-    vecs[PARAMETER_MOMENTUM_VT]->add(*vecs[PARAMETER_GRADIENT],
-                                     tau_ * alpha_ * gamma_ * learningRate_);
-    vecs[PARAMETER_VALUE]->add(*vecs[PARAMETER_MOMENTUM_UT],
-                               tau_ / beta_ + 1.0 / alpha_,
-                               *vecs[PARAMETER_MOMENTUM_VT],
-                               1.0 / beta_);
-
-  } else {
-    vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
-                                     *vecs[PARAMETER_MOMENTUM],
-                                     learningRate_ * paraConfig.learning_rate(),
-                                     paraConfig.momentum(),
-                                     applyDecay_ ? paraConfig.decay_rate() : 0);
-  }
-}
-
-ParameterOptimizer::TraverseCallback
-SparseMomentumParameterOptimizer::needSpecialTraversal(
-    const ParameterConfig& config) const {
-  if (alpha_ > threshold_ && isParameterSparse_) {
-    //  Restart to avoid large value multiplication
-    //  1. \alpha = 1, \beta = 1, \tau = 0
-    //  2. Note that \tau * u_t + v_t = \beta \theta_t, therefore:
-    //     u_t should be rescaled to u_t/alpha_
-    //     v_t should be reset to \theta_t
-    return [this](const VectorPtr vecs[],
-                  const ParameterConfig& config,
-                  size_t sparseId) {
-      vecs[PARAMETER_MOMENTUM_UT]->divScalar(alpha_);
-      vecs[PARAMETER_MOMENTUM_VT]->assign(*vecs[PARAMETER_VALUE]);
-    };
-  } else {
-    return nullptr;
-  }
-}
-
-void SparseMomentumParameterOptimizer::finishBatch() {
-  timer_++;
-  if (!isParameterSparse_) return;
-  if (alpha_ > threshold_) {
-    alpha_ = 1;
-    beta_ = 1;
-    tau_ = -1;
-  }
-}
-
-void AdagradParameterOptimizer::update(const VectorPtr vecs[],
-                                       const ParameterConfig& config,
-                                       size_t sparseId) const {
-  BaseMatrix& value = *vecs[PARAMETER_VALUE];
-  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
-  BaseMatrix& accum_buffer = *vecs[PARAMETER_GRADIENT_SQURESUM];
-  BaseMatrix& accum = *vecs[PARAMETER_GRADIENT_SQURESUM1];
-  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
-
-  real epsilon = optConfig_.ada_epsilon();
-  real learningRate = learningRate_ * config.learning_rate();
-  real momentum = config.momentum();
-  real decayRate = applyDecay_ ? config.decay_rate() : 0;
-
-  adagradApply(value,
-               grad,
-               mom,
-               accum_buffer,
-               accum,
-               lr,
-               epsilon,
-               learningRate,
-               momentum,
-               decayRate);
-}
-
-ParameterOptimizer::TraverseCallback
-AdagradParameterOptimizer::needSpecialTraversal(
-    const ParameterConfig& config) const {
-  if (numUpdates_ % kMaxNumAccumulates == 0) {
-    // Move the sum to a different buffer to avoid loss of precision
-    // due to too many sums.
-    return [](const VectorPtr vecs[],
-              const ParameterConfig& config,
-              size_t sparseId) {
-      vecs[PARAMETER_GRADIENT_SQURESUM]->add(
-          *vecs[PARAMETER_GRADIENT_SQURESUM1]);
-      vecs[PARAMETER_GRADIENT_SQURESUM1]->zeroMem();
-    };
-  } else {
-    return nullptr;
-  }
-}
-
-void AdaDeltaParameterOptimizer::update(const VectorPtr vecs[],
-                                        const ParameterConfig& config,
-                                        size_t sparseId) const {
-  CHECK(sparseId == -1LU) << "Sparse update is not supported";
-
-  BaseMatrix& value = *vecs[PARAMETER_VALUE];
-  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
-  BaseMatrix& accum = *vecs[PARAMETER_GRADIENT_SQURESUM];
-  BaseMatrix& accum_update = *vecs[PARAMETER_GRADIENT_SQURESUM1];
-  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
-
-  real learningRate = learningRate_ * config.learning_rate();
-  real momentum = config.momentum();
-  real decayRate = applyDecay_ ? config.decay_rate() : 0;
-
-  adadeltaApply(value,
-                grad,
-                mom,
-                accum,
-                accum_update,
-                lr,
-                rou_,
-                epsilon_,
-                learningRate,
-                momentum,
-                decayRate);
-}
-
-void RMSPropParameterOptimizer::update(const VectorPtr vecs[],
-                                       const ParameterConfig& config,
-                                       size_t sparseId) const {
-  BaseMatrix& value = *vecs[PARAMETER_VALUE];
-  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
-  BaseMatrix& sum = *vecs[PARAMETER_GRADIENT_SQURESUM];
-  BaseMatrix& sum1 = *vecs[PARAMETER_GRADIENT_SQURESUM1];
-  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
-
-  real accumulatedRou = rou_;
-  bool firstTime = timer_ == 0;
-  if (sparseId != -1LU) {
-    CHECK_LT(sparseId, t0Vec_.size());
-    accumulatedRou = std::pow(rou_, timer_ + 1 - t0Vec_[sparseId]);
-    firstTime = t0Vec_[sparseId] == 0;
-    t0Vec_[sparseId] = timer_ + 1;
-  }
-
-  real epsilon = optConfig_.ada_epsilon();
-  real learningRate = learningRate_ * config.learning_rate();
-  real momentum = config.momentum();
-  real decayRate = applyDecay_ ? config.decay_rate() : 0;
-
-  rmspropApply(value,
-               grad,
-               mom,
-               sum,
-               sum1,
-               lr,
-               accumulatedRou,
-               rou_,
-               epsilon,
-               learningRate,
-               momentum,
-               decayRate,
-               firstTime);
-}
-
-void DecayedAdagradParameterOptimizer::update(const VectorPtr vecs[],
-                                              const ParameterConfig& config,
-                                              size_t sparseId) const {
-  BaseMatrix& value = *vecs[PARAMETER_VALUE];
-  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
-  BaseMatrix& sum = *vecs[PARAMETER_GRADIENT_SQURESUM];
-  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
-
-  real accumulatedRou = rou_;
-  bool firstTime = timer_ == 0;
-  if (sparseId != -1LU) {
-    CHECK_LT(sparseId, t0Vec_.size());
-    accumulatedRou = std::pow(rou_, timer_ + 1 - t0Vec_[sparseId]);
-    firstTime = t0Vec_[sparseId] == 0;
-    t0Vec_[sparseId] = timer_ + 1;
-  }
-
-  real epsilon = optConfig_.ada_epsilon();
-  real learningRate = learningRate_ * config.learning_rate();
-  real momentum = config.momentum();
-  real decayRate = applyDecay_ ? config.decay_rate() : 0;
-
-  decayedAdagradApply(value,
-                      grad,
-                      mom,
-                      sum,
-                      lr,
-                      accumulatedRou,
-                      rou_,
-                      epsilon,
-                      learningRate,
-                      momentum,
-                      decayRate,
-                      firstTime);
-}
-
-void AdamParameterOptimizer::update(const VectorPtr vecs[],
-                                    const ParameterConfig& config,
-                                    size_t sparseId) const {
-  CHECK(sparseId == -1UL) << "Sparse update is not supported";
-
-  real beta1_power = std::pow(beta1_, step_);
-  real beta2_power = std::pow(beta2_, step_);
-  real learningRate = config.learning_rate() * learningRate_;
-
-  BaseMatrix& value = *vecs[PARAMETER_VALUE];
-  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
-  BaseMatrix& v = *vecs[PARAMETER_SECOND_MOMENTUM];
-
-  adamApply(value,
-            grad,
-            mom,
-            v,
-            beta1_,
-            beta2_,
-            beta1_power,
-            beta2_power,
-            epsilon_,
-            learningRate);
-}
-
-void AdamaxParameterOptimizer::update(const VectorPtr vecs[],
-                                      const ParameterConfig& config,
-                                      size_t sparseId) const {
-  CHECK(sparseId == -1UL) << "Sparse update is not supported";
-  real learningRate = config.learning_rate() * learningRate_;
-
-  BaseMatrix& value = *vecs[PARAMETER_VALUE];
-  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
-  BaseMatrix& u = *vecs[PARAMETER_WEIGHTED_INFINITY_NORM];
-
-  adamaxApply(value, grad, mom, u, beta1_, beta2_, step_, learningRate);
-}
-
-void OptimizerWithGradientClipping::update(const VectorPtr vecs[],
-                                           const ParameterConfig& config,
-                                           size_t sparseId) const {
-  real globalThreshold = optConfig_.gradient_clipping_threshold();
-  real localThreshold = config.gradient_clipping_threshold();
-
-  // Use local gradient clipping threshold if it's enabled,
-  // otherwise using the global one.
-  real threshold = localThreshold > 0.0f ? localThreshold : globalThreshold;
-  std::string field = localThreshold > 0.0f ? "local" : "global";
-
-  real maxAbsGrad = vecs[PARAMETER_GRADIENT]->getAbsMax();
-  if (maxAbsGrad > threshold) {
-    if (FLAGS_log_clipping) {
-      real avgAbsGrad = vecs[PARAMETER_GRADIENT]->getAbsSum() /
-                        vecs[PARAMETER_GRADIENT]->getSize();
-      LOG(INFO) << "parameter=" << config.name() << " need clipping by "
-                << field << " threshold=" << threshold
-                << ", max grad=" << maxAbsGrad << ", avg grad=" << avgAbsGrad;
-    }
-    vecs[PARAMETER_GRADIENT]->clip(-threshold, threshold);
-  }
-  optimizer_->update(vecs, config, sparseId);
-}
-
-}  // namespace paddle
diff --git a/paddle/parameter/LearningRateScheduler.cpp b/paddle/parameter/LearningRateScheduler.cpp
deleted file mode 100644
index d57d2189a45dc8cbcea7a8a5f25c5ec7ac71cca3..0000000000000000000000000000000000000000
--- a/paddle/parameter/LearningRateScheduler.cpp
+++ /dev/null
@@ -1,173 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "LearningRateScheduler.h"
-#include "paddle/utils/StringUtil.h"
-
-namespace paddle {
-
-ClassRegistrar<LearningRateScheduler, OptimizationConfig>
-    LearningRateScheduler::registrar_;
-
-LearningRateScheduler* LearningRateScheduler::create(
-    const OptimizationConfig& config) {
-  return registrar_.createByType(config.learning_rate_schedule(), config);
-}
-
-// LRS stands for LearningRateScheduler
-
-class BaseLRS : public LearningRateScheduler {
- public:
-  explicit BaseLRS(const OptimizationConfig& config)
-      : learningRate_(config.learning_rate()),
-        a_(config.learning_rate_decay_a()),
-        b_(config.learning_rate_decay_b()) {}
-
- protected:
-  real learningRate_;
-  real a_;
-  real b_;
-};
-
-class ConstLRS : public BaseLRS {
- public:
-  explicit ConstLRS(const OptimizationConfig& config) : BaseLRS(config) {}
-  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
-    return learningRate_;
-  }
-};
-REGISTER_LEARNING_RATE_SCHEDULER(constant, ConstLRS);
-
-class PolyLRS : public BaseLRS {
- public:
-  explicit PolyLRS(const OptimizationConfig& config) : BaseLRS(config) {}
-  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
-    return learningRate_ * pow(1.0 + a_ * numSamplesProcessed, -b_);
-  }
-};
-REGISTER_LEARNING_RATE_SCHEDULER(poly, PolyLRS);
-
-class CaffePolyLRS : public BaseLRS {
- public:
-  explicit CaffePolyLRS(const OptimizationConfig& config) : BaseLRS(config) {}
-  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
-    if (numSamplesProcessed > a_) {
-      LOG_FIRST_N(WARNING, 1)
-          << "Using caffe_poly learning rate schedule, "
-          << "learning rate hits ZERO when "
-          << "numSamplesProcessed > config.learning_rate_decay_b(), "
-          << "training is over and you can stop it. "
-          << "See common/LearningRateScheduler.cpp for more info.";
-      return 0;
-    } else {
-      return learningRate_ * pow(1.0 - numSamplesProcessed / a_, b_);
-    }
-  }
-};
-REGISTER_LEARNING_RATE_SCHEDULER(caffe_poly, CaffePolyLRS);
-
-class ExpLRS : public BaseLRS {
- public:
-  explicit ExpLRS(const OptimizationConfig& config) : BaseLRS(config) {}
-  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
-    double decayRatio = (double)numSamplesProcessed / b_;
-    return learningRate_ * pow(a_, decayRatio);
-  }
-};
-REGISTER_LEARNING_RATE_SCHEDULER(exp, ExpLRS);
-
-class DiscreteExpLRS : public BaseLRS {
- public:
-  explicit DiscreteExpLRS(const OptimizationConfig& config) : BaseLRS(config) {}
-  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
-    int numDecays = floor(numSamplesProcessed / b_);
-    return learningRate_ * pow(a_, numDecays);
-  }
-};
-REGISTER_LEARNING_RATE_SCHEDULER(discexp, DiscreteExpLRS);
-
-class LinearLRS : public BaseLRS {
- public:
-  explicit LinearLRS(const OptimizationConfig& config) : BaseLRS(config) {}
-  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
-    return std::max(learningRate_ - a_ * numSamplesProcessed, b_);
-  }
-};
-REGISTER_LEARNING_RATE_SCHEDULER(linear, LinearLRS);
-
-/*
-  specify learning rate through
-  learning_rate_args = 'seg0:rate0,seg1:rate1,...,segK:rateK'
-  if seg_{i-1} <= numSamples <= seg_i,
-  then learning_rate = learning_rate_base * rate_i
-*/
-class ManualLRS : public BaseLRS {
- public:
-  explicit ManualLRS(const OptimizationConfig& config)
-      : BaseLRS(config), currentSegment_(0), lastNum_(0) {
-    std::vector<std::string> pieces;
-    str::split(config.learning_rate_args(), ',', &pieces);
-    rates_.reserve(pieces.size());
-    std::string s1, s2;
-
-    for (auto& piece : pieces) {
-      auto pos = piece.find(':');
-      CHECK(pos != std::string::npos) << "Wrong format for learning_rate_args: "
-                                      << config.learning_rate_args();
-      segments_.push_back(str::to<int64_t>(piece.substr(0, pos)));
-      rates_.push_back(str::to<real>(piece.substr(pos + 1)));
-    }
-  }
-
-  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
-    return calc(numSamplesProcessed);
-  }
-
-  real calc(int64_t num) {
-    // We assume that num never decreases.
-    CHECK_LE(lastNum_, num);
-    lastNum_ = num;
-    while (currentSegment_ < rates_.size()) {
-      if (num <= segments_[currentSegment_]) {
-        return learningRate_ * rates_[currentSegment_];
-      }
-      ++currentSegment_;
-      if (currentSegment_ < rates_.size()) {
-        LOG(INFO) << " learning_rate changes to "
-                  << learningRate_ * rates_[currentSegment_];
-      }
-    }
-    return learningRate_ * rates_.back();
-  }
-
- protected:
-  std::vector<real> rates_;
-  std::vector<int64_t> segments_;
-  size_t currentSegment_;
-  int64_t lastNum_;
-};
-
-REGISTER_LEARNING_RATE_SCHEDULER(manual, ManualLRS);
-
-class PassManualLRS : public ManualLRS {
- public:
-  explicit PassManualLRS(const OptimizationConfig& config)
-      : ManualLRS(config) {}
-  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
-    return calc(pass);
-  }
-};
-
-REGISTER_LEARNING_RATE_SCHEDULER(pass_manual, PassManualLRS);
-}  // namespace paddle
diff --git a/paddle/parameter/LearningRateScheduler.h b/paddle/parameter/LearningRateScheduler.h
deleted file mode 100644
index 3fad97040248dcf8a22988c38153df31f267ed37..0000000000000000000000000000000000000000
--- a/paddle/parameter/LearningRateScheduler.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "TrainerConfig.pb.h"
-#include "paddle/utils/ClassRegistrar.h"
-
-namespace paddle {
-// NOLINTNEXTLINES_4
-#define REGISTER_LEARNING_RATE_SCHEDULER(__type_name, __class_name) \
-  static InitFunction __reg_type_##__type_name([]() {               \
-    LearningRateScheduler::registrar_.registerClass<__class_name>(  \
-        #__type_name);                                              \
-  })
-
-class LearningRateScheduler {
- public:
-  static LearningRateScheduler* create(const OptimizationConfig& config);
-  virtual ~LearningRateScheduler() {}
-  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) = 0;
-
-  static ClassRegistrar<LearningRateScheduler, OptimizationConfig> registrar_;
-};
-
-}  // namespace paddle
diff --git a/paddle/parameter/Parameter.cpp b/paddle/parameter/Parameter.cpp
deleted file mode 100644
index 0e6ea90f3d582e843c62bda000313eb71289d5b4..0000000000000000000000000000000000000000
--- a/paddle/parameter/Parameter.cpp
+++ /dev/null
@@ -1,425 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Parameter.h"
-#include <gflags/gflags.h>
-#include <fstream>
-#include "AverageOptimizer.h"
-#include "FirstOrderOptimizer.h"
-#include "OptimizerFunctions.h"
-#include "OptimizerWithRegularizer.h"
-#include "ParameterUpdateFunctions.h"
-#include "ThreadLocalBuffer.h"
-#include "hl_gpu.h"
-#include "paddle/math/CpuSparseMatrix.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/math/SparseRowMatrix.h"
-#include "paddle/utils/Logging.h"
-
-DEFINE_int32(enable_grad_share,
-             (100 * 1024 * 1024),
-             "threshold for enable gradient parameter share for batch "
-             "multi-cpu training");
-DEFINE_int32(
-    grad_share_block_num,
-    64,
-    "block number of gradient parameter share for batch multi-cpu training");
-
-namespace paddle {
-
-const std::string Parameter::kMissParameterFail = "fail";
-const std::string Parameter::kMissParameterRand = "rand";
-const std::string Parameter::kMissParameterZero = "zero";
-
-Parameter::Parameter(const ParameterConfig& config, bool useGpu, bool doInit)
-    : config_(config),
-      useGpu_(useGpu),
-      deviceId_(-1),
-      sharedCount_(0),
-      updateCounter_(0),
-      updated_(false),
-      headerFormat_(PARAM_FORMAT_ORIGINAL) {
-  setID(-1); /* capture uninitialized id */
-  if (useGpu_ && FLAGS_parallel_nn) {
-    /* gpu environment is specified by device property */
-    deviceId_ = config_.device();
-    if (deviceId_ < 0) {
-      useGpu_ = false;
-    }
-  }
-
-  if (doInit) {
-    initialize();
-  }
-
-  for (int i = 0; i < config.update_hooks_size(); ++i) {
-    this->updaterHooks_.push_back(IParameterUpdaterHook::create(config, i));
-  }
-}
-
-void Parameter::initialize() {
-  SetDevice device(deviceId_);
-
-  bufs_[PARAMETER_VALUE] =
-      Vector::createParallelVector(config_.size(), useGpu_);
-  bufs_[PARAMETER_VALUE]->zeroMem();
-
-  if (config_.is_sparse()) {
-    enableSparseParameter();
-  }
-
-  if (!isStatic()) {
-    bufs_[PARAMETER_GRADIENT] =
-        Vector::createParallelVector(config_.size(), useGpu_);
-    bufs_[PARAMETER_MOMENTUM] =
-        Vector::createParallelVector(config_.size(), useGpu_);
-
-    bufs_[PARAMETER_GRADIENT]->zeroMem();
-    bufs_[PARAMETER_MOMENTUM]->zeroMem();
-  }
-}
-
-void Parameter::randomize(const VectorPtr& value,
-                          const ParameterConfig& config) {
-  if (PARAMETER_INIT_UNIFORM == config.initial_strategy()) {
-    // initialize the parameter as uniform distribution
-    real initial_min = config.initial_mean() - config.initial_std();
-    real initial_max = config.initial_mean() + config.initial_std();
-    value->uniform(initial_min, initial_max);
-    VLOG(1) << config.name() << ": initial_min=" << initial_min
-            << ", initial_max=" << initial_max;
-  } else if (PARAMETER_INIT_NORMAL == config.initial_strategy()) {
-    /* Initialize the parameters randomly */
-    value->randnorm(config.initial_mean(), config.initial_std());
-    VLOG(1) << config.name() << ": initial_mean=" << config.initial_mean()
-            << ", initial_std=" << config.initial_std();
-  } else {
-    LOG(FATAL) << "not supported initial_strategy: "
-               << config.initial_strategy();
-  }
-}
-
-void Parameter::randomize() {
-  if (!bufs_[PARAMETER_VALUE]) return;
-  SetDevice device(deviceId_);
-  Parameter::randomize(bufs_[PARAMETER_VALUE], config_);
-
-  if (config_.is_sparse()) {
-    if (format_ == SPARSE_CSC) {
-      sparseRand(intBufs_[PARAMETER_COLS]->getData(),
-                 intBufs_[PARAMETER_ROWS]->getData(),
-                 config_.size(),
-                 config_.dims(1) + 1,
-                 config_.dims(0),
-                 useGpu_);
-    } else {
-      sparseRand(intBufs_[PARAMETER_ROWS]->getData(),
-                 intBufs_[PARAMETER_COLS]->getData(),
-                 config_.size(),
-                 config_.dims(0) + 1,
-                 config_.dims(1),
-                 useGpu_);
-    }
-  }
-  setValueUpdated();
-}
-
-void Parameter::zeroMem() {
-  if (!bufs_[PARAMETER_VALUE]) return;
-  bufs_[PARAMETER_VALUE]->zeroMem();
-  setValueUpdated();
-  LOG(INFO) << getName() << " set to 0";
-}
-
-bool Parameter::isGradShared(size_t* blockNum) {
-  if (!useGpu_ && !isStatic() && FLAGS_enable_grad_share > 0 &&
-      !isGradSparseUpdate() &&
-      this->getSize() > (size_t)FLAGS_enable_grad_share) {
-    if (blockNum) {
-      *blockNum = (size_t)FLAGS_grad_share_block_num;
-    }
-    return true;
-  }
-  return false;
-}
-
-bool Parameter::isValueShared() {
-  return !useGpu_ && config_.is_shared() && FLAGS_trainer_count > 1;
-}
-
-bool Parameter::isGradSparseUpdate() const {
-  return !useGpu_ && !isStatic() &&
-         (config_.sparse_update() || config_.sparse_remote_update());
-}
-
-void Parameter::setMat(ParameterType pType, int matType) {
-  CHECK(!mats_[pType]);
-
-  if (config_.dims_size() == 0 && matType == MAT_NORMAL) {
-    return;
-  }
-
-  CHECK_EQ((size_t)config_.dims_size(), 2LU);
-  size_t height = config_.dims(0);
-  size_t width = config_.dims(1);
-  if (matType == MAT_NORMAL) {
-    if (!config_.is_sparse()) {
-      CHECK_EQ(height * width, bufs_[pType]->getSize());
-      mats_[pType] =
-          Matrix::create(bufs_[pType]->getMemoryHandle(), height, width);
-    } else {
-      size_t size = bufs_[pType]->getSize();
-      CHECK_GE(height * width, size);
-      if (format_ == SPARSE_CSR) {
-        CHECK_EQ(height + 1, intBufs_[PARAMETER_ROWS]->getSize());
-        CHECK_EQ(size, intBufs_[PARAMETER_COLS]->getSize());
-      } else {
-        CHECK_EQ(width + 1, intBufs_[PARAMETER_COLS]->getSize());
-        CHECK_EQ(size, intBufs_[PARAMETER_ROWS]->getSize());
-      }
-      mats_[pType] =
-          Matrix::createSparseMatrix(bufs_[pType]->getData(),
-                                     intBufs_[PARAMETER_ROWS]->getData(),
-                                     intBufs_[PARAMETER_COLS]->getData(),
-                                     height,
-                                     width,
-                                     bufs_[pType]->getSize(),
-                                     FLOAT_VALUE,
-                                     format_,
-                                     false,
-                                     useGpu_);
-    }
-  }
-#ifndef PADDLE_MOBILE_INFERENCE
-  // NOLINTNEXTLINE
-  else if (matType == MAT_NORMAL_SHARED) {
-    CHECK_EQ(height * width, bufs_[pType]->getSize());
-    size_t blockNum = 0;
-    CHECK(isGradShared(&blockNum));
-    mats_[pType] = std::make_shared<SharedCpuMatrix>(
-        blockNum,
-        std::dynamic_pointer_cast<CpuMemoryHandle>(
-            bufs_[pType]->getMemoryHandle()),
-        height,
-        width);
-  } else if (matType == MAT_VALUE_SHARED) {
-    CHECK_EQ(height * width, bufs_[pType]->getSize());
-    mats_[pType] = std::make_shared<SharedCpuMatrix>(
-        std::dynamic_pointer_cast<CpuMemoryHandle>(
-            bufs_[pType]->getMemoryHandle()),
-        height,
-        width);
-  } else if (matType == MAT_SPARSE_ROW_IDS) {
-    CHECK_EQ(height * width, bufs_[pType]->getSize());
-    mats_[pType] = std::make_shared<SparseRowIdsCpuMatrix>(
-        std::dynamic_pointer_cast<CpuMemoryHandle>(
-            bufs_[pType]->getMemoryHandle()),
-        height,
-        width);
-  } else if (matType == MAT_SPARSE_ROW) {
-    auto valueMat =
-        std::dynamic_pointer_cast<SparseRowCpuMatrix>(mats_[PARAMETER_VALUE]);
-    SparseRowCpuMatrix::IndexDictPtr indexDict(nullptr);
-    if (pType != PARAMETER_VALUE) {
-      CHECK(valueMat) << "The matrix for PARAMETER_VALUE must be set "
-                      << " and its type must be MAT_SPARSE_ROW,"
-                      << " MAT_SPARSE_ROW_PREFETCH or MAT_CACHE_ROW";
-      indexDict = valueMat->getIndexDictHandle();
-    }
-    auto mat =
-        std::make_shared<SparseRowCpuMatrix>(nullptr,
-                                             height,
-                                             width,
-                                             // grad share index with value
-                                             indexDict);
-    mats_[pType] = mat;
-  } else if (matType == MAT_CACHE_ROW) {
-    CHECK(isGradSparseUpdate());
-    auto mat = std::make_shared<CacheRowCpuMatrix>(height, width);
-    mats_[pType] = mat;
-  } else if (matType == MAT_SPARSE_ROW_PREFETCH_FULL_SIZE ||
-             matType == MAT_SPARSE_ROW_PREFETCH) {
-    auto mat = std::make_shared<SparsePrefetchRowCpuMatrix>(
-        bufs_[pType] ? std::dynamic_pointer_cast<CpuMemoryHandle>(
-                           bufs_[pType]->getMemoryHandle())
-                     : nullptr,
-        height,
-        width,
-        nullptr,  // indexDictHandle
-        getGlobalSyncThreadPool());
-    mats_[pType] = mat;
-  } else if (matType == MAT_SPARSE_ROW_AUTO_GROW) {
-    CHECK(isGradSparseUpdate());
-    mats_[pType] = std::make_shared<SparseAutoGrowRowCpuMatrix>(height, width);
-  }
-#endif
-  // NOLINTNEXTLINE
-  else {
-    LOG(FATAL) << "Unsupported mat type" << matType;
-  }
-}
-
-void Parameter::incUpdate(const UpdateCallback& callback) {
-  // Static parameter is fixed, and does not need to be updated
-  if (isStatic()) {
-    return;
-  }
-
-  ++updateCounter_;
-  if (isUpdatable()) {
-    if (callback) callback(this);
-    clearUpdate();
-  }
-}
-
-bool Parameter::save(const std::string& filename) const {
-  std::ofstream fs(filename, std::ios_base::binary);
-  CHECK(fs) << "Fail to open " << filename;
-  return save(fs);
-}
-
-bool Parameter::save(std::ostream& s) const {
-  CpuVector vec(*bufs_[PARAMETER_VALUE].get());
-  Header header;
-  header.format = headerFormat_;
-  header.valueSize = sizeof(real);
-  header.size = getSize();
-
-  CHECK_EQ(header.size, vec.getSize());
-
-  CHECK(s.write(reinterpret_cast<char*>(&header), sizeof(header)))
-      << "Fail to write parameter " << getName();
-
-  CHECK(s.write(reinterpret_cast<char*>(vec.getData()),
-                header.size * sizeof(real)))
-      << "Fail to write parameter " << getName();
-  if (config_.is_sparse()) {
-    CpuIVector rows(*intBufs_[PARAMETER_ROWS].get());
-    CpuIVector cols(*intBufs_[PARAMETER_COLS].get());
-    CHECK(s.write(reinterpret_cast<char*>(rows.getData()),
-                  rows.getSize() * sizeof(int)))
-        << "Fail to write parameter " << getName();
-    CHECK(s.write(reinterpret_cast<char*>(cols.getData()),
-                  cols.getSize() * sizeof(int)))
-        << "Fail to write parameter " << getName();
-  }
-
-  return true;
-}
-
-/**
- * Load parameter value from a file
- */
-bool Parameter::load(const std::string& filename) {
-  std::ifstream fs(filename, std::ios_base::binary);
-  if (!fs) {
-    LOG(INFO) << "missing parameters [" << filename << "] while loading model.";
-    if (kMissParameterFail == FLAGS_load_missing_parameter_strategy) {
-      LOG(FATAL) << getName() << " missing, not allowed.";
-      return false;
-    }
-    if (kMissParameterRand == FLAGS_load_missing_parameter_strategy) {
-      LOG(INFO) << getName() << " missing, set to random.";
-      randomize();
-      return true;
-    }
-    if (kMissParameterZero == FLAGS_load_missing_parameter_strategy) {
-      LOG(INFO) << getName() << " missing, set to zero.";
-      zeroMem();
-      return true;
-    }
-    LOG(FATAL) << "unsupported load_missing_parameter_strategy: "
-               << FLAGS_load_missing_parameter_strategy;
-    return false;
-  }
-  return load(fs);
-}
-
-bool Parameter::load(std::istream& s) {
-  CpuVector vec(*bufs_[PARAMETER_VALUE].get());
-  Header header;
-  CHECK(s.read(reinterpret_cast<char*>(&header), sizeof(header)))
-      << "Fail to read parameter " << getName();
-  CHECK(isHeaderFormatSupported(header.format)) << "Incorrect format version: "
-                                                << header.format;
-  headerFormat_ = header.format;
-  CHECK_EQ(header.size, getSize())
-      << "The size (" << header.size << ") in the file does not match the size "
-      << "(" << getSize() << ") of the parameter: " << getName();
-  CHECK_EQ(header.valueSize, sizeof(real))
-      << "Unsupported valueSize " << header.valueSize << " at: " << getName();
-  CHECK(s.read(reinterpret_cast<char*>(vec.getData()),
-               header.size * sizeof(real)));
-
-  auto& tmp = *bufs_[PARAMETER_VALUE].get();
-  if (typeid(tmp) == typeid(GpuVector)) {
-    bufs_[PARAMETER_VALUE]->copyFrom(vec);
-  }
-
-  if (config_.is_sparse() && config_.need_compact()) {
-    // load from dense parameter with many zero
-    CHECK_EQ(config_.dims_size(), 2);
-    auto height = config_.dims(0);
-    auto width = config_.dims(1);
-    auto mat = Matrix::create(vec.getData(), height, width);
-    CpuSparseMatrix sparseMat(height,
-                              width,
-                              0,
-                              FLOAT_VALUE,
-                              format_,
-                              /*trans*/ false);
-    sparseMat.copyFrom(*mat, HPPL_STREAM_DEFAULT);
-    auto nnz = sparseMat.getElementCnt();
-    size_t rowSize = (format_ == SPARSE_CSR) ? height + 1 : nnz;
-    size_t colSize = (format_ == SPARSE_CSR) ? nnz : width + 1;
-
-    intBufs_[PARAMETER_ROWS]->copyFrom(sparseMat.getRows(), rowSize);
-    intBufs_[PARAMETER_COLS]->copyFrom(sparseMat.getCols(), colSize);
-    bufs_[PARAMETER_VALUE]->resize(nnz);  // for setMat check
-    bufs_[PARAMETER_VALUE]->copyFrom(sparseMat.getValue(), nnz);
-    config_.set_size(nnz);
-    LOG(INFO) << "compact nnz=" << (1. * nnz / (height * width))
-              << " name=" << config_.name();
-  } else if (config_.is_sparse()) {
-    CpuIVector rows(*intBufs_[PARAMETER_ROWS].get());
-    CpuIVector cols(*intBufs_[PARAMETER_COLS].get());
-    size_t rowSize, colSize;
-    CHECK_EQ(config_.dims_size(), 2);
-    if (format_ == SPARSE_CSR) {
-      rowSize = config_.dims(0) + 1;
-      colSize = config_.size();
-    } else {
-      rowSize = config_.size();
-      colSize = config_.dims(1) + 1;
-    }
-    CHECK(
-        s.read(reinterpret_cast<char*>(rows.getData()), rowSize * sizeof(int)));
-    CHECK(
-        s.read(reinterpret_cast<char*>(cols.getData()), colSize * sizeof(int)));
-    auto& paramRows = *intBufs_[PARAMETER_ROWS].get();
-    if (typeid(paramRows) == typeid(GpuIVector)) {
-      intBufs_[PARAMETER_ROWS]->copyFrom(rows);
-    }
-    auto& paramCols = *intBufs_[PARAMETER_COLS].get();
-    if (typeid(paramCols) == typeid(GpuIVector)) {
-      intBufs_[PARAMETER_COLS]->copyFrom(cols);
-    }
-  }
-
-  setValueUpdated();
-
-  return true;
-}
-
-}  // namespace paddle
diff --git a/paddle/parameter/Parameter.h b/paddle/parameter/Parameter.h
deleted file mode 100644
index ef519bf35a4f051b4477eb04b5eb2c5f0b5e29e8..0000000000000000000000000000000000000000
--- a/paddle/parameter/Parameter.h
+++ /dev/null
@@ -1,380 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdint.h>
-
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include "ParameterConfig.pb.h"
-#include "TrainerConfig.pb.h"
-
-#include "ParameterUpdaterHook.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/Vector.h"
-#include "paddle/utils/Common.h"
-#include "paddle/utils/GlobalConstants.h"
-#include "paddle/utils/Locks.h"
-#include "paddle/utils/ThreadLocal.h"
-#include "paddle/utils/Util.h"
-
-namespace paddle {
-
-typedef enum {
-  /// The paddle original basic format
-  PARAM_FORMAT_ORIGINAL = 0,
-
-  /// See mkldnn_memory_format_t in
-  /// https://github.com/01org/mkl-dnn/blob/master/include/mkldnn_types.h
-  /// for a detailed description.
-  /// 2D weights tensor in the format (output channels, input channels).
-  PARAM_FORMAT_MKLDNN_OI,
-
-  /// The total format items numbers
-  PARAM_FORMAT_ITEMS,
-} PARAM_FORMAT;
-
-class SparsePrefetchRowCpuMatrix;
-
-class Parameter;
-typedef std::function<void(Parameter* param)> UpdateCallback;
-typedef std::function<void(int paramId, Parameter* param)> ParamInitCallback;
-
-class Parameter;
-typedef std::shared_ptr<Parameter> ParameterPtr;
-
-class Parameter {
- public:
-  Parameter(const ParameterConfig& config, bool useGpu, bool doInit = true);
-  const std::string& getName() const { return config_.name(); }
-
-  size_t getSize() const { return config_.size(); }
-
-  bool isFullSize() const {
-    if (bufs_[PARAMETER_VALUE]) {
-      return this->getSize() == bufs_[PARAMETER_VALUE]->getSize();
-    }
-    return false;
-  }
-
-  inline bool useGpu() const { return useGpu_; }
-
-  int getDeviceId() const { return deviceId_; }
-
-  void setDevice(int deviceId) { deviceId_ = deviceId; }
-
-  /// The id ranges from 0 to the_total_number_of_parameters - 1
-  size_t getID() const { return config_.para_id(); }
-
-  /// ID is a implict value created until neural network is built.
-  void setID(size_t id) { config_.set_para_id(id); }
-
-  bool isStatic() const { return config_.is_static(); }
-
-  enum MatType {
-    MAT_NORMAL,
-    /// both value and grad are shared
-    MAT_NORMAL_SHARED,
-
-    /// Now used in BatchNorm in CPU mode
-    MAT_VALUE_SHARED,
-
-    /// sparse matrix, which has full size parameter
-    MAT_SPARSE_ROW_IDS,
-    /// sparse matrix, parameter size scale by sparse rates.
-    MAT_SPARSE_ROW_AUTO_GROW,
-    MAT_CACHE_ROW,
-    MAT_SPARSE_ROW,
-
-    /// sparse matrix for prefetching parameter from pserver
-    MAT_SPARSE_ROW_PREFETCH,
-    /// same as above, but parameter has full size for saving parameter in local
-    MAT_SPARSE_ROW_PREFETCH_FULL_SIZE,
-  };
-
-  void enableSparseParameter() {
-    if (config_.is_sparse()) {
-      if (config_.format() == "csr") {
-        size_t height = config_.dims(0);
-        size_t nnz = config_.size();
-        enableIntType(PARAMETER_ROWS, height + 1);
-        enableIntType(PARAMETER_COLS, nnz);
-        format_ = SPARSE_CSR;
-      } else {
-        size_t width = config_.dims(1);
-        size_t nnz = config_.size();
-        enableIntType(PARAMETER_COLS, width + 1);
-        enableIntType(PARAMETER_ROWS, nnz);
-        format_ = SPARSE_CSC;
-      }
-    }
-  }
-
-  /// allocate buffer for the give type
-  void enableType(ParameterType type, MatType matType = MAT_NORMAL) {
-    if (bufs_[type] || mats_[type]) {
-      return;
-    }
-    SetDevice device(deviceId_);
-    if (config_.dims_size() == 2) {
-      if (matType == MAT_NORMAL || matType == MAT_NORMAL_SHARED ||
-          matType == MAT_SPARSE_ROW_PREFETCH_FULL_SIZE ||
-          matType == MAT_VALUE_SHARED || matType == MAT_SPARSE_ROW_IDS) {
-        bufs_[type] = Vector::createParallelVector(config_.size(), useGpu_);
-        bufs_[type]->zeroMem();
-      } else {
-        CHECK(isGradSparseUpdate());
-      }
-      if (config_.is_sparse() && type == PARAMETER_VALUE) {
-        enableSparseParameter();
-      }
-      setMat(type, matType);
-    } else {
-      bufs_[type] = Vector::createParallelVector(config_.size(), useGpu_);
-      bufs_[type]->zeroMem();
-    }
-  }
-
-  void enableBufType(ParameterType type) {
-    if (bufs_[type]) return;
-    bufs_[type] = Vector::createParallelVector(config_.size(), useGpu_);
-    bufs_[type]->zeroMem();
-  }
-
-  void enableIntType(ParameterType type, size_t intStoreSize = 0) {
-    if (!intBufs_[type]) {
-      SetDevice device(deviceId_);
-      size_t size = intStoreSize ? intStoreSize : config_.size();
-      intBufs_[type] = IVector::create(size, useGpu_);
-      intBufs_[type]->zeroMem();
-    }
-  }
-
-  void enableSharedType(ParameterType type,
-                        VectorPtr vec,
-                        MatrixPtr mat = nullptr) {
-    if (!bufs_[type] && !mats_[type]) {
-      bufs_[type] = vec;
-      mats_[type] = mat;
-    }
-  }
-
-  /// for batchGradientMachine: blockNum is number of partitions of the matrix.
-  bool isGradShared(size_t* blockNum = NULL);
-
-  bool isValueShared();
-
-  // for AsgdSparseGradientMachine & SgdSparseGradientMachine:
-  // and MultiGradientMachine
-  bool isGradSparseUpdate() const;
-
-  bool isSparseRemoteUpdate() const {
-    return config_.sparse_remote_update() && !useGpu();
-  }
-
-  const ParameterConfig& getConfig() const { return config_; }
-
-  ParameterConfig& getConfig() { return config_; }
-
-  bool hasType(ParameterType pType) const {
-    return bufs_[pType] || mats_[pType];
-  }
-
-  const VectorPtr& getBuf(ParameterType pType) const {
-    return this->bufs_[pType];
-  }
-
-  const VectorPtr* getBufs() const { return bufs_; }
-
-  const MatrixPtr& getMat(ParameterType pType) const { return mats_[pType]; }
-
-  void setValueUpdated() { updated_ = true; }
-
-  void clearValueUpdated() { updated_ = false; }
-
-  bool isValueUpdated() const { return updated_; }
-
-  /**
-   * Save parameter value to a file
-   */
-  bool save(const std::string& filename) const;
-
-  /**
-   * Save parameter to ostream
-   */
-  bool save(std::ostream& s) const;
-
-  /**
-   * Load parameter value from a file
-   */
-  bool load(const std::string& filename);
-
-  /**
-   * Load parameter from istream
-   */
-  bool load(std::istream& is);
-
-  void incShared() { sharedCount_++; }
-
-  /**
-   * After one of the parameter's gradient is merged
-   * You should call this function to do some additional processing,
-   */
-  void incUpdate(const UpdateCallback& callbacks = NULL);
-
-  void clearGradient() {
-    auto& mat = getMat(PARAMETER_GRADIENT);
-    if (mat) {
-      // zeroMem will also clear rows for SparseRowCpuMatrix
-      mat->zeroMem();
-    } else {
-      auto& gradBuf = getBuf(PARAMETER_GRADIENT);
-      if (gradBuf) gradBuf->zeroMem();
-    }
-  }
-
-  void initialize();
-
-  /**
-   * Initialize the value according to config_: initial_mean,
-   * initial_std and initial_strategy.
-   */
-  void randomize();
-  static void randomize(const VectorPtr& value, const ParameterConfig& config);
-
-  /// Initialize the value to 0
-  void zeroMem();
-
-  /// file header structure
-  struct Header {
-    int32_t format;      // = PARAM_FORMAT
-    uint32_t valueSize;  // = sizeof(real)
-    uint64_t size;       // = getSize()
-  };
-
-  /**
-   * @brief Is the header format supported.
-   */
-  static bool isHeaderFormatSupported(int32_t fmt) {
-    return fmt < PARAM_FORMAT_ITEMS;
-  }
-
-  /**
-   * @brief Get the format in header.
-   */
-  int getHeaderFormat() { return headerFormat_; }
-
-  /**
-   * @brief Set the format in header.
-   */
-  void setHeaderFormat(int32_t fmt) {
-    CHECK(isHeaderFormatSupported(fmt)) << "Unsupported format version: "
-                                        << fmt;
-    headerFormat_ = fmt;
-  }
-
-  /**
-   * @brief  Parameter Update Hook.
-   *
-   * The parameter's update hook before ParameterUpdater::updateImpl
-   * It could modify gradient/momentum/etc here. Such as drop some gradient,
-   * etc.
-   */
-  void updateHook() {
-    for (auto& hook : updaterHooks_) {
-      hook->update(this);
-    }
-  }
-
-  /**
-   * @brief  Initialize all updater hook.
-   *
-   * This method should be invoked in ParameterUpdater::init() only.
-   */
-  void initHook() {
-    for (auto& hook : updaterHooks_) {
-      hook->init(this);
-    }
-  }
-
- protected:
-  /**
-   * @brief create matrix to matType.
-   *
-   * used by gradient machine which needs specify matrix type,
-   * instead of creating in weights.cpp.
-   *
-   * @note  pType should be enabled already.
-   */
-  void setMat(ParameterType pType, int matType);
-
-  bool isUpdatable() { return (updateCounter_ == sharedCount_); }
-
-  void clearUpdate() { updateCounter_ = 0; }
-
- protected:
-  ParameterConfig config_;
-
-  bool useGpu_;
-
-  int deviceId_;
-
-  /**
-   * @brief bufs_ stores parameter value and gradient.
-   *
-   * Layer should use bufs_[PARAMETER_VALUE] to form weight matrix for
-   * calculation and stores gradient to bufs_[PARAMETER_GRADIENT].
-   */
-  VectorPtr bufs_[NUM_PARAMETER_TYPES];
-
-  /**
-   * @brief Weight matrix for bufs_.
-   *
-   * It's helpfull when parameter shared by multi-layers.
-   * Caller should check, if mats exist, do not create it again.
-   */
-  MatrixPtr mats_[NUM_PARAMETER_TYPES];
-
-  /// Int vectors, used in some User defined parameter types
-  IVectorPtr intBufs_[NUM_PARAMETER_TYPES];
-
-  int sharedCount_;
-  int updateCounter_;
-
-  bool updated_;
-  SparseFormat format_;
-
-  /// The header format for saving or loading param
-  int32_t headerFormat_;
-
-  std::vector<std::shared_ptr<IParameterUpdaterHook>> updaterHooks_;
-
- public:
-  void setSharedCount(int cnt) { sharedCount_ = cnt; }
-  int getSharedCount() { return sharedCount_; }
-
-  bool isSparse() { return config_.is_sparse(); }
-  SparseFormat getFormat() { return format_; }
-
-  static const std::string kMissParameterFail;
-  static const std::string kMissParameterRand;
-  static const std::string kMissParameterZero;
-};
-
-typedef std::map<std::string, ParameterPtr> ParameterMap;
-
-}  // namespace paddle
diff --git a/paddle/parameter/ParameterOptimizer.cpp b/paddle/parameter/ParameterOptimizer.cpp
deleted file mode 100644
index 638daa58f1e5f3f416d7f90ad2662a523eaf6741..0000000000000000000000000000000000000000
--- a/paddle/parameter/ParameterOptimizer.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/utils/Logging.h"
-
-#include <fstream>
-
-#include "AverageOptimizer.h"
-#include "FirstOrderOptimizer.h"
-#include "OptimizerFunctions.h"
-#include "OptimizerWithRegularizer.h"
-#include "ParameterOptimizer.h"
-#include "hl_gpu.h"
-
-namespace paddle {
-
-ParameterOptimizer* ParameterOptimizer::create(
-    const OptimizationConfig& optConfig, bool inPserver) {
-  if (inPserver && optConfig.num_batches_per_send_parameter() > 1) {
-    return new AddOptimizer(optConfig);
-  }
-  if (optConfig.learning_method() == "momentum") {
-    return new SgdOptimizer(optConfig);
-  }
-  if (optConfig.learning_method() == "torch_momentum") {
-    return new SgdOptimizer(optConfig);
-  }
-  if (optConfig.learning_method() == "adagrad") {
-    return new AdagradParameterOptimizer(optConfig);
-  }
-  if (optConfig.learning_method() == "adadelta") {
-    return new AdaDeltaParameterOptimizer(optConfig);
-  }
-  if (optConfig.learning_method() == "rmsprop") {
-    return new RMSPropParameterOptimizer(optConfig);
-  }
-  if (optConfig.learning_method() == "decayed_adagrad") {
-    return new DecayedAdagradParameterOptimizer(optConfig);
-  }
-  if (optConfig.learning_method() == "adam") {
-    return new AdamParameterOptimizer(optConfig);
-  }
-  if (optConfig.learning_method() == "adamax") {
-    return new AdamaxParameterOptimizer(optConfig);
-  }
-  if (optConfig.learning_method() == "sparse_momentum") {
-    return new SparseMomentumParameterOptimizer(optConfig);
-  }
-  return nullptr;
-}
-
-}  // namespace paddle
diff --git a/paddle/parameter/ParameterUpdateFunctions.cpp b/paddle/parameter/ParameterUpdateFunctions.cpp
deleted file mode 100644
index db1153c2d6430e453d776b92b63152c311771668..0000000000000000000000000000000000000000
--- a/paddle/parameter/ParameterUpdateFunctions.cpp
+++ /dev/null
@@ -1,300 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/utils/Logging.h"
-#ifdef __AVX__
-#include <x86intrin.h>
-#include <xmmintrin.h>
-#endif
-
-#include "ParameterUpdateFunctions.h"
-
-namespace paddle {
-
-void sgdUpdateCpu(real learningRate,
-                  real momentum,
-                  real decayRate,
-                  size_t size,
-                  real* value,
-                  const real* grad,
-                  real* momentumVec) {
-  decayRate *= learningRate;
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (size_t i = 0; i < size; ++i) {
-    momentumVec[i] = momentum * momentumVec[i] - learningRate * grad[i] -
-                     decayRate * value[i];
-    value[i] += momentumVec[i];
-  }
-}
-
-void sgdUpdate(real learningRate,
-               real momentum,
-               real decayRate,
-               Vector* value,
-               Vector* grad,
-               Vector* momentumVec) {
-  size_t size = value->getSize();
-  real* val = value->getData();
-  real* grd = grad->getData();
-  real* mom = momentumVec->getData();
-  if (typeid(*value) == typeid(CpuVector)) {
-    sgdUpdateCpu(learningRate, momentum, decayRate, size, val, grd, mom);
-  } else if (typeid(*value) == typeid(GpuVector)) {
-    value->sgdUpdate(*grad, *momentumVec, learningRate, momentum, decayRate);
-  } else {
-    LOG(FATAL) << "Wrong";
-  }
-}
-
-void sgdUpdateAvx(float learningRate,
-                  float momentum,
-                  float decayRate,
-                  size_t size,
-                  float* value,
-                  const float* _grad,
-                  float* momentumVec) {
-#ifdef __AVX__
-  float* grad = const_cast<float*>(_grad);  // the gradient is not modified
-                                            // but when invoke simd functions
-                                            // need non-const pointer.
-  size_t gradientAlign = 0;
-  size_t gradientAlignHeader = (size_t)grad % sizeof(__m256);
-  CHECK_EQ(gradientAlignHeader, (size_t)momentumVec % sizeof(__m256))
-      << "Gradent buffer didn't align with momentum buffer";
-  CHECK_EQ(gradientAlignHeader, (size_t)value % sizeof(__m256))
-      << "Gradent buffer didn't align with value buffer";
-  if (0 != gradientAlignHeader) {
-    gradientAlignHeader = sizeof(__m256) - gradientAlignHeader;
-    gradientAlign = gradientAlignHeader / sizeof(real);
-
-    // handle the unalign buffer
-    for (size_t i = 0; i < gradientAlign; i++) {
-      momentumVec[i] = momentum * momentumVec[i] - (learningRate * grad[i]) -
-                       (decayRate * learningRate * value[i]);
-      value[i] += momentumVec[i];
-    }
-    grad += gradientAlign;
-    momentumVec += gradientAlign;
-    value += gradientAlign;
-  }
-
-  constexpr size_t kParallelNum = 8;
-  constexpr size_t nStepSize = (sizeof(__m256) / sizeof(real)) * kParallelNum;
-  size_t cntLoop = (size - gradientAlign) / nStepSize;
-  size_t cntRem = (size - gradientAlign) % nStepSize;
-  __m256 gradientTmp[kParallelNum];
-  __m256 valueTmp[kParallelNum];
-  __m256 lr, mom, dr;
-  std::function<void(void)> loopFun;
-
-  learningRate *= -1;
-  lr = _mm256_set_ps(learningRate,
-                     learningRate,
-                     learningRate,
-                     learningRate,
-                     learningRate,
-                     learningRate,
-                     learningRate,
-                     learningRate);
-
-  if (0 != momentum) {
-    mom = _mm256_set_ps(momentum,
-                        momentum,
-                        momentum,
-                        momentum,
-                        momentum,
-                        momentum,
-                        momentum,
-                        momentum);
-  }
-
-  decayRate *= learningRate;
-  if (0 != decayRate) {
-    dr = _mm256_set_ps(decayRate,
-                       decayRate,
-                       decayRate,
-                       decayRate,
-                       decayRate,
-                       decayRate,
-                       decayRate,
-                       decayRate);
-  }
-
-  auto gradMulFun = [&](void) {
-    gradientTmp[0] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad), lr);
-    gradientTmp[1] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 8), lr);
-    gradientTmp[2] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 16), lr);
-    gradientTmp[3] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 24), lr);
-    gradientTmp[4] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 32), lr);
-    gradientTmp[5] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 40), lr);
-    gradientTmp[6] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 48), lr);
-    gradientTmp[7] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 56), lr);
-  };
-
-  auto valueMulFun = [&](void) {
-    valueTmp[0] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value), dr);
-    valueTmp[1] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 8), dr);
-    valueTmp[2] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 16), dr);
-    valueTmp[3] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 24), dr);
-    valueTmp[4] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 32), dr);
-    valueTmp[5] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 40), dr);
-    valueTmp[6] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 48), dr);
-    valueTmp[7] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 56), dr);
-  };
-
-  auto momentumMulFun = [&](void) {
-    *reinterpret_cast<__m256*>(momentumVec) =
-        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec), mom);
-    *reinterpret_cast<__m256*>(momentumVec + 8) =
-        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 8), mom);
-    *reinterpret_cast<__m256*>(momentumVec + 16) =
-        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 16), mom);
-    *reinterpret_cast<__m256*>(momentumVec + 24) =
-        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 24), mom);
-    *reinterpret_cast<__m256*>(momentumVec + 32) =
-        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 32), mom);
-    *reinterpret_cast<__m256*>(momentumVec + 40) =
-        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 40), mom);
-    *reinterpret_cast<__m256*>(momentumVec + 48) =
-        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 48), mom);
-    *reinterpret_cast<__m256*>(momentumVec + 56) =
-        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 56), mom);
-  };
-
-  auto momentumAddGradFun = [&](void) {
-    *reinterpret_cast<__m256*>(momentumVec) =
-        _mm256_add_ps(*reinterpret_cast<__m256*>(momentumVec), gradientTmp[0]);
-    *reinterpret_cast<__m256*>(momentumVec + 8) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 8), gradientTmp[1]);
-    *reinterpret_cast<__m256*>(momentumVec + 16) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 16), gradientTmp[2]);
-    *reinterpret_cast<__m256*>(momentumVec + 24) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 24), gradientTmp[3]);
-    *reinterpret_cast<__m256*>(momentumVec + 32) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 32), gradientTmp[4]);
-    *reinterpret_cast<__m256*>(momentumVec + 40) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 40), gradientTmp[5]);
-    *reinterpret_cast<__m256*>(momentumVec + 48) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 48), gradientTmp[6]);
-    *reinterpret_cast<__m256*>(momentumVec + 56) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 56), gradientTmp[7]);
-  };
-
-  auto momentumZeroFun = [&](void) {
-    *reinterpret_cast<__m256*>(momentumVec) = gradientTmp[0];
-    *reinterpret_cast<__m256*>(momentumVec + 8) = gradientTmp[1];
-    *reinterpret_cast<__m256*>(momentumVec + 16) = gradientTmp[2];
-    *reinterpret_cast<__m256*>(momentumVec + 24) = gradientTmp[3];
-    *reinterpret_cast<__m256*>(momentumVec + 32) = gradientTmp[4];
-    *reinterpret_cast<__m256*>(momentumVec + 40) = gradientTmp[5];
-    *reinterpret_cast<__m256*>(momentumVec + 48) = gradientTmp[6];
-    *reinterpret_cast<__m256*>(momentumVec + 56) = gradientTmp[7];
-  };
-
-  auto momentumAddValueFun = [&](void) {
-    *reinterpret_cast<__m256*>(momentumVec) =
-        _mm256_add_ps(*reinterpret_cast<__m256*>(momentumVec), valueTmp[0]);
-    *reinterpret_cast<__m256*>(momentumVec + 8) =
-        _mm256_add_ps(*reinterpret_cast<__m256*>(momentumVec + 8), valueTmp[1]);
-    *reinterpret_cast<__m256*>(momentumVec + 16) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 16), valueTmp[2]);
-    *reinterpret_cast<__m256*>(momentumVec + 24) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 24), valueTmp[3]);
-    *reinterpret_cast<__m256*>(momentumVec + 32) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 32), valueTmp[4]);
-    *reinterpret_cast<__m256*>(momentumVec + 40) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 40), valueTmp[5]);
-    *reinterpret_cast<__m256*>(momentumVec + 48) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 48), valueTmp[6]);
-    *reinterpret_cast<__m256*>(momentumVec + 56) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 56), valueTmp[7]);
-  };
-
-  auto valueAddMomentumFun = [&](void) {
-    *reinterpret_cast<__m256*>(value) =
-        _mm256_add_ps(*reinterpret_cast<__m256*>(value),
-                      *reinterpret_cast<__m256*>(momentumVec));
-    *reinterpret_cast<__m256*>(value + 8) =
-        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 8),
-                      *reinterpret_cast<__m256*>(momentumVec + 8));
-    *reinterpret_cast<__m256*>(value + 16) =
-        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 16),
-                      *reinterpret_cast<__m256*>(momentumVec + 16));
-    *reinterpret_cast<__m256*>(value + 24) =
-        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 24),
-                      *reinterpret_cast<__m256*>(momentumVec + 24));
-    *reinterpret_cast<__m256*>(value + 32) =
-        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 32),
-                      *reinterpret_cast<__m256*>(momentumVec + 32));
-    *reinterpret_cast<__m256*>(value + 40) =
-        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 40),
-                      *reinterpret_cast<__m256*>(momentumVec + 40));
-    *reinterpret_cast<__m256*>(value + 48) =
-        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 48),
-                      *reinterpret_cast<__m256*>(momentumVec + 48));
-    *reinterpret_cast<__m256*>(value + 56) =
-        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 56),
-                      *reinterpret_cast<__m256*>(momentumVec + 56));
-  };
-
-  if (0 == decayRate && 0 == momentum) {
-    loopFun = [&](void) {
-      gradMulFun();
-      momentumZeroFun();
-      valueAddMomentumFun();
-    };
-  } else if (0 == decayRate && 0 != momentum) {
-    loopFun = [&](void) {
-      gradMulFun();
-      momentumMulFun();
-      momentumAddGradFun();
-      valueAddMomentumFun();
-    };
-  } else if (0 != decayRate && 0 == momentum) {
-    loopFun = [&](void) {
-      gradMulFun();
-      valueMulFun();
-      momentumZeroFun();
-      momentumAddValueFun();
-      valueAddMomentumFun();
-    };
-  } else if (0 != decayRate && 0 != momentum) {
-    loopFun = [&](void) {
-      gradMulFun();
-      valueMulFun();
-      momentumMulFun();
-      momentumAddGradFun();
-      momentumAddValueFun();
-      valueAddMomentumFun();
-    };
-  }
-
-  for (size_t i = 0; i < cntLoop; i++) {
-    loopFun();
-    grad += nStepSize;
-    momentumVec += nStepSize;
-    value += nStepSize;
-  }
-
-  for (size_t i = 0; i < cntRem; i++) {
-    momentumVec[i] = momentum * momentumVec[i] + (learningRate * grad[i]) +
-                     (decayRate * value[i]);
-    value[i] += momentumVec[i];
-  }
-#endif
-}
-
-}  // namespace paddle
diff --git a/paddle/parameter/ParameterUpdateFunctions.h b/paddle/parameter/ParameterUpdateFunctions.h
deleted file mode 100644
index 7434baa2d3d6297cc6d8d99b46cff516d6ac49f9..0000000000000000000000000000000000000000
--- a/paddle/parameter/ParameterUpdateFunctions.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/math/Vector.h"
-#include "paddle/utils/Common.h"
-
-namespace paddle {
-
-/**
- * Performs the following operations.
- *
- * momentumVec = momentum * momentumVec
- *               - learningRate * grad
- *               - learningRate * decayRate * value
- *
- * value = value + momentumVec
- * momentum = 0 or decayRate = 0 are specially handled to avoid unnecessary
- * computation.
- */
-void sgdUpdate(real learningRate,
-               real momentum,
-               real decayRate,
-               Vector* value,
-               Vector* grad,
-               Vector* momentumVec);
-
-void sgdUpdateCpu(real learningRate,
-                  real momentum,
-                  real decayRate,
-                  size_t size,
-                  real* value,
-                  const real* grad,
-                  real* momentumVec);
-
-void sgdUpdateAvx(float learningRate,
-                  float momentum,
-                  float decayRate,
-                  size_t size,
-                  float* value,
-                  const float* grad,
-                  float* momentumVec);
-
-}  // namespace paddle
diff --git a/paddle/parameter/ParameterUpdaterBase.cpp b/paddle/parameter/ParameterUpdaterBase.cpp
deleted file mode 100644
index 7815856b45d93406597b332469de1c57a7781da5..0000000000000000000000000000000000000000
--- a/paddle/parameter/ParameterUpdaterBase.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ParameterUpdaterBase.h"
-#include <fstream>
-#include "hl_gpu.h"
-#include "paddle/utils/Logging.h"
-
-namespace paddle {
-
-void ParameterUpdater::init(const std::vector<ParameterPtr>& parameters) {
-  parameters_ = parameters;
-  for (ParameterType type : getParameterTypes()) {
-    for (auto& para : parameters) {
-      para->enableType(type);
-    }
-  }
-  for (size_t pid = 0; pid < parameters_.size(); ++pid) {
-    nonStaticParaIDMap_.insert(
-        std::pair<size_t, size_t>(parameters_[pid]->getID(), pid));
-  }
-
-  for (auto& para : parameters) {
-    if (!para->isStatic()) {
-      para->initHook();
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/parameter/ParameterUpdaterHook.cpp b/paddle/parameter/ParameterUpdaterHook.cpp
deleted file mode 100644
index 989185b66a5b7785bb0572fba59a72adeef9797b..0000000000000000000000000000000000000000
--- a/paddle/parameter/ParameterUpdaterHook.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ParameterUpdaterHook.h"
-
-#include <algorithm>
-#include <atomic>
-#include <fstream>
-#include <mutex>
-#include <thread>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/math/Vector.h"
-#include "paddle/parameter/Parameter.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/Util.h"
-
-namespace paddle {
-
-/**
- * The static pruning hook
- * Static means user specify a sparsity_ratio before training started, and the
- * network will prune the parameters based on the sparsity_ratio. More details
- * can be found https://arxiv.org/pdf/1506.02626.pdf.
- */
-
-class StaticPruningHook : public IParameterUpdaterHook {
- public:
-  explicit StaticPruningHook(const ParameterUpdaterHookConfig &hookConfig)
-      : initCount_(0) {
-    sparsityRatio_ = hookConfig.sparsity_ratio();
-  }
-
-  static bool sortPairAscend(const std::pair<real, size_t> &pair1,
-                             const std::pair<real, size_t> &pair2) {
-    return pair1.first > pair2.first;
-  }
-
-  void update(Parameter *para) {
-    updateThreadChecker_.check();
-    auto &vec = para->getBuf(PARAMETER_GRADIENT);
-    if (vec) {
-      vec->dotMul(*maskVec_);
-    }
-  }
-
-  void generateMask(Parameter *para) {
-    VectorPtr maskTemp = Vector::create(para->getSize(), false);
-    maskTemp->zeroMem();
-    real *maskTempData = maskTemp->getData();
-    size_t nonZeroNum = para->getSize() * (1 - sparsityRatio_);
-
-    VectorPtr paraVec = para->getBuf(PARAMETER_VALUE);
-    VectorPtr paraCpuCopy = Vector::create(para->getSize(), false);
-
-    paraCpuCopy->copyFrom(*paraVec);
-    std::vector<std::pair<real, size_t>> param;
-
-    for (size_t i = 0; i < para->getSize(); i++)
-      param.push_back(std::make_pair(fabs(paraCpuCopy->getData()[i]), i));
-
-    std::partial_sort(
-        param.begin(), param.begin() + nonZeroNum, param.end(), sortPairAscend);
-    for (size_t i = 0; i < nonZeroNum; i++) maskTempData[param[i].second] = 1.0;
-
-    // Currently just use a mask vector for hack.
-    if (para->useGpu()) {
-      maskVec_ = Vector::create(para->getSize(), para->useGpu());
-      maskVec_->copyFrom(*maskTemp);
-    } else {
-      maskVec_ = maskTemp;
-    }
-  }
-
-  void init(Parameter *para) {
-    generateMask(para);
-    size_t initCount = this->initCount_.fetch_add(1);
-    CHECK_EQ(initCount, 0UL) << "Currently the StaticPruningHook must invoke "
-                                "in same ParamterUpdater";
-    VLOG(3) << "Initialize Parameter " << para;
-    SetDevice device(para->getDeviceId());
-
-    auto &paraVec = para->getBuf(PARAMETER_VALUE);
-    paraVec->dotMul(*maskVec_);
-  }
-
- private:
-  SameThreadChecker updateThreadChecker_;
-  std::atomic<size_t> initCount_;
-  VectorPtr maskVec_;
-  real sparsityRatio_;
-};
-
-IParameterUpdaterHook::IParameterUpdaterHook() {}
-
-IParameterUpdaterHook::~IParameterUpdaterHook() {}
-
-/**
- * A Hasher used by g_hooks.
- *
- * Use the independent hasher intendedly. There is a hasher in PServer for hash
- * ParameterBlock. But not to use same hasher to reduce dependency.
- *
- * May be extracted to Util.h to unify the hasher.
- */
-class StringIntPairHasher {
- public:
-  size_t operator()(const std::pair<std::string, int> &k) const {
-    return intHasher_(strHasher_(k.first) + k.second);
-  }
-
- private:
-  std::hash<std::string> strHasher_;
-  std::hash<int> intHasher_;
-};
-
-static WeakKVCache<std::pair<std::string, int>,
-                   IParameterUpdaterHook,
-                   StringIntPairHasher>
-    g_hookCache_;
-
-/**
- * ParameterUpdaterHook actually factory method.
- */
-static IParameterUpdaterHook *createImpl(
-    const ParameterUpdaterHookConfig &config) {
-  auto &type = config.type();
-  if (type == "pruning") {
-    return new StaticPruningHook(config);
-  }
-
-  LOG(FATAL) << "Unknown Hook type:  " << type;
-  return nullptr;
-}
-
-std::shared_ptr<IParameterUpdaterHook> IParameterUpdaterHook::create(
-    const ParameterConfig &paramConfig, int idx) {
-  std::pair<std::string, int> key = {paramConfig.name(), idx};
-  return g_hookCache_.get(
-      key, [&] { return createImpl(paramConfig.update_hooks(idx)); });
-}
-
-}  // namespace paddle
diff --git a/paddle/parameter/Regularizer.cpp b/paddle/parameter/Regularizer.cpp
deleted file mode 100644
index d223fd2df679af1e983e84f48a4d3b0715ce1569..0000000000000000000000000000000000000000
--- a/paddle/parameter/Regularizer.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Regularizer.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/Util.h"
-
-namespace paddle {
-
-Regularizer* Regularizer::get(const std::vector<ParameterType>& types,
-                              const ParameterConfig& paraConfig) {
-  bool useLearningRateVec =
-      std::find(types.begin(), types.end(), PARAMETER_LEARNING_RATE) !=
-      types.end();
-  if (paraConfig.decay_rate_l1() > 0.0f &&
-      paraConfig.decay_rate() > 0.0f) {  // use L1 and L2
-    if (useLearningRateVec) {
-      static L1L2LrRegularizer regularizer_;
-      return &regularizer_;
-    }
-    static L1L2Regularizer regularizer_;
-    return &regularizer_;
-  }
-  if (paraConfig.decay_rate_l1() > 0.0f) {  // use L1 only
-    if (useLearningRateVec) {
-      static L1LrRegularizer regularizer_;
-      return &regularizer_;
-    }
-    static L1Regularizer regularizer_;
-    return &regularizer_;
-  }
-  if (paraConfig.decay_rate() > 0.0f) {  // use L2 only
-    if (useLearningRateVec) {
-      static L2LrRegularizer regularizer_;
-      return &regularizer_;
-    }
-    static L2Regularizer regularizer_;
-    return &regularizer_;
-  }
-  return nullptr;
-}
-
-}  // namespace paddle
diff --git a/paddle/parameter/ThreadLocalBuffer.h b/paddle/parameter/ThreadLocalBuffer.h
deleted file mode 100644
index 07c96e59d0bc0a58ce9956a54e7de359896e5618..0000000000000000000000000000000000000000
--- a/paddle/parameter/ThreadLocalBuffer.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/math/Vector.h"
-
-namespace paddle {
-namespace parameter {
-extern VectorPtr* getThreadLocalBuffer();
-}  // namespace parameter
-}  // namespace paddle
diff --git a/paddle/parameter/Weight.cpp b/paddle/parameter/Weight.cpp
deleted file mode 100644
index ba4ddce69fb9c2ad0fa937efca5ba470247978e4..0000000000000000000000000000000000000000
--- a/paddle/parameter/Weight.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Weight.h"
-#include "paddle/utils/Logging.h"
-
-namespace paddle {
-
-Weight::Weight(size_t height, size_t width, ParameterPtr param) {
-  VectorPtr vPtr = param->getBuf(PARAMETER_VALUE);
-  VectorPtr gPtr = param->getBuf(PARAMETER_GRADIENT);
-
-  // create a new weight
-  if (param->isSparse()) {
-    CHECK_LE(param->getSize(), width * height);
-  } else {
-    CHECK_EQ(param->getSize(), width * height);
-  }
-
-  // weight_
-  weight_ = param->getMat(PARAMETER_VALUE);
-  if (!weight_ && vPtr) {
-    weight_ = Matrix::create(vPtr->getMemoryHandle(), height, width);
-  }
-  if (weight_) {
-    CHECK_EQ(height, weight_->getHeight());
-    CHECK_EQ(width, weight_->getWidth());
-  }
-
-  // weightGrad
-  weightGrad_ = param->getMat(PARAMETER_GRADIENT);
-  if (!weightGrad_ && gPtr) {
-    weightGrad_ = Matrix::create(gPtr->getMemoryHandle(), height, width);
-  }
-  if (weightGrad_) {
-    CHECK_EQ(height, weightGrad_->getHeight());
-    CHECK_EQ(width, weightGrad_->getWidth());
-  }
-
-  parameter_ = param;
-}
-
-Weight::Weight(size_t height, size_t width, ParameterPtr param, size_t offset) {
-  VectorPtr vPtr = param->getBuf(PARAMETER_VALUE);
-  VectorPtr gPtr = param->getBuf(PARAMETER_GRADIENT);
-
-  // create a new weight
-  CHECK_LE(offset + width * height, param->getSize());
-
-  // weight_
-  if (vPtr) {
-    weight_ = Matrix::create(vPtr->getData() + offset,
-                             height,
-                             width,
-                             /* trans */ false,
-                             param->useGpu());
-  }
-
-  // weightGrad
-  if (gPtr) {
-    weightGrad_ = Matrix::create(gPtr->getData() + offset,
-                                 height,
-                                 width,
-                                 /* trans */ false,
-                                 param->useGpu());
-  }
-
-  parameter_ = param;
-}
-
-const ParameterPtr& Weight::getParameterPtr() { return parameter_; }
-void Weight::setParameterPtr(ParameterPtr param) { parameter_ = param; }
-}  // namespace paddle
diff --git a/paddle/parameter/Weight.h b/paddle/parameter/Weight.h
deleted file mode 100644
index 113dd6530c82fe1e831ad4a35e9cbcb9880b9243..0000000000000000000000000000000000000000
--- a/paddle/parameter/Weight.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <memory>
-#include <vector>
-
-#include "paddle/math/Matrix.h"
-#include "paddle/math/SparseRowMatrix.h"
-#include "paddle/parameter/Parameter.h"
-
-namespace paddle {
-
-class Weight {
- private:
-  MatrixPtr weight_;
-  MatrixPtr weightGrad_;
-  ParameterPtr parameter_;
-
- public:
-  Weight(size_t height, size_t width, ParameterPtr parameter);
-  Weight(size_t height, size_t width, ParameterPtr parameter, size_t offset);
-
-  const MatrixPtr& getW() { return weight_; }
-  const MatrixPtr& getWGrad() { return weightGrad_; }
-  const ParameterPtr& getParameterPtr();
-
-  void incUpdate(const UpdateCallback& callback) {
-    getParameterPtr()->incUpdate(callback);
-  }
-
-  void setParameterPtr(ParameterPtr param);
-};
-
-typedef std::vector<std::unique_ptr<Weight>> WeightList;
-
-}  // namespace paddle
diff --git a/paddle/parameter/tests/test_argument.cpp b/paddle/parameter/tests/test_argument.cpp
deleted file mode 100644
index 54ceb3e08714e37abb5d491c8973bee631b993be..0000000000000000000000000000000000000000
--- a/paddle/parameter/tests/test_argument.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/parameter/Argument.h>
-
-using namespace paddle;  // NOLINT
-
-TEST(Argument, poolSequenceWithStride) {
-  Argument input, output;
-  ICpuGpuVector::resizeOrCreate(input.sequenceStartPositions, 5, false);
-  int* inStart = input.sequenceStartPositions->getMutableData(false);
-  inStart[0] = 0;
-  inStart[1] = 9;
-  inStart[2] = 14;
-  inStart[3] = 17;
-  inStart[4] = 30;
-
-  int strideResult[] = {0, 5, 9, 14, 17, 22, 27, 30};
-  int strideResultReversed[] = {0, 4, 9, 14, 17, 20, 25, 30};
-
-  for (auto reversed : {false, true}) {
-    ICpuGpuVectorPtr stridePositions;
-    output.poolSequenceWithStride(
-        input, 5 /* stride */, &stridePositions, reversed);
-
-    const int* outStart = output.sequenceStartPositions->getData(false);
-    CHECK_EQ(outStart[0], 0);
-    CHECK_EQ(outStart[1], 2);
-    CHECK_EQ(outStart[2], 3);
-    CHECK_EQ(outStart[3], 4);
-    CHECK_EQ(outStart[4], 7);
-
-    CHECK_EQ(stridePositions->getSize(), 8UL);
-    auto result = reversed ? strideResultReversed : strideResult;
-    for (int i = 0; i < 8; i++) {
-      CHECK_EQ(stridePositions->getData(false)[i], result[i]);
-    }
-  }
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/parameter/tests/test_common.cpp b/paddle/parameter/tests/test_common.cpp
deleted file mode 100644
index 89dcc6c751eb2ec07bfe8297c93d56c824086211..0000000000000000000000000000000000000000
--- a/paddle/parameter/tests/test_common.cpp
+++ /dev/null
@@ -1,174 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <paddle/utils/Util.h>
-#include <stdlib.h>
-
-#include <gtest/gtest.h>
-#include <paddle/parameter/ParameterUpdateFunctions.h>
-#include <paddle/utils/Flags.h>
-#include <paddle/utils/Stat.h>
-#include <paddle/utils/Thread.h>
-
-using namespace paddle;  // NOLINT
-
-class CommonTest : public ::testing::Test {
- protected:
-  CommonTest() : testStat_("test") {}
-  virtual ~CommonTest() {}
-  virtual void SetUp() {
-    const size_t buffSize[] = {
-        100, 128, 500, 1024, 4096, 10240, 102400, 1000000};
-    sizeVec_.resize(8);
-    memcpy(&sizeVec_[0], &buffSize[0], 8 * sizeof(size_t));
-    valueUint_.resize(4);
-    valueUint_[0].first = 0.0;
-    valueUint_[0].second = 0.0;
-    valueUint_[1].first = 0.0;
-    valueUint_[1].second = 1.0;
-    valueUint_[2].first = 1.0;
-    valueUint_[2].second = 0.0;
-    valueUint_[3].first = 1.0;
-    valueUint_[3].second = 1.0;
-    learningRate_ = 1.0;
-  }
-
-  void test_sgdUpadate(real* gradientBuffer,
-                       real* valueBuffer,
-                       real* momentumBuffer,
-                       size_t size);
-
-  virtual void TreaDown() { LOG(INFO) << "All Test Finished."; }
-
- protected:
-  std::vector<std::pair<real, real>> valueUint_;
-  std::vector<size_t> sizeVec_;
-  real learningRate_;
-  StatSet testStat_;
-};
-
-void CommonTest::test_sgdUpadate(real* gradientBuffer,
-                                 real* valueBuffer,
-                                 real* momentumBuffer,
-                                 size_t size) {
-// sgdUpdateAvx has no double version yet
-#if defined(__AVX__) && !defined(PADDLE_TYPE_DOUBLE)
-  real valueSum1 = 0, valueSum2 = 0, momSum1 = 0, momSum2 = 0;
-  real* gradTmp = new real[size];
-  real* valueTmp = new real[size];
-  real* momentumTmp = new real[size];
-  memcpy(gradTmp, gradientBuffer, size * sizeof(real));
-  memcpy(valueTmp, valueBuffer, size * sizeof(real));
-  memcpy(momentumTmp, momentumBuffer, size * sizeof(real));
-  for (auto& arg : valueUint_) {
-    {
-      {
-        struct timeval t;
-        REGISTER_TIMER("gettimeofday", 0, testStat_);
-        gettimeofday(&t, NULL);
-      }
-      REGISTER_TIMER("avxTimer", 0);
-      sgdUpdateAvx(learningRate_,
-                   arg.first,
-                   arg.second,
-                   size,
-                   valueBuffer,
-                   gradientBuffer,
-                   momentumBuffer);
-    }
-    for (size_t i = 0; i < size; i++) {
-      valueSum1 += valueBuffer[i];
-      momSum1 += momentumBuffer[i];
-      // std::cout << "["
-      //          << valueBuffer[i]
-      //          << "," << momentumBuffer[i]
-      //          << "," << gradientBuffer[i] << "],";
-    }
-    {
-      REGISTER_TIMER("cpuTimer", 0);
-      sgdUpdateCpu(learningRate_,
-                   arg.first,
-                   arg.second,
-                   size,
-                   valueTmp,
-                   gradTmp,
-                   momentumTmp);
-    }
-    for (size_t i = 0; i < size; i++) {
-      valueSum2 += valueTmp[i];
-      momSum2 += momentumTmp[i];
-      // std::cout << "["
-      //          << valueTmp[i]
-      //          << "," << momentumTmp[i]
-      //          << "," << gradTmp[i] << "],";
-    }
-
-    VLOG(3) << "valueSum1 = " << valueSum1 << " ; valueSum2 = " << valueSum2;
-    VLOG(3) << "momSum1 = " << momSum1 << " ; momSum2 = " << momSum2;
-    ASSERT_EQ(valueSum1, valueSum2);
-    ASSERT_EQ(momSum1, momSum2);
-  }
-  delete[] gradTmp;
-  delete[] valueTmp;
-  delete[] momentumTmp;
-#endif
-}
-
-TEST_F(CommonTest, sgdUpdate) {
-  const size_t alignHeader[] = {0, 2, 3, 5, 7, 8};
-  for (auto& size : sizeVec_) {
-    real *gradientBuffer, *valueBuffer, *momentumBuffer;
-    CHECK_EQ(posix_memalign((void**)&gradientBuffer, 32, sizeof(real) * size),
-             0);
-    CHECK_EQ(posix_memalign((void**)&valueBuffer, 32, sizeof(real) * size), 0);
-    CHECK_EQ(posix_memalign((void**)&momentumBuffer, 32, sizeof(real) * size),
-             0);
-
-    for (size_t i = 0; i < size; i++) {
-      gradientBuffer[i] = 1.0;
-      valueBuffer[i] = 2.0;
-      momentumBuffer[i] = 3.0;
-    }
-    for (int i = 0; i < 6; i++) {
-      LOG(INFO) << "----------------------" << size << ":" << alignHeader[i]
-                << "-------------------------";
-      test_sgdUpadate(&gradientBuffer[alignHeader[i]],
-                      &valueBuffer[alignHeader[i]],
-                      &momentumBuffer[alignHeader[i]],
-                      size - alignHeader[i]);
-    }
-    free(gradientBuffer);
-    free(valueBuffer);
-    free(momentumBuffer);
-  }
-  globalStat.printAllStatus();
-  testStat_.printAllStatus();
-}
-
-TEST_F(CommonTest, syncThreadPool) {
-  SyncThreadPool pool(10);
-
-  std::vector<int> nums;
-  nums.resize(10);
-
-  pool.exec([&](int tid, size_t numThreads) { nums[tid] = tid; });
-  for (size_t i = 0; i < nums.size(); ++i) {
-    EXPECT_EQ((int)i, nums[i]);
-  }
-
-  pool.exec([&](int tid, size_t numThreads) { nums[tid] -= tid; });
-  for (size_t i = 0; i < nums.size(); ++i) {
-    EXPECT_EQ((int)0, nums[i]);
-  }
-}
diff --git a/paddle/pserver/BaseClient.cpp b/paddle/pserver/BaseClient.cpp
deleted file mode 100644
index a6204ef47ea553246ddadbb2eae6cc714cafe594..0000000000000000000000000000000000000000
--- a/paddle/pserver/BaseClient.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "BaseClient.h"
-#include <gflags/gflags.h>
-#include <string.h>
-#include <vector>
-#include "paddle/utils/Stat.h"
-
-DECLARE_string(pservers);
-
-namespace paddle {
-
-BaseClient::BaseClient(bool separate, int numPorts)
-    : stopping_(false), numPorts_(numPorts), separateSendAndRecv_(separate) {
-  CHECK_GT(numPorts, 0);
-}
-
-BaseClient::~BaseClient() {}
-
-void BaseClient::recvData() { recvSyncBarrier_->wait(); }
-
-void BaseClient::synchronize(SyncObject syncObjectId) {
-  SynchronizeRequest request;
-  request.set_sync_object_id(syncObjectId);
-  std::vector<SynchronizeResponse> responses;
-  multiCall(__func__, request, &responses);
-}
-
-void BaseClient::startThreads() {
-  if (!separateSendAndRecv_) {
-    return;
-  }
-  recvSyncBarrier_.reset(new ThreadBarrier(threadNum_ + 1));
-
-  sendThreads_.resize(threadNum_);
-  recvThreads_.resize(threadNum_);
-  sendJobQueue_.resize(threadNum_);
-  recvJobQueue_.resize(threadNum_);
-
-  for (int i = 0; i < threadNum_; ++i) {
-    sendJobQueue_[i].reset(new SendQueue());
-    recvJobQueue_[i].reset(new SendQueue());
-
-    sendThreads_[i].reset(
-        new std::thread([this](int id) { this->send(id); }, i));
-
-    recvThreads_[i].reset(
-        new std::thread([this](int id) { this->recv(id); }, i));
-  }
-}
-
-void BaseClient::finishThreads() {
-  if (!separateSendAndRecv_) {
-    return;
-  }
-  stopping_ = true;
-  for (int i = 0; i < threadNum_; i++) {
-    sendJobQueue_[i]->enqueue(nullptr);
-  }
-  for (auto& thread : sendThreads_) {
-    thread->join();
-  }
-  for (auto& thread : recvThreads_) {
-    thread->join();
-  }
-  stopping_ = false;
-}
-}  // namespace paddle
diff --git a/paddle/pserver/BaseClient.h b/paddle/pserver/BaseClient.h
deleted file mode 100644
index d50230e73a3a7d128cbfd1d70517fddd228fb1bb..0000000000000000000000000000000000000000
--- a/paddle/pserver/BaseClient.h
+++ /dev/null
@@ -1,311 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "ParameterService.pb.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/pserver/ProtoServer.h"
-#include "paddle/utils/Common.h"
-#include "paddle/utils/Queue.h"
-
-namespace paddle {
-
-/**
- * it manages all connections to pservers.
- * it exists two modes to manage connections to all pservers. Firstly, one
- * connection owns two threads that separately manage to send and receive
- * data. Secondly, each thread uses one connection for all activation in it.
- * the first solution arms with sendThreads_/recvThreads_ and sendJobQueue_/
- * recvJobQueue_. the second solution use some shared thread pool to manage
- * connections.
- */
-class BaseClient {
- protected:
-  typedef std::unique_ptr<std::thread> ThreadPtr;
-  typedef std::vector<std::vector<iovec>> InputIovs;
-  typedef std::vector<SendParameterRequest> SendRequest;
-  typedef std::vector<SendDataRequest> SendDataRequestVec;
-
-  // TODO(yanfei):
-  // refine data structure to unify parameter and features communication
-  struct SendJob {
-    /// store parameters related blocks data
-    InputIovs parallelInputIovs;
-    /// store protobuf request
-    SendRequest parallelRequests;
-    /// store data, such as features for metric learning
-    SendDataRequestVec parallelDataRequests;
-  };
-
- public:
-  explicit BaseClient(bool separate = false, int numPorts = FLAGS_ports_num);
-
-  virtual ~BaseClient();
-
-  typedef std::shared_ptr<SendJob> SendJobPtr;
-  typedef Queue<SendJobPtr> SendQueue;
-
-  /// send data to server, support only synchronize
-  template <class DataType>
-  void putData(int clientId,
-               SendDataType type,
-               DataType* datas,
-               size_t size,
-               DataUpdateMode mode) {
-    synchronize(SYNC_DATA);
-    sendData(clientId, type, mode, datas, size);
-    recvData();
-    synchronize(SYNC_DATA);
-  }
-
-  template <class DataType>
-  void putOwnData(int clientId,
-                  SendDataType type,
-                  DataType* datas,
-                  size_t size) {
-    putData(clientId, type, datas, size, DATA_UPDATE_MODE_SET_OWN);
-  }
-
-  template <class DataType>
-  void getAllData(int clientId,
-                  SendDataType type,
-                  DataType* datas,
-                  size_t size) {
-    sendData(clientId,
-             type,
-             DATA_UPDATE_MODE_GET_ALL,
-             reinterpret_cast<DataType*>(NULL),
-             0);
-    recvData();
-    size_t dataOffset = 0;
-    for (auto& recvMem : recvDataMems_) {
-      CHECK_LE(dataOffset, size);
-      size_t memSize = std::min(recvMem.get()->getSize(),
-                                sizeof(DataType) * (size - dataOffset));
-      CHECK_EQ(memSize % sizeof(DataType), size_t(0));
-      memcpy(datas + dataOffset, recvMem.get()->getBuf(), memSize);
-      dataOffset += memSize / sizeof(DataType);
-    }
-    CHECK_EQ(dataOffset, size);
-  }
-
-  /**
-   * Reduces values on all clients.
-   * This reduce just support SUM.
-   * The results are saved in recvBuf of rootId client
-   */
-  template <class DataType>
-  void reduce(DataType* sendBuf,
-              DataType* recvBuf,
-              size_t size,
-              int clientId,
-              int rootId) {
-    putOwnData(clientId, DATA_REDUCE_SUM, sendBuf, size);
-    if (rootId == clientId) {
-      getAllData(clientId, DATA_REDUCE_SUM, recvBuf, size);
-    }
-  }
-
-  /**
-   * return trans data type according to the input type
-   */
-  virtual TransDataType getTransDtype(const std::type_info& info) {
-    TransDataType dataType;
-    if (typeid(int*) == info) {  // NOLINT
-      dataType = TRANS_INT32;
-    } else if (typeid(uint32_t*) == info) {  // NOLINT
-      dataType = TRANS_UINT32_T;
-    } else if (typeid(int64_t*) == info) {  // NOLINT
-      dataType = TRANS_INT64_T;
-    } else if (typeid(uint64_t*) == info) {  // NOLINT
-      dataType = TRANS_UINT64_T;
-    } else if (typeid(float*) == info) {  // NOLINT
-      dataType = TRANS_FLOAT;
-    } else if (typeid(double*) == info) {  // NOLINT
-      dataType = TRANS_DOUBLE;
-    } else {
-      LOG(FATAL) << "not supported";
-    }
-    return dataType;
-  }
-
- protected:
-  /// for a > 0, b > 0:
-  /// return the smallest x s.t. b*x >= a
-  static int divup(int a, int b) { return (a + b - 1) / b; }
-
-  int calcClientId(int i, int serviceNum) {
-    return (i + FLAGS_trainer_id * numPorts_) % serviceNum;
-  }
-
-  /// start threads in sendThreads_ and recvThreads_
-  void startThreads();
-
-  /// finish threads in sendThreads_ and recvThreads_
-  void finishThreads();
-
-  template <class DataType>
-  void prepareData(int clientId,
-                   SendDataType type,
-                   DataUpdateMode updateMode,
-                   DataType* datas,
-                   size_t size,
-                   SendJob* sendJob) {
-    sendJob->parallelDataRequests.resize(serviceNum_);
-    sendJob->parallelInputIovs.resize(serviceNum_);
-    for (int i = 0; i < serviceNum_; ++i) {
-      auto& request = sendJob->parallelDataRequests[i];
-      request.set_update_mode(updateMode);
-      request.set_type(type);
-      request.set_client_id(clientId);
-      request.set_server_id(i);
-    }
-
-    /// split datas which need send to Server into serviceNum_ pieces
-    if (!datas) {
-      CHECK(!size) << "ownSize should be zero since datas is nullptr";
-    }
-    size_t baseSize = size / serviceNum_;
-    size_t dataOffset = 0;
-    for (int i = 0; i < serviceNum_; ++i) {
-      auto& request = sendJob->parallelDataRequests[i];
-      DataBlock* block = request.add_blocks();
-      size_t ownSize = size_t(i) < size % serviceNum_ ? baseSize + 1 : baseSize;
-      size_t realSize = datas ? std::max(ownSize, size_t(1)) : 0;
-      block->set_total_size(realSize * sizeof(DataType));
-      block->set_data_size(sizeof(DataType));
-      // TODO(yuyang18): The getTransDtype can be rewritten as template method
-      //                 to reduce runtime overhead.
-      block->set_data_type(getTransDtype(typeid(DataType*)));  // NOLINT
-      if (datas) {
-        sendJob->parallelInputIovs[i].push_back(
-            {datas + dataOffset, realSize * sizeof(DataType)});
-      }
-      dataOffset += ownSize;
-    }
-    CHECK_EQ(dataOffset, size);
-  }
-
-  /**
-   * @brief send data to all data servers
-   *
-   * @note  each trainer sends all its data to all data servers
-   *        it's for broadcast data synchronization, such as features
-   *        synchronization in metric learning.
-   */
-  template <class DataType>
-  void sendData(int clientId,
-                SendDataType type,
-                DataUpdateMode updateMode,
-                DataType* datas,
-                size_t size) {
-    SendJobPtr sendJob = std::make_shared<SendJob>();
-    prepareData(clientId, type, updateMode, datas, size, sendJob.get());
-    for (int i = 0; i < threadNum_; ++i) {
-      sendJobQueue_[i]->enqueue(sendJob);
-    }
-  }
-
-  /**
-   * @brief recv data from all data servers
-   *
-   * @note  synchronize all recv threads
-   */
-  void recvData();
-
-  /// send request, and recv responses
-  template <typename ProtoIn, typename ProtoOut>
-  void multiCall(const char* funcName,
-                 const ProtoIn& request,
-                 std::vector<ProtoOut>* responses) {
-    responses->resize(clients_.size());
-    size_t numClients = clients_.size();
-    for (size_t i = 0; i < numClients; ++i) {
-      clients_[i].send(funcName, request);
-    }
-    for (size_t i = 0; i < numClients; ++i) {
-      clients_[i].recv(&(*responses)[i]);
-    }
-  }
-
-  /**
-   * @brief synchronize all trainers and pservers
-   *
-   * @note  used to ensure that data of all trainers have been received
-   */
-  void synchronize(SyncObject syncObjectId = SYNC_DEFAULT);
-
-  /**
-   * @brief use multithread to separately send data
-   *
-   * @note  each thread should read its own JobQueue to handle requests
-   *        each thread should calcClientId() to retrieve connections
-   *        managed by himself.
-   *        send and recv are implemented in child class.
-   */
-  virtual void send(int threadId) = 0;
-
-  /**
-   * @brief use multithread to separately receive data
-   *
-   * @note  almost same as send()
-   */
-  virtual void recv(int threadId) = 0;
-
- protected:
-  bool stopping_;
-  /// nodes * ports that means the number of real pservers
-  int serviceNum_;
-  /**
-   * threads num for managing all services. Normally the
-   * number of pservers are relatively less than several
-   * hundreds so that using thread-based parallelization
-   * can benifit traffic performance and pserver's sgd
-   * optimization performance.
-   */
-  int threadNum_;
-  /// the connection manager at client end
-  std::vector<ProtoClient> clients_;
-  /// send threads for parallelization
-  std::vector<ThreadPtr> sendThreads_;
-  /// recv threads for parallelization
-  std::vector<ThreadPtr> recvThreads_;
-  std::unique_ptr<ThreadBarrier> recvSyncBarrier_;
-
-  // TODO(yanfei):
-  // current pserver's will return value until all parameters'
-  // optimization are finished so that recv are not overlapped
-  // in reality. More robust implimentation should be to pipeline
-  // all send/recv action based on parameter unit level, and
-  // it will benifits deep and larger model training in future,
-  // especially local node compution power surpasses inter-connection
-  // such as GPU cluster, even with BOX GPU cluster.
-  // queue for buffering send request
-  /**
-   * send/recv queue cooperates with each other to accomplish
-   * overlapping communication with forwardBackward action.
-   */
-  std::vector<std::unique_ptr<SendQueue>> sendJobQueue_;
-  /// queue for buffering recv request
-  std::vector<std::unique_ptr<SendQueue>> recvJobQueue_;
-  /// specific for dserver
-  SendJob sendJob_;
-  /// port num for each node
-  int numPorts_;
-  /// if set, overlapped optimization is disabled
-  bool separateSendAndRecv_;
-  std::vector<CpuMemHandlePtr> recvDataMems_;
-};
-}  // namespace paddle
diff --git a/paddle/pserver/LightNetwork.cpp b/paddle/pserver/LightNetwork.cpp
deleted file mode 100644
index 4c0da2217e880b7509ea5f42da5ac7ffe93a53ec..0000000000000000000000000000000000000000
--- a/paddle/pserver/LightNetwork.cpp
+++ /dev/null
@@ -1,459 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <fcntl.h>
-#include <netdb.h>
-#include <netinet/in.h>
-#include <netinet/tcp.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-#include <chrono>
-
-#include <arpa/inet.h>
-#include <net/if.h>
-#include <sys/ioctl.h>
-#include <sstream>
-
-#include "LightNetwork.h"
-#include "RDMANetwork.h"
-#include "paddle/utils/StringUtil.h"
-#include "paddle/utils/Util.h"
-
-/// quick ack can reduce the latency of small message
-DEFINE_bool(small_messages,
-            false,
-            "if message size is small, recommend set it True to enable quick "
-            "ack and no delay");
-
-/// reasonable sock_send_buf_size can control the traffic injected into switch
-/// network. Injecting too many data into traffic could cause packets loss which
-/// cause long latency and degrade the efficiency of communication.
-DEFINE_int32(sock_send_buf_size,
-             1024 * 1024 * 40,
-             "restrict sock send buff size, can reduce network congestion if "
-             "set carefully");
-
-/// reasonable size can hold bursted packets and reduce packets loss
-DEFINE_int32(sock_recv_buf_size,
-             1024 * 1024 * 40,
-             "restrict sock recv buff size");
-
-/// reasonable sock_listen_queue_size can control maximum pending connections.
-DEFINE_int32(sock_listen_queue_size,
-             1024,
-             "listen queue size when pserver listen a TCP port");
-
-namespace paddle {
-
-/**
- * @brief get ip address from interface name
- *
- * @param[in] device device interface name
- */
-std::string getIpAddr(std::string &device) {
-  int sock;
-  struct sockaddr_in sin;
-  struct ifreq ifr;
-
-  sock = socket(AF_INET, SOCK_DGRAM, 0);
-  CHECK(sock >= 0) << "Create socket error.";
-
-  strncpy(ifr.ifr_name, device.c_str(), IFNAMSIZ);
-  ifr.ifr_name[IFNAMSIZ - 1] = 0;
-
-  CHECK_GE(ioctl(sock, SIOCGIFADDR, &ifr), 0);
-  memcpy(&sin, &ifr.ifr_addr, sizeof(sin));
-  close(sock);
-  return std::string(inet_ntoa(sin.sin_addr));
-}
-
-/**
- * @brief set sock option
- *
- * @param[in] sockfd sock file descriptor
- *
- * @note adjust some default sock option for better performance
- */
-void setOption(int sockfd) {
-#if !defined(__APPLE__) && !defined(__OSX__)
-  int sendSize = FLAGS_sock_send_buf_size;
-  int recvSize = FLAGS_sock_recv_buf_size;
-  CHECK_GE(
-      setsockopt(sockfd, SOL_SOCKET, SO_RCVBUF, &recvSize, sizeof(recvSize)),
-      0);
-  CHECK_GE(
-      setsockopt(sockfd, SOL_SOCKET, SO_SNDBUF, &sendSize, sizeof(sendSize)),
-      0);
-#endif
-
-  if (FLAGS_small_messages) {
-    int optval = 1;
-    CHECK_GE(
-        setsockopt(sockfd, IPPROTO_TCP, TCP_NODELAY, &optval, sizeof(optval)),
-        0);
-#ifdef TCP_QUICKACK
-    optval = 1;
-    CHECK_GE(
-        setsockopt(sockfd, IPPROTO_TCP, TCP_QUICKACK, &optval, sizeof(optval)),
-        0);
-#endif
-  }
-  int reuse = 1;
-  CHECK_GE(setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof(reuse)),
-           0);
-}
-
-/**
- * @brief class constructor for SocketServer
- * @param[in] addr sock bind address
- * @param[in] port sock bind port
- * @param[in] rdmaCpu rdma sock bind cpu core
- *
- * @note start one socket server which hosts parameter server process.
- *       rdmaCpu is passed to rdma deamon for better performance, and
- *       start tcp socket instead of rdma socket if rdmaCpu is equal
- *       to -1. Each trainer process starts one connection to one socket
- *       server, and use --ports_num to build more connections to harness
- *       fat communication channel if necessary.
- *       each connection is controlled by single thread with blocking
- *       read and write.
- */
-SocketServer::SocketServer(const std::string &addr, int port, int rdmaCpu)
-    : port_(port), addr_(addr), stopping_(false) {
-  if (rdmaCpu == -1) {
-    tcpRdma_ = F_TCP;
-    socket_ = 0;
-    maxPendingConnections_ = FLAGS_sock_listen_queue_size;
-  } else {
-    tcpRdma_ = F_RDMA;
-    rdmaCpu_ = rdmaCpu;
-    rdmaSocket_ = 0;
-
-    std::stringstream ss;
-    ss << port;
-    rdmaUri_ = "rdma://" + addr + ":" + ss.str();
-  }
-
-  /// trigger to initialize RDMA lib
-  CHECK(RdmaClientDaemons::get()) << "initilizate RDMA failed\n";
-}
-
-SocketServer::~SocketServer() {
-  stopping_ = true;
-  /// trigger accept thread to stop
-  {
-    SocketClient trigger(addr_.empty() ? "127.0.0.1" : addr_, port_, tcpRdma_);
-  }
-  this->join();
-}
-
-/**
- * @brief start one tcp server which hosts parameter server
- *
- * @note do tcp socket bind and listen. it will spawn one thread
- *       for each connection
- */
-void SocketServer::tcpServer() {
-  int newsockfd;
-  socklen_t clilen;
-  struct sockaddr_in serv_addr, cli_addr;
-  struct hostent *server;
-
-  /// First call to socket() function
-  socket_ = socket(AF_INET, SOCK_STREAM, 0);
-  CHECK(socket_ >= 0) << "ERROR opening socket";
-
-  /// Initialize socket structure
-  bzero((char *)&serv_addr, sizeof(serv_addr));
-  serv_addr.sin_family = AF_INET;
-  serv_addr.sin_port = htons(port_);
-  if (!addr_.empty()) {
-    server = gethostbyname(addr_.c_str());
-    CHECK(server) << "ERROR, no such host: " << addr_;
-    bcopy((char *)server->h_addr,
-          (char *)&serv_addr.sin_addr.s_addr,
-          server->h_length);
-  } else {
-    serv_addr.sin_addr.s_addr = INADDR_ANY;
-  }
-
-  setOption(socket_);
-
-  /// Now bind the host address using bind() call.
-  CHECK(bind(socket_, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) >= 0)
-      << "ERROR on binding " << addr_;
-
-  /// Now start listening for the clients, here process will
-  /// go in sleep mode and will wait for the incoming connection
-  listen(socket_, maxPendingConnections_);
-  clilen = sizeof(cli_addr);
-
-  while (true) {
-    /// Accept actual connection from the client
-    newsockfd = accept(socket_, (struct sockaddr *)&cli_addr, &clilen);
-    if (stopping_) {
-      break;
-    }
-    CHECK(newsockfd >= 0) << "ERROR on accept";
-    constexpr int kPeerNameLen = 128;
-    char peerName[kPeerNameLen];
-    CHECK(inet_ntop(AF_INET, &cli_addr.sin_addr, peerName, kPeerNameLen));
-
-    SocketWorker *worker =
-        new SocketWorker(createChannel(newsockfd, std::string(peerName)), this);
-    worker->start();
-    worker->detach();
-  }
-  close(socket_);
-  LOG(INFO) << "pserver accept thread finish, addr=" << addr_
-            << " port=" << port_;
-}
-
-/**
- * @brief start one rdma server which hosts parameter server
- *
- * @note do rdma bind and listen, which calling self-defined socket
- *       like rdma library. it will spawn one thread for each connection
- */
-void SocketServer::rdmaServer() {
-  struct sxi_sock *newsock;
-
-  /// First call to socket() function
-  rdmaSocket_ = rdma::ssocket(rdmaCpu_);
-  CHECK(rdmaSocket_) << "ERROR opening RDMA socket";
-
-  CHECK(rdma::bind(rdmaSocket_, rdmaUri_.c_str()) == 0)
-      << "ERROR bind RDMA socket";
-
-  /// Now start listening for the clients, here process will
-  /// go in sleep mode and will wait for the incoming connection
-  CHECK(rdma::listen(rdmaSocket_) == 0) << "ERROR listen RDMA socket";
-
-  while (true) {
-    /// Accept actual connection from the client
-    newsock = rdma::accept(rdmaSocket_);
-    if (stopping_) {
-      break;
-    }
-    CHECK(newsock) << "ERROR on accept";
-
-    constexpr int kPeerNameLen = 128;
-    char peerName[kPeerNameLen];
-
-    struct sockaddr_in *saddr = rdma::getSourceAddress(newsock);
-    CHECK(inet_ntop(AF_INET, &saddr->sin_addr, peerName, kPeerNameLen));
-
-    SocketWorker *worker =
-        new SocketWorker(createChannel(newsock, std::string(peerName)), this);
-    worker->start();
-    worker->detach();
-  }
-  rdma::close(rdmaSocket_);
-  LOG(INFO) << "pserver accept thread finish, rdma uri=" << rdmaUri_;
-}
-
-/**
- * @brief start a socket server
- *
- * @note framework for starting socket server
- */
-void SocketServer::run() {
-  if (tcpRdma_ == F_TCP) {
-    LOG(INFO) << "tcp server start ";
-    tcpServer();
-  } else if (tcpRdma_ == F_RDMA) {
-    LOG(INFO) << "rdma server start ";
-    rdmaServer();
-  }
-}
-
-/**
- * @brief class constructor for rdma client deamons
- *
- * @note  automatically start several client deamons for better performance
- */
-std::unique_ptr<RdmaClientDaemons> RdmaClientDaemons::daemons_ = nullptr;
-std::once_flag RdmaClientDaemons::initDataFlag_;
-
-RdmaClientDaemons::RdmaClientDaemons() {
-  if (FLAGS_rdma_tcp == "rdma") {
-    rdma::init();
-
-    struct sxi_socket *socket;
-    onlineCpus_ = rdma::numCpus();
-    for (auto i = 0; i < onlineCpus_; i++) {
-      socket = rdma::csocket(i);
-      CHECK(socket) << "ERROR open client socket daemon";
-
-      rdmaClientSocket_.push_back(socket);
-    }
-    LOG(INFO) << "RDMA client daemons started, onlineCpus_:" << onlineCpus_;
-    /// round robin scheduler for new connection
-    curCpu_ = 0;
-    /// wait daemons to start completely.
-    sleep(2);
-  }
-}
-
-RdmaClientDaemons::~RdmaClientDaemons() {
-  if (FLAGS_rdma_tcp == "rdma") {
-    for (auto i = 0; i < onlineCpus_; i++) {
-      rdma::close(rdmaClientSocket_[i]);
-    }
-    LOG(INFO) << "RDMA client daemons is destoryed, onlineCpus_ "
-              << onlineCpus_;
-  }
-}
-
-/**
- * @brief worker thread main context
- *
- * @note  each connection from client(trainer) is controlled by single worker
- *        thread, which is for handling all parameter server requests
- */
-void SocketWorker::run() {
-  LOG(INFO) << "worker started, peer = " << channel_->getPeerName();
-
-  std::vector<iovec> inputIovs;
-
-  while (true) {
-    std::unique_ptr<MsgReader> msgReader = channel_->readMessage();
-    if (!msgReader) {
-      break;
-    }
-
-    auto callback = [this](const std::vector<iovec> &outputIovs) {
-      channel_->writeMessage(outputIovs);
-    };
-
-    server_->handleRequest(std::move(msgReader), callback);
-  }
-
-  LOG(INFO) << "worker begin to finish, peer = " << channel_->getPeerName();
-  delete this;
-}
-
-/**
- * @brief start one tcp connection to tcp server
- * @param[in] serverAddr  tcp server ip
- * @param[in] serverPort  tcp server port
- *
- * @note each object contains one channel which accept byte stream
- */
-void SocketClient::TcpClient(const std::string &serverAddr, int serverPort) {
-  struct sockaddr_in serv_addr;
-  struct hostent *server;
-
-  int errRet;  // temp for gethostbyname_r
-
-  /// Create a socket point
-  int sockfd = socket(AF_INET, SOCK_STREAM, 0);
-  CHECK(sockfd >= 0) << "ERROR opening socket";
-
-#if defined(__OSX__) || defined(__APPLE__)
-  server = getipnodebyname(serverAddr.c_str(), AF_INET, AI_DEFAULT, &errRet);
-  CHECK_NE(HOST_NOT_FOUND, errRet) << "ERROR, no such host: " << serverAddr
-                                   << " ret = " << errRet;
-  CHECK(server) << "getipnodebyname error!";
-#else
-  struct hostent hostinfo;
-  char buf[1024];  // temp for gethostbyname_r
-  CHECK_EQ(
-      0,
-      gethostbyname_r(
-          serverAddr.c_str(), &hostinfo, buf, sizeof(buf), &server, &errRet))
-      << "ERROR, no such host: " << serverAddr << " ret = " << errRet;
-  CHECK(server) << "gethostbyname_r error!";
-#endif
-
-  bzero((char *)&serv_addr, sizeof(serv_addr));
-  serv_addr.sin_family = AF_INET;
-  bcopy((char *)server->h_addr,
-        (char *)&serv_addr.sin_addr.s_addr,
-        server->h_length);
-  serv_addr.sin_port = htons(serverPort);
-
-  setOption(sockfd);
-
-  /// Now connect to the server
-  int retry_count = 0;
-  do {
-    if (connect(sockfd, (sockaddr *)&serv_addr, sizeof(serv_addr)) == 0) {
-      break;
-    }
-
-    if (errno == ECONNREFUSED) {
-      LOG(WARNING) << "connection refused by pserver, try again!";
-      if (retry_count++ >= 7) {
-        LOG(FATAL) << "connection refused by pserver, maybe pserver failed!";
-      }
-      std::this_thread::sleep_for(std::chrono::seconds(1));
-    } else {
-      CHECK(errno != 0) << "ERROR connecting to " << serverAddr << ":"
-                        << serverPort << "errorno: " << errno;
-    }
-  } while (errno == ECONNREFUSED);
-
-  channel_.reset(new SocketChannel(sockfd, serverAddr));
-  tcpRdma_ = F_TCP;
-}
-
-/**
- * @brief start one RDMA connection to rdma server
- * @param[in] serverAddr  rdma server ip
- * @param[in] serverPort  rdma server port
- *
- * @note  each object contains one channel which accept byte stream
- *        for rdma, low level sock also provide byte stream api.
- */
-void SocketClient::RdmaClient(const std::string &serverAddr, int serverPort) {
-  struct sxi_sock *sock;
-
-  std::stringstream ss;
-  ss << serverPort;
-
-  std::string rdmaUri = "rdma://" + serverAddr + ":" + ss.str();
-
-  RdmaClientDaemons *daemons = RdmaClientDaemons::daemons_->get();
-  socketDaemon_ = daemons->selectDaemon();
-
-  /// connect to server with socket daemon
-  sock = rdma::connect(socketDaemon_, rdmaUri.c_str());
-  CHECK(sock) << "ERROR connect to server" << rdmaUri;
-
-  std::vector<std::string> seg;
-  str::split(rdmaUri, '/', &seg);
-  std::string server = seg.at(seg.size() - 1);
-  channel_.reset(new SocketChannel(sock, server));
-  tcpRdma_ = F_RDMA;
-}
-
-/**
- * @brief class constructor
- * @param[in] serverAddr pserver ip address
- * @param[in] serverPort pserver port
- * @param[in] ChannelType F_TCP or F_RDMA
- *
- * @note  responsible for building one connection to specified pserver port
- */
-SocketClient::SocketClient(const std::string &serverAddr,
-                           int serverPort,
-                           enum ChannelType channelType) {
-  if (channelType == F_RDMA)
-    RdmaClient(serverAddr, serverPort);
-  else
-    TcpClient(serverAddr, serverPort);
-}
-
-}  // namespace paddle
diff --git a/paddle/pserver/LightNetwork.h b/paddle/pserver/LightNetwork.h
deleted file mode 100644
index bcfc9655e989e80e08e9dce9b8734c0643cbf661..0000000000000000000000000000000000000000
--- a/paddle/pserver/LightNetwork.h
+++ /dev/null
@@ -1,185 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "SocketChannel.h"
-
-#include <atomic>
-#include <memory>
-#include <thread>
-#include <vector>
-
-#include "paddle/utils/Thread.h"
-
-struct sxi_socket;
-
-namespace paddle {
-
-class SocketWorker;
-
-/**
- * @brief class for holding all parameters processing for current port
- *
- * @note  each parameter server inherits from one socket server, each
- *        server contains serveral woker threads which are to parallelize
- *        the processing of computation, but share some common datas stored
- *        in child class of socketserver.
- */
-class SocketServer : public Thread {
-  // rdmaCpu controls the cpu affinity of RDMA server daemon,
-  // which could benifit performance. rdmaCpu = -1 means TCP
-  // is used instead of RDMA transport.
- public:
-  SocketServer(const std::string& addr, int port, int rdmaCpu);
-  ~SocketServer();
-
-  virtual void run();
-
-  typedef std::function<void(const std::vector<iovec>& outputIovs)>
-      ResponseCallback;
-
- protected:
-  //
-  // The derived class needs to implement this function
-  // to handle the request received by SocketWorker
-  // The request is encapsulated by MsgReader, which contains
-  // a set of blocks.
-  virtual void handleRequest(std::unique_ptr<MsgReader> msgReader,
-                             ResponseCallback callback) = 0;
-
-  std::unique_ptr<SocketChannel> createChannel(int sock,
-                                               const std::string& peerName) {
-    return std::unique_ptr<SocketChannel>(new SocketChannel(sock, peerName));
-  }
-  std::unique_ptr<SocketChannel> createChannel(struct sxi_sock* sock,
-                                               const std::string& peerName) {
-    return std::unique_ptr<SocketChannel>(new SocketChannel(sock, peerName));
-  }
-
-  friend class SocketWorker;
-
- private:
-  void rdmaServer();
-  void tcpServer();
-
-  void detach() {}  // detach accept thread is forbidden
-
- protected:
-  enum ChannelType tcpRdma_;
-  // for rdma
-  int rdmaCpu_;
-  std::string rdmaUri_;
-  sxi_socket* rdmaSocket_;
-  // for tcp
-  int port_;
-  std::string addr_;
-  int socket_;
-  int maxPendingConnections_;
-  bool stopping_;
-};
-
-/**
- * @brief class for holding one connection from one trainer
- *
- * @note  all parameter processing will run in the context of this worker
- */
-class SocketWorker : public Thread {
- public:
-  SocketWorker(std::unique_ptr<SocketChannel>&& channel, SocketServer* server)
-      : channel_(std::move(channel)), server_(server) {}
-
-  virtual ~SocketWorker() {}
-
-  virtual void run();
-
- protected:
-  std::unique_ptr<SocketChannel> channel_;
-  SocketServer* server_;
-  enum ChannelType tcpRdma_;
-};
-
-/**
- * @brief class for providing rdma client deamon thread
- *
- * @note  the deamons are required by sock like rdam library. Here
- *        use singleton model for daemons. Each deamon hosts in
- *        single cpu core for better load balance performance
- */
-class RdmaClientDaemons {
- private:
-  RdmaClientDaemons();
-
-  static std::unique_ptr<RdmaClientDaemons> daemons_;
-
- public:
-  static RdmaClientDaemons* get() {
-    std::call_once(RdmaClientDaemons::initDataFlag_,
-                   &RdmaClientDaemons::getInstance);
-
-    return daemons_.get();
-  }
-
-  struct sxi_socket* selectDaemon() {
-    int cpu = curCpu_;
-    curCpu_ = (curCpu_ + 1) % onlineCpus_;
-
-    LOG(INFO) << "select daemon " << cpu << "onlineCpus_ " << onlineCpus_;
-    return rdmaClientSocket_[cpu];
-  }
-
-  ~RdmaClientDaemons();
-
- public:
-  friend class SocketClient;
-
- private:
-  static std::once_flag initDataFlag_;
-  static void getInstance() {
-    if (!daemons_.get()) daemons_.reset(new RdmaClientDaemons());
-  }
-
-  std::vector<struct sxi_socket*> rdmaClientSocket_;
-  std::atomic<int> curCpu_;
-  int onlineCpus_;
-};
-
-/**
- * @brief management for client connection which are from trainers
- *
- * @note  it contains one channel descriptor which used to write and
- *        read data
- */
-class SocketClient {
- public:
-  SocketClient(const std::string& serverAddr,
-               int serverPort,
-               enum ChannelType channelType);
-
-  SocketChannel* getChannel() { return channel_.get(); }
-
- protected:
-  std::unique_ptr<SocketChannel> channel_;
-  struct sxi_socket* socketDaemon_;
-  enum ChannelType tcpRdma_;
-
- private:
-  void RdmaClient(const std::string& serverAddr, int serverPort);
-  void TcpClient(const std::string& serverAddr, int serverPort);
-};
-
-std::string getIpAddr(std::string& device);
-void setOption(int sockfd);
-
-}  // namespace paddle
diff --git a/paddle/pserver/ParameterClient2.cpp b/paddle/pserver/ParameterClient2.cpp
deleted file mode 100644
index 43e4902b0f0f73840624041f19ba7f4eb9a45844..0000000000000000000000000000000000000000
--- a/paddle/pserver/ParameterClient2.cpp
+++ /dev/null
@@ -1,781 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <unistd.h>
-
-#include "ParameterClient2.h"
-#include "paddle/math/SparseRowMatrix.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/StringUtil.h"
-
-DEFINE_string(pservers, "127.0.0.1", "Comma separated addresses of pservers");
-DEFINE_int32(parallel_thread_num, 1, "Thread number for parameter send");
-
-namespace paddle {
-
-template <typename T1, typename T2>
-void copyToRepeatedField(google::protobuf::RepeatedField<T1>* dest,
-                         const T2* src,
-                         size_t size) {
-  dest->Clear();
-  dest->Reserve(size);
-  for (size_t i = 0; i < size; ++i) {
-    dest->AddAlreadyReserved(src[i]);
-  }
-}
-
-ParameterClient2::ParameterClient2(bool separate, int port, int numPorts)
-    : BaseClient(separate, numPorts), port_(port) {
-#ifndef PADDLE_DISABLE_TIMER
-  forwardbackwordTime_ = 0;
-#endif
-}
-
-int ParameterClient2::calcParameterBlockSize(
-    const std::vector<ParameterPtr>& parameters, size_t serviceNum) {
-  size_t totalSize = 0;
-  for (auto& para : parameters) {
-    totalSize += para->getSize();
-  }
-  size_t perServerSize = totalSize / serviceNum;
-
-  int sizeBits = 64 - __builtin_clzl(perServerSize);
-
-  /// 2^10 is min block size
-  /// 2^7 will be max number of blocks in one pserver
-  int blockSizeBits = std::max((sizeBits - 7), 10);
-  return 1 << blockSizeBits;
-}
-
-void ParameterClient2::initThreads() {
-  threadNum_ = serviceNum_;
-  if (FLAGS_parallel_thread_num > 1) {
-    LOG(INFO) << "parallel_thread_num dosent need to set";
-  }
-  syncThreadPool_.reset(new SyncThreadPool(threadNum_));
-  startThreads();
-}
-
-bool ParameterClient2::init(const std::vector<ParameterPtr>& parameters) {
-  destroy();
-
-  std::vector<std::string> hosts;
-  str::split(FLAGS_pservers, ',', &hosts);
-  serviceNum_ = hosts.size() * numPorts_;
-  uint64_t denseBlockSize = calcParameterBlockSize(parameters, serviceNum_);
-
-  /// setup prefetch matrix if exists
-  for (auto& para : parameters) {
-    /// set block size for each parameter
-    para->getConfig().set_parameter_block_size(
-        para->getConfig().sparse_remote_update() ? para->getConfig().dims(1)
-                                                 : denseBlockSize);
-  }
-
-  for (auto& para : parameters) {
-    CHECK_NE(-1UL, para->getID()) << "id in parameter is not initialized";
-    parameterMap_[para->getID()] = para;
-  }
-
-  allSegments_.reserve(parameters.size());
-
-  for (auto& para : parameters) {
-    ParameterSegments segments;
-    segments.name = para->getName();
-    segments.id = para->getID();
-    allSegments_.push_back(segments);
-    if (para->getConfig().sparse_remote_update()) {
-      CHECK_EQ(para->getConfig().parameter_block_size(),
-               para->getConfig().dims(1))
-          << "For sparse remote update parameter,"
-          << " block size is the width of each row.";
-    }
-  }
-
-  /// init clients
-  clients_.reserve(serviceNum_);
-  recvDataMems_.resize(serviceNum_);
-
-  for (size_t i = 0; i < hosts.size(); ++i) {
-    for (int j = 0; j < numPorts_; ++j) {
-      LOG(INFO) << "pserver " << i * numPorts_ + j << " " << hosts[i] << ":"
-                << port_ + j;
-      if (FLAGS_rdma_tcp == "rdma") {
-        clients_.emplace_back(hosts[i], port_ + j, F_RDMA);
-      } else {
-        clients_.emplace_back(hosts[i], port_ + j, F_TCP);
-      }
-    }
-  }
-
-  sparseDistribution_.reset(new SparseParameterDistribution(serviceNum_));
-
-  sleep(2);
-
-  initThreads();
-
-  return true;
-}
-
-ParameterClient2::~ParameterClient2() { destroy(); }
-
-void ParameterClient2::destroy() {
-  if (clients_.empty()) {
-    /// this means not initialized.
-    return;
-  }
-  finishThreads();
-
-  parameterMap_.clear();
-  allSegments_.clear();
-  clients_.clear();
-}
-
-void ParameterClient2::sendParallel(int tid,
-                                    size_t numThreads,
-                                    ParameterType recvParameterType) {
-  int numMyClients = divup(serviceNum_ - tid, numThreads);
-
-  for (int j = 0; j < numMyClients; ++j) {
-    REGISTER_TIMER("client_sendAndRecv_send");
-    int i = numThreads * j + tid;
-    /// Try to make different clients to send data to different pservers
-    /// at the same time so that they will not flood data to the same
-    /// pserver.
-    i = calcClientId(i, serviceNum_);
-    clients_[i].send("sendParameter",
-                     sendJob_.parallelRequests[i],
-                     sendJob_.parallelInputIovs[i]);
-
-    /// clear large structure
-    sendJob_.parallelRequests[i].Clear();
-    sendJob_.parallelInputIovs[i].clear();
-  }
-
-  std::vector<void*> bufs;
-  SendParameterResponse response;
-  for (int j = 0; j < numMyClients; ++j) {
-    REGISTER_TIMER("client_sendAndRecv_recv");
-    int i = numThreads * j + tid;
-    i = calcClientId(i, serviceNum_);
-    auto msgReader = clients_[i].recv(&response);
-    CHECK_EQ(msgReader->getNumBlocks(), (size_t)response.blocks_size());
-    bufs.clear();
-    bufs.reserve(response.blocks_size());
-    for (auto& block : response.blocks()) {
-      auto it = parameterMap_.find(block.para_id());
-      CHECK(it != parameterMap_.end());
-      Parameter* parameter = it->second.get();
-      real* buf = nullptr;
-      if (parameter->getBuf(recvParameterType)) {
-        buf = parameter->getBuf(recvParameterType)->getPoint(block.begin_pos());
-      } else {
-        auto recvMat = dynamic_cast<SparseRowCpuMatrix*>(
-            parameter->getMat(recvParameterType).get());
-        CHECK(recvMat);
-        size_t width = parameter->getConfig().dims(1);
-        // TODO(wuyi): need add lock here? may also cause resize.
-        buf = recvMat->getLocalRow(block.begin_pos() / width);
-      }
-      /// sparse_id is not useful while receiving data since sparse data
-      /// storage is continuous, do commit recieved data as that of dense.
-      bufs.push_back(buf);
-    }
-    msgReader->readBlocks(bufs);
-  }
-}
-
-void ParameterClient2::prepareSendData(
-    ParameterUpdateMode updateMode,
-    ParameterType parameterType,
-    const std::vector<ParameterSegments>& parameterSegments,
-    int64_t numSamples,
-    real cost,
-    bool sendBackParameter,
-    ParameterType sendBackParameterType,
-    BatchStatus batchStatus,
-    SendJob* sendJob) {
-  sendJob->parallelRequests.resize(serviceNum_);
-  sendJob->parallelInputIovs.resize(serviceNum_);
-
-  for (auto& request : sendJob->parallelRequests) {
-#ifndef PADDLE_DISABLE_TIMER
-    if (updateMode == PSERVER_UPDATE_MODE_ADD_GRADIENT) {
-      request.set_forwardbackward_time(forwardbackwordTime_);
-    }
-#endif
-    request.set_trainer_id(trainerId_);
-    request.set_update_mode(updateMode);
-    request.set_send_back_parameter(sendBackParameter);
-    request.set_send_back_parameter_type(sendBackParameterType);
-    request.set_num_samples(numSamples);
-    request.set_cost(cost);
-    request.set_batch_status(batchStatus);
-    CHECK_EQ(request.blocks_size(), 0);
-    VLOG(10) << "request: trainer_id: " << request.trainer_id()
-             << " update_mode" << request.update_mode()
-             << " send_back_parameter: " << request.send_back_parameter()
-             << " send_back_parameter_type: "
-             << request.send_back_parameter_type()
-             << " num_samples: " << request.num_samples()
-             << " cost: " << request.cost()
-             << " batch_status: " << request.batch_status();
-  }
-  for (const auto& segments : parameterSegments) {
-    const auto it = parameterMap_.find(segments.id);
-    CHECK(it != parameterMap_.end());
-    Parameter* parameter = it->second.get();
-    CHECK(parameter != nullptr) << "parameter is nullptr";
-    int64_t nameHash = std::hash<std::string>()(segments.name);
-    bool sendingPara = !(updateMode == PSERVER_UPDATE_MODE_GET_PARAM ||
-                         updateMode == PSERVER_UPDATE_MODE_GET_PARAM_SPARSE ||
-                         updateMode == PSERVER_UPDATE_MODE_SET_PARAM_ZERO);
-    bool sparseUpdate = parameter->getConfig().sparse_remote_update() &&
-                        (updateMode == PSERVER_UPDATE_MODE_ADD_GRADIENT ||
-                         updateMode == PSERVER_UPDATE_MODE_ASYNC_SGD ||
-                         updateMode == PSERVER_UPDATE_MODE_GET_PARAM_SPARSE);
-
-    const auto blockSize = parameter->getConfig().parameter_block_size();
-    CHECK_GE(blockSize, 1LU) << "blockSize should > 0 " << blockSize;
-    const auto paraSize = parameter->getSize();
-    if (sparseUpdate) {
-      auto prefetchMat = std::dynamic_pointer_cast<SparsePrefetchRowCpuMatrix>(
-          parameter->getMat(PARAMETER_VALUE));
-      CHECK(prefetchMat != nullptr) << "prefetchMat is nullptr";
-      auto sendMat = dynamic_cast<SparseRowCpuMatrix*>(
-          parameter->getMat(parameterType).get());
-      CHECK(sendMat != nullptr) << "sendMat is nullptr";
-
-      syncThreadPool_->exec([&](int tid, size_t numThreads) {
-        std::lock_guard<std::mutex> guard(sparseAutoGrowthMutex_);
-        const auto& localIndices = prefetchMat->getLocalIndices();
-        /// num of sparse rows
-        size_t nLocalBlocks = localIndices.size();
-        uint64_t beginDim = 0;
-        uint64_t endDim = 0;
-
-        // HACK(typhoonzero): let it resize first
-        prefetchMat->getLocalRow(nLocalBlocks);
-        sendMat->getLocalRow(nLocalBlocks);
-
-        for (size_t row = 0; row < nLocalBlocks; ++row) {
-          int64_t blockId = localIndices[row];  // local row -> sparse row
-          int serverId = std::abs((blockId + nameHash) % serviceNum_);
-          if (serverId % numThreads != (size_t)tid) {
-            continue;
-          }
-
-          beginDim = blockId * blockSize;
-          endDim = std::min<int64_t>(beginDim + blockSize, paraSize);
-
-          auto& request = sendJob->parallelRequests[serverId];
-          ParameterBlock* block = request.add_blocks();
-          block->set_para_id(segments.id);
-          /// global sparse row id
-          block->set_block_id(blockId);
-          /// local row offset
-          block->set_begin_pos(row * blockSize);
-          /// block len
-          block->set_block_size(endDim - beginDim);
-          if (sendingPara) {
-            sendJob->parallelInputIovs[serverId].push_back(
-                {sendMat->getLocalRow(row), sizeof(real) * (size_t)blockSize});
-            /// detect sparse parameter distribution
-            sparseDistribution_->probeDistribution(serverId,
-                                                   sizeof(real) * blockSize);
-          }
-        }
-      });
-
-    } else {  /// parameter set for dense and sparse
-      real* buf =
-          sendingPara ? parameter->getBuf(parameterType)->getPoint(0) : nullptr;
-      uint64_t endDim = 0;
-      for (uint64_t beginDim = 0; beginDim < paraSize; beginDim = endDim) {
-        endDim = std::min<int64_t>(beginDim + blockSize, paraSize);
-        int64_t blockId = beginDim / blockSize;
-        int serverId = std::abs((blockId + nameHash) % serviceNum_);
-
-        auto& request = sendJob->parallelRequests[serverId];
-        ParameterBlock* block = request.add_blocks();
-        block->set_para_id(segments.id);
-        block->set_block_id(blockId);
-        block->set_begin_pos(beginDim);
-        block->set_block_size(endDim - beginDim);
-        if (buf) {
-          sendJob->parallelInputIovs[serverId].push_back(
-              {buf + beginDim, sizeof(real) * ((size_t)(endDim - beginDim))});
-        }
-      }
-    }
-  }  // parameterSegments
-
-  sparseDistribution_->checkAndResetDistribution();
-}
-
-void ParameterClient2::sendAndReceiveParameter(
-    ParameterUpdateMode updateMode,
-    ParameterType parameterType,
-    const std::vector<ParameterSegments>& parameterSegments,
-    int64_t numSamples,
-    real cost,
-    bool sendBackParameter,
-    ParameterType sendBackParameterType,
-    ParameterType recvParameterType) {
-  prepareSendData(updateMode,
-                  parameterType,
-                  parameterSegments,
-                  numSamples,
-                  cost,
-                  sendBackParameter,
-                  sendBackParameterType,
-                  /*batchStatus = */ BATCH_START_AND_FINISH,
-                  &sendJob_);
-
-  syncThreadPool_->exec([&](int tid, size_t numThreads) {
-    this->sendParallel(tid, numThreads, recvParameterType);
-  });
-}
-
-void ParameterClient2::sendParameter(
-    ParameterUpdateMode updateMode,
-    ParameterType parameterType,
-    const std::vector<ParameterSegments>& parameterSegments,
-    int64_t numSamples,
-    real cost,
-    bool sendBackParameter,
-    BatchStatus batchStatus) {
-  SendJobPtr sendJob = std::make_shared<SendJob>();
-  prepareSendData(updateMode,
-                  parameterType,
-                  parameterSegments,
-                  numSamples,
-                  cost,
-                  sendBackParameter,
-                  PARAMETER_VALUE,
-                  batchStatus,
-                  sendJob.get());
-
-  for (int i = 0; i < threadNum_; i++) {
-    sendJobQueue_[i]->enqueue(sendJob);
-  }
-}
-
-void ParameterClient2::recvParameter() { recvSyncBarrier_->wait(); }
-
-void ParameterClient2::send(int threadId) {
-  int index = threadId;
-  LOG(INFO) << "send thread " << threadId << " started";
-  int numMyClients = divup(serviceNum_ - index, threadNum_);
-  while (true) {
-    SendJobPtr recvJob = sendJobQueue_[index]->dequeue();
-    if (stopping_) {
-      recvJobQueue_[index]->enqueue(recvJob);
-      break;
-    }
-    for (int j = 0; j < numMyClients; ++j) {
-      REGISTER_TIMER("client_send");
-      int i = threadNum_ * j + index;
-      /// Try to make different clients to send data to different pservers
-      /// at the same time so that they will not flood data to the same
-      /// pserver.
-      i = calcClientId(i, serviceNum_);
-      if (recvJob->parallelRequests.size()) {
-        clients_[i].send("sendParameter",
-                         recvJob->parallelRequests[i],
-                         recvJob->parallelInputIovs[i]);
-      } else {
-        clients_[i].send("sendData",
-                         recvJob->parallelDataRequests[i],
-                         recvJob->parallelInputIovs[i]);
-      }
-    }
-    recvJobQueue_[index]->enqueue(recvJob);
-  }
-}
-
-void ParameterClient2::recv(int threadId) {
-  LOG(INFO) << "recv thread " << threadId << " started";
-  int index = threadId;
-  int numMyClients = divup(serviceNum_ - index, threadNum_);
-  while (true) {
-    std::vector<void*> bufs;
-    SendParameterResponse response;
-    SendDataResponse dataResponse;
-    SendJobPtr recvJob = recvJobQueue_[index]->dequeue();
-    if (stopping_) break;
-    for (int j = 0; j < numMyClients; ++j) {
-      REGISTER_TIMER("client_recv");
-      int i = threadNum_ * j + index;
-      i = calcClientId(i, serviceNum_);
-      if (recvJob->parallelRequests.size()) {
-        auto msgReader = clients_[i].recv(&response);
-        CHECK_EQ(msgReader->getNumBlocks(), (size_t)response.blocks_size());
-        bufs.clear();
-        bufs.reserve(response.blocks_size());
-        for (auto& block : response.blocks()) {
-          auto it = parameterMap_.find(block.para_id());
-          CHECK(it != parameterMap_.end());
-          Parameter* parameter = it->second.get();
-          real* buf =
-              parameter->getBuf(PARAMETER_VALUE)->getPoint(block.begin_pos());
-          CHECK_EQ(msgReader->getBlockLength(bufs.size()),
-                   sizeof(real) * (block.block_size()));
-          bufs.push_back(buf);
-        }
-        msgReader->readBlocks(bufs);
-      } else {
-        auto msgReader = clients_[i].recv(&dataResponse);
-        CHECK_EQ(msgReader->getNumBlocks(), (size_t)dataResponse.blocks_size());
-        size_t totalLen = msgReader->getTotalLength();
-        if (0 == totalLen) {
-          continue;
-        }
-        auto& recvMem = recvDataMems_[dataResponse.server_id()];
-        CHECK_EQ(dataResponse.blocks_size(), 1)
-            << "Only one block currently support now!";
-        auto& block = dataResponse.blocks(0);
-        CHECK_EQ(totalLen % sizeof(block.data_size()), 0U);
-        recvMem = std::make_shared<CpuMemoryHandle>(totalLen);
-        msgReader->readNextBlock(recvMem.get()->getBuf());
-      }
-    }
-    recvSyncBarrier_->wait();
-  }
-}
-
-void ParameterClient2::waitPassStart() {
-  WaitPassStartRequest request;
-  std::vector<WaitPassStartResponse> responses;
-  multiCall(__func__, request, &responses);
-}
-
-void ParameterClient2::waitPassFinish() {
-  WaitPassFinishRequest request;
-  std::vector<WaitPassFinishResponse> responses;
-  multiCall(__func__, request, &responses);
-}
-
-void ParameterClient2::synchronize(SyncObject syncObjectId) {
-  SynchronizeRequest request;
-  request.set_sync_object_id(syncObjectId);
-  std::vector<SynchronizeResponse> responses;
-  multiCall(__func__, request, &responses);
-}
-
-void ParameterClient2::asyncFinishPass(SyncObject syncObjectId) {
-  SynchronizeRequest request;
-  request.set_sync_object_id(syncObjectId);
-  request.set_trainer_id(trainerId_);
-  std::vector<SynchronizeResponse> responses;
-  multiCall(__func__, request, &responses);
-}
-
-void ParameterClient2::setConfig(const OptimizationConfig& optConfig,
-                                 const std::string& saveDir,
-                                 bool isSparseServer) {
-  SetConfigRequest request;
-  std::vector<SetConfigResponse> responses;
-
-  for (auto& nameAndPara : parameterMap_) {
-    *request.add_param_configs() = nameAndPara.second->getConfig();
-  }
-
-  *request.mutable_opt_config() = optConfig;
-  request.set_save_dir(saveDir);
-  request.set_is_sparse_server(isSparseServer);
-
-  std::vector<SetConfigRequest> requests;
-  requests.resize(clients_.size());
-  for (size_t i = 0; i < requests.size(); ++i) {
-    requests[i].CopyFrom(request);
-    requests[i].set_server_id(i);
-  }
-
-  responses.resize(clients_.size());
-  size_t numClients = clients_.size();
-  for (size_t i = 0; i < numClients; ++i) {
-    clients_[i].send(__func__, requests[i]);
-  }
-  for (size_t i = 0; i < numClients; ++i) {
-    clients_[i].recv(&responses[i]);
-  }
-}
-
-bool ParameterClient2::inStatus(PServerStatus status) {
-  GetStatusRequest request;
-  std::vector<GetStatusResponse> responses;
-
-  bool ok = true;
-  multiCall("getStatus", request, &responses);
-  for (auto& response : responses) {
-    if (response.status() != status) {
-      ok = false;
-    }
-  }
-
-  return ok;
-}
-
-void ParameterClient2::setStatus(PServerStatus status) {
-  SetStatusRequest request;
-  request.set_status(status);
-  std::vector<SetStatusResponse> responses;
-  multiCall(__func__, request, &responses);
-}
-
-void ParameterClient2::waitForStatus(PServerStatus status) {
-  while (!inStatus(status)) {
-    sleep(1);
-  }
-}
-
-template <typename Proto>
-static void validateResponses(const std::vector<Proto>& responses) {
-  for (auto& response : responses) {
-    CHECK(response.return_message().empty())
-        << "client" << &response - &responses[0]
-        << " error:" << response.return_message();
-  }
-}
-
-PServerVector ParameterClient2::createVector() {
-  CreateVectorRequest request;
-  std::vector<CreateVectorResponse> responses;
-  int64_t handle = -1;
-
-  multiCall(__func__, request, &responses);
-  validateResponses(responses);
-
-  for (auto& response : responses) {
-    if (handle == -1) {
-      handle = response.handle();
-    } else {
-      CHECK_EQ(handle, response.handle()) << "Inconsistent handle from client"
-                                          << &response - &responses[0] << " "
-                                          << handle << " " << response.handle();
-    }
-  }
-  return PServerVector{handle};
-}
-
-void ParameterClient2::releaseVector(PServerVector handle) {
-  ReleaseVectorRequest request;
-  std::vector<ReleaseVectorResponse> responses;
-
-  request.set_handle(handle.handle);
-  multiCall(__func__, request, &responses);
-  validateResponses(responses);
-}
-
-PServerMatrix ParameterClient2::createMatrix(int32_t numCols) {
-  CreateMatrixRequest request;
-  std::vector<CreateMatrixResponse> responses;
-  int64_t handle = -1;
-
-  request.set_num_cols(numCols);
-  multiCall(__func__, request, &responses);
-  validateResponses(responses);
-
-  for (auto& response : responses) {
-    if (handle == -1) {
-      handle = response.handle();
-    } else {
-      CHECK_EQ(handle, response.handle()) << "Inconsistent handle from client"
-                                          << &response - &responses[0] << " "
-                                          << handle << " " << response.handle();
-    }
-  }
-  return PServerMatrix{handle};
-}
-
-void ParameterClient2::releaseMatrix(PServerMatrix handle) {
-  ReleaseMatrixRequest request;
-  std::vector<ReleaseMatrixResponse> responses;
-
-  request.set_handle(handle.handle);
-  multiCall(__func__, request, &responses);
-  validateResponses(responses);
-}
-
-void PreparedOperations::addOperationHelper(Operation* op, CpuVectorPtr vec) {
-  ProtoVector& pvec = *op->add_vectors();
-  size_t dim = vec->getSize();
-  pvec.set_dim(dim);
-  copyToRepeatedField(pvec.mutable_values(), vec->getData(), vec->getSize());
-}
-
-void PreparedOperations::addOperationHelper(Operation* op, CpuMatrixPtr mat) {
-  ProtoMatrix& pmat = *op->add_matrices();
-  pmat.set_num_cols(mat->getWidth());
-  pmat.set_num_rows(mat->getHeight());
-  copyToRepeatedField(
-      pmat.mutable_values(), mat->getData(), pmat.num_cols() * pmat.num_rows());
-}
-
-static inline real addTwo(real a, double b) { return a + b; }
-
-void ParameterClient2::doOperation(PreparedOperations& ops,
-                                   bool waitForGradient,
-                                   bool sendBackGradient,
-                                   bool releasePass) {
-  std::vector<DoOperationResponse> responses;
-  ops.request_.set_wait_for_gradient(waitForGradient);
-  ops.request_.set_send_back_parameter(sendBackGradient);
-  ops.request_.set_release_pass(releasePass);
-  multiCall(__func__, ops.request_, &responses);
-  validateResponses(responses);
-  size_t numPassFinishServers = 0;
-
-  size_t numOps = ops.request_.operations_size();
-  for (auto& response : responses) {
-    numPassFinishServers += response.pass_finish();
-    CHECK_EQ(numOps, (size_t)response.results_size());
-    for (size_t opId = 0; opId < numOps; ++opId) {
-      const OperationResult& result = response.results(opId);
-      std::vector<real*>& resultScalars = ops.localResults_[opId].resultScalars;
-      std::vector<CpuVectorPtr>& resultVectors =
-          ops.localResults_[opId].resultVectors;
-      std::vector<CpuMatrixPtr>& resultMatrices =
-          ops.localResults_[opId].resultMatrices;
-
-      if (&response == &responses[0]) {
-        /// Initialize results to zero
-
-        resultScalars.resize(result.scalars_size());
-        for (auto p : resultScalars) {
-          if (!p) continue;
-          *p = 0;
-        }
-        size_t numVectors = result.vectors_size();
-        resultVectors.resize(numVectors);
-        for (size_t i = 0; i < numVectors; ++i) {
-          if (!resultVectors[i]) continue;
-          resultVectors[i]->resize(result.vectors(i).dim());
-          resultVectors[i]->zeroMem();
-        }
-        size_t numMatrices = result.matrices_size();
-        resultMatrices.resize(numMatrices);
-        for (size_t i = 0; i < numMatrices; ++i) {
-          if (!resultMatrices[i]) continue;
-          resultMatrices[i]->resize(result.matrices(i).num_rows(),
-                                    result.matrices(i).num_cols());
-          resultMatrices[i]->zeroMem();
-        }
-      }
-
-      // aggregate results from each pserver to results
-
-      CHECK_EQ(resultScalars.size(), (size_t)result.scalars_size());
-      for (ssize_t i = 0; i < result.scalars_size(); ++i) {
-        real* rscalar = resultScalars[i];
-        if (!rscalar) continue;
-        *rscalar += result.scalars(i);
-      }
-
-      CHECK_EQ(resultVectors.size(), (size_t)result.vectors_size());
-      for (auto& vec : result.vectors()) {
-        int i = &vec - &result.vectors(0);
-        CpuVectorPtr rvec = resultVectors[i];
-        if (!rvec) continue;
-        CHECK_EQ(rvec->getSize(), (size_t)vec.dim());
-        std::transform(rvec->getData(),
-                       rvec->getData() + rvec->getSize(),
-                       vec.values().data(),
-                       rvec->getData(),
-                       addTwo);
-      }
-
-      CHECK_EQ(resultMatrices.size(), (size_t)result.matrices_size());
-      for (auto& mat : result.matrices()) {
-        int i = &mat - &result.matrices(0);
-        CpuMatrixPtr rmat = resultMatrices[i];
-        if (!rmat) continue;
-        CHECK_EQ(rmat->getHeight(), (size_t)mat.num_rows());
-        CHECK_EQ(rmat->getWidth(), (size_t)mat.num_cols());
-
-        std::transform(rmat->getData(),
-                       rmat->getData() + rmat->getElementCnt(),
-                       mat.values().data(),
-                       rmat->getData(),
-                       addTwo);
-      }
-    }
-  }
-  passFinish_ = numPassFinishServers == clients_.size();
-}
-
-real ParameterClient2::vectorDotProduct(PServerVector u, PServerVector v) {
-  real result = 0.0;
-  PreparedOperations ops;
-  ops.addOperation(PSERVER_OP_utv, u, v)(&result);
-  doOperation(ops, false, false);
-  return result;
-}
-
-void ParameterClient2::vectorScale(PServerVector u, real a) {
-  PreparedOperations ops;
-  ops.addOperation(PSERVER_OP_au, u, a);
-  doOperation(ops, false, false);
-}
-
-void ParameterClient2::vectorCopy(PServerVector src, PServerVector dst) {
-  PreparedOperations ops;
-  ops.addOperation(PSERVER_OP_COPY, src, dst);
-  doOperation(ops, false, false);
-}
-
-void ParameterClient2::vectorAddMult(PServerVector u, PServerVector v, real a) {
-  PreparedOperations ops;
-  ops.addOperation(PSERVER_OP_au_bv, v, u, a, (real)1);
-  doOperation(ops, false, false);
-}
-
-void ParameterClient2::vectorAddMultInto(PServerVector u,
-                                         PServerVector v,
-                                         PServerVector w,
-                                         real a) {
-  PreparedOperations ops;
-  ops.addOperation(PSERVER_OP_au_bv_cw, v, w, u, (real)1, a, (real)0);
-  doOperation(ops, false, false);
-}
-
-void ParameterClient2::vectorScaleInto(PServerVector u,
-                                       PServerVector v,
-                                       real a) {
-  PreparedOperations ops;
-  ops.addOperation(PSERVER_OP_au_bv, v, u, a, (real)0);
-  doOperation(ops, false, false);
-}
-
-void ParameterClient2::loadValueVector(const std::string& dirName) {
-  LoadValueRequest request;
-  request.set_dir_name(dirName);
-  std::vector<LoadValueResponse> responses;
-
-  multiCall(__func__, request, &responses);
-  validateResponses(responses);
-}
-
-void ParameterClient2::saveValueVector(const std::string& dirName) {
-  SaveValueRequest request;
-  request.set_dir_name(dirName);
-  std::vector<SaveValueResponse> responses;
-
-  multiCall(__func__, request, &responses);
-  validateResponses(responses);
-}
-
-}  // namespace paddle
diff --git a/paddle/pserver/ParameterClient2.h b/paddle/pserver/ParameterClient2.h
deleted file mode 100644
index c96bb787151a525556c8217629109de201762cff..0000000000000000000000000000000000000000
--- a/paddle/pserver/ParameterClient2.h
+++ /dev/null
@@ -1,602 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <atomic>
-#include <mutex>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/math/Matrix.h"
-#include "paddle/math/Vector.h"
-#include "paddle/parameter/Parameter.h"
-#include "paddle/pserver/BaseClient.h"
-#include "paddle/utils/Common.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/Locks.h"
-#include "paddle/utils/Queue.h"
-#include "paddle/utils/Util.h"
-
-#include "ParameterService.pb.h"
-
-#include "ProtoServer.h"
-#include "SparseParameterDistribution.h"
-
-DECLARE_int32(parallel_thread_num);
-
-namespace paddle {
-
-struct PServerMatrix {
-  int64_t handle;
-};
-
-struct PServerVector {
-  int64_t handle;
-};
-
-/**
- * @brief A class to help to prepare server-side operations.
- */
-class PreparedOperations {
- protected:
-  class ResultsAdder;
-  struct LocalOperationResult;
-
- public:
-  /**
-   * Offers an easy way to prepare operations that will be performed on
-   * server-side.
-   *
-   * Usage:
-   * @code
-   *   addOperation(optype, arguments...)(results...)
-   * @endcode
-   *
-   * Examples:
-   * 1. set pserver vector to 1:
-   * @code
-   *   PServerVector u = parameterClient.createVector();
-   *   addOperation(PSERVER_OP_RESET, u, (real)1);
-   * @endcode
-   *
-   * 2. Compute inner product of to pserver vectors.
-   * @code
-   *   PServerVector u = parameterClient.createVector();
-   *   PServerVector v = parameterClient.createVector();
-   *   real result;
-   *   addOperation(PSERVER_OP_utv, u, v)(&result)
-   * @endcode
-   *
-   * @param[in] operation The operation that pserver will perform.
-   * @param[in] args Argument list of the operation
-   * @return A ResultsAdder object initialized with the last element of
-   *         localResults_.
-   */
-  template <typename... Args>
-  ResultsAdder addOperation(MatrixVectorOperation operation, Args... args) {
-    Operation* op = request_.add_operations();
-    op->set_operation(operation);
-    localResults_.emplace_back();
-    addOperationHelper(op, args...);
-    return ResultsAdder(&localResults_.back());
-  }
-
- protected:
-  void addOperationHelper(Operation* op) {}
-
-  /**
-   * @brief Helper function to add an new operation that takes a PServerVector
-   *        as an operand.
-   */
-  void addOperationHelper(Operation* op, PServerVector arg) {
-    op->add_pvectors(arg.handle);
-  }
-
-  /**
-   * @brief Helper function to add an new operation that takes a PServerMatrix
-   *        as an operand.
-   */
-  void addOperationHelper(Operation* op, PServerMatrix arg) {
-    op->add_pmatrices(arg.handle);
-  }
-
-  /**
-   * @brief Helper function to add an new operation that takes a real valued
-   *        scalar as an operand.
-   */
-  void addOperationHelper(Operation* op, real arg) { op->add_scalars(arg); }
-
-  /**
-   * @brief Helper function to add an new operation that takes a CpuVectorPtr
-   *        as an operand.
-   * @note The array of CpuVectors that arg points to will be copied to
-   *       op's vectors field.
-   */
-  void addOperationHelper(Operation* op, CpuVectorPtr arg);
-
-  /**
-   * @brief Helper function to add an new operation that takes a CpuMatrixPtr
-   *        as an operand.
-   * @note The array of CpuMatrixs that arg points to will be copied to
-   *       op's matrices field.
-   */
-  void addOperationHelper(Operation* op, CpuMatrixPtr arg);
-
-  /**
-   * @brief Helper function to add an new operation and prepare the operands.
-   *
-   * @tparam Arg An operand of the operation.
-   * @tparam Args A list of rest operands of the operation.
-   * @param op Pointer to an Operation object.
-   */
-  template <typename Arg, typename... Args>
-  void addOperationHelper(Operation* op, Arg arg, Args... args) {
-    addOperationHelper(op, arg);
-    addOperationHelper(op, args...);
-  }
-
-  /**
-   * @brief ResultsAdder offers easy ways to quickly store operation results.
-   */
-  class ResultsAdder {
-   public:
-    explicit ResultsAdder(LocalOperationResult* localResult)
-        : localResult_(localResult) {}
-    template <typename... Args>
-    void operator()(Args... args) {
-      addResult(args...);
-    }
-    void addResult() {}
-    void addResult(real* arg) { localResult_->resultScalars.push_back(arg); }
-    void AddResult(CpuVectorPtr arg) {
-      localResult_->resultVectors.push_back(arg);
-    }
-    void AddResult(CpuMatrixPtr arg) {
-      localResult_->resultMatrices.push_back(arg);
-    }
-    template <typename Arg, typename... Args>
-    void addResult(Arg arg, Args... args) {
-      addResult(arg);
-      addResult(args...);
-    }
-
-   protected:
-    LocalOperationResult* localResult_;
-  };
-
- protected:
-  DoOperationRequest request_;
-  std::vector<iovec> inputIovs_;
-  struct LocalOperationResult {
-    std::vector<real*> resultScalars;
-    std::vector<CpuVectorPtr> resultVectors;
-    std::vector<CpuMatrixPtr> resultMatrices;
-  };
-  std::vector<LocalOperationResult> localResults_;
-  friend class ParameterClient2;
-};
-
-struct ParameterSegments {
-  std::string name;  // name of the parameter
-  size_t id;         // id of the parameter
-};
-
-/**
- * The client interface for parameter server. ParameterClient2 supports 2 modes
- * for managing connections to parameter servers, in the 1st mode one connection
- * is shared by 2 threads that are separately responsible for sending and
- * recieving activities, in the 2nd mode one connection is owned by only one
- * thread, and all the sending and recieving activities run in that single
- * thread.
- * TODO(yanfei):
- * Additional core idea to further optimizate pserver performance is
- * to do sync-sgd based parameter level instead of pserver level.
- * full-parallelization based parameter level for sync-sgd also can
- * sense forwardbackward computation layer-by-layer for more deeper layer
- * model.
- * Firstly, pserver can do full-parallelization on all computation based
- * parameter level instead of waiting for all gradients are finished and
- * start to send back parameters value immediately if parameter is ready
- * instead of waiting for all parameters value are ready
- * Secondly, parameter client can write back parameters to GPU instead of
- * waiting until all parameters are received to CPU host end.
- */
-class ParameterClient2 : public BaseClient {
- public:
-  /** Constructor.
-   * @param separate True if sending and recieving activities are separated
-   *                 into 2 threads, otherwise false.
-   * @param port Port number that parameter client runs on.
-   * @param numPorts Number of ports parameter clients occupies,
-   *                 numPorts * pserver number is the total number of
-   *                 connections the parameter client maintains.
-   */
-  ParameterClient2(bool separate = false,
-                   int port = FLAGS_port,
-                   int numPorts = FLAGS_ports_num);
-
-  ~ParameterClient2();
-
-  static int calcParameterBlockSize(const std::vector<ParameterPtr>& parameters,
-                                    size_t serviceNum);
-
- public:
-  bool init(const std::vector<ParameterPtr>& parameters);
-
-  /// service functions
-
-  /**
-   * @brief Sends the segments in parameter to parameter servers, then receives
-   *        the response from the servers.
-   * @param[in] updateMode Indicates how parameters should be updated on the
-   *            server side.
-   * @param[in] parameterType Type of parameter that will be sent.
-   * @param[in] segments Segments in the parameter that will be sent.
-   * @param[in] numSamples Number of samples this update is based on.
-   * @param[in] cost Cost of the batch, will be used to calculate global object
-   *            value.
-   * @param[in] sendBackParameter True if the updated parameters should be sent
-   *            back, otherwise false.
-   * @param[in] sendBackParameterType Send back parameter type on pserver,
-   *            PARAMETER_VALUE by default
-   * @param[in] recvParameterType pserver[sendBackParameterType] will be copy to
-   *            client[recvParameterType]
-   * @note Only parameterType will be sent.
-   */
-  void sendAndReceiveParameter(ParameterUpdateMode updateMode,
-                               ParameterType parameterType,
-                               const std::vector<ParameterSegments>& segments,
-                               int64_t numSamples,
-                               real cost,
-                               bool sendBackParameter,
-                               ParameterType sendBackParameterType,
-                               ParameterType recvParameterType);
-
-  /**
-   * @brief Sends all parameters to parameter servers, and receives the response
-   *        from the servers.
-   */
-  void sendAndReceiveParameter(
-      ParameterUpdateMode updateMode,
-      ParameterType parameterType,
-      int64_t numSamples,
-      real cost,
-      bool sendBackParameter,
-      ParameterType sendBackParameterType = PARAMETER_VALUE,
-      ParameterType recvParameterType = PARAMETER_VALUE) {
-    sendAndReceiveParameter(updateMode,
-                            parameterType,
-                            allSegments_,
-                            numSamples,
-                            cost,
-                            sendBackParameter,
-                            sendBackParameterType,
-                            recvParameterType);
-  }
-
-  /**
-   * @brief Sends the segments in parameter to parameter servers. Each
-   *        sendParameter() must be paired with a recvParameter() in the future.
-   *        Only parameterType will be sent.
-   *
-   * @param[in] updateMode Indicates how parameters should be updated on the
-   *            server side.
-   * @param[in] parameterType Type of parameter that will be sent.
-   * @param[in] segments Segments in the parameter that will be sent.
-   * @param[in] numSamples Number of samples this update is based on.
-   * @param[in] cost Cost of the batch, will be used to calculate global object
-   *            value.
-   * @param[in] sendBackParameter True if the updated parameters should be sent
-   *            back, otherwise false.
-   * @param[in] batchStatus Status of the batch.
-   * @note This function is non-blocking. This means that parameter should
-   *       not change between this call and recvParameter()
-   */
-  void sendParameter(ParameterUpdateMode updateMode,
-                     ParameterType parameterType,
-                     const std::vector<ParameterSegments>& segments,
-                     int64_t numSamples,
-                     real cost,
-                     bool sendBackParameter,
-                     BatchStatus batchStatus);
-
-  void recvParameter();
-
-  /**
-   * Sends all parameters to parameter servers, recvParameter() have to be
-   * invoked
-   * afterwards.
-   *
-   * @note This function is non-blocking. This means that if parameter should
-   *       not changes between this call and recvParameter()
-   */
-  void sendParameter(ParameterUpdateMode updateMode,
-                     ParameterType parameterType,
-                     int64_t numSamples,
-                     real cost,
-                     bool sendBackParameter,
-                     BatchStatus batchStatus) {
-    sendParameter(updateMode,
-                  parameterType,
-                  allSegments_,
-                  numSamples,
-                  cost,
-                  sendBackParameter,
-                  batchStatus);
-  }
-
-  /// Get all parameters from parameter servers
-  void getParameter(ParameterType recvParameterType = PARAMETER_VALUE,
-                    ParameterType sendBackParameterType = PARAMETER_VALUE) {
-    sendAndReceiveParameter(PSERVER_UPDATE_MODE_GET_PARAM,
-                            PARAMETER_VALUE,
-                            0,     // numSamples = 0
-                            0,     // cost = 0
-                            true,  // sendBackParameter = true
-                            sendBackParameterType,
-                            recvParameterType);
-  }
-
-  /// Get parameters by sparse row ids from parameter servers
-  void getParameterSparse(
-      ParameterType recvParameterType = PARAMETER_VALUE,
-      ParameterType sendBackParameterType = PARAMETER_VALUE) {
-    sendAndReceiveParameter(PSERVER_UPDATE_MODE_GET_PARAM_SPARSE,
-                            PARAMETER_VALUE,
-                            0,     // numSamples = 0
-                            0,     // cost = 0
-                            true,  // sendBackParameter = true
-                            sendBackParameterType,
-                            recvParameterType);
-  }
-
-  /// Set all parameters on parameter servers using the local parameters
-  void setParameter() {
-    sendAndReceiveParameter(PSERVER_UPDATE_MODE_SET_PARAM,
-                            PARAMETER_VALUE,
-                            0,       // numSamples = 0
-                            0,       // cost = 0
-                            false);  // sendBackParameter = false
-  }
-  /**
-   * Set all parameters on parameter servers, values will be zero
-   * means do not sending local parameters
-   */
-  void setParameterZero() {
-    sendAndReceiveParameter(PSERVER_UPDATE_MODE_SET_PARAM_ZERO,
-                            PARAMETER_VALUE,
-                            0,       // numSamples = 0
-                            0,       // cost = 0
-                            false);  // sendBackParameter = false
-  }
-
-  /**
-   * @brief Wait until all gradient servers start one pass.
-   *
-   * @note This is now only used by the gradient servers for "sgd"
-   *       algorithm. Calling this function means that the calling gradient
-   *       server is ready to start a new pass.
-   */
-  void waitPassStart();
-
-  /**
-   * @brief Wait until all gradient servers finish one pass.
-   *
-   * @note This is now only used by the gradient servers for "sgd" algorithm.
-   *       Calling this function means that the calling gradient server
-   *       finishes one pass.
-   */
-  void waitPassFinish();
-
-  /// Wait until all gradient servers call this function.
-  void synchronize(SyncObject syncObjectId = SYNC_DEFAULT);
-
-  /// Called when async-sgd finish pass.
-  void asyncFinishPass(SyncObject syncObjectId = SYNC_DEFAULT);
-
-  void asyncStartPass(SyncObject syncObjectId = SYNC_DEFAULT) {
-    return synchronize(syncObjectId);
-  }
-
-  /**
-   * @brief Execute the prepared operations on pservers, fetch the results and
-   *        aggregate results from different pservers.
-   * @param[in] ops Prepared operations that will be executed on pservers.
-   * @param[in] waitForGradient If true, wait for gradient to be ready before
-   *            starting the operations.
-   * @param[in] sendBackParameter If true, send back the parameter to clients
-   *            after the operations are finished.
-   * @param[in] If true, and if all clients call waitPassFinish, signal all
-   *            clients finish the pass.
-   */
-  void doOperation(PreparedOperations& ops,
-                   bool waitForGradient,
-                   bool sendBackParameter,
-                   bool releasePass = true);
-
-  /**
-   * Set the configuration of pserver, including parameter config and
-   * optimization config
-   */
-  void setConfig(const OptimizationConfig& optConfig,
-                 const std::string& saveDir = "",
-                 bool isSparseServer = false);
-
-  /// Return true if all pservers are in the given status
-  bool inStatus(PServerStatus status);
-  bool isPassFinish() { return passFinish_; }
-
-  /// Set pserver status
-  void setStatus(PServerStatus status);
-
-  /**
-   * @brief Wait until all pservers are at status
-   * @note This function is not suitable for frequent use,
-   *       because it sleeps 1 second each time when condition is satisfied.
-   */
-  void waitForStatus(PServerStatus status);
-
-  /// Create a column vector. The size is the dimension of parameter.
-  PServerVector createVector();
-
-  /// Release the PServerVector given handle.
-  void releaseVector(PServerVector handle);
-
-  /**
-   * Create a column major matrix. The number of rows is the dimension of
-   * parameter. The number of columns is specifed by numCols.
-   */
-  PServerMatrix createMatrix(int32_t numCols);
-
-  /// Release the PServerMatrix given handle.
-  void releaseMatrix(PServerMatrix handle);
-
-  // Some basic algebra functions
-  /// Calculate the dot product of u and v
-  real vectorDotProduct(PServerVector u, PServerVector v);
-
-  /// Scale u by a
-  void vectorScale(PServerVector u, real a);
-
-  /// Copy from src to dest
-  void vectorCopy(PServerVector src, PServerVector dst);
-
-  /// u += v * a
-  void vectorAddMult(PServerVector u, PServerVector v, real a);
-
-  /// u = v + w * a
-  void vectorAddMultInto(PServerVector u,
-                         PServerVector v,
-                         PServerVector w,
-                         real a);
-  /// u = v * a
-  void vectorScaleInto(PServerVector u, PServerVector v, real a);
-
-  /// Return pserver parameter value.
-  PServerVector getPServerParameterValue() {
-    PServerVector vec;
-    vec.handle = PARAMETER_VALUE;
-    return vec;
-  }
-
-  /// Return pserver parameter gradient.
-  PServerVector getPServerParameterGradient() {
-    PServerVector vec;
-    vec.handle = PARAMETER_GRADIENT;
-    return vec;
-  }
-
-  /**
-   * Tell pservers to load value vector from file.
-   *
-   * @param[in] dirName The directory that contains the value vector file.
-   */
-  void loadValueVector(const std::string& dirName);
-
-  /// Tell pservers to save value vector to file.
-  void saveValueVector(const std::string& dirName);
-
-  void setTrainerId(int trainerId) { trainerId_ = trainerId; }
-
-#ifndef PADDLE_DISABLE_TIMER
-  void setForwardbackwardTime(uint64_t delta) { forwardbackwordTime_ = delta; }
-#endif
-
- protected:
-  template <typename ProtoIn, typename ProtoOut>
-  void multiCall(const char* funcName,
-                 const ProtoIn& request,
-                 std::vector<ProtoOut>* responses) {
-    responses->resize(clients_.size());
-    size_t numClients = clients_.size();
-    for (size_t i = 0; i < numClients; ++i) {
-      clients_[i].send(funcName, request);
-    }
-    for (size_t i = 0; i < numClients; ++i) {
-      clients_[i].recv(&(*responses)[i]);
-    }
-  }
-
- private:
-  void destroy();
-
-  /**
-   * @brief management function for parallelizing send/recv all connections
-   *        to all pservers. it is called under one SyncThreadPool. it
-   *        supports to use N thread to control M connections. the receiving
-   *        actions can be started until all sending action to all connections
-   *        owned by current thread are finished. Different connections
-   * controlled
-   *        by different threads can transfer data asynchronously.
-   */
-  void sendParallel(int tid,
-                    size_t numThreads,
-                    ParameterType recvParameterType);
-  /// sending thread routine for asynchronously send data
-  void send(int threadId);
-  /// receiving thread routing for asynchronously receive data
-  void recv(int threadId);
-
-  /**
-   * @brief main routine to build data for pserver
-   *
-   * @note  it can prepare different kinds of parameter type data. it can
-   *        be regarded as layer for bridging real parameters data and
-   *        protobuf data for communication.
-   *        TODO(yanfei):
-   *        can abstract additional layer to encode and decode data to/from
-   *        protobuf data.
-   */
-  void prepareSendData(
-      ParameterUpdateMode updateMode,
-      ParameterType parameterType,  // client send type
-      const std::vector<ParameterSegments>& parameterSegments,
-      int64_t numSamples,
-      real cost,
-      bool sendBackParameter,
-      ParameterType sendBackParameterType,  // send back type in pserver
-      BatchStatus batchStatus,
-      SendJob* sendJob);
-
-  /// start necessary threads for threadPool
-  void initThreads();
-
- protected:
-  /// start port number of pserver
-  /// it deduce all ports for dense and sparse with some rules
-  int port_;
-  /// identify the trainer id using this client
-  int trainerId_;
-
-#ifndef PADDLE_DISABLE_TIMER
-  uint64_t forwardbackwordTime_;
-#endif
-  std::mutex sparseAutoGrowthMutex_;
-
-  /// map id to parameter used for decoding protobuf data
-  std::unordered_map<size_t, ParameterPtr> parameterMap_;
-  /// segments for all parameters that needed to sync
-  std::vector<ParameterSegments> allSegments_;
-
-  /// module for sensing sparse parameters distribution on all pservers
-  std::unique_ptr<SparseParameterDistribution> sparseDistribution_;
-
-  /// thread pool for parallelizing all connections to pservers
-  std::unique_ptr<SyncThreadPool> syncThreadPool_;
-
-  bool passFinish_;
-};
-
-}  // namespace paddle
diff --git a/paddle/pserver/ParameterServer2.cpp b/paddle/pserver/ParameterServer2.cpp
deleted file mode 100644
index f8814714c29a9776adde6a979a84241f733f65bd..0000000000000000000000000000000000000000
--- a/paddle/pserver/ParameterServer2.cpp
+++ /dev/null
@@ -1,1401 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ParameterServer2.h"
-
-#include <algorithm>
-#include <fstream>
-
-#include "paddle/math/SIMDFunctions.h"
-#include "paddle/parameter/AverageOptimizer.h"
-#include "paddle/parameter/FirstOrderOptimizer.h"
-#include "paddle/parameter/OptimizerFunctions.h"
-#include "paddle/parameter/OptimizerWithRegularizer.h"
-#include "paddle/parameter/ParameterOptimizer.h"
-#include "paddle/parameter/ParameterUpdateFunctions.h"
-#include "paddle/parameter/Regularizer.h"
-#include "paddle/parameter/ThreadLocalBuffer.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/GlobalConstants.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/StringUtil.h"
-
-DEFINE_int32(pserver_num_threads, 1, "number of threads for sync op exec");
-DEFINE_double(async_lagged_ratio_min,
-              1.0,
-              "control config_.async_lagged_grad_discard_ratio() min value");
-DEFINE_double(
-    async_lagged_ratio_default,
-    1.5,
-    "if async_lagged_grad_discard_ratio is not set in trainer_config.conf"
-    "use it as defalut value");
-
-namespace paddle {
-
-const std::string ParameterServer2::kRetMsgInvalidMatrixHandle =
-    "Invalid matrix handle";
-const std::string ParameterServer2::kRetMsgInvalidVectorHandle =
-    "Invalid vector handle";
-const std::string ParameterServer2::kRetMsgUnknownOperation =
-    "Unknown operation";
-
-ParameterServer2::ParameterServer2(const std::string& addr,
-                                   int port,
-                                   int rdmaCpu)
-    : ProtoServer(addr, port, rdmaCpu),
-      dataSize_(0),
-      size_(0),
-      gradientReadyBarrier_(FLAGS_num_gradient_servers + 1),
-      parameterReadyBarrier_(FLAGS_num_gradient_servers + 1),
-      passBarrier_(FLAGS_num_gradient_servers + 1),
-      numPassFinishClients_(0),
-      allClientPassFinish_(false),
-      serverId_(-1),
-      batchId_(-1) {
-  /**
-   * register function for remote client calling, these functions
-   * will be mapped to a data structure for quick looking up. each
-   * request from trainer can contains one function name to indicate
-   * remote action. this architecture looks like rpc style for pserver.
-   */
-  REGISTER_SERVICE_FUNCTION_EX(ParameterServer2, sendParameter);
-  REGISTER_SERVICE_FUNCTION_EX(ParameterServer2, sendData);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, setConfig);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, setStatus);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, getStatus);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, doOperation);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, createVector);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, releaseVector);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, createMatrix);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, releaseMatrix);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, waitPassStart);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, waitPassFinish);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, synchronize);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, asyncFinishPass);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, loadValueVector);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, saveValueVector);
-
-  /// thread pool for parallelizing some computations
-  if (FLAGS_pserver_num_threads > 1) {
-    syncThreadPool_.reset(new SyncThreadPool(FLAGS_pserver_num_threads, false));
-  }
-}
-
-bool ParameterServer2::init() {
-  vectors_.resize(NUM_PARAMETER_TYPES);
-  configMap_.clear();
-
-  numSamplesProcessed_ = 0;
-  cost_ = 0;
-  char* mpienv = getenv("OMPI_COMM_WORLD_SIZE");
-  if (mpienv != NULL) {
-    mpiSize_ = atoi(mpienv);
-  } else {
-    mpiSize_ = 1;
-  }
-  status_ = PSERVER_STATUS_NOT_SET;
-  dataMems_.resize(FLAGS_num_gradient_servers);
-  synchronizeBarriers_.resize(SyncObject_ARRAYSIZE);
-  for (auto& barrier : synchronizeBarriers_) {
-    barrier.reset(new ThreadBarrier(FLAGS_num_gradient_servers));
-  }
-
-  // initialization for dicarding lagging gradient
-  asyncUpdateSteps_ = 0;
-  asyncTrainerSteps_.resize(FLAGS_num_gradient_servers);
-  asyncTrainerSteps_.assign(asyncTrainerSteps_.size(), 0);
-  asyncLaggedGradientsNum_ = 0;
-  asyncUpdateStat_.resize(static_cast<int>(FLAGS_num_gradient_servers *
-                                           FLAGS_async_lagged_ratio_default));
-  asyncUpdateStat_.assign(asyncUpdateStat_.size(), 0);
-  asyncTrainerDiscardStat_.resize(FLAGS_num_gradient_servers);
-  asyncTrainerDiscardStat_.assign(asyncTrainerDiscardStat_.size(), 0);
-  asyncTrainerCommitStat_.resize(FLAGS_num_gradient_servers);
-  asyncTrainerCommitStat_.assign(asyncTrainerCommitStat_.size(), 0);
-
-  return true;
-}
-
-void ParameterServer2::getStatus(const GetStatusRequest& request,
-                                 ProtoResponseCallback callback) {
-  (void)request;
-  GetStatusResponse response;
-  response.set_status(status_);
-  callback(response);
-}
-
-void ParameterServer2::setStatus(const SetStatusRequest& request,
-                                 ProtoResponseCallback callback) {
-  status_ = request.status();
-  SetStatusResponse response;
-  callback(response);
-}
-
-void ParameterServer2::setConfig(const SetConfigRequest& request,
-                                 ProtoResponseCallback callback) {
-  {
-    std::lock_guard<RWLock> guard(parameterMutex_);
-
-    serverId_ = request.server_id();
-    isSparseServer_ = request.is_sparse_server();
-
-    if (!request.save_dir().empty()) {
-      mkDir(request.save_dir().c_str());
-    }
-
-    for (const auto& config : request.param_configs()) {
-      CHECK(!configMap_.count(config.para_id()))
-          << "Duplicated parameter name: " << config.name();
-      configMap_[config.para_id()] = config;
-      CHECK_EQ(config.sparse_remote_update(), isSparseServer_);
-    }
-
-    config_ = request.opt_config();
-    if (config_.algorithm() == TrainAlgorithm::AsyncSGD) {
-      auto asyncLaggedRatio = config_.async_lagged_grad_discard_ratio();
-      if (asyncLaggedRatio <= FLAGS_async_lagged_ratio_min) {
-        LOG(INFO) << "WARNING: async_lagged_grad_discard_ratio is too small"
-                  << "reset to default, async_lagged_grad_discard_ratio = "
-                  << FLAGS_async_lagged_ratio_default;
-        asyncLaggedRatio = FLAGS_async_lagged_ratio_default;
-      }
-      asyncLaggedThreshold_ =
-          static_cast<int64_t>(FLAGS_num_gradient_servers * asyncLaggedRatio);
-      LOG(INFO) << "discard lagged async gradient ratio: " << asyncLaggedRatio
-                << " asyncLaggedhreshold: " << asyncLaggedThreshold_;
-    }
-    if (isSparseServer_ && config_.num_batches_per_send_parameter() > 1) {
-      /// sparse server must NOT use local update mode
-      config_.set_num_batches_per_send_parameter(1);
-    }
-
-    if (config_.num_batches_per_send_parameter() > 1 &&
-        config_.center_parameter_update_method() == "average") {
-      /// scaling L1/L2 decay rate as large as L1/L2 apply in trainer
-      /// if parameter regularization in pserver
-      for (auto& pair : configMap_) {
-        ParameterConfig& config = pair.second;
-        if (config_.num_batches_per_send_parameter() ==
-            config.num_batches_regularization()) {
-          real scale =
-              config_.delta_add_rate() * config.num_batches_regularization();
-          if (config_.algorithm() == "sgd") {
-            scale *= FLAGS_num_gradient_servers;
-          }
-          config.set_decay_rate(config.decay_rate() * scale);
-          if (config.decay_rate() > 0.1f) {
-            LOG(FATAL) << "L2 decay=" << config.decay_rate()
-                       << " for parameter:" << config.name()
-                       << " is too large after scale in pserver!";
-          }
-          config.set_decay_rate_l1(config.decay_rate_l1() * scale);
-          if (config.decay_rate_l1() > 0.1f) {
-            LOG(FATAL) << "L1 decay=" << config.decay_rate_l1()
-                       << " for parameter:" << config.name()
-                       << " is too large after scale in pserver!";
-          }
-
-          LOG(INFO) << "parameter:" << config.name()
-                    << " decay apply in pserver,"
-                    << " L1 decay=" << config.decay_rate_l1()
-                    << " L2 decay=" << config.decay_rate();
-        }
-      }
-    }
-  }
-
-  SetConfigResponse response;
-  callback(response);
-}
-
-real bufferSum(const std::vector<ParameterServer2::Buffer>& buffers) {
-  real sum = 0;
-  for (const auto buffer : buffers) {
-    for (size_t i = 0; i < buffer.size; ++i) {
-      sum += buffer.base[i];
-    }
-  }
-  return sum;
-}
-
-void ParameterServer2::mergeSegments(BlockSegments* segments) {
-  if (segments->empty()) {
-    return;
-  }
-  std::sort(segments->begin(), segments->end());
-  auto curr = segments->begin();
-  for (auto it = segments->begin(); it != segments->end(); ++it) {
-    if (it->first <= curr->second) {
-      curr->second = std::max(curr->second, it->second);
-    } else {
-      ++curr;
-      *curr = *it;
-    }
-  }
-  ++curr;
-  segments->erase(curr, segments->end());
-}
-
-void ParameterServer2::setParameter(const SendParameterRequest& request,
-                                    std::vector<Buffer>& inputBuffers,
-                                    SendParameterResponse* response,
-                                    std::vector<Buffer>* outputBuffers) {
-  (void)response;
-  (void)outputBuffers;
-  LOG(INFO) << "pserver: setParameter";
-  std::lock_guard<RWLock> guard(parameterMutex_);
-
-  int64_t numBlocks = blockIdMap_.size();
-  CHECK_EQ(blockIdMap_.size(), blockOffsetMap_.size());
-  /// total bytes for all the added blocks
-  int64_t totalSize = size_;
-  std::vector<int64_t> offsets;
-  offsets.reserve(request.blocks_size());
-  std::vector<int64_t> blockIds;
-  blockIds.reserve(request.blocks_size());
-  int bufferIndex = 0;
-
-  if (!request.blocks().size()) {
-    LOG(WARNING)
-        << "--ports_num or --ports_num_for_sparse might be too large, "
-        << "or total dense parameter size or sparse parameters size "
-        << "might be too small, this psever doesn't store any parameter.";
-    return;
-  }
-
-  for (const auto& block : request.blocks()) {
-    /// block size for parameter(e.g. 128 for sparse row, 1K for dense)
-    uint64_t blockSize = getParameterConfig(block).parameter_block_size();
-    BlockKey key(block.para_id(), block.block_id());
-    if (inputBuffers.size()) {  // if !=PSERVER_UPDATE_MODE_SET_PARAM_ZERO
-      Buffer buffer = inputBuffers[bufferIndex];
-      ++bufferIndex;
-      CHECK_EQ(buffer.size, block.block_size())
-          << "data size is too big:"
-          << " block_size=" << block.block_size()
-          << " data_size=" << buffer.size;
-    }
-
-    /// add a new block
-    if (blockIdMap_.count(key) == 0) {
-      blockOffsetMap_[key] = totalSize;
-      blockIdMap_[key] = numBlocks;
-      ++numBlocks;
-      totalSize += blockSize;
-    }
-    offsets.push_back(blockOffsetMap_[key]);
-    blockIds.push_back(blockIdMap_[key]);
-  }
-
-  size_ = totalSize;
-  LOG(INFO) << "pserver: new cpuvector: size=" << size_;
-  if (!vectors_[PARAMETER_VALUE]) {
-    /// vectors_
-    const auto types = sgdOptimizerGetTypes(config_, true /*inPserver*/);
-    for (const auto type : types) {
-      vectors_[type].reset(new CpuVector(size_));
-      vectors_[type]->zeroMem();
-    }
-
-    blockInfos_.resize(numBlocks);
-    for (auto& info : blockInfos_) {
-      info.lock.reset(new std::mutex());
-    }
-  } else {
-    CHECK_EQ((size_t)size_, vectors_[PARAMETER_VALUE]->getSize())
-        << "Currently adding new blocks is not supported. "
-        << "All blocks must be added in one setParameter call";
-  }
-
-  VectorPtr buf = vectors_[PARAMETER_VALUE];
-  usedSegments_.reserve(offsets.size());
-  /// if offsets is empty, means parameter_block_size is too big or too many
-  /// nodes.
-  if (offsets.empty()) {
-    LOG(WARNING) << "in setParameter: offsets is empty";
-  }
-  for (size_t i = 0; i < offsets.size(); ++i) {
-    size_t blockId = blockIds[i];
-    BlockInfo& info = blockInfos_[blockId];
-    const ParameterConfig& config = getParameterConfig(request.blocks(i));
-    info.config = &config;
-    info.offset = offsets[i];
-    info.optimizer.reset(sgdOptimizerCreate(
-        config_, config, config.sparse_remote_update(), true /*inPserver*/));
-    if (config.sparse_remote_update()) {
-      size_t width = config.dims(1);
-      CHECK_EQ(config.parameter_block_size(), width)
-          << "block size: " << config.parameter_block_size()
-          << "width : " << width;
-    }
-    info.optimizer->init(1, info.config);
-    usedSegments_.push_back(std::make_pair(
-        offsets[i], offsets[i] + request.blocks(i).block_size()));
-  }
-  mergeSegments(&usedSegments_);
-
-  if (request.update_mode() == PSERVER_UPDATE_MODE_SET_PARAM) {
-    /// copy param from trainer
-    for (size_t i = 0; i < offsets.size(); ++i) {
-      Buffer buffer = inputBuffers[i];
-      real* start = buf->getPoint(offsets[i]);
-      CHECK_LE(offsets[i] + buffer.size, buf->getSize());
-      memcpy(start, buffer.base, sizeof(real) * buffer.size);
-    }
-  } else {
-    CHECK(request.update_mode() == PSERVER_UPDATE_MODE_SET_PARAM_ZERO);
-    /// nothing to do, value vector zero mem already
-  }
-}
-
-void ParameterServer2::addGradient(const SendParameterRequest& request,
-                                   std::vector<Buffer>& inputBuffers,
-                                   SendParameterResponse* response,
-                                   std::vector<Buffer>* outputBuffers) {
-  VLOG(1) << "pserver: addGradient";
-
-  {
-    ReadLockGuard guard(parameterMutex_);
-    int bufferIndex = 0;
-    for (const auto& block : request.blocks()) {
-      int64_t offset = getBlockOffset(block);
-      CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
-                          << " id=" << block.para_id()
-                          << " block id=" << block.block_id();
-
-      int64_t blockId = getBlockId(block);
-      CHECK_GE(blockId, 0) << "Only existing parameter block is allowed: "
-                           << " id=" << block.para_id()
-                           << " block id=" << block.block_id();
-
-      Buffer buffer = inputBuffers[bufferIndex];
-      ++bufferIndex;
-
-      const real* gradientBuffer = buffer.base;
-      real* gradientSumBuffer = vectors_[PARAMETER_GRADIENT]->getPoint(offset);
-
-      size_t size = buffer.size;
-
-      BlockInfo& info = blockInfos_[blockId];
-      const ParameterConfig& config = getParameterConfig(blockId);
-      if (config.sparse_remote_update()) {
-        CHECK_EQ(size, config.parameter_block_size());
-      } else {  // dense
-        CHECK_LE(size, config.parameter_block_size());
-      }
-      std::lock_guard<std::mutex> guard(*info.lock);
-      simd::addTo(gradientSumBuffer, gradientBuffer, size);
-    }
-  }
-  if (request.batch_status() == BATCH_FINISH ||
-      request.batch_status() == BATCH_START_AND_FINISH) {
-    numSamplesProcessed_ += request.num_samples();
-    cost_ += request.cost();
-    VLOG(1) << "num samples: " << numSamplesProcessed_
-            << ", new cost:" << cost_;
-
-    /// notify doOperation gradient ready
-    gradientReadyBarrier_.wait();
-
-    /// wait doOperation finish
-    parameterReadyBarrier_.wait();
-    VLOG(1) << "start send back";
-  }
-}
-
-bool ParameterServer2::asyncGrdientCommitCheckAndStat(
-    const SendParameterRequest& request) {
-  const auto trainerId = request.trainer_id();
-  int64_t trainerSteps = asyncTrainerSteps_[trainerId];
-  CHECK_GE(asyncUpdateSteps_, trainerSteps)
-      << " async update steps overflows "
-      << " trainer id: " << trainerId
-      << " async update steps in pserver: " << asyncUpdateSteps_
-      << " async update steps in request: " << trainerSteps;
-
-  asyncUpdateSteps_++;
-  bool commitGradient = true;
-
-  int64_t delta = asyncUpdateSteps_ - trainerSteps;
-  if (delta >= asyncLaggedThreshold_) {
-    VLOG(1) << "discard Async Update: "
-            << " trainer id: " << trainerId
-            << " pserver steps: " << asyncUpdateSteps_
-            << " request steps: " << trainerSteps;
-    asyncLaggedGradientsNum_++;
-    commitGradient = false;
-  }
-  /// stat on lagged steps, to get total discard distribution
-  if (static_cast<size_t>(delta) < asyncUpdateStat_.size()) {
-    asyncUpdateStat_[delta]++;
-  } else {
-    asyncUpdateStat_[asyncUpdateStat_.size() - 1]++;
-  }
-  /// stat on trainerId and discard, to get trainer condition
-  if (commitGradient) {
-    asyncTrainerCommitStat_[trainerId]++;
-  } else {
-    asyncTrainerDiscardStat_[trainerId]++;
-  }
-
-  return commitGradient;
-}
-
-static ThreadLocal<std::vector<bool>> localBlockBitset_;
-
-void ParameterServer2::asyncSGD(const SendParameterRequest& request,
-                                std::vector<Buffer>& inputBuffers,
-                                SendParameterResponse* response,
-                                std::vector<Buffer>* outputBuffers) {
-  int64_t numBlocks = blockIdMap_.size();
-  auto& localBlockBitset = *localBlockBitset_;
-
-  if (isSparseServer_) {
-    if (localBlockBitset.empty()) {
-      localBlockBitset.resize(numBlocks);
-    }
-    localBlockBitset.assign(numBlocks, false);
-  }
-
-  ReadLockGuard guard(parameterMutex_);
-
-  if (request.send_back_parameter()) {
-    outputBuffers->reserve(request.blocks_size());
-  }
-
-  bool commitGradient = asyncGrdientCommitCheckAndStat(request);
-
-  VectorPtr* vecs = parameter::getThreadLocalBuffer();
-  size_t bufferIndex = 0;
-  for (const auto& block : request.blocks()) {
-    int64_t offset = getBlockOffset(block);
-    CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
-                        << " id=" << block.para_id()
-                        << " block id=" << block.block_id();
-    int64_t blockId = getBlockId(block);
-    CHECK_GE(blockId, 0) << "Only existing parameter block is allowed: "
-                         << " id=" << block.para_id()
-                         << " block id=" << block.block_id();
-    Buffer buffer = inputBuffers[bufferIndex];
-    ++bufferIndex;
-
-    size_t size = buffer.size;
-
-    BlockInfo& info = blockInfos_[blockId];
-    const ParameterConfig& config = getParameterConfig(blockId);
-
-    std::lock_guard<std::mutex> guard(*info.lock);
-    /// gradients are too obsolete, will be discarded
-    if (commitGradient) {
-      info.optimizer->startBatch(numSamplesProcessed_);
-
-      for (const auto type : info.optimizer->getParameterTypes()) {
-        vecs[type]->subVecFrom(*vectors_[type], offset, size);
-      }
-      vecs[PARAMETER_GRADIENT]->subVecFrom(buffer.base, 0, size);
-      info.optimizer->update(vecs, config, isSparseServer_ ? 0 : -1);
-
-      if (auto callback = info.optimizer->needSpecialTraversal(config)) {
-        blockTraverse(info, config, offset, size, vecs, callback);
-      }
-      info.optimizer->finishBatch();
-    }
-
-    if (commitGradient && isSparseServer_) {
-      localBlockBitset[blockId] = true;
-    }
-
-    if (!isSparseServer_ && request.send_back_parameter()) {  // dense
-      int type = request.send_back_parameter_type();
-      sendBackParameter(block, type, response, &buffer, outputBuffers);
-    }
-  }  /// foreach block
-
-  asyncTrainerSteps_[request.trainer_id()] = asyncUpdateSteps_;
-
-  if (commitGradient && isSparseServer_) {
-    /// find blocks that trainer do not request update
-    for (int64_t blockId = 0; blockId < numBlocks; ++blockId) {
-      if (localBlockBitset[blockId]) {
-        continue;
-      }
-
-      BlockInfo& info = blockInfos_[blockId];
-      const ParameterConfig& config = *info.config;
-      size_t size = config.parameter_block_size();
-
-      std::lock_guard<std::mutex> guard(*info.lock);
-      info.optimizer->startBatch(numSamplesProcessed_);
-      if (auto callback = info.optimizer->needSpecialTraversal(config)) {
-        blockTraverse(info, config, info.offset, size, vecs, callback);
-      }
-      info.optimizer->finishBatch();
-    }
-  }
-
-  if (commitGradient && (request.batch_status() == BATCH_FINISH ||
-                         request.batch_status() == BATCH_START_AND_FINISH)) {
-    numSamplesProcessed_ += request.num_samples();
-  }
-
-  /// show some performance log if needed
-  if (request.trainer_id() == 0) {
-    /// batchId_ is approximately equal to "real batchId_"
-    batchId_++;
-  }
-}
-
-void ParameterServer2::getParameter(const SendParameterRequest& request,
-                                    std::vector<Buffer>& inputBuffers,
-                                    SendParameterResponse* response,
-                                    std::vector<Buffer>* outputBuffers) {
-  (void)inputBuffers;
-  LOG(INFO) << "pserver: getParameter";
-  ReadLockGuard guard(parameterMutex_);
-  for (const auto& block : request.blocks()) {
-    int type = request.send_back_parameter_type();
-    sendBackParameter(block, type, response, outputBuffers);
-  }
-}
-
-void ParameterServer2::getParameterSparse(const SendParameterRequest& request,
-                                          std::vector<Buffer>& inputBuffers,
-                                          SendParameterResponse* response,
-                                          std::vector<Buffer>* outputBuffers) {
-  (void)inputBuffers;
-  auto& buffer = *readWriteBuffer_;
-  size_t numReals = 0;
-  for (const auto& block : request.blocks()) {
-    numReals += getParameterConfig(block).dims(1);
-  }
-  buffer.resize(numReals);
-
-  VLOG(3) << "pserver: getParameterSparse, numReals=" << numReals;
-
-  ReadLockGuard guard(parameterMutex_);
-  size_t offset = 0;
-  for (const auto& block : request.blocks()) {
-    size_t width = getParameterConfig(block).dims(1);
-    Buffer buf = {buffer.data() + offset, width};
-    int type = request.send_back_parameter_type();
-    sendBackParameterSparse(block, type, response, &buf, width, outputBuffers);
-    offset += width;
-  }
-}
-
-void ParameterServer2::sendBackParameter(const ParameterBlock& block,
-                                         int parameterType,
-                                         SendParameterResponse* response,
-                                         std::vector<Buffer>* outputBuffers) {
-  ParameterBlock* returnBlock = response->add_blocks();
-  returnBlock->set_para_id(block.para_id());
-  returnBlock->set_block_id(block.block_id());
-  returnBlock->set_begin_pos(block.begin_pos());
-  returnBlock->set_block_size(block.block_size());
-
-  int64_t offset = getBlockOffset(block);
-  CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
-                      << " id=" << block.para_id()
-                      << " block id=" << block.block_id();
-
-  real* valueBuffer = vectors_[parameterType]->getPoint(offset);
-  outputBuffers->push_back({valueBuffer, (size_t)block.block_size()});
-}
-
-void ParameterServer2::sendBackParameter(const ParameterBlock& block,
-                                         int parameterType,
-                                         SendParameterResponse* response,
-                                         Buffer* buffer,
-                                         std::vector<Buffer>* outputBuffers) {
-  ParameterBlock* returnBlock = response->add_blocks();
-  returnBlock->set_para_id(block.para_id());
-  returnBlock->set_block_id(block.block_id());
-  returnBlock->set_begin_pos(block.begin_pos());
-  returnBlock->set_block_size(block.block_size());
-
-  int64_t offset = getBlockOffset(block);
-  CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
-                      << " id=" << block.para_id()
-                      << " block id=" << block.block_id();
-
-  size_t size = buffer->size;
-  real* valueBuffer = vectors_[parameterType]->getPoint(offset);
-  /// copy to second buffer to avoid to be polluted by other request
-  memcpy(buffer->base, valueBuffer, sizeof(real) * size);
-  outputBuffers->push_back({buffer->base, size});
-}
-
-void ParameterServer2::sendBackParameterSparse(
-    const ParameterBlock& block,
-    int parameterType,
-    SendParameterResponse* response,
-    Buffer* buffer,
-    size_t width,
-    std::vector<Buffer>* outputBuffers) {
-  ParameterBlock* returnBlock = response->add_blocks();
-  returnBlock->set_para_id(block.para_id());
-  returnBlock->set_block_id(block.block_id());
-  returnBlock->set_begin_pos(block.begin_pos());
-  returnBlock->set_block_size(block.block_size());
-  int64_t offset = getBlockOffset(block);
-  CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
-                      << " id=" << block.para_id()
-                      << " block id=" << block.block_id();
-
-  real* valueBuffer = vectors_[parameterType]->getPoint(offset);
-  CHECK_EQ(buffer->size, width);
-  memcpy(buffer->base, valueBuffer, width * sizeof(real));
-  outputBuffers->push_back(*buffer);
-}
-
-void ParameterServer2::readAllBlocks(
-    MsgReader* msgReader, std::vector<ParameterServer2::Buffer>* buffers) {
-  auto& buffer = *readWriteBuffer_;
-  size_t numBlocks = msgReader->getNumBlocks();
-  buffer.resizeWithAlignHints(msgReader->getTotalLength() / sizeof(real),
-                              numBlocks);
-  std::vector<void*> bufs(numBlocks);
-  buffers->clear();
-  buffers->reserve(numBlocks);
-  buffer.resetAlignAlloc();
-  for (size_t i = 0; i < numBlocks; ++i) {
-    size_t len = msgReader->getBlockLength(i);
-    CHECK_EQ(len % sizeof(real), (size_t)0);
-    size_t size = len / sizeof(real);
-    bufs[i] = buffer.nextBlock(size);
-    buffers->push_back({(real*)bufs[i], size});
-  }
-  msgReader->readBlocks(bufs);
-}
-
-void ParameterServer2::sendParameter(const SendParameterRequest& request,
-                                     std::unique_ptr<MsgReader> msgReader,
-                                     ProtoResponseCallbackEx callback) {
-  SendParameterResponse response;
-  std::vector<Buffer> inputBuffers;
-  std::vector<Buffer> outputBuffers;
-  readAllBlocks(msgReader.get(), &inputBuffers);
-  msgReader.reset();
-
-  switch (request.update_mode()) {
-    case PSERVER_UPDATE_MODE_SET_PARAM:
-    case PSERVER_UPDATE_MODE_SET_PARAM_ZERO:
-      setParameter(request, inputBuffers, &response, &outputBuffers);
-      break;
-    case PSERVER_UPDATE_MODE_GET_PARAM:
-      getParameter(request, inputBuffers, &response, &outputBuffers);
-      break;
-    case PSERVER_UPDATE_MODE_GET_PARAM_SPARSE:
-      getParameterSparse(request, inputBuffers, &response, &outputBuffers);
-      break;
-    case PSERVER_UPDATE_MODE_ASYNC_SGD:
-      asyncSGD(request, inputBuffers, &response, &outputBuffers);
-      break;
-    case PSERVER_UPDATE_MODE_ADD_GRADIENT:
-      addGradient(request, inputBuffers, &response, &outputBuffers);
-      break;
-    case PSERVER_UPDATE_MODE_AVERAGE_PARAMETER:
-      break;
-  }
-  switch (request.update_mode()) {
-    case PSERVER_UPDATE_MODE_ADD_GRADIENT:
-      (*requestVec_).push_back(request);
-      (*callbackVec_).push_back(callback);
-      if (request.batch_status() == BATCH_FINISH ||
-          request.batch_status() == BATCH_START_AND_FINISH) {
-        for (size_t i = 0; i < (*requestVec_).size(); i++) {
-          ReadLockGuard guard(parameterMutex_);
-          SendParameterRequest& request = (*requestVec_)[i];
-          SendParameterResponse responseTemp;
-
-          std::vector<iovec> outputIovs;
-          if (request.send_back_parameter()) {
-            CHECK(!isSparseServer_);
-            std::vector<Buffer> outputBuffersTemp;
-            for (const auto& block : request.blocks()) {
-              int type = request.send_back_parameter_type();
-              sendBackParameter(block, type, &responseTemp, &outputBuffersTemp);
-            }
-            outputIovs.reserve(outputBuffersTemp.size());
-            for (auto buffer : outputBuffersTemp) {
-              outputIovs.push_back({buffer.base, buffer.size * sizeof(real)});
-            }
-          }
-
-          ProtoResponseCallbackEx& callbackTemp = (*callbackVec_)[i];
-          callbackTemp(responseTemp, outputIovs);
-        }
-        (*requestVec_).clear();
-        (*callbackVec_).clear();
-      }
-      break;
-    case PSERVER_UPDATE_MODE_SET_PARAM:
-    case PSERVER_UPDATE_MODE_SET_PARAM_ZERO:
-    case PSERVER_UPDATE_MODE_GET_PARAM:
-    case PSERVER_UPDATE_MODE_GET_PARAM_SPARSE:
-    case PSERVER_UPDATE_MODE_ASYNC_SGD:
-    case PSERVER_UPDATE_MODE_AVERAGE_PARAMETER:
-      std::vector<iovec> outputIovs;
-      outputIovs.reserve(outputBuffers.size());
-      for (auto buffer : outputBuffers) {
-        outputIovs.push_back({buffer.base, buffer.size * sizeof(real)});
-      }
-      callback(response, outputIovs);
-      break;
-  }
-}
-
-template <typename Dtype>
-void ParameterServer2::reduceAndSendData(const SendDataRequest& request,
-                                         std::unique_ptr<MsgReader>& msgReader,
-                                         ProtoResponseCallbackEx& callback) {
-  SendDataResponse response;
-  response.set_type(request.type());
-  response.set_server_id(serverId_);
-
-  auto sendData = reinterpret_cast<Dtype*>(dataMems_[0].get()->getBuf());
-  size_t rawMemSize = dataMems_[0].get()->getSize();
-  CHECK_EQ(rawMemSize % sizeof(Dtype), 0U);
-  size_t dataMemSize = rawMemSize / sizeof(Dtype);
-  for (size_t i = 1; i < dataMems_.size(); ++i) {
-    CHECK_EQ(dataMems_[i].get()->getSize(), rawMemSize);
-    auto data = reinterpret_cast<Dtype*>(dataMems_[i].get()->getBuf());
-    for (size_t j = 0; j < dataMemSize; ++j) {
-      sendData[j] += data[j];
-    }
-  }
-  std::vector<iovec> outputIovs;
-  auto block = response.add_blocks();
-  outputIovs.push_back({sendData, rawMemSize});
-  block->set_total_size(rawMemSize);
-  block->set_data_size(sizeof(Dtype));
-  callback(response, outputIovs);
-}
-
-void ParameterServer2::templateReduceSum(const SendDataRequest& request,
-                                         std::unique_ptr<MsgReader>& msgReader,
-                                         ProtoResponseCallbackEx& callback) {
-  const auto& block = request.blocks(0);
-  switch (block.data_type()) {
-    case TRANS_FLOAT:
-      reduceAndSendData<float>(request, msgReader, callback);
-      break;
-    case TRANS_DOUBLE:
-      reduceAndSendData<double>(request, msgReader, callback);
-      break;
-    case TRANS_INT32:
-      reduceAndSendData<int>(request, msgReader, callback);
-      break;
-    case TRANS_UINT32_T:
-      reduceAndSendData<uint32_t>(request, msgReader, callback);
-      break;
-    case TRANS_INT64_T:
-      reduceAndSendData<int64_t>(request, msgReader, callback);
-      break;
-    case TRANS_UINT64_T:
-      reduceAndSendData<uint64_t>(request, msgReader, callback);
-      break;
-    default:
-      LOG(FATAL) << "not supported";
-      break;
-  }
-}
-
-void ParameterServer2::sendData(const SendDataRequest& request,
-                                std::unique_ptr<MsgReader> msgReader,
-                                ProtoResponseCallbackEx callback) {
-  SendDataResponse response;
-  response.set_type(request.type());
-  response.set_server_id(serverId_);
-
-  switch (request.update_mode()) {
-    case DATA_UPDATE_MODE_SET_OWN: {
-      CHECK_EQ(msgReader->getNumBlocks(), (size_t)(request.blocks_size()));
-      size_t totalLen = msgReader->getTotalLength();
-      if (totalLen > 0) {
-        CHECK_EQ(msgReader->getNumBlocks(), 1U)
-            << "Only one block currently support now!";
-        const auto& block = request.blocks(0);
-        if (0 == dataSize_) {
-          dataSize_ = block.data_size();
-        } else {
-          CHECK_EQ(dataSize_, block.data_size());
-        }
-        int64_t serverId = request.server_id();
-        if (serverId_ < 0) {
-          serverId_ = serverId;
-        } else {
-          CHECK_EQ(serverId_, serverId);
-        }
-        int64_t clientId = request.client_id();
-        dataMems_[clientId] = std::make_shared<CpuMemoryHandle>(totalLen);
-        CHECK_EQ(totalLen % sizeof(block.data_size()), 0U);
-        msgReader->readNextBlock(dataMems_[clientId].get()->getBuf());
-      }
-      msgReader.reset();
-      std::vector<iovec> outputIovs;
-      callback(response, outputIovs);
-      break;
-    }
-    case DATA_UPDATE_MODE_GET_ALL: {
-      /// Currently only support DATA_REDUCE_SUM
-      /// And their Operations are just add
-      CHECK(DATA_REDUCE_SUM == request.type());
-      templateReduceSum(request, msgReader, callback);
-      break;
-    }
-    default: { LOG(FATAL) << "not supported"; }
-  }
-}
-
-void ParameterServer2::clearUnusedSegments(CpuVector* vec) {
-  real* data = vec->getData();
-  if (usedSegments_.empty()) {
-    return;
-  }
-  memset(data, 0, sizeof(real) * usedSegments_[0].first);
-  memset(data + usedSegments_.back().second,
-         0,
-         sizeof(real) * (size_ - usedSegments_.back().second));
-  size_t n = size_ - usedSegments_.back().second;
-
-  for (size_t i = 1; i < usedSegments_.size(); ++i) {
-    memset(
-        data + usedSegments_[i - 1].second,
-        0,
-        sizeof(real) * (usedSegments_[i].first - usedSegments_[i - 1].second));
-    n += usedSegments_[i].first - usedSegments_[i - 1].second;
-  }
-}
-
-void ParameterServer2::parallelExecForEachBlock(ExecFunc func) {
-  SyncThreadPool::execHelper(
-      syncThreadPool_.get(), [&](int tid, size_t numThreads) {
-        int64_t numBlocks = blockIdMap_.size();
-        VectorPtr* vecs = parameter::getThreadLocalBuffer();
-        for (int64_t blockId = tid; blockId < numBlocks;
-             blockId += numThreads) {
-          func(blockId, vecs);
-        }
-      });
-}
-
-void ParameterServer2::blockTraverse(
-    BlockInfo& info,
-    const ParameterConfig& config,
-    int64_t offset,
-    size_t size,
-    const VectorPtr vecs[],
-    const ParameterOptimizer::TraverseCallback& callback) {
-  /// setup sub bufs
-  for (const auto type : info.optimizer->getParameterTypes()) {
-    vecs[type]->subVecFrom(*vectors_[type], offset, size);
-  }
-  callback(vecs, config, config.sparse_remote_update() ? 0 : -1LU);
-}
-
-void ParameterServer2::op_SGD(const Operation& operation,
-                              OperationResult* result) {
-  (void)operation;
-  (void)result;
-
-  if (allClientPassFinish_) {
-    /// when all clients signal pass finished, the update
-    /// is empty.
-    return;
-  }
-
-  {
-    parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
-      BlockInfo& info = blockInfos_[blockId];
-      const ParameterConfig& config = getParameterConfig(blockId);
-      int64_t offset = info.offset;
-      size_t size = config.parameter_block_size();
-
-      info.optimizer->startBatch(numSamplesProcessed_);
-
-      for (const auto type : info.optimizer->getParameterTypes()) {
-        vecs[type]->subVecFrom(*vectors_[type], offset, size);
-      }
-      info.optimizer->update(
-          vecs, config, config.sparse_remote_update() ? 0 : -1LU);
-      vecs[PARAMETER_GRADIENT]->zeroMem();
-
-      if (auto callback = info.optimizer->needSpecialTraversal(config)) {
-        blockTraverse(info, config, offset, size, vecs, callback);
-      }
-      info.optimizer->finishBatch();
-    });
-  }
-
-  batchId_++;
-}
-
-void ParameterServer2::op_start_pass(const Operation& operation,
-                                     OperationResult* result) {
-  (void)operation;
-  (void)result;
-
-  parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
-    BlockInfo& info = blockInfos_[blockId];
-    info.optimizer->startPass();
-  });
-}
-
-void ParameterServer2::op_finish_pass(const Operation& operation,
-                                      OperationResult* result) {
-  (void)operation;
-  (void)result;
-
-  parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
-    BlockInfo& info = blockInfos_[blockId];
-    const ParameterConfig& config = getParameterConfig(blockId);
-    size_t size = config.parameter_block_size();
-
-    /// catch up with
-    if (auto callback = info.optimizer->startCatchUpWith()) {
-      blockTraverse(info, config, info.offset, size, vecs, callback);
-      info.optimizer->finishCatchUpWith();
-    }
-
-    /// finish pass
-    info.optimizer->finishPass();
-  });
-  batchId_ = 0;
-}
-
-void ParameterServer2::op_apply(const Operation& operation,
-                                OperationResult* result) {
-  (void)operation;
-  (void)result;
-
-  parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
-    BlockInfo& info = blockInfos_[blockId];
-    const ParameterConfig& config = getParameterConfig(blockId);
-    int64_t offset = info.offset;
-    size_t size = config.parameter_block_size();
-
-    // catch up with
-    if (auto callback = info.optimizer->startCatchUpWith()) {
-      blockTraverse(info, config, offset, size, vecs, callback);
-      info.optimizer->finishCatchUpWith();
-    }
-
-    // apply to PARAMETER_APPLY
-    if (auto callback = info.optimizer->apply()) {
-      blockTraverse(info, config, offset, size, vecs, callback);
-    }
-  });
-}
-
-void ParameterServer2::op_randomize(const Operation& operation,
-                                    OperationResult* result) {
-  LOG(INFO) << "ParameterServer2::op_randomize: serverId=" << serverId_;
-
-  CpuVector& valueVec = *vectors_[PARAMETER_VALUE];
-
-  parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
-    BlockInfo& info = blockInfos_[blockId];
-    const ParameterConfig& config = getParameterConfig(blockId);
-    size_t size = config.parameter_block_size();
-
-    vecs[PARAMETER_VALUE]->subVecFrom(valueVec, info.offset, size);
-    Parameter::randomize(vecs[PARAMETER_VALUE], config);
-  });
-}
-
-void ParameterServer2::loadValueVector(const LoadValueRequest& request,
-                                       ProtoResponseCallback callback) {
-  LoadValueResponse response;
-  LOG(INFO) << "ParameterServer2::loadValueVector: serverId=" << serverId_;
-
-  constexpr int kBufLen = 100;
-  char buf[kBufLen];
-  snprintf(buf, kBufLen, "/pserver.%04d", static_cast<int>(serverId_));
-  std::string filename = request.dir_name() + buf;
-
-  std::ifstream fs(filename, std::ios_base::binary);
-  CHECK(fs) << "Fail to open " << filename;
-
-  CpuVector& vec = *vectors_[PARAMETER_VALUE];
-  Parameter::Header header;
-  CHECK(fs.read(reinterpret_cast<char*>(&header), sizeof(header)))
-      << "Fail to read parameters in pserver";
-  CHECK(Parameter::isHeaderFormatSupported(header.format))
-      << "Incorrect format version: " << header.format;
-  CHECK_EQ(header.size, (size_t)size_)
-      << "The size (" << header.size << ") in the file does not match the size "
-      << "(" << size_ << ") of the pserver: " << serverId_;
-  CHECK_EQ(header.valueSize, sizeof(real)) << "Unsupported valueSize "
-                                           << header.valueSize;
-  CHECK(fs.read(reinterpret_cast<char*>(vec.getData()),
-                header.size * sizeof(real)));
-
-  callback(response);
-}
-
-void ParameterServer2::saveValueVector(const SaveValueRequest& request,
-                                       ProtoResponseCallback callback) {
-  SaveValueResponse response;
-  LOG(INFO) << "ParameterServer2::SaveValueVector: serverId=" << serverId_;
-
-  mkDir(request.dir_name().c_str());
-
-  constexpr int kBufLen = 100;
-  char buf[kBufLen];
-  snprintf(buf, kBufLen, "/pserver.%04d", static_cast<int>(serverId_));
-  std::string filename = request.dir_name() + buf;
-
-  std::ofstream fs(filename, std::ios_base::binary);
-  CHECK(fs) << "Fail to open " << filename;
-
-  CpuVector& vec = vectors_[PARAMETER_APPLY] ? *vectors_[PARAMETER_APPLY]
-                                             : *vectors_[PARAMETER_VALUE];
-  Parameter::Header header;
-  // TODO(TJ): save param headerFormat_
-  header.format = PARAM_FORMAT_ORIGINAL;
-  header.valueSize = sizeof(real);
-  header.size = size_;
-
-  CHECK_EQ(header.size, vec.getSize());
-
-  CHECK(fs.write(reinterpret_cast<char*>(&header), sizeof(header)))
-      << "Fail to write parameter in pserver: " << serverId_;
-
-  CHECK(fs.write(reinterpret_cast<char*>(vec.getData()),
-                 header.size * sizeof(real)))
-      << "Fail to write parameter in pserver: " << serverId_;
-
-  callback(response);
-}
-
-void ParameterServer2::op_RESET(const Operation& operation,
-                                OperationResult* result) {
-  (void)result;
-  CpuVector* u = vectors_[operation.pvectors(0)].get();
-  u->reset(operation.scalars(0));
-  clearUnusedSegments(u);
-}
-
-void ParameterServer2::op_utv(const Operation& operation,
-                              OperationResult* result) {
-  real* u = vectors_[operation.pvectors(0)]->getData();
-  real* v = vectors_[operation.pvectors(1)]->getData();
-  int64_t size = size_;
-  double sum = 0;
-  for (int64_t i = 0; i < size; ++i) {
-    sum += (double)u[i] * (double)v[i];
-  }
-  result->add_scalars(sum);
-}
-
-void ParameterServer2::op_au_bv(const Operation& operation,
-                                OperationResult* result) {
-  (void)result;
-  real* u = vectors_[operation.pvectors(0)]->getData();
-  real* v = vectors_[operation.pvectors(1)]->getData();
-  int64_t size = size_;
-  real a = operation.scalars(0);
-  real b = operation.scalars(1);
-  for (int64_t i = 0; i < size; ++i) {
-    v[i] = a * u[i] + b * v[i];
-  }
-}
-
-void ParameterServer2::op_COPY(const Operation& operation,
-                               OperationResult* result) {
-  (void)result;
-  real* u = vectors_[operation.pvectors(0)]->getData();
-  real* v = vectors_[operation.pvectors(1)]->getData();
-  int64_t size = size_;
-  for (int64_t i = 0; i < size; ++i) {
-    v[i] = u[i];
-  }
-}
-
-void ParameterServer2::op_au(const Operation& operation,
-                             OperationResult* result) {
-  (void)result;
-  real* u = vectors_[operation.pvectors(0)]->getData();
-  int64_t size = size_;
-  real a = operation.scalars(0);
-  for (int64_t i = 0; i < size; ++i) {
-    u[i] *= a;
-  }
-}
-
-void ParameterServer2::op_au_bv_cw(const Operation& operation,
-                                   OperationResult* result) {
-  (void)result;
-  real* u = vectors_[operation.pvectors(0)]->getData();
-  real* v = vectors_[operation.pvectors(1)]->getData();
-  real* w = vectors_[operation.pvectors(2)]->getData();
-  int64_t size = size_;
-  real a = operation.scalars(0);
-  real b = operation.scalars(1);
-  real c = operation.scalars(2);
-  for (int64_t i = 0; i < size; ++i) {
-    w[i] = a * u[i] + b * v[i] + c * w[i];
-  }
-}
-
-void ParameterServer2::op_make_steepest_desc_dir(const Operation& operation,
-                                                 OperationResult* result) {
-  (void)result;
-  real* dir = vectors_[operation.pvectors(0)]->getData();
-  real* grad = vectors_[operation.pvectors(1)]->getData();
-  real* x = vectors_[operation.pvectors(2)]->getData();
-  int64_t size = size_;
-  real l1weight = operation.scalars(0);
-  for (int64_t i = 0; i < size; ++i) {
-    if (x[i] < 0) {
-      dir[i] = -grad[i] + l1weight;
-    } else if (x[i] > 0) {
-      dir[i] = -grad[i] - l1weight;
-    } else {
-      if (grad[i] < -l1weight) {
-        dir[i] = -grad[i] - l1weight;
-      } else if (grad[i] > l1weight) {
-        dir[i] = -grad[i] + l1weight;
-      } else {
-        dir[i] = 0;
-      }
-    }
-  }
-}
-
-void ParameterServer2::op_fix_dir_signs(const Operation& operation,
-                                        OperationResult* result) {
-  (void)result;
-  real* dir = vectors_[operation.pvectors(0)]->getData();
-  real* steepestDescDir = vectors_[operation.pvectors(1)]->getData();
-  int64_t size = size_;
-  for (int64_t i = 0; i < size; ++i) {
-    if (dir[i] * steepestDescDir[i] <= 0) {
-      dir[i] = 0;
-    }
-  }
-}
-
-void ParameterServer2::op_fix_omega_signs(const Operation& operation,
-                                          OperationResult* result) {
-  (void)result;
-  real* x = vectors_[operation.pvectors(0)]->getData();
-  real* newx = vectors_[operation.pvectors(1)]->getData();
-  int64_t size = size_;
-  for (int64_t i = 0; i < size; ++i) {
-    if (x[i] * newx[i] < 0) {
-      newx[i] = 0;
-    }
-  }
-}
-
-void ParameterServer2::op_dir_deriv(const Operation& operation,
-                                    OperationResult* result) {
-  real* dir = vectors_[operation.pvectors(0)]->getData();
-  real* grad = vectors_[operation.pvectors(1)]->getData();
-  real* x = vectors_[operation.pvectors(2)]->getData();
-  int64_t size = size_;
-  real l1weight = operation.scalars(0);
-  double sum = 0;
-  for (int64_t i = 0; i < size; ++i) {
-    if (dir[i] != 0) {
-      if (x[i] < 0) {
-        sum += dir[i] * (grad[i] - l1weight);
-      } else if (x[i] > 0) {
-        sum += dir[i] * (grad[i] + l1weight);
-      } else if (dir[i] < 0) {
-        sum += dir[i] * (grad[i] - l1weight);
-      } else if (dir[i] > 0) {
-        sum += dir[i] * (grad[i] + l1weight);
-      }
-    }
-  }
-  result->add_scalars(sum);
-}
-
-void ParameterServer2::op_cost(const Operation& operation,
-                               OperationResult* result) {
-  real* x = vectors_[operation.pvectors(0)]->getData();
-  real* newgrad = vectors_[operation.pvectors(1)]->getData();
-  int64_t size = size_;
-  real l1weight = operation.scalars(0);
-  real l2weight = operation.scalars(1);
-  double cost_real = cost_ / mpiSize_;
-  double sum_weight_l1 = 0;
-  double sum_weight_l2 = 0;
-  for (int64_t i = 0; i < size; ++i) {
-    sum_weight_l1 += std::abs(x[i]);
-    sum_weight_l2 += x[i] * x[i];
-    newgrad[i] += 2.0 * l2weight * x[i];
-  }
-  cost_real += l1weight * sum_weight_l1 + l2weight * sum_weight_l2;
-  result->add_scalars(cost_real);
-}
-
-ParameterServer2::OperatorFunction ParameterServer2::opFuncs[] = {
-    nullptr,                         // PSERVER_OP_utu = 0;
-    &ParameterServer2::op_utv,       // PSERVER_OP_utv = 1;
-    &ParameterServer2::op_au,        // PSERVER_OP_au = 2;
-    &ParameterServer2::op_au_bv,     // PSERVER_OP_au_bv = 3;
-    nullptr,                         // PSERVER_OP_aAx_bu = 4;
-    &ParameterServer2::op_SGD,       // PSERVER_OP_SGD = 5;
-    &ParameterServer2::op_RESET,     // PSERVER_OP_RESET = 6;
-    &ParameterServer2::op_COPY,      // PSERVER_OP_COPY = 7;
-    &ParameterServer2::op_au_bv_cw,  // PSERVER_OP_au_bv_cw = 8;
-    &ParameterServer2::op_make_steepest_desc_dir,
-    /// PSERVER_OP_MAKE_STEEPEST_DESC_DIR = 9;
-    &ParameterServer2::op_fix_dir_signs,    // PSERVER_OP_FIX_SIGNS = 10;
-    &ParameterServer2::op_dir_deriv,        // PSERVER_OP_DIR_DERIV = 11;
-    &ParameterServer2::op_fix_omega_signs,  // PSERVER_OP_FIX_OMEGA_SIGNS = 12;
-    &ParameterServer2::op_cost,             // PSERVER_OP_COST = 13
-    &ParameterServer2::op_start_pass,       // PSERVER_OP_START_PASS = 14
-    &ParameterServer2::op_finish_pass,      // PSERVER_OP_FINISH_PASS = 15
-    &ParameterServer2::op_randomize,        // PSERVER_OP_RANDOMIZE = 16
-    &ParameterServer2::op_apply,            // PSERVER_OP_APPLY = 17
-};
-
-void ParameterServer2::doOperation(const DoOperationRequest& request,
-                                   ProtoResponseCallback callback) {
-  if (request.wait_for_gradient()) {
-    /// wait gradient update
-    gradientReadyBarrier_.wait();
-    allClientPassFinish_ = numPassFinishClients_ == FLAGS_num_gradient_servers;
-  }
-
-  DoOperationResponse response;
-  response.set_pass_finish(allClientPassFinish_);
-
-  for (const auto& op : request.operations()) {
-    OperationResult* opResult = response.add_results();
-    if (op.operation() >= ARRAYSIZE(opFuncs)) {
-      LOG(ERROR) << "Unknown operation " << op.operation();
-      response.set_return_message(kRetMsgUnknownOperation);
-    }
-    OperatorFunction opFunc = opFuncs[op.operation()];
-    if (!opFunc) {
-      LOG(ERROR) << "Operation not implemented: " << op.operation();
-      response.set_return_message(kRetMsgUnknownOperation);
-    }
-    (this->*opFunc)(op, opResult);
-  }
-
-  if (request.send_back_parameter()) {
-    /// clean current cost
-    cost_ = 0;
-
-    if (allClientPassFinish_ && request.release_pass()) {
-      /// This signals that all clients finish one pass, so waitPassFinish()
-      /// will stop waiting.
-      numPassFinishClients_ = 0;
-    }
-
-    /// notify addGradient() to send back parameter
-    parameterReadyBarrier_.wait();
-  }
-  callback(response);
-}
-
-void ParameterServer2::waitPassStart(const WaitPassStartRequest& request,
-                                     ProtoResponseCallback callback) {
-  passBarrier_.wait();
-  callback(WaitPassStartResponse());
-}
-
-void ParameterServer2::waitPassFinish(const WaitPassFinishRequest& request,
-                                      ProtoResponseCallback callback) {
-  numPassFinishClients_ += 1;
-
-  while (numPassFinishClients_ != 0) {
-    /// notify doOperation gradient ready
-    gradientReadyBarrier_.wait();
-    /// wait doOperation finish
-    parameterReadyBarrier_.wait();
-  }
-
-  callback(WaitPassFinishResponse());
-}
-
-void ParameterServer2::synchronize(const SynchronizeRequest& request,
-                                   ProtoResponseCallback callback) {
-  synchronizeBarriers_[request.sync_object_id()]->wait();
-  dataSize_ = 0;
-  callback(SynchronizeResponse());
-}
-
-void ParameterServer2::asyncFinishPass(const SynchronizeRequest& request,
-                                       ProtoResponseCallback callback) {
-  synchronizeBarriers_[request.sync_object_id()]->wait();
-  callback(SynchronizeResponse());
-
-  if (request.trainer_id() == 0) {
-    batchId_ = 0;
-  }
-}
-
-void ParameterServer2::createVector(const CreateVectorRequest& request,
-                                    ProtoResponseCallback callback) {
-  (void)request;
-  CreateVectorResponse response;
-  LOG(INFO) << "ParameterServer2::createVector: size=" << size_;
-  CpuVectorPtr vec = std::make_shared<CpuVector>(size_);
-  int64_t handle = -1;
-  {
-    std::lock_guard<RWLock> guard(parameterMutex_);
-    handle = vectors_.size();
-    vectors_.push_back(vec);
-  }
-  response.set_handle(handle);
-  callback(response);
-}
-
-void ParameterServer2::releaseVector(const ReleaseVectorRequest& request,
-                                     ProtoResponseCallback callback) {
-  ReleaseVectorResponse response;
-  CpuVectorPtr vec;
-  {
-    std::lock_guard<RWLock> guard(parameterMutex_);
-    vec.swap(vectors_[request.handle()]);
-  }
-  callback(response);
-}
-
-void ParameterServer2::createMatrix(const CreateMatrixRequest& request,
-                                    ProtoResponseCallback callback) {
-  CreateMatrixResponse response;
-  /// We need to create column major matrix of size_ * num_cols
-  /// Matrix is row majoar. Need to tranpose when use it.
-  CpuMatrixPtr mat = std::make_shared<CpuMatrix>(request.num_cols(), size_);
-  int64_t handle = -1;
-  {
-    std::lock_guard<RWLock> guard(parameterMutex_);
-    handle = matrices_.size();
-    matrices_.push_back(mat);
-  }
-  response.set_handle(handle);
-  callback(response);
-}
-
-void ParameterServer2::releaseMatrix(const ReleaseMatrixRequest& request,
-                                     ProtoResponseCallback callback) {
-  ReleaseMatrixResponse response;
-  CpuMatrixPtr mat;
-  {
-    std::lock_guard<RWLock> guard(parameterMutex_);
-    mat.swap(matrices_[request.handle()]);
-  }
-  callback(response);
-}
-
-}  // namespace paddle
diff --git a/paddle/pserver/ParameterServer2.h b/paddle/pserver/ParameterServer2.h
deleted file mode 100644
index 0b8ef5c170c01ec8a5d53f01db9888f82ca68eec..0000000000000000000000000000000000000000
--- a/paddle/pserver/ParameterServer2.h
+++ /dev/null
@@ -1,696 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <atomic>
-#include <limits>
-#include <mutex>
-#include <string>
-#include <type_traits>
-#include <unordered_map>
-#include <vector>
-
-#include <stddef.h>
-#include <stdlib.h>
-
-#include "paddle/math/Matrix.h"
-#include "paddle/math/Vector.h"
-#include "paddle/parameter/Parameter.h"
-#include "paddle/parameter/ParameterOptimizer.h"
-#include "paddle/utils/Common.h"
-#include "paddle/utils/Locks.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/ThreadLocal.h"
-
-#include "ParameterService.pb.h"
-
-#include "ProtoServer.h"
-
-DECLARE_int32(port);
-
-namespace paddle {
-
-// @TODO(yanfei):
-// if armed with high density computation resource per node, pserver could also
-// utilize GPU to reduce overhead. if this mechanism is used, it could pipeline
-// network receiving and GPU computation to reduce the network overhead even
-// further. the pipeline could help to accelerate BIG model training.
-// @TODO:(yanfei)
-// for cpu and less/low gpu machine, the time exhausted by forward and backward
-// could be larger than optimization at pserver. However, if armed with lots of
-// gpus per node and if the model size is so large enough that limited cpu
-// computation causes big optmization latency, the GPU may be required by
-// pserver.
-
-/**
- * Client interface for the parameter server
- *
- * it implements several rpc API for remote parameter client usage.
- * for sync-sgd, client needs one controller thread to build connections
- * to all pservers, these controller connections do barriers
- * synchronization with these connections used for transfering data.
- * each data connection uses block based fine grained synchronization
- * to gain better scalability. Merging gradients from different trainers
- * are concurrently executed with block units, so that some network
- * overhead will be hidden in merging gradient.
- * for async-sgd, the difference is that pserver will do optimization
- * immediately if the gradients are ready, so that pserver needs to
- * prepare separate buffer to store value for sending back to trainer
- * to prevent from being polluted.
- */
-class ParameterServer2 : public ProtoServer {
- protected:
-  /// parameter_ mutex.
-  RWLock parameterMutex_;
-
-  typedef std::pair<size_t, int64_t> BlockKey;
-  struct BlockKeyHash {
-    size_t operator()(const BlockKey& key) const {
-      return std::hash<size_t>()(key.first) + key.second;
-    }
-  };
-
-  // TODO(yanfei):
-  // if index data structure is based on parameters instead of blocks, the
-  // lookup performance could be better. In addition, the block memory
-  // access almost exhibits good locality, so index data structure and
-  // block data structure can be refined further, especially if gpu is used
-  // for pserver.
-  /**
-   * all parameters are stored in CpuVector with a blockMap_ data structure
-   * to index block data required by requests.
-   */
-  typedef std::unordered_map<BlockKey, int64_t, BlockKeyHash> BlockMap;
-  /// <(para, block), global offset(byte) in all parameters>
-  BlockMap blockOffsetMap_;
-  /// <(para, block), global idx [0, nBlocksInAllParameters]>
-  BlockMap blockIdMap_;
-
-  std::vector<CpuVectorPtr> vectors_;
-  std::vector<CpuMatrixPtr> matrices_;
-  std::vector<CpuMemHandlePtr> dataMems_;
-
-  // TODO(yanfei):
-  // if storing sparse_remote_update() flag in request instead of
-  // reading configMap_, and storing config within new block wise
-  // overview data structure, the config mapping, block mapping
-  // can be unified in single clean data structure. Use para_id
-  // to index parameters, use offset to index block within parameter
-  // and keep two index into single one.
-  /**
-   * mapping between parameter and config
-   * different parameter allows different config, such as decay_rate.
-   * for each request, it need to read config for adding gradient
-   * and optmization.
-   */
-  std::unordered_map<size_t, ParameterConfig> configMap_;
-
-  /**
-   * to parallelize the multi-thread and multi-connnection
-   * computation at pserver, it use block unit to reduce
-   * the contention for computation, even further use block
-   * level optimizater control for each block for some special
-   * reason annotated below.
-   */
-  struct BlockInfo {
-    const ParameterConfig* config;
-    std::unique_ptr<std::mutex> lock;
-    /// global offset for all parameters
-    uint64_t offset;
-    /**
-     *
-     * Async sgd in pserver is very different from sync sgd.
-     * Each trainer follows startBatch, update*, finishBatch as in
-     * sync sgd, but all these actions are almost executed by
-     * multi-core and multi-thread simutaneously, so that async
-     * sgd optimization is based on block level in reality, then
-     * per block optimization is necessary indeed. In addition,
-     * per block optimization is also perfered for performance
-     * with multithreads.
-     */
-    std::unique_ptr<ParameterOptimizer> optimizer;
-  };
-  std::vector<BlockInfo> blockInfos_;
-
-  typedef std::vector<std::pair<int64_t, int64_t>> BlockSegments;
-  /// Because some blocks might not be fully used. We keep a
-  /// record of which segments are used.
-  BlockSegments usedSegments_;
-
-  /// record pserver status, all status defined in ParameterService.pb
-  PServerStatus status_;
-  /// record all samples processed which could be used by optimizater
-  std::atomic<int64_t> numSamplesProcessed_;
-  double cost_;
-  int mpiSize_;
-  int dataSize_;
-  /// configuration for current parameter optimizer
-  OptimizationConfig config_;
-
-  /**
-   * The ReadWriteBuffer is based on std::vector, but aligned for avx/sse
-   * compute. And add some helper method to allocate memory aligned blocks.
-   *
-   * @param T          type of element.
-   * @param AlignBytes the memory aligned bytes for allocated blocks.
-   */
-  template <typename T, size_t AlignBytes>
-  class ReadWriteBuffer
-      : public std::vector<T, AlignedAllocator<T, AlignBytes>> {
-   public:
-    static_assert(sizeof(T) % AlignBytes == 0 || AlignBytes % sizeof(T) == 0,
-                  "Type T must be able to aligned.");
-
-    /**
-     * @brief IsTLargerThanAlign compiled time calculated constant for is type
-     * T larger than alignments.
-     */
-    constexpr static bool IsTLargerThanAlign = sizeof(T) >= AlignBytes;
-
-    static_assert(std::is_pod<T>::value, "T must be POD type.");
-
-    /**
-     * @brief if AlignBytes > sizeof(T), then will calcuate how many elements
-     * can be stored in AlignBytes.
-     */
-    constexpr static size_t AlignElementCount = AlignBytes / sizeof(T);
-
-    static_assert(AlignElementCount ==
-                          (AlignElementCount & -AlignElementCount) ||
-                      AlignBytes > sizeof(T),
-                  "AlignElementCount should be exp of 2");
-
-    /**
-     * @brief Resize Buffer, with block count that will be allocated. Each block
-     * will be memory aligned in AlignBytes.
-     * @param size The element count in all blocks.
-     * @param alignBlockCount The block count that will be allocated.
-     */
-    void resizeWithAlignHints(size_t size, size_t alignBlockCount = 1) {
-      if (IsTLargerThanAlign) {  //! So, each elements is memory aligned.
-        this->resize(size);
-      } else {
-        //! at most, we need such elements in buffer to make sure each block is
-        //! aligned.
-        this->resize(size + alignBlockCount * (AlignElementCount - 1));
-      }
-    }
-
-    /**
-     * @brief reset aligned allocate blocks.
-     */
-    void resetAlignAlloc() { this->curOffset_ = 0; }
-
-    /**
-     * @brief get next aligned block address.
-     * @param blockSize is the element count in each block.
-     * @return Aligned block address.
-     */
-    T* nextBlock(size_t blockSize) {
-      T* r = &this->operator[](curOffset_);
-      curOffset_ += blockSize;
-
-      if (!IsTLargerThanAlign) {
-        curOffset_ =
-            (curOffset_ + AlignElementCount - 1) & ~(AlignElementCount - 1);
-      }
-      return r;
-    }
-
-   private:
-    size_t curOffset_;
-  };
-
-  /// to buffer the data from network for further processing to
-  /// reduce redundant memory allocation.
-  ThreadLocal<ReadWriteBuffer<real, ALIGN_HINT>> readWriteBuffer_;
-
-  /// size of the parameter
-  int64_t size_;
-
-  /// for synchronized training, check details in addGradient()
-  /// and doOperation()
-  ThreadBarrier gradientReadyBarrier_;
-  ThreadBarrier parameterReadyBarrier_;
-  ThreadBarrier passBarrier_;
-  ThreadLocal<std::vector<SendParameterRequest>> requestVec_;
-  ThreadLocal<std::vector<ProtoResponseCallbackEx>> callbackVec_;
-
-  std::atomic<int> numPassFinishClients_;
-  bool allClientPassFinish_;
-
-  std::vector<std::unique_ptr<ThreadBarrier>> synchronizeBarriers_;
-  std::atomic<int> serverId_;
-
-  /**
-   *
-   * for lagged async gradient gradient commit control in Async Sgd.
-   * discard lagged gradients from too slow nodes, whose gradients
-   * exhibits bad quality.
-   * Algorithm:
-   * pserver:
-   * 1. initial asyncUpdaterSteps = 0, asyncTrainerSteps_[N] = 0.
-   * syncUpdaterSteps means
-   *    the version of parameter value.
-   * 2. when pull arrives, record asyncUpdateSteps_ into
-   * syncTrainerSteps_[trainer_id]
-   * 3. when push arrives, compare asyncUpdateSteps_ with
-   * syncTrainerSteps_[trainer_id]
-   *    if delta > threshold, discard current gradient, else commit
-   *    gradient.
-   * 4. reset asyncUpdaterSteps_ and asyncTrainerSteps_[N] when pass
-   * finished
-   * Note:
-   * it can not discard all lag-gradient strictly in some special
-   * condition. part of gradients could be discarded if
-   * ConcurrentRemoteParameterUpdater is sed.
-   * this algorithm is implemented in asynSGD()
-   */
-  int64_t asyncLaggedThreshold_;
-  std::atomic<int64_t> asyncUpdateSteps_;
-  std::vector<int64_t> asyncTrainerSteps_;
-  size_t asyncLaggedGradientsNum_;
-  /// stat all async update
-  std::vector<size_t> asyncUpdateStat_;
-  /// stat per trainer_id
-  std::vector<size_t> asyncTrainerDiscardStat_;
-  /// stat per trainer_id
-  std::vector<size_t> asyncTrainerCommitStat_;
-
-  /// only used by controller and other control cmd from trainer number 0
-  std::unique_ptr<SyncThreadPool> syncThreadPool_;
-
-  /// pserver for sparse remote update parameters
-  bool isSparseServer_;
-
-  /// barrier performance tuning sync-sgd required
-  std::atomic<int64_t> batchId_;
-
- public:
-  struct Buffer {
-    real* base;
-    size_t size;
-  };
-
- protected:
-  /// async gradient commit control
-  bool asyncGrdientCommitCheckAndStat(const SendParameterRequest& request);
-
- public:
-  /// disable default parameter for overloading
-  /// @rdmaCpu:the id of cpu core hosting RDMA server(0-N)
-  /// -1 means using TCP transport instead of RDMA
-  ParameterServer2(const std::string& addr, int port, int rdmaCpu = -1);
-
-  ~ParameterServer2() {}
-
-  static const std::string kRetMsgInvalidMatrixHandle;
-  static const std::string kRetMsgInvalidVectorHandle;
-  static const std::string kRetMsgUnknownOperation;
-
-  /// service functions
-  template <typename Dtype>
-  void reduceAndSendData(const SendDataRequest& request,
-                         std::unique_ptr<MsgReader>& msgReader,
-                         ProtoResponseCallbackEx& callback);
-
-  void templateReduceSum(const SendDataRequest& request,
-                         std::unique_ptr<MsgReader>& msgReader,
-                         ProtoResponseCallbackEx& callback);
-
-  /**
-   * @brief framework for sending parameters
-   *
-   * @note  different parameter data type can be sent to pserver.
-   *        in most case, the api is used to send gradients from
-   *        trainer to pserver.
-   *        it also can be used to retrieve parameters from pserver
-   */
-  void sendParameter(const SendParameterRequest& request,
-                     std::unique_ptr<MsgReader> msgReader,
-                     ProtoResponseCallbackEx callback);
-
-  void sendData(const SendDataRequest& request,
-                std::unique_ptr<MsgReader> msgReader,
-                ProtoResponseCallbackEx callback);
-
-  /**
-   * @brief send config to pserver
-   *
-   * @note  it can help pserver to understand the configuration for
-   * optimization,
-   *        logging control, duplicated initialization, etc.
-   */
-  void setConfig(const SetConfigRequest& request,
-                 ProtoResponseCallback callback);
-
-  /**
-   * @brief get status for pserver
-   *
-   * @note  used to check if parameters are ready at pserver
-   */
-  void getStatus(const GetStatusRequest& request,
-                 ProtoResponseCallback callback);
-
-  /**
-   * @brief set status for pserver
-   *
-   * @note  used to check if parameters are ready at pserver, since parameters
-   *        at pserver are initialized by trainer
-   */
-  void setStatus(const SetStatusRequest& request,
-                 ProtoResponseCallback callback);
-
-  /**
-   * @brief framework for doing some operation at pserver end
-   *
-   * @note  if sync-sgd is used, controller will calling op_SGD action
-   *        for gradient optimization.
-   *        check avaiable operations in opFuncs[]
-   */
-  void doOperation(const DoOperationRequest& request,
-                   ProtoResponseCallback callback);
-
-  /// Create a column vector. The size is the dimension of parameter
-  void createVector(const CreateVectorRequest& request,
-                    ProtoResponseCallback callback);
-
-  void releaseVector(const ReleaseVectorRequest& request,
-                     ProtoResponseCallback callback);
-
-  /// Create a column major matrix. The number of rows is the dimension of
-  /// parameter. The number of columns is specifed by num_cols.
-  void createMatrix(const CreateMatrixRequest& request,
-                    ProtoResponseCallback callback);
-
-  void releaseMatrix(const ReleaseMatrixRequest& request,
-                     ProtoResponseCallback callback);
-  /**
-   * @brief stateful control for indicationg sync pass start
-   *
-   * @note  it is valuable for logging and state control,
-   *        especially for sync-sgd control
-   */
-  void waitPassStart(const WaitPassStartRequest& request,
-                     ProtoResponseCallback callback);
-
-  /**
-   * @brief stateful control for indicationg sync pass end
-   *
-   * @note  it is valuable for logging and state control,
-   *        especially for sync-sgd control
-   */
-  void waitPassFinish(const WaitPassFinishRequest& request,
-                      ProtoResponseCallback callback);
-
-  /**
-   * @brief synchronize all distributed trainers
-   *
-   * @note  it's general api for synchronizing trainer and pserver
-   */
-  void synchronize(const SynchronizeRequest& request,
-                   ProtoResponseCallback callback);
-
-  /**
-   * @brief stateful control for indicating async pass is finished
-   *
-   * @note  it is valuable for logging control, state reset, etc.
-   */
-  void asyncFinishPass(const SynchronizeRequest& request,
-                       ProtoResponseCallback callback);
-
-  void loadValueVector(const LoadValueRequest& request,
-                       ProtoResponseCallback callback);
-
-  void saveValueVector(const SaveValueRequest& request,
-                       ProtoResponseCallback callback);
-
- public:
-  /**
-   * @brief initialize parameter server
-   */
-  bool init();
-
-  /**
-   * @brief set parameters at pserver
-   *
-   * @note  do parameter initialization if neccessy.
-   */
-  void setParameter(const SendParameterRequest& request,
-                    std::vector<Buffer>& inputBuffers,
-                    SendParameterResponse* response,
-                    std::vector<Buffer>* outputBuffers);
-
-  /**
-   * @brief receive gradients and do optimization for async-sgd
-   *
-   * @note  this api asynchronizately receives all data from all
-   *        trainers, and immediately do optimization and return
-   *        optimizated value for trainer.
-   *        this above routine are block based atomic updating,
-   *        which means different block could based different stale
-   *        gradient.
-   *        it will discard some lagged gradients by default for
-   *        better convergence.
-   */
-  void asyncSGD(const SendParameterRequest& request,
-                std::vector<Buffer>& inputBuffers,
-                SendParameterResponse* response,
-                std::vector<Buffer>* outputBuffers);
-
-  /**
-   * @brief merge gradients from all trainer
-   *
-   * @note  this api use block based parallelization as fine grained
-   *        parallelization which benifits lock contention and latency
-   *        hidden for communication, also can harness multi-core
-   *        efficiently.
-   *        it also implements the synchronization for sync-sgd
-   */
-  void addGradient(const SendParameterRequest& request,
-                   std::vector<Buffer>& inputBuffers,
-                   SendParameterResponse* response,
-                   std::vector<Buffer>* outputBuffers);
-
-  /**
-   * @brief get dense parameters from pserver
-   *
-   * @note  for some specified condition, trainer will get parameters from
-   *        pservers.
-   *        e.g.
-   *        if all parameters are stored at perver end for big model training
-   *        trainer can use it to retrieve all parameters if necessary.
-   */
-  void getParameter(const SendParameterRequest& request,
-                    std::vector<Buffer>& inputBuffers,
-                    SendParameterResponse* response,
-                    std::vector<Buffer>* outputBuffers);
-
-  /**
-   * @brief get sparse value from parameter server
-   *
-   * @note  with sparse enabled, pservers own all latest value
-   *        while trainer only retrieve value that only are needed.
-   *        e.g.
-   *        trainer will do prefetch action to retrieve necessary latest
-   *        value from pserver for sparse calculation.
-   */
-  void getParameterSparse(const SendParameterRequest& request,
-                          std::vector<Buffer>& inputBuffers,
-                          SendParameterResponse* response,
-                          std::vector<Buffer>* outputBuffers);
-
- protected:
-  void mergeSegments(BlockSegments* segments);
-
-  /// set the unused segments to zero
-  void clearUnusedSegments(CpuVector* vec);
-
-  // TODO(yanfei):
-  // if read data and do optimization interleavely block by block,
-  // the performance could be better for gaining less network congestion.
-  /// read all data from connection and store it in static pre-allocated buffer
-  void readAllBlocks(MsgReader* msgReader,
-                     std::vector<ParameterServer2::Buffer>* buffers);
-
-  const ParameterConfig& getParameterConfig(const ParameterBlock& block) {
-    CHECK_LT(block.para_id(), -1UL) << "invalid parameter id:"
-                                    << block.para_id();
-    const auto it = configMap_.find(block.para_id());
-    CHECK(it != configMap_.end()) << "can not find parameter id: "
-                                  << block.para_id();
-    return it->second;
-  }
-
-  /// it implictly check blockOffsetMap_ while retrieving blockId
-  const ParameterConfig& getParameterConfig(int64_t blockId) const {
-    CHECK(blockId >= 0 && blockId < (int64_t)blockInfos_.size())
-        << "block idx out of range, id: " << blockId
-        << " info size: " << blockInfos_.size();
-    return *(blockInfos_[blockId].config);
-  }
-
-  template <class Response>
-  bool isValidVectorHandle(int64_t handle, Response* response) {
-    if (handle < 0 || (size_t)handle >= vectors_.size()) {
-      LOG(ERROR) << "Invalid vector handle " << handle;
-      response->set_return_message(kRetMsgInvalidVectorHandle);
-      return false;
-    }
-    return true;
-  }
-
-  template <class Response>
-  bool isValidMatrixHandle(int64_t handle, Response* response) {
-    if (handle < 0 || (size_t)handle >= matrices_.size()) {
-      LOG(ERROR) << "Invalid matrix handle " << handle;
-      response->set_return_message(kRetMsgInvalidMatrixHandle);
-      return false;
-    }
-    return true;
-  }
-
-  /**
-   * @brief get block offset
-   *
-   * @note  block.begin_dim is added to the block offset.
-   *        return -1 if block cannot be found
-   */
-  int64_t getBlockOffset(const ParameterBlock& block) const {
-    BlockKey key(block.para_id(), block.block_id());
-    auto it = blockOffsetMap_.find(key);
-    if (it == blockOffsetMap_.end()) {
-      return -1;
-    }
-    return it->second;
-  }
-
-  /// return -1 if block cannot be found
-  int64_t getBlockId(const ParameterBlock& block) const {
-    BlockKey key(block.para_id(), block.block_id());
-    auto it = blockIdMap_.find(key);
-    if (it == blockIdMap_.end()) {
-      return -1;
-    }
-    return it->second;
-  }
-
-  /**
-   * @brief prepare data for sending back
-   *
-   * @note  modify reponse and outputBuffers for sending parameter
-   *        back to client. The buffer for socket sending uses
-   *        vectors_[parameterType] directly
-   *        for dense with sync-sgd
-   */
-  void sendBackParameter(const ParameterBlock& block,
-                         int parameterType,
-                         SendParameterResponse* response,
-                         std::vector<Buffer>* outputBuffers);
-
-  /**
-   * @brief prepare data for sending back
-   *
-   * @note  modify response and outputBuffers for sending parameter
-   *        back to client. The buffer for socket sending uses buffer->base
-   *        The parameter values are copied from vectors_[parameterType]
-   *        to buffer->base.
-   *        for dense with async-sgd
-   */
-  void sendBackParameter(const ParameterBlock& block,
-                         int parameterType,
-                         SendParameterResponse* response,
-                         Buffer* buffer,
-                         std::vector<Buffer>* outputBuffers);
-  /**
-   * @brief prepare data for sending back
-   *
-   * @note  specified for sparse
-   */
-  void sendBackParameterSparse(const ParameterBlock& block,
-                               int parameterType,
-                               SendParameterResponse* response,
-                               Buffer* buffer,
-                               size_t width,
-                               std::vector<Buffer>* outputBuffers);
-
-  /**
-   * framework routine for block parallelization
-   * e.g.
-   * for optimization on all blocks at pserver end, this routine can facilitize
-   * the parallelize of do optimization on all blocks with multithreads.
-   */
-  typedef std::function<void(int64_t blockId, const VectorPtr vecs[])> ExecFunc;
-  void parallelExecForEachBlock(ExecFunc func);
-  void blockTraverse(BlockInfo& info,
-                     const ParameterConfig& config,
-                     int64_t offset,
-                     size_t size,
-                     const VectorPtr vecs[],
-                     const ParameterOptimizer::TraverseCallback& callback);
-
- public:
-  typedef void (ParameterServer2::*OperatorFunction)(const Operation& operation,
-                                                     OperationResult* result);
-
-  /**
-   * doOperation will call following operations indirectly
-   * e.g.
-   * for sync-sgd control, the controller in remote updater will send op_SGD
-   * command to pserver, then send sendParameter request to pserver immediately.
-   * the two function at pserver end will do cooperation to achieve the sync-sgd
-   * gradient merge and optimization.
-   * the most following operations are specified for owlqn, all operations are
-   * under the context of doOperation function
-   */
-  static OperatorFunction opFuncs[];
-
-  void op_SGD(const Operation& operation, OperationResult* result);
-
-  void op_RESET(const Operation& operation, OperationResult* result);
-
-  void op_utv(const Operation& operation, OperationResult* result);
-
-  void op_au_bv(const Operation& operation, OperationResult* result);
-
-  void op_COPY(const Operation& operation, OperationResult* result);
-
-  void op_au(const Operation& operation, OperationResult* result);
-
-  void op_au_bv_cw(const Operation& operation, OperationResult* result);
-
-  void op_make_steepest_desc_dir(const Operation& operation,
-                                 OperationResult* result);
-
-  void op_fix_dir_signs(const Operation& operation, OperationResult* result);
-
-  void op_dir_deriv(const Operation& operation, OperationResult* result);
-
-  void op_fix_omega_signs(const Operation& operation, OperationResult* result);
-
-  void op_cost(const Operation& operation, OperationResult* result);
-
-  void op_start_pass(const Operation& operation, OperationResult* result);
-  void op_finish_pass(const Operation& operation, OperationResult* result);
-
-  void op_apply(const Operation& operation, OperationResult* result);
-
-  void op_randomize(const Operation& operation, OperationResult* result);
-
-  void op_load(const Operation& operation, OperationResult* result);
-  void op_save(const Operation& operation, OperationResult* result);
-};
-
-}  // namespace paddle
diff --git a/paddle/pserver/ParameterServerController.h b/paddle/pserver/ParameterServerController.h
deleted file mode 100644
index 1308d62fb1787f19123fe37d49f8e14039c5a39a..0000000000000000000000000000000000000000
--- a/paddle/pserver/ParameterServerController.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "ParameterServer2.h"
-#include "ParameterServerConfig.pb.h"
-#include "RDMANetwork.h"
-#include "paddle/utils/StringUtil.h"
-
-namespace paddle {
-
-/**
- * @brief ParameterServerController is used for create, init and manage multi
- * parameter server instances. The num of the instances is decided by port
- * num(the ports number for parameter send) and network devices configured
- * by gflags or proto.
- */
-class ParameterServerController final {
- public:
-  DISABLE_COPY(ParameterServerController);
-
-  /**
-   * @brief Ctor, Create a ParameterServerController from ParameterServerConfig.
-   */
-  explicit ParameterServerController(const ParameterServerConfig& config);
-
-  /**
-   * @brief Dtor.
-   */
-  ~ParameterServerController();
-
-  /**
-   * @brief create ParameterServerController from gflags, this is used for
-   * compatibility with the old usage of configuration by gflags.
-   */
-  static ParameterServerController* createFromGflags();
-
-  /**
-   * @brief create ParameterServerController with ParameterServerConfig, remove
-   * gflags from ParameterServer. Init all ParameterServer2 instances according
-   * to
-   * the config.
-   */
-  static ParameterServerController* create(const ParameterServerConfig& config);
-
-  /**
-   * @brief start all ParameterServer2 instances in this
-   * ParameterServerController.
-   */
-  void start();
-
-  /**
-   * @brief join and wait for all ParameterServer2 instances thread in this
-   * ParameterServerController.
-   */
-  void wait();
-
- private:
-  std::vector<std::unique_ptr<ParameterServer2>> parameterServers_;
-};
-
-}  // namespace paddle
diff --git a/paddle/pserver/RDMANetwork.h b/paddle/pserver/RDMANetwork.h
deleted file mode 100644
index 83db6b9df71274c3a8eb3403457877b68f2b6dea..0000000000000000000000000000000000000000
--- a/paddle/pserver/RDMANetwork.h
+++ /dev/null
@@ -1,158 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifndef PADDLE_DISABLE_RDMA
-#include "sxi_sock.h"
-#else
-#define PROMPT_ERR() LOG(FATAL) << "Paddle is not compiled with rdma"
-#endif
-#include "paddle/utils/Logging.h"
-
-#include <netinet/in.h>
-struct sxi_sock;
-struct sxi_socket;
-
-#ifndef MAX_VEC_SIZE
-// define default MAX_VEC_SIZE
-#define MAX_VEC_SIZE (1UL << 16)
-#endif
-
-namespace paddle {
-/// Namespace rdma is adaptors for sxi_sock.h. Make paddle not depend on it
-/// when disable rdma support
-namespace rdma {
-inline int numCpus() {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_num_configured_cpus();
-#else
-  return 0;
-#endif
-}
-
-inline sxi_socket* ssocket(int cpuId) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_ssocket(cpuId);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline int listen(sxi_socket* s) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_listen(s);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline int bind(sxi_socket* s, const char* str) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_bind(s, str);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline sxi_sock* accept(sxi_socket* s) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_accept(s);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline sockaddr_in* getSourceAddress(sxi_sock* sock) {
-#ifndef PADDLE_DISABLE_RDMA
-  return reinterpret_cast<sockaddr_in*>(&sock->sa);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline int close(sxi_socket* sock) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_socket_close(sock);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline int close(sxi_sock* sock) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_sock_close(sock);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline void init() {
-#ifndef PADDLE_DISABLE_RDMA
-  sxi_module_init();
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline sxi_socket* csocket(int cpuId) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_csocket(cpuId);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline ssize_t read(sxi_sock* channel, void* data, size_t len) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_read(channel, data, len);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline ssize_t write(sxi_sock* channel, void* data, size_t len) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_write(channel, data, len);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline ssize_t readv(sxi_sock* channel, iovec* iov, int count) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_readv(channel, iov, count);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline ssize_t writev(sxi_sock* channel, iovec* iov, int count) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_writev(channel, iov, count);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline sxi_sock* connect(sxi_socket* socket, const char* url) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_connect(socket, url);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-}  //  namespace rdma
-}  //  namespace paddle
diff --git a/paddle/pserver/SocketChannel.cpp b/paddle/pserver/SocketChannel.cpp
deleted file mode 100644
index 72e6943408a1856db214262ff0b0698a2eb89a91..0000000000000000000000000000000000000000
--- a/paddle/pserver/SocketChannel.cpp
+++ /dev/null
@@ -1,235 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "SocketChannel.h"
-
-#include <netdb.h>
-#include <netinet/in.h>
-#include <stdio.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include "RDMANetwork.h"
-
-#include "paddle/utils/Util.h"
-
-namespace paddle {
-
-/**
- * UIO_MAXIOV is documented in writev(2), but <sys/uio.h> only
- * declares it on osx/ios if defined(KERNEL)
- */
-#ifndef UIO_MAXIOV
-#define UIO_MAXIOV 512
-#endif
-
-SocketChannel::~SocketChannel() {
-  if (tcpRdma_ == F_TCP)
-    close(tcpSocket_);
-  else
-    rdma::close(rdmaSocket_);
-  LOG(INFO) << "destory connection in socket channel, peer = " << peerName_;
-}
-
-size_t SocketChannel::read(void* buf, size_t size) {
-  size_t total = 0;
-  while (total < size) {
-    ssize_t len;
-    if (tcpRdma_ == F_TCP)
-      len = ::read(tcpSocket_, (char*)buf + total, size - total);
-    else
-      len = rdma::read(rdmaSocket_, (char*)buf + total, size - total);
-
-    CHECK(len >= 0) << " peer=" << peerName_;
-    if (len <= 0) {
-      return total;
-    }
-    total += len;
-  }
-  return total;
-}
-
-size_t SocketChannel::write(const void* buf, size_t size) {
-  size_t total = 0;
-  while (total < size) {
-    ssize_t len;
-    if (tcpRdma_ == F_TCP)
-      len = ::write(tcpSocket_, (const char*)buf + total, size - total);
-    else
-      len = rdma::write(rdmaSocket_, (char*)buf + total, size - total);
-
-    CHECK(len >= 0) << " peer=" << peerName_;
-    if (len <= 0) {
-      return total;
-    }
-    total += len;
-  }
-  return total;
-}
-
-template <class IOFunc, class SocketType>
-static size_t readwritev(IOFunc iofunc,
-                         SocketType socket,
-                         iovec* iovs,
-                         int iovcnt,
-                         int maxiovs,
-                         const std::string& peerName) {
-  int curIov = 0;
-  size_t total = 0;
-
-  for (int i = 0; i < iovcnt; ++i) {
-    total += iovs[i].iov_len;
-  }
-
-  size_t size = 0;
-  size_t curIovSizeDone = 0;
-
-  while (size < total) {
-    ssize_t len =
-        iofunc(socket, &iovs[curIov], std::min(iovcnt - curIov, maxiovs));
-    CHECK(len > 0) << " peer=" << peerName << " curIov=" << curIov
-                   << " iovCnt=" << iovcnt
-                   << " iovs[curIov].base=" << iovs[curIov].iov_base
-                   << " iovs[curIov].iov_len=" << iovs[curIov].iov_len;
-    size += len;
-
-    /// restore iovs[curIov] to the original value
-    iovs[curIov].iov_base =
-        (void*)((char*)iovs[curIov].iov_base - curIovSizeDone);
-    iovs[curIov].iov_len += curIovSizeDone;
-
-    len += curIovSizeDone;
-
-    while (curIov < iovcnt) {
-      if ((size_t)len < iovs[curIov].iov_len) break;
-      len -= iovs[curIov].iov_len;
-      ++curIov;
-    }
-    if (curIov < iovcnt) {
-      curIovSizeDone = len;
-      iovs[curIov].iov_base = (void*)((char*)iovs[curIov].iov_base + len);
-      iovs[curIov].iov_len -= len;
-    }
-  }
-  return size;
-}
-
-/// rdma::readv and rdma::writev can take advantage of RDMA blocking offload
-/// transfering
-size_t SocketChannel::writev(const std::vector<struct iovec>& iovs) {
-  if (tcpRdma_ == F_TCP)
-    return readwritev(::writev,
-                      tcpSocket_,
-                      const_cast<iovec*>(&iovs[0]),
-                      iovs.size(),
-                      UIO_MAXIOV,
-                      peerName_);
-  else
-    return readwritev(rdma::writev,
-                      rdmaSocket_,
-                      const_cast<iovec*>(&iovs[0]),
-                      iovs.size(),
-                      MAX_VEC_SIZE,
-                      peerName_);
-}
-
-size_t SocketChannel::readv(std::vector<struct iovec>* iovs) {
-  if (tcpRdma_ == F_TCP)
-    return readwritev(::readv,
-                      tcpSocket_,
-                      const_cast<iovec*>(&(*iovs)[0]),
-                      iovs->size(),
-                      UIO_MAXIOV,
-                      peerName_);
-  else
-    return readwritev(rdma::readv,
-                      rdmaSocket_,
-                      const_cast<iovec*>(&(*iovs)[0]),
-                      iovs->size(),
-                      MAX_VEC_SIZE,
-                      peerName_);
-}
-
-void SocketChannel::writeMessage(const std::vector<struct iovec>& userIovs) {
-  MessageHeader header;
-  header.numIovs = userIovs.size();
-
-  std::vector<size_t> iovLengths;
-  iovLengths.reserve(userIovs.size());
-  for (auto& iov : userIovs) {
-    iovLengths.push_back(iov.iov_len);
-  }
-
-  std::vector<iovec> iovs;
-  iovs.reserve(userIovs.size() + 2);
-  iovs.push_back({&header, sizeof(header)});
-  iovs.push_back({&iovLengths[0],
-                  static_cast<size_t>(sizeof(iovLengths[0]) * header.numIovs)});
-  iovs.insert(iovs.end(), userIovs.begin(), userIovs.end());
-
-  header.totalLength = 0;
-  for (auto& iov : iovs) {
-    header.totalLength += iov.iov_len;
-  }
-
-  CHECK(writev(iovs) == (size_t)header.totalLength);
-}
-
-std::unique_ptr<MsgReader> SocketChannel::readMessage() {
-  MessageHeader header;
-
-  size_t len = read(&header, sizeof(header));
-  if (len == 0) {
-    return nullptr;
-  }
-
-  CHECK(len == sizeof(header));
-
-  std::unique_ptr<MsgReader> msgReader(new MsgReader(this, header.numIovs));
-
-  CHECK_EQ(msgReader->getTotalLength() + sizeof(header) +
-               msgReader->getNumBlocks() * sizeof(size_t),
-           (size_t)header.totalLength)
-      << " totalLength=" << msgReader->getTotalLength()
-      << " numBlocks=" << msgReader->getNumBlocks();
-  return msgReader;
-}
-
-MsgReader::MsgReader(SocketChannel* channel, size_t numBlocks)
-    : channel_(channel), blockLengths_(numBlocks), currentBlockIndex_(0) {
-  size_t size = numBlocks * sizeof(blockLengths_[0]);
-  CHECK(channel_->read(&blockLengths_[0], size) == size);
-}
-
-void MsgReader::readBlocks(const std::vector<void*>& bufs) {
-  CHECK_LE(currentBlockIndex_ + bufs.size(), blockLengths_.size());
-  std::vector<iovec> iovs;
-  iovs.reserve(bufs.size());
-  size_t totalLength = 0;
-  for (void* buf : bufs) {
-    iovs.push_back({buf, getNextBlockLength()});
-    totalLength += getNextBlockLength();
-    ++currentBlockIndex_;
-  }
-
-  CHECK(channel_->readv(&iovs) == totalLength);
-}
-
-void MsgReader::readNextBlock(void* buf) {
-  CHECK_LT(currentBlockIndex_, blockLengths_.size());
-  CHECK(channel_->read(buf, getNextBlockLength()) == getNextBlockLength());
-  ++currentBlockIndex_;
-}
-
-}  // namespace paddle
diff --git a/paddle/pserver/SocketChannel.h b/paddle/pserver/SocketChannel.h
deleted file mode 100644
index 8b45ac56090ef82e77514566e7df6b366958655e..0000000000000000000000000000000000000000
--- a/paddle/pserver/SocketChannel.h
+++ /dev/null
@@ -1,153 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/utils/Util.h"
-
-#include <sys/uio.h>
-
-#include <memory>
-#include <vector>
-
-struct sxi_sock;
-
-namespace paddle {
-
-class SocketChannel;
-enum ChannelType {
-  F_TCP = 1,
-  F_RDMA = 2,
-};
-
-/// reading a set of blocks of data from SocketChannel.
-class MsgReader {
- public:
-  MsgReader(SocketChannel* channel, size_t numIovs);
-  ~MsgReader() {
-    /// ensure all data blocks have been processed
-    CHECK_EQ(currentBlockIndex_, blockLengths_.size());
-  }
-  /**
-   * @brief number of remaining parts
-   */
-  size_t getNumBlocks() const {
-    return blockLengths_.size() - currentBlockIndex_;
-  }
-
-  /**
-   * @brief lenght of next block
-   */
-  size_t getNextBlockLength() const { return getBlockLength(0); }
-
-  /**
-   * @brief get the total length of all the remaining blocks
-   */
-  size_t getTotalLength() const {
-    size_t total = 0;
-    for (size_t i = currentBlockIndex_; i < blockLengths_.size(); ++i) {
-      total += blockLengths_[i];
-    }
-    return total;
-  }
-
-  /**
-   * @brief Get the length for block currentBlockIndex + i
-   */
-  size_t getBlockLength(size_t i) const {
-    return blockLengths_[currentBlockIndex_ + i];
-  }
-
-  /**
-   * @brief  read blocks data and store it to buf
-   */
-  void readBlocks(const std::vector<void*>& bufs);
-  void readNextBlock(void* buf);
-
- protected:
-  SocketChannel* channel_;
-  std::vector<size_t> blockLengths_;
-  size_t currentBlockIndex_;
-};
-
-/// APIs for reading and writing byte stream data or naive iov data
-/// from the APIs both RDMA and TCP exhibits byte stream style
-class SocketChannel {
- public:
-  SocketChannel(int socket, const std::string& peerName)
-      : tcpSocket_(socket), peerName_(peerName) {
-    tcpRdma_ = F_TCP;
-  }
-  SocketChannel(struct sxi_sock* socket, const std::string& peerName)
-      : rdmaSocket_(socket), peerName_(peerName) {
-    tcpRdma_ = F_RDMA;
-  }
-
-  ~SocketChannel();
-
-  const std::string& getPeerName() const { return peerName_; }
-
-  /**
-   * @brief read size bytes.
-   *
-   * @note  keep reading until getting size bytes or sock is closed
-   *        is closed
-   */
-  size_t read(void* buf, size_t size);
-
-  /**
-   * @brief write size bytes.
-   *
-   * @note  keep writing until writing size bytes or sock is closed
-   */
-  size_t write(const void* buf, size_t size);
-
-  /**
-   * @brief write a set of buffers.
-   *
-   * @note  keep writing until all buffers are written or sock is closed
-   */
-  size_t writev(const std::vector<struct iovec>& iov);
-
-  /**
-   * @brief read a set of buffers.
-   *
-   * @note  keep reading until all buffers are full or sock is closed.
-   */
-  size_t readv(std::vector<struct iovec>* iov);
-
-  /**
-   * @brief write a set of buffers.
-   *
-   * @note  keep writing until all buffers are passed or sock is closed
-   */
-  void writeMessage(const std::vector<struct iovec>& iov);
-
-  /// return null to indicate socket is closed
-  std::unique_ptr<MsgReader> readMessage();
-
- protected:
-  struct MessageHeader {
-    int64_t totalLength;  /// include the header
-    int64_t numIovs;
-    int64_t iovLengths[0];
-  };
-
-  int tcpSocket_;
-  struct sxi_sock* rdmaSocket_;
-  const std::string peerName_;
-  enum ChannelType tcpRdma_;
-};
-
-}  // namespace paddle
diff --git a/paddle/pserver/SparseParameterDistribution.cpp b/paddle/pserver/SparseParameterDistribution.cpp
deleted file mode 100644
index bb247f389cc26b32ff79d36bdf5c81ba8591dc58..0000000000000000000000000000000000000000
--- a/paddle/pserver/SparseParameterDistribution.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <unistd.h>
-
-#include "paddle/utils/Logging.h"
-
-#include "paddle/utils/Flags.h"
-
-#include "SparseParameterDistribution.h"
-
-DEFINE_bool(check_sparse_distribution_in_pserver,
-            false,
-            "check whether sparse parameter exhibts balanced distribution at "
-            "all pservers");
-DEFINE_bool(show_check_sparse_distribution_log,
-            false,
-            "show logs details for sparse parameter distribution in pserver");
-DEFINE_int32(check_sparse_distribution_batches,
-             100,
-             "run sparse parameter distribution check for N batches");
-DEFINE_double(
-    check_sparse_distribution_ratio,
-    0.6,
-    "if parameters dispatched to different pservers exhibit unbalanced "
-    " distribution for check_sparse_distribution_ratio * "
-    " check_sparse_distribution_batches times, crash program");
-DEFINE_double(check_sparse_distribution_unbalance_degree,
-              2.0,
-              "the ratio of maximum data size and minimun data size for "
-              "different pserver");
-
-namespace paddle {
-
-SparseParameterDistribution::SparseParameterDistribution(size_t serviceNum) {
-  totBytes_ = 0;
-  data_.resize(serviceNum);
-
-  batchPassed_ = 0;
-  unbalanceCnt_ = 0;
-}
-
-void SparseParameterDistribution::probeDistribution(int serverId,
-                                                    size_t dataSize) {
-  if (!FLAGS_check_sparse_distribution_in_pserver ||
-      batchPassed_ > FLAGS_check_sparse_distribution_batches) {
-    return;
-  }
-
-  CHECK_LT((size_t)serverId, data_.size())
-      << "invalid sparse parameter distribution probe";
-
-  data_[serverId] += dataSize;
-  totBytes_ += dataSize;
-}
-
-void SparseParameterDistribution::checkAndResetDistribution() {
-  if (!FLAGS_check_sparse_distribution_in_pserver ||
-      batchPassed_ >= FLAGS_check_sparse_distribution_batches) {
-    return;
-  }
-
-  /// at runtime, prepareSendData is called by many contexts,
-  /// so need to check if data is avaiable.
-  if (!totBytes_) {
-    return;
-  }
-
-  /// check if distribution is balanced
-  auto avgSize = totBytes_ / data_.size();
-  auto unbalanceDegree = FLAGS_check_sparse_distribution_unbalance_degree;
-  for (auto& dataSize : data_) {
-    if (dataSize > unbalanceDegree * avgSize ||
-        dataSize * unbalanceDegree < avgSize) {
-      unbalanceCnt_++;
-      break;
-    }
-  }
-
-  auto printData = [&]() {
-    std::stringstream ss;
-    for (auto& dataSize : data_) {
-      ss << dataSize * 0.001 << "KB ";
-    }
-    ss << std::endl;
-    LOG(INFO) << ss.str();
-  };
-
-  /// show all sparse data size for different pserver
-  if (FLAGS_show_check_sparse_distribution_log) {
-    LOG(INFO) << "sparse distribution:";
-    printData();
-  }
-
-  totBytes_ = 0;
-  batchPassed_++;
-
-  if (batchPassed_ == FLAGS_check_sparse_distribution_batches) {
-    LOG(INFO) << "show last parameter distribution sample:";
-    printData();
-    LOG(INFO) << "total unbalanced batches: " << unbalanceCnt_
-              << " in passed batches: " << batchPassed_;
-    CHECK_LE((float)unbalanceCnt_ / (float)batchPassed_,
-             FLAGS_check_sparse_distribution_ratio)
-        << "unbalanced sparse parameter distribution for different pserver. "
-        << "it could be caused by unbalanced sparse ids distribution, try "
-        << "to shuffle dimensions in input samples";
-  }
-
-  std::fill(data_.begin(), data_.end(), 0);
-}
-}  // namespace paddle
diff --git a/paddle/pserver/SparseParameterDistribution.h b/paddle/pserver/SparseParameterDistribution.h
deleted file mode 100644
index e168f36c75e9452fff547f139a67a553cc6b796a..0000000000000000000000000000000000000000
--- a/paddle/pserver/SparseParameterDistribution.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <unistd.h>
-
-#include <atomic>
-#include "paddle/utils/Logging.h"
-
-namespace paddle {
-
-/*
- * if sparse_remote_updater is used, different ParameterServer could
- * be assigned with unbalanced gradients. the parameter value from
- * ParameterServer also be not balanced. the distribution of different
- * dimensions of sparse ids determines the unbalanced degree of data
- * distributed among all ParameterServers. Even distribution will
- * benifits cluster efficiency.
- * do check the unbalanced degree of gradients at runtime, crash program
- * if unbalanced distribution exhibts by default.
- */
-class SparseParameterDistribution {
- public:
-  /// serviceNum means the number of ParameterServers
-  explicit SparseParameterDistribution(size_t serviceNum);
-  ~SparseParameterDistribution() {}
-  /// collect data
-  void probeDistribution(int serverId, size_t data);
-  void checkAndResetDistribution();
-
- private:
-  std::vector<size_t> data_;
-  std::atomic<size_t> totBytes_;
-
-  /// after some batches, stop to check
-  int batchPassed_;
-
-  /// stat on unbalanced distribution found
-  int unbalanceCnt_;
-};
-}  // namespace paddle
diff --git a/paddle/pserver/test/SocketTest.cpp b/paddle/pserver/test/SocketTest.cpp
deleted file mode 100644
index 206cd17c379f529579c103893cfb492524bc6f8d..0000000000000000000000000000000000000000
--- a/paddle/pserver/test/SocketTest.cpp
+++ /dev/null
@@ -1,256 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/utils/Util.h"
-
-#include <netdb.h>
-#include <netinet/in.h>
-#include <stdio.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-
-#include <thread>
-
-#include "paddle/math/Vector.h"
-#include "paddle/utils/Logging.h"
-
-struct MessageHeader {
-  int64_t dataLength;
-};
-
-class Thread {
- public:
-  void start();
-  virtual void run() = 0;
-  virtual ~Thread() {}
-
- protected:
-  std::unique_ptr<std::thread> thread_;
-};
-
-void Thread::start() {
-  thread_.reset(new std::thread([this]() { this->run(); }));
-}
-
-class SocketChannel {
- public:
-  explicit SocketChannel(int socket) : socket_(socket) {}
-  int getSocketFd() const { return socket_; }
-  uint64_t readAll(void* buf, size_t size);
-  uint64_t writeAll(const void* buf, size_t size);
-
- protected:
-  int socket_;
-};
-
-uint64_t SocketChannel::readAll(void* buf, size_t size) {
-  uint64_t total = 0;
-  while (total < size) {
-    int64_t len = read(socket_, (char*)buf + total, size - total);
-    if (len <= 0) {
-      return total;
-    }
-    total += len;
-  }
-  return total;
-}
-
-uint64_t SocketChannel::writeAll(const void* buf, size_t size) {
-  uint64_t total = 0;
-  while (total < size) {
-    int64_t len = write(socket_, (const char*)buf + total, size - total);
-    if (len <= 0) {
-      return total;
-    }
-    total += len;
-  }
-  return total;
-}
-
-class SocketWorker : public Thread {
- public:
-  explicit SocketWorker(int socket) : channel_(socket) {}
-  virtual void run();
-
-  // read n bytes.
-  int64_t readAll(char* buf, size_t n);
-
-  // write n bytes
-
- protected:
-  SocketChannel channel_;
-  std::string buffer_;
-};
-
-class SocketServer : public Thread {
- public:
-  explicit SocketServer(int port)
-      : port_(port), socket_(0), maxPendingConnections_(100) {}
-
-  virtual void run();
-
- protected:
-  int port_;
-  int socket_;
-  int maxPendingConnections_;
-};
-
-void SocketServer::run() {
-  int newsockfd;
-  socklen_t clilen;
-  struct sockaddr_in serv_addr, cli_addr;
-
-  /* First call to socket() function */
-  socket_ = socket(AF_INET, SOCK_STREAM, 0);
-  CHECK(socket_ >= 0) << "ERROR opening socket";
-
-  /* Initialize socket structure */
-  bzero((char*)&serv_addr, sizeof(serv_addr));
-  serv_addr.sin_family = AF_INET;
-  serv_addr.sin_addr.s_addr = INADDR_ANY;
-  serv_addr.sin_port = htons(port_);
-
-  /* Now bind the host address using bind() call.*/
-  CHECK(bind(socket_, (struct sockaddr*)&serv_addr, sizeof(serv_addr)) >= 0)
-      << "ERROR on binding";
-
-  /* Now start listening for the clients, here process will
-   * go in sleep mode and will wait for the incoming connection
-   */
-  listen(socket_, maxPendingConnections_);
-  clilen = sizeof(cli_addr);
-
-  while (true) {
-    /* Accept actual connection from the client */
-    newsockfd = accept(socket_, (struct sockaddr*)&cli_addr, &clilen);
-    CHECK(newsockfd >= 0) << "ERROR on accept";
-
-    SocketWorker* worker = new SocketWorker(newsockfd);
-    worker->start();
-  }
-}
-
-void SocketWorker::run() {
-  MessageHeader header;
-
-  while (true) {
-    int64_t n = channel_.readAll(&header, sizeof(header));
-    CHECK(n == sizeof(header)) << "ERROR reading from socket";
-
-    buffer_.resize(header.dataLength);
-    n = channel_.readAll(&buffer_[0], header.dataLength);
-    CHECK(n == header.dataLength) << "ERROR reading from socket";
-
-    /* Write a response to the client */
-    n = channel_.writeAll(&header, sizeof(header));
-    CHECK(n == sizeof(header)) << "ERROR reading from socket";
-    n = channel_.writeAll(buffer_.data(), buffer_.size());
-    CHECK(n == header.dataLength) << "ERROR writing to socket";
-  }
-}
-
-class SocketClient {
- public:
-  SocketClient(const std::string& serverAddr, int serverPort);
-  SocketChannel* getChannel() const { return channel_.get(); }
-
- protected:
-  std::unique_ptr<SocketChannel> channel_;
-};
-
-SocketClient::SocketClient(const std::string& serverAddr, int serverPort) {
-  struct sockaddr_in serv_addr;
-  struct hostent* server;
-
-  // char buffer[256];
-
-  /* Create a socket point */
-  int sockfd = socket(AF_INET, SOCK_STREAM, 0);
-  CHECK(sockfd >= 0) << "ERROR opening socket";
-  server = gethostbyname(serverAddr.c_str());
-  CHECK(server) << "ERROR, no such host: " << serverAddr;
-
-  bzero((char*)&serv_addr, sizeof(serv_addr));
-  serv_addr.sin_family = AF_INET;
-  bcopy((char*)server->h_addr,
-        (char*)&serv_addr.sin_addr.s_addr,
-        server->h_length);
-  serv_addr.sin_port = htons(serverPort);
-
-  /* Now connect to the server */
-  CHECK(connect(sockfd, (sockaddr*)&serv_addr, sizeof(serv_addr)) >= 0)
-      << "ERROR connecting";
-
-  channel_.reset(new SocketChannel(sockfd));
-}
-
-DEFINE_string(server_addr, "127.0.0.1", "Server address");
-DEFINE_int64(dim, 10000000, "Data size");
-DEFINE_int32(loop_time, 100000, "test loop time");
-
-using namespace paddle;  // NOLINT
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  SocketServer server(FLAGS_port);
-  server.start();
-  sleep(1);
-
-  SocketClient client(FLAGS_server_addr, FLAGS_port);
-
-  SocketChannel* channel = client.getChannel();
-
-  MessageHeader header;
-
-  uint64_t dataSize = FLAGS_dim * sizeof(real);
-
-#ifdef PADDLE_WITH_CUDA
-  GpuVector gpuParam(FLAGS_dim);
-  GpuVector gpuGrad(FLAGS_dim);
-#else
-  CpuVector gpuParam(FLAGS_dim);
-  CpuVector gpuGrad(FLAGS_dim);
-#endif
-  CpuVector cpuParam(FLAGS_dim);
-  CpuVector cpuGrad(FLAGS_dim);
-
-  gpuParam.rand();
-  gpuGrad.rand();
-  cpuParam.rand();
-  cpuGrad.rand();
-
-  for (int i = 0; i < FLAGS_loop_time; ++i) {
-    cpuGrad.copyFrom(gpuGrad);
-
-    header.dataLength = dataSize;
-    CHECK(channel->writeAll(&header, sizeof(header)) == sizeof(header))
-        << "Client write header error";
-
-    CHECK(channel->writeAll(cpuGrad.getData(), dataSize) == dataSize)
-        << "Client write data error";
-
-    /* Now read server response */
-    CHECK(channel->readAll(&header, sizeof(header)) == sizeof(header))
-        << "Client read header error";
-
-    CHECK_EQ((uint64_t)header.dataLength, dataSize);
-    CHECK(channel->readAll(cpuParam.getData(), dataSize) == dataSize)
-        << "Client read data error";
-
-    gpuParam.copyFrom(cpuParam);
-
-    LOG_EVERY_N(INFO, 100) << "i=" << i;
-  }
-  exit(0);
-}
diff --git a/paddle/pserver/test/test_ParameterServer2.cpp b/paddle/pserver/test/test_ParameterServer2.cpp
deleted file mode 100644
index 01d179258dffaf996a57022801ee3bd60a268f77..0000000000000000000000000000000000000000
--- a/paddle/pserver/test/test_ParameterServer2.cpp
+++ /dev/null
@@ -1,624 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/pserver/ParameterClient2.h>
-#include <paddle/pserver/ParameterServer2.h>
-#include <paddle/utils/Flags.h>
-#include <paddle/utils/Util.h>
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_int32(num_gradient_servers);
-DEFINE_string(server_addr, "127.0.0.1", "assign server address");
-DEFINE_int32(server_cpu, 0, "assign server cpu");
-
-class ParameterServer2Tester : public ParameterServer2 {
- public:
-  ParameterServer2Tester(std::string serverAddr,
-                         int port,
-                         int rdmaCpu = -1,
-                         bool sepSendAndRecv = false)
-      : ParameterServer2(serverAddr, port, rdmaCpu), client_(sepSendAndRecv) {}
-  virtual ~ParameterServer2Tester() {}
-  void setup() {
-    CHECK(ParameterServer2::init());
-
-    parameters_.clear();
-    clientConfigs_.clear();
-
-    clientConfigs_.resize(2);
-    {
-      ParameterConfig& config = clientConfigs_[0];
-      config.set_name("para0");
-      config.set_para_id(0);
-      config.set_size(10000);
-      config.set_device(-1);
-      config.set_learning_rate(1.0);
-      config.set_momentum(0.9);
-    }
-
-    {
-      ParameterConfig& config = clientConfigs_[1];
-      config.set_name("para1");
-      config.set_para_id(1);
-      config.set_size(5000);
-      config.set_device(-1);
-      config.set_learning_rate(0.5);
-      config.set_momentum(0.4);
-    }
-
-    for (auto& config : clientConfigs_) {
-      parameters_.emplace_back(new Parameter(config, /* useGpu= */ false));
-    }
-
-    size_t id = 0;
-    for (auto& para : parameters_) {
-      para->setID(id++);
-    }
-
-    CHECK(client_.init(parameters_));
-    OptimizationConfig optConfig;
-    optConfig.set_algorithm("async_sgd");
-    optConfig.set_batch_size(100);
-    optConfig.set_learning_rate(0.1);
-    client_.setConfig(optConfig);
-    client_.setParameter();
-  }
-
-  void setConfigTest();
-  void setStatusTest();
-  void sendParameterTest();
-  void sendDataTest(SendDataType type, size_t size);
-  void operationTest();
-  void mergeBlockSegmentTest();
-  void checkSegments(const BlockSegments& expected, const BlockSegments& segs);
-  void waitPassFinishTest();
-  void synchronizeTest();
-
- protected:
-  ParameterClient2 client_;
-  vector<ParameterConfig> clientConfigs_;
-  vector<ParameterPtr> parameters_;
-};
-
-std::unique_ptr<ParameterServer2Tester> g_server;
-
-void ParameterServer2Tester::setConfigTest() {
-  setup();
-
-  for (auto& config : clientConfigs_) {
-    auto it = configMap_.find(config.para_id());
-    EXPECT_TRUE(it != configMap_.end());
-    auto& serverConfig = it->second;
-    EXPECT_EQ(config.name(), serverConfig.name());
-    EXPECT_EQ(config.size(), serverConfig.size());
-    EXPECT_EQ(config.learning_rate(), serverConfig.learning_rate());
-    EXPECT_EQ(config.momentum(), serverConfig.momentum());
-  }
-}
-
-void ParameterServer2Tester::setStatusTest() {
-  setup();
-  EXPECT_TRUE(client_.inStatus(PSERVER_STATUS_NOT_SET));
-  client_.setStatus(PSERVER_STATUS_PARAMETER_READY);
-  EXPECT_EQ(PSERVER_STATUS_PARAMETER_READY, status_);
-  EXPECT_TRUE(client_.inStatus(PSERVER_STATUS_PARAMETER_READY));
-}
-
-real sumVector(const CpuVector& vec) {
-  const real* data = vec.getData();
-  size_t dim = vec.getSize();
-  real sum = 0;
-  for (size_t i = 0; i < dim; ++i) {
-    sum += data[i];
-  }
-  return sum;
-}
-
-void ParameterServer2Tester::sendParameterTest() {
-  setup();
-
-  client_.sendAndReceiveParameter(PSERVER_UPDATE_MODE_SET_PARAM,
-                                  PARAMETER_VALUE,
-                                  0,       // numSamples = 0
-                                  0,       // cost = 0
-                                  false);  // sendBackParameter = false
-
-  vector<ParameterPtr> parameterCopies;
-
-  for (auto& parameter : parameters_) {
-    parameterCopies.emplace_back(
-        new Parameter(parameter->getConfig(), /* useGpu= */ false));
-    parameterCopies.back()
-        ->getBuf(PARAMETER_VALUE)
-        ->copyFrom(*parameter->getBuf(PARAMETER_VALUE));
-  }
-
-  client_.sendAndReceiveParameter(PSERVER_UPDATE_MODE_GET_PARAM,
-                                  PARAMETER_VALUE,
-                                  0,      // numSamples = 0
-                                  0,      // cost = 0
-                                  true);  // sendBackParameter = true
-
-  for (size_t i = 0; i != parameters_.size(); ++i) {
-    real* v1 = parameters_[i]->getBuf(PARAMETER_VALUE)->getData();
-    real* v2 = parameterCopies[i]->getBuf(PARAMETER_VALUE)->getData();
-    EXPECT_EQ(parameters_[i]->getSize(), parameterCopies[i]->getSize());
-    size_t size = parameters_[i]->getSize();
-    real sum1 = 0, sum2 = 0;
-    for (size_t j = 0; j < size; ++j) {
-      sum1 += v1[j];
-      sum2 += v2[j];
-    }
-    EXPECT_EQ(sum1, sum2);
-  }
-}
-
-void ParameterServer2Tester::sendDataTest(SendDataType type, size_t size) {
-  ParameterClient2 client1(true);
-  client1.init(parameters_);
-  ParameterClient2 client2(true);
-  client2.init(parameters_);
-  ParameterClient2 client3(true);
-  client3.init(parameters_);
-
-  ThreadWorker worker1;
-  ThreadWorker worker2;
-  ThreadWorker worker3;
-
-  double* testData1 = new double[size];
-  double* testData2 = new double[size];
-  double* testData3 = new double[size];
-  double* getDataExpect = new double[size];
-  double* getDataReal = new double[size];
-  for (size_t i = 0; i < size; ++i) {
-    testData1[i] = rand();  // NOLINT TODO(yuyang18): Use rand_r instead.
-    testData2[i] = rand();  // NOLINT
-    testData3[i] = rand();  // NOLINT
-    getDataExpect[i] = testData1[i] + testData2[i] + testData3[i];
-  }
-
-  auto put1 = [&]() {
-    LOG(INFO) << "putOwnData1 start";
-    client1.putOwnData(0, type, testData1, size);
-    LOG(INFO) << "putOwnData1 finish";
-  };
-
-  auto get1 = [&]() {
-    LOG(INFO) << "sendData1 get all start";
-    client1.getAllData(0, type, getDataReal, size);
-    for (size_t i = 0; i < size; ++i) {
-      CHECK_EQ(getDataReal[i], getDataExpect[i]);
-    }
-    LOG(INFO) << "sendData1 get all finish";
-  };
-
-  auto put2 = [&]() {
-    LOG(INFO) << "putOwnData2 start";
-    client2.putOwnData(1, type, testData2, size);
-    LOG(INFO) << "putOwnData2 finish";
-  };
-
-  auto put3 = [&]() {
-    LOG(INFO) << "putOwnData3 start";
-    client3.putOwnData(2, type, testData3, size);
-    LOG(INFO) << "putOwnData3 finish";
-  };
-
-  worker1.addJob(put1);
-  worker1.addJob(get1);
-  worker2.addJob(put2);
-  worker3.addJob(put3);
-
-  worker1.addJob(put1);
-  worker2.addJob(put2);
-  worker3.addJob(put3);
-  worker1.addJob(get1);
-
-  worker1.wait();
-  worker2.wait();
-  worker3.wait();
-  free(testData1);
-  free(testData2);
-  free(testData3);
-  free(getDataExpect);
-  free(getDataReal);
-}
-
-void ParameterServer2Tester::operationTest() {
-  PServerVector v1, v2;
-  v1 = client_.createVector();
-  EXPECT_EQ(NUM_PARAMETER_TYPES, v1.handle);
-
-  v2 = client_.createVector();
-  EXPECT_EQ(NUM_PARAMETER_TYPES + 1, v2.handle);
-
-  PreparedOperations ops;
-  ops.addOperation(PSERVER_OP_RESET, v1, (real)1);
-  ops.addOperation(PSERVER_OP_RESET, v2, (real)2);
-
-  real res1, res2, res3;
-  ops.addOperation(PSERVER_OP_utv, v1, v2)(&res1);
-
-  ops.addOperation(PSERVER_OP_au_bv, v1, v2, (real)-1, (real)1);
-  ops.addOperation(PSERVER_OP_utv, v1, v2)(&res2);
-
-  ops.addOperation(PSERVER_OP_au_bv, v1, v2, (real)-1, (real)1);
-  ops.addOperation(PSERVER_OP_utv, v1, v2)(&res3);
-  client_.doOperation(ops, false, false);
-
-  EXPECT_EQ(30000, res1);
-  EXPECT_EQ(15000, res2);
-  EXPECT_EQ(0, res3);
-
-  PServerMatrix m1, m2;
-  m1 = client_.createMatrix(4);
-  EXPECT_EQ(0, m1.handle);
-  m2 = client_.createMatrix(8);
-  EXPECT_EQ(1, m2.handle);
-
-  // TODO(yuyang18): add tests for other operations OP_COPY, OP_au
-
-  client_.releaseVector(v1);
-  client_.releaseVector(v2);
-  client_.releaseMatrix(m1);
-  client_.releaseMatrix(m2);
-}
-
-void ParameterServer2Tester::checkSegments(const BlockSegments& expected,
-                                           const BlockSegments& segs) {
-  EXPECT_EQ(expected.size(), segs.size());
-  if (expected.size() != segs.size()) {
-    return;
-  }
-  for (size_t i = 0; i < expected.size(); ++i) {
-    EXPECT_EQ(expected[i], segs[i]);
-  }
-}
-
-void ParameterServer2Tester::mergeBlockSegmentTest() {
-  {
-    BlockSegments segs{{10, 20}, {30, 45}, {50, 70}};
-    mergeSegments(&segs);
-    checkSegments({{10, 20}, {30, 45}, {50, 70}}, segs);
-  }
-  {
-    BlockSegments segs{{30, 45}, {50, 70}, {10, 20}};
-    mergeSegments(&segs);
-    checkSegments({{10, 20}, {30, 45}, {50, 70}}, segs);
-  }
-  {
-    BlockSegments segs{{30, 45}, {50, 70}, {10, 30}};
-    mergeSegments(&segs);
-    checkSegments({{10, 45}, {50, 70}}, segs);
-  }
-  {
-    BlockSegments segs{{30, 45}, {10, 70}, {10, 30}};
-    mergeSegments(&segs);
-    checkSegments({{10, 70}}, segs);
-  }
-  {
-    BlockSegments segs{{30, 45}, {50, 70}, {10, 35}};
-    mergeSegments(&segs);
-    checkSegments({{10, 45}, {50, 70}}, segs);
-  }
-  {
-    BlockSegments segs{{30, 45}, {50, 70}, {10, 60}};
-    mergeSegments(&segs);
-    checkSegments({{10, 70}}, segs);
-  }
-  {
-    BlockSegments segs{{30, 45}, {50, 70}, {30, 47}};
-    mergeSegments(&segs);
-    checkSegments({{30, 47}, {50, 70}}, segs);
-  }
-}
-
-void ParameterServer2Tester::waitPassFinishTest() {
-  ParameterClient2 client1;
-  ParameterClient2 client2;
-  ParameterClient2 client3;
-
-  ThreadWorker worker1;
-  ThreadWorker worker2;
-  ThreadWorker worker3;
-
-  auto init1 = [&]() {
-    LOG(INFO) << "init1 start";
-    client1.init(parameters_);
-    LOG(INFO) << "init1 finish";
-  };
-
-  auto init2 = [&]() {
-    LOG(INFO) << "init2 start";
-    client2.init(parameters_);
-    LOG(INFO) << "init2 finish";
-  };
-
-  auto init3 = [&]() {
-    LOG(INFO) << "init3 start";
-    client3.init(parameters_);
-    LOG(INFO) << "init3 finish";
-  };
-
-  auto update1 = [&]() {
-    LOG(INFO) << "update1 start";
-    client1.sendAndReceiveParameter(PSERVER_UPDATE_MODE_ADD_GRADIENT,
-                                    PARAMETER_VALUE,
-                                    0,      // numSamples = 0
-                                    0,      // cost = 0
-                                    true);  // sendBackParameter = false
-    LOG(INFO) << "update1 finish";
-  };
-
-  auto wait1 = [&]() {
-    LOG(INFO) << "wait1 start";
-    client1.waitPassFinish();
-    LOG(INFO) << "wait1 finish";
-  };
-
-  auto update2 = [&]() {
-    LOG(INFO) << "update2 start";
-    client2.sendAndReceiveParameter(PSERVER_UPDATE_MODE_ADD_GRADIENT,
-                                    PARAMETER_VALUE,
-                                    0,      // numSamples = 0
-                                    0,      // cost = 0
-                                    true);  // sendBackParameter = false
-    LOG(INFO) << "update2 finish";
-  };
-
-  auto wait2 = [&]() {
-    LOG(INFO) << "wait2 start";
-    client2.waitPassFinish();
-    LOG(INFO) << "wait2 finish";
-  };
-
-  auto op3 = [&]() {
-    LOG(INFO) << "op3 start";
-    PreparedOperations ops;
-    ops.addOperation(PSERVER_OP_SGD);
-    client3.doOperation(ops,
-                        /* waitForGradient= */ true,
-                        /* sendBackarameter= */ true);
-    LOG(INFO) << "op3 finish";
-  };
-
-  worker1.addJob(init1);
-  worker2.addJob(init2);
-  worker3.addJob(init3);
-
-  worker1.addJob(update1);
-  worker2.addJob(update2);
-  worker3.addJob(op3);
-
-  worker3.addJob(op3);
-  worker3.addJob(op3);
-  worker2.addJob(update2);
-  worker2.addJob(update2);
-  worker1.addJob(wait1);
-
-  worker2.addJob(wait2);
-  worker3.addJob(op3);
-
-  worker1.wait();
-  worker2.wait();
-  worker3.wait();
-
-  LOG(INFO) << "Pass 1 finished";
-
-  worker1.addJob(update1);
-  worker2.addJob(update2);
-  worker3.addJob(op3);
-
-  worker1.wait();
-  worker2.wait();
-  worker3.wait();
-
-  worker3.addJob(op3);
-  worker3.addJob(op3);
-  worker1.addJob(update1);
-  worker1.addJob(wait1);
-  worker2.addJob(wait2);
-
-  worker1.wait();
-  worker2.wait();
-  worker3.wait();
-
-  LOG(INFO) << "Pass 2 finished";
-}
-
-void ParameterServer2Tester::synchronizeTest() {
-  ParameterClient2 client1;
-  ParameterClient2 client2;
-
-  ThreadWorker worker1;
-  ThreadWorker worker2;
-
-  FLAGS_log_period_server = 2;
-
-  auto init1 = [&]() {
-    LOG(INFO) << "init1 start";
-    client1.init(parameters_);
-    client1.setTrainerId(0);
-    LOG(INFO) << "init1 finish";
-  };
-
-  auto init2 = [&]() {
-    LOG(INFO) << "init2 start";
-    client2.init(parameters_);
-    client2.setTrainerId(1);
-    LOG(INFO) << "init2 finish";
-  };
-
-  auto update1 = [&]() {
-    LOG(INFO) << "update1 start";
-    client1.sendAndReceiveParameter(PSERVER_UPDATE_MODE_ASYNC_SGD,
-                                    PARAMETER_VALUE,
-                                    0,      // numSamples = 0
-                                    0,      // cost = 0
-                                    true);  // sendBackParameter = false
-    LOG(INFO) << "update1 finish";
-  };
-
-  auto wait1 = [&]() {
-    LOG(INFO) << "wait1 start";
-    client1.asyncFinishPass();
-    LOG(INFO) << "wait1 finish";
-  };
-
-  auto update2 = [&]() {
-    LOG(INFO) << "update2 start";
-    client2.sendAndReceiveParameter(PSERVER_UPDATE_MODE_ASYNC_SGD,
-                                    PARAMETER_VALUE,
-                                    0,      // numSamples = 0
-                                    0,      // cost = 0
-                                    true);  // sendBackParameter = false
-    LOG(INFO) << "update2 finish";
-  };
-
-  auto wait2 = [&]() {
-    LOG(INFO) << "wait2 start";
-    client2.asyncFinishPass();
-    LOG(INFO) << "wait2 finish";
-  };
-
-  worker1.addJob(init1);
-  worker2.addJob(init2);
-  // call wait to reset some stats at pserver
-  worker1.addJob(wait1);
-  worker2.addJob(wait2);
-
-  worker1.addJob(update1);
-  worker2.addJob(update2);
-
-  worker2.addJob(update2);
-  worker2.addJob(update2);
-  worker1.addJob(wait1);
-
-  worker2.addJob(wait2);
-
-  worker1.wait();
-  worker2.wait();
-  LOG(INFO) << "Pass 1 finished";
-
-  worker1.addJob(update1);
-  worker2.addJob(update2);
-
-  worker1.wait();
-  worker2.wait();
-
-  worker1.addJob(update1);
-  worker2.addJob(update2);
-  worker1.addJob(update1);
-  worker1.addJob(update1);
-  worker1.addJob(update1);
-  worker1.addJob(update1);
-  worker1.addJob(update1);
-  worker1.addJob(update1);
-  worker1.addJob(wait1);
-  worker2.addJob(wait2);
-
-  worker1.wait();
-  worker2.wait();
-  LOG(INFO) << "Pass 2 finished";
-}
-
-TEST(ParameterServer2, sendParameter) { g_server->sendParameterTest(); }
-
-TEST(ParameterServer2, setConfig) { g_server->setConfigTest(); }
-
-TEST(ParameterServer2, setStatus) { g_server->setStatusTest(); }
-
-TEST(ParameterServer2, operation) { g_server->operationTest(); }
-
-TEST(ParameterServer2, mergeBlockSegment) { g_server->mergeBlockSegmentTest(); }
-
-TEST(ParameterServer2, waitPassFinish) { g_server->waitPassFinishTest(); }
-
-TEST(ParameterServer2, synchronize) { g_server->synchronizeTest(); }
-
-TEST(ParameterServer2, sendData) {
-  // Set gserver and pserver all 3, so that the test is sufficient.
-  int oldFlagsPortsNUm = FLAGS_ports_num;
-  int oldFlagsNumGradientServers = FLAGS_num_gradient_servers;
-  int oldFlagsPort = FLAGS_port;
-  FLAGS_ports_num = 3;
-  FLAGS_num_gradient_servers = 3;
-  FLAGS_port = FLAGS_port + 1;
-  std::unique_ptr<ParameterServer2Tester> g_server1;
-  std::unique_ptr<ParameterServer2Tester> g_server2;
-  std::unique_ptr<ParameterServer2Tester> g_server3;
-  if (FLAGS_rdma_tcp == "rdma") {
-    g_server1.reset(new ParameterServer2Tester(
-        FLAGS_server_addr, FLAGS_port, FLAGS_server_cpu));
-    g_server1->start();
-    g_server2.reset(new ParameterServer2Tester(
-        FLAGS_server_addr, FLAGS_port + 1, FLAGS_server_cpu + 1));
-    g_server2->start();
-    g_server3.reset(new ParameterServer2Tester(
-        FLAGS_server_addr, FLAGS_port + 2, FLAGS_server_cpu + 2));
-    g_server3->start();
-  } else {  // tcp
-    g_server1.reset(new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port));
-    g_server1->start();
-    g_server2.reset(
-        new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port + 1));
-    g_server2->start();
-    g_server3.reset(
-        new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port + 2));
-    g_server3->start();
-  }
-
-  g_server2->init();
-  g_server3->init();
-  sleep(2);
-  g_server1->setup();
-  g_server1->sendDataTest(DATA_REDUCE_SUM, 1 << 24);
-  sleep(2);
-  g_server1->sendDataTest(DATA_REDUCE_SUM, 2);
-  sleep(2);
-  g_server1.reset();
-  g_server2.reset();
-  g_server3.reset();
-
-  FLAGS_ports_num = oldFlagsPortsNUm;
-  FLAGS_num_gradient_servers = oldFlagsNumGradientServers;
-  FLAGS_port = oldFlagsPort;
-}
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-
-  FLAGS_num_gradient_servers = 2;
-
-  if (FLAGS_rdma_tcp == "rdma") {
-    g_server.reset(new ParameterServer2Tester(
-        FLAGS_server_addr, FLAGS_port, FLAGS_server_cpu));
-  } else {
-    g_server.reset(new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port));
-  }
-
-  g_server->start();
-
-  sleep(2);
-
-  int ret = RUN_ALL_TESTS();
-
-  g_server.reset();
-
-  exit(ret);
-}
diff --git a/paddle/pserver/test/test_ProtoServer.cpp b/paddle/pserver/test/test_ProtoServer.cpp
deleted file mode 100644
index a66b14a1cc58d11988e4936a9c35d98b8bf5edc1..0000000000000000000000000000000000000000
--- a/paddle/pserver/test/test_ProtoServer.cpp
+++ /dev/null
@@ -1,169 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <memory>
-#include "ParameterService.pb.h"
-#include "paddle/math/Vector.h"
-#include "paddle/pserver/ProtoServer.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/Util.h"
-
-DEFINE_string(server_addr, "127.0.0.1", "Server address");
-DEFINE_int64(dim, 50000000, "Data size");
-DEFINE_bool(test_proto_server, true, "whether to test ProtoServer");
-DEFINE_bool(benchmark, false, "Do benchmark. Skip some tests");
-
-using namespace paddle;  // NOLINT
-
-class MyServer : public ProtoServer {
- public:
-  explicit MyServer(int port, int rdmaCpu = -1)
-      : ProtoServer(FLAGS_server_addr, port, rdmaCpu),
-        status_(PSERVER_STATUS_NOT_SET) {
-    REGISTER_SERVICE_FUNCTION(MyServer, getStatus);
-    REGISTER_SERVICE_FUNCTION(MyServer, setStatus);
-    REGISTER_SERVICE_FUNCTION_EX(MyServer, getStatusEx);
-  }
-  void getStatus(const GetStatusRequest& request,
-                 ProtoResponseCallback callback) {
-    (void)request;
-    GetStatusResponse response;
-    response.set_status(status_);
-    callback(response);
-  }
-
-  void getStatusEx(const GetStatusRequest& request,
-                   std::unique_ptr<MsgReader> msgReader,
-                   ProtoResponseCallbackEx callback) {
-    (void)request;
-    GetStatusResponse response;
-    response.set_status(status_);
-    buffer_.resize(msgReader->getNextBlockLength());
-    msgReader->readNextBlock(&buffer_[0]);
-    callback(response, {{&buffer_[0], buffer_.size()}});
-  }
-
-  void setStatus(const SetStatusRequest& request,
-                 ProtoResponseCallback callback) {
-    SetStatusResponse response;
-    status_ = request.status();
-    callback(response);
-  }
-
- protected:
-  PServerStatus status_;
-  std::string buffer_;
-};
-
-TEST(ProtoServer, regular) {
-  ProtoClient* client;
-  if (FLAGS_rdma_tcp == "rdma")
-    client = new ProtoClient(FLAGS_server_addr, FLAGS_port, F_RDMA);
-  else
-    client = new ProtoClient(FLAGS_server_addr, FLAGS_port, F_TCP);
-  {
-    GetStatusRequest request;
-    GetStatusResponse response;
-    auto msgReader = client->sendAndRecv("getStatus", request, &response);
-    EXPECT_EQ(response.status(), PSERVER_STATUS_NOT_SET);
-    EXPECT_EQ(msgReader->getNumBlocks(), (size_t)0);
-  }
-
-  {
-    SetStatusRequest request;
-    SetStatusResponse response;
-    request.set_status(PSERVER_STATUS_PARAMETER_READY);
-    client->sendAndRecv("setStatus", request, &response);
-  }
-
-  {
-    GetStatusRequest request;
-    GetStatusResponse response;
-    client->sendAndRecv("getStatus", request, &response);
-    EXPECT_EQ(response.status(), PSERVER_STATUS_PARAMETER_READY);
-  }
-
-  delete client;
-}
-
-TEST(ProtoServer, extended) {
-#ifdef PADDLE_WITH_CUDA
-  ProtoClient* client;
-  if (FLAGS_rdma_tcp == "rdma")
-    client = new ProtoClient(FLAGS_server_addr, FLAGS_port, F_RDMA);
-  else
-    client = new ProtoClient(FLAGS_server_addr, FLAGS_port, F_TCP);
-  int64_t dataSize = FLAGS_dim * sizeof(real);
-
-  GpuVector gpuParam(FLAGS_dim);
-  GpuVector gpuGrad(FLAGS_dim);
-  CpuVector cpuParam(FLAGS_dim);
-  CpuVector cpuGrad(FLAGS_dim);
-
-  gpuParam.rand();
-  gpuGrad.rand();
-  cpuParam.rand();
-  cpuGrad.rand();
-
-  for (int k = 0; k < 4; ++k) {
-    for (int i = 0; i < 10; ++i) {
-      cpuGrad.copyFrom(gpuGrad);
-      if (FLAGS_test_proto_server) {
-        GetStatusRequest request;
-        GetStatusResponse response;
-        {
-          REGISTER_TIMER("sendAndRecv");
-          auto msgReader =
-              client->sendAndRecv("getStatusEx",
-                                  request,
-                                  {{cpuGrad.getData(), (size_t)dataSize}},
-                                  &response);
-
-          EXPECT_EQ(msgReader->getNumBlocks(), (size_t)1);
-          EXPECT_EQ(msgReader->getNextBlockLength(), (size_t)dataSize);
-          msgReader->readNextBlock(cpuParam.getData());
-        }
-        if (!FLAGS_benchmark) {
-          real* v1 = cpuGrad.getData();
-          real* v2 = cpuParam.getData();
-          real sum1 = 0, sum2 = 0;
-          for (int j = 0; j < FLAGS_dim; ++j) {
-            sum1 += v1[j];
-            sum2 += v2[j];
-          }
-          EXPECT_EQ(sum1, sum2);
-        }
-      }
-      gpuParam.copyFrom(cpuParam);
-
-      LOG_EVERY_N(INFO, 10) << "i=" << i;
-    }
-    globalStat.printAllStatus();
-    globalStat.reset();
-  }
-
-  delete client;
-#endif
-}
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  MyServer server(FLAGS_port, FLAGS_rdma_tcp == "rdma" ? 0 : -1);
-  server.start();
-  usleep(10000);
-
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/pserver/test/test_ProtoServer.sh b/paddle/pserver/test/test_ProtoServer.sh
deleted file mode 100755
index 970c90b494c2a256cf22f3de7b7ea7964fed58ab..0000000000000000000000000000000000000000
--- a/paddle/pserver/test/test_ProtoServer.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set -x
-for ((port=12340;port<=12360;port++))
-do
-    port_used_num=`netstat -a |grep $port|wc -l`
-    if [ $port_used_num -eq 0 ]
-    then
-        echo $port;
-        pserver/test/test_ProtoServer --port=$port 
-        if [ $? -eq 0 ]
-           then
-               exit 0
-           else
-               echo "test_ProtoServer run wrong"
-       	       exit 1
-        fi
-fi
-done
-echo "test_ProtoServer port not found"
-exit 1
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index e8b305326702cf04b752bb2eb413f848daa5ec7b..d173b41e86f61954954b6a5ea9957d2e172deca0 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -22,7 +22,7 @@
 function print_usage() {
     echo -e "\n${RED}Usage${NONE}:
     ${BOLD}${SCRIPT_NAME}${NONE} [OPTION]"
-    
+
     echo -e "\n${RED}Options${NONE}:
     ${BLUE}build${NONE}: run build for x86 platform
     ${BLUE}build_android${NONE}: run build for android platform
@@ -106,6 +106,8 @@ function cmake_gen() {
         -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
+        -DWITH_ANAKIN=${WITH_ANAKIN:-OFF}
+        -DWITH_INFERENCE_DEMO=${WITH_INFERENCE_DEMO:-ON}
     ========================================
 EOF
     # Disable UNITTEST_USE_VIRTUALENV in docker because
@@ -133,7 +135,8 @@ EOF
         -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
-        -DWITH_ANAKIN=ON
+        -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \
+        -DWITH_INFERENCE_DEMO=${WITH_INFERENCE_DEMO:-ON}
 }
 
 function abort(){
@@ -198,7 +201,7 @@ function build_android() {
     fi
 
     ANDROID_STANDALONE_TOOLCHAIN=$ANDROID_TOOLCHAINS_DIR/$ANDROID_ARCH-android-$ANDROID_API
-    
+
     cat <<EOF
     ============================================
     Generating the standalone toolchain ...
@@ -212,13 +215,13 @@ EOF
           --arch=$ANDROID_ARCH \
           --platform=android-$ANDROID_API \
           --install-dir=$ANDROID_STANDALONE_TOOLCHAIN
-    
+
     BUILD_ROOT=${PADDLE_ROOT}/build_android
     DEST_ROOT=${PADDLE_ROOT}/install_android
-    
+
     mkdir -p $BUILD_ROOT
     cd $BUILD_ROOT
-    
+
     if [ $ANDROID_ABI == "armeabi-v7a" ]; then
       cmake -DCMAKE_SYSTEM_NAME=Android \
             -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
@@ -286,7 +289,7 @@ function build_ios() {
           -DWITH_TESTING=OFF \
           -DWITH_SWIG_PY=OFF \
           -DCMAKE_BUILD_TYPE=Release
-    
+
     make -j 2
 }
 
@@ -309,6 +312,20 @@ EOF
     fi
 }
 
+function assert_api_not_changed() {
+    mkdir -p ${PADDLE_ROOT}/build/.check_api_workspace
+    cd ${PADDLE_ROOT}/build/.check_api_workspace
+    virtualenv .env
+    source .env/bin/activate
+    pip install ${PADDLE_ROOT}/build/python/dist/*whl
+    curl ${PADDLE_API_SPEC_URL:-https://raw.githubusercontent.com/PaddlePaddle/FluidAPISpec/master/API.spec} \
+        > origin.spec
+    python ${PADDLE_ROOT}/tools/print_signatures.py paddle.fluid > new.spec
+    python ${PADDLE_ROOT}/tools/diff_api.py origin.spec new.spec
+    deactivate
+}
+
+
 function single_test() {
     TEST_NAME=$1
     if [ -z "${TEST_NAME}" ]; then
@@ -331,14 +348,14 @@ EOF
 function bind_test() {
     # the number of process to run tests
     NUM_PROC=6
-    
+
     # calculate and set the memory usage for each process
     MEM_USAGE=$(printf "%.2f" `echo "scale=5; 1.0 / $NUM_PROC" | bc`)
     export FLAGS_fraction_of_gpu_memory_to_use=$MEM_USAGE
-    
+
     # get the CUDA device count
     CUDA_DEVICE_COUNT=$(nvidia-smi -L | wc -l)
-    
+
     for (( i = 0; i < $NUM_PROC; i++ )); do
         cuda_list=()
         for (( j = 0; j < $CUDA_DEVICE_COUNT; j++ )); do
@@ -547,6 +564,7 @@ function main() {
       cicheck)
         cmake_gen ${PYTHON_ABI:-""}
         build
+        assert_api_not_changed
         run_test
         gen_capi_package
         gen_fluid_inference_lib
diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt
index a1f446817e0cbc1b4391398a82b0846d01bbec2c..22644818994134d4797edfae8d156a005c103d52 100644
--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
@@ -6,6 +6,6 @@ if(WITH_TESTING)
   add_library(paddle_test_util STATIC TestUtil.cpp)
   add_dependencies(paddle_test_util paddle_proto ${external_project_dependencies})
   if(NOT MOBILE_INFERENCE)
-    cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init memory gtest gflags)
+    cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS device_context memory gtest gflags)
   endif()
 endif()
diff --git a/paddle/testing/TestMain.cpp b/paddle/testing/TestMain.cpp
index 3e14532d1878fa374a5a2241c7b8319da2dc79d3..1811dbbd1a5f3f6078e7acd24b55d13a242c98bf 100644
--- a/paddle/testing/TestMain.cpp
+++ b/paddle/testing/TestMain.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Util.h"
 
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
diff --git a/paddle/testing/TestUtil.cpp b/paddle/testing/TestUtil.cpp
index cfb8c713d96008a74287fb1248657c30f3b81164..fa8efc20f59addb4526d2cbeaf34f161307c588a 100644
--- a/paddle/testing/TestUtil.cpp
+++ b/paddle/testing/TestUtil.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "TestUtil.h"
 #include <gflags/gflags.h>
-#include "paddle/math/SparseMatrix.h"
+#include "paddle/legacy/math/SparseMatrix.h"
 
 DEFINE_int32(fixed_seq_length, 0, "Produce some sequence of fixed length");
 
diff --git a/paddle/testing/TestUtil.h b/paddle/testing/TestUtil.h
index ec86469aebbafbf5406a21e6825eda6c105a6b9d..98b864e3c56f1938075bd039ba13a49ec457de50 100644
--- a/paddle/testing/TestUtil.h
+++ b/paddle/testing/TestUtil.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <gtest/gtest.h>
-#include "paddle/math/Matrix.h"
+#include "paddle/legacy/math/Matrix.h"
 
 namespace paddle {
 
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index 507479c8622c8d33722e08bba018ad1ba5452e15..cfea2059c3ce20fb44732d990e9708ad6f8d81a1 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
-#include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/platform/init.h"
 
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
@@ -30,7 +30,9 @@ int main(int argc, char** argv) {
   new_argv.push_back(
       strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory"));
 #else
-  new_argv.push_back(strdup("--tryfromenv=use_pinned_memory,use_mkldnn"));
+  new_argv.push_back(strdup(
+      "--tryfromenv=use_pinned_memory,use_mkldnn,initial_cpu_memory_in_mb"));
+  new_argv.push_back(strdup("--undefok=use_mkldnn,initial_cpu_memory_in_mb"));
 #endif
   int new_argc = static_cast<int>(new_argv.size());
   char** new_argv_address = new_argv.data();
diff --git a/paddle/trainer/MergeModel.cpp b/paddle/trainer/MergeModel.cpp
deleted file mode 100644
index 56c38015fb2398f8b39fac6b5a5d4af1c2fd56aa..0000000000000000000000000000000000000000
--- a/paddle/trainer/MergeModel.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-
-#include "ParamUtil.h"
-#include "Trainer.h"
-#include "paddle/pserver/ParameterServer2.h"
-#include "paddle/utils/PythonUtil.h"
-
-DEFINE_string(model_dir, "", "Directory for separated model files");
-DEFINE_string(config_file, "", "Config file for the model");
-DEFINE_string(model_file, "", "File for merged model file");
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  initPython(argc, argv);
-
-  if (FLAGS_model_dir.empty() || FLAGS_config_file.empty() ||
-      FLAGS_model_file.empty()) {
-    LOG(INFO) << "Usage: ./paddle_merge_model --model_dir=pass-00000 "
-                 "--config_file=config.py --model_file=out.paddle";
-    return 0;
-  }
-
-  string confFile = FLAGS_config_file;
-#ifndef PADDLE_WITH_CUDA
-  FLAGS_use_gpu = false;
-#endif
-  auto config = std::make_shared<TrainerConfigHelper>(confFile);
-  unique_ptr<GradientMachine> gradientMachine(GradientMachine::create(*config));
-  gradientMachine->loadParameters(FLAGS_model_dir);
-
-  ofstream os(FLAGS_model_file);
-
-  string buf;
-  config->getConfig().SerializeToString(&buf);
-  int64_t size = buf.size();
-  os.write((char*)&size, sizeof(size));
-  CHECK(os) << "Fail to write to " << FLAGS_model_file;
-  os.write(buf.data(), buf.size());
-  vector<ParameterPtr>& parameters = gradientMachine->getParameters();
-  for (auto& para : parameters) {
-    para->save(os);
-    CHECK(os) << "Fail to write to " << FLAGS_model_file;
-  }
-  os.close();
-
-  return 0;
-}
diff --git a/paddle/trainer/NewRemoteParameterUpdater.cpp b/paddle/trainer/NewRemoteParameterUpdater.cpp
deleted file mode 100644
index 410ac6d95c4d65ce6fb25c05351bb8ddb24473f4..0000000000000000000000000000000000000000
--- a/paddle/trainer/NewRemoteParameterUpdater.cpp
+++ /dev/null
@@ -1,150 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "NewRemoteParameterUpdater.h"
-#include "Trainer.h"
-#include "paddle/utils/Stat.h"
-
-DECLARE_int32(trainer_id);
-DECLARE_string(save_dir);
-
-namespace paddle {
-NewRemoteParameterUpdater::NewRemoteParameterUpdater(
-    const OptimizationConfig &config, const std::string pserverSpec)
-    : trainerConfig_(config),
-      parameterClient_(-1),
-      newParameters_(nullptr),
-      newGradients_(nullptr),
-      pserverSpec_(pserverSpec) {}
-
-NewRemoteParameterUpdater::NewRemoteParameterUpdater(
-    const OptimizationConfig &config,
-    const std::string pserverSpec,
-    const bool useEtcd)
-    : trainerConfig_(config),
-      parameterClient_(-1),
-      newParameters_(nullptr),
-      newGradients_(nullptr),
-      pserverSpec_(pserverSpec),
-      useEtcd_(useEtcd) {}
-
-void NewRemoteParameterUpdater::init(
-    const std::vector<ParameterPtr> &parameters) {
-  ParameterUpdater::init(parameters);
-
-  // create parameter server client.
-  if (useEtcd_) {
-    parameterClient_ =
-        paddle_new_etcd_pserver_client((char *)pserverSpec_.c_str());
-  } else {
-    parameterClient_ = paddle_new_pserver_client((char *)pserverSpec_.c_str(),
-                                                 FLAGS_trainer_id == 0);
-  }
-
-  // init new parameter and gradient.
-  newParameters_ = initNewParameter(PARAMETER_VALUE);
-  newGradients_ = initNewParameter(PARAMETER_GRADIENT);
-
-  // init parameter, one trainer will get the opportunity to int parameter and
-  // send them to parameter server. Others will get the initialized parameter
-  // from parameter server
-  if (paddle_begin_init_params(parameterClient_)) {
-    LOG(INFO) << "paddle_begin_init_params start";
-    // NOTE: convert V1 OptimizatioinConfig proto to V2 OptimizerConfig.
-    // This makes golang pserver compatible with handy V1 demos.
-    // TODO(wuyi): Refine or remove these ugly converting lines
-    OptimizerConfig optimizerConfigV2;
-    if (trainerConfig_.learning_method() == "momentum") {
-      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::SGD);
-    } else if (trainerConfig_.learning_method() == "adagrad") {
-      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::Adagrad);
-      optimizerConfigV2.mutable_adagrad()->set_epsilon(
-          trainerConfig_.ada_epsilon());
-    } else if (trainerConfig_.learning_method() == "adadelta") {
-      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::Adagrad);
-      optimizerConfigV2.mutable_adadelta()->set_epsilon(
-          trainerConfig_.ada_epsilon());
-      optimizerConfigV2.mutable_adadelta()->set_rho(trainerConfig_.ada_rou());
-    } else if (trainerConfig_.learning_method() == "adam") {
-      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::Adam);
-      optimizerConfigV2.mutable_adam()->set_beta_1(trainerConfig_.adam_beta1());
-      optimizerConfigV2.mutable_adam()->set_beta_2(trainerConfig_.adam_beta2());
-      optimizerConfigV2.mutable_adam()->set_epsilon(
-          trainerConfig_.adam_epsilon());
-    } else {
-      LOG(ERROR) << "got unsupported v1 optimizer config: "
-                 << trainerConfig_.learning_method();
-      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::SGD);
-    }
-
-    if (trainerConfig_.learning_rate_schedule() == "constant") {
-      optimizerConfigV2.set_lr_policy(paddle::OptimizerConfig::Const);
-      optimizerConfigV2.mutable_const_lr()->set_learning_rate(
-          trainerConfig_.learning_rate());
-    } else if (trainerConfig_.learning_rate_schedule() == "linear") {
-      optimizerConfigV2.set_lr_policy(paddle::OptimizerConfig::Linear);
-      optimizerConfigV2.mutable_linear_lr()->set_learning_rate(
-          trainerConfig_.learning_rate());
-      optimizerConfigV2.mutable_linear_lr()->set_lr_decay_a(
-          trainerConfig_.learning_rate_decay_a());
-      optimizerConfigV2.mutable_linear_lr()->set_lr_decay_b(
-          trainerConfig_.learning_rate_decay_b());
-    } else {
-      LOG(ERROR) << "got unsupported v1 learning_rate_schedule config: "
-                 << trainerConfig_.learning_rate_schedule() << ", set to const";
-      optimizerConfigV2.set_lr_policy(paddle::OptimizerConfig::Const);
-      optimizerConfigV2.mutable_const_lr()->set_learning_rate(
-          trainerConfig_.learning_rate());
-    }
-
-    // overwrite optimizerConfigV2 for per-parameter(layer) configs
-    for (int i = 0; i < parameterSize(); ++i) {
-      // FIXME(typhoonzero): paramConfig always have default values,
-      // how to check if it's default?
-      // TODO(typhoonzero): log output: optimizerConfigV2.DebugString();
-      LOG(INFO) << "trainerConfig_: " << trainerConfig_.DebugString();
-      // send param and config to pserver
-      std::string bytes = optimizerConfigV2.SerializeAsString();
-      const char *array = bytes.data();
-      int size = (int)bytes.size();
-      paddle_init_param(
-          parameterClient_, *newParameters_[i], (void *)array, size);
-    }
-    paddle_finish_init_params(parameterClient_);
-    LOG(INFO) << "paddle_begin_init_params done";
-  } else {
-    paddle_get_params(parameterClient_, newParameters_, parameterSize());
-  }
-
-  LOG(INFO) << "NewRemoteParameterUpdater initialized";
-}
-
-void NewRemoteParameterUpdater::updateImpl(Parameter *para) {}
-
-void NewRemoteParameterUpdater::finishBatch(real cost) {
-  // send gradient to parameter server.
-  paddle_send_grads(parameterClient_, newGradients_, parameterSize());
-  // get the updated parameter from parameterClient.
-  paddle_get_params(parameterClient_, newParameters_, parameterSize());
-
-  // clear gradient after update parameter.
-  for (auto &para : parameters_) {
-    para->getBuf(PARAMETER_GRADIENT)->zeroMem();
-  }
-}
-
-void NewRemoteParameterUpdater::startPass() {}
-
-bool NewRemoteParameterUpdater::finishPass() { return true; }
-}  // namespace paddle
diff --git a/paddle/trainer/NewRemoteParameterUpdater.h b/paddle/trainer/NewRemoteParameterUpdater.h
deleted file mode 100644
index 02693c675e6f5cb574e52e9681963a5904676028..0000000000000000000000000000000000000000
--- a/paddle/trainer/NewRemoteParameterUpdater.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include <thread>
-#include "OptimizerConfig.pb.h"
-#include "ParameterUpdater.h"
-#include "libpaddle_pserver_cclient.h"
-#include "paddle/pserver/ParameterClient2.h"
-#include "paddle/utils/Queue.h"
-#include "paddle/utils/Util.h"
-
-namespace paddle {
-
-/**
- * New remote parameter updater for dense parameters that use cclient of go.
- */
-class NewRemoteParameterUpdater : public ParameterUpdater {
- public:
-  NewRemoteParameterUpdater(const OptimizationConfig& config,
-                            const std::string pserverSpec);
-  NewRemoteParameterUpdater(const OptimizationConfig& config,
-                            const std::string pserverSpec,
-                            const bool useEtcd);
-  ~NewRemoteParameterUpdater() {
-    releaseNewParameter(newParameters_);
-    releaseNewParameter(newGradients_);
-    if (parameterClient_ >= 0) paddle_pserver_client_release(parameterClient_);
-  }
-
-  /**
-   * initialize the internal parameter client and itself.
-   */
-  virtual void init(const std::vector<ParameterPtr>& parameters);
-  /**
-   * @brief start batch
-   *
-   * @note  one batch training exhibits stateful feature to help
-   *        to do performance tuning, sgd optimization if necessary.
-   */
-  virtual PassType startBatch(int64_t batchSize) { return PASS_TRAIN; }
-
-  /**
-   * send parameters to pservers and get returned parameters
-   * from all pservers if necessary.
-   */
-  virtual void finishBatch(real cost);
-  virtual void startPass();
-  virtual bool finishPass();
-
- protected:
-  /**
-   * work need to do after finishBatch
-   */
-  virtual void updateImpl(Parameter* para);
-
- private:
-  int parameterSize() { return (int)parameters_.size(); }
-
-  /**
-   * init parameter of go paddle pserver cclient.
-   * @param new_params
-   * @param type
-   */
-  paddle_parameter** initNewParameter(ParameterType type) {
-    paddle_parameter** new_params =
-        (paddle_parameter**)malloc(sizeof(paddle_parameter*) * parameterSize());
-    for (int i = 0; i < parameterSize(); ++i) {
-      new_params[i] = (paddle_parameter*)malloc(sizeof(paddle_parameter));
-      memset(new_params[i], 0, sizeof(paddle_parameter));
-    }
-
-    for (int i = 0; i < parameterSize(); ++i) {
-      ParameterPtr param = parameters_[i];
-      new_params[i]->element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
-      new_params[i]->name = (char*)param->getName().c_str();
-      new_params[i]->content =
-          (unsigned char*)(param->getBuf(type).get()->getData());
-      new_params[i]->content_len =
-          (int)param->getBuf(type).get()->getSize() * sizeof(real);
-    }
-    return new_params;
-  }
-
-  void releaseNewParameter(paddle_parameter** newParams) {
-    if (newParams != nullptr) {
-      for (int i = 0; i < parameterSize(); ++i) {
-        free(newParams[i]);
-      }
-      free(newParams);
-    }
-  }
-
- protected:
-  const OptimizationConfig& trainerConfig_;
-  /// internal parameter client object for exchanging data with pserver
-  paddle_pserver_client parameterClient_;
-  /// the parameters for new pserver client
-  paddle_parameter** newParameters_;
-  /// the gradinets for new pserver client
-  paddle_parameter** newGradients_;
-  /// the specification of parameter server "host1:port,host1:port"
-  std::string pserverSpec_;
-  /// true if pserverSpec_ is etcd endpoint, else pserverSpec_ is pserver addr
-  bool useEtcd_;
-};
-
-}  // namespace paddle
diff --git a/paddle/trainer/ParamUtil.cpp b/paddle/trainer/ParamUtil.cpp
deleted file mode 100644
index ffbca42e106591ddeb2cefcfafbeb408c544371b..0000000000000000000000000000000000000000
--- a/paddle/trainer/ParamUtil.cpp
+++ /dev/null
@@ -1,163 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ParamUtil.h"
-
-#include <fenv.h>
-#include <stdio.h>
-
-#include <iomanip>
-#include <iostream>
-#include <limits>
-#include <sstream>
-
-#include <google/protobuf/text_format.h>
-#include <paddle/utils/Version.h>
-
-#include "paddle/utils/GlobalConstants.h"
-#include "paddle/utils/PythonUtil.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/Util.h"
-
-#include "TesterConfig.h"
-#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
-#include "paddle/gserver/layers/ValidationLayer.h"
-
-namespace paddle {
-
-ParameterUtil::ParameterUtil(
-    const std::shared_ptr<TrainerConfigHelper> &config,
-    std::unique_ptr<ParameterUtilConfig> &&intconfig,
-    const GradientMachinePtr &gradientMachine,
-    const std::shared_ptr<ParameterUpdater> &parameterUpdater) {
-  config_ = config;
-  intConfig_ = std::move(intconfig);
-  gserver_ = gradientMachine;
-  pUpdater_ = parameterUpdater;
-}
-
-bool ParameterUtil::loadParameters(int passId, bool local, bool remote) {
-  constexpr int kBufLen = 100;
-  char buf[kBufLen];
-  snprintf(buf, kBufLen, "pass-%05d", passId);
-  std::string doneFile = path::join(config_->getSaveDir(), buf, "done");
-  if (!fileExist(doneFile.c_str())) return false;
-  loadParametersWithPath(path::join(config_->getSaveDir(), buf), local, remote);
-  return true;
-}
-
-void ParameterUtil::loadParametersWithPath(const std::string &dir,
-                                           bool local,
-                                           bool remote) {
-  if (local) {
-    gserver_->loadParameters(dir);
-  }
-  if (remote && pUpdater_) {
-    pUpdater_->loadParametersRemote(dir);
-  }
-}
-
-void ParameterUtil::saveParametersOnePass(int passId, int passInnerId) {
-  pUpdater_->apply();
-  saveParameters(passId, passInnerId);
-  if (intConfig_->save_only_one_ && passId >= intConfig_->saving_period_) {
-    deleteParameters(passId - intConfig_->saving_period_);
-  }
-  pUpdater_->restore();
-}
-
-void ParameterUtil::saveParameters(int passId, int passInnerId) {
-  constexpr int kBufLen = 100;
-  char buf[kBufLen];
-  if (passInnerId > 0) {
-    snprintf(buf, kBufLen, "pass-%05d-%03d", passId, passInnerId);
-  } else {
-    snprintf(buf, kBufLen, "pass-%05d", passId);
-  }
-
-  std::string basePath = config_->getSaveDir();
-  if (basePath.find('/') == std::string::npos) {
-    basePath = "./" + basePath;
-  }
-  mkDirRecursively(basePath.c_str());
-
-  std::string saveDir = path::join(basePath, buf);
-  mkDir(saveDir.c_str());
-  if (!intConfig_->load_save_param_pserver_) {
-    pUpdater_->getParametersRemote(true /*full parameter*/,
-                                   true /*after apply*/);
-  }
-
-  gserver_->saveParameters(saveDir);
-  if (intConfig_->load_save_param_pserver_) {
-    pUpdater_->saveParametersRemote(saveDir);
-  }
-  std::string doneFile = path::join(saveDir, "done");
-  touchFile(doneFile.c_str());
-  std::ofstream out(doneFile);
-  version::printVersion(out);
-  out.close();
-  VLOG(1) << "save dir " << saveDir;
-  saveConfigWithPath(saveDir);
-}
-
-void ParameterUtil::deleteParameters(int passId, int passInnerId) {
-  constexpr int kBufLen = 100;
-  char buf[kBufLen];
-  const std::string &saveDir = config_->getSaveDir();
-  if (passInnerId > 0) {
-    snprintf(buf,
-             kBufLen,
-             "%s/pass-%05d-%03d",
-             saveDir.c_str(),
-             passId,
-             passInnerId);
-  } else {
-    snprintf(buf, kBufLen, "%s/pass-%05d", saveDir.c_str(), passId);
-  }
-  mkDir(saveDir.c_str());
-  LOG(INFO) << "delete dir " << buf;
-  rmDir(buf);
-}
-
-void ParameterUtil::saveConfigWithPath(const std::string &path) {
-  std::string src;
-  // save config in some path
-  if (!intConfig_->config_.empty()) {
-    src = intConfig_->config_;
-  } else {
-    bool ok;
-    src = config_->getConfigName(&ok);
-    if (!ok) {
-      return;
-    }
-  }
-  copyFileToPath(src, path);
-
-  // save other import config file name to path.txt
-  std::string ss = path::join(path, "path.txt");
-  std::ofstream os(ss);
-  std::string fileName = path::basename(src);
-  CHECK(os.write(fileName.c_str(), fileName.length()))
-      << "Fail to write config file name " << ss;
-  VLOG(1) << "fileName " << fileName;
-  os.close();
-
-  // copy other import config files
-  for (int i = 0; i < config_->getConfig().config_files_size(); ++i) {
-    copyFileToPath(config_->getConfig().config_files(i), path);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/trainer/ParamUtil.h b/paddle/trainer/ParamUtil.h
deleted file mode 100644
index 10746b4d58e3a82c081987a6aaad9e0b42272a03..0000000000000000000000000000000000000000
--- a/paddle/trainer/ParamUtil.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/utils/Util.h"
-
-#include <stdio.h>
-
-#include "hl_gpu.h"
-#include "paddle/gserver/dataproviders/DataProvider.h"
-#include "paddle/gserver/gradientmachines/GradientMachine.h"
-
-#include <stdlib.h>
-#include <fstream>
-#include "ParameterUpdater.h"
-#include "TrainerConfig.pb.h"
-#include "TrainerConfigHelper.h"
-
-namespace paddle {
-
-/**
- * Configuration for parameter utils.
- */
-struct ParameterUtilConfig {
-  DISABLE_COPY(ParameterUtilConfig);
-
-  ParameterUtilConfig(bool save_only_one,
-                      int saving_period,
-                      bool load_save_parameters_in_pserver,
-                      std::string config)
-      : save_only_one_(save_only_one),
-        saving_period_(saving_period),
-        load_save_param_pserver_(load_save_parameters_in_pserver),
-        config_(config) {}
-
-  bool save_only_one_;
-  int saving_period_;
-  bool load_save_param_pserver_;
-  std::string config_;
-};
-
-/**
- * ParameterUtil
- * Utility class for loading and saving parameters
- */
-class ParameterUtil {
- public:
-  /**
-   * Ctor.
-   *
-   * @param config
-   * @param intconfig
-   * @param gradientMachine
-   * @param parameterUpdater
-   * @return
-   */
-  ParameterUtil(const std::shared_ptr<TrainerConfigHelper> &config,
-                std::unique_ptr<ParameterUtilConfig> &&intconfig,
-                const GradientMachinePtr &gradientMachine,
-                const std::shared_ptr<ParameterUpdater> &parameterUpdater);
-
-  /// Load parameter from the saved parameter file as pass passId
-  /// if loadsave_parameters_in_pserver is set, some parameters MUST
-  /// load in pserver, which is "remote".
-  /// loadParameters can choose to load local/remote parameter, or both.
-  bool loadParameters(int passId, bool local = true, bool remote = false);
-
-  /// load parameters given path info
-  void loadParametersWithPath(const std::string &dir,
-                              bool local = true,
-                              bool remote = false);
-
-  /// Save parameter to dist for pass passId
-  /// passInnerId means saving times in one pass, some users want to
-  /// save parameters when have processed some batches in one pass
-  /// passInnerId = 0 means do not need to save in one inner pass
-  void saveParameters(int passId, int passInnerId = 0);
-
-  /// save parameters for one pass, when passInnerId > 0 means saving
-  /// the passInnerId times in one pass
-  void saveParametersOnePass(int passId, int passInnerId = 0);
-
-  /// delete parameter from disk via passId
-  void deleteParameters(int passId, int passInnerId = 0);
-
-  /// save config given path info
-  void saveConfigWithPath(const std::string &path);
-
-  /**
-   * Try to load parameter from config.
-   * @return true if can load from trainer config.
-   */
-  inline bool tryLoadParametersFromConfig() {
-    auto &c = config_->getConfig();
-    if (!c.init_model_path().empty()) {
-      loadParametersWithPath(c.init_model_path());
-      return true;
-    } else if (c.start_pass() > 0) {
-      CHECK(loadParameters(c.start_pass() - 1));
-      return true;
-    } else {
-      return false;
-    }
-  }
-
- private:
-  std::shared_ptr<TrainerConfigHelper> config_;
-  std::unique_ptr<ParameterUtilConfig> intConfig_;
-  GradientMachinePtr gserver_;
-  std::shared_ptr<ParameterUpdater> pUpdater_;
-};
-
-}  //  namespace paddle
diff --git a/paddle/trainer/ParameterUpdater.cpp b/paddle/trainer/ParameterUpdater.cpp
deleted file mode 100644
index 4e9e890c85945aedd7e604f52a06902191c95d4c..0000000000000000000000000000000000000000
--- a/paddle/trainer/ParameterUpdater.cpp
+++ /dev/null
@@ -1,152 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ParameterUpdater.h"
-
-#include "paddle/utils/Logging.h"
-
-#include "paddle/utils/Thread.h"
-
-namespace paddle {
-
-static const hl_stream_t kDeviceToHostStream = HPPL_STREAM_1;
-static const hl_stream_t kHostToDeviceStream = HPPL_STREAM_2;
-
-SgdUpdaterWithCpuAverager::SgdUpdaterWithCpuAverager(
-    const OptimizationConfig& optConfig)
-    : SgdLocalUpdater(optConfig, false /*with averager*/) {
-  CHECK(FLAGS_use_gpu && optConfig.do_average_in_cpu());
-  averager_.reset(AverageOptimizer::create(optConfig,
-                                           new DummyOptimizer(optConfig),
-                                           false /*sparse*/,
-                                           true /*apply*/));
-  updateWorker_.addJob([]() { hl_set_device(FLAGS_gpu_id); });
-}
-
-void SgdUpdaterWithCpuAverager::init(
-    const std::vector<ParameterPtr>& parameters) {
-  SgdLocalUpdater::init(parameters);
-  averager_->init(parameters_.size(), nullptr);
-  copyEvents_.resize(parameters_.size());
-  for (auto& parameter : parameters) {
-    SetDevice device(parameter->getDeviceId());
-    cpuParameters_.emplace_back(new Parameter(parameter->getConfig(),
-                                              /* useGpu= */ false,
-                                              /* doInit= */ false));
-    if (parameter->useGpu()) {
-      cpuParameters_.back()->enableType(PARAMETER_APPLY);
-    } else {
-      cpuParameters_.back()->enableSharedType(
-          PARAMETER_APPLY, parameter->getBuf(PARAMETER_VALUE));
-    }
-    for (ParameterType type : averager_->getParameterTypes()) {
-      cpuParameters_.back()->enableType(type);
-    }
-
-    hl_create_event(&copyEvents_[nonStaticParaIDMap_[parameter->getID()]]);
-  }
-}
-
-SgdUpdaterWithCpuAverager::~SgdUpdaterWithCpuAverager() {
-  for (auto& event : copyEvents_) {
-    hl_destroy_event(event);
-  }
-}
-
-void SgdUpdaterWithCpuAverager::updateImpl(Parameter* para) {
-  SgdLocalUpdater::updateImpl(para);
-
-  if (para->useGpu()) {
-    size_t pid = nonStaticParaIDMap_[para->getID()];
-    Parameter* cpuPara = cpuParameters_[pid].get();
-    cpuPara->getBuf(PARAMETER_VALUE)
-        ->copyFrom(*para->getBuf(PARAMETER_VALUE), kDeviceToHostStream);
-    hl_stream_record_event(kDeviceToHostStream, copyEvents_[pid]);
-  }
-
-  updateWorker_.addJob(
-      std::bind(&SgdUpdaterWithCpuAverager::updateFunc, this, para));
-}
-
-void SgdUpdaterWithCpuAverager::updateFunc(Parameter* para) {
-  SetDevice setDevice(para->getDeviceId());
-  size_t pid = nonStaticParaIDMap_[para->getID()];
-  Parameter* cpuPara = cpuParameters_[pid].get();
-  if (para->useGpu()) {
-    hl_event_synchronize(copyEvents_[pid]);
-  }
-  averager_->update(cpuPara->getBufs(), cpuPara->getConfig(), -1LU);
-}
-
-void SgdUpdaterWithCpuAverager::finishBatch(real cost) {
-  SgdLocalUpdater::finishBatch(cost);
-
-  updateWorker_.wait();
-  for (auto para : cpuParameters_) {
-    if (auto callback = averager_->needSpecialTraversal(para->getConfig())) {
-      callback(para->getBufs(), para->getConfig(), -1LU);
-    }
-  }
-  averager_->finishBatch();
-}
-
-void SgdUpdaterWithCpuAverager::apply() {
-  // backup gpu value
-  for (auto& para : parameters_) {
-    SetDevice setDevice(para->getDeviceId());
-    para->getBuf(PARAMETER_GRADIENT)
-        ->copyFrom(*para->getBuf(PARAMETER_VALUE), kHostToDeviceStream);
-  }
-
-  // apply on cpu parameter
-  if (auto callback = averager_->apply()) {
-    for (auto para : cpuParameters_) {
-      callback(para->getBufs(), para->getConfig(), -1LU);
-    }
-  }
-
-  // copy to gpu value
-  for (auto& para : parameters_) {
-    SetDevice setDevice(para->getDeviceId());
-    size_t pid = nonStaticParaIDMap_[para->getID()];
-    Parameter* cpuPara = cpuParameters_[pid].get();
-    if (parameters_[pid]->useGpu()) {
-      para->getBuf(PARAMETER_VALUE)
-          ->copyFrom(*cpuPara->getBuf(PARAMETER_APPLY), kHostToDeviceStream);
-    }
-  }
-  hl_stream_synchronize(kHostToDeviceStream);
-  for (auto& para : parameters_) {
-    para->setValueUpdated();
-  }
-}
-
-void SgdUpdaterWithCpuAverager::restore() {
-  // restore on cpu parameter
-  if (auto callback = averager_->restore()) {
-    for (auto para : cpuParameters_) {
-      callback(para->getBufs(), para->getConfig(), -1LU);
-    }
-  }
-
-  // restore gpu value
-  for (auto& para : parameters_) {
-    SetDevice device(para->getDeviceId());
-    para->getBuf(PARAMETER_VALUE)->copyFrom(*para->getBuf(PARAMETER_GRADIENT));
-    para->getBuf(PARAMETER_GRADIENT)->zeroMem();
-    para->setValueUpdated();
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/trainer/ParameterUpdater.h b/paddle/trainer/ParameterUpdater.h
deleted file mode 100644
index ef7ab92eca77bab2a8481561713f8034d2b8505d..0000000000000000000000000000000000000000
--- a/paddle/trainer/ParameterUpdater.h
+++ /dev/null
@@ -1,265 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/utils/Thread.h"
-#include "paddle/utils/Util.h"
-
-#include "paddle/parameter/AverageOptimizer.h"
-#include "paddle/parameter/FirstOrderOptimizer.h"
-#include "paddle/parameter/OptimizerFunctions.h"
-#include "paddle/parameter/OptimizerWithRegularizer.h"
-#include "paddle/parameter/Parameter.h"
-#include "paddle/parameter/ParameterUpdaterBase.h"
-
-#include "TrainerConfig.pb.h"
-#include "paddle/gserver/layers/Layer.h"
-
-#include <memory>
-#include <vector>
-
-namespace paddle {
-
-/**
- * @brief Parameter Updater for SGD, and local(not cluster) run.
- */
-class SgdLocalUpdater : public ParameterUpdater {
- public:
-  /**
-   * @brief Ctor. Initialize optimizer locally by optConfig.
-   * @param optConfig optimization config.
-   * @param withAverager with average optimizer or not, default is true.
-   */
-  explicit SgdLocalUpdater(const OptimizationConfig& optConfig,
-                           bool withAverager = true)
-      : numSamplesProcessed_(0) {
-    auto baseOptimizer = ParameterOptimizer::create(optConfig);
-    optimizer_.reset(withAverager
-                         ? AverageOptimizer::create(optConfig, baseOptimizer)
-                         : baseOptimizer);
-    CHECK(optimizer_) << "fail to create optimizer: "
-                      << optConfig.learning_method();
-    auto types = optimizer_->getParameterTypes();
-    for (auto type : types) {
-      addParameterType(type);
-    }
-  }
-
-  /**
-   * @brief Initialize parameters and optimizer_.
-   *        For example,
-   *           If optimizer need hassien vector, then parameter's hassien will
-   *           be initialized.
-   * @param parameters The parameter need to be initialized.
-   */
-  virtual void init(const std::vector<ParameterPtr>& parameters) {
-    ParameterUpdater::init(parameters);
-    optimizer_->init(parameters_.size(), nullptr);
-    // check no L1 decay in parameter configs
-    CHECK(std::find_if(parameters.begin(),
-                       parameters.end(),
-                       [](const ParameterPtr& para) {
-                         return para->getConfig().decay_rate_l1() > 0.0f;
-                       }) == parameters.end())
-        << "SgdLocalUpdater cannot support L1 decay in parameter";
-  }
-
-  /**
-   * @brief Start a batch with current mini-batch size
-   * @param current mini-batch size.
-   * @return Always PASS_TRAIN.
-   */
-  virtual PassType startBatch(int64_t batchSize) {
-    numSamplesProcessed_ += batchSize;
-    optimizer_->startBatch(numSamplesProcessed_);
-    return PASS_TRAIN;
-  }
-
-  /**
-   * @brief finish a mini-batch.
-   */
-  virtual void finishBatch(real cost) { optimizer_->finishBatch(); }
-
-  /**
-   * @brief start a pass.
-   */
-  virtual void startPass() { optimizer_->startPass(); }
-
-  /**
-   * @brief finish a pass.
-   * @param cost sum cost during one pass.
-   * @return true if accept (used for owlqn).
-   */
-  virtual bool finishPass() {
-    optimizer_->finishPass();
-    return ParameterUpdater::finishPass();
-  }
-
-  /**
-   * @brief apply model average.
-   */
-  virtual void apply() {
-    if (auto callback = optimizer_->apply()) {
-      for (auto para : parameters_) {
-        SetDevice device(para->getDeviceId());
-        callback(para->getBufs(), para->getConfig(), -1UL);
-      }
-    }
-  }
-
-  /**
-   * @brief restore parameter value before model average
-   */
-  virtual void restore() {
-    if (auto callback = optimizer_->restore()) {
-      for (auto para : parameters_) {
-        SetDevice device(para->getDeviceId());
-        callback(para->getBufs(), para->getConfig(), -1UL);
-      }
-    }
-  }
-
- protected:
-  /**
-   * @brief update method. Update value from gradient.
-   * @param para parameter that will be updated.
-   */
-  virtual void updateImpl(Parameter* para) {
-    optimizer_->update(para->getBufs(), para->getConfig());
-    if (auto callback = optimizer_->needSpecialTraversal(para->getConfig())) {
-      callback(para->getBufs(), para->getConfig(), -1UL);
-    }
-
-    para->setValueUpdated();
-    para->getBuf(PARAMETER_GRADIENT)->zeroMem();
-  }
-
-  std::unique_ptr<ParameterOptimizer> optimizer_;
-
-  /**
-   * @brief total number of samples processed.
-   */
-  int64_t numSamplesProcessed_;
-};
-
-/**
- * @brief SgdCpuUpdater is used only in recursive neural network
- * @deprecated
- */
-class SgdCpuUpdater : public SgdLocalUpdater, public Deprecated {
- public:
-  explicit SgdCpuUpdater(const OptimizationConfig& optConfig)
-      : SgdLocalUpdater(optConfig),
-        Deprecated(
-            "SgdCpuUpdater is used only in recursive neural network, "
-            "and recursive neural network is deprecated in paddle. "
-            "Use it all by your own.") {}
-
-  /**
-   * @brief update all parameter on finish batch.
-   * @param cost
-   */
-  virtual void finishBatch(real cost) {
-    for (auto para : parameters_) {
-      SgdLocalUpdater::update(para.get());
-    }
-    optimizer_->finishBatch();
-  }
-
- protected:
-  /**
-   * @brief do nothing.
-   * @param para
-   */
-  virtual void updateImpl(Parameter* para) {}
-};
-
-/**
- * @brief Sgd Local Updater With average in cpu.
- *
- * It will do model average in cpu to reduce gpu memory comsuption.
- */
-class SgdUpdaterWithCpuAverager : public SgdLocalUpdater {
- public:
-  /**
-   * @brief Ctor.
-   *
-   * SgdUpdaterWithCpuAverager will do everything as a
-   * SgdLocalUpdater, then copy parameter from GPU to CPU, and do model
-   * average in cpu.
-   */
-  explicit SgdUpdaterWithCpuAverager(const OptimizationConfig& optConfig);
-  ~SgdUpdaterWithCpuAverager();
-
-  /**
-   * @brief init. Initialize cpu parameters, model average optimizer.
-   * @param parameters
-   */
-  virtual void init(const std::vector<ParameterPtr>& parameters);
-
-  virtual PassType startBatch(int64_t batchSize) {
-    averager_->startBatch(-1UL);
-    return SgdLocalUpdater::startBatch(batchSize);
-  }
-  virtual void finishBatch(real cost);
-
-  virtual void startPass() {
-    averager_->startPass();
-    SgdLocalUpdater::startPass();
-  }
-  virtual bool finishPass() {
-    averager_->finishPass();
-    return SgdLocalUpdater::finishPass();
-  }
-
-  /// apply the averaged parameter to PARAMETER_VALUE
-  /// use PARAETER_GRADIENT for backing up PARAMETER_VALUE
-  virtual void apply();
-
-  /**
-   * @brief Restore parameter before apply().
-   */
-  virtual void restore();
-
- protected:
-  virtual void updateImpl(Parameter* para);
-
-  void updateFunc(Parameter* para);
-
- protected:
-  std::unique_ptr<ParameterOptimizer> averager_;
-
-  /**
-   * @brief The thread worker which do model average.
-   *
-   * For each parameter, GPU->CPU parameter is async, and do model average in
-   * another thread. Because the training process don't need model average while
-   * training, and model average only used in evaluation stage and saving stage.
-   * So the model average is totally async.
-   */
-  ThreadWorker updateWorker_;
-
-  /**
-   * @brief The parameter mirror in cpu.
-   */
-  std::vector<ParameterPtr> cpuParameters_;
-
-  /**
-   * @brief GPU -> CPU copy event. Model average will wait after copy done.
-   */
-  std::vector<hl_event_t> copyEvents_;
-};
-
-}  // namespace paddle
diff --git a/paddle/trainer/RemoteParameterUpdater.cpp b/paddle/trainer/RemoteParameterUpdater.cpp
deleted file mode 100644
index 7314266cb24da9b9e9f0f1cbe61ed363247f51fe..0000000000000000000000000000000000000000
--- a/paddle/trainer/RemoteParameterUpdater.cpp
+++ /dev/null
@@ -1,843 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "RemoteParameterUpdater.h"
-#include "Trainer.h"
-#include "paddle/utils/GlobalConstants.h"
-#include "paddle/utils/Stat.h"
-
-DECLARE_int32(trainer_id);
-DECLARE_string(save_dir);
-
-namespace paddle {
-
-static const hl_stream_t kDeviceToHostStream = HPPL_STREAM_1;
-static const hl_stream_t kHostToDeviceStream = HPPL_STREAM_2;
-static const int kFinishBatchPid = -1;
-
-const std::string RemoteParameterUpdater::kAverage = "average";
-const std::string RemoteParameterUpdater::kElasticAverage = "elastic_average";
-
-RemoteParameterUpdater::RemoteParameterUpdater(
-    const OptimizationConfig& config,
-    int expectedPassCount,
-    std::unique_ptr<ParameterUpdater>&& localUpdater)
-    : config_(config),
-      localUpdater_(std::move(localUpdater)),
-      numBatches_(0),
-      passCount_(0),
-      expectedPassCount_(expectedPassCount),
-      separateSendAndRecv_(false),
-      isFirstPass_(true),
-      useApplyInPserver_(false) {
-  addParameterType(PARAMETER_MOMENTUM);
-}
-
-void RemoteParameterUpdater::init(const std::vector<ParameterPtr>& parameters) {
-  ParameterUpdater::init(parameters);
-
-  if (localUpdater_) {
-    localUpdater_->init(parameters);
-
-    for (auto& parameter : parameters) {
-      parameter->enableType(PARAMETER_DELTA);
-    }
-
-    CHECK(config_.center_parameter_update_method() == kAverage ||
-          config_.center_parameter_update_method() == kElasticAverage)
-        << "unknown center_parameter_update_method";
-
-    // modify delta_add_rate
-    CHECK_GT(FLAGS_num_gradient_servers, 1)
-        << "FLAGS_num_gradient_servers should be set in trainer args.";
-    real delta_add_rate = config_.delta_add_rate() / FLAGS_num_gradient_servers;
-    config_.set_delta_add_rate(delta_add_rate);
-    LOG(INFO) << "center parameter in pserver,"
-              << " modify delta_add_rate=" << delta_add_rate;
-  }
-
-  if (!FLAGS_use_gpu) {
-    cpuParameters_ = parameters;
-  } else {
-    for (auto& parameter : parameters) {
-      cpuParameters_.emplace_back(new Parameter(parameter->getConfig(),
-                                                /* useGpu= */ false));
-      cpuParameters_.back()->setID(parameter->getID());
-      if (localUpdater_) {
-        cpuParameters_.back()->enableType(PARAMETER_DELTA);
-      }
-    }
-  }
-
-  parameterClient_.reset(new ParameterClient2(separateSendAndRecv_));
-  parameterClient_->init(cpuParameters_);
-  parameterClient_->setTrainerId(FLAGS_trainer_id);
-
-  if (FLAGS_trainer_id == 0) {
-    parameterClient_->setConfig(config_);
-    copyParametersFromDevice(PARAMETER_VALUE);
-    parameterClient_->setParameter();
-    parameterClient_->setStatus(PSERVER_STATUS_PARAMETER_READY);
-  } else {
-    parameterClient_->waitForStatus(PSERVER_STATUS_PARAMETER_READY);
-    parameterClient_->getParameter();
-    copyParametersToDevice(PARAMETER_VALUE);
-  }
-  if (FLAGS_trainer_id == 0 &&
-      (config_.algorithm() != TrainAlgorithm::AsyncSGD)) {
-    startController();
-    useApplyInPserver_ = useApplyInPserver(config_);
-  }
-}
-
-void RemoteParameterUpdater::startController() {
-  controllerThread_.reset(new std::thread([this]() { this->controller(); }));
-}
-
-void RemoteParameterUpdater::controller() {
-  ParameterClient2 client(false);
-  client.init(cpuParameters_);
-  while (true) {
-    /*start pass*/ {
-      client.waitPassStart();
-
-      PreparedOperations ops;
-      ops.addOperation(PSERVER_OP_START_PASS);
-      client.doOperation(ops,
-                         /* waitForGradient= */ false,
-                         /* sendBackarameter= */ false,
-                         /* releasePass= */ false);
-    }
-
-    while (true) {
-      PreparedOperations ops;
-      ops.addOperation(PSERVER_OP_SGD);
-      client.doOperation(ops,
-                         /* waitForGradient= */ true,
-                         /* sendBackarameter= */ true,
-                         /* releasePass= */ false);
-      if (client.isPassFinish()) {
-        break;
-      }
-    }
-
-    /*finish pass*/ {
-      PreparedOperations ops;
-      ops.addOperation(PSERVER_OP_FINISH_PASS);
-      client.doOperation(ops,
-                         /* waitForGradient= */ true,
-                         /* sendBackarameter= */ true,
-                         /* releasePass= */ true);
-    }
-
-    passCount_++;
-    if (passCount_ == expectedPassCount_) {
-      break;
-    }
-  }
-}
-
-void RemoteParameterUpdater::copyParametersToDevice(
-    ParameterType parameterType) {
-  if (!FLAGS_use_gpu) {
-    return;
-  }
-  int numParameters = cpuParameters_.size();
-  for (int i = 0; i < numParameters; ++i) {
-    parameters_[i]
-        ->getBuf(parameterType)
-        ->copyFrom(*cpuParameters_[i]->getBuf(parameterType));
-    if (parameterType == PARAMETER_VALUE) {
-      parameters_[i]->setValueUpdated();
-    }
-  }
-}
-
-void RemoteParameterUpdater::copyParametersFromDevice(
-    ParameterType parameterType) {
-  if (!FLAGS_use_gpu) {
-    return;
-  }
-  int numParameters = cpuParameters_.size();
-  for (int i = 0; i < numParameters; ++i) {
-    cpuParameters_[i]
-        ->getBuf(parameterType)
-        ->copyFrom(*parameters_[i]->getBuf(parameterType));
-  }
-}
-
-void RemoteParameterUpdater::updateImpl(Parameter* para) {
-  REGISTER_TIMER("update");
-  if (localUpdater_) {
-    localUpdater_->update(para);
-  }
-}
-
-void RemoteParameterUpdater::finishBatch(real cost) {
-  if (localUpdater_) {
-    localUpdater_->finishBatch(cost);
-  }
-
-  const std::string& algorithm = config_.algorithm();
-  ParameterUpdateMode mode;
-  if (algorithm == TrainAlgorithm::AsyncSGD) {
-    mode = PSERVER_UPDATE_MODE_ASYNC_SGD;
-  } else if (algorithm == TrainAlgorithm::SGD) {
-    mode = PSERVER_UPDATE_MODE_ADD_GRADIENT;
-  } else {
-    LOG(FATAL) << "Unknown algorithm: " << algorithm;
-  }
-
-  ParameterType sendType;
-  bool sendBackParameter = true;
-  if (localUpdater_) {
-    ++numBatches_;
-    if (numBatches_ % config_.num_batches_per_send_parameter() != 0) {
-      return;
-    }
-
-    if (config_.center_parameter_update_method() == kElasticAverage) {
-      parameterClient_->getParameter(PARAMETER_DELTA);
-      copyParametersToDevice(PARAMETER_DELTA);
-      sendBackParameter = false;  // no need send back after send
-
-      // calc delta
-      for (auto& para : parameters_) {
-        // DELTA = LOCAL_VALUE - CENTER_VALUE/*store in DELTA*/
-        para->getBuf(PARAMETER_DELTA)
-            ->add(*para->getBuf(PARAMETER_VALUE), -1.0f, 1.0f);
-
-        // when delta send to pserver, pserver will do:
-        // CENTER_VALUE += alpha * (LOCAL_VALUE - CENTER_VALUE)
-      }
-    } else {
-      // calc delta
-      for (auto& para : parameters_) {
-        // DELTA = NEW_VALUE - OLD_VALUE/*store in DELTA*/
-        para->getBuf(PARAMETER_DELTA)
-            ->add(*para->getBuf(PARAMETER_VALUE), -1.0f, 1.0f);
-      }
-    }
-
-    sendType = PARAMETER_DELTA;
-
-  } else {
-    // In this case, we perform SGD on pserver.
-    sendType = PARAMETER_GRADIENT;
-  }
-
-  copyParametersFromDevice(sendType);
-
-  {
-    REGISTER_TIMER("sendAndRecv_dense");
-    parameterClient_->sendAndReceiveParameter(mode,
-                                              sendType,
-                                              batchSize_,
-                                              0,  // cost = 0
-                                              sendBackParameter);
-  }
-
-  if (sendBackParameter) {
-    copyParametersToDevice(PARAMETER_VALUE);
-  }
-
-  if (localUpdater_) {
-    if (config_.center_parameter_update_method() == kElasticAverage) {
-      for (auto& para : parameters_) {
-        SetDevice device(para->getDeviceId());
-        // LOCAL_VALUE += -alpha * (LOCAL_VALUE - CENTER_VALUE)
-        para->getBuf(PARAMETER_VALUE)
-            ->add(*para->getBuf(PARAMETER_DELTA), -config_.delta_add_rate());
-      }
-
-    } else {  // average
-      // copy value to delta
-      for (auto& para : parameters_) {
-        SetDevice device(para->getDeviceId());
-        para->getBuf(PARAMETER_DELTA)->copyFrom(*para->getBuf(PARAMETER_VALUE));
-      }
-    }
-  } else {
-    for (auto& para : parameters_) {
-      SetDevice device(para->getDeviceId());
-      para->getBuf(sendType)->zeroMem();
-    }
-  }
-}
-
-void RemoteParameterUpdater::startPass() {
-  if (config_.algorithm() == TrainAlgorithm::SGD) {
-    parameterClient_->waitPassStart();
-  } else {
-    // sync could benifits reducing lagged trainer for async-sgd
-    // even if sync could not remove all lagged trainer for the
-    // sake of file loading, buffer etc.
-    parameterClient_->asyncStartPass();
-  }
-
-  if (localUpdater_) {
-    localUpdater_->startPass();
-    numBatches_ = 0;
-
-    if (config_.center_parameter_update_method() == kElasticAverage) {
-      if (!isFirstPass_) {
-        // restore local value from delta
-        for (auto& para : parameters_) {
-          SetDevice device(para->getDeviceId());
-          para->getBuf(PARAMETER_VALUE)
-              ->copyFrom(*para->getBuf(PARAMETER_DELTA));
-        }
-      }
-    } else {  // average
-      // copy value to delta
-      for (auto& para : parameters_) {
-        SetDevice device(para->getDeviceId());
-        para->getBuf(PARAMETER_DELTA)->copyFrom(*para->getBuf(PARAMETER_VALUE));
-      }
-    }
-  }
-}
-
-bool RemoteParameterUpdater::finishPass() {
-  if (localUpdater_) {
-    localUpdater_->finishPass();
-  }
-
-  if (config_.algorithm() == TrainAlgorithm::SGD) {
-    parameterClient_->waitPassFinish();
-  } else {
-    parameterClient_->asyncFinishPass();
-  }
-  if (localUpdater_) {
-    if (config_.center_parameter_update_method() == kElasticAverage) {
-      // backup local value to delta as we will get
-      // the remote parameter for saving/testing
-      for (auto& para : parameters_) {
-        SetDevice device(para->getDeviceId());
-        para->getBuf(PARAMETER_DELTA)->copyFrom(*para->getBuf(PARAMETER_VALUE));
-      }
-    }
-  }
-  parameterClient_->getParameter();
-  copyParametersToDevice(PARAMETER_VALUE);
-
-  isFirstPass_ = false;
-  return true;
-}
-
-void RemoteParameterUpdater::apply() {
-  if (useApplyInPserver_) {
-    PreparedOperations ops;
-    ops.addOperation(PSERVER_OP_APPLY);
-    parameterClient_->doOperation(ops,
-                                  /* waitForGradient= */ false,
-                                  /* sendBackarameter= */ false);
-    parameterClient_->getParameter(
-        /* recvParameterType= */ PARAMETER_VALUE,
-        /* sendBackParameterType= */ PARAMETER_APPLY);
-    copyParametersToDevice(PARAMETER_VALUE);
-  }
-}
-
-void RemoteParameterUpdater::restore() {
-  if (useApplyInPserver_) {
-    parameterClient_->getParameter();
-    copyParametersToDevice(PARAMETER_VALUE);
-  }
-}
-
-ConcurrentRemoteParameterUpdater::ConcurrentRemoteParameterUpdater(
-    OptimizationConfig config,
-    int passCount,
-    std::unique_ptr<ParameterUpdater>&& localUpdater)
-    : RemoteParameterUpdater(config, passCount, std::move(localUpdater)) {
-  sendThread_.reset(new std::thread([this]() { this->send(); }));
-  recvThread_.reset(new std::thread([this]() { this->recv(); }));
-
-  stopping_ = false;
-  oneBatchFinished_ = false;
-  separateSendAndRecv_ = true;
-}
-
-ConcurrentRemoteParameterUpdater::~ConcurrentRemoteParameterUpdater() {
-  stopping_ = true;
-  sendQueue_.enqueue(0);
-  sendThread_->join();
-  recvQueue_.enqueue(0);
-  recvThread_->join();
-}
-
-void ConcurrentRemoteParameterUpdater::finishBatch(real cost) {
-  if (localUpdater_) {
-    localUpdater_->finishBatch(cost);
-
-    if (!needToUpdateRemotely()) {
-      ++numBatches_;
-      return;
-    }
-  }
-
-  sendQueue_.enqueue(kFinishBatchPid);
-
-  finishBatchCond_.wait([this]() { return oneBatchFinished_; });
-  oneBatchFinished_ = false;
-  {
-    REGISTER_TIMER("sync_hostToDeviceStream");
-    for (auto& para : parameters_) {
-      SetDevice device(para->getDeviceId());
-      hl_stream_synchronize(kHostToDeviceStream);
-    }
-  }
-
-  if (localUpdater_) {
-    ++numBatches_;
-  }
-}
-
-// Use para=NULL to signal the end of one batch
-void ConcurrentRemoteParameterUpdater::send(Parameter* para) {
-  const std::string& algorithm = config_.algorithm();
-  ParameterUpdateMode mode;
-  if (algorithm == TrainAlgorithm::AsyncSGD) {
-    mode = PSERVER_UPDATE_MODE_ASYNC_SGD;
-  } else if (algorithm == TrainAlgorithm::SGD) {
-    mode = PSERVER_UPDATE_MODE_ADD_GRADIENT;
-  } else {
-    LOG(FATAL) << "Unknown algorithm: " << algorithm;
-  }
-  ParameterType sendType;
-  if (localUpdater_) {
-    sendType = PARAMETER_DELTA;
-  } else {
-    // In this case, we perform SGD on pserver.
-    sendType = PARAMETER_GRADIENT;
-  }
-  std::vector<ParameterSegments> paraSegment;
-  if (para == NULL) {
-    parameterClient_->sendParameter(
-        mode,
-        sendType,
-        paraSegment,
-        batchSize_,
-        0,              // cost=0
-        true,           // sendBackParameter = true
-        batchStatus_);  // batchStatus_ = BATCH_FINISH
-
-  } else {
-    ParameterSegments paraSegTemp;
-    paraSegment.reserve(1);
-    paraSegTemp.name = para->getName();
-    paraSegTemp.id = para->getID();
-    paraSegment.push_back(paraSegTemp);
-    {
-      SetDevice device(para->getDeviceId());
-      REGISTER_TIMER("copySingleParaFromDevice");
-      copySingleParaFromDevice(para, sendType);
-      hl_stream_synchronize(kDeviceToHostStream);
-    }
-    parameterClient_->sendParameter(mode,
-                                    sendType,
-                                    paraSegment,
-                                    batchSize_,
-                                    0,     // cost=0
-                                    true,  // sendBackParameter = true
-                                    batchStatus_);
-    if (batchStatus_ == BATCH_START) batchStatus_ = BATCH_ON;
-  }
-}
-void ConcurrentRemoteParameterUpdater::recv(Parameter* para) {
-  parameterClient_->recvParameter();
-  if (para != NULL) {
-    REGISTER_TIMER("copySingleParaToDevice");
-    SetDevice device(para->getDeviceId());
-    copySingleParaToDevice(para, PARAMETER_VALUE);
-
-    if (localUpdater_) {
-      para->getBuf(PARAMETER_DELTA)->copyFrom(*para->getBuf(PARAMETER_VALUE));
-    } else {
-      // if cpu, parameter should not changes until recvParameter().
-      // if gpu, zero mem when send finish
-      if (!FLAGS_use_gpu) {
-        para->getBuf(PARAMETER_GRADIENT)->zeroMem();
-      }
-    }
-  }
-}
-
-void ConcurrentRemoteParameterUpdater::recv() {
-  if (FLAGS_use_gpu) hl_set_device(FLAGS_gpu_id);
-  StatPtr stat = getStat("recv");
-  FOR_TIMING(Timer timer);
-  while (true) {
-    int pid;
-    {
-      REGISTER_TIMER("recv_dequeue");
-      pid = recvQueue_.dequeue();
-    }
-    if (pid == kFinishBatchPid) {
-      Parameter* para = NULL;
-      FOR_TIMING(timer.start());
-      recv(para);
-      FOR_TIMING(timer.stop());
-      FOR_TIMING(stat->addSample(timer.get()));
-      FOR_TIMING(timer.reset());
-      finishBatchCond_.notify_all([this] { oneBatchFinished_ = true; });
-    } else {
-      if (stopping_) break;
-      Parameter* para = parameters_[pid].get();
-      FOR_TIMING(timer.start());
-      recv(para);
-      FOR_TIMING(timer.stop());
-      oneBatchFinished_ = false;
-    }
-  }
-}
-
-void ConcurrentRemoteParameterUpdater::send() {
-  if (FLAGS_use_gpu) hl_set_device(FLAGS_gpu_id);
-  StatPtr stat = getStat("send");
-  FOR_TIMING(Timer timer);
-  while (true) {
-    int pid;
-    {
-      REGISTER_TIMER("send_dequeue");
-      pid = sendQueue_.dequeue();
-    }
-    if (pid == kFinishBatchPid) {
-      batchStatus_ = BATCH_FINISH;
-      if (!localUpdater_) {
-        // if cpu, parameter should not changes until recvParameter().
-        // if gpu, zeroMem() at the end of batch so that it won't
-        // interfere with computation.
-        if (FLAGS_use_gpu) {
-          REGISTER_TIMER("para_zeroMem");
-          for (auto& para : parameters_) {
-            SetDevice device(para->getDeviceId());
-            para->getBuf(PARAMETER_GRADIENT)->zeroMem();
-          }
-        }
-      }
-      Parameter* para = NULL;
-      FOR_TIMING(timer.start());
-      send(para);
-      FOR_TIMING(timer.stop());
-      FOR_TIMING(stat->addSample(timer.get()));
-      FOR_TIMING(timer.reset());
-      recvQueue_.enqueue(pid);
-    } else {
-      if (stopping_) break;
-      Parameter* para = parameters_[pid].get();
-      if (localUpdater_) {
-        // DELTA = NEW_VALUE - OLD_VALUE/*store in DELTA*/
-        para->getBuf(PARAMETER_DELTA)
-            ->add(*para->getBuf(PARAMETER_VALUE), -1.0f, 1.0f);
-      }
-      FOR_TIMING(timer.start());
-      send(para);
-      FOR_TIMING(timer.stop());
-      recvQueue_.enqueue(nonStaticParaIDMap_[para->getID()]);
-    }
-  }
-}
-
-void ConcurrentRemoteParameterUpdater::updateImpl(Parameter* para) {
-  REGISTER_TIMER("update");
-  if (localUpdater_) {
-    localUpdater_->update(para);
-    if (!needToUpdateRemotely()) {
-      return;
-    }
-  }
-  sendQueue_.enqueue(nonStaticParaIDMap_[para->getID()]);
-}
-
-void ConcurrentRemoteParameterUpdater::copySingleParaToDevice(
-    Parameter* para, ParameterType parameterType) {
-  if (!FLAGS_use_gpu) {
-    return;
-  }
-  int i = nonStaticParaIDMap_[para->getID()];
-  para->getBuf(parameterType)
-      ->copyFrom(*cpuParameters_[i]->getBuf(parameterType),
-                 kHostToDeviceStream);
-  if (parameterType == PARAMETER_VALUE) {
-    para->setValueUpdated();
-  }
-}
-
-void ConcurrentRemoteParameterUpdater::copySingleParaFromDevice(
-    Parameter* para, ParameterType parameterType) {
-  if (!FLAGS_use_gpu) {
-    return;
-  }
-  int i = nonStaticParaIDMap_[para->getID()];
-  cpuParameters_[i]
-      ->getBuf(parameterType)
-      ->copyFrom(*para->getBuf(parameterType), kDeviceToHostStream);
-}
-
-SparseRemoteParameterUpdater::SparseRemoteParameterUpdater(
-    const OptimizationConfig& config, int expectedPassCount, bool testing)
-    : config_(config),
-      passCount_(0),
-      expectedPassCount_(expectedPassCount),
-      testing_(testing),
-      useApplyInPserver_(false) {}
-
-void SparseRemoteParameterUpdater::init(
-    const std::vector<ParameterPtr>& parameters) {
-  ParameterUpdater::init(parameters);
-
-  parameterClient_.reset(new ParameterClient2(
-      false, FLAGS_port + FLAGS_ports_num, FLAGS_ports_num_for_sparse));
-  parameterClient_->init(parameters_);
-  parameterClient_->setTrainerId(FLAGS_trainer_id);
-
-  if (FLAGS_trainer_id == 0) {
-    parameterClient_->setConfig(
-        config_, FLAGS_save_dir, true /*is_sparse_server*/);
-    if (parameters[0]->isFullSize()) {
-      parameterClient_->setParameter();
-    } else {  // init in pserver
-      parameterClient_->setParameterZero();
-    }
-  }
-  if (FLAGS_trainer_id == 0 && !testing_ &&
-      config_.algorithm() == TrainAlgorithm::SGD) {
-    startController();
-    useApplyInPserver_ = useApplyInPserver(config_);
-  }
-}
-
-void SparseRemoteParameterUpdater::startController() {
-  controllerThread_.reset(new std::thread([this]() { this->controller(); }));
-}
-
-void SparseRemoteParameterUpdater::controller() {
-  ParameterClient2 client(
-      false, FLAGS_port + FLAGS_ports_num, FLAGS_ports_num_for_sparse);
-  client.init(parameters_);
-
-  while (true) {
-    /*start pass*/ {
-      client.waitPassStart();
-
-      PreparedOperations ops;
-      ops.addOperation(PSERVER_OP_START_PASS);
-      client.doOperation(ops,
-                         /* waitForGradient= */ false,
-                         /* sendBackarameter= */ false,
-                         /* releasePass= */ false);
-    }
-
-    while (true) {
-      PreparedOperations ops;
-      ops.addOperation(PSERVER_OP_SGD);
-      client.doOperation(ops,
-                         /* waitForGradient= */ true,
-                         /* sendBackarameter= */ true,
-                         /* releasePass= */ false);
-      if (client.isPassFinish()) {
-        break;
-      }
-    }
-
-    /*finish pass*/ {
-      PreparedOperations ops;
-      ops.addOperation(PSERVER_OP_FINISH_PASS);
-      client.doOperation(ops,
-                         /* waitForGradient= */ true,
-                         /* sendBackarameter= */ true,
-                         /* releasePass= */ true);
-    }
-
-    passCount_++;
-    if (passCount_ == expectedPassCount_) {
-      break;
-    }
-  }
-}
-
-PassType SparseRemoteParameterUpdater::startBatch(int64_t batchSize) {
-  batchSize_ = batchSize;
-  return PASS_TRAIN;
-}
-
-void SparseRemoteParameterUpdater::finishBatch(real cost) {
-  const std::string& algorithm = config_.algorithm();
-  ParameterUpdateMode mode;
-  if (algorithm == TrainAlgorithm::AsyncSGD) {
-    mode = PSERVER_UPDATE_MODE_ASYNC_SGD;
-  } else if (algorithm == TrainAlgorithm::SGD) {
-    mode = PSERVER_UPDATE_MODE_ADD_GRADIENT;
-  } else {
-    LOG(FATAL) << "Unknown algorithm: " << algorithm;
-  }
-
-  ParameterType sendType = PARAMETER_GRADIENT;
-
-  REGISTER_TIMER("sendSparseParam");
-  parameterClient_->sendAndReceiveParameter(mode,
-                                            sendType,
-                                            batchSize_,
-                                            0,       // cost = 0
-                                            false);  // sendBackParameter
-
-  // grad zero move to sgd grad machine, before merge grad sparse remote
-}
-
-void SparseRemoteParameterUpdater::startPass() {
-  if (config_.algorithm() == TrainAlgorithm::SGD) {
-    parameterClient_->waitPassStart();
-  } else {
-    if (FLAGS_trainer_id == 0) {
-      PreparedOperations ops;
-      ops.addOperation(PSERVER_OP_START_PASS);
-      parameterClient_->doOperation(ops,
-                                    /* waitForGradient= */ false,
-                                    /* sendBackarameter= */ false);
-    }
-    parameterClient_->asyncStartPass();
-  }
-}
-
-bool SparseRemoteParameterUpdater::finishPass() {
-  if (config_.algorithm() == TrainAlgorithm::SGD) {
-    parameterClient_->waitPassFinish();
-  } else {
-    if (FLAGS_trainer_id == 0) {
-      PreparedOperations ops;
-      ops.addOperation(PSERVER_OP_FINISH_PASS);
-      parameterClient_->doOperation(ops,
-                                    /* waitForGradient= */ false,
-                                    /* sendBackarameter= */ false);
-    }
-    parameterClient_->asyncFinishPass();
-  }
-
-  return true;
-}
-
-// Trainer will call getParametersRemote at batch start or before save,
-// so we do not get values in apply() and restore().
-void SparseRemoteParameterUpdater::apply() {
-  if (useApplyInPserver_) {
-    PreparedOperations ops;
-    ops.addOperation(PSERVER_OP_APPLY);
-    parameterClient_->doOperation(ops,
-                                  /* waitForGradient= */ false,
-                                  /* sendBackarameter= */ false);
-  }
-}
-
-void SparseRemoteParameterUpdater::restore() {}
-
-void SparseRemoteParameterUpdater::getParametersRemote(bool fullSize,
-                                                       bool apply) {
-  ParameterType sendBackParameterType =
-      (useApplyInPserver_ && apply) ? PARAMETER_APPLY : PARAMETER_VALUE;
-  std::function<void()> getParams;
-  std::function<void(Parameter&, real)> applyL1;
-  if (fullSize) {
-    getParams = [&] {
-      parameterClient_->getParameter(
-          /* recvParameterType= */ PARAMETER_VALUE, sendBackParameterType);
-    };
-    applyL1 = [](Parameter& para, real decayRate) {
-      para.getBuf(PARAMETER_VALUE)->applyL1(/*lr=*/1.0f, decayRate);
-    };
-  } else {
-    getParams = [&] {
-      parameterClient_->getParameterSparse(
-          /* recvParameterType= */ PARAMETER_VALUE, sendBackParameterType);
-    };
-    applyL1 = [](Parameter& para, real decayRate) {
-      para.getMat(PARAMETER_VALUE)->applyL1(/*lr=*/1.0f, decayRate);
-    };
-  }
-  {
-    REGISTER_TIMER("getParamDenseAndSparse");
-    getParams();
-    if (config_.shrink_parameter_value() > 0) {
-      for (auto& para : parameters_) {
-        if (para->getConfig().decay_rate_l1() > 0) {
-          applyL1(*para, config_.shrink_parameter_value());
-        }
-      }
-    }
-  }
-}
-
-void SparseRemoteParameterUpdater::randParametersRemote() {
-  CHECK_EQ(FLAGS_trainer_id, 0);
-
-  PreparedOperations ops;
-  ops.addOperation(PSERVER_OP_RANDOMIZE);
-  parameterClient_->doOperation(ops,
-                                /* waitForGradient= */ false,
-                                /* sendBackarameter= */ false);
-}
-
-void SparseRemoteParameterUpdater::loadParametersRemote(
-    const std::string& dirName) {
-  if (FLAGS_trainer_id == 0) {
-    parameterClient_->loadValueVector(dirName);
-  }
-
-  if (testing_) {
-    // we do not use synchronize() here,
-    // because test mode may run only one tester
-    if (FLAGS_trainer_id == 0) {
-      parameterClient_->setStatus(PSERVER_STATUS_PARAMETER_READY);
-    } else {
-      parameterClient_->waitForStatus(PSERVER_STATUS_PARAMETER_READY);
-    }
-  }
-}
-
-void SparseRemoteParameterUpdater::saveParametersRemote(
-    const std::string& dirName) {
-  if (FLAGS_trainer_id == 0) {
-    parameterClient_->saveValueVector(dirName);
-  }
-}
-
-void SparseRemoteParameterUpdaterComposite::init(
-    const std::vector<ParameterPtr>& parameters) {
-  parameters_ = parameters;
-
-  std::vector<ParameterPtr> parametersArray[NUMBER_UPDATERS];
-
-  for (auto& para : parameters_) {
-    if (para->isSparseRemoteUpdate()) {
-      parametersArray[UPDATER_SPARSE_REMOTE].push_back(para);
-    } else {
-      parametersArray[UPDATER_NORMAL].push_back(para);
-    }
-  }
-  CHECK(!parametersArray[UPDATER_SPARSE_REMOTE].empty());
-  CHECK(!parametersArray[UPDATER_NORMAL].empty());
-
-  syncThreadPool_->execPlusOwner([&](int tid, size_t numThreads) {
-    updaters_[tid]->init(parametersArray[tid]);
-  });
-
-  parameterTypes_ = updaters_[UPDATER_NORMAL]->getParameterTypes();
-}
-
-std::vector<std::function<ParameterUpdater*(
-    const std::string&, const OptimizationConfig&, bool, size_t)>>
-    ParameterUpdaterCreators::constructors_;
-
-}  // namespace paddle
diff --git a/paddle/trainer/RemoteParameterUpdater.h b/paddle/trainer/RemoteParameterUpdater.h
deleted file mode 100644
index 3a40a46354efd6b92278884c8f5b72504a3ff283..0000000000000000000000000000000000000000
--- a/paddle/trainer/RemoteParameterUpdater.h
+++ /dev/null
@@ -1,416 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include <thread>
-#include "ParameterUpdater.h"
-#include "paddle/pserver/ParameterClient2.h"
-#include "paddle/utils/Queue.h"
-#include "paddle/utils/Util.h"
-
-namespace paddle {
-
-// TODO(yanfei):
-// I think that the biggest feature of rdma is packet lossless control
-// feature instead of high bandwiths, zero copy and gpu-direct rdma in
-// theroy.
-// But zero-copy and gpu-direct rdma features can help to reduce latency
-// caused by os system.
-// So, for some specified cluster, such as high density gpu cluster,
-// gpu-direct and zero copy could help to improve cluster communication
-// performance.
-//
-
-/**
- * Normal remote parameter updater for dense parameters.
- *
- * It first packs all parameters for all pservers using ParameterClient
- * module, then wait for merged parameters data from all pservers.
- * The synchronization pattern specified by sync-sgd or async-sgd is
- * achieved by all pservers with the help of the controller within this
- * remote parameter updater.
- * This module indeedly bridges the gradient machines and parameter servers.
- * It helps to transfer the parameters from acceleration device to cpu end
- * for network. It contains additional parameters copy buffers for
- * acceleration devices at cpu end, such as gpu, otherwise it will
- * directly use original parameters data to update pservers.
- *
- * This remote parameter updater does not use pipeline mechanism to hide
- * copy latency from gpu to cpu buffer. In addition the overlapped between
- * backward and communication is not supported.
- */
-class RemoteParameterUpdater : public ParameterUpdater {
- public:
-  RemoteParameterUpdater(
-      const OptimizationConfig& config,
-      int expectedPassCount,
-      std::unique_ptr<ParameterUpdater>&& localUpdater = nullptr);
-  ~RemoteParameterUpdater() {
-    if (controllerThread_) {
-      controllerThread_->join();
-    }
-  }
-
-  /**
-   * initialize the internal parameter client and itself.
-   */
-  virtual void init(const std::vector<ParameterPtr>& parameters);
-  /**
-   * @brief start batch
-   *
-   * @note  one batch training exhibits stateful feature to help
-   *        to do performance tuning, sgd optimization if necessary.
-   */
-  virtual PassType startBatch(int64_t batchSize) {
-    if (localUpdater_) {
-      localUpdater_->startBatch(batchSize);
-    }
-    batchSize_ = batchSize;
-    batchStatus_ = BATCH_START;
-    return PASS_TRAIN;
-  }
-
-  /**
-   * send parameters to pservers and get returned parameters
-   * from all pservers if necessary. it will implictly
-   * cooperate with controller thread for sync-sgd.
-   */
-  virtual void finishBatch(real cost);
-  virtual void startPass();
-  virtual bool finishPass();
-
-#ifndef PADDLE_DISABLE_TIMER
-  virtual void setForwardbackwardTime(uint64_t delta) {
-    parameterClient_->setForwardbackwardTime(delta);
-  }
-#endif
-
-  virtual void apply();
-  virtual void restore();
-
- protected:
-  /**
-   * control all pservers with all trainers for sync-sgd
-   */
-  virtual void controller();
-
-  /**
-   * work need to do after finishBatch
-   */
-  virtual void updateImpl(Parameter* para);
-
-  void startController();
-
-  /**
-   * @brief copy parameters from cpu host to device, such as gpu.
-   *
-   * @note  return if all data are transfered.
-   */
-  void copyParametersToDevice(ParameterType parameterType);
-
-  /**
-   * @brief copy parameters from device to cpu host
-   *
-   * @note  return if all data are transfered
-   */
-  void copyParametersFromDevice(ParameterType parameterType);
-
- protected:
-  /// Optimization config used to guide initialization and finishBatch
-  OptimizationConfig config_;
-  /// internal parameter client object for exchanging data with pserver
-  std::unique_ptr<ParameterClient2> parameterClient_;
-  /// internal shadow buffer at cpu host end, use original parameters_
-  /// if no acceleration devices are used.
-  std::vector<ParameterPtr> cpuParameters_;
-  /// local updater for aggregating multi-batches local delta
-  std::unique_ptr<ParameterUpdater> localUpdater_;
-  /// the size of mini-batch
-  int64_t batchSize_;
-  /// batches passed
-  int64_t numBatches_;
-  /// for stateful control
-  BatchStatus batchStatus_;
-  /// controller thread for sync-sgd
-  std::unique_ptr<std::thread> controllerThread_;
-  /// passed already finished
-  int64_t passCount_;
-  /// expected passes to finished
-  int64_t expectedPassCount_;
-  /// use normal synchronization communication if True
-  bool separateSendAndRecv_;
-  /// true if it's first pass
-  bool isFirstPass_;
-  bool useApplyInPserver_;
-
-  static const std::string kAverage;
-  static const std::string kElasticAverage;
-};
-
-// TODO(yanfei):
-// do parameters level synchronization Optimization at pserver end with
-// ConcurrentRemoteParameterUpdater to get more parallelization, at last
-// to really hide pserver latency in backward computation.
-//
-/**
- * This updater add additional optimization for overlapping synchronization
- * from pservers with backward computation.
- *
- * Parameter can be sent to pservers when related backward stage is finished.
- * This concurrent udpater does data copy from acceleration device to host
- * memory aynchronously. In addition internal parameter client reads data in
- * host memory and send them to all pservers in next stage. So this class
- * help to pipeline device-to-host copy and host-to-network to hide network
- * latency in backward stage.
- * It contains separate send and recv thread for pipeline usage.
- */
-class ConcurrentRemoteParameterUpdater : public RemoteParameterUpdater {
- public:
-  ConcurrentRemoteParameterUpdater(
-      OptimizationConfig config,
-      int expectedPassCount,
-      std::unique_ptr<ParameterUpdater>&& localUpdater);
-  ~ConcurrentRemoteParameterUpdater();
-
-  /**
-   * @brief send paraemeters to all pservers
-   *
-   * @note  it just signal the end signal to internal parameter client
-   *        to finished the aynchronous send action. In addition it also
-   *        do synchronization for all asynchronous host-to-device copy.
-   */
-  virtual void finishBatch(real cost);
-
- protected:
-  virtual void updateImpl(Parameter* para);
-  /// internal thread called in send thread
-  void send(Parameter* para);  // para == NULL indicate end of a minibatch
-  /// internal function called in recv thread
-  void recv(Parameter* para);
-  /**
-   * @brief send thread for relaying data from gradient to parameter client
-   *
-   * @note  just pipe data to internal parameter client for pipeline
-   */
-  void send();
-  /**
-   * @brief recv thread for relaying data from internal parameter client to
-   *        host memory
-   *
-   * @note  it contains the asynchronous data copy form host to device
-   */
-  void recv();
-  /// copy specified parameter from host to device
-  void copySingleParaToDevice(Parameter* para, ParameterType parameterType);
-  /// copy specified parameter from device to host
-  void copySingleParaFromDevice(Parameter* para, ParameterType parameterType);
-  bool needToUpdateRemotely() {
-    return (numBatches_ + 1) % config_.num_batches_per_send_parameter() == 0;
-  }
-
- private:
-  /// send thread used for overlapping
-  std::unique_ptr<std::thread> sendThread_;
-  /// recv thread used for overlapping
-  std::unique_ptr<std::thread> recvThread_;
-  /// buffer queue for overlapping
-  Queue<int> sendQueue_;
-  /// buffer queue for overlapping
-  Queue<int> recvQueue_;
-  /// flags indicating to stop
-  bool stopping_;
-  /// conditional variable for threads synchronization between the
-  /// thread calling finishBatch and internal recv thread
-  LockedCondition finishBatchCond_;
-  bool oneBatchFinished_;
-};
-
-// TODO(yanfei):
-// merge sparse updater with dense updater, and could help to reduce
-// the synchronization between sparse and dense udpater. it could also
-// reduce the threads for managing all connections.
-/**
- * This class is specified for updating sparse parameters.
- *
- * It allows part of parameter to be exchanged with all pservers.
- * If sparse input assigned, part gradients of first hidden layer
- * could remained zero which can not need to be exchanged within
- * all pservers. This is the key optimization point for this updater
- *
- * For updating sparse parameters, all latest parameters are stored
- * in pservers instead of keeping full copy at train end, so need to
- * prefetch parameters weight value which can be changed in next-batch
- * before doing next forwardbackward. Also, with above fact that the
- * parameters can be stored in pserver instead of trainer, we can
- * fetch specified parmeters if necessary, and can support huge
- * parameters which is larger enough than  the RAM size in single
- * node.
- *
- * Internally, this updater will direct internal parameter client
- * to encapsulate sparse specified message for all pservers.
- */
-class SparseRemoteParameterUpdater : public ParameterUpdater {
- public:
-  SparseRemoteParameterUpdater(const OptimizationConfig& config,
-                               int expectedPassCount,
-                               bool testing);
-  ~SparseRemoteParameterUpdater() {
-    if (controllerThread_) {
-      controllerThread_->join();
-    }
-  }
-
-  /// initialization
-  virtual void init(const std::vector<ParameterPtr>& parameters);
-
-  /// stateful batch control
-  virtual PassType startBatch(int64_t batchSize);
-  /// send all sparse related parameters to all pservers
-  virtual void finishBatch(real cost);
-  virtual void startPass();
-  virtual bool finishPass();
-
-  virtual void apply();
-  virtual void restore();
-
-  /// load parameters from pservers
-  virtual void loadParametersRemote(const std::string& dirName);
-  /// save parameters to pservers
-  virtual void saveParametersRemote(const std::string& dirName);
-  /**
-   * @brief get latest sparse parameters value from all pservers
-   *
-   * @note  call it before next mini-batch
-   */
-  virtual void getParametersRemote(bool fullSize, bool apply);
-  virtual void randParametersRemote();
-#ifndef PADDLE_DISABLE_TIMER
-  virtual void setForwardbackwardTime(uint64_t delta) {
-    parameterClient_->setForwardbackwardTime(delta);
-  }
-#endif
-
- protected:
-  /// update implimentation, not implemented
-  virtual void updateImpl(Parameter* para) {}
-
-  /// internal controller routine for controller thread
-  virtual void controller();
-
-  /// start controller thread
-  void startController();
-
- protected:
-  /// optimization config
-  OptimizationConfig config_;
-  /// internal parameter client
-  std::unique_ptr<ParameterClient2> parameterClient_;
-  int64_t batchSize_;
-  std::unique_ptr<std::thread> controllerThread_;
-  int64_t passCount_;
-  int64_t expectedPassCount_;
-  bool testing_;
-  bool useApplyInPserver_;
-};
-
-/**
- * Class for supporting normal updater and sparse updater
- *
- * Not all parts of one model are sparse, so it exists dense updater
- * for normal layers while sparse updater is for sparse layers.
- *
- * it directly call internal dense and sparse udpater individually.
- */
-class SparseRemoteParameterUpdaterComposite : public ParameterUpdaterComposite {
- public:
-  enum {
-    UPDATER_SPARSE_REMOTE = 0,  // execute in sync thread pool(tid:0)
-    UPDATER_NORMAL = 1,         // execute in Owner thread(tid:1)
-    NUMBER_UPDATERS = 2,
-  };
-  /**
-   * @brief create one dense updater and one sparse updater
-   *
-   * @note  use syncThreadPool to synchronize these two updaters
-   */
-  SparseRemoteParameterUpdaterComposite(
-      const OptimizationConfig& config,
-      int expectedPassCount,
-      bool testing,
-      std::unique_ptr<ParameterUpdater>&& normalUpdater) {
-    updaters_.resize(NUMBER_UPDATERS);
-    updaters_[UPDATER_SPARSE_REMOTE].reset(
-        new SparseRemoteParameterUpdater(config, expectedPassCount, testing));
-    updaters_[UPDATER_NORMAL] = std::move(normalUpdater);
-
-    syncThreadPool_.reset(new SyncThreadPool(NUMBER_UPDATERS - 1));
-  }
-
-  /// initialization of dense and sparse updaters
-  virtual void init(const std::vector<ParameterPtr>& parameters);
-};
-
-class ParameterUpdaterCreators {
- public:
-  /**
-   * @brief add a creator to create custom ParameterUpdater while training.
-   *        The creator is a function with type (alogrithm, optConfig, isLocal,
-   *        numPasses) -> ParameterUpdater*. Trainer will use this
-   *        ParameterUpdater if creator can create a no nullptr
-   *        ParameterUpdater. Return nullptr will use trainer's default
-   *        updaters.
-   *
-   * @param creator method which can create ParameterUpdater.
-   */
-  static void addCreator(
-      const std::function<ParameterUpdater*(
-          const std::string&,         // algo
-          const OptimizationConfig&,  // optConfig
-          bool,                       // isLocal
-          size_t                      // numPasses
-          )>& creator) {  // NOLINT  explicit move closing ) in this line
-                          // for readability
-    constructors_.push_back(creator);
-  }
-
-  /**
-   * @brief Try to create an updater by given algo, optConfig, isLocal,
-   *        numPasses. Return nullptr if cannot create anyone.
-   * @param algo algorithm string.
-   * @param optConfig optimization config.
-   * @param isLocal is in local mode or not.
-   * @param numPasses total passes that trainer will train.
-   * @return nullptr if fail, not nullptr if we can create an updater.
-   */
-  static ParameterUpdater* tryCreateUpdater(const std::string& algo,
-                                            const OptimizationConfig& optConfig,
-                                            bool isLocal,
-                                            size_t numPasses) {
-    for (auto& c : constructors_) {
-      if (auto updater = c(algo, optConfig, isLocal, numPasses)) {
-        return updater;
-      }
-    }
-    return nullptr;
-  }
-
- private:
-  static std::vector<std::function<ParameterUpdater*(
-      const std::string&, const OptimizationConfig&, bool, size_t)>>
-      constructors_;
-};
-
-}  // namespace paddle
diff --git a/paddle/trainer/Tester.cpp b/paddle/trainer/Tester.cpp
deleted file mode 100644
index 16e676d60248dfe6d443c50fbf34970e63c1f412..0000000000000000000000000000000000000000
--- a/paddle/trainer/Tester.cpp
+++ /dev/null
@@ -1,380 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Tester.h"
-
-#include <fenv.h>
-#include <stdio.h>
-
-#include <iomanip>
-#include <iostream>
-#include <limits>
-#include <sstream>
-
-#include <google/protobuf/text_format.h>
-
-#include "paddle/utils/GlobalConstants.h"
-#include "paddle/utils/PythonUtil.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/Util.h"
-
-#include "TesterConfig.h"
-#include "paddle/gserver/gradientmachines/GradientMachineMode.h"
-#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
-#include "paddle/gserver/layers/ValidationLayer.h"
-
-namespace paddle {
-
-Tester::Tester(const std::shared_ptr<TrainerConfigHelper>& config,
-               std::unique_ptr<TesterConfig>&& intconfig,
-               const GradientMachinePtr& gradientMachine,
-               const std::shared_ptr<ParameterUpdater>& parameterUpdater,
-               std::shared_ptr<DataProvider> testDataProvider)
-    : config_(config),
-      intconfig_(std::move(intconfig)),
-      gradientMachine_(gradientMachine),
-      parameterUpdater_(parameterUpdater),
-      testDataProvider_(testDataProvider) {
-  if (config_->getOptConfig().use_sparse_remote_updater()) {
-    LOG(FATAL) << "It's prohibited to set sparse_remote_update "
-               << "when doing train and test jobs in the same "
-               << "process. You could run paddle --job=test in "
-               << "a separate process.";
-  }
-  testEvaluator_.reset(gradientMachine_->makeEvaluator());
-  if (intconfig_->distributeTest) {
-    testParameterClient_.reset(new ParameterClient2(true));
-  }
-
-  if (testParameterClient_) {
-    testParameterClient_->init(gradientMachine_->getParameters());
-  }
-
-  std::unique_ptr<ParameterUtilConfig> paramConfig(
-      new ParameterUtilConfig(intconfig_->saveOnlyOne,
-                              intconfig_->savingPeriod,
-                              intconfig_->loadsaveParametersInPserver,
-                              intconfig_->config));
-
-  paramUtil_.reset(new ParameterUtil(
-      config_, std::move(paramConfig), gradientMachine_, parameterUpdater_));
-}
-
-void Tester::startTestPeriod() {
-  if (testDataProvider_) {
-    testDataProvider_->reset();
-  }
-  testEvaluator_->start();
-  testContext_.cost = 0;
-  testContext_.numSamples = 0;
-
-  parameterUpdater_->apply();
-  if (intconfig_->prevBatchState) {
-    gradientMachine_->getState(*intconfig_->trainState);
-    gradientMachine_->setState(*intconfig_->testState);
-  }
-}
-
-void Tester::testOneDataBatch(const DataBatch& dataBatch,
-                              std::vector<Argument>* outArgs) {
-  testContext_.cost +=
-      forwardOneBatch(dataBatch, testEvaluator_.get(), outArgs);
-  testContext_.numSamples += dataBatch.getSize();
-}
-
-void Tester::testOnePeriod() {
-  DataBatch dataBatch;
-  int64_t batchSize = config_->getOptConfig().batch_size();
-  std::vector<Argument> outArgs;
-  startTestPeriod();
-  while (testDataProvider_->getNextBatch(batchSize, &dataBatch) != 0) {
-    testOneDataBatch(dataBatch, &outArgs);
-  }
-  finishTestPeriod();
-}
-
-void Tester::finishTestPeriod() {
-  if (intconfig_->prevBatchState) {
-    gradientMachine_->resetState();
-  }
-  testEvaluator_->finish();
-  CHECK_GT(testContext_.numSamples, 0)
-      << "There is no samples in your test batch. Possibly "
-         "wrong implementation of DataProvidor.reset()";
-  LOG(INFO) << " Test samples=" << testContext_.numSamples
-            << " cost=" << testContext_.cost / testContext_.numSamples
-            << " Eval: " << *testEvaluator_;
-  parameterUpdater_->restore();
-  if (intconfig_->prevBatchState) {
-    gradientMachine_->getState(*intconfig_->testState);
-    gradientMachine_->setState(*intconfig_->trainState);
-  }
-}
-
-int64_t Tester::testOneBatchById(int64_t batchId) {
-  DataBatch dataBatch;
-  int32_t batchSize = config_->getOptConfig().batch_size();
-
-  testDataProvider_->getNextBatch(batchSize, &dataBatch);
-
-  int64_t actualBatchSize = dataBatch.getSize();
-  if (actualBatchSize == 0) {
-    return 0;
-  }
-
-  std::vector<Argument> outArgs;
-
-  stats_ += std::pair<int64_t, real>{
-      actualBatchSize,
-      forwardOneBatch(dataBatch, testEvaluator_.get(), &outArgs)};
-
-  if (((batchId + 1) % intconfig_->logPeriod) == 0) {
-    LOG(INFO) << " Batch=" << batchId + 1 << " " << stats_.getStats(false);
-  }
-
-  return actualBatchSize;
-}
-
-real Tester::forwardOneBatch(const DataBatch& dataBatch,
-                             Evaluator* evaluator,
-                             std::vector<Argument>* pOutArgs) {
-  auto& outArgs = *pOutArgs;
-  const std::vector<Argument>& inArgs = dataBatch.getStreams();
-  if (intconfig_->loadsaveParametersInPserver) {
-    REGISTER_TIMER("prefetch");
-    gradientMachine_->prefetch(inArgs);
-    parameterUpdater_->getParametersRemote(false /*full parameter*/,
-                                           true /*after apply*/);
-  }
-
-  gradientMachine_->forward(inArgs, &outArgs, PASS_TEST);
-
-  // write features if set this flag and outArgs is not empty
-  std::string featFile = intconfig_->featFile;
-  if (!featFile.empty() && outArgs.empty()) {
-    size_t numOutputs = outArgs.size();
-    std::vector<MatrixPtr> featMatrices;
-    featMatrices.resize(numOutputs);
-    for (size_t i = 0; i < numOutputs; ++i) {
-      featMatrices[i] = Matrix::create(outArgs[i].value->getHeight(),
-                                       outArgs[i].value->getWidth(),
-                                       false,
-                                       false);  // CPU data buffer
-      featMatrices[i]->copyFrom(*(outArgs[i].value), HPPL_STREAM_DEFAULT);
-    }
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-    FILE* fp = fopen(featFile.c_str(), "ab+");
-    CHECK(!ferror(fp)) << "Fail to open " << featFile;
-
-    size_t sampleNum = featMatrices[0]->getHeight();
-    for (size_t i = 0; i < sampleNum; ++i) {
-      for (size_t j = 0; j < numOutputs; ++j) {
-        size_t dim = featMatrices[j]->getWidth();
-        fwrite(featMatrices[j]->getData() + i * dim, sizeof(real), dim, fp);
-      }
-    }
-    fclose(fp);
-  }
-  if (evaluator) {
-    gradientMachine_->eval(evaluator);
-  }
-
-  // Save the output layers if predict_output_dir is not empty
-  std::string predictOutputDir = intconfig_->predictOutputDir;
-  if (!predictOutputDir.empty() && !outArgs.empty()) {
-    CHECK(intconfig_->testing) << "Only valid in test mode";
-    if (!os_.is_open()) {
-      // TODO(yuyang18): Refactor these lines.
-      constexpr int kBufLen = 100;
-      char buf[kBufLen];
-      snprintf(buf, kBufLen, "rank-%05d", intconfig_->trainerId);
-      mkDir(predictOutputDir.c_str());
-      std::string filename = path::join(predictOutputDir, buf);
-      os_.open(filename, std::ofstream::trunc);
-      CHECK(os_.is_open()) << "Failed to open file " << filename;
-    }
-    printOutput(outArgs, os_);
-    return 0.0;  // In this case, there is no meaning to calculate cost
-  }
-
-  return Argument::sum(outArgs);
-}
-
-void Tester::testOnePassBatch(int passId) {
-  stats_.reset();
-  const std::vector<Argument> inArgs;
-  gradientMachine_->forward(inArgs, nullptr, PASS_TEST);
-  int64_t num;
-  real cost;
-  gradientMachine_->getStats(cost, num);
-  stats_ += std::pair<int64_t, real>{num, cost};
-  gradientMachine_->onPassEnd();
-
-  LOG(INFO) << " Pass=" << passId << " " << stats_.getStats(false);
-}
-
-void Tester::testOnePass(int passId) {
-  stats_.reset();
-  int64_t batchId = 0;
-  int num = 0;
-  if (intconfig_->prevBatchState) {
-    gradientMachine_->resetState();
-  }
-
-  testEvaluator_->start();
-
-  do {
-    num = testOneBatchById(batchId);
-    ++batchId;
-  } while (num > 0);
-
-  gradientMachine_->onPassEnd();
-  testEvaluator_->finish();
-
-  LOG(INFO) << " Pass=" << passId << " " << stats_.getStats(false)
-            << " Eval: " << *testEvaluator_;
-
-  if (intconfig_->distributeTest) {
-    testEvaluator_->distributeEval(testParameterClient_.get());
-    if (0 == intconfig_->trainerId) {
-      LOG(INFO) << "distribute eval: " << *testEvaluator_;
-    }
-  }
-}
-
-void Tester::test() {
-  CHECK(testDataProvider_) << "TestData is not specified";
-  testDataProvider_->setSkipShuffle();
-  testDataProvider_->reset();
-  gradientMachine_->start();
-
-  // For evaluation
-  std::vector<std::string> modelList;
-  std::string modelListFromConfig = intconfig_->modelList;
-  std::string initModelPath = intconfig_->initModelPath;
-  if (!modelListFromConfig.empty()) {
-    loadFileList(modelListFromConfig, modelList);
-    intconfig_->testPass = 0;
-    intconfig_->numPasses = modelList.size();
-    intconfig_->savingPeriod = 1;
-    CHECK_EQ(intconfig_->testWait, 0) << "--test_wait must be 0 for evaluation";
-  } else if (!initModelPath.empty()) {
-    modelList.push_back(initModelPath);
-    intconfig_->testPass = 0;
-    intconfig_->numPasses = 1;
-    intconfig_->savingPeriod = 1;
-    CHECK_EQ(intconfig_->testWait, 0) << "--test_wait must be 0 for evaluation";
-  }
-
-  for (int i = intconfig_->testPass; i < intconfig_->numPasses; ++i) {
-    int passId = i;
-    if (passId % intconfig_->savingPeriod == 0) {
-      if (intconfig_->testWait) {
-        while (paramUtil_->loadParameters(
-                   passId, true /*local*/, true /*remote*/) == false) {
-          LOG(INFO) << "Waiting for parameters of pass " << passId;
-          sleep(60);  // sleep 60s
-        }
-      } else {
-        if (modelList.size() == 0) {
-          CHECK_EQ(paramUtil_->loadParameters(
-                       passId, true /*local*/, true /*remote*/),
-                   true);
-        } else {
-          paramUtil_->loadParametersWithPath(
-              modelList[i], true /*local*/, true /*remote*/);
-        }
-      }
-      if (IGradientMachineMode::trainWholeDataInOneBatch(intconfig_->mode)) {
-        testOnePassBatch(passId);
-      } else {
-        testOnePass(passId);
-      }
-      if (passId + intconfig_->savingPeriod < intconfig_->numPasses) {
-        // if there is at least 1 more pass to test, then call reset,
-        // otherwise not.
-        testDataProvider_->reset();
-      }
-    }
-  }
-
-  gradientMachine_->finish();
-}
-
-void Tester::printOutput(const std::vector<Argument>& outArgs,
-                         std::ostream& os) {
-  size_t numOutputs = outArgs.size();
-  size_t numIns = outArgs[0].getBatchSize();
-  if (cpuMat_.size() != numOutputs || cpuVec_.size() != numOutputs) {
-    cpuMat_.resize(numOutputs, nullptr);
-    cpuVec_.resize(numOutputs, nullptr);
-  }
-
-  for (size_t i = 0; i < numOutputs; ++i) {
-    if (outArgs[i].value != nullptr) {
-      if (outArgs[i].value->useGpu()) {
-        if (dynamic_cast<GpuMatrix*>(outArgs[i].value.get())) {
-          size_t dim = outArgs[i].value->getWidth();
-          Matrix::resizeOrCreate(cpuMat_[i], numIns, dim, false, false);
-          cpuMat_[i]->copyFrom(*outArgs[i].value);
-        } else if (dynamic_cast<GpuSparseMatrix*>(outArgs[i].value.get())) {
-          auto sparseMat =
-              dynamic_cast<GpuSparseMatrix*>(outArgs[i].value.get());
-          cpuMat_[i] = Matrix::createSparseMatrix(sparseMat->getHeight(),
-                                                  sparseMat->getWidth(),
-                                                  sparseMat->getElementCnt(),
-                                                  sparseMat->getValueType(),
-                                                  sparseMat->format_,
-                                                  false,  /* trans */
-                                                  false); /* useGpu */
-          hl_stream_t stream = HPPL_STREAM_DEFAULT;
-          cpuMat_[i]->copyFrom(*sparseMat, stream);
-        } else {
-          LOG(WARNING) << "Not supported gpu matrix type";
-        }
-      }
-    } else if (outArgs[i].ids != nullptr) {
-      if (outArgs[i].ids->useGpu()) {
-        IVector::resizeOrCreate(cpuVec_[i], outArgs[i].ids->getSize(), false);
-        cpuVec_[i]->copyFrom(*outArgs[i].ids);
-      }
-    } else if (outArgs[i].strs != nullptr) {
-      continue;
-    } else {
-      LOG(WARNING) << "outArgs[" << i << "] has no data to print";
-    }
-  }
-
-  for (size_t i = 0; i < numIns; ++i) {
-    for (size_t j = 0; j < numOutputs; ++j) {
-      if (outArgs[j].value != nullptr) {
-        if (outArgs[j].value->useGpu()) {
-          cpuMat_[j]->printOneRow(os, i);
-        } else {
-          outArgs[j].value->printOneRow(os, i);
-        }
-      } else if (outArgs[j].ids != nullptr) {
-        if (outArgs[j].ids->useGpu()) {
-          cpuVec_[j]->printOneElement(os, i);
-        } else {
-          outArgs[j].ids->printOneElement(os, i);
-        }
-      } else if (outArgs[j].strs != nullptr) {
-        os << (*outArgs[j].strs)[i] << ";";
-      }
-    }
-    os << std::endl;
-  }
-}
-}  // namespace paddle
diff --git a/paddle/trainer/Tester.h b/paddle/trainer/Tester.h
deleted file mode 100644
index 801c77e3116369732bf4b03107adce6a71dc2184..0000000000000000000000000000000000000000
--- a/paddle/trainer/Tester.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/utils/Util.h"
-
-#include <stdio.h>
-
-#include "hl_gpu.h"
-#include "paddle/gserver/dataproviders/DataProvider.h"
-#include "paddle/gserver/gradientmachines/GradientMachine.h"
-
-#include "TrainerConfig.pb.h"
-
-#include <stdlib.h>
-#include <fstream>
-#include "ParamUtil.h"
-#include "ParameterUpdater.h"
-#include "TesterConfig.h"
-#include "TrainerInternalConfig.h"
-
-namespace paddle {
-
-/**
- * Neural Network test logics code.
- * It is a private class for Trainer.
- */
-class Tester {
- public:
-  /**
-   * Ctor
-   * @param config Trainer Config.
-   * @param intconfig Tester Config.
-   * @param gradientMachine Gradient machine(neuralnetwork) that will be tested.
-   * @param parameterUpdater Parameter Updater. Not for updating parameter, just
-   *                         for getting parameter from parameter-server.
-   * @param testDataProvider Test data provider.
-   */
-  Tester(const std::shared_ptr<TrainerConfigHelper>& config,
-         std::unique_ptr<TesterConfig>&& intconfig,
-         const GradientMachinePtr& gradientMachine,
-         const std::shared_ptr<ParameterUpdater>& parameterUpdater,
-         std::shared_ptr<DataProvider> testDataProvider);
-
-  /**
-   * test one period.
-   *
-   * One period means 2 things.
-   *   if test_period !=0 and not test_all_data_in_one_period, then
-   *      will test test_period * batch_size data.
-   *   else
-   *      will test whole test data.
-   *
-   * It is convenience to test small set of data when test data set is large and
-   * is training at same time.
-   */
-  void testOnePeriod();
-  void startTestPeriod();
-  void finishTestPeriod();
-  void testOneDataBatch(const DataBatch& dataBatch,
-                        std::vector<Argument>* outArgs);
-
-  /**
-   * Test for given data batch.
-   * @param dataBatch Data batch.
-   * @param evaluator Evaluator
-   * @return cost
-   */
-  real forwardOneBatch(const DataBatch& dataBatch,
-                       Evaluator* evaluator,
-                       std::vector<Argument>* outArgs);
-
-  /**
-   * performance the full pass of test given test data provider
-   */
-  void test();
-
- protected:
-  std::shared_ptr<ParameterClient2> testParameterClient_;
-  std::shared_ptr<TrainerConfigHelper> config_;
-  std::unique_ptr<TesterConfig> intconfig_;
-  GradientMachinePtr gradientMachine_;
-  std::shared_ptr<ParameterUpdater> parameterUpdater_;
-  std::unique_ptr<Evaluator> testEvaluator_;
-  std::unique_ptr<ParameterUtil> paramUtil_;
-  DataProviderPtr testDataProvider_;
-  TrainerStats stats_;
-
-  // Used for saving the values of output layers
-  std::ofstream os_;
-  std::vector<MatrixPtr> cpuMat_;
-  std::vector<IVectorPtr> cpuVec_;
-  struct {
-    int64_t numSamples;
-    real cost;
-  } testContext_;
-
- private:
-  /**
-   * Test one batch by batchId. It is only used for testOnePass.
-   *
-   * Durning testOnePass, each log_period will print cost statistics.
-   *
-   * @param batchId current batch id (from 0)
-   * @return num of tested samples. Zero if end of pass.
-   */
-  int64_t testOneBatchById(int64_t batchId);
-
-  /**
-   * Test whole pass in one batch.
-   *
-   *
-   * @param passId current pass id (from 0)
-   */
-  void testOnePassBatch(int passId);
-
-  /**
-   * test for one pass in several mini-batches.
-   *
-   * Used for sgd method.
-   *
-   * @param passId current pass id (from 0)
-   */
-  void testOnePass(int passId);
-
-  /**
-   * print the outArgs to a stream
-   *
-   * used for save feature file
-   *
-   * @param [in] outArgs output arguments for network.
-   * @param [in,out] os output stream.
-   */
-  void printOutput(const std::vector<Argument>& outArgs, std::ostream& os);
-};
-
-}  //  namespace paddle
diff --git a/paddle/trainer/TesterConfig.h b/paddle/trainer/TesterConfig.h
deleted file mode 100644
index 68d4c931ff2df8e24acaa9fe6b35bfd613197c72..0000000000000000000000000000000000000000
--- a/paddle/trainer/TesterConfig.h
+++ /dev/null
@@ -1,138 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/utils/Util.h"
-
-#include <stdio.h>
-
-#include "hl_gpu.h"
-#include "paddle/gserver/gradientmachines/GradientMachine.h"
-
-#include "TrainerConfig.pb.h"
-
-#include <stdlib.h>
-#include <fstream>
-#include "ParameterUpdater.h"
-
-namespace paddle {
-
-/**
- * TesterConfig
- * general configs for training
- */
-struct TesterConfig {
-  /**
-   * indicate test period
-   */
-  int testPeriod;
-
-  /**
-   * indicate whether to save previous batch state
-   */
-  bool prevBatchState;
-
-  /**
-   * log period
-   */
-  int logPeriod;
-
-  /**
-   * loadsave parameters in pserver
-   */
-  bool loadsaveParametersInPserver;
-
-  /**
-   * feat file
-   */
-  std::string featFile;
-
-  /**
-   * predict output dir
-   */
-  std::string predictOutputDir;
-
-  /**
-   * trianer id
-   */
-  int trainerId;
-
-  /**
-   * distribute test
-   */
-  bool distributeTest;
-
-  /**
-   * training state
-   */
-  MachineState* trainState;
-
-  /**
-   * test state
-   */
-  MachineState* testState;
-
-  /**
-   * model list
-   */
-  std::string modelList;
-
-  /**
-   * test passes
-   */
-  int testPass;
-
-  /**
-   * num passes
-   */
-  int numPasses;
-
-  /**
-   * saving period
-   */
-  int savingPeriod;
-
-  /**
-   * test wait
-   */
-  int testWait;
-
-  /**
-   * init model path
-   */
-  std::string initModelPath;
-
-  /**
-   * save only one
-   */
-  bool saveOnlyOne;
-
-  /**
-   * testing mode
-   */
-  bool testing;
-
-  /**
-   * mode
-   */
-  int mode;
-
-  /**
-   * config loc
-   */
-  std::string config;
-};
-
-}  //  namespace paddle
diff --git a/paddle/trainer/ThreadParameterUpdater.cpp b/paddle/trainer/ThreadParameterUpdater.cpp
deleted file mode 100644
index 3c85c3aaac68fc29da90c24d1208887a17009d5f..0000000000000000000000000000000000000000
--- a/paddle/trainer/ThreadParameterUpdater.cpp
+++ /dev/null
@@ -1,309 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ThreadParameterUpdater.h"
-
-#include "paddle/utils/Logging.h"
-
-#include "paddle/math/SparseRowMatrix.h"
-#include "paddle/parameter/ThreadLocalBuffer.h"
-#include "paddle/utils/Thread.h"
-
-DECLARE_int32(trainer_count);
-
-namespace paddle {
-
-SgdThreadUpdater::SgdThreadUpdater(const OptimizationConfig& optConfig)
-    : config_(optConfig), numSamplesProcessed_(0) {
-  // fill types
-  auto types = sgdOptimizerGetTypes(optConfig, false /*inPserver*/);
-  for (auto type : types) {
-    addParameterType(type);
-  }
-}
-
-void SgdThreadUpdater::init(const std::vector<ParameterPtr>& parameters) {
-  ParameterUpdater::init(parameters);
-
-  // calc max parameter id
-  size_t maxId = 0;
-  for (auto& para : parameters_) {
-    maxId = std::max(maxId, para->getID());
-  }
-
-  optimizers_.resize(maxId + 1);
-  for (auto& para : parameters_) {
-    int pid = para->getID();
-    optimizers_[pid].reset(sgdOptimizerCreate(config_,
-                                              para->getConfig(),
-                                              para->isGradSparseUpdate(),
-                                              false /*inPserver*/));
-    size_t numRows = para->isGradSparseUpdate() ? para->getConfig().dims(0) : 0;
-    optimizers_[pid]->init(numRows, &para->getConfig());
-    if (para->isGradSparseUpdate() && FLAGS_trainer_count == 1) {
-      // For trainer_count=1, the gradient machine is NeuralNetwork, which does
-      // not create parameter buf for PARAMETER_GRADIENT for sparse update in
-      // Parameter::enableType(). But gradient parameter buf is still used
-      // in SgdThreadUpdater. We need to explicitly create it.
-      //
-      // The AverageOptimizer::restore/apply method will use PARAMETER_GRADIENT
-      // as a temp buffer.
-      para->enableBufType(PARAMETER_GRADIENT);
-    }
-  }
-}
-
-void SgdThreadUpdater::startPass() {
-  for (auto& para : parameters_) {
-    int pid = para->getID();
-    optimizers_[pid]->startPass();
-  }
-}
-
-bool SgdThreadUpdater::finishPass() {
-  catchUpWith();
-
-  for (auto& para : parameters_) {
-    int pid = para->getID();
-    optimizers_[pid]->finishPass();
-  }
-  return true;
-}
-
-void SgdThreadUpdater::updateImpl(Parameter* para) {
-  if (!para->useGpu()) return;
-  SetDevice setDevice(para->getDeviceId());
-  ParameterOptimizer* optimizer = optimizers_[para->getID()].get();
-  optimizer->update(para->getBufs(), para->getConfig());
-  if (auto callback = optimizer->needSpecialTraversal(para->getConfig())) {
-    callback(para->getBufs(), para->getConfig(), -1LU);
-  }
-
-  para->setValueUpdated();
-  para->clearGradient();
-}
-
-void SgdThreadUpdater::threadTraverse(
-    const ParameterOptimizer::TraverseCallback& callback,
-    int tid,
-    size_t numThreads,
-    Parameter* para) {
-  VectorPtr* vecs = parameter::getThreadLocalBuffer();
-  if (para->isGradSparseUpdate()) {
-    size_t height = para->getConfig().dims(0);
-    size_t width = para->getConfig().dims(1);
-    for (size_t i = tid; i < height; i += numThreads) {
-      // setup sub bufs
-      for (auto type : parameterTypes_) {
-        vecs[type]->subVecFrom(*para->getBuf(type), i * width, width);
-      }
-      callback(vecs, para->getConfig(), i);
-    }
-  } else {  // dense
-    // setup sub bufs
-    auto interval = calcSplitArrayInterval(
-        para->getSize(), (size_t)tid, numThreads, 8LU /*for avx*/);
-    for (auto type : parameterTypes_) {
-      vecs[type]->subVecFrom(*para->getBuf(type), interval);
-    }
-
-    callback(vecs, para->getConfig(), -1LU);
-  }
-}
-
-void SgdThreadUpdater::traverse(GetTraverseCallback getTraverseCallback) {
-  bool hasCpuPara = false;
-  bool hasGpuPara = false;
-  for (auto& para : parameters_) {
-    if (para->useGpu()) {
-      hasGpuPara = true;
-    } else {
-      hasCpuPara = true;
-    }
-  }
-
-  auto cpuTraverse = [&](int tid, size_t numThreads) {
-    for (auto& para : parameters_) {
-      if (auto callback = getTraverseCallback(para.get())) {
-        threadTraverse(callback, tid, numThreads, para.get());
-      }
-    }
-  };
-  auto gpuTraverse = [&](int tid, size_t numThreads) {
-    for (auto& para : parameters_) {
-      if (para->useGpu()) {
-        if (auto callback = getTraverseCallback(para.get())) {
-          SetDevice setDevice(para->getDeviceId());
-          callback(para->getBufs(), para->getConfig(), -1LU);
-        }
-      }
-    }
-  };
-
-  if (hasCpuPara && hasGpuPara) {
-    getGlobalSyncThreadPool()->exec(cpuTraverse, gpuTraverse);
-  } else if (hasCpuPara) {
-    getGlobalSyncThreadPool()->exec(cpuTraverse);
-  } else if (hasGpuPara) {
-    gpuTraverse(0, 0);
-  }
-}
-
-void SgdThreadUpdater::catchUpWith() {
-  traverse([this](Parameter* para) {
-    return optimizers_[para->getID()]->startCatchUpWith();
-  });
-
-  for (auto& para : parameters_) {
-    int pid = para->getID();
-    optimizers_[pid]->finishCatchUpWith();
-  }
-}
-
-void SgdThreadUpdater::apply() {
-  catchUpWith();
-
-  traverse(
-      [this](Parameter* para) { return optimizers_[para->getID()]->apply(); });
-}
-
-void SgdThreadUpdater::restore() {
-  traverse([this](Parameter* para) {
-    return optimizers_[para->getID()]->restore();
-  });
-}
-
-PassType SgdThreadUpdater::startBatch(int64_t batchSize) {
-  numSamplesProcessed_ += batchSize;
-  for (auto& para : parameters_) {
-    int pid = para->getID();
-    optimizers_[pid]->startBatch(numSamplesProcessed_);
-  }
-  return PASS_TRAIN;
-}
-
-void SgdThreadUpdater::finishBatch(real cost) {
-  getGlobalSyncThreadPool()->exec([&](int tid, size_t numThreads) {
-    for (auto& para : parameters_) {
-      if (para->isGradSparseUpdate()) {
-        threadUpdateSparse(tid, numThreads, para.get());
-      } else if (!para->useGpu()) {
-        threadUpdateDense(tid, numThreads, para.get());
-      }
-    }
-  });
-
-  for (auto& para : parameters_) {
-    int pid = para->getID();
-    optimizers_[pid]->finishBatch();
-  }
-}
-
-void SgdThreadUpdater::threadUpdateSparse(int tid,
-                                          size_t numThreads,
-                                          Parameter* para) {
-  int pid = para->getID();
-  ParameterOptimizer* optimizer = optimizers_[pid].get();
-  VectorPtr* vecs = parameter::getThreadLocalBuffer();
-
-  size_t height = para->getConfig().dims(0);
-  size_t width = para->getConfig().dims(1);
-
-  if (dynamic_cast<SparseRowIdsCpuMatrix*>(
-          para->getMat(PARAMETER_GRADIENT).get())) {
-    // From MultiGradientMachine
-    SparseRowIdsCpuMatrix* mainMat = dynamic_cast<SparseRowIdsCpuMatrix*>(
-        para->getMat(PARAMETER_GRADIENT).get());
-    std::vector<uint32_t>& sparseIds = mainMat->getIds(tid);
-
-    for (auto id : sparseIds) {
-      // setup sub bufs
-      for (auto type : parameterTypes_) {
-        vecs[type]->subVecFrom(*para->getBuf(type), id * width, width);
-      }
-      optimizer->update(vecs, para->getConfig(), id);
-      vecs[PARAMETER_GRADIENT]->zeroMem();
-    }
-    sparseIds.clear();
-  } else if (dynamic_cast<SparseRowCpuMatrix*>(
-                 para->getMat(PARAMETER_GRADIENT).get())) {
-    // From NeuralNetwork
-    SparseRowCpuMatrix* mainMat = dynamic_cast<SparseRowCpuMatrix*>(
-        para->getMat(PARAMETER_GRADIENT).get());
-
-    std::vector<unsigned int>& localIndices =
-        mainMat->getIndexDictHandle()->localIndices;
-
-    auto interval =
-        calcSplitArrayInterval(localIndices.size(), tid, numThreads);
-    for (size_t i = interval.first; i < interval.second; ++i) {
-      auto id = localIndices[i];
-      real* row = mainMat->getLocalRow(i);
-      // setup sub bufs
-      for (auto type : parameterTypes_) {
-        if (type == PARAMETER_GRADIENT) {
-          vecs[type]->subVecFrom(row, 0, width);
-        } else {
-          vecs[type]->subVecFrom(*para->getBuf(type), id * width, width);
-        }
-      }
-      optimizer->update(vecs, para->getConfig(), id);
-      vecs[PARAMETER_GRADIENT]->zeroMem();
-    }
-    // For numThreads > 1, MultiGradientMachine is used, which goes
-    // to the above branch.
-    CHECK_EQ(numThreads, 1UL);
-    mainMat->clearIndices();
-  } else {
-    auto& m = *para->getMat(PARAMETER_GRADIENT).get();
-    LOG(FATAL) << "Internal error: " << para->getName() << " "
-               << typeid(m).name();
-  }
-
-  if (auto callback = optimizer->needSpecialTraversal(para->getConfig())) {
-    for (size_t i = tid; i < height; i += numThreads) {
-      // setup sub bufs
-      for (auto type : parameterTypes_) {
-        vecs[type]->subVecFrom(*para->getBuf(type), i * width, width);
-      }
-      callback(vecs, para->getConfig(), i);
-    }
-  }
-}
-
-void SgdThreadUpdater::threadUpdateDense(int tid,
-                                         size_t numThreads,
-                                         Parameter* para) {
-  int pid = para->getID();
-  ParameterOptimizer* optimizer = optimizers_[pid].get();
-  VectorPtr* vecs = parameter::getThreadLocalBuffer();
-
-  auto interval = calcSplitArrayInterval(
-      para->getSize(), (size_t)tid, numThreads, 8LU /*for avx*/);
-
-  // setup sub bufs
-  for (auto type : parameterTypes_) {
-    vecs[type]->subVecFrom(*para->getBuf(type), interval);
-  }
-
-  // update
-  optimizer->update(vecs, para->getConfig());
-  vecs[PARAMETER_GRADIENT]->zeroMem();
-
-  if (auto callback = optimizer->needSpecialTraversal(para->getConfig())) {
-    callback(vecs, para->getConfig(), -1LU);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/trainer/ThreadParameterUpdater.h b/paddle/trainer/ThreadParameterUpdater.h
deleted file mode 100644
index b5e6a7ce3c8457364b10c921bca3386fbb6f6cbf..0000000000000000000000000000000000000000
--- a/paddle/trainer/ThreadParameterUpdater.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/parameter/AverageOptimizer.h"
-#include "paddle/parameter/FirstOrderOptimizer.h"
-#include "paddle/parameter/OptimizerFunctions.h"
-#include "paddle/parameter/OptimizerWithRegularizer.h"
-#include "paddle/parameter/Parameter.h"
-#include "paddle/parameter/Regularizer.h"
-#include "paddle/utils/Util.h"
-
-#include <memory>
-#include <vector>
-
-namespace paddle {
-
-/**
- * \brief A parameter updater that uses multiple threads to update parameters.
-   This parameter updater handles GPU and CPU updates differently,
-   because at the current moment, the merging on CPU is happening on the
-   main thread, and the its parameter size can be much larger than the one GPU.
-   Thus, for GPU, the parameter updates happens in updateImpl() function, which
-   is called by gradient machines as a callback function supplied to backward()
-   and forwardBackward().
-   For CPU, the parameter updates happens in separate threads maintained by this
-   class.
- */
-class SgdThreadUpdater : public ParameterUpdater {
- public:
-  explicit SgdThreadUpdater(const OptimizationConfig& optConfig);
-  virtual ~SgdThreadUpdater() {}
-
-  // Use the startPass() function of the base optimizer.
-  virtual void startPass();
-
-  // Use the finishPass() function of the base optimizer.
-  virtual bool finishPass();
-
-  virtual void init(const std::vector<ParameterPtr>& parameters);
-  virtual PassType startBatch(int64_t batchSize);
-  // Call finishBatch for each optimizer.
-  virtual void finishBatch(real cost);
-  virtual void catchUpWith();
-  virtual void apply();
-  virtual void restore();
-
- protected:
-  // This is the function that will be eventualy called by the GradientMachine.
-  // used only for GPU update.
-  virtual void updateImpl(Parameter* para);
-  OptimizationConfig config_;
-  int64_t numSamplesProcessed_;
-
-  // One optimizers for each parameter.
-  std::vector<std::unique_ptr<ParameterOptimizer>> optimizers_;
-
-  // The update function for CPU sparse parameters.
-  void threadUpdateSparse(int tid, size_t numThreads, Parameter* para);
-
-  // The update function for CPU dense parameters.
-  void threadUpdateDense(int tid, size_t numThreads, Parameter* para);
-  // The update function for after update operations, such as averager.
-  void threadTraverse(const ParameterOptimizer::TraverseCallback& callback,
-                      int tid,
-                      size_t numThreads,
-                      Parameter* para);
-  typedef std::function<const ParameterOptimizer::TraverseCallback(Parameter*)>
-      GetTraverseCallback;
-  void traverse(GetTraverseCallback getTraverseCallback);
-};
-
-}  // namespace paddle
diff --git a/paddle/trainer/Trainer.cpp b/paddle/trainer/Trainer.cpp
deleted file mode 100644
index 3e4a2b5fa8a3981f6362edc1dc61ae1616e257ef..0000000000000000000000000000000000000000
--- a/paddle/trainer/Trainer.cpp
+++ /dev/null
@@ -1,653 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Trainer.h"
-
-#include <stdio.h>
-
-#include <iomanip>
-#include <iostream>
-#include <limits>
-#include <sstream>
-
-#include <google/protobuf/text_format.h>
-
-#include "paddle/utils/Common.h"
-#include "paddle/utils/GlobalConstants.h"
-#include "paddle/utils/PythonUtil.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/Util.h"
-
-#include "RemoteParameterUpdater.h"
-#include "TesterConfig.h"
-#include "ThreadParameterUpdater.h"
-#include "TrainerConfigHelper.h"
-#include "paddle/gserver/gradientmachines/GradientMachineMode.h"
-#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
-#include "paddle/gserver/layers/ValidationLayer.h"
-
-DEFINE_string(config, "", "Trainer config file");
-
-DEFINE_int32(test_period,
-             0,
-             "if equal 0, do test on all test data at the end of "
-             "each pass. While if equal non-zero, do test on all test "
-             "data every test_period batches");
-DEFINE_bool(test_all_data_in_one_period,
-            false,
-            "This option was deprecated, since we will always do "
-            "test on all test set ");
-
-DEFINE_bool(local, true, "Train in local mode or not");
-
-DEFINE_int32(average_test_period,
-             0,
-             "Do test on average parameter every so"
-             " many batches. MUST be devided by FLAGS_log_period."
-             " Default 0 means do not test average parameter");
-
-DEFINE_int32(saving_period, 1, "Save parameteres every so many passes");
-DEFINE_int64(saving_period_by_batches,
-             0,
-             "Save parameters every so many batches in one pass");
-DEFINE_string(save_dir, "", "Directory for saving model parameter");
-DEFINE_int32(start_pass,
-             0,
-             "Start training from this pass. "
-             "Will load parameter from the previous pass");
-DEFINE_int32(test_pass, -1, "Will load parameter start from this pass to test");
-DEFINE_int32(test_wait, 0, "Waiting for pass parameter if not exist");
-DEFINE_bool(with_cost, true, "enable cost layer or not");
-DEFINE_bool(distribute_test, false, "test in distribute mode");
-
-DEFINE_int32(num_passes, 100, "train for so many passes");
-
-DEFINE_string(config_args,
-              "",
-              "arguments passed to config file."
-              "Format: key1=value1,key2=value2");
-
-DEFINE_bool(save_only_one,
-            false,
-            "Save only parameters in last pass, remove previous.");
-
-DEFINE_string(feat_file, "", "File name of extracted feature.");
-DEFINE_string(predict_output_dir,
-              "",
-              "Directory that saves the predicted results of output layers");
-DEFINE_string(model_list, "", "File that saves the model list when evaluation");
-
-namespace paddle {
-
-void Trainer::init(const std::shared_ptr<TrainerConfigHelper>& config,
-                   bool testing,
-                   const std::shared_ptr<GradientMachine>& gradientMachine,
-                   const std::shared_ptr<DataProvider>& dataProvider,
-                   const std::shared_ptr<DataProvider>& testDataProvider) {
-  this->stats_ = std::make_shared<TrainerStats>();
-
-  config_ = config;
-
-  config_->updateConfigFromFlags();
-
-  testing_ = testing;
-
-  // in testing, mode_ may GradientMachine::kTesting or
-  // GradientMachine::kSgdSparseCpuTraining
-
-  if (FLAGS_local) {
-    CHECK(!FLAGS_loadsave_parameters_in_pserver)
-        << "local and loadsave_parameters_in_pserver can not both true";
-    if (config_->getOptConfig().use_sparse_remote_updater()) {
-      config_->disableRemoteSparseUpdaterForEachParams();
-      LOG(INFO) << "ignore sparse_remote_update=true due to  --local=true";
-    }
-  }
-  if (FLAGS_loadsave_parameters_in_pserver) {
-    CHECK(config_->getOptConfig().use_sparse_remote_updater())
-        << "no parameter to load from pserver, please check network config";
-  }
-  if (testing && !FLAGS_loadsave_parameters_in_pserver) {
-    if (config_->getOptConfig().use_sparse_remote_updater()) {
-      config_->disableRemoteSparseUpdater();
-      LOG(INFO) << "because parameter is loaded local,"
-                << "tester ignore sparse_remote_update flag";
-    }
-  }
-
-  CHECK(TrainAlgorithm::isValid(config_->getOptConfig().algorithm()))
-      << "invalid algorithm configuration: "
-      << config_->getOptConfig().algorithm();
-
-  bool useSparseUpdater = false;
-  for (auto& paraConfig : config_->getModelConfig().parameters()) {
-    if (paraConfig.sparse_update() || paraConfig.sparse_remote_update()) {
-      useSparseUpdater = true;
-    }
-  }
-
-  if (FLAGS_use_mkldnn) {
-    CHECK_EQ(FLAGS_trainer_count, 1) << "MKLDNN only need 1 trainer";
-  }
-
-  if (testing) {
-    LOG(INFO) << "trainer: in testing mode";
-    if (config_->getOptConfig().use_sparse_remote_updater() ||
-        FLAGS_trainer_count > 1) {
-      mode_ = GradientMachine::kSgdSparseCpuTraining;
-      LOG(INFO) << "trainer mode: SgdSparseCpuTraining";
-    } else {
-      mode_ = GradientMachine::kTesting;
-      LOG(INFO) << "trainer mode: Testing";
-    }
-  } else if (IGradientMachineMode::tryGetMode(
-                 (int*)&mode_,
-                 config_->getOptConfig().algorithm(),
-                 FLAGS_trainer_count,
-                 FLAGS_local,
-                 FLAGS_use_gpu)) {
-    LOG(INFO) << "Custom trainer mode.";
-  } else if ((config_->getOptConfig().algorithm() == TrainAlgorithm::SGD ||
-              config_->getOptConfig().algorithm() ==
-                  TrainAlgorithm::AsyncSGD) &&
-             useSparseUpdater) {
-    mode_ = GradientMachine::kSgdSparseCpuTraining;
-    LOG(INFO) << "trainer mode: SgdSparseCpuTraining";
-  } else {
-    mode_ = GradientMachine::kNormal;
-    LOG(INFO) << "trainer mode: Normal";
-  }
-
-  // initialize trainer internal
-  trainerInternal_.init(config_,
-                        gradientMachine,
-                        TrainerInternalConfig::createFromMode(mode_),
-                        stats_,
-                        testing);
-  std::unique_ptr<ParameterUtilConfig> paramConfig(
-      new ParameterUtilConfig(FLAGS_save_only_one,
-                              FLAGS_saving_period,
-                              FLAGS_loadsave_parameters_in_pserver,
-                              FLAGS_config));
-
-  paramUtil_.reset(
-      new paddle::ParameterUtil(config_,
-                                std::move(paramConfig),
-                                trainerInternal_.getGradientMachine(),
-                                trainerInternal_.getParameterUpdater()));
-
-  bool gpuData =
-      FLAGS_use_gpu && (!FLAGS_parallel_nn) &&
-      (!IGradientMachineMode::dataMustInCpu(mode_, FLAGS_trainer_count));
-
-  dataProvider_ = dataProvider;
-  if (!dataProvider_ && config_->hasDataConfig() && !testing_) {
-    dataProvider_.reset(DataProvider::create(*config_, *config_, gpuData));
-  }
-  if (!testDataProvider_) {
-    // No evaluator_ if there is testDataProvider but no dataProvider.
-    evaluator_.reset(trainerInternal_.getGradientMachine()->makeEvaluator());
-    currentEvaluator_.reset(
-        trainerInternal_.getGradientMachine()->makeEvaluator());
-    if (FLAGS_average_test_period > 0 && FLAGS_trainer_id == 0 &&
-        config_->getOptConfig().average_window() > 0) {
-      CHECK_EQ(FLAGS_average_test_period % FLAGS_log_period, 0)
-          << "FLAGS_average_test_period must be divided by FALGS_log_period";
-      averageEvaluator_.reset(
-          trainerInternal_.getGradientMachine()->makeEvaluator());
-    }
-  }
-
-  testDataProvider_ = testDataProvider;
-  if (!testDataProvider_ && config_->hasTestDataConfig()) {
-    testDataProvider_.reset(
-        DataProvider::create(config_->getTestDataConfig(), *config_, gpuData));
-  }
-  if (testDataProvider_) {
-    createTester();
-  }
-
-  if (!testing &&
-      (trainerInternal_.getGradientMachine()->hasStaticParameters())) {
-    CHECK(!FLAGS_loadsave_parameters_in_pserver)
-        << "is_static and loadsave_parameters_in_pserver can not both true";
-  }
-  if (testing) {
-    // will load per pass for tester
-  } else if (paramUtil_->tryLoadParametersFromConfig()) {
-    // load from config already.
-  } else {
-    trainerInternal_.getGradientMachine()->randParameters();
-  }
-
-  // Only non static parameters need to be updated
-  std::vector<ParameterPtr>& parameters =
-      trainerInternal_.getGradientMachine()->getNonStaticParameters();
-  if (trainerInternal_.getParameterUpdater()) {
-    trainerInternal_.getParameterUpdater()->init(parameters);
-
-    if (FLAGS_loadsave_parameters_in_pserver && FLAGS_trainer_id == 0) {
-      if (testing) {
-        // will load per pass for tester
-      } else if (!config_->getConfig().init_model_path().empty() &&
-                 (FLAGS_local || FLAGS_trainer_id == 0)) {
-        paramUtil_->loadParametersWithPath(
-            config_->getConfig().init_model_path(),
-            false /*local*/,
-            true /*remote*/);
-      } else if (config_->getConfig().start_pass() > 0 &&
-                 (FLAGS_local || FLAGS_trainer_id == 0)) {
-        CHECK(paramUtil_->loadParameters(config_->getConfig().start_pass() - 1,
-                                         false /*local*/,
-                                         true /*remote*/));
-      } else {
-        trainerInternal_.getParameterUpdater()->randParametersRemote();
-      }
-    }
-  }
-
-  // set current evaluator and evalutor
-  trainerInternal_.setCurrentEvaluator(currentEvaluator_.get());
-  trainerInternal_.setEvaluator(evaluator_.get());
-}
-
-void Trainer::train(size_t numPasses) {
-  startTrain();
-  for (size_t i = 0; i < numPasses; ++i) {
-    if (IGradientMachineMode::trainWholeDataInOneBatch(mode_)) {
-      trainOnePassBatch(config_->getConfig().start_pass() + i);
-    } else {
-      trainOnePass();
-    }
-    if (i < numPasses - 1) {
-      dataProvider_->reset();
-    }
-  }
-
-  finishTrain();
-}
-
-static double genPerturbation(real* d, real* grad, size_t dim) {
-  auto& reng = ThreadLocalRandomEngine::get();
-  std::uniform_real_distribution<double> dist(-1, 1);
-  double gradNorm = 0, dNorm = 0;
-  for (size_t i = 0; i < dim; ++i) {
-    d[i] = dist(reng);
-    dNorm += d[i] * d[i];
-    gradNorm += grad[i] * grad[i];
-  }
-  if (gradNorm > 0) {
-    real s = 0.5 * sqrt(gradNorm / dNorm);
-    for (size_t i = 0; i < dim; ++i) {
-      d[i] = s * d[i] + grad[i];
-    }
-  }
-  double delta = 0;
-  for (size_t i = 0; i < dim; ++i) {
-    delta += grad[i] * d[i];
-  }
-  return delta;
-}
-
-real Trainer::checkGradient() {
-  trainerInternal_.getGradientMachine()->start();
-  std::vector<ParameterPtr>& parameters =
-      trainerInternal_.getGradientMachine()->getNonStaticParameters();
-  DataBatch dataBatch;
-  int32_t batchSize = config_->getOptConfig().batch_size();
-
-  dataProvider_->getNextBatch(batchSize, &dataBatch);
-
-  CHECK(dataBatch.getSize()) << "No data from data provider";
-  std::vector<Argument>& inArgs = dataBatch.getStreams();
-  std::vector<Argument> outArgs;
-
-  trainerInternal_.getGradientMachine()->forward(inArgs, &outArgs, PASS_GC);
-  real cost = Argument::sum(outArgs);
-  LOG(INFO) << "original cost=" << cost;
-  trainerInternal_.getGradientMachine()->backward();
-
-  real maxDiff = 0;
-  char fill = ' ';
-  for (auto& parameter : parameters) {
-    CpuVector oldPara(parameter->getSize());
-    CpuVector newPara(parameter->getSize());
-    oldPara.copyFrom(*parameter->getBuf(PARAMETER_VALUE));
-    real* newp = newPara.getData();
-    real* oldp = oldPara.getData();
-    CpuVector cpuGrad(*parameter->getBuf(PARAMETER_GRADIENT));
-    real* grad = cpuGrad.getData();
-    size_t dim = parameter->getSize();
-    std::vector<real> d(dim);
-
-    double delta = genPerturbation(d.data(), grad, dim);
-
-    // use a step such that delta / cost is FLAGS_checkgrad_eps
-    real step =
-        (delta != 0) ? cost / delta * FLAGS_checkgrad_eps : FLAGS_checkgrad_eps;
-    delta *= step;
-    for (size_t i = 0; i < dim; ++i) {
-      newp[i] = oldp[i] + step * d[i];
-    }
-
-    parameter->getBuf(PARAMETER_VALUE)->copyFrom(newPara);
-    parameter->setValueUpdated();
-    trainerInternal_.getGradientMachine()->forward(inArgs, &outArgs, PASS_GC);
-    real newCost1 = Argument::sum(outArgs);
-
-    for (size_t i = 0; i < dim; ++i) {
-      newp[i] = oldp[i] - step * d[i];
-    }
-
-    parameter->getBuf(PARAMETER_VALUE)->copyFrom(newPara);
-    parameter->setValueUpdated();
-    trainerInternal_.getGradientMachine()->forward(inArgs, &outArgs, PASS_GC);
-    real newCost2 = Argument::sum(outArgs);
-
-    real trueDelta = 0.5 * (newCost1 - newCost2);
-    real diff = (1e-20 + trueDelta) / (1e-20 + delta) - 1;
-    LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(fill)
-              << std::setw(20) << parameter->getName()
-              << "step=" << std::setw(15) << step << "cost1=" << std::setw(10)
-              << newCost1 << "cost2=" << std::setw(10) << newCost2
-              << "true_delta=" << std::setw(15) << trueDelta
-              << "analytic_delta=" << std::setw(15) << delta << "diff=" << diff
-              << (std::abs(diff) > 0.01 ? " ***" : "");
-
-    maxDiff = std::max(maxDiff, std::abs(diff));
-
-    // restore parameter
-    parameter->getBuf(PARAMETER_VALUE)->copyFrom(oldPara);
-    parameter->setValueUpdated();
-
-    fill = (fill == ' ') ? '.' : ' ';
-  }
-  return maxDiff;
-}
-
-void Trainer::startTrain() {
-  trainPassContext_.passId = config_->getConfig().start_pass();
-  srand(config_->getConfig().start_pass() + 1);
-  if (dataProvider_) {
-    dataProvider_->reset();
-  }
-
-  trainerInternal_.getGradientMachine()->start();
-}
-
-void Trainer::finishTrain() { trainerInternal_.getGradientMachine()->finish(); }
-
-void Trainer::startTrainPass() {
-  stats_->reset();
-  trainPassContext_.batchId = 0;
-  trainPassContext_.avgTestCost = 0;
-  trainPassContext_.numAvgTests = 0;
-  trainPassContext_.passInnerId = 1;
-
-  trainerInternal_.getParameterUpdater()->startPass();
-  evaluator_->start();
-  if (FLAGS_prev_batch_state) {
-    trainerInternal_.getGradientMachine()->resetState();
-    trainerInternal_.getGradientMachine()->getState(testState_);
-  }
-}
-
-void Trainer::trainOneDataBatch(DataBatch& dataBatch) {
-  int num = dataBatch.getSize();
-  if (averageEvaluator_) {
-    int64_t mod = trainPassContext_.batchId % FLAGS_average_test_period;
-    if (mod >= FLAGS_average_test_period - FLAGS_log_period) {
-      if (mod == FLAGS_average_test_period - FLAGS_log_period) {
-        averageEvaluator_->start();
-      }
-      trainerInternal_.getParameterUpdater()->apply();
-      if (FLAGS_prev_batch_state) {
-        trainerInternal_.getGradientMachine()->getState(trainState_);
-      }
-      trainPassContext_.avgTestCost += tester_->forwardOneBatch(
-          dataBatch, averageEvaluator_.get(), &forwardOutput_);
-      if (FLAGS_prev_batch_state) {
-        trainerInternal_.getGradientMachine()->setState(trainState_);
-      }
-      trainPassContext_.numAvgTests += num;
-      trainerInternal_.getParameterUpdater()->restore();
-    }
-  }
-  {
-    REGISTER_TIMER("TrainBatch");
-    trainerInternal_.trainOneBatch(
-        trainPassContext_.batchId, dataBatch, &forwardOutput_);
-  }
-
-  if (averageEvaluator_ &&
-      trainPassContext_.batchId % FLAGS_average_test_period ==
-          FLAGS_average_test_period - 1) {
-    averageEvaluator_->finish();
-    LOG(INFO) << " Averaged parameter:"
-              << " cost="
-              << trainPassContext_.avgTestCost / trainPassContext_.numAvgTests
-              << " Eval: " << *averageEvaluator_;
-    trainPassContext_.numAvgTests = 0;
-    trainPassContext_.avgTestCost = 0;
-  }
-
-  ++trainPassContext_.batchId;
-
-  if (trainPassContext_.batchId % FLAGS_log_period == 0) {
-    FOR_TIMING(globalStat.setThreadInfo(true));
-    FOR_TIMING(globalStat.printAllStatus());
-    FOR_TIMING(globalStat.reset());
-  }
-
-  if (testDataProvider_ && FLAGS_test_period > 0 &&
-      trainPassContext_.batchId % FLAGS_test_period == 0) {
-    tester_->testOnePeriod();
-  }
-
-  if (FLAGS_saving_period_by_batches > 0 &&
-      trainPassContext_.batchId >
-          FLAGS_saving_period_by_batches * trainPassContext_.passInnerId &&
-      0 == FLAGS_trainer_id) {
-    trainerInternal_.getParameterUpdater()->catchUpWith();
-    if (testDataProvider_) {
-      tester_->testOnePeriod();
-    }
-    paramUtil_->saveParametersOnePass(trainPassContext_.passId,
-                                      trainPassContext_.passInnerId);
-    ++trainPassContext_.passInnerId;
-  }
-}
-
-void Trainer::finishTrainPass() {
-  if (trainPassContext_.batchId == 0) {
-    // This means no more data from DataProvider
-    return;
-  }
-
-  trainerInternal_.finishTrainPass(trainPassContext_.passId,
-                                   trainPassContext_.batchId);
-
-  FOR_TIMING(globalStat.setThreadInfo(true));
-  FOR_TIMING(globalStat.printAllStatus());
-  FOR_TIMING(globalStat.reset());
-
-  if (testDataProvider_) {
-    tester_->testOnePeriod();
-  }
-
-  if (trainPassContext_.passId % FLAGS_saving_period == 0 &&
-      FLAGS_trainer_id == 0) {
-    paramUtil_->saveParametersOnePass(trainPassContext_.passId);
-  }
-  ++trainPassContext_.passId;
-}
-
-void Trainer::trainOnePass() {
-  startTrainPass();
-  size_t batchSize = config_->getOptConfig().batch_size();
-  while (true) {
-    DataBatch dataBatch;
-
-    int num = 0;
-    {
-      REGISTER_TIMER("getTrainBatch");
-      num = dataProvider_->getNextBatch(batchSize, &dataBatch);
-    }
-    if (num == 0) break;
-    CHECK_EQ(num, dataBatch.getSize());
-    trainOneDataBatch(dataBatch);
-  }
-
-  finishTrainPass();
-}
-
-void Trainer::trainOnePassBatch(int passId) {
-  this->stats_->reset();
-
-  trainerInternal_.getParameterUpdater()->startPass();
-  const std::vector<Argument> inArgs;
-  {
-    REGISTER_TIMER("onePass");
-    trainerInternal_.getGradientMachine()->forwardBackward(
-        inArgs, nullptr, PASS_TRAIN, nullptr);
-  }
-
-  real cost = .0;
-  int64_t num = 0;
-  trainerInternal_.getGradientMachine()->getStats(cost, num);
-  *stats_ += {num, cost};
-
-  trainerInternal_.getGradientMachine()->onPassEnd();
-
-  bool accepted = trainerInternal_.getParameterUpdater()->finishPass();
-
-  globalStat.setThreadInfo(true);
-  globalStat.printAllStatus();
-  globalStat.reset();
-
-  LOG(INFO) << " Pass=" << passId
-            << " AcceptedPass=" << (accepted ? acceptedPassId_ : -1)
-            << stats_->getStats(false /*withCurrentCost*/);
-
-  if (accepted) {
-    if (acceptedPassId_ % FLAGS_saving_period == 0 && FLAGS_trainer_id == 0) {
-      paramUtil_->saveParameters(acceptedPassId_);
-    }
-    acceptedPassId_++;
-    if (FLAGS_save_only_one && acceptedPassId_ >= FLAGS_saving_period) {
-      paramUtil_->deleteParameters(acceptedPassId_ - FLAGS_saving_period);
-    }
-  }
-}
-
-real Trainer::calcGradient(const DataBatch& dataBatch,
-                           const Vector& value,
-                           Vector& gradient) {
-  CHECK_EQ(value.getSize(), gradient.getSize());
-  std::vector<ParameterPtr>& parameters =
-      trainerInternal_.getGradientMachine()->getParameters();
-
-  clearGradient();
-
-  size_t offset = 0;
-  size_t valueSize = value.getSize();
-
-  for (auto& para : parameters) {
-    CHECK_LE(offset + para->getSize(), valueSize);
-    VectorPtr val =
-        Vector::create(para->getSize(), value.getMemoryHandle(), offset);
-    para->getBuf(PARAMETER_VALUE)->copyFrom(*val);
-    para->setValueUpdated();
-    offset += para->getSize();
-  }
-
-  CHECK_EQ(offset, valueSize);
-
-  std::vector<Argument> inArgs = dataBatch.getStreams();
-  std::vector<Argument> outArgs;
-
-  trainerInternal_.getGradientMachine()->forwardBackward(
-      inArgs, &outArgs, PASS_TRAIN);
-  real cost = Argument::sum(outArgs);
-
-  offset = 0;
-  for (auto& para : parameters) {
-    VectorPtr grad =
-        Vector::create(para->getSize(), gradient.getMemoryHandle(), offset);
-    if (para->getBuf(PARAMETER_GRADIENT)) {
-      grad->copyFrom(*para->getBuf(PARAMETER_GRADIENT));
-    }
-    offset += para->getSize();
-  }
-
-  return cost;
-}
-
-void Trainer::clearGradient() {
-  std::vector<ParameterPtr>& parameters =
-      trainerInternal_.getGradientMachine()->getNonStaticParameters();
-  for (auto& parameter : parameters) {
-    parameter->clearGradient();
-  }
-}
-
-int Trainer::getBatchSize() { return config_->getOptConfig().batch_size(); }
-
-void Trainer::createTester() {
-  tester_.reset(new paddle::Tester(config_,
-                                   createTesterConfig(),
-                                   trainerInternal_.getGradientMachine(),
-                                   trainerInternal_.getParameterUpdater(),
-                                   testDataProvider_));
-}
-
-void Trainer::test() { tester_->test(); }
-
-std::unique_ptr<TesterConfig> Trainer::createTesterConfig() {
-  TesterConfig* conf = new TesterConfig;
-  if (FLAGS_test_period) {
-    LOG(WARNING) << "The meaning of --test_period is changed: "
-                 << "if equal 0, do test on all test data at the end of "
-                 << "each pass. While if equal non-zero, do test on all test "
-                 << "data every test_period batches ";
-  }
-  if (FLAGS_test_all_data_in_one_period) {
-    LOG(WARNING) << "--test_all_data_in_one_period was deprecated, since "
-                 << "we will always do test on all test set ";
-  }
-  conf->testPeriod = FLAGS_test_period;
-  conf->prevBatchState = FLAGS_prev_batch_state;
-  conf->logPeriod = FLAGS_log_period;
-  conf->loadsaveParametersInPserver = FLAGS_loadsave_parameters_in_pserver;
-  conf->featFile = FLAGS_feat_file;
-  conf->predictOutputDir = FLAGS_predict_output_dir;
-  conf->trainerId = FLAGS_trainer_id;
-  conf->distributeTest = FLAGS_distribute_test;
-  conf->config = FLAGS_config;
-  conf->modelList = FLAGS_model_list;
-  conf->testPass = FLAGS_test_pass;
-  conf->numPasses = FLAGS_num_passes;
-  conf->savingPeriod = FLAGS_saving_period;
-  conf->testWait = FLAGS_test_wait;
-  conf->initModelPath = FLAGS_init_model_path;
-  conf->saveOnlyOne = FLAGS_save_only_one;
-  conf->testing = testing_;
-  conf->mode = mode_;
-  conf->trainState = &trainState_;
-  conf->testState = &testState_;
-  return std::unique_ptr<TesterConfig>(conf);
-}
-
-ParameterUtil* Trainer::getParameterUtilPtr() { return paramUtil_.get(); }
-}  // namespace paddle
diff --git a/paddle/trainer/Trainer.h b/paddle/trainer/Trainer.h
deleted file mode 100644
index 78127b7be5cef34f51a4b540852c139625b571dd..0000000000000000000000000000000000000000
--- a/paddle/trainer/Trainer.h
+++ /dev/null
@@ -1,204 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/utils/Util.h"
-
-#include <stdio.h>
-
-#include "hl_gpu.h"
-#include "paddle/gserver/dataproviders/DataProvider.h"
-#include "paddle/gserver/gradientmachines/GradientMachine.h"
-
-#include <stdlib.h>
-#include <fstream>
-#include "ParamUtil.h"
-#include "ParameterUpdater.h"
-#include "Tester.h"
-#include "TrainerConfigHelper.h"
-#include "TrainerInternal.h"
-
-DECLARE_int32(num_passes);
-
-namespace paddle {
-
-/**
- * Trainer Class
- *
- * Trainer combines GradientMachine, ParameterUpdater, DataProvider together to
- * train/test a NeuralNetwork.
- */
-class Trainer {
- public:
-  /**
-   * Ctor.
-   * @return
-   */
-  Trainer() : acceptedPassId_(0) {}
-
-  virtual ~Trainer() {}
-
-  /**
-   * initialize a new trainer using config
-   *
-   * @param config TrainerConfig.
-   * @param testing true if only for testing
-   * @param gradientMachine GradientMachine that will be trained.
-   *                        nullptr if create from config.
-   * @param dataProvider Train Data Provider. null if create from config.
-   * @param testDataProvider Test Data Provider. null if create from config.
-   */
-  virtual void init(
-      const std::shared_ptr<TrainerConfigHelper>& config,
-      bool testing = false,
-      const std::shared_ptr<GradientMachine>& gradientMachine = nullptr,
-      const std::shared_ptr<DataProvider>& dataProvider = nullptr,
-      const std::shared_ptr<DataProvider>& testDataProvider = nullptr);
-
-  /**
-   * Train until num_passes reached.
-   * One pass means neural network train through all training data.
-   *
-   * @param numPasses the number of traning pass.
-   * @note Durning neural network training, the num passes may set a very large
-   * value, and kill training process when result is good enough.
-   */
-  void train(size_t numPasses = (size_t)FLAGS_num_passes);
-
-  /**
-   * compare the gradient from bp with finite difference
-   * @return  the maximal difference
-   */
-  real checkGradient();
-
-  void startTrain();
-  void finishTrain();
-  void startTrainPass();
-  void finishTrainPass();
-  void trainOneDataBatch(DataBatch& dataBatch);
-  void time();
-
-  /**
-   * given a dataBatch and the current parameter value
-   * calculate its gradient and return the cost.
-   *
-   * TODO(yuyang18): I think this method is deprecated and buggy. Should it be
-   * removed?
-   */
-  real calcGradient(const DataBatch& dataBatch,
-                    const Vector& value,
-                    Vector& gradient);
-
-  /**
-   * Get Trainer Config.
-   */
-  const TrainerConfig& getConfig() const { return config_->getConfig(); }
-
-  /**
-   * Get Train Data Provider
-   */
-  const DataProviderPtr& getDataProvider() { return dataProvider_; }
-
-  /**
-   * Get Gradient Machine.
-   */
-  const GradientMachinePtr& getGradientMachine() {
-    return trainerInternal_.getGradientMachine();
-  }
-
-  /**
-   * Get batch size in optimization config.
-   * @note This method didn't return the actual batch size. Just batch size
-   * set in the optimization config. The actual batch size in one trainer may
-   * less than batch size in config due to there are not enough data.
-   */
-  int getBatchSize();
-
-  /**
-   * Do test job
-   */
-  void test();
-
-  /**
-   * Get parameter util ptr
-   *
-   * TODO(yuyang18): Make it return a smart pointer.
-   */
-  ParameterUtil* getParameterUtilPtr();
-
- protected:
-  /**
-   * Train one pass of data.
-   *
-   * SGD Method.
-   */
-  void trainOnePass();
-
-  /**
-   * Train one pass in one batch.
-   *
-   */
-  void trainOnePassBatch(int passId);
-
-  /**
-   * set parameter gradient to zero
-   */
-  void clearGradient();
-
-  void createTester();
-
- private:
-  std::unique_ptr<TesterConfig> createTesterConfig();
-
- protected:
-  std::shared_ptr<TrainerConfigHelper> config_;
-  std::shared_ptr<TrainerStats> stats_;
-
-  DataProviderPtr dataProvider_;
-  DataProviderPtr testDataProvider_;
-  MachineState trainState_;
-  MachineState testState_;
-
-  struct TrainPassContext {
-    int64_t batchId;
-    real avgTestCost;
-    int64_t numAvgTests;
-    int passId;
-    int passInnerId;
-  };
-  std::vector<paddle::Argument> forwardOutput_;
-
-  TrainPassContext trainPassContext_;
-
-  std::unique_ptr<Evaluator> evaluator_;
-  std::unique_ptr<Evaluator> currentEvaluator_;
-  std::unique_ptr<Evaluator> averageEvaluator_;
-  // training mode
-  // used to decide which GradientMachine and ParameterUpdater to create
-  GradientMachine::CreateMode mode_;
-  int testing_;
-  int acceptedPassId_;
-
-  // trainer tester
-  std::unique_ptr<Tester> tester_;
-
-  // parameter util
-  std::unique_ptr<ParameterUtil> paramUtil_;
-
-  // trainer Internal
-  TrainerInternal trainerInternal_;
-};
-
-}  // namespace paddle
diff --git a/paddle/trainer/TrainerBenchmark.cpp b/paddle/trainer/TrainerBenchmark.cpp
deleted file mode 100644
index 173653c81688fe4606731c68ea1854268b3f4590..0000000000000000000000000000000000000000
--- a/paddle/trainer/TrainerBenchmark.cpp
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#undef PADDLE_DISABLE_TIMER
-
-#include "Trainer.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/Util.h"
-
-DECLARE_int32(test_period);
-
-DEFINE_bool(feed_data, false, "Wether to read data from DataProvider.");
-
-namespace paddle {
-
-void Trainer::time() {
-  startTrain();
-
-  trainerInternal_.getParameterUpdater()->startPass();
-  evaluator_->start();
-
-  DataBatch dataBatch;
-  int32_t batchSize = config_->getOptConfig().batch_size();
-  int32_t num = dataProvider_->getNextBatch(batchSize, &dataBatch);
-  CHECK_EQ(num, batchSize) << "The sample number is less than batch size "
-                           << num << " != " << batchSize;
-
-  CHECK(dataBatch.getSize()) << "No data from data provider";
-
-  std::vector<paddle::Argument> outputs;
-  // burning time
-  LOG(INFO) << "Burning time...";
-  for (int n = 0; n < 10; ++n) {
-    trainerInternal_.trainOneBatch(n, dataBatch, &outputs);
-  }
-  LOG(INFO) << "Burning time end.";
-
-  for (int n = 0; n < FLAGS_test_period; n++) {
-    if (FLAGS_feed_data) {
-      REGISTER_TIMER("GetData");
-      num = dataProvider_->getNextBatch(batchSize, &dataBatch);
-    }
-
-    if (num != batchSize) {
-      break;
-    }
-
-    {
-      REGISTER_TIMER("FwdBwd");
-      trainerInternal_.trainOneBatch(n, dataBatch, &outputs);
-    }
-  }
-  globalStat.setThreadInfo(true);
-  globalStat.printSegTimerStatus();
-  globalStat.reset();
-
-  finishTrain();
-}
-
-}  // namespace paddle
diff --git a/paddle/trainer/TrainerConfigHelper.cpp b/paddle/trainer/TrainerConfigHelper.cpp
deleted file mode 100644
index 2b68d89e48a3efd5de205ce33643b7e6320a4303..0000000000000000000000000000000000000000
--- a/paddle/trainer/TrainerConfigHelper.cpp
+++ /dev/null
@@ -1,199 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "TrainerConfigHelper.h"
-#include "ParamUtil.h"
-#include "TrainerConfig.pb.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/PythonUtil.h"
-
-DECLARE_string(config);
-DECLARE_string(init_model_path);
-DECLARE_int32(start_pass);
-DECLARE_string(save_dir);
-DECLARE_int32(trainer_id);
-DECLARE_bool(local);
-DECLARE_bool(with_cost);
-DECLARE_bool(with_gpu);
-DECLARE_bool(parallel_nn);
-DECLARE_string(config_args);
-DECLARE_bool(use_mkldnn);
-DECLARE_bool(use_mkl_packed);
-
-const char *kConfigParserModuleName = "paddle.trainer.config_parser";
-const char *kConfigParserFuncName = "parse_config_and_serialize";
-
-namespace paddle {
-
-struct TrainerConfigHelperPrivate {
-  TrainerConfig conf;
-};
-
-TrainerConfigHelper::TrainerConfigHelper(const std::string &configFilePath)
-    : m(new TrainerConfigHelperPrivate()) {
-  std::ostringstream configArgs;
-  configArgs << "trainer_id=" << FLAGS_trainer_id << ",local=" << FLAGS_local
-             << ",with_cost=" << FLAGS_with_cost << ",use_gpu=" << FLAGS_use_gpu
-             << ",parallel_nn=" << FLAGS_parallel_nn
-             << ",use_mkldnn=" << FLAGS_use_mkldnn
-             << ",use_mkl_packed=" << FLAGS_use_mkl_packed
-             << ",cudnn_version=" << hl_get_cudnn_lib_version();
-  if (!FLAGS_config_args.empty()) {
-    configArgs << "," << FLAGS_config_args;
-  }
-
-  VLOG(3) << "Parsing trainer config " << configFilePath;
-  std::string configProtoStr =
-      callPythonFunc(kConfigParserModuleName,
-                     kConfigParserFuncName,
-                     {configFilePath, configArgs.str()});
-  CHECK(m->conf.ParseFromString(configProtoStr));
-}
-
-TrainerConfigHelper::TrainerConfigHelper(const TrainerConfig &config)
-    : m(new TrainerConfigHelperPrivate()) {
-  m->conf = config;
-}
-
-TrainerConfigHelper::~TrainerConfigHelper() { delete m; }
-
-const TrainerConfig &TrainerConfigHelper::getConfig() const { return m->conf; }
-
-TrainerConfig &TrainerConfigHelper::getMutableConfig() { return m->conf; }
-
-const OptimizationConfig &TrainerConfigHelper::getOptConfig() const {
-  return m->conf.opt_config();
-}
-
-const ModelConfig &TrainerConfigHelper::getModelConfig() const {
-  return m->conf.model_config();
-}
-
-const DataConfig *TrainerConfigHelper::getDataConfigPtr() const {
-  if (m->conf.has_data_config()) {
-    return &m->conf.data_config();
-  } else {
-    return nullptr;
-  }
-}
-
-const DataConfig &TrainerConfigHelper::getTestDataConfig() const {
-  CHECK(m->conf.has_test_data_config());
-  return m->conf.test_data_config();
-}
-
-bool TrainerConfigHelper::hasDataConfig() const {
-  return m->conf.has_data_config();
-}
-
-bool TrainerConfigHelper::hasTestDataConfig() const {
-  return m->conf.has_test_data_config();
-}
-
-void TrainerConfigHelper::updateConfigFromFlags() {
-  if (!FLAGS_save_dir.empty()) {
-    m->conf.set_save_dir(FLAGS_save_dir);
-  }
-  if (!FLAGS_init_model_path.empty()) {
-    m->conf.set_init_model_path(FLAGS_init_model_path);
-  }
-  if (FLAGS_start_pass != 0) {
-    m->conf.set_start_pass(FLAGS_start_pass);
-  }
-}
-
-void TrainerConfigHelper::disableRemoteSparseUpdater() {
-  m->conf.mutable_opt_config()->set_use_sparse_remote_updater(false);
-}
-
-void TrainerConfigHelper::disableRemoteSparseUpdaterForEachParams() {
-  this->disableRemoteSparseUpdater();
-  for (int i = 0; i < m->conf.model_config().parameters_size(); ++i) {
-    m->conf.mutable_model_config()
-        ->mutable_parameters(i)
-        ->set_sparse_remote_update(false);
-  }
-}
-
-OptimizationConfig &TrainerConfigHelper::getOptConfig() {
-  return *m->conf.mutable_opt_config();
-}
-
-void TrainerConfigHelper::setSaveDir(const std::string &saveDir) {
-  m->conf.set_save_dir(saveDir);
-}
-
-const std::string &TrainerConfigHelper::getSaveDir() const {
-  return m->conf.save_dir();
-}
-
-std::string TrainerConfigHelper::getConfigNameFromPath(
-    const std::string &modelPath) {
-  std::ifstream s(path::join(modelPath, "path.txt"));
-  CHECK(s.is_open()) << " fail to open path.txt";
-  std::string ss;
-  getline(s, ss);
-  VLOG(3) << "fileName " << path::join(modelPath, ss);
-  s.close();
-  return path::join(modelPath, ss);
-}
-
-std::string TrainerConfigHelper::getConfigNameFromPassId(
-    int passId, const std::string &modelPath) {
-  constexpr int kBufLen = 100;
-  char buf[kBufLen];
-  snprintf(buf, kBufLen, "pass-%05d", passId);
-  return TrainerConfigHelper::getConfigNameFromPath(path::join(modelPath, buf));
-}
-
-std::string TrainerConfigHelper::getConfigName(bool *ok) const {
-  std::string retv = "";
-
-  if (!m->conf.config_file().empty()) {
-    retv = m->conf.config_file();
-  } else if (!m->conf.init_model_path().empty()) {
-    retv = getConfigNameFromPath(m->conf.init_model_path());
-  } else if (m->conf.start_pass() >= 1) {
-    retv = getConfigNameFromPassId(m->conf.start_pass(), m->conf.save_dir());
-  }
-
-  if (ok) {
-    *ok = !retv.empty();
-  }
-
-  return retv;
-}
-
-std::shared_ptr<TrainerConfigHelper> TrainerConfigHelper::createFromFlags() {
-  std::string configPath;
-  if (!FLAGS_config.empty()) {
-    configPath = FLAGS_config;
-  } else if (!FLAGS_init_model_path.empty()) {
-    configPath = getConfigNameFromPath(FLAGS_init_model_path);
-  } else if (FLAGS_start_pass >= 1) {
-    configPath =
-        getConfigNameFromPassId(FLAGS_start_pass - 1, FLAGS_init_model_path);
-  } else {
-    return nullptr;
-  }
-  return std::make_shared<TrainerConfigHelper>(configPath);
-}
-
-std::shared_ptr<TrainerConfigHelper>
-TrainerConfigHelper::createFromFlagConfig() {
-  CHECK(!FLAGS_config.empty());
-  return std::make_shared<TrainerConfigHelper>(FLAGS_config);
-}
-
-}  // namespace paddle
diff --git a/paddle/trainer/TrainerConfigHelper.h b/paddle/trainer/TrainerConfigHelper.h
deleted file mode 100644
index b21dda964e70fce6e5e9672cc131595ad5af3bbc..0000000000000000000000000000000000000000
--- a/paddle/trainer/TrainerConfigHelper.h
+++ /dev/null
@@ -1,205 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <paddle/utils/Logging.h>
-#include <paddle/utils/Util.h>
-#include <memory>
-
-namespace paddle {
-
-class TrainerConfig;
-class OptimizationConfig;
-struct TrainerConfigHelperPrivate;
-class ModelConfig;
-class DataConfig;
-
-/**
- * @brief TrainerConfig Helper. A class wrap protobuf's TrainerConfig Object,
- * simplize the usage for TrainerConfig.
- *
- * The all operation to TrainerConfig object should use this object. It remove
- * many copy & paste code in trainer.
- *
- * @TODO(yuyang18): Make cmake check compiler support keyword 'final' or not.
- * Define a macro to unify 'final' keyword
- */
-class TrainerConfigHelper /*final*/ {
- public:
-  DISABLE_COPY(TrainerConfigHelper);
-
-  /**
-   * @brief Ctor, Create a TrainerConfig from config file
-   * @param configFilePath Config file path.
-   */
-  explicit TrainerConfigHelper(const std::string& configFilePath);
-  explicit TrainerConfigHelper(const TrainerConfig& config);
-
-  /**
-   * Dtor
-   * @warning this class is a final class. Should not be inherited.
-   */
-  ~TrainerConfigHelper();
-
-  /**
-   * @brief Get Trainer Config itself.
-   */
-  const TrainerConfig& getConfig() const;
-
-  TrainerConfig& getMutableConfig();
-
-  /**
-   * @brief Get Optimizer Config.
-   */
-  const OptimizationConfig& getOptConfig() const;
-
-  /**
-   * @brief Get Model Config.
-   */
-  const ModelConfig& getModelConfig() const;
-
-  /**
-   * @brief Get Train Data Config Pointer.
-   * @return nullptr if there is no train data. Else will return pointer
-   */
-  const DataConfig* getDataConfigPtr() const;
-
-  /**
-   * @brief Get Tain Data Config.
-   * @warning Core when there is no train data.
-   */
-  const DataConfig& getDataConfig() const {
-    CHECK(this->hasDataConfig());
-    auto conf = this->getDataConfigPtr();
-    return *conf;
-  }
-
-  /**
-   * @brief Get test data config
-   * @warning Core when there is no test data.
-   */
-  const DataConfig& getTestDataConfig() const;
-
-  /**
-   * @brief Has train data config or not.
-   * @return true if has train data.
-   */
-  bool hasDataConfig() const;
-
-  /**
-   * @brief Has test data config or not.
-   * @return true if has test data.
-   */
-  bool hasTestDataConfig() const;
-
-  /**
-   * @brief Update trainer config from command line flags.
-   *        Override config's (save_dir, init_model_path, start_pass) if command
-   *        flags is existed.
-   */
-  void updateConfigFromFlags();
-
-  /**
-   * @brief Disable optimization's sparse remote update.
-   */
-  void disableRemoteSparseUpdater();
-
-  /**
-   * @brief Disable optimization and each parameter's sparse remote update.
-   */
-  void disableRemoteSparseUpdaterForEachParams();
-
-  /**
-   * @brief implicit conversion.
-   */
-  inline operator const TrainerConfig&() const { return this->getConfig(); }
-
-  /**
-   * @brief implicit conversion.
-   */
-  inline operator const OptimizationConfig&() const {
-    return this->getOptConfig();
-  }
-
-  /**
-   * @brief implicit conversion.
-   */
-  inline operator const DataConfig&() const { return this->getDataConfig(); }
-
-  /**
-   * @brief implicit conversion.
-   */
-  inline operator const ModelConfig&() const { return this->getModelConfig(); }
-
-  /**
-   * @brief Get mutable optimization config.
-   */
-  OptimizationConfig& getOptConfig();
-
-  /**
-   * @brief set model save directory.
-   * @param saveDir Directory path.
-   */
-  void setSaveDir(const std::string& saveDir);
-
-  /**
-   * @brief get model save directory.
-   * @return save directory path.
-   */
-  const std::string& getSaveDir() const;
-
-  /**
-   * @brief Get config file name from model path.
-   *
-   * Paddle save model to a directory, and write a file 'path.txt' which save
-   * config filename.
-   *
-   * @param modelPath model saved directory.
-   * @return config file name.
-   */
-  static std::string getConfigNameFromPath(const std::string& modelPath);
-
-  /**
-   * @brief Get config file name from this config instance.
-   * @param[out] ok true if no error.
-   * @return config file name.
-   */
-  std::string getConfigName(bool* ok = nullptr) const;
-
-  /**
-   * @brief Try to create TrainerConfigHelper from all command line flags.
-   *        Try to load from --config, --init_model_path, --start_pass one by
-   *        one. Return nullptr if cannot load TrainerConfigHelper from all
-   *        these place.
-   * @return nullptr if cannot load, otherwise return a TrainerConfigHelper.
-   */
-  static std::shared_ptr<TrainerConfigHelper> createFromFlags();
-
-  /**
-   * @brief Try to create TrainerConfigHelper only from '--config' flag.
-   * @return nullptr if cannot load, otherwise return a TrainerConfigHelper.
-   */
-  static std::shared_ptr<TrainerConfigHelper> createFromFlagConfig();
-
- private:
-  static std::string getConfigNameFromPassId(int passId,
-                                             const std::string& modelPath);
-
-  TrainerConfigHelperPrivate* m;
-};
-
-typedef std::shared_ptr<TrainerConfigHelper> TrainerConfigHelperPtr;
-
-}  // namespace paddle
diff --git a/paddle/trainer/TrainerInternal.cpp b/paddle/trainer/TrainerInternal.cpp
deleted file mode 100644
index 4c5d4a0913aaf3a9932b3d67806378ece4245304..0000000000000000000000000000000000000000
--- a/paddle/trainer/TrainerInternal.cpp
+++ /dev/null
@@ -1,303 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "TrainerInternal.h"
-
-#include <fenv.h>
-#include <stdio.h>
-
-#include <iomanip>
-#include <iostream>
-#include <limits>
-#include <sstream>
-
-#include <google/protobuf/text_format.h>
-
-#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
-#include "paddle/gserver/layers/ValidationLayer.h"
-#include "paddle/utils/GlobalConstants.h"
-#include "paddle/utils/PythonUtil.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/Util.h"
-
-#include "RemoteParameterUpdater.h"
-#include "ThreadParameterUpdater.h"
-
-namespace paddle {
-
-void TrainerInternal::init(const std::shared_ptr<TrainerConfigHelper>& config,
-                           const GradientMachinePtr& gradientMachine,
-                           std::unique_ptr<TrainerInternalConfig>&& intconfig,
-                           const std::shared_ptr<TrainerStats>& stats,
-                           bool testing) {
-  config_ = config;
-  intconfig_ = std::move(intconfig);
-  stats_ = stats;
-
-  //! in training will use parameter updater definitly.
-  //! But only use parameter in testing mode when some parameter in pserver.
-  if (!testing || (config_->getOptConfig().use_sparse_remote_updater() &&
-                   intconfig_->loadsave_parameters_in_pserver)) {
-    createParameterUpdater(testing);
-  }
-
-  gradientMachine_ = gradientMachine;
-  if (!gradientMachine) {
-    CHECK(config_->getConfig().has_model_config())
-        << "Missing model_config in trainer_config";
-    gradientMachine_.reset(
-        GradientMachine::create(config_->getConfig().model_config(),
-                                intconfig_->mode,
-                                parameterUpdater_->getParameterTypes()));
-  }
-}
-
-void TrainerInternal::trainOneBatch(int64_t batchId,
-                                    const DataBatch& dataBatch,
-                                    std::vector<Argument>* outArgs) {
-  // true means updating parameter whenever gradient is ready during backward()
-  bool doPipelineUpdate =
-      (intconfig_->mode != GradientMachine::kSgdSparseCpuTraining) &&
-      (intconfig_->local || intconfig_->use_gpu ||
-       intconfig_->trainer_count <= 1);
-
-  int64_t actualBatchSize = dataBatch.getSize();
-  if (actualBatchSize == 0) {
-    return;
-  }
-
-  bool showStats = intconfig_->show_param_stats_period > 0 &&
-                   (batchId + 1) % intconfig_->show_param_stats_period == 0 &&
-                   intconfig_->trainer_id == 0;
-
-  std::vector<ParaStat> paraStats;
-  if (showStats) {
-    paraStats.resize(gradientMachine_->getParameters().size());
-  }
-
-  const std::vector<Argument>& inArgs = dataBatch.getStreams();
-
-  PassType passType = parameterUpdater_->startBatch(actualBatchSize);
-
-  if (config_->getOptConfig().use_sparse_remote_updater()) {
-    REGISTER_TIMER("prefetch");
-    gradientMachine_->prefetch(inArgs);
-    parameterUpdater_->getParametersRemote();
-  }
-
-  UpdateCallback updateCallback = [this, showStats, &paraStats](
-      Parameter* para) {
-    if (showStats) {
-      //! @TODO(yuyang18) Show stats is actually a ParameterHook, refactor
-      // it
-      //! to ParameterHook.
-      auto& grad = para->getBuf(PARAMETER_GRADIENT);
-      SetDevice device(para->getDeviceId());
-      paraStats[para->getID()].avgAbsGrad = grad->getAbsSum() / para->getSize();
-      paraStats[para->getID()].maxAbsGrad = grad->getAbsMax();
-    }
-    parameterUpdater_->update(para);
-  };
-
-  {
-#ifndef PADDLE_DISABLE_TIMER
-    Timer timer;
-    timer.start();
-#endif
-    REGISTER_TIMER("forwardBackward");
-    forwardBackwardBatch(
-        inArgs, *outArgs, passType, updateCallback, doPipelineUpdate);
-#ifndef PADDLE_DISABLE_TIMER
-    timer.stop();
-    parameterUpdater_->setForwardbackwardTime(timer.get());
-#endif
-  }
-
-  if (!doPipelineUpdate) {
-    auto& parameters = gradientMachine_->getNonStaticParameters();
-    for (auto& para : parameters) {
-      updateCallback(para.get());
-    }
-  }
-
-  real cost = 0;
-  {
-    REGISTER_TIMER("sumCost");
-    cost = Argument::sum(*outArgs);
-  }
-
-  if (batchId % intconfig_->log_period == 0) {
-    currentEvaluator_->start();
-    stats_->resetCurrentStat();
-  }
-  {
-    REGISTER_TIMER("eval");
-    gradientMachine_->eval(currentEvaluator_);
-    gradientMachine_->eval(evaluator_);
-  }
-
-  *stats_ += {actualBatchSize, cost};
-  {
-    REGISTER_TIMER("finishBatch");
-    parameterUpdater_->finishBatch(cost);
-  }
-
-  if (showStats) {
-    showParameterStats(paraStats);
-  }
-  if ((batchId + 1) % intconfig_->log_period == 0) {
-    currentEvaluator_->finish();
-
-    if (intconfig_->dot_period > 0) {
-      std::cerr << std::endl;
-    }
-    LOG(INFO) << " Batch=" << batchId + 1 << " " << *stats_
-              << " Eval: " << *evaluator_
-              << " CurrentEval: " << *currentEvaluator_;
-  } else if (intconfig_->dot_period > 0 &&
-             (batchId + 1) % intconfig_->dot_period == 0) {
-    std::cerr << ".";
-  }
-}
-
-/**
- * finish train pass
- */
-void TrainerInternal::finishTrainPass(int passId, int batchId) {
-  gradientMachine_->onPassEnd();
-  parameterUpdater_->finishPass();
-  evaluator_->finish();
-  LOG(INFO) << " Pass=" << passId << " Batch=" << batchId << " "
-            << stats_->getStats(false /*without current cost*/)
-            << " Eval: " << *evaluator_;
-}
-
-void TrainerInternal::showParameterStats(
-    const std::vector<ParaStat>& paraStats) {
-  std::vector<ParameterPtr>& parameters = gradientMachine_->getParameters();
-  for (auto& parameter : parameters) {
-    SetDevice device(parameter->getDeviceId());
-    real sum = parameter->getBuf(PARAMETER_VALUE)->getAbsSum();
-    const auto& lr = parameter->getBuf(PARAMETER_LEARNING_RATE);
-    std::ostringstream osLrHistogram;
-    if (lr) {
-      if (VLOG_IS_ON(2)) {
-        osLrHistogram << " lr_histogram: ";
-        lr->histogram(osLrHistogram);
-      } else {
-        osLrHistogram << " max_lr=" << std::setw(11) << lr->getMax()
-                      << " min_lr=" << std::setw(11) << lr->getMin()
-                      << " avg_lr=" << std::setw(11)
-                      << lr->getSum() / parameter->getSize();
-      }
-    }
-    int pid = parameter->getID();
-    LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ')
-              << std::setw(20) << parameter->getName()
-              << " avg_abs_val=" << std::setw(11) << sum / parameter->getSize()
-              << " max_val=" << std::setw(11)
-              << parameter->getBuf(PARAMETER_VALUE)->getAbsMax()
-              << " avg_abs_grad=" << std::setw(11) << paraStats[pid].avgAbsGrad
-              << " max_grad=" << std::setw(11) << paraStats[pid].maxAbsGrad
-              << osLrHistogram.str();
-  }
-}
-
-void TrainerInternal::createParameterUpdater(bool testing) {
-  const std::string& alg = config_->getOptConfig().algorithm();
-  parameterUpdater_.reset(ParameterUpdaterCreators::tryCreateUpdater(
-      alg, config_->getOptConfig(), intconfig_->local, intconfig_->num_passes));
-  if (parameterUpdater_) {
-    return;
-  }
-
-  if (!intconfig_->local) {
-    if (testing && config_->getOptConfig().use_sparse_remote_updater()) {
-      std::unique_ptr<ParameterUpdater> localUpdater;
-      localUpdater.reset(
-          new SgdLocalUpdater(config_->getOptConfig()));  // do nothing
-      parameterUpdater_.reset(
-          new SparseRemoteParameterUpdaterComposite(config_->getOptConfig(),
-                                                    intconfig_->num_passes,
-                                                    testing,
-                                                    std::move(localUpdater)));
-    } else {
-      if (GradientMachine::kSgdSparseCpuTraining == intconfig_->mode &&
-          !intconfig_->use_old_updater) {
-        intconfig_->use_old_updater = true;
-        LOG(INFO) << "Sgd sparse training can not work with"
-                  << " ConcurrentRemoteParameterUpdater,"
-                  << " automatically reset --use_old_updater=true";
-      }
-
-      std::unique_ptr<ParameterUpdater> localUpdater;
-      if (config_->getOptConfig().num_batches_per_send_parameter() > 1) {
-        CHECK(alg == TrainAlgorithm::SGD || alg == TrainAlgorithm::AsyncSGD)
-            << "Unsupported algorithm in remote-local mode: " << alg;
-        if (GradientMachine::kSgdSparseCpuTraining == intconfig_->mode) {
-          localUpdater.reset(new SgdThreadUpdater(*config_));
-        } else {
-          localUpdater.reset(new SgdLocalUpdater(*config_));
-        }
-      }
-
-      localUpdater.reset(
-          intconfig_->use_old_updater
-              ? new RemoteParameterUpdater(
-                    *config_, intconfig_->num_passes, std::move(localUpdater))
-              : new ConcurrentRemoteParameterUpdater(
-                    *config_, intconfig_->num_passes, std::move(localUpdater)));
-
-      if (config_->getOptConfig().use_sparse_remote_updater()) {
-        localUpdater.reset(
-            new SparseRemoteParameterUpdaterComposite(*config_,
-                                                      intconfig_->num_passes,
-                                                      testing,
-                                                      std::move(localUpdater)));
-      }
-
-      this->parameterUpdater_ = std::move(localUpdater);
-    }
-  } else {
-    CHECK_EQ(config_->getOptConfig().num_batches_per_send_parameter(), 1)
-        << "num_batches_per_send_parameter should be one in local mode!";
-
-    if (GradientMachine::kSgdSparseCpuTraining == intconfig_->mode) {
-      parameterUpdater_.reset(new SgdThreadUpdater(*config_));
-    } else if (alg == TrainAlgorithm::SGD || alg == TrainAlgorithm::AsyncSGD) {
-      if (config_->getModelConfig().type() == "recursive_nn") {
-        parameterUpdater_.reset(new SgdCpuUpdater(*config_));
-      } else if (intconfig_->use_gpu &&
-                 config_->getOptConfig().do_average_in_cpu() &&
-                 config_->getOptConfig().average_window() > 0) {
-        parameterUpdater_.reset(new SgdUpdaterWithCpuAverager(*config_));
-      } else {
-        parameterUpdater_.reset(new SgdLocalUpdater(*config_));
-      }
-    } else {
-      LOG(FATAL) << "Unsupported algorithm in local mode: " << alg;
-    }
-  }
-}
-
-void TrainerInternal::forwardBackwardBatch(const std::vector<Argument>& inArgs,
-                                           std::vector<Argument>& outArgs,
-                                           PassType& passType,
-                                           UpdateCallback updateCallback,
-                                           bool doPipelineUpdate) {
-  gradientMachine_->forwardBackward(
-      inArgs, &outArgs, passType, doPipelineUpdate ? updateCallback : nullptr);
-}
-
-}  // namespace paddle
diff --git a/paddle/trainer/TrainerInternal.h b/paddle/trainer/TrainerInternal.h
deleted file mode 100644
index 48ee53a5e60f950bfc3cc299c754b0e72601c818..0000000000000000000000000000000000000000
--- a/paddle/trainer/TrainerInternal.h
+++ /dev/null
@@ -1,139 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/utils/Util.h"
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <fstream>
-
-#include "ParameterUpdater.h"
-#include "TrainerConfig.pb.h"
-#include "TrainerConfigHelper.h"
-#include "TrainerInternalConfig.h"
-#include "hl_gpu.h"
-#include "paddle/gserver/gradientmachines/GradientMachine.h"
-
-namespace paddle {
-
-/**
- * TrainerInteral
- * the core training class for driving training logic
- */
-class TrainerInternal {
- public:
-  struct ParaStat {
-    real maxAbsGrad;
-    real avgAbsGrad;
-    ParaStat() : maxAbsGrad(.0), avgAbsGrad(.0) {}
-  };
-
-  TrainerInternal() {}
-
-  /**
-   * Intializes trainer internal class
-   * @param config network config
-   * @param machine gradient machine
-   * @param intconfig training config
-   * @param stats training stats
-   * @param testing if it is in testing phase
-   */
-  void init(const std::shared_ptr<TrainerConfigHelper>& config,
-            const GradientMachinePtr& machine,
-            std::unique_ptr<TrainerInternalConfig>&& intconfig,
-            const std::shared_ptr<TrainerStats>& stats,
-            bool testing);
-
-  virtual ~TrainerInternal() {}
-
-  /**
-   * CreateParameterUpdater
-   * @param testing if it is in testing phase
-   */
-  void createParameterUpdater(bool testing);
-
-  /**
-   * FinishTrainPass
-   * @param passId current pass id
-   * @param batchId current batch id, starts from 0
-   */
-  void finishTrainPass(int passId, int batchId);
-
-  /**
-   * trainOneBatch
-   * @param batchId current batch id
-   * @param dataBatch data for the batch
-   */
-  void trainOneBatch(int64_t batchId,
-                     const DataBatch& dataBatch,
-                     std::vector<Argument>* outArgs);
-
-  /**
-   * showParameterStats
-   * @param paraStats training stats
-   */
-  void showParameterStats(const std::vector<ParaStat>& paraStats);
-
-  /**
-   * getGradientMachine
-   */
-  inline const GradientMachinePtr& getGradientMachine() const {
-    return gradientMachine_;
-  }
-
-  /**
-   * getParameterUpdater
-   */
-  inline const std::shared_ptr<ParameterUpdater>& getParameterUpdater() {
-    return parameterUpdater_;
-  }
-
-  /**
-   * setCurrentEvaluator
-   * @param eval evaluator to set
-   */
-  inline void setCurrentEvaluator(Evaluator* eval) { currentEvaluator_ = eval; }
-
-  /**
-   * setEvaluator
-   * @param eval evaluator to set
-   */
-  inline void setEvaluator(Evaluator* eval) { evaluator_ = eval; }
-
-  /**
-   * forwardBackwardBatch
-   * @param inArgs input argument for data batch
-   * @param outArgs output argument from neural network
-   * @param updateCallback layerwise parameter gradient statistics
-   * @param doPipelineUpdate whether to do pipeline update
-   */
-  virtual void forwardBackwardBatch(const std::vector<Argument>& inArgs,
-                                    std::vector<Argument>& outArgs,
-                                    PassType& passType,
-                                    UpdateCallback updateCallback,
-                                    bool doPipelineUpdate);
-
- protected:
-  std::shared_ptr<ParameterUpdater> parameterUpdater_;
-  GradientMachinePtr gradientMachine_;
-  std::shared_ptr<TrainerConfigHelper> config_;
-  std::unique_ptr<TrainerInternalConfig> intconfig_;
-  std::shared_ptr<TrainerStats> stats_;
-  Evaluator* currentEvaluator_;
-  Evaluator* evaluator_;
-};
-
-}  // namespace paddle
diff --git a/paddle/trainer/TrainerInternalConfig.h b/paddle/trainer/TrainerInternalConfig.h
deleted file mode 100644
index 43aae381029784278ad58c9398f64af24dffa1df..0000000000000000000000000000000000000000
--- a/paddle/trainer/TrainerInternalConfig.h
+++ /dev/null
@@ -1,233 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/utils/Util.h"
-
-#include <stdio.h>
-
-#include "hl_gpu.h"
-#include "paddle/gserver/gradientmachines/GradientMachine.h"
-
-#include "TrainerConfig.pb.h"
-
-#include <stdlib.h>
-#include <fstream>
-#include <sstream>
-#include "ParameterUpdater.h"
-
-namespace paddle {
-/**
- * @brief TrainerStats object will statistics sample processed and total cost.
- *
- * There are two stats in it, the 'AvgCost' and 'CurrentAvgCost'. 'AvgCost'
- * means cost through one pass(all mini-batches). 'CurrentAvgCost' means cost
- * through one mini-batch.
- */
-class TrainerStats {
- public:
-  /**
-   * @brief reset all stats.
-   *
-   * often used before pass start.
-   */
-  inline void reset() {
-    numProcessed_ = 0;
-    totalCost_ = .0;
-    this->resetCurrentStat();
-  }
-
-  /**
-   * @brief reset current stat.
-   *
-   * 'current' means the most recent --log_period mini-batches
-   */
-  inline void resetCurrentStat() {
-    currentCost_ = .0;
-    currentSamples_ = 0;
-  }
-
-  /**
-   * @brief add cost to stat.
-   * @param numProcessed current mini-batch size
-   * @param cost current mini-batch cost
-   */
-  inline void addCost(int64_t numProcessed, real cost) {
-    this->numProcessed_ += numProcessed;
-    this->totalCost_ += cost;
-    this->currentSamples_ += numProcessed;
-    this->currentCost_ += cost;
-  }
-
-  /**
-   * @brief get average cost through on pass(all processed mini-batches)
-   * @return pass average cost
-   */
-  inline real getAvgCost() const {
-    CHECK_NE(this->numProcessed_, 0);
-    return this->totalCost_ / this->numProcessed_;
-  }
-
-  /**
-   * @brief get current mini-batch's average cost.
-   * @return mini-batch average cost
-   */
-  inline real getCurrentAvgCost() const {
-    CHECK_NE(this->currentSamples_, 0);
-    return this->currentCost_ / this->currentSamples_;
-  }
-
-  /**
-   * @brief get all processed samples' number
-   * @return all processed samples' number
-   */
-  inline int64_t getNumProcessed() const { return this->numProcessed_; }
-
-  /**
-   * @brief same function as addCost. But it is simple to invoke.
-   * For example:
-   *
-   * @code{.cpp}
-   * TrainerStats stat;
-   * cost = neuralNetwork.forward(batchSize);
-   * stat += {batchSize, cost};
-   * @endcode
-   *
-   * @param p a pair of parameter, first is numProcessed, second is cost.
-   * @return *this
-   */
-  inline TrainerStats& operator+=(const std::pair<int64_t, real>& p) {
-    this->addCost(p.first, p.second);
-    return *this;
-  }
-
-  /**
-   * @brief TrainerStats Constructor.
-   *
-   * reset stat when constructed.
-   */
-  inline TrainerStats() { this->reset(); }
-
-  /**
-   * @brief show stats to ostream.
-   *
-   * If there is no need to print current cost, set withCurrentCost to False.
-   *
-   * @param os output stream.
-   * @param withCurrentCost print current cost or not.
-   */
-  void showStats(std::ostream& os, bool withCurrentCost = true) const {
-    os << "samples=" << this->getNumProcessed()
-       << " AvgCost=" << this->getAvgCost();
-    if (withCurrentCost) {
-      os << " CurrentCost=" << this->getCurrentAvgCost();
-    }
-  }
-
-  /**
-   * @brief get stats to std::string
-   * @param withCurrentCost return current cost or not
-   * @return stats string
-   */
-  std::string getStats(bool withCurrentCost = true) const {
-    std::ostringstream os;
-    this->showStats(os, withCurrentCost);
-    return os.str();
-  }
-
- private:
-  int64_t numProcessed_;
-  real totalCost_;
-  real currentCost_;
-  int64_t currentSamples_;
-};
-
-inline std::ostream& operator<<(std::ostream& os, const TrainerStats& stats) {
-  stats.showStats(os);
-  return os;
-}
-
-/**
- * TrainerInternalConfig
- * general configs for training
- */
-struct TrainerInternalConfig {
-  /**
-   * @brief Create TrainerInternalConfig from GradientMachine::CreateMode and
-   * command line arguments.
-   * @param mode
-   * @return
-   */
-  static std::unique_ptr<TrainerInternalConfig> createFromMode(
-      GradientMachine::CreateMode mode);
-
-  /**
-   * indicate whether the training is local
-   * if local, no parameter server is used
-   */
-  bool local;
-
-  /**
-   * indicate whether training uses GPU
-   */
-  bool use_gpu;
-
-  /**
-   * indicate number of trainer
-   */
-  int trainer_count;
-
-  /**
-   * how frequently to show param stats
-   */
-  int show_param_stats_period;
-
-  /**
-   * current trainer id
-   */
-  int trainer_id;
-
-  /**
-   * frequency to dump log
-   */
-  int log_period;
-
-  /**
-   * dot period
-   */
-  int dot_period;
-
-  /**
-   * num passes for training
-   */
-  int num_passes;
-
-  /**
-   * use old updater
-   */
-  bool use_old_updater;
-
-  /**
-   * whether to load and save parameter in pserver
-   */
-  bool loadsave_parameters_in_pserver;
-
-  /**
-   * training mode
-   */
-  GradientMachine::CreateMode mode;
-};
-
-}  //  namespace paddle
diff --git a/paddle/trainer/TrainerMain.cpp b/paddle/trainer/TrainerMain.cpp
deleted file mode 100644
index c5c1d484e5f85c774fd4b8f1d4a8d46abfa2f547..0000000000000000000000000000000000000000
--- a/paddle/trainer/TrainerMain.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <fenv.h>
-#include "paddle/pserver/ParameterServerController.h"
-#include "paddle/utils/PythonUtil.h"
-
-#include "ParamUtil.h"
-#include "Trainer.h"
-
-DEFINE_bool(start_pserver, false, "Whether to start pserver");
-DECLARE_int32(gpu_id);
-DEFINE_string(job, "train", "one of (train, test, checkgrad)");
-DECLARE_int32(start_pass);
-DECLARE_string(config);
-DECLARE_string(init_model_path);
-DECLARE_string(rdma_tcp);
-
-using namespace paddle;  // NOLINT
-
-int main(int argc, char** argv) {
-  // write logs instantly (never buffer log messages)
-  FLAGS_logbuflevel = -1;
-
-  initMain(argc, argv);
-  initPython(argc, argv);
-
-  std::unique_ptr<ParameterServerController> parameterServerPtr(nullptr);
-  if (FLAGS_start_pserver) {
-    parameterServerPtr.reset(
-        paddle::ParameterServerController::createFromGflags());
-    parameterServerPtr->start();
-  }
-  Trainer trainer;
-  auto config = TrainerConfigHelper::createFromFlags();
-  CHECK(config != nullptr) << "no valid config";
-
-  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
-  trainer.init(config, FLAGS_job == "test");
-
-  if (FLAGS_job == "train") {
-    trainer.train();
-  } else if (FLAGS_job == "checkgrad") {
-    trainer.checkGradient();
-  } else if (FLAGS_job == "test") {
-    trainer.test();
-  } else if (FLAGS_job == "time") {
-    trainer.time();
-  } else {
-    LOG(FATAL) << "Unknown job type: " << FLAGS_job;
-  }
-
-  return 0;
-}
diff --git a/paddle/trainer/tests/CMakeLists.txt b/paddle/trainer/tests/CMakeLists.txt
deleted file mode 100644
index 12c9ea8cef79a6bdbd6e26c35612d0abbe00257b..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/CMakeLists.txt
+++ /dev/null
@@ -1,37 +0,0 @@
-add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/sample_trainer_config.conf
-    COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/* ${CMAKE_CURRENT_BINARY_DIR}
-)
-add_custom_target(copy_trainer_conf ALL DEPENDS sample_trainer_config.conf)
-
-set(PYTHON_PATH 
-   ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d 
-   ${PADDLE_BINARY_DIR}/python/:${PADDLE_BINARY_DIR}/paddle/trainer/tests)
-function(trainer_test TARGET)
-  add_unittest_without_exec(${TARGET} ${TARGET}.cpp)
-  add_test(NAME ${TARGET}
-    COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}
-      WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
-endfunction()
-
-trainer_test(test_Compare)
-trainer_test(test_PyDataProviderWrapper)
-trainer_test(test_recurrent_machine_generation)
-trainer_test(test_Trainer)
-
-############### test_TrainerOnePass ##########################
-if(WITH_PYTHON)
-  # only run test_TrainerOnePass when PYTHON is enabled, because train one pass
-  # is using PyDataProvider2.
-  add_unittest_without_exec(test_TrainerOnePass
-      test_TrainerOnePass.cpp)
-  add_test(NAME test_TrainerOnePass
-    COMMAND ${PYTHON_PATH} ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port 
-          ${CMAKE_CURRENT_BINARY_DIR}/test_TrainerOnePass
-      WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
-endif()
-
-#################### test_config_parser #########################
-add_test(NAME test_config_parser
-  COMMAND ${PYTHON_PATH} ${PYTHON_EXECUTABLE} 
-        ${PADDLE_SOURCE_DIR}/paddle/trainer/tests/config_parser_test.py
-    WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
diff --git a/paddle/trainer/tests/config_parser_test.py b/paddle/trainer/tests/config_parser_test.py
deleted file mode 100644
index db66ebb5b7c13fe53df14a07918aad62ba895ffa..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/config_parser_test.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer.config_parser import parse_config_and_serialize
-
-if __name__ == '__main__':
-    parse_config_and_serialize('trainer/tests/test_config.conf', '')
-    parse_config_and_serialize(
-        'trainer/tests/sample_trainer_config.conf',
-        'extension_module_name=paddle.trainer.config_parser_extension')
-    parse_config_and_serialize('gserver/tests/pyDataProvider/trainer.conf', '')
diff --git a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.list b/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.list
deleted file mode 100644
index 0db50f34dd24b5e6fbc33a1e8dd3c16cb59eb56e..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.list
+++ /dev/null
@@ -1 +0,0 @@
-trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.data
diff --git a/paddle/trainer/tests/sample_filelist.txt b/paddle/trainer/tests/sample_filelist.txt
deleted file mode 100644
index 7db4c735359a380dc150e24368653d2a6a55a453..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/sample_filelist.txt
+++ /dev/null
@@ -1 +0,0 @@
-trainer/tests/sample_data.txt
diff --git a/paddle/trainer/tests/sample_trainer_config.conf b/paddle/trainer/tests/sample_trainer_config.conf
deleted file mode 100644
index 2697832840f35a33c07f1664ef18a229d656d784..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/sample_trainer_config.conf
+++ /dev/null
@@ -1,87 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-TrainData(SimpleData(
-            files = "trainer/tests/sample_filelist.txt",
-            feat_dim = 3,
-            context_len = 0,
-            buffer_capacity = 1000000))
-
-TestData(SimpleData(
-           files = "trainer/tests/sample_filelist.txt",
-           feat_dim = 3,
-           context_len = 0,
-           buffer_capacity = 1000000))
-
-settings(batch_size = 100)
-
-data = data_layer(name='input', size=3)
-
-fc1 = fc_layer(input=data, size=5,
-               bias_attr=False,
-               act=SigmoidActivation())
-
-fc2 = fc_layer(input=data, size=9,
-               bias_attr=False,
-               act=LinearActivation())
-
-fc3 = fc_layer(input=data, size=3,
-               bias_attr=False,
-               act=TanhActivation())
-
-fc4 = fc_layer(input=data, size=5,
-               bias_attr=False,
-               act=LinearActivation(),
-               param_attr=ParamAttr(name='sharew'))
-
-fc5 = fc_layer(input=data, size=5,
-               bias_attr=False,
-               act=BReluActivation())
-
-fc6 = fc_layer(input=data, size=5,
-               bias_attr=False,
-               act=SoftReluActivation())
-
-fc7 = fc_layer(input=data, size=3,
-               bias_attr=False,
-               act=SquareActivation())
-
-fc8 = fc_layer(input=data, size=5,
-               bias_attr=True,
-               act=SquareActivation())
-
-with mixed_layer(size=3, act=SoftmaxActivation()) as layer9:
-    layer9 += full_matrix_projection(input=fc1)
-    layer9 += full_matrix_projection(input=fc2)
-    layer9 += full_matrix_projection(input=fc3)
-    layer9 += trans_full_matrix_projection(input=fc4,
-                                           param_attr=ParamAttr(name='sharew'))
-    layer9 += full_matrix_projection(input=fc5)
-    layer9 += full_matrix_projection(input=fc6)
-    layer9 += full_matrix_projection(input=fc7)
-    layer9 += full_matrix_projection(input=fc8)
-
-if get_config_arg('with_cost', bool, True):
-    # This is for training the neural network.
-    # We need to have another data layer for label
-    # and a layer for calculating cost
-    lbl = data_layer(name='label', size=1)
-    outputs(classification_cost(input=layer9, label=lbl))
-else:    
-    # This is for prediction where we don't have label
-    # and don't need to calculate cost
-    outputs(layer9)
diff --git a/paddle/trainer/tests/sample_trainer_config_hsigmoid.conf b/paddle/trainer/tests/sample_trainer_config_hsigmoid.conf
deleted file mode 100644
index e4abe31d480b69bc2ff4741649b336714818515b..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/sample_trainer_config_hsigmoid.conf
+++ /dev/null
@@ -1,53 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from paddle.trainer_config_helpers import *
-
-TrainData(SimpleData(
-    files = "trainer/tests/sample_filelist.txt",
-    feat_dim = 3,
-    context_len = 0,
-    buffer_capacity = 1000000,
-))
-
-settings(batch_size = 100)
-
-data = data_layer(name='input', size=3)
-
-fc1 = fc_layer(input=data, size=12,
-               bias_attr=False,
-               act=SigmoidActivation())
-
-fc2 = fc_layer(input=data, size=19,
-               bias_attr=False,
-               act=LinearActivation())
-
-fc3 = fc_layer(input=data, size=5,
-               bias_attr=False,
-               act=TanhActivation())
-
-fc4 = fc_layer(input=data, size=5,
-               bias_attr=False,
-               act=LinearActivation())
-
-# This is for training the neural network.
-# We need to have another data layer for label
-# and a layer for calculating cost
-lbl = data_layer(name='label', size=1)
-
-outputs(hsigmoid(input=[fc1, fc2, fc3, fc4],
-                 label=lbl,
-                 num_classes=3))
diff --git a/paddle/trainer/tests/sample_trainer_config_parallel.conf b/paddle/trainer/tests/sample_trainer_config_parallel.conf
deleted file mode 100644
index e2b8b3ecdab83b4614dbe468c3a295c05867f7f9..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/sample_trainer_config_parallel.conf
+++ /dev/null
@@ -1,86 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-TrainData(SimpleData(
-            files = "trainer/tests/sample_filelist.txt",
-            feat_dim = 3,
-            context_len = 0,
-            buffer_capacity = 1000000))
-
-TestData(SimpleData(
-           files = "trainer/tests/sample_filelist.txt",
-           feat_dim = 3,
-           context_len = 0,
-           buffer_capacity = 1000000))
-
-settings(batch_size = 100)
-
-# Output layer, label layer, cost layer, preferably set to the same environment.
-output_device = 0
-
-# Input Layer does not need to specify the device number.
-data = data_layer(name='input', size=3)
-
-# Calculate in the CPU.
-fc1 = fc_layer(input=data, size=5,
-               bias_attr=True,
-               layer_attr=ExtraAttr(device=-1),
-               act=SigmoidActivation())
-
-# Calculate in the GPU 0.
-fc2 = fc_layer(input=fc1, size=10,
-               bias_attr=True,
-               layer_attr=ExtraAttr(device=0),
-               act=SigmoidActivation())
-
-# Calculate in the GPU 1.
-fc3 = fc_layer(input=fc1, size=10,
-               bias_attr=True,
-               layer_attr=ExtraAttr(device=1),
-               act=SigmoidActivation())
-
-# Calculate in the GPU 0.
-fc4 = fc_layer(input=[fc2,fc3], size=10,
-               bias_attr=True,
-               layer_attr=ExtraAttr(device=0),
-               act=SigmoidActivation())
-
-# Calculate in the GPU 1.
-fc5 = fc_layer(input=[fc2,fc3], size=10,
-               bias_attr=True,
-               layer_attr=ExtraAttr(device=1),
-               act=SigmoidActivation())
-
-output = fc_layer(input=[fc4,fc5], size=10,
-                  bias_attr=True,
-                  layer_attr=ExtraAttr(device=output_device),
-                  act=SoftmaxActivation())
-
-if get_config_arg('with_cost', bool, True):
-    # This is for training the neural network.
-    # We need to have another data layer for label
-    # and a layer for calculating cost
-    lbl = data_layer(name='label', size=1,
-                    layer_attr=ExtraAttr(device=output_device))
-                    
-    outputs(classification_cost(input=output, 
-                                label=lbl,
-                                layer_attr=ExtraAttr(device=output_device)))
-else:
-    # This is for prediction where we don't have label
-    # and don't need to calculate cost
-    outputs(output)
diff --git a/paddle/trainer/tests/sample_trainer_nest_rnn_gen.conf b/paddle/trainer/tests/sample_trainer_nest_rnn_gen.conf
deleted file mode 100644
index 741a0aa71df7866c180ab2513f28638117d0f1ca..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/sample_trainer_nest_rnn_gen.conf
+++ /dev/null
@@ -1,73 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=15, learning_rate=0)
-
-num_words = 5
-beam_flag = get_config_arg('beam_search', bool, False)
-
-sent_id = data_layer(name="sent_id", size=1)
-
-# This layer has no actual use, but only to decide batch_size in generation.
-# When generating, at least one Memory in RecurrentLayer MUST have a boot layer.
-dummy_data = data_layer(name="dummy_data_input", size=2)
-
-def outer_step(dummy_data):
-
-    gen_inputs = [StaticInput(input=dummy_data, size=2, is_seq=True),
-                  GeneratedInput(size=num_words,
-                                 embedding_name="wordvec",
-                                 embedding_size=num_words)]
-
-    def inner_step(dummy_memory, predict_word):
-
-        # simplified RNN for testing
-        with mixed_layer(size=num_words) as layer:
-            layer += full_matrix_projection(input=predict_word,
-                                            param_attr=ParamAttr(name="transtable"))
-
-        with mixed_layer(size=num_words, act=ExpActivation()) as out:
-            out += trans_full_matrix_projection(input=layer,
-                                                param_attr=ParamAttr(name="wordvec"))
-
-        return out
-
-    beam_gen = beam_search(name="rnn_gen",
-                           step=inner_step,
-                           input=gen_inputs,
-                           bos_id=0,
-                           eos_id=num_words-1,
-                           beam_size=2 if beam_flag else 1,
-                           num_results_per_sample=1,
-                           max_length=10)
-    return beam_gen
-
-beam_gen_concat = recurrent_group(name="rnn_gen_concat",
-                                  step=outer_step,
-                                  input=[SubsequenceInput(dummy_data)])
-
-seqtext_printer_evaluator(input=beam_gen_concat,
-                          id_input=sent_id,
-                          dict_file="./trainer/tests/test_gen_dict.txt",
-                          result_file="./trainer/tests/dump_text.test")
-#outputs(beam_gen_concat)
-# In this config, as dummy_data_input doesn't work on beam_gen (we can find dummy_memory
-# is read-only memory, and isn't used by other layers of step), we show the Inputs and Outputs
-# as follows. Note that "__beam_search_predict__" is the default output name of beam_search.
-Inputs("sent_id","dummy_data_input")
-Outputs("__beam_search_predict__")
diff --git a/paddle/trainer/tests/sample_trainer_rnn_gen.conf b/paddle/trainer/tests/sample_trainer_rnn_gen.conf
deleted file mode 100644
index 58d27f15ae1c0a38885ee105a7963b6e7bd55906..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/sample_trainer_rnn_gen.conf
+++ /dev/null
@@ -1,66 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=15, learning_rate=0)
-
-num_words = 5
-beam_flag = get_config_arg('beam_search', bool, False)
-
-sent_id = data_layer(name="sent_id", size=1)
-
-# This layer has no actual use, but only to decide batch_size in generation.
-# When generating, at least one Memory in RecurrentLayer MUST have a boot layer.
-dummy_data = data_layer(name="dummy_data_input", size=2)
-
-gen_inputs = [StaticInput(input=dummy_data, size=2),
-              GeneratedInput(size=num_words,
-                             embedding_name="wordvec",
-                             embedding_size=num_words)]
-
-def step(dummy_memory, predict_word):
-
-    # simplified RNN for testing
-    with mixed_layer(size=num_words) as layer:
-        layer += full_matrix_projection(input=predict_word,
-                                        param_attr=ParamAttr(name="transtable"))
-
-    with mixed_layer(size=num_words, act=ExpActivation()) as out:
-        out += trans_full_matrix_projection(input=layer,
-                                            param_attr=ParamAttr(name="wordvec"))
-
-    return out
-
-beam_gen = beam_search(name="rnn_gen",
-                       step=step,
-                       input=gen_inputs,
-                       bos_id=0,
-                       eos_id=num_words-1,
-                       beam_size=2 if beam_flag else 1,
-                       num_results_per_sample=2 if beam_flag else 1,
-                       max_length=10)
-
-seqtext_printer_evaluator(input=beam_gen,
-                          id_input=sent_id,
-                          dict_file="./trainer/tests/test_gen_dict.txt",
-                          result_file="./trainer/tests/dump_text.test")
-#outputs(beam_gen)
-# In this config, as dummy_data_input doesn't work on beam_gen (we can find dummy_memory
-# is read-only memory, and isn't used by other layers of step), we show the Inputs and Outputs
-# as follows. Note that "__beam_search_predict__" is the default output name of beam_search.
-Inputs("sent_id","dummy_data_input")
-Outputs("__beam_search_predict__")
diff --git a/paddle/trainer/tests/simple_sparse_neural_network.py b/paddle/trainer/tests/simple_sparse_neural_network.py
deleted file mode 100644
index 970fb466dc5061713fe7815d5247cbbde93be821..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/simple_sparse_neural_network.py
+++ /dev/null
@@ -1,37 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=17, learning_method=AdaGradOptimizer(), learning_rate=1e-4)
-
-file_list = 'trainer/tests/fake_file_list.list'
-
-define_py_data_sources2(
-    train_list=file_list,
-    test_list=file_list,
-    module="simple_sparse_neural_network_dp",
-    obj="process")
-
-embedding = embedding_layer(
-    input=data_layer(
-        name="word_ids", size=8191),
-    size=128,
-    param_attr=ParamAttr(sparse_update=True))
-prediction = fc_layer(input=embedding, size=10, act=SoftmaxActivation())
-
-outputs(
-    classification_cost(
-        input=prediction, label=data_layer(
-            name='label', size=10)))
diff --git a/paddle/trainer/tests/test_Compare.cpp b/paddle/trainer/tests/test_Compare.cpp
deleted file mode 100644
index f3a964acb69be059a43470f7b68910a3b6cecaab..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/test_Compare.cpp
+++ /dev/null
@@ -1,157 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <paddle/utils/PythonUtil.h>
-
-#include "paddle/trainer/Trainer.h"
-
-#include <gtest/gtest.h>
-#include <cstdlib>
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-static const string& configFile = "trainer/tests/sample_trainer_config.conf";
-
-DECLARE_int32(gpu_id);
-DECLARE_bool(use_gpu);
-DECLARE_string(config);
-DECLARE_string(config_args);
-
-struct comData {
-  vector<Argument> outArgs;
-  vector<ParameterPtr> parameters;
-};
-
-void calcGradient(bool useGpu, comData& Data) {
-  FLAGS_use_gpu = useGpu;
-  FLAGS_config = configFile;
-
-  *ThreadLocalRand::getSeed() = 0;
-  srand(0);
-  Trainer trainer;
-  trainer.init(TrainerConfigHelper::createFromFlagConfig());
-
-  Data.parameters = trainer.getGradientMachine()->getParameters();
-  DataBatch dataBatch;
-  int32_t batchSize = trainer.getConfig().opt_config().batch_size();
-  trainer.getDataProvider()->setSkipShuffle();
-  trainer.getDataProvider()->getNextBatch(batchSize, &dataBatch);
-  CHECK(dataBatch.getSize()) << "No data from data provider";
-  vector<Argument>& inArgs = dataBatch.getStreams();
-  trainer.getGradientMachine()->start();
-  for (int i = 0; i < 2; ++i) {
-    trainer.getGradientMachine()->forwardBackward(
-        inArgs, &Data.outArgs, PASS_TRAIN);
-  }
-  trainer.getGradientMachine()->finish();
-}
-
-void compareGradient(comData& comDataCpu, comData& comDataGpu);
-
-TEST(Trainer, create) {
-  int devCount = 0;
-  devCount = hl_get_device_count();
-  FLAGS_config_args = "drop_rate=0";
-
-  comData comDataCpu;
-  calcGradient(false, comDataCpu);
-  LOG(INFO) << "Cpu is completed";
-
-  {
-    LOG(INFO) << "Test GPU";
-    comData comData;
-    calcGradient(true, comData);
-    compareGradient(comDataCpu, comData);
-    LOG(INFO) << "Gpu is completed";
-  }
-
-  {
-    LOG(INFO) << "Test test multi gpu";
-    comData comData;
-    FLAGS_trainer_count = devCount;
-    calcGradient(true, comData);
-    compareGradient(comDataCpu, comData);
-    LOG(INFO) << "Gpu4 is completed";
-  }
-
-  {
-    LOG(INFO) << "Test use_sparse_update=true";
-    comData comData;
-    calcGradient(false, comData);
-    compareGradient(comDataCpu, comData);
-    LOG(INFO) << "Cpu4 is completed";
-  }
-}
-
-double checkBuffer(real* A, real* B, size_t len) {
-#ifdef PADDLE_TYPE_DOUBLE
-  double precision = 1e-7;
-#else
-  double precision = 2e-3;
-#endif
-  int nNum = 0;
-  double maxE = 0;
-  for (size_t i = 0; i < len; ++i) {
-    double e = fabs(A[i] - B[i]);
-    maxE = std::max(e, maxE);
-    nNum += e > precision * fabs(A[i]);
-  }
-  EXPECT_EQ(0, nNum);
-  return maxE;
-}
-
-void compareGradient(comData& comDataCpu, comData& comDataGpu) {
-  /*compare outArgs*/
-  vector<Argument> outArgs1 = comDataCpu.outArgs;
-  vector<Argument> outArgs2 = comDataGpu.outArgs;
-  CpuMatrix out1(outArgs1[0].value->getHeight(), outArgs1[0].value->getWidth());
-  CpuMatrix out2(outArgs2[0].value->getHeight(), outArgs2[0].value->getWidth());
-  out1.copyFrom(*outArgs1[0].value);
-  out2.copyFrom(*outArgs2[0].value);
-  checkBuffer(out1.getData(), out2.getData(), out1.getElementCnt());
-
-  /*compare parameters*/
-  vector<ParameterPtr>& parameters1 = comDataCpu.parameters;
-  vector<ParameterPtr>& parameters2 = comDataGpu.parameters;
-  for (size_t i = 0; i < parameters1.size(); ++i) {
-    ParameterPtr parameter1, parameter2;
-    parameter1 = parameters1[i];
-    parameter2 = parameters2[i];
-    /*compare parameters value*/
-    CpuVector para1(parameter1->getSize());
-    CpuVector para2(parameter2->getSize());
-    para1.copyFrom(*parameter1->getBuf(PARAMETER_VALUE));
-    para2.copyFrom(*parameter2->getBuf(PARAMETER_VALUE));
-    checkBuffer(para1.getData(), para2.getData(), para1.getSize());
-
-    /*compare parameters grad*/
-    CpuVector cpuGrad1(*parameter1->getBuf(PARAMETER_GRADIENT));
-    CpuVector cpuGrad2(*parameter2->getBuf(PARAMETER_GRADIENT));
-    double e =
-        checkBuffer(cpuGrad1.getData(), cpuGrad2.getData(), cpuGrad1.getSize());
-    LOG(INFO) << parameter1->getName() << " max error=" << e;
-  }
-}
-
-int main(int argc, char** argv) {
-#ifndef PADDLE_WITH_CUDA
-  exit(0);
-#endif
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  initPython(argc, argv);
-  int ret = RUN_ALL_TESTS();
-  exit(ret);
-}
diff --git a/paddle/trainer/tests/test_PyDataProviderWrapper.cpp b/paddle/trainer/tests/test_PyDataProviderWrapper.cpp
deleted file mode 100644
index 92dc8aa9ec5ce281d1950d84260c1b9555e686a7..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/test_PyDataProviderWrapper.cpp
+++ /dev/null
@@ -1,220 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef PADDLE_NO_PYTHON
-#include <DataConfig.pb.h>
-#include <gtest/gtest.h>
-#include <paddle/gserver/dataproviders/DataProvider.h>
-#include <paddle/math/Matrix.h>
-#include <paddle/parameter/Argument.h>
-#include <paddle/utils/PythonUtil.h>
-#include <fstream>
-#include <typeinfo>
-#include <unordered_map>
-#include <unordered_set>
-#include "picojson.h"
-
-void checkValue(std::vector<paddle::Argument>& arguments, picojson::array& arr);
-const std::string kDir = "./trainer/tests/pydata_provider_wrapper_dir/";
-
-TEST(PyDataProviderWrapper, SequenceData) {
-  paddle::DataConfig conf;
-  conf.set_type("py");
-  conf.set_load_data_module("testPyDataWrapper");
-  conf.set_load_data_object("processSeqAndGenerateData");
-  conf.set_load_data_args(kDir + "test_pydata_provider_wrapper.json");
-  conf.clear_files();
-  conf.set_files(kDir + "test_pydata_provider_wrapper.list");
-  paddle::DataProviderPtr provider(paddle::DataProvider::create(conf, false));
-  provider->setSkipShuffle();
-  provider->reset();
-  paddle::DataBatch batchFromPy;
-  provider->getNextBatch(100, &batchFromPy);
-
-  picojson::value val;
-  std::fstream fin;
-  fin.open(kDir + "test_pydata_provider_wrapper.json", std::ios_base::in);
-  EXPECT_TRUE(fin.is_open());
-  if (fin.is_open()) {
-    std::string err = picojson::parse(val, fin);
-    EXPECT_TRUE(err.empty());
-    EXPECT_TRUE(val.is<picojson::array>());
-    picojson::array& arr = val.get<picojson::array>();
-    std::vector<paddle::Argument>& arguments = batchFromPy.getStreams();
-    // CHECK Value
-    checkValue(arguments, arr);
-    // CHECK sequenceStartPositions
-    for (size_t i = 0; i < arr.size(); i++) {
-      int row_id = arr[i].get<picojson::array>().size();
-      EXPECT_EQ(0, arguments[i].sequenceStartPositions->getData(false)[0]);
-      EXPECT_EQ((int)row_id,
-                arguments[i].sequenceStartPositions->getData(false)[1]);
-    }
-    fin.close();
-  }
-}
-
-TEST(PyDataProviderWrapper, HasSubSequenceData) {
-  paddle::DataConfig conf;
-  conf.set_type("py");
-  conf.set_load_data_module("testPyDataWrapper");
-  conf.set_load_data_object("processSubSeqAndGenerateData");
-  conf.set_load_data_args(kDir + "test_pydata_provider_wrapper.json");
-  conf.clear_files();
-  conf.set_files(kDir + "test_pydata_provider_wrapper.list");
-  paddle::DataProviderPtr provider(paddle::DataProvider::create(conf, false));
-  provider->setSkipShuffle();
-  provider->reset();
-  paddle::DataBatch batchFromPy;
-  provider->getNextBatch(1, &batchFromPy);
-
-  picojson::value val;
-  std::fstream fin;
-  fin.open(kDir + "test_pydata_provider_wrapper.json", std::ios_base::in);
-  EXPECT_TRUE(fin.is_open());
-  if (fin.is_open()) {
-    std::string err = picojson::parse(val, fin);
-    EXPECT_TRUE(err.empty());
-    EXPECT_TRUE(val.is<picojson::array>());
-    picojson::array& arr = val.get<picojson::array>();
-    std::vector<paddle::Argument>& arguments = batchFromPy.getStreams();
-    // CHECK Value
-    checkValue(arguments, arr);
-    // CHECK sequenceStartPositions and subSequenceStartPositions
-    for (size_t i = 0; i < arr.size(); i++) {
-      int row_id = arr[i].get<picojson::array>().size();
-      EXPECT_EQ(0, arguments[i].sequenceStartPositions->getData(false)[0]);
-      EXPECT_EQ((int)row_id,
-                arguments[i].sequenceStartPositions->getData(false)[1]);
-      EXPECT_EQ(0, arguments[i].subSequenceStartPositions->getData(false)[0]);
-      EXPECT_EQ((int)row_id,
-                arguments[i].subSequenceStartPositions->getData(false)[1]);
-    }
-    fin.close();
-  }
-}
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  paddle::initPython(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
-
-void checkValue(std::vector<paddle::Argument>& arguments,
-                picojson::array& arr) {
-  // CHECK SLOT 0, Sparse Value.
-  paddle::Argument& sparse_values_seq = arguments[0];
-  paddle::MatrixPtr& sparse_values_seq_rawmatrix = sparse_values_seq.value;
-  EXPECT_TRUE(sparse_values_seq_rawmatrix != nullptr);
-  paddle::CpuSparseMatrix* sparse_val_seq_sparse_mat =
-      dynamic_cast<paddle::CpuSparseMatrix*>(sparse_values_seq_rawmatrix.get());
-  EXPECT_TRUE(sparse_val_seq_sparse_mat != nullptr);
-  EXPECT_EQ(arr.size(), arguments.size());
-  EXPECT_TRUE(arr[0].is<picojson::array>());
-  size_t row_id = 0;
-  for (picojson::value& sparse_val_seq : arr[0].get<picojson::array>()) {
-    std::unordered_map<int, real> cols;
-    for (picojson::value& kv : sparse_val_seq.get<picojson::array>()) {
-      EXPECT_TRUE(kv.get(0).is<double>());
-      EXPECT_TRUE(kv.get(1).is<double>());
-      int col = (int)(kv.get(0).get<double>());
-      real val = (real)(kv.get(1).get<double>());
-      cols.insert({col, val});
-    }
-    size_t colNum = sparse_val_seq_sparse_mat->getColNum(row_id);
-    EXPECT_EQ(cols.size(), colNum);
-    int* rowIds = sparse_val_seq_sparse_mat->getRowCols(row_id);
-    real* rowBuf = sparse_val_seq_sparse_mat->getRowValues(row_id);
-    for (size_t i = 0; i < colNum; ++i) {
-      int id = rowIds[i];
-      auto it = cols.find(id);
-      EXPECT_NE(cols.end(), it);
-      real expect = it->second;
-      EXPECT_NEAR(expect, *rowBuf, 1e-5);
-      ++rowBuf;
-    }
-    ++row_id;
-  }
-
-  // CHECK SLOT 1, Dense Value.
-  paddle::Argument& dense_arg = arguments[1];
-  paddle::MatrixPtr& dense_mat = dense_arg.value;
-  EXPECT_NE(nullptr, dense_mat);
-  EXPECT_TRUE(arr[1].is<picojson::array>());
-  row_id = 0;
-  for (picojson::value& dense_seq : arr[1].get<picojson::array>()) {
-    EXPECT_TRUE(dense_seq.is<picojson::array>());
-    picojson::array& row = dense_seq.get<picojson::array>();
-    EXPECT_EQ(row.size(), dense_mat->getWidth());
-    real* rowBuf = dense_mat->getRowBuf(row_id++);
-
-    for (picojson::value& val : row) {
-      EXPECT_TRUE(val.is<double>());
-      real expect = val.get<double>();
-      EXPECT_NEAR(expect, *rowBuf++, 1e-5);
-    }
-  }
-
-  // CHECK SLOT 2, Sparse Non Value.
-  paddle::Argument& sparse_non_val_arg = arguments[2];
-  paddle::MatrixPtr& sparse_non_val_rawm = sparse_non_val_arg.value;
-  EXPECT_NE(nullptr, sparse_non_val_rawm);
-  paddle::CpuSparseMatrix* sparse_non_val_m =
-      dynamic_cast<paddle::CpuSparseMatrix*>(sparse_non_val_rawm.get());
-  EXPECT_NE(nullptr, sparse_non_val_m);
-  row_id = 0;
-  for (picojson::value& row : arr[2].get<picojson::array>()) {
-    EXPECT_TRUE(row.is<picojson::array>());
-    std::unordered_set<int> ids;
-    for (picojson::value& id : row.get<picojson::array>()) {
-      EXPECT_TRUE(id.is<double>());
-      ids.insert((int)(id.get<double>()));
-    }
-    size_t colNum = sparse_non_val_m->getColNum(row_id);
-    EXPECT_EQ(ids.size(), colNum);
-    for (size_t i = 0; i < colNum; ++i) {
-      int col = sparse_non_val_m->getRowCols(row_id)[i];
-      EXPECT_TRUE(ids.find(col) != ids.end());
-    }
-    ++row_id;
-  }
-
-  // CHECK SLOT 3, Index.
-  paddle::Argument& index_arg = arguments[3];
-  paddle::IVectorPtr indices = index_arg.ids;
-  EXPECT_NE(nullptr, indices);
-  int* idPtr = indices->getData();
-  for (picojson::value& id : arr[3].get<picojson::array>()) {
-    EXPECT_TRUE(id.is<double>());
-    int _id = (int)(id.get<double>());
-    EXPECT_EQ(_id, *idPtr++);
-  }
-
-  // CHECK SLOT 4, String.
-  paddle::Argument& strArg = arguments[4];
-  std::vector<std::string>* strPtr = strArg.strs.get();
-  EXPECT_NE(nullptr, strPtr);
-  size_t vecIndex = 0;
-  for (picojson::value& str : arr[4].get<picojson::array>()) {
-    EXPECT_TRUE(str.is<std::string>());
-    std::string _str = str.get<std::string>();
-    EXPECT_EQ(_str, (*strPtr)[vecIndex++]);
-  }
-}
-
-#else
-int main() { return 0; }
-
-#endif
diff --git a/paddle/trainer/tests/test_Trainer.cpp b/paddle/trainer/tests/test_Trainer.cpp
deleted file mode 100644
index 394038cf730f13cb957fbbc5ae0e5719b8fe9db6..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/test_Trainer.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <paddle/utils/PythonUtil.h>
-#include <paddle/utils/Version.h>
-#include "paddle/trainer/Trainer.h"
-
-#include <gtest/gtest.h>
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-static const string& configFile1 = "trainer/tests/sample_trainer_config.conf";
-static const string& configFile2 =
-    "trainer/tests/sample_trainer_config_hsigmoid.conf";
-static const string& configFile4 =
-    "trainer/tests/sample_trainer_config_parallel.conf";
-
-DECLARE_bool(use_gpu);
-DECLARE_string(config);
-DECLARE_int32(gpu_id);
-DECLARE_bool(allow_only_one_model_on_one_gpu);
-
-void checkGradientTest(const string& configFile,
-                       bool useGpu,
-                       bool parallel,
-                       int trainerCount = 1) {
-  FLAGS_use_gpu = useGpu;
-  FLAGS_parallel_nn = parallel;
-  FLAGS_config = configFile;
-  FLAGS_trainer_count = trainerCount;
-  LOG(INFO) << " useGpu=" << useGpu << " trainerCount=" << trainerCount
-            << " configFile=" << configFile;
-
-  Trainer trainer;
-  trainer.init(TrainerConfigHelper::createFromFlagConfig());
-  EXPECT_LE(fabs(trainer.checkGradient()), 0.02);
-}
-
-TEST(checkGradient, cpu) { checkGradientTest(configFile1, false, false); }
-
-#ifdef PADDLE_WITH_CUDA
-TEST(checkGradient, gpu) { checkGradientTest(configFile1, true, false); }
-
-TEST(checkGradient, multiGpu) {
-  int numGpu;
-  numGpu = hl_get_device_count();
-  for (auto count : {2, 4}) {
-    if (count <= numGpu) {
-      checkGradientTest(configFile1, true, false, count);
-    }
-  }
-}
-
-TEST(checkGradient, parallel) {
-  if (hl_get_device_count() >= 2) {
-    checkGradientTest(configFile4, true, true);
-  }
-}
-
-TEST(checkGradient, multiParallel) {
-  FLAGS_allow_only_one_model_on_one_gpu = false;
-  checkGradientTest(configFile4, true, true, 2);
-  FLAGS_allow_only_one_model_on_one_gpu = true;
-}
-
-#endif
-
-TEST(checkGradient, multi) {
-  int numGpu;
-  if (version::isWithGpu()) {
-    numGpu = hl_get_device_count();
-  } else {
-    numGpu = 0;
-  }
-  for (bool useGpu : {false, true}) {
-    for (auto count : {2, 4}) {
-      if (useGpu && count > numGpu) continue;
-      checkGradientTest(configFile1, useGpu, false, count);
-    }
-  }
-}
-
-TEST(checkGradient, hsigmoid) { checkGradientTest(configFile2, false, false); }
-
-TEST(checkGradient, non_parallel) {
-  checkGradientTest(configFile4, false, false);
-}
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  initPython(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/trainer/tests/test_TrainerOnePass.cpp b/paddle/trainer/tests/test_TrainerOnePass.cpp
deleted file mode 100644
index de12c4d649c6041f497c0eeac0904ebfc0d5bf97..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/test_TrainerOnePass.cpp
+++ /dev/null
@@ -1,317 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <paddle/utils/GlobalConstants.h>
-#include <paddle/utils/PythonUtil.h>
-#include "paddle/trainer/Trainer.h"
-#include "paddle/trainer/TrainerInternal.h"
-
-#include <gtest/gtest.h>
-#include <paddle/pserver/ParameterServer2.h>
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-static const string& configFile1 = "trainer/tests/sample_trainer_config.conf";
-static const string& configFile2 =
-    "trainer/tests/sample_trainer_config_parallel.conf";
-
-static const string& configFileSimpleSparse =
-    "trainer/tests/simple_sparse_neural_network.py";
-
-DECLARE_bool(use_gpu);
-DECLARE_string(config);
-DECLARE_int32(gpu_id);
-DECLARE_int32(seed);
-DECLARE_int32(num_passes);
-DECLARE_int32(saving_period);
-
-class TrainerForTest : public paddle::Trainer {
- public:
-  inline const std::shared_ptr<ParameterUpdater>& getParameterUpdaterForTest() {
-    return this->trainerInternal_.getParameterUpdater();
-  }
-};
-
-int gNumDevices = 0;
-
-void trainerOnePassTest(const string& configFile,
-                        bool useGpu,
-                        bool parallel,
-                        int trainerCount = 1,
-                        double averageWindow = 0.0f,
-                        bool doAverageInCpu = false) {
-  FLAGS_use_gpu = useGpu;
-  FLAGS_parallel_nn = parallel;
-  FLAGS_config = configFile;
-  FLAGS_trainer_count = trainerCount;
-  LOG(INFO) << " useGpu=" << useGpu << " trainerCount=" << trainerCount
-            << " configFile=" << configFile;
-  srand(FLAGS_seed);
-
-  if (useGpu) {
-    if (gNumDevices < trainerCount) {
-      return;
-    }
-  }
-
-  Trainer trainer;
-  auto config = TrainerConfigHelper::createFromFlagConfig();
-  if (averageWindow > 0) {
-    config->getOptConfig().set_average_window(averageWindow);
-    config->getOptConfig().set_do_average_in_cpu(doAverageInCpu);
-  }
-  trainer.init(config);
-  trainer.train();
-}
-
-// 1. test trainer (cpu, gpu).
-TEST(trainerOnePass, cpu) { trainerOnePassTest(configFile1, false, false); }
-
-#ifdef PADDLE_WITH_CUDA
-TEST(trainerOnePass, gpu) { trainerOnePassTest(configFile1, true, false); }
-
-TEST(trainerOnePass, gpu2) { trainerOnePassTest(configFile1, true, false, 2); }
-
-TEST(trainerOnePass, gpu4) { trainerOnePassTest(configFile1, true, false, 4); }
-
-TEST(trainerOnePass, parallel) {
-  if (hl_get_device_count() >= 2) {
-    trainerOnePassTest(configFile2, true, true);
-  }
-}
-#endif
-
-// 2. test average_window.
-#ifdef PADDLE_WITH_CUDA
-TEST(average_window, gpu) {
-  trainerOnePassTest(configFile1, true, false, 4, 0.01);
-}
-
-TEST(average_window, gpu2) {
-  FLAGS_num_passes = 20;
-  trainerOnePassTest(configFile1, true, false, 2, 0.01);
-  FLAGS_num_passes = 1;
-}
-
-TEST(average_window, gpu4) {
-  FLAGS_num_passes = 20;
-  trainerOnePassTest(configFile1, true, false, 4, 0.01);
-  FLAGS_num_passes = 1;
-}
-
-TEST(average_window_cpu, gpu2) {
-  FLAGS_num_passes = 20;
-  trainerOnePassTest(configFile1, true, false, 2, 0.01, true);
-  FLAGS_num_passes = 1;
-}
-
-TEST(average_window_cpu, gpu4) {
-  FLAGS_num_passes = 20;
-  trainerOnePassTest(configFile1, true, false, 4, 0.01, true);
-  FLAGS_num_passes = 1;
-}
-#endif
-
-// 3. test trainer + pserver.
-DECLARE_int32(num_gradient_servers);
-DECLARE_int32(port);
-DECLARE_bool(local);
-DECLARE_bool(use_old_updater);
-
-double checkRemoteParameterUpdater(TrainerForTest& trainer) {
-  auto gradientMachine = trainer.getGradientMachine();
-  auto parameterUpdater = trainer.getParameterUpdaterForTest();
-  auto dataProvider = trainer.getDataProvider();
-  auto& parameters = gradientMachine->getParameters();
-  const TrainerConfig& config = trainer.getConfig();
-  const string& alg = config.opt_config().algorithm();
-
-  vector<ParameterPtr> parameterCheck;
-  for (auto& parameter : parameters) {
-    parameterCheck.emplace_back(
-        new Parameter(parameter->getConfig(), /* useGpu= */ false));
-    parameterCheck.back()
-        ->getBuf(PARAMETER_VALUE)
-        ->copyFrom(*parameter->getBuf(PARAMETER_VALUE));
-    parameterCheck.back()
-        ->getBuf(PARAMETER_GRADIENT)
-        ->copyFrom(*parameter->getBuf(PARAMETER_GRADIENT));
-  }
-
-  std::unique_ptr<ParameterUpdater> parameterUpdaterCheck;
-  if (alg == TrainAlgorithm::SGD) {
-    parameterUpdaterCheck.reset(new SgdLocalUpdater(config.opt_config()));
-  } else {
-    LOG(INFO) << "unsupported algorithm in remote parameter check: " << alg;
-    return -1.0;
-  }
-  parameterUpdaterCheck->init(parameterCheck);
-
-  // gradientMachine->start(config, *dataProvider);
-  DataBatch dataBatch;
-  int32_t batchSize = config.opt_config().batch_size();
-  dataProvider->getNextBatch(batchSize, &dataBatch);
-  CHECK(dataBatch.getSize()) << "No data from data provider";
-  int64_t actualBatchSize = dataBatch.getSize();
-  const vector<Argument>& inArgs = dataBatch.getStreams();
-  vector<Argument> outArgs;
-
-  UpdateCallback updateCallback = [parameterUpdater,
-                                   parameterCheck](Parameter* para) {
-    parameterCheck[para->getID()]
-        ->getBuf(PARAMETER_GRADIENT)
-        ->copyFrom(*para->getBuf(PARAMETER_GRADIENT));
-    parameterUpdater->update(para);
-  };
-
-  parameterUpdater->startPass();
-  parameterUpdaterCheck->startPass();
-
-  for (int i = 0; i < config.opt_config().num_batches_per_get_parameter() * 2;
-       ++i) {
-    PassType passType = parameterUpdater->startBatch(actualBatchSize);
-    gradientMachine->forwardBackward(
-        inArgs, &outArgs, passType, updateCallback);
-    parameterUpdater->finishBatch(0);
-
-    parameterUpdaterCheck->startBatch(actualBatchSize);
-    for (auto& para : parameterCheck) {
-      parameterUpdaterCheck->update(para.get());
-    }
-    parameterUpdaterCheck->finishBatch(0);
-  }
-
-  double sum = 0.0f;
-  for (size_t i = 0; i != parameters.size(); ++i) {
-    real *v1, *v2;
-    CpuVector trainerPara(parameters[i]->getSize());
-    trainerPara.copyFrom(*parameters[i]->getBuf(PARAMETER_VALUE));
-    if (!FLAGS_use_gpu) {
-      v1 = parameters[i]->getBuf(PARAMETER_VALUE)->getData();
-    } else {
-      v1 = trainerPara.getData();
-    }
-    v2 = parameterCheck[i]->getBuf(PARAMETER_VALUE)->getData();
-
-    size_t size = parameters[i]->getSize();
-    double diff = 0;
-    for (size_t j = 0; j < size; ++j) {
-      diff += fabs(v1[j] - v2[j]);
-    }
-    sum += diff;
-    LOG(INFO) << setiosflags(ios::left) << setfill(' ') << setw(20)
-              << parameters[i]->getName() << "diff=" << setw(15) << diff;
-  }
-
-  parameterUpdater->finishPass();
-  parameterUpdaterCheck->finishPass();
-  gradientMachine->finish();
-  return sum;
-}
-
-void checkRemoteParameterUpdaterTest(const string& configFile,
-                                     bool useGpu,
-                                     bool parallel,
-                                     int trainerCount = 1,
-                                     bool useOldUpdater = false,
-                                     int num_batches_per_get_parameter = 1) {
-  FLAGS_use_gpu = useGpu;
-  FLAGS_parallel_nn = parallel;
-  FLAGS_config = configFile;
-  FLAGS_trainer_count = trainerCount;
-  FLAGS_use_old_updater = useOldUpdater;
-  LOG(INFO) << " useGpu=" << useGpu << " trainerCount=" << trainerCount
-            << " configFile=" << configFile;
-  srand(FLAGS_seed);
-
-  if (useGpu) {
-    if (gNumDevices < trainerCount) {
-      return;
-    }
-  }
-
-  FLAGS_local = 0;
-  std::shared_ptr<ParameterServer2> pserver;
-  pserver.reset(new ParameterServer2(std::string(), FLAGS_port));
-  pserver->init();
-  pserver->start();
-
-  TrainerForTest trainer;
-  auto config = TrainerConfigHelper::createFromFlagConfig();
-  config->getOptConfig().set_num_batches_per_get_parameter(
-      num_batches_per_get_parameter);
-  trainer.init(config);
-  EXPECT_EQ(checkRemoteParameterUpdater(trainer), 0);
-
-  FLAGS_local = 1;
-}
-
-TEST(checkRemoteUpdater, cpuTrainer) {
-  checkRemoteParameterUpdaterTest(configFile1, false, false);
-}
-
-TEST(checkRemoteUpdater, cpuTrainerOldUpdater) {
-  checkRemoteParameterUpdaterTest(configFile1, false, false, 1, true);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(checkRemoteUpdater, gpuTrainer) {
-  checkRemoteParameterUpdaterTest(configFile1, true, false);
-}
-
-TEST(checkRemoteUpdater, gpu2Trainer) {
-  checkRemoteParameterUpdaterTest(configFile1, true, false, 2);
-}
-
-TEST(checkRemoteUpdater, gpu4Trainer) {
-  checkRemoteParameterUpdaterTest(configFile1, true, false, 4);
-}
-
-TEST(checkRemoteUpdater, gpuTrainerOldUpdater) {
-  checkRemoteParameterUpdaterTest(configFile1, true, false, 1, true);
-}
-
-TEST(checkRemoteUpdater, gpu2TrainerOldUpdater) {
-  checkRemoteParameterUpdaterTest(configFile1, true, false, 2, true);
-}
-
-TEST(checkRemoteUpdater, gpu4TrainerOldUpdater) {
-  checkRemoteParameterUpdaterTest(configFile1, true, false, 4, true);
-}
-
-#endif
-
-TEST(checkRemoteUpdater, cpuDeltaTrainer) {
-  checkRemoteParameterUpdaterTest(configFile1, false, false, 1, false, 10);
-}
-
-TEST(checkRemoteUpdater, cpuDeltaTrainerOldUpdater) {
-  checkRemoteParameterUpdaterTest(configFile1, false, false, 1, true, 10);
-}
-
-TEST(SgdThreadUpdater, simpleSparseNN) {
-  trainerOnePassTest(configFileSimpleSparse, false, false, 1, 0.5, true);
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  initPython(argc, argv);
-  gNumDevices = hl_get_device_count();
-
-  FLAGS_num_passes = 1;          // train one pass
-  FLAGS_saving_period = 100000;  // do not save parameteres
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/trainer/tests/test_config.conf b/paddle/trainer/tests/test_config.conf
deleted file mode 100644
index 2f86aaa75316fa2a5a28edfef31c01e15a44b3d0..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/test_config.conf
+++ /dev/null
@@ -1,77 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-TrainData(SimpleData(
-    files = "trainer/tests/sample_filelist.txt",
-    feat_dim = 3,
-    context_len = 0,
-    buffer_capacity = 1000000,
-    async_load_data = False))
-
-settings(batch_size = 100)
-
-data = data_layer(name='input', size=3)
-
-wt = data_layer(name='weight', size=1)
-
-fc1 = fc_layer(input=data, size=5,
-               bias_attr=True,
-               act=SigmoidActivation())
-
-fc2 = fc_layer(input=data, size=12,
-               bias_attr=True,
-               param_attr=ParamAttr(name='sharew'),
-               act=LinearActivation())
-
-fc3 = fc_layer(input=data, size=3,
-               bias_attr=True,
-               act=TanhActivation())
-
-fc4 = fc_layer(input=data, size=5,
-               bias_attr=True,
-               layer_attr=ExtraAttr(drop_rate=0.5),
-               act=SquareActivation())
-
-pool = img_pool_layer(input=fc2,
-                      pool_size=2,
-                      pool_size_y=3,
-                      num_channels=1,
-                      padding=1,
-                      padding_y=2,
-                      stride=2,
-                      stride_y=3,
-                      pool_type=CudnnAvgPooling())
-
-concat = concat_layer(input=[fc3, fc4])
-
-with mixed_layer(size=3, act=SoftmaxActivation()) as output:
-    output += full_matrix_projection(input=fc1)
-    output += trans_full_matrix_projection(input=fc2,
-                                           param_attr=ParamAttr(name='sharew'))
-    output += full_matrix_projection(input=concat)
-    output += identity_projection(input=fc3)
-
-lbl = data_layer(name='label', size=1)
-
-cost = classification_cost(input=output, label=lbl, weight=wt,
-                           layer_attr=ExtraAttr(device=-1))
-
-nce = nce_layer(input=fc2, label=lbl, weight=wt,
-                num_classes=3, 
-                neg_distribution=[0.1, 0.3, 0.6])
-                
-outputs(cost, nce)
diff --git a/paddle/trainer/tests/test_recurrent_machine_generation.cpp b/paddle/trainer/tests/test_recurrent_machine_generation.cpp
deleted file mode 100644
index a8fbe31c2b1e228107dfc19483444409bfcbf788..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/test_recurrent_machine_generation.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <fstream>
-
-#include <paddle/trainer/Trainer.h>
-#include <paddle/utils/PythonUtil.h>
-
-#include <gtest/gtest.h>
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-static const string& CONFIG_FILE = "trainer/tests/sample_trainer_rnn_gen.conf";
-static const string& NEST_CONFIG_FILE =
-    "trainer/tests/sample_trainer_nest_rnn_gen.conf";
-static const string& OUTPUT_DIR = "trainer/tests/dump_text.test";
-static string modelDir = "trainer/tests/rnn_gen_test_model_dir/t1";  // NOLINT
-static string expectFile =                                           // NOLINT
-    "trainer/tests/rnn_gen_test_model_dir/r1.test";                  // NOLINT
-
-DECLARE_string(config_args);
-
-vector<float> readRetFile(const string& fname) {
-  ifstream inFile(fname);
-  float ret;
-  vector<float> nums;
-  while (inFile >> ret) {
-    nums.push_back(ret);
-  }
-  return nums;
-}
-
-void checkOutput(const string& expRetFile) {
-  vector<float> rets = readRetFile(OUTPUT_DIR);
-  vector<float> expRets = readRetFile(expRetFile);
-  EXPECT_EQ(rets.size(), expRets.size());
-  for (size_t i = 0; i < rets.size(); i++) {
-    EXPECT_FLOAT_EQ(rets[i], expRets[i]);
-  }
-}
-
-void prepareInArgs(vector<Argument>& inArgs,
-                   const size_t batchSize,
-                   bool useGpu,
-                   bool hasSubseq) {
-  inArgs.clear();
-  // sentence id
-  Argument sentId;
-  sentId.value = nullptr;
-  if (hasSubseq) {
-    // as there is only one sequence, there is only one label.
-    IVector::resizeOrCreate(sentId.ids, 1, useGpu);
-    sentId.ids->setElement(0, 0);
-  } else {
-    // as there is batchSize word, there is batchSize label.
-    IVector::resizeOrCreate(sentId.ids, batchSize, useGpu);
-    for (size_t i = 0; i < batchSize; ++i) sentId.ids->setElement(i, i);
-  }
-  inArgs.emplace_back(sentId);
-
-  // a dummy layer to decide batch size
-  Argument dummyInput;
-  dummyInput.value = Matrix::create(batchSize, 2, false, useGpu);
-  dummyInput.value->randomizeUniform();
-  if (hasSubseq) {
-    // generate one sequence with batchSize subsequence,
-    // and each subsequence has only one word.
-    dummyInput.sequenceStartPositions = ICpuGpuVector::create(2, false);
-    int* buf = dummyInput.sequenceStartPositions->getMutableData(false);
-    dummyInput.subSequenceStartPositions =
-        ICpuGpuVector::create(batchSize + 1, false);
-    int* subBuf = dummyInput.subSequenceStartPositions->getMutableData(false);
-    buf[0] = 0;
-    buf[1] = batchSize;
-    for (size_t i = 0; i < batchSize + 1; i++) subBuf[i] = i;
-  }
-  inArgs.emplace_back(dummyInput);
-}
-
-void testGeneration(const string& configFile,
-                    bool useGpu,
-                    bool hasSubseq,
-                    const string& expRetFile) {
-  FLAGS_use_gpu = useGpu;
-  auto config = std::make_shared<TrainerConfigHelper>(configFile);
-  unique_ptr<GradientMachine> gradientMachine(GradientMachine::create(*config));
-  gradientMachine->loadParameters(modelDir);
-  vector<Argument> inArgs(2);
-
-  const size_t batchSize = 15;
-  prepareInArgs(inArgs, batchSize, useGpu, hasSubseq);
-  vector<Argument> outArgs;
-  unique_ptr<Evaluator> testEvaluator(gradientMachine->makeEvaluator());
-  testEvaluator->start();
-  gradientMachine->forward(inArgs, &outArgs, PASS_TEST);
-  gradientMachine->eval(testEvaluator.get());
-  testEvaluator->finish();
-  checkOutput(expRetFile);
-}
-
-#ifndef PADDLE_TYPE_DOUBLE
-
-TEST(RecurrentGradientMachine, test_generation) {
-#ifndef PADDLE_WITH_CUDA
-  const auto useGpuConfs = {false};
-#else
-  const auto useGpuConfs = {true, false};
-#endif
-  auto testGen = [&](const string& configFile,
-                     bool hasSubseq,
-                     const string& expRetFile,
-                     bool beam_search) {
-    FLAGS_config_args = beam_search ? "beam_search=1" : "beam_search=0";
-    for (auto useGpu : useGpuConfs) {
-      LOG(INFO) << configFile << " useGpu=" << useGpu
-                << " beam_search=" << beam_search;
-      testGeneration(configFile, useGpu, hasSubseq, expRetFile);
-    }
-  };
-  testGen(CONFIG_FILE, false, expectFile + ".nobeam", false);  // no beam search
-  testGen(CONFIG_FILE, false, expectFile + ".beam", true);     // beam search
-  // In hierarchical RNN, beam search and one way search are only in inner-RNN,
-  // outer-RNN will concat the generated inner-results (first for beam search)
-  // from inner-RNN. Thus, they have the same outer-results.
-  testGen(NEST_CONFIG_FILE,
-          true,
-          expectFile + ".nest",
-          false);  // no beam search
-  testGen(NEST_CONFIG_FILE, true, expectFile + ".nest", true);  // beam search
-}
-#endif
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  initPython(argc, argv);
-  CHECK(argc == 1 || argc == 3);
-  if (argc == 3) {
-    modelDir = argv[1];
-    expectFile = argv[2];
-  }
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/utils/CpuId.cpp b/paddle/utils/CpuId.cpp
deleted file mode 100644
index 7186feef041eb3b1be459a506294f83f9a00ad94..0000000000000000000000000000000000000000
--- a/paddle/utils/CpuId.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/utils/CpuId.h"
-#include "paddle/utils/Util.h"
-
-#ifdef _WIN32
-
-#include <intrin.h>
-
-/// for MSVC
-#define CPUID(info, x) __cpuidex(info, x, 0)
-
-#else
-
-#if !defined(__arm__) && !defined(__aarch64__)
-#include <cpuid.h>
-/// for GCC/Clang
-#define CPUID(info, x) __cpuid_count(x, 0, info[0], info[1], info[2], info[3])
-#endif
-
-#endif
-
-namespace paddle {
-
-SIMDFlags::SIMDFlags() {
-#if defined(__arm__) || defined(__aarch64__)
-  simd_flags_ = SIMD_NEON;
-#else
-  unsigned int cpuInfo[4];
-  // CPUID: https://en.wikipedia.org/wiki/CPUID
-  // clang-format off
-  CPUID(cpuInfo, 0x00000001);
-  simd_flags_ |= cpuInfo[3] & (1 << 25) ? SIMD_SSE   : SIMD_NONE;
-  simd_flags_ |= cpuInfo[3] & (1 << 26) ? SIMD_SSE2  : SIMD_NONE;
-  simd_flags_ |= cpuInfo[2] & (1 <<  0) ? SIMD_SSE3  : SIMD_NONE;
-  simd_flags_ |= cpuInfo[2] & (1 <<  9) ? SIMD_SSSE3 : SIMD_NONE;
-  simd_flags_ |= cpuInfo[2] & (1 << 19) ? SIMD_SSE41 : SIMD_NONE;
-  simd_flags_ |= cpuInfo[2] & (1 << 20) ? SIMD_SSE42 : SIMD_NONE;
-  simd_flags_ |= cpuInfo[2] & (1 << 12) ? SIMD_FMA3  : SIMD_NONE;
-  simd_flags_ |= cpuInfo[2] & (1 << 28) ? SIMD_AVX   : SIMD_NONE;
-
-  CPUID(cpuInfo, 0x00000007);
-  simd_flags_ |= cpuInfo[1] & (1 <<  5) ? SIMD_AVX2  : SIMD_NONE;
-  simd_flags_ |= cpuInfo[1] & (1 << 16) ? SIMD_AVX512: SIMD_NONE;
-
-  CPUID(cpuInfo, 0x80000001);
-  simd_flags_ |= cpuInfo[2] & (1 << 16) ? SIMD_FMA4  : SIMD_NONE;
-  // clang-fotmat on
-#endif
-}
-
-SIMDFlags const* SIMDFlags::instance() {
-  static SIMDFlags instance;
-  return &instance;
-}
-
-}  // namespace paddle
diff --git a/paddle/utils/PythonUtil.h b/paddle/utils/PythonUtil.h
deleted file mode 100644
index 6f8d7e09309503e47aca7ae2d20774c748703b21..0000000000000000000000000000000000000000
--- a/paddle/utils/PythonUtil.h
+++ /dev/null
@@ -1,353 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-// clang-format off
-#include "paddle/utils/Util.h"
-
-#ifndef PADDLE_NO_PYTHON
-// must include the following two blocks, otherwise,
-// gcc compiler may produce warning
-#ifdef __APPLE__
-#define _POSIX_SOURCE
-#define _POSIX_C_SOURCE 200809L
-#define _XOPEN_SOURCE 700
-#endif
-
-#ifdef _POSIX_C_SOURCE
-#define __TEMP_POSIX_C_SOURCE _POSIX_C_SOURCE
-#undef _POSIX_C_SOURCE
-#endif
-#ifdef _XOPEN_SOURCE
-#define __TEMP_XOPEN_SOURCE _XOPEN_SOURCE
-#undef _XOPEN_SOURCE
-#endif
-#include <Python.h>
-#include <frameobject.h>
-#endif
-
-#include <stdarg.h>
-#include <map>
-#include <mutex>
-// clang-format on
-
-namespace paddle {
-
-std::string callPythonFunc(const std::string& moduleName,
-                           const std::string& funcName,
-                           const std::vector<std::string>& args);
-
-#ifndef PADDLE_NO_PYTHON
-
-/**
- * Global lock guard of python C-api invokes.
- * NOTE: the lock of this guard is reentrant or recursive.
- */
-class PyGuard {
- public:
-  PyGuard();
-  PyGuard(const PyGuard& other) = delete;
-  PyGuard& operator=(const PyGuard& other) = delete;
-
- private:
-  std::lock_guard<std::recursive_mutex> guard_;
-};
-
-struct PyObjectDeleter {
-  void operator()(PyObject* obj) {
-    if (obj) {
-      Py_DECREF(obj);
-    }
-  }
-};
-
-typedef std::unique_ptr<PyObject, PyObjectDeleter> PyObjectPtr;
-
-PyObjectPtr callPythonFuncRetPyObj(const std::string& moduleName,
-                                   const std::string& funcName,
-                                   const std::vector<std::string>& args);
-
-PyObjectPtr createPythonClass(const std::string& moduleName,
-                              const std::string& className,
-                              const std::vector<std::string>& args,
-                              const std::map<std::string, std::string>& kwargs);
-
-#define CHECK_PY(x) CHECK((x) != nullptr) << ::paddle::py::getPyCallStack()
-
-namespace py {
-PyObjectPtr import(const std::string& moduleName);
-
-/**
- * Cast a PyLong or PyInt to int type T.
- * @tparam T return type.
- * @param [in] obj PyLong or PyInt object.
- * @param [out] ok status for casting. False if error occured. nullptr if user
- *                 don't care is ok or not.
- * @return The value of python object, or 0 if not ok.
- */
-template <typename T>
-T castInt(PyObject* obj, bool* ok = nullptr) {
-  if (PyLong_Check(obj)) {
-    if (ok) *ok = true;
-    return (T)PyLong_AsUnsignedLong(obj);
-  } else if (PyInt_Check(obj)) {
-    if (ok) *ok = true;
-    return (T)PyInt_AsLong(obj);
-  } else {
-    if (ok) *ok = false;
-    return (T)0;
-  }
-}
-
-/**
- * Invoke repr of python object.
- *
- * Just like toString method in java.
- */
-char* repr(PyObject* obj);
-
-/**
- * Invoke repr of python object.
- */
-inline char* repr(const PyObjectPtr& obj) { return repr(obj.get()); }
-
-/**
- * Get Python Error Stack String.
- */
-std::string getPyCallStack();
-
-/**
- * Object Helper for PyObjectPtr.
- *
- * Implements getAttr method for object.
- */
-class ObjectHelper {
- public:
-  explicit ObjectHelper(const PyObjectPtr& obj) : obj_(obj) {}
-
-  /**
-   * get attribute
-   */
-  inline PyObject* getAttr(const std::string& field) const {
-    auto obj = PyObject_GetAttrString(obj_.get(), field.c_str());
-    CHECK_PY(obj) << "Cannot get attribute on python object " << obj_.get();
-    return obj;
-  }
-
-  /**
-   * Get Int attribute
-   * @param [in] field  attribute name.
-   * @param [out] ok true if this attribute is int.
-   * @tparam T int type.
-   * @return int value.
-   */
-  template <typename T>
-  T getIntAttr(const std::string& field, bool* ok = nullptr) const {
-    PyObjectPtr tmp(getAttr(field));
-    return castInt<T>(tmp.get(), ok);
-  }
-
-  /**
-   * Get int attribute. Log(Fatal) when not ok
-   * @param field attribute name.
-   * @return int value.
-   */
-  template <typename T>
-  T getIntAttrWithError(const std::string& field) const {
-    bool ok;
-    T tmp = getIntAttr<T>(field, &ok);
-    CHECK(ok) << "Cannot get integer attribute on object " << obj_.get();
-    return tmp;
-  }
-
-  /**
-   * Get bool attribute.
-   * @param field
-   * @param [out] isBoolType return true if attribute is bool type. If the
-   *                         attribute is not bool type, then an implicit
-   *                         conversion will happens, and will return the
-   *                         conversion result.
-   *
-   *                         Such as, if the attribute is 1, then the return
-   *                         value of function will be true, but the isBoolType
-   *                         will return false.
-   * @return
-   */
-  bool getBoolAttr(const std::string& field, bool* isBoolType = nullptr) const {
-    PyObjectPtr tmp(getAttr(field));
-    if (isBoolType) {
-      *isBoolType = PyBool_Check(tmp.get());
-    }
-    return PyObject_IsTrue(tmp.get());
-  }
-
- private:
-  const PyObjectPtr& obj_;
-};
-
-/**
- * Python Sequence Helper
- *
- * The python sequence means list or tuple.
- */
-class SequenceHelper {
- public:
-  explicit SequenceHelper(const PyObjectPtr& seq) : seq_(seq.get()) {
-    CHECK(PySequence_Check(seq_));
-  }
-
-  explicit SequenceHelper(PyObject* seq) : seq_(seq) {
-    CHECK(PySequence_Check(seq_));
-  }
-
-  inline size_t size() const { return (size_t)PySequence_Size(seq_); }
-
-  inline PyObject* operator[](size_t i) const {
-    return PySequence_Fast_GET_ITEM(seq_, i);
-  }
-
-  inline double getDouble(size_t i) const {
-    auto* ptr = (*this)[i];
-    return PyFloat_AsDouble(ptr);
-  }
-
-  /**
-   * Set a sequence item o[i] = obj;
-   * @param i index
-   * @param obj setted item.
-   * @param steal if steal = true, sequence will move object in iteself,
-   *              just like std::move. Otherwise, it will increase reference
-   *              count. Default is false.
-   */
-  inline void set(size_t i, const PyObjectPtr& obj, bool steal = false) {
-    this->set(i, obj.get(), steal);
-  }
-
-  /**
-   * Set a sequence item o[i] = obj;
-   */
-  inline void set(size_t i, PyObject* obj, bool steal = false) {
-    if (!steal) {
-      Py_XINCREF(obj);
-    }
-    if (PyTuple_Check(seq_)) {
-      CHECK_NE(PyTuple_SetItem(seq_, i, obj), -1) << getPyCallStack();
-    } else {
-      CHECK_NE(PySequence_SetItem(seq_, i, obj), -1) << getPyCallStack();
-    }
-  }
-
- private:
-  PyObject* seq_;
-};
-
-class DictHelper {
- public:
-  explicit DictHelper(PyObject* d) : dict_(d) {}
-
-  explicit DictHelper(const PyObjectPtr& d) : dict_(d.get()) {}
-
-  void set(const std::string& key, PyObject* item) {
-    PyDict_SetItemString(dict_, key.c_str(), item);
-  }
-
-  void setBool(const std::string& key, bool b) {
-    this->set(key, PyBool_FromLong(b));
-  }
-
-  void setStringList(const std::string& key,
-                     const std::vector<std::string>& items) {
-    auto* list = PyList_New(items.size());
-    for (size_t i = 0; i < items.size(); ++i) {
-      PyList_SetItem(list, i, PyString_FromString(items[i].c_str()));
-    }
-    this->set(key, list);
-  }
-
- private:
-  inline void checkDict() { CHECK(PyDict_Check(this->dict_)); }
-
-  PyObject* dict_;
-};
-
-inline static bool isCallable(const PyObjectPtr& obj) {
-  return PyCallable_Check(obj.get());
-}
-
-/**
- * Wrap a callable object.
- */
-class CallableHelper {
- public:
-  explicit CallableHelper(const PyObjectPtr& obj) : obj_(obj) {
-    CHECK(py::isCallable(obj_));
-  }
-
-  ~CallableHelper() {}
-
-  /**
-   * reset args, and create new tuple.
-   * @param sz args size.
-   */
-  void setArgsSize(size_t sz) { args.reset(PyTuple_New(sz)); }
-
-  /**
-   * Get args sequence. User can set/get by SequenceHelper.
-   */
-  SequenceHelper getArgs() { return SequenceHelper(args); }
-
-  /**
-   * Call python method, return an object.
-   */
-  PyObject* operator()() {
-    PyGuard guard;
-    return PyObject_Call(obj_.get(), args.get(), kwargs.get());
-  }
-
- private:
-  const PyObjectPtr& obj_;
-  PyObjectPtr args;
-  PyObjectPtr kwargs;
-};
-
-inline static PyObject* iterNext(const PyObjectPtr& context, bool* atEnd) {
-  PyGuard g;
-  PyObject* data = PyIter_Next(context.get());
-  if (data == nullptr) {
-    if (PyErr_ExceptionMatches(PyExc_StopIteration)) {
-      PyErr_Clear();
-      *atEnd = true;
-      return nullptr;
-    } else if (PyErr_Occurred()) {
-      CHECK_PY(data) << "Calling iterator next error";
-      return nullptr;
-    } else {
-      *atEnd = false;
-      return data;  // just return none in iterator.
-    }
-  } else {
-    *atEnd = false;
-    return data;
-  }
-}
-}  // namespace py
-
-#endif
-
-/**
- * Initialize python.
- */
-void initPython(int argc, char** argv);
-
-}  // namespace paddle
diff --git a/paddle/utils/arch/linux/Locks.cpp b/paddle/utils/arch/linux/Locks.cpp
deleted file mode 100644
index 409af8bce3621c51bfd7a69c6b4ec1f9cc6be8e4..0000000000000000000000000000000000000000
--- a/paddle/utils/arch/linux/Locks.cpp
+++ /dev/null
@@ -1,149 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/utils/Locks.h"
-#include <semaphore.h>
-#include <unistd.h>
-#include "paddle/utils/Logging.h"
-
-namespace paddle {
-class SemaphorePrivate {
- public:
-  sem_t sem;
-};
-
-Semaphore::Semaphore(int initValue) : m(new SemaphorePrivate()) {
-  sem_init(&m->sem, 0, initValue);
-}
-
-Semaphore::~Semaphore() {
-  sem_destroy(&m->sem);
-  delete m;
-}
-
-bool Semaphore::timeWait(struct timespec* ts) {
-  return (0 == sem_timedwait(&m->sem, ts));
-}
-
-void Semaphore::wait() { sem_wait(&m->sem); }
-
-void Semaphore::post() { sem_post(&m->sem); }
-
-/// SpinLockPrivate
-
-#ifdef PADDLE_USE_PTHREAD_SPINLOCK
-
-class SpinLockPrivate {
- public:
-  inline SpinLockPrivate() { pthread_spin_init(&lock_, 0); }
-  inline ~SpinLockPrivate() { pthread_spin_destroy(&lock_); }
-
-  inline void lock() { pthread_spin_lock(&lock_); }
-  inline void unlock() { pthread_spin_unlock(&lock_); }
-
-  pthread_spinlock_t lock_;
-  char padding_[64 - sizeof(pthread_spinlock_t)];
-};
-
-#else
-// clang-format off
-#include <cstddef>
-#include <atomic>
-// clang-format on
-
-class SpinLockPrivate {
- public:
-  inline void lock() {
-    while (lock_.test_and_set(std::memory_order_acquire)) {
-    }
-  }
-  inline void unlock() { lock_.clear(std::memory_order_release); }
-
-  std::atomic_flag lock_ = ATOMIC_FLAG_INIT;
-  char padding_[64 - sizeof(lock_)];  // Padding to cache line size
-};
-
-#endif
-
-SpinLock::SpinLock() : m(new SpinLockPrivate()) {}
-SpinLock::~SpinLock() { delete m; }
-void SpinLock::lock() { m->lock(); }
-void SpinLock::unlock() { m->unlock(); }
-
-/// ThreadBarrierPrivate
-
-#ifdef PADDLE_USE_PTHREAD_BARRIER
-
-class ThreadBarrierPrivate {
- public:
-  pthread_barrier_t barrier_;
-
-  inline explicit ThreadBarrierPrivate(int count) {
-    pthread_barrier_init(&barrier_, nullptr, count);
-  }
-
-  inline ~ThreadBarrierPrivate() { pthread_barrier_destroy(&barrier_); }
-
-  inline void wait() { pthread_barrier_wait(&barrier_); }
-};
-
-#else
-
-class ThreadBarrierPrivate {
- public:
-  pthread_mutex_t mutex_;
-  pthread_cond_t cond_;
-  int count_;
-  int tripCount_;
-
-  inline explicit ThreadBarrierPrivate(int cnt) : count_(0), tripCount_(cnt) {
-    CHECK_NE(cnt, 0);
-    CHECK_GE(pthread_mutex_init(&mutex_, 0), 0);
-    CHECK_GE(pthread_cond_init(&cond_, 0), 0);
-  }
-
-  inline ~ThreadBarrierPrivate() {
-    pthread_cond_destroy(&cond_);
-    pthread_mutex_destroy(&mutex_);
-  }
-
-  /**
-   * @brief wait
-   * @return true if the last wait
-   */
-  inline bool wait() {
-    pthread_mutex_lock(&mutex_);
-    ++count_;
-    if (count_ >= tripCount_) {
-      count_ = 0;
-      pthread_cond_broadcast(&cond_);
-      pthread_mutex_unlock(&mutex_);
-      return true;
-    } else {
-      pthread_cond_wait(&cond_, &mutex_);
-      pthread_mutex_unlock(&mutex_);
-      return false;
-    }
-  }
-};
-
-#endif
-
-/// ThreadBarrier
-
-ThreadBarrier::ThreadBarrier(int count) : m(new ThreadBarrierPrivate(count)) {}
-ThreadBarrier::~ThreadBarrier() { delete m; }
-void ThreadBarrier::wait() { m->wait(); }
-
-}  // namespace paddle
diff --git a/paddle/utils/arch/osx/Excepts.cpp b/paddle/utils/arch/osx/Excepts.cpp
deleted file mode 100644
index ac444615786fa9f89f96504a31b2289eae7bb643..0000000000000000000000000000000000000000
--- a/paddle/utils/arch/osx/Excepts.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/utils/Excepts.h"
-
-#if defined(__APPLE__) || defined(__OSX__)
-#if defined(__arm__) || defined(__arm64__)
-// TODO(liuyiqun): implement the arm version
-int fegetexcept(void) { return -1; }
-int feenableexcept(unsigned int excepts) { return -1; }
-int fedisableexcept(unsigned int excepts) { return -1; }
-#else
-int fegetexcept(void) {
-  static fenv_t fenv;
-  return fegetenv(&fenv) ? -1 : (fenv.__control & FE_ALL_EXCEPT);
-}
-
-int feenableexcept(unsigned int excepts) {
-  static fenv_t fenv;
-  unsigned int new_excepts = excepts & FE_ALL_EXCEPT, old_excepts;
-
-  if (fegetenv(&fenv)) return -1;
-  old_excepts = fenv.__control & FE_ALL_EXCEPT;
-
-  // unmask
-  fenv.__control &= ~new_excepts;
-  fenv.__mxcsr &= ~(new_excepts << 7);
-
-  return (fesetenv(&fenv) ? -1 : old_excepts);
-}
-
-int fedisableexcept(unsigned int excepts) {
-  static fenv_t fenv;
-  unsigned int new_excepts = excepts & FE_ALL_EXCEPT, old_excepts;
-
-  if (fegetenv(&fenv)) return -1;
-  old_excepts = fenv.__control & FE_ALL_EXCEPT;
-
-  // mask
-  fenv.__control |= new_excepts;
-  fenv.__mxcsr |= new_excepts << 7;
-
-  return (fesetenv(&fenv) ? -1 : old_excepts);
-}
-#endif
-#endif
diff --git a/paddle/utils/arch/osx/Locks.cpp b/paddle/utils/arch/osx/Locks.cpp
deleted file mode 100644
index f3905091bd024ab02c3f5d39cfed6dbc38fabbbc..0000000000000000000000000000000000000000
--- a/paddle/utils/arch/osx/Locks.cpp
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/utils/Locks.h"
-#include <dispatch/dispatch.h>
-#include <libkern/OSAtomic.h>
-#include <atomic>
-#include "paddle/utils/Logging.h"
-
-namespace paddle {
-
-class SemaphorePrivate {
- public:
-  ~SemaphorePrivate() { dispatch_release(sem); }
-
-  dispatch_semaphore_t sem;
-};
-
-Semaphore::Semaphore(int initValue) : m(new SemaphorePrivate()) {
-  m->sem = dispatch_semaphore_create(initValue);
-}
-
-Semaphore::~Semaphore() { delete m; }
-
-bool Semaphore::timeWait(timespec *ts) {
-  dispatch_time_t tm = dispatch_walltime(ts, 0);
-  return (0 == dispatch_semaphore_wait(m->sem, tm));
-}
-
-void Semaphore::wait() {
-  dispatch_semaphore_wait(m->sem, DISPATCH_TIME_FOREVER);
-}
-
-void Semaphore::post() { dispatch_semaphore_signal(m->sem); }
-
-class SpinLockPrivate {
- public:
-  std::atomic_flag lock_ = ATOMIC_FLAG_INIT;
-  char padding_[64 - sizeof(lock_)];  // Padding to cache line size
-};
-
-SpinLock::SpinLock() : m(new SpinLockPrivate()) {}
-SpinLock::~SpinLock() { delete m; }
-
-void SpinLock::lock() {
-  while (m->lock_.test_and_set(std::memory_order_acquire)) {
-  }
-}
-
-void SpinLock::unlock() { m->lock_.clear(std::memory_order_release); }
-
-class ThreadBarrierPrivate {
- public:
-  pthread_mutex_t mutex_;
-  pthread_cond_t cond_;
-  int count_;
-  int tripCount_;
-
-  inline explicit ThreadBarrierPrivate(int cnt) : count_(0), tripCount_(cnt) {
-    CHECK_NE(cnt, 0);
-    CHECK_GE(pthread_mutex_init(&mutex_, 0), 0);
-    CHECK_GE(pthread_cond_init(&cond_, 0), 0);
-  }
-
-  inline ~ThreadBarrierPrivate() {
-    pthread_cond_destroy(&cond_);
-    pthread_mutex_destroy(&mutex_);
-  }
-
-  /**
-   * @brief wait
-   * @return true if the last wait
-   */
-  inline bool wait() {
-    pthread_mutex_lock(&mutex_);
-    ++count_;
-    if (count_ >= tripCount_) {
-      count_ = 0;
-      pthread_cond_broadcast(&cond_);
-      pthread_mutex_unlock(&mutex_);
-      return true;
-    } else {
-      pthread_cond_wait(&cond_, &mutex_);
-      pthread_mutex_unlock(&mutex_);
-      return false;
-    }
-  }
-};
-
-ThreadBarrier::ThreadBarrier(int count) : m(new ThreadBarrierPrivate(count)) {}
-ThreadBarrier::~ThreadBarrier() { delete m; }
-void ThreadBarrier::wait() { m->wait(); }
-
-}  // namespace paddle
diff --git a/paddle/utils/tests/CMakeLists.txt b/paddle/utils/tests/CMakeLists.txt
deleted file mode 100644
index c770ce169878d9998e559b1d417fc1acc88cde97..0000000000000000000000000000000000000000
--- a/paddle/utils/tests/CMakeLists.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-add_simple_unittest(test_Thread)
-add_simple_unittest(test_StringUtils)
-add_simple_unittest(test_CustomStackTrace)
-add_simple_unittest(test_ThreadBarrier)
-add_simple_unittest(test_SpinLock)
-add_simple_unittest(test_SIMDFlags)
-add_simple_unittest(test_Error)
-
-add_executable(
-    test_CustomStackTracePrint
-    test_CustomStackTracePrint.cpp
-)
-link_paddle_exe(test_CustomStackTracePrint)
-if(NOT APPLE)
-    add_test(NAME test_CustomStackTracePrint
-        COMMAND ${PADDLE_SOURCE_DIR}/paddle/utils/tests/test_CustomStackTracePrint.sh
-        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-endif()
diff --git a/paddle/utils/tests/test_CustomStackTrace.cpp b/paddle/utils/tests/test_CustomStackTrace.cpp
deleted file mode 100644
index 4d5540b24cb9d52482cfa5a77dfa956b8bf4ef38..0000000000000000000000000000000000000000
--- a/paddle/utils/tests/test_CustomStackTrace.cpp
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gflags/gflags.h>  // NOLINT
-#include <gtest/gtest.h>    // NOLINT
-
-#include "paddle/utils/CustomStackTrace.h"
-#include "paddle/utils/Locks.h"
-#include "paddle/utils/StringUtil.h"
-#include "paddle/utils/Util.h"
-
-DEFINE_int32(test_thread_num, 10, "testing thread number");
-
-void testNormalImpl(
-    const std::function<void(paddle::CustomStackTrace<std::string>&,
-                             size_t,
-                             size_t,
-                             paddle::ThreadBarrier&,
-                             paddle::ThreadBarrier&)>& callback) {
-  paddle::CustomStackTrace<std::string> tracer;
-  paddle::ThreadBarrier doneBarrier(FLAGS_test_thread_num + 1);
-  paddle::ThreadBarrier startBarrier(FLAGS_test_thread_num + 1);
-  constexpr size_t countDown = 10;
-  constexpr size_t layerSize = 1000;
-  std::vector<std::unique_ptr<std::thread>> threads;
-  threads.reserve(FLAGS_test_thread_num);
-
-  for (int32_t i = 0; i < FLAGS_test_thread_num; ++i) {
-    threads.emplace_back(
-        new std::thread([&tracer, &startBarrier, &doneBarrier, &callback] {
-          callback(tracer, countDown, layerSize, startBarrier, doneBarrier);
-        }));
-  }
-  size_t cntDown = countDown;
-  while (cntDown-- > 0) {
-    startBarrier.wait();
-    sleep(1);
-    doneBarrier.wait();
-    ASSERT_TRUE(tracer.empty());
-  }
-
-  for (auto& thread : threads) {
-    thread->join();
-  }
-}
-
-TEST(CustomStackTrace, normalTrain) {
-  testNormalImpl([](paddle::CustomStackTrace<std::string>& tracer,
-                    size_t countDown,
-                    size_t layerSize,
-                    paddle::ThreadBarrier& start,
-                    paddle::ThreadBarrier& finish) {
-    while (countDown-- > 0) {
-      start.wait();
-      for (size_t i = 0; i < layerSize; ++i) {
-        tracer.push("layer_" + paddle::str::to_string(i));
-      }
-      for (size_t i = 0; i < layerSize; ++i) {
-        tracer.pop("layer_" + paddle::str::to_string(layerSize - 1 - i));
-      }
-      finish.wait();
-    }
-  });
-}
-
-TEST(CustomStackTrace, normalTest) {
-  testNormalImpl([](paddle::CustomStackTrace<std::string>& tracer,
-                    size_t countDown,
-                    size_t layerSize,
-                    paddle::ThreadBarrier& start,
-                    paddle::ThreadBarrier& finish) {
-    while (countDown-- > 0) {
-      start.wait();
-      for (size_t i = 0; i < layerSize; ++i) {
-        tracer.push("layer_" + paddle::str::to_string(i));
-      }
-      tracer.clear();  // in forward test, tracer will clear after forward.
-      finish.wait();
-    }
-  });
-}
diff --git a/paddle/utils/tests/test_CustomStackTracePrint.cpp b/paddle/utils/tests/test_CustomStackTracePrint.cpp
deleted file mode 100644
index 360c61c88a757da708b01d2bb54068b948b235cc..0000000000000000000000000000000000000000
--- a/paddle/utils/tests/test_CustomStackTracePrint.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/utils/CustomStackTrace.h"
-#include "paddle/utils/StringUtil.h"
-#include "paddle/utils/Util.h"
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-
-  for (size_t i = 0; i < 1000; ++i) {
-    paddle::gLayerStackTrace.push("layer_" + paddle::str::to_string(i));
-    if (i == 998) {
-      throw "Unhandle exception";
-    }
-  }
-
-  return 0;
-}
diff --git a/paddle/utils/tests/test_Error.cpp b/paddle/utils/tests/test_Error.cpp
deleted file mode 100644
index 6f311fa6b80191de1e11ce1f63c31b64fe2eeb80..0000000000000000000000000000000000000000
--- a/paddle/utils/tests/test_Error.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/utils/Error.h"
-
-#include <gtest/gtest.h>
-
-TEST(Error, testAll) {
-  paddle::Error error;
-  ASSERT_TRUE(error.isOK());
-  error = paddle::Error("I'm the error");
-  ASSERT_FALSE(error.isOK());
-  ASSERT_STREQ("I'm the error", error.msg());
-
-  error = paddle::Error("error2");
-  ASSERT_FALSE(error.isOK());
-  ASSERT_STREQ("error2", error.msg());
-
-  int i = 3;
-  auto error3 = paddle::Error("error%d", i);
-  ASSERT_FALSE(error3.isOK());
-  ASSERT_STREQ("error3", error3.msg());
-}
diff --git a/paddle/utils/tests/test_SIMDFlags.cpp b/paddle/utils/tests/test_SIMDFlags.cpp
deleted file mode 100644
index a808d456a69866f72502bcf1ae244cec14738e22..0000000000000000000000000000000000000000
--- a/paddle/utils/tests/test_SIMDFlags.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-
-#include "paddle/utils/CpuId.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Util.h"
-
-using namespace paddle;  // NOLINT
-
-TEST(SIMDFlags, gccTest) {
-#if (defined(__GNUC__) || defined(__GNUG__)) && !(defined(__clang__)) && \
-    !defined(__arm__) && !defined(__aarch64__)
-  // clang-format off
-  CHECK(!__builtin_cpu_supports("sse")    != HAS_SSE);
-  CHECK(!__builtin_cpu_supports("sse2")   != HAS_SSE2);
-  CHECK(!__builtin_cpu_supports("sse3")   != HAS_SSE3);
-  CHECK(!__builtin_cpu_supports("ssse3")  != HAS_SSSE3);
-  CHECK(!__builtin_cpu_supports("sse4.1") != HAS_SSE41);
-  CHECK(!__builtin_cpu_supports("sse4.2") != HAS_SSE42);
-  CHECK(!__builtin_cpu_supports("avx")    != HAS_AVX);
-  CHECK(!__builtin_cpu_supports("avx2")   != HAS_AVX2);
-// clang-format on
-#endif
-}
-
-TEST(SIMDFlags, normalPrint) {
-  LOG(INFO) << "Has SSE:     " << std::boolalpha << HAS_SSE;
-  LOG(INFO) << "Has SSE2:    " << std::boolalpha << HAS_SSE2;
-  LOG(INFO) << "Has SSE3:    " << std::boolalpha << HAS_SSE3;
-  LOG(INFO) << "Has SSSE3:   " << std::boolalpha << HAS_SSSE3;
-  LOG(INFO) << "Has SSE4:    " << std::boolalpha << HAS_SSE41 || HAS_SSE42;
-  LOG(INFO) << "Has FMA3:    " << std::boolalpha << HAS_FMA3;
-  LOG(INFO) << "Has FMA4:    " << std::boolalpha << HAS_FMA4;
-  LOG(INFO) << "Has AVX:     " << std::boolalpha << HAS_AVX;
-  LOG(INFO) << "Has AVX2:    " << std::boolalpha << HAS_AVX2;
-  LOG(INFO) << "Has AVX512:  " << std::boolalpha << HAS_AVX512;
-  LOG(INFO) << "Has NEON:    " << std::boolalpha << HAS_NEON;
-}
diff --git a/paddle/utils/tests/test_SpinLock.cpp b/paddle/utils/tests/test_SpinLock.cpp
deleted file mode 100644
index cc34eb1f868003d3db9221578c0c20c44be285eb..0000000000000000000000000000000000000000
--- a/paddle/utils/tests/test_SpinLock.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <vector>
-
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-
-#include "paddle/utils/Locks.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Util.h"
-
-DEFINE_int32(test_thread_num, 100, "testing thread number");
-
-void testNormalImpl(
-    size_t thread_num,
-    const std::function<void(size_t, size_t&, paddle::SpinLock&)>& callback) {
-  paddle::SpinLock mutex;
-  std::vector<std::thread> threads;
-  threads.reserve(thread_num);
-
-  size_t count = 0;
-  for (size_t i = 0; i < thread_num; ++i) {
-    threads.emplace_back([&thread_num, &count, &mutex, &callback] {
-      callback(thread_num, count, mutex);
-    });
-  }
-  for (auto& thread : threads) {
-    thread.join();
-  }
-  // Check whether all threads reach this point or not
-  CHECK_EQ(count, thread_num);
-}
-
-TEST(ThreadSpinLock, normalTest) {
-  for (auto& thread_num : {10, 30, 50, 100, 300, 1000}) {
-    testNormalImpl(
-        thread_num,
-        [](size_t thread_num, size_t& count, paddle::SpinLock& mutex) {
-          std::lock_guard<paddle::SpinLock> lock(mutex);
-          ++count;
-        });
-  }
-}
diff --git a/paddle/utils/tests/test_StringUtils.cpp b/paddle/utils/tests/test_StringUtils.cpp
deleted file mode 100644
index 248f58a7f26e26e82b55110930964cee04fb558b..0000000000000000000000000000000000000000
--- a/paddle/utils/tests/test_StringUtils.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/utils/StringUtil.h"
-
-#include <gtest/gtest.h>
-
-TEST(StringUtil, to) {
-  ASSERT_NEAR(paddle::str::to<double>("12.45"), 12.45, 1e-5);
-  ASSERT_DEATH_IF_SUPPORTED(paddle::str::to<double>("12.45x23"), ".*");
-  ASSERT_DEATH_IF_SUPPORTED(paddle::str::to<int>(""), ".*");
-}
diff --git a/paddle/utils/tests/test_Thread.cpp b/paddle/utils/tests/test_Thread.cpp
deleted file mode 100644
index 6e2580c4913f0adc7ba1e63c9cebce308775aac6..0000000000000000000000000000000000000000
--- a/paddle/utils/tests/test_Thread.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/utils/Thread.h>
-#include <atomic>
-
-using paddle::AsyncThreadPool;  // NOLINT
-
-TEST(AsyncThreadPool, addJob) {
-  AsyncThreadPool pool(8);
-  auto a = pool.addJob([] { return 1; });
-  auto b = pool.addJob([] { return true; });
-  auto c = pool.addJob([] { return false; });
-
-  ASSERT_EQ(a.get(), 1);
-  ASSERT_TRUE(b.get());
-  ASSERT_FALSE(c.get());
-}
-
-TEST(AsyncThreadPool, addBatchJob) {
-  AsyncThreadPool pool(8);
-  std::atomic<int> counter{0};
-
-  std::vector<AsyncThreadPool::JobFunc> jobs;
-
-  for (int i = 0; i < 10000; i++) {
-    jobs.emplace_back([&] { counter++; });
-  }
-
-  pool.addBatchJobs(jobs);
-
-  ASSERT_EQ(counter, 10000);
-}
-
-TEST(AsyncThreadPool, multiThreadAddBatchJob) {
-  AsyncThreadPool levelOnePool(200);
-  AsyncThreadPool levelTwoPool(200);
-
-  std::shared_ptr<std::mutex> mut = std::make_shared<std::mutex>();
-  int counter = 0;
-  const int numMonitors = 300;
-  const int numSlaves = 300;
-  std::vector<AsyncThreadPool::JobFunc> moniterJobs(numMonitors, [&] {
-    std::vector<AsyncThreadPool::JobFunc> slaveJobs(numSlaves, [mut, &counter] {
-      std::lock_guard<std::mutex> lk(*mut);
-      counter++;
-    });
-    levelTwoPool.addBatchJobs(slaveJobs);
-  });
-  levelOnePool.addBatchJobs(moniterJobs);
-  ASSERT_EQ(counter, numMonitors * numSlaves);
-}
-
-TEST(AsyncThreadPool, addBatchJobWithResults) {
-  AsyncThreadPool pool(100);
-
-  std::vector<std::function<int()>> jobs;
-  const int numJobs = 100;
-  for (int i = 0; i < numJobs; i++) {
-    jobs.emplace_back([i] { return i; });
-  }
-
-  std::vector<int> res;
-  pool.addBatchJobs(jobs, res);
-
-  for (int i = 0; i < numJobs; i++) {
-    ASSERT_EQ(res[i], i);
-  }
-}
diff --git a/paddle/utils/tests/test_ThreadBarrier.cpp b/paddle/utils/tests/test_ThreadBarrier.cpp
deleted file mode 100644
index 554b1c1d4adce7a0196b304281dcf878a0b6426e..0000000000000000000000000000000000000000
--- a/paddle/utils/tests/test_ThreadBarrier.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <set>
-#include <vector>
-
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-
-#include "paddle/utils/Locks.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Util.h"
-
-DEFINE_int32(test_thread_num, 100, "testing thread number");
-
-void testNormalImpl(
-    size_t thread_num,
-    const std::function<void(size_t,
-                             std::mutex&,
-                             std::set<std::thread::id>&,
-                             paddle::ThreadBarrier&)>& callback) {
-  std::mutex mutex;
-  std::set<std::thread::id> tids;
-  paddle::ThreadBarrier barrier(thread_num);
-
-  std::vector<std::thread> threads;
-  threads.reserve(thread_num);
-  for (size_t i = 0; i < thread_num; ++i) {
-    threads.emplace_back([&thread_num, &mutex, &tids, &barrier, &callback] {
-      callback(thread_num, mutex, tids, barrier);
-    });
-  }
-
-  for (auto& thread : threads) {
-    thread.join();
-  }
-}
-
-TEST(ThreadBarrier, normalTest) {
-  for (auto& thread_num : {10, 30, 50, 100, 300, 1000}) {
-    testNormalImpl(thread_num,
-                   [](size_t thread_num,
-                      std::mutex& mutex,
-                      std::set<std::thread::id>& tids,
-                      paddle::ThreadBarrier& barrier) {
-                     {
-                       std::lock_guard<std::mutex> guard(mutex);
-                       tids.insert(std::this_thread::get_id());
-                     }
-                     barrier.wait();
-                     // Check whether all threads reach this point or not
-                     CHECK_EQ(tids.size(), thread_num);
-                   });
-  }
-}
diff --git a/proto/README.md b/proto/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..dda7ed7b3c8ea4b541eaafbd0fd239eea789b40e
--- /dev/null
+++ b/proto/README.md
@@ -0,0 +1,3 @@
+## protos in this folder are legacy v2 protos.
+
+## Please refer to paddle/fluid for latest version.
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index ea25f3ab351ca1feb085a8fbbfe53d8cee397bbf..797c0fbcc4a2d61f5cbbf691db19b4cba5d38630 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -1,4 +1,4 @@
-file(GLOB UTILS_PY_FILES . ./paddle/utils/*.py)
+file(GLOB UTILS_PY_FILES . ./paddle/legacy/utils/*.py)
 file(GLOB_RECURSE FLUID_PY_FILES ./paddle/fluid/*.py)
 set(PY_FILES paddle/__init__.py
   ${UTILS_PY_FILES}
diff --git a/python/paddle/dataset/mnist.py b/python/paddle/dataset/mnist.py
index 6a1b8b5fac223c0d134cae69a61a0c2c00bc1feb..9d05aeeb95c4f936cb773ece20407ecb32cbbf21 100644
--- a/python/paddle/dataset/mnist.py
+++ b/python/paddle/dataset/mnist.py
@@ -111,7 +111,7 @@ def fetch():
     paddle.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5)
     paddle.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
     paddle.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5)
-    paddle.dataset.common.download(TEST_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
+    paddle.dataset.common.download(TEST_LABEL_URL, 'mnist', TEST_LABEL_MD5)
 
 
 def convert(path):
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index bd985ad733aa8eece2f8374d033f452a0175a011..3034c1a0875a71421bcba172c16ee32d809df152 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -44,7 +44,7 @@ import metrics
 import transpiler
 from param_attr import ParamAttr, WeightNormParamAttr
 from data_feeder import DataFeeder
-from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace
+from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope
 from transpiler import DistributeTranspiler, InferenceTranspiler, \
     memory_optimize, release_memory
 from concurrency import (Go, make_channel, channel_send, channel_recv,
@@ -83,6 +83,7 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + \
               'profiler',
               'unique_name',
               'recordio_writer',
+              'Scope',
           ]
 
 
@@ -117,7 +118,8 @@ def __bootstrap__():
 
     read_env_flags = [
         'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
-        'eager_delete_scope', 'use_mkldnn'
+        'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
+        'init_allocated_mem'
     ]
     if core.is_compiled_with_cuda():
         read_env_flags += [
diff --git a/python/paddle/fluid/average.py b/python/paddle/fluid/average.py
index 6abe8233b07c484494848c566e9898600a7d8f5c..358e24df31bb517604481bb48b9180e579f8460d 100644
--- a/python/paddle/fluid/average.py
+++ b/python/paddle/fluid/average.py
@@ -36,6 +36,25 @@ def _is_number_or_matrix_(var):
 
 
 class WeightedAverage(object):
+    """
+    Calculate weighted average.
+
+    The average calculating is accomplished via Python totally. 
+    They do not change Paddle's Program, nor do anything to
+    modify NN model's configuration. They are completely 
+    wrappers of Python functions.
+
+    Examples:
+        .. code-block:: python
+            avg = fluid.average.WeightedAverage()
+            avg.add(value=2.0, weight=1)
+            avg.add(value=4.0, weight=2)
+            avg.eval()
+
+            # The result is 3.333333333.
+            # For (2.0 * 1 + 4.0 * 2) / (1 + 2) = 3.333333333
+    """
+
     def __init__(self):
         warnings.warn(
             "The %s is deprecated, please use fluid.metrics.Accuracy instead." %
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index 4f9622d04dc98f41b503ceb780802d2a4e4c58a0..4faa06303170488d0de2fda4c1461cfe2d623d35 100644
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -132,9 +132,9 @@ def _addup_repetitive_outputs_(op_descs):
     for idx, op_desc in enumerate(op_descs):
         for var_name in op_desc.input_arg_names():
             if len(renamed_vars[var_name]) > 1:
-                pending_sum_ops.append(
-                    (_create_op_desc_("sum", {"X": renamed_vars[var_name]},
-                                      {"Out": [var_name]}, {}), idx))
+                pending_sum_ops.append((_create_op_desc_(
+                    "sum", {"X": renamed_vars[var_name]}, {"Out": [var_name]},
+                    {"use_mkldnn": False}), idx))
                 renamed_vars[var_name] = [var_name]
         for var_name in op_desc.output_arg_names():
             if var_name == core.empty_var_name(
@@ -147,7 +147,7 @@ def _addup_repetitive_outputs_(op_descs):
             else:
                 if len(renamed_vars[var_name]) == 1:
                     new_name = var_name + "@RENAME@" + \
-                               str(var_rename_count[var_name])
+                        str(var_rename_count[var_name])
                     var_rename_count[var_name] += 1
                     # rename original var_name
                     renamed_vars[var_name][0] = new_name
@@ -155,14 +155,15 @@ def _addup_repetitive_outputs_(op_descs):
                     _rename_arg_(pending_sum_ops, var_name, new_name)
 
                 new_name = var_name + "@RENAME@" + \
-                           str(var_rename_count[var_name])
+                    str(var_rename_count[var_name])
                 var_rename_count[var_name] += 1
                 op_desc.rename_output(var_name, new_name)
                 renamed_vars[var_name].append(new_name)
     for var_name, inputs in renamed_vars.iteritems():
         if len(inputs) > 1:
-            pending_sum_ops.append((_create_op_desc_(
-                "sum", {"X": inputs}, {"Out": [var_name]}, {}), len(op_descs)))
+            pending_sum_ops.append(
+                (_create_op_desc_("sum", {"X": inputs}, {"Out": [var_name]},
+                                  {"use_mkldnn": False}), len(op_descs)))
     # sum_op descs are sorted according to their insert position
     for p in reversed(pending_sum_ops):
         op_descs.insert(p[1], p[0])
@@ -434,18 +435,65 @@ def _get_stop_gradients_(program):
 def append_backward(loss, parameter_list=None, no_grad_set=None,
                     callbacks=None):
     """
-    Append backward part to main_program
+    Append backward part to main_program.
 
-    Args:
-        loss(Variable): The variable generated by cost function.
-        parameter_list(list[string]): Parameters that need to be updated by
-            optimizer. If None, it means all parameters need to be updated.
-        no_grad_set(set): Variables that have no gradients in Block 0.
-            All variables with `step_gradient=True` from all blocks will be
-            automatically added.
+    A complete neural network training is made up of forward and backward 
+    propagation. However, when we configure a network, we only need to 
+    specify its forwrd part. The backward part is generated automatically 
+    according to the forward part by this function.
 
-    Return:
-        (list[(Variable,Variable)]): list of (parameter, gradient) pair.
+    In most cases, users do not need to invoke this function manually. It 
+    will be automatically invoked by the optimizer's `minimize` function.
+
+    Args:
+        loss(Variable): The loss variable of the network.
+        parameter_list(list[string]|None): Names of parameters that need 
+                                           to be updated by optimizers. 
+                                           If it is None, all parameters 
+                                           will be updated.
+                                           Default: None
+        no_grad_set(set|None): Variables in the Block 0 whose gradients 
+                               should be ignored. All variables with 
+                               `step_gradient=True` from all blocks will 
+                               be automatically added into this set.
+                               Default: None
+        callbacks(list[callable object]|None): The callbacks are used for 
+                                               doing some custom jobs during 
+                                               backward part building. All 
+                                               callable objects in it will 
+                                               be invoked once each time a 
+                                               new gradient operator is added 
+                                               into the program. The callable 
+                                               object must has two input 
+                                               parameters: 'block' and 'context'. 
+                                               The 'block' is the block which 
+                                               the new gradient operator will 
+                                               be added to. The 'context' is a 
+                                               map, whose keys are gradient 
+                                               variable names and values are 
+                                               corresponding original variables.
+                                               In addition to this, the 'context' 
+                                               has another special key-value pair: 
+                                               the key is string '__current_op_desc__' 
+                                               and the value is the op_desc of the 
+                                               gradient operator who has just 
+                                               triggered the callable object. 
+
+    Returns:
+        list[(Variable,Variable)]: Pairs of parameter and its 
+        corresponding gradients. The key is the parameter and the 
+        value is gradient variable.
+
+    Raises:
+        AssertionError: If `loss` is not an instance of Variable.
+
+    Examples:
+        .. code-block:: python
+
+            # network configuration code
+            # ...
+            avg_loss = fluid.layers.mean(loss)
+            param_grad_list = fluid.backward.append_backward(loss=avg_loss)
     """
     assert isinstance(loss, framework.Variable)
 
diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py
index 66c3fc6b66d61bc9578f84594409ad0f24c99910..18e2f3045e272fb4712391f87bffd3f367c1c744 100644
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -24,8 +24,6 @@ __all__ = [
     'GradientClipByValue',
     'GradientClipByNorm',
     'GradientClipByGlobalNorm',
-    'append_gradient_clip_ops',
-    'error_clip_callback',
 ]
 
 
@@ -38,6 +36,25 @@ class BaseErrorClipAttr(object):
 
 
 class ErrorClipByValue(BaseErrorClipAttr):
+    """
+    Clips tensor values to the range [min, max].
+
+    Given a tensor t, this operation clips its value to min and max inplace.
+
+    - Any values less than min are set to min.
+    - Any values greater than max are set to max.
+
+    Args:
+        max (float): The maximum value to clip by.
+        min (float, optional): The minimum value to clip by. if not set by user, \
+        will be set to -max by framework.
+
+    Examples:
+        .. code-block:: python
+
+            var = fluid.framework.Variable(..., error_clip=ErrorClipByValue(max=5.0), ...)
+    """
+
     def __init__(self, max, min=None):
         max = float(max)
         if min is None:
@@ -99,6 +116,31 @@ class NullGradientClipAttr(BaseGradientClipAttr):
 
 
 class GradientClipByValue(BaseGradientClipAttr):
+    """
+    Clips gradient values to the range [min, max].
+
+    Given a tensor t, this operation clips its value to min and max inplace.
+
+    - Any values less than min are set to min.
+    - Any values greater than max are set to max.
+
+    Args:
+        max (float): The maximum value to clip by.
+        min (float, optional): The minimum value to clip by. if not set by user, \
+        will be set to -max by framework.
+
+    Examples:
+        .. code-block:: python
+
+            w_param_attrs = ParamAttr(name=None,
+              initializer=UniformInitializer(low=-1.0, high=1.0, seed=0),
+              learning_rate=1.0,
+              regularizer=L1Decay(1.0),
+              trainable=True,
+              clip=GradientClipByValue(-1.0, 1.0))
+            y_predict = fluid.layers.fc(input=x, size=1, param_attr=w_param_attrs)
+    """
+
     def __init__(self, max, min=None):
         max = float(max)
         if min is None:
@@ -120,6 +162,37 @@ class GradientClipByValue(BaseGradientClipAttr):
 
 
 class GradientClipByNorm(BaseGradientClipAttr):
+    """
+    Clips tensor values to a maximum L2-norm.
+
+    This operator limits the L2 norm of the input :math:`X` within :math:`max\_norm`.
+    If the L2 norm of :math:`X` is less than or equal to :math:`max\_norm`, :math:`Out`
+    will be the same as :math:`X`. If the L2 norm of :math:`X` is greater than
+    :math:`max\_norm`, :math:`X` will be linearly scaled to make the L2 norm of
+    :math:`Out` equal to :math:`max\_norm`, as shown in the following formula:
+
+    .. math::
+
+        Out = \\frac{max\_norm * X}{norm(X)},
+
+    where :math:`norm(X)` represents the L2 norm of :math:`X`.
+
+    Args:
+        clip_norm (float): The maximum norm value
+
+    Examples:
+        .. code-block:: python
+
+            w_param_attrs = ParamAttr(name=None,
+              initializer=UniformInitializer(low=-1.0, high=1.0, seed=0),
+              learning_rate=1.0,
+              regularizer=L1Decay(1.0),
+              trainable=True,
+              clip=GradientClipByNorm(clip_norm=2.0))
+            y_predict = fluid.layers.fc(input=x, size=1, param_attr=w_param_attrs)
+
+    """
+
     def __init__(self, clip_norm):
         self.clip_norm = clip_norm
 
@@ -135,6 +208,44 @@ class GradientClipByNorm(BaseGradientClipAttr):
 
 
 class GradientClipByGlobalNorm(BaseGradientClipAttr):
+    """
+    Clips values of multiple tensors by the ratio of the sum of their norms.
+
+    Given a list of tensors t_list, and a clipping ratio clip_norm, this
+    operation returns a list of clipped tensors list_clipped and the global
+    norm (global_norm) of all tensors in t_list.
+
+    To perform the clipping, the values :math:`t\_list[i]` are set to:
+
+    .. math::
+
+        t\_list[i] = t\_list[i] * \\frac{clip\_norm}{\max(global\_norm, clip\_norm)}
+
+    where:
+
+    .. math::
+
+        global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2}
+
+    If :math:`clip\_norm > global\_norm` then the entries in t_list remain as they are,
+    otherwise they're all shrunk by the global ratio.
+
+    Args:
+        clip_norm (float): The maximum norm value
+        group_name (str, optional): The group name for this clip.
+
+    Examples:
+        .. code-block:: python
+
+            p_g_clip = fluid.backward.append_backward(loss=avg_cost_clip)
+
+            with fluid.program_guard(main_program=prog_clip):
+                fluid.clip.set_gradient_clip(
+                    fluid.clip.GradientClipByGlobalNorm(clip_norm=2.0))
+                p_g_clip = fluid.clip.append_gradient_clip_ops(p_g_clip)
+
+    """
+
     def __init__(self, clip_norm, group_name="default_group"):
         if not isinstance(group_name, basestring):
             raise TypeError("'group_name' must be a basestring.")
@@ -183,15 +294,16 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
 
 def set_gradient_clip(clip, param_list=None, program=None):
     """
-        To specify parameters that require gradient clip.
-        Args:
-            clip(BaseGradientClipAttr): An instance of some derived class of BaseGradientClipAttr, 
-                    which describes the type and detailed attributes of required gradient clip.
-            param_list(list, None by default): Parameters that require gradient clip. 
-                    It can be a list of parameter or a list of parameter's name. 
-                    When it's None, all parameters in the program will be included. 
-            program(Program, None by default): The program where parameters are. 
-                    Will be the default main program when assigned with None.
+    To specify parameters that require gradient clip.
+
+    Args:
+        clip(BaseGradientClipAttr): An instance of some derived class of BaseGradientClipAttr,
+                which describes the type and detailed attributes of required gradient clip.
+        param_list(list(Variable)): Parameters that require gradient clip.
+                It can be a list of parameter or a list of parameter's name.
+                When it's None, all parameters in the program will be included.
+        program(Program): The program where parameters are.
+                Will be the default main program when assigned with None.
     """
     if not isinstance(clip, BaseGradientClipAttr):
         raise TypeError(
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index e2013137b14f73bb0fcfb57b4bdc35fcc043bdc0..c859778b3757f638ac531620f241e684522add57 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -29,6 +29,13 @@ class DataToLoDTensorConverter(object):
         self.place = place
         self.lod_level = lod_level
         self.shape = shape
+        negtive_count = 0
+        for s in self.shape:
+            if s < 0:
+                negtive_count += 1
+            if negtive_count > 1:
+                self.shape = None
+                break
         if dtype == core.VarDesc.VarType.FP32:
             self.dtype = 'float32'
         elif dtype == core.VarDesc.VarType.INT64:
@@ -47,7 +54,7 @@ class DataToLoDTensorConverter(object):
         self.lod = []
 
         for i in six.range(lod_level):
-            self.lod.append([0])
+            self.lod.append([])
 
     def feed(self, data):
         self._feed_impl_(data, self.lod, self.lod_level)
@@ -56,21 +63,77 @@ class DataToLoDTensorConverter(object):
         if lod_level == 0:
             self.data.append(data)
         else:
-            cur_lod_len = len(data)
-            lod[0].append(lod[0][-1] + cur_lod_len)
+            lod[0].append(len(data))
             for each_data in data:
                 self._feed_impl_(each_data, lod[1:], lod_level - 1)
 
     def done(self):
-        arr = numpy.array(self.data, dtype=self.dtype).reshape(self.shape)
+        arr = numpy.array(self.data, dtype=self.dtype)
+        if self.shape:
+            arr = arr.reshape(self.shape)
         t = core.LoDTensor()
         t.set(arr, self.place)
         if self.lod_level > 0:
-            t.set_lod(self.lod)
+            t.set_recursive_sequence_lengths(self.lod)
         return t
 
 
 class DataFeeder(object):
+    """
+    DataFeeder converts the data that returned by a reader into a data
+    structure that can feed into Executor and ParallelExecutor. The reader
+    usually returns a list of mini-batch data entries. Each data entry in
+    the list is one sample. Each sample is a list or a tuple with one
+    feature or multiple features.
+
+    The simple usage shows below:
+
+    ..  code-block:: python
+
+        place = fluid.CPUPlace()
+        img = fluid.layers.data(name='image', shape=[1, 28, 28])
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+        feeder = fluid.DataFeeder([img, label], fluid.CPUPlace())
+        result = feeder.feed([([0] * 784, [9]), ([1] * 784, [1])])
+
+
+    If you want to feed data into GPU side separately in advance when you
+    use multi-GPU to train a model, you can use `decorate_reader` function.
+
+    ..  code-block:: python
+
+        place=fluid.CUDAPlace(0)
+        feeder = fluid.DataFeeder(place=place, feed_list=[data, label])
+        reader = feeder.decorate_reader(
+            paddle.batch(flowers.train(), batch_size=16))
+
+    Args:
+        feed_list(list): The Variables or Variables'name that will
+            feed into model.
+        place(Place): place indicates feed data into CPU or GPU, if you want to
+            feed data into GPU, please using `fluid.CUDAPlace(i)` (`i` represents
+            the GPU id), or if you want to feed data into CPU, please using
+            `fluid.CPUPlace()`.
+        program(Program): The Program that will feed data into, if program
+            is None, it will use default_main_program(). Default None.
+
+    Raises:
+        ValueError: If some Variable is not in this Program.
+
+    Examples:
+        .. code-block:: python
+
+            # ...
+            place = fluid.CPUPlace()
+            feed_list = [
+                main_program.global_block().var(var_name) for var_name in feed_vars_name
+            ] # feed_vars_name is a list of variables' name.
+            feeder = fluid.DataFeeder(feed_list, place)
+            for data in reader():
+                outs = exe.run(program=main_program,
+                               feed=feeder.feed(data))
+    """
+
     def __init__(self, feed_list, place, program=None):
         self.feed_dtypes = []
         self.feed_names = []
@@ -100,6 +163,16 @@ class DataFeeder(object):
         self.place = place
 
     def feed(self, iterable):
+        """
+        According to feed_list and iterable, converters the input into
+        a data structure that can feed into Executor and ParallelExecutor.
+
+        Args:
+            iterable(list|tuple): the input data.
+
+        Returns:
+            dict: the result of conversion.
+        """
         converter = []
         for lod_level, shape, dtype in six.zip(
                 self.feed_lod_level, self.feed_shapes, self.feed_dtypes):
@@ -122,6 +195,20 @@ class DataFeeder(object):
         return ret_dict
 
     def feed_parallel(self, iterable, num_places=None):
+        """
+        Takes multiple mini-batches. Each mini-batch will be feed on each
+        device in advance.
+
+        Args:
+            iterable(list|tuple): the input data.
+            num_places(int): the number of devices. Default None.
+
+        Returns:
+            dict: the result of conversion.
+
+        Notes:
+            The number of devices and number of mini-batches must be same.
+        """
         if isinstance(self.place, core.CUDAPlace):
             places = [
                 core.CUDAPlace(i)
@@ -160,6 +247,24 @@ class DataFeeder(object):
                         multi_devices,
                         num_places=None,
                         drop_last=True):
+        """
+        Converter the input data into a data that returned by reader into
+        multiple mini-batches. Each mini-batch will be feed on each device.
+
+        Args:
+            reader(fun): the input data.
+            multi_devices(bool): the number of places. Default None.
+            num_places(int): the number of places. Default None.
+            drop_last(bool): the number of places. Default None.
+
+        Returns:
+            dict: the result of conversion.
+
+        Raises:
+            ValueError: If drop_last is False and the data batch which cannot
+            fit for devices.
+        """
+
         def __reader_creator__():
             if not multi_devices:
                 for item in reader():
diff --git a/python/paddle/fluid/evaluator.py b/python/paddle/fluid/evaluator.py
index 7c6ad6f27dcfd7040f79c72c01413c8cc84a28ba..00ba1a0457583d1cc1fa7136ebd51e9ced167832 100644
--- a/python/paddle/fluid/evaluator.py
+++ b/python/paddle/fluid/evaluator.py
@@ -41,7 +41,12 @@ def _clone_var_(block, var):
 
 class Evaluator(object):
     """
-    Base Class for all evaluators
+    Warning: better to use the fluid.metrics.* things, more
+    flexible support via pure Python and Operator, and decoupled
+    with executor. Short doc are intended to urge new user
+    start from Metrics.
+
+    Base Class for all evaluators.
 
     Args:
         name(str): The name of evaluator. such as, "accuracy". Used for generate
@@ -69,6 +74,10 @@ class Evaluator(object):
     def reset(self, executor, reset_program=None):
         """
         reset metric states at the begin of each pass/user specified batch
+
+        Args:
+            executor(Executor|ParallelExecutor): a executor for executing the reset_program
+            reset_program(Program): a single Program for reset process
         """
         if reset_program is None:
             reset_program = Program()
@@ -85,15 +94,16 @@ class Evaluator(object):
     def eval(self, executor, eval_program=None):
         """
         Evaluate the statistics merged by multiple mini-batches.
+        Args:
+            executor(Executor|ParallelExecutor): a executor for executing the eval_program
+            eval_program(Program): a single Program for eval process
         """
         raise NotImplementedError()
 
-    def create_state(self, suffix, dtype, shape):
+    def _create_state(self, suffix, dtype, shape):
         """
         Create state variable.
 
-        NOTE: It is not a public API.
-
         Args:
             suffix(str): the state suffix.
             dtype(str|core.VarDesc.VarType): the state data type
@@ -113,9 +123,35 @@ class Evaluator(object):
 
 class ChunkEvaluator(Evaluator):
     """
+    Warning: This would be deprecated in the future. Please use fluid.metrics.ChunkEvaluator 
+    instead.
+
     Accumulate counter numbers output by chunk_eval from mini-batches and
     compute the precision recall and F1-score using the accumulated counter
     numbers.
+    For some basics of chunking, please refer to
+    'Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>'.
+
+    Args:
+        input (Variable): prediction output of the network.
+        label (Variable): label of the test data set.
+        chunk_scheme (str): can be IOB/IOE/IOBES and IO. See the chunk_eval op for details.
+        num_chunk_types (int): the number of chunk type.
+        excluded_chunk_types (list): A list including chunk type ids, indicating chunk types that are not counted.
+
+    Returns:
+        tuple: tuple containing: precision, recall, f1_score
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.executor(place)
+            evaluator = fluid.Evaluator.ChunkEvaluator(input, label)
+            for epoch in PASS_NUM:
+                evaluator.reset(exe)
+                for data in batches:
+                    loss = exe.run(fetch_list=[cost])
+                distance, instance_error = distance_evaluator.eval(exe)
     """
 
     def __init__(
@@ -130,11 +166,11 @@ class ChunkEvaluator(Evaluator):
         if main_program.current_block().idx != 0:
             raise ValueError("You can only invoke Evaluator in root block")
 
-        self.num_infer_chunks = self.create_state(
+        self.num_infer_chunks = self._create_state(
             dtype='int64', shape=[1], suffix='num_infer_chunks')
-        self.num_label_chunks = self.create_state(
+        self.num_label_chunks = self._create_state(
             dtype='int64', shape=[1], suffix='num_label_chunks')
-        self.num_correct_chunks = self.create_state(
+        self.num_correct_chunks = self._create_state(
             dtype='int64', shape=[1], suffix='num_correct_chunks')
         precision, recall, f1_score, num_infer_chunks, num_label_chunks, num_correct_chunks = layers.chunk_eval(
             input=input,
@@ -178,6 +214,8 @@ class ChunkEvaluator(Evaluator):
 
 class EditDistance(Evaluator):
     """
+    Warning: This would be deprecated in the future. Please use fluid.metrics.EditDistance
+    instead.
     Accumulate edit distance sum and sequence number from mini-batches and
     compute the average edit_distance and instance error of all batches.
 
@@ -188,15 +226,16 @@ class EditDistance(Evaluator):
         ignored_tokens(list of int): Tokens that should be removed before
         calculating edit distance.
 
-    Example:
+    Examples:
+        .. code-block:: python
 
-        exe = fluid.executor(place)
-        distance_evaluator = fluid.Evaluator.EditDistance(input, label)
-        for epoch in PASS_NUM:
-            distance_evaluator.reset(exe)
-            for data in batches:
-                loss = exe.run(fetch_list=[cost])
-            distance, instance_error = distance_evaluator.eval(exe)
+            exe = fluid.executor(place)
+            distance_evaluator = fluid.Evaluator.EditDistance(input, label)
+            for epoch in PASS_NUM:
+                distance_evaluator.reset(exe)
+                for data in batches:
+                    loss = exe.run(fetch_list=[cost])
+                distance, instance_error = distance_evaluator.eval(exe)
 
         In the above example:
         'distance' is the average of the edit distance in a pass.
@@ -210,11 +249,11 @@ class EditDistance(Evaluator):
         if main_program.current_block().idx != 0:
             raise ValueError("You can only invoke Evaluator in root block")
 
-        self.total_distance = self.create_state(
+        self.total_distance = self._create_state(
             dtype='float32', shape=[1], suffix='total_distance')
-        self.seq_num = self.create_state(
+        self.seq_num = self._create_state(
             dtype='int64', shape=[1], suffix='seq_num')
-        self.instance_error = self.create_state(
+        self.instance_error = self._create_state(
             dtype='int64', shape=[1], suffix='instance_error')
         distances, seq_num = layers.edit_distance(
             input=input, label=label, ignored_tokens=ignored_tokens)
@@ -256,9 +295,10 @@ class EditDistance(Evaluator):
 
 class DetectionMAP(Evaluator):
     """
+    Warning: This would be deprecated in the future. Please use fluid.metrics.DetectionMAP
+    instead.
     Calculate the detection mean average precision (mAP).
 
-    TODO (Dang Qingqing): update the following doc.
     The general steps are as follows:
     1. calculate the true positive and false positive according to the input
         of detection and labels.
@@ -293,17 +333,18 @@ class DetectionMAP(Evaluator):
             - 11point: the 11-point interpolated average precision.
             - integral: the natural integral of the precision-recall curve.
 
-    Example:
+    Examples:
+        .. code-block:: python
 
-        exe = fluid.executor(place)
-        map_evaluator = fluid.Evaluator.DetectionMAP(input,
-            gt_label, gt_box, gt_difficult)
-        cur_map, accum_map = map_evaluator.get_map_var()
-        fetch = [cost, cur_map, accum_map]
-        for epoch in PASS_NUM:
-            map_evaluator.reset(exe)
-            for data in batches:
-                loss, cur_map_v, accum_map_v = exe.run(fetch_list=fetch)
+            exe = fluid.executor(place)
+            map_evaluator = fluid.Evaluator.DetectionMAP(input,
+                gt_label, gt_box, gt_difficult)
+            cur_map, accum_map = map_evaluator.get_map_var()
+            fetch = [cost, cur_map, accum_map]
+            for epoch in PASS_NUM:
+                map_evaluator.reset(exe)
+                for data in batches:
+                    loss, cur_map_v, accum_map_v = exe.run(fetch_list=fetch)
 
         In the above example:
 
@@ -340,9 +381,10 @@ class DetectionMAP(Evaluator):
             evaluate_difficult=evaluate_difficult,
             ap_version=ap_version)
 
-        self.create_state(dtype='int32', shape=None, suffix='accum_pos_count')
-        self.create_state(dtype='float32', shape=None, suffix='accum_true_pos')
-        self.create_state(dtype='float32', shape=None, suffix='accum_false_pos')
+        self._create_state(dtype='int32', shape=None, suffix='accum_pos_count')
+        self._create_state(dtype='float32', shape=None, suffix='accum_true_pos')
+        self._create_state(
+            dtype='float32', shape=None, suffix='accum_false_pos')
 
         self.has_state = None
         var = self.helper.create_variable(
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 33d8f709412b25d29c6618272500dd7b953d6645..b436dfe70afdb52299222f8ba3f5bdff2842d103 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -18,17 +18,24 @@ from framework import Program, default_main_program, Variable
 from . import core
 
 __all__ = [
-    'Executor', 'global_scope', 'scope_guard', 'switch_scope', 'fetch_var'
+    'Executor', 'global_scope', 'scope_guard', '_switch_scope', 'fetch_var'
 ]
 
 g_scope = core.Scope()
 
 
 def global_scope():
+    """
+    Get the global/default scope instance. There are a lot of APIs use
+    :code:`global_scope` as its default value, e.g., :code:`Executor.run`
+
+    Returns:
+        Scope: The global/default scope instance.
+    """
     return g_scope
 
 
-def switch_scope(scope):
+def _switch_scope(scope):
     global g_scope
     ex = g_scope
     g_scope = scope
@@ -37,12 +44,42 @@ def switch_scope(scope):
 
 @contextlib.contextmanager
 def scope_guard(scope):
-    ex = switch_scope(scope)
+    """
+    Change the global/default scope instance by Python `with` statement. All
+    variable in runtime will assigned to the new scope.
+
+    Examples:
+        >>> import paddle.fluid as fluid
+        >>> new_scope = fluid.Scope()
+        >>> with fluid.scope_guard(new_scope):
+        >>>     ...
+
+    Args:
+        scope: The new global/default scope.
+    """
+    ex = _switch_scope(scope)
     yield
-    switch_scope(ex)
+    _switch_scope(ex)
 
 
 def as_numpy(tensor):
+    """
+    Convert a Tensor to a numpy.ndarray, its only support Tensor without LoD information.
+    For higher dimensional sequence data, please use LoDTensor directly.
+    Examples:
+        >>> import paddle.fluid as fluid
+        >>> outs = executor.run(...)
+        >>> np_outs = map(lambda x: as_numpy(x), outs)
+        >>>     ...
+
+    Args:
+       tensor(Variable): a instance of Tensor
+
+    Returns:
+        numpy.ndarray
+    """
+    if isinstance(tensor, core.LoDTensorArray):
+        return [as_numpy(t) for t in tensor]
     if isinstance(tensor, list):
         return [as_numpy(t) for t in tensor]
     assert isinstance(tensor, core.LoDTensor)
@@ -135,14 +172,18 @@ def has_fetch_operators(block, fetch_targets, fetch_holder_name):
 
 def fetch_var(name, scope=None, return_numpy=True):
     """
-    Fetch the value of the variable with the given name from the given scope
+    Fetch the value of the variable with the given name from the
+    given scope.
+
     Args:
         name(str): name of the variable. Typically, only persistable variables
             can be found in the scope used for running the program.
         scope(core.Scope|None): scope object. It should be the scope where
             you pass to Executor.run() when running your program.
-            If None, global_scope() will be used.
-        return_numpy(bool): whether convert the tensor to numpy.ndarray
+            If None, global_scope() will be used. Default None.
+        return_numpy(bool): whether convert the tensor to numpy.ndarray.
+            Default True.
+
     Returns:
        LodTensor|numpy.ndarray
     """
@@ -162,7 +203,7 @@ def fetch_var(name, scope=None, return_numpy=True):
     return tensor
 
 
-def get_program_cache_key(feed, fetch_list):
+def _get_program_cache_key(feed, fetch_list):
     feed_var_names = feed.keys()
 
     def to_name_str(var):
@@ -181,6 +222,25 @@ def get_program_cache_key(feed, fetch_list):
 
 
 class Executor(object):
+    """
+    An Executor in Python, only support the single-GPU running. For multi-cards, please refer to
+    ParallelExecutor.
+    Python executor takes a program, add feed operators and fetch operators to this program according
+    to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides
+    the variables(or names) that user want to get after program run. Note: the executor will run all
+    operators in the program but not only the operators dependent by the fetch_list.
+    It store the global variables into the global scope, and create a local scope for the temporary 
+    variables. The local scope contents will be discarded after every minibatch forward/backward finished. 
+    But the global scope variables will be persistent through different runs.
+    All of ops in program will be running in sequence.
+
+    Args:
+        place(core.CPUPlace|core.CUDAPlace(n)): indicate the executor run on which device
+
+    Note: For debugging complicated network in parallel-GPUs, you can test it on the executor.
+    They has the exactly same arguments, and expected the same results.
+    """
+
     def __init__(self, place):
         self.place = place
         p = core.Place()
@@ -189,6 +249,23 @@ class Executor(object):
         self.program_caches = dict()
 
     def as_lodtensor(self, data):
+        """
+        Convert numpy.ndarray to Tensor, its only support Tensor without LoD information.
+        For higher dimensional sequence data, please use LoDTensor directly.
+
+        Examples:
+            >>> import paddle.fluid as fluid
+            >>> exe = fluid.executor(fluid.CPUPlace())
+            >>> data = np.array(size=(100, 200, 300))
+            >>> np_outs = map(lambda x: exe.as_lodtensor(x), data)
+            >>>     ...
+
+        Args:
+            data(numpy.ndarray): a instance of array
+
+        Returns:
+            LoDTensor
+        """
         if isinstance(data, list):
             raise RuntimeError("Some of your feed data hold LoD information. \
                 They can not be completely cast from a list of Python \
@@ -271,6 +348,12 @@ class Executor(object):
         ]
         return outs
 
+    def begin_pass(self):
+        self.executor.begin_pass()
+
+    def end_pass(self):
+        self.executor.end_pass()
+
     def run(self,
             program=None,
             feed=None,
@@ -280,23 +363,47 @@ class Executor(object):
             scope=None,
             return_numpy=True,
             use_program_cache=False):
-        """ Run program by this Executor. Feed data by feed map, fetch result by fetch_list.
-
+        """
+        Run program by this Executor. Feed data by feed map, fetch result by fetch_list.
         Python executor takes a program, add feed operators and fetch operators to this program according
         to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides
-        the variables(or names) that user want to get after program run. Note: the executor will run all
+        the variables(or names) that user want to get after program run.
+
+        Note: the executor will run all
         operators in the program but not only the operators dependent by the fetch_list
 
-        :param program: the program that need to run, if not provied, then default_main_program will be used.
-        :param feed: feed variable map, e.g. {"image": ImageData, "label": LableData}
-        :param fetch_list: a list of variable or variable names that user want to get, run will return them according
-        to this list.
-        :param feed_var_name: the name for the input variable of feed Operator.
-        :param fetch_var_name: the name for the output variable of feed Operator.
-        :param scope: the scope used to run this program, you can switch it to different scope. default is global_scope
-        :param return_numpy: if convert the fetched tensor to numpy
-        :param use_program_cache: set use_program_cache to true if program not changed compare to the last step.
-        :return: result according to fetch_list.
+        Args:
+            program(Program): the program that need to run, if not provied, then default_main_program will be used.
+            feed(dict): feed variable map, e.g. {"image": ImageData, "label": LableData}
+            fetch_list(list): a list of variable or variable names that user want to get, run will return them according to this list.
+            feed_var_name(str): the name for the input variable of feed Operator.
+            fetch_var_name(str): the name for the output variable of fetch Operator.
+            scope(Scope): the scope used to run this program, you can switch it to different scope. default is global_scope
+            return_numpy(bool): if convert the fetched tensor to numpy
+            use_program_cache(bool): set use_program_cache to true if program not changed compare to the last step.
+
+        Returns:
+
+            list(numpy.array): fetch result according to fetch_list.
+
+
+        Examples:
+
+            >>> data = layers.data(name='X', shape=[1], dtype='float32')
+            >>> hidden = layers.fc(input=data, size=10)
+            >>> layers.assign(hidden, out)
+            >>> loss = layers.mean(out)
+            >>> adam = fluid.optimizer.Adam()
+            >>> adam.minimize(loss)
+
+            >>> cpu = core.CPUPlace()
+            >>> exe = Executor(cpu)
+            >>> exe.run(default_startup_program())
+
+            >>> x = numpy.random.random(size=(10, 1)).astype('float32')
+            >>> outs = exe.run(
+            >>>     feed={'X': x},
+            >>>     fetch_list=[loss.name])
         """
         if feed is None:
             feed = {}
@@ -317,7 +424,7 @@ class Executor(object):
         if scope is None:
             scope = global_scope()
 
-        cache_key = get_program_cache_key(feed, fetch_list)
+        cache_key = _get_program_cache_key(feed, fetch_list)
         if use_program_cache:
             cached_program = self._get_program_cache(cache_key)
             if cached_program is None:
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index f6438c82ac207d0e38d8be5e9d6252b28e72826e..ea3117e02bd993b06de39725b2c3296031065e3c 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -19,7 +19,16 @@ import re
 import numpy as np
 
 import proto.framework_pb2 as framework_pb2
-from . import core
+try:
+    from . import core
+except ImportError, e:
+    raise ImportError(
+        """NOTE: You may need to run \"export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH\"
+    if you encounters \"libmkldnn.so not found\" errors. If you have python
+    installed in other directory, replace \"/usr/local/lib\" with your own
+    directory. The original error is: \n""" + e.message)
+except Exception, e:
+    raise e
 import unique_name
 
 __all__ = [
@@ -27,11 +36,10 @@ __all__ = [
     'Variable',
     'Program',
     'Operator',
+    'Parameter',
     'default_startup_program',
     'default_main_program',
     'program_guard',
-    'switch_startup_program',
-    'switch_main_program',
     'get_var',
 ]
 
@@ -43,7 +51,8 @@ ZERO_VAR_SUFFIX = core.kZeroVarSuffix()
 
 def grad_var_name(var_name):
     """
-    return gradient name for a certain var name
+    Returns:
+        str: gradient name for a certain var name
     """
     return var_name + GRAD_VAR_SUFFIX
 
@@ -51,10 +60,12 @@ def grad_var_name(var_name):
 def convert_np_dtype_to_dtype_(np_dtype):
     """
     Convert the data type in numpy to the data type in Paddle
+
     Args:
-        np_dtype(np.dtype): the data type in numpy
+        np_dtype(np.dtype): the data type in numpy.
 
-    Returns(core.VarDesc.VarType): the data type in Paddle
+    Returns:
+        core.VarDesc.VarType: the data type in Paddle.
 
     """
     dtype = np.dtype(np_dtype)
@@ -120,37 +131,53 @@ def _debug_string_(proto, throw_on_error=True):
 
 class Variable(object):
     """
-    Python variable. Every input and output of an operator is a variable. Every
-    variable belongs to a block. The variable has a name and two variables in
-    different blocks could have the same name.
+    In Fluid, every input and output of an operator is a variable. In most 
+    cases, variables are used for holding different kinds of data or training 
+    labels. A variable belongs to a block. All variable has its own name and 
+    two variables in different blocks could have the same name.
 
-    There are many kinds of variables. Please reference the framework.proto for
-    details.
+    There are many kinds of variables. Each kind of them has its own attributes 
+    and usages. Please reference the framework.proto for details. 
 
-    Notes: The constructor of Variable should not be invoked directly. Please
-    use `Block.create_var` to create a variable.
-
-    >>> cur_program = Program()
-    >>> cur_block = cur_program.current_block()
-    >>> new_variable = cur_block.create_var(
-    >>>                    name="X", shape=[-1, 23, 48], dtype='float32')
+    Most of a Variable's member variables can be setted to be None. It mean 
+    it is not available or will be specified later.
 
     Args:
-        block(Block): The associated block. It will be passed by
-            `Block.create_var` automatically.
+        block(Block): The block that the variable belongs to.
         type(core.VarDesc.VarType): Variable type. Please reference the
             framework.proto for details.
-        shape(tuple|list|None): The shape of variable. -1 means the batch size.
+        name(str|None): The name of the variable. If setted None, it will be
+            generated automatically. Default: None
+        shape(tuple|list|None): The shape of the variable. -1 means the batch size.
             Some kinds of variable do not contain shape, just set it to None.
-        dtype(np.dtype|core.VarDesc.VarType|str): The data type of variable.
-        lod_level(int): The level of lod tensor. 0 means it is not a time
+            Default: None
+        dtype(np.dtype|core.VarDesc.VarType|str|None): The data type of variable.
+            Default: None
+        lod_level (int|None): The level of lod tensor. 0 means it is not a time
             series data.
-        capacity(int): The capacity of Channel variable. Ignored
-            for other types.
-        persistable(bool): True if the variable should be saved as check point.
-            Defaults to False.
-        stop_gradient(bool): True if the variable will stop to calculate
-            gradients when backward. Defaults to False.
+            Default: None
+        capacity (int|None): The capacity of Channel variable. Ignored for other
+            types. Default: None
+        persistable (bool|None): True if the variable is persistable. A persistable
+            variable will not be deleted after an iteration ending. Defaults: None.
+        error_clip (BaseErrorClipAttr|None): The error clip attributes of the
+            corresponding gradient variable. Default: None
+        stop_gradient (bool): True if the variable will stop to calculate its
+            gradients when backward. Default: False.
+        is_data (bool): True if the variable is an input data. Default: False
+
+    Notes:
+        The constructor of Variable should not be invoked directly. Please
+        use `Block.create_var` to create a variable.
+
+    Examples:
+        .. code-block:: python
+
+            cur_program = Program()
+            cur_block = cur_program.current_block()
+            new_variable = cur_block.create_var(name="X",
+                                                shape=[-1, 23, 48],
+                                                dtype='float32')
     """
 
     def __init__(self,
@@ -253,13 +280,14 @@ class Variable(object):
         Get debug string.
 
         Args:
-            throw_on_error(bool): True if raise an exception when self is not
-                intialized.
+            throw_on_error(bool): True if raise an exception when self is
+                not initialized.
             with_details(bool): more details about variables and parameters
-                (e.g. trainable, optimize_attr, ...) will be printed when with_details is True
-
-        Returns(str): The debug string.
+                (e.g. trainable, optimize_attr, ...) will be printed when
+                with_details is True. Default False;
 
+        Returns:
+            str: The debug string.
         """
         assert isinstance(throw_on_error, bool) and isinstance(with_details,
                                                                bool)
@@ -276,6 +304,15 @@ class Variable(object):
     __repr__ = __str__
 
     def set_desc(self, input):
+        """
+        Set the variable description.
+
+        Args:
+            input(core.VarDesc): The new VarDesc.
+
+        Returns:
+            None
+        """
         self.desc = input
 
     @property
@@ -312,6 +349,15 @@ class Variable(object):
         return self.desc.type()
 
     def set_error_clip(self, error_clip):
+        """
+        Set the error_clip.
+
+        Args:
+            error_clip(BaseErrorClipAttr) : The new error_clip.
+
+        Returns:
+            None
+        """
         self.error_clip = error_clip
 
 
@@ -319,8 +365,8 @@ def get_all_op_protos():
     """
     Get all registered op proto from PaddlePaddle C++ end.
 
-    Returns(list): list of OpProto
-
+    Returns:
+       list: list of OpProto.
     """
     protostrs = core.get_all_op_protos()
     ret_values = []
@@ -373,16 +419,52 @@ class OpProtoHolder(object):
 
 class Operator(object):
     """
-    Python Operator class. The operator represents the build in instructions in a
-    Block. Users can use the build in instructions to describe their neural
-    network.
+    In Fluid, all the operation are represented by Operator, and Operator
+    is regarded as a build in an instruction of a Block. Users can use the
+    build in instructions to describe their neural network.
+
+    Args:
+        block(Block): The block has the current operator.
+        desc(core.OpDesc): The protobuf description of Operator.
+        type(str): The type of operator. Default None.
+        inputs(dict): The input of this Operator. it is a dictionary, for every
+            element, key is the input parameter name, and value is a list of
+            variables. Default None.
+        outputs(dict): The output of this Operator. it is a dictionary, for
+            every element, key is the input parameter name, and value is a list
+            of variables. Default None.
+        attrs(dict): The attributes of this Operator. it is a dictionary, for
+            every element, key is attribute name, and value is the attribute value.
+            The attribute type should be as same as the type registered in C++ side.
+            Default None.
+
+    Returns:
+        Operator: The initialized Operator.
+
+    Raises:
+        ValueError: If the passed input, output and attrs doesn't match the
+            initializing Operator's that registered in C++ side.
+
+    Notes:
+        The constructor of operator should not be invoked directly. Use
+        Block.append_op or Block.prepend_op instead.
+
+    Examples:
+        .. code-block:: python
+
+            cur_program = Program()
+            cur_block = cur_program.current_block()
+            # var1 += var2 + var3
+            cur_block.append_op(type="sum",
+                                inputs={"X": [var1, var2, var3]},
+                                outputs={"Out": [var1]})
     """
     OP_WITHOUT_KERNEL_SET = {
         'feed', 'fetch', 'save', 'load', 'recurrent', 'go',
         'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv',
         'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine',
         'ncclInit', 'channel_create', 'channel_close', 'channel_send',
-        'channel_recv', 'select', 'gen_nccl_id'
+        'channel_recv', 'select', 'checkpoint_notify', 'gen_nccl_id'
     }
 
     def __init__(self,
@@ -392,31 +474,7 @@ class Operator(object):
                  inputs=None,
                  outputs=None,
                  attrs=None):
-        """
-        Constructor.
-
-        Notes: The constructor of operator should not be invoked directly. Use
-        Block.append_op or Block.prepend_op instead.
 
-        >>> cur_program = Program()
-        >>> cur_block = cur_program.current_block()
-        >>> # var1 += var2 + var3
-        >>> cur_block.append_op(type="sum",
-        >>>                     inputs={"X": [var1, var2, var3]},
-        >>>                     outputs={"Out": [var1]})
-
-        Args:
-            block(Block): The block has the current operator.
-            desc(core.OpDesc): The protobuf description.
-            type(str): The type of operator.
-            inputs(dict): The input dictionary. Key is the input parameter name.
-                Value is a list of variables.
-            outputs(dict): The output dictionary which has the same format with
-                           inputs.
-            attrs(dict): The attributes dictionary. Key is attribute name. Value
-                is the attribute value. The attribute type should be as same as
-                the type registered in C++
-        """
         self.block = block
         self.desc = desc
         self.attrs = attrs
@@ -510,15 +568,9 @@ class Operator(object):
                 if (attr_name not in self.attrs) or (
                         self.attrs[attr_name] is None):
                     continue
-                if isinstance(self.attrs[attr_name], Block):
-                    self.desc.set_block_attr(attr_name,
-                                             self.attrs[attr_name].desc)
-                elif isinstance(self.attrs[attr_name], core.BlockDesc) or \
-                        isinstance(self.attrs[attr_name], core.ProgramDesc):
-                    self.desc.set_serialized_attr(
-                        attr_name, self.attrs[attr_name].serialize_to_string())
-                else:
-                    self.desc.set_attr(attr_name, self.attrs[attr_name])
+                attr_val = self.attrs[attr_name]
+                self._update_desc_attr(attr_name, attr_val)
+
         self.desc.check_attrs()
         if self.has_kernel(type):
             self.desc.infer_var_type(self.block.desc)
@@ -529,12 +581,14 @@ class Operator(object):
 
     def to_string(self, throw_on_error):
         """
-        To debug string.
+        Get debug string.
+
         Args:
-            throw_on_error(bool): raise exception when self is not initialized
-                when throw_on_error is True
+            throw_on_error(bool): Whether to raise exception if self is not
+                initialized.
 
-        Returns(str): The debug string.
+        Returns:
+            str: The debug string.
 
         """
         protostr = self.desc.serialize_to_string()
@@ -552,29 +606,45 @@ class Operator(object):
 
     def input(self, name):
         """
-        Get input arguments by the input parameter name
-        Args:
-            name(str): The input parameter name
+        Get the input arguments according to the input parameter name.
 
-        Returns(list): return the list of argument names associated with the
-            specific parameter name.
+        Args:
+            name(str): The input parameter name.
 
+        Returns:
+            list: return the list of argument names that associated with \
+                the specific parameter name.
         """
         return self.desc.input(name)
 
     def rename_input(self, old_name, new_name):
+        """
+        Rename the `old_name` to `new_name`.
+
+        Args:
+            old_name(str): The old name of the Operator's input.
+            new_name(str): The new name of the Operator's input.
+
+        Returns:
+            None
+        """
         self.desc.rename_input(old_name, new_name)
 
     def rename_output(self, old_name, new_name):
+        """
+        Rename the `old_name` to `new_name`.
+
+        Args:
+            old_name(str): The old name of the Operator's output.
+            new_name(str): The new name of the Operator's output.
+
+        Returns:
+            None
+        """
         self.desc.rename_output(old_name, new_name)
 
     @property
     def input_names(self):
-        """
-        Get all input parameter names
-        Returns(list): return a list of input parameter names
-
-        """
         return self.desc.input_names()
 
     @property
@@ -587,33 +657,23 @@ class Operator(object):
 
     def output(self, name):
         """
-        Get output arguments by the output parameter name
-        Args:
-            name(str): The output parameter name
+        Get output arguments by the output parameter name.
 
-        Returns(list): return the list of argument names associated with the
-            specific parameter name.
+        Args:
+            name(str): The output parameter name.
 
+        Returns:
+            list: return the list of argument names associated with \
+                the specific parameter name.
         """
         return self.desc.output(name)
 
     @property
     def output_names(self):
-        """
-        Get all output parameter names
-        Returns(list): return a list of output parameter names
-
-        """
         return self.desc.output_names()
 
     @property
     def idx(self):
-        """
-        Return the array index of current operator.
-        Returns(int): The array index in block.ops array
-        Raises:
-            ValueError: when the operator is not found.
-        """
         for i, op in enumerate(self.block.ops):
             if op == self:
                 return i
@@ -622,66 +682,100 @@ class Operator(object):
 
     def has_attr(self, name):
         """
-        operator has the attribute with name or not.
+        Whether this Operator has the attribute with name or not.
+
         Args:
-            name(str): the attribute name
+            name(str): the attribute name.
 
-        Returns(bool): True if has this attribute.
+        Returns:
+            bool: True if has this attribute.
 
         """
         return self.desc.has_attr(name)
 
     def attr_type(self, name):
         """
-        Get the type of attribute by attribute name
-        Args:
-            name(str): the attribute name
+        Get the type of attribute by attribute's name.
 
-        Returns(core.AttrType): the attribute type
+        Args:
+            name(str): the attribute name.
 
+        Returns:
+            core.AttrType: the attribute type.
         """
         return self.desc.attr_type(name)
 
     def set_attr(self, name, val):
+        """
+        Set the value of attribute by attribute's name.
+
+        Args:
+            name(str): the attribute name.
+            val(bool|int|str|float|list): the value of the attribute.
+
+        Raises:
+            ValueError: If the type of value doesn't match with desc.attr_type(name).
+        """
         self.attrs[name] = val
-        self.desc.set_attr(name, val)
+        self._update_desc_attr(name, val)
 
-    @property
-    def attr_names(self):
+    def _update_desc_attr(self, name, val):
         """
-        Get all attribute names
-        Returns(list): The list of attribute name
+        Update the value of desc's attribute by attribute's name.
+
+        Args:
+            name(str): the attribute name.
+            val(bool|int|str|float|list): the value of the attribute.
 
+        Raises:
+            ValueError: If the type of value doesn't match with desc.attr_type(name).
         """
+        if isinstance(val, Block):
+            self.desc.set_block_attr(name, val.desc)
+        elif isinstance(val, list) and val and all(
+                isinstance(v, Block) for v in val):
+            self.desc.set_blocks_attr(name, [v.desc for v in val])
+        elif isinstance(val, core.BlockDesc) or \
+                isinstance(val, core.ProgramDesc):
+            self.desc.set_serialized_attr(name, val.serialize_to_string())
+        else:
+            self.desc.set_attr(name, val)
+
+    @property
+    def attr_names(self):
         return self.desc.attr_names()
 
     def attr(self, name):
         """
-        Get attribute by name
+        Get the attribute by name.
+
         Args:
-            name(str): the attribute name
+            name(str): the attribute name.
 
-        Returns(bool|int|str|float|list): The attribute value. The return value
+        Returns:
+            bool|int|str|float|list: The attribute value. The return value
             can be any valid attribute type.
-
         """
         return self.desc.attr(name)
 
     def block_attr(self, name):
         """
-        Get the block attribute by name
-        Args:
-            name(str): the attribute name
+        Get the block attribute by name.
 
-        Returns(int): the block index
+        Args:
+            name(str): the attribute name.
 
+        Returns:
+            int: the block index.
         """
         return self.desc.block_attr(name)
 
     def all_attrs(self):
         """
-        Get the attribute dict
-        Returns(dict): The Operator's attribute dict
+        Get the attribute dict.
+
+        Returns:
+            dict: The Operator's attribute dict.
         """
         attr_names = self.attr_names
         attr_map = {}
@@ -694,6 +788,35 @@ class Operator(object):
 
 
 class Block(object):
+    """
+    In Fluid, a Program is consistence of multi-Block, and Block stores
+    VarDesc and OpDesc. In a specific Block, a VarDesc have a unique name.
+    One block could have some child blocks, and child block's name scopes
+    should inherit the parent's so that OpDesc in child block can reference
+    a VarDesc that is stored in the parent block.
+    Please reference the framework.proto for details.
+
+    Args:
+        program(Program): The Program that the Block belongs to.
+        idx(int): The block's id in the Program.
+
+    Notes:
+        The constructor of Block should not be invoked directly. Please
+        use `Program.create_block()` to create a block.
+
+    Examples:
+        .. code-block:: python
+
+            cur_program = Program()
+            cur_block = cur_program.current_block()
+            var = cur_block.create_var(name="X",
+                                       shape=[-1, 23, 48],
+                                       dtype='float32')
+            cur_block.append_op(type="abs",
+                                inputs={"X": [var]},
+                                outputs={"Out": [var]})
+    """
+
     def __init__(self, program, idx):
         self.desc = program.desc.block(idx)
         self.vars = collections.OrderedDict()  # var_name --> var
@@ -706,15 +829,17 @@ class Block(object):
 
     def to_string(self, throw_on_error, with_details=False):
         """
-        To debug string.
+        Get debug string.
+
         Args:
             throw_on_error(bool): raise exception when self is not initialized
-                when throw_on_error is True
+                when throw_on_error is True.
             with_details(bool): more details about variables and parameters
-                (e.g. trainable, optimize_attr, ...) will be printed when with_details is True
-
-        Returns(str): The debug string.
+                (e.g. trainable, optimize_attr, ...) will be printed when
+                with_details is True. Default False.
 
+        Returns:
+            str: The debug string.
         """
         assert isinstance(throw_on_error, bool) and isinstance(with_details,
                                                                bool)
@@ -746,6 +871,15 @@ class Block(object):
         return self.desc.get_forward_block_idx()
 
     def set_forward_block_idx(self, idx):
+        """
+        Set the forward block Idx.
+
+        Args:
+            idx(int): the block index.
+
+        Returns:
+            None
+        """
         self.desc.set_forward_block_idx(idx)
 
     @property
@@ -753,6 +887,19 @@ class Block(object):
         return self.desc.id
 
     def var(self, name):
+        """
+        Get a Variable by name from this block.
+
+        Args:
+            name(str): the Variable's name.
+
+        Raises:
+            ValueError: The If input's type is not str, or this block
+                doesn't have a Variable with the giving name.
+
+        Returns:
+            Variable: the Variable with the giving name.
+        """
         if not isinstance(name, basestring):
             raise TypeError(
                 "var require string as parameter, but get %s instead." %
@@ -763,6 +910,19 @@ class Block(object):
         return v
 
     def var_recursive(self, name):
+        """
+        Get a Variable by name from this block recursively.
+
+        Args:
+            name(str): the Variable's name.
+
+        Raises:
+            ValueError: this block and this parent block doesn't
+                have a Variable with the giving name.
+
+        Returns:
+            Variable: the Variable with the giving name.
+        """
         frontier = list()
         visited = set()
 
@@ -809,6 +969,18 @@ class Block(object):
     def rename_var(self, name, new_name):
         """
         Rename variable in vars and ops' inputs and outputs
+
+        Args:
+            name(str): the name that need to be renamed.
+            new_name(str): the name that need to rename to.
+
+        Raises:
+            ValueError: If this block doesn't have this the giving name,
+                or the type of the var with the giving name is not Parameter
+                or Variable.
+
+        Returns:
+            Variable: the Variable with the giving name.
         """
         if not self.has_var(name):
             raise ValueError("var %s is not in current block" % name)
@@ -872,12 +1044,27 @@ class Block(object):
         return param
 
     def append_op(self, *args, **kwargs):
+        """
+        Appends a new Operator according to the giving arguments.
+
+        Returns:
+            Operator: the append Operator.
+        """
         op_desc = self.desc.append_op()
         op = Operator(block=self, desc=op_desc, *args, **kwargs)
         self.ops.append(op)
         return op
 
     def insert_op(self, index, *args, **kwargs):
+        """
+        Insert a Operator according to the giving arguments.
+
+        Args:
+            index(int): the place that the operator to insert.
+
+        Returns:
+            Operator: the insert Operator.
+        """
         self.sync_with_cpp()
         op_desc = self.desc.insert_op(index)
         op = Operator(block=self, desc=op_desc, *args, **kwargs)
@@ -885,11 +1072,30 @@ class Block(object):
         return op
 
     def remove_op(self, index):
+        """
+        Remove the specific position operator.
+
+        Args:
+            index(int): the position that the operator to insert.
+
+        Returns:
+            None
+        """
         self.sync_with_cpp()
         self.desc.remove_op(index, index + 1)
         del self.ops[index]
 
     def slice_ops(self, start, end):
+        """
+        Return the Operator between start and end.
+
+        Args:
+            start(int): the start position.
+            end(int): the end position.
+
+        Returns:
+            list: the Operators between start and end.
+        """
         return self.ops[start:end]
 
     def prepend_op(self, *args, **kwargs):
@@ -900,9 +1106,8 @@ class Block(object):
 
     def sync_with_cpp(self):
         """
-        Sync from the desc on the c++ end.
-
-        This method is used to synchronize the c++ desc instance generated by backward.
+        Sync from the desc on the c++ end. This method is used to synchronize
+        the c++ desc instance generated by backward.
         """
         # sync variables from cpp
         for var in self.desc.all_vars():
@@ -967,9 +1172,14 @@ class Block(object):
 
     def copy_param_info_from(self, other):
         """
-        Copy the information of parameters from the other block
+        Copy the information of parameters from the other block.
+
         Args:
-            other(Block): the other block
+            other(Block): the other block.
+
+        Raises:
+            ValueError: If type of input is not Block, or the `other` and this
+                block is not in the same topology.
 
         Returns:
             None
@@ -1001,11 +1211,12 @@ class Block(object):
     def clone_variable(self, var):
         """
         Clone a variable into current block.
+
         Args:
             var: the variable to be cloned.
 
         Returns:
-            The new  variable cloned from 'var' in current block.
+            Variable: the new  variable cloned from 'var' in current block.
         """
         assert isinstance(var, Variable)
         ret_var = None
@@ -1013,6 +1224,9 @@ class Block(object):
         if var.type == core.VarDesc.VarType.STEP_SCOPES:
             ret_var = self.create_var(
                 name=var.name, persistable=var.persistable, type=var.type)
+        elif var.type == core.VarDesc.VarType.RAW:
+            ret_var = self.create_var(
+                name=var.name, persistable=var.persistable, type=var.type)
         elif var.type == core.VarDesc.VarType.SELECTED_ROWS:
             ret_var = self.create_var(
                 name=var.name,
@@ -1034,6 +1248,32 @@ class Block(object):
 
 
 class Program(object):
+    """
+    Python Program. Beneath it is a ProgramDesc, which is used for
+    create c++ Program. A program is a self-contained programing
+    language like container. It has at least one Block, when the
+    control flow op like conditional_block, while_op is included,
+    it will contains nested block.
+    Please reference the framework.proto for details.
+
+    Notes: we have default_startup_program and default_main_program
+    by default, a pair of them will shared the parameters.
+    The default_startup_program only run once to initialize parameters,
+    default_main_program run in every mini batch and adjust the weights.
+
+    Returns:
+        A empty program.
+
+    Examples:
+        >>> main_program = fluid.Program()
+        >>> startup_program = fluid.Program()
+        >>> with fluid.program_guard(main_program=main_program, startup_program=startup_program):
+        >>>     fluid.layers.data(name="x", shape=[-1, 784], dtype='float32')
+        >>>     fluid.layers.data(name="y", shape=[-1, 1], dtype='int32')
+        >>>     fluid.layers.fc(name="fc", shape=[10], dtype='float32', act="relu")
+
+    """
+
     def __init__(self):
         self.desc = core.ProgramDesc()
         self.blocks = [Block(self, 0)]
@@ -1044,6 +1284,19 @@ class Program(object):
 
     @property
     def op_role(self):
+        """
+        The operator role. In a enum {Forward, Backward, Optimize}.
+
+        Notes: this is a low level API. It is used only for ParallelExecutor to
+        duplicate or schedule operator to devices.
+
+        For example, the forward operator should be executed on every device.
+        The backward operator should be executed on every device and the
+        parameter gradient of backward (use :code:`op_role_var` to get this
+        variable) operator should be merged to one device. The optimization
+        operators should be executed on only one device and broadcast the
+        optimization result, i.e., the new parameter, to every other device.
+        """
         return self._current_role
 
     @op_role.setter
@@ -1052,6 +1305,13 @@ class Program(object):
 
     @property
     def op_role_var(self):
+        """
+        The auxiliary variables for :code:`op_role` property.
+
+        See Also: :code:`Program.op_role`'s documentation for details.
+
+        Notes: This is a very low-level API. Users should not use it directly.
+        """
         return self._op_role_var
 
     @op_role_var.setter
@@ -1060,6 +1320,21 @@ class Program(object):
 
     @contextlib.contextmanager
     def optimized_guard(self, var):
+        """
+        A with guard to set :code:`Optimization` :code:`OpRole` and
+        :code:`OpRoleVar` automatically.
+
+        Notes: This is a very low level API. Users should not use it directly.
+
+        Args:
+            var(Variable|str): The variable (name) to be optimized.
+
+        Examples:
+
+            >>> p, g = backward(...)
+            >>> with program.optimized_guard(p):
+            >>>     p = p - 0.001 * g
+        """
         OpRole = core.op_proto_and_checker_maker.OpRole
         self._current_role = OpRole.Optimize
         self._op_role_var = [var.name if isinstance(var, Variable) else var]
@@ -1068,18 +1343,35 @@ class Program(object):
         self._current_role = OpRole.Forward
 
     def __str__(self):
+        """
+        Get the protobuf debug string of this Program.
+
+        Returns:
+            (str): The protobuf debug string.
+
+        Raises:
+            ValueError: If any of required fields is not set.
+        """
         return self.to_string(True)
 
     def to_string(self, throw_on_error, with_details=False):
         """
         To debug string.
+
         Args:
-            throw_on_error(bool): raise exception when self is not initialized
-                when throw_on_error is True
-            with_details(bool): more details about variables and parameters
-                (e.g. trainable, optimize_attr, ...) will be printed when with_details is True
+            throw_on_error(bool): raise Value error when any of required fields
+                is not set.
 
-        Returns(str): The debug string.
+            with_details(bool): True if more details about variables and
+                parameters, e.g., :code:`trainable`, :code:`optimize_attr`, need
+                to print.
+
+        Returns
+            (str): The debug string.
+
+        Raises:
+            ValueError: If any of required fields is not set and throw_on_error is
+                True.
 
         """
         assert isinstance(throw_on_error, bool) and isinstance(with_details,
@@ -1095,22 +1387,93 @@ class Program(object):
         return res_str
 
     def get_desc(self):
+        """
+        Get the C++ side of `ProgramDesc` object pointer. The C++ object is
+        exposed by :code:`pybind`.
+
+        Notes: This is a very low level API. Users should not use this API
+        directly.
+        """
         return self.desc
 
     def clone(self, for_test=False):
-        """Clone the Program object
+        """
+        Create a new, duplicated program.
+
+
+        Some operators, e.g., :code:`batch_norm`, behave differently between
+        training and testing. They have an attribute, :code:`is_test`, to
+        control this behaviour. This method will change the :code:`is_test`
+        attribute of them to :code:`True` when :code:`for_test=True`.
 
-        Set for_test to False when we want to clone the program for training.
-        Set for_test to True when we want to clone the program for testing.
+        * Set for_test to False when we want to clone the program for training.
+        * Set for_test to True when we want to clone the program for testing.
+
+        Notes: This API DOES NOT prune any operator. Use
+        :code:`clone(for_test=True)` before backward and optimization please. e.g.
+
+            >>> test_program = fluid.default_main_program().clone(for_test=True)
+            >>> optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
+            >>> optimizer.minimize()
 
         Args:
-            for_test(bool): Some operators, such as batch_norm and drop_out ops,
-                behave differently in training and testing. If for_test is True,
-                the is_test attributes in these operators will be set to True for
-                testing purposes, otherwise, they remain unchanged.
+            for_test(bool): True if change the :code:`is_test` attribute of
+                operators to :code:`True`.
 
-        Returns(Program):
-            The cloned Program object.
+        Returns:
+            Program: The new, duplicated Program object.
+
+        Examples:
+
+            1. To clone a test program, the sample code is:
+
+            >>> import paddle.fluid as fluid
+            >>> train_program = fluid.Program()
+            >>> startup_program = fluid.Program()
+            >>> with fluid.program_guard(train_program, startup_program):
+            >>>     img = fluid.layers.data(name='image', shape=[784])
+            >>>     hidden = fluid.layers.fc(input=img, size=200, act='relu')
+            >>>     hidden = fluid.layers.dropout(hidden, dropout_prob=0.5)
+            >>>     loss = fluid.layers.cross_entropy(
+            >>>                 input=fluid.layers.fc(hidden, size=10, act='softmax'),
+            >>>                 label=fluid.layers.data(name='label', shape=[1], dtype='int64'))
+            >>>
+            >>> test_program = train_program.clone(for_test=True)
+            >>>
+            >>> sgd = fluid.optimizer.SGD(learning_rate=1e-3)
+            >>> with fluid.program_guard(train_program, startup_program):
+            >>>     sgd.minimize(loss)
+
+            2. The :code:`clone` method can be avoid if you create program for
+            training and program for testing individually.
+
+            >>> import paddle.fluid as fluid
+            >>>
+            >>> def network(is_test):
+            >>>     img = fluid.layers.data(name='image', shape=[784])
+            >>>     hidden = fluid.layers.fc(input=img, size=200, act='relu')
+            >>>     hidden = fluid.layers.dropout(hidden, dropout_prob=0.5, is_test=is_test)
+            >>>     loss = fluid.layers.cross_entropy(
+            >>>                 input=fluid.layers.fc(hidden, size=10, act='softmax'),
+            >>>                 label=fluid.layers.data(name='label', shape=[1], dtype='int64'))
+            >>>     return loss
+            >>>
+            >>> train_program = fluid.Program()
+            >>> startup_program = fluid.Program()
+            >>> test_program = fluid.Program()
+            >>>
+            >>> with fluid.program_guard(train_program, startup_program):
+            >>>     with fluid.unique_name.guard():
+            >>>         loss = network(is_test=False)
+            >>>         sgd = fluid.optimizer.SGD(learning_rate=1e-3)
+            >>>         sgd.minimize(loss)
+            >>>
+            >>> # the test startup program is not used.
+            >>> with fluid.program_guard(test_program, fluid.Program()):
+            >>>     with fluid.unique_name.guard():
+            >>>         loss = network(is_test=True)
+
+            The two code snippets above will generate same programs.
         """
         if for_test:
             p = self.inference_optimize()
@@ -1125,6 +1488,21 @@ class Program(object):
         return p
 
     def prune(self, targets):
+        """
+        Prune operators and variables which are not needed to generate
+        :code:`targets`.
+
+        Notes: This is a very low level API. Users should not use this API
+        directly. This API is in flux and not stable.
+
+        Args:
+            targets(list|Variable|Operator): A list of variables or operators
+                need to be pruned
+
+        Returns:
+            Program:  A new, pruned program.
+
+        """
         if not isinstance(targets, list):
             targets = [targets]
         targets_idx = []
@@ -1159,6 +1537,17 @@ class Program(object):
         return res
 
     def inference_optimize(self):
+        """
+        This method will create a new program and change the :code:`is_test`
+        attribute of operators to :code:`True`. All the :code:`Parameter`
+        information will be lost.
+
+        Notes: This API is a very low level API. Use
+        :code:`Program.clone(for_test=True)` instead.
+
+        Returns:
+            Program: The new program.
+        """
         # this is an alternative implement before
         # core.inference_optimize being fixed.
         res = Program()
@@ -1175,6 +1564,18 @@ class Program(object):
 
     @staticmethod
     def parse_from_string(binary_str):
+        """
+        Deserialize a program desc from protobuf binary string.
+
+        Notes: All information about parameters will be lost after serialization
+        and deserialization.
+
+        Args:
+            binary_str(str): The binary prootbuf string.
+
+        Returns:
+            Program: A deserialized program desc.
+        """
         p = Program()
         p.desc = core.ProgramDesc(binary_str)
         p.blocks = [Block(p, i) for i in xrange(p.desc.num_blocks())]
@@ -1183,10 +1584,19 @@ class Program(object):
 
     @property
     def random_seed(self):
+        """
+        The default random seed for random operators in Program. Zero means get
+        the random seed from random device.
+
+        Notes: It must be set before the operators have been added.
+        """
         return self._seed
 
     @property
     def num_blocks(self):
+        """
+        The number of blocks in this program.
+        """
         return self.desc.num_blocks()
 
     @random_seed.setter
@@ -1199,15 +1609,40 @@ class Program(object):
         return str(self)
 
     def global_block(self):
+        """
+        Get the first block of this program.
+        """
         return self.blocks[0]
 
     def block(self, index):
+        """
+        Get the :code:`index` block of this program
+        Args:
+            index(int): The index of block to get
+
+        Returns:
+            Block: The :code:`index` block
+        """
         return self.blocks[index]
 
     def current_block(self):
+        """
+        Get the current block. The :code:`current` block is the block to append
+        operators.
+        """
         return self.blocks[self.current_block_idx]
 
     def create_block(self, parent_idx=None):
+        """
+        Create a new block with the :code:`parent_idx` and change the current block
+        to new block.
+
+        Args:
+            parent_idx(int): The parent block index.
+
+        Returns:
+            Block: The new block.
+        """
         new_block_idx = len(self.blocks)
         parent = self.current_block() if parent_idx is None else self.block(
             parent_idx)
@@ -1217,9 +1652,24 @@ class Program(object):
         return self.current_block()
 
     def rollback(self):
+        """
+        Exit a code block, i.e., roll back to the parent block.
+        Returns:
+            None
+        """
         self.current_block_idx = self.current_block().parent_idx
 
     def sync_with_cpp(self):
+        """
+        Synchronize Python instance to its binding C++ object instance.
+        If the program is modified in C++ space, this method should be invoked.
+
+        Notes: This is a very low level API. Users should not invoke it
+        directly.
+
+        Returns:
+            None
+        """
         for block_idx in range(len(self.blocks), self.desc.num_blocks()):
             self.blocks.append(Block(self, block_idx))
         for block in self.blocks:
@@ -1228,6 +1678,10 @@ class Program(object):
     def copy_param_info_from(self, other):
         """
         Copy the information of parameters from other program.
+
+        Notes: This is a very low level API. Users should not invoke it
+        directly.
+
         Args:
             other(Program): Other program
 
@@ -1246,6 +1700,10 @@ class Program(object):
     def copy_data_info_from(self, other):
         """
         Copy the information of data variables from other program.
+
+        Notes: This is a very low level API. Users should not invoke it
+        directly.
+
         Args:
             other(Program): Other program
 
@@ -1264,12 +1722,41 @@ class Program(object):
                 self.global_block().var(var.name).is_data = True
 
     def list_vars(self):
+        """
+        Get all variables from this Program. A iterable object is returned.
+
+        Returns:
+            iterable: The generator will yield every variable in this program.
+        """
         for each_block in self.blocks:
             for each_var in each_block.vars.itervalues():
                 yield each_var
 
 
 class Parameter(Variable):
+    """
+    Parameter is derived from Variable. A parameter is a persistable 
+    Variable, and will be updated by optimizers after each iteration.
+    The training of a neural network is essentially the updating of 
+    its parameters.
+
+    Relative to a general Variable, a Parameter has several its own
+    member variables:
+
+    Args:
+        trainable(bool): True if the parameter need to be updated after
+            iterations.
+        optimize_attr(map): Parameter attributes related with optimizing.
+            Currently, it only contains 'learning_rate'.
+            Default: {'learning_rate': 1.0}
+        regularizer(WeightDecayRegularizer): The Regularizer which will
+            be applied on the parameter. Default: None
+        gradient_clip_attr(BaseGradientClipAttr): The gradint clip strategy
+            which will be applied on the parameter. Default: None
+        do_model_average(bool): True if the model average strategy will
+            be applied on this parameter.
+    """
+
     def __init__(self, block, shape, dtype, **kwargs):
         if shape is None or dtype is None:
             raise ValueError("Parameter must set shape and dtype")
@@ -1299,6 +1786,7 @@ class Parameter(Variable):
     def to_string(self, throw_on_error, with_details=False):
         """
         To debug string.
+
         Args:
             throw_on_error(bool): raise exception when self is not initialized
                 when throw_on_error is True
@@ -1331,8 +1819,15 @@ _startup_program_ = Program()
 
 def default_startup_program():
     """
-    Get default startup program. In startup program, Paddle will initialize
-    parameters, initialize nccl handle, etc.
+    Get default/global startup program.
+
+    The layer function in :code:`fluid.layers` will create parameters, readers,
+    NCCL handles as global variables. The :code:`startup_program` will
+    initialize them by the operators in startup program. The layer function will
+    append these initialization operators into startup program.
+
+    This method will return the :code:`default` or the :code:`current` startup
+    program. Users can use :code:`fluid.program_guard` to switch program.
 
     Returns:
         Program: startup program
@@ -1342,7 +1837,15 @@ def default_startup_program():
 
 def default_main_program():
     """
-    Get default main program. The main program is used for training or testing.
+    Get default/global main program. The main program is used for training or
+    testing.
+
+    All layer function in :code:`fluid.layers` will append operators and
+    variables to the :code:`default_main_program`.
+
+    The :code:`default_main_program` is the default program in a lot of APIs.
+    For example, the :code:`Executor.run()` will execute the
+    :code:`default_main_program` when the program is not specified.
 
     Returns:
         Program: main program
@@ -1384,20 +1887,34 @@ def switch_startup_program(program):
 @contextlib.contextmanager
 def program_guard(main_program, startup_program=None):
     """
-    Switch program with `with` statement
+    Change the global main program and startup program with `with` statement.
+    Layer functions in the Python `with` block will append operators and
+    variables to the new main programs.
+
+    Examples:
+
+        >>> import paddle.fluid as fluid
+        >>> main_program = fluid.Program()
+        >>> startup_program = fluid.Program()
+        >>> with fluid.program_guard(main_program, startup_program):
+        >>>     data = fluid.layers.data(...)
+        >>>     hidden = fluid.layers.fc(...)
+
+    Notes: The temporary :code:`Program` can be used if the user does not need
+    to construct either of startup program or main program.
 
     Examples:
-        >>> with program_guard(Program()):
-        >>>   data = fluid.layers.data(...)
-        >>>   hidden = fluid.layers.fc(...)
+
+        >>> import paddle.fluid as fluid
+        >>> main_program = fluid.Program()
+        >>> # does not care about startup program. Just pass a temporary value.
+        >>> with fluid.program_guard(main_program, fluid.Program()):
+        >>>     data = ...
 
     Args:
-        main_program(Program): New main program inside `with` statement
+        main_program(Program): New main program inside `with` statement.
         startup_program(Program): New startup program inside `with` statement.
             None means do not change startup program.
-
-    Returns:
-        None
     """
     if not isinstance(main_program, Program):
         raise TypeError("main_program should be Program")
@@ -1414,11 +1931,12 @@ def program_guard(main_program, startup_program=None):
 
 def get_var(name, program=None):
     """
-    Get a variable by name from the global block of a program
+    Get a variable by name from the global block of a program.
+
     Args:
         name(str): name of the variable
         program(Program|None): program object.
-             If None, default_global_program() will be used.
+        If None, default_global_program() will be used.
 
     Returns:
         Variable
diff --git a/python/paddle/fluid/inferencer.py b/python/paddle/fluid/inferencer.py
index 6baac00905713594acd59bb3819038576fab0674..a81e39695b78f235d6ae896d90117dd392692634 100644
--- a/python/paddle/fluid/inferencer.py
+++ b/python/paddle/fluid/inferencer.py
@@ -27,13 +27,30 @@ __all__ = ['Inferencer', ]
 
 
 class Inferencer(object):
+    """
+    Inferencer High Level API.
+
+    Args:
+        infer_func (Python func): Infer function that will return predict Variable
+        param_path (str): The path where the inference model is saved by fluid.io.save_params
+        place (Place): place to do the inference
+        parallel (bool): use parallel_executor to run the inference, it will use multi CPU/GPU.
+
+    Examples:
+        .. code-block:: python
+
+            def inference_program():
+                x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+                y_predict = fluid.layers.fc(input=x, size=1, act=None)
+                return y_predict
+
+            place = fluid.CPUPlace()
+            inferencer = fluid.Inferencer(
+                infer_func=inference_program, param_path="/tmp/model", place=place)
+
+    """
+
     def __init__(self, infer_func, param_path, place=None, parallel=False):
-        """
-        :param infer_func: a function that will return predict Variable
-        :param param_path: the path where the inference model is saved by fluid.io.save_params
-        :param place: place to do the inference
-        :param parallel: use parallel_executor to run the inference, it will use multi CPU/GPU.
-        """
         self.param_path = param_path
         self.scope = core.Scope()
         self.parallel = parallel
@@ -60,9 +77,20 @@ class Inferencer(object):
 
     def infer(self, inputs, return_numpy=True):
         """
-        :param inputs: a map of {"input_name": input_var} that will be feed into the inference program
-        to get the predict value
-        :return: the predict value of the inference model
+        Do Inference for Inputs
+
+        Args:
+            inputs (map): a map of {"input_name": input_var} that will be feed into the inference program
+            return_numpy (bool): transform return value into numpy or not
+
+        Returns:
+            Tensor or Numpy: the predict value of the inference model for the inputs
+
+        Examples:
+            .. code-block:: python
+
+                tensor_x = numpy.random.uniform(0, 10, [batch_size, 13]).astype("float32")
+                results = inferencer.infer({'x': tensor_x})
         """
         if not isinstance(inputs, dict):
             raise ValueError(
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 4e132ed26183eaa5e572128e679cdbffd42e5a42..373e9c060de1ee27c165ccd2380cd8c38612c4d9 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -15,28 +15,43 @@
 import framework
 import numpy as np
 import contextlib
+from framework import convert_np_dtype_to_dtype_
+from core import VarDesc
 
 __all__ = [
-    'Constant', 'Uniform', 'Normal', 'Xavier', 'force_init_on_cpu',
-    'init_on_cpu', 'ConstantInitializer', 'UniformInitializer',
-    'NormalInitializer', 'XavierInitializer'
+    'Constant', 'Uniform', 'Normal', 'Xavier', 'Bilinear', 'MSRA',
+    'force_init_on_cpu', 'init_on_cpu', 'ConstantInitializer',
+    'UniformInitializer', 'NormalInitializer', 'XavierInitializer',
+    'BilinearInitializer', 'MSRAInitializer'
 ]
 
 _force_init_on_cpu_ = False
 
 
 def force_init_on_cpu():
+    """
+    The flag of whether force to init variables on CPU.
+
+    Examples:
+        .. code-block:: python
+
+            if force_init_on_cpu():
+                pass
+
+    """
     return _force_init_on_cpu_
 
 
 @contextlib.contextmanager
 def init_on_cpu():
     """
-    Switch program with `with` statement
+    Force the variable to be inited on CPU.
 
     Examples:
-        >>> with init_on_cpu():
-        >>>   step = layers.create_global_var()
+        .. code-block:: python
+
+            with init_on_cpu():
+                step = layers.create_global_var()
 
     """
     global _force_init_on_cpu_
@@ -102,14 +117,18 @@ class Initializer(object):
 
 class ConstantInitializer(Initializer):
     """Implements the constant initializer
+
+    Args:
+        value (float): constant value to initialize the variable
+
+    Examples:
+        .. code-block:: python
+
+            fc = fluid.layers.fc(input=x, size=10,
+                param_attr=fluid.initializer.Constant(value=2.0))
     """
 
     def __init__(self, value=0.0, force_cpu=False):
-        """Constructor for ConstantInitializer
-
-        Args:
-            value: constant value to initialize the variable
-        """
         assert value is not None
         super(ConstantInitializer, self).__init__()
         self._value = value
@@ -144,16 +163,20 @@ class ConstantInitializer(Initializer):
 
 class UniformInitializer(Initializer):
     """Implements the random uniform distribution initializer
+
+    Args:
+        low (float): lower boundary of the uniform distribution
+        high (float): upper boundary of the uniform distribution
+        seed (int): random seed
+
+    Examples:
+        .. code-block:: python
+
+            fc = fluid.layers.fc(input=x, size=10,
+                param_attr=fluid.initializer.Uniform(low=-0.5, high=0.5))
     """
 
     def __init__(self, low=-1.0, high=1.0, seed=0):
-        """Constructor for UniformInitializer
-
-        Args:
-            low: lower boundary of the uniform distribution
-            high: upper boundary of the uniform distribution
-            seed: random seed
-        """
         assert low is not None
         assert high is not None
         assert high >= low
@@ -194,17 +217,21 @@ class UniformInitializer(Initializer):
 
 
 class NormalInitializer(Initializer):
-    """Implements the  random Normal(Gaussian) distribution initializer
+    """Implements the Random Normal(Gaussian) distribution initializer
+
+    Args:
+        loc (float): mean of the normal distribution
+        scale (float): standard deviation of the normal distribution
+        seed (int): random seed
+
+    Examples:
+        .. code-block:: python
+
+            fc = fluid.layers.fc(input=x, size=10,
+                param_attr=fluid.initializer.Normal(loc=0.0, scale=2.0))
     """
 
     def __init__(self, loc=0.0, scale=1.0, seed=0):
-        """Constructor for NormalInitializer
-
-        Args:
-            loc: mean of the normal distribution
-            scale: standard deviation of the normal distribution
-            seed: random seed
-        """
         assert loc is not None
         assert scale is not None
         assert seed is not None
@@ -244,39 +271,49 @@ class NormalInitializer(Initializer):
 
 
 class XavierInitializer(Initializer):
-    """Implements the Xavier initializer
-
+    """
     This class implements the Xavier weight initializer from the paper
-    Understanding the difficulty of training deep feedforward neural
-    networks[1] by Xavier Glorot and Yoshua Bengio.
+    `Understanding the difficulty of training deep feedforward neural
+    networks <http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>`_
+    by Xavier Glorot and Yoshua Bengio.
 
     This initializer is designed to keep the scale of the gradients
     approximately same in all the layers. In case of Uniform distribution,
-    the range is [-x, x], where x = sqrt(6 / (fan_in + fan_out)).
+    the range is [-x, x], where
+
+    .. math::
+
+        x = \sqrt{\\frac{6.0}{fan\_in + fan\_out}}
+
     In case of Normal distribution, the mean is 0 and the standard deviation
-    is sqrt(2/ (fan_in + fan_out)).
+    is
+
+    .. math::
+
+        \sqrt{\\frac{2.0}{fan\_in + fan\_out}}
+
+
+    Args:
+        uniform (bool): whether to use uniform or normal distribution
+        fan_in (float): fan_in for Xavier initialization. If None, it is
+                inferred from the variable.
+        fan_out (float): fan_out for Xavier initialization. If None, it is
+                 inferred from the variable.
+        seed (int): random seed
+
+    Note:
+        It is recommended to set fan_in and fan_out to None for most cases.
+
+    Examples:
+        .. code-block:: python
+
+            fc = fluid.layers.fc(
+                input=queries, size=10,
+                param_attr=fluid.initializer.Xavier(uniform=False))
 
-    References:
-        [1] Understanding the difficulty of training deep feedforward neural
-            networks. International conference on artificial intelligence and
-            statistics.
-            (http://proceedings.mlr.press/v9/glorot10a.html)
     """
 
     def __init__(self, uniform=True, fan_in=None, fan_out=None, seed=0):
-        """Constructor for XavierInitializer
-
-        Args:
-            uniform: whether to use uniform or normal distribution
-            fan_in: fan_in for Xavier initialization. If None, it is
-                    inferred from the variable.
-            fan_out: fan_out for Xavier initialization. If None, it is
-                     inferred from the variable.
-            seed: random seed
-
-        Note: It is recommended to set fan_in and fan_out to None for
-              most cases.
-        """
         assert uniform is not None
         assert seed is not None
         super(XavierInitializer, self).__init__()
@@ -340,30 +377,42 @@ class MSRAInitializer(Initializer):
     """Implements the MSRA initializer a.k.a. Kaiming Initializer
 
     This class implements the weight initialization from the paper
-    Delving Deep into Rectifiers: Surpassing Human-Level Performance on
-    ImageNet Classification[1] by Kaiming He, Xiangyu Zhang, Shaoqing Ren
-    and Jian Sun. This is a robust initialization method that particularly
-    considers the rectifier nonlinearities. In case of Uniform distribution,
-    the range is [-x, x], where x = sqrt(6 / fan_in). In case of Normal
-    distribution, the mean is 0 and the standard deviation
-    is sqrt(2/ fan_in).
-
-    References:
-        [1] Delving Deep into Rectifiers: Surpassing Human-Level Performance
-            on ImageNet Classification
-            (https://arxiv.org/abs/1502.01852)
+    `Delving Deep into Rectifiers: Surpassing Human-Level Performance on
+    ImageNet Classification <https://arxiv.org/abs/1502.01852>`_
+    by Kaiming He, Xiangyu Zhang, Shaoqing Ren and Jian Sun. This is a
+    robust initialization method that particularly considers the rectifier
+    nonlinearities. In case of Uniform distribution, the range is [-x, x], where
+
+    .. math::
+
+        x = \sqrt{\\frac{6.0}{fan\_in}}
+
+    In case of Normal distribution, the mean is 0 and the standard deviation
+    is
+
+    .. math::
+
+        \sqrt{\\frac{2.0}{fan\_in}}
+
+    Args:
+        uniform (bool): whether to use uniform or normal distribution
+        fan_in (float): fan_in for MSRAInitializer. If None, it is\
+        inferred from the variable.
+        seed (int): random seed
+
+    Note:
+        It is recommended to set fan_in to None for most cases.
+
+    Examples:
+        .. code-block:: python
+
+            fc = fluid.layers.fc(
+                input=queries, size=10,
+                param_attr=fluid.initializer.MSRA(uniform=False))
     """
 
     def __init__(self, uniform=True, fan_in=None, seed=0):
         """Constructor for MSRAInitializer
-
-        Args:
-            uniform: whether to use uniform or normal distribution
-            fan_in: fan_in for MSRAInitializer. If None, it is
-                    inferred from the variable.
-            seed: random seed
-
-        Note: It is recommended to set fan_in to None for most cases.
         """
         assert uniform is not None
         assert seed is not None
@@ -422,6 +471,104 @@ class MSRAInitializer(Initializer):
         return op
 
 
+class BilinearInitializer(Initializer):
+    """
+    This initializer can be used in transposed convolution operator to
+    act as upsampling. Users can upsample a feature map with shape of
+    (B, C, H, W) by any integer factor. The usage is:
+
+    Examples:
+
+        .. code-block:: python
+
+            factor = 2
+            w_attr = ParamAttr(learning_rate=0., regularizer=L2Decay(0.),
+                               initializer=Bilinear())
+            conv_up = fluid.layers.conv2d_transpose(
+                input,
+                num_filters=C,
+                output_size=None,
+                filter_size=2 * factor - factor % 2,
+                padding=ceil((factor - 1) / 2.),
+                stride=factor,
+                groups=C,
+                param_attr=w_attr,
+                bias_attr=False)
+
+    Where, `num_filters=C` and `groups=C` means this is channel-wise transposed
+    convolution. The filter shape will be (C, 1, K, K) where K is `filer_size`,
+    This initializer will set a (K, K) interpolation kernel for every channel
+    of the filter identically. The resulting shape of the output feature map
+    will be (B, C, factor * H, factor * W). Note that the learning rate and the
+    weight decay are set to 0 in order to keep coefficient values of bilinear
+    interpolation unchanged during training.
+
+    """
+
+    def __init__(self):
+        """Constructor for BilinearInitializer.
+        """
+        super(BilinearInitializer, self).__init__()
+
+    def __call__(self, var, block):
+        """Add biliear initialization ops for a variable
+
+        Args:
+            var (Variable): Variable that needs to be initialized.
+            block (Block): The block in which initialization ops should
+                           be added.
+
+        Returns:
+            Operator: the initialization op
+
+        Raises:
+            ValueError: If type of `var` and `block` is not right.
+                        If the shape of `var` size is not 4 and
+                        var.shape[2] != var.shape[3].
+        """
+        if not isinstance(var, framework.Variable):
+            raise ValueError("var must be framework.Variable.")
+
+        if not isinstance(block, framework.Block):
+            raise ValueError("block must be framework.Block.")
+
+        shape = var.shape
+        if len(shape) != 4:
+            raise ValueError("the length of shape must be 4.")
+        if shape[2] != shape[3]:
+            raise ValueError("shape[2] must be equal to shape[3].")
+
+        weight = np.zeros(np.prod(var.shape), dtype='float32')
+        size = shape[3]
+        # factor
+        f = np.ceil(size / 2.)
+        # center
+        c = (2 * f - 1 - f % 2) / (2. * f)
+        for i in range(np.prod(shape)):
+            x = i % size
+            y = (i / size) % size
+            weight[i] = (1 - abs(x / f - c)) * (1 - abs(y / f - c))
+        weight = np.reshape(weight, shape)
+
+        if var.dtype == VarDesc.VarType.FP32:
+            value_name = "fp32_values"
+            values = [float(v) for v in weight.flat]
+        else:
+            raise ValueError("Unsupported dtype %s", input.dtype)
+        if np.prod(shape) > 1024 * 1024:
+            raise ValueError("The size of input is too big. ")
+        op = block.append_op(
+            type='assign_value',
+            outputs={'Out': [var]},
+            attrs={
+                'dtype': var.dtype,
+                'shape': list(shape),
+                value_name: values
+            })
+        var.op = op
+        return op
+
+
 # We short the class name, since users will use the initializer with the package
 # name. The sample code:
 #
@@ -436,3 +583,4 @@ Uniform = UniformInitializer
 Normal = NormalInitializer
 Xavier = XavierInitializer
 MSRA = MSRAInitializer
+Bilinear = BilinearInitializer
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 6323c9899e0080b436a52f852c647466b8f94bc1..5c8f4f6507c7dd9b3d005639d962ce1e55b2c704 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -13,11 +13,12 @@
 # limitations under the License.
 
 import os
+import errno
 import time
 import shutil
 
 from paddle.fluid.evaluator import Evaluator
-from paddle.fluid.framework import Program, Parameter, default_main_program, Variable
+from paddle.fluid.framework import Program, Parameter, default_main_program, default_startup_program, Variable
 from . import core
 
 __all__ = [
@@ -25,25 +26,48 @@ __all__ = [
     'load_persistables', 'save_inference_model', 'load_inference_model',
     'get_inference_program', 'save_checkpoint', 'load_checkpoint',
     'clean_checkpoint', 'load_persist_vars_without_grad',
-    'save_persist_vars_without_grad', 'get_latest_checkpoint_serial'
+    'load_lookup_table_vars', 'save_persist_vars_without_grad',
+    'get_latest_checkpoint_serial'
 ]
 
 
 def is_parameter(var):
-    """Check whether the variable is a Parameter.
-
-    This function checks whether the input variable is a Parameter.
+    """
+    Check whether the given variable is an instance of Parameter.
 
     Args:
-        var : The input variable.
+        var(Variable): The variable to be checked.
 
     Returns:
-        boolean result whether the variable is a Parameter.
+        bool: True if the given `var` is an instance of Parameter,
+        False if not.
+
+    Examples:
+        .. code-block:: python
+
+            param = fluid.default_main_program().global_block().var('fc.w')
+            res = fluid.io.is_parameter(param)
     """
     return isinstance(var, Parameter)
 
 
 def is_persistable(var):
+    """
+    Check whether the given variable is persistable.
+
+    Args:
+        var(Variable): The variable to be checked.
+
+    Returns:
+        bool: True if the given `var` is persistable
+        False if not.
+
+    Examples:
+        .. code-block:: python
+
+            param = fluid.default_main_program().global_block().var('fc.w')
+            res = fluid.io.is_persistable(param)
+    """
     if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
             var.desc.type() == core.VarDesc.VarType.FETCH_LIST:
         return False
@@ -68,20 +92,69 @@ def save_vars(executor,
               predicate=None,
               filename=None):
     """
-    Save variables to directory by executor.
+    Save variables to the given directory by executor.
+
+    There are two ways to specify variables to be saved: The first way, list 
+    variables in a list and assign it to the `vars`. The second way, assign the 
+    `main_program` with an existing program, then all variables in the program 
+    will be saved. The first way has a higher priority. In other words, if `vars` 
+    are assigned, the `main_program` and the `predicate` will be ignored.
+
+    The `dirname` are used to specify the folder where to save variables. 
+    If you prefer to save variables in separate files in the folder `dirname`, 
+    set `filename` None; if you prefer to save all variables in a single file, 
+    use `filename` to specify it.
+
+    Args:
+        executor(Executor): The executor to run for saving variables.
+        dirname(str): The directory path.
+        main_program(Program|None): The program whose variables will be saved. 
+                                    If it is None, the default main program will 
+                                    be used automatically.
+                                    Default: None
+        vars(list[Variable]|None): The list that contains all variables to save. 
+                                   It has a higher priority than the `main_program`.
+                                   Default: None
+        predicate(function|None): If it is not None, only variables in the 
+                                  `main_program` that makes predicate(variable)==True 
+                                  will be saved. It only works when we are using the 
+                                  `main_program` to specify variables (In other words 
+                                  `vars` is None).
+                                  Default: None
+        filename(str|None): The file which to save all variables. If you prefer to save 
+                            variables separately, set it to None.
+                            Default: None
+
+    Returns:
+        None
+
+    Raises:
+        TypeError: If `main_program` is not an instance of Program nor None.
+
+    Examples:
+        .. code-block:: python
 
-    :param executor: executor that save variable
-    :param dirname: directory path
-    :param main_program: program. If vars is None, then filter all variables in this
-    program which fit `predicate`. Default default_main_program.
-    :param predicate: The Predicate describes a callable that returns a variable
-    as a bool. If it returns true, the corresponding input variable will be saved.
-    :param vars: variables need to be saved. If vars is specified, program & predicate
-    will be ignored
-    :param filename: The name of a single file that all vars are saved to.
-        If it is None, save variables to separate files.
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
 
-    :return: None
+            # The first usage: using `main_program` to specify variables
+            def name_has_fc(var):
+                res = "fc" in var.name
+                return res
+
+            prog = fluid.default_main_program()
+            fluid.io.save_vars(executor=exe, dirname=path, main_program=prog,
+                               vars=None)
+            # All variables in `main_program` whose name includes "fc" will be saved.
+            # And variables are going to be saved separately.
+
+
+            # The second usage: using `vars` to specify variables
+            var_list = [var_a, var_b, var_c]
+            fluid.io.save_vars(executor=exe, dirname=path, vars=var_list, 
+                               filename="vars_file")
+            # var_a, var_b and var_c will be saved. And they are going to be
+            # saved in the same file named 'var_file' in the path "./my_paddle_model".
     """
     if vars is None:
         if main_program is None:
@@ -129,7 +202,42 @@ def save_vars(executor,
 
 def save_params(executor, dirname, main_program=None, filename=None):
     """
-    Save all parameters to directory with executor.
+    This function filters out all parameters from the give `main_program`
+    and then save them to the folder `dirname` or the file `filename`.
+
+    Use the `dirname` to specify the saving folder. If you would like to 
+    save parameters in separate files, set `filename` None; if you would 
+    like to save all parameters in a single file, use `filename` to specify 
+    the file name.
+
+    NOTICE: Some variables are not Parameter while they are necessary for 
+    training. So you can NOT save and continue your training just by 
+    `save_params()` and `load_params()`. Please use `save_persistables()` 
+    and `load_persistables()` instead.
+
+    Args:
+        executor(Executor): The executor to run for saving parameters.
+        dirname(str): The saving directory path.
+        main_program(Program|None): The program whose parameters will be
+                                    saved. If it is None, the default
+                                    main program will be used automatically.
+                                    Default: None
+        filename(str|None): The file to save all parameters. If you prefer 
+                            to save parameters in differnet files, set it 
+                            to None.
+                            Default: None
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            prog = fluid.default_main_program()
+            fluid.io.save_params(executor=exe, dirname=param_path, 
+                                 main_program=None)
     """
     save_vars(
         executor,
@@ -142,7 +250,37 @@ def save_params(executor, dirname, main_program=None, filename=None):
 
 def save_persistables(executor, dirname, main_program=None, filename=None):
     """
-    Save all persistables to directory with executor.
+    This function filters out all variables with `persistable==True` from the 
+    give `main_program` and then saves these variables to the folder `dirname` 
+    or file `filename`.
+
+    The `dirname` is used to specify the folder where persistable variables 
+    are going to be saved. If you would like to save variables in separate 
+    files, set `filename` None; if you would like to save all variables in a 
+    single file, use `filename` to specify the file name.
+
+    Args:
+        executor(Executor): The executor to run for saving persistable variables.
+        dirname(str): The directory path.
+        main_program(Program|None): The program whose persistbale variables will 
+                                    be saved. If it is None, the default main 
+                                    program will be used automatically.
+                                    Default: None
+        filename(str|None): The file to saved all variables. If you prefer to 
+                            save variables in differnet files, set it to None.
+                            Default: None
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            prog = fluid.default_main_program()
+            fluid.io.save_persistables(executor=exe, dirname=param_path, 
+                                       main_program=None)
     """
     save_vars(
         executor,
@@ -160,20 +298,69 @@ def load_vars(executor,
               predicate=None,
               filename=None):
     """
-    Load variables from directory by executor.
+    Load variables from the given directory by executor.
+
+    There are two ways to specify variables to be loaded: The first way, list 
+    variables in a list and assign it to the `vars`. The second way, assign the 
+    `main_program` with an existing program, then all variables in the program 
+    will be loaded. The first way has a higher priority. In other words if `vars` 
+    are assigned, the `main_program` and the `predicate` will be ignored.
 
-    :param executor: executor that load variable
-    :param dirname: directory path
-    :param main_program: program. If vars is None, then filter all variables in this
-    program which fit `predicate`. Default default_main_program().
-    :param predicate: The Predicate describes a callable that returns a variable
-    as a bool. If it returns true, the corresponding input variable will be loaded.
-    :param vars: variables need to be loaded. If vars is specified, program &
-    predicate will be ignored
-    :param filename: The name of the single file that all vars are loaded from.
-        If it is None, load variables from separate files.
+    The `dirname` are used to specify the folder where to load variables. 
+    If variables were saved in separate files in the folder `dirname`, 
+    set `filename` None; if all variables were saved in a single file, 
+    use `filename` to specify it.
 
-    :return: None
+    Args:
+        executor(Executor): The executor to run for loading variables.
+        dirname(str): The directory path.
+        main_program(Program|None): The program whose variables will be loaded. 
+                                    If it is None, the default main program will 
+                                    be used automatically.
+                                    Default: None
+        vars(list[Variable]|None): The list that contains all variables to load. 
+                                   It has a higher priority than the `main_program`.
+                                   Default: None
+        predicate(function|None): If it is not None, only variables in the 
+                                  `main_program` that makes predicate(variable)==True 
+                                  will be loaded. It only works when we are using the 
+                                  `main_program` to specify variables (In other words 
+                                  `vars` is None).
+                                  Default: None
+        filename(str|None): The file which saved all required variables. If variables 
+                            were saved in differnet files, set it to None.
+                            Default: None
+
+    Returns:
+        None
+
+    Raises:
+        TypeError: If `main_program` is not an instance of Program nor None.
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+
+            # The first usage: using `main_program` to specify variables
+            def name_has_fc(var):
+                res = "fc" in var.name
+                return res
+
+            prog = fluid.default_main_program()
+            fluid.io.load_vars(executor=exe, dirname=path, main_program=prog,
+                               vars=None)
+            # All variables in `main_program` whose name includes "fc" will be loaded.
+            # And all the variables are supposed to have been saved in differnet files.
+
+
+            # The second usage: using `vars` to specify variables
+            var_list = [var_a, var_b, var_c]
+            fluid.io.load_vars(executor=exe, dirname=path, vars=var_list, 
+                               filename="vars_file")
+            # var_a, var_b and var_c will be loaded. And they are supposed to haven 
+            # been saved in the same file named 'var_file' in the path "./my_paddle_model".
     """
     if vars is None:
         if main_program is None:
@@ -221,7 +408,42 @@ def load_vars(executor,
 
 def load_params(executor, dirname, main_program=None, filename=None):
     """
-    load all parameters from directory by executor.
+    This function filters out all parameters from the give `main_program`
+    and then trys to load these parameters from the folder `dirname` or
+    the file `filename`.
+
+    Use the `dirname` to specify the folder where parameters were saved. If 
+    parameters were saved in separate files in the folder `dirname`, set 
+    `filename` None; if all parameters were saved in a single file, use 
+    `filename` to specify the file name.
+
+    NOTICE: Some variables are not Parameter while they are necessary for 
+    training. So you can NOT save and continue your training just by 
+    `save_params()` and `load_params()`. Please use `save_persistables()` 
+    and `load_persistables()` instead. 
+
+    Args:
+        executor(Executor): The executor to run for loading parameters.
+        dirname(str): The directory path.
+        main_program(Program|None): The program whose parameters will be
+                                    loaded. If it is None, the default
+                                    main program will be used automatically.
+                                    Default: None
+        filename(str|None): The file which saved all parameters. If parameters 
+                            were saved in differnet files, set it to None.
+                            Default: None
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            prog = fluid.default_main_program()
+            fluid.io.load_params(executor=exe, dirname=param_path, 
+                                main_program=None)
     """
     load_vars(
         executor,
@@ -233,7 +455,37 @@ def load_params(executor, dirname, main_program=None, filename=None):
 
 def load_persistables(executor, dirname, main_program=None, filename=None):
     """
-    load all persistables from directory by executor.
+    This function filters out all variables with `persistable==True` from the 
+    give `main_program` and then trys to load these variables from the folder 
+    `dirname` or the file `filename`.
+
+    Use the `dirname` to specify the folder where persistable variables were 
+    saved. If variables were saved in separate files, set `filename` None; 
+    if all variables were saved in a single file, use `filename` to specify 
+    the file name.
+
+    Args:
+        executor(Executor): The executor to run for loading persistable variables.
+        dirname(str): The directory path.
+        main_program(Program|None): The program whose persistbale variables will 
+                                    be loaded. If it is None, the default main 
+                                    program will be used automatically.
+                                    Default: None
+        filename(str|None): The file which saved all variables. If variables were 
+                            saved in differnet files, set it to None.
+                            Default: None
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            prog = fluid.default_main_program()
+            fluid.io.load_persistables(executor=exe, dirname=param_path, 
+                                       main_program=None)
     """
     load_vars(
         executor,
@@ -306,22 +558,48 @@ def save_inference_model(dirname,
                          model_filename=None,
                          params_filename=None):
     """
-    Build a model especially for inference,
-    and save it to directory by the executor.
+    Prune the given `main_program` to build a new program especially for inference,
+    and then save it and all related parameters to given `dirname` by the `executor`.
+
+    Args:
+        dirname(str): The directory path to save the inference model.
+        feeded_var_names(list[str]): Names of variables that need to be feeded data 
+                                     during inference.
+        target_vars(list[Variable]): Variables from which we can get inference 
+                                     results.
+        executor(Executor): The executor that saves the inference model.
+        main_program(Program|None): The original program, which will be pruned to 
+                                    build the inference model. If is setted None, 
+                                    the default main program will be used.
+                                    Default: None.
+        model_filename(str|None): The name of file to save the inference program 
+                                  itself. If is setted None, a default filename 
+                                  `__model__` will be used.
+        params_filename(str|None): The name of file to save all related parameters. 
+                                   If it is setted None, parameters will be saved 
+                                   in separate files .
+
+    Returns:
+        None
 
-    :param dirname: directory path
-    :param feeded_var_names: Names of variables that need to be feeded data during inference
-    :param target_vars: Variables from which we can get inference results.
-    :param executor: executor that save inference model
-    :param main_program: original program, which will be pruned to build the inference model.
-            Default default_main_program().
-    :param model_filename: The name of file to save inference program.
-        If not specified, default filename `__model__` will be used.
-    :param params_filename: The name of file to save parameters.
-        It is used for the case that all parameters are saved in a single binary file.
-        If not specified, parameters are considered saved in separate files.
+    Raises:
+        ValueError: If `feed_var_names` is not a list of basestring.
+        ValueError: If `target_vars` is not a list of Variable.
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            path = "./infer_model"
+            fluid.io.save_inference_model(dirname=path, feeded_var_names=['img'],
+                         target_vars=[predict_var], executor=exe)
+
+            # In this exsample, the function will prune the default main program 
+            # to make it suitable for infering the `predict_var`. The pruned 
+            # inference program is going to be saved in the "./infer_model/__model__" 
+            # and parameters are going to be saved in separate files under folder
+            # "./infer_model". 
 
-    :return: None
     """
     if isinstance(feeded_var_names, basestring):
         feeded_var_names = [feeded_var_names]
@@ -382,18 +660,49 @@ def load_inference_model(dirname,
     """
     Load inference model from a directory
 
-    :param dirname: directory path
-    :param executor: executor that load inference model
-    :param model_filename: The name of file to load inference program.
-        If not specified, default filename `__model__` will be used.
-    :param params_filename: The name of file to load parameters.
-        It is used for the case that all parameters are saved in a single binary file.
-        If not specified, parameters are considered saved in separate files.
+    Args:
+        dirname(str): The directory path
+        executor(Executor): The executor to run for loading inference model.
+        model_filename(str|None): The name of file to load inference program.
+                                  If it is None, the default filename 
+                                  '__model__' will be used.
+                                  Default: None
+        params_filename(str|None): The name of file to load all parameters.
+                                   It is only used for the case that all 
+                                   parameters were saved in a single binary 
+                                   file. If parameters were saved in separate 
+                                   files, set it as 'None'.
+
+    Returns:
+        tuple: The return of this function is a tuple with three elements:
+        (program, feed_target_names, fetch_targets). The `program` is a 
+        Program, it's the program for inference. The `feed_target_names` is 
+        a list of str, it contains Names of variables that need to feed 
+        data in the inference program. The `fetch_targets` is a list of 
+        Variable. It contains variables from which we can get inference 
+        results.
+
+    Raises:
+        ValueError: If `dirname` is not a existing directory.
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            path = "./infer_model"
+            [inference_program, feed_target_names, fetch_targets] = 
+                fluid.io.load_inference_model(dirname=path, executor=exe)
+            results = exe.run(inference_program,
+                          feed={feed_target_names[0]: tensor_img},
+                          fetch_list=fetch_targets)
+
+            # In this exsample, the inference program was saved in the 
+            # "./infer_model/__model__" and parameters were saved in 
+            # separate files in ""./infer_model". 
+            # After getting inference program, feed target names and 
+            # fetch targets, we can use an Executor to run the inference 
+            # program to get the inference result.
 
-    :return: [program, feed_target_names, fetch_targets]
-             program: program especially for inference.
-             feed_target_names: Names of variables that need to feed data
-             fetch_targets: Variables from which we can get inference results.
     """
     if not os.path.isdir(dirname):
         raise ValueError("There is no directory named '%s'", dirname)
@@ -424,12 +733,25 @@ def load_inference_model(dirname,
 
 def get_parameter_value(para, executor):
     """
-    Get the LoDTensor for the parameter
+    Get the LoDTensor value of the given parameter.
+
+    Args:
+        para(Parameter): The parameter to get value from.
+        executor(Executor): The executor to run for retrieving the value.
 
-    :param executor: executor for retrieving the value
-    :param para: the given parameter
+    Returns:
+        numpy.array: The given parameter's values.
+
+    Raises:
+        AssertionError: If the `para` is not an instance of Parameter.
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param = fluid.default_main_program().global_block().var('fc.w')
+            p = fluid.io.get_parameter_value(param, exe)
 
-    :return: the LoDTensor for the parameter
     """
     assert is_parameter(para)
 
@@ -441,14 +763,30 @@ def get_parameter_value(para, executor):
 
 def get_parameter_value_by_name(name, executor, program=None):
     """
-    Get the LoDTensor for paramter with the given name
+    Get the LoDTensor value of a certain parameter by its name.
+
+    Args:
+        name(str): The parameter's name.
+        executor(Executor): The executor to run for retrieving the value.
+        program(Program | None): The program where to find the parameter.
+                               If it's set to be None, the function will
+                               try to find the parameter in the default
+                               main program.
 
-    :param executor: executor for retrieving the value
-    :param name: the name of the parameter
-    :param program: the program where the variable is found
-            Default default_main_program().
+    Returns:
+        numpy.array: The parameter's values.
+
+    Raises:
+        TypeError: If given `name` is not an instance of basestring.
+        TypeError: If the parameter with the given name doesn't exist.
+        AssertionError: If there is a varibale named `name` in the
+                        given program but it is not a Parameter.
+
+    Examples:
+        .. code-block:: python
 
-    :return: the LoDTensor for the variable
+            exe = fluid.Executor(fluid.CPUPlace())
+            p = fluid.io.get_parameter_value('fc.w', exe)
     """
     if program is None:
         program = default_main_program()
@@ -459,6 +797,7 @@ def get_parameter_value_by_name(name, executor, program=None):
 SUCCESS_MARK_FILENAME = "_SUCCESS"
 CHECKPOINT_PREFIX = "checkpoint"
 MODEL_DIR = "__model__"
+LOOKUP_TABLE_DIR = "__lookup_table__"
 TRAINER_PREFIX = "trainer"
 CHECKPOINT_SEPARATOR = "_"
 
@@ -468,48 +807,145 @@ def save_checkpoint(executor,
                     trainer_id,
                     trainer_args=None,
                     main_program=None,
-                    max_num_checkpoints=3):
+                    max_num_checkpoints=3,
+                    lookup_table=None,
+                    ps_endpoint_list=None):
     """
-    Save Checkpoint will save persistable LodTensor variables from main_program in checkpoint directory,
-    the directory named by serial number from 0 to (n -1), save_checkpoint use LRU strategy
-    to keep numbers of checkpoint directory,  the numbers of checkpoint directory are max_num_checkpoints at most,
-    The interval between two saved checkpoints must greater than save_interval_secs.
+    This function filters out all checkpoint variables from the give
+    main_program and then saves these variables to the `checkpoint_dir` 
+    directory.
+
+    In the training precess, we generally save a checkpoint in each
+    iteration. So there might be a lot of checkpoints in the 
+    `checkpoint_dir`. To avoid them taking too much disk space, the 
+    `max_num_checkpoints` are introduced to limit the total number of 
+    checkpoints. If the number of existing checkpints is greater than 
+    the `max_num_checkpoints`, oldest ones will be scroll deleted.
+
+    A variable is a checkpoint variable and will be saved if it meets
+    all following conditions:
+        1. It's persistable.
+        2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
+        3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
+
+    Args:
+        executor(Executor): The executor to run for save checkpoint.
+        checkpoint_dir(str): The folder where to save checkpoints.
+        trainer_id(int): currect trainer id, if id is equal to 0, the trainer 
+            is chief.
+        trainer_args(dict|None): Current training arguments. Such as 'epoch_id' 
+            and 'step_id'.
+            Defaut: None
+        main_program(Program|None): The program whose checkpoint variables will
+            be saved. If it is None, the default main program will be used.
+        max_num_checkpoints(int): The max number of total number of existing 
+            checkpoints.
+            Default: 3
+        lookup_table(string|None): the lookup table name, when use distribute
+            lookup table, we can get lookup table name by DistributeTranspiler.
+            table_name 
+        ps_endpoint_list(list|None): the parameter server ip:port list.  
+            when use distribute lookup table, we can get ps_endpoint_list by 
+            distribute arguments.
 
-    :param executor executor for save the value
-    :param checkpoint_dir the checkpoint directory 
-    :param trainer_id currect trainer id, if id is equal to 0, the trainer is chief
-    :param main_program   will save all variables in program 
-    :param max_num_checkpoints will keep numbers of checkpoint serials not bigger than max_num_checkpoints
+    Returns:
+        None
+
+    Raises:
+        ValueError: If `checkpoint_dir` is None.
+        AssertionError: If `trainer_args` is not a dict.
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            path = "./checkpoints"
+            prog = fluid.default_main_program()
+            trainer_args = {"epoch_id": 200,
+                            "step_id": 20} # just an example
+            table_name = "share_w"
+            ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"]
+
+            fluid.io.save_checkpoint(executor=exe,
+                                     checkpoint_dir=path,
+                                     trainer_id=0,
+                                     trainer_args=trainer_args,
+                                     main_program=prog,
+                                     max_num_checkpoints=3,
+                                     lookup_table=table_name,
+                                     ps_endpoint_list = ps_endpoints)
     """
     if checkpoint_dir is None:
         raise ValueError("'checkpoint_dir' should not be None")
+    assert checkpoint_dir
 
     if trainer_args:
         assert isinstance(trainer_args, dict)
 
-    if not os.path.isdir(checkpoint_dir):
-        os.makedirs(checkpoint_dir)
+    is_chief = trainer_id == 0
 
+    _make_chekcpoint_dirs(checkpoint_dir)
     serial = get_latest_checkpoint_serial(checkpoint_dir) + 1
     cur_dir = _get_serial_dir(checkpoint_dir, serial)
 
     save_trainer_args(cur_dir, trainer_id, trainer_args)
 
-    if trainer_id == 0:
+    if is_chief:
         save_persist_vars_without_grad(executor, cur_dir, main_program)
 
+    if is_chief and lookup_table and ps_endpoint_list:
+        save_pserver_vars_by_notify(executor, cur_dir, lookup_table,
+                                    ps_endpoint_list)
+
     _scroll_delete(checkpoint_dir, max_num_checkpoints)
 
 
 def load_checkpoint(executor, checkpoint_dir, serial, main_program):
     """
-    Load checkpoint from a directory by executor,
-    it will find  the most recent saved checkpoint file and load it auto.
+    This function filters out all checkpoint variables from the give
+    main_program and then try to load these variables from the
+    `checkpoint_dir` directory.
+
+    In the training precess, we generally save a checkpoint in each
+    iteration. So there are more than one checkpoint in the 
+    `checkpoint_dir` (each checkpoint has its own sub folder), use 
+    `serial` to specify which serial of checkpoint you would like to
+    load.
+
+    A variable is a checkpoint variable and will be loaded if it meets
+    all following conditions:
+        1. It's persistable.
+        2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
+        3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
+
+    Args:
+        executor(Executor): The executor to run for loading checkpoint.
+        checkpoint_dir(str): The folder where all checkpoints are.
+        serial(int): The serial of checkpoint you would like to load.
+        main_program(Program): The program whose checkpoint variables will
+                               be loaded.
 
-    :param executor executor for load the value
-    :param checkpoint_dir  the checkpoint directory 
-    :param serial the serial folder in checkpoint directory will be load
-    :param main_program  will load all variables in program 
+    Returns:
+        None
+
+    Raises:
+        ValueError: If `checkpoint_dir` is None.
+        ValueError: If `serial` is None or `serial` is less than 0.
+        ValueError: If `main_program` is None.
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            path = "./checkpoints"
+            prog = fluid.default_main_program()
+            fluid.io.load_checkpoint(executor=exe, checkpoint_dir=path,
+                    serial=9, main_program=prog)
+
+            # In this example, `load_checkpoint` function
+            # will first filters out all checkpoint variables in the default
+            # main program, and then try to load these variables form the
+            # folder "./checkpoints/checkpoint_9/__model__".
     """
 
     if checkpoint_dir is None:
@@ -527,11 +963,12 @@ def load_checkpoint(executor, checkpoint_dir, serial, main_program):
 
 def clean_checkpoint(checkpoint_dir, delete_dir=False):
     """
-    clean the checkpoint dir, when the train exits normally, the trainer will call clean_checkpoint to delete checkpoint directory saved before.
+    clean the checkpoint dir, when the train exits normally, 
+    the trainer will call clean_checkpoint to delete checkpoint directory saved before.
     delete_dir only works when the directory is empty, otherwise, OSError is raised.  
 
-    :param checkpoint_dir
-    :param delete_dir
+    : param checkpoint_dir
+    : param delete_dir
     """
 
     if checkpoint_dir is None:
@@ -547,13 +984,40 @@ def load_persist_vars_without_grad(executor,
                                    program,
                                    has_model_dir=False):
     """
-    load_persist_vars_without_grad will load variables from a directory by an executor,
-    the variable named end with "@GRAD" will not be loaded.
+    This function filters out all checkpoint variables from the give
+    program and then trys to load these variables from the given directory.
+
+    A variable is a checkpoint variable if it meets all following
+    conditions:
+        1. It's persistable.
+        2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
+        3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
+
+    Args:
+        executor(Executor): The executor to run for loading variables.
+        dirname(str): The directory path.
+        program(Program): The program whose checkpoint variables will
+                          be loaded.
+        has_model_dir(bool): if True, the function loads variables
+                             from a sub directory named '__model__'.
+                             Default: False
 
-    :param executor  executor for load the value
-    :param dirname the checkpoint directory 
-    :param program   will load all variables in program 
-    :param has_model_dir if has_model_dir is True, will load variables from  sub directory named __model__
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            prog = fluid.default_main_program()
+            fluid.io.load_persist_vars_without_grad(executor=exe,
+                    dirname=param_path, program=prog, has_model_dir=True)
+
+            # In this example, `load_persist_vars_without_grad` function
+            # will first filters out all checkpoint variables in the default
+            # main program, and then trys to load these variables form the
+            # folder "./my_paddle_model/__model__".
     """
 
     if has_model_dir:
@@ -567,14 +1031,90 @@ def load_persist_vars_without_grad(executor,
         filename=None)
 
 
+def load_lookup_table_vars(executor, dirname, program, pserver_id, table_name):
+    """
+    The parameter server will load lookup table's local file in 
+    selectedrows variable.
+
+    Args:
+        executor(Executor): The executor to run for loading persistable variables
+        dirname(str): The directory path
+        main_program(Program): Find the variable named table_name in main_program
+        pserver_id(int): the serial number in pserver_endpoints list
+        table_name(str): lookup table name
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            dirname = "./checkpoints/checkpoint_9/__model__"
+            prog = fluid.default_main_program()
+            pserver_id = 1
+            table_name = "share_w"
+            fluid.io.load_lookup_table_vars(executor=exe,
+                    dirname=dirname, program=prog, pserver_id=pserver_id,
+                    table_name=table_name)
+    """
+
+    for var in program.list_vars():
+        if var.name == table_name:
+            lookup_table_var = var
+            break
+
+    assert lookup_table_var is not None
+
+    lookup_table_dir = os.path.join(dirname, LOOKUP_TABLE_DIR)
+    table_file = table_name + CHECKPOINT_SEPARATOR + str(pserver_id)
+
+    load_prog = Program()
+    load_block = load_prog.global_block()
+
+    load_block.append_op(
+        type='load',
+        inputs={},
+        outputs={'Out': [lookup_table_var]},
+        attrs={'file_path': os.path.join(lookup_table_dir, table_file)})
+
+    executor.run(load_prog)
+
+
 def save_persist_vars_without_grad(executor, dirname, program):
     """
-    save_persist_vars_without_grad  will save variables to a directory by an executor,
-    the variable named end with "@GRAD" will not be saved.
+    This function filters out all checkpoint variables from the give
+    program and then save these variables to a sub-folder '__model__' of 
+    the given directory.
+
+    A variable is a checkpoint variable if it meets all following
+    conditions:
+        1. It's persistable.
+        2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
+        3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
 
-    :param executor  executor for load the value
-    :param dirname the checkpoint directory 
-    :param program   will load all variables in program
+    Args:
+        executor(Executor): The executor to run for saving variables.
+        dirname(str): The directory path.
+        program(Program): The program whose checkpoint variables will
+                          be saved.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            prog = fluid.default_main_program()
+            fluid.io.save_persist_vars_without_grad(executor=exe,
+                    dirname=param_path, program=prog)
+
+            # In this example, `save_persist_vars_without_grad` function
+            # will first filters out all checkpoint variables in the default
+            # main program, and then saves these variables to the folder 
+            # "./my_paddle_model/__model__".
     """
     cur_dir = _get_model_dir(dirname)
     save_vars(
@@ -587,6 +1127,54 @@ def save_persist_vars_without_grad(executor, dirname, program):
     _write_success(cur_dir)
 
 
+def save_pserver_vars_by_notify(executor, dirname, lookup_table,
+                                ps_endpoint_list):
+    """
+    This function will send checkpoint notify message from Trainer 0
+    to all the pservers.
+    The checkpoint notify message contains lookup table name, 
+    the absolute path on pserver to save lookup_table.
+
+    Args:
+        executor(Executor): The executor to run for send checkpoint notify.
+        dirname(str): The folder where to save checkpoints.
+        lookup_table(string): the lookup table name, when use distribute
+            lookup table, we can get lookup table name by DistributeTranspiler.
+            table_name 
+        ps_endpoint_list(list): the parameter server ip:port list.  
+            when use distribute lookup table, we can get ps_endpoint_list by 
+            distribute arguments.
+    Return:
+        None
+    
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            prog = fluid.default_main_program()
+            table_name = "share_w"
+            ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"]
+
+            fluid.io.save_pserver_vars_by_notify(executor=exe,
+                    dirname=param_path, lookup_table=table_name, 
+                    ps_endpoint_list=ps_endpoints)
+    """
+    cur_dir = _get_lookuptable_dir(dirname)
+
+    checkpoint_notify_program = Program()
+    checkpoint_notify_block = checkpoint_notify_program.global_block()
+
+    attrs = {}
+    attrs['epmap'] = ps_endpoint_list
+    attrs['dir'] = cur_dir
+    attrs['lookup_table'] = lookup_table
+
+    checkpoint_notify_block.append_op(
+        type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs)
+    executor.run(checkpoint_notify_program)
+
+
 def save_trainer_args(dirname, trainer_id, trainer_args):
     assert isinstance(trainer_args, dict)
 
@@ -600,6 +1188,29 @@ def save_trainer_args(dirname, trainer_id, trainer_args):
 
 
 def load_trainer_args(checkpoint_dir, serial, trainer_id, trainer_args):
+    """
+    trainer will load some args from it's independent directory, 
+    such as epoch_id and step_id.
+
+    Args:
+        checkpoint_dir(str): The folder where all checkpoints are.
+        serial(int): The serial of checkpoint you would like to load.
+        trainer_id(int): current trainer id.
+        trainer_args(list): list about load trainer args
+    Return:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            param_path = "./checkpoint/"
+            serial = 7
+            trainer_id = 2
+            trainer_args = ["epoch_id", "step_id"]
+
+            fluid.io.load_trainer_args(checkpoint_dir=param_path, serial=serial,
+            trainer_id=trainer_id, trainer_args=trainer_args)
+    """
     assert isinstance(trainer_args, list)
 
     cur_dir = _get_serial_dir(checkpoint_dir, serial)
@@ -620,7 +1231,7 @@ def _is_checkpoint_var(var):
     the checkpoint will not save or load all the variables.
     var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded.
 
-    :param var
+    : param var(Variable)
     """
     if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
             var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
@@ -640,6 +1251,23 @@ def _is_checkpoint_var(var):
     return var.persistable
 
 
+def _make_chekcpoint_dirs(dirs):
+    """
+    _make_chekcpoint_dirs will makdir local directory directly, when the directory is exist, it will igore it.
+    """
+    assert dirs is not None
+
+    if os.path.isfile(dirs):
+        raise OSError(errno.ENOTDIR, "dirs path shoule be a Directory.", dirs)
+
+    if not os.path.isdir(dirs):
+        try:
+            os.makedirs(dirs)
+        except OSError as err:
+            if err.errno != errno.EEXIST:
+                raise err
+
+
 def _get_dir_serial(dirname):
     _, serial = dirname.split(CHECKPOINT_SEPARATOR)
 
@@ -653,29 +1281,27 @@ def _get_dir_serial(dirname):
 def _get_serial_dir(dirname, serial):
     serial_folder = CHECKPOINT_PREFIX + CHECKPOINT_SEPARATOR + str(serial)
     serial_dir = os.path.join(dirname, serial_folder)
-
-    if not os.path.isdir(serial_dir):
-        os.makedirs(serial_dir)
+    _make_chekcpoint_dirs(serial_dir)
 
     return serial_dir
 
 
 def _get_model_dir(dirname):
     model_dir = os.path.join(dirname, MODEL_DIR)
+    _make_chekcpoint_dirs(model_dir)
+    return model_dir
 
-    if not os.path.isdir(model_dir):
-        os.makedirs(model_dir)
 
-    return model_dir
+def _get_lookuptable_dir(dirname):
+    lookuptable_dir = os.path.join(dirname, LOOKUP_TABLE_DIR)
+    _make_chekcpoint_dirs(lookuptable_dir)
+    return lookuptable_dir
 
 
 def _get_trainer_dir(dirname, trainer_id):
     trainer_folder = TRAINER_PREFIX + CHECKPOINT_SEPARATOR + str(trainer_id)
     trainer_dir = os.path.join(dirname, trainer_folder)
-
-    if not os.path.isdir(trainer_dir):
-        os.makedirs(trainer_dir)
-
+    _make_chekcpoint_dirs(trainer_dir)
     return trainer_dir
 
 
@@ -694,14 +1320,18 @@ def _scroll_delete(dirname, max_num_checkpoints=3):
     serials = serials[max_num_checkpoints:]
     for serial in serials:
         cur_dir = _get_serial_dir(dirname, serial)
-        shutil.rmtree(cur_dir)
+        try:
+            shutil.rmtree(cur_dir)
+        except OSError as err:
+            if err.errno != errno.ENOENT:
+                raise err
 
 
 def _write_success(dirname):
     """
     write an empty file named "_SUCCESS" in checkpoint dir, indicate this checkpoint is correct.
 
-    :param dirname
+    : param dirname
     """
     success_file = os.path.join(dirname, SUCCESS_MARK_FILENAME)
     with open(success_file, 'a') as f:
@@ -713,7 +1343,7 @@ def get_latest_checkpoint_serial(checkpoint_dir):
     """
     get the latest file in checkpoint directory, the _SUCCESS file must exist in the directory
 
-    :param checkpoint_dir
+    : param checkpoint_dir
     """
     if not checkpoint_dir:
         return -1
@@ -744,3 +1374,101 @@ def get_latest_checkpoint_serial(checkpoint_dir):
         if success_num > current_dir:
             current_dir = success_num
     return current_dir
+
+
+def get_test_program(filelist, program=None, startup_program=None):
+    """
+    Transpile current train program to a program to read test dataset
+    if the program is using reader ops like "open_files_op".
+    """
+
+    def _copy_reader_var_(block, var, new_name=None):
+        if new_name == None:
+            new_name = var.name
+        new_var = block.create_var(
+            name=str(new_name), type=core.VarDesc.VarType.READER)
+        new_var.desc.set_shapes(var.desc.shapes())
+        new_var.desc.set_dtypes(var.desc.dtypes())
+        new_var.persistable = True
+        return new_var
+
+    def _get_test_reader_name(train_reader_name):
+        return train_reader_name + "_test"
+
+    def _is_reader_op(op):
+        block = op.block
+        if "Out" in op.output_names:
+            reader_out = block.vars[op.output("Out")[0]]
+            if reader_out.type == core.VarDesc.VarType.READER:
+                return True
+        return False
+
+    if program == None:
+        program = default_main_program()
+    if startup_program == None:
+        startup_program = default_startup_program()
+    startup_block = startup_program.global_block()
+
+    # 1. find out the orignal reader var name
+    startup_reader_op_list = []
+
+    for op in startup_block.ops:
+        if _is_reader_op(op):
+            startup_reader_op_list.append(op)
+
+    if len(startup_reader_op_list) == 0:
+        return program
+
+    root_reader_op = startup_reader_op_list[0]
+    train_test_reader_map = {}
+    # 2. add operators to startup to read open and read test data files
+    for op in startup_reader_op_list:
+        assert (len(op.output("Out")) == 1)
+        train_reader_name = op.output("Out")[0]
+        train_reader = startup_block.vars[train_reader_name]
+        test_reader = _copy_reader_var_(
+            startup_block,
+            train_reader,
+            new_name=_get_test_reader_name(train_reader_name))
+        train_test_reader_map[train_reader.name] = test_reader
+
+        test_op_inputs = {}
+        for name in op.input_names:
+            train_arg_names = op.input(name)
+            test_arg_vars = []
+            for arg_name in train_arg_names:
+                arg_var = train_test_reader_map[
+                    arg_name] if name == "UnderlyingReader" else startup_block.vars[
+                        arg_name]
+                test_arg_vars.append(arg_var)
+            test_op_inputs[name] = test_arg_vars
+
+        test_op = startup_block.append_op(
+            type=op.type,
+            inputs=test_op_inputs,
+            outputs={'Out': [test_reader]},
+            attrs=op.attrs)
+        # root reader op's filelist attr for read test files
+        if op.type == root_reader_op.type:
+            test_op.set_attr("file_names", filelist)
+        if op.type == "create_multi_pass_reader":
+            test_op.set_attr("pass_num", 1)
+
+    # 3. rename reader vars in inference program to different name
+    #    to avoid read from train data.
+    main_block = program.global_block()
+    for var in main_block.vars.values():
+        if var.type == core.VarDesc.VarType.READER:
+            main_block.rename_var(
+                str(var.name), str(_get_test_reader_name(var.name)))
+
+    for op in main_block.ops:
+        if op.type == root_reader_op.type:
+            test_op.set_attr("file_names", filelist)
+        if op.type == "create_multi_pass_reader":
+            test_op.set_attr("pass_num", 1)
+
+    startup_program.sync_with_cpp()
+    program.sync_with_cpp()
+
+    return program
diff --git a/python/paddle/fluid/layers/__init__.py b/python/paddle/fluid/layers/__init__.py
index a568f61dcb2da976baa7847ae26281a34d6f88dd..cd1492da24d5e9d09a9eaac0b1b9c7aaffac6250 100644
--- a/python/paddle/fluid/layers/__init__.py
+++ b/python/paddle/fluid/layers/__init__.py
@@ -28,8 +28,8 @@ import math_op_patch
 from math_op_patch import *
 import detection
 from detection import *
-import metric
-from metric import *
+import metric_op
+from metric_op import *
 from learning_rate_scheduler import *
 
 __all__ = []
@@ -41,5 +41,5 @@ __all__ += control_flow.__all__
 __all__ += ops.__all__
 __all__ += device.__all__
 __all__ += detection.__all__
-__all__ += metric.__all__
+__all__ += metric_op.__all__
 __all__ += learning_rate_scheduler.__all__
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 4db085e9f551be09d9a8da998a7a97f68c1787f3..849474dc58461ac3772f439da7bf5d57592daa8c 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -20,13 +20,13 @@ from ..framework import Program, Variable, Operator
 from ..layer_helper import LayerHelper, unique_name
 from ..initializer import force_init_on_cpu
 from ops import logical_and, logical_not, logical_or
+import numpy
 
 __all__ = [
     'split_lod_tensor',
     'merge_lod_tensor',
     'BlockGuard',
     'BlockGuardWithCompletion',
-    'StaticRNNMemoryLink',
     'WhileGuard',
     'While',
     'Switch',
@@ -55,34 +55,36 @@ __all__ = [
 
 def split_lod_tensor(input, mask, level=0):
     """
-    **split_lod_tensor**
-
     This function takes in an input that contains the complete lod information,
     and takes in a mask which is used to mask certain parts of the input.
     The output is the true branch and the false branch with the mask applied to
-    the input at a certain level in the tensor.
+    the input at a certain level in the tensor. Mainly used in IfElse to split
+    data into two parts.
 
     Args:
         input(tuple|list|None): The input tensor that contains complete
                                 lod information needed to construct the output.
         mask(list): A bool column vector which masks the input.
-        level(int): The specific lod level to rank.
+        level(int): The specific lod level to split.
 
     Returns:
-        Variable: The true branch of tensor as per the mask applied to input.
-        Variable: The false branch of tensor as per the mask applied to input.
+        tuple(Variable, Variable):
+        The true branch of tensor as per the mask applied to input.
+
+        The false branch of tensor as per the mask applied to input.
 
     Examples:
         .. code-block:: python
 
-          x = layers.data(name='x', shape=[1])
+          x = fluid.layers.data(name='x', shape=[1])
           x.persistable = True
 
-          y = layers.data(name='y', shape=[1])
+          y = fluid.layers.data(name='y', shape=[1])
           y.persistable = True
 
-          out_true, out_false = layers.split_lod_tensor(
+          out_true, out_false = fluid.layers.split_lod_tensor(
                 input=x, mask=y, level=level)
+
     """
     helper = LayerHelper('split_lod_tensor', **locals())
     out_true = helper.create_tmp_variable(dtype=input.dtype)
@@ -105,8 +107,9 @@ def merge_lod_tensor(in_true, in_false, x, mask, level=0):
 
     This function takes in an input :math:`x`, the True branch, the False
     branch and a binary :math:`mask`. Using this information, this function
-    merges the True and False branches of the tensor into a single Output
-    at a certain lod level indiacted by :math:`level`.
+    merges the True and False branches of the tensor into a single tensor as
+    output at a certain lod level indicated by :math:`level`. Used in IfElse
+    to merge the output if True block and False Block.
 
     Args:
         in_true(tuple|list|None): The True branch to be merged.
@@ -114,7 +117,7 @@ def merge_lod_tensor(in_true, in_false, x, mask, level=0):
         x(tuple|list|None): The input tensor that contains complete
                             lod information needed to construct the output.
         mask(list): A bool column vector which masks the input.
-        level(int): The specific lod level to rank.
+        level(int): The specific lod level to merge.
 
     Returns:
         Variable: The merged output tensor.
@@ -182,12 +185,14 @@ def Print(input,
     Returns:
         Variable: Output tensor, same data with input tensor.
 
+
     Examples:
+
         .. code-block:: python
 
-        value = some_layer(...)
-        Print(value, summarize=10,
-              message="The content of some_layer: ")
+           value = some_layer(...)
+           Print(value, summarize=10,
+               message="The content of some_layer: ")
     '''
     helper = LayerHelper('print', **locals())
     out = helper.create_tmp_variable(dtype=helper.input_dtype())
@@ -233,9 +238,56 @@ class BlockGuard(object):
 
 class ParallelDo(object):
     """
-    ParallelDo class.
+    ParallelDo is used to represent multi-thread data parallel processing.
+
+    Its vanilla implementation can be shown as the following (:math:`|` means
+    single thread and :math:`||||` means multiple threads)
+
+    .. code-block:: text
+
+      In the forward pass
+        |      Split input onto different devices
+        |      Copy parameter onto different devices
+        ||||   Compute forward pass in parallel
+        |      Merge output from different devices
 
-    ParallelDo class is used to create a ParallelDo.
+      In the backward pass
+        |      Split output@grad onto different devices
+        ||||   Compute backward pass in parallel
+        |      accumulate param@grad from different devices to the first device
+        |      Merge input@grad from different devices
+        |      Copy param@grad to the place of parallel_do_op
+
+    Examples:
+
+    .. code-block:: python
+
+      images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+      label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+      # ParallelDo version & Single-thread version
+      if thread_num > 1:
+          places = fluid.layers.get_places(thread_num)
+          pd = fluid.layers.ParallelDo(places)
+          with pd.do():
+              images = pd.read_input(images)
+              label = pd.read_input(label)
+              predict = cnn_model(images)
+              cost = fluid.layers.cross_entropy(input=predict, label=label)
+
+              avg_cost = fluid.layers.mean(x=cost)
+              pd.write_output(avg_cost)
+
+          avg_cost = pd()
+          avg_cost = fluid.layers.mean(avg_cost)
+      else:
+          predict = cnn_model(images)
+          cost = fluid.layers.cross_entropy(input=predict, label=label)
+          avg_cost = fluid.layers.mean(x=cost)
+
+    .. warning::
+    
+       It will be soon deprecated, please use ParallelExecutor instead.
     """
 
     def __init__(self, places, use_nccl=False, name=None):
@@ -362,16 +414,17 @@ class StaticRNNMemoryLink(object):
     """
     StaticRNNMemoryLink class.
 
-    Args:
-        init: the initial variable for Memory
-        init: Variable
-        pre_mem: the memory variable in previous time step
-        pre_mem: Variable
-        mem: the memory variable in current time step
-        mem: Variable
-
     StaticRNNMemoryLink class is used to create a link between two
     memory cells of a StaticRNN.
+
+
+    NOTE: This is a internal data structure of a very low-level API.
+    Please use StaticRNN instead.
+
+    Args:
+        init(Variable): the initial variable for Memory.
+        pre_mem(Variable): the memory variable in previous time step.
+        mem(Variable): the memory variable in current time step.
     """
 
     def __init__(self, init, pre_mem, mem=None):
@@ -606,6 +659,29 @@ class WhileGuard(BlockGuard):
 
 
 class While(object):
+    """
+    while loop control flow.
+
+    Args:
+        cond (Variable): condition used to compare.
+        name (str): The name of this layer.
+
+    Examples:
+          .. code-block:: python
+
+            d0 = layers.data("d0", shape=[10], dtype='float32')
+            data_array = layers.array_write(x=d0, i=i)
+            array_len = layers.fill_constant(shape=[1],dtype='int64', value=3)
+
+            cond = layers.less_than(x=i, y=array_len)
+            while_op = layers.While(cond=cond)
+            with while_op.block():
+                d = layers.array_read(array=data_array, i=i)
+                i = layers.increment(x=i, in_place=True)
+                layers.array_write(result, i=i, array=d)
+                layers.less_than(x=i, y=array_len, cond=cond)
+    """
+
     BEFORE_WHILE_BLOCK = 0
     IN_WHILE_BLOCK = 1
     AFTER_WHILE_BLOCK = 2
@@ -675,8 +751,8 @@ def lod_rank_table(x, level=0):
         .. code-block:: text
 
             x is a LoDTensor:
-                x.lod = [[0,                2, 3],
-                         [0,             5, 6, 7]]
+                x.lod = [[2,                1],
+                         [5,             1, 1]]
                 x.data = [a, b, c, d, e, f, g]
 
             1. set level to 0:
@@ -706,7 +782,7 @@ def lod_rank_table(x, level=0):
         .. code-block:: python
 
             x = fluid.layers.data(name='x', shape=[10],
-                            dtype='float32', lod_level=1)
+                                  dtype='float32', lod_level=1)
             out = layers.lod_rank_table(x=x, level=0)
     """
     helper = LayerHelper("lod_rank_table", **locals())
@@ -748,17 +824,25 @@ def max_sequence_len(rank_table):
 
 
 def lod_tensor_to_array(x, table):
-    """ Convert a LOD_TENSOR to an LOD_TENSOR_ARRAY.
+    """ 
+    Convert a LoDTensor to a LoDTensorArray.
+
+    This function split a LoDTesnor to a LoDTensorArray according to its LoD 
+    information. LoDTensorArray is an alias of C++ std::vector<LoDTensor> in 
+    PaddlePaddle. The generated LoDTensorArray of this function can be further read 
+    or written by `read_from_array()` and `write_to_array()` operators. However, 
+    this function is generally an internal component of PaddlePaddle `DynamicRNN`. 
+    Users should not use it directly.
 
     Args:
-        x (Variable|list): The LOD tensor to be converted to a LOD tensor array.
+        x (Variable|list): The LoDTensor to be converted to a LoDTensorArray.
         table (ParamAttr|list): The variable that stores the level of lod
                                 which is ordered by sequence length in
-                                descending order.
+                                descending order. It is generally generated 
+                                by `layers.lod_rank_table()` API.
 
     Returns:
-        Variable: The variable of type array that has been converted from a
-                  tensor.
+        Variable: The LoDTensorArray that has been converted from the input tensor.
 
     Examples:
         .. code-block:: python
@@ -823,8 +907,7 @@ def increment(x, value=1.0, in_place=True):
         in_place (bool): If the increment should be performed in-place.
 
     Returns:
-        Variable: The tensor variable storing the transformation of
-                  element-wise increment of each value in the input.
+        Variable: The elementwise-incremented object.
 
     Examples:
         .. code-block:: python
@@ -866,7 +949,7 @@ def array_write(x, i, array=None):
         Variable: The output LOD_TENSOR_ARRAY where the input tensor is written.
 
     Examples:
-        .. code-block::python
+        .. code-block:: python
 
           tmp = fluid.layers.zeros(shape=[10], dtype='int32')
           i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
@@ -887,14 +970,17 @@ def array_write(x, i, array=None):
 
 
 def create_array(dtype):
-    """This function creates an array of type :math:`LOD_TENSOR_ARRAY` using the
-    LayerHelper.
+    """
+    **Create LoDTensorArray**
+
+    This function creates an array of LOD_TENSOR_ARRAY . It is mainly used to
+    implement RNN with array_write, array_read and While.
 
     Args:
-        dtype (int|float): The data type of the elements in the array.
+        dtype (int|float): The data type of the elements in the lod_tensor_array.
 
     Returns:
-        Variable: The tensor variable storing the elements of data type.
+        Variable: The lod_tensor_array variable storing the elements of data type.
 
     Examples:
         .. code-block:: python
@@ -909,37 +995,40 @@ def create_array(dtype):
         dtype=dtype)
 
 
-def less_than(x, y, force_cpu=True, cond=None, **ignored):
+@templatedoc()
+def less_than(x, y, force_cpu=None, cond=None, **ignored):
     """
-    **Less than**
+    ${comment}
 
-    This layer returns the truth value of :math:`x < y` elementwise.
+    >>> import paddle.fluid as fluid
+    >>> less = fluid.layers.less_than(x=label, y=limit)
 
     Args:
-        x(Variable): First operand of *less_than*
-        y(Variable): Second operand of *less_than*
-        force_cpu(Bool|True): The output data will be on CPU if set true.
+        x(${x_type}): ${x_comment}.
+        y(${y_type}): ${y_comment}.
+        force_cpu(${force_cpu_type}): ${force_cpu_comment}.
         cond(Variable|None): Optional output variable to store the result of *less_than*
 
     Returns:
-        Variable: The tensor variable storing the output of *less_than*.
-
-    Examples:
-        .. code-block:: python
-
-          less = fluid.layers.less_than(x=label, y=limit)
+        ${out_comment}.
     """
     helper = LayerHelper("less_than", **locals())
     if cond is None:
         cond = helper.create_tmp_variable(dtype='bool')
         cond.stop_gradient = True
 
+    attrs = dict()
+    if force_cpu is not None:
+        attrs['force_cpu'] = force_cpu
+    elif force_init_on_cpu():
+        attrs['force_cpu'] = force_init_on_cpu()
+
     helper.append_op(
         type='less_than',
         inputs={'X': [x],
                 'Y': [y]},
         outputs={'Out': [cond]},
-        attrs={'force_cpu': force_cpu or force_init_on_cpu()})
+        attrs=attrs)
     return cond
 
 
@@ -974,16 +1063,34 @@ def equal(x, y, cond=None, **ignored):
 
 
 def array_read(array, i):
-    """This function performs the operation to read the data in as an
+    """
+    This function performs the operation to read the data in as an
     LOD_TENSOR_ARRAY.
+
+    .. code-block:: text
+
+        Given:
+
+        array = [0.6, 0.1, 0.3, 0.1]
+        
+        And:
+        
+        i = 2
+
+        Then:
+
+        output = 0.3
+
     Args:
-        array (Variable|list): The input tensor that will be written to an array.
-        i (Variable|list): The subscript index in tensor array, that points the
-                           place where data will be written to.
+        array (Variable|list): The input tensor that store data to be read.
+        i (Variable|list): The index of the data to be read from input array.
+
     Returns:
         Variable: The tensor type variable that has the data written to it.
+
     Examples:
-        .. code-block::python
+        .. code-block:: python
+
           tmp = fluid.layers.zeros(shape=[10], dtype='int32')
           i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
           arr = layers.array_read(tmp, i=i)
@@ -1004,8 +1111,28 @@ def array_read(array, i):
 
 def shrink_memory(x, i, table):
     """
-    This function creates an operator to shrink_rnn_memory using the RankTable
+    This function creates an operator to shrink rnn memory using the RankTable
     as mentioned in the input parameter.
+
+    NOTE: This API is very low-level API. It is used by DynamicRNN only.
+
+    Since the Dynamic RNN uses no-padding way to implement RNN. The sequence
+    will be sorted by order, and the length of valid memory will be shrink after
+    each time step.
+
+    Args:
+        x(Variable): The memory object in the previous time step.
+        i(Variable): The step count variable. A int scalar as LoDTensor.
+        table(Variable): The RNNRankTable object.
+
+    Returns:
+        the memory variable after shrink.
+
+    Examples:
+
+        Since this API is very low level API. The example is not provided.
+        Please reference the implementation of class DynamicRNN for detail
+        usage.
     """
     helper = LayerHelper('shrink_memory', **locals())
     out = helper.create_tmp_variable(dtype=x.dtype)
@@ -1020,9 +1147,14 @@ def shrink_memory(x, i, table):
 
 
 def array_length(array):
-    """This function performs the operation to find the length of the input
+    """
+    **Get the Length of Input LoDTensorArray**
+
+    This function performs the operation to find the length of the input
     LOD_TENSOR_ARRAY.
 
+    Related API: array_read, array_write, While.
+
     Args:
         array (LOD_TENSOR_ARRAY): The input array that will be used
                                   to compute the length.
@@ -1031,12 +1163,13 @@ def array_length(array):
         Variable: The length of the input LoDTensorArray.
 
     Examples:
-        .. code-block::python
+        .. code-block:: python
 
           tmp = fluid.layers.zeros(shape=[10], dtype='int32')
           i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
           arr = fluid.layers.array_write(tmp, i=i)
           arr_len = fluid.layers.array_length(arr)
+
     """
     helper = LayerHelper('array_length', **locals())
     tmp = helper.create_tmp_variable(dtype='int64')
@@ -1047,6 +1180,13 @@ def array_length(array):
 
 
 class ConditionalBlockGuard(BlockGuard):
+    """
+    ConditionalBlockGuard is derived from BlockGuard. It is dedicated for 
+    holding a ConditionalBlock, and helping users entering and exiting the 
+    ConditionalBlock via Python's 'with' keyword. However, ConditionalBlockGuard 
+    is generally an internal component of IfElse, users should not use it directly.
+    """
+
     def __init__(self, block):
         if not isinstance(block, ConditionalBlock):
             raise TypeError("block should be conditional block")
@@ -1063,6 +1203,31 @@ class ConditionalBlockGuard(BlockGuard):
 
 
 class ConditionalBlock(object):
+    '''
+    **ConditionalBlock**
+
+    ConditionalBlock is an operator that bind a block to a specific condition,
+    if the condition matches, the corresponding block will be executed.
+
+    Args:
+        inputs (Variable): bool conditions.
+        is_scalar_condition (bool): whether the branch is controled by a scalar.
+        name(str): name of this ConditionalBlock.
+
+    Examples:
+        .. code-block:: python
+
+             cond = layers.less_than(x=label, y=limit)
+             true_image, false_image = layers.split_lod_tensor(
+                 input=image, mask=cond)
+             true_cond = layers.ConditionalBlock([true_image])
+
+             with true_cond.block():
+                 ...
+             with false_cond.block():
+                 ...
+    '''
+
     def __init__(self, inputs, is_scalar_condition=False, name=None):
         for each_input in inputs:
             if not isinstance(each_input, Variable):
@@ -1120,6 +1285,42 @@ class ConditionalBlock(object):
 
 
 class Switch(object):
+    """
+    Switch class works just like a `if-elif-else`. Can be used in learning rate scheduler
+    to modify learning rate
+
+    The Semantics:
+
+    1. A `switch` control-flow checks cases one-by-one.
+
+    2. The condition of each case is a boolean value, which is a scalar Variable.
+
+    3. It runs the first matched case, or the default case if there is one.
+
+    4. Once it matches a case, it runs the corresponding branch and only that branch.
+
+    Examples:
+        .. code-block:: python
+
+            lr = fluid.layers.tensor.create_global_var(
+                shape=[1],
+                value=0.0,
+                dtype='float32',
+                persistable=True,
+                name="learning_rate")
+            one_var = tensor.fill_constant(
+                shape=[1], dtype='float32', value=1.0)
+            two_var = tensor.fill_constant(
+                shape=[1], dtype='float32', value=2.0)
+
+            with fluid.layers.control_flow.Switch() as switch:
+                with switch.case(global_step == zero_var):
+                    fluid.layers.tensor.assign(input=one_var, output=lr)
+                with switch.default():
+                    fluid.layers.tensor.assign(input=two_var, output=lr)
+
+    """
+
     def __init__(self, name=None):
         self.helper = LayerHelper('switch', name=name)
         self.inside_scope = False
@@ -1149,7 +1350,8 @@ class Switch(object):
         return ConditionalBlockGuard(cond_block)
 
     def default(self):
-        """create a default case for this switch
+        """
+        create a default case for this switch
         """
         pre_cond_num = len(self.pre_not_conditions)
         if pre_cond_num == 0:
@@ -1339,6 +1541,38 @@ class IfElse(object):
 
 
 class DynamicRNN(object):
+    """
+    The dynamic RNN can process a batch of sequence data. The length of each
+    sample sequence can be different. This API automatically process them in
+    batch.
+
+    The input lod must be set. Please reference `lod_tensor`
+
+    >>> import paddle.fluid as fluid
+    >>> data = fluid.layers.data(name='sentence', dtype='int64', lod_level=1)
+    >>> embedding = fluid.layers.embedding(input=data, size=[65535, 32],
+    >>>                                    is_sparse=True)
+    >>>
+    >>> drnn = fluid.layers.DynamicRNN()
+    >>> with drnn.block():
+    >>>     word = drnn.step_input(embedding)
+    >>>     prev = drnn.memory(shape=[200])
+    >>>     hidden = fluid.layers.fc(input=[word, prev], size=200, act='relu')
+    >>>     drnn.update_memory(prev, hidden)  # set prev to hidden
+    >>>     drnn.output(hidden)
+    >>>
+    >>> # last is the last time step of rnn. It is the encoding result.
+    >>> last = fluid.layers.sequence_last_step(drnn())
+
+    The dynamic RNN will unfold sequence into timesteps. Users need to define
+    how to process each time step during the :code:`with` block.
+
+    The `memory` is used staging data cross time step. The initial value of
+    memory can be zero or another variable.
+
+    The dynamic RNN can mark multiple variables as its output. Use `drnn()` to
+    get the output sequence.
+    """
     BEFORE_RNN = 0
     IN_RNN = 1
     AFTER_RNN = 2
@@ -1361,6 +1595,15 @@ class DynamicRNN(object):
         self.mem_link = []
 
     def step_input(self, x):
+        """
+        Mark a sequence as a dynamic RNN input.
+        Args:
+            x(Variable): The input sequence.
+
+        Returns:
+            The current timestep in the input sequence.
+
+        """
         self._assert_in_rnn_block_("step_input")
         if not isinstance(x, Variable):
             raise TypeError(
@@ -1404,6 +1647,15 @@ class DynamicRNN(object):
         return array_read(array=input_array, i=self.step_idx)
 
     def static_input(self, x):
+        """
+        Mark a variable as a RNN input. The input will not be scattered into
+        time steps.
+        Args:
+            x(Variable): The input variable.
+
+        Returns:
+            The input variable that can access in RNN.
+        """
         self._assert_in_rnn_block_("static_input")
         if not isinstance(x, Variable):
             raise TypeError(
@@ -1425,6 +1677,10 @@ class DynamicRNN(object):
 
     @contextlib.contextmanager
     def block(self):
+        """
+        The block for user to define operators in RNN. See the class docstring
+        for more details.
+        """
         if self.status != DynamicRNN.BEFORE_RNN:
             raise ValueError("rnn.block() can only be invoke once")
         self.step_idx = fill_constant(
@@ -1451,6 +1707,9 @@ class DynamicRNN(object):
                     x=each_array, table=self.lod_rank_table))
 
     def __call__(self, *args, **kwargs):
+        """
+        Get the output of RNN. This API should only be invoked after RNN.block()
+        """
         if self.status != DynamicRNN.AFTER_RNN:
             raise ValueError(("Output of the dynamic RNN can only be visited "
                               "outside the rnn block."))
@@ -1465,6 +1724,70 @@ class DynamicRNN(object):
                value=0.0,
                need_reorder=False,
                dtype='float32'):
+        """
+        Create a memory variable for dynamic rnn.
+
+        If the :code:`init` is not None, :code:`memory` will be initialized by
+        this variable. The :code:`need_reorder` is used to reorder the memory as
+        the input variable. It should be set to true when the initialized memory
+        depends on the input sample.
+
+        For example,
+
+        >>> import paddle.fluid as fluid
+        >>> sentence = fluid.layers.data(
+        >>>                 name='sentence', dtype='float32', shape=[32])
+        >>> boot_memory = fluid.layers.data(
+        >>>                 name='boot', dtype='float32', shape=[10])
+        >>>
+        >>> drnn = fluid.layers.DynamicRNN()
+        >>> with drnn.block():
+        >>>     word = drnn.step_input(sentence)
+        >>>     memory = drnn.memory(init=boot_memory, need_reorder=True)
+        >>>     hidden = fluid.layers.fc(
+        >>>                 input=[word, memory], size=10, act='tanh')
+        >>>     drnn.update_memory(ex_mem=memory, new_mem=hidden)
+        >>>     drnn.output(hidden)
+        >>> rnn_output = drnn()
+
+
+        Otherwise, if :code:`shape`, :code:`value`, :code:`dtype` are set, the
+        :code:`memory` will be initialized by this :code:`value`.
+
+        For example,
+
+        >>> import paddle.fluid as fluid
+        >>> sentence = fluid.layers.data(
+        >>>                 name='sentence', dtype='float32', shape=[32])
+        >>>
+        >>> drnn = fluid.layers.DynamicRNN()
+        >>> with drnn.block():
+        >>>     word = drnn.step_input(sentence)
+        >>>     memory = drnn.memory(shape=[10], dtype='float32', value=0)
+        >>>     hidden = fluid.layers.fc(
+        >>>             input=[word, memory], size=10, act='tanh')
+        >>>     drnn.update_memory(ex_mem=memory, new_mem=hidden)
+        >>>     drnn.output(hidden)
+        >>> rnn_output = drnn()
+
+
+        Args:
+            init(Variable|None): The initialized variable.
+
+            shape(list|tuple): The memory shape. NOTE the shape does not contain
+            batch_size.
+
+            value(float): the initalized value.
+
+            need_reorder(bool): True if the initialized memory depends on the
+            input sample.
+
+            dtype(str|numpy.dtype): The data type of the initialized memory.
+
+        Returns:
+            the memory variable.
+
+        """
         self._assert_in_rnn_block_('memory')
         if init is not None:
             if not isinstance(init, Variable):
@@ -1532,6 +1855,16 @@ class DynamicRNN(object):
             return self.memory(init=init)
 
     def update_memory(self, ex_mem, new_mem):
+        """
+        Update the memory from ex_mem to new_mem. NOTE that the shape and data
+        type of :code:`ex_mem` and :code:`new_mem` must be same.
+        Args:
+            ex_mem(Variable): the memory variable.
+            new_mem(Variable): the plain variable generated in RNN block.
+
+        Returns:
+            None
+        """
         self._assert_in_rnn_block_('update_memory')
         if not isinstance(ex_mem, Variable):
             raise TypeError("The input arg `ex_mem` of update_memory() must "
@@ -1549,6 +1882,15 @@ class DynamicRNN(object):
         self.mem_link.append((new_mem, mem_array))
 
     def output(self, *outputs):
+        """
+        mark the RNN output variables.
+
+        Args:
+            outputs: The output variables.
+
+        Returns:
+            None
+        """
         self._assert_in_rnn_block_('output')
         parent_block = self._parent_block_()
         for each in outputs:
@@ -1591,26 +1933,26 @@ def reorder_lod_tensor_by_rank(x, rank_table):
 
 def is_empty(x, cond=None, **ignored):
     """
-    **Is Empty**
-
-    This layer returns the truth value of whether the variable is empty.
+    Test whether a Variable is empty.
 
     Args:
-        x(Variable): Operand of *is_empty*
-        cond(Variable|None): Optional output variable to store the result
-                             of *is_empty*
+        x (Variable): The Variable to be tested.
+        cond (Variable|None): Output parameter. Returns the test result 
+                              of given 'x'. Default: None
 
     Returns:
-        Variable: The tensor variable storing the output of *is_empty*.
+        Variable: A bool scalar. True if 'x' is an empty Variable.
 
     Raises:
         TypeError: If input cond is not a variable, or cond's dtype is
-                   not bool
+                   not bool.
 
     Examples:
         .. code-block:: python
 
-          less = fluid.layers.is_empty(x=input)
+          res = fluid.layers.is_empty(x=input)
+          # or:
+          fluid.layers.is_empty(x=input, cond=res)
     """
     helper = LayerHelper("is_empty", **locals())
     if cond is None:
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 3a83db12fd13651578deeac6b562bac2f1e4e4b6..6af01297df54ffd4201776d20d51a88f5808ccb0 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -16,7 +16,7 @@ All layers just related to the detection neural network.
 """
 
 from layer_function_generator import generate_layer_fn
-from layer_function_generator import autodoc
+from layer_function_generator import autodoc, templatedoc
 from ..layer_helper import LayerHelper
 import tensor
 import nn
@@ -30,6 +30,7 @@ __all__ = [
     'detection_output',
     'ssd_loss',
     'detection_map',
+    'anchor_generator',
 ]
 
 __auto__ = [
@@ -97,7 +98,9 @@ def detection_output(loc,
         nms_eta(float): The parameter for adaptive NMS.
 
     Returns:
-        Variable: The detection outputs is a LoDTensor with shape [No, 6].
+        Variable: 
+        
+            The detection outputs is a LoDTensor with shape [No, 6].
             Each row has six values: [label, confidence, xmin, ymin, xmax, ymax].
             `No` is the total number of detections in this mini-batch. For each
             instance, the offsets in first dimension are called LoD, the offset
@@ -110,15 +113,15 @@ def detection_output(loc,
     Examples:
         .. code-block:: python
 
-        pb = layers.data(name='prior_box', shape=[10, 4],
+            pb = layers.data(name='prior_box', shape=[10, 4],
                          append_batch_size=False, dtype='float32')
-        pbv = layers.data(name='prior_box_var', shape=[10, 4],
+            pbv = layers.data(name='prior_box_var', shape=[10, 4],
                           append_batch_size=False, dtype='float32')
-        loc = layers.data(name='target_box', shape=[2, 21, 4],
+            loc = layers.data(name='target_box', shape=[2, 21, 4],
                           append_batch_size=False, dtype='float32')
-        scores = layers.data(name='scores', shape=[2, 21, 10],
+            scores = layers.data(name='scores', shape=[2, 21, 10],
                           append_batch_size=False, dtype='float32')
-        nmsed_outs = fluid.layers.detection_output(scores=scores,
+            nmsed_outs = fluid.layers.detection_output(scores=scores,
                                        loc=loc,
                                        prior_box=pb,
                                        prior_box_var=pbv)
@@ -153,7 +156,7 @@ def detection_output(loc,
     return nmsed_outs
 
 
-@autodoc()
+@templatedoc()
 def detection_map(detect_res,
                   label,
                   class_num,
@@ -164,6 +167,47 @@ def detection_map(detect_res,
                   input_states=None,
                   out_states=None,
                   ap_version='integral'):
+    """
+    ${comment}
+
+    Args:
+        detect_res: ${detect_res_comment}
+        label:  ${label_comment}
+        class_num: ${class_num_comment}
+        background_label: ${background_label_comment}
+        overlap_threshold: ${overlap_threshold_comment}
+        evaluate_difficult: ${evaluate_difficult_comment}
+        has_state: ${has_state_comment}
+        input_states: If not None, It contains 3 elements:
+            1. pos_count ${pos_count_comment}.
+            2. true_pos ${true_pos_comment}.
+            3. false_pos ${false_pos_comment}.
+        out_states: If not None, it contains 3 elements.
+            1. accum_pos_count ${accum_pos_count_comment}.
+            2. accum_true_pos ${accum_true_pos_comment}.
+            3. accum_false_pos ${accum_false_pos_comment}.
+        ap_version: ${ap_type_comment}
+
+    Returns:
+        ${map_comment}
+
+
+    Examples:
+          .. code-block:: python
+
+            detect_res = fluid.layers.data(
+                name='detect_res',
+                shape=[10, 6],
+                append_batch_size=False,
+                dtype='float32')
+            label = fluid.layers.data(
+                name='label',
+                shape=[10, 6],
+                append_batch_size=False,
+                dtype='float32')
+
+            map_out = fluid.layers.detection_map(detect_res, label, 21)
+    """
     helper = LayerHelper("detection_map", **locals())
 
     def __create_var(type):
@@ -210,53 +254,68 @@ def bipartite_match(dist_matrix,
                     dist_threshold=None,
                     name=None):
     """
-    **Bipartite matchint operator**
-
-    This operator is a greedy bipartite matching algorithm, which is used to
-    obtain the matching with the maximum distance based on the input
+    This operator implements a greedy bipartite matching algorithm, which is
+    used to obtain the matching with the maximum distance based on the input
     distance matrix. For input 2D matrix, the bipartite matching algorithm can
-    find the matched column for each row, also can find the matched row for
-    each column. And this operator only calculate matched indices from column
-    to row. For each instance, the number of matched indices is the number of
-    of columns of the input ditance matrix.
-
-    There are two outputs to save matched indices and distance.
-    A simple description, this algothrim matched the best (maximum distance)
+    find the matched column for each row (matched means the largest distance),
+    also can find the matched row for each column. And this operator only
+    calculate matched indices from column to row. For each instance,
+    the number of matched indices is the column number of the input distance
+    matrix.
+
+    There are two outputs, matched indices and distance.
+    A simple description, this algorithm matched the best (maximum distance)
     row entity to the column entity and the matched indices are not duplicated
     in each row of ColToRowMatchIndices. If the column entity is not matched
     any row entity, set -1 in ColToRowMatchIndices.
 
-    Please note that the input DistMat can be LoDTensor (with LoD) or Tensor.
+    NOTE: the input DistMat can be LoDTensor (with LoD) or Tensor.
     If LoDTensor with LoD, the height of ColToRowMatchIndices is batch size.
     If Tensor, the height of ColToRowMatchIndices is 1.
 
+    NOTE: This API is a very low level API. It is used by :code:`ssd_loss`
+    layer. Please consider to use :code:`ssd_loss` instead.
+
     Args:
         dist_matrix(Variable): This input is a 2-D LoDTensor with shape
             [K, M]. It is pair-wise distance matrix between the entities
             represented by each row and each column. For example, assumed one
             entity is A with shape [K], another entity is B with shape [M]. The
-            dist_matirx[i][j] is the distance between A[i] and B[j]. The bigger
-            the distance is, the better macthing the pairs are. Please note,
-            This tensor can contain LoD information to represent a batch of
-            inputs. One instance of this batch can contain different numbers of
-            entities.
+            dist_matrix[i][j] is the distance between A[i] and B[j]. The bigger
+            the distance is, the better matching the pairs are.
+
+            NOTE: This tensor can contain LoD information to represent a batch
+            of inputs. One instance of this batch can contain different numbers
+            of entities.
         match_type(string|None): The type of matching method, should be
-           'bipartite' or 'per_prediction', 'bipartite' by defalut.
+           'bipartite' or 'per_prediction'. [default 'bipartite'].
         dist_threshold(float|None): If `match_type` is 'per_prediction',
             this threshold is to determine the extra matching bboxes based
-            on the maximum distance, 0.5 by defalut.
+            on the maximum distance, 0.5 by default.
     Returns:
-        match_indices(Variable): A 2-D Tensor with shape [N, M] in int type.
-            N is the batch size. If match_indices[i][j] is -1, it
-            means B[j] does not match any entity in i-th instance.
-            Otherwise, it means B[j] is matched to row
-            match_indices[i][j] in i-th instance. The row number of
-            i-th instance is saved in match_indices[i][j].
-        match_distance(Variable): A 2-D Tensor with shape [N, M] in float type.
-            N is batch size. If match_indices[i][j] is -1,
-            match_distance[i][j] is also -1.0. Otherwise, assumed
-            match_distance[i][j] = d, and the row offsets of each instance
-            are called LoD. Then match_distance[i][j] = dist_matrix[d+LoD[i]][j].
+        tuple: a tuple with two elements is returned. The first is
+        matched_indices, the second is matched_distance.
+
+        The matched_indices is a 2-D Tensor with shape [N, M] in int type.
+        N is the batch size. If match_indices[i][j] is -1, it
+        means B[j] does not match any entity in i-th instance.
+        Otherwise, it means B[j] is matched to row
+        match_indices[i][j] in i-th instance. The row number of
+        i-th instance is saved in match_indices[i][j].
+
+        The matched_distance is a 2-D Tensor with shape [N, M] in float type
+        . N is batch size. If match_indices[i][j] is -1,
+        match_distance[i][j] is also -1.0. Otherwise, assumed
+        match_distance[i][j] = d, and the row offsets of each instance
+        are called LoD. Then match_distance[i][j] =
+        dist_matrix[d+LoD[i]][j].
+
+    Examples:
+
+        >>> x = fluid.layers.data(name='x', shape=[4], dtype='float32')
+        >>> y = fluid.layers.data(name='y', shape=[4], dtype='float32')
+        >>> iou = fluid.layers.iou_similarity(x=x, y=y)
+        >>> matched_indices, matched_dist = fluid.layers.bipartite_match(iou)
     """
     helper = LayerHelper('bipartite_match', **locals())
     match_indices = helper.create_tmp_variable(dtype='int32')
@@ -281,8 +340,6 @@ def target_assign(input,
                   mismatch_value=None,
                   name=None):
     """
-    **Target assigner operator**
-
     This operator can be, for given the target bounding boxes or labels,
     to assign classification and regression targets to each prediction as well as
     weights to prediction. The weights is used to specify which prediction would
@@ -296,20 +353,24 @@ def target_assign(input,
 
     1. Assigning all outpts based on `match_indices`:
 
-    If id = match_indices[i][j] > 0,
+    .. code-block:: text
+
+        If id = match_indices[i][j] > 0,
 
-        out[i][j][0 : K] = X[lod[i] + id][j % P][0 : K]
-        out_weight[i][j] = 1.
+            out[i][j][0 : K] = X[lod[i] + id][j % P][0 : K]
+            out_weight[i][j] = 1.
 
-    Otherwise,
+        Otherwise,
 
-        out[j][j][0 : K] = {mismatch_value, mismatch_value, ...}
-        out_weight[i][j] = 0.
+            out[j][j][0 : K] = {mismatch_value, mismatch_value, ...}
+            out_weight[i][j] = 0.
 
     2. Assigning out_weight based on `neg_indices` if `neg_indices` is provided:
 
     Assumed that the row offset for each instance in `neg_indices` is called neg_lod,
     for i-th instance and each `id` of neg_indices in this instance:
+    
+    .. code-block:: text
 
         out[i][id][0 : K] = {mismatch_value, mismatch_value, ...}
         out_weight[i][id] = 1.0
@@ -326,10 +387,23 @@ def target_assign(input,
        mismatch_value (float32): Fill this value to the mismatched location.
 
     Returns:
-       out (Variable): The output is a 3D Tensor with shape [N, P, K],
-           N and P is the same as they are in `neg_indices`, K is the
-           same as it in input of X. If `match_indices[i][j]`.
-       out_weight (Variable): The weight for output with the shape of [N, P, 1].
+        tuple: 
+        
+               A tuple(out, out_weight) is returned. out is a 3D Tensor with 
+               shape [N, P, K], N and P is the same as they are in 
+               `neg_indices`, K is the same as it in input of X. If 
+               `match_indices[i][j]`. out_weight is the weight for output with 
+               the shape of [N, P, 1].
+
+    Examples:
+
+        .. code-block:: python
+
+            matched_indices, matched_dist = fluid.layers.bipartite_match(iou)
+            gt = layers.data(
+                        name='gt', shape=[1, 1], dtype='int32', lod_level=1)
+            trg, trg_weight = layers.target_assign(
+                            gt, matched_indices, mismatch_value=0)
     """
     helper = LayerHelper('target_assign', **locals())
     out = helper.create_tmp_variable(dtype=input.dtype)
@@ -364,7 +438,7 @@ def ssd_loss(location,
              normalize=True,
              sample_size=None):
     """
-    **Multi-box loss layer for object dection algorithm of SSD**
+    **Multi-box loss layer for object detection algorithm of SSD**
 
     This layer is to compute dection loss for SSD given the location offset
     predictions, confidence predictions, prior boxes and ground-truth boudding
@@ -372,21 +446,35 @@ def ssd_loss(location,
     is a weighted sum of the localization loss (or regression loss) and
     confidence loss (or classification loss) by performing the following steps:
 
-    1. Find matched boundding box by bipartite matching algorithm.
+    1. Find matched bounding box by bipartite matching algorithm.
+
       1.1 Compute IOU similarity between ground-truth boxes and prior boxes.
+
       1.2 Compute matched boundding box by bipartite matching algorithm.
+
     2. Compute confidence for mining hard examples
+
       2.1. Get the target label based on matched indices.
+
       2.2. Compute confidence loss.
+
     3. Apply hard example mining to get the negative example indices and update
        the matched indices.
+
     4. Assign classification and regression targets
+
       4.1. Encoded bbox according to the prior boxes.
+
       4.2. Assign regression targets.
+
       4.3. Assign classification targets.
+
     5. Compute the overall objective loss.
+
       5.1 Compute confidence loss.
+
       5.1 Compute localization loss.
+
       5.3 Compute the overall weighted loss.
 
     Args:
@@ -421,39 +509,36 @@ def ssd_loss(location,
         mining_type (str): The hard example mining type, should be 'hard_example'
             or 'max_negative', now only support `max_negative`.
         normalize (bool): Whether to normalize the SSD loss by the total number
-            of output locations, True by defalut.
+            of output locations, True by default.
         sample_size (int): The max sample size of negative box, used only when
             mining_type is 'hard_example'.
 
     Returns:
-        Variable: The weighted sum of the localization loss and confidence loss,
-            with shape [N * Np, 1], N and Np are the same as they are
-            in `location`.
+        The weighted sum of the localization loss and confidence loss, with \
+        shape [N * Np, 1], N and Np are the same as they are in `location`.
 
     Raises:
-        ValueError: If mining_type is 'hard_example', now only support
-            mining type of `max_negative`.
+        ValueError: If mining_type is 'hard_example', now only support mining \
+        type of `max_negative`.
 
     Examples:
-        .. code-block:: python
-
-            pb = layers.data(
-                name='prior_box',
-                shape=[10, 4],
-                append_batch_size=False,
-                dtype='float32')
-            pbv = layers.data(
-                name='prior_box_var',
-                shape=[10, 4],
-                append_batch_size=False,
-                dtype='float32')
-            loc = layers.data(name='target_box', shape=[10, 4], dtype='float32')
-            scores = layers.data(name='scores', shape=[10, 21], dtype='float32')
-            gt_box = layers.data(
-                name='gt_box', shape=[4], lod_level=1, dtype='float32')
-            gt_label = layers.data(
-                name='gt_label', shape=[1], lod_level=1, dtype='float32')
-            loss = layers.ssd_loss(loc, scores, gt_box, gt_label, pb, pbv)
+        >>> pb = fluid.layers.data(
+        >>>                   name='prior_box',
+        >>>                   shape=[10, 4],
+        >>>                   append_batch_size=False,
+        >>>                   dtype='float32')
+        >>> pbv = fluid.layers.data(
+        >>>                   name='prior_box_var',
+        >>>                   shape=[10, 4],
+        >>>                   append_batch_size=False,
+        >>>                   dtype='float32')
+        >>> loc = fluid.layers.data(name='target_box', shape=[10, 4], dtype='float32')
+        >>> scores = fluid.layers.data(name='scores', shape=[10, 21], dtype='float32')
+        >>> gt_box = fluid.layers.data(
+        >>>         name='gt_box', shape=[4], lod_level=1, dtype='float32')
+        >>> gt_label = fluid.layers.data(
+        >>>         name='gt_label', shape=[1], lod_level=1, dtype='float32')
+        >>> loss = fluid.layers.ssd_loss(loc, scores, gt_box, gt_label, pb, pbv)
     """
 
     helper = LayerHelper('ssd_loss', **locals())
@@ -577,7 +662,7 @@ def prior_box(input,
               offset=0.5,
               name=None):
     """
-    **Prior box operator**
+    **Prior Box Operator**
 
     Generate prior boxes for SSD(Single Shot MultiBox Detector) algorithm.
     Each position of the input produce N prior boxes, N is determined by
@@ -606,26 +691,30 @@ def prior_box(input,
        name(str): Name of the prior box op. Default: None.
 
     Returns:
-        boxes(Variable): the output prior boxes of PriorBox.
-             The layout is [H, W, num_priors, 4].
-             H is the height of input, W is the width of input,
-             num_priors is the total
-             box count of each position of input.
-        Variances(Variable): the expanded variances of PriorBox.
-             The layout is [H, W, num_priors, 4].
-             H is the height of input, W is the width of input
-             num_priors is the total
-             box count of each position of input
+        tuple: A tuple with two Variable (boxes, variances)
+
+        boxes: the output prior boxes of PriorBox.
+        The layout is [H, W, num_priors, 4].
+        H is the height of input, W is the width of input,
+        num_priors is the total
+        box count of each position of input.
+
+        variances: the expanded variances of PriorBox.
+        The layout is [H, W, num_priors, 4].
+        H is the height of input, W is the width of input
+        num_priors is the total
+        box count of each position of input
 
 
     Examples:
         .. code-block:: python
-            box, var = prior_box(
-            input=conv1,
-            image=images,
-            min_sizes=[100.],
-            flip=True,
-            clip=True)
+
+            box, var = fluid.layers.prior_box(
+                input=conv1,
+                image=images,
+                min_sizes=[100.],
+                flip=True,
+                clip=True)
     """
     helper = LayerHelper("prior_box", **locals())
     dtype = helper.input_dtype()
@@ -695,11 +784,9 @@ def multi_box_head(inputs,
                    stride=1,
                    name=None):
     """
-    **Prior_boxes**
-
     Generate prior boxes for SSD(Single Shot MultiBox Detector)
     algorithm. The details of this algorithm, please refer the
-    section 2.2 of SSD paper (SSD: Single Shot MultiBox Detector)
+    section 2.2 of SSD paper `SSD: Single Shot MultiBox Detector
     <https://arxiv.org/abs/1512.02325>`_ .
 
     Args:
@@ -740,24 +827,27 @@ def multi_box_head(inputs,
        name(str): Name of the prior box layer. Default: None.
 
     Returns:
-        mbox_loc(Variable): The predicted boxes' location of the inputs.
-             The layout is [N, H*W*Priors, 4]. where Priors
-             is the number of predicted boxes each position of each input.
-        mbox_conf(Variable): The predicted boxes' confidence of the inputs.
-             The layout is [N, H*W*Priors, C]. where Priors
-             is the number of predicted boxes each position of each input
-             and C is the number of Classes.
-        boxes(Variable): the output prior boxes of PriorBox.
-             The layout is [num_priors, 4]. num_priors is the total
-             box count of each position of inputs.
-        Variances(Variable): the expanded variances of PriorBox.
-             The layout is [num_priors, 4]. num_priors is the total
-             box count of each position of inputs
+        tuple: A tuple with four Variables. (mbox_loc, mbox_conf, boxes, variances)
+
+        mbox_loc: The predicted boxes' location of the inputs. The layout
+        is [N, H*W*Priors, 4]. where Priors is the number of predicted
+        boxes each position of each input.
+
+        mbox_conf: The predicted boxes' confidence of the inputs. The layout
+        is [N, H*W*Priors, C]. where Priors is the number of predicted boxes
+        each position of each input and C is the number of Classes.
+
+        boxes: the output prior boxes of PriorBox. The layout is [num_priors, 4].
+        num_priors is the total box count of each position of inputs.
+
+        variances: the expanded variances of PriorBox. The layout is
+        [num_priors, 4]. num_priors is the total box count of each position of inputs
 
 
     Examples:
         .. code-block:: python
-          mbox_locs, mbox_confs, box, var = layers.multi_box_head(
+
+          mbox_locs, mbox_confs, box, var = fluid.layers.multi_box_head(
             inputs=[conv1, conv2, conv3, conv4, conv5, conv5],
             image=images,
             num_classes=21,
@@ -909,3 +999,95 @@ def multi_box_head(inputs,
     box.stop_gradient = True
     var.stop_gradient = True
     return mbox_locs_concat, mbox_confs_concat, box, var
+
+
+def anchor_generator(input,
+                     anchor_sizes=None,
+                     aspect_ratios=None,
+                     variance=[0.1, 0.1, 0.2, 0.2],
+                     stride=None,
+                     offset=0.5,
+                     name=None):
+    """
+    **Anchor generator operator**
+
+    Generate anchors for Faster RCNN algorithm.
+    Each position of the input produce N anchors, N =
+    size(anchor_sizes) * size(aspect_ratios). The order of generated anchors
+    is firstly aspect_ratios loop then anchor_sizes loop.
+
+    Args:
+       input(Variable): The input feature map, the format is NCHW.
+       anchor_sizes(list|tuple|float): The anchor sizes of generated anchors,
+       given in absolute pixels e.g. [64., 128., 256., 512.].
+       For instance, the anchor size of 64 means the area of this anchor equals to 64**2.
+       aspect_ratios(list|tuple|float): The height / width ratios of generated
+            anchors, e.g. [0.5, 1.0, 2.0].
+       variance(list|tuple): The variances to be used in box regression deltas.
+            Default:[0.1, 0.1, 0.2, 0.2].
+       stride(list|turple): The anchors stride across width and height,
+            e.g. [16.0, 16.0]
+       offset(float): Prior boxes center offset. Default: 0.5
+       name(str): Name of the prior box op. Default: None.
+
+    Returns:
+        Anchors(Variable):  The output anchors with a layout of [H, W, num_anchors, 4].
+              H is the height of input, W is the width of input,
+              num_anchors is the box count of each position.
+              Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized.
+        Variances(Variable): The expanded variances of anchors
+              with a layout of [H, W, num_priors, 4].
+              H is the height of input, W is the width of input
+              num_anchors is the box count of each position.
+              Each variance is in (xcenter, ycenter, w, h) format.
+
+
+    Examples:
+
+        .. code-block:: python
+
+            anchor, var = anchor_generator(
+                input=conv1,
+                anchor_sizes=[64, 128, 256, 512],
+                aspect_ratios=[0.5, 1.0, 2.0],
+                variance=[0.1, 0.1, 0.2, 0.2],
+                stride=[16.0, 16.0],
+                offset=0.5)
+    """
+    helper = LayerHelper("anchor_generator", **locals())
+    dtype = helper.input_dtype()
+
+    def _is_list_or_tuple_(data):
+        return (isinstance(data, list) or isinstance(data, tuple))
+
+    if not _is_list_or_tuple_(anchor_sizes):
+        anchor_sizes = [anchor_sizes]
+    if not _is_list_or_tuple_(aspect_ratios):
+        aspect_ratios = [aspect_ratios]
+    if not (_is_list_or_tuple_(stride) and len(stride) == 2):
+        raise ValueError('stride should be a list or tuple ',
+                         'with length 2, (stride_width, stride_height).')
+
+    anchor_sizes = list(map(float, anchor_sizes))
+    aspect_ratios = list(map(float, aspect_ratios))
+    stride = list(map(float, stride))
+
+    attrs = {
+        'anchor_sizes': anchor_sizes,
+        'aspect_ratios': aspect_ratios,
+        'variances': variance,
+        'stride': stride,
+        'offset': offset
+    }
+
+    anchor = helper.create_tmp_variable(dtype)
+    var = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type="anchor_generator",
+        inputs={"Input": input},
+        outputs={"Anchors": anchor,
+                 "Variances": var},
+        attrs=attrs, )
+    anchor.stop_gradient = True
+    var.stop_gradient = True
+    return anchor, var
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index f3aeb6cd757a3c40f04b08e61cfd5ce09908f92c..977abde21f38a0d25a90bc14426fd817df2c8508 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -22,9 +22,9 @@ from ..executor import global_scope
 from layer_function_generator import generate_layer_fn, templatedoc
 
 __all__ = [
-    'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'open_recordio_file',
-    'open_files', 'read_file', 'shuffle', 'batch', 'double_buffer',
-    'random_data_generator', 'Preprocessor', 'load'
+    'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'Recv',
+    'open_recordio_file', 'open_files', 'read_file', 'shuffle', 'batch',
+    'double_buffer', 'random_data_generator', 'Preprocessor', 'load'
 ]
 
 
@@ -109,10 +109,35 @@ class BlockGuardServ(BlockGuard):
 
 class ListenAndServ(object):
     """
-    ListenAndServ class.
+    **ListenAndServ Layer**
 
-    ListenAndServ class is used to wrap listen_and_serv op to create a server
-    which can receive variables from clients and run a block.
+    ListenAndServ is used to create a rpc server bind and listen
+    on specific TCP port, this server will run the sub-block when
+    received variables from clients.
+
+    Args:
+        endpoint(string): IP:port string which the server will listen on.
+        inputs(list): a list of variables that the server will get from clients.
+        fan_in(int): how many client are expected to report to this server, default: 1.
+        optimizer_mode(bool): whether to run the server as a parameter server, default: True.
+
+    Examples:
+        .. code-block:: python
+
+            with fluid.program_guard(main):
+                serv = layers.ListenAndServ(
+                    "127.0.0.1:6170", ["X"], optimizer_mode=False)
+                with serv.do():
+                    x = layers.data(
+                        shape=[32, 32],
+                        dtype='float32',
+                        name="X",
+                        append_batch_size=False)
+                    fluid.initializer.Constant(value=1.0)(x, main.global_block())
+                    layers.scale(x=x, scale=10.0, out=out_var)
+
+            exe = fluid.Executor(place)
+            exe.run(main)
     """
 
     def __init__(self, endpoint, inputs, fan_in=1, optimizer_mode=True):
@@ -161,7 +186,6 @@ class ListenAndServ(object):
         main_program = self.helper.main_program
         current_block = main_program.current_block()
         parent_block = self.parent_block()
-        empty_block = Program().global_block()
 
         parent_block.append_op(
             type='listen_and_serv',
@@ -170,25 +194,25 @@ class ListenAndServ(object):
             attrs={
                 'endpoint': self.endpoint,
                 'Fanin': self.fan_in,
-                'OptimizeBlock': current_block,
-                'PrefetchBlock': empty_block,
+                'optimize_blocks': [
+                    current_block
+                ],  # did not support multiple optimize blocks in layers
                 'sync_mode': True,  # did not support async now in layers
                 'grad_to_block_id': [""]
             })
 
 
-def Send(endpoints, send_vars, get_vars=None):
+def Send(endpoints, send_vars, sync=True):
     """
-    Send layer
+    Send variables to the server side, and get vars from server
+    side when server have finished running server side program.
 
     Args:
-        endpoints: comma seperated IP:PORT pairs in the order
+        endpoints (str): comma seperated IP:PORT pairs in the order
                    of send_vars to send
-        send_vars: vars to send
-        get_vars: vars to get from server after send completes.
+        send_vars (list): variables to send to server
+        sync (bool): whether to wait the request finish
 
-    Send variables to the server side, and get vars from server
-    side when server have finished running server side program.
     """
     assert (type(send_vars) == list)
 
@@ -196,40 +220,33 @@ def Send(endpoints, send_vars, get_vars=None):
     endpoints = list(set(epmap))
 
     helper = LayerHelper("Send", **locals())
-    if not get_vars:
-        get_vars = []
-        for s in send_vars:
-            v = helper.create_tmp_variable(dtype=s.dtype, stop_gradient=True)
-            get_vars.append(v)
     rpc_op_role_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
 
     helper.append_op(
         type="send",
         inputs={"X": send_vars},
-        outputs={"Out": get_vars},
         attrs={
             "endpoints": endpoints,
             "epmap": epmap,
             rpc_op_role_name: core.op_proto_and_checker_maker.OpRole.RPC
         })
-
-    return get_vars
+    if sync:
+        helper.append_op(type="send_barrier", attrs={"endpoints": endpoints})
 
 
-def Recv(endpoints, get_vars):
+def Recv(endpoints, get_vars, sync=True):
     """
-    Recv layer
+    Receive variables from server side
 
     Args:
-        endpoints: comma seperated IP:PORT pairs in the order
+        endpoints (str): comma seperated IP:PORT pairs in the order
                    of send_vars to send
-        send_vars: vars to send
-        get_vars: vars to get from server after send completes.
+        get_vars (list): vars to get from server after send completes.
+        sync (bool): whether to wait the request finish
 
-    Send variables to the server side, and get vars from server
-    side when server have finished running server side program.
+    Returns:
+        list: list of received variables
     """
-    assert (type(send_vars) == list)
     assert (type(get_vars) == list)
 
     epmap = endpoints.split(",")
@@ -242,6 +259,9 @@ def Recv(endpoints, get_vars):
         outputs={"Out": get_vars},
         attrs={"endpoints": endpoints,
                "epmap": epmap})
+    if sync:
+        helper.append_op(type="fetch_barrier", attrs={"endpoints": endpoints})
+    return get_vars
 
 
 def monkey_patch_reader_methods(reader):
@@ -292,6 +312,7 @@ def _copy_reader_create_op_(block, op):
     return new_op
 
 
+@templatedoc(op_type='create_recordio_file_reader')
 def open_recordio_file(filename,
                        shapes,
                        lod_levels,
@@ -299,34 +320,30 @@ def open_recordio_file(filename,
                        pass_num=1,
                        for_parallel=True):
     """
-    Open a RecordIO file
-
-    This layer takes a RecordIO file to read from and returns a Reader Variable.
-    Via the Reader Variable, we can get data from the given RecordIO file.
+    ${comment}
 
     Args:
-       filename(str): The RecordIO file's name.
+       filename(${filename_type}): ${filename_comment}.
        shapes(list): List of tuples which declaring data shapes.
-       lod_levels(list): List of ints which declaring data lod_level.
+       lod_levels(${lod_levels_type}): ${lod_levels_comment}.
        dtypes(list): List of strs which declaring data type.
        pass_num(int): Number of passes to run.
        for_parallel(Bool): Set it as True if you are going to run
             subsequent operators in parallel.
 
     Returns:
-       Variable: A Reader Variable via which we can get RecordIO file data.
+       ${out_comment}.
 
     Examples:
-       .. code-block:: python
-
-         reader = fluid.layers.io.open_recordio_file(
-                                          filename='./data.recordio',
-                                          shapes=[(3,224,224), (1)],
-                                          lod_levels=[0, 0],
-                                          dtypes=['float32', 'int64'])
 
-         # Via the reader, we can use 'read_file' layer to get data:
-         image, label = fluid.layers.io.read_file(reader)
+        >>> import paddle.fluid as fluid
+        >>> reader = fluid.layers.io.open_recordio_file(
+        >>>                               filename='./data.recordio',
+        >>>                               shapes=[(3,224,224), (1)],
+        >>>                               lod_levels=[0, 0],
+        >>>                               dtypes=['float32', 'int64'])
+        >>> # Via the reader, we can use 'read_file' layer to get data:
+        >>> image, label = fluid.layers.io.read_file(reader)
     """
     dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
     shape_concat = []
@@ -358,9 +375,6 @@ def open_recordio_file(filename,
     if pass_num > 1:
         main_prog_var = multi_pass(reader=main_prog_var, pass_num=pass_num)
 
-    if for_parallel:
-        main_prog_var = parallel(reader=main_prog_var)
-
     return monkey_patch_reader_methods(main_prog_var)
 
 
@@ -386,16 +400,16 @@ def random_data_generator(low, high, shapes, lod_levels, for_parallel=True):
        Variable: A Reader Variable from which we can get random data.
 
     Examples:
-       .. code-block:: python
 
-         reader = fluid.layers.io.random_data_generator(
-                                          low=0.0,
-                                          high=1.0,
-                                          shapes=[(3,224,224), (1)],
-                                          lod_levels=[0, 0])
+        .. code-block:: python
 
-         # Via the reader, we can use 'read_file' layer to get data:
-         image, label = fluid.layers.io.read_file(reader)
+            reader = fluid.layers.random_data_generator(
+                                             low=0.0,
+                                             high=1.0,
+                                             shapes=[[3,224,224], [1]],
+                                             lod_levels=[0, 0])
+            # Via the reader, we can use 'read_file' layer to get data:
+            image, label = fluid.layers.read_file(reader)
     """
     dtypes = [core.VarDesc.VarType.FP32] * len(shapes)
     shape_concat = []
@@ -452,10 +466,13 @@ def open_files(filenames,
        lod_levels(list): List of ints which declaring data lod_level.
        dtypes(list): List of strs which declaring data type.
        thread_num(int): The maximal concurrent prefetch thread number.
-       buffer_size(int): The size of prefetch buffer.
+       buffer_size(int|None): The size of prefetch buffer. If it is setted None, 
+            buffer size will be thread_num * 3.
+            Default: None
        pass_num(int): Number of passes to run.
        for_parallel(Bool): Set it as True if you are going to run 
             subsequent operators in parallel.
+            Default: True
 
     Returns:
        Variable: A Reader Variable via which we can get file data.
@@ -475,7 +492,7 @@ def open_files(filenames,
          image, label = fluid.layers.io.read_file(reader)
     """
     if buffer_size is None:
-        buffer_size = thread_num
+        buffer_size = thread_num * 3
     if isinstance(filenames, basestring):
         filenames = [filenames]
     dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
@@ -509,9 +526,6 @@ def open_files(filenames,
         main_prog_reader = multi_pass(
             reader=main_prog_reader, pass_num=pass_num)
 
-    if for_parallel:
-        main_prog_reader = parallel(reader=main_prog_reader)
-
     return monkey_patch_reader_methods(main_prog_reader)
 
 
@@ -544,16 +558,77 @@ def __create_unshared_decorated_reader__(op_type, reader, attrs, name=None):
 
 
 def shuffle(reader, buffer_size):
+    """
+    Shuffle the reader.
+    """
     return __create_unshared_decorated_reader__(
         'create_shuffle_reader', reader, {'buffer_size': int(buffer_size)})
 
 
 def batch(reader, batch_size):
+    """
+    This layer is a reader decorator. It takes a reader and adds 
+    'batching' decoration on it. When reading with the result 
+    decorated reader, output data will be automatically organized 
+    to the form of batches.
+
+    Args:
+        reader(Variable): The reader to be decorated with 'batching'.
+        batch_size(int): The batch size.
+
+    Returns:
+        Variable: The reader which has been decorated with 'batching'.
+
+    Examples:
+        .. code-block:: python
+
+            raw_reader = fluid.layers.io.open_files(filenames=['./data1.recordio',
+                                                           './data2.recordio'],
+                                                    shapes=[(3,224,224), (1)],
+                                                    lod_levels=[0, 0],
+                                                    dtypes=['float32', 'int64'],
+                                                    thread_num=2,
+                                                    buffer_size=2)
+            batch_reader = fluid.layers.batch(reader=raw_reader, batch_size=5)
+
+            # If we read data with the raw_reader:
+            #     data = fluid.layers.read_file(raw_reader)
+            # We can only get data instance by instance.
+            # 
+            # However, if we read data with the batch_reader:
+            #     data = fluid.layers.read_file(batch_reader)
+            # Each 5 adjacent instances will be automatically combined together 
+            # to become a batch. So what we get('data') is a batch data instead 
+            # of an instance.
+    """
     return __create_unshared_decorated_reader__(
         'create_batch_reader', reader, {'batch_size': int(batch_size)})
 
 
 def double_buffer(reader, place=None, name=None):
+    """
+    Wrap a double buffer reader. The data will copy to target place with a
+    double buffer queue. If the target place is None, the place that executor
+    perform on will be used.
+
+    Args:
+        reader(Variable): the reader variable need to be wrapped.
+        place(Place): the place of target data. Default is the sample place of
+            executor perform.
+
+        name(str): Variable name. None if the user does not care.
+
+    Returns:
+        wrapped reader with double buffer.
+
+    Examples:
+
+        >>> reader = fluid.layers.open_files(filenames=['somefile'],
+        >>>                                  shapes=[[-1, 784], [-1, 1]],
+        >>>                                  dtypes=['float32', 'int64'])
+        >>> reader = fluid.layers.double_buffer(reader)
+        >>> img, label = fluid.layers.read_file(reader)
+    """
     attrs = dict()
     if place is not None:
         attrs['place'] = str(place).upper()
@@ -566,20 +641,41 @@ def multi_pass(reader, pass_num):
         'create_multi_pass_reader', reader, {'pass_num': int(pass_num)})
 
 
-def parallel(reader):
-    return __create_shared_decorated_reader__('create_threaded_reader', reader,
-                                              {})
+def read_file(reader):
+    """
+    Execute the given reader and get data via it.
+
+    A reader is also a Variable. It can be a raw reader generated by 
+    `fluid.layers.open_files()` or a decorated one generated by 
+    `fluid.layers.double_buffer()` and so on.
+
+    Args:
+
+        reader(Variable): The reader to execute.
+
+    Returns:
+        Tuple[Variable]: Data read via the given reader.
 
+    Examples:
+        .. code-block:: python
 
-def read_file(file_obj):
+           data_file = fluid.layers.open_files(
+                filenames=['mnist.recordio'],
+                shapes=[(-1, 748), (-1, 1)],
+                lod_levels=[0, 0],
+                dtypes=["float32", "int64"])
+            data_file = fluid.layers.double_buffer(
+                fluid.layers.batch(data_file, batch_size=64))
+            input, label = fluid.layers.read_file(data_file)
+    """
     helper = LayerHelper('read_file')
     out = [
         helper.create_tmp_variable(
             stop_gradient=True, dtype='float32')
-        for _ in range(len(file_obj.desc.shapes()))
+        for _ in range(len(reader.desc.shapes()))
     ]
     helper.append_op(
-        type='read', inputs={'Reader': [file_obj]}, outputs={'Out': out})
+        type='read', inputs={'Reader': [reader]}, outputs={'Out': out})
     if len(out) == 1:
         return out[0]
     else:
diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py
index cb60a3aec9a5a69f1eed281eb017384a621c66a8..3096389101a5e5b302c78145b8bc9f1d71f6b8cb 100644
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -44,6 +44,18 @@ def _type_to_str_(tp):
     return framework_pb2.AttrType.Name(tp)
 
 
+_two_dollar_pattern_ = re.compile(r"\$\$([^\$]+)\$\$")
+_single_dollar_pattern_ = re.compile(r"\$([^\$]+)\$")
+_two_bang_pattern_ = re.compile(r"!!([^!]+)!!")
+
+
+def escape_math(text):
+    return _two_bang_pattern_.sub(
+        r'$$\1$$',
+        _single_dollar_pattern_.sub(r':math:`\1`',
+                                    _two_dollar_pattern_.sub(r"!!\1!!", text)))
+
+
 def _generate_doc_string_(op_proto):
     """
     Generate docstring by OpProto
@@ -59,18 +71,16 @@ def _generate_doc_string_(op_proto):
         raise TypeError("OpProto should be `framework_pb2.OpProto`")
 
     buf = cStringIO.StringIO()
-    buf.write(op_proto.comment)
+    buf.write(escape_math(op_proto.comment))
     buf.write('\nArgs:\n')
     for each_input in op_proto.inputs:
         line_begin = '    {0}: '.format(_convert_(each_input.name))
         buf.write(line_begin)
-        buf.write(each_input.comment)
-        buf.write('\n')
-        buf.write(' ' * len(line_begin))
-        buf.write('Duplicable: ')
-        buf.write(str(each_input.duplicable))
-        buf.write('  Optional: ')
-        buf.write(str(each_input.dispensable))
+        buf.write(escape_math(each_input.comment))
+        if each_input.duplicable:
+            buf.write("  Duplicatable.")
+        if each_input.dispensable:
+            buf.write("  Optional.")
         buf.write('\n')
 
     skip_attrs = OpProtoHolder.generated_op_attr_names()
@@ -83,7 +93,7 @@ def _generate_doc_string_(op_proto):
         buf.write(' (')
         buf.write(_type_to_str_(each_attr.type))
         buf.write('): ')
-        buf.write(each_attr.comment)
+        buf.write(escape_math(each_attr.comment))
         buf.write('\n')
 
     if len(op_proto.outputs) != 0:
@@ -92,7 +102,7 @@ def _generate_doc_string_(op_proto):
         for each_opt in op_proto.outputs:
             if not each_opt.intermediate:
                 break
-        buf.write(each_opt.comment)
+        buf.write(escape_math(each_opt.comment))
 
     return buf.getvalue()
 
@@ -224,9 +234,6 @@ def autodoc(comment=""):
     return __impl__
 
 
-_inline_math_single_dollar = re.compile(r"\$([^\$]+)\$")
-
-
 def templatedoc(op_type=None):
     """
     Decorator of layer function. It will use the docstring from the layer
@@ -244,9 +251,6 @@ def templatedoc(op_type=None):
     def trim_ending_dot(msg):
         return msg.rstrip('.')
 
-    def escape_inline_math(msg):
-        return _inline_math_single_dollar.sub(repl=r':math:`\1`', string=msg)
-
     def __impl__(func):
         if op_type is None:
             op_type_name = func.__name__
@@ -260,7 +264,7 @@ def templatedoc(op_type=None):
         for line in comment_lines:
             line = line.strip()
             if len(line) != 0:
-                comment += escape_inline_math(line)
+                comment += escape_math(line)
                 comment += " "
             elif len(comment) != 0:
                 comment += "\n    \n    "
diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
index 716cc7824eff0c56cc55a055310fa8b1913ac5e6..6071e3e74218e4db4cddc223818d3a9b7086fd86 100644
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -25,10 +25,11 @@ import nn
 import ops
 import tensor
 from ..initializer import init_on_cpu
+from ..framework import default_main_program, Parameter
 
 __all__ = [
     'exponential_decay', 'natural_exp_decay', 'inverse_time_decay',
-    'polynomial_decay', 'piecewise_decay', 'noam_decay'
+    'polynomial_decay', 'piecewise_decay', 'noam_decay', 'append_LARS'
 ]
 
 
@@ -70,21 +71,40 @@ def noam_decay(d_model, warmup_steps):
 
 
 def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
-    """Applies exponential decay to the learning rate.
+    """
+    Applies exponential decay to the learning rate. 
+
+    When training a model, it is often recommended to lower the learning rate as the 
+    training progresses. By using this function, the learning rate will be decayed by 
+    'decay_rate' every 'decay_steps' steps.
+
+    >>> if staircase == True:
+    >>>     decayed_learning_rate = learning_rate * decay_rate ^ floor(global_step / decay_steps)
+    >>> else:
+    >>>     decayed_learning_rate = learning_rate * decay_rate ^ (global_step / decay_steps)
 
-    ```python
-    decayed_learning_rate = learning_rate *
-            decay_rate ^ (global_step / decay_steps)
-    ```
     Args:
-        learning_rate: A scalar float32 value or a Variable. This
-          will be the initial learning rate during training
-        decay_steps: A Python `int32` number.
-        decay_rate: A Python `float` number.
-        staircase: Boolean. If set true, decay the learning rate every decay_steps.
+        learning_rate(Variable|float): The initial learning rate.
+        decay_steps(int): See the decay computation above.
+        decay_rate(float): The decay rate. See the decay computation above.
+        staircase(Boolean): If True, decay the learning rate at discrete intervals.
+                            Default: False
 
     Returns:
-        The decayed learning rate
+        Variable: The decayed learning rate
+
+    Examples:
+        .. code-block:: python
+
+          base_lr = 0.1
+          sgd_optimizer = fluid.optimizer.SGD(
+                learning_rate=fluid.layers.exponential_decay(
+                    learning_rate=base_lr,
+                    decay_steps=10000,
+                    decay_rate=0.5,
+                    staircase=True))
+          sgd_optimizer.minimize(avg_cost)
+
     """
     global_step = _decay_step_counter()
 
@@ -128,22 +148,39 @@ def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
 
 
 def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
-    """Applies inverse time decay to the initial learning rate.
+    """
+    Applies inverse time decay to the initial learning rate.
 
-    >>> if staircase:
+    When training a model, it is often recommended to lower the learning rate as the 
+    training progresses. By using this function, an inverse decay function will be 
+    applied to the initial learning rate.
+
+    >>> if staircase == True:
     >>>     decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step))
     >>> else:
     >>>     decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step)
 
     Args:
-        learning_rate: A scalar float32 value or a Variable. This
-          will be the initial learning rate during training.
-        decay_steps: A Python `int32` number.
-        decay_rate: A Python `float` number.
-        staircase: Boolean. If set true, decay the learning rate every decay_steps.
+        learning_rate(Variable|float): The initial learning rate.
+        decay_steps(int): See the decay computation above.
+        decay_rate(float): The decay rate. See the decay computation above.
+        staircase(Boolean): If True, decay the learning rate at discrete intervals.
+                            Default: False
 
     Returns:
-        The decayed learning rate
+        Variable: The decayed learning rate
+
+    Examples:
+        .. code-block:: python
+
+          base_lr = 0.1
+          sgd_optimizer = fluid.optimizer.SGD(
+                learning_rate=fluid.layers.inverse_time_decay(
+                    learning_rate=base_lr,
+                    decay_steps=10000,
+                    decay_rate=0.5,
+                    staircase=True))
+          sgd_optimizer.minimize(avg_cost)
     """
     global_step = _decay_step_counter()
 
@@ -162,25 +199,28 @@ def polynomial_decay(learning_rate,
                      end_learning_rate=0.0001,
                      power=1.0,
                      cycle=False):
-    """Applies polynomial decay to the initial learning rate.
+    """
+    Applies polynomial decay to the initial learning rate.
+
+    .. code-block:: python
+
+     if cycle:
+       decay_steps = decay_steps * ceil(global_step / decay_steps)
+     else:
+       global_step = min(global_step, decay_steps)
+       decayed_learning_rate = (learning_rate - end_learning_rate) *
+            (1 - global_step / decay_steps) ^ power + end_learning_rate
 
-    >>> if cycle:
-    >>>     decay_steps = decay_steps * ceil(global_step / decay_steps)
-    >>> else:
-    >>>     global_step = min(global_step, decay_steps)
-    >>> decayed_learning_rate = (learning_rate - end_learning_rate) *
-    >>>                   (1 - global_step / decay_steps) ^ power +
-    >>>                   end_learning_rate
     Args:
-        learning_rate: A scalar float32 value or a Variable. This
-          will be the initial learning rate during training
-        decay_steps: A Python `int32` number.
-        end_learning_rate: A Python `float` number.
-        power: A Python `float` number
-        cycle: Boolean. If set true, decay the learning rate every decay_steps.
+        learning_rate(Variable|float32): A scalar float32 value or a Variable. This
+          will be the initial learning rate during training.
+        decay_steps(int32): A Python `int32` number.
+        end_learning_rate(float): A Python `float` number.
+        power(float): A Python `float` number.
+        cycle(bool): If set true, decay the learning rate every decay_steps.
 
     Returns:
-        The decayed learning rate
+        Variable: The decayed learning rate
     """
     global_step = _decay_step_counter()
 
@@ -209,15 +249,27 @@ def polynomial_decay(learning_rate,
 def piecewise_decay(boundaries, values):
     """Applies piecewise decay to the initial learning rate.
 
-    >>> boundaries = [10000, 20000]
-    >>> values = [1.0, 0.5, 0.1]
-    >>>
-    >>> if step < 10000:
-    >>>     learning_rate = 1.0
-    >>> elif 10000 <= step < 20000:
-    >>>     learning_rate = 0.5
-    >>> else:
-    >>>     learning_rate = 0.1
+      The algorithm can be described as the code below.
+
+      .. code-block:: python
+
+        boundaries = [10000, 20000]
+        values = [1.0, 0.5, 0.1]
+        if step < 10000:
+            learning_rate = 1.0
+        elif 10000 <= step < 20000:
+            learning_rate = 0.5
+        else:
+            learning_rate = 0.1
+    Args:
+        boundaries: A list of steps numbers.
+        values: A list of learning rate values that will be picked during
+            different step boundaries.
+
+    Returns:
+        The decayed learning rate.
+
+
     """
 
     if len(values) - len(boundaries) != 1:
@@ -249,3 +301,41 @@ def piecewise_decay(boundaries, values):
                 tensor.assign(last_value_var, lr)
 
     return lr
+
+
+def append_LARS(params_grads, learning_rate, weight_decay):
+    """Applies LARS (LAYER-WISE ADAPTIVE RATE SCALING) to learning rate for
+       each layer.
+
+    ```python
+        learning_rate *= local_gw_ratio * sqrt(sumsq(param))
+                        / (sqrt(sumsq(gradient))+ weight_decay * sqrt(sumsq(param)))
+    ```
+
+    Args:
+        learning_rate: A learning rate Variable. This
+          is the global learning rate for LARS.
+        weight_decay: A Python `float` number.
+
+    Returns:
+        The decayed learning rate
+    """
+
+    def _balanced_weight(param_norm, grad_norm):
+        if weight_decay == 1.0:
+            return grad_norm + param_norm
+        else:
+            return grad_norm + weight_decay * param_norm
+
+    for param, grad in params_grads:
+        param_lr = param.optimize_attr['learning_rate']
+        param_norm = ops.sqrt(nn.reduce_sum(input=ops.square(param)))
+        grad_norm = ops.sqrt(nn.reduce_sum(input=ops.square(grad)))
+        if type(param_lr) == float and param_lr == 1.0:
+            decayed_lr = learning_rate * param_norm \
+                         / _balanced_weight(param_norm, grad_norm)
+        else:
+            decayed_lr = learning_rate * param_lr * param_norm \
+                         / _balanced_weight(param_norm, grad_norm)
+        # set back param local learning rate
+        param.optimize_attr['learning_rate'] = decayed_lr
diff --git a/python/paddle/fluid/layers/metric.py b/python/paddle/fluid/layers/metric.py
deleted file mode 100644
index a1c64ce2771526cbd0baa944f97d01e7878b3ac1..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/layers/metric.py
+++ /dev/null
@@ -1,77 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-All layers just related to metric.
-"""
-
-import warnings
-from ..layer_helper import LayerHelper
-from ..initializer import Normal, Constant
-from ..framework import Variable
-from ..param_attr import ParamAttr
-import nn
-
-__all__ = ['accuracy', 'auc']
-
-
-def accuracy(input, label, k=1, correct=None, total=None):
-    """
-    This function computes the accuracy using the input and label.
-    The output is the top k inputs and their indices.
-    """
-    helper = LayerHelper("accuracy", **locals())
-    topk_out, topk_indices = nn.topk(input, k=k)
-    acc_out = helper.create_tmp_variable(dtype="float32")
-    if correct is None:
-        correct = helper.create_tmp_variable(dtype="int64")
-    if total is None:
-        total = helper.create_tmp_variable(dtype="int64")
-    helper.append_op(
-        type="accuracy",
-        inputs={
-            "Out": [topk_out],
-            "Indices": [topk_indices],
-            "Label": [label]
-        },
-        outputs={
-            "Accuracy": [acc_out],
-            "Correct": [correct],
-            "Total": [total],
-        })
-    return acc_out
-
-
-def auc(input, label, curve='ROC', num_thresholds=200):
-    warnings.warn(
-        "This interface not recommended, fluid.layers.auc compute the auc at every minibatch, \
-        but can not aggregate them and get the pass AUC, because pass \
-        auc can not be averaged with weighted from the minibatch auc value. \
-        Please use fluid.metrics.Auc, it can compute the auc value via Python natively, \
-        which can get every minibatch and every pass auc value.", Warning)
-    helper = LayerHelper("auc", **locals())
-    topk_out = helper.create_tmp_variable(dtype=input.dtype)
-    topk_indices = helper.create_tmp_variable(dtype="int64")
-    topk_out, topk_indices = nn.topk(input, k=k)
-    auc_out = helper.create_tmp_variable(dtype="float32")
-    helper.append_op(
-        type="accuracy",
-        inputs={
-            "Out": [topk_out],
-            "Indices": [topk_indices],
-            "Label": [label]
-        },
-        attrs={"curve": curve,
-               "num_thresholds": num_thresholds},
-        outputs={"AUC": [auc_out], })
-    return auc_out
diff --git a/python/paddle/fluid/layers/metric_op.py b/python/paddle/fluid/layers/metric_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..99e82fdd04282177fae63f1fb94b5e32d41c612e
--- /dev/null
+++ b/python/paddle/fluid/layers/metric_op.py
@@ -0,0 +1,138 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+All layers just related to metric.
+"""
+
+import warnings
+from ..layer_helper import LayerHelper
+from ..initializer import Normal, Constant
+from ..framework import Variable
+from ..param_attr import ParamAttr
+import nn
+
+__all__ = ['accuracy', 'auc']
+
+
+def accuracy(input, label, k=1, correct=None, total=None):
+    """
+    accuracy layer.
+    Refer to the https://en.wikipedia.org/wiki/Precision_and_recall
+
+    This function computes the accuracy using the input and label.
+    If the correct label occurs in top k predictions, then correct will increment by one.
+    Note: the dtype of accuracy is determined by input. the input and label dtype can be different.
+
+    Args:
+        input(Variable): The input of accuracy layer, which is the predictions of network.
+          Carry LoD information is supported.
+        label(Variable): The label of dataset.
+        k(int): The top k predictions for each class will be checked.
+        correct(Variable): The correct predictions count.
+        total(Variable): The total entries count.
+
+    Returns:
+        Variable: The correct rate.
+
+    Examples:
+        .. code-block:: python
+
+           data = fluid.layers.data(name="data", shape=[-1, 32, 32], dtype="float32")
+           label = fluid.layers.data(name="data", shape=[-1,1], dtype="int32")
+           predict = fluid.layers.fc(input=data, size=10)
+           acc = fluid.layers.accuracy(input=predict, label=label, k=5)
+
+    """
+    helper = LayerHelper("accuracy", **locals())
+    topk_out, topk_indices = nn.topk(input, k=k)
+    acc_out = helper.create_tmp_variable(dtype="float32")
+    if correct is None:
+        correct = helper.create_tmp_variable(dtype="int64")
+    if total is None:
+        total = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="accuracy",
+        inputs={
+            "Out": [topk_out],
+            "Indices": [topk_indices],
+            "Label": [label]
+        },
+        outputs={
+            "Accuracy": [acc_out],
+            "Correct": [correct],
+            "Total": [total],
+        })
+    return acc_out
+
+
+def auc(input, label, curve='ROC', num_thresholds=200):
+    """
+    **Area Under the Curve (AUC) Layer**
+
+    This implementation computes the AUC according to forward output and label.
+    It is used very widely in binary classification evaluation. 
+
+    Note: If input label contains values other than 0 and 1, it will be cast 
+    to `bool`. Find the relevant definitions `here <https://en.wikipedia.org\
+    /wiki/Receiver_operating_characteristic#Area_under_the_curve>`_.
+
+    There are two types of possible curves:
+
+        1. ROC: Receiver operating characteristic;
+        2. PR: Precision Recall
+
+    Args:
+        input(Variable): A floating-point 2D Variable, values are in the range 
+                         [0, 1]. Each row is sorted in descending order. This 
+                         input should be the output of topk. Typically, this 
+                         Variable indicates the probability of each label.
+        label(Variable): A 2D int Variable indicating the label of the training 
+                         data. The height is batch size and width is always 1.
+        curve(str): Curve type, can be 'ROC' or 'PR'. Default 'ROC'.
+        num_thresholds(int): The number of thresholds to use when discretizing 
+                             the roc curve. Default 200.
+
+    Returns:
+        Variable: A scalar representing the current AUC.
+
+    Examples:
+        .. code-block:: python
+        
+            # network is a binary classification model and label the ground truth
+            prediction = network(image, is_infer=True)
+            auc_out=fluid.layers.auc(input=prediction, label=label)
+    """
+
+    warnings.warn(
+        "This interface not recommended, fluid.layers.auc compute the auc at every minibatch, \
+        but can not aggregate them and get the pass AUC, because pass \
+        auc can not be averaged with weighted from the minibatch auc value. \
+        Please use fluid.metrics.Auc, it can compute the auc value via Python natively, \
+        which can get every minibatch and every pass auc value.", Warning)
+    helper = LayerHelper("auc", **locals())
+    topk_out = helper.create_tmp_variable(dtype=input.dtype)
+    topk_indices = helper.create_tmp_variable(dtype="int64")
+    topk_out, topk_indices = nn.topk(input, k=k)
+    auc_out = helper.create_tmp_variable(dtype="float32")
+    helper.append_op(
+        type="auc",
+        inputs={
+            "Out": [topk_out],
+            "Indices": [topk_indices],
+            "Label": [label]
+        },
+        attrs={"curve": curve,
+               "num_thresholds": num_thresholds},
+        outputs={"AUC": [auc_out], })
+    return auc_out
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index ac3ba4174f885781c6bf9313647ddc2b0a380285..925700d7368ae31e7b697ca3b82115e3b900d21c 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-All layers just related to the neural network. 
+All layers just related to the neural network.
 """
 
 from ..layer_helper import LayerHelper
@@ -23,6 +23,7 @@ from layer_function_generator import autodoc, templatedoc
 from tensor import concat
 import utils
 import random
+from .. import unique_name
 
 __all__ = [
     'fc',
@@ -92,6 +93,9 @@ __all__ = [
     'gather',
     'random_crop',
     'mean_iou',
+    'relu',
+    'log',
+    'crop',
 ]
 
 
@@ -107,14 +111,15 @@ def fc(input,
     """
     **Fully Connected Layer**
 
-    The fully connected layer can take multiple tensors as its inputs. It
-    creates a variable called weights for each input tensor, which represents
-    a fully connected weight matrix from each input unit to each output unit.
-    The fully connected layer multiplies each input tensor with its coresponding
-    weight to produce an output Tensor. If multiple input tensors are given,
-    the results of multiple multiplications will be sumed up. If bias_attr is
-    not None, a bias variable will be created and added to the output. Finally,
-    if activation is not None, it will be applied to the output as well.
+    This function creates a fully connected layer in the network. It can take
+    multiple tensors as its inputs. It creates a variable called weights for
+    each input tensor, which represents a fully connected weight matrix from
+    each input unit to each output unit. The fully connected layer multiplies
+    each input tensor with its coresponding weight to produce an output Tensor.
+    If multiple input tensors are given, the results of multiple multiplications
+    will be sumed up. If bias_attr is not None, a bias variable will be created
+    and added to the output. Finally, if activation is not None, it will be applied
+    to the output as well.
 
     This process can be formulated as follows:
 
@@ -155,7 +160,7 @@ def fc(input,
         name (str, default None): The name of this layer.
 
     Returns:
-        A tensor variable storing the transformation result.
+        Variable: The transformation result.
 
     Raises:
         ValueError: If rank of the input tensor is less than 2.
@@ -163,8 +168,7 @@ def fc(input,
     Examples:
         .. code-block:: python
 
-          data = fluid.layers.data(
-              name="data", shape=[32, 32], dtype="float32")
+          data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
           fc = fluid.layers.fc(input=data, size=1000, act="tanh")
     """
 
@@ -196,7 +200,10 @@ def fc(input,
     else:
         pre_bias = helper.create_tmp_variable(dtype)
         helper.append_op(
-            type="sum", inputs={"X": mul_results}, outputs={"Out": pre_bias})
+            type="sum",
+            inputs={"X": mul_results},
+            outputs={"Out": pre_bias},
+            attrs={"use_mkldnn": use_mkldnn})
     # add bias
     pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)
     # add activation
@@ -226,11 +233,11 @@ def embedding(input,
             have two elements which indicate the size of the dictionary of
             embeddings and the size of each embedding vector respectively.
         is_sparse(bool): The flag indicating whether to use sparse update.
-        is_distributed (bool): Whether to run lookup table from remote parameter server.
+        is_distributed(bool): Whether to run lookup table from remote parameter server.
         padding_idx(int|long|None): If :attr:`None`, it makes no effect to lookup.
             Otherwise the given :attr:`padding_idx` indicates padding the output
             with zeros whenever lookup encounters it in :attr:`input`. If
-            :math:`padding_idx < 0`, the padding_idx to use in lookup is
+            :math:`padding_idx < 0`, the :attr:`padding_idx` to use in lookup is
             :math:`size[0] + dim`.
         param_attr(ParamAttr): Parameters for this layer
         dtype(np.dtype|core.VarDesc.VarType|str): The type of data : float32, float_16, int etc
@@ -266,6 +273,7 @@ def embedding(input,
     return tmp
 
 
+@templatedoc(op_type="lstm")
 def dynamic_lstm(input,
                  size,
                  h_0=None,
@@ -280,56 +288,11 @@ def dynamic_lstm(input,
                  dtype='float32',
                  name=None):
     """
-    **Dynamic LSTM Layer**
-
-    The defalut implementation is diagonal/peephole connection
-    (https://arxiv.org/pdf/1402.1128.pdf), the formula is as follows:
-
-    .. math::
-
-        i_t & = \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + W_{ic}c_{t-1} + b_i)
-
-        f_t & = \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + W_{fc}c_{t-1} + b_f)
-
-        \\tilde{c_t} & = act_g(W_{cx}x_t + W_{ch}h_{t-1} + b_c)
-
-        o_t & = \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + W_{oc}c_t + b_o)
-
-        c_t & = f_t \odot c_{t-1} + i_t \odot \\tilde{c_t}
-
-        h_t & = o_t \odot act_h(c_t)
-
-    where the :math:`W` terms denote weight matrices (e.g. :math:`W_{xi}` is
-    the matrix of weights from the input gate to the input), :math:`W_{ic}, \
-    W_{fc}, W_{oc}` are diagonal weight matrices for peephole connections. In
-    our implementation, we use vectors to reprenset these diagonal weight
-    matrices. The :math:`b` terms denote bias vectors (:math:`b_i` is the input
-    gate bias vector), :math:`\sigma` is the non-linear activations, such as
-    logistic sigmoid function, and :math:`i, f, o` and :math:`c` are the input
-    gate, forget gate, output gate, and cell activation vectors, respectively,
-    all of which have the same size as the cell output activation vector :math:`h`.
-
-    The :math:`\odot` is the element-wise product of the vectors. :math:`act_g`
-    and :math:`act_h` are the cell input and cell output activation functions
-    and `tanh` is usually used for them. :math:`\\tilde{c_t}` is also called
-    candidate hidden state, which is computed based on the current input and
-    the previous hidden state.
-
-    Set `use_peepholes` to `False` to disable peephole connection. The formula
-    is omitted here, please refer to the paper
-    http://www.bioinf.jku.at/publications/older/2604.pdf for details.
-
-    Note that these :math:`W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}`
-    operations on the input :math:`x_{t}` are NOT included in this operator.
-    Users can choose to use fully-connect layer before LSTM layer.
+    ${comment}
 
     Args:
-        input(Variable): The input of dynamic_lstm layer, which supports
-                         variable-time length input sequence. The underlying
-                         tensor in this Variable is a matrix with shape
-                         (T X 4D), where T is the total time steps in this
-                         mini-batch, D is the hidden size.
-        size(int): 4 * hidden size.
+        input (Variable): ${input_comment}
+        size (int): 4 * hidden size.
         h_0(Variable): The initial hidden state is an optional input, default is zero.
                        This is a tensor with shape (N x D), where N is the
                        batch size and D is the hidden size.
@@ -344,33 +307,26 @@ def dynamic_lstm(input,
                                                 W_{fh}, W_{oh}`}
                                - The shape is (D x 4D), where D is the hidden
                                  size.
-        bias_attr(ParamAttr|None): The bias attribute for the learnable bias
+        bias_attr (ParamAttr|None): The bias attribute for the learnable bias
                               weights, which contains two parts, input-hidden
                               bias weights and peephole connections weights if
                               setting `use_peepholes` to `True`.
 
                               1. `use_peepholes = False`
-                                - Biases = {:math:`b_c, b_i, b_f, b_o`}.
-                                - The shape is (1 x 4D).
+                                 - Biases = {:math:`b_c, b_i, b_f, b_o`}.
+                                 - The shape is (1 x 4D).
                               2. `use_peepholes = True`
-                                - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
+                                 - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
                                                  W_{fc}, W_{oc}`}.
-                                - The shape is (1 x 7D).
-        use_peepholes(bool): Whether to enable diagonal/peephole connections,
-                             default `True`.
-        is_reverse(bool): Whether to compute reversed LSTM, default `False`.
-        gate_activation(str): The activation for input gate, forget gate and
-                              output gate. Choices = ["sigmoid", "tanh", "relu",
-                              "identity"], default "sigmoid".
-        cell_activation(str): The activation for cell output. Choices = ["sigmoid",
-                              "tanh", "relu", "identity"], default "tanh".
-        candidate_activation(str): The activation for candidate hidden state.
-                              Choices = ["sigmoid", "tanh",
-                                  "relu", "identity"],
-                              default "tanh".
-        dtype(str): Data type. Choices = ["float32", "float64"], default "float32".
-        name(str|None): A name for this layer(optional). If set None, the layer
-                        will be named automatically.
+                                 - The shape is (1 x 7D).
+        use_peepholes (bool): ${use_peepholes_comment}
+        is_reverse (bool): ${is_reverse_comment}
+        gate_activation (str): ${gate_activation_comment}
+        cell_activation (str): ${cell_activation_comment}
+        candidate_activation (str): ${candidate_activation_comment}
+        dtype (str): Data type. Choices = ["float32", "float64"], default "float32".
+        name (str|None): A name for this layer(optional). If set None, the layer
+                         will be named automatically.
 
     Returns:
         tuple: The hidden state, and cell state of LSTM. The shape of both \
@@ -541,27 +497,31 @@ def dynamic_lstmp(input,
         cell_activation(str): The activation for cell output. Choices = ["sigmoid",
                               "tanh", "relu", "identity"], default "tanh".
         candidate_activation(str): The activation for candidate hidden state.
-                              Choices = ["sigmoid", "tanh",
-                                  "relu", "identity"],
+                              Choices = ["sigmoid", "tanh", "relu", "identity"],
                               default "tanh".
         proj_activation(str): The activation for projection output.
-                              Choices = ["sigmoid", "tanh",
-                                  "relu", "identity"],
+                              Choices = ["sigmoid", "tanh", "relu", "identity"],
                               default "tanh".
         dtype(str): Data type. Choices = ["float32", "float64"], default "float32".
         name(str|None): A name for this layer(optional). If set None, the layer
                         will be named automatically.
 
     Returns:
-        tuple: The projection of hidden state, and cell state of LSTMP. The \
-               shape of projection is (T x P), for the cell state which is \
-               (T x D), and both LoD is the same with the `input`.
+        tuple: A tuple of two output variable: the projection of hidden state, \
+               and cell state of LSTMP. The shape of projection is (T x P), \
+               for the cell state which is (T x D), and both LoD is the same \
+               with the `input`.
 
     Examples:
+
         .. code-block:: python
 
+            dict_dim, emb_dim = 128, 64
+            data = fluid.layers.data(name='sequence', shape=[1],
+                                     dtype='int32', lod_level=1)
+            emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim])
             hidden_dim, proj_dim = 512, 256
-            fc_out = fluid.layers.fc(input=input_seq, size=hidden_dim * 4,
+            fc_out = fluid.layers.fc(input=emb, size=hidden_dim * 4,
                                      act=None, bias_attr=None)
             proj_out, _ = fluid.layers.dynamic_lstmp(input=fc_out,
                                                      size=hidden_dim * 4,
@@ -627,10 +587,10 @@ def dynamic_gru(input,
                 candidate_activation='tanh',
                 h_0=None):
     """
-    **Dynamic GRU Layer**
+    **Gated Recurrent Unit (GRU) Layer**
 
     Refer to `Empirical Evaluation of Gated Recurrent Neural Networks on
-    Sequence Modeling <https://arxiv.org/abs/1412.3555>`_
+    Sequence Modeling <https://arxiv.org/abs/1412.3555>`_ .
 
     The formula is as follows:
 
@@ -677,17 +637,25 @@ def dynamic_gru(input,
             Choices = ["sigmoid", "tanh", "relu", "identity"], default "sigmoid".
         candidate_activation(str): The activation for candidate hidden state.
             Choices = ["sigmoid", "tanh", "relu", "identity"], default "tanh".
-        h_0 (Variable): The hidden output of the first time step.
+        h_0 (Variable): This is initial hidden state. If not set, default is
+            zero. This is a tensor with shape (N x D), where N is the number of
+            total time steps of input mini-batch feature and D is the hidden
+            size.
 
     Returns:
         Variable: The hidden state of GRU. The shape is :math:`(T \\times D)`, \
-            and lod is the same with the input.
+            and sequence length is the same with the input.
 
     Examples:
+
         .. code-block:: python
 
+            dict_dim, emb_dim = 128, 64
+            data = fluid.layers.data(name='sequence', shape=[1],
+                                     dtype='int32', lod_level=1)
+            emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim])
             hidden_dim = 512
-            x = fluid.layers.fc(input=data, size=hidden_dim * 3)
+            x = fluid.layers.fc(input=emb, size=hidden_dim * 3)
             hidden = fluid.layers.dynamic_gru(input=x, dim=hidden_dim)
     """
 
@@ -835,11 +803,14 @@ def linear_chain_crf(input, label, param_attr=None):
 
     Args:
         input(${emission_type}): ${emission_comment}
+        input(${transition_type}): ${transition_comment}
         label(${label_type}): ${label_comment}
         param_attr(ParamAttr): The attribute of the learnable parameter.
 
     Returns:
-        ${log_likelihood_comment}
+        output(${emission_exps_type}): ${emission_exps_comment} \n
+        output(${transition_exps_type}): ${transition_exps_comment} \n
+        output(${log_likelihood_type}): ${log_likelihood_comment}
 
     """
     helper = LayerHelper('linear_chain_crf', **locals())
@@ -874,11 +845,19 @@ def crf_decoding(input, param_attr, label=None):
 
     Args:
         input(${emission_type}): ${emission_comment}
+
         param_attr(ParamAttr): The parameter attribute for training.
+
         label(${label_type}): ${label_comment}
 
     Returns:
-        ${viterbi_path_comment}
+        Variable: ${viterbi_path_comment}
+
+    Examples:
+        .. code-block:: python
+
+           crf_decode = layers.crf_decoding(
+                input=hidden, param_attr=ParamAttr(name="crfw"))
     """
     helper = LayerHelper('crf_decoding', **locals())
     transition = helper.get_parameter(param_attr.name)
@@ -893,15 +872,15 @@ def crf_decoding(input, param_attr, label=None):
     return viterbi_path
 
 
+@templatedoc()
 def cos_sim(X, Y):
     """
-    This function performs the cosine similarity between two tensors
-    X and Y and returns that as the output.
+    ${comment}
 
     Args:
-        X (Variable): The input X.
-        Y (Variable): The input Y.
-    
+        X (Variable): ${x_comment}.
+        Y (Variable): ${y_comment}.
+
     Returns:
         Variable: the output of cosine(X, Y).
     """
@@ -925,13 +904,13 @@ def dropout(x, dropout_prob, is_test=False, seed=None, name=None):
 
     Drop or keep each element of `x` independently. Dropout is a regularization
     technique for reducing overfitting by preventing neuron co-adaption during
-    training. The dropout operator randomly set (according to the given dropout
+    training. The dropout operator randomly sets (according to the given dropout
     probability) the outputs of some units to zero, while others are remain
     unchanged.
 
     Args:
-        x (Variable): The input tensor.
-         dropout_prob (float): Probability of setting units to zero.
+        x (Variable): The input tensor variable.
+        dropout_prob (float): Probability of setting units to zero.
         is_test (bool): A flag indicating whether it is in test phrase or not.
         seed (int): A Python integer used to create random seeds. If this
                     parameter is set to None, a random seed is used.
@@ -941,13 +920,14 @@ def dropout(x, dropout_prob, is_test=False, seed=None, name=None):
                          will be named automatically.
 
     Returns:
-        Variable: A tensor variable.
+        Variable: A tensor variable is the shape with `x`.
 
     Examples:
+
         .. code-block:: python
 
-          x = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
-          droped = fluid.layers.dropout(input=x, dropout_rate=0.5)
+            x = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
+            droped = fluid.layers.dropout(x, dropout_prob=0.5)
     """
 
     helper = LayerHelper('dropout', **locals())
@@ -1097,20 +1077,94 @@ def chunk_eval(input,
                num_chunk_types,
                excluded_chunk_types=None):
     """
+    **Chunk Evaluator**
+
     This function computes and outputs the precision, recall and
     F1-score of chunk detection.
 
+    For some basics of chunking, please refer to
+    'Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>'.
+
+    ChunkEvalOp computes the precision, recall, and F1-score of chunk detection,
+    and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes.
+    Here is a NER example of labeling for these tagging schemes:
+
+    .. code-block:: python
+
+       ====== ====== ======  =====  ==  ============   =====  ===== =====  ==  =========
+              Li     Ming    works  at  Agricultural   Bank   of    China  in  Beijing.
+       ====== ====== ======  =====  ==  ============   =====  ===== =====  ==  =========
+       IO     I-PER  I-PER   O      O   I-ORG          I-ORG  I-ORG I-ORG  O   I-LOC
+       IOB    B-PER  I-PER   O      O   B-ORG          I-ORG  I-ORG I-ORG  O   B-LOC
+       IOE    I-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   E-LOC
+       IOBES  B-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   S-LOC
+       ====== ====== ======  =====  ==  ============   =====  ===== =====  ==  =========
+
+    There are three chunk types(named entity types) including PER(person), ORG(organization)
+    and LOC(LOCATION), and we can see that the labels have the form <tag type>-<chunk type>.
+
+    Since the calculations actually use label ids rather than labels, extra attention
+    should be paid when mapping labels to ids to make CheckEvalOp work. The key point
+    is that the listed equations are satisfied by ids.
+
+    .. code-block:: python
+
+       tag_type = label % num_tag_type
+       chunk_type = label / num_tag_type
+
+    where `num_tag_type` is the num of tag types in the tagging scheme, `num_chunk_type`
+    is the num of chunk types, and `tag_type` get its value from the following table.
+
+    .. code-block:: python
+
+       Scheme Begin Inside End   Single
+        plain   0     -      -     -
+        IOB     0     1      -     -
+        IOE     -     0      1     -
+        IOBES   0     1      2     3
+
+    Still use NER as example, assuming the tagging scheme is IOB while chunk types are ORG,
+    PER and LOC. To satisfy the above equations, the label map can be like this:
+
+    .. code-block:: python
+
+       B-ORG  0
+       I-ORG  1
+       B-PER  2
+       I-PER  3
+       B-LOC  4
+       I-LOC  5
+       O      6
+
+    It's not hard to verify the equations noting that the num of chunk types
+    is 3 and the num of tag types in IOB scheme is 2. For example, the label
+    id of I-LOC is 5, the tag type id of I-LOC is 1, and the chunk type id of
+    I-LOC is 2, which consistent with the results from the equations.
+
     Args:
         input (Variable): prediction output of the network.
         label (Variable): label of the test data set.
         chunk_scheme (str): ${chunk_scheme_comment}
         num_chunk_types (int): ${num_chunk_types_comment}
         excluded_chunk_types (list): ${excluded_chunk_types_comment}
-    
+
     Returns:
-        tuple: tuple containing: (precision, recall, f1_score,
-               num_infer_chunks, num_label_chunks,
-               num_correct_chunks)
+        tuple: tuple containing: precision, recall, f1_score,
+        num_infer_chunks, num_label_chunks,
+        num_correct_chunks
+
+    Examples:
+        .. code-block:: python
+
+            crf = fluid.layers.linear_chain_crf(
+                input=hidden, label=label, param_attr=ParamAttr(name="crfw"))
+            crf_decode = fluid.layers.crf_decoding(
+                input=hidden, param_attr=ParamAttr(name="crfw"))
+            fluid.layers.chunk_eval(
+                input=crf_decode,
+                label=label,
+                chunk_scheme="IOB",
+                num_chunk_types=(label_dict_len - 1) / 2)
     """
     helper = LayerHelper("chunk_eval", **locals())
 
@@ -1166,15 +1220,11 @@ def sequence_conv(input,
         bias_attr (ParamAttr|None): attributes for bias
         param_attr (ParamAttr|None): attributes for parameter
         act (str): the activation type
-    
+
     Returns:
         Variable: output of sequence_conv
     """
 
-    # FIXME(dzh) : want to unify the argument of python layer
-    # function. So we ignore some unecessary attributes.
-    # such as, padding_trainable, context_start.
-
     helper = LayerHelper('sequence_conv', **locals())
     dtype = helper.input_dtype()
     filter_shape = [filter_size * input.shape[1], num_filters]
@@ -1199,6 +1249,41 @@ def sequence_conv(input,
 
 
 def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=True):
+    """
+    This function computes the softmax activation among all time-steps for each
+    sequence. The dimension of each time-step should be 1. Thus, the shape of
+    input Tensor can be either :math:`[N, 1]` or :math:`[N]`, where :math:`N`
+    is the sum of the length of all sequences.
+
+    For i-th sequence in a mini-batch:
+
+    .. math::
+
+        Out(X[lod[i]:lod[i+1]], :) = \\frac{\exp(X[lod[i]:lod[i+1], :])}{\sum(\exp(X[lod[i]:lod[i+1], :]))}
+
+    For example, for a mini-batch of 3 sequences with variable-length,
+    each containing 2, 3, 2 time-steps, the lod of which is [0, 2, 5, 7],
+    then softmax will be computed among :math:`X[0:2, :]`, :math:`X[2:5, :]`,
+    :math:`X[5:7, :]`, and :math:`N` turns out to be 7.
+
+    Args:
+        input (Variable): The input variable which is a LoDTensor.
+        bias_attr (ParamAttr|None): attributes for bias
+        param_attr (ParamAttr|None): attributes for parameter
+        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn \
+        library is installed. Default: True
+
+    Returns:
+        Variable: output of sequence_softmax
+
+    Examples:
+
+        .. code-block:: python
+
+             x = fluid.layers.data(name='x', shape=[7, 1],
+                              dtype='float32', lod_level=1)
+             x_sequence_softmax = fluid.layers.sequence_softmax(input=x)
+    """
     helper = LayerHelper('sequence_softmax', **locals())
     dtype = helper.input_dtype()
     softmax_out = helper.create_tmp_variable(dtype)
@@ -1211,6 +1296,45 @@ def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=True):
 
 
 def softmax(input, param_attr=None, bias_attr=None, use_cudnn=True, name=None):
+    """
+    The input of the softmax layer is a 2-D tensor with shape N x K (N is the
+    batch_size, K is the dimension of input feature). The output tensor has the
+    same shape as the input tensor.
+
+    For each row of the input tensor, the softmax operator squashes the
+    K-dimensional vector of arbitrary real values to a K-dimensional vector of real
+    values in the range [0, 1] that add up to 1.
+
+    It computes the exponential of the given dimension and the sum of exponential
+    values of all the other dimensions in the K-dimensional vector input.
+    Then the ratio of the exponential of the given dimension and the sum of
+    exponential values of all the other dimensions is the output of the softmax
+    operator.
+
+    For each row :math:`i` and each column :math:`j` in Input(X), we have:
+
+    .. math::
+
+        Out[i, j] = \\frac{\exp(X[i, j])}{\sum_j(exp(X[i, j])}
+
+    Args:
+        input (Variable): The input variable.
+        bias_attr (ParamAttr): attributes for bias
+        param_attr (ParamAttr): attributes for parameter
+        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn \
+        library is installed.
+
+    Returns:
+        Variable: output of softmax
+
+    Examples:
+
+        .. code-block:: python
+
+             fc = fluid.layers.fc(input=x, size=10)
+             softmax = fluid.layers.softmax(input=fc)
+
+    """
     helper = LayerHelper('softmax', **locals())
     dtype = helper.input_dtype()
     softmax_out = helper.create_tmp_variable(dtype)
@@ -1236,14 +1360,17 @@ def conv2d(input,
            act=None,
            name=None):
     """
-    **Convlution2D Layer**
-
     The convolution2D layer calculates the output based on the input, filter
-    and strides, paddings, dilations, groups parameters. Input(Input) and
-    Output(Output) are in NCHW format. Where N is batch size, C is the number of
+    and strides, paddings, dilations, groups parameters. Input and
+    Output are in NCHW format, where N is batch size, C is the number of
     channels, H is the height of the feature, and W is the width of the feature.
-    The details of convolution layer, please refer UFLDL's `convolution,
-    <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_ .
+    Filter is in MCHW format, where M is the number of output image channels,
+    C is the number of input image channels, H is the height of the filter,
+    and W is the width of the filter. If the groups is greater than 1,
+    C will equal the number of input image channels divided by the groups.
+    Please refer to UFLDL's `convolution
+    <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_
+    for more detials.
     If bias attribution and activation type are provided, bias is added to the
     output of the convolution, and the corresponding activation function is
     applied to the final result.
@@ -1254,15 +1381,14 @@ def conv2d(input,
 
         Out = \sigma (W \\ast X + b)
 
-    In the above equation:
+    Where:
 
     * :math:`X`: Input value, a tensor with NCHW format.
     * :math:`W`: Filter value, a tensor with MCHW format.
     * :math:`\\ast`: Convolution operation.
     * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
     * :math:`\\sigma`: Activation function.
-    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be
-                   different.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
 
     Example:
 
@@ -1273,6 +1399,7 @@ def conv2d(input,
           Filter shape: :math:`(C_{out}, C_{in}, H_f, W_f)`
 
         - Output:
+
           Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
 
         Where
@@ -1284,7 +1411,7 @@ def conv2d(input,
 
     Args:
         input (Variable): The input image with [N, C, H, W] format.
-            num_filters(int): The number of filter. It is as same as the output
+        num_filters(int): The number of filter. It is as same as the output
             image channel.
         filter_size (int|tuple|None): The filter size. If filter_size is a tuple,
             it must contain two integers, (filter_size_H, filter_size_W).
@@ -1307,7 +1434,8 @@ def conv2d(input,
         bias_attr (ParamAttr): Bias parameter for the Conv2d layer. Default: None
         use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
             library is installed. Default: True
-        use_mkldnn (bool): Use mkldnn kernels or not.
+        use_mkldnn (bool): Use mkldnn kernels or not, it is valid only when compiled
+            with mkldnn library. Default: False
         act (str): Activation type. Default: None
         name (str|None): A name for this layer(optional). If set None, the layer
             will be named automatically.
@@ -1323,10 +1451,8 @@ def conv2d(input,
     Examples:
         .. code-block:: python
 
-          data = fluid.layers.data(
-              name='data', shape=[3, 32, 32], dtype='float32')
-          conv2d = fluid.layers.conv2d(
-              input=data, num_filters=2, filter_size=3, act="relu")
+          data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
+          conv2d = fluid.layers.conv2d(input=data, num_filters=2, filter_size=3, act="relu")
     """
 
     num_channels = input.shape[1]
@@ -1428,8 +1554,7 @@ def conv3d(input,
     * :math:`\\ast`: Convolution operation.
     * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
     * :math:`\\sigma`: Activation function.
-    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be
-                   different.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
 
     Example:
 
@@ -1491,10 +1616,8 @@ def conv3d(input,
     Examples:
         .. code-block:: python
 
-          data = fluid.layers.data(
-              name='data', shape=[3, 12, 32, 32], dtype='float32')
-          conv2d = fluid.layers.conv3d(
-              input=data, num_filters=2, filter_size=3, act="relu")
+          data = fluid.layers.data(name='data', shape=[3, 12, 32, 32], dtype='float32')
+          conv3d = fluid.layers.conv3d(input=data, num_filters=2, filter_size=3, act="relu")
     """
 
     l_type = 'conv3d'
@@ -1571,13 +1694,13 @@ def sequence_pool(input, pool_type):
     .. code-block:: text
 
        x is a 1-level LoDTensor:
-         x.lod = [[0, 2, 5, 7]]
+         x.lod = [[2, 3, 2]]
          x.data = [1, 3, 2, 4, 6, 5, 1]
          x.dims = [7, 1]
 
        then output is a Tensor:
          out.dim = [3, 1]
-         with condition len(x.lod[-1]) - 1 == out.dims[0]
+         with condition len(x.lod[-1]) == out.dims[0]
 
        for different pool_type:
          average: out.data = [2, 4, 3], where 2=(1+3)/2, 4=(2+4+6)/3, 3=(5+1)/2
@@ -1636,13 +1759,13 @@ def sequence_first_step(input):
     .. code-block:: text
 
        x is a 1-level LoDTensor:
-         x.lod = [[0, 2, 5, 7]]
+         x.lod = [[2, 3, 2]]
          x.data = [1, 3, 2, 4, 6, 5, 1]
          x.dims = [7, 1]
 
        then output is a Tensor:
          out.dim = [3, 1]
-         with condition len(x.lod[-1]) - 1 == out.dims[0]
+         with condition len(x.lod[-1]) == out.dims[0]
          out.data = [1, 2, 5], where 1=first(1,3), 2=first(2,4,6), 5=first(5,1)
 
     Args:
@@ -1669,13 +1792,13 @@ def sequence_last_step(input):
     .. code-block:: text
 
        x is a 1-level LoDTensor:
-         x.lod = [[0, 2, 5, 7]]
+         x.lod = [[2, 3, 2]]
          x.data = [1, 3, 2, 4, 6, 5, 1]
          x.dims = [7, 1]
 
        then output is a Tensor:
          out.dim = [3, 1]
-         with condition len(x.lod[-1]) - 1 == out.dims[0]
+         with condition len(x.lod[-1]) == out.dims[0]
          out.data = [3, 6, 1], where 3=last(1,3), 6=last(2,4,6), 1=last(5,1)
 
     Args:
@@ -1695,6 +1818,7 @@ def sequence_last_step(input):
     return sequence_pool(input=input, pool_type="last")
 
 
+@templatedoc()
 def pool2d(input,
            pool_size=-1,
            pool_type="max",
@@ -1706,24 +1830,45 @@ def pool2d(input,
            use_mkldnn=False,
            name=None):
     """
-    This function adds the operator for pooling in 2 dimensions, using the
-    pooling configurations mentioned in input parameters.
+    ${comment}
 
     Args:
-        input (Variable): ${input_comment}
-        pool_size (int): ${ksize_comment}
-        pool_type (str): ${pooling_type_comment}
+        input (Variable): The input tensor of pooling operator. The format of
+                          input tensor is NCHW, where N is batch size, C is
+                          the number of channels, H is the height of the
+                          feature, and W is the width of the feature.
+        pool_size (int): The side length of pooling windows. All pooling
+                         windows are squares with pool_size on a side.
+        pool_type: ${pooling_type_comment}
         pool_stride (int): stride of the pooling layer.
         pool_padding (int): padding size.
-        global_pooling (bool): ${global_pooling_comment}
-        use_cudnn (bool): ${use_cudnn_comment}
-        ceil_mode (bool): ${ceil_mode_comment}
-        use_mkldnn (bool): ${use_mkldnn_comment}
-        name (str): A name for this layer(optional). If set None, the layer
-            will be named automatically.
-    
+        global_pooling: ${global_pooling_comment}
+        use_cudnn: ${use_cudnn_comment}
+        ceil_mode: ${ceil_mode_comment}
+        use_mkldnn: ${use_mkldnn_comment}
+        name (str|None): A name for this layer(optional). If set None, the
+                        layer will be named automatically.
+
     Returns:
-        Variable: output of pool2d layer.
+        Variable: The pooling result.
+
+    Raises:
+        ValueError: If 'pool_type' is not "max" nor "avg"
+        ValueError: If 'global_pooling' is False and 'pool_size' is -1
+        ValueError: If 'use_cudnn' is not a bool value.
+
+    Examples:
+
+        .. code-block:: python
+
+          data = fluid.layers.data(
+              name='data', shape=[3, 32, 32], dtype='float32')
+          conv2d = fluid.layers.pool2d(
+                            input=data,
+                            pool_size=2,
+                            pool_type='max',
+                            pool_stride=1,
+                            global_pooling=False)
     """
     if pool_type not in ["max", "avg"]:
         raise ValueError(
@@ -1849,29 +1994,61 @@ def batch_norm(input,
                name=None,
                moving_mean_name=None,
                moving_variance_name=None,
-               do_model_average_for_mean_and_var=False):
+               do_model_average_for_mean_and_var=False,
+               fuse_with_relu=False):
     """
-    This function helps create an operator to implement
-    the BatchNorm layer using the configurations from the input parameters.
+    **Batch Normalization Layer**
+
+    Can be used as a normalizer function for conv2d and fully_connected operations.
+    The required data format for this layer is one of the following:
+
+    1. NHWC `[batch, in_height, in_width, in_channels]`
+
+    2. NCHW `[batch, in_channels, in_height, in_width]`
+
+    Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing
+    Internal Covariate Shift <https://arxiv.org/pdf/1502.03167.pdf>`_
+    for more details.
+
+    :math:`input` is the input features over a mini-batch.
+
+    ..  math::
+
+        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
+        \ mini-batch\ mean \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
 
     Args:
-        input (Variable): the input variable.
-        act (str): activation type
-        is_test (bool): whether to run batch_norm as test mode.
-        momentum (float): momentum
-        epsilon (float): epsilon, default 1e-05
-        param_attr (ParamAttr|None): attributes for parameter
-        bias_attr (ParamAttr|None): attributes for bias
-        data_layout (str): data layout, default NCHW
-        in_place (bool): if True, do not create tmp variable
-        use_mkldnn (bool): ${use_mkldnn_comment}
-        name (str): The name of this layer. It is optional.
-        moving_mean_name (str): The name of moving mean variable name, optional.
-        moving_variance_name (str): The name of moving variance name, optional.
-        do_model_average_for_mean_and_var (bool):
+        input(variable): The input variable which is a LoDTensor.
+        act(string, Default None): Activation type, linear|relu|prelu|...
+        is_test(bool, Default False): Used for training or training.
+        momentum(float, Default 0.9):
+        epsilon(float, Default 1e-05):
+        param_attr(ParamAttr): The parameter attribute for Parameter `scale`.
+        bias_attr(ParamAttr): The parameter attribute for Parameter `bias`.
+        data_layout(string, default NCHW): NCHW|NHWC
+        in_place(bool, Default False): Make the input and output of batch norm reuse memory.
+        use_mkldnn(bool, Default false): ${use_mkldnn_comment}
+        name(string, Default None): A name for this layer(optional). If set None, the layer
+            will be named automatically.
+        moving_mean_name(string, Default None): The name of moving_mean which store the global Mean.
+        moving_variance_name(string, Default None): The name of the moving_variance which store the global Variance.
+        do_model_average_for_mean_and_var(bool, Default False): Do model average for mean and variance or not.
+        fuse_with_relu (bool): if True, this OP performs relu after batch norm.
 
     Returns:
-        Variable: output of batch_norm layer.
+        Variable: A tensor variable which is the result after applying batch normalization on the input.
+
+    Examples:
+
+        .. code-block:: python
+
+            hidden1 = fluid.layers.fc(input=x, size=200, param_attr='fc1.w')
+            hidden2 = fluid.layers.batch_norm(input=hidden1)
     """
     helper = LayerHelper('batch_norm', **locals())
     dtype = helper.input_dtype()
@@ -1947,12 +2124,14 @@ def batch_norm(input,
             "momentum": momentum,
             "epsilon": epsilon,
             "is_test": is_test,
-            "use_mkldnn": use_mkldnn
+            "use_mkldnn": use_mkldnn,
+            "fuse_with_relu": fuse_with_relu
         })
 
     return helper.append_activation(batch_norm_out)
 
 
+@templatedoc()
 def layer_norm(input,
                scale=True,
                shift=True,
@@ -1963,20 +2142,11 @@ def layer_norm(input,
                act=None,
                name=None):
     """
-    **Layer Normalization**
-
-    Assume feature vectors exist on dimensions
-    :attr:`begin_norm_axis ... rank(input)` and calculate the moment statistics
-    along these dimensions for each feature vector :math:`a` with size
-    :math:`H`, then normalize each feature vector using the corresponding
-    statistics. After that, apply learnable gain and bias on the normalized
-    tensor to scale and shift if :attr:`scale` and :attr:`shift` are set.
-
-    Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_
+    ${comment}
 
     The formula is as follows:
 
-    .. math::
+    ..  math::
 
         \\mu & = \\frac{1}{H}\\sum_{i=1}^{H} a_i
 
@@ -1984,6 +2154,15 @@ def layer_norm(input,
 
         h & = f(\\frac{g}{\\sigma}(a - \\mu) + b)
 
+    * :math:`a`: the vector representation of the summed inputs to the neurons
+    in that layer.
+
+    * :math:`H`: the number of hidden units in a layers
+
+    * :math:`g`: the trainable scale parameter.
+
+    * :math:`b`: the trainable bias parameter.
+
     Args:
         input(Variable): The input tensor variable.
         scale(bool): Whether to learn the adaptive gain :math:`g` after
@@ -2002,14 +2181,13 @@ def layer_norm(input,
         name (str): The name of this layer. It is optional.
 
     Returns:
-        Variable: A tensor variable with the same shape as the input.
+        ${y_comment}
 
     Examples:
-        .. code-block:: python
 
-            data = fluid.layers.data(
-              name='data', shape=[3, 32, 32], dtype='float32')
-            x = fluid.layers.layer_norm(input=data, begin_norm_axis=1)
+        >>> data = fluid.layers.data(name='data', shape=[3, 32, 32],
+        >>>                          dtype='float32')
+        >>> x = fluid.layers.layer_norm(input=data, begin_norm_axis=1)
     """
     helper = LayerHelper('layer_norm', **locals())
     dtype = helper.input_dtype()
@@ -2050,34 +2228,6 @@ def layer_norm(input,
     return helper.append_activation(layer_norm_out)
 
 
-def beam_search_decode(ids, scores, name=None):
-    """
-    ${beam_search_decode}
-
-    Args:
-        ids (Variable): ${ids_comment}
-        scores (Variable): ${scores_comment}
-        name (str): The name of this layer. It is optional.
-    
-    Returns:
-        tuple: a tuple of two output variable: sentence_ids, sentence_scores
-    """
-    helper = LayerHelper('beam_search_decode', **locals())
-    sentence_ids = helper.create_tmp_variable(dtype=ids.dtype)
-    sentence_scores = helper.create_tmp_variable(dtype=ids.dtype)
-
-    helper.append_op(
-        type="beam_search_decode",
-        inputs={"Ids": ids,
-                "Scores": scores},
-        outputs={
-            "SentenceIds": sentence_ids,
-            "SentenceScores": sentence_scores
-        })
-
-    return sentence_ids, sentence_scores
-
-
 def conv2d_transpose(input,
                      num_filters,
                      output_size=None,
@@ -2102,32 +2252,36 @@ def conv2d_transpose(input,
     represent height and width, respectively. The details of convolution transpose
     layer, please refer to the following explanation and references
     `therein <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_.
+    If bias attribution and activation type are provided, bias is added to
+    the output of the convolution, and the corresponding activation function
+    is applied to the final result.
 
     For each input :math:`X`, the equation is:
 
     .. math::
 
-        Out = W \\ast X
+        Out = \sigma (W \\ast X + b)
 
-    In the above equation:
+    Where:
 
     * :math:`X`: Input value, a tensor with NCHW format.
     * :math:`W`: Filter value, a tensor with MCHW format.
-    * :math:`\\ast` : Convolution transpose operation.
-    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be
-                   different.
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
 
     Example:
 
         - Input:
 
-          Input shape: $(N, C_{in}, H_{in}, W_{in})$
+          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
 
-          Filter shape: $(C_{in}, C_{out}, H_f, W_f)$
+          Filter shape: :math:`(C_{in}, C_{out}, H_f, W_f)`
 
         - Output:
 
-          Output shape: $(N, C_{out}, H_{out}, W_{out})$
+          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
 
         Where
 
@@ -2181,15 +2335,20 @@ def conv2d_transpose(input,
     Examples:
        .. code-block:: python
 
-          data = fluid.layers.data(
-              name='data', shape=[3, 32, 32], dtype='float32')
-          conv2d_transpose = fluid.layers.conv2d_transpose(
-              input=data, num_filters=2, filter_size=3)
+          data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
+          conv2d_transpose = fluid.layers.conv2d_transpose(input=data, num_filters=2, filter_size=3)
     """
-    helper = LayerHelper("conv2d_transpose", **locals())
+
+    input_channel = input.shape[1]
+
+    op_type = 'conv2d_transpose'
+    if (input_channel == groups and num_filters == input_channel and
+            not use_cudnn):
+        op_type = 'depthwise_conv2d_transpose'
+
+    helper = LayerHelper(op_type, **locals())
     if not isinstance(input, Variable):
         raise TypeError("Input of conv2d_transpose must be Variable")
-    input_channel = input.shape[1]
 
     padding = utils.convert_to_list(padding, 2, 'padding')
     stride = utils.convert_to_list(stride, 2, 'stride')
@@ -2223,7 +2382,7 @@ def conv2d_transpose(input,
 
     pre_bias = helper.create_tmp_variable(dtype=input.dtype)
     helper.append_op(
-        type='conv2d_transpose',
+        type=op_type,
         inputs={'Input': [input],
                 'Filter': [img_filter]},
         outputs={'Output': pre_bias},
@@ -2264,32 +2423,36 @@ def conv3d_transpose(input,
     two elements. These two elements represent height and width, respectively.
     The details of convolution transpose layer, please refer to the following
     explanation and references `therein <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_.
+    If bias attribution and activation type are provided, bias is added to
+    the output of the convolution, and the corresponding activation function
+    is applied to the final result.
 
     For each input :math:`X`, the equation is:
 
     .. math::
 
-        Out = W \\ast X
+        Out = \sigma (W \\ast X + b)
 
     In the above equation:
 
     * :math:`X`: Input value, a tensor with NCDHW format.
     * :math:`W`: Filter value, a tensor with MCDHW format.
-    * :math:`\\ast` : Convolution transpose operation.
-    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be
-                   different.
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
 
     Example:
 
         - Input:
 
-          Input shape: $(N, C_{in}, D_{in}, H_{in}, W_{in})$
+          Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
 
-          Filter shape: $(C_{in}, C_{out}, D_f, H_f, W_f)$
+          Filter shape: :math:`(C_{in}, C_{out}, D_f, H_f, W_f)`
 
         - Output:
 
-          Output shape: $(N, C_{out}, D_{out}, H_{out}, W_{out})$
+          Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
 
         Where
 
@@ -2344,10 +2507,8 @@ def conv3d_transpose(input,
     Examples:
        .. code-block:: python
 
-          data = fluid.layers.data(
-              name='data', shape=[3, 12, 32, 32], dtype='float32')
-          conv2d_transpose = fluid.layers.conv3d_transpose(
-              input=data, num_filters=2, filter_size=3)
+          data = fluid.layers.data(name='data', shape=[3, 12, 32, 32], dtype='float32')
+          conv3d_transpose = fluid.layers.conv3d_transpose(input=data, num_filters=2, filter_size=3)
     """
     l_type = "conv3d_transpose"
     helper = LayerHelper(l_type, **locals())
@@ -2418,18 +2579,18 @@ def sequence_expand(x, y, ref_level=-1, name=None):
 
         * Case 1
             x is a LoDTensor:
-                x.lod  = [[0,   2,        4]]
+                x.lod  = [[2,        2]]
                 x.data = [[a], [b], [c], [d]]
                 x.dims = [4, 1]
 
             y is a LoDTensor:
-                y.lod = [[0,    2,    4],
-                         [0, 3, 6, 7, 8]]
+                y.lod = [[2,    2],
+                         [3, 3, 1, 1]]
 
             ref_level: 0
 
             then output is a 1-level LoDTensor:
-                out.lod =  [[0,   2,        4,        6,        8]]
+                out.lod =  [[2,        2,        2,        2]]
                 out.data = [[a], [b], [a], [b], [c], [d], [c], [d]]
                 out.dims = [8, 1]
 
@@ -2439,7 +2600,7 @@ def sequence_expand(x, y, ref_level=-1, name=None):
                 x.dims = [3, 1]
 
             y is a LoDTensor:
-                y.lod = [[0, 2, 2, 5]]
+                y.lod = [[2, 0, 3]]
 
             ref_level: -1
 
@@ -2477,21 +2638,89 @@ def sequence_expand(x, y, ref_level=-1, name=None):
     return tmp
 
 
-def beam_search(pre_ids, ids, scores, beam_size, end_id, level=0):
-    '''
-    This function implements the beam search algorithm.
+def beam_search(pre_ids,
+                pre_scores,
+                ids,
+                scores,
+                beam_size,
+                end_id,
+                level=0,
+                name=None):
+    """
+    Beam search is a classical algorithm for selecting candidate words in a
+    machine translation task.
 
-    Args:
-        pre_ids (Variable): ${pre_ids_comment}
-        ids (Variable): ${ids_comment}
-        scores (Variable): ${scores_comment}
-        beam_size (int): ${beam_size_comment}
-        end_id (int): ${end_id_comment}
-        level (int): ${level_comment}
+    Refer to `Beam search <https://en.wikipedia.org/wiki/Beam_search>`_
+    for more details.
     
+    This layer does the search in beams for one time step. Specifically, it 
+    selects the top-K candidate word ids of current step from :attr:`ids`
+    according to their :attr:`scores` for all source sentences, where K is
+    :attr:`beam_size` and :attr:`ids, scores` are predicted results from the
+    computation cell. Additionally, :attr:`pre_ids` and :attr:`pre_scores` are
+    the output of beam_search at previous step, they are needed for special use
+    to handle ended candidate translations.
+ 
+    Note that the :attr:`scores` passed in should be accumulated scores, and
+    length penalty should be done with extra operators before calculating the
+    accumulated scores if needed, also suggest finding top-K before it and
+    using the top-K candidates following.
+
+    Please see the following demo for a fully beam search usage example:
+
+        fluid/tests/book/test_machine_translation.py
+
+    Args:
+        pre_ids(Variable): The LodTensor variable which is the output of
+            beam_search at previous step. It should be a LodTensor with shape
+            :math:`(batch_size, 1)` and lod
+            :math:`[[0, 1, ... , batch_size], [0, 1, ..., batch_size]]` at the
+            first step.
+        pre_scores(Variable): The LodTensor variable which is the output of
+            beam_search at previous step.
+        ids(Variable): The LodTensor variable containing the candidates ids.
+            Its shape should be :math:`(batch_size \\times beam_size, K)`,
+            where :math:`K` supposed to be :attr:`beam_size`.
+        scores(Variable): The LodTensor variable containing the accumulated
+            scores corresponding to :attr:`ids` and its shape is the same as
+            the shape of :attr:`ids`.
+        beam_size(int): The beam width used in beam search.
+        end_id(int): The id of end token.
+        level(int, default 0): It can be ignored and mustn't change currently.
+            It means the source level of lod, which is explained as following.
+            The lod level of :attr:`ids` should be 2. The first level is source
+            level which describes how many prefixes (branchs) for each source
+            sentece (beam), and the second level is sentence level which
+            describes how these candidates belong to the prefix. The paths
+            linking prefixes and selected candidates are organized and reserved
+            in lod.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
+
     Returns:
-        tuple: a tuple of beam_search output variables: selected_ids, selected_scores
-    '''
+        Variable: The LodTensor pair containing the selected ids and the \
+            corresponding scores.
+
+    Examples:
+        .. code-block:: python
+
+            # Suppose `probs` contains predicted results from the computation
+            # cell and `pre_ids` and `pre_scores` is the output of beam_search
+            # at previous step.
+            topk_scores, topk_indices = layers.topk(probs, k=beam_size)
+            accu_scores = layers.elementwise_add(
+                x=layers.log(x=topk_scores)),
+                y=layers.reshape(
+                    pre_scores, shape=[-1]),
+                axis=0)
+            selected_ids, selected_scores = layers.beam_search(
+                pre_ids=pre_ids,
+                pre_scores=pre_scores,
+                ids=topk_indices,
+                scores=accu_scores,
+                beam_size=beam_size,
+                end_id=end_id)
+    """
     helper = LayerHelper('beam_search', **locals())
     score_type = scores.dtype
     id_type = ids.dtype
@@ -2503,6 +2732,7 @@ def beam_search(pre_ids, ids, scores, beam_size, end_id, level=0):
         type='beam_search',
         inputs={
             'pre_ids': pre_ids,
+            'pre_scores': pre_scores,
             'ids': ids,
             'scores': scores,
         },
@@ -2520,6 +2750,56 @@ def beam_search(pre_ids, ids, scores, beam_size, end_id, level=0):
     return selected_ids, selected_scores
 
 
+def beam_search_decode(ids, scores, beam_size, end_id, name=None):
+    """
+    Beam Search Decode Layer. This layer constructs the full hypotheses for
+    each source sentence by walking back along the LoDTensorArray :attr:`ids`
+    whose lods can be used to restore the path in the beam search tree.
+    Please see the following demo for a fully beam search usage example:
+        fluid/tests/book/test_machine_translation.py
+
+    Args:
+        ids(Variable): The LodTensorArray variable containing the selected ids
+            of all steps.
+        scores(Variable): The LodTensorArray variable containing the selected
+            scores of all steps.
+        beam_size(int): The beam width used in beam search.
+        end_id(int): The id of end token.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
+
+    Returns:
+        Variable: The LodTensor pair containing the generated id sequences \
+            and the corresponding scores. The shapes and lods of the two \
+            LodTensor are same. The lod level is 2 and the two levels \
+            separately indicate how many hypotheses each source sentence has \
+            and how many ids each hypothesis has.
+
+    Examples:
+        .. code-block:: python
+            # Suppose `ids` and `scores` are LodTensorArray variables reserving
+            # the selected ids and scores of all steps
+            finished_ids, finished_scores = layers.beam_search_decode(
+                ids, scores, beam_size=5, end_id=0)
+    """
+    helper = LayerHelper('beam_search_decode', **locals())
+    sentence_ids = helper.create_tmp_variable(dtype=ids.dtype)
+    sentence_scores = helper.create_tmp_variable(dtype=ids.dtype)
+
+    helper.append_op(
+        type="beam_search_decode",
+        inputs={"Ids": ids,
+                "Scores": scores},
+        outputs={
+            "SentenceIds": sentence_ids,
+            "SentenceScores": sentence_scores
+        },
+        attrs={"beam_size": beam_size,
+               "end_id": end_id})
+
+    return sentence_ids, sentence_scores
+
+
 def lstm_unit(x_t,
               hidden_t_prev,
               cell_t_prev,
@@ -2698,23 +2978,24 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None):
 
 def reduce_mean(input, dim=None, keep_dim=False, name=None):
     """
-    Computes the mean of tensor elements over the given dimension.
+    Computes the mean of the input tensor's elements along the given dimension.
 
     Args:
         input (Variable): The input variable which is a Tensor or LoDTensor.
-        dim (list|int|None): The dimensions along which the mean is computed. If
-            :attr:`None`, compute the mean over all elements of :attr:`input`
-            and return a Tensor variable with a single element, otherwise
+        dim (list|int|None): The dimension along which the mean is computed. If
+            `None`, compute the mean over all elements of :attr:`input`
+            and return a variable with a single element, otherwise it
             must be in the range :math:`[-rank(input), rank(input))`. If
-            :math:`dim[i] < 0`, the dimension to reduce is :math:`rank + dim[i]`.
+            :math:`dim[i] < 0`, the dimension to reduce is
+            :math:`rank(input) + dim[i]`.
         keep_dim (bool): Whether to reserve the reduced dimension in the
             output Tensor. The result tensor will have one fewer dimension
             than the :attr:`input` unless :attr:`keep_dim` is true.
-        name(str|None): A name for this layer(optional). If set None, the layer
+        name(str|None): A name for this layer(optional). If set `None`, the layer
                        will be named automatically.
 
     Returns:
-        Variable: The reduced Tensor variable.
+        Variable: The reduced mean Variable.
 
     Examples:
         .. code-block:: python
@@ -2936,7 +3217,7 @@ def split(input, num_or_sections, dim=-1, name=None):
                        will be named automatically.
 
     Returns:
-        List: The list of segmented tensor variables.
+        list(Variable): The list of segmented tensor variables.
 
     Examples:
         .. code-block:: python
@@ -2987,32 +3268,33 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
     norm. For a 1-D tensor (`dim` is fixed to 0), this layer computes
 
     .. math::
-    y = \frac{x}{ \sqrt{\sum {x^2} + epsion }}
+
+        y = \\frac{x}{ \sqrt{\sum {x^2} + epsion }}
 
     For `x` with more dimensions, this layer independently normalizes each 1-D
     slice along dimension `axis`.
 
     Args:
         x(Variable|list): The input tensor to l2_normalize layer.
-        axis(int): The axis on which to apply normalization. If `axis < 0`,
+        axis(int): The axis on which to apply normalization. If `axis < 0`, \
             the dimension to normalization is rank(X) + axis. -1 is the
             last dimension.
-        epsilon(float): The epsilon value is used to avoid division by zero,
+        epsilon(float): The epsilon value is used to avoid division by zero, \
             the defalut value is 1e-10.
-        name(str|None): A name for this layer(optional). If set None, the layer
+        name(str|None): A name for this layer(optional). If set None, the layer \
             will be named automatically.
 
-
     Returns:
-        Variable: The output tensor variable.
+        Variable: The output tensor variable is the same shape with `x`.
 
     Examples:
+
         .. code-block:: python
 
-          data = fluid.layers.data(name="data",
-                                   shape=(3, 17, 13),
-                                   dtype="float32")
-          normed = fluid.layers.l2_normalize(x=data, axis=1)
+            data = fluid.layers.data(name="data",
+                                     shape=(3, 17, 13),
+                                     dtype="float32")
+            normed = fluid.layers.l2_normalize(x=data, axis=1)
     """
 
     if len(x.shape) == 1:
@@ -3144,25 +3426,51 @@ def topk(input, k, name=None):
     This operator is used to find values and indices of the k largest entries
     for the last dimension.
 
-    If the input is a vector (rank=1), finds the k largest entries in the vector
+    If the input is a vector (1-D Tensor), finds the k largest entries in the vector
     and outputs their values and indices as vectors. Thus values[j] is the j-th
     largest entry in input, and its index is indices[j].
 
     If the input is a Tensor with higher rank, this operator computes the top k
     entries along the last dimension.
 
+    For example:
+
+    .. code-block:: text
+
+        If:
+            input = [[5, 4, 2, 3],
+                     [9, 7, 10, 25],
+                     [6, 2, 10, 1]]
+            k = 2
+
+        Then:
+            The first output:
+            values = [[5, 4],
+                      [10, 25],
+                      [6, 10]]
+
+            The second output:
+            indices = [[0, 1],
+                       [2, 3],
+                       [0, 2]]
+
     Args:
         input(Variable): The input variable which can be a vector or Tensor with
             higher rank.
-        k(int): An integer value to specify the top k largest elements.
+        k(int):  The number of top elements to look for along the last dimension
+                 of input.
         name(str|None): A name for this layer(optional). If set None, the layer
                        will be named automatically.
+                       Default: None
 
     Returns:
-        values(Variable): The k largest elements along each last dimensional
-            slice.
-        indices(Variable): The indices of values within the last dimension of
-            input.
+        Tuple[Variable]: A tuple with two elements. Each element is a Variable.
+        The first one is k largest elements along each last
+        dimensional slice. The second one is indices of values
+        within the last dimension of input.
+
+    Raises:
+        ValueError: If k < 1 or k is not less than the last dimension of input
 
     Examples:
         .. code-block:: python
@@ -3170,7 +3478,7 @@ def topk(input, k, name=None):
             top5_values, top5_indices = layers.topk(input, k=5)
     """
     shape = input.shape
-    if k < 1 and k >= shape[-1]:
+    if k < 1 or k >= shape[-1]:
         raise ValueError("k must be greater than 0 and less than %d." %
                          (shape[-1]))
 
@@ -3188,8 +3496,7 @@ def topk(input, k, name=None):
     return values, indices
 
 
-def edit_distance(input, label, normalized=True, ignored_tokens=None,
-                  name=None):
+def edit_distance(input, label, normalized=True, ignored_tokens=None):
     """
     EditDistance operator computes the edit distances between a batch of
     hypothesis strings and their references. Edit distance, also called
@@ -3203,21 +3510,21 @@ def edit_distance(input, label, normalized=True, ignored_tokens=None,
 
     "kitten" -> "sitten" -> "sittin" -> "sitting"
 
-    Input(Hyps) is a LoDTensor consisting of all the hypothesis strings with
+    The input is a LoDTensor consisting of all the hypothesis strings with
     the total number denoted by `batch_size`, and the separation is specified
     by the LoD information. And the `batch_size` reference strings are arranged
-    in order in the same way in the LoDTensor Input(Refs).
+    in order in the same way in the input LoDTensor.
 
-    Output(Out) contains the `batch_size` results and each stands for the edit
+    The output contains the `batch_size` results and each stands for the edit
     distance for a pair of strings respectively. If Attr(normalized) is true,
     the edit distance will be divided by the length of reference string.
 
     Args:
         input(Variable): The indices for hypothesis strings.
         label(Variable): The indices for reference strings.
-        normalized(bool): Indicated whether to normalize the edit distance by
+        normalized(bool, default True): Indicated whether to normalize the edit distance by
                           the length of reference string.
-        ignored_tokens(list of int): Tokens that should be removed before
+        ignored_tokens(list<int>, default None): Tokens that should be removed before
                                      calculating edit distance.
         name (str): The name of this layer. It is optional.
 
@@ -3229,7 +3536,6 @@ def edit_distance(input, label, normalized=True, ignored_tokens=None,
 
             x = fluid.layers.data(name='x', shape=[8], dtype='float32')
             y = fluid.layers.data(name='y', shape=[7], dtype='float32')
-
             cost = fluid.layers.edit_distance(input=x,label=y)
     """
     helper = LayerHelper("edit_distance", **locals())
@@ -3270,6 +3576,7 @@ def edit_distance(input, label, normalized=True, ignored_tokens=None,
 def ctc_greedy_decoder(input, blank, name=None):
     """
     This op is used to decode sequences by greedy policy by below steps:
+
     1. Get the indexes of max value for each row in input. a.k.a.
        numpy.argmax(input, axis=0).
     2. For each sequence in result of step1, merge repeated tokens between two
@@ -3291,7 +3598,7 @@ def ctc_greedy_decoder(input, blank, name=None):
                       [0.2, 0.2, 0.1, 0.5],
                       [0.5, 0.1, 0.3, 0.1]]
 
-        input.lod = [[0, 4, 8]]
+        input.lod = [[4, 4]]
 
         Then:
 
@@ -3299,7 +3606,7 @@ def ctc_greedy_decoder(input, blank, name=None):
                        [1],
                        [3]]
 
-        output.lod = [[0, 2, 3]]
+        output.lod = [[2, 1]]
 
     Args:
 
@@ -3316,7 +3623,7 @@ def ctc_greedy_decoder(input, blank, name=None):
 
     Returns:
         Variable: CTC greedy decode result. If all the sequences in result were
-        empty, the result LoDTensor will be [-1] with LoD [[0]] and dims [1, 1].
+        empty, the result LoDTensor will be [-1] with LoD [[]] and dims [1, 1].
 
     Examples:
         .. code-block:: python
@@ -3349,35 +3656,33 @@ def warpctc(input, label, blank=0, norm_by_times=False):
     input tensor.
 
     Args:
-        input(Variable): (LodTensor, default: LoDTensor<float>),
-            the unscaled probabilities of variable-length sequences,
-            which is a 2-D Tensor with LoD information.
-            It's shape is [Lp, num_classes + 1], where Lp is the sum of all input
-            sequences' length and num_classes is the true number of classes.
-            (not including the blank label).
-        label(Variable): (LodTensor, default: LoDTensor<int>), the ground truth
-            of variable-length sequence, which is a 2-D Tensor with LoD
-            information. It is of the shape [Lg, 1], where Lg is th sum of
-            all labels' length.
-        blank (int): default 0, the blank label index of Connectionist
-            Temporal Classification (CTC) loss, which is in the
-            half-opened interval [0, num_classes + 1).
-        norm_by_times (bool): default false, whether to normalize
-            the gradients by the number of time-step, which is also the
-            sequence's length. There is no need to normalize the gradients
-            if warpctc layer was follewed by a mean_op.
+       input (Variable): The unscaled probabilities of variable-length sequences,
+         which is a 2-D Tensor with LoD information.
+         It's shape is [Lp, num_classes + 1], where Lp is the sum of all input
+         sequences' length and num_classes is the true number of classes.
+         (not including the blank label).
+       label (Variable): The ground truth of variable-length sequence,
+         which is a 2-D Tensor with LoD information. It is of the shape [Lg, 1],
+         where Lg is th sum of all labels' length.
+       blank (int, default 0): The blank label index of Connectionist
+         Temporal Classification (CTC) loss, which is in the
+         half-opened interval [0, num_classes + 1).
+       norm_by_times(bool, default false): Whether to normalize the gradients
+         by the number of time-step, which is also the sequence's length.
+         There is no need to normalize the gradients if warpctc layer was
+         follewed by a mean_op.
 
     Returns:
         Variable: The Connectionist Temporal Classification (CTC) loss,
         which is a 2-D Tensor of the shape [batch_size, 1].
 
     Examples:
+
         .. code-block:: python
-            y = layers.data(
-                name='y', shape=[11, 8], dtype='float32', lod_level=1)
-            y_predict = layers.data(
-                name='y_predict', shape=[11, 1], dtype='float32')
-            cost = layers.warpctc(input=y_predict, label=y)
+
+            label = fluid.layers.data(shape=[11, 8], dtype='float32', lod_level=1)
+            predict = fluid.layers.data(shape=[11, 1], dtype='float32')
+            cost = fluid.layers.warpctc(input=predict, label=label)
 
     """
     helper = LayerHelper('warpctc', **locals())
@@ -3407,16 +3712,20 @@ def sequence_reshape(input, new_dim):
 
         x is a LoDTensor:
             x.lod  = [[0, 2, 6]]
-            x.data = [[1, 2], [3, 4],
-                      [5, 6], [7, 8], [9, 10], [11, 12]]
+            x.data = [[1,  2], [3,  4],
+                      [5,  6], [7,  8],
+                      [9, 10], [11, 12]]
             x.dims = [6, 2]
 
         set new_dim = 4
 
         then out is a LoDTensor:
+
             out.lod  = [[0, 1, 3]]
-            out.data = [[1, 2, 3, 4],
-                        [5, 6, 7, 8], [9, 10, 11, 12]]
+
+            out.data = [[1,  2,  3,  4],
+                        [5,  6,  7,  8],
+                        [9, 10, 11, 12]]
             out.dims = [3, 4]
 
     Currently, only 1-level LoDTensor is supported and please make sure
@@ -3424,19 +3733,19 @@ def sequence_reshape(input, new_dim):
     no remainder for each sequence.
 
     Args:
-        input (Variable): (LodTensor, default: LoDTensor<float>), a 2-D LoDTensor
-            with shape being [N, M] where M for dimension.
-        new_dim (int): New dimension which the input LoDTensor is reshaped to.
+
+       input (Variable): A 2-D LoDTensor with shape being [N, M] where M for dimension.
+       new_dim (int): New dimension that the input LoDTensor is reshaped to.
 
     Returns:
+
         Variable: Reshaped LoDTensor according to new dimension.
 
     Examples:
         .. code-block:: python
 
-            x = fluid.layers.data(name='x', shape=[5, 20],
-                              dtype='float32', lod_level=1)
-            x_reshaped = layers.sequence_reshape(input=x, new_dim=10)
+            x = fluid.layers.data(shape=[5, 20], dtype='float32', lod_level=1)
+            x_reshaped = fluid.layers.sequence_reshape(input=x, new_dim=10)
     """
     helper = LayerHelper('sequence_reshape', **locals())
     out = helper.create_tmp_variable(helper.input_dtype())
@@ -3466,13 +3775,41 @@ def nce(input,
         input (Variable): input variable.
         label (Variable): label.
         num_total_classes (int):${num_total_classes_comment}
-        sample_weight (int): ${sample_weight_comment}
+        sample_weight (Variable|None): A Variable of shape [batch_size, 1]
+            storing a weight for each sample. The default weight for each
+            sample is 1.0.
         param_attr (ParamAttr|None): attributes for parameter
         bias_attr (ParamAttr|None): attributes for bias
         num_neg_samples (int): ${num_neg_samples_comment}
-    
+
     Returns:
-        Variable: output of nce layer.
+        Variable: The output nce loss.
+
+    Examples:
+        .. code-block:: python
+
+            window_size = 5
+            words = []
+            for i in xrange(window_size):
+                words.append(layers.data(
+                    name='word_{0}'.format(i), shape=[1], dtype='int64'))
+
+            dict_size = 10000
+            label_word = int(window_size / 2) + 1
+
+            embs = []
+            for i in xrange(window_size):
+                if i == label_word:
+                    continue
+
+                emb = layers.embedding(input=words[i], size=[dict_size, 32],
+                                       param_attr='emb.w', is_sparse=True)
+                embs.append(emb)
+
+            embs = layers.concat(input=embs, axis=1)
+            loss = layers.nce(input=embs, label=words[label_word],
+                          num_total_classes=dict_size, param_attr='nce.w',
+                          bias_attr='nce.b')
     """
     helper = LayerHelper('nce', **locals())
     assert isinstance(input, Variable)
@@ -3590,8 +3927,6 @@ def hsigmoid(input, label, num_classes=2, param_attr=None, bias_attr=None):
 
 def transpose(x, perm, name=None):
     """
-    **transpose Layer**
-
     Permute the dimensions of `input` according to `perm`.
 
     The `i`-th dimension  of the returned tensor will correspond to the
@@ -3681,8 +4016,6 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None):
 
     Examples:
 
-    As an example:
-
         .. code-block:: text
 
             Given:
@@ -3724,9 +4057,9 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None):
 
             output.dims = {8, 9}
 
-            output.lod = [[0, 4, 8]]
+            output.lod = [[4, 4]]
 
-        The simple usage is:
+     Examples:
 
         .. code-block:: python
 
@@ -3759,29 +4092,13 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None):
     return out
 
 
+@templatedoc()
 def row_conv(input, future_context_size, param_attr=None, act=None):
-    """Row Conv Operator. This layer will apply lookahead convolution to
-    **input**. The input variable should be a 2D LoDTensor with shape [T, D].
-    Parameters with shape [future_context_size + 1, D] will be created. The math
-    equation of row convolution is as follows:
-
-    .. math::
-        Out_{i} = \sum_{j = i} ^ {i + \\tau} X_{j} \odot W_{i - j}
-
-    In the above equation:
-
-    * :math:`Out_{i}`: The i-th row of output variable with shape [1, D].
-    * :math:`\\tau`: Future context size.
-    * :math:`X_{j}`: The j-th row of input variable with shape [1, D].
-    * :math:`W_{i-j}`: The (i-j)-th row of parameters with shape [1, D].
-
-    More details about row_conv please refer to the paper \
-    (http://www.cs.cmu.edu/~dyogatam/papers/wang+etal.iclrworkshop2016.pdf) and
-    the design document \
-    (https://github.com/PaddlePaddle/Paddle/issues/2228#issuecomment-303903645).
+    """
+    ${comment}
 
     Args:
-        input (Variable): Input variable, a 2D LoDTensor with shape [T, D].
+        input (${x_type}): ${x_comment}.
         future_context_size (int): Future context size. Please note, the shape
             of convolution kernel is [future_context_size + 1, D].
         param_attr (ParamAttr): Attributes of parameters, including
@@ -3789,14 +4106,13 @@ def row_conv(input, future_context_size, param_attr=None, act=None):
         act (str): Non-linear activation to be applied to output variable.
 
     Returns:
-        Variable: The output tensor with same shape as input tensor.
+        ${out_comment}.
 
     Examples:
-        .. code-block:: python
-
-            x = fluid.layers.data(name='x', shape=[16],
-                            dtype='float32', lod_level=1)
-            out = fluid.layers.row_conv(input=x, future_context_size=2)
+        >>> import paddle.fluid as fluid
+        >>> x = fluid.layers.data(name='x', shape=[16],
+        >>>                        dtype='float32', lod_level=1)
+        >>> out = fluid.layers.row_conv(input=x, future_context_size=2)
     """
     helper = LayerHelper('row_conv', **locals())
     dtype = helper.input_dtype()
@@ -3812,42 +4128,23 @@ def row_conv(input, future_context_size, param_attr=None, act=None):
     return helper.append_activation(out)
 
 
+@templatedoc()
 def multiplex(inputs, index):
     """
-    **Multiplex Layer**
-
-    Referring to the given index variable, this layer selects rows from the
-    input variables to construct a multiplex variable. Assuming that there are
-    :math:`m` input variables and :math:`I_i` represents the i-th input
-    variable and :math:`i` is in [0, :math:`m`). All input variables are
-    tensors with same shape [:math:`d_0`, :math:`d_1`, ..., :math:`d_R`].
-    Please note that rank of the input tensor should be at least 2. Each input
-    variable will be treated as a 2-D matrix with shape [:math:`M`, :math:`N`]
-    where :math:`M` for :math:`d_0` and :math:`N` for :math:`d_1` * :math:`d_2`
-    * ... * :math:`d_R`. Let :math:`I_i[j]` be the j-th row of the i-th input
-    variable. The given index variable should be a 2-D tensor with shape
-    [:math:`M`, 1]. Let `ID[i]` be the i-th index value of the index variable.
-    Then the output variable will be a tensor with shape [:math:`d_0`,
-    :math:`d_1`, ..., :math:`d_R`]. If we treat the output tensor as a 2-D
-    matrix with shape [:math:`M`, :math:`N`] and let :math:`O[i]` be the i-th
-    row of the matrix, then `O[i]` is equal to :math:`I_{ID[i]}[i]`.
+    ${comment}
+
+    >>> import paddle.fluid as fluid
+    >>> x1 = fluid.layers.data(name='x1', shape=[4], dtype='float32')
+    >>> x2 = fluid.layers.data(name='x2', shape=[4], dtype='float32')
+    >>> index = fluid.layers.data(name='index', shape=[1], dtype='int32')
+    >>> out = fluid.layers.multiplex(inputs=[x1, x2], index=index)
 
     Args:
-        inputs (list): A list of variables to gather from. All variables have the
-                same shape and the rank is at least 2.
-        index (Variable): Tensor<int32>, index variable which is a 2-D tensor
-                with shape [M, 1] where M is the batch size.
+       inputs (list): ${x_comment}.
+       index (${ids_type}): ${ids_comment}.
 
     Returns:
-        Variable: Multiplex variable gathered from input variables.
-
-    Examples:
-        .. code-block:: python
-
-            x1 = fluid.layers.data(name='x1', shape=[4], dtype='float32')
-            x2 = fluid.layers.data(name='x2', shape=[4], dtype='float32')
-            index = fluid.layers.data(name='index', shape=[1], dtype='int32')
-            out = fluid.layers.multiplex(inputs=[x1, x2], index=index)
+        ${out_comment}.
     """
     helper = LayerHelper('multiplex', **locals())
 
@@ -3933,31 +4230,30 @@ def softmax_with_cross_entropy(logits, label, soft_label=False):
 
 def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
     """
-    **Smooth L1 Loss Operator. **
-
-    This operator computes the smooth L1 loss for X and Y.
-    The operator takes the first dimension of X and Y as batch size.
+    This layer computes the smooth L1 loss for Variable :attr:`x` and :attr:`y`.
+    It takes the first dimension of :attr:`x` and :attr:`y` as batch size.
     For each instance, it computes the smooth L1 loss element by element first
-    and then sums all the losses. So the shape of Out is [batch_size, 1].
+    and then sums all the losses. So the shape of ouput Variable is
+    [batch_size, 1].
 
     Args:
         x (Variable): A tensor with rank at least 2. The input value of smooth
             L1 loss op with shape [batch_size, dim1, ..., dimN].
         y (Variable): A tensor with rank at least 2. The target value of smooth
-            L1 loss op with same shape as x.
+            L1 loss op with same shape as :attr:`x`.
         inside_weight (Variable|None):  A tensor with rank at least 2. This
-            input is optional and should have same shape with x. If provided,
-            the result of (x - y) will be multiplied by this tensor element by
-            element.
+            input is optional and should have same shape with :attr:`x`. If
+            provided, the result of (:attr:`x` - :attr:`y`) will be multiplied
+            by this tensor element by element.
         outside_weight (Variable|None): A tensor with rank at least 2. This
-            input is optional and should have same shape with x. If provided,
-            the out smooth L1 loss will be multiplied by this tensor element
-            by element.
-        sigma (float|None): Hyper parameter of smooth L1 loss op. A float scalar
-            with default value 1.0.
+            input is optional and should have same shape with :attr:`x`. If
+            provided, the out smooth L1 loss will be multiplied by this tensor
+            element by element.
+        sigma (float|None): Hyper parameter of smooth L1 loss layer. A float
+           scalar with default value 1.0.
+
     Returns:
-        Variable: A tensor with rank be 2. The output smooth L1 loss with
-            shape [batch_size, 1].
+        Variable: The output smooth L1 loss with shape [batch_size, 1].
 
     Examples:
         .. code-block:: python
@@ -3968,6 +4264,7 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
             fc = fluid.layers.fc(input=data, size=100)
             out = fluid.layers.smooth_l1(x=fc, y=label)
     """
+
     helper = LayerHelper('smooth_l1_loss', **locals())
     diff = helper.create_tmp_variable(dtype=x.dtype)
     loss = helper.create_tmp_variable(dtype=x.dtype)
@@ -3987,32 +4284,20 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
 
 def one_hot(input, depth):
     """
-    One Hot Operator. This operator creates the one-hot representations for input
-    index values. The following example will help to explain the function of this
-    operator.
+    This layer creates the one-hot representations for input indices.
 
     Args:
-        input(variable):  A Tensor/LodTensor of indices, last dimension must be 1.
-        depth(scalar): an interger defining the depth of the one hot dimension.
+        input(Variable): Input indices, last dimension must be 1.
+        depth(scalar): An interger defining the depth of the one-hot dimension.
 
     Returns:
-         The one-hot tensor or LodTensor, same as input.
+        Variable: The one-hot representations of input.
 
     Examples:
         .. code-block:: python
 
-        X is a LoDTensor:
-          X.lod = [[0, 1, 4]]
-          X.shape = [4, 1]
-          X.data = [[1], [1], [3], [0]]
-        set depth = 4
-        Out is a LoDTensor:
-          Out.lod = [[0, 1, 4]]
-          Out.shape = [4, 4]
-          Out.data = [[0., 1., 0., 0.],
-                      [0., 1., 0., 0.],
-                      [0., 0., 0., 1.],
-                      [1., 0., 0., 0.]]
+            label = layers.data(name="label", shape=[1], dtype="float32")
+            one_hot_label = layers.one_hot(input=label, depth=10)
     """
     helper = LayerHelper("one_hot", **locals())
     one_hot_out = helper.create_tmp_variable(dtype='float32')
@@ -4026,8 +4311,9 @@ def one_hot(input, depth):
 
 def autoincreased_step_counter(counter_name=None, begin=1, step=1):
     """
-    NOTE: The counter will be automatically increased by 1 every mini-batch
-    Return the run counter of the main program, which is started with 1.
+    Create an auto-increase variable
+    which will be automatically increased by 1 every mini-batch
+    Return the run counter of the main program, default is started from 1.
 
     Args:
         counter_name(str): The counter name, default is '@STEP_COUNTER@'.
@@ -4036,6 +4322,12 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1):
 
     Returns:
         Variable: The global run counter.
+
+    Examples:
+        .. code-block:: python
+
+           global_step = fluid.layers.autoincreased_step_counter(
+               counter_name='@LR_DECAY_COUNTER@', begin=begin, step=1)
     """
     helper = LayerHelper('global_step_counter')
     if counter_name is None:
@@ -4105,14 +4397,18 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
                                 say :attr:`actual_shape` has a higher priority
                                 than :attr:`shape`.
         act (str): The non-linear activation to be applied to output variable.
-        inplace(bool): If this flag is set true, a new output tensor is created
-                       whose data is copied from input x, otherwise the output
-                       shares data with input without copying.
+        inplace(bool): If this flag is set true, the output
+                       shares data with input without copying, otherwise
+                       a new output tensor is created
+                       whose data is copied from input x.
         name (str): The name of this layer. It is optional.
 
     Returns:
         Variable: The output tensor.
 
+    Raises:
+        TypeError: if actual_shape is neither Variable nor None.
+
     Examples:
         .. code-block:: python
 
@@ -4124,6 +4420,11 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
 
     if not (isinstance(shape, list) or isinstance(shape, tuple)):
         raise ValueError("Input shape must be a python lsit or tuple.")
+    inputs = {"X": x}
+    if isinstance(actual_shape, Variable):
+        inputs["Shape"] = actual_shape
+    elif actual_shape is not None:
+        raise TypeError("actual_shape should either be Variable or None")
 
     # Validate the shape
     unk_dim_idx = -1
@@ -4144,9 +4445,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
     reshaped = helper.create_tmp_variable(dtype=x.dtype)
     helper.append_op(
         type="reshape",
-        inputs={"X": x,
-                "Shape": actual_shape}
-        if isinstance(actual_shape, Variable) else {"X": x},
+        inputs=inputs,
         attrs={"shape": shape,
                "inplace": inplace},
         outputs={"Out": reshaped})
@@ -4156,73 +4455,74 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
 
 def lod_reset(x, y=None, target_lod=None):
     """
-    LoD Reset Operator. Set LoD of **x** to a new one specified by **y** or
-    **target_lod**. When **y** provided, **y.lod** would be considered as target
-    LoD first, otherwise **y.data** would be considered as target LoD. If **y**
-    is not provided, target LoD should be specified by **target_lod**.
-    If target LoD is specified by **Y.data** or **target_lod**, only one level
-    LoD is supported.
+    Set LoD of :attr:`x` to a new one specified by :attr:`y` or
+    :attr:`target_lod`. When :attr:`y` provided, :attr:`y.lod` would be
+    considered as target LoD first, otherwise :attr:`y.data` would be
+    considered as target LoD. If :attr:`y` is not provided, target LoD should
+    be specified by :attr:`target_lod`. If target LoD is specified by
+    :attr:`Y.data` or :attr:`target_lod`, only one level LoD is supported.
 
     .. code-block:: text
 
         * Example 1:
 
             Given a 1-level LoDTensor x:
-                x.lod =  [[ 0,     2,                   5      6 ]]
+                x.lod =  [[ 2,           3,                   1 ]]
                 x.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
                 x.dims = [6, 1]
 
-            target_lod: [0, 4, 6]
+            target_lod: [4, 2]
 
             then we get a 1-level LoDTensor:
-                out.lod =  [[ 0,                   4,            6 ]]
+                out.lod =  [[4,                          2]]
                 out.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
                 out.dims = [6, 1]
 
         * Example 2:
 
             Given a 1-level LoDTensor x:
-                x.lod =  [[ 0,     2,                   5      6 ]]
+                x.lod =  [[2,            3,                   1]]
                 x.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
                 x.dims = [6, 1]
 
             y is a Tensor:
-                y.data = [[0, 2, 6]]
+                y.data = [[2, 4]]
                 y.dims = [1, 3]
 
             then we get a 1-level LoDTensor:
-                out.lod =  [[ 0,     2,                          6 ]]
+                out.lod =  [[2,            4]]
                 out.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
                 out.dims = [6, 1]
 
         * Example 3:
 
             Given a 1-level LoDTensor x:
-                x.lod =  [[ 0,      2,                   5     6 ]]
+                x.lod =  [[2,            3,                   1]]
                 x.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
                 x.dims = [6, 1]
 
             y is a 2-level LoDTensor:
-                y.lod =  [[0, 2, 4], [0, 2, 5, 6]]
+                y.lod =  [[2, 2], [2, 2, 1, 1]]
                 y.data = [[1.1], [2.1], [3.1], [4.1], [5.1], [6.1]]
                 y.dims = [6, 1]
 
             then we get a 2-level LoDTensor:
-                out.lod =  [[0, 2, 4], [0, 2, 5, 6]]
+                out.lod =  [[2, 2], [2, 2, 1, 1]]
                 out.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
                 out.dims = [6, 1]
 
     Args:
         x (Variable): Input variable which could be a Tensor or LodTensor.
-        y (Variable|None): If provided, output's LoD would be derived from y.
+        y (Variable|None): If provided, output's LoD would be derived
+                           from :attr:`y`.
         target_lod (list|tuple|None): One level LoD which should be considered
-                                      as target LoD when y not provided.
+                                      as target LoD when :attr:`y` not provided.
 
     Returns:
-        Variable: Output variable with LoD specified by this operator.
+        Variable: Output variable with LoD specified by this layer.
 
     Raises:
-        ValueError: If y and target_lod are both None.
+        ValueError: If :attr:`y` and :attr:`target_lod` are both None.
 
     Examples:
         .. code-block:: python
@@ -4258,9 +4558,7 @@ def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None):
 
     .. math::
 
-        Output(i, x, y) = Input(i, x, y) / \left(
-        k + \alpha \sum\limits^{\min(C, c + n/2)}_{j = \max(0, c - n/2)}
-        (Input(j, x, y))^2 \right)^{\beta}
+      Output(i, x, y) = Input(i, x, y) / \\left(k + \\alpha \\sum\\limits^{\\min(C, c + n/2)}_{j = \\max(0, c - n/2)}(Input(j, x, y))^2\\right)^{\\beta}
 
     In the above equation:
 
@@ -4444,34 +4742,20 @@ def label_smooth(label,
     return smooth_label
 
 
+@templatedoc()
 def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0):
     """
-    Region of interest pooling (also known as RoI pooling) is to perform
-        is to perform max pooling on inputs of nonuniform sizes to obtain
-        fixed-size feature maps (e.g. 7*7).
-    The operator has three steps:
-        1. Dividing each region proposal into equal-sized sections with
-           the pooled_width and pooled_height
-        2. Finding the largest value in each section
-        3. Copying these max values to the output buffer
+    ${comment}
 
     Args:
-        input (Variable): The input for ROI pooling.
-        rois (Variable): ROIs (Regions of Interest) to pool over. It should
-                         be a 2-D one level LoTensor of shape [num_rois, 4].
-                         The layout is [x1, y1, x2, y2], where (x1, y1)
-                         is the top left coordinates, and (x2, y2) is the
-                         bottom right coordinates. The num_rois is the
-                         total number of ROIs in this batch data.
-        pooled_height (integer): The pooled output height. Default: 1
-        pooled_width (integer): The pooled output width. Default: 1
-        spatial_scale (float): Multiplicative spatial scale factor. To
-                               translate ROI coords from their input scale
-                               to the scale used when pooling. Default: 1.0
+        input (Variable): ${x_comment}
+        rois (Variable): ROIs (Regions of Interest) to pool over.
+        pooled_height (integer): ${pooled_height_comment} Default: 1
+        pooled_width (integer): ${pooled_width_comment} Default: 1
+        spatial_scale (float): ${spatial_scale_comment} Default: 1.0
 
     Returns:
-        pool_out (Variable): The output is a 4-D tensor of the shape
-                             (num_rois, channels, pooled_h, pooled_w).
+        Variable: ${out_comment}.
 
     Examples:
         .. code-block:: python
@@ -4543,12 +4827,13 @@ def image_resize(input,
                  name=None,
                  resample='BILINEAR'):
     """
-    Resize a batch of images.
+    **Resize a Batch of Images**
 
-    The input must be a tensor of the shape (num_batches, channels, in_h, in_w), 
+    The input must be a tensor of the shape (num_batches, channels, in_h, in_w),
     and the resizing only applies on the last two dimensions(hight and width).
 
     Supporting resample methods:
+
         'BILINEAR' : Bilinear interpolation
 
     Args:
@@ -4568,8 +4853,8 @@ def image_resize(input,
                        Default: 'BILINEAR'
 
     Returns:
-        out (Variable): The output is a 4-D tensor of the shape
-                        (num_batches, channls, out_h, out_w).
+        Variable: The output is a 4-D tensor of the shape
+        (num_batches, channls, out_h, out_w).
 
     Examples:
         .. code-block:: python
@@ -4640,9 +4925,9 @@ def resize_bilinear(input, out_shape=None, scale=None, name=None):
 
 def image_resize_short(input, out_short_len, resample='BILINEAR'):
     """
-    Resize a batch of images. The short edge of input images will be 
-    resized to the given 'out_short_len'. The long edge of input images 
-    will be resized proportionately to make images' length-width ratio 
+    Resize a batch of images. The short edge of input images will be
+    resized to the given 'out_short_len'. The long edge of input images
+    will be resized proportionately to make images' length-width ratio
     constant.
 
     Args:
@@ -4653,8 +4938,8 @@ def image_resize_short(input, out_short_len, resample='BILINEAR'):
         resample (str): resample method, default: BILINEAR.
 
     Returns:
-        out (Variable): The output is a 4-D tensor of the shape
-                        (num_batches, channls, out_h, out_w).
+        Variable: The output is a 4-D tensor of the shape
+        (num_batches, channls, out_h, out_w).
     """
     in_shape = input.shape
     if len(in_shape) != 4:
@@ -4673,7 +4958,9 @@ def image_resize_short(input, out_short_len, resample='BILINEAR'):
 
 def gather(input, index):
     """
-    Output is obtained by gathering entries of the outer-most dimension 
+    **Gather Layer**
+
+    Output is obtained by gathering entries of the outer-most dimension
     of X indexed by `index` and concatenate them together.
 
     .. math::
@@ -4698,7 +4985,7 @@ def gather(input, index):
                        [5, 6]]
 
     Args:
-        input (Variable): The source input with rank>=1. 
+        input (Variable): The source input with rank>=1.
         index (Variable): The index input with rank=1.
 
     Returns:
@@ -4726,10 +5013,6 @@ def random_crop(x, shape, seed=None):
     """
     ${comment}
 
-    Examples:
-        >>> img = fluid.layers.data("img", [3, 256, 256])
-        >>> cropped_img = fluid.layers.random_crop(img, shape=[3, 224, 224])
-
     Args:
         x(${x_type}): ${x_comment}
         shape(${shape_type}): ${shape_comment}
@@ -4739,63 +5022,115 @@ def random_crop(x, shape, seed=None):
     Returns:
         ${out_comment}
 
+    Examples:
+        >>> img = fluid.layers.data("img", [3, 256, 256])
+        >>> cropped_img = fluid.layers.random_crop(img, shape=[3, 224, 224])
     """
     helper = LayerHelper("random_crop", **locals())
-    dtype = helper.input_dtype()
+    dtype = x.dtype
     out = helper.create_tmp_variable(dtype)
     if seed is None:
         seed = random.randint(-65536, 65535)
-
+    op_attrs = {"shape": shape}
     if isinstance(seed, int):
-        seed_value = seed
-        seed = helper.create_tmp_variable(dtype="int64")
-        helper.append_op(
-            type="fill_constant",
-            inputs={},
-            outputs={"Out": seed},
-            attrs={
-                "dtype": seed.dtype,
-                "shape": [1],
-                "value": float(seed_value)
-            })
+        op_attrs["startup_seed"] = seed
+        seed = helper.create_variable(
+            name=unique_name.generate("random_crop_seed"),
+            dtype="int64",
+            persistable=True)
     elif not isinstance(seed, Variable):
         raise ValueError("'seed' must be a Variable or an int.")
-    seed_out = helper.create_tmp_variable(dtype="int64")
     helper.append_op(
         type="random_crop",
         inputs={"X": x,
                 "Seed": seed},
         outputs={"Out": out,
-                 "SeedOut": seed_out},
-        attrs={"shape": shape})
+                 "SeedOut": seed},
+        attrs=op_attrs)
+    return out
+
+
+def log(x):
+    """
+    Calculates the natural log of the given input tensor, element-wise.
+
+    .. math::
+
+        Out = \\ln(x)
+
+    Args:
+        x (Variable): Input tensor.
+
+    Returns:
+        Variable: The natural log of the input tensor computed element-wise.
+
+    Examples:
+
+        .. code-block:: python
+
+            output = fluid.layers.log(x)
+    """
+    helper = LayerHelper('log', **locals())
+    dtype = helper.input_dtype(input_param_name='x')
+    out = helper.create_tmp_variable(dtype)
+    helper.append_op(type="log", inputs={"X": x}, outputs={"Out": out})
+    return out
+
+
+def relu(x):
+    """
+    Relu takes one input data (Tensor) and produces one output data (Tensor)
+    where the rectified linear function, y = max(0, x), is applied to
+    the tensor elementwise.
+
+    .. math::
+
+        Out = \\max(0, x)
+
+    Args:
+        x (Variable): The input tensor.
+
+    Returns:
+        Variable: The output tensor with the same shape as input.
+
+    Examples:
+
+        .. code-block:: python
+
+            output = fluid.layers.relu(x)
+    """
+    helper = LayerHelper('relu', **locals())
+    dtype = helper.input_dtype(input_param_name='x')
+    out = helper.create_tmp_variable(dtype)
+    helper.append_op(type="relu", inputs={"X": x}, outputs={"Out": out})
     return out
 
 
 def mean_iou(input, label, num_classes):
     """
     Mean Intersection-Over-Union is a common evaluation metric for
-    semantic image segmentation, which first computes the IOU for each 
-    semantic class and then computes the average over classes. 
-    IOU is defined as follows: 
-    
+    semantic image segmentation, which first computes the IOU for each
+    semantic class and then computes the average over classes.
+    IOU is defined as follows:
+
     .. math::
-        
-        IOU = true_positive / (true_positive + false_positive + false_negative). 
 
-    The predictions are accumulated in a confusion matrix and mean-IOU 
+        IOU = \\frac{true\_positiv}{(true\_positive + false\_positive + false\_negative)}.
+
+    The predictions are accumulated in a confusion matrix and mean-IOU
     is then calculated from it.
 
 
     Args:
         input (Variable): A Tensor of prediction results for semantic labels with type int32 or int64.
-        label (Variable):  A Tensor of ground truth labels with type int32 or int64. 
+        label (Variable): A Tensor of ground truth labels with type int32 or int64.
                            Its shape should be the same as input.
+        num_classes (int): The possible number of labels.
 
     Returns:
         mean_iou (Variable): A Tensor representing the mean intersection-over-union with shape [1].
         out_wrong(Variable): A Tensor with shape [num_classes]. The wrong numbers of each class.
-        out_correct(Variable): A Tensor with shape [num_classes]. The correct numbers of each class. 
-
+        out_correct(Variable): A Tensor with shape [num_classes]. The correct numbers of each class.
 
     Examples:
 
@@ -4810,12 +5145,110 @@ def mean_iou(input, label, num_classes):
     out_correct = helper.create_tmp_variable(dtype='int32')
     helper.append_op(
         type="mean_iou",
-        inputs={"predictions": input,
-                "labels": label},
+        inputs={"Predictions": input,
+                "Labels": label},
         outputs={
-            "out_mean_iou": out_mean_iou,
-            "out_wrong": out_wrong,
-            "out_correct": out_correct
+            "OutMeanIou": out_mean_iou,
+            "OutWrong": out_wrong,
+            "OutCorrect": out_correct
         },
         attrs={"num_classes": num_classes})
     return out_mean_iou, out_wrong, out_correct
+
+
+def crop(x, shape=None, offsets=None, name=None):
+    """
+    Crop input into output, as specified by offsets and shape.
+
+    .. code-block:: text
+
+        * Case 1:
+            Given
+                X = [[0, 1, 2, 0, 0]
+                     [0, 3, 4, 0, 0]
+                     [0, 0, 0, 0, 0]],
+            and
+                shape = [2, 2],
+                offsets = [0, 1],
+            output is:
+                Out = [[1, 2],
+                       [3, 4]].
+        * Case 2:
+            Given
+                X = [[0, 1, 2, 5, 0]
+                     [0, 3, 4, 6, 0]
+                     [0, 0, 0, 0, 0]],
+            and shape is tensor
+                shape = [[0, 0, 0]
+                         [0, 0, 0]]
+            and
+                offsets = [0, 1],
+
+            output is:
+                Out = [[1, 2, 5],
+                       [3, 4, 6]].
+
+    Args:
+        x (Variable): The input tensor variable.
+        shape (Variable|list/tuple of integer): The output shape is specified
+            by `shape`, which can a Variable or a list/tupe of integer.
+            If a tensor Variable, it's rank must be the same as `x`. This way
+            is suitable for the case that the output shape may be changed each
+            iteration. If a list/tupe of integer, it's length must be the same
+            as the rank of `x`
+        offsets (Variable|list/tuple of integer|None): Specifies the copping
+            offsets at each dimension. It can be a Variable or or a list/tupe
+            of integer. If a tensor Variable, it's rank must be the same as `x`.
+            This way is suitable for the case that the offsets may be changed
+            each iteration. If a list/tupe of integer, it's length must be the
+            same as the rank of `x`. If None, the offsets are 0 at each
+            dimension.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
+
+    Returns:
+        Variable: The cropped tensor variable.
+
+    Raises:
+        ValueError: If shape is not a list, tuple or Variable.
+
+    Examples:
+
+        .. code-block:: python
+
+            x = fluid.layers.data(name="x", shape=[3, 5], dtype="float32")
+            y = fluid.layers.data(name="y", shape=[2, 3], dtype="float32")
+            crop = fluid.layers.crop(x, shape=y)
+
+            # or
+            z = fluid.layers.data(name="z", shape=[3, 5], dtype="float32")
+            crop = fluid.layers.crop(z, shape=[2, 3])
+
+    """
+    helper = LayerHelper('crop', **locals())
+
+    if not (isinstance(shape, list) or isinstance(shape, tuple) or \
+        isinstance(shape, Variable)):
+        raise ValueError("The shape should be a list, tuple or Variable.")
+
+    if offsets is None:
+        offsets = [0] * len(x.shape)
+
+    out = helper.create_tmp_variable(x.dtype)
+    ipts = {'X': x}
+    attrs = {}
+    if isinstance(shape, Variable):
+        ipts['Y'] = shape
+    else:
+        attrs['shape'] = shape
+    if isinstance(offsets, Variable):
+        ipts['Offsets'] = offsets
+    else:
+        attrs['offsets'] = offsets
+
+    helper.append_op(
+        type='crop',
+        inputs=ipts,
+        outputs={'Out': out},
+        attrs=None if len(attrs) == 0 else attrs)
+    return out
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 98f169e8f0881fbba6aecb45b43a52c8fd51132d..9e97ec9a6f55680a2eb44ad712ac002df4fecda5 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -17,7 +17,6 @@ __activations__ = [
     'sigmoid',
     'logsigmoid',
     'exp',
-    'relu',
     'tanh',
     'tanh_shrink',
     'softshrink',
@@ -29,7 +28,6 @@ __activations__ = [
     'sin',
     'round',
     'reciprocal',
-    'log',
     'square',
     'softplus',
     'softsign',
@@ -40,8 +38,6 @@ __activations__ = [
     'relu6',
     'pow',
     'stanh',
-    'hard_shrink',
-    'thresholded_relu',
     'hard_sigmoid',
     'swish',
 ]
@@ -64,18 +60,102 @@ __all__ = [
     'logical_or',
     'logical_xor',
     'logical_not',
-    'uniform_random',
     'uniform_random_batch_size_like',
     'gaussian_random',
     'gaussian_random_batch_size_like',
-    'cumsum',
     'scatter',
     'sum',
     'slice',
     'polygon_box_transform',
     'shape',
+    'iou_similarity',
     'maxout',
 ] + __activations__
 
 for _OP in set(__all__):
     globals()[_OP] = generate_layer_fn(_OP)
+
+__all__ += ["uniform_random"]
+
+_uniform_random_ = generate_layer_fn('uniform_random')
+
+
+def uniform_random(shape, dtype=None, min=None, max=None, seed=None):
+    kwargs = dict()
+    for name in locals():
+        val = locals()[name]
+        if val is not None:
+            kwargs[name] = val
+    return _uniform_random_(**kwargs)
+
+
+uniform_random.__doc__ = _uniform_random_.__doc__ + """
+Examples:
+
+    >>> result = fluid.layers.uniform_random(shape=[32, 784])
+"""
+
+__all__ += ['hard_shrink']
+
+_hard_shrink_ = generate_layer_fn('hard_shrink')
+
+
+def hard_shrink(x, threshold=None):
+    kwargs = dict()
+    for name in locals():
+        val = locals()[name]
+        if val is not None:
+            kwargs[name] = val
+    return _hard_shrink_(**kwargs)
+
+
+hard_shrink.__doc__ = _hard_shrink_.__doc__ + """
+Examples:
+
+    >>> data = fluid.layers.data(name="input", shape=[784])
+    >>> result = fluid.layers.hard_shrink(x=data, threshold=0.3)
+"""
+
+__all__ += ['cumsum']
+
+_cum_sum_ = generate_layer_fn('cumsum')
+
+
+def cumsum(x, axis=None, exclusive=None, reverse=None):
+    kwargs = dict()
+    for name in locals():
+        val = locals()[name]
+        if val is not None:
+            kwargs[name] = val
+
+    return _cum_sum_(**kwargs)
+
+
+cumsum.__doc__ = _cum_sum_.__doc__ + """
+Examples:
+
+    >>> data = fluid.layers.data(name="input", shape=[32, 784])
+    >>> result = fluid.layers.cumsum(data, axis=0)
+"""
+
+__all__ += ['thresholded_relu']
+
+_thresholded_relu_ = generate_layer_fn('thresholded_relu')
+
+
+def thresholded_relu(x, threshold=None):
+    kwargs = dict()
+    for name in locals():
+        val = locals()[name]
+        if val is not None:
+            kwargs[name] = val
+
+    _thresholded_relu_(**kwargs)
+
+
+thresholded_relu.__doc__ = _thresholded_relu_.__doc__ + """
+Examples:
+
+    >>> data = fluid.layers.data(name="input", shape=[1])
+    >>> result = fluid.layers.thresholded_relu(data, threshold=0.4)
+"""
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 62b01d595a812ee8fc094e40b6dfb5c3f56cd012..b6614ecf3bc16e73683f4991779769049c6800ed 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -6,7 +6,7 @@
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
+# Unlessf required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
@@ -33,12 +33,32 @@ __all__ = [
     'fill_constant',
     'argmin',
     'argmax',
+    'argsort',
     'ones',
     'zeros',
+    'reverse',
 ]
 
 
 def create_tensor(dtype, name=None, persistable=False):
+    """
+    Create an variable, which will hold a LoDTensor with data type dtype.
+
+    Args:
+        dtype(string): 'float32'|'int32'|..., the data type of the
+            created tensor.
+        name(string): The name of the created tensor, if not set,
+            the name will be a random unique one.
+        persistable(bool): Set the persistable flag of the create tensor.
+
+    Returns:
+        Variable: The tensor variable storing the created tensor.
+
+    Examples:
+        .. code-block:: python
+
+          tensor = fluid.layers.create_tensor(dtype='float32')
+    """
     helper = LayerHelper("create_tensor", **locals())
     return helper.create_variable(
         name=helper.name, dtype=dtype, persistable=persistable)
@@ -51,7 +71,12 @@ def create_parameter(shape,
                      is_bias=False,
                      default_initializer=None):
     """
-    Create a parameter
+    Create a parameter. The parameter is a learnable variable, which can have
+    gradient, and can be optimized.
+
+    NOTE: this is a very low-level API. This API is useful when you create
+    operator by your self. instead of using layers.
+
     Args:
         shape(list[int]): shape of the parameter
         dtype(string): element type of the parameter
@@ -63,7 +88,12 @@ def create_parameter(shape,
         default_initializer(Initializer): initializer for the parameter
 
     Returns:
-        Parameter: the created parameter
+        the created parameter.
+
+    Examples:
+        >>> W = fluid.layers.create_parameter(shape=[784, 200], dtype='float32')
+        >>> data = fluid.layers.data(name="img", shape=[64, 784], append_batch_size=False)
+        >>> hidden = fluid.layers.matmul(x=data, y=W)
     """
     helper = LayerHelper("create_parameter", **locals())
     if attr is None:
@@ -79,16 +109,29 @@ def create_global_var(shape,
                       force_cpu=False,
                       name=None):
     """
-    Create a global variable. such as global_step
+    Create a new variable in the global block(block 0).
+
     Args:
         shape(list[int]): shape of the variable
-        value(float): the value of the variable
-        dtype(string): element type of the parameter
-        persistable(bool): if this variable is persistable
-        force_cpu(bool): force this variable to be on CPU
+        value(float): the value of the variable. The new created 
+                      variable will be filled with it.
+        dtype(string): data type of the variable
+        persistable(bool): if this variable is persistable. 
+                           Default: False
+        force_cpu(bool): force this variable to be on CPU. 
+                         Default: False
+        name(str|None): The name of the variable. If set to None the variable 
+                        name will be generated automatically. 
+                        Default: None
 
     Returns:
         Variable: the created Variable
+
+    Examples:
+        .. code-block:: python
+
+            var = fluid.create_global_var(shape=[2,3], value=1.0, dtype='float32', 
+                                 persistable=True, force_cpu=True, name='new_var')
     """
     helper = LayerHelper("global_var", **locals())
     var = helper.create_global_variable(
@@ -101,8 +144,21 @@ def create_global_var(shape,
 
 def cast(x, dtype):
     """
-    This function takes in the input with input_dtype
-    and casts it to the output_dtype as the output.
+    This layer takes in the Variable :attr:`x` with :attr:`x.dtype` and casts 
+    it to the output with :attr:`dtype`.
+
+    Args:
+        x (Variable): The input Variable for casting.
+        dtype(np.dtype|core.VarDesc.VarType|str): Data type of the output Variable.
+
+    Returns:
+        Variable: The output Variable after casting.
+
+    Examples:
+        .. code-block:: python
+
+            data = fluid.layers.data(name='x', shape=[13], dtype='float32')
+            result = fluid.layers.cast(x=data, dtype='float64')
     """
     helper = LayerHelper('cast', **locals())
     out = helper.create_tmp_variable(dtype=dtype)
@@ -133,7 +189,8 @@ def concat(input, axis=0, name=None):
 
     Examples:
         .. code-block:: python
-          out = fluid.layers.concat(input=[Efirst, Esecond, Ethird, Efourth])
+
+           out = fluid.layers.concat(input=[Efirst, Esecond, Ethird, Efourth])
     """
     helper = LayerHelper('concat', **locals())
     out = helper.create_tmp_variable(dtype=helper.input_dtype())
@@ -146,19 +203,21 @@ def concat(input, axis=0, name=None):
 
 
 def sums(input, out=None):
-    """This function performs the sum operation on the input and returns the
+    """
+    This function performs the sum operation on the input and returns the
     result as the output.
 
     Args:
         input (Variable|list): The input tensor that has the elements
                                that need to be summed up.
+        out (Variable|None): Output parameter. The sum result.
+                             Default: None
 
     Returns:
-        Variable: The tensor type variable that has the sum of input
-                  written to it.
+        Variable: the sum of input. The same as the argument 'out'
 
     Examples:
-        .. code-block::python
+        .. code-block:: python
 
           tmp = fluid.layers.zeros(shape=[10], dtype='int32')
           i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
@@ -172,11 +231,15 @@ def sums(input, out=None):
     helper = LayerHelper('sum', **locals())
     if out is None:
         out = helper.create_tmp_variable(dtype=helper.input_dtype())
-    helper.append_op(type='sum', inputs={'X': input}, outputs={'Out': out})
+    helper.append_op(
+        type='sum',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={'use_mkldnn': False})
     return out
 
 
-def assign(input, output):
+def assign(input, output=None):
     """
     **Assign**
 
@@ -184,18 +247,21 @@ def assign(input, output):
 
     Args:
         input(Variable|numpy.ndarray): The source variable
-        output(Variable): The destination variable
+        output(Variable|None): The destination variable
 
     Returns:
         Variable: The destination variable that was supplied as the *output*.
 
     Examples:
         .. code-block:: python
+
           out = fluid.layers.create_tensor(dtype='float32')
           hidden = fluid.layers.fc(input=data, size=10)
           fluid.layers.assign(hidden, out)
     """
     helper = LayerHelper('assign', **locals())
+    if output is None:
+        output = helper.create_tmp_variable(dtype=input.dtype)
     if isinstance(input, Variable):
         helper.append_op(
             type='assign', inputs={'X': [input]}, outputs={'Out': [output]})
@@ -321,22 +387,22 @@ def argmin(x, axis=0):
     """
     **argmin**
 
-    This function computes the indices of the min elements 
+    This function computes the indices of the min elements
     of the input tensor's element along the provided axis.
 
     Args:
         x(Variable): The input to compute the indices of
                      the min elements.
         axis(int): Axis to compute indices along.
-    
+
     Returns:
         Variable: The tensor variable storing the output
-    
+
     Examples:
         .. code-block:: python
-          
+
           out = fluid.layers.argmin(x=in, axis=0)
-          out = fluid.layers.argmin(x=in, axis=-1)  
+          out = fluid.layers.argmin(x=in, axis=-1)
     """
     helper = LayerHelper("arg_min", **locals())
     out = helper.create_tmp_variable(VarDesc.VarType.INT64)
@@ -352,22 +418,22 @@ def argmax(x, axis=0):
     """
     **argmax**
 
-    This function computes the indices of the max elements 
+    This function computes the indices of the max elements
     of the input tensor's element along the provided axis.
 
     Args:
         x(Variable): The input to compute the indices of
                      the max elements.
         axis(int): Axis to compute indices along.
-    
+
     Returns:
         Variable: The tensor variable storing the output
-    
+
     Examples:
         .. code-block:: python
-          
+
           out = fluid.layers.argmax(x=in, axis=0)
-          out = fluid.layers.argmax(x=in, axis=-1)  
+          out = fluid.layers.argmax(x=in, axis=-1)
     """
     helper = LayerHelper("arg_max", **locals())
     out = helper.create_tmp_variable(VarDesc.VarType.INT64)
@@ -379,6 +445,58 @@ def argmax(x, axis=0):
     return out
 
 
+def argsort(input, axis=-1, name=None):
+    """
+    Performs sorting on the input Variable along the given axis, and outputs 
+    sorted data Varibale and its corresponding index Variable with the same 
+    shape as :attr:`input`.
+
+    .. code-block:: text
+    
+        For example, the given axis is -1 and the input Variable
+
+            input = [[0.15849551, 0.45865775, 0.8563702 ],
+                     [0.12070083, 0.28766365, 0.18776911]],
+
+        after argsort, the sorted Vairable becomes
+
+            out = [[0.15849551, 0.45865775, 0.8563702 ],
+                   [0.12070083, 0.18776911, 0.28766365]],
+
+        and the sorted indices along the given axis turn outs to be
+
+            indices = [[0, 1, 2], 
+                       [0, 2, 1]]
+
+    Args:
+        input(Variable): The input Variable for sorting.
+        axis(int): The axis along which to sort the input Variable. When 
+                   :attr:`axis` < 0, the actual axis will be :attr:`axis` + 
+                   rank(:attr:`input`). Default -1, the last dimension.
+        name(str|None): (optional) A name for this layer. If set None, the 
+                   layer will be named automatically.
+
+    Returns:
+        tuple: A tuple of sorted data Variable and the sorted indices.
+
+    Examples:
+        .. code-block:: python
+
+            input = fluid.layers.data(data=[2, 3])
+            out, indices = fluid.layers.argsort(input, axis=0)
+    """
+    helper = LayerHelper("argsort", **locals())
+    out = helper.create_tmp_variable(dtype=input.dtype, stop_gradient=True)
+    ids = helper.create_tmp_variable(VarDesc.VarType.INT64, stop_gradient=True)
+    helper.append_op(
+        type='argsort',
+        inputs={'X': input},
+        outputs={'Out': out,
+                 'Indices': ids},
+        attrs={'axis': axis})
+    return out, ids
+
+
 def ones(shape, dtype, force_cpu=False):
     """
     **ones**
@@ -413,11 +531,12 @@ def zeros(shape, dtype, force_cpu=False):
     It also sets *stop_gradient* to True.
 
     Args:
-        shape(tuple|list|None): Shape of output tensor
-        dtype(np.dtype|core.VarDesc.VarType|str): Data type of output tensor
+        shape(tuple|list|None): Shape of output tensor.
+        dtype(np.dtype|core.VarDesc.VarType|str): Data type of output tensor.
+        force_cpu(bool, default False): Whether to make output stay on CPU.
 
     Returns:
-        Variable: The tensor variable storing the output
+        Variable: The tensor variable storing the output.
 
     Examples:
         .. code-block:: python
@@ -435,9 +554,9 @@ def reverse(x, axis):
 
     Args:
         x(Vairbale): the input to be reversed.
-        axis(int|tuple|list): Axis that along which order of elements 
-                    is reversed. If it is a tuple or a list, reversing 
-                    will be apply on each axis in the tuple or list.  
+        axis(int|tuple|list): Axis that along which order of elements
+                    is reversed. If it is a tuple or a list, reversing
+                    will be apply on each axis in the tuple or list.
 
     Returns:
         Variable: The reversed tensor.
@@ -468,9 +587,9 @@ def save(x, file_path, overwrite=True):
     Args:
         x(variable): The Tensor/LoDTensor to be saved.
         file_path(str): The file path where the variable will be saved.
-        overwrite(bool): Whether or not cover the given file when it has already 
-            existed. If it's set 'False' and the file is existed, a runtime 
-            error will be thrown. 
+        overwrite(bool): Whether or not cover the given file when it has already
+            existed. If it's set 'False' and the file is existed, a runtime
+            error will be thrown.
     """
     helper = LayerHelper("save", **locals())
     helper.append_op(
@@ -486,11 +605,27 @@ def save_combine(x, file_path, overwrite=True):
     Saves a list of variables into a single file.
 
     Args:
-        x(list): A list of Tensor/LoDTensor to be saved together in a single file.
+        x(list): A list of Tensor/LoDTensor variables to be saved together in
+                 a single file.
         file_path(str): The file path where variables will be saved.
-        overwrite(bool): Whether or not cover the given file when it has already 
-            existed. If it's set 'False' and the file is existed, a runtime 
-            error will be thrown. 
+        overwrite(bool): Whether or not cover the given file when it has already
+            existed. If it's set 'False' and the file is existed, a runtime
+            error will be thrown.
+
+    Returns:
+        There is no return value.
+
+    Examples:
+
+        .. code-block:: python
+
+            v1 = fluid.layers.data(name="data",
+                                   shape=(4, 6),
+                                   dtype="float32")
+            v2 = fluid.layers.data(name="data",
+                                   shape=(6, 8, 4),
+                                   dtype="float32")
+            normed = fluid.layers.save_combine([v1, v2], file_path="output")
     """
     helper = LayerHelper("save_combine", **locals())
     helper.append_op(
diff --git a/python/paddle/fluid/lod_tensor.py b/python/paddle/fluid/lod_tensor.py
index 9946d0a4ff33b2f5040f6d2e31aa20fcf9c609a7..b2b3186c1e8dd84e1527ff18744bd611f1f74c5f 100644
--- a/python/paddle/fluid/lod_tensor.py
+++ b/python/paddle/fluid/lod_tensor.py
@@ -18,172 +18,117 @@ import numpy as np
 __all__ = ['create_lod_tensor', 'create_random_int_lodtensor']
 
 
-def _validate_lod(lod, tensor_height=-1):
-    """Check whether the input length-based lod info is valid.
-
-    There are several things to check:
-    1. lod should be a list of lists. Empty list is fine.
-    2. The length of each sublist (a lod level) should be at least one.
-    3. Each element in each lod level should be an integer greater than 0.
-    4. The sum of one lod level should be equal to the length of the next lod level.
-    5. The sum of the last lod level should be equal to the tensor height. 
-       Bypass this check if user does not provide tensor_height as input.
-
-    Args:
-        lod: the length-based lod info, e.g., [[2, 3], [2, 1, 2, 3, 4]].
-        tensor_height: the outermost dimension of the tensor with which the input 
-            lod is associated with. 
-
-    Returns:
-        A boolean indicating whether the input lod is valid or not.
+def create_lod_tensor(data, recursive_seq_lens, place):
     """
-    assert isinstance(lod, list), "lod should be a list"
-    # Empty lod is fine
-    if len(lod) == 0:
-        return True
-
-    lod_sum = []
-    for level in lod:
-        assert isinstance(level, list), "each item in lod should be a list"
-        # Each level of lod should have at least one length info
-        if len(level) < 1:
-            return False
-        level_sum = 0
-        for lod_len in level:
-            # Each length in a level should be > 0
-            if lod_len <= 0:
-                return False
-            level_sum += lod_len
-        lod_sum.append(level_sum)
-
-    for idx, val in enumerate(lod_sum[:-1]):
-        # Each level's sum should be equal to 
-        # the number of items in the next level
-        if val != len(lod[idx + 1]):
-            return False
-
-    if tensor_height == -1:
-        return True
-    else:
-        # Last level's sum should be equal to the tensor height
-        return lod_sum[-1] == tensor_height
-
-
-def _convert_lod(lod):
-    """Convert a length-based lod to a offset-based lod.
-
-    If the length-based lod is [[2, 3], [2, 1, 2, 3, 4]],
-    then the offset-based lod is [[0, 2, 5], [0, 2, 3, 5, 8, 12]].
-
-    Args:
-        lod: a length-based lod info. 
+    Create a lod tensor from a numpy array, a list, or an existing lod tensor.
 
-    Returns:
-        A list of lists as the offset-based lod converted to from the input lod.
-    """
-    new_lod = []
-    for level in lod:
-        cur_len = 0
-        new_level = [cur_len]
-        for lod_len in level:
-            cur_len += lod_len
-            new_level.append(cur_len)
-        new_lod.append(new_level)
-    return new_lod
+    Create a lod tensor by doing the following:
 
+    1. Check that the length-based level of detail (LoD) also known as 
+       recursive_sequence_lengths of the input is valid.
 
-def create_lod_tensor(data, lod, place):
-    """Create a lod tensor from a numpy array, a list, or an existing lod tensor.
+    2. Convert recursive_sequence_lengths to a offset-based LoD.
 
-    Create a lod tensor by doing the following:
-    1. Check that the length-based input lod is valid.
-    2. Convert the length-based lod to a offset-based LoD.
-    3. Copy the data from a numpy array, a list or a existing lod tensor to 
+    3. Copy the data from a numpy array, a list or a existing lod tensor to
        CPU or GPU device (based on input place).
+
     4. Set the level of detail (LoD) using the offset-based LoD.
     
-    Use example:
-    Suppose we want LoDTensor to hold data for sequences of word, where each word is
-    represented by an integer. If we want to create a LoDTensor to represent two 
-    sentences, one of 2 words, and one of 3 words. 
+    Examples:
 
-    Then 'data' can be a numpy array of integers with shape (5, 1).
-    'lod' will be [[2, 3]], indicating the length(# of words) in each sentence.
-    This length-based input lod [[2, 3]] will be converted to offset-based lod [[0, 2, 5]]
-    inside the function call.
+        Suppose we want LoDTensor to hold data for sequences of word, where each
+        word is represented by an integer. If we want to create a LoDTensor to
+        represent two sentences, one of 2 words, and one of 3 words.
 
-    Please refer to 
-    github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/lod_tensor.md
-    for more details regarding LoD.
+        Then :code:`data` can be a numpy array of integers with shape (5, 1).
+        :code:`recursive_seq_lens` will be [[2, 3]], indicating the length(# of words) in each
+        sentence. This length-based :code:`recursive_seq_lens` [[2, 3]] will be converted to
+        offset-based LoD [[0, 2, 5]] inside the function call.
+
+    Please reference :ref:`api_guide_low_level_lod_tensor` for more details
+    regarding LoD.
 
     Args:
-        data: a numpy array or a LoDTensor or a list holding the data to be copied.
-        lod: a list of lists indicating the length-based LoD info specified by the user. 
-        place: CPU or GPU place indicating where the data in the new LoDTensor will be stored.
+        data(numpy.ndarray|list|LoDTensor): a numpy array or a LoDTensor or a
+            list holding the data to be copied.
+        recursive_seq_lens(list): a list of lists indicating the length-based level of detail 
+            info specified by the user.
+        place(Place): CPU or GPU place indicating where the data in the new
+            LoDTensor will be stored.
 
     Returns:
-        A fluid LoDTensor object with tensor data and lod info.
+        A fluid LoDTensor object with tensor data and recursive_seq_lens info.
     """
     if isinstance(data, core.LoDTensor):
-        return create_lod_tensor(np.array(data), lod, place)
+        return create_lod_tensor(np.array(data), recursive_seq_lens, place)
     elif isinstance(data, list):
         # When input data is a list, it only deal with the case where the base element 
         # is an index of shape [1] and dtype int64 (e.g., word id). Hence, the generated 
         # LoDTensor will be of shape [n, 1] and dtype int64, where `n` is the total number 
         # of words or other indexes in the sequence. 
-        new_lod = []
+        new_recursive_seq_lens = []
         for seq in data:
-            new_lod.append(len(seq))
-        assert [new_lod] == lod, "data and lod do not match"
+            new_recursive_seq_lens.append(len(seq))
+        assert [
+            new_recursive_seq_lens
+        ] == recursive_seq_lens, "data and recursive_seq_lens do not match"
         flattened_data = np.concatenate(data, axis=0).astype("int64")
         flattened_data = flattened_data.reshape([len(flattened_data), 1])
-        return create_lod_tensor(flattened_data, lod, place)
+        return create_lod_tensor(flattened_data, recursive_seq_lens, place)
     elif isinstance(data, np.ndarray):
-        assert _validate_lod(lod,
-                             data.shape[0]), "the provided lod info is invalid"
         tensor = core.LoDTensor()
         tensor.set(data, place)
-        tensor.set_lod(_convert_lod(lod))
+        tensor.set_recursive_sequence_lengths(recursive_seq_lens)
+        assert tensor.has_valid_recursive_sequence_lengths(
+        ), "the provided lod info is invalid"
         return tensor
     else:
         raise TypeError(
             "data should be either a LoDTensor, a Numpy array or a list")
 
 
-def create_random_int_lodtensor(lod, base_shape, place, low, high):
-    """Create a LoDTensor containing random integers.
+def create_random_int_lodtensor(recursive_seq_lens, base_shape, place, low,
+                                high):
+    """
+    Create a LoDTensor containing random integers.
 
-    This function is frequently used in the book examples. So we revised it based on 
-    the new create_lod_tensor API and put it here in the lod_tensor module to simplify 
-    the code. 
+    This function is frequently used in the book examples. So we revised it
+    based on the new create_lod_tensor API and put it here in the lod_tensor
+    module to simplify the code.
 
     The function does the following:
-    1. Calculate the overall shape of the LoDTensor based on the length-based 'lod' input 
-    and the shape of the basic element in 'base_shape'.
+
+    1. Calculate the overall shape of the LoDTensor based on the length-based
+       :code:`recursive_seq_lens` input and the shape of the basic element in
+       :code:`base_shape`.
+
     2. Create a numpy array of this shape.
+
     3. Create the LoDTensor using create_lod_tensor API.
 
-    Suppose we want LoDTensor to hold data for sequences of word, where each word is
-    represented by an integer. If we want to create a LoDTensor to represent two 
-    sentences, one of 2 words, and one of 3 words. Then 'base_shape' is [1], input 
-    length-based 'lod' is [[2, 3]]. Then the overall shape of the LoDTensor would be 
-    [5, 1], holding 5 words for two sentences. 
+    Suppose we want LoDTensor to hold data for sequences of word, where each
+    word is represented by an integer. If we want to create a LoDTensor to
+    represent two sentences, one of 2 words, and one of 3 words. Then
+    'base_shape' is [1], input length-based 'recursive_seq_lens' is [[2, 3]]. 
+    Then the overall shape of the LoDTensor would be [5, 1], holding 5 words 
+    for two sentences.
 
     Args:
-        data: a numpy array or a LoDTensor holding the data to be copied.
-        lod: a list of lists indicating the length-based LoD info specified by the user.
-        base_shape: the shape of the basic element to be held by the LoDTensor. 
-        place: CPU or GPU place indicating where the data in the new LoDTensor will be stored.
-        low: the lower bound of the random integers.
-        high: the upper bound of the random integers.
+        recursive_seq_lens(list): a list of lists indicating the length-based 
+            level of detail info specified by the user.
+        base_shape(list): the shape of the basic element to be held by the
+            LoDTensor.
+        place(Place): CPU or GPU place indicating where the data in the new
+            LoDTensor will be stored.
+        low(int): the lower bound of the random integers.
+        high(int): the upper bound of the random integers.
 
     Returns:
-        A fluid LoDTensor object with tensor data and lod info. 
+        A fluid LoDTensor object with tensor data and recursive_seq_lens info. 
     """
     assert isinstance(base_shape, list), "base_shape should be a list"
-    converted_lod = _convert_lod(lod)
     # append the total number of basic elements to the front of its shape
-    overall_shape = [converted_lod[-1][-1]] + base_shape
+    overall_shape = [sum(recursive_seq_lens[-1])] + base_shape
     # the range of integer data elements is [low, high]    
     data = np.random.random_integers(low, high, overall_shape).astype("int64")
-    return create_lod_tensor(data, lod, place)
+    return create_lod_tensor(data, recursive_seq_lens, place)
diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py
index bb9c6fdc60089fc2b43573a6421a6f9781d2d4a8..17bb0826a6ea86c98a069263dfab84b99e1177ad 100644
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -23,6 +23,8 @@ import warnings
 __all__ = [
     'MetricBase',
     'CompositeMetric',
+    'Precision',
+    'Recall',
     'Accuracy',
     'ChunkEvaluator',
     'EditDistance',
@@ -46,33 +48,34 @@ def _is_number_or_matrix_(var):
 
 class MetricBase(object):
     """
-    Base Class for all evaluators
+    Base Class for all Metrics.
+    MetricBase define a group of interfaces for the
+    model evaluation methods. Metrics accumulate metric states between
+    consecutive minibatches, at every minibatch, use update
+    interface to add current minibatch value to global states.
+    Use eval to compute accumative metric value from last reset()
+    or from scratch on.
+    If you need to custom a new metric, please inherit from MetricBase and
+    custom implementation.
 
     Args:
-        name(str): The name of evaluator. such as, "accuracy". Used for generate
-            temporary variable name.
-    Interface:
-        Note(*) : the states is the attributes who not has _ prefix.
-
-        get_config(): print current states and configuration
-        reset(): clear the states. If the Metrics states type is not (int, float, np.ndarray),
-                Please override this method.
-        update(): update states at every minibatch
-        eval(): get metric evaluation in numpy type.
+        name(str): The name of metric instance. such as, "accuracy".
+                  It needed if you want to distinct different metrics in a model.
+
     """
 
-    def __init__(self, name, **kwargs):
+    def __init__(self, name):
         self._name = str(name) if name != None else self.__class__.__name__
-        self._kwargs = kwargs if kwargs != None else dict()
-        self.reset()
 
     def __str__(self):
         return self._name
 
     def reset(self):
         """
-        states is the attributes who not has _ prefix.
-        reset the states of metrics.
+        reset clear the states of metrics. By default, the states
+        are the members who do not has _ prefix, reset set them to inital states.
+        If you violate the implicit name rule, please also custom the reset
+        interface.
         """
         states = {
             attr: value
@@ -90,61 +93,231 @@ class MetricBase(object):
                 setattr(self, attr, None)
 
     def get_config(self):
+        """
+        Get the metric and current states.
+        The states are the members who do not has "_" prefix.
+
+        Args:
+            None
+
+        Returns:
+            dict: a dict of metric and states
+        """
         states = {
             attr: value
             for attr, value in self.__dict__.iteritems()
             if not attr.startswith("_")
         }
-        config = copy.deepcopy(self._kwargs)
+        config = {}
         config.update({"name": self._name, "states": copy.deepcopy(states)})
         return config
 
-    def update(self):
-        raise NotImplementedError()
+    def update(self, preds, labels):
+        """
+        Updates the metric states at every minibatch.
+        One user can compute the minibatch metric via pure Python, or
+        via a c++ operator.
+
+        Args:
+            preds(numpy.array): the predictions of current minibatch
+            labels(numpy.array): the labels of current minibatch, if the label is one-hot
+                               or soft-label, should custom the corresponding update rule.
+        """
+        raise NotImplementedError(
+            "Should not use it directly, please extend it.")
 
     def eval(self):
-        raise NotImplementedError()
+        """
+        Evalute the current metrics based the accumulated states.
+
+        Returns:
+            float|list(float)|numpy.array: the metrics via Python.
+        """
+        raise NotImplementedError(
+            "Should not use it directly, please extend it.")
 
 
 class CompositeMetric(MetricBase):
     """
-    Compute multiple metrics in each minibatch.
+    Composite multiple metrics in one instance.
     for example, merge F1, accuracy, recall into one Metric.
+    
+    Examples:
+        .. code-block:: python
+    
+          labels = fluid.layers.data(name="data", shape=[1], dtype="int32")
+          data = fluid.layers.data(name="data", shape=[32, 32], dtype="int32")
+          pred = fluid.layers.fc(input=data, size=1000, act="tanh")
+          comp = fluid.metrics.CompositeMetric()
+          acc = fluid.metrics.Precision()
+          recall = fluid.metrics.Recall()
+          comp.add_metric(acc)
+          comp.add_metric(recall)
+          for pass in range(PASSES):
+            comp.reset()
+            for data in train_reader():
+                loss, preds, labels = exe.run(fetch_list=[cost, preds, labels])
+            comp.update(preds=preds, labels=labels)
+            numpy_acc, numpy_recall = comp.eval()
     """
 
-    def __init__(self, name=None, **kwargs):
-        super(CompositeMetric, self).__init__(name, kwargs)
+    def __init__(self, name=None):
+        super(CompositeMetric, self).__init__(name)
         self._metrics = []
 
     def add_metric(self, metric):
+        """
+        add one metric instance to CompositeMetric.
+
+        Args:
+            metric: a instance of MetricBase.
+        """
         if not isinstance(metric, MetricBase):
             raise ValueError("SubMetric should be inherit from MetricBase.")
         self._metrics.append(metric)
 
+    def update(self, preds, labels):
+        """
+        Update every metrics in sequence.
+
+        Args:
+            preds(numpy.array): the predictions of current minibatch
+            labels(numpy.array): the labels of current minibatch, if the label is one-hot
+                               or soft-label, should custom the corresponding update rule.
+        """
+        for m in self._metrics:
+            ans.append(m.update(preds, labels))
+
     def eval(self):
+        """
+        Evaluate every metrics in sequence.
+
+        Returns:
+            list(float|numpy.array): a list of metrics value in Python.
+        """
         ans = []
         for m in self._metrics:
             ans.append(m.eval())
         return ans
 
 
+class Precision(MetricBase):
+    """
+    Precision (also called positive predictive value) is the fraction of
+    relevant instances among the retrieved instances.
+    https://en.wikipedia.org/wiki/Evaluation_of_binary_classifiers
+
+    Note Precision is different with Accuracy in binary classifiers.
+    accuracy = true positive / total instances
+    precision = true positive / all positive instance
+
+    Examples:
+        .. code-block:: python
+
+        metric = fluid.metrics.Precision()
+        for pass in range(PASSES):
+            metric.reset()
+            for data in train_reader():
+                loss, preds, labels = exe.run(fetch_list=[cost, preds, labels])
+            metric.update(preds=preds, labels=labels)
+            numpy_precision = metric.eval()
+    """
+
+    def __init__(self, name=None):
+        super(Precision, self).__init__(name)
+        self.tp = 0  # true positive
+        self.fp = 0  # false positive
+
+    def update(self, preds, labels):
+        if not _is_numpy_(preds):
+            raise ValueError("The 'preds' must be a numpy ndarray.")
+        if not _is_numpy_(labels):
+            raise ValueError("The 'labels' must be a numpy ndarray.")
+        sample_num = labels[0]
+        for i in range(sample_num):
+            pred = preds[i].astype("int32")
+            label = labels[i]
+            if label == 1:
+                if pred == label:
+                    self.tp += 1
+                else:
+                    self.fp += 1
+
+    def eval(self):
+        ap = self.tp + self.fp
+        return float(self.tp) / ap if ap != 0 else .0
+
+
+class Recall(MetricBase):
+    """
+    Recall (also known as sensitivity) is the fraction of
+    relevant instances that have been retrieved over the
+    total amount of relevant instances
+
+    https://en.wikipedia.org/wiki/Precision_and_recall
+
+    Examples:
+        .. code-block:: python
+
+        metric = fluid.metrics.Recall()
+        for pass in range(PASSES):
+            metric.reset()
+            for data in train_reader():
+                loss, preds, labels = exe.run(fetch_list=[cost, preds, labels])
+            metric.update(preds=preds, labels=labels)
+            numpy_recall = metric.eval()
+    """
+
+    def __init__(self, name=None):
+        super(Recall, self).__init__(name)
+        self.tp = 0  # true positive
+        self.fn = 0  # false negtive
+
+    def update(self, preds, labels):
+        if not _is_numpy_(preds):
+            raise ValueError("The 'preds' must be a numpy ndarray.")
+        if not _is_numpy_(labels):
+            raise ValueError("The 'labels' must be a numpy ndarray.")
+        sample_num = labels[0]
+        for i in range(sample_num):
+            pred = preds[i].astype("int32")
+            label = labels[i]
+            if label == 1:
+                if pred == label:
+                    self.tp += 1
+            else:
+                if pred != label:
+                    self.fn += 1
+
+    def eval(self):
+        recall = self.tp + self.fn
+        return float(self.tp) / recall if recall != 0 else .0
+
+
 class Accuracy(MetricBase):
     """
     Accumulate the accuracy from minibatches and compute the average accuracy
     for every pass.
+    https://en.wikipedia.org/wiki/Accuracy_and_precision
 
     Args:
        name: the metrics name
 
-    Example:
-        minibatch_accuracy = fluid.layers.accuracy(pred, label)
-        accuracy_evaluator = fluid.metrics.Accuracy()
-        for epoch in PASS_NUM:
-            accuracy_evaluator.reset()
-            for data in batches:
-                loss = exe.run(fetch_list=[cost, minibatch_accuracy])
-            accuracy_evaluator.update(value=minibatch_accuracy, weight=batches)
-            accuracy = accuracy_evaluator.eval()
+    Examples:
+        .. code-block:: python
+
+            labels = fluid.layers.data(name="data", shape=[1], dtype="int32")
+            data = fluid.layers.data(name="data", shape=[32, 32], dtype="int32")
+            pred = fluid.layers.fc(input=data, size=1000, act="tanh")
+            minibatch_accuracy = fluid.layers.accuracy(pred, label)
+            accuracy_evaluator = fluid.metrics.Accuracy()
+            for pass in range(PASSES):
+                accuracy_evaluator.reset()
+                for data in train_reader():
+                    batch_size = data[0]
+                    loss = exe.run(fetch_list=[cost, minibatch_accuracy])
+                accuracy_evaluator.update(value=minibatch_accuracy, weight=batch_size)
+                numpy_acc = accuracy_evaluator.eval()
     """
 
     def __init__(self, name=None):
@@ -153,6 +326,13 @@ class Accuracy(MetricBase):
         self.weight = .0
 
     def update(self, value, weight):
+        """
+        Update minibatch states.
+
+        Args:
+            value(float|numpy.array): accuracy of one minibatch.
+            weight(int|float): batch size.
+        """
         if not _is_number_or_matrix_(value):
             raise ValueError(
                 "The 'value' must be a number(int, float) or a numpy ndarray.")
@@ -163,9 +343,8 @@ class Accuracy(MetricBase):
 
     def eval(self):
         if self.weight == 0:
-            raise ValueError(
-                "There is no data in Accuracy Metrics. Please check layers.accuracy output has added to Accuracy."
-            )
+            raise ValueError("There is no data in Accuracy Metrics. \
+                Please check layers.accuracy output has added to Accuracy.")
         return self.value / self.weight
 
 
@@ -174,6 +353,25 @@ class ChunkEvaluator(MetricBase):
     Accumulate counter numbers output by chunk_eval from mini-batches and
     compute the precision recall and F1-score using the accumulated counter
     numbers.
+    For some basics of chunking, please refer to
+    'Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>'.
+    ChunkEvalEvaluator computes the precision, recall, and F1-score of chunk detection,
+    and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes.
+
+    Examples:
+        .. code-block:: python
+
+            labels = fluid.layers.data(name="data", shape=[1], dtype="int32")
+            data = fluid.layers.data(name="data", shape=[32, 32], dtype="int32")
+            pred = fluid.layers.fc(input=data, size=1000, act="tanh")
+            precision, recall, f1_score, num_infer_chunks, num_label_chunks, num_correct_chunks = layers.chunk_eval(
+                input=pred,
+                label=label)
+            metric = fluid.metrics.ChunkEvaluator()
+            for data in train_reader():
+                loss, preds, labels = exe.run(fetch_list=[cost, preds, labels])
+                metric.update(num_infer_chunks, num_label_chunks, num_correct_chunks)
+                numpy_precision, numpy_recall, numpy_f1 = metric.eval()
     """
 
     def __init__(self, name=None):
@@ -183,9 +381,17 @@ class ChunkEvaluator(MetricBase):
         self.num_correct_chunks = 0
 
     def update(self, num_infer_chunks, num_label_chunks, num_correct_chunks):
+        """
+        Update the states based on the layers.chunk_eval() ouputs.
+        Args:
+            num_infer_chunks(int|numpy.array): The number of chunks in Inference on the given minibatch.
+            num_label_chunks(int|numpy.array): The number of chunks in Label on the given mini-batch.
+            num_correct_chunks(int|float|numpy.array): The number of chunks both in Inference and Label on the
+                                                  given mini-batch.
+        """
         if not _is_number_or_matrix_(num_infer_chunks):
             raise ValueError(
-                "The 'num_infer_chunks' must be a number(int, float) or a numpy ndarray."
+                "The 'num_infer_chunks' must be a number(int) or a numpy ndarray."
             )
         if not _is_number_or_matrix_(num_label_chunks):
             raise ValueError(
@@ -212,21 +418,28 @@ class ChunkEvaluator(MetricBase):
 
 class EditDistance(MetricBase):
     """
+    Edit distance is a way of quantifying how dissimilar two strings
+    (e.g., words) are to one another by counting the minimum number
+    of operations required to transform one string into the other.
+    Refer to https://en.wikipedia.org/wiki/Edit_distance
+
     Accumulate edit distance sum and sequence number from mini-batches and
     compute the average edit_distance and instance error of all batches.
 
     Args:
         name: the metrics name
 
-    Example:
-        edit_distance_metrics = fluid.layers.edit_distance(input, label)
-        distance_evaluator = fluid.metrics.EditDistance()
-        for epoch in PASS_NUM:
-            distance_evaluator.reset()
-            for data in batches:
-                loss = exe.run(fetch_list=[cost] + list(edit_distance_metrics))
-            distance_evaluator.update(*edit_distance_metrics)
-            distance, instance_error = distance_evaluator.eval()
+    Examples:
+        .. code-block:: python
+
+            distances, seq_num = fluid.layers.edit_distance(input, label)
+            distance_evaluator = fluid.metrics.EditDistance()
+            for epoch in PASS_NUM:
+                distance_evaluator.reset()
+                for data in batches:
+                    loss = exe.run(fetch_list=[cost] + list(edit_distance_metrics))
+                distance_evaluator.update(distances, seq_num)
+                distance, instance_error = distance_evaluator.eval()
 
         In the above example:
         'distance' is the average of the edit distance in a pass.
@@ -264,16 +477,38 @@ class EditDistance(MetricBase):
 class DetectionMAP(MetricBase):
     """
     Calculate the detection mean average precision (mAP).
-
-    TODO (Dang Qingqing): update the following doc.
-    The general steps are as follows:
-    1. calculate the true positive and false positive according to the input
-        of detection and labels.
-    2. calculate mAP value, support two versions: '11 point' and 'integral'.
-
+    mAP is the metric to measure the accuracy of object detectors
+    like Faster R-CNN, SSD, etc.
+    It is the average of the maximum precisions at different recall values.
     Please get more information from the following articles:
       https://sanchom.wordpress.com/tag/average-precision/
+
       https://arxiv.org/abs/1512.02325
+
+    The general steps are as follows:
+
+        1. calculate the true positive and false positive according to the input
+            of detection and labels.
+        2. calculate mAP value, support two versions: '11 point' and 'integral'.
+
+    Examples:
+        .. code-block:: python
+
+            pred = fluid.layers.fc(input=data, size=1000, act="tanh")
+            batch_map = layers.detection_map(
+                input,
+                label,
+                class_num,
+                background_label,
+                overlap_threshold=overlap_threshold,
+                evaluate_difficult=evaluate_difficult,
+                ap_version=ap_version)
+            metric = fluid.metrics.DetectionMAP()
+            for data in train_reader():
+                loss, preds, labels = exe.run(fetch_list=[cost, batch_map])
+                batch_size = data[0]
+                metric.update(value=batch_map, weight=batch_size)
+                numpy_map = metric.eval()
     """
 
     def __init__(self, name=None):
@@ -302,17 +537,18 @@ class DetectionMAP(MetricBase):
 
 class Auc(MetricBase):
     """
-    Auc Metrics which adapts to binary classification.
-    Need to note that auc metrics compute the value via Python natively.
+    Auc metric adapts to the binary classification.
+    Refer to https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve
+    Need to note that auc metric compute the value via Python natively.
     If you concern the speed, please use the fluid.layers.auc instead.
 
     The `auc` function creates four local variables, `true_positives`,
-      `true_negatives`, `false_positives` and `false_negatives` that are used to
-      compute the AUC. To discretize the AUC curve, a linearly spaced set of
-      thresholds is used to compute pairs of recall and precision values. The area
-      under the ROC-curve is therefore computed using the height of the recall
-      values by the false positive rate, while the area under the PR-curve is the
-      computed using the height of the precision values by the recall.
+    `true_negatives`, `false_positives` and `false_negatives` that are used to
+    compute the AUC. To discretize the AUC curve, a linearly spaced set of
+    thresholds is used to compute pairs of recall and precision values. The area
+    under the ROC-curve is therefore computed using the height of the recall
+    values by the false positive rate, while the area under the PR-curve is the
+    computed using the height of the precision values by the recall.
 
     Args:
         name: metric name
@@ -322,22 +558,32 @@ class Auc(MetricBase):
             curve.
 
     "NOTE: only implement the ROC curve type via Python now."
+
+    Examples:
+        .. code-block:: python
+
+            pred = fluid.layers.fc(input=data, size=1000, act="tanh")
+            metric = fluid.metrics.Auc()
+            for data in train_reader():
+                loss, preds, labels = exe.run(fetch_list=[cost, preds, labels])
+                metric.update(preds, labels)
+                numpy_auc = metric.eval()
     """
 
     def __init__(self, name, curve='ROC', num_thresholds=200):
-        super(MetricBase, self).__init__(name, curve, num_thresholds)
+        super(Auc, self).__init__(name=name)
         self._curve = curve
         self._num_thresholds = num_thresholds
         self._epsilon = 1e-6
-        self.tp_list = np.ndarray((num_thresholds, ))
-        self.fn_list = np.ndarray((num_thresholds, ))
-        self.tn_list = np.ndarray((num_thresholds, ))
-        self.fp_list = np.ndarray((num_thresholds, ))
+        self.tp_list = np.zeros((num_thresholds, ))
+        self.fn_list = np.zeros((num_thresholds, ))
+        self.tn_list = np.zeros((num_thresholds, ))
+        self.fp_list = np.zeros((num_thresholds, ))
 
-    def update(self, labels, predictions, axis=1):
+    def update(self, preds, labels):
         if not _is_numpy_(labels):
             raise ValueError("The 'labels' must be a numpy ndarray.")
-        if not _is_numpy_(predictions):
+        if not _is_numpy_(preds):
             raise ValueError("The 'predictions' must be a numpy ndarray.")
 
         kepsilon = 1e-7  # to account for floating point imprecisions
@@ -350,12 +596,12 @@ class Auc(MetricBase):
             tp, fn, tn, fp = 0, 0, 0, 0
             for i, lbl in enumerate(labels):
                 if lbl:
-                    if predictions[i, 0] >= thresh:
+                    if preds[i, 1] >= thresh:
                         tp += 1
                     else:
                         fn += 1
                 else:
-                    if predictions[i, 0] >= thresh:
+                    if preds[i, 1] >= thresh:
                         fp += 1
                     else:
                         tn += 1
diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py
index bbedf6fde0872fd32d81c103bf5fe61449b7f57b..9b3f2aebee73e56ee820dc8ff4c9cfabd1456aaa 100644
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@@ -26,16 +26,87 @@ def simple_img_conv_pool(input,
                          filter_size,
                          pool_size,
                          pool_stride,
-                         act,
-                         param_attr=None,
+                         pool_padding=0,
                          pool_type='max',
+                         global_pooling=False,
+                         conv_stride=1,
+                         conv_padding=0,
+                         conv_dilation=1,
+                         conv_groups=1,
+                         param_attr=None,
+                         bias_attr=None,
+                         act=None,
                          use_cudnn=True,
                          use_mkldnn=False):
+    """
+    The simple_img_conv_pool is composed with one Convolution2d and one Pool2d.
+
+    Args:
+        input (Variable): The input image with [N, C, H, W] format.
+        num_filters(int): The number of filter. It is as same as the output
+            feature channel.
+        filter_size (int|list|tuple): The filter size. If filter_size is a list or
+            tuple, it must contain two integers, (filter_size_H, filter_size_W). Otherwise,
+            the filter_size_H = filter_size_W = filter_size.
+        pool_size (int|list|tuple): The pooling size of Pool2d layer. If pool_size
+            is a list or tuple, it must contain two integers, (pool_size_H, pool_size_W).
+            Otherwise, the pool_size_H = pool_size_W = pool_size.
+        pool_stride (int|list|tuple): The pooling stride of Pool2d layer. If pool_stride
+            is a list or tuple, it must contain two integers, (pooling_stride_H, pooling_stride_W).
+            Otherwise, the pooling_stride_H = pooling_stride_W = pool_stride.
+        pool_padding (int|list|tuple): The padding of Pool2d layer. If pool_padding is a list or
+            tuple, it must contain two integers, (pool_padding_H, pool_padding_W).
+            Otherwise, the pool_padding_H = pool_padding_W = pool_padding. Default 0.
+        pool_type (str): Pooling type can be :math:`max` for max-pooling and :math:`avg` for
+            average-pooling. Default :math:`max`.
+        global_pooling (bool): Whether to use the global pooling. If global_pooling = true,
+            pool_size and pool_padding while be ignored. Default False
+        conv_stride (int|list|tuple): The stride size of the Conv2d Layer. If stride is a
+            list or tuple, it must contain two integers, (conv_stride_H, conv_stride_W). Otherwise,
+            the conv_stride_H = conv_stride_W = conv_stride. Default: conv_stride = 1.
+        conv_padding (int|list|tuple): The padding size of the Conv2d Layer. If padding is
+            a list or  tuple, it must contain two integers, (conv_padding_H, conv_padding_W).
+            Otherwise, the conv_padding_H = conv_padding_W = conv_padding. Default: conv_padding = 0.
+        conv_dilation (int|list|tuple): The dilation size of the Conv2d Layer. If dilation is
+            a list or tuple, it must contain two integers, (conv_dilation_H, conv_dilation_W).
+            Otherwise, the conv_dilation_H = conv_dilation_W = conv_dilation. Default: conv_dilation = 1.
+        conv_groups (int): The groups number of the Conv2d Layer. According to grouped
+            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
+            the first half of the filters is only connected to the first half
+            of the input channels, while the second half of the filters is only
+            connected to the second half of the input channels. Default: groups=1
+        param_attr (ParamAttr): The parameters to the Conv2d Layer. Default: None
+        bias_attr (ParamAttr): Bias parameter for the Conv2d layer. Default: None
+        act (str): Activation type for Conv2d. Default: None
+        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: True
+        use_mkldnn (bool): Use mkldnn kernels or not, it is valid only when compiled
+            with mkldnn library. Default: False
+
+    Return:
+        Variable: The result of input after Convolution2d and Pool2d.
+
+    Examples:
+        .. code-block:: python
+
+            img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+            conv_pool = fluid.nets.simple_img_conv_pool(input=img,
+                                                        filter_size=5,
+                                                        num_filters=20,
+                                                        pool_size=2,
+                                                        pool_stride=2,
+                                                        act="relu")
+    """
     conv_out = layers.conv2d(
         input=input,
         num_filters=num_filters,
         filter_size=filter_size,
+        stride=conv_stride,
+        padding=conv_padding,
+        dilation=conv_dilation,
+        groups=conv_groups,
         param_attr=param_attr,
+        bias_attr=bias_attr,
         act=act,
         use_cudnn=use_cudnn,
         use_mkldnn=use_mkldnn)
@@ -45,6 +116,8 @@ def simple_img_conv_pool(input,
         pool_size=pool_size,
         pool_type=pool_type,
         pool_stride=pool_stride,
+        pool_padding=pool_padding,
+        global_pooling=global_pooling,
         use_cudnn=use_cudnn,
         use_mkldnn=use_mkldnn)
     return pool_out
@@ -60,11 +133,65 @@ def img_conv_group(input,
                    conv_with_batchnorm=False,
                    conv_batchnorm_drop_rate=0.0,
                    pool_stride=1,
-                   pool_type=None,
+                   pool_type="max",
                    use_cudnn=True,
                    use_mkldnn=False):
     """
-    Image Convolution Group, Used for vgg net.
+    The Image Convolution Group is composed of Convolution2d, BatchNorm, DropOut,
+    and Pool2d. According to the input arguments, img_conv_group will do serials of
+    computation for Input using Convolution2d, BatchNorm, DropOut, and pass the last
+    result to Pool2d.
+
+    Args:
+        input (Variable): The input image with [N, C, H, W] format.
+        conv_num_filter(list|tuple): Indicates the numbers of filter of this group.
+        pool_size (int|list|tuple): The pooling size of Pool2d Layer. If pool_size
+            is a list or tuple, it must contain two integers, (pool_size_H, pool_size_W).
+            Otherwise, the pool_size_H = pool_size_W = pool_size.
+        conv_padding (int|list|tuple): The padding size of the Conv2d Layer. If padding is
+            a list or tuple, its length must be equal to the length of conv_num_filter.
+            Otherwise the conv_padding of all Conv2d Layers are the same. Default 1.
+        conv_filter_size (int|list|tuple): The filter size. If filter_size is a list or
+            tuple, its length must be equal to the length of conv_num_filter.
+            Otherwise the conv_filter_size of all Conv2d Layers are the same. Default 3.
+        conv_act (str): Activation type for Conv2d Layer that is not followed by BatchNorm.
+            Default: None.
+        param_attr (ParamAttr): The parameters to the Conv2d Layer. Default: None
+        conv_with_batchnorm (bool|list): Indicates whether to use BatchNorm after Conv2d Layer.
+            If conv_with_batchnorm is a list, its length must be equal to the length of
+            conv_num_filter. Otherwise, conv_with_batchnorm indicates whether all the
+            Conv2d Layer follows a BatchNorm. Default False.
+        conv_batchnorm_drop_rate (float|list): Indicates the drop_rate of Dropout Layer
+            after BatchNorm. If conv_batchnorm_drop_rate is a list, its length must be
+            equal to the length of conv_num_filter. Otherwise, drop_rate of all Dropout
+            Layers is conv_batchnorm_drop_rate. Default 0.0.
+        pool_stride (int|list|tuple): The pooling stride of Pool2d layer. If pool_stride
+            is a list or tuple, it must contain two integers, (pooling_stride_H,
+            pooling_stride_W). Otherwise, the pooling_stride_H = pooling_stride_W = pool_stride.
+            Default 1.
+        pool_type (str): Pooling type can be :math:`max` for max-pooling and :math:`avg` for
+            average-pooling. Default :math:`max`.
+        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: True
+        use_mkldnn (bool): Use mkldnn kernels or not, it is valid only when compiled
+            with mkldnn library. Default: False
+
+    Return:
+        Variable: The final result after serial computation using Convolution2d,
+            BatchNorm, DropOut, and Pool2d.
+
+    Examples:
+        .. code-block:: python
+
+            img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+            conv_pool = fluid.nets.img_conv_group(input=img,
+                                                  num_channels=3,
+                                                  conv_padding=1,
+                                                  conv_num_filter=[3, 3],
+                                                  conv_filter_size=3,
+                                                  conv_act="relu",
+                                                  pool_size=2,
+                                                  pool_stride=2)
     """
     tmp = input
     assert isinstance(conv_num_filter, list) or \
@@ -74,6 +201,7 @@ def img_conv_group(input,
         if not hasattr(obj, '__len__'):
             return [obj] * len(conv_num_filter)
         else:
+            assert len(obj) == len(conv_num_filter)
             return obj
 
     conv_padding = __extend_list__(conv_padding)
@@ -119,6 +247,39 @@ def sequence_conv_pool(input,
                        param_attr=None,
                        act="sigmoid",
                        pool_type="max"):
+    """
+    The sequence_conv_pool is composed with Sequence Convolution and Pooling.
+
+    Args:
+        input (Variable): The input of sequence_conv, which supports variable-time
+            length input sequence. The underlying of input is a matrix with shape
+            (T, N), where T is the total time steps in this mini-batch and N is
+            the input_hidden_size
+        num_filters(int): The number of filter.
+        filter_size (int): The filter size.
+        param_attr (ParamAttr): The parameters to the Sequence_conv Layer. Default: None.
+        act (str): Activation type for Sequence_conv Layer. Default: "sigmoid".
+        pool_type (str): Pooling type can be :math:`max` for max-pooling, :math:`average` for
+            average-pooling, :math:`sum` for sum-pooling, :math:`sqrt` for sqrt-pooling.
+            Default :math:`max`.
+
+    Return:
+        Variable: The final result after Sequence Convolution and Pooling.
+
+    Examples:
+        .. code-block:: python
+
+            input_dim = len(word_dict)
+            emb_dim = 128
+            hid_dim = 512
+            data = fluid.layers.data( ame="words", shape=[1], dtype="int64", lod_level=1)
+            emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim], is_sparse=True)
+            seq_conv = fluid.nets.sequence_conv_pool(input=emb,
+                                                     num_filters=hid_dim,
+                                                     filter_size=3,
+                                                     act="tanh",
+                                                     pool_type="sqrt")
+    """
     conv_out = layers.sequence_conv(
         input=input,
         num_filters=num_filters,
@@ -132,9 +293,9 @@ def sequence_conv_pool(input,
 
 def glu(input, dim=-1):
     """
-    The gated linear unit composed by split, sigmoid activation and elementwise
-    multiplication. Specifically, Split the input into two equal sized parts
-    :math:`a` and :math:`b` along the given dimension and then compute as
+    The Gated Linear Units(GLU) composed by split, sigmoid activation and element-wise
+    multiplication. Specifically, Split the input into two equal sized parts,
+    :math:`a` and :math:`b`, along the given dimension and then compute as
     following:
 
         .. math::
@@ -147,16 +308,16 @@ def glu(input, dim=-1):
     Args:
         input (Variable): The input variable which is a Tensor or LoDTensor.
         dim (int): The dimension along which to split. If :math:`dim < 0`, the
-            dimension to split along is :math:`rank(input) + dim`.
+            dimension to split along is :math:`rank(input) + dim`. Default -1.
 
     Returns:
-        Variable: The Tensor variable with half the size of input.
+        Variable: Variable with half the size of input.
 
     Examples:
         .. code-block:: python
 
-            # x is a Tensor variable with shape [3, 6, 9]
-            fluid.nets.glu(input=x, dim=1)  # shape of output: [3, 3, 9]
+            data = fluid.layers.data(name="words", shape=[3, 6, 9], dtype="float32")
+            output = fluid.nets.glu(input=data, dim=1)  # shape of output: [3, 3, 9]
     """
 
     a, b = layers.split(input, num_or_sections=2, dim=dim)
@@ -189,40 +350,48 @@ def scaled_dot_product_attention(queries,
     <https://arxiv.org/pdf/1706.03762.pdf>`_.
 
     Args:
-
         queries (Variable): The input variable which should be a 3-D Tensor.
         keys (Variable): The input variable which should be a 3-D Tensor.
         values (Variable): The input variable which should be a 3-D Tensor.
         num_heads (int): Head number to compute the scaled dot product
-                         attention. Default value is 1.
+            attention. Default: 1.
         dropout_rate (float): The dropout rate to drop the attention weight.
-                              Default value is 0.
+            Default: 0.0.
 
     Returns:
-
-        Variable: A 3-D Tensor computed by multi-head scaled dot product \
-                  attention.
+        Variable: A 3-D Tensor computed by multi-head scaled dot product\
+            attention.
 
     Raises:
-
         ValueError: If input queries, keys, values are not 3-D Tensors.
 
-    NOTE:
+    NOTES:
         1. When num_heads > 1, three linear projections are learned respectively
-        to map input queries, keys and values into queries', keys' and values'.
-        queries', keys' and values' have the same shapes with queries, keys
-        and values.
-
-        1. When num_heads == 1, scaled_dot_product_attention has no learnable
-        parameters.
+           to map input queries, keys and values into queries', keys' and values'.
+           queries', keys' and values' have the same shapes with queries, keys
+           and values.
+        2. When num_heads == 1, scaled_dot_product_attention has no learnable
+           parameters.
 
     Examples:
         .. code-block:: python
 
-            # Suppose q, k, v are Tensors with the following shape:
-            # q: [3, 5, 9], k: [3, 6, 9], v: [3, 6, 10]
-
-            contexts = fluid.nets.scaled_dot_product_attention(q, k, v)
+            queries = fluid.layers.data(name="queries",
+                                        shape=[3, 5, 9],
+                                        dtype="float32",
+                                        append_batch_size=False)
+            queries.stop_gradient = False
+            keys = fluid.layers.data(name="keys",
+                                     shape=[3, 6, 9],
+                                     dtype="float32",
+                                     append_batch_size=False)
+            keys.stop_gradient = False
+            values = fluid.layers.data(name="values",
+                                       shape=[3, 6, 10],
+                                       dtype="float32",
+                                       append_batch_size=False)
+            values.stop_gradient = False
+            contexts = fluid.nets.scaled_dot_product_attention(queries, keys, values)
             contexts.shape  # [3, 5, 10]
     """
     if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 115362c6bf33018342699a442c688e7356f3c206..75ee40fa9ca94cdd84ee7acbb62d6e652ac7fa33 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 import re
 from collections import defaultdict
-from paddle.fluid.framework import Program
+from paddle.fluid.framework import Program, Variable
 import framework
 import layers
 from backward import append_backward
@@ -26,10 +26,10 @@ from clip import append_gradient_clip_ops, error_clip_callback
 from contextlib import contextmanager
 
 __all__ = [
-    'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad',
+    'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Ftrl',
     'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer',
     'AdamaxOptimizer', 'DecayedAdagradOptimizer', 'RMSPropOptimizer',
-    'Adadelta', 'ModelAverage', 'Optimizer'
+    'FtrlOptimizer', 'Adadelta', 'ModelAverage', 'Optimizer', 'RMSPropOptimizer'
 ]
 
 
@@ -41,7 +41,10 @@ class Optimizer(object):
     but need to use one of it's implementation.
     """
 
-    def __init__(self, learning_rate, regularization=None):
+    def __init__(self,
+                 learning_rate,
+                 regularization=None,
+                 LARS_weight_decay=0.0):
         if not isinstance(learning_rate, float) and \
                 not isinstance(learning_rate, framework.Variable):
             raise TypeError("learning rate should be float or Variable")
@@ -61,6 +64,7 @@ class Optimizer(object):
         # {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...}
         self._accumulators = defaultdict(lambda: dict())
         self.helper = None
+        self._LARS_weight_decay = LARS_weight_decay
 
     def _create_global_learning_rate(self):
         lr = self.global_learning_rate()
@@ -100,10 +104,15 @@ class Optimizer(object):
         # create learning rate variable for every parameter
         param = param_and_grad[0]
         param_lr = param.optimize_attr['learning_rate']
-        if param_lr == 1.0:
-            return self.global_learning_rate()
+        if type(param_lr) == Variable:
+            # param learning rate has been updated (LARS)
+            print("returns updated param lr ", param_lr)
+            return param_lr
         else:
-            return self.global_learning_rate() * param_lr
+            if param_lr == 1.0:
+                return self.global_learning_rate()
+            else:
+                return self.global_learning_rate() * param_lr
 
     def _create_accumulators(self, block, parameters):
         """Create all accumulators needed by the parameters
@@ -183,15 +192,15 @@ class Optimizer(object):
         """Add optimization operators to update gradients to variables.
 
         Args:
-          loss: the target that this optimization is for.
-          parameters_and_grads: a list of (variable, gradient) pair to update.
+          loss(Variable): the target that this optimization is for.
+          parameters_and_grads(list(tuple(Variable, Variable))):
+          a list of (variable, gradient) pair to update.
 
         Returns:
           return_op_list: a list of operators that will complete one step of
           optimization. This will include parameter update ops, global step
           update ops and any other custom ops required by subclasses to manage
           their internal state.
-          :param startup_program:
         """
         # This is a default implementation of create_optimization_pass that
         # can be shared by most optimizers. This implementation assumes that
@@ -210,6 +219,10 @@ class Optimizer(object):
             self._create_accumulators(loss.block,
                                       [p[0] for p in parameters_and_grads])
             self._create_global_learning_rate()
+            if self._LARS_weight_decay > 0.0:
+                layers.append_LARS(parameters_and_grads,
+                                   self.global_learning_rate(),
+                                   self._LARS_weight_decay)
 
             optimize_ops = []
             for param_and_grad in parameters_and_grads:
@@ -255,7 +268,22 @@ class Optimizer(object):
 
 
 class SGDOptimizer(Optimizer):
-    """ Simple SGD optimizer without any state.
+    """
+    Optimizer of the stochastic gradient descent algorithm.
+
+    .. math::
+
+        param\_out = param - learning\_rate * grad
+
+    Args:
+        learning_rate (float|Variable): the learning rate used to update parameters. \
+        Can be a float value or a Variable with one float value as data element.
+
+    Examples:
+        .. code-block:: python
+
+            sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.2)
+            sgd_optimizer.minimize(cost)
     """
 
     def __init__(self, learning_rate, **kwargs):
@@ -281,7 +309,37 @@ class SGDOptimizer(Optimizer):
 
 
 class MomentumOptimizer(Optimizer):
-    """Simple Momentum optimizer with velocity state
+    """
+
+    Simple Momentum optimizer with velocity state
+
+    This optimizer has a flag for Nestrov Momentum.
+
+    The update equations are as follows:
+
+    .. math::
+
+        & velocity = mu * velocity + gradient
+
+        & if (use\_nesterov):
+
+        &\quad   param = param - gradient * learning\_rate + mu * velocity * learning\_rate
+
+        & else:
+
+        &\quad   param = param - learning\_rate * velocity
+
+    Args:
+        learning_rate (float|Variable): the learning rate used to update parameters. \
+        Can be a float value or a Variable with one float value as data element.
+        momentum (float): momentum factor
+        use_nesterov (bool): enables Nesterov momentum
+
+    Examples:
+        .. code-block:: python
+
+            optimizer = fluid.optimizer.Momentum(learning_rate=0.2, momentum=0.1)
+            optimizer.minimize(cost)
     """
     _velocity_acc_str = "velocity"
 
@@ -325,7 +383,32 @@ class MomentumOptimizer(Optimizer):
 
 
 class AdagradOptimizer(Optimizer):
-    """Simple Adagrad optimizer with moment state
+    """
+    **Adaptive Gradient Algorithm (Adagrad)**
+
+    The update is done as follows:
+
+    .. math::
+
+        moment\_out &= moment + grad * grad
+
+        param\_out &= param - \\frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon}
+
+    The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
+    does not have the epsilon attribute. It is added here in our implementation
+    as also proposed here: http://cs231n.github.io/neural-networks-3/#ada
+    for numerical stability to avoid the division by zero error.
+
+    Args:
+        learning_rate (float|Variable): the learning rate used to update parameters. \
+        Can be a float value or a Variable with one float value as data element.
+        epsilon (float): a small float value for numerical stability.
+
+    Examples:
+        .. code-block:: python
+
+            optimizer = fluid.optimizer.Adagrad(learning_rate=0.2)
+            optimizer.minimize(cost)
     """
     _moment_acc_str = "moment"
 
@@ -366,7 +449,40 @@ class AdagradOptimizer(Optimizer):
 
 
 class AdamOptimizer(Optimizer):
-    """Implements the Adam Optimizer
+    """
+    This implements the Adam optimizer from Section 2 of the Adam
+    paper : https://arxiv.org/abs/1412.6980.
+    Adam is a first-order gradient-based optimization method based on
+    adaptive estimates of lower-order moments.
+
+    Adam updates:
+
+    .. math::
+
+        t & = t + 1
+
+        moment\_1\_out & = {\\beta}_1 * moment\_1 + (1 - {\\beta}_1) * grad
+
+        moment\_2\_out & = {\\beta}_2 * moment\_2 + (1 - {\\beta}_2) * grad * grad
+
+        learning\_rate & = learning\_rate * \\
+                          \\frac{\sqrt{1 - {\\beta}_2^t}}{1 - {\\beta}_1^t}
+
+        param\_out & = param - learning\_rate * \\frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
+
+    Args:
+        learning_rate (float|Variable): the learning rate used to update parameters. \
+        Can be a float value or a Variable with one float value as data element.
+        beta1 (float): The exponential decay rate for the 1st moment estimates.
+        beta2 (float): The exponential decay rate for the 2nd moment estimates.
+        epsilon (float): a small float value for numerical stability.
+
+    Examples:
+        .. code-block:: python
+
+            optimizer = fluid.optimizer.Adam(learning_rate=0.2)
+            optimizer.minimize(cost)
+
     """
     _moment1_acc_str = "moment1"
     _moment2_acc_str = "moment2"
@@ -471,7 +587,42 @@ class AdamOptimizer(Optimizer):
 
 
 class AdamaxOptimizer(Optimizer):
-    """Implements the Adamax Optimizer
+    """
+    We implement the Adamax optimizer from Section 7 of the Adam
+    paper: https://arxiv.org/abs/1412.6980. Adamax is a variant of the
+    Adam algorithm based on the infinity norm.
+
+    Adamax updates:
+
+    .. math::
+
+        t & = t + 1
+
+        moment\_out & = {\\beta}_1 * moment + (1 - {\\beta}_1) * grad
+
+        inf\_norm\_out & = max({\\beta}_2 * inf\_norm + \epsilon, |grad|)
+
+        learning\_rate & = \\frac{learning\_rate}{1 - {\\beta}_1^t}
+
+        param\_out & = param - learning\_rate * \\frac{moment\_out}{inf\_norm\_out}
+
+
+    The original paper does not have an epsilon attribute.
+    However, it is added here for numerical stability to prevent the
+    division by 0 error.
+
+    Args:
+        learning_rate (float|Variable): the learning rate used to update parameters. \
+        Can be a float value or a Variable with one float value as data element.
+        beta1 (float): The exponential decay rate for the 1st moment estimates.
+        beta2 (float): The exponential decay rate for the 2nd moment estimates.
+        epsilon (float): a small float value for numerical stability.
+
+    Examples:
+        .. code-block:: python
+
+            optimizer = fluid.optimizer.Adamax(learning_rate=0.2)
+            optimizer.minimize(cost)
     """
     _moment_acc_str = "moment"
     _inf_norm_acc_str = "inf_norm"
@@ -555,7 +706,34 @@ class AdamaxOptimizer(Optimizer):
 
 
 class DecayedAdagradOptimizer(Optimizer):
-    """Simple Decayed Adagrad optimizer with moment state
+    """
+    **Decayed Adagrad Optimizer**
+
+    The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
+
+    The update is done as follows:
+
+    .. math::
+
+        moment\_out & = decay * moment + (1 - decay) * grad * grad
+
+        param\_out & = param - \\frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon}
+
+    The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
+    does not have an epsilon attribute. It is added here for numerical
+    stability to avoid the division by zero error.
+
+    Args:
+        learning_rate (float|Variable): the learning rate used to update parameters. \
+        Can be a float value or a Variable with one float value as data element.
+        decay (float): decay rate.
+        epsilon (float): a small float value for numerical stability.
+
+    Examples:
+        .. code-block:: python
+
+            optimizer = fluid.optimizer.DecayedAdagrad(learning_rate=0.2)
+            optimizer.minimize(cost)
     """
     _moment_acc_str = "moment"
 
@@ -601,6 +779,7 @@ class DecayedAdagradOptimizer(Optimizer):
 class AdadeltaOptimizer(Optimizer):
     """
     **Adadelta Optimizer**
+
     Simple Adadelta optimizer with average squared grad state and
     average squared update state.
     The details of adadelta please refer to this
@@ -615,7 +794,7 @@ class AdadeltaOptimizer(Optimizer):
         E(dx_t^2) &= \\rho * E(dx_{t-1}^2) + (1-\\rho) * (-g*learning\\_rate)^2
 
     Args:
-        learning_rate(float): global leraning rate
+        learning_rate(float): global learning rate
         rho(float): rho in equation
         epsilon(float): epsilon in equation
 
@@ -690,37 +869,37 @@ class RMSPropOptimizer(Optimizer):
 
     ..  math::
 
-        r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 \\\\
+        r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
 
         w & = w - \\frac{\\eta} {\\sqrt{r(w,t) + \\epsilon}} \\nabla Q_{i}(w)
 
     The first equation calculates moving average of the squared gradient for
-    each weight. Then dividing the gradient by :math: `sqrt{v(w,t)}`.
+    each weight. Then dividing the gradient by :math:`sqrt{v(w,t)}`.
 
     In some cases, adding a momentum term :math: `\\beta` is beneficial.
     In our implementation, Nesterov momentum is used:
 
     ..  math::
 
-        r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 \\\\
+        r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
 
         v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{v(w,t) +
             \\epsilon}} \\nabla Q_{i}(w)
 
         w & = w - v(w, t)
 
-    where, :math: `\\rho` is a hyperparameter and typical values are 0.9, 0.95
+    where, :math:`\\rho` is a hyperparameter and typical values are 0.9, 0.95
     and so on. :math: `beta` is the momentum term. :math: `\\epsilon` is a
     smoothing term to avoid division by zero, usually set somewhere in range
     from 1e-4 to 1e-8.
 
 
     Args:
-        learning_rate(float): global leraning rate.
+        learning_rate(float): global learning rate.
         rho(float): rho is :math: `\\rho` in equation, set 0.95 by default.
         epsilon(float): :math: `\\epsilon` in equation is smoothing term to
             avoid division by zero, set 1e-6 by default.
-        momentum(float): :math: `\\beta` in equation is the momentum term,
+        momentum(float): :math:`\\beta` in equation is the momentum term,
             set 0.0 by default.
 
     Raises:
@@ -797,6 +976,113 @@ class RMSPropOptimizer(Optimizer):
         return rmsprop_op
 
 
+class FtrlOptimizer(Optimizer):
+    """
+    FTRL (Follow The Regularized Leader) Optimizer.
+
+    The paper that proposed Follow The Regularized Leader (FTRL):
+    (https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf)
+
+    ..  math::
+
+        &new\_accum = squared\_accum + grad^2
+
+        &if (lr\_power == -0.5):
+
+        &\quad  linear\_accum += grad - \\frac{\\sqrt{new\_accum} - \\sqrt{squared\_accum}}{learning\_rate * param}
+
+        &else:
+
+        &\quad   linear\_accum += grad - \\frac{new\_accum^{-lr\_power} - accum^{-lr\_power}}{learning\_rate * param}
+
+
+        &x = l1 * sign(linear\_accum) - linear\_accum
+
+        &if (lr\_power == -0.5):
+
+        &\quad   y = \\frac{\\sqrt{new\_accum}}{learning\_rate} + (2 * l2)
+
+        &\quad   pre\_shrink = \\frac{x}{y}
+
+        &\quad   param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0)
+
+        &else:
+
+        &\quad   y = \\frac{new\_accum^{-lr\_power}}{learning\_rate} + (2 * l2)
+
+        &\quad   pre\_shrink = \\frac{x}{y}
+
+        &\quad   param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0)
+
+        &squared\_accum += grad^2
+
+    Args:
+        learning_rate (float|Variable): global learning rate.
+        l1 (float):
+        l2 (float):
+        lr_power (float):
+
+    Raises:
+        ValueError: If learning_rate, rho, epsilon, momentum are None.
+
+    Examples:
+          .. code-block:: python
+
+              optimizer = fluid.optimizer.Ftrl(0.0001)
+              _, params_grads = optimizer.minimize(cost)
+    """
+
+    _squared_acc_str = "squared"
+    _linear_acc_str = "linear"
+
+    def __init__(self, learning_rate, l1=0.0, l2=0.0, lr_power=-0.5, **kwargs):
+        super(FtrlOptimizer, self).__init__(
+            learning_rate=learning_rate, **kwargs)
+        if learning_rate is None:
+            raise ValueError("learning_rate is not set.")
+
+        self.type = "ftrl"
+        self._l1 = l1
+        self._l2 = l2
+        self._lr_power = lr_power
+
+    def _create_accumulators(self, block, parameters):
+        if not isinstance(block, framework.Block):
+            raise TypeError("block is not instance of framework.Block.")
+
+        for p in parameters:
+            self._add_accumulator(self._squared_acc_str, p)
+            self._add_accumulator(self._linear_acc_str, p)
+
+    def _append_optimize_op(self, block, param_and_grad):
+        if not isinstance(block, framework.Block):
+            raise TypeError("block is not instance of framework.Block.")
+
+        squared_acc = self._get_accumulator(self._squared_acc_str,
+                                            param_and_grad[0])
+        linear_acc = self._get_accumulator(self._linear_acc_str,
+                                           param_and_grad[0])
+        ftrl_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "SquaredAccumulator": squared_acc,
+                "LinearAccumulator": linear_acc,
+                "LearningRate": self._create_param_lr(param_and_grad),
+            },
+            outputs={
+                "ParamOut": param_and_grad[0],
+                "SquaredAccumOut": squared_acc,
+                "LinearAccumOut": linear_acc
+            },
+            attrs={"l1": self._l1,
+                   "l2": self._l1,
+                   "lr_power": self._lr_power})
+
+        return ftrl_op
+
+
 # We short the class name, since users will use the optimizer with the package
 # name. The sample code:
 #
@@ -813,6 +1099,7 @@ Adamax = AdamaxOptimizer
 DecayedAdagrad = DecayedAdagradOptimizer
 Adadelta = AdadeltaOptimizer
 RMSProp = RMSPropOptimizer
+Ftrl = FtrlOptimizer
 
 
 class ModelAverage(Optimizer):
@@ -826,15 +1113,16 @@ class ModelAverage(Optimizer):
 
     Args:
         average_window_rate: The rate of average window.
-        params_grads: A list of parameter-grad variable pairs.
         min_average_window: The minimum size of average window.
         max_average_window: The maximum size of average window.
 
     Examples:
-        ...
+
+      .. code-block:: python
+
         optimizer = fluid.optimizer.Momentum()
-        _, params_grads = optimizer.minimize(cost)
-        model_average = fluid.optimizer.ModelAverage(params_grads, 0.15,
+        optimizer.minimize(cost)
+        model_average = fluid.optimizer.ModelAverage(0.15,
                                                 min_average_window=10000,
                                                 max_average_window=20000)
         for pass_id in range(args.pass_num):
@@ -848,7 +1136,6 @@ class ModelAverage(Optimizer):
 
     def __init__(self,
                  average_window_rate,
-                 params_grads=None,
                  min_average_window=10000,
                  max_average_window=10000,
                  **kwargs):
@@ -857,21 +1144,16 @@ class ModelAverage(Optimizer):
         self.min_average_window = min_average_window
         self.max_average_window = max_average_window
 
-        self.params_grads = [] if params_grads is None else params_grads
-        params = {}
-        for param, grad in self.params_grads:
-            if param.do_model_average != False:
-                params[param.name] = (param, grad)
+        self.params_grads = []
         for param in framework.default_main_program().global_block(
         ).all_parameters():
-            if param.name not in params and param.do_model_average != False:
+            if param.do_model_average != False:
                 grad = param.block.create_var(
                     name=unique_name.generate(".".join([param.name, 'tmp'])),
                     dtype=param.dtype,
                     persistable=False,
                     stop_gradient=True)
-                params[param.name] = (param, grad)
-        self.params_grads = params.values()
+                self.params_grads.append((param, grad))
 
         for param, grad in self.params_grads:
             self._append_average_accumulate_op(param)
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 0fdc9a035292b3390cece6c5821a60b1b281e54d..6baf648198585022f992709c519038688af293e1 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -27,6 +27,40 @@ BuildStrategy = core.ParallelExecutor.BuildStrategy
 
 
 class ParallelExecutor(object):
+    """
+    ParallelExecutor can run program in parallel.
+
+    Args:
+        use_cuda (bool): Whether to use CUDA or not.
+        loss_name (str): The loss name must set in training. Default None.
+        main_program (Program): The program that need to run, if not provided,
+            then default_main_program will be used. Default None.
+        share_vars_from(ParallelExecutor): If provied, it will share variables
+            from the specified ParallelExecutor. Default None.
+        num_trainers(int): If greater than 1, NCCL will be initialized with
+            multiple rank of nodes, each node should have same number of GPUs.
+            Distributed training will be enabled then. Default 1.
+        trainer_id(int: Must use together with num_trainers. trainer_id is the
+            "rank" of current node starts from 0. Default 0.
+
+    Returns:
+        ParallelExecutor: The initialized ParallelExecutor object.
+
+    Raises:
+        TypeError: If share_vars_from is provided, but not ParallelExecutor object.
+
+    Examples:
+        .. code-block:: python
+
+          train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=loss.name)
+          test_exe = fluid.ParallelExecutor(use_cuda=True,
+                                            main_program=test_program,
+                                            share_vars_from=train_exe)
+
+          train_loss, = train_exe.run([loss.name], feed=feed_dict)
+          test_loss, = test_exe.run([loss.name], feed=feed_dict)
+    """
+
     def __init__(self,
                  use_cuda,
                  loss_name=None,
@@ -37,42 +71,6 @@ class ParallelExecutor(object):
                  num_trainers=1,
                  trainer_id=0,
                  **kwargs):
-        """
-        ParallelExecutor can run program in parallel.
-
-        Args:
-            use_cuda(bool): Whether to use CUDA or not.
-            loss_name(str, default None): The loss name must set in training.
-            main_program(Program, default None): The program that need to run,
-                if not provided, then default_main_program will be used.
-            share_vars_from(ParallelExecutor, default None): If provied,
-                it will share variables from the specified ParallelExecutor.
-            num_trainers(int, default 1): If greater than 1, NCCL will be
-                initialized with multpile rank of nodes, each node should have
-                same number of GPUs. Distributed training will be enabled then.
-            trainer_id(int, default 0): Must use together with num_trainers.
-                trainer_id is the "rank" of current node starts from 0.
-
-        Returns:
-            A ParallelExecutor object.
-
-        Raises:
-            TypeError: If share_vars_from is provided, but not ParallelExecutor
-                object.
-
-        Examples:
-            .. code-block:: python
-
-              train_exe = fluid.ParallelExecutor(
-                  use_cuda=True, loss_name=loss.name)
-              test_exe = fluid.ParallelExecutor(
-                  use_cuda=True,
-                  main_program=test_program,
-                  share_vars_from=train_exe)
-
-              train_loss, = train_exe.run([loss.name], feed=feed_dict)
-              test_loss, = test_exe.run([loss.name], feed=feed_dict)
-        """
         if len(kwargs) != 0:
             err_msg = ""
             for key in kwargs:
@@ -131,10 +129,16 @@ class ParallelExecutor(object):
         main = main_program
         main = main if main else framework.default_main_program()
         scope = executor.global_scope()
+        # FIXME(Yancey1989): it's a temporary approach to determinate the distribute
+        # train program, call self.bcast_param() at the end of each mini-batch.
+        self.is_dist = True if "recv" in [
+            op.type for op in main.global_block().ops
+        ] else False
 
         if share_vars_from and not isinstance(share_vars_from,
                                               ParallelExecutor):
             raise TypeError("share_vars_from must be ParallelExecutor.")
+
         local_scopes = share_vars_from.executor.local_scopes(
         ) if share_vars_from else []
 
@@ -156,7 +160,7 @@ class ParallelExecutor(object):
             build_strategy, num_trainers, trainer_id)
         self.scope = scope
 
-    def run(self, fetch_list, feed=None, feed_dict=None):
+    def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True):
         """
         Run a parallel executor with fetch_list.
 
@@ -166,12 +170,14 @@ class ParallelExecutor(object):
         element in the list will be copied to each device directly.
 
         For example, if the feed is a dict:
+
         >>> exe = ParallelExecutor()
         >>> # the image will be splitted into devices. If there is two devices
         >>> # each device will process an image with shape (24, 1, 28, 28)
         >>> exe.run(feed={'image': numpy.random.random(size=(48, 1, 28, 28))})
 
         For example, if the feed is a list:
+
         >>> exe = ParallelExecutor()
         >>> # each device will process each element in the list.
         >>> # the 1st device will process an image with shape (48, 1, 28, 28)
@@ -182,18 +188,42 @@ class ParallelExecutor(object):
         >>>               {"image": numpy.random.random(size=(32, 1, 28, 28))},
         >>>              ])
 
-
         Args:
             fetch_list(list): The fetched variable names
             feed(list|dict|None): The feed variables. If the feed is a dict,
                 tensors in that dict will be splitted into each devices. If
                 the feed is a list, each element of the list will be copied
-                to each device.
+                to each device. Default None.
             feed_dict: Alias for feed parameter, for backward compatibility.
-                This parameter is deprecated.
+                This parameter has been deprecated. Default None.
+            return_numpy(bool): Whether converts the fetched tensor to numpy.
+                Default: True.
+
+        Returns:
+            List: The fetched result list.
+
+        Raises:
+            ValueError: If the feed is a list, but its length is not equal the
+                length of active places, or its element's is not dict.
+
+        NOTES:
+            1. If the feed's type is dict, the number of data that feeds to
+               ParallelExecutor must be bigger than active places. Otherwise,
+               it will throw exception from C++ side. Special attention should be
+               paid to check whether the last batch of the dataset is bigger
+               than active places.
+            2. If active places are more than one, the fetch results for each
+               variable is a list, and each element of this list is the variable of
+               respective active place.
 
-        Returns: fetched result list.
+        Examples:
+            .. code-block:: python
 
+                pe = fluid.ParallelExecutor(use_cuda=use_cuda,
+                                            loss_name=avg_cost.name,
+                                            main_program=fluid.default_main_program())
+                loss = pe.run(feed=feeder.feed(cur_batch),
+                              fetch_list=[avg_cost.name]))
         """
         if feed is None and feed_dict is not None:
             feed = feed_dict
@@ -238,9 +268,20 @@ class ParallelExecutor(object):
         fetch_var_name = '@FETCHED_VAR_NAME@'
         self.executor.run(fetch_list, fetch_var_name)
         arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()
+
+        if self.is_dist:
+            self.bcast_params()
+
+        if return_numpy:
+            return executor.as_numpy(arr)
+
         return [arr[i] for i in range(len(arr))]
 
     def bcast_params(self):
+        """
+        Broadcast the parameters to other devices. It is used during
+        distributed training.
+        """
         self.executor.bcast_params(set(self.persistable_vars))
 
     @property
diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py
index 1c6970441bccdc1c1221503256c30c83502bd123..0a42b9fca8dba7a11b414990be6c04c93158864f 100644
--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
@@ -22,6 +22,35 @@ __all__ = [
 
 
 class ParamAttr(object):
+    """
+    Parameter attributes object. To fine-tuning network training process, user
+    can set parameter's attributes to control training details. Such as learning rate,
+    regularization, trainable, do_model_average and the method to initialize param.
+
+
+    Args:
+        name(str): The parameter's name. Default None.
+        initializer(Initializer): The method to initial this parameter. Default None.
+        learning_rate(float): The parameter's learning rate. The learning rate when
+            optimize is :math:`global\_lr * parameter\_lr * scheduler\_factor`.
+            Default 1.0.
+        regularizer(WeightDecayRegularizer): Regularization factor. Default None.
+        trainable(bool): Whether this parameter is trainable. Default True.
+        gradient_clip(BaseGradientClipAttr): The method to clip this parameter's
+            gradient. Default None.
+        do_model_average(bool): Whether this parameter should do model average.
+            Default False.
+
+    Examples:
+        .. code-block:: python
+
+            w_param_attrs = fluid.ParamAttr(name="fc_weight",
+                                            learning_rate=0.5,
+                                            regularizer=fluid.L2Decay(1.0),
+                                            trainable=True)
+            y_predict = fluid.layers.fc(input=x, size=10, param_attr=w_param_attrs)
+    """
+
     def __init__(self,
                  name=None,
                  initializer=None,
@@ -29,7 +58,7 @@ class ParamAttr(object):
                  regularizer=None,
                  trainable=True,
                  gradient_clip=None,
-                 do_model_average=None):
+                 do_model_average=False):
         self.name = name
         self.initializer = initializer
         self.learning_rate = learning_rate
@@ -39,6 +68,16 @@ class ParamAttr(object):
         self.model_average = do_model_average
 
     def set_default_initializer(self, initializer):
+        """
+        Set the default initializer, the initializer should be Constant,
+        Uniform, Normal, Xavier, MSRA.
+
+        Args:
+            initializer(Initializer): the initializer to set.
+
+        Returns:
+            None
+        """
         if initializer is None:
             if self.initializer is None:
                 raise ValueError("ParamAttr.initializer is not set")
@@ -50,13 +89,45 @@ class ParamAttr(object):
         self.initializer = initializer
 
     def set_default_param_initializer(self):
+        """
+        Set the default initializer for the parameter with Xavier.
+
+        Args:
+            None.
+
+        Returns:
+            None.
+        """
         self.set_default_initializer(Xavier())
 
     def set_default_bias_initializer(self):
+        """
+        Set the default initializer for the bias with Constant(0.0).
+
+        Args:
+            None.
+
+        Returns:
+            None.
+        """
         self.set_default_initializer(Constant(0.0))
 
     @staticmethod
     def to_attr(arg):
+        """
+        Create ParamAttr[s].
+
+        Args:
+            arg: Arguments to initialize ParamAttr[s]. arg's type can be
+                str, Initializer, float, WeightDecayRegularizer, BaseGradientClipAttr,
+                bool, ParamAttr, or a list of above type.
+
+        Returns:
+            ParamAttr[s]: ParamAttr[s] initialized with arg.
+
+        Raises:
+            arg can not initialize a ParamAttr.
+        """
         if arg is None:
             return ParamAttr()
         elif isinstance(arg, list) or isinstance(arg, tuple):
@@ -75,6 +146,15 @@ class ParamAttr(object):
             raise TypeError("{0} cast to ParamAttr".format(type(arg)))
 
     def to_kwargs(self, with_initializer=False):
+        """
+        Returns the attributes of this parameter.
+
+        Args:
+            with_initializer(bool): Whether to add initializer attr.
+
+        Returns:
+            Parameter attributes(map): The attributes of this parameter.
+        """
         kwargs = {
             'name': self.name,
             'optimize_attr': {
@@ -92,9 +172,27 @@ class ParamAttr(object):
 
 class WeightNormParamAttr(ParamAttr):
     """
-    Used for weight normalization. Any field in ParamAttr can also be set here.
-    Besides, an extra field dim can be set to indicate the dimension except
-    which to normalize.
+    Used for weight Norm. Weight Norm is a reparameterization of the weight vectors
+    in a neural network that decouples the length of those weight vectors from
+    their direction. Weight Norm has been implemented as discussed in this
+    paper: `Weight Normalization: A Simple Reparameterization to Accelerate
+    Training of Deep Neural Networks
+    <https://arxiv.org/pdf/1602.07868.pdf>`_.
+
+    Args:
+        dim(list): The parameter's name. Default None.
+        kwargs: Any field in ParamAttr. Default None.
+
+    Examples:
+        .. code-block:: python
+
+            data = fluid.layers.data(name="data", shape=[3, 32, 32], dtype="float32")
+            fc = fluid.layers.fc(input=data,
+                                 size=1000,
+                                 param_attr=WeightNormParamAttr(
+                                      dim=None,
+                                      name='weight_norm_param'))
+
     """
     # List to record the parameters reparameterized by weight normalization.
     # If these parameters are treated as Variable rather than Parameter,
diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py
index e2bd1d4c9a1ea5ddc0dfd19c769dcb40bfd6d04c..6a321ae024dcb50452bc4d96d7e7e70f590a42c6 100644
--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
@@ -42,6 +42,9 @@ def cuda_profiler(output_file, output_mode=None, config=None):
     counters/options for profiling by `config` argument. The default config
     is ['gpustarttimestamp', 'gpustarttimestamp', 'gridsize3d',
     'threadblocksize', 'streamid', 'enableonstart 0', 'conckerneltrace'].
+    Then users can use NVIDIA Visual Profiler
+    (https://developer.nvidia.com/nvidia-visual-profiler) tools to load this
+    this output file to visualize results.
 
     Args:
         output_file (string) : The output file name, the result will be
@@ -50,6 +53,33 @@ def cuda_profiler(output_file, output_mode=None, config=None):
             Comma separated values format. It should be 'kvp' or 'csv'.
         config (list of string) : The profiler options and counters can refer
             to "Compute Command Line Profiler User Guide".
+
+    Raises:
+        ValueError: If `output_mode` is not in ['kvp', 'csv'].
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+            import paddle.fluid.profiler as profiler
+
+            epoc = 8
+            dshape = [4, 3, 28, 28]
+            data = fluid.layers.data(name='data', shape=[3, 28, 28], dtype='float32')
+            conv = fluid.layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
+
+            place = fluid.CUDAPlace(0)
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+
+            output_file = 'cuda_profiler.txt'
+            with profiler.cuda_profiler(output_file, 'csv') as nvprof:
+                for i in range(epoc):
+                    input = np.random.random(dshape).astype('float32')
+                    exe.run(fluid.default_main_program(), feed={'data': input})
+            # then use  NVIDIA Visual Profiler (nvvp) to load this output file
+            # to visualize results.
     """
     if output_mode is None:
         output_mode = 'csv'
@@ -69,19 +99,52 @@ def cuda_profiler(output_file, output_mode=None, config=None):
 
 
 def reset_profiler():
-    """The profiler clear interface.
-    reset_profiler will clear the previous time record.
+    """
+    Clear the previous time record. This interface does not work for
+    `fluid.profiler.cuda_profiler`, it only works for
+    `fluid.profiler.start_profiler`, `fluid.profiler.stop_profiler`,
+    and `fluid.profiler.profiler`.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle.fluid.profiler as profiler
+            with profiler.profiler(state, 'total', '/tmp/profile'):
+                for iter in range(10):
+                    if iter == 2:
+                        profiler.reset_profiler()
+                    # ...
     """
     core.reset_profiler()
 
 
 def start_profiler(state):
-    """Enable the profiler.
+    """
+    Enable the profiler. Uers can use `fluid.profiler.start_profiler` and
+    `fluid.profiler.stop_profiler` to insert the code, except the usage of
+    `fluid.profiler.profiler` interface.
 
     Args:
         state (string) : The profiling state, which should be 'CPU', 'GPU'
             or 'All'. 'CPU' means only profile CPU. 'GPU' means profiling
             GPU as well. 'All' also generates timeline.
+
+    Raises:
+        ValueError: If `state` is not in ['CPU', 'GPU', 'All'].
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle.fluid.profiler as profiler
+
+            profiler.start_profiler('GPU')
+            for iter in range(10):
+                if iter == 2:
+                    profiler.reset_profiler()
+                # except each iteration
+            profiler.stop_profiler('total', '/tmp/profile')
     """
     if core.is_profiler_enabled():
         return
@@ -97,7 +160,10 @@ def start_profiler(state):
 
 
 def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
-    """Stop the profiler.
+    """
+    Stop the profiler. Uers can use `fluid.profiler.start_profiler` and
+    `fluid.profiler.stop_profiler` to insert the code, except the usage of
+    `fluid.profiler.profiler` interface.
 
     Args:
         sorted_key (string) : If None, the profiling results will be printed
@@ -111,6 +177,23 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
             The `ave` means sorting by the average execution time.
         profile_path (string) : If state == 'All', it will write a profile
             proto output file.
+
+    Raises:
+        ValueError: If `sorted_key` is not in
+            ['calls', 'total', 'max', 'min', 'ave'].
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle.fluid.profiler as profiler
+
+            profiler.start_profiler('GPU')
+            for iter in range(10):
+                if iter == 2:
+                    profiler.reset_profiler()
+                # except each iteration
+            profiler.stop_profiler('total', '/tmp/profile')
     """
     if not core.is_profiler_enabled():
         return
@@ -137,7 +220,12 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
     Different from cuda_profiler, this profiler can be used to profile both CPU
     and GPU program. By defalut, it records the CPU and GPU operator kernels,
     if you want to profile other program, you can refer the profiling tutorial
-    to add more records.
+    to add more records in C++ code.
+
+    If the state == 'All', a profile proto file will be written to
+    `profile_path`. This file records timeline information during the execution.
+    Then users can visualize this file to see the timeline, please refer 
+    https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/optimization/timeline.md
 
     Args:
         state (string) : The profiling state, which should be 'CPU' or 'GPU',
@@ -156,6 +244,25 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
             The `ave` means sorting by the average execution time.
         profile_path (string) : If state == 'All', it will write a profile
             proto output file.
+
+    Raises:
+        ValueError: If `state` is not in ['CPU', 'GPU', 'All']. If `sorted_key` is
+            not in ['calls', 'total', 'max', 'min', 'ave'].
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle.fluid.profiler as profiler
+
+            with profiler.profiler('All', 'total', '/tmp/profile') as prof:
+                for pass_id in range(pass_num):
+                    for batch_id, data in enumerate(train_reader()):
+                        exe.run(fluid.default_main_program(),
+                                feed=feeder.feed(data),
+                                fetch_list=[],
+                                use_program_cache=True)
+                        # ...
     """
     start_profiler(state)
     yield
diff --git a/python/paddle/fluid/recordio_writer.py b/python/paddle/fluid/recordio_writer.py
index 8d48e9abef0fb9861284c6302b30efb0e3994989..bd57772713057f12b876942de58ee43527e94834 100644
--- a/python/paddle/fluid/recordio_writer.py
+++ b/python/paddle/fluid/recordio_writer.py
@@ -36,6 +36,45 @@ def convert_reader_to_recordio_file(
         compressor=core.RecordIOWriter.Compressor.Snappy,
         max_num_records=1000,
         feed_order=None):
+    """
+    Convert a Python Reader to a recordio file.
+
+    Please see :ref:`api_guide_python_reader` and :ref:`api_guide_reader_op` for
+    details.
+
+    Examples:
+
+        >>> import paddle.fluid as fluid
+        >>> import paddle.dataset.mnist as mnist
+        >>> import paddle
+        >>>
+        >>> tmp_program = fluid.Program()
+        >>> with fluid.program_guard(tmp_program):
+        >>>     img = fluid.layers.data(name='img', shape=[784])
+        >>>     label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+        >>> feeder = fluid.DataFeeder(feed_list=[img, label], place=fluid.CPUPlace())
+        >>> # mnist.recordio will be generated in current directory
+        >>> fluid.recordio_writer.convert_reader_to_recordio_file(
+        >>>                     filename="mnist.recordio",
+        >>>                     reader_creator=paddle.batch(mnist.train(), batch_size=32),
+        >>>                     feeder=feeder)
+
+    Args:
+        filename(str): The recordio filename.
+        reader_creator(callable): The Python Reader Creator. See
+            :ref:`api_guide_python_reader`.
+        feeder(DataFeeder): The DataFeeder instance. Used to convert
+            :code:`reader_creator` to :code: `lod_tensor`
+        compressor: Must in fluid.core.RecordIOWriter.Compressor.Snappy or
+            fluid.core.RecordIOWriter.Compressor.NoCompress. Use :code:`Snappy`
+            by default.
+        max_num_records(int): Maximum number of records in one chuck. Each record
+            is each return value from reader function
+        feed_order(list): The order of variable names that the reader returns
+
+    Returns:
+        int: the number of record that saved.
+    """
     if feed_order is None:
         feed_order = feeder.feed_names
     counter = 0
@@ -58,6 +97,17 @@ def convert_reader_to_recordio_files(
         compressor=core.RecordIOWriter.Compressor.Snappy,
         max_num_records=1000,
         feed_order=None):
+    """
+    convert a python reader to many recordio files.
+
+    This API is basically same as :code:`convert_reader_to_recordio_file`,
+    instead of it will create many recordio files. Each file contains at
+    most :code:`batch_per_file` records.
+
+    Please reference
+    :ref:`api_fluid_recordio_writer_convert_reader_to_recordio_file` for more
+    details.
+    """
     if feed_order is None:
         feed_order = feeder.feed_names
     f_name, f_ext = os.path.splitext(filename)
diff --git a/python/paddle/fluid/regularizer.py b/python/paddle/fluid/regularizer.py
index c4d6829599616cb3ea7791a189e7070974de6ae3..dac474d5ee76590a75311d6bf2c4cb2fe85b6c40 100644
--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
@@ -16,8 +16,8 @@ import framework
 from . import core
 
 __all__ = [
-    'append_regularization_ops', 'WeightDecayRegularizer', 'L1Decay', 'L2Decay',
-    'L1DecayRegularizer', 'L2DecayRegularizer'
+    'append_regularization_ops', 'L1Decay', 'L2Decay', 'L1DecayRegularizer',
+    'L2DecayRegularizer'
 ]
 
 
@@ -36,7 +36,8 @@ def append_regularization_ops(parameters_and_grads, regularization=None):
                         set. It will be applied with regularizer.
 
     Returns:
-        list of (parameters, gradients) pair with the regularized gradient
+        list[(Variable, Variable)]: list of (parameters, gradients) \
+        pair with the regularized gradient
 
     Raises:
         Exception: Unknown regularization type
@@ -100,6 +101,24 @@ class WeightDecayRegularizer(object):
 
 class L2DecayRegularizer(WeightDecayRegularizer):
     """Implements the L2 Weight Decay Regularization
+
+    Small values of L2 can help prevent over fitting the training data.
+
+    .. math::
+
+        L2WeightDecay = reg\_coeff * parameter
+
+    Args:
+        regularization_coeff(float): regularization coeff
+
+    Examples:
+        .. code-block:: python
+
+            optimizer = fluid.optimizer.Adagrad(
+                learning_rate=1e-4,
+                regularization=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=0.1))
+            optimizer.minimize(avg_cost)
     """
 
     def __init__(self, regularization_coeff=0.0):
@@ -154,6 +173,27 @@ class L2DecayRegularizer(WeightDecayRegularizer):
 
 class L1DecayRegularizer(WeightDecayRegularizer):
     """Implements the L1 Weight Decay Regularization
+
+    L1 regularization encourages sparsity.
+
+    .. math::
+
+        L1WeightDecay = reg\_coeff * sign(parameter)
+
+    Args:
+        regularization_coeff(float): regularization coeff
+
+    Examples:
+        .. code-block:: python
+
+            program = fluid.framework.Program()
+            block = program.global_block()
+            mul_x = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="mul.x",
+                regularizer=fluid.regularizer.L1DecayRegularizer(0.5))
     """
 
     def __init__(self, regularization_coeff=0.0):
diff --git a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
index 0ccb3a39e02ea0c24bdfe01c5eba73b92da88a04..67aa21e8c5699f1cb568dad23cd13f4cb51a6ec9 100755
--- a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
+++ b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
@@ -206,35 +206,35 @@ def infer(use_cuda, inference_program, params_dirname):
     inferencer = fluid.Inferencer(
         inference_program, param_path=params_dirname, place=place)
 
-    # Setup inputs by creating LoDTensors to represent sequences of words.
-    # Here each word is the basic element of these LoDTensors and the shape of 
+    # Setup input by creating LoDTensor to represent sequence of words.
+    # Here each word is the basic element of the LoDTensor and the shape of 
     # each word (base_shape) should be [1] since it is simply an index to 
     # look up for the corresponding word vector.
-    # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
-    # which has only one lod level. Then the created LoDTensors will have only 
+    # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
+    # which has only one level of detail. Then the created LoDTensor will have only 
     # one higher level structure (sequence of words, or sentence) than the basic 
     # element (word). Hence the LoDTensor will hold data for three sentences of 
     # length 3, 4 and 2, respectively. 
-    # Note that lod info should be a list of lists.
-    lod = [[3, 4, 2]]
+    # Note that recursive_sequence_lengths should be a list of lists.
+    recursive_seq_lens = [[3, 4, 2]]
     base_shape = [1]
     # The range of random integers is [low, high]
     word = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
     ctx_n2 = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
     ctx_n1 = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
     ctx_0 = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
     ctx_p1 = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
     ctx_p2 = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
     pred = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=PRED_DICT_LEN - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=PRED_DICT_LEN - 1)
     mark = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=MARK_DICT_LEN - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=MARK_DICT_LEN - 1)
 
     results = inferencer.infer(
         {
diff --git a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
index c4b37df3a09f93fe965ae28ce783f06f5018020d..8becd2404b0201c44b587a28e88995958082cd28 100644
--- a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
@@ -127,9 +127,19 @@ def decode(context, is_sparse):
         current_score = pd.fc(input=current_state_with_lod,
                               size=target_dict_dim,
                               act='softmax')
-        topk_scores, topk_indices = pd.topk(current_score, k=topk_size)
+        topk_scores, topk_indices = pd.topk(current_score, k=beam_size)
+        # calculate accumulated scores after topk to reduce computation cost
+        accu_scores = pd.elementwise_add(
+            x=pd.log(topk_scores), y=pd.reshape(
+                pre_score, shape=[-1]), axis=0)
         selected_ids, selected_scores = pd.beam_search(
-            pre_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0)
+            pre_ids,
+            pre_score,
+            topk_indices,
+            accu_scores,
+            beam_size,
+            end_id=10,
+            level=0)
 
         pd.increment(x=counter, value=1, in_place=True)
 
@@ -138,10 +148,14 @@ def decode(context, is_sparse):
         pd.array_write(selected_ids, array=ids_array, i=counter)
         pd.array_write(selected_scores, array=scores_array, i=counter)
 
-        pd.less_than(x=counter, y=array_len, cond=cond)
+        # update the break condition: up to the max length or all candidates of
+        # source sentences have ended.
+        length_cond = pd.less_than(x=counter, y=array_len)
+        finish_cond = pd.logical_not(pd.is_empty(x=selected_ids))
+        pd.logical_and(x=length_cond, y=finish_cond, out=cond)
 
     translation_ids, translation_scores = pd.beam_search_decode(
-        ids=ids_array, scores=scores_array)
+        ids=ids_array, scores=scores_array, beam_size=beam_size, end_id=10)
 
     # return init_ids, init_scores
 
@@ -215,11 +229,13 @@ def decode_main(use_cuda, is_sparse):
         [1. for _ in range(batch_size)], dtype='float32')
     init_ids_data = init_ids_data.reshape((batch_size, 1))
     init_scores_data = init_scores_data.reshape((batch_size, 1))
-    init_lod = [1] * batch_size
-    init_lod = [init_lod, init_lod]
+    init_recursive_seq_lens = [1] * batch_size
+    init_recursive_seq_lens = [init_recursive_seq_lens, init_recursive_seq_lens]
 
-    init_ids = fluid.create_lod_tensor(init_ids_data, init_lod, place)
-    init_scores = fluid.create_lod_tensor(init_scores_data, init_lod, place)
+    init_ids = fluid.create_lod_tensor(init_ids_data, init_recursive_seq_lens,
+                                       place)
+    init_scores = fluid.create_lod_tensor(init_scores_data,
+                                          init_recursive_seq_lens, place)
 
     train_data = paddle.batch(
         paddle.reader.shuffle(
@@ -243,7 +259,7 @@ def decode_main(use_cuda, is_sparse):
             feed=feed_dict,
             fetch_list=[translation_ids, translation_scores],
             return_numpy=False)
-        print result_ids.lod()
+        print result_ids.recursive_sequence_lengths()
         break
 
 
diff --git a/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py b/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
index 090c11ce1e79201f0d65d3540527791ab2191d4a..c860f1641708d947fd2a8008d3d3ccd0a231f6c2 100644
--- a/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
@@ -209,13 +209,15 @@ def infer(use_cuda, inference_program, params_dirname):
         inference_program, param_path=params_dirname, place=place)
 
     # Use the first data from paddle.dataset.movielens.test() as input.
-    # Use create_lod_tensor(data, lod, place) API to generate LoD Tensor,
-    # where `data` is a list of sequences of index numbers, `lod` is 
-    # the level of detail (lod) info associated with `data`.
+    # Use create_lod_tensor(data, recursive_sequence_lengths, place) API 
+    # to generate LoD Tensor where `data` is a list of sequences of index 
+    # numbers, `recursive_sequence_lengths` is the length-based level of detail 
+    # (lod) info associated with `data`.
     # For example, data = [[10, 2, 3], [2, 3]] means that it contains
     # two sequences of indexes, of length 3 and 2, respectively.
-    # Correspondingly, lod = [[3, 2]] contains one level of detail info,
-    # indicating that `data` consists of two sequences of length 3 and 2. 
+    # Correspondingly, recursive_sequence_lengths = [[3, 2]] contains one 
+    # level of detail info, indicating that `data` consists of two sequences 
+    # of length 3 and 2, respectively. 
     user_id = fluid.create_lod_tensor([[1]], [[1]], place)
     gender_id = fluid.create_lod_tensor([[1]], [[1]], place)
     age_id = fluid.create_lod_tensor([[0]], [[1]], place)
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
index 9b61f7a00ce5e2a08c2105fb7f50e6868ef25df3..1668ae83d3581125b799508c8c3115a038e93d5a 100644
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
@@ -128,17 +128,17 @@ def infer(use_cuda, inference_program, params_dirname=None):
     # Here each word is the basic element of the LoDTensor and the shape of 
     # each word (base_shape) should be [1] since it is simply an index to 
     # look up for the corresponding word vector.
-    # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
-    # which has only one lod level. Then the created LoDTensor will have only 
+    # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
+    # which has only one level of detail. Then the created LoDTensor will have only 
     # one higher level structure (sequence of words, or sentence) than the basic 
     # element (word). Hence the LoDTensor will hold data for three sentences of 
     # length 3, 4 and 2, respectively. 
-    # Note that lod info should be a list of lists.
-    lod = [[3, 4, 2]]
+    # Note that recursive_sequence_lengths should be a list of lists.
+    recursive_seq_lens = [[3, 4, 2]]
     base_shape = [1]
     # The range of random integers is [low, high]
     tensor_words = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=len(word_dict) - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=len(word_dict) - 1)
     results = inferencer.infer({'words': tensor_words})
     print("infer results: ", results)
 
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
index aa7c567b4d66ba07c26d54436fb305011cfeccf2..8da89d82cb8e00853eebfd794602a0e1e1020e7c 100644
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
@@ -143,17 +143,17 @@ def infer(use_cuda, inference_program, params_dirname=None):
     # Here each word is the basic element of the LoDTensor and the shape of 
     # each word (base_shape) should be [1] since it is simply an index to 
     # look up for the corresponding word vector.
-    # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
-    # which has only one lod level. Then the created LoDTensor will have only 
+    # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
+    # which has only one level of detail. Then the created LoDTensor will have only 
     # one higher level structure (sequence of words, or sentence) than the basic 
     # element (word). Hence the LoDTensor will hold data for three sentences of 
     # length 3, 4 and 2, respectively. 
-    # Note that lod info should be a list of lists.
-    lod = [[3, 4, 2]]
+    # Note that recursive_sequence_lengths should be a list of lists.
+    recursive_seq_lens = [[3, 4, 2]]
     base_shape = [1]
     # The range of random integers is [low, high]
     tensor_words = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=len(word_dict) - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=len(word_dict) - 1)
     results = inferencer.infer({'words': tensor_words})
     print("infer results: ", results)
 
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
index 8c74be0f08855c20f5aa3ecd75622a51e94a0304..74faa2e8aa734cd644dfcc38127fd12df1fb1092 100644
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
@@ -138,17 +138,17 @@ def infer(use_cuda, inference_program, params_dirname=None):
     # Here each word is the basic element of the LoDTensor and the shape of 
     # each word (base_shape) should be [1] since it is simply an index to 
     # look up for the corresponding word vector.
-    # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
-    # which has only one lod level. Then the created LoDTensor will have only 
+    # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
+    # which has only one level of detail. Then the created LoDTensor will have only 
     # one higher level structure (sequence of words, or sentence) than the basic 
     # element (word). Hence the LoDTensor will hold data for three sentences of 
     # length 3, 4 and 2, respectively. 
-    # Note that lod info should be a list of lists.
-    lod = [[3, 4, 2]]
+    # Note that recursive_sequence_lengths should be a list of lists.
+    recursive_seq_lens = [[3, 4, 2]]
     base_shape = [1]
     # The range of random integers is [low, high]
     tensor_words = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=len(word_dict) - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=len(word_dict) - 1)
     results = inferencer.infer({'words': tensor_words})
     print("infer results: ", results)
 
diff --git a/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py b/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
index ba44f72d9b03c3a44560a8a30cba2253256314ef..02e65cf56c4d1bd262831320befd2edc735c0d1c 100644
--- a/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
+++ b/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
@@ -124,21 +124,22 @@ def infer(use_cuda, inference_program, params_dirname=None):
 
     # Setup inputs by creating 4 LoDTensors representing 4 words. Here each word 
     # is simply an index to look up for the corresponding word vector and hence 
-    # the shape of word (base_shape) should be [1]. The length-based level of 
-    # detail (lod) info of each LoDtensor should be [[1]] meaning there is only 
-    # one lod_level and there is only one sequence of one word on this level.
-    # Note that lod info should be a list of lists.
-    lod = [[1]]
+    # the shape of word (base_shape) should be [1]. The recursive_sequence_lengths, 
+    # which is length-based level of detail (lod) of each LoDTensor, should be [[1]] 
+    # meaning there is only one level of detail and there is only one sequence of 
+    # one word on this level.
+    # Note that recursive_sequence_lengths should be a list of lists.
+    recursive_seq_lens = [[1]]
     base_shape = [1]
     # The range of random integers is [low, high]
     first_word = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=dict_size - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
     second_word = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=dict_size - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
     third_word = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=dict_size - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
     fourth_word = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=dict_size - 1)
+        recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
 
     result = inferencer.infer(
         {
diff --git a/python/paddle/fluid/tests/book/notest_understand_sentiment.py b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
index c6687e8ad7fcc45c82d6dcb2256e9055a81cc61c..1df7b99aad6094a8b8ddfe783b9de35cef61c524 100644
--- a/python/paddle/fluid/tests/book/notest_understand_sentiment.py
+++ b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
@@ -194,16 +194,16 @@ def train(word_dict,
     if is_local:
         train_loop(fluid.default_main_program())
     else:
-        port = os.getenv("PADDLE_INIT_PORT", "6174")
-        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
         eplist = []
         for ip in pserver_ips.split(","):
             eplist.append(':'.join([ip, port]))
         pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
-        trainers = int(os.getenv("TRAINERS"))
+        trainers = int(os.getenv("PADDLE_TRAINERS"))
         current_endpoint = os.getenv("POD_IP") + ":" + port
-        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
-        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
         t = fluid.DistributeTranspiler()
         t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
         if training_role == "PSERVER":
@@ -238,17 +238,21 @@ def infer(word_dict, use_cuda, save_dirname=None):
         # Here each word is the basic element of the LoDTensor and the shape of 
         # each word (base_shape) should be [1] since it is simply an index to 
         # look up for the corresponding word vector.
-        # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
-        # which has only one lod level. Then the created LoDTensor will have only 
+        # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
+        # which has only one level of detail. Then the created LoDTensor will have only 
         # one higher level structure (sequence of words, or sentence) than the basic 
         # element (word). Hence the LoDTensor will hold data for three sentences of 
         # length 3, 4 and 2, respectively. 
-        # Note that lod info should be a list of lists.
-        lod = [[3, 4, 2]]
+        # Note that recursive_sequence_lengths should be a list of lists.
+        recursive_seq_lens = [[3, 4, 2]]
         base_shape = [1]
         # The range of random integers is [low, high]
         tensor_words = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=word_dict_len - 1)
+            recursive_seq_lens,
+            base_shape,
+            place,
+            low=0,
+            high=word_dict_len - 1)
 
         # Construct feed as a dictionary of {feed_target_name: feed_target_data}
         # and results will contain a list of data corresponding to fetch_targets.
@@ -257,7 +261,7 @@ def infer(word_dict, use_cuda, save_dirname=None):
                           feed={feed_target_names[0]: tensor_words},
                           fetch_list=fetch_targets,
                           return_numpy=False)
-        print(results[0].lod())
+        print(results[0].recursive_sequence_lengths())
         np_data = np.array(results[0])
         print("Inference Shape: ", np_data.shape)
         print("Inference results: ", np_data)
diff --git a/python/paddle/fluid/tests/book/test_fit_a_line.py b/python/paddle/fluid/tests/book/test_fit_a_line.py
index b1a6b524d33cae97c8982ffb8f780b1b07761a09..71bf5f8b3a9b17f24ce35220a9348bb871852623 100644
--- a/python/paddle/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/test_fit_a_line.py
@@ -69,16 +69,16 @@ def train(use_cuda, save_dirname, is_local):
     if is_local:
         train_loop(fluid.default_main_program())
     else:
-        port = os.getenv("PADDLE_INIT_PORT", "6174")
-        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
         eplist = []
         for ip in pserver_ips.split(","):
             eplist.append(':'.join([ip, port]))
         pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
-        trainers = int(os.getenv("TRAINERS"))
+        trainers = int(os.getenv("PADDLE_TRAINERS"))
         current_endpoint = os.getenv("POD_IP") + ":" + port
-        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
-        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
         t = fluid.DistributeTranspiler()
         t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
         if training_role == "PSERVER":
@@ -110,14 +110,23 @@ def infer(use_cuda, save_dirname=None):
         # The input's dimension should be 2-D and the second dim is 13
         # The input data should be >= 0
         batch_size = 10
-        tensor_x = numpy.random.uniform(0, 10,
-                                        [batch_size, 13]).astype("float32")
+
+        test_reader = paddle.batch(
+            paddle.dataset.uci_housing.test(), batch_size=batch_size)
+
+        test_data = test_reader().next()
+        test_feat = numpy.array(
+            [data[0] for data in test_data]).astype("float32")
+        test_label = numpy.array(
+            [data[1] for data in test_data]).astype("float32")
+
         assert feed_target_names[0] == 'x'
         results = exe.run(inference_program,
-                          feed={feed_target_names[0]: tensor_x},
+                          feed={feed_target_names[0]: numpy.array(test_feat)},
                           fetch_list=fetch_targets)
         print("infer shape: ", results[0].shape)
         print("infer results: ", results[0])
+        print("ground truth: ", test_label)
 
 
 def main(use_cuda, is_local=True):
diff --git a/python/paddle/fluid/tests/book/test_image_classification.py b/python/paddle/fluid/tests/book/test_image_classification.py
index 0f3a4c9242a81a3c1fb90268245715a8e59a207a..a2fb186b86c9706ac1aff0de49defbfb06e2eb0f 100644
--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
@@ -178,16 +178,16 @@ def train(net_type, use_cuda, save_dirname, is_local):
     if is_local:
         train_loop(fluid.default_main_program())
     else:
-        port = os.getenv("PADDLE_INIT_PORT", "6174")
-        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
         eplist = []
         for ip in pserver_ips.split(","):
             eplist.append(':'.join([ip, port]))
         pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
-        trainers = int(os.getenv("TRAINERS"))
+        trainers = int(os.getenv("PADDLE_TRAINERS"))
         current_endpoint = os.getenv("POD_IP") + ":" + port
-        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
-        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
         t = fluid.DistributeTranspiler()
         t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
         if training_role == "PSERVER":
diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
index 99d51ae0076178aca50e36c2c187257a8ba1cbf2..d489feae9c568ec1d9e3a230766d10d1ced0200a 100644
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -209,16 +209,16 @@ def train(use_cuda, save_dirname=None, is_local=True):
     if is_local:
         train_loop(fluid.default_main_program())
     else:
-        port = os.getenv("PADDLE_INIT_PORT", "6174")
-        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
         eplist = []
         for ip in pserver_ips.split(","):
             eplist.append(':'.join([ip, port]))
         pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
-        trainers = int(os.getenv("TRAINERS"))
+        trainers = int(os.getenv("PADDLE_TRAINERS"))
         current_endpoint = os.getenv("POD_IP") + ":" + port
-        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
-        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
         t = fluid.DistributeTranspiler()
         t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
         if training_role == "PSERVER":
@@ -247,35 +247,67 @@ def infer(use_cuda, save_dirname=None):
         [inference_program, feed_target_names,
          fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
 
-        # Setup inputs by creating LoDTensors to represent sequences of words.
-        # Here each word is the basic element of these LoDTensors and the shape of 
+        # Setup input by creating LoDTensor to represent sequence of words.
+        # Here each word is the basic element of the LoDTensor and the shape of 
         # each word (base_shape) should be [1] since it is simply an index to 
         # look up for the corresponding word vector.
-        # Suppose the length_based level of detail (lod) info is set to [[3, 4, 2]],
-        # which has only one lod level. Then the created LoDTensors will have only 
+        # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
+        # which has only one level of detail. Then the created LoDTensor will have only 
         # one higher level structure (sequence of words, or sentence) than the basic 
         # element (word). Hence the LoDTensor will hold data for three sentences of 
         # length 3, 4 and 2, respectively. 
-        # Note that lod info should be a list of lists.
-        lod = [[3, 4, 2]]
+        # Note that recursive_sequence_lengths should be a list of lists.
+        recursive_seq_lens = [[3, 4, 2]]
         base_shape = [1]
         # The range of random integers is [low, high]
         word = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=word_dict_len - 1)
+            recursive_seq_lens,
+            base_shape,
+            place,
+            low=0,
+            high=word_dict_len - 1)
         pred = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=pred_dict_len - 1)
+            recursive_seq_lens,
+            base_shape,
+            place,
+            low=0,
+            high=pred_dict_len - 1)
         ctx_n2 = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=word_dict_len - 1)
+            recursive_seq_lens,
+            base_shape,
+            place,
+            low=0,
+            high=word_dict_len - 1)
         ctx_n1 = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=word_dict_len - 1)
+            recursive_seq_lens,
+            base_shape,
+            place,
+            low=0,
+            high=word_dict_len - 1)
         ctx_0 = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=word_dict_len - 1)
+            recursive_seq_lens,
+            base_shape,
+            place,
+            low=0,
+            high=word_dict_len - 1)
         ctx_p1 = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=word_dict_len - 1)
+            recursive_seq_lens,
+            base_shape,
+            place,
+            low=0,
+            high=word_dict_len - 1)
         ctx_p2 = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=word_dict_len - 1)
+            recursive_seq_lens,
+            base_shape,
+            place,
+            low=0,
+            high=word_dict_len - 1)
         mark = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=mark_dict_len - 1)
+            recursive_seq_lens,
+            base_shape,
+            place,
+            low=0,
+            high=mark_dict_len - 1)
 
         # Construct feed as a dictionary of {feed_target_name: feed_target_data}
         # and results will contain a list of data corresponding to fetch_targets.
@@ -301,7 +333,7 @@ def infer(use_cuda, save_dirname=None):
                           },
                           fetch_list=fetch_targets,
                           return_numpy=False)
-        print(results[0].lod())
+        print(results[0].recursive_sequence_lengths())
         np_data = np.array(results[0])
         print("Inference Shape: ", np_data.shape)
 
diff --git a/python/paddle/fluid/tests/book/test_machine_translation.py b/python/paddle/fluid/tests/book/test_machine_translation.py
index 23e5900f127a7a3253c551f8f7fbceba08382209..90c301a66105d8d872ee531556c5060b5d727515 100644
--- a/python/paddle/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/test_machine_translation.py
@@ -108,7 +108,7 @@ def decoder_decode(context, is_sparse):
         pre_state = pd.array_read(array=state_array, i=counter)
         pre_score = pd.array_read(array=scores_array, i=counter)
 
-        # expand the lod of pre_state to be the same with pre_score
+        # expand the recursive_sequence_lengths of pre_state to be the same with pre_score
         pre_state_expanded = pd.sequence_expand(pre_state, pre_score)
 
         pre_ids_emb = pd.embedding(
@@ -126,9 +126,19 @@ def decoder_decode(context, is_sparse):
         current_score = pd.fc(input=current_state_with_lod,
                               size=target_dict_dim,
                               act='softmax')
-        topk_scores, topk_indices = pd.topk(current_score, k=50)
+        topk_scores, topk_indices = pd.topk(current_score, k=beam_size)
+        # calculate accumulated scores after topk to reduce computation cost
+        accu_scores = pd.elementwise_add(
+            x=pd.log(topk_scores), y=pd.reshape(
+                pre_score, shape=[-1]), axis=0)
         selected_ids, selected_scores = pd.beam_search(
-            pre_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0)
+            pre_ids,
+            pre_score,
+            topk_indices,
+            accu_scores,
+            beam_size,
+            end_id=10,
+            level=0)
 
         pd.increment(x=counter, value=1, in_place=True)
 
@@ -137,10 +147,14 @@ def decoder_decode(context, is_sparse):
         pd.array_write(selected_ids, array=ids_array, i=counter)
         pd.array_write(selected_scores, array=scores_array, i=counter)
 
-        pd.less_than(x=counter, y=array_len, cond=cond)
+        # update the break condition: up to the max length or all candidates of
+        # source sentences have ended.
+        length_cond = pd.less_than(x=counter, y=array_len)
+        finish_cond = pd.logical_not(pd.is_empty(x=selected_ids))
+        pd.logical_and(x=length_cond, y=finish_cond, out=cond)
 
     translation_ids, translation_scores = pd.beam_search_decode(
-        ids=ids_array, scores=scores_array)
+        ids=ids_array, scores=scores_array, beam_size=beam_size, end_id=10)
 
     # return init_ids, init_scores
 
@@ -200,16 +214,16 @@ def train_main(use_cuda, is_sparse, is_local=True):
     if is_local:
         train_loop(framework.default_main_program())
     else:
-        port = os.getenv("PADDLE_INIT_PORT", "6174")
-        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
         eplist = []
         for ip in pserver_ips.split(","):
             eplist.append(':'.join([ip, port]))
         pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
-        trainers = int(os.getenv("TRAINERS"))
+        trainers = int(os.getenv("PADDLE_TRAINERS"))
         current_endpoint = os.getenv("POD_IP") + ":" + port
-        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
-        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
         t = fluid.DistributeTranspiler()
         t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
         if training_role == "PSERVER":
@@ -238,11 +252,13 @@ def decode_main(use_cuda, is_sparse):
         [1. for _ in range(batch_size)], dtype='float32')
     init_ids_data = init_ids_data.reshape((batch_size, 1))
     init_scores_data = init_scores_data.reshape((batch_size, 1))
-    init_lod = [1] * batch_size
-    init_lod = [init_lod, init_lod]
+    init_recursive_seq_lens = [1] * batch_size
+    init_recursive_seq_lens = [init_recursive_seq_lens, init_recursive_seq_lens]
 
-    init_ids = fluid.create_lod_tensor(init_ids_data, init_lod, place)
-    init_scores = fluid.create_lod_tensor(init_scores_data, init_lod, place)
+    init_ids = fluid.create_lod_tensor(init_ids_data, init_recursive_seq_lens,
+                                       place)
+    init_scores = fluid.create_lod_tensor(init_scores_data,
+                                          init_recursive_seq_lens, place)
 
     train_data = paddle.batch(
         paddle.reader.shuffle(
@@ -266,7 +282,7 @@ def decode_main(use_cuda, is_sparse):
             feed=feed_dict,
             fetch_list=[translation_ids, translation_scores],
             return_numpy=False)
-        print result_ids.lod()
+        print result_ids.recursive_sequence_lengths()
         break
 
 
diff --git a/python/paddle/fluid/tests/book/test_recognize_digits.py b/python/paddle/fluid/tests/book/test_recognize_digits.py
index 578b1162fbd7e3a1b1c0cc934406818f2e07e019..5f5c8544bbdb87421f129b201a0ebaf4cb8602a1 100644
--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
@@ -94,7 +94,7 @@ def train(nn_type,
 
     test_program = fluid.default_main_program().clone(for_test=True)
 
-    optimizer = fluid.optimizer.Adam(learning_rate=0.001)
+    optimizer = fluid.optimizer.Adam(learning_rate=0.001, LARS_weight_decay=0.3)
     optimizer.minimize(avg_loss)
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
@@ -151,16 +151,16 @@ def train(nn_type,
     if is_local:
         train_loop(fluid.default_main_program())
     else:
-        port = os.getenv("PADDLE_INIT_PORT", "6174")
-        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
         eplist = []
         for ip in pserver_ips.split(","):
             eplist.append(':'.join([ip, port]))
         pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
-        trainers = int(os.getenv("TRAINERS"))
+        trainers = int(os.getenv("PADDLE_TRAINERS"))
         current_endpoint = os.getenv("POD_IP") + ":" + port
-        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
-        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
         t = fluid.DistributeTranspiler()
         t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
         if training_role == "PSERVER":
diff --git a/python/paddle/fluid/tests/book/test_recommender_system.py b/python/paddle/fluid/tests/book/test_recommender_system.py
index 65d6552acc9b3d31a97a45290e4613a633fffa3c..6548766ef5d0162b50d4dd072e8e91dd95dc5d2b 100644
--- a/python/paddle/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/fluid/tests/book/test_recommender_system.py
@@ -220,16 +220,16 @@ def train(use_cuda, save_dirname, is_local=True):
     if is_local:
         train_loop(fluid.default_main_program())
     else:
-        port = os.getenv("PADDLE_INIT_PORT", "6174")
-        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
         eplist = []
         for ip in pserver_ips.split(","):
             eplist.append(':'.join([ip, port]))
         pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
-        trainers = int(os.getenv("TRAINERS"))
+        trainers = int(os.getenv("PADDLE_TRAINERS"))
         current_endpoint = os.getenv("POD_IP") + ":" + port
-        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
-        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
         t = fluid.DistributeTranspiler()
         t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
         if training_role == "PSERVER":
@@ -260,13 +260,15 @@ def infer(use_cuda, save_dirname=None):
 
         # Use the first data from paddle.dataset.movielens.test() as input
         assert feed_target_names[0] == "user_id"
-        # Use create_lod_tensor(data, lod, place) API to generate LoD Tensor
-        # where `data` is a list of sequences of index numbers, `lod` is 
-        # the level of detail (lod) info associated with `data`.
+        # Use create_lod_tensor(data, recursive_sequence_lengths, place) API 
+        # to generate LoD Tensor where `data` is a list of sequences of index 
+        # numbers, `recursive_sequence_lengths` is the length-based level of detail 
+        # (lod) info associated with `data`.
         # For example, data = [[10, 2, 3], [2, 3]] means that it contains
         # two sequences of indexes, of length 3 and 2, respectively.
-        # Correspondingly, lod = [[3, 2]] contains one level of detail info,
-        # indicating that `data` consists of two sequences of length 3 and 2. 
+        # Correspondingly, recursive_sequence_lengths = [[3, 2]] contains one 
+        # level of detail info, indicating that `data` consists of two sequences 
+        # of length 3 and 2, respectively. 
         user_id = fluid.create_lod_tensor([[1]], [[1]], place)
 
         assert feed_target_names[1] == "gender_id"
diff --git a/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py b/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
index 7ada57def6bfedb113ea1a56f9677116b80488ea..467282624154086a874b0e73736ed5b1358915ff 100644
--- a/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
+++ b/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
@@ -216,19 +216,19 @@ def infer(use_cuda, save_dirname=None):
         # Here each word is the basic element of the LoDTensor and the shape of 
         # each word (base_shape) should be [1] since it is simply an index to 
         # look up for the corresponding word vector.
-        # Suppose the length_based level of detail (lod) info is set to [[4, 6]],
-        # which has only one lod level. Then the created LoDTensor will have only 
+        # Suppose the recursive_sequence_lengths info is set to [[4, 6]],
+        # which has only one level of detail. Then the created LoDTensor will have only 
         # one higher level structure (sequence of words, or sentence) than the basic 
         # element (word). Hence the LoDTensor will hold data for two sentences of 
         # length 4 and 6, respectively. 
-        # Note that lod info should be a list of lists.
-        lod = [[4, 6]]
+        # Note that recursive_sequence_lengths should be a list of lists.
+        recursive_seq_lens = [[4, 6]]
         base_shape = [1]
         # The range of random integers is [low, high]
         word_data = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=1)
+            recursive_seq_lens, base_shape, place, low=0, high=1)
         trg_word = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=1)
+            recursive_seq_lens, base_shape, place, low=0, high=1)
 
         # Construct feed as a dictionary of {feed_target_name: feed_target_data}
         # and results will contain a list of data corresponding to fetch_targets.
@@ -241,7 +241,7 @@ def infer(use_cuda, save_dirname=None):
                           },
                           fetch_list=fetch_targets,
                           return_numpy=False)
-        print(results[0].lod())
+        print(results[0].recursive_sequence_lengths())
         np_data = np.array(results[0])
         print("Inference shape: ", np_data.shape)
         print("Inference results: ", np_data)
diff --git a/python/paddle/fluid/tests/book/test_word2vec.py b/python/paddle/fluid/tests/book/test_word2vec.py
index 3118d88701e5f64ae50f7ee774ea8174aa7758eb..49bd72c7a53c0ae740bdbabe15b1d37340699d41 100644
--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
@@ -125,16 +125,16 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
     if is_local:
         train_loop(fluid.default_main_program())
     else:
-        port = os.getenv("PADDLE_INIT_PORT", "6174")
-        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
         eplist = []
         for ip in pserver_ips.split(","):
             eplist.append(':'.join([ip, port]))
         pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
-        trainers = int(os.getenv("TRAINERS"))
+        trainers = int(os.getenv("PADDLE_TRAINERS"))
         current_endpoint = os.getenv("POD_IP") + ":" + port
-        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
-        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
         t = fluid.DistributeTranspiler()
         t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
         if training_role == "PSERVER":
@@ -168,21 +168,22 @@ def infer(use_cuda, save_dirname=None):
 
         # Setup inputs by creating 4 LoDTensors representing 4 words. Here each word 
         # is simply an index to look up for the corresponding word vector and hence 
-        # the shape of word (base_shape) should be [1]. The length-based level of 
-        # detail (lod) info of each LoDtensor should be [[1]] meaning there is only 
-        # one lod_level and there is only one sequence of one word on this level.
-        # Note that lod info should be a list of lists.
-        lod = [[1]]
+        # the shape of word (base_shape) should be [1]. The recursive_sequence_lengths, 
+        # which is length-based level of detail (lod) of each LoDTensor, should be [[1]] 
+        # meaning there is only one level of detail and there is only one sequence of 
+        # one word on this level.
+        # Note that recursive_sequence_lengths should be a list of lists.
+        recursive_seq_lens = [[1]]
         base_shape = [1]
         # The range of random integers is [low, high]
         first_word = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=dict_size - 1)
+            recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
         second_word = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=dict_size - 1)
+            recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
         third_word = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=dict_size - 1)
+            recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
         fourth_word = fluid.create_random_int_lodtensor(
-            lod, base_shape, place, low=0, high=dict_size - 1)
+            recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
 
         assert feed_target_names[0] == 'firstw'
         assert feed_target_names[1] == 'secondw'
@@ -200,7 +201,7 @@ def infer(use_cuda, save_dirname=None):
                           },
                           fetch_list=fetch_targets,
                           return_numpy=False)
-        print(results[0].lod())
+        print(results[0].recursive_sequence_lengths())
         np_data = np.array(results[0])
         print("Inference Shape: ", np_data.shape)
 
diff --git a/python/paddle/fluid/tests/test_data_feeder.py b/python/paddle/fluid/tests/test_data_feeder.py
index ce3ba3ebc50d7b015f379b5e80b179463a7b231a..30b7a634a2b978df85d6432854ef12285460be44 100644
--- a/python/paddle/fluid/tests/test_data_feeder.py
+++ b/python/paddle/fluid/tests/test_data_feeder.py
@@ -22,12 +22,11 @@ class TestDataFeeder(unittest.TestCase):
         label = fluid.layers.data(name='label', shape=[1], dtype='int64')
         feeder = fluid.DataFeeder([img, label], fluid.CPUPlace())
         result = feeder.feed([([0] * 784, [9]), ([1] * 784, [1])])
-        print(result)
 
         self.assertEqual(result['image'].shape(), [2, 1, 28, 28])
         self.assertEqual(result['label'].shape(), [2, 1])
-        self.assertEqual(result['image'].lod(), [])
-        self.assertEqual(result['label'].lod(), [])
+        self.assertEqual(result['image'].recursive_sequence_lengths(), [])
+        self.assertEqual(result['label'].recursive_sequence_lengths(), [])
 
     def test_lod_level_1_converter(self):
         # lod_level = 1
@@ -42,12 +41,12 @@ class TestDataFeeder(unittest.TestCase):
         # label = [1] * len(data)
         result = feeder.feed(
             [([1, 2, 3], [1]), ([4, 5], [1]), ([6, 7, 8, 9], [1])])
-        print(result)
 
         self.assertEqual(result['sentences'].shape(), [9, 1])
         self.assertEqual(result['label'].shape(), [3, 1])
-        self.assertEqual(result['sentences'].lod(), [[0, 3, 5, 9]])
-        self.assertEqual(result['label'].lod(), [])
+        self.assertEqual(result['sentences'].recursive_sequence_lengths(),
+                         [[3, 2, 4]])
+        self.assertEqual(result['label'].recursive_sequence_lengths(), [])
 
     def test_lod_level_2_converter(self):
         # lod_level = 2
@@ -62,12 +61,12 @@ class TestDataFeeder(unittest.TestCase):
         # label = [1] * len(data)
         result = feeder.feed(
             [([[1, 2, 3], [4, 5]], [1]), ([[6, 7, 8, 9]], [1])])
-        print(result)
 
         self.assertEqual(result['paragraphs'].shape(), [9, 1])
         self.assertEqual(result['label'].shape(), [2, 1])
-        self.assertEqual(result['paragraphs'].lod(), [[0, 2, 3], [0, 3, 5, 9]])
-        self.assertEqual(result['label'].lod(), [])
+        self.assertEqual(result['paragraphs'].recursive_sequence_lengths(),
+                         [[2, 1], [3, 2, 4]])
+        self.assertEqual(result['label'].recursive_sequence_lengths(), [])
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index 8569d838bdd414eb84c6c87674990a25a2fdcdf9..2d70c986b1b6c42ff709e9cf3b4234cf4fc26836 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -127,6 +127,24 @@ class TestPriorBox(unittest.TestCase):
         assert box.shape[3] == 4
 
 
+class TestAnchorGenerator(unittest.TestCase):
+    def test_anchor_generator(self):
+        data_shape = [3, 224, 224]
+        images = fluid.layers.data(
+            name='pixel', shape=data_shape, dtype='float32')
+        conv1 = fluid.layers.conv2d(images, 3, 3, 2)
+        anchor, var = fluid.layers.anchor_generator(
+            input=conv1,
+            anchor_sizes=[64, 128, 256, 512],
+            aspect_ratios=[0.5, 1.0, 2.0],
+            variance=[0.1, 0.1, 0.2, 0.2],
+            stride=[16.0, 16.0],
+            offset=0.5)
+        assert len(anchor.shape) == 4
+        assert anchor.shape == var.shape
+        assert anchor.shape[3] == 4
+
+
 class TestMultiBoxHead(unittest.TestCase):
     def test_multi_box_head(self):
         data_shape = [3, 224, 224]
diff --git a/python/paddle/fluid/tests/test_if_else_op.py b/python/paddle/fluid/tests/test_if_else_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b58925599de62510ea9048f5210bb0b7e49f933
--- /dev/null
+++ b/python/paddle/fluid/tests/test_if_else_op.py
@@ -0,0 +1,208 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.fluid.layers as layers
+from paddle.fluid.framework import Program, program_guard
+from paddle.fluid.executor import Executor
+from paddle.fluid.optimizer import MomentumOptimizer
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+import unittest
+import numpy as np
+
+
+class TestMNISTIfElseOp(unittest.TestCase):
+    def test_raw_api(self):
+        prog = Program()
+        startup_prog = Program()
+        with program_guard(prog, startup_prog):
+            image = layers.data(name='x', shape=[784], dtype='float32')
+
+            label = layers.data(name='y', shape=[1], dtype='int64')
+
+            limit = layers.fill_constant(shape=[1], dtype='int64', value=5)
+            cond = layers.less_than(x=label, y=limit)
+            true_image, false_image = layers.split_lod_tensor(
+                input=image, mask=cond)
+
+            true_out = layers.create_tensor(dtype='float32')
+            true_cond = layers.ConditionalBlock([cond])
+
+            with true_cond.block():
+                hidden = layers.fc(input=true_image, size=100, act='tanh')
+                prob = layers.fc(input=hidden, size=10, act='softmax')
+                layers.assign(input=prob, output=true_out)
+
+            false_out = layers.create_tensor(dtype='float32')
+            false_cond = layers.ConditionalBlock([cond])
+
+            with false_cond.block():
+                hidden = layers.fc(input=false_image, size=200, act='tanh')
+                prob = layers.fc(input=hidden, size=10, act='softmax')
+                layers.assign(input=prob, output=false_out)
+
+            prob = layers.merge_lod_tensor(
+                in_true=true_out, in_false=false_out, mask=cond, x=image)
+            loss = layers.cross_entropy(input=prob, label=label)
+            avg_loss = layers.mean(loss)
+
+            optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
+            optimizer.minimize(avg_loss, startup_prog)
+
+        train_reader = paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.mnist.train(), buf_size=8192),
+            batch_size=10)
+
+        place = core.CPUPlace()
+        exe = Executor(place)
+
+        exe.run(startup_prog)
+        PASS_NUM = 100
+        for pass_id in range(PASS_NUM):
+            for data in train_reader():
+                x_data = np.array(map(lambda x: x[0], data)).astype("float32")
+                y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+                y_data = np.expand_dims(y_data, axis=1)
+
+                outs = exe.run(prog,
+                               feed={'x': x_data,
+                                     'y': y_data},
+                               fetch_list=[avg_loss])
+                print outs[0]
+                if outs[0] < 1.0:
+                    return
+        self.assertFalse(True)
+
+    def test_ifelse(self):
+        prog = Program()
+        startup_prog = Program()
+        with program_guard(prog, startup_prog):
+            image = layers.data(name='x', shape=[784], dtype='float32')
+
+            label = layers.data(name='y', shape=[1], dtype='int64')
+
+            limit = layers.fill_constant(shape=[1], dtype='int64', value=5)
+            cond = layers.less_than(x=label, y=limit)
+            ie = layers.IfElse(cond)
+
+            with ie.true_block():
+                true_image = ie.input(image)
+                hidden = layers.fc(input=true_image, size=100, act='tanh')
+                prob = layers.fc(input=hidden, size=10, act='softmax')
+                ie.output(prob)
+
+            with ie.false_block():
+                false_image = ie.input(image)
+                hidden = layers.fc(input=false_image, size=200, act='tanh')
+                prob = layers.fc(input=hidden, size=10, act='softmax')
+                ie.output(prob)
+
+            prob = ie()
+            loss = layers.cross_entropy(input=prob[0], label=label)
+            avg_loss = layers.mean(loss)
+
+            optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
+            optimizer.minimize(avg_loss, startup_prog)
+        train_reader = paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.mnist.train(), buf_size=8192),
+            batch_size=200)
+
+        place = core.CPUPlace()
+        exe = Executor(place)
+
+        exe.run(startup_prog)
+        PASS_NUM = 100
+        for pass_id in range(PASS_NUM):
+            for data in train_reader():
+                x_data = np.array(map(lambda x: x[0], data)).astype("float32")
+                y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+                y_data = y_data.reshape((y_data.shape[0], 1))
+
+                outs = exe.run(prog,
+                               feed={'x': x_data,
+                                     'y': y_data},
+                               fetch_list=[avg_loss])
+                print outs[0]
+                if outs[0] < 1.0:
+                    return
+        self.assertFalse(True)
+
+
+class TestIfElse(unittest.TestCase):
+    def set_test_case(self):
+        # condiction is: self.data < self.cond_value
+        self.cond_value = 0.5
+        self.data = np.random.rand(25, 1).astype(np.float32)
+
+    def compare_ifelse_op_and_numpy(self, place):
+        self.set_test_case()
+
+        prog = Program()
+        startup_prog = Program()
+        with program_guard(prog, startup_prog):
+            src = layers.data(name='data', shape=[1], dtype='float32')
+            cond = layers.fill_constant(
+                [1], dtype='float32', value=self.cond_value)
+            ifcond = layers.less_than(x=src, y=cond)
+            ie = layers.IfElse(ifcond)
+            with ie.true_block():
+                true_target = ie.input(src)
+                ie.output(true_target)
+
+            with ie.false_block():
+                false_target = ie.input(src)
+                ie.output(false_target)
+            if_out = ie()
+            out = layers.reduce_sum(if_out)
+
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            fetch_list = [out]
+            o1, = exe.run(fluid.default_main_program(),
+                          feed={'data': self.data},
+                          fetch_list=[out])
+            o2 = np.sum(self.data)
+            self.assertTrue(
+                np.allclose(
+                    o1, o2, atol=1e-8),
+                "IfElse result : " + str(o1) + "\n Numpy result :" + str(o2))
+
+    def test_cpu(self):
+        self.compare_ifelse_op_and_numpy(fluid.CPUPlace())
+
+    def test_cuda(self):
+        if not core.is_compiled_with_cuda():
+            return
+        self.compare_ifelse_op_and_numpy(fluid.CUDAPlace(0))
+
+
+class TestIfElseTrueBranch(TestIfElse):
+    def set_test_case(self):
+        # condiction is: self.data < self.cond_value
+        self.cond_value = 10.
+        self.data = np.random.rand(25, 1).astype(np.float32)
+
+
+class TestIfElseFalseBranch(TestIfElse):
+    def set_test_case(self):
+        # condiction is: self.data < self.cond_value
+        self.cond_value = -10.
+        self.data = np.random.rand(25, 1).astype(np.float32)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/test_lod_tensor.py b/python/paddle/fluid/tests/test_lod_tensor.py
index 013d72f418cf7ac11eb31fd221052039e896e203..f7a9dd4129027417a06a6c25ff9a801fff259c5e 100644
--- a/python/paddle/fluid/tests/test_lod_tensor.py
+++ b/python/paddle/fluid/tests/test_lod_tensor.py
@@ -13,77 +13,85 @@
 # limitations under the License.
 
 import paddle.fluid as fluid
-from paddle.fluid.lod_tensor import create_lod_tensor, create_random_int_lodtensor, _validate_lod, _convert_lod
-import numpy
+from paddle.fluid.lod_tensor import create_lod_tensor, create_random_int_lodtensor
+import numpy as np
 import unittest
 
 
 class TestLoDTensor(unittest.TestCase):
-    def test_validate_lod(self):
-        lod = (1, 2, 1)
-        self.assertRaises(AssertionError, _validate_lod, lod, -1)
-        lod = [[1, 2], (2, 3)]
-        self.assertRaises(AssertionError, _validate_lod, lod, -1)
-        lod = [1, 2, 3]
-        self.assertRaises(AssertionError, _validate_lod, lod, -1)
+    def test_pybind_recursive_seq_lens(self):
+        tensor = fluid.LoDTensor()
+        recursive_seq_lens = []
+        tensor.set_recursive_sequence_lengths(recursive_seq_lens)
+        recursive_seq_lens = [[], [1], [3]]
+        self.assertRaises(Exception, tensor.set_recursive_sequence_lengths,
+                          recursive_seq_lens)
+        recursive_seq_lens = [[0], [2], [3]]
+        self.assertRaises(Exception, tensor.set_recursive_sequence_lengths,
+                          recursive_seq_lens)
 
-        lod = []
-        self.assertTrue(_validate_lod(lod, -1))
-        lod = [[], [1], [3]]
-        self.assertFalse(_validate_lod(lod, -1))
-        lod = [[0], [-1], [3]]
-        self.assertFalse(_validate_lod(lod, -1))
+        recursive_seq_lens = [[1, 2, 3]]
+        tensor.set_recursive_sequence_lengths(recursive_seq_lens)
+        self.assertEqual(tensor.recursive_sequence_lengths(),
+                         recursive_seq_lens)
+        tensor.set(np.random.random([6, 1]), fluid.CPUPlace())
+        self.assertTrue(tensor.has_valid_recursive_sequence_lengths())
+        tensor.set(np.random.random([9, 1]), fluid.CPUPlace())
+        self.assertFalse(tensor.has_valid_recursive_sequence_lengths())
 
         # Each level's sum should be equal to the number of items in the next level
         # Moreover, last level's sum should be equal to the tensor height
-        lod = [[2, 3], [1, 3, 1, 2, 1]]
-        self.assertTrue(_validate_lod(lod, tensor_height=8))
-        lod = [[1, 3], [2, 1, 3]]
-        self.assertFalse(_validate_lod(lod, tensor_height=6))
-        lod = [[1, 3], [2, 1, 3, 4]]
-        self.assertFalse(_validate_lod(lod, tensor_height=5))
-
-    def test_convert_lod(self):
-        lod = [[1, 2, 3]]
-        converted_lod = [[0, 1, 3, 6]]
-        self.assertEqual(_convert_lod(lod), converted_lod)
-
-        lod = [[2, 3], [1, 3, 1, 2, 1]]
-        converted_lod = [[0, 2, 5], [0, 1, 4, 5, 7, 8]]
-        self.assertEqual(_convert_lod(lod), converted_lod)
+        recursive_seq_lens = [[2, 3], [1, 3, 1, 2, 2]]
+        tensor.set_recursive_sequence_lengths(recursive_seq_lens)
+        self.assertEqual(tensor.recursive_sequence_lengths(),
+                         recursive_seq_lens)
+        tensor.set(np.random.random([8, 1]), fluid.CPUPlace())
+        self.assertFalse(tensor.has_valid_recursive_sequence_lengths())
+        recursive_seq_lens = [[2, 3], [1, 3, 1, 2, 1]]
+        tensor.set_recursive_sequence_lengths(recursive_seq_lens)
+        self.assertTrue(tensor.has_valid_recursive_sequence_lengths())
+        tensor.set(np.random.random([9, 1]), fluid.CPUPlace())
+        self.assertFalse(tensor.has_valid_recursive_sequence_lengths())
 
     def test_create_lod_tensor(self):
         # Create LoDTensor from a list
         data = [[1, 2, 3], [3, 4]]
-        wrong_lod = [[2, 2]]
-        correct_lod = [[3, 2]]
-        self.assertRaises(AssertionError, create_lod_tensor, data, wrong_lod,
-                          fluid.CPUPlace())
-        tensor = create_lod_tensor(data, correct_lod, fluid.CPUPlace())
-        self.assertEqual(tensor.lod(), [[0, 3, 5]])
+        wrong_recursive_seq_lens = [[2, 2]]
+        correct_recursive_seq_lens = [[3, 2]]
+        self.assertRaises(AssertionError, create_lod_tensor, data,
+                          wrong_recursive_seq_lens, fluid.CPUPlace())
+        tensor = create_lod_tensor(data, correct_recursive_seq_lens,
+                                   fluid.CPUPlace())
+        self.assertEqual(tensor.recursive_sequence_lengths(),
+                         correct_recursive_seq_lens)
 
         # Create LoDTensor from numpy array
-        data = numpy.random.random([10, 1])
-        lod = [[2, 1], [3, 3, 4]]
-        tensor = create_lod_tensor(data, lod, fluid.CPUPlace())
-        self.assertEqual(tensor.lod(), [[0, 2, 3], [0, 3, 6, 10]])
+        data = np.random.random([10, 1])
+        recursive_seq_lens = [[2, 1], [3, 3, 4]]
+        tensor = create_lod_tensor(data, recursive_seq_lens, fluid.CPUPlace())
+        self.assertEqual(tensor.recursive_sequence_lengths(),
+                         recursive_seq_lens)
 
         # Create LoDTensor from another LoDTensor, they are differnt instances
-        new_lod = [[2, 2, 1], [1, 2, 2, 3, 2]]
-        new_tensor = create_lod_tensor(tensor, new_lod, fluid.CPUPlace())
-        self.assertEqual(tensor.lod(), [[0, 2, 3], [0, 3, 6, 10]])
-        self.assertEqual(new_tensor.lod(), [[0, 2, 4, 5], [0, 1, 3, 5, 8, 10]])
+        new_recursive_seq_lens = [[2, 2, 1], [1, 2, 2, 3, 2]]
+        new_tensor = create_lod_tensor(tensor, new_recursive_seq_lens,
+                                       fluid.CPUPlace())
+        self.assertEqual(tensor.recursive_sequence_lengths(),
+                         recursive_seq_lens)
+        self.assertEqual(new_tensor.recursive_sequence_lengths(),
+                         new_recursive_seq_lens)
 
     def test_create_random_int_lodtensor(self):
         # The shape of a word, commonly used in speech and NLP problem, is [1]
         shape = [1]
-        lod = [[2, 3, 5]]
+        recursive_seq_lens = [[2, 3, 5]]
         dict_size = 10000
         low = 0
         high = dict_size - 1
-        tensor = create_random_int_lodtensor(lod, shape,
+        tensor = create_random_int_lodtensor(recursive_seq_lens, shape,
                                              fluid.CPUPlace(), low, high)
-        self.assertEqual(tensor.lod(), [[0, 2, 5, 10]])
+        self.assertEqual(tensor.recursive_sequence_lengths(),
+                         recursive_seq_lens)
         self.assertEqual(tensor.shape(), [10, 1])
 
 
diff --git a/python/paddle/fluid/tests/test_mnist_if_else_op.py b/python/paddle/fluid/tests/test_mnist_if_else_op.py
deleted file mode 100644
index d34f52db5ffc889f17513d034ad2c99f696b0cdf..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/test_mnist_if_else_op.py
+++ /dev/null
@@ -1,148 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import paddle.fluid.layers as layers
-from paddle.fluid.framework import Program, program_guard, default_main_program, default_startup_program
-from paddle.fluid.executor import Executor
-from paddle.fluid.optimizer import MomentumOptimizer
-import paddle.fluid.core as core
-import unittest
-import numpy as np
-
-
-class TestMNISTIfElseOp(unittest.TestCase):
-    def test_raw_api(self):
-        prog = Program()
-        startup_prog = Program()
-        with program_guard(prog, startup_prog):
-            image = layers.data(name='x', shape=[784], dtype='float32')
-
-            label = layers.data(name='y', shape=[1], dtype='int64')
-
-            limit = layers.fill_constant_batch_size_like(
-                input=label, dtype='int64', shape=[1], value=5.0)
-            cond = layers.less_than(x=label, y=limit)
-            true_image, false_image = layers.split_lod_tensor(
-                input=image, mask=cond)
-
-            true_out = layers.create_tensor(dtype='float32')
-            true_cond = layers.ConditionalBlock([true_image])
-
-            with true_cond.block():
-                hidden = layers.fc(input=true_image, size=100, act='tanh')
-                prob = layers.fc(input=hidden, size=10, act='softmax')
-                layers.assign(input=prob, output=true_out)
-
-            false_out = layers.create_tensor(dtype='float32')
-            false_cond = layers.ConditionalBlock([false_image])
-
-            with false_cond.block():
-                hidden = layers.fc(input=false_image, size=200, act='tanh')
-                prob = layers.fc(input=hidden, size=10, act='softmax')
-                layers.assign(input=prob, output=false_out)
-
-            prob = layers.merge_lod_tensor(
-                in_true=true_out, in_false=false_out, mask=cond, x=image)
-            loss = layers.cross_entropy(input=prob, label=label)
-            avg_loss = layers.mean(loss)
-
-            optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
-            optimizer.minimize(avg_loss, startup_prog)
-
-        train_reader = paddle.batch(
-            paddle.reader.shuffle(
-                paddle.dataset.mnist.train(), buf_size=8192),
-            batch_size=200)
-
-        place = core.CPUPlace()
-        exe = Executor(place)
-
-        exe.run(startup_prog)
-        PASS_NUM = 100
-        for pass_id in range(PASS_NUM):
-            for data in train_reader():
-                x_data = np.array(map(lambda x: x[0], data)).astype("float32")
-                y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-                y_data = np.expand_dims(y_data, axis=1)
-
-                outs = exe.run(prog,
-                               feed={'x': x_data,
-                                     'y': y_data},
-                               fetch_list=[avg_loss])
-                print outs[0]
-                if outs[0] < 1.0:
-                    return
-        self.assertFalse(True)
-
-    def test_ifelse(self):
-        prog = Program()
-        startup_prog = Program()
-        with program_guard(prog, startup_prog):
-            image = layers.data(name='x', shape=[784], dtype='float32')
-
-            label = layers.data(name='y', shape=[1], dtype='int64')
-
-            limit = layers.fill_constant_batch_size_like(
-                input=label, dtype='int64', shape=[1], value=5.0)
-            cond = layers.less_than(x=label, y=limit)
-            ie = layers.IfElse(cond)
-
-            with ie.true_block():
-                true_image = ie.input(image)
-                hidden = layers.fc(input=true_image, size=100, act='tanh')
-                prob = layers.fc(input=hidden, size=10, act='softmax')
-                ie.output(prob)
-
-            with ie.false_block():
-                false_image = ie.input(image)
-                hidden = layers.fc(input=false_image, size=200, act='tanh')
-                prob = layers.fc(input=hidden, size=10, act='softmax')
-                ie.output(prob)
-
-            prob = ie()
-            loss = layers.cross_entropy(input=prob[0], label=label)
-            avg_loss = layers.mean(loss)
-
-            optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
-            optimizer.minimize(avg_loss, startup_prog)
-        train_reader = paddle.batch(
-            paddle.reader.shuffle(
-                paddle.dataset.mnist.train(), buf_size=8192),
-            batch_size=200)
-
-        place = core.CPUPlace()
-        exe = Executor(place)
-
-        exe.run(kwargs['startup_program'])
-        PASS_NUM = 100
-        for pass_id in range(PASS_NUM):
-            for data in train_reader():
-                x_data = np.array(map(lambda x: x[0], data)).astype("float32")
-                y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-                y_data = y_data.reshape((y_data.shape[0], 1))
-
-                outs = exe.run(kwargs['main_program'],
-                               feed={'x': x_data,
-                                     'y': y_data},
-                               fetch_list=[avg_loss])
-                print outs[0]
-                if outs[0] < 1.0:
-                    return
-        self.assertFalse(True)
-
-
-if __name__ == '__main__':
-    # temp disable if else unittest since it could be buggy.
-    exit(0)
diff --git a/python/paddle/fluid/tests/unittests/.gitignore b/python/paddle/fluid/tests/unittests/.gitignore
index 3538a9c2009bb133609153427981fb66974377fa..b1e8fda03aa42f5f7528eafb46c16d55b868bae5 100644
--- a/python/paddle/fluid/tests/unittests/.gitignore
+++ b/python/paddle/fluid/tests/unittests/.gitignore
@@ -4,3 +4,5 @@ mnist_1.recordio
 mnist_2.recordio
 flowers.recordio
 wmt16.recordio
+data_balance_test.recordio
+data_balance_with_lod_test.recordio
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 21182393bd68db4a379fc3ecf83fc85d27ca9490..f6c8dcabcbc592024188f4742e6c532a704d2289 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -15,7 +15,7 @@ if(NOT WITH_DISTRIBUTE)
 endif(NOT WITH_DISTRIBUTE)
 
 list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
-list(REMOVE_ITEM TEST_OPS test_modified_huber_loss_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5184 
+list(REMOVE_ITEM TEST_OPS test_modified_huber_loss_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5184
 list(REMOVE_ITEM TEST_OPS test_lstm_unit_op) # # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5185
 list(REMOVE_ITEM TEST_OPS test_nce) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/7778
 list(REMOVE_ITEM TEST_OPS test_recurrent_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/6152
@@ -43,8 +43,6 @@ list(REMOVE_ITEM TEST_OPS test_warpctc_op)
 list(REMOVE_ITEM TEST_OPS test_dist_train)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed)
-# TODO(wuyi): this test hungs on CI, will add it back later
-list(REMOVE_ITEM TEST_OPS test_listen_and_serv_op)
 foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
@@ -52,3 +50,6 @@ py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=$
 py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
 py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
+set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)
+set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 180)
+set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 180)
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 307caae4b0cf4869c1abb755215aa97795d47e15..e056ef9952a519d6c4d580b27f1118a3a91f13af 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -162,7 +162,7 @@ class OpTest(unittest.TestCase):
                     tensor = core.LoDTensor()
                     if isinstance(np_value, tuple):
                         tensor.set(np_value[0], place)
-                        tensor.set_lod(np_value[1])
+                        tensor.set_recursive_sequence_lengths(np_value[1])
                     else:
                         tensor.set(np_value, place)
                     feed_map[name] = tensor
@@ -170,7 +170,8 @@ class OpTest(unittest.TestCase):
                 tensor = core.LoDTensor()
                 if isinstance(self.inputs[var_name], tuple):
                     tensor.set(self.inputs[var_name][0], place)
-                    tensor.set_lod(self.inputs[var_name][1])
+                    tensor.set_recursive_sequence_lengths(self.inputs[var_name][
+                        1])
                 else:
                     tensor.set(self.inputs[var_name], place)
                 feed_map[var_name] = tensor
@@ -293,7 +294,8 @@ class OpTest(unittest.TestCase):
                         str(place))
                     if isinstance(expect, tuple):
                         self.assertListEqual(
-                            actual.lod(), expect[1], "Output (" + sub_out_name +
+                            actual.recursive_sequence_lengths(), expect[1],
+                            "Output (" + sub_out_name +
                             ") has different lod at " + str(place))
             else:
                 idx = find_actual(out_name, fetch_list)
@@ -307,8 +309,8 @@ class OpTest(unittest.TestCase):
                     "Output (" + out_name + ") has diff at " + str(place) +
                     str(actual_t) + "\n" + str(expect_t))
                 if isinstance(expect, tuple):
-                    self.assertListEqual(actual.lod(), expect[1],
-                                         "Output (" + out_name +
+                    self.assertListEqual(actual.recursive_sequence_lengths(),
+                                         expect[1], "Output (" + out_name +
                                          ") has different lod at " + str(place))
 
     def _get_places(self):
@@ -408,7 +410,7 @@ class OpTest(unittest.TestCase):
         tensor = core.LoDTensor()
         tensor.set(np_value, place)
         if lod is not None:
-            tensor.set_lod(lod)
+            tensor.set_recursive_sequence_lengths(lod)
         return tensor
 
     @staticmethod
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
index 829c5a1a5fd099543e9e98b9587d4f316a91b587..cddf00765f4894126988c794763c34629449e8e6 100644
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -18,6 +18,8 @@ import unittest
 import paddle.fluid as fluid
 import time
 import numpy as np
+import math
+import sys
 
 __all__ = ['TestParallelExecutorBase']
 
@@ -81,7 +83,6 @@ class TestParallelExecutorBase(unittest.TestCase):
             begin = time.time()
             first_loss, = run_executor(
                 exe=exe, feed=feed_dict, fetch_list=[loss.name])
-            first_loss = np.array(first_loss)
 
             for i in xrange(iter):
                 run_executor(exe=exe, feed=feed_dict, fetch_list=[])
@@ -94,7 +95,11 @@ class TestParallelExecutorBase(unittest.TestCase):
                 print "%.4f Instance per second" % (
                     (batch_size * iter + 2) / (end - begin))
 
-            last_loss = np.array(last_loss)
+            avg_last_loss_val = np.array(last_loss).mean()
+            avg_first_loss_val = np.array(first_loss).mean()
+            if math.isnan(float(avg_last_loss_val)) or math.isnan(
+                    float(avg_first_loss_val)):
+                sys.exit("got NaN loss, training failed.")
 
             print first_loss, last_loss
             # self.assertGreater(first_loss[0], last_loss[0])
diff --git a/python/paddle/fluid/tests/unittests/test_anchor_generator_op.py b/python/paddle/fluid/tests/unittests/test_anchor_generator_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c7d5d41f0c512a9fb609dce304c1eed929d28b5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_anchor_generator_op.py
@@ -0,0 +1,110 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://w_idxw.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import sys
+import math
+from op_test import OpTest
+
+
+def anchor_generator_in_python(input_feat, anchor_sizes, aspect_ratios,
+                               variances, stride, offset):
+    num_anchors = len(aspect_ratios) * len(anchor_sizes)
+    layer_h = input_feat.shape[2]
+    layer_w = input_feat.shape[3]
+    out_dim = (layer_h, layer_w, num_anchors, 4)
+    out_anchors = np.zeros(out_dim).astype('float32')
+
+    for h_idx in range(layer_h):
+        for w_idx in range(layer_w):
+            x_ctr = (w_idx * stride[0]) + offset * (stride[0] - 1)
+            y_ctr = (h_idx * stride[1]) + offset * (stride[1] - 1)
+            idx = 0
+            for r in range(len(aspect_ratios)):
+                ar = aspect_ratios[r]
+                for s in range(len(anchor_sizes)):
+                    anchor_size = anchor_sizes[s]
+                    area = stride[0] * stride[1]
+                    area_ratios = area / ar
+                    base_w = np.round(np.sqrt(area_ratios))
+                    base_h = np.round(base_w * ar)
+                    scale_w = anchor_size / stride[0]
+                    scale_h = anchor_size / stride[1]
+                    w = scale_w * base_w
+                    h = scale_h * base_h
+                    out_anchors[h_idx, w_idx, idx, :] = [
+                        (x_ctr - 0.5 * (w - 1)), (y_ctr - 0.5 * (h - 1)),
+                        (x_ctr + 0.5 * (w - 1)), (y_ctr + 0.5 * (h - 1))
+                    ]
+                    idx += 1
+
+    # set the variance.
+    out_var = np.tile(variances, (layer_h, layer_w, num_anchors, 1))
+    out_anchors = out_anchors.astype('float32')
+    out_var = out_var.astype('float32')
+    return out_anchors, out_var
+
+
+class TestAnchorGeneratorOp(OpTest):
+    def set_data(self):
+        self.init_test_params()
+        self.init_test_input()
+        self.init_test_output()
+        self.inputs = {'Input': self.input}
+
+        self.attrs = {
+            'anchor_sizes': self.anchor_sizes,
+            'aspect_ratios': self.aspect_ratios,
+            'stride': self.stride,
+            'offset': self.offset,
+            'variances': self.variances,
+        }
+
+        self.outputs = {'Anchors': self.out_anchors, 'Variances': self.out_var}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def setUp(self):
+        self.op_type = "anchor_generator"
+        self.set_data()
+
+    def init_test_params(self):
+        self.batch_size = 1
+        self.input_channels = 2
+        self.layer_h = 2
+        self.layer_w = 2
+
+        self.anchor_sizes = [64., 128., 256., 512.]
+        self.aspect_ratios = [0.5, 1., 2.]
+        self.stride = [16., 16.]
+
+        self.offset = 0.5
+
+        self.variances = [0.1, 0.1, 0.2, 0.2]
+
+    def init_test_input(self):
+        self.input = np.random.random(
+            (self.batch_size, self.input_channels, self.layer_h,
+             self.layer_w)).astype('float32')
+
+    def init_test_output(self):
+        self.out_anchors, self.out_var = anchor_generator_in_python(
+            self.input, self.anchor_sizes, self.aspect_ratios, self.variances,
+            self.stride, self.offset)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_argsort_op.py b/python/paddle/fluid/tests/unittests/test_argsort_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..b29a102a3880406156481fdac54ca7043d3415db
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_argsort_op.py
@@ -0,0 +1,56 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestArgsortOp(OpTest):
+    def setUp(self):
+        self.init_axis()
+        x = np.random.random((2, 3, 4, 5, 10)).astype("float32")
+        if self.axis < 0:
+            self.axis = self.axis + len(x.shape)
+        self.indices = np.argsort(x, kind='quicksort', axis=self.axis)
+        self.out = np.sort(x, kind='quicksort', axis=self.axis)
+        self.op_type = "argsort"
+        self.inputs = {'X': x}
+        self.attrs = {'axis': self.axis}
+        self.outputs = {'Indices': self.indices, 'Out': self.out}
+
+    def init_axis(self):
+        self.axis = -1
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestArgsortOpAxis0(TestArgsortOp):
+    def init_axis(self):
+        self.axis = 0
+
+
+class TestArgsortOpAxis1(TestArgsortOp):
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestArgsortOpAxisNeg2(TestArgsortOp):
+    def init_axis(self):
+        self.axis = -2
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py
index f6097d4b846e8da1c4ee3cc49b31f9873660056d..18fa5461590134d2032a29e40699109c12092c6d 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py
@@ -52,5 +52,17 @@ class TestMKLDNNBatchNormOpInference(TestBatchNormOpInference):
         self.check_with_place(place, data_format, self.dtype, [2, 3, 4, 5])
 
 
+class TestMKLDNNBatchNormOpWithReluInference(TestBatchNormOpInference):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+        self.fuse_with_relu = True
+
+    def test_check_output(self):
+        place = core.CPUPlace()
+        data_format = "NCHW"
+
+        self.check_with_place(place, data_format, self.dtype, [2, 3, 4, 5])
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
index 4216d83653b27ec7f18034e576fbedbecc3f1cfe..a62ee9596d0f6c58135b4a13249b638e84e63c3c 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
@@ -128,7 +128,7 @@ def create_or_get_tensor(scope, var_name, var, place):
     tensor = scope.var(var_name).get_tensor()
     if var is not None:
         assert isinstance(var, np.ndarray)
-        tensor.set_lod([[]])
+        tensor.set_recursive_sequence_lengths([])
         tensor.set_dims(var.shape)
         tensor.set(var, place)
     return tensor
@@ -159,6 +159,7 @@ class TestBatchNormOpInference(unittest.TestCase):
     def setUp(self):
         self.dtype = np.float32
         self.use_mkldnn = False
+        self.fuse_with_relu = False
         self.init_kernel_type()
 
     def __assert_close(self, tensor, np_array, msg, atol=1e-4):
@@ -180,6 +181,8 @@ class TestBatchNormOpInference(unittest.TestCase):
         scale_shape = [c]
 
         x_val = np.random.random_sample(x_shape).astype(dtype)
+        # generate some negative values to test case with relu fused
+        x_val = x_val - 0.5
         scale_val = np.random.random_sample(scale_shape).astype(np.float32)
         bias_val = np.random.random_sample(scale_shape).astype(np.float32)
 
@@ -188,6 +191,8 @@ class TestBatchNormOpInference(unittest.TestCase):
 
         y_out = _reference_testing(x_val, scale_val, bias_val, mean, variance,
                                    epsilon, data_layout).astype(dtype)
+        if self.fuse_with_relu:
+            y_out = np.maximum(y_out, 0)
 
         scope = core.Scope()
 
@@ -233,6 +238,7 @@ class TestBatchNormOpInference(unittest.TestCase):
             is_test=True,
             data_layout=data_layout,
             use_mkldnn=self.use_mkldnn,
+            fuse_with_relu=self.fuse_with_relu,
             epsilon=epsilon)
 
         batch_norm_op.run(scope, place)
@@ -265,6 +271,7 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference):
     def setUp(self):
         self.dtype = np.float16
         self.use_mkldnn = False
+        self.fuse_with_relu = False
         self.init_kernel_type()
 
     def test_check_output(self):
@@ -284,6 +291,7 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference):
 class TestBatchNormOpTraining(unittest.TestCase):
     def setUp(self):
         self.use_mkldnn = False
+        self.fuse_with_relu = False
         self.data_formats = ["NCHW", "NHWC"]
         self.init_kernel_type()
 
@@ -367,7 +375,8 @@ class TestBatchNormOpTraining(unittest.TestCase):
                         "epsilon": epsilon,
                         "is_test": False,
                         "data_layout": data_layout,
-                        "use_mkldnn": self.use_mkldnn
+                        "use_mkldnn": self.use_mkldnn,
+                        "fuse_with_relu": self.fuse_with_relu
                     })
                 block.create_var(name='y@GRAD', dtype='float32', shape=y.shape)
 
diff --git a/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py b/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
index 7976dd7c3f14390fb00bc8ab39121b6a686e3039..db5771f7b0ad74c73b81d502209c17dce3ce8457 100644
--- a/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
+++ b/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
@@ -20,6 +20,8 @@ from paddle.fluid.op import Operator
 
 
 class TestBeamSearchDecodeOp(unittest.TestCase):
+    """unittest of beam_search_decode_op"""
+
     def setUp(self):
         self.scope = core.Scope()
         self.place = core.CPUPlace()
@@ -32,32 +34,44 @@ class TestBeamSearchDecodeOp(unittest.TestCase):
 
     def test_get_set(self):
         ids = self.scope.var("ids").get_lod_tensor_array()
-        self.append_lod_tensor(
-            ids, [[0, 3, 6], [0, 1, 2, 3, 4, 5, 6]],
-            np.array(
-                [1, 2, 3, 4, 5, 6], dtype="int64"))
-        self.append_lod_tensor(
-            ids, [[0, 3, 6], [0, 1, 1, 3, 5, 5, 6]],
-            np.array(
-                [0, 1, 2, 3, 4, 5], dtype="int64"))
-        self.append_lod_tensor(
-            ids, [[0, 3, 6], [0, 0, 1, 2, 3, 4, 5]],
-            np.array(
-                [0, 1, 2, 3, 4], dtype="int64"))
-
         scores = self.scope.var("scores").get_lod_tensor_array()
-        self.append_lod_tensor(
-            scores, [[0, 3, 6], [0, 1, 2, 3, 4, 5, 6]],
-            np.array(
-                [1, 2, 3, 4, 5, 6], dtype="float64"))
-        self.append_lod_tensor(
-            scores, [[0, 3, 6], [0, 1, 1, 3, 5, 5, 6]],
-            np.array(
-                [0, 1, 2, 3, 4, 5], dtype="float64"))
-        self.append_lod_tensor(
-            scores, [[0, 3, 6], [0, 0, 1, 2, 3, 4, 5]],
-            np.array(
-                [0, 1, 2, 3, 4], dtype="float64"))
+        # Construct sample data with 5 steps and 2 source sentences
+        # beam_size = 2, end_id = 1
+        # start with start_id
+        [
+            self.append_lod_tensor(
+                array, [[0, 1, 2], [0, 1, 2]], np.array(
+                    [0, 0], dtype=dtype))
+            for array, dtype in ((ids, "int64"), (scores, "float32"))
+        ]
+        [
+            self.append_lod_tensor(
+                array, [[0, 1, 2], [0, 2, 4]],
+                np.array(
+                    [2, 3, 4, 5], dtype=dtype))
+            for array, dtype in ((ids, "int64"), (scores, "float32"))
+        ]
+        [
+            self.append_lod_tensor(
+                array, [[0, 2, 4], [0, 2, 2, 4, 4]],
+                np.array(
+                    [3, 1, 5, 4], dtype=dtype))
+            for array, dtype in ((ids, "int64"), (scores, "float32"))
+        ]
+        [
+            self.append_lod_tensor(
+                array, [[0, 2, 4], [0, 1, 2, 3, 4]],
+                np.array(
+                    [1, 1, 3, 5], dtype=dtype))
+            for array, dtype in ((ids, "int64"), (scores, "float32"))
+        ]
+        [
+            self.append_lod_tensor(
+                array, [[0, 2, 4], [0, 0, 0, 2, 2]],
+                np.array(
+                    [5, 1], dtype=dtype))
+            for array, dtype in ((ids, "int64"), (scores, "float32"))
+        ]
 
         sentence_ids = self.scope.var("sentence_ids").get_tensor()
         sentence_scores = self.scope.var("sentence_scores").get_tensor()
@@ -69,16 +83,18 @@ class TestBeamSearchDecodeOp(unittest.TestCase):
             Scores="scores",
             # outputs
             SentenceIds="sentence_ids",
-            SentenceScores="sentence_scores")
+            SentenceScores="sentence_scores",
+            beam_size=2,
+            end_id=1, )
 
         beam_search_decode_op.run(self.scope, self.place)
 
-        expected_lod = [[0, 4, 8], [0, 1, 3, 6, 9, 10, 13, 16, 19]]
+        expected_lod = [[0, 2, 4], [0, 4, 7, 12, 17]]
         self.assertEqual(sentence_ids.lod(), expected_lod)
         self.assertEqual(sentence_scores.lod(), expected_lod)
 
         expected_data = np.array(
-            [2, 1, 0, 3, 1, 0, 3, 2, 1, 5, 4, 3, 2, 4, 4, 3, 6, 5, 4], "int64")
+            [0, 2, 3, 1, 0, 2, 1, 0, 4, 5, 3, 5, 0, 4, 5, 3, 1], "int64")
         self.assertTrue(np.array_equal(np.array(sentence_ids), expected_data))
         self.assertTrue(
             np.array_equal(np.array(sentence_scores), expected_data))
diff --git a/python/paddle/fluid/tests/unittests/test_beam_search_op.py b/python/paddle/fluid/tests/unittests/test_beam_search_op.py
index bc708f3aff54f54d290684d68afa503a50a32dac..167451edd8c46c006c8019678a304a38f18cb946 100644
--- a/python/paddle/fluid/tests/unittests/test_beam_search_op.py
+++ b/python/paddle/fluid/tests/unittests/test_beam_search_op.py
@@ -26,9 +26,12 @@ def create_tensor(scope, name, np_data):
 
 
 class BeamSearchOpTester(unittest.TestCase):
+    """unittest of beam_search_op"""
+
     def setUp(self):
         self.scope = core.Scope()
         self._create_ids()
+        self._create_pre_scores()
         self._create_scores()
         self._create_pre_ids()
         self.scope.var('selected_ids')
@@ -37,7 +40,8 @@ class BeamSearchOpTester(unittest.TestCase):
     def test_run(self):
         op = Operator(
             'beam_search',
-            pre_ids="pre_ids",
+            pre_ids='pre_ids',
+            pre_scores='pre_scores',
             ids='ids',
             scores='scores',
             selected_ids='selected_ids',
@@ -47,15 +51,27 @@ class BeamSearchOpTester(unittest.TestCase):
             end_id=0, )
         op.run(self.scope, core.CPUPlace())
         selected_ids = self.scope.find_var("selected_ids").get_tensor()
-        print 'selected_ids', np.array(selected_ids)
-        print 'lod', selected_ids.lod()
+        selected_scores = self.scope.find_var("selected_scores").get_tensor()
+        self.assertTrue(
+            np.allclose(
+                np.array(selected_ids), np.array([4, 2, 3, 8])[:, np.newaxis]))
+        self.assertTrue(
+            np.allclose(
+                np.array(selected_scores),
+                np.array([0.5, 0.6, 0.9, 0.7])[:, np.newaxis]))
+        self.assertEqual(selected_ids.lod(),
+                         [[0L, 2L, 4L], [0L, 1L, 2L, 3L, 4L]])
 
     def _create_pre_ids(self):
         np_data = np.array([[1, 2, 3, 4]], dtype='int64')
-        tensor = create_tensor(self.scope, "pre_ids", np_data)
+        tensor = create_tensor(self.scope, 'pre_ids', np_data)
+
+    def _create_pre_scores(self):
+        np_data = np.array([[0.1, 0.2, 0.3, 0.4]], dtype='float32')
+        tensor = create_tensor(self.scope, 'pre_scores', np_data)
 
     def _create_ids(self):
-        self.lod = [[0, 1, 4], [0, 1, 2, 3, 4]]
+        self.lod = [[0, 2, 4], [0, 1, 2, 3, 4]]
         np_data = np.array(
             [[4, 2, 5], [2, 1, 3], [3, 5, 2], [8, 2, 1]], dtype='int64')
         tensor = create_tensor(self.scope, "ids", np_data)
diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
index 87c11e7880e73b911f21dda77c1cc2b4850b3591..b04f25ef874cc6204211a4f5f5991a0ec8c473dd 100644
--- a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
@@ -15,6 +15,7 @@
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle.fluid.core as core
 
 
 def bilinear_interp_np(input, out_h, out_w, out_size):
@@ -45,9 +46,9 @@ def bilinear_interp_np(input, out_h, out_w, out_size):
 
             out[:, :, i, j] = h2lambda*(w2lambda*input[:, :, h, w] +
                                         w1lambda*input[:, :, h, w+wid]) + \
-                              h1lambda*(w2lambda*input[:, :, h+hid, w] +
-                                        w1lambda*input[:, :, h+hid, w+wid])
-    return out.astype("float32")
+                h1lambda*(w2lambda*input[:, :, h+hid, w] +
+                          w1lambda*input[:, :, h+hid, w+wid])
+    return out.astype(input.dtype)
 
 
 class TestBilinearInterpOp(OpTest):
@@ -122,5 +123,44 @@ class TestCase6(TestBilinearInterpOp):
         self.out_size = np.array([65, 129]).astype("int32")
 
 
+class TestBilinearInterpOpUint8(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.init_test_case()
+        self.op_type = "bilinear_interp"
+        input_np = np.random.randint(
+            low=0, high=256, size=self.input_shape).astype("uint8")
+        output_np = bilinear_interp_np(input_np, self.out_h, self.out_w,
+                                       self.out_size)
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+        self.attrs = {'out_h': self.out_h, 'out_w': self.out_w}
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output_with_place(place=core.CPUPlace(), atol=1)
+
+    def init_test_case(self):
+        self.input_shape = [1, 3, 9, 6]
+        self.out_h = 10
+        self.out_w = 9
+
+
+class TestCase1Uint8(TestBilinearInterpOpUint8):
+    def init_test_case(self):
+        self.input_shape = [2, 3, 128, 64]
+        self.out_h = 120
+        self.out_w = 50
+
+
+class TestCase2Uint8(TestBilinearInterpOpUint8):
+    def init_test_case(self):
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 5
+        self.out_w = 13
+        self.out_size = np.array([6, 15]).astype("int32")
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
index f7461ee6dab699064153332116449c8e20a0bac0..d5bd726c4a82ee839703c69a933100bb056cb736 100644
--- a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
@@ -65,23 +65,25 @@ def batch_bipartite_match(distance, lod, match_type=None, dist_threshold=None):
         distance (numpy.array) : The distance of two entries with shape [M, N].
         lod (list of int): The offsets of each input in this batch.
     """
-    n = len(lod) - 1
+    n = len(lod)
     m = distance.shape[1]
     match_indices = -1 * np.ones((n, m), dtype=np.int)
     match_dist = np.zeros((n, m), dtype=np.float32)
-    for i in range(len(lod) - 1):
-        bipartite_match(distance[lod[i]:lod[i + 1], :], match_indices[i, :],
-                        match_dist[i, :])
+    cur_offset = 0
+    for i in range(n):
+        bipartite_match(distance[cur_offset:(cur_offset + lod[i]), :],
+                        match_indices[i, :], match_dist[i, :])
         if match_type == 'per_prediction':
-            argmax_match(distance[lod[i]:lod[i + 1], :], match_indices[i, :],
-                         match_dist[i, :], dist_threshold)
+            argmax_match(distance[cur_offset:(cur_offset + lod[i]), :],
+                         match_indices[i, :], match_dist[i, :], dist_threshold)
+        cur_offset += lod[i]
     return match_indices, match_dist
 
 
 class TestBipartiteMatchOpWithLoD(OpTest):
     def setUp(self):
         self.op_type = 'bipartite_match'
-        lod = [[0, 5, 11, 23]]
+        lod = [[5, 6, 12]]
         dist = np.random.random((23, 217)).astype('float32')
         match_indices, match_dist = batch_bipartite_match(dist, lod[0])
 
@@ -98,7 +100,7 @@ class TestBipartiteMatchOpWithLoD(OpTest):
 class TestBipartiteMatchOpWithoutLoD(OpTest):
     def setUp(self):
         self.op_type = 'bipartite_match'
-        lod = [[0, 8]]
+        lod = [[8]]
         dist = np.random.random((8, 17)).astype('float32')
         match_indices, match_dist = batch_bipartite_match(dist, lod[0])
 
@@ -112,10 +114,27 @@ class TestBipartiteMatchOpWithoutLoD(OpTest):
         self.check_output()
 
 
+class TestBipartiteMatchOpWithoutLoDLargeScaleInput(OpTest):
+    def setUp(self):
+        self.op_type = 'bipartite_match'
+        lod = [[300]]
+        dist = np.random.random((300, 17)).astype('float32')
+        match_indices, match_dist = batch_bipartite_match(dist, lod[0])
+
+        self.inputs = {'DistMat': dist}
+        self.outputs = {
+            'ColToRowMatchIndices': match_indices,
+            'ColToRowMatchDist': match_dist,
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class TestBipartiteMatchOpWithPerPredictionType(OpTest):
     def setUp(self):
         self.op_type = 'bipartite_match'
-        lod = [[0, 5, 11, 23]]
+        lod = [[5, 6, 12]]
         dist = np.random.random((23, 237)).astype('float32')
         match_indices, match_dist = batch_bipartite_match(dist, lod[0],
                                                           'per_prediction', 0.5)
diff --git a/python/paddle/fluid/tests/unittests/test_box_coder_op.py b/python/paddle/fluid/tests/unittests/test_box_coder_op.py
index b4c48d85f2c564d877c0a29e64dd2944d2b26ea3..4ce9a4783e2332b6882164a70e1462c6a6d31bef 100644
--- a/python/paddle/fluid/tests/unittests/test_box_coder_op.py
+++ b/python/paddle/fluid/tests/unittests/test_box_coder_op.py
@@ -81,15 +81,19 @@ def batch_box_coder(prior_box, prior_box_var, target_box, lod, code_type,
     n = target_box.shape[0]
     m = prior_box.shape[0]
     output_box = np.zeros((n, m, 4), dtype=np.float32)
-    for i in range(len(lod) - 1):
+    cur_offset = 0
+    for i in range(len(lod)):
         if (code_type == "EncodeCenterSize"):
-            box_coder(target_box[lod[i]:lod[i + 1], :], prior_box,
-                      prior_box_var, output_box[lod[i]:lod[i + 1], :, :],
+            box_coder(target_box[cur_offset:(cur_offset + lod[i]), :],
+                      prior_box, prior_box_var,
+                      output_box[cur_offset:(cur_offset + lod[i]), :, :],
                       code_type, box_normalized)
         elif (code_type == "DecodeCenterSize"):
-            box_coder(target_box[lod[i]:lod[i + 1], :, :], prior_box,
-                      prior_box_var, output_box[lod[i]:lod[i + 1], :, :],
+            box_coder(target_box[cur_offset:(cur_offset + lod[i]), :, :],
+                      prior_box, prior_box_var,
+                      output_box[cur_offset:(cur_offset + lod[i]), :, :],
                       code_type, box_normalized)
+        cur_offset += lod[i]
     return output_box
 
 
@@ -99,7 +103,7 @@ class TestBoxCoderOp(OpTest):
 
     def setUp(self):
         self.op_type = "box_coder"
-        lod = [[0, 1, 2, 3, 4, 5]]
+        lod = [[1, 1, 1, 1, 1]]
         prior_box = np.random.random((10, 4)).astype('float32')
         prior_box_var = np.random.random((10, 4)).astype('float32')
         target_box = np.random.random((5, 10, 4)).astype('float32')
@@ -152,7 +156,7 @@ class TestBoxCoderOpWithLoD(OpTest):
 
     def setUp(self):
         self.op_type = "box_coder"
-        lod = [[0, 4, 12, 20]]
+        lod = [[4, 8, 8]]
         prior_box = np.random.random((10, 4)).astype('float32')
         prior_box_var = np.random.random((10, 4)).astype('float32')
         target_box = np.random.random((20, 4)).astype('float32')
diff --git a/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py b/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py
index 050df2801c98e8f4167cdd1b4dde858c9f9f07dd..23932194f0ca97954ec9ade3fdcaebd7a32749a0 100644
--- a/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py
+++ b/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py
@@ -144,10 +144,10 @@ class TestChunkEvalOp(OpTest):
         starts = sorted(starts)
         self.num_correct_chunks, self.num_infer_chunks, self.num_label_chunks = self.gen_chunks(
             infer, label, starts)
-        self.inputs = {
-            'Inference': (infer, [starts]),
-            'Label': (label, [starts])
-        }
+        lod = []
+        for i in range(len(starts) - 1):
+            lod.append(starts[i + 1] - starts[i])
+        self.inputs = {'Inference': (infer, [lod]), 'Label': (label, [lod])}
         precision = float(
             self.num_correct_chunks
         ) / self.num_infer_chunks if self.num_infer_chunks else 0
diff --git a/python/paddle/fluid/tests/unittests/test_concat_op.py b/python/paddle/fluid/tests/unittests/test_concat_op.py
index 1e00d67d5480bfa77a60e1aed52cafac6e8242ca..e9f3c45dc40b3333fe7304f8e4313d156bd5374c 100644
--- a/python/paddle/fluid/tests/unittests/test_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_concat_op.py
@@ -43,7 +43,7 @@ class TestConcatOp(OpTest):
         self.axis = 1
 
 
-class TestConcatOp2(OpTest):
+class TestConcatOp2(TestConcatOp):
     def init_test_data(self):
         self.x0 = np.random.random((2, 3, 4, 5)).astype('float32')
         self.x1 = np.random.random((2, 3, 4, 5)).astype('float32')
@@ -51,5 +51,16 @@ class TestConcatOp2(OpTest):
         self.axis = 1
 
 
+class TestConcatOp3(TestConcatOp):
+    def init_test_data(self):
+        self.x0 = np.random.random((1, 256, 170, 256)).astype('float32')
+        self.x1 = np.random.random((1, 128, 170, 256)).astype('float32')
+        self.x2 = np.random.random((1, 128, 170, 256)).astype('float32')
+        self.axis = 1
+
+    def test_check_grad(self):
+        pass
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
index ded2f130288a4a959a1c859b2cc8ccf0912efb12..07545e7feb46c85a4b80f9b846be27d36cbfb59a 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
@@ -242,6 +242,19 @@ class TestCUDNNWithGroups(TestWithGroups):
         self.op_type = "conv2d_transpose"
 
 
+class TestDepthwiseConvTranspose(TestConv2dTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.dilations = [1, 1]
+        self.input_size = [2, 8, 16, 16]  # NCHW
+        self.groups = 8
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] / self.groups
+        self.filter_size = [self.input_size[1], f_c, 4, 4]
+        self.op_type = "depthwise_conv2d_transpose"
+
+
 # Please Don't remove the following code.
 # Currently, CI use cudnn V5.0 which not support dilation conv.
 # class TestCUDNNWithDilation(TestWithDilation):
diff --git a/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py b/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py
index f397f542bb07519886d75618e2a915c2dbf61fce..122b076c2d3e3a69f52a2c335e2bc89707b4fa9b 100644
--- a/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py
+++ b/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py
@@ -22,9 +22,9 @@ from op_test import OpTest
 class CRFDecoding(object):
     def __init__(self, emission_weights, transition_weights,
                  seq_start_positions):
-        assert (emission_weights.shape[0] == seq_start_positions[-1])
+        assert (emission_weights.shape[0] == sum(seq_start_positions))
         self.tag_num = emission_weights.shape[1]
-        self.seq_num = len(seq_start_positions) - 1
+        self.seq_num = len(seq_start_positions)
 
         self.seq_start_positions = seq_start_positions
         self.x = emission_weights
@@ -34,9 +34,9 @@ class CRFDecoding(object):
         self.w = transition_weights[2:, :]
 
         self.track = np.zeros(
-            (seq_start_positions[-1], self.tag_num), dtype="int64")
+            (sum(seq_start_positions), self.tag_num), dtype="int64")
         self.decoded_path = np.zeros(
-            (seq_start_positions[-1], 1), dtype="int64")
+            (sum(seq_start_positions), 1), dtype="int64")
 
     def _decode_one_sequence(self, decoded_path, x):
         seq_len, tag_num = x.shape
@@ -71,9 +71,11 @@ class CRFDecoding(object):
             decoded_path[i - 1] = max_idx = track[i, max_idx]
 
     def decode(self):
+        cur_pos = 0
         for i in range(self.seq_num):
-            start = self.seq_start_positions[i]
-            end = self.seq_start_positions[i + 1]
+            start = cur_pos
+            cur_pos += self.seq_start_positions[i]
+            end = cur_pos
             self._decode_one_sequence(self.decoded_path[start:end, :],
                                       self.x[start:end, :])
         return self.decoded_path
@@ -90,11 +92,13 @@ class TestCRFDecodingOp1(OpTest):
         TAG_NUM = 17
         MAX_SEQ_LEN = 10
 
-        lod = [[0]]
+        lod = [[]]
+        total_len = 0
         for i in range(SEQ_NUM):
-            lod[-1].append(lod[-1][-1] + random.randint(1, MAX_SEQ_LEN))
+            lod[-1].append(random.randint(1, MAX_SEQ_LEN))
+            total_len += lod[-1][-1]
         emission = np.random.uniform(-1, 1,
-                                     [lod[-1][-1], TAG_NUM]).astype("float64")
+                                     [total_len, TAG_NUM]).astype("float64")
         transition = np.random.uniform(-0.5, 0.5,
                                        [TAG_NUM + 2, TAG_NUM]).astype("float64")
 
@@ -126,7 +130,8 @@ class TestCRFDecodingOp2(OpTest):
         self.op_type = "crf_decoding"
         TAG_NUM = 5
 
-        lod = [[0, 1, 3, 6, 10]]
+        lod = [[1, 2, 3, 4]]
+        total_len = sum(lod[-1])
         transition = np.repeat(
             np.arange(
                 TAG_NUM, dtype="float64").reshape(1, TAG_NUM),
@@ -135,13 +140,13 @@ class TestCRFDecodingOp2(OpTest):
         emission = np.repeat(
             np.arange(
                 TAG_NUM, dtype="float64").reshape(1, TAG_NUM),
-            lod[-1][-1],
+            total_len,
             axis=0)
 
         labels = np.random.randint(
-            low=0, high=TAG_NUM, size=(lod[-1][-1], 1), dtype="int64")
+            low=0, high=TAG_NUM, size=(total_len, 1), dtype="int64")
         predicted_labels = np.ones(
-            (lod[-1][-1], 1), dtype="int64") * (TAG_NUM - 1)
+            (total_len, 1), dtype="int64") * (TAG_NUM - 1)
         expected_output = (labels == predicted_labels).astype("int64")
 
         self.inputs = {
diff --git a/python/paddle/fluid/tests/unittests/test_ctc_align.py b/python/paddle/fluid/tests/unittests/test_ctc_align.py
index f166031a1cbbaa5e312f5c7919b39648d0dad013..131b4076f45ae25b45bb3f64da07a5c3aacc43d5 100644
--- a/python/paddle/fluid/tests/unittests/test_ctc_align.py
+++ b/python/paddle/fluid/tests/unittests/test_ctc_align.py
@@ -22,14 +22,16 @@ from test_softmax_op import stable_softmax
 def CTCAlign(input, lod, blank, merge_repeated):
     lod0 = lod[0]
     result = []
-    for i in range(len(lod0) - 1):
+    cur_offset = 0
+    for i in range(len(lod0)):
         prev_token = -1
-        for j in range(lod0[i], lod0[i + 1]):
+        for j in range(cur_offset, cur_offset + lod0[i]):
             token = input[j][0]
             if (token != blank) and not (merge_repeated and
                                          token == prev_token):
                 result.append(token)
             prev_token = token
+        cur_offset += lod0[i]
     result = np.array(result).reshape([len(result), 1]).astype("int32")
     if len(result) == 0:
         result = np.array([-1])
@@ -39,7 +41,7 @@ def CTCAlign(input, lod, blank, merge_repeated):
 class TestCTCAlignOp(OpTest):
     def config(self):
         self.op_type = "ctc_align"
-        self.input_lod = [[0, 11, 18]]
+        self.input_lod = [[11, 7]]
         self.blank = 0
         self.merge_repeated = False
         self.input = np.array(
@@ -66,7 +68,7 @@ class TestCTCAlignOp(OpTest):
 class TestCTCAlignOpCase1(TestCTCAlignOp):
     def config(self):
         self.op_type = "ctc_align"
-        self.input_lod = [[0, 11, 19]]
+        self.input_lod = [[11, 8]]
         self.blank = 0
         self.merge_repeated = True
         self.input = np.array(
@@ -77,7 +79,7 @@ class TestCTCAlignOpCase1(TestCTCAlignOp):
 class TestCTCAlignOpCase2(TestCTCAlignOp):
     def config(self):
         self.op_type = "ctc_align"
-        self.input_lod = [[0, 4]]
+        self.input_lod = [[4]]
         self.blank = 0
         self.merge_repeated = True
         self.input = np.array([0, 0, 0, 0]).reshape([4, 1]).astype("int32")
diff --git a/python/paddle/fluid/tests/unittests/test_data_balance.py b/python/paddle/fluid/tests/unittests/test_data_balance.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d810920d55ccf069ff408c553069e8f5e590271
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_data_balance.py
@@ -0,0 +1,192 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.fluid as fluid
+import paddle.v2 as paddle
+import numpy as np
+
+
+class TestDataBalance(unittest.TestCase):
+    def prepare_data(self):
+        def fake_data_generator():
+            for n in xrange(self.total_ins_num):
+                yield np.ones((3, 4)) * n, n
+
+        # Prepare data
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            reader = paddle.batch(
+                fake_data_generator, batch_size=self.batch_size)
+            feeder = fluid.DataFeeder(
+                feed_list=[
+                    fluid.layers.data(
+                        name='image', shape=[3, 4], dtype='float32'),
+                    fluid.layers.data(
+                        name='label', shape=[1], dtype='int64'),
+                ],
+                place=fluid.CPUPlace())
+            self.num_batches = fluid.recordio_writer.convert_reader_to_recordio_file(
+                self.data_file_name, reader, feeder)
+
+    def prepare_lod_data(self):
+        def fake_data_generator():
+            for n in xrange(1, self.total_ins_num + 1):
+                d1 = (np.ones((n, 3)) * n).astype('float32')
+                d2 = (np.array(n).reshape((1, 1))).astype('int32')
+                yield d1, d2
+
+        # Prepare lod data
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            with fluid.recordio_writer.create_recordio_writer(
+                    filename=self.lod_data_file_name) as writer:
+                eof = False
+                generator = fake_data_generator()
+                while (not eof):
+                    data_batch = [
+                        np.array([]).reshape((0, 3)), np.array([]).reshape(
+                            (0, 1))
+                    ]
+                    lod = [0]
+                    for _ in xrange(self.batch_size):
+                        try:
+                            ins = generator.next()
+                        except StopIteration:
+                            eof = True
+                            break
+                        for i, d in enumerate(ins):
+                            data_batch[i] = np.concatenate(
+                                (data_batch[i], d), axis=0)
+                        lod.append(lod[-1] + ins[0].shape[0])
+                    if data_batch[0].shape[0] > 0:
+                        for i, d in enumerate(data_batch):
+                            t = fluid.LoDTensor()
+                            t.set(data_batch[i], fluid.CPUPlace())
+                            if i == 0:
+                                t.set_lod([lod])
+                            writer.append_tensor(t)
+                        writer.complete_append_tensor()
+
+    def setUp(self):
+        self.use_cuda = fluid.core.is_compiled_with_cuda()
+        self.data_file_name = './data_balance_test.recordio'
+        self.lod_data_file_name = './data_balance_with_lod_test.recordio'
+        self.total_ins_num = 50
+        self.batch_size = 10
+        self.prepare_data()
+        self.prepare_lod_data()
+
+    def main(self):
+        main_prog = fluid.Program()
+        startup_prog = fluid.Program()
+        with fluid.program_guard(main_prog, startup_prog):
+            data_reader = fluid.layers.io.open_files(
+                filenames=[self.data_file_name],
+                shapes=[[-1, 3, 4], [-1, 1]],
+                lod_levels=[0, 0],
+                dtypes=['float32', 'int64'])
+            if self.use_cuda:
+                data_reader = fluid.layers.double_buffer(data_reader)
+            image, label = fluid.layers.read_file(data_reader)
+
+            place = fluid.CUDAPlace(0) if self.use_cuda else fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(startup_prog)
+
+            build_strategy = fluid.BuildStrategy()
+            build_strategy.enable_data_balance = True
+            parallel_exe = fluid.ParallelExecutor(
+                use_cuda=self.use_cuda,
+                main_program=main_prog,
+                build_strategy=build_strategy)
+
+            if (parallel_exe.device_count > self.batch_size):
+                print("WARNING: Unittest TestDataBalance skipped. \
+                    For the result is not correct when device count \
+                    is larger than batch size.")
+                exit(0)
+            fetch_list = [image.name, label.name]
+
+            data_appeared = [False] * self.total_ins_num
+            while (True):
+                try:
+                    image_val, label_val = parallel_exe.run(fetch_list,
+                                                            return_numpy=True)
+                except fluid.core.EOFException:
+                    break
+                ins_num = image_val.shape[0]
+                broadcasted_label = np.ones(
+                    (ins_num, 3, 4)) * label_val.reshape((ins_num, 1, 1))
+                self.assertEqual(image_val.all(), broadcasted_label.all())
+                for l in label_val:
+                    self.assertFalse(data_appeared[l[0]])
+                    data_appeared[l[0]] = True
+            for i in data_appeared:
+                self.assertTrue(i)
+
+    def main_lod(self):
+        main_prog = fluid.Program()
+        startup_prog = fluid.Program()
+        with fluid.program_guard(main_prog, startup_prog):
+            data_reader = fluid.layers.io.open_files(
+                filenames=[self.lod_data_file_name],
+                shapes=[[-1, 3], [-1, 1]],
+                lod_levels=[1, 0],
+                dtypes=['float32', 'int32'],
+                thread_num=1)
+            ins, label = fluid.layers.read_file(data_reader)
+
+            place = fluid.CUDAPlace(0) if self.use_cuda else fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(startup_prog)
+            build_strategy = fluid.BuildStrategy()
+            build_strategy.enable_data_balance = True
+            parallel_exe = fluid.ParallelExecutor(
+                use_cuda=self.use_cuda,
+                main_program=main_prog,
+                build_strategy=build_strategy)
+
+            if (parallel_exe.device_count > self.batch_size):
+                print("WARNING: Unittest TestDataBalance skipped. \
+                    For the result is not correct when device count \
+                    is larger than batch size.")
+                exit(0)
+            fetch_list = [ins.name, label.name]
+
+            data_appeared = [False] * self.total_ins_num
+            while (True):
+                try:
+                    ins_tensor, label_tensor = parallel_exe.run(
+                        fetch_list, return_numpy=False)
+                except fluid.core.EOFException:
+                    break
+
+                ins_val = np.array(ins_tensor)
+                label_val = np.array(label_tensor)
+                ins_lod = ins_tensor.lod()[0]
+                self.assertEqual(ins_val.shape[1], 3)
+                self.assertEqual(label_val.shape[1], 1)
+                self.assertEqual(len(ins_lod) - 1, label_val.shape[0])
+                for i in range(0, len(ins_lod) - 1):
+                    ins_elem = ins_val[ins_lod[i]:ins_lod[i + 1]][:]
+                    label_elem = label_val[i][0]
+                    self.assertEqual(ins_elem.all(), label_elem.all())
+                    self.assertFalse(data_appeared[int(label_elem - 1)])
+                    data_appeared[int(label_elem - 1)] = True
+
+            for i in data_appeared:
+                self.assertTrue(i)
+
+    def test_all(self):
+        self.main()
+        self.main_lod()
diff --git a/python/paddle/fluid/tests/unittests/test_detection_map_op.py b/python/paddle/fluid/tests/unittests/test_detection_map_op.py
index f545ad155ccd28c2d34e424d307eed49b37f20fb..05d3367ad8ec2bc3df794015a7c25e943a26c68c 100644
--- a/python/paddle/fluid/tests/unittests/test_detection_map_op.py
+++ b/python/paddle/fluid/tests/unittests/test_detection_map_op.py
@@ -74,13 +74,13 @@ class TestDetectionMAPOp(OpTest):
         self.evaluate_difficult = True
         self.ap_type = "integral"
 
-        self.label_lod = [[0, 2, 4]]
+        self.label_lod = [[2, 2]]
         # label difficult xmin ymin xmax ymax
         self.label = [[1, 0, 0.1, 0.1, 0.3, 0.3], [1, 1, 0.6, 0.6, 0.8, 0.8],
                       [2, 0, 0.3, 0.3, 0.6, 0.5], [1, 0, 0.7, 0.1, 0.9, 0.3]]
 
         # label score xmin ymin xmax ymax difficult
-        self.detect_lod = [[0, 3, 7]]
+        self.detect_lod = [[3, 4]]
         self.detect = [
             [1, 0.3, 0.1, 0.0, 0.4, 0.3], [1, 0.7, 0.0, 0.1, 0.2, 0.3],
             [1, 0.9, 0.7, 0.6, 0.8, 0.8], [2, 0.8, 0.2, 0.1, 0.4, 0.4],
@@ -89,7 +89,7 @@ class TestDetectionMAPOp(OpTest):
         ]
 
         # label score true_pos false_pos
-        self.tf_pos_lod = [[0, 3, 7]]
+        self.tf_pos_lod = [[3, 4]]
         self.tf_pos = [[1, 0.9, 1, 0], [1, 0.7, 1, 0], [1, 0.3, 0, 1],
                        [1, 0.2, 1, 0], [2, 0.8, 0, 1], [2, 0.1, 1, 0],
                        [3, 0.2, 0, 1]]
@@ -112,15 +112,19 @@ class TestDetectionMAPOp(OpTest):
             for i, count in enumerate(class_pos_count):
                 class_pos_count_dict[i] = count
 
-            for i in range(len(true_pos_lod[0]) - 1):
-                start = true_pos_lod[0][i]
-                end = true_pos_lod[0][i + 1]
+            cur_pos = 0
+            for i in range(len(true_pos_lod[0])):
+                start = cur_pos
+                cur_pos += true_pos_lod[0][i]
+                end = cur_pos
                 for j in range(start, end):
                     true_pos_dict[i].append(true_pos[j])
 
-            for i in range(len(false_pos_lod[0]) - 1):
-                start = false_pos_lod[0][i]
-                end = false_pos_lod[0][i + 1]
+            cur_pos = 0
+            for i in range(len(false_pos_lod[0])):
+                start = cur_pos
+                cur_pos += false_pos_lod[0][i]
+                end = cur_pos
                 for j in range(start, end):
                     false_pos_dict[i].append(false_pos[j])
 
@@ -130,19 +134,19 @@ class TestDetectionMAPOp(OpTest):
             label_number = self.class_num
 
             out_class_pos_count = []
-            out_true_pos_lod = [0]
+            out_true_pos_lod = []
             out_true_pos = []
-            out_false_pos_lod = [0]
+            out_false_pos_lod = []
             out_false_pos = []
 
             for i in range(label_number):
                 out_class_pos_count.append([label_count[i]])
                 true_pos_list = true_pos[i]
                 out_true_pos += true_pos_list
-                out_true_pos_lod.append(len(out_true_pos))
+                out_true_pos_lod.append(len(true_pos_list))
                 false_pos_list = false_pos[i]
                 out_false_pos += false_pos_list
-                out_false_pos_lod.append(len(out_false_pos))
+                out_false_pos_lod.append(len(false_pos_list))
 
             return out_class_pos_count, out_true_pos, [
                 out_true_pos_lod
@@ -241,7 +245,7 @@ class TestDetectionMAPOpSkipDiff(TestDetectionMAPOp):
 
         self.evaluate_difficult = False
 
-        self.tf_pos_lod = [[0, 2, 6]]
+        self.tf_pos_lod = [[2, 4]]
         # label score true_pos false_pos
         self.tf_pos = [[1, 0.7, 1, 0], [1, 0.3, 0, 1], [1, 0.2, 1, 0],
                        [2, 0.8, 0, 1], [2, 0.1, 1, 0], [3, 0.2, 0, 1]]
@@ -267,9 +271,9 @@ class TestDetectionMAPOpMultiBatch(TestDetectionMAPOp):
     def init_test_case(self):
         super(TestDetectionMAPOpMultiBatch, self).init_test_case()
         self.class_pos_count = [0, 2, 1]
-        self.true_pos_lod = [[0, 0, 3, 5]]
+        self.true_pos_lod = [[0, 3, 2]]
         self.true_pos = [[0.7, 1.], [0.3, 0.], [0.2, 1.], [0.8, 0.], [0.1, 1.]]
-        self.false_pos_lod = [[0, 0, 3, 5]]
+        self.false_pos_lod = [[0, 3, 2]]
         self.false_pos = [[0.7, 0.], [0.3, 1.], [0.2, 0.], [0.8, 1.], [0.1, 0.]]
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist.py b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad2d57f7c5f127be87e963508e1dd150fdd30225
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
@@ -0,0 +1,210 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import argparse
+import time
+import math
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import os
+import signal
+
+SEED = 1
+DTYPE = "float32"
+paddle.dataset.mnist.fetch()
+
+
+# random seed must set before configuring the network.
+# fluid.default_startup_program().random_seed = SEED
+def cnn_model(data):
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=data,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+
+    # TODO(dzhwinter) : refine the initializer and random seed settting
+    SIZE = 10
+    input_shape = conv_pool_2.shape
+    param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
+    scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
+
+    predict = fluid.layers.fc(
+        input=conv_pool_2,
+        size=SIZE,
+        act="softmax",
+        param_attr=fluid.param_attr.ParamAttr(
+            initializer=fluid.initializer.NormalInitializer(
+                loc=0.0, scale=scale)))
+    return predict
+
+
+def get_model(batch_size):
+    # Input data
+    images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    # Train program
+    predict = cnn_model(images)
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    # Evaluator
+    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+    batch_acc = fluid.layers.accuracy(
+        input=predict, label=label, total=batch_size_tensor)
+
+    inference_program = fluid.default_main_program().clone()
+    # Optimization
+    opt = fluid.optimizer.AdamOptimizer(
+        learning_rate=0.001, beta1=0.9, beta2=0.999)
+
+    # Reader
+    train_reader = paddle.batch(
+        paddle.dataset.mnist.train(), batch_size=batch_size)
+    test_reader = paddle.batch(
+        paddle.dataset.mnist.test(), batch_size=batch_size)
+    opt.minimize(avg_cost)
+    return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
+
+
+def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers):
+    t = fluid.DistributeTranspiler()
+    t.transpile(
+        trainer_id=trainer_id,
+        program=main_program,
+        pservers=pserver_endpoints,
+        trainers=trainers)
+    return t
+
+
+def run_pserver(pserver_endpoints, trainers, current_endpoint):
+    get_model(batch_size=20)
+    t = get_transpiler(0,
+                       fluid.default_main_program(), pserver_endpoints,
+                       trainers)
+    pserver_prog = t.get_pserver_program(current_endpoint)
+    startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
+
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    exe.run(startup_prog)
+
+    exe.run(pserver_prog)
+
+
+class TestDistMnist(unittest.TestCase):
+    def setUp(self):
+        self._trainers = 1
+        self._pservers = 1
+        self._ps_endpoints = "127.0.0.1:9123"
+
+    def start_pserver(self, endpoint):
+        p = Process(
+            target=run_pserver,
+            args=(self._ps_endpoints, self._trainers, endpoint))
+        p.start()
+        return p.pid
+
+    def _wait_ps_ready(self, pid):
+        retry_times = 5
+        while True:
+            assert retry_times >= 0, "wait ps ready failed"
+            time.sleep(1)
+            try:
+                # the listen_and_serv_op would touch a file which contains the listen port
+                # on the /tmp directory until it was ready to process all the RPC call.
+                os.stat("/tmp/paddle.%d.port" % pid)
+                return
+            except os.error:
+                retry_times -= 1
+
+    def stop_pserver(self, pid):
+        os.kill(pid, signal.SIGTERM)
+
+    def test_with_place(self):
+        p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+
+        pserver_pid = self.start_pserver(self._ps_endpoints)
+        self._wait_ps_ready(pserver_pid)
+
+        self.run_trainer(p, 0)
+
+        self.stop_pserver(pserver_pid)
+
+    def run_trainer(self, place, trainer_id):
+        test_program, avg_cost, train_reader, test_reader, batch_acc, predict = get_model(
+            batch_size=20)
+        t = get_transpiler(trainer_id,
+                           fluid.default_main_program(), self._ps_endpoints,
+                           self._trainers)
+
+        trainer_prog = t.get_trainer_program()
+
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+
+        feed_var_list = [
+            var for var in trainer_prog.global_block().vars.itervalues()
+            if var.is_data
+        ]
+
+        feeder = fluid.DataFeeder(feed_var_list, place)
+        for pass_id in xrange(10):
+            for batch_id, data in enumerate(train_reader()):
+                exe.run(trainer_prog, feed=feeder.feed(data))
+
+                if (batch_id + 1) % 10 == 0:
+                    acc_set = []
+                    avg_loss_set = []
+                    for test_data in test_reader():
+                        acc_np, avg_loss_np = exe.run(
+                            program=test_program,
+                            feed=feeder.feed(test_data),
+                            fetch_list=[batch_acc, avg_cost])
+                        acc_set.append(float(acc_np))
+                        avg_loss_set.append(float(avg_loss_np))
+                    # get test acc and loss
+                    acc_val = np.array(acc_set).mean()
+                    avg_loss_val = np.array(avg_loss_set).mean()
+                    if float(acc_val
+                             ) > 0.8:  # Smaller value to increase CI speed
+                        return
+                    else:
+                        print(
+                            'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'.
+                            format(pass_id, batch_id + 1,
+                                   float(avg_loss_val), float(acc_val)))
+                        if math.isnan(float(avg_loss_val)):
+                            assert ("got Nan loss, training failed.")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_train.py b/python/paddle/fluid/tests/unittests/test_dist_train.py
index 2314bb2ed8a4eeb34752fd5d040f8a8476798aa6..562e66b0625083fe840d64967249f0215cfda1f9 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_train.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_train.py
@@ -16,6 +16,7 @@ import os
 import time
 import unittest
 from multiprocessing import Process
+import signal
 
 import numpy
 
@@ -24,9 +25,6 @@ import paddle.fluid.layers as layers
 
 
 class TestSendOp(unittest.TestCase):
-    @unittest.skip(
-        "This test is buggy. We cannot use time.sleep to sync processes, the connection may fail in unittest."
-    )
     def test_send(self):
         # Run init_serv in a thread
         place = fluid.CPUPlace()
@@ -35,7 +33,9 @@ class TestSendOp(unittest.TestCase):
         p.daemon = True
         p.start()
 
-        time.sleep(10)
+        self.ps_timeout = 5
+        self._wait_ps_ready(p.pid)
+
         with open("/tmp/paddle.%d.port" % p.pid, "r") as fn:
             selected_port = int(fn.readlines()[0])
         self.init_client(place, selected_port)
@@ -44,9 +44,23 @@ class TestSendOp(unittest.TestCase):
         self.assertTrue(numpy.allclose(self.local_out, self.dist_out))
 
         # FIXME(typhoonzero): find a way to gracefully shutdown the server.
-        os.system("kill -9 %d" % p.pid)
+        os.kill(p.pid, signal.SIGKILL)
         p.join()
 
+    def _wait_ps_ready(self, pid):
+        start_left_time = self.ps_timeout
+        sleep_time = 0.5
+        while True:
+            assert start_left_time >= 0, "wait ps ready failed"
+            time.sleep(sleep_time)
+            try:
+                # the listen_and_serv_op would touch a file which contains the listen port
+                # on the /tmp directory until it was ready to process all the RPC call.
+                os.stat("/tmp/paddle.%d.port" % pid)
+                return
+            except os.error:
+                start_left_time -= sleep_time
+
     def init_serv(self, place):
         main = fluid.Program()
 
@@ -84,7 +98,10 @@ class TestSendOp(unittest.TestCase):
                 dtype="float32",
                 persistable=False,
                 shape=[32, 32])
-            o = layers.Send("127.0.0.1:%d" % port, [x], [get_var])
+            fluid.initializer.Constant(value=2.3)(get_var, main.global_block())
+            layers.Send("127.0.0.1:%d" % port, [x])
+            o = layers.Recv("127.0.0.1:%d" % port, [get_var])
+
         exe = fluid.Executor(place)
         self.dist_out = exe.run(main, fetch_list=o)  # o is a list
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index b4379ad447e01683325dfcbb6a5b322f0b8eac3d..75b4b4e50da04521021dcb1e97cfe495f2619433 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -15,51 +15,248 @@
 import unittest
 import paddle.fluid as fluid
 from paddle.fluid.transpiler.distribute_transpiler import delete_ops
+import traceback
 
-from transpiler_test import TranspilerTest
 
-
-class TestDistTranspiler(TranspilerTest):
+class TranspilerTest(unittest.TestCase):
     def setUp(self):
-        self.current_pserver_ep = "127.0.0.1:6174"
+        self.trainer_id = 0
+        self.trainers = 2
+        self.pservers = 2
+        # NOTE: we do not actually bind this port
+        self.pserver_eps = "127.0.0.1:6174,127.0.0.1:6175"
+        self.pserver1_ep = "127.0.0.1:6174"
+        self.pserver2_ep = "127.0.0.1:6175"
+        self.slice_var_up = True
+        self.sync_mode = True
+        self.transpiler = None
+
+    def net_conf(self):
+        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
+        y_predict = fluid.layers.fc(input=x,
+                                    size=1000,
+                                    act=None,
+                                    param_attr=fluid.ParamAttr(name='fc_w'),
+                                    bias_attr=fluid.ParamAttr(name='fc_b'))
+        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        avg_cost = fluid.layers.mean(cost)
+        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
+        sgd_optimizer.minimize(avg_cost)
+        return
+
+    def get_main_program(self):
+        main = fluid.Program()
+        with fluid.program_guard(main):
+            self.net_conf()
+        self.origin_prog = main.clone()
+        return main
+
+    def get_trainer(self):
+        t = self._transpiler_instance()
+        return t.get_trainer_program()
+
+    def get_pserver(self, ep):
+        t = self._transpiler_instance()
+        pserver = t.get_pserver_program(ep)
+        startup = t.get_startup_program(ep, pserver)
+        return pserver, startup
+
+    def _transpiler_instance(self):
+        if not self.transpiler:
+            main = self.get_main_program()
+            self.transpiler = fluid.DistributeTranspiler()
+            self.transpiler.transpile(
+                self.trainer_id,
+                program=main,
+                pservers=self.pserver_eps,
+                trainers=self.trainers,
+                slice_var_up=self.slice_var_up,
+                sync_mode=self.sync_mode)
+        return self.transpiler
 
+
+class TestBasicModel(TranspilerTest):
     def test_transpiler(self):
+        pserver, startup = self.get_pserver(self.pserver1_ep)
+        pserver2, startup2 = self.get_pserver(self.pserver2_ep)
+
         trainer = self.get_trainer()
-        pserver, startup = self.get_pserver(self.current_pserver_ep)
-        self.assertEqual([op.type for op in trainer.global_block().ops],
-                         self.get_expect_trainer_ops())
+
+        self.assertEqual([op.type for op in trainer.global_block().ops], [
+            'mul', 'elementwise_add', 'elementwise_sub', 'square', 'mean',
+            'fill_constant', 'mean_grad', 'square_grad', 'elementwise_sub_grad',
+            'elementwise_add_grad', 'send', 'mul_grad', 'split_byref', 'send',
+            'send_barrier', 'recv', 'recv', 'fetch_barrier', 'concat'
+        ])
 
         self.assertEqual(len(pserver.blocks), 3)
         # block0: listen_and_serv
         self.assertEqual([op.type for op in pserver.blocks[0].ops],
                          ["listen_and_serv"])
-        # block2: optimize pass
+        # block1~2: optimize pass
         self.assertEqual([op.type for op in pserver.blocks[1].ops],
                          ["sum", "scale", "sgd"])
-
         # confirm startup program
-
-        self.assertEqual([op.type for op in startup.global_block().ops], [
-            "fill_constant", "fill_constant", "uniform_random", "uniform_random"
-        ])
-
+        self.assertEqual([op.type for op in startup.global_block().ops],
+                         ["fill_constant", "fill_constant", "uniform_random"])
         # the variable #fc_w will be split into two blocks
         fc_w_var = startup.global_block().var("fc_w.block1")
         self.assertEqual(fc_w_var.shape, (500, 1000))
+        # all parameters should be optimized on pserver
+
+        pserver_params = []
+        for prog in [pserver, pserver2]:
+            for blk in prog.blocks:
+                for op in blk.ops:
+                    if "Param" in op.input_names:
+                        param_name = op.input("Param")[0]
+                        is_block_idx = param_name.find(".block")
+                        if is_block_idx != -1:
+                            origin_param_name = param_name[:is_block_idx]
+                        else:
+                            origin_param_name = param_name
+                        pserver_params.append(origin_param_name)
+        trainer_params = []
+        for op in self.origin_prog.global_block().ops:
+            if "Param" in op.input_names:
+                trainer_params.append(op.input("Param")[0])
+        self.assertEqual(set(pserver_params), set(trainer_params))
+
+
+class TestNoSliceVar(TranspilerTest):
+    def setUp(self):
+        super(TestNoSliceVar, self).setUp()
+        self.slice_var_up = False
+
+    def test_transpiler(self):
+        _, startup = self.get_pserver(self.pserver1_ep)
+        _, startup2 = self.get_pserver(self.pserver2_ep)
+
+        if startup.global_block().vars.has_key("fc_w"):
+            fc_w_var = startup.global_block().vars["fc_w"]
+        elif startup2.global_block().vars.has_key("fc_w"):
+            fc_w_var = startup2.global_block().vars["fc_w"]
+
+        self.assertEqual(fc_w_var.shape, (1000, 1000))
 
-    def get_expect_trainer_ops(self):
-        trainer = fluid.Program()
 
-        with fluid.program_guard(trainer):
-            optimize_ops, params_grads = self.net_conf()
+class TestLRDecay(TranspilerTest):
+    def net_conf(self):
+        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
+        y_predict = fluid.layers.fc(input=x,
+                                    size=1000,
+                                    act=None,
+                                    param_attr=fluid.ParamAttr(name='fc_w'),
+                                    bias_attr=fluid.ParamAttr(name='fc_b'))
+        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        avg_cost = fluid.layers.mean(cost)
+        sgd_optimizer = fluid.optimizer.SGD(
+            learning_rate=fluid.layers.exponential_decay(
+                learning_rate=1.0,
+                decay_steps=2100,
+                decay_rate=0.1,
+                staircase=True))
+        sgd_optimizer.minimize(avg_cost)
+        return
+
+    def test_transpiler(self):
+        pserver, startup = self.get_pserver(self.pserver1_ep)
+        trainer = self.get_trainer()
+
+        self.assertEqual(len(pserver.blocks), 4)
+        lr_decay_ops = [op.type for op in pserver.blocks[1].ops]
+        self.assertEqual(lr_decay_ops, [
+            "increment", "cast", "fill_constant", "elementwise_div", "floor",
+            "fill_constant", "elementwise_pow", "fill_constant",
+            "elementwise_mul"
+        ])
+
+
+class TestLRDecayConditional(TranspilerTest):
+    def net_conf(self):
+        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
+        y_predict = fluid.layers.fc(input=x,
+                                    size=1000,
+                                    act=None,
+                                    param_attr=fluid.ParamAttr(name='fc_w'),
+                                    bias_attr=fluid.ParamAttr(name='fc_b'))
+        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        avg_cost = fluid.layers.mean(cost)
+        sgd_optimizer = fluid.optimizer.SGD(
+            learning_rate=fluid.layers.piecewise_decay([10000, 20000],
+                                                       [1.0, 0.5, 1.0]))
+        sgd_optimizer.minimize(avg_cost)
+        return
+
+    def test_transpiler(self):
+        pserver, startup = self.get_pserver(self.pserver1_ep)
+        trainer = self.get_trainer()
+
+        serv_op = pserver.blocks[0].ops[0]
+        sub_blocks = []
+        optimize_blocks = []
+        for b in serv_op.attrs["optimize_blocks"]:
+            optimize_blocks.append(b.idx)
+        for b in pserver.blocks:
+            if b.idx not in optimize_blocks:
+                sub_blocks.append(b.idx)
+
+        self.assertEqual(len(pserver.blocks), 7)
+        lr_decay_ops = [op.type for op in pserver.blocks[1].ops]
+        self.assertEqual(lr_decay_ops, [
+            "increment", "cast", "fill_constant", "fill_constant", "less_than",
+            "logical_not", "conditional_block", "fill_constant",
+            "fill_constant", "less_than", "logical_not", "logical_and",
+            "logical_and", "conditional_block", "fill_constant",
+            "conditional_block"
+        ])
+        # test the condition blocks
+        for b in sub_blocks:
+            if b == 0:
+                continue
+            block = pserver.blocks[b]
+            self.assertEqual([op.type for op in block.ops], ["assign"])
+
+
+class TestL2Decay(TranspilerTest):
+    def net_conf(self):
+        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
+        y_predict = fluid.layers.fc(
+            input=x,
+            size=1000,
+            act=None,
+            param_attr=fluid.ParamAttr(
+                name='fc_w',
+                regularizer=fluid.regularizer.L2Decay(),
+                gradient_clip=fluid.clip.GradientClipByValue(0.1)),
+            bias_attr=fluid.ParamAttr(name='fc_b'))
+        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        avg_cost = fluid.layers.mean(cost)
+        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
+        sgd_optimizer.minimize(avg_cost)
+        return
+
+    def test_transpiler(self):
+        pserver, startup = self.get_pserver(self.pserver1_ep)
+        trainer = self.get_trainer()
+
+        self.assertEqual(len(pserver.blocks), 3)
+        self.assertEqual([op.type for op in pserver.blocks[1].ops],
+                         ["sum", "scale", "clip", "sgd"])
+        self.assertEqual(
+            [op.type for op in pserver.blocks[2].ops],
+            ["sum", "scale", "clip", "scale", "elementwise_add", "sgd"])
+        # TODO(typhoonzero): test clipping and L2Decay ops are removed from trainer
+
 
-        delete_ops(trainer.global_block(), optimize_ops)
-        ops = [op.type for op in trainer.global_block().ops] + [
-            "split_byref", "send", "send_barrier", "recv", "recv",
-            "fetch_barrier", "concat"
-        ]
-        ops.insert(ops.index("elementwise_add_grad") + 1, "send")
-        return ops
+    # FIXME(typhoonzero): need to add test for async case:
+    # see https://github.com/PaddlePaddle/Paddle/issues/11691
+class TestAsyncSGD(TranspilerTest):
+    pass
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
new file mode 100644
index 0000000000000000000000000000000000000000..712fd5849d80b1915ae3b2ae5108bedee8d88a2c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
@@ -0,0 +1,203 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import argparse
+import time
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import os
+import signal
+
+IS_SPARSE = True
+EMBED_SIZE = 32
+HIDDEN_SIZE = 256
+N = 5
+BATCH_SIZE = 32
+ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy
+
+
+def get_model():
+    def __network__(words):
+        embed_first = fluid.layers.embedding(
+            input=words[0],
+            size=[dict_size, EMBED_SIZE],
+            dtype='float32',
+            is_sparse=IS_SPARSE,
+            param_attr='shared_w')
+        embed_second = fluid.layers.embedding(
+            input=words[1],
+            size=[dict_size, EMBED_SIZE],
+            dtype='float32',
+            is_sparse=IS_SPARSE,
+            param_attr='shared_w')
+        embed_third = fluid.layers.embedding(
+            input=words[2],
+            size=[dict_size, EMBED_SIZE],
+            dtype='float32',
+            is_sparse=IS_SPARSE,
+            param_attr='shared_w')
+        embed_forth = fluid.layers.embedding(
+            input=words[3],
+            size=[dict_size, EMBED_SIZE],
+            dtype='float32',
+            is_sparse=IS_SPARSE,
+            param_attr='shared_w')
+
+        concat_embed = fluid.layers.concat(
+            input=[embed_first, embed_second, embed_third, embed_forth], axis=1)
+        hidden1 = fluid.layers.fc(input=concat_embed,
+                                  size=HIDDEN_SIZE,
+                                  act='sigmoid')
+        predict_word = fluid.layers.fc(input=hidden1,
+                                       size=dict_size,
+                                       act='softmax')
+        cost = fluid.layers.cross_entropy(input=predict_word, label=words[4])
+        avg_cost = fluid.layers.mean(cost)
+        return avg_cost, predict_word
+
+    word_dict = paddle.dataset.imikolov.build_dict()
+    dict_size = len(word_dict)
+
+    first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
+    second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
+    third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
+    forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64')
+    next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
+    avg_cost, predict_word = __network__(
+        [first_word, second_word, third_word, forth_word, next_word])
+
+    inference_program = paddle.fluid.default_main_program().clone()
+
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+    sgd_optimizer.minimize(avg_cost)
+
+    train_reader = paddle.batch(
+        paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
+    test_reader = paddle.batch(
+        paddle.dataset.imikolov.test(word_dict, N), BATCH_SIZE)
+
+    return inference_program, avg_cost, train_reader, test_reader, predict_word
+
+
+def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers):
+    t = fluid.DistributeTranspiler()
+    t.transpile(
+        trainer_id=trainer_id,
+        program=main_program,
+        pservers=pserver_endpoints,
+        trainers=trainers)
+    return t
+
+
+def run_pserver(pserver_endpoints, trainers, current_endpoint):
+    get_model()
+    t = get_transpiler(0,
+                       fluid.default_main_program(), pserver_endpoints,
+                       trainers)
+    pserver_prog = t.get_pserver_program(current_endpoint)
+    startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
+
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    exe.run(startup_prog)
+
+    exe.run(pserver_prog)
+
+
+class TestDistMnist(unittest.TestCase):
+    def setUp(self):
+        self._trainers = 1
+        self._pservers = 1
+        self._ps_endpoints = "127.0.0.1:9123"
+
+    def start_pserver(self, endpoint):
+        p = Process(
+            target=run_pserver,
+            args=(self._ps_endpoints, self._trainers, endpoint))
+        p.start()
+        return p.pid
+
+    def _wait_ps_ready(self, pid):
+        retry_times = 5
+        while True:
+            assert retry_times >= 0, "wait ps ready failed"
+            time.sleep(1)
+            try:
+                # the listen_and_serv_op would touch a file which contains the listen port
+                # on the /tmp directory until it was ready to process all the RPC call.
+                os.stat("/tmp/paddle.%d.port" % pid)
+                return
+            except os.error:
+                retry_times -= 1
+
+    def stop_pserver(self, pid):
+        os.kill(pid, signal.SIGKILL)
+
+    def test_with_place(self):
+        p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+
+        pserver_pid = self.start_pserver(self._ps_endpoints)
+        self._wait_ps_ready(pserver_pid)
+
+        self.run_trainer(p, 0)
+
+        self.stop_pserver(pserver_pid)
+
+    def run_trainer(self, place, trainer_id):
+        test_program, avg_cost, train_reader, test_reader, predict = get_model()
+        t = get_transpiler(trainer_id,
+                           fluid.default_main_program(), self._ps_endpoints,
+                           self._trainers)
+
+        trainer_prog = t.get_trainer_program()
+
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+
+        use_gpu = True if core.is_compiled_with_cuda() else False
+
+        exec_strategy = ExecutionStrategy()
+        exec_strategy.use_cuda = use_gpu
+        train_exe = fluid.ParallelExecutor(
+            use_cuda=use_gpu,
+            main_program=trainer_prog,
+            loss_name=avg_cost.name,
+            exec_strategy=exec_strategy)
+
+        feed_var_list = [
+            var for var in trainer_prog.global_block().vars.itervalues()
+            if var.is_data
+        ]
+
+        feeder = fluid.DataFeeder(feed_var_list, place)
+        for pass_id in xrange(10):
+            for batch_id, data in enumerate(train_reader()):
+                avg_loss_np = train_exe.run(feed=feeder.feed(data),
+                                            fetch_list=[avg_cost.name])
+                loss = np.array(avg_loss_np).mean()
+                if float(loss) < 5.0:
+                    return
+                if math.isnan(loss):
+                    assert ("Got Nan loss, training failed")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
index 95af51f1b2f8cd9492baa9cb14fe31ffa586f2fc..0f289af284773caf8515f9cbdd38e0d4481e4e44 100644
--- a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
@@ -136,16 +136,16 @@ class BaseRNN(object):
         feed_dict = dict()
 
         for iname in self.inputs:
-            lod = [0]
+            lod = []
             np_flatten = []
             for seq_id in xrange(len(self.inputs[iname])):
                 seq_len = len(self.inputs[iname][seq_id])
-                lod.append(lod[-1] + seq_len)
+                lod.append(seq_len)
                 np_flatten.extend(self.inputs[iname][seq_id])
 
             t = fluid.Tensor()
             t.set(numpy.array(np_flatten), place)
-            t.set_lod([lod])
+            t.set_recursive_sequence_lengths([lod])
             feed_dict[iname] = t
 
         for pname in self.params:
diff --git a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
index d3f63ee2c414a71309be8f0af6d3e5912078ecdb..92e718662dfd7998be3ede2994f160059679fa8a 100644
--- a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
@@ -39,20 +39,20 @@ class TestDyRnnStaticInput(unittest.TestCase):
 
     def prepare_x_tensor(self):
         self.x_tensor_dim = 10
-        lod = [[0, 2, 3, 6]]
-        shape = [lod[0][-1], self.x_tensor_dim]
+        lod = [[2, 1, 3]]
+        shape = [sum(lod[0]), self.x_tensor_dim]
         self.x_tensor_data = np.random.random(shape).astype('float32')
         self.x_tensor = core.LoDTensor()
-        self.x_tensor.set_lod(lod)
+        self.x_tensor.set_recursive_sequence_lengths(lod)
         self.x_tensor.set(self.x_tensor_data, self.place)
 
     def prepare_static_input_tensor(self):
         self.static_input_tensor_dim = 4
-        lod = [[0, 1, 3, 6]]
-        shape = [lod[0][-1], self.static_input_tensor_dim]
+        lod = [[1, 2, 3]]
+        shape = [sum(lod[0]), self.static_input_tensor_dim]
         self.static_input_data = np.random.random(shape).astype('float32')
         self.static_input_tensor = core.LoDTensor()
-        self.static_input_tensor.set_lod(lod)
+        self.static_input_tensor.set_recursive_sequence_lengths(lod)
         self.static_input_tensor.set(self.static_input_data, self.place)
 
     def fetch_value(self, var):
@@ -69,7 +69,7 @@ class TestDyRnnStaticInput(unittest.TestCase):
         ndarray = np.zeros(shape=dims).astype('float32')
         for i in xrange(np.product(dims)):
             ndarray.ravel()[i] = lod_tensor.get_float_element(i)
-        return ndarray, lod_tensor.lod()
+        return ndarray, lod_tensor.recursive_sequence_lengths()
 
     def build_graph(self, only_forward=False):
         x_tensor = fluid.layers.data(
@@ -131,21 +131,20 @@ class TestDyRnnStaticInput(unittest.TestCase):
             framework.grad_var_name('static_input_tensor'))
         return static_input_grad, loss
 
-    def get_seq_len_from_lod(self, lod):
-        return [lod[0][i + 1] - lod[0][i] for i in xrange(len(lod[0]) - 1)]
-
     def get_expected_static_step_outs(self):
-        x_lod = self.x_tensor.lod()
-        x_seq_len = self.get_seq_len_from_lod(x_lod)
+        x_lod = self.x_tensor.recursive_sequence_lengths()
+        x_seq_len = x_lod[0]
         x_seq_len_sorted = sorted(x_seq_len)
         x_sorted_indices = np.argsort(x_seq_len)[::-1]
 
-        static_lod = self.static_input_tensor.lod()
-        static_sliced = [
-            self.static_input_data[static_lod[0][i]:static_lod[0][i + 1]]
-            for i in xrange(len(static_lod[0]) - 1)
-        ]
-        static_seq_len = self.get_seq_len_from_lod(static_lod)
+        static_lod = self.static_input_tensor.recursive_sequence_lengths()
+        static_sliced = []
+        cur_offset = 0
+        for i in xrange(len(static_lod[0])):
+            static_sliced.append(self.static_input_data[cur_offset:(
+                cur_offset + static_lod[0][i])])
+            cur_offset += static_lod[0][i]
+        static_seq_len = static_lod[0]
         static_reordered = []
         for i in xrange(len(x_sorted_indices)):
             static_reordered.extend(static_sliced[x_sorted_indices[i]].tolist())
@@ -159,11 +158,13 @@ class TestDyRnnStaticInput(unittest.TestCase):
 
         for i in xrange(self._max_sequence_len):
             end = len(x_seq_len) - bisect.bisect_left(x_seq_len_sorted, i + 1)
-            lod = [0]
+            lod = []
+            total_len = 0
             for i in xrange(end):
-                lod.append(static_seq_len_reordered[i] + lod[-1])
+                lod.append(static_seq_len_reordered[i])
+                total_len += lod[-1]
             static_step_lods.append([lod])
-            end = lod[-1]
+            end = total_len
             static_step_outs.append(
                 np.array(static_reordered[:end]).astype('float32'))
 
@@ -199,7 +200,9 @@ class TestDyRnnStaticInput(unittest.TestCase):
             self.static_input_tensor.set_float_element(i, origin)
             numeric_gradients.ravel()[i] = (y_pos - y_neg) / self._delta / 2
         self.assertTrue(np.allclose(actual_gradients, numeric_gradients, 0.001))
-        self.assertTrue(np.allclose(actual_lod, self.static_input_tensor.lod()))
+        self.assertTrue(
+            np.allclose(actual_lod,
+                        self.static_input_tensor.recursive_sequence_lengths()))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_edit_distance_op.py b/python/paddle/fluid/tests/unittests/test_edit_distance_op.py
index 2957fb50586c8bce74bbf8066e0e9bf24d79cb7d..816562621b4fc749f3c6b0eca8ee3c5850ef1ba9 100644
--- a/python/paddle/fluid/tests/unittests/test_edit_distance_op.py
+++ b/python/paddle/fluid/tests/unittests/test_edit_distance_op.py
@@ -52,23 +52,29 @@ class TestEditDistanceOp(OpTest):
     def setUp(self):
         self.op_type = "edit_distance"
         normalized = False
-        x1 = np.array([[0, 12, 3, 5, 8, 2]]).astype("int64")
-        x2 = np.array([[0, 12, 4, 7, 8]]).astype("int64")
+        x1 = np.array([[12, 3, 5, 8, 2]]).astype("int64")
+        x2 = np.array([[12, 4, 7, 8]]).astype("int64")
         x1 = np.transpose(x1)
         x2 = np.transpose(x2)
-        x1_lod = [0, 1, 5]
-        x2_lod = [0, 3, 4]
+        x1_lod = [1, 4]
+        x2_lod = [3, 1]
 
-        num_strs = len(x1_lod) - 1
+        num_strs = len(x1_lod)
         distance = np.zeros((num_strs, 1)).astype("float32")
         sequence_num = np.array(2).astype("int64")
+
+        x1_offset = 0
+        x2_offset = 0
         for i in range(0, num_strs):
             distance[i] = Levenshtein(
-                hyp=x1[x1_lod[i]:x1_lod[i + 1]],
-                ref=x2[x2_lod[i]:x2_lod[i + 1]])
+                hyp=x1[x1_offset:(x1_offset + x1_lod[i])],
+                ref=x2[x2_offset:(x2_offset + x2_lod[i])])
+            x1_offset += x1_lod[i]
+            x2_offset += x2_lod[i]
             if normalized is True:
-                len_ref = x2_lod[i + 1] - x2_lod[i]
+                len_ref = x2_lod[i]
                 distance[i] = distance[i] / len_ref
+
         self.attrs = {'normalized': normalized}
         self.inputs = {'Hyps': (x1, [x1_lod]), 'Refs': (x2, [x2_lod])}
         self.outputs = {'Out': distance, 'SequenceNum': sequence_num}
@@ -81,23 +87,29 @@ class TestEditDistanceOpNormalized(OpTest):
     def setUp(self):
         self.op_type = "edit_distance"
         normalized = True
-        x1 = np.array([[0, 10, 3, 6, 5, 8, 2]]).astype("int64")
-        x2 = np.array([[0, 10, 4, 6, 7, 8]]).astype("int64")
+        x1 = np.array([[10, 3, 6, 5, 8, 2]]).astype("int64")
+        x2 = np.array([[10, 4, 6, 7, 8]]).astype("int64")
         x1 = np.transpose(x1)
         x2 = np.transpose(x2)
-        x1_lod = [0, 1, 3, 6]
-        x2_lod = [0, 2, 3, 5]
+        x1_lod = [1, 2, 3]
+        x2_lod = [2, 1, 2]
 
-        num_strs = len(x1_lod) - 1
+        num_strs = len(x1_lod)
         distance = np.zeros((num_strs, 1)).astype("float32")
         sequence_num = np.array(3).astype("int64")
+
+        x1_offset = 0
+        x2_offset = 0
         for i in range(0, num_strs):
             distance[i] = Levenshtein(
-                hyp=x1[x1_lod[i]:x1_lod[i + 1]],
-                ref=x2[x2_lod[i]:x2_lod[i + 1]])
+                hyp=x1[x1_offset:(x1_offset + x1_lod[i])],
+                ref=x2[x2_offset:(x2_offset + x2_lod[i])])
+            x1_offset += x1_lod[i]
+            x2_offset += x2_lod[i]
             if normalized is True:
-                len_ref = x2_lod[i + 1] - x2_lod[i]
+                len_ref = x2_lod[i]
                 distance[i] = distance[i] / len_ref
+
         self.attrs = {'normalized': normalized}
         self.inputs = {'Hyps': (x1, [x1_lod]), 'Refs': (x2, [x2_lod])}
         self.outputs = {'Out': distance, 'SequenceNum': sequence_num}
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..bcdbfc8e527d0dc9a95eddaf040f8035207b6c20
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_mkldnn_op.py
@@ -0,0 +1,130 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from op_test import OpTest
+from test_elementwise_add_op import *
+'''
+Some tests differ from the tests defined in test_elementwise_add_op.py
+because MKLDNN does not support tensors of number of dimensions 3.
+Such dimensions cause exceptions in MKLDNN reorder primitive.
+'''
+
+
+class TestMKLDNNElementwiseAddOp(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype)
+        self.out = np.add(self.x, self.y)
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNElementwiseAddOp_scalar(TestElementwiseAddOp_scalar):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(1).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNElementwiseAddOp_scalar2(TestElementwiseAddOp_scalar2):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNElementwiseAddOp_Vector(TestElementwiseAddOp_Vector):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TesMKLDNNtElementwiseAddOp_broadcast_0(TestElementwiseAddOp_broadcast_0):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(2).astype(self.dtype)
+        self.out = self.x + self.y.reshape(2, 1, 1, 1)
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNElementwiseAddOp_broadcast_1(TestElementwiseAddOp_broadcast_1):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(3).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 3, 1, 1)
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNElementwiseAddOp_broadcast_2(TestElementwiseAddOp_broadcast_2):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(4).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 1, 1, 4)
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNElementwiseAddOp_broadcast_3(TestElementwiseAddOp_broadcast_3):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNElementwiseAddOp_broadcast_4(TestElementwiseAddOp_broadcast_4):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNElementwiseAddOp_rowwise_add_0(
+        TestElementwiseAddOp_rowwise_add_0):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(3, 4).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 3, 4, 1)
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNElementwiseAddOp_rowwise_add_1(
+        TestElementwiseAddOp_rowwise_add_1):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNElementwiseAddOp_channelwise_add(
+        TestElementwiseAddOp_channelwise_add):
+    def init_input_output(self):
+        self.x = np.random.rand(3, 5, 20, 20).astype(self.dtype)
+        self.y = np.random.rand(3, 1, 1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
index 96d47906a0606bba4b1d2207f7da85b058e42a2b..fb9a496126f0b6efcad73590c78efe5a47b88cd6 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
@@ -18,19 +18,23 @@ from op_test import OpTest
 
 
 class TestElementwiseAddOp(OpTest):
+    def init_kernel_type(self):
+        self.use_mkldnn = False
+
     def setUp(self):
         self.op_type = "elementwise_add"
         self.dtype = np.float32
         self.axis = -1
         self.init_dtype()
         self.init_input_output()
+        self.init_kernel_type()
         self.init_axis()
 
         self.inputs = {
             'X': OpTest.np_dtype_to_fluid_dtype(self.x),
             'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
         }
-        self.attrs = {'axis': self.axis}
+        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
         self.outputs = {'Out': self.out}
 
     def test_check_output(self):
diff --git a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
index 281068e945e76a42635868d19573498f79fde1f3..026ac2112b2d78644b3315b9cab8019ca27e9714 100644
--- a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
@@ -40,7 +40,6 @@ class TestFakeDequantizeMaxAbsOp(OpTest):
         self.op_type = "fake_dequantize_max_abs"
         x = np.random.randn(31, 65).astype("float32")
         yq, scale = quantize_max_abs(x, self.num_bits)
-        print 'scale ', scale
         ydq = dequantize_max_abs(yq, self.num_bits, scale)
 
         self.inputs = {'X': yq}
diff --git a/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py b/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py
index 9d724a6479f061996359b1efcc5f61f0564331c7..8b9da843115409c65055927d317867d1290c8f0e 100644
--- a/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py
+++ b/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py
@@ -24,17 +24,16 @@ class TestFeedFetch(unittest.TestCase):
         input_array = np.ones((4, 4, 6)).astype("float32")
         input_array[0, 0, 0] = 3
         input_array[3, 3, 5] = 10
-        input_tensor = core.LoDTensor([[0, 2, 4]])
+        input_tensor = core.LoDTensor([[2, 2]])
         input_tensor.set(input_array, place)
 
         core.set_feed_variable(scope, input_tensor, "feed", 0)
 
         output_tensor = core.get_fetch_variable(scope, "feed", 0)
 
-        output_lod = output_tensor.lod()
-        self.assertEqual(0, output_lod[0][0])
+        output_lod = output_tensor.recursive_sequence_lengths()
+        self.assertEqual(2, output_lod[0][0])
         self.assertEqual(2, output_lod[0][1])
-        self.assertEqual(4, output_lod[0][2])
 
         output_array = np.array(output_tensor)
         self.assertEqual(3, output_array[0, 0, 0])
diff --git a/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py b/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py
index 533d8ccfac82a2e298af16181ab16bf7aa3db282..0c75cf33f5f208d11081a6802910c25553b8c4ec 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py
@@ -55,7 +55,7 @@ class TestFillConstantBatchSizeLikeWithLoDTensor(OpTest):
         self.op_type = "fill_constant_batch_size_like"
         self.inputs = {
             'Input': (np.random.random((31, 28)).astype("float32"),
-                      [[0, 9, 23, 31]])
+                      [[9, 14, 8]])
         }
         self.attrs = {
             'value': 3.5,
diff --git a/python/paddle/fluid/tests/unittests/test_gaussian_random_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_gaussian_random_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ae877a60818744f852d3af9a02ffebf5e2affc8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_mkldnn_op.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from test_gaussian_random_op import TestGaussianRandomOp
+
+
+class TestMKLDNN(TestGaussianRandomOp):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
index 272caceaf38699438ccae41691bf26b2eb4d2a22..8481500fd78f0ccf34f09c66bec27e195b9aada3 100644
--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
@@ -25,7 +25,15 @@ class TestGaussianRandomOp(unittest.TestCase):
     def setUp(self):
         self.op_type = "gaussian_random"
         self.inputs = {}
-        self.attrs = {"shape": [1000, 784], "mean": .0, "std": 1., "seed": 10}
+        self.use_mkldnn = False
+        self.init_kernel_type()
+        self.attrs = {
+            "shape": [1000, 784],
+            "mean": .0,
+            "std": 1.,
+            "seed": 10,
+            "use_mkldnn": self.use_mkldnn
+        }
 
         self.outputs = ["Out"]
 
@@ -58,6 +66,9 @@ class TestGaussianRandomOp(unittest.TestCase):
         self.assertAlmostEqual(numpy.mean(tensor), .0, delta=0.1)
         self.assertAlmostEqual(numpy.std(tensor), 1., delta=0.1)
 
+    def init_kernel_type(self):
+        pass
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_gru_op.py b/python/paddle/fluid/tests/unittests/test_gru_op.py
index 3a13eb872a8646cede126b667864dfc3784ebd0b..8fbf1560859aa295fc40b36129d0f0d07d55dd9f 100644
--- a/python/paddle/fluid/tests/unittests/test_gru_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gru_op.py
@@ -20,8 +20,8 @@ from test_lstm_op import identity, sigmoid, tanh, relu
 
 
 class TestGRUOp(OpTest):
-    lod = [[0, 2, 6, 9]]
-    batch_size = lod[0][-1]
+    lod = [[2, 4, 3]]
+    batch_size = sum(lod[0])
     frame_size = 5
     activate = {
         'identity': identity,
@@ -33,10 +33,10 @@ class TestGRUOp(OpTest):
     @staticmethod
     def seq_to_batch(lod, is_reverse):
         idx_in_seq_list = []
-        seq_starts = lod[0]
-        seq_lens = []
-        for i in range(len(seq_starts) - 1):
-            seq_lens.append(seq_starts[i + 1] - seq_starts[i])
+        seq_lens = lod[0]
+        seq_starts = [0]
+        for i in range(len(seq_lens)):
+            seq_starts.append(seq_starts[-1] + seq_lens[i])
         sorted_seqs = sorted(
             range(len(seq_lens)), lambda x, y: seq_lens[y] - seq_lens[x])
         num_batch = seq_lens[sorted_seqs[0]]
diff --git a/python/paddle/fluid/tests/unittests/test_initializer.py b/python/paddle/fluid/tests/unittests/test_initializer.py
index 587e2025e1045f63a5825f884d4dcad8b4685e62..15a72cb605911dfe957fb927763174521a30a085 100644
--- a/python/paddle/fluid/tests/unittests/test_initializer.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer.py
@@ -364,5 +364,22 @@ class TestMSRAInitializer(unittest.TestCase):
         self.assertEqual(init_op.attr('seed'), 134)
 
 
+class TestMSRAInitializer(unittest.TestCase):
+    def test_bilinear_initializer(self):
+        """Test the bilinear initializer with supplied arguments
+        """
+        program = framework.Program()
+        block = program.global_block()
+        block.create_parameter(
+            dtype="float32",
+            shape=[8, 1, 3, 3],
+            lod_level=0,
+            name="param",
+            initializer=initializer.BilinearInitializer())
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'assign_value')
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_iou_similarity_op.py b/python/paddle/fluid/tests/unittests/test_iou_similarity_op.py
index 8f62ac20a5c13257a1519128292e2abc4962bf84..eff4212d91e609a7ef531280bbd3cf3671a59830 100644
--- a/python/paddle/fluid/tests/unittests/test_iou_similarity_op.py
+++ b/python/paddle/fluid/tests/unittests/test_iou_similarity_op.py
@@ -58,8 +58,8 @@ class TestIOUSimilarityOpWithLoD(TestIOUSimilarityOp):
 
     def setUp(self):
         super(TestIOUSimilarityOpWithLoD, self).setUp()
-        self.boxes1_lod = [[0, 1, 2]]
-        self.output_lod = [[0, 1, 2]]
+        self.boxes1_lod = [[1, 1]]
+        self.output_lod = [[1, 1]]
 
         self.inputs = {'X': (self.boxes1, self.boxes1_lod), 'Y': self.boxes2}
         self.outputs = {'Out': (self.output, self.output_lod)}
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index f5b305a025a9b678c64a307255afd5303a65563f..becf73e3b577af6a275fa1a755ac4cf043a26a00 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -411,6 +411,33 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(output)
         print(str(program))
 
+    def test_crop(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[3, 5], dtype="float32")
+            y = layers.data(name='y', shape=[2, 3], dtype="float32")
+            output = layers.crop(x, shape=y)
+            self.assertIsNotNone(output)
+        print(str(program))
+
+    def test_mean_iou(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[16], dtype='float32')
+            y = layers.data(name='label', shape=[1], dtype='int64')
+            iou = layers.mean_iou(x, y, 2)
+            self.assertIsNotNone(iou)
+        print(str(program))
+
+    def test_argsort(self):
+        program = Program()
+        with program_guard(program):
+            data = layers.data(name='x', shape=[2, 3, 3], dtype="float32")
+            out, ids = layers.argsort(input=data, axis=1)
+            self.assertIsNotNone(out)
+            self.assertIsNotNone(ids)
+        print(str(program))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py b/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py
index f49f7635f76c9feb5b5593438cb445df9488c69b..696d0ab4fa81a409a2bf0d6f6f23779ec26eb6d2 100644
--- a/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py
+++ b/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py
@@ -105,11 +105,13 @@ class TestLinearChainCrfOp(OpTest):
         MAX_SEQ_LEN = 5
 
         # the linear_chain_crf operator only supports sequence (LoD level = 1)
-        lod = [[0]]
+        lod = [[]]
+        seq_start_pos = [0]
         for i in range(SEQ_NUM):
-            lod[-1].append(lod[-1][-1] + random.randint(1, MAX_SEQ_LEN))
-        emission = np.random.uniform(-1, 1,
-                                     [lod[-1][-1], TAG_NUM]).astype("float64")
+            lod[-1].append(random.randint(1, MAX_SEQ_LEN))
+            seq_start_pos.append(seq_start_pos[-1] + lod[-1][-1])
+        emission = np.random.uniform(
+            -1, 1, [seq_start_pos[-1], TAG_NUM]).astype("float64")
         emission_row_max = np.amax(emission, axis=1, keepdims=True)
         emission_exps = np.exp(emission - emission_row_max)
 
@@ -118,14 +120,14 @@ class TestLinearChainCrfOp(OpTest):
         transition_exps = np.exp(transition)
 
         labels = np.random.randint(
-            low=0, high=TAG_NUM, size=(lod[-1][-1], 1), dtype="int64")
+            low=0, high=TAG_NUM, size=(seq_start_pos[-1], 1), dtype="int64")
 
         self.inputs = {
             "Emission": (emission, lod),
             "Transition": transition,
             "Label": (labels, lod)
         }
-        crf = LinearChainCrfForward(lod[0], emission, emission_row_max,
+        crf = LinearChainCrfForward(seq_start_pos, emission, emission_row_max,
                                     emission_exps, transition, transition_exps,
                                     labels)
         alpha, log_likelihood = crf.crf_forward_compute()
diff --git a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
index d1d709551c77908db88be6fda7ac74d4e922138e..1cdc69501043d120b9e3cc8ccda3a1212d205886 100644
--- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
@@ -57,17 +57,18 @@ class TestListenAndServOp(OpTest):
     def setUp(self):
         self.ps_timeout = 5
         self.ip = "127.0.0.1"
-        self.port = "6173"
+        self.port = "0"
         self.trainers = 1
-        self.trainer_id = 1
+        self.trainer_id = 0
 
     def _start_pserver(self, use_cuda, sync_mode):
         p = Process(
             target=run_pserver,
             args=(use_cuda, sync_mode, self.ip, self.port, self.trainers,
                   self.trainer_id))
+        p.daemon = True
         p.start()
-        return p.pid
+        return p
 
     def _wait_ps_ready(self, pid):
         start_left_time = self.ps_timeout
@@ -89,18 +90,20 @@ class TestListenAndServOp(OpTest):
 
     def test_handle_signal_in_serv_op(self):
         # run pserver on CPU in sync mode
-        pid = self._start_pserver(False, True)
-        self._wait_ps_ready(pid)
+        p1 = self._start_pserver(False, True)
+        self._wait_ps_ready(p1.pid)
 
         # raise SIGTERM to pserver
-        os.kill(pid, signal.SIGTERM)
+        os.kill(p1.pid, signal.SIGINT)
+        p1.join()
 
         # run pserver on CPU in async mode
-        pid = self._start_pserver(False, False)
-        self._wait_ps_ready(pid)
+        p2 = self._start_pserver(False, False)
+        self._wait_ps_ready(p2.pid)
 
         # raise SIGTERM to pserver
-        os.kill(pid, signal.SIGTERM)
+        os.kill(p2.pid, signal.SIGTERM)
+        p2.join()
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_lod_rank_table.py b/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
index 093eecb8370b8ae7e4c43ce7ca6f50f5d302bd60..bac5e502318397b43e9867d5fc9e4e8cd33394b8 100644
--- a/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
@@ -30,7 +30,8 @@ class TestLoDRankTable(unittest.TestCase):
 
         tensor = core.LoDTensor()
         tensor.set(numpy.random.random(size=(17, 100)), cpu)
-        tensor.set_lod([[0, 1, 3], [0, 5, 6, 7], [0, 3, 4, 9, 10, 13, 16, 17]])
+        tensor.set_recursive_sequence_lengths(
+            [[1, 2], [5, 1, 1], [3, 1, 5, 1, 3, 3, 1]])
         exe.run(scope=scope, feed={'x': tensor})
         var = scope.find_var(rank_table.name)
         table = var.get_lod_rank_table()
diff --git a/python/paddle/fluid/tests/unittests/test_lod_reset_op.py b/python/paddle/fluid/tests/unittests/test_lod_reset_op.py
index 6b6d4c824aeae319dacf224408ce96a0d9c5bb35..77905c4b96499c855fd5c5e704b8051ccdb7a323 100644
--- a/python/paddle/fluid/tests/unittests/test_lod_reset_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_reset_op.py
@@ -21,11 +21,15 @@ class TestLodResetOpByAttr(OpTest):
     def setUp(self):
         self.op_type = "lod_reset"
         x = np.random.random((10, 20)).astype("float32")
-        lod = [[0, 3, 5, 10]]
-        target_lod_0 = [0, 7, 10]
+        lod = [[3, 2, 5]]
+        # target_offset_lod and target_lod are the same lod info represented
+        # in offset-based format and length-based format, respectively.
+        target_offset_lod = [0, 7, 10]
+        target_lod = [7, 3]
         self.inputs = {'X': (x, lod)}
-        self.attrs = {'target_lod': target_lod_0}
-        self.outputs = {'Out': (x, [target_lod_0])}
+        # The `target_lod` attribute is still based on offset
+        self.attrs = {'target_lod': target_offset_lod}
+        self.outputs = {'Out': (x, [target_lod])}
 
     def test_check_output(self):
         self.check_output()
@@ -38,13 +42,16 @@ class TestLodResetOpByInput(OpTest):
     def setUp(self):
         self.op_type = "lod_reset"
         x = np.random.random((10, 20)).astype("float32")
-        lod = [[0, 3, 5, 10]]
-        target_lod_0 = [0, 4, 7, 10]
+        lod = [[3, 2, 5]]
+        # target_offset_lod and target_lod are the same lod info represented
+        # in offset-based format and length-based format, respectively.
+        target_offset_lod = [0, 4, 7, 10]
+        target_lod = [4, 3, 3]
         self.inputs = {
             'X': (x, lod),
-            'Y': np.array([target_lod_0]).astype('int32')
+            'Y': np.array([target_offset_lod]).astype('int32')
         }
-        self.outputs = {'Out': (x, [target_lod_0])}
+        self.outputs = {'Out': (x, [target_lod])}
 
     def test_check_output(self):
         self.check_output()
@@ -57,15 +64,16 @@ class TestLodResetOpBoth(OpTest):
     def setUp(self):
         self.op_type = "lod_reset"
         x = np.random.random((10, 20)).astype("float32")
-        lod = [[0, 3, 5, 10]]
-        target_lod_0_attr = [0, 7, 10]
-        target_lod_0_in = [0, 4, 7, 10]
+        lod = [[3, 2, 5]]
+        target_offset_lod_attr = [0, 7, 10]
+        target_offset_lod_in = [0, 4, 7, 10]
+        target_lod_in = [4, 3, 3]
         self.inputs = {
             'X': (x, lod),
-            'Y': np.array(target_lod_0_in).astype('int32')
+            'Y': np.array(target_offset_lod_in).astype('int32')
         }
-        self.attrs = {'target_lod': target_lod_0_attr}
-        self.outputs = {'Out': (x, [target_lod_0_in])}
+        self.attrs = {'target_lod': target_offset_lod_attr}
+        self.outputs = {'Out': (x, [target_lod_in])}
 
     def test_check_output(self):
         self.check_output()
@@ -78,11 +86,11 @@ class TestLodResetOpYIsLoDTensor(OpTest):
     def setUp(self):
         self.op_type = "lod_reset"
         x = np.random.random((10, 20)).astype("float32")
-        lod = [[0, 3, 5, 10]]
+        lod = [[3, 2, 5]]
         y = np.random.random((10, 10)).astype("float32")
-        target_lod_0 = [[0, 4, 7, 10]]
-        self.inputs = {'X': (x, lod), 'Y': (y, target_lod_0)}
-        self.outputs = {'Out': (x, target_lod_0)}
+        target_lod = [[4, 3, 3]]
+        self.inputs = {'X': (x, lod), 'Y': (y, target_lod)}
+        self.outputs = {'Out': (x, target_lod)}
 
     def test_check_output(self):
         self.check_output()
diff --git a/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py b/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py
index 63b17a5ccd62ed79b3d611e039c2b2705a133272..118c22fbb1ff6be5859ae9e4aed6218b0c77deec 100644
--- a/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py
@@ -27,7 +27,7 @@ class TestLoDTensorArray(unittest.TestCase):
         for i in xrange(10):
             t = core.LoDTensor()
             t.set(numpy.array([i], dtype='float32'), cpu)
-            t.set_lod([[0, 1]])
+            t.set_recursive_sequence_lengths([[1]])
             tensor_array.append(t)
 
         self.assertEqual(10, len(tensor_array))
@@ -35,17 +35,17 @@ class TestLoDTensorArray(unittest.TestCase):
         for i in xrange(10):
             t = tensor_array[i]
             self.assertEqual(numpy.array(t), numpy.array([i], dtype='float32'))
-            self.assertEqual([[0, 1]], t.lod())
+            self.assertEqual([[1]], t.recursive_sequence_lengths())
 
             t = core.LoDTensor()
             t.set(numpy.array([i + 10], dtype='float32'), cpu)
-            t.set_lod([[0, 2]])
+            t.set_recursive_sequence_lengths([[1]])
             tensor_array[i] = t
             t = tensor_array[i]
             self.assertEqual(
                 numpy.array(t), numpy.array(
                     [i + 10], dtype='float32'))
-            self.assertEqual([[0, 2]], t.lod())
+            self.assertEqual([[1]], t.recursive_sequence_lengths())
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py b/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
index 66a03640c148d769787593f41a44cd4d1aaa10b1..cebe6997bb4152519dabbabfc0404d6036bc4e65 100644
--- a/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
@@ -29,7 +29,7 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         tensor = core.LoDTensor()
         tensor.set(
             numpy.arange(10).reshape(10, 1).astype('int32'), self.place())
-        tensor.set_lod([[0, 3, 9, 10]])
+        tensor.set_recursive_sequence_lengths([[3, 6, 1]])
         expect = map(lambda x: numpy.array(x).astype('int32'),
                      [[3, 0, 9], [4, 1], [5, 2], [6], [7], [8]])
         self.main(
@@ -42,7 +42,7 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         tensor = core.LoDTensor()
         tensor.set(
             numpy.arange(10).reshape(10, 1).astype('int32'), self.place())
-        tensor.set_lod([[0, 3, 9, 9, 10]])
+        tensor.set_recursive_sequence_lengths([[3, 6, 0, 1]])
         expect = map(lambda x: numpy.array(x).astype('int32'),
                      [[3, 0, 9], [4, 1], [5, 2], [6], [7], [8]])
         self.main(
@@ -55,7 +55,7 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         tensor = core.LoDTensor()
         tensor.set(
             numpy.arange(20).reshape(20, 1).astype('int32'), self.place())
-        tensor.set_lod([[0, 2, 5], [0, 3, 9, 11, 17, 20]])
+        tensor.set_recursive_sequence_lengths([[2, 3], [3, 6, 2, 6, 3]])
 
         expect = [
             numpy.array(
@@ -65,7 +65,7 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
                 [17, 18, 19], dtype='int32')
         ]
 
-        lod = [[[0, 2, 5]], [[0, 6, 12]], [[0, 3]]]
+        lod = [[[2, 3]], [[6, 6]], [[3]]]
         self.main(
             tensor=tensor,
             expect_array=expect,
@@ -77,8 +77,8 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         tensor.set(
             numpy.arange(31).reshape(31, 1).astype('int32'), self.place())
 
-        tensor.set_lod([[0, 3, 5, 9, 11],
-                        [0, 3, 7, 11, 11, 12, 17, 19, 21, 23, 30, 31]])
+        tensor.set_recursive_sequence_lengths(
+            [[3, 2, 4, 2], [3, 4, 4, 0, 1, 5, 2, 2, 2, 7, 1]])
 
         expect = [
             numpy.array(
@@ -88,7 +88,7 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
             ], [17, 18, 3, 4, 5, 6, 11, 30], [19, 20, 7, 8, 9, 10], [21, 22]]
         ]
 
-        lod = [[[0, 5, 8, 8, 15]], [[0, 2, 6, 7, 8]], [[0, 2, 6]], [[0, 2]]]
+        lod = [[[5, 3, 0, 7]], [[2, 4, 1, 1]], [[2, 4]], [[2]]]
         self.main(
             tensor=tensor,
             expect_array=expect,
@@ -99,8 +99,9 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         tensor = core.LoDTensor()
         tensor.set(
             numpy.arange(50).reshape(50, 1).astype('int32'), self.place())
-        tensor.set_lod([[0, 2, 5, 6], [0, 2, 5, 6, 10, 12, 13],
-                        [0, 3, 7, 11, 17, 21, 22, 23, 27, 31, 39, 45, 46, 50]])
+        tensor.set_recursive_sequence_lengths(
+            [[2, 3, 1], [2, 3, 1, 4, 2, 1],
+             [3, 4, 4, 6, 4, 1, 1, 4, 4, 8, 6, 1, 4]])
 
         expect = [
             numpy.array(
@@ -108,8 +109,8 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
             for item in [[21, 0, 1, 2, 3, 4, 5, 6, 46, 47, 48, 49], range(
                 22, 39) + range(7, 21), range(39, 46)]
         ]
-        lod = [[[0, 1, 3, 4], [0, 1, 4, 8, 12]],
-               [[0, 4, 7], [0, 1, 5, 9, 17, 21, 27, 31]], [[0, 2], [0, 6, 7]]]
+        lod = [[[1, 2, 1], [1, 3, 4, 4]], [[4, 3], [1, 4, 4, 8, 4, 6, 4]],
+               [[2], [6, 1]]]
         self.main(
             tensor=tensor,
             expect_array=expect,
@@ -120,8 +121,9 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         tensor = core.LoDTensor()
         tensor.set(
             numpy.arange(50).reshape(50, 1).astype('int32'), self.place())
-        tensor.set_lod([[0, 2, 5, 6], [0, 2, 5, 6, 10, 12, 13],
-                        [0, 3, 7, 11, 17, 21, 22, 23, 27, 31, 39, 45, 46, 50]])
+        tensor.set_recursive_sequence_lengths(
+            [[2, 3, 1], [2, 3, 1, 4, 2, 1],
+             [3, 4, 4, 6, 4, 1, 1, 4, 4, 8, 6, 1, 4]])
         self.main(
             tensor=tensor,
             expect_array=None,
@@ -162,12 +164,13 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
             exp_tensor, exp_lod = exp
             exp_tensor = numpy.expand_dims(exp_tensor, axis=1)
             self.assertTrue(numpy.allclose(exp_tensor, numpy.array(array[i])))
-            self.assertEqual(exp_lod, array[i].lod())
+            self.assertEqual(exp_lod, array[i].recursive_sequence_lengths())
 
     def check_tensor_same(self, actual, expect):
         self.assertTrue(
             numpy.allclose(numpy.array(actual), numpy.array(expect)))
-        self.assertEqual(actual.lod(), expect.lod())
+        self.assertEqual(actual.recursive_sequence_lengths(),
+                         expect.recursive_sequence_lengths())
 
 
 class TestCPULoDTensorArrayOpGrad(unittest.TestCase):
@@ -188,7 +191,7 @@ class TestCPULoDTensorArrayOpGrad(unittest.TestCase):
 
         tensor = core.LoDTensor()
         tensor.set(numpy.arange(10).reshape(10, 1).astype('float32'), place)
-        tensor.set_lod([[0, 3, 9, 10]])
+        tensor.set_recursive_sequence_lengths([[3, 6, 1]])
 
         g_vars = program.global_block().var(x.name + "@GRAD")
 
diff --git a/python/paddle/fluid/tests/unittests/test_lstm_op.py b/python/paddle/fluid/tests/unittests/test_lstm_op.py
index e726f99d49877a1bc464090092ec80b97ab15d0c..705a24bd8f39a55e0a352944d961f8d33aaf96ff 100644
--- a/python/paddle/fluid/tests/unittests/test_lstm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_op.py
@@ -84,15 +84,17 @@ def lstm(
         h = g_o * act_cell(c)
         return h, c
 
-    def _reverse(x, lod):
+    def _reverse(x, offset):
         y = np.zeros_like(x)
-        for i in range(len(lod) - 1):
-            b, e = lod[i], lod[i + 1]
+        for i in range(len(offset) - 1):
+            b, e = offset[i], offset[i + 1]
             y[b:e, :] = np.flip(x[b:e, :], 0)
         return y
 
-    offset = lod[0]
-    batch_size = len(offset) - 1
+    offset = [0]
+    for l in lod[0]:
+        offset.append(offset[-1] + l)
+    batch_size = len(lod[0])
     hidden = []
     cell = []
     input = _reverse(input, offset) if is_reverse else input
@@ -100,7 +102,7 @@ def lstm(
         input = input + np.tile(w_b, (offset[-1], 1))
     for i in range(batch_size):
         # compute one sequence
-        seq_len = offset[i + 1] - offset[i]
+        seq_len = lod[0][i]
         x = input[offset[i]:offset[i + 1], :]
         h_pre = h0[i]  # 1 x D
         c_pre = c0[i]  # 1 x D
@@ -124,7 +126,7 @@ def lstm(
 
 class TestLstmOp(OpTest):
     def set_argument(self):
-        self.lod = [[0, 2, 5, 7]]
+        self.lod = [[2, 3, 2]]
         self.D = 16
 
         self.act_gate = 'sigmoid'
@@ -139,8 +141,8 @@ class TestLstmOp(OpTest):
         self.set_argument()
         self.op_type = 'lstm'
 
-        T = self.lod[0][-1]
-        N = len(self.lod[0]) - 1
+        T = sum(self.lod[0])
+        N = len(self.lod[0])
 
         x = np.random.normal(size=(T, 4 * self.D)).astype('float64')
         if self.has_initial_state:
@@ -186,7 +188,7 @@ class TestLstmOp(OpTest):
 
     def test_check_grad(self):
         # TODO(qingqing) remove folowing lines after the check_grad is refined.
-        N = len(self.lod[0]) - 1
+        N = len(self.lod[0])
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchCellPreAct'] = np.zeros(
             (N, self.D)).astype('float64')
@@ -196,7 +198,7 @@ class TestLstmOp(OpTest):
 
 # class TestLstmOpHasInitial(TestLstmOp):
 #     def set_argument(self):
-#         self.lod = [[0, 2, 5, 7]]
+#         self.lod = [[2, 3, 2]]
 #         self.D = 16
 
 #         self.act_gate = 'sigmoid'
@@ -209,7 +211,7 @@ class TestLstmOp(OpTest):
 
 #     def test_check_grad(self):
 #         # TODO(qingqing) remove folowing lines after the check_grad is refined.
-#         N = len(self.lod[0]) - 1
+#         N = len(self.lod[0])
 #         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
 #         self.outputs['BatchCellPreAct'] = np.zeros(
 #             (N, self.D)).astype('float64')
@@ -218,7 +220,7 @@ class TestLstmOp(OpTest):
 #             max_relative_error=5e-4)
 
 #     def test_check_grad_ingore_bias(self):
-#         N = len(self.lod[0]) - 1
+#         N = len(self.lod[0])
 #         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
 #         self.outputs['BatchCellPreAct'] = np.zeros(
 #             (N, self.D)).astype('float64')
@@ -228,7 +230,7 @@ class TestLstmOp(OpTest):
 #             no_grad_set=set('Bias'))
 
 #     def test_check_grad_ingore_weight(self):
-#         N = len(self.lod[0]) - 1
+#         N = len(self.lod[0])
 #         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
 #         self.outputs['BatchCellPreAct'] = np.zeros(
 #             (N, self.D)).astype('float64')
@@ -238,7 +240,7 @@ class TestLstmOp(OpTest):
 #             no_grad_set=set('Weight'))
 
 #     def test_check_grad_ingore_input(self):
-#         N = len(self.lod[0]) - 1
+#         N = len(self.lod[0])
 #         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
 #         self.outputs['BatchCellPreAct'] = np.zeros(
 #             (N, self.D)).astype('float64')
@@ -248,7 +250,7 @@ class TestLstmOp(OpTest):
 #             no_grad_set=set('Input'))
 
 #     def test_check_grad_ingore_h0(self):
-#         N = len(self.lod[0]) - 1
+#         N = len(self.lod[0])
 #         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
 #         self.outputs['BatchCellPreAct'] = np.zeros(
 #             (N, self.D)).astype('float64')
@@ -258,7 +260,7 @@ class TestLstmOp(OpTest):
 #             no_grad_set=set('H0'))
 
 #     def test_check_grad_ingore_c0(self):
-#         N = len(self.lod[0]) - 1
+#         N = len(self.lod[0])
 #         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
 #         self.outputs['BatchCellPreAct'] = np.zeros(
 #             (N, self.D)).astype('float64')
@@ -269,7 +271,7 @@ class TestLstmOp(OpTest):
 
 # class TestLstmOpRerverse(TestLstmOp):
 #     def set_argument(self):
-#         self.lod = [[0, 2, 5, 7]]
+#         self.lod = [[2, 3, 2]]
 #         self.D = 16
 
 #         self.act_gate = 'sigmoid'
@@ -282,7 +284,7 @@ class TestLstmOp(OpTest):
 
 # class TestLstmOpNotUsePeepholes(TestLstmOp):
 #     def set_argument(self):
-#         self.lod = [[0, 2, 5, 7]]
+#         self.lod = [[2, 3, 2]]
 #         self.D = 16
 
 #         self.act_gate = 'sigmoid'
diff --git a/python/paddle/fluid/tests/unittests/test_lstmp_op.py b/python/paddle/fluid/tests/unittests/test_lstmp_op.py
index afff133f6c6cfe45d1aca4014dc8b92e6562e6b8..ed2262da4bc727657c2e65d69cb1922891e17b09 100644
--- a/python/paddle/fluid/tests/unittests/test_lstmp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstmp_op.py
@@ -64,15 +64,17 @@ def lstmp(
         r = act_proj(r)
         return r, c
 
-    def _reverse(x, lod):
+    def _reverse(x, offset):
         y = np.zeros_like(x)
-        for i in range(len(lod) - 1):
-            b, e = lod[i], lod[i + 1]
+        for i in range(len(offset) - 1):
+            b, e = offset[i], offset[i + 1]
             y[b:e, :] = np.flip(x[b:e, :], 0)
         return y
 
-    offset = lod[0]
-    batch_size = len(offset) - 1
+    offset = [0]
+    for l in lod[0]:
+        offset.append(offset[-1] + l)
+    batch_size = len(lod[0])
     # recurrent projection state
     projection = []
     cell = []
@@ -81,7 +83,7 @@ def lstmp(
         input = input + np.tile(w_b, (offset[-1], 1))
     for i in range(batch_size):
         # compute one sequence
-        seq_len = offset[i + 1] - offset[i]
+        seq_len = lod[0][i]
         x = input[offset[i]:offset[i + 1], :]
         r_pre = np.dot(h0[i], w_rh)  # 1 x P
         r_pre = act_proj(r_pre)
@@ -117,8 +119,8 @@ class TestLstmpOp(LstmTest.TestLstmOp):
         self.reset_argument()
         self.op_type = 'lstmp'
 
-        T = self.lod[0][-1]
-        N = len(self.lod[0]) - 1
+        T = sum(self.lod[0])
+        N = len(self.lod[0])
 
         x = np.random.normal(size=(T, 4 * self.D)).astype('float64')
         if self.has_initial_state:
@@ -166,7 +168,7 @@ class TestLstmpOp(LstmTest.TestLstmOp):
 
     def test_check_grad(self):
         # TODO(qingqing) remove folowing lines after the check_grad is refined.
-        N = len(self.lod[0]) - 1
+        N = len(self.lod[0])
         self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
@@ -183,7 +185,7 @@ class TestLstmpOpHasInitial(TestLstmpOp):
 
     def test_check_grad(self):
         # TODO(qingqing) remove folowing lines after the check_grad is refined.
-        N = len(self.lod[0]) - 1
+        N = len(self.lod[0])
         self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
@@ -195,7 +197,7 @@ class TestLstmpOpHasInitial(TestLstmpOp):
             max_relative_error=1e-2)
 
     def test_check_grad_ingore_bias(self):
-        N = len(self.lod[0]) - 1
+        N = len(self.lod[0])
         self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
@@ -207,7 +209,7 @@ class TestLstmpOpHasInitial(TestLstmpOp):
             no_grad_set=set('Bias'))
 
     def test_check_grad_ingore_weight(self):
-        N = len(self.lod[0]) - 1
+        N = len(self.lod[0])
         self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
@@ -219,7 +221,7 @@ class TestLstmpOpHasInitial(TestLstmpOp):
             no_grad_set=set('Weight'))
 
     def test_check_grad_ingore_proj_weight(self):
-        N = len(self.lod[0]) - 1
+        N = len(self.lod[0])
         self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
@@ -231,7 +233,7 @@ class TestLstmpOpHasInitial(TestLstmpOp):
             no_grad_set=set('ProjWeight'))
 
     def test_check_grad_ingore_input(self):
-        N = len(self.lod[0]) - 1
+        N = len(self.lod[0])
         self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
@@ -243,7 +245,7 @@ class TestLstmpOpHasInitial(TestLstmpOp):
             no_grad_set=set('Input'))
 
     def test_check_grad_ingore_h0(self):
-        N = len(self.lod[0]) - 1
+        N = len(self.lod[0])
         self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
@@ -255,7 +257,7 @@ class TestLstmpOpHasInitial(TestLstmpOp):
             no_grad_set=set('H0'))
 
     def test_check_grad_ingore_c0(self):
-        N = len(self.lod[0]) - 1
+        N = len(self.lod[0])
         self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
diff --git a/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py b/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py
index c27573c3d69037bc48e0b6a90636b3f027f15a41..54ee85c1a7a539fe9517f32adb35ab99b5ae2a07 100644
--- a/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py
@@ -70,7 +70,7 @@ class TestMineHardExamplesOp(OpTest):
 
         self.updated_match_indices = self.match_indices
 
-        self.neg_indices_lod = [[0, 1, 2]]
+        self.neg_indices_lod = [[1, 1]]
         self.neg_indices = np.array([[1], [0]]).astype('int32')
 
 
@@ -92,7 +92,7 @@ class TestMineHardExamplesOpHardExample(TestMineHardExamplesOp):
         self.updated_match_indices = np.array([[0, -1, -1],
                                                [-1, -1, -1]]).astype('int32')
 
-        self.neg_indices_lod = [[0, 1, 3]]
+        self.neg_indices_lod = [[1, 2]]
         self.neg_indices = np.array([[2], [0], [2]]).astype('int32')
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_multi_file_reader.py b/python/paddle/fluid/tests/unittests/test_multi_file_reader.py
index 3f940203b9393d266d75b50c9cbf62e89c36cbdf..dbd510e64ffdd6f3b78b22bb0d37d9a7ba3fd9b5 100644
--- a/python/paddle/fluid/tests/unittests/test_multi_file_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_multi_file_reader.py
@@ -64,8 +64,7 @@ class TestMultipleReader(unittest.TestCase):
             while True:
                 try:
                     img_val, = exe.run(fetch_list=[img])
-                except fluid.core.EnforceNotMet as ex:
-                    self.assertIn("There is no next data.", ex.message)
+                except fluid.core.EOFException:
                     break
                 batch_count += 1
                 self.assertLessEqual(img_val.shape[0], self.batch_size)
diff --git a/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py b/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
index 52e7cc1ffbba40a63ce3cec645c7c0a7a499c1bf..7fc9f550440d3d0e1a8182a69f5692b3df0aa258 100644
--- a/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
@@ -59,8 +59,7 @@ class TestMultipleReader(unittest.TestCase):
             while True:
                 try:
                     img_val, = exe.run(fetch_list=[img])
-                except fluid.core.EnforceNotMet as ex:
-                    self.assertIn("There is no next data.", ex.message)
+                except fluid.core.EOFException:
                     break
                 batch_count += 1
                 self.assertLessEqual(img_val.shape[0], self.batch_size)
diff --git a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
index 6459913c0162374e17d0249627e7107a195babf8..aacd8ae45af10a2b19d2903ab121e9bb4f9de7ff 100644
--- a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
@@ -135,12 +135,12 @@ def batched_multiclass_nms(boxes, scores, background, score_threshold,
     batch_size = scores.shape[0]
 
     det_outs = []
-    lod = [0]
+    lod = []
     for n in range(batch_size):
         nmsed_outs, nmsed_num = multiclass_nms(boxes[n], scores[n], background,
                                                score_threshold, nms_threshold,
                                                nms_top_k, keep_top_k)
-        lod.append(lod[-1] + nmsed_num)
+        lod.append(nmsed_num)
         if nmsed_num == 0: continue
 
         for c, indices in nmsed_outs.iteritems():
diff --git a/python/paddle/fluid/tests/unittests/test_one_hot_op.py b/python/paddle/fluid/tests/unittests/test_one_hot_op.py
index cd78cce8729ab2b5a0bb4817cf3022e53932283a..d13f2b3afde10f9b4e632094fa216d8729069afa 100644
--- a/python/paddle/fluid/tests/unittests/test_one_hot_op.py
+++ b/python/paddle/fluid/tests/unittests/test_one_hot_op.py
@@ -27,9 +27,9 @@ class TestOneHotOp(OpTest):
         self.op_type = 'one_hot'
         depth = 10
         dimension = 12
-        x_lod = [[0, 4, 5, 8, 11]]
-        x = [np.random.randint(0, depth - 1) for i in xrange(x_lod[0][-1])]
-        x = np.array(x).astype('int').reshape([x_lod[0][-1], 1])
+        x_lod = [[4, 1, 3, 3]]
+        x = [np.random.randint(0, depth - 1) for i in xrange(sum(x_lod[0]))]
+        x = np.array(x).astype('int').reshape([sum(x_lod[0]), 1])
 
         out = np.zeros(shape=(np.product(x.shape[:-1]),
                               depth)).astype('float32')
@@ -50,9 +50,9 @@ class TestOneHotOp_default_dtype(OpTest):
         self.op_type = 'one_hot'
         depth = 10
         dimension = 12
-        x_lod = [[0, 4, 5, 8, 11]]
-        x = [np.random.randint(0, depth - 1) for i in xrange(x_lod[0][-1])]
-        x = np.array(x).astype('int').reshape([x_lod[0][-1], 1])
+        x_lod = [[4, 1, 3, 3]]
+        x = [np.random.randint(0, depth - 1) for i in xrange(sum(x_lod[0]))]
+        x = np.array(x).astype('int').reshape([sum(x_lod[0]), 1])
 
         out = np.zeros(shape=(np.product(x.shape[:-1]),
                               depth)).astype('float32')
@@ -75,11 +75,11 @@ class TestOneHotOp_exception(OpTest):
         self.place = core.CPUPlace()
         self.dimension = 12
         self.x = core.LoDTensor()
-        x_lod = [[0, 4, 5, 8, 11]]
-        data = [np.random.randint(11, 20) for i in xrange(x_lod[0][-1])]
-        data = np.array(data).astype('int').reshape([x_lod[0][-1], 1])
+        x_lod = [[4, 1, 3, 3]]
+        data = [np.random.randint(11, 20) for i in xrange(sum(x_lod[0]))]
+        data = np.array(data).astype('int').reshape([sum(x_lod[0]), 1])
         self.x.set(data, self.place)
-        self.x.set_lod(x_lod)
+        self.x.set_recursive_sequence_lengths(x_lod)
 
     def test_check_output(self):
         program = Program()
diff --git a/python/paddle/fluid/tests/unittests/test_optimizer.py b/python/paddle/fluid/tests/unittests/test_optimizer.py
index e775db1d10f4561b6fb90631757a25c9f74cb777..7286c7c450108c4b5ad7136041bc4e989894a2ba 100644
--- a/python/paddle/fluid/tests/unittests/test_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer.py
@@ -434,5 +434,71 @@ class TestDecayedAdagradOptimizer(unittest.TestCase):
         self.assertAlmostEqual(init_ops[1].attr('value'), 0.0)
 
 
+class TestFtrlOptimizer(unittest.TestCase):
+    class MockFtrl(optimizer.FtrlOptimizer):
+        def get_accumulators(self):
+            return self._accumulators
+
+        def get_squared_str(self):
+            return self._squared_acc_str
+
+        def get_linear_str(self):
+            return self._linear_acc_str
+
+    def test_ftrl_optimizer(self):
+        init_program = framework.Program()
+        program = framework.Program()
+        block = program.global_block()
+        mul_x = block.create_parameter(
+            dtype="float32",
+            shape=[5, 10],
+            lod_level=0,
+            name="mul.x",
+            optimize_attr={'learning_rate': 1.1})
+        mul_y = block.create_var(
+            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
+        mul_out = block.create_var(
+            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
+        block.append_op(
+            type="mul",
+            inputs={"X": mul_x,
+                    "Y": mul_y},
+            outputs={"Out": mul_out},
+            attrs={"x_num_col_dims": 1})
+        mean_out = block.create_var(
+            dtype="float32", shape=[1], lod_level=0, name="mean.out")
+        block.append_op(
+            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
+        learning_rate = 0.01
+        ftrl_optimizer = self.MockFtrl(
+            learning_rate=learning_rate, l1=0.0, l2=0.0, lr_power=-0.5)
+        params_grads = append_backward(mean_out)
+        self.assertEqual(len(params_grads), 1)
+        self.assertEqual(len(ftrl_optimizer.get_accumulators()), 0)
+        opts = ftrl_optimizer.create_optimization_pass(params_grads, mul_out,
+                                                       init_program)
+        self.assertEqual(len(opts), 3)
+        self.assertEqual([op.type for op in opts],
+                         ["fill_constant", "elementwise_mul", "ftrl"])
+
+        # Check accumulators
+        accumulators = ftrl_optimizer.get_accumulators()
+        self.assertEqual(len(accumulators), 2)
+        self.assertTrue(ftrl_optimizer.get_squared_str() in accumulators)
+        self.assertTrue(ftrl_optimizer.get_linear_str() in accumulators)
+        squared_acc = accumulators[ftrl_optimizer.get_squared_str()]
+        linear_acc = accumulators[ftrl_optimizer.get_linear_str()]
+        self.assertEqual(len(squared_acc), 1)
+        self.assertEqual(len(linear_acc), 1)
+        self.assertTrue(mul_x.name in squared_acc)
+        self.assertTrue(mul_x.name in linear_acc)
+
+        # Check init_program
+        init_ops = init_program.global_block().ops
+        self.assertEqual(len(init_ops), 3)
+        self.assertEqual(init_ops[0].type, "fill_constant")
+        self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
index 163975555ec2cea5c169cc1da3c4324d91ba3616..63fb58c6927fa387b3b19147b9dc9d24bb8e5132 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
@@ -169,10 +169,10 @@ class TestCRFModel(unittest.TestCase):
             data = train_data()
             for i in xrange(10):
                 cur_batch = next(data)
-                print map(np.array,
-                          pe.run(feed=feeder.feed(cur_batch),
-                                 fetch_list=[avg_cost.name]))[0]
+                print pe.run(feed=feeder.feed(cur_batch),
+                             fetch_list=[avg_cost.name])[0]
 
+    @unittest.skip(reason="CI hangs")
     def test_update_sparse_parameter_all_reduce(self):
         build_strategy = fluid.BuildStrategy()
         build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
@@ -181,6 +181,7 @@ class TestCRFModel(unittest.TestCase):
         self.check_network_convergence(
             is_sparse=True, build_strategy=build_strategy, use_cuda=False)
 
+    @unittest.skip(reason="CI hangs")
     def test_update_dense_parameter_all_reduce(self):
         build_strategy = fluid.BuildStrategy()
         build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
@@ -189,6 +190,7 @@ class TestCRFModel(unittest.TestCase):
         self.check_network_convergence(
             is_sparse=False, build_strategy=build_strategy, use_cuda=False)
 
+    @unittest.skip(reason="CI hangs")
     def test_update_sparse_parameter_reduce(self):
         build_strategy = fluid.BuildStrategy()
         build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
@@ -197,6 +199,7 @@ class TestCRFModel(unittest.TestCase):
         self.check_network_convergence(
             is_sparse=True, build_strategy=build_strategy, use_cuda=False)
 
+    @unittest.skip(reason="CI hangs")
     def test_update_dense_parameter_reduce(self):
         build_strategy = fluid.BuildStrategy()
         build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
index 79702475cca86ca22107d4b1824fda277dd83157..1f5d2f16773efb7537de85abec88344f8e0daa9f 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
@@ -75,7 +75,9 @@ class TestFetchOp(unittest.TestCase):
                     fetch_list.append(k)
 
             for data in train_inputs:
-                ret = pe.run(fetch_list, feed=feeder.feed(data))
+                ret = pe.run(fetch_list,
+                             feed=feeder.feed(data),
+                             return_numpy=True)
                 for i in range(len(fetch_list)):
                     assert not math.isnan(np.sum(ret[i])) and \
                            not math.isinf(np.sum(ret[i]))
@@ -128,7 +130,7 @@ class TestFeedParallel(unittest.TestCase):
             use_cuda=use_cuda, loss_name=loss.name, main_program=main)
 
         for batch_id, data in enumerate(reader()):
-            loss_np = np.array(pe.run(feed=data, fetch_list=[loss.name])[0])
+            loss_np = pe.run(feed=data, fetch_list=[loss.name])[0]
             print batch_id, loss_np
             if batch_id == 2:
                 break
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
index 31ba8c1d6096c9c89e0695c8eca8e16a5e303a61..9a2733927d38f1a2b1af92fcc12f036158b4d06f 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
@@ -16,6 +16,8 @@ import paddle.fluid as fluid
 import numpy as np
 import unittest
 import os
+import sys
+import math
 
 
 def simple_fc_net():
@@ -70,10 +72,17 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
 
             for i in xrange(5):
                 test_loss, = test_exe.run([loss.name], feed=feed_dict)
-                test_loss = np.array(test_loss)
 
                 train_loss, = train_exe.run([loss.name], feed=feed_dict)
-                train_loss = np.array(train_loss)
+
+                avg_test_loss_val = np.array(test_loss).mean()
+                if math.isnan(float(avg_test_loss_val)):
+                    sys.exit("got NaN loss, testing failed.")
+
+                avg_train_loss_val = np.array(train_loss).mean()
+                if math.isnan(float(avg_train_loss_val)):
+                    sys.exit("got NaN loss, training failed.")
+
                 self.assertTrue(
                     np.allclose(
                         train_loss, test_loss, atol=1e-8),
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_op.py b/python/paddle/fluid/tests/unittests/test_parallel_op.py
index 79bea148f9398152a02d70946cdc5fff1f47ba6b..9ba5f988f317a515b77c0b428da236626419a2c3 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_op.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_op.py
@@ -113,7 +113,9 @@ class BaseParallelForTest(unittest.TestCase):
             generator = callback()
             # Automatically insert parallel do if use_parallel = True
             if use_parallel:
-                places = fluid.layers.get_places()
+                thread_num = fluid.core.get_cuda_device_count(
+                ) if use_gpu else 8
+                places = fluid.layers.get_places(thread_num)
                 pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl)
                 data = next(generator)
 
diff --git a/python/paddle/fluid/tests/unittests/test_print_op.py b/python/paddle/fluid/tests/unittests/test_print_op.py
index c75080fbb96d472810e5d6a1d02a77c456006f66..e01af42a58b86042fd0282928d1a78d9c3239fe3 100644
--- a/python/paddle/fluid/tests/unittests/test_print_op.py
+++ b/python/paddle/fluid/tests/unittests/test_print_op.py
@@ -28,7 +28,7 @@ class TestPrintOpCPU(unittest.TestCase):
         self.x_tensor = core.LoDTensor()
         tensor_np = np.random.random(size=(2, 3)).astype('float32')
         self.x_tensor.set(tensor_np, self.place)
-        self.x_tensor.set_lod([[0, 1, 1]])
+        self.x_tensor.set_recursive_sequence_lengths([[1, 1]])
 
     def build_network(self, only_forward, **kargs):
         x = layers.data('x', shape=[3], dtype='float32', lod_level=1)
@@ -62,7 +62,7 @@ class TestPrintOpGPU(TestPrintOpCPU):
         self.x_tensor = core.LoDTensor()
         tensor_np = np.random.random(size=(2, 3)).astype('float32')
         self.x_tensor.set(tensor_np, self.place)
-        self.x_tensor.set_lod([[0, 1, 1]])
+        self.x_tensor.set_recursive_sequence_lengths([[1, 1]])
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_recordio_reader.py b/python/paddle/fluid/tests/unittests/test_recordio_reader.py
index f32050014d7ace5aee4aca75a47bfc6a75ff91c2..69a522e273db017ac55b408276b4a28f5f907c42 100644
--- a/python/paddle/fluid/tests/unittests/test_recordio_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_recordio_reader.py
@@ -68,8 +68,7 @@ class TestRecordIO(unittest.TestCase):
             while True:
                 try:
                     tmp, = exe.run(fetch_list=[avg_loss])
-                except fluid.core.EnforceNotMet as ex:
-                    self.assertIn("There is no next data.", ex.message)
+                except fluid.core.EOFException:
                     break
 
                 avg_loss_np.append(tmp)
diff --git a/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py b/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
index 76d0d2f2fe80e409dc1b7fa858d43fbc6ad960ef..a70321bd800bf25eeb9e5d197ea7e08626b9aede 100644
--- a/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
@@ -70,11 +70,10 @@ class TestReorderLoDTensor(unittest.TestCase):
                 lod_level_i = numpy.random.randint(
                     low=1,
                     high=5,
-                    size=self.num_seq if i == 0 else lod_level_i[-1])
-                lod_level_i = [0] + numpy.cumsum(lod_level_i).tolist()
+                    size=self.num_seq if i == 0 else sum(lod_level_i)).tolist()
                 data_lod.append(lod_level_i)
             data_value = numpy.random.random(
-                size=[data_lod[-1][-1] if data_lod else self.num_seq
+                size=[sum(data_lod[-1]) if data_lod else self.num_seq
                       ] + data_shape).astype('float32')
             self.data[data_name] = (data_value, data_lod)
 
@@ -84,29 +83,36 @@ class TestReorderLoDTensor(unittest.TestCase):
             tensor = fluid.Tensor()
             tensor.set(self.data[desc[0]][0], place)
             if self.data[desc[0]][1]:
-                tensor.set_lod(self.data[desc[0]][1])
+                tensor.set_recursive_sequence_lengths(self.data[desc[0]][1])
             self.inputs[desc[0]] = tensor
 
     def reorder(self):
-        level = 0
+        def convert_to_offset(lod):
+            offset_lod = [[0] for i in lod]
+            for i, level in enumerate(lod):
+                for seq_len in level:
+                    offset_lod[i].append(offset_lod[i][-1] + seq_len)
+            return offset_lod
 
+        level = 0
         # compute the rank_table according to ref_lod
         ref_lod = self.data[self.data_desc[1][0]][1][level]
         rank_table = []  # list of (index, length)
-        for i in range(len(ref_lod) - 1):
-            rank_table.append((i, ref_lod[i + 1] - ref_lod[i]))
+        for i in range(len(ref_lod)):
+            rank_table.append((i, ref_lod[i]))
         rank_table = sorted(rank_table, lambda x, y: y[1] - x[1])
 
         # compute the input sequence info according to input_lod
         input_value, input_lod = self.data[self.data_desc[0][0]]
+        offset_lod = convert_to_offset(input_lod)
 
         input_table = []  # list of (offset, length, sub_lod)
-        if input_lod:
-            for i in range(len(input_lod[level]) - 1):
+        if offset_lod:
+            for i in range(len(offset_lod[level]) - 1):
                 start_idx = i
                 end_idx = i + 1
                 sub_lod = []
-                for lod_level_i in input_lod[level:]:
+                for lod_level_i in offset_lod[level:]:
                     sub_lod_i = []
                     for idx in range(start_idx, end_idx):
                         sub_lod_i.append(lod_level_i[idx + 1] - lod_level_i[
@@ -132,10 +138,9 @@ class TestReorderLoDTensor(unittest.TestCase):
 
             input_seq_sub_lod = input_table[index][2]
             if len(output_lod) == 0:
-                output_lod = [[0] for i in input_seq_sub_lod]
-            for i, sub_lod_i in enumerate(input_seq_sub_lod):
-                for idx_sub in sub_lod_i:
-                    output_lod[i].append(output_lod[i][-1] + idx_sub)
+                output_lod = [[] for i in input_seq_sub_lod]
+            for i, level in enumerate(input_seq_sub_lod):
+                output_lod[i].extend(level)
         return output_value, output_lod
 
     def test_reorder_lod_tensor(self):
@@ -148,7 +153,8 @@ class TestReorderLoDTensor(unittest.TestCase):
             self.assertTrue(
                 numpy.allclose(
                     numpy.array(actual_output), expect_output, atol=0.001))
-            self.assertEqual(expect_output_lod, actual_output.lod())
+            self.assertEqual(expect_output_lod,
+                             actual_output.recursive_sequence_lengths())
         # check gradient
         expect_grad = numpy.ones_like(self.data[self.data_desc[0][0]][0])
         expect_grad_lod = self.data[self.data_desc[0][0]][1]
@@ -156,7 +162,8 @@ class TestReorderLoDTensor(unittest.TestCase):
             self.assertTrue(
                 numpy.allclose(
                     numpy.array(actual_grad), expect_grad, atol=0.001))
-            self.assertEqual(expect_grad_lod, actual_grad.lod())
+            self.assertEqual(expect_grad_lod,
+                             actual_grad.recursive_sequence_lengths())
 
     def test_reorder_tensor(self):
         self.data_desc[0][-1] = 0  # input is tensor
@@ -168,7 +175,8 @@ class TestReorderLoDTensor(unittest.TestCase):
             self.assertTrue(
                 numpy.allclose(
                     numpy.array(actual_output), expect_output, atol=0.001))
-            self.assertEqual(expect_output_lod, actual_output.lod())
+            self.assertEqual(expect_output_lod,
+                             actual_output.recursive_sequence_lengths())
         # check gradient
         expect_grad = numpy.ones_like(self.data[self.data_desc[0][0]][0])
         expect_grad_lod = self.data[self.data_desc[0][0]][1]
@@ -176,14 +184,14 @@ class TestReorderLoDTensor(unittest.TestCase):
             self.assertTrue(
                 numpy.allclose(
                     numpy.array(actual_grad), expect_grad, atol=0.001))
-            self.assertEqual(expect_grad_lod, actual_grad.lod())
+            self.assertEqual(expect_grad_lod,
+                             actual_grad.recursive_sequence_lengths())
 
         # compare outputs between LodTensors with explicit and implicit lod
         # use the same data but set the input lod explicitly
-        input_lod = [[
-            i for i in range(len(self.data[self.data_desc[0][0]][0]) + 1)
-        ]]
-        self.inputs[self.data_desc[0][0]].set_lod(input_lod)
+        input_lod = [[1] * len(self.data[self.data_desc[0][0]][0])]
+        self.inputs[self.data_desc[0][0]].set_recursive_sequence_lengths(
+            input_lod)
         # preserve the output of LodTensor with implicit lod to compare
         expect_output = [
             numpy.array(actual_output) for actual_output in self.actual_outputs
diff --git a/python/paddle/fluid/tests/unittests/test_roi_pool_op.py b/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
index 3d754aff3a73e7168e2123483b26e5e3a3585a4e..df5684ab173a4889dd7b693f9246bafd12e0345f 100644
--- a/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
@@ -107,7 +107,7 @@ class TestROIPoolOp(OpTest):
         rois = []
         self.rois_lod = [[]]
         for bno in range(self.batch_size):
-            self.rois_lod[0].append(len(rois))
+            self.rois_lod[0].append(bno + 1)
             for i in range(bno + 1):
                 x1 = np.random.random_integers(
                     0, self.width / self.spatial_scale - self.pooled_width)
@@ -121,7 +121,6 @@ class TestROIPoolOp(OpTest):
 
                 roi = [bno, x1, y1, x2, y2]
                 rois.append(roi)
-        self.rois_lod[0].append(len(rois))
         self.rois_num = len(rois)
         self.rois = np.array(rois).astype("int64")
 
diff --git a/python/paddle/fluid/tests/unittests/test_row_conv_op.py b/python/paddle/fluid/tests/unittests/test_row_conv_op.py
index 30f1efbcbcb11332c85c9d5489f22c17b06c2b36..07dcd108689ae6069e30fe22029258d192215549 100644
--- a/python/paddle/fluid/tests/unittests/test_row_conv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_row_conv_op.py
@@ -19,8 +19,10 @@ from op_test import OpTest
 
 def row_conv_forward(x, lod, wt):
     out = np.zeros_like(x)
-    seq_info = lod[0]
-    num_sequences = len(seq_info) - 1
+    num_sequences = len(lod[0])
+    seq_info = [0]
+    for seq_len in lod[0]:
+        seq_info.append(seq_info[-1] + seq_len)
     context_length = wt.shape[0]
 
     for i in range(num_sequences):  # loop over number of sequences
@@ -32,7 +34,6 @@ def row_conv_forward(x, lod, wt):
         cur_timesteps = end - start
         for j in range(cur_timesteps):  # loop over different timesteps
             for k in range(context_length):
-
                 if j + k >= cur_timesteps:
                     continue
                 curoutput[j, :] += curinput[j + k, :] * wt[k, :]
@@ -44,8 +45,8 @@ class TestRowConvOp1(OpTest):
     def setUp(self):
 
         self.op_type = "row_conv"
-        lod = [[0, 2, 5, 7]]
-        T = lod[0][-1]
+        lod = [[2, 3, 2]]
+        T = sum(lod[0])
         D = 16
         context_length = 2
 
@@ -75,8 +76,8 @@ class TestRowConvOp2(OpTest):
     def setUp(self):
 
         self.op_type = "row_conv"
-        lod = [[0, 20, 50, 100]]
-        T = lod[0][-1]
+        lod = [[20, 30, 50]]
+        T = sum(lod[0])
         D = 35
         context_length = 35
 
diff --git a/python/paddle/fluid/tests/unittests/test_seq_concat_op.py b/python/paddle/fluid/tests/unittests/test_seq_concat_op.py
index 10592d127fafdf202c65fcfa91b5c464cc60e96c..11ffa761a690eb1f9f6dc50c45128a99301741db 100644
--- a/python/paddle/fluid/tests/unittests/test_seq_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_concat_op.py
@@ -18,14 +18,19 @@ import sys
 from op_test import OpTest
 
 
-def to_abs_lod(lod):
-    if len(lod) == 0 or len(lod) == 1:
-        return lod
+def to_abs_offset_lod(lod):
+    offset_lod = [[0] for i in lod]
+    for i, level in enumerate(lod):
+        for seq_len in level:
+            offset_lod[i].append(offset_lod[i][-1] + seq_len)
+
+    if len(offset_lod) == 0 or len(offset_lod) == 1:
+        return offset_lod
     import copy
-    new_lod = copy.deepcopy(lod)
-    for idx, val in enumerate(lod[0]):
-        new_lod[0][idx] = lod[1][val]
-    return new_lod
+    new_offset_lod = copy.deepcopy(offset_lod)
+    for idx, val in enumerate(offset_lod[0]):
+        new_offset_lod[0][idx] = offset_lod[1][val]
+    return new_offset_lod
 
 
 def seq_concat(inputs, level):
@@ -35,11 +40,11 @@ def seq_concat(inputs, level):
     x1 = inputs['X'][1][1][0]
     level_idx = len(lod0) - level - 1
     outs = []
-    for i in range(len(lod0[level_idx]) - 1):
-        sub_x0 = x0[to_abs_lod(lod0)[level_idx][i]:to_abs_lod(lod0)[level_idx][
-            i + 1], :]
-        sub_x1 = x1[to_abs_lod(lod1)[level_idx][i]:to_abs_lod(lod1)[level_idx][
-            i + 1], :]
+    for i in range(len(lod0[level_idx])):
+        sub_x0 = x0[to_abs_offset_lod(lod0)[level_idx][i]:to_abs_offset_lod(
+            lod0)[level_idx][i + 1], :]
+        sub_x1 = x1[to_abs_offset_lod(lod1)[level_idx][i]:to_abs_offset_lod(
+            lod1)[level_idx][i + 1], :]
         outs.append(np.concatenate((sub_x0, sub_x1), axis=0))
     return np.concatenate(outs, axis=0)
 
@@ -48,9 +53,9 @@ class TestSeqConcatOp(OpTest):
     def set_data(self):
         # two level, batch size is 3
         x0 = np.random.random((4, 6, 3)).astype('float32')
-        lod0 = [[0, 2, 4], [0, 1, 2, 3, 4]]
+        lod0 = [[2, 2], [1, 1, 1, 1]]
         x1 = np.random.random((4, 8, 3)).astype('float32')
-        lod1 = [[0, 2, 4], [0, 1, 2, 3, 4]]
+        lod1 = [[2, 2], [1, 1, 1, 1]]
         axis = 1
         level = 1
         self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]}
@@ -72,14 +77,14 @@ class TestSeqConcatOpLevelZeroNestedSequence(TestSeqConcatOp):
     def set_data(self):
         # two level, batch size is 3
         x0 = np.random.random((4, 6, 3)).astype('float32')
-        lod0 = [[0, 2, 4], [0, 1, 2, 3, 4]]
+        lod0 = [[2, 2], [1, 1, 1, 1]]
         x1 = np.random.random((7, 6, 3)).astype('float32')
-        lod1 = [[0, 2, 4], [0, 1, 3, 5, 7]]
+        lod1 = [[2, 2], [1, 2, 2, 2]]
         axis = 0
         level = 0
         self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]}
         self.attrs = {'axis': axis, 'level': level}
-        out_lod = [[0, 2, 4], [0, 2, 5, 8, 11]]
+        out_lod = [[2, 2], [2, 3, 3, 3]]
         self.outputs = {'Out': (seq_concat(self.inputs, level), out_lod)}
 
 
@@ -87,14 +92,14 @@ class TestSeqConcatOplevelOneNestedSequence(TestSeqConcatOp):
     def set_data(self):
         # two level, batch size is 3
         x0 = np.random.random((4, 6, 3)).astype('float32')
-        lod0 = [[0, 2, 4], [0, 1, 2, 3, 4]]
+        lod0 = [[2, 2], [1, 1, 1, 1]]
         x1 = np.random.random((7, 6, 3)).astype('float32')
-        lod1 = [[0, 3, 4], [0, 1, 3, 5, 7]]
+        lod1 = [[3, 1], [1, 2, 2, 2]]
         axis = 0
         level = 1
         self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]}
         self.attrs = {'axis': axis, 'level': level}
-        out_lod = [[0, 5, 8], [0, 1, 2, 3, 5, 7, 8, 9, 11]]
+        out_lod = [[5, 3], [1, 1, 1, 2, 2, 1, 1, 2]]
         self.outputs = {'Out': (seq_concat(self.inputs, level), out_lod)}
 
 
@@ -102,14 +107,14 @@ class TestSeqConcatOpLevelZeroSequence(TestSeqConcatOp):
     def set_data(self):
         # two level, batch size is 3
         x0 = np.random.random((4, 3, 4)).astype('float32')
-        lod0 = [[0, 1, 2, 3, 4]]
+        lod0 = [[1, 1, 1, 1]]
         x1 = np.random.random((7, 3, 4)).astype('float32')
-        lod1 = [[0, 1, 3, 5, 7]]
+        lod1 = [[1, 2, 2, 2]]
         axis = 0
         level = 0
         self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]}
         self.attrs = {'axis': axis, 'level': level}
-        out_lod = [[0, 2, 5, 8, 11]]
+        out_lod = [[2, 3, 3, 3]]
         self.outputs = {'Out': (seq_concat(self.inputs, level), out_lod)}
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_seq_conv.py b/python/paddle/fluid/tests/unittests/test_seq_conv.py
index 51dbf1f61834ff0093d76ed546be27a585697d40..9701d9adef1fd272f2520f66607acded6a8c25c6 100644
--- a/python/paddle/fluid/tests/unittests/test_seq_conv.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_conv.py
@@ -75,35 +75,38 @@ class TestSeqProject(OpTest):
         pading_data = self.pad_data
         out = np.zeros((self.input_size[0], self.context_length *
                         self.input_size[1])).astype('float32')
-        lod = lod[0]
+        offset = [0]
+        for seq_len in lod[0]:
+            offset.append(offset[-1] + seq_len)
         begin_pad = np.max([0, -self.context_start])
 
-        for i in range(len(lod) - 1):
+        for i in range(len(offset) - 1):
             for j in range(self.context_length):
-                in_begin = lod[i] + self.context_start + j
-                in_end = lod[i + 1] + self.context_start + j
-                out_begin = lod[i]
-                out_end = lod[i + 1]
-                if in_begin < lod[i]:
-                    pad_size = np.min([lod[i] - in_begin, lod[i + 1] - lod[i]])
+                in_begin = offset[i] + self.context_start + j
+                in_end = offset[i + 1] + self.context_start + j
+                out_begin = offset[i]
+                out_end = offset[i + 1]
+                if in_begin < offset[i]:
+                    pad_size = np.min(
+                        [offset[i] - in_begin, offset[i + 1] - offset[i]])
                     if self.padding_trainable:
                         sub_w = pading_data[j:j + pad_size, :]
-                        out[lod[i]:lod[i] + pad_size, j * self.input_size[1]:(
-                            j + 1) * self.input_size[1]] = sub_w
-                    out_begin = lod[i] + pad_size
-                    in_begin = lod[i]
+                        out[offset[i]:offset[i] + pad_size, j * self.input_size[
+                            1]:(j + 1) * self.input_size[1]] = sub_w
+                    out_begin = offset[i] + pad_size
+                    in_begin = offset[i]
 
-                if in_end > lod[i + 1]:
+                if in_end > offset[i + 1]:
                     pad_size = np.min(
-                        [in_end - lod[i + 1], lod[i + 1] - lod[i]])
+                        [in_end - offset[i + 1], offset[i + 1] - offset[i]])
                     if self.padding_trainable:
                         sub_w = pading_data[begin_pad + self.context_start + j -
                                             pad_size:begin_pad +
                                             self.context_start + j, :]
-                        out[lod[i + 1] - pad_size:lod[i + 1], j * self.
+                        out[offset[i + 1] - pad_size:offset[i + 1], j * self.
                             input_size[1]:(j + 1) * self.input_size[1]] = sub_w
-                    in_end = lod[i + 1]
-                    out_end = lod[i + 1] - pad_size
+                    in_end = offset[i + 1]
+                    out_end = offset[i + 1] - pad_size
                 if in_end <= in_begin:
                     continue
 
@@ -175,7 +178,11 @@ class TestSeqProject(OpTest):
         self.context_stride = 1
 
         self.input_size = [self.input_row, 23]
-        self.lod = [[0, 4, 5, 8, self.input_row]]
+        offset_lod = [[0, 4, 5, 8, self.input_row]]
+        self.lod = [[]]
+        # convert from offset-based lod to length-based lod
+        for i in range(len(offset_lod[0]) - 1):
+            self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
         self.output_represention = 8  # output feature size
 
 
@@ -188,7 +195,11 @@ class TestSeqProjectCase1(TestSeqProject):
         self.context_stride = 1
 
         self.input_size = [self.input_row, 23]
-        self.lod = [[0, 4, 5, 8, self.input_row]]
+        offset_lod = [[0, 4, 5, 8, self.input_row]]
+        self.lod = [[]]
+        # convert from offset-based lod to length-based lod
+        for i in range(len(offset_lod[0]) - 1):
+            self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
         self.output_represention = 8  # output feature size
 
 
@@ -203,8 +214,12 @@ class TestSeqProjectCase2(TestSeqProject):
         self.input_size = [self.input_row, 23]
         idx = range(self.input_size[0])
         del idx[0]
-        self.lod = [[0] + np.sort(random.sample(idx, 8)).tolist() +
-                    [self.input_size[0]]]
+        offset_lod = [[0] + np.sort(random.sample(idx, 8)).tolist() +
+                      [self.input_size[0]]]
+        self.lod = [[]]
+        # convert from offset-based lod to length-based lod
+        for i in range(len(offset_lod[0]) - 1):
+            self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
         self.output_represention = 8  # output feature size
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_seq_pool.py b/python/paddle/fluid/tests/unittests/test_seq_pool.py
index 2e48ef0e880839f6d5b4e515a174f427a35e7e6f..0b3659d7a67956f7546d368346bd102eeedf1d97 100644
--- a/python/paddle/fluid/tests/unittests/test_seq_pool.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_pool.py
@@ -18,26 +18,34 @@ from op_test import OpTest
 
 
 class TestSeqAvgPool(OpTest):
+    def convert_to_offset(self, lod):
+        offset = [[0] for i in lod]
+        for i, level in enumerate(lod):
+            for seq_len in level:
+                offset[i].append(offset[i][-1] + seq_len)
+        return offset
+
     def set_data(self):
         self.op_type = 'sequence_pool'
         # one level, batch size is 4
         x = np.random.uniform(0.1, 1, [11, 23]).astype('float32')
-        lod = [[0, 4, 5, 8, 11]]
+        lod = [[4, 1, 3, 3]]
         self.inputs = {'X': (x, lod)}
+        offset = self.convert_to_offset(lod)
 
         out = np.zeros((4, 23)).astype('float32')
         self.outputs = {'Out': out}
-        return x, lod, out
+        return x, offset, out
 
-    def compute(self, x, lod, out):
+    def compute(self, x, offset, out):
         self.attrs = {'pooltype': "AVERAGE"}
-        for i in range(4):
-            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+        for i in range(len(offset[0]) - 1):
+            sub_x = x[offset[0][i]:offset[0][i + 1], :]
             out[i] = sub_x.mean(axis=0)
 
     def setUp(self):
-        x, lod, out = self.set_data()
-        self.compute(x, lod, out)
+        x, offset, out = self.set_data()
+        self.compute(x, offset, out)
 
     def test_check_output(self):
         self.check_output()
@@ -50,10 +58,10 @@ class TestSeqAvgPool(OpTest):
 
 
 class TestSeqSumPool(TestSeqAvgPool):
-    def compute(self, x, lod, out):
+    def compute(self, x, offset, out):
         self.attrs = {'pooltype': "SUM"}
-        for i in range(4):
-            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+        for i in range(len(offset[0]) - 1):
+            sub_x = x[offset[0][i]:offset[0][i + 1], :]
             out[i] = sub_x.sum(axis=0)
 
 
@@ -61,46 +69,47 @@ class TestSeqMaxPool(TestSeqAvgPool):
     def set_data(self):
         self.op_type = 'sequence_pool'
         x = np.random.uniform(0.1, 1, [13, 23]).astype('float32')
-        lod = [[0, 4, 5, 8, 13]]
-        for i in range(4):
-            l = lod[0][i + 1] - lod[0][i]
-            x[lod[0][i] + np.random.randint(l), :] += 2.0
+        lod = [[4, 1, 3, 5]]
+        offset = self.convert_to_offset(lod)
+        for i in range(len(offset[0]) - 1):
+            l = offset[0][i + 1] - offset[0][i]
+            x[offset[0][i] + np.random.randint(l), :] += 2.0
 
         self.inputs = {'X': (x, lod)}
 
         out = np.zeros((4, 23)).astype('float32')
         self.outputs = {'Out': out}
-        return x, lod, out
+        return x, offset, out
 
-    def compute(self, x, lod, out):
+    def compute(self, x, offset, out):
         self.attrs = {'pooltype': "MAX"}
-        for i in range(4):
-            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+        for i in range(len(offset[0]) - 1):
+            sub_x = x[offset[0][i]:offset[0][i + 1], :]
             out[i] = np.amax(sub_x, axis=0)
 
 
 class TestSeqSqrtPool(TestSeqAvgPool):
-    def compute(self, x, lod, out):
+    def compute(self, x, offset, out):
         self.attrs = {'pooltype': "SQRT"}
-        for i in range(4):
-            sub_x = x[lod[0][i]:lod[0][i + 1], :]
-            len = lod[0][i + 1] - lod[0][i]
-            out[i] = sub_x.sum(axis=0) / np.sqrt(len)
+        for i in range(len(offset[0]) - 1):
+            sub_x = x[offset[0][i]:offset[0][i + 1], :]
+            seq_len = offset[0][i + 1] - offset[0][i]
+            out[i] = sub_x.sum(axis=0) / np.sqrt(seq_len)
 
 
 class TestSeqLastPool(TestSeqAvgPool):
-    def compute(self, x, lod, out):
+    def compute(self, x, offset, out):
         self.attrs = {'pooltype': "LAST"}
-        for i in range(4):
-            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+        for i in range(len(offset[0]) - 1):
+            sub_x = x[offset[0][i]:offset[0][i + 1], :]
             out[i] = sub_x[-1, :]
 
 
 class TestSeqFirstPool(TestSeqAvgPool):
-    def compute(self, x, lod, out):
+    def compute(self, x, offset, out):
         self.attrs = {'pooltype': "FIRST"}
-        for i in range(4):
-            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+        for i in range(len(offset[0]) - 1):
+            sub_x = x[offset[0][i]:offset[0][i + 1], :]
             out[i] = sub_x[0, :]
 
 
@@ -109,35 +118,39 @@ class TestSeqAvgPool2D(TestSeqAvgPool):
         self.op_type = 'sequence_pool'
         # one level, batch size is 4
         x = np.random.uniform(0.1, 1, [13, 3, 17]).astype('float32')
-        lod = [[0, 4, 5, 8, 13]]
+        lod = [[4, 1, 3, 5]]
         self.inputs = {'X': (x, lod)}
+        offset = self.convert_to_offset(lod)
 
         out = np.zeros((4, 3, 17)).astype('float32')
         self.outputs = {'Out': out}
-        return x, lod, out
+        return x, offset, out
 
-    def compute(self, x, lod, out):
+    def compute(self, x, offset, out):
         self.attrs = {'pooltype': "AVERAGE"}
-        for i in range(4):
-            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17))
+        for i in range(len(offset[0]) - 1):
+            sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :],
+                               (-1, 3 * 17))
             out[i] = np.reshape(sub_x.mean(axis=0), (3, 17))
 
 
 class TestSeqSumPool2D(TestSeqAvgPool2D):
-    def compute(self, x, lod, out):
+    def compute(self, x, offset, out):
         self.attrs = {'pooltype': "SUM"}
-        for i in range(4):
-            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17))
+        for i in range(len(offset[0]) - 1):
+            sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :],
+                               (-1, 3 * 17))
             out[i] = np.reshape(sub_x.sum(axis=0), (3, 17))
 
 
 class TestSeqSqrtPool2D(TestSeqAvgPool2D):
-    def compute(self, x, lod, out):
+    def compute(self, x, offset, out):
         self.attrs = {'pooltype': "SQRT"}
-        for i in range(4):
-            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17))
-            len = lod[0][i + 1] - lod[0][i]
-            out[i] = np.reshape(sub_x.sum(axis=0) / np.sqrt(len), (3, 17))
+        for i in range(len(offset[0]) - 1):
+            sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :],
+                               (-1, 3 * 17))
+            seq_len = offset[0][i + 1] - offset[0][i]
+            out[i] = np.reshape(sub_x.sum(axis=0) / np.sqrt(seq_len), (3, 17))
 
     def test_check_grad(self):
         # Remove MaxIndex after check_grad is refined.
@@ -150,36 +163,40 @@ class TestSeqMaxPool2D(TestSeqAvgPool2D):
     def set_data(self):
         self.op_type = 'sequence_pool'
         x = np.random.uniform(0.1, 1, [13, 3, 11]).astype('float32')
-        lod = [[0, 4, 5, 8, 13]]
+        lod = [[4, 1, 3, 5]]
         self.inputs = {'X': (x, lod)}
-        for i in range(4):
-            l = lod[0][i + 1] - lod[0][i]
-            x[lod[0][i] + np.random.randint(l), :] += 1.0
+        offset = self.convert_to_offset(lod)
+        for i in range(len(offset[0]) - 1):
+            l = offset[0][i + 1] - offset[0][i]
+            x[offset[0][i] + np.random.randint(l), :] += 1.0
 
         out = np.zeros((4, 3, 11)).astype('float32')
         self.outputs = {'Out': out}
-        return x, lod, out
+        return x, offset, out
 
-    def compute(self, x, lod, out):
+    def compute(self, x, offset, out):
         self.attrs = {'pooltype': "MAX"}
-        for i in range(4):
-            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 11))
+        for i in range(len(offset[0]) - 1):
+            sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :],
+                               (-1, 3 * 11))
             out[i] = np.reshape(np.amax(sub_x, axis=0), (3, 11))
 
 
 class TestSeqLastPool2D(TestSeqAvgPool2D):
-    def compute(self, x, lod, out):
+    def compute(self, x, offset, out):
         self.attrs = {'pooltype': "LAST"}
-        for i in range(4):
-            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17))
+        for i in range(len(offset[0]) - 1):
+            sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :],
+                               (-1, 3 * 17))
             out[i] = np.reshape(sub_x[-1, :], (3, 17))
 
 
 class TestSeqFirstPool2D(TestSeqAvgPool2D):
-    def compute(self, x, lod, out):
+    def compute(self, x, offset, out):
         self.attrs = {'pooltype': "FIRST"}
-        for i in range(4):
-            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17))
+        for i in range(len(offset[0]) - 1):
+            sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :],
+                               (-1, 3 * 17))
             out[i] = np.reshape(sub_x[0, :], (3, 17))
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py b/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py
index ebab77e8041d5ff1bd845fb121e5901116fd0254..8f0765277ae85af2b17ad96d4fd0c1148c393ff0 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py
@@ -18,15 +18,17 @@ from op_test import OpTest
 
 
 def sequence_erase(in_seq, lod0, tokens):
-    new_lod0 = [0]
+    new_lod0 = []
     out_seq = []
-    for i in range(0, len(lod0) - 1):
+    offset = 0
+    for i in range(0, len(lod0)):
         num_out = 0
-        for dat in in_seq[lod0[i]:lod0[i + 1]]:
+        for dat in in_seq[offset:(offset + lod0[i])]:
             if dat not in tokens:
                 out_seq.append(dat)
                 num_out += 1
-        new_lod0.append(new_lod0[-1] + num_out)
+        offset += lod0[i]
+        new_lod0.append(num_out)
     return np.array(out_seq).astype("int32"), new_lod0
 
 
@@ -34,7 +36,7 @@ class TestSequenceEraseOpInt32(OpTest):
     def setUp(self):
         self.op_type = "sequence_erase"
         in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
-        lod = [[0, 9, 13, 24, 30]]
+        lod = [[9, 4, 11, 6]]
         tokens = [2, 3, 5]
         out_seq, new_lod0 = sequence_erase(in_seq, lod[0], tokens)
         self.attrs = {'tokens': tokens}
@@ -49,7 +51,7 @@ class TestSequenceEraseOpInt64(OpTest):
     def setUp(self):
         self.op_type = "sequence_erase"
         in_seq = np.random.randint(0, 10, (30, 1)).astype("int64")
-        lod = [[0, 9, 13, 24, 30]]
+        lod = [[9, 4, 11, 6]]
         tokens = [2, 3, 5]
         out_seq, new_lod0 = sequence_erase(in_seq, lod[0], tokens)
         self.attrs = {'tokens': tokens}
@@ -64,7 +66,7 @@ class TestSequenceEraseOpEmpty(OpTest):
     def setUp(self):
         self.op_type = "sequence_erase"
         in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
-        lod = [[0, 9, 13, 24, 30]]
+        lod = [[9, 4, 11, 6]]
         tokens = []
         out_seq, new_lod0 = sequence_erase(in_seq, lod[0], tokens)
         self.attrs = {'tokens': tokens}
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_expand.py b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
index 4c8ec1426c6e103498af544ea5928ec630707d46..0bbd31814efdff6050733f6876ef64e3fcaaaf76 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_expand.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
@@ -21,7 +21,7 @@ class TestSequenceExpand(OpTest):
     def set_data(self):
         x_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32')
         y_data = np.random.uniform(0.1, 1, [8, 1]).astype('float32')
-        y_lod = [[0, 1, 4, 8]]
+        y_lod = [[1, 3, 4]]
         self.inputs = {'X': x_data, 'Y': (y_data, y_lod)}
 
     def compute(self):
@@ -37,23 +37,27 @@ class TestSequenceExpand(OpTest):
         out = np.zeros(shape=((0, ) + x_data.shape[1:]), dtype=x_data.dtype)
 
         if x_lod is None:
-            x_idx = [i for i in xrange(x_data.shape[0] + 1)]
+            # x_idx = [i for i in xrange(x_data.shape[0] + 1)]
+            x_idx = [1] * x_data.shape[0]
         else:
             x_idx = x_lod[0]
-            out_lod = [[0]]
+            out_lod = [[]]
+
+        offset = 0
+        for i in xrange(len(y_lod[ref_level])):
+            repeat_num = y_lod[ref_level][i]
+            x_len = x_idx[i]
 
-        for i in xrange(1, len(y_lod[ref_level])):
-            repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1]
-            x_len = x_idx[i] - x_idx[i - 1]
             if repeat_num > 0:
-                x_sub = x_data[x_idx[i - 1]:x_idx[i], :]
+                x_sub = x_data[offset:(offset + x_len), :]
                 stacked_x_sub = x_sub
                 for r in range(repeat_num - 1):
                     stacked_x_sub = np.vstack((stacked_x_sub, x_sub))
                 out = np.vstack((out, stacked_x_sub))
                 if x_lod is not None:
                     for j in xrange(repeat_num):
-                        out_lod[0].append(out_lod[0][-1] + x_len)
+                        out_lod[0].append(x_len)
+            offset += x_len
 
         if x_lod is None:
             self.outputs = {'Out': out}
@@ -75,9 +79,9 @@ class TestSequenceExpand(OpTest):
 class TestSequenceExpandCase1(TestSequenceExpand):
     def set_data(self):
         x_data = np.random.uniform(0.1, 1, [5, 1]).astype('float32')
-        x_lod = [[0, 2, 5]]
+        x_lod = [[2, 3]]
         y_data = np.random.uniform(0.1, 1, [13, 1]).astype('float32')
-        y_lod = [[0, 2, 5], [0, 2, 4, 7, 10, 13]]
+        y_lod = [[2, 3], [2, 2, 3, 3, 3]]
         self.inputs = {'X': x_data, 'Y': (y_data, y_lod)}
         self.attrs = {'ref_level': 0}
 
@@ -85,9 +89,9 @@ class TestSequenceExpandCase1(TestSequenceExpand):
 class TestSequenceExpandCase2(TestSequenceExpand):
     def set_data(self):
         x_data = np.random.uniform(0.1, 1, [1, 2, 2]).astype('float32')
-        x_lod = [[0, 1]]
+        x_lod = [[1]]
         y_data = np.random.uniform(0.1, 1, [2, 2, 2]).astype('float32')
-        y_lod = [[0, 2], [0, 2]]
+        y_lod = [[2], [1, 1]]
         self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
         self.attrs = {'ref_level': 0}
 
@@ -95,9 +99,9 @@ class TestSequenceExpandCase2(TestSequenceExpand):
 class TestSequenceExpandCase3(TestSequenceExpand):
     def set_data(self):
         x_data = np.random.uniform(0.1, 1, [4, 1]).astype('float32')
-        x_lod = [[0, 1, 2, 3, 4]]
-        y_data = np.random.uniform(0.1, 1, [6, 1]).astype('float32')
-        y_lod = [[0, 2, 4, 4, 6]]
+        x_lod = [[1, 1, 1, 1]]
+        y_data = np.random.uniform(0.1, 1, [8, 1]).astype('float32')
+        y_lod = [[2, 2, 2, 2]]
         self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
 
 
@@ -105,9 +109,9 @@ class TestSequenceExpandCase4(TestSequenceExpand):
     def set_data(self):
         data = np.random.uniform(0.1, 1, [5 * 2, 1])
         x_data = np.array(data).reshape([5, 2]).astype('float32')
-        x_lod = [[0, 2, 5]]
-        y_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32')
-        y_lod = [[0, 1, 3], [0, 1, 3]]
+        x_lod = [[2, 3]]
+        y_data = np.random.uniform(0.1, 1, [5, 1]).astype('float32')
+        y_lod = [[2], [2, 3]]
         self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_reshape.py b/python/paddle/fluid/tests/unittests/test_sequence_reshape.py
index efeab560392d8c03b1bb5db83f59c12d4fef64b0..68f2e5eba35ed318281d14e397dc6d363bcb4079 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_reshape.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_reshape.py
@@ -22,7 +22,7 @@ class TestSequenceReshape(OpTest):
     def setUp(self):
         self.op_type = 'sequence_reshape'
         dimension = 12
-        x_lod = [[0, 4, 5, 8, 11]]
+        x_lod = [[4, 1, 3, 3]]
         x = np.random.uniform(0.1, 1, [11, 24]).astype('float32')
 
         self.inputs = {'X': (x, x_lod)}
@@ -34,13 +34,13 @@ class TestSequenceReshape(OpTest):
 
     def compute_output(self, x, x_lod, dimension):
         x_width = x.shape[1]
-        out_lod = [[0]]
-        for i in xrange(len(x_lod[0]) - 1):
-            seq_len = x_lod[0][i + 1] - x_lod[0][i]
+        out_lod = [[]]
+        for i in xrange(len(x_lod[0])):
+            seq_len = x_lod[0][i]
             offset = (seq_len * x_width) / dimension
             assert int(offset) * dimension == seq_len * x_width
-            out_lod[0].append(out_lod[0][-1] + int(offset))
-        out = np.zeros(shape=(out_lod[0][-1], dimension)).astype('float32')
+            out_lod[0].append(int(offset))
+        out = np.zeros(shape=(sum(out_lod[0]), dimension)).astype('float32')
         out.ravel()[:] = x.ravel()[:]
         return out, out_lod
 
@@ -55,7 +55,7 @@ class TestSequenceReshape_reduce(TestSequenceReshape):
     def setUp(self):
         self.op_type = 'sequence_reshape'
         dimension = 24
-        x_lod = [[0, 4, 6, 8, 12]]
+        x_lod = [[4, 2, 2, 4]]
         x = np.random.uniform(0.1, 1, [12, 12]).astype('float32')
 
         self.inputs = {'X': (x, x_lod)}
@@ -70,7 +70,7 @@ class TestSequenceReshape_same(TestSequenceReshape):
     def setUp(self):
         self.op_type = 'sequence_reshape'
         dimension = 12
-        x_lod = [[0, 4, 6, 8, 12]]
+        x_lod = [[4, 2, 2, 4]]
         x = np.random.uniform(0.1, 1, [12, 12]).astype('float32')
 
         self.inputs = {'X': (x, x_lod)}
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_slice_op.py b/python/paddle/fluid/tests/unittests/test_sequence_slice_op.py
index 660b4a171d09ddfc0e78b650a467db6b576c7ee3..313e485d1e3080f2c59c68256cbc5c81aa6558cd 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_slice_op.py
@@ -29,20 +29,20 @@ class TestSequenceSliceOp(OpTest):
 
         self.inputs = {'X': (x, lod), 'Offset': offset, 'Length': length}
         outs = []  #np.zeros((100, 3, 2)).astype('float32')
-        out_lod = [[0]]
-        out_lod_offset = 0
+        out_lod = [[]]
+        lod_offset = 0
         for i in range(len(offset)):
-            sub_x = x[lod[0][i] + offset[i, 0]:lod[0][i] + offset[i, 0] +
+            sub_x = x[lod_offset + offset[i, 0]:lod_offset + offset[i, 0] +
                       length[i, 0], :]
-            out_lod_offset = out_lod_offset + len(sub_x)
             outs.append(sub_x)
-            out_lod[0].append(out_lod_offset)
+            out_lod[0].append(len(sub_x))
+            lod_offset += lod[0][i]
         outs = np.concatenate(outs, axis=0)
         self.outputs = {'Out': (outs, out_lod)}
 
     def init_test_case(self):
         self.x_dim = (100, 3, 2)
-        self.x_lod = [[0, 20, 40, 60, 80, 100]]
+        self.x_lod = [[20, 20, 20, 20, 20]]
         self.offset = [[1], [2], [3], [4], [5]]
         self.length = [[10], [8], [6], [4], [2]]
 
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py b/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
index d6dc99bb3106feee33daa52bffb386f07cc16de5..e91a69a0f8039651225039beb2a42e8dffeb62d3 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
@@ -26,15 +26,16 @@ class TestSequenceSoftmaxOp(OpTest):
         self.init_op_type()
 
         x = np.random.uniform(0.1, 1, (11, 1)).astype("float32")
-        lod = [[0, 4, 5, 8, 11]]
+        lod = [[4, 1, 3, 3]]
 
         out = np.zeros((11, 1)).astype("float32")
-        for i in range(4):
-            sub_x = x[lod[0][i]:lod[0][i + 1], :]
-            sub_x = sub_x.reshape(1, lod[0][i + 1] - lod[0][i])
+        offset = 0
+        for i in range(len(lod[0])):
+            sub_x = x[offset:offset + lod[0][i], :]
+            sub_x = sub_x.reshape(1, lod[0][i])
             sub_out = stable_softmax(sub_x)
-            out[lod[0][i]:lod[0][i + 1], :] = sub_out.reshape(
-                lod[0][i + 1] - lod[0][i], 1)
+            out[offset:offset + lod[0][i], :] = sub_out.reshape(lod[0][i], 1)
+            offset += lod[0][i]
 
         self.inputs = {"X": (x, lod)}
         self.outputs = {"Out": out}
diff --git a/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py b/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
index 1d93230e7b74c5b6c00bbe125e3ae2d3a649b4b9..b779f0fb014bbba62927754ea6f36828a32e6c0a 100644
--- a/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
+++ b/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
@@ -54,12 +54,12 @@ class TestShrinkRNNMemoryReferLoD(TestShrinkRNNMemoryBase):
     def test_refer_lod(self):
         cpu = core.CPUPlace()
         x_tensor = core.LoDTensor()
-        x_tensor.set_lod([[0, 2, 5, 6]])
+        x_tensor.set_recursive_sequence_lengths([[2, 3, 1]])
         tensor_np = np.random.random(size=(6, 100)).astype('float32')
         x_tensor.set(tensor_np, cpu)
 
         rank_table_tensor = core.LoDTensor()
-        rank_table_tensor.set_lod([[0, 1, 3, 6]])
+        rank_table_tensor.set_recursive_sequence_lengths([[1, 2, 3]])
         rank_table_tensor.set(np.random.random(size=(6, 1)).astype('float32'),
                               cpu)
 
@@ -83,7 +83,7 @@ class TestShrinkRNNMemoryNoLoD(TestShrinkRNNMemoryBase):
         x_tensor.set(tensor_np, cpu)
 
         rank_table_tensor = core.LoDTensor()
-        rank_table_tensor.set_lod([[0, 1, 3, 6]])
+        rank_table_tensor.set_recursive_sequence_lengths([[1, 2, 3]])
         rank_table_tensor.set(np.random.random(size=(6, 1)).astype('float32'),
                               cpu)
 
diff --git a/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py
deleted file mode 100644
index f4aa7426bc315be501348a64e2f15caed6dc8810..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py
+++ /dev/null
@@ -1,80 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-
-import paddle.fluid as fluid
-from paddle.fluid.transpiler.distribute_transpiler import delete_ops
-
-from transpiler_test import TranspilerTest
-
-
-class TestSimpleDistTranspiler(TranspilerTest):
-    def setUp(self):
-        self.current_pserver_ep = "127.0.0.1:6175"
-
-    def test_simple_transpiler(self):
-        np.random.seed(1)
-
-        trainer = self.get_trainer()
-        pserver, startup = self.get_pserver(self.current_pserver_ep)
-        self.assertEqual([op.type for op in trainer.global_block().ops],
-                         self.get_expect_trainer_ops())
-
-        self.assertEqual(len(pserver.blocks), 2)
-        # block0: listen_and_serv
-        self.assertEqual([op.type for op in pserver.blocks[0].ops],
-                         ["listen_and_serv"])
-        # block1: optimize pass
-        self.assertEqual([op.type for op in pserver.blocks[1].ops],
-                         ["sum", "scale", "sgd"])
-
-        # confirm startup program
-        self.assertEqual([op.type for op in startup.global_block().ops],
-                         ["fill_constant", "uniform_random", "uniform_random"])
-
-        # the variable #fc_w will NOT be splited
-        fc_w_var = startup.global_block().var("fc_w@GRAD")
-        self.assertEqual(fc_w_var.shape, (1000, 1000))
-
-        fc_w_var = startup.global_block().var("fc_w@GRAD.trainer_0")
-        self.assertEqual(fc_w_var.shape, (1000, 1000))
-
-    def get_expect_trainer_ops(self):
-        trainer = fluid.Program()
-
-        with fluid.program_guard(trainer):
-            optimize_ops, params_grads = self.net_conf()
-
-        delete_ops(trainer.global_block(), optimize_ops)
-        ops = [op.type for op in trainer.global_block().ops] + [
-            "send", "send_barrier", "recv", "recv", "fetch_barrier"
-        ]
-        ops.insert(ops.index("elementwise_add_grad") + 1, "send")
-        return ops
-
-    def _transpiler_instance(self):
-        main = self.get_main_program()
-        t = fluid.DistributeTranspiler()
-        t.transpile(
-            self.trainer_id,
-            program=main,
-            pservers=self.pserver_eps,
-            trainers=self.trainers,
-            slice_var_up=False)
-        return t
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py b/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
index 02cc7da84918041c33bf5c8def46025bc87a2b9e..0916ed7c9f1e2d6d90c6908983fdc8b177aecbb9 100644
--- a/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
@@ -56,7 +56,7 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
     def test_split_and_merge_lod_tensor_level_0(self):
         tensor = core.LoDTensor()
         tensor.set(np.arange(10).reshape(10, 1).astype('int32'), self.place())
-        tensor.set_lod([[0, 3, 9, 10]])
+        tensor.set_recursive_sequence_lengths([[3, 6, 1]])
 
         mask_np = np.array([0, 1, 0]).astype('bool')
         mask_np = np.expand_dims(mask_np, axis=1)
@@ -68,15 +68,15 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         expect_true_tensor = np.expand_dims(expect_true_tensor, axis=1)
         expect_true = core.LoDTensor()
         expect_true.set(expect_true_tensor, self.place())
-        expect_true.set_lod([[0, 6]])
+        expect_true.set_recursive_sequence_lengths([[6]])
 
         expect_false_tensor = np.array([0, 1, 2, 9]).astype('int32')
         expect_false_tensor = np.expand_dims(expect_false_tensor, axis=1)
-        expect_false_lod = [[0, 3, 4]]
+        expect_false_lod = [[3, 1]]
 
         expect_false = core.LoDTensor()
         expect_false.set(expect_false_tensor, self.place())
-        expect_false.set_lod(expect_false_lod)
+        expect_false.set_recursive_sequence_lengths(expect_false_lod)
 
         self.main(
             tensor=tensor,
@@ -126,7 +126,8 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
 
     def check_tensor_same(self, actual, expect):
         self.assertTrue(np.allclose(np.array(actual), np.array(expect)))
-        self.assertEqual(actual.lod(), expect.lod())
+        self.assertEqual(actual.recursive_sequence_lengths(),
+                         expect.recursive_sequence_lengths())
 
 
 class TestCPUSplitMergeLoDTensorGrad(unittest.TestCase):
@@ -151,7 +152,7 @@ class TestCPUSplitMergeLoDTensorGrad(unittest.TestCase):
 
         tensor = core.LoDTensor()
         tensor.set(np.arange(10).reshape(10, 1).astype('float32'), place)
-        tensor.set_lod([[0, 3, 9, 10]])
+        tensor.set_recursive_sequence_lengths([[3, 6, 1]])
 
         mask_np = np.array([0, 1, 0]).astype('bool')
         mask_np = np.expand_dims(mask_np, axis=1)
diff --git a/python/paddle/fluid/tests/unittests/test_sum_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_sum_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..7956897d68a3fb49d62ba696d0b6400b4f909989
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_sum_mkldnn_op.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from test_sum_op import TestSumOp
+
+
+class TestMKLDNN(TestSumOp):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sum_op.py b/python/paddle/fluid/tests/unittests/test_sum_op.py
index 2faf5b10647a1fa1d44e4847f017db177ee8808a..1d90414e137a70e6265042e24e106fe565802778 100644
--- a/python/paddle/fluid/tests/unittests/test_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sum_op.py
@@ -20,12 +20,15 @@ from op_test import OpTest
 class TestSumOp(OpTest):
     def setUp(self):
         self.op_type = "sum"
+        self.use_mkldnn = False
+        self.init_kernel_type()
         x0 = np.random.random((3, 4)).astype('float32')
         x1 = np.random.random((3, 4)).astype('float32')
         x2 = np.random.random((3, 4)).astype('float32')
         self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]}
         y = x0 + x1 + x2
         self.outputs = {'Out': y}
+        self.attrs = {'use_mkldnn': self.use_mkldnn}
 
     def test_check_output(self):
         self.check_output()
@@ -33,6 +36,9 @@ class TestSumOp(OpTest):
     def test_check_grad(self):
         self.check_grad(['x0'], 'Out')
 
+    def init_kernel_type(self):
+        pass
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_target_assign_op.py b/python/paddle/fluid/tests/unittests/test_target_assign_op.py
index ccb41e56c5555b8c79674449c9139ada0bc47aac..bd208897520122b6a5dcf71da325b1b9dba632f6 100644
--- a/python/paddle/fluid/tests/unittests/test_target_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_target_assign_op.py
@@ -22,22 +22,23 @@ def gen_match_and_neg_indices(num_prior, gt_lod, neg_lod):
     if len(gt_lod) != len(neg_lod):
         raise AssertionError("The input arguments are illegal.")
 
-    batch_size = len(gt_lod) - 1
+    batch_size = len(gt_lod)
 
     match_indices = -1 * np.ones((batch_size, num_prior)).astype('int32')
-    neg_indices = np.zeros((neg_lod[-1], 1)).astype('int32')
+    neg_indices = np.zeros((sum(neg_lod), 1)).astype('int32')
 
+    offset = 0
     for n in range(batch_size):
-        gt_num = gt_lod[n + 1] - gt_lod[n]
+        gt_num = gt_lod[n]
         ids = random.sample([i for i in range(num_prior)], gt_num)
         match_indices[n, ids] = [i for i in range(gt_num)]
 
         ret_ids = set([i for i in range(num_prior)]) - set(ids)
-        s = neg_lod[n]
-        e = neg_lod[n + 1]
-        l = e - s
+        l = neg_lod[n]
         neg_ids = random.sample(ret_ids, l)
-        neg_indices[s:e, :] = np.array(neg_ids).astype('int32').reshape(l, 1)
+        neg_indices[offset:offset + neg_lod[n], :] = np.array(neg_ids).astype(
+            'int32').reshape(l, 1)
+        offset += neg_lod[n]
 
     return match_indices, neg_indices
 
@@ -56,24 +57,28 @@ def target_assign(encoded_box, gt_label, match_indices, neg_indices, gt_lod,
     # init weight for target label
     trg_label_wt = np.zeros((batch_size, num_prior, 1)).astype('float32')
 
+    gt_offset = 0
+    neg_offset = 0
     for i in range(batch_size):
         cur_indices = match_indices[i]
         col_ids = np.where(cur_indices > -1)
         col_val = cur_indices[col_ids]
 
-        gt_start = gt_lod[i]
         # target bbox
-        for v, c in zip(col_val + gt_start, col_ids[0].tolist()):
+        for v, c in zip(col_val + gt_offset, col_ids[0].tolist()):
             trg_box[i][c][:] = encoded_box[v][c][:]
         # weight for target bbox
         trg_box_wt[i][col_ids] = 1.0
 
-        trg_label[i][col_ids] = gt_label[col_val + gt_start]
+        trg_label[i][col_ids] = gt_label[col_val + gt_offset]
         trg_label_wt[i][col_ids] = 1.0
         # set target label weight to 1.0 for the negative samples
         if neg_indices is not None:
-            neg_ids = neg_indices[neg_lod[i]:neg_lod[i + 1]]
+            neg_ids = neg_indices[neg_offset:neg_offset + neg_lod[i]]
             trg_label_wt[i][neg_ids] = 1.0
+        # update offset
+        gt_offset += gt_lod[i]
+        neg_offset += neg_lod[i]
 
     return trg_box, trg_box_wt, trg_label, trg_label_wt
 
@@ -83,11 +88,11 @@ class TestTargetAssginFloatType(OpTest):
         self.op_type = "target_assign"
         num_prior = 120
         num_class = 21
-        gt_lod = [0, 5, 11, 23]
-        neg_lod = [0, 4, 7, 13]
+        gt_lod = [5, 6, 12]
+        neg_lod = [4, 3, 6]
         mismatch_value = 0
-        batch_size = len(gt_lod) - 1
-        num_gt = gt_lod[-1]
+        batch_size = len(gt_lod)
+        num_gt = sum(gt_lod)
 
         encoded_box = np.random.random((num_gt, num_prior, 4)).astype('float32')
         gt_label = np.random.randint(
@@ -121,11 +126,11 @@ class TestTargetAssginIntType(OpTest):
         self.op_type = "target_assign"
         num_prior = 120
         num_class = 21
-        gt_lod = [0, 5, 11, 23]
-        neg_lod = [0, 4, 7, 13]
+        gt_lod = [5, 6, 12]
+        neg_lod = [4, 3, 6]
         mismatch_value = 0
-        batch_size = len(gt_lod) - 1
-        num_gt = gt_lod[-1]
+        batch_size = len(gt_lod)
+        num_gt = sum(gt_lod)
 
         encoded_box = np.random.random((num_gt, num_prior, 4)).astype('float32')
         gt_label = np.random.randint(
diff --git a/python/paddle/fluid/tests/unittests/test_tensor.py b/python/paddle/fluid/tests/unittests/test_tensor.py
index 379081c3287ce81dbf2bd7307cb5eac2620b13db..f17edd3025b17549892bbd47935a1d2452cefac3 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor.py
@@ -69,15 +69,14 @@ class TestTensor(unittest.TestCase):
         array[0, 0, 0] = 3
         array[3, 3, 5] = 10
         lod_tensor.set(array, place)
-        lod_tensor.set_lod([[0, 2, 4]])
+        lod_tensor.set_recursive_sequence_lengths([[2, 2]])
 
         lod_v = numpy.array(lod_tensor)
         self.assertTrue(numpy.alltrue(array == lod_v))
 
-        lod = lod_tensor.lod()
-        self.assertEqual(0, lod[0][0])
+        lod = lod_tensor.recursive_sequence_lengths()
+        self.assertEqual(2, lod[0][0])
         self.assertEqual(2, lod[0][1])
-        self.assertEqual(4, lod[0][2])
 
     def test_float_lod_tensor(self):
         place = core.CPUPlace()
@@ -97,21 +96,21 @@ class TestTensor(unittest.TestCase):
         lod_v = numpy.array(lod_tensor)
         self.assertAlmostEqual(1.0, lod_v[0, 0, 0, 0])
         self.assertAlmostEqual(2.0, lod_v[0, 0, 0, 1])
-        self.assertEqual(len(lod_tensor.lod()), 0)
+        self.assertEqual(len(lod_tensor.recursive_sequence_lengths()), 0)
 
-        lod_py = [[0, 2, 5], [0, 2, 4, 5]]
-        lod_tensor.set_lod(lod_py)
-        lod = lod_tensor.lod()
+        lod_py = [[2, 1], [1, 2, 2]]
+        lod_tensor.set_recursive_sequence_lengths(lod_py)
+        lod = lod_tensor.recursive_sequence_lengths()
         self.assertListEqual(lod_py, lod)
 
     def test_lod_tensor_init(self):
         scope = core.Scope()
         place = core.CPUPlace()
-        lod_py = [[0, 2, 5], [0, 2, 4, 5]]
+        lod_py = [[2, 1], [1, 2, 2]]
         lod_tensor = core.LoDTensor()
 
         lod_tensor.set_dims([5, 2, 3, 4])
-        lod_tensor.set_lod(lod_py)
+        lod_tensor.set_recursive_sequence_lengths(lod_py)
         lod_tensor.alloc_float(place)
         tensor_array = numpy.array(lod_tensor)
         tensor_array[0, 0, 0, 0] = 1.0
@@ -121,17 +120,17 @@ class TestTensor(unittest.TestCase):
         lod_v = numpy.array(lod_tensor)
         self.assertAlmostEqual(1.0, lod_v[0, 0, 0, 0])
         self.assertAlmostEqual(2.0, lod_v[0, 0, 0, 1])
-        self.assertListEqual(lod_py, lod_tensor.lod())
+        self.assertListEqual(lod_py, lod_tensor.recursive_sequence_lengths())
 
     def test_lod_tensor_gpu_init(self):
         if not core.is_compiled_with_cuda():
             return
         place = core.CUDAPlace(0)
-        lod_py = [[0, 2, 5], [0, 2, 4, 5]]
+        lod_py = [[2, 1], [1, 2, 2]]
         lod_tensor = core.LoDTensor()
 
         lod_tensor.set_dims([5, 2, 3, 4])
-        lod_tensor.set_lod(lod_py)
+        lod_tensor.set_recursive_sequence_lengths(lod_py)
         lod_tensor.alloc_float(place)
         tensor_array = numpy.array(lod_tensor)
         tensor_array[0, 0, 0, 0] = 1.0
@@ -141,7 +140,7 @@ class TestTensor(unittest.TestCase):
         lod_v = numpy.array(lod_tensor)
         self.assertAlmostEqual(1.0, lod_v[0, 0, 0, 0])
         self.assertAlmostEqual(2.0, lod_v[0, 0, 0, 1])
-        self.assertListEqual(lod_py, lod_tensor.lod())
+        self.assertListEqual(lod_py, lod_tensor.recursive_sequence_lengths())
 
     def test_empty_tensor(self):
         place = core.CPUPlace()
diff --git a/python/paddle/fluid/tests/unittests/test_warpctc_op.py b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
index ac638f7836f8205f80e31cfd5eb8892b2c7aee08..9f1aaee472f918da7deb8816a0a4654dafe74a30 100644
--- a/python/paddle/fluid/tests/unittests/test_warpctc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
@@ -34,8 +34,8 @@ class CTCForward(object):
 
         self.level = 0
         self.num_classes = softmax.shape[1]
-        self.batch_size = len(softmax_lod[self.level]) - 1
-        assert self.batch_size == len(labels_lod[self.level]) - 1
+        self.batch_size = len(softmax_lod[self.level])
+        assert self.batch_size == len(labels_lod[self.level])
 
         self.loss = np.zeros([self.batch_size, 1], dtype="float32")
         self.gradient = np.zeros(self.softmax.shape, dtype="float32")
@@ -156,16 +156,20 @@ class CTCForward(object):
         return -log_prob
 
     def forward(self):
+        softmax_offset = 0
+        labels_offset = 0
         for i in range(self.batch_size):
-            softmax_start_i = self.softmax_lod[self.level][i]
-            softmax_end_i = self.softmax_lod[self.level][i + 1]
-            labels_start_i = self.labels_lod[self.level][i]
-            labels_end_i = self.labels_lod[self.level][i + 1]
+            softmax_start_i = softmax_offset
+            softmax_end_i = softmax_offset + self.softmax_lod[self.level][i]
+            labels_start_i = labels_offset
+            labels_end_i = labels_offset + self.labels_lod[self.level][i]
 
             softmax_a_sequence = self.softmax[softmax_start_i:softmax_end_i, :]
             labels_a_sequence = self.labels[labels_start_i:labels_end_i, :]
             self.loss[i] = self.forward_a_sequence(softmax_a_sequence,
                                                    labels_a_sequence)
+            softmax_offset += self.softmax_lod[self.level][i]
+            labels_offset += self.labels_lod[self.level][i]
         return self.loss
 
 
@@ -173,8 +177,8 @@ class TestWarpCTCOp(OpTest):
     def config(self):
         self.batch_size = 4
         self.num_classes = 8
-        self.logits_lod = [[0, 4, 5, 8, 11]]
-        self.labels_lod = [[0, 3, 4, 8, 12]]
+        self.logits_lod = [[4, 1, 3, 3]]
+        self.labels_lod = [[3, 1, 4, 4]]
         self.blank = self.num_classes - 1
         self.norm_by_times = False
 
@@ -184,11 +188,13 @@ class TestWarpCTCOp(OpTest):
 
         logits = np.random.uniform(
             0.1, 1.0,
-            [self.logits_lod[0][-1], self.num_classes]).astype("float32")
+            [sum(self.logits_lod[0]), self.num_classes]).astype("float32")
         softmax = np.apply_along_axis(stable_softmax, 1, logits)
         # labels should not be blank
         labels = np.random.randint(
-            0, self.num_classes - 1, [self.labels_lod[0][-1], 1], dtype="int32")
+            0,
+            self.num_classes - 1, [sum(self.labels_lod[0]), 1],
+            dtype="int32")
 
         ctc = CTCForward(softmax, self.logits_lod, labels, self.labels_lod,
                          self.blank, self.norm_by_times)
@@ -196,9 +202,8 @@ class TestWarpCTCOp(OpTest):
 
         max_sequence_length = 0
         for i in range(self.batch_size):
-            max_sequence_length = max(
-                max_sequence_length,
-                self.logits_lod[0][i + 1] - self.logits_lod[0][i])
+            max_sequence_length = max(max_sequence_length,
+                                      self.logits_lod[0][i])
         self.gradient = np.zeros(
             [max_sequence_length, self.batch_size, self.num_classes],
             dtype="float32")
@@ -222,8 +227,8 @@ class TestWarpCTCOpCase1(TestWarpCTCOp):
     def config(self):
         self.batch_size = 4
         self.num_classes = CUDA_BLOCK_SIZE + 2
-        self.logits_lod = [[0, 4, 5, 8, 11]]
-        self.labels_lod = [[0, 3, 4, 8, 12]]
+        self.logits_lod = [[4, 1, 3, 3]]
+        self.labels_lod = [[3, 1, 4, 4]]
         self.blank = 0
         self.norm_by_times = False
 
diff --git a/python/paddle/fluid/tests/unittests/test_weight_normalization.py b/python/paddle/fluid/tests/unittests/test_weight_normalization.py
index 2adf917bc5d3bb35842a817c57a983627b759f22..436f9b9f86fb86270e47c8e30c5c0701787ca0f1 100644
--- a/python/paddle/fluid/tests/unittests/test_weight_normalization.py
+++ b/python/paddle/fluid/tests/unittests/test_weight_normalization.py
@@ -76,11 +76,11 @@ class TestWeightNormalization(unittest.TestCase):
                 lod_level_i = numpy.random.randint(
                     low=1,
                     high=5,
-                    size=self.batch_size if i == 0 else lod_level_i[-1])
-                lod_level_i = [0] + numpy.cumsum(lod_level_i).tolist()
+                    size=self.batch_size
+                    if i == 0 else sum(lod_level_i)).tolist()
                 data_lod.append(lod_level_i)
             data_value = numpy.random.random(
-                size=[data_lod[-1][-1] if data_lod else self.batch_size
+                size=[sum(data_lod[-1]) if data_lod else self.batch_size
                       ] + data_shape).astype('float32')
             self.data[data_name] = (data_value, data_lod)
 
@@ -90,7 +90,7 @@ class TestWeightNormalization(unittest.TestCase):
             tensor = fluid.Tensor()
             tensor.set(self.data[desc[0]][0], place)
             if self.data[desc[0]][1]:
-                tensor.set_lod(self.data[desc[0]][1])
+                tensor.set_recursive_sequence_lengths(self.data[desc[0]][1])
             self.inputs[desc[0]] = tensor
 
     def weight_normalize(self):
diff --git a/python/paddle/fluid/tests/unittests/testsuite.py b/python/paddle/fluid/tests/unittests/testsuite.py
index 1dc94a80c9d3999d34fdf0edbf82ffe297bd95d7..a995ee10f29a714b674fae4b31070e6ba2ca9953 100644
--- a/python/paddle/fluid/tests/unittests/testsuite.py
+++ b/python/paddle/fluid/tests/unittests/testsuite.py
@@ -22,7 +22,7 @@ def as_lodtensor(np_array, lod, place):
     tensor = core.LoDTensor()
     tensor.set(np_value, place)
     if lod is not None:
-        tensor.set_lod(lod)
+        tensor.set_recursive_sequence_lengths(lod)
     return tensor
 
 
@@ -73,7 +73,7 @@ def set_input(scope, op, inputs, place):
         if isinstance(var, tuple) or isinstance(var, np.ndarray):
             tensor = scope.find_var(var_name).get_tensor()
             if isinstance(var, tuple):
-                tensor.set_lod(var[1])
+                tensor.set_recursive_sequence_lengths(var[1])
                 var = var[0]
             tensor.set_dims(var.shape)
             tensor.set(var, place)
diff --git a/python/paddle/fluid/tests/unittests/transpiler_test.py b/python/paddle/fluid/tests/unittests/transpiler_test.py
deleted file mode 100644
index d84c5d9c41c705cf6d14cc0b5a8c692b0d646337..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/transpiler_test.py
+++ /dev/null
@@ -1,73 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import numpy as np
-
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import paddle.fluid.layers as layers
-
-
-class TranspilerTest(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        self.trainer_id = 0
-        self.trainers = 2
-        self.pservers = 2
-        self.pserver_eps = "127.0.0.1:6174,127.0.0.1:6175"
-
-    def net_conf(self):
-        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
-
-        y_predict = fluid.layers.fc(input=x,
-                                    size=1000,
-                                    act=None,
-                                    param_attr=fluid.ParamAttr(name='fc_w'))
-
-        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-
-        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-        avg_cost = fluid.layers.mean(cost)
-        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
-
-        optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
-        return optimize_ops, params_grads
-
-    def get_main_program(self):
-        main = fluid.Program()
-
-        with fluid.program_guard(main):
-            self.net_conf()
-
-        return main
-
-    def get_trainer(self):
-        return self._transpiler_instance().get_trainer_program()
-
-    def get_pserver(self, ep):
-        t = self._transpiler_instance()
-        pserver = t.get_pserver_program(ep)
-        startup = t.get_startup_program(ep, pserver)
-        return pserver, startup
-
-    def _transpiler_instance(self):
-        main = self.get_main_program()
-        t = fluid.DistributeTranspiler()
-        t.transpile(
-            self.trainer_id,
-            program=main,
-            pservers=self.pserver_eps,
-            trainers=self.trainers)
-        return t
diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index efc28d899304b01a3085891f3ae9396d57c589a1..b6e0241265b18377874efb0d223441994b4650d0 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -33,23 +33,59 @@ __all__ = [
 
 
 class BeginEpochEvent(object):
+    """
+    The begin of a training epoch.
+
+    Args:
+        epoch_id(int): The current epoch ID.
+    """
+
     def __init__(self, epoch_id):
         self.epoch = epoch_id
 
 
 class EndEpochEvent(object):
+    """
+    The end of a training epoch.
+
+    Args:
+        epoch_id(int): The current epoch ID.
+    """
+
     def __init__(self, epoch_id):
         self.epoch = epoch_id
 
 
 class BeginStepEvent(object):
+    """
+    The begin of a training epoch.
+
+    Args:
+        epoch_id(int): The current epoch ID.
+        step_id(int): The current step ID.
+    """
+
     def __init__(self, epoch_id, step_id):
         self.epoch = epoch_id
         self.step = step_id
         self.fetch_metrics = True
+        """
+        If fetch_metrics is true, the metrics will be fetched at the 
+        EndStepEvent. Default is True.
+        """
 
 
 class EndStepEvent(object):
+    """
+    The end of a training step.
+
+    Args:
+        epoch_id(int): The current epoch ID.
+        step_id(int): The current step ID.
+        metrics(list): A list of fetched tensor. The order of this list is same
+            as the :code:`train_func` returns.
+    """
+
     def __init__(self, epoch_id, step_id, metrics):
         self.epoch = epoch_id
         self.step = step_id
@@ -57,32 +93,46 @@ class EndStepEvent(object):
 
 
 class CheckpointConfig(object):
+    """
+    Parameter object for :code:`fluid.io.save_checkpoint` and
+    :code:`fluid.Trainer`. Used to configuration how to save checkpoint.
+
+    Args:
+        checkpoint_dir(str): Directory path to save check point. Default is the
+            current directory.
+
+        max_num_checkpoints(int): The max number of local check points.
+        epoch_interval(int): Every number of epoch to save check point.
+        step_interval(int): Every number of step to save check point.
+
+    Examples:
+        >>> config = fluid.CheckpointConfig("./checkpoints")
+        >>> trainer = fluid.Trainer(train_func=train_program,
+        >>>                         place=place,
+        >>>                         optimizer_func=optimizer_func,
+        >>>                         checkpoint_config=config)
+        >>> trainer.train(...)
+    """
+
     def __init__(self,
                  checkpoint_dir=None,
                  max_num_checkpoints=3,
                  epoch_interval=1,
                  step_interval=10):
-        if checkpoint_dir is None:
-            self.checkpoint_dir = os.getcwd()
-        else:
-            self.checkpoint_dir = checkpoint_dir
-
-        self.max_num_checkpoints = max_num_checkpoints
 
-        if epoch_interval < 1:
-            self.epoch_interval = 1
-        else:
-            self.epoch_interval = epoch_interval
-
-        if step_interval < 1:
-            self.step_interval = 10
-        else:
-            self.step_interval = step_interval
+        assert epoch_interval >= 1
+        assert step_interval >= 1
 
+        self.checkpoint_dir = checkpoint_dir \
+            if checkpoint_dir is not None else os.getcwd()
+        self.max_num_checkpoints = max_num_checkpoints
+        self.epoch_interval = epoch_interval
+        self.step_interval = step_interval
         self.epoch_id = 0
         self.step_id = 0
         self.load_serial = None
-        self.is_pserver = False
+        self.pserver_id = None
+        self.lookup_table_name = None
 
 
 def check_and_get_place(place):
@@ -113,11 +163,62 @@ def check_and_get_place(place):
 
 class Trainer(object):
     """
+    A trainer wraps MultiGPU/MultiNode training loops and can be used to train a
+    simple neural network easily.
+
+    This API takes a :code:`train_func`. A :code:`train_func` is a function that
+    return loss as it first return value. The reset value can be fetched by
+    EndStepEvent.metrics
+
+    This API also takes a :code:`optimizer_func` that will return an optimizer
+    instance.
+
+    For example, to train a MLP for MNIST dataset, the sample program is
+
+    >>> import paddle.fluid as fluid
+    >>>
+    >>> def mlp(image, layer_sizes=[200, 100], activation="relu", num_classes=10):
+    >>>     hidden = image
+    >>>     for layer_size in layer_sizes:
+    >>>         hidden = fluid.layers.fc(input=hidden, size=layer_size, act=activation)
+    >>>     return fluid.layers.fc(input=hidden, size=num_classes, act="softmax")
+    >>>
+    >>> def train_mnist_mlp():
+    >>>     img = fluid.layers.data(name='image', shape=[784])
+    >>>     label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    >>>     prediction = mlp(img)
+    >>>     return fluid.layers.mean(fluid.layers.cross_entropy(prediction, label))
+    >>>
+    >>> def optimizer():
+    >>>     return fluid.optimizer.Adam()
+    >>>
+    >>> trainer = Trainer(train_func=train_mnist_mlp,
+    >>>                   optimizer_func=optimizer,
+    >>>                   place=fluid.CUDAPlace(0),
+    >>>                   parallel=True)
+    >>>
+    >>> def train_callback(event):
+    >>>     if isinstance(event, fluid.EndStepEvent):
+    >>>         print "Epoch ID", event.epoch, "Step ID",\
+    >>>             event.step, "AvgLoss", event.metrics[0]
+    >>>     elif isinstance(event, fluid.EndEpochEvent):
+    >>>         trainer.save_params("./model_{0}".format(event.epoch))
+    >>>
+    >>> trainer.train(num_epochs=100, event_handler=train_callback)
+
+    For more example, please see :ref:`api_guide_high_level_api`.
+
 
     Args:
-        train_func(callable): A function which will return loss. The loss must be a scalar.
+        train_func(callable): A function which will return loss. The loss must be
+            a scalar tensor.
         optimizer_func(callable): A function that returns an Optimizer object.
-        place: The device place of this trainer.
+        place(CUDAPlace|CPUPlace): The device place of this trainer. If
+            :code:`parallel=True,` all CUDA Places will be used if :code:`place`
+            is a :code:`CUDAPlace`.
+        parallel(bool): True if use multiple devices.
+        checkpoint_config(CheckpointConfig): Configuration about how to save
+            checkpoints.
     """
 
     def __init__(self,
@@ -129,9 +230,6 @@ class Trainer(object):
                  checkpoint_config=None):
         self.__stop = False
         self.parallel = parallel
-        # 1. we need to generate a framework.Program by calling
-        # program_func. Reference: fluid.program_guard in
-        # test_word2vec.py
 
         # config for checkpoint
         # only chief worker will save variables
@@ -145,6 +243,10 @@ class Trainer(object):
 
         self.scope = core.Scope()
 
+        # 1. we need to generate a framework.Program by calling
+        # program_func. Reference: fluid.program_guard in
+        # test_word2vec.py
+
         self.startup_program = framework.Program()
         self.train_program = framework.Program()
 
@@ -181,13 +283,20 @@ class Trainer(object):
                                    self.checkpoint_cfg.load_serial,
                                    self.startup_program)
 
-            if not self.checkpoint_cfg.is_pserver:
-                epoch_id, step_id = io.load_trainer_args(
-                    self.checkpoint_cfg.checkpoint_dir,
-                    self.checkpoint_cfg.load_serial, self.trainer_id,
-                    self._get_checkpoint_load_args())
-                self.checkpoint_cfg.epoch_id = int(epoch_id)
-                self.checkpoint_cfg.step_id = int(step_id)
+                if not self.checkpoint_cfg.pserver_id:
+                    epoch_id, step_id = io.load_trainer_args(
+                        self.checkpoint_cfg.checkpoint_dir,
+                        self.checkpoint_cfg.load_serial, self.trainer_id,
+                        self._get_checkpoint_load_args())
+                    self.checkpoint_cfg.epoch_id = int(epoch_id)
+                    self.checkpoint_cfg.step_id = int(step_id)
+                else:
+                    if self.checkpoint_cfg.lookup_table_name:
+                        io.load_lookup_table_vars(
+                            exe, self.checkpoint_cfg.checkpoint_dir,
+                            self.startup_program,
+                            self.checkpoint_cfg.pserver_id,
+                            self.checkpoint_cfg.lookup_table_name)
 
         if param_path and os.path.isdir(param_path):
             # load params from param_path into scope
@@ -206,7 +315,7 @@ class Trainer(object):
             for ip in worker_ips.split(","):
                 worker_endpoints.append(':'.join([ip, port]))
             self.num_trainers = len(worker_endpoints)
-            current_endpoint = os.getenv("POD_IP") + ":" + port
+            current_endpoint = os.getenv("PADDLE_CURRENT_IP") + ":" + port
             worker_endpoints.remove(current_endpoint)
             # TODO(wuyi): use self.nccl_id_var, self.num_trainers and self.trainer_id
             # in ParallelExecutor to start
@@ -257,7 +366,10 @@ class Trainer(object):
                 self.trainer_id, pservers=pserver_endpoints, trainers=trainers)
             if training_role == "PSERVER":
                 if self.checkpoint_cfg:
-                    self.is_pserver = True
+                    pserver_id = eplist.index(current_endpoint)
+                    self.checkpoint_cfg.pserver_id = pserver_id
+                    if t.has_distributed_lookup_table:
+                        self.checkpoint_cfg.lookup_table_name = t.table_name
 
                 self.train_program = t.get_pserver_program(current_endpoint)
                 self.startup_program = t.get_startup_program(current_endpoint,
@@ -277,17 +389,18 @@ class Trainer(object):
 
     def train(self, num_epochs, event_handler, reader=None, feed_order=None):
         """
-        Train the model.
+        Start the train loop to train the model.
 
         Args:
-            num_epochs: The number of epoch. An epoch will process all data in reader
-            event_handler: The event handler. A function with type (ev:Event)->void
-            reader:
-            feed_order: Feeding order of reader. None will following the defining
+            num_epochs(int): The number of epoch. An epoch will process all data in reader
+            event_handler(callable): The event handler. A function with type (ev:Event)->void
+            reader(callable): A reader creator object. See also
+                :ref:`api_guide_python_reader` .
+            feed_order(list): Feeding order of reader. None will following the defining
                 order in program
 
         Returns:
-
+            None
         """
         training_role = os.getenv("PADDLE_TRAINING_ROLE", "")
         if training_role == "PSERVER":
@@ -307,16 +420,24 @@ class Trainer(object):
         Test the model on given test data
 
         Args:
-            reader: The reader that yields test data.
-            feed_order: Feeding order of reader. None will following the defining
-                order in program
+            reader(callable): The reader that yields test data.
+            feed_order(list): Feeding order of reader. None will following the
+                defining order in program
         """
 
         return self._test_by_executor(reader, feed_order,
                                       self.train_func_outputs)
 
     def save_params(self, param_path):
-        # reference: save_persistables in io.py
+        """
+        Save all parameters into :code:`param_path`.
+
+        Args:
+            param_path(str): The path to save parameters.
+
+        Returns:
+            None
+        """
         with self._prog_and_scope_guard():
             exe = executor.Executor(self.place)
             io.save_persistables(exe, dirname=param_path)
@@ -448,7 +569,8 @@ class Trainer(object):
     def _save_checkpoint(self, epoch_id, step_id):
         assert self.checkpoint_cfg
 
-        if epoch_id % self.checkpoint_cfg.epoch_interval == 0 and step_id % self.checkpoint_cfg.step_interval == 0:
+        if epoch_id % self.checkpoint_cfg.epoch_interval == 0 \
+            and step_id % self.checkpoint_cfg.step_interval == 0:
             exe = executor.Executor(self.place)
             io.save_checkpoint(
                 executor=exe,
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 9c604170b8b53c9cbcf39b4978ae60ccad84648c..53d6ca86a008f798af2854a154cce8b7242d2f35 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -12,19 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Transpile the program to distributed data-parallelism programs.
-The main_program will be transformed to use a remote parameter server
-to do parameter optimization. And the optimization graph will be put
-into a parameter server program.
-
-Use different methods to split trainable variables to different
-parameter servers.
-
 Steps to transpile trainer:
 1. split variable to multiple blocks, aligned by product(dim[1:]) (width).
 2. rename splited grad variables to add trainer_id suffix ".trainer_%d".
 3. modify trainer program add split_op to each grad variable.
-4. append send_op to send splited variables to server and 
+4. append send_op to send splited variables to server and
 5. add recv_op to fetch params(splited blocks or origin param) from server.
 6. append concat_op to merge splited blocks to update local weights.
 
@@ -44,7 +36,7 @@ import numpy as np
 from ps_dispatcher import RoundRobin, HashName, PSDispatcher
 from .. import core, framework
 from ..framework import Program, default_main_program, \
-                        default_startup_program, \
+                        default_startup_program, Block, \
                         Variable, Parameter, grad_var_name
 from details import *
 
@@ -117,129 +109,41 @@ def slice_variable(var_list, slice_count, min_block_size=8192):
     return blocks
 
 
-class DistributeTranspiler:
-    def _has_distributed_lookup_table(self):
-        # process lookup_table_op
-        # 1. check all lookup_table_op is distributed
-        # 2. check all lookup_table_op share the same table.
-        distributed_lookup_table_ops = []
-        # support only one distributed_lookup_table now
-        self.table_name = None
-        for op in self.origin_program.global_block().ops:
-            if op.type == LOOKUP_TABLE_TYPE:
-                if op.attrs['is_distributed'] is True:
-                    if self.table_name is None:
-                        self.table_name = op.input("W")[0]
-                    if self.table_name != op.input("W")[0]:
-                        raise RuntimeError("all distributed lookup_table_ops"
-                                           " should have only one table")
-                    distributed_lookup_table_ops.append(op)
-                else:
-                    if self.table_name is not None:
-                        assert op.input("W")[0] != self.table_name
-
-        return len(distributed_lookup_table_ops) > 0
-
-    def _update_dist_lookup_table_vars(self, param_list, grad_list,
-                                       params_grads):
-        # TODO(wuyi): put find a way to put dist lookup table stuff all together.
-        # update self.table_param_grad and self.trainer_side_table_grad_list
-        program = self.origin_program
-        if self.has_distributed_lookup_table:
-            param_list = [
-                param for param in param_list if param.name != self.table_name
-            ]
-            grad_list = [
-                grad for grad in grad_list
-                if grad.name != grad_var_name(self.table_name)
-            ]
-            self.table_param_grad = [
-                param_grad for param_grad in params_grads
-                if param_grad[0].name == self.table_name
-            ][0]
-            table_grad_var = self.table_param_grad[1]
-            if self.sync_mode:
-                self.trainer_side_table_grad_list = [
-                    program.global_block().create_var(
-                        name="%s.trainer_%d.pserver_%d" %
-                        (table_grad_var.name, self.trainer_id, index),
-                        type=table_grad_var.type,
-                        shape=table_grad_var.shape,
-                        dtype=table_grad_var.dtype)
-                    for index in range(len(self.pserver_endpoints))
-                ]
-            else:
-                self.trainer_side_table_grad_list = [
-                    program.global_block().create_var(
-                        name="%s.pserver_%d" % (table_grad_var.name, index),
-                        type=table_grad_var.type,
-                        shape=table_grad_var.shape,
-                        dtype=table_grad_var.dtype)
-                    for index in range(len(self.pserver_endpoints))
-                ]
-        return param_list, grad_list
-
-    def _init_splited_vars(self, slice_var_up):
-        # update these mappings for further transpile:
-        # 1. param_var_mapping: param var name -> [splited params vars]
-        # 2. grad_var_mapping: grad var name -> [splited grads vars]
-        # 3. grad_param_mapping: grad.blockx -> param.blockx
-        # 4. param_grad_ep_mapping: ep -> {"params": [], "grads": []}
-
-        param_list = []
-        grad_list = []
-        param_grad_set = set()
-        for p, g in self.params_grads:
-            # skip parameter marked not trainable
-            if type(p) == Parameter and p.trainable == False:
-                continue
-            if p.name not in param_grad_set:
-                param_list.append(p)
-                param_grad_set.add(p.name)
-            if g.name not in param_grad_set:
-                grad_list.append(g)
-                param_grad_set.add(g.name)
-
-        param_list, grad_list = self._update_dist_lookup_table_vars(
-            param_list, grad_list, self.params_grads)
-
-        if slice_var_up:
-            # when we slice var up into blocks, we will slice the var according to
-            # pserver services' count. A pserver may have two or more listening ports.
-            grad_blocks = slice_variable(grad_list, len(self.pserver_endpoints))
-            param_blocks = slice_variable(param_list,
-                                          len(self.pserver_endpoints))
-        else:
-            # when we do NOT slice var up into blocks, we will always slice params
-            # grads into one block.
-            grad_blocks = slice_variable(grad_list, 1)
-            param_blocks = slice_variable(param_list, 1)
-        assert (len(grad_blocks) == len(param_blocks))
-
-        # origin_varname -> [splited_var]
-        self.param_var_mapping = self._create_vars_from_blocklist(
-            self.origin_program, param_blocks)
-        self.grad_var_mapping = self._create_vars_from_blocklist(
-            self.origin_program,
-            grad_blocks,
-            add_trainer_suffix=self.trainer_num > 1)
-        self.grad_param_mapping = dict()
-        for g, p in zip(grad_blocks, param_blocks):
-            g_name, g_bid, _ = g.split(":")
-            p_name, p_bid, _ = p.split(":")
-            self.grad_param_mapping[self.grad_var_mapping[g_name][int(g_bid)]] =  \
-                    self.param_var_mapping[p_name][int(p_bid)]
-
-        # create mapping of endpoint -> split var to create pserver side program
-        self.param_grad_ep_mapping = dict()
-        [
-            self.param_grad_ep_mapping.update({
-                ep: {
-                    "params": [],
-                    "grads": []
-                }
-            }) for ep in self.pserver_endpoints
-        ]
+class DistributeTranspiler(object):
+    """
+    **DistributeTranspiler**
+
+    Convert the fluid program to distributed data-parallelism programs.
+
+    The main_program will be transformed to use a remote parameter server
+    to do parameter optimization. And the optimization graph will be put
+    into a parameter server program.
+
+    Examples:
+        .. code-block:: python
+
+           # Define your model before these codes.
+           port = os.getenv("PADDLE_PSERVER_PORT", "6174")
+           pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "")
+           eplist = []
+           for ip in pserver_ips.split(","):
+                eplist.append(':'.join([ip, port]))
+           pserver_endpoints = ",".join(eplist)
+           trainers = int(os.getenv("PADDLE_TRAINERS"))
+           current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
+           trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+           role = os.getenv("PADDLE_TRAINING_ROLE")
+
+           t = distribute_transpiler.DistributeTranspiler()
+           t.transpile(
+                trainer_id, pservers=pserver_endpoints, trainers=trainers)
+           if role == "PSERVER":
+                pserver_program = t.get_pserver_program(current_endpoint)
+                pserver_startup_program = t.get_startup_program(current_endpoint,
+                                                                pserver_program)
+           elif role == "TRAINER":
+                trainer_program = t.get_trainer_program()
+    """
 
     def transpile(self,
                   trainer_id,
@@ -250,20 +154,20 @@ class DistributeTranspiler:
                   split_method=RoundRobin,
                   sync_mode=True):
         """
-        :param trainer_id: one unique id for each trainer in a job.
-        :type trainer_id: int
-        :param program: program to transpile, default is default_main_program
-        :type program: Program
-        :param pservers: parameter server endpoints like "m1:6174,m2:6174"
-        :type pservers: string
-        :param trainers: total number of workers/trainers in the job
-        :type trainers: int
-        :param split_method: A function to determin how to split variables
-            to different servers equally.
-        :type split_method: function
-        :type sync_mode: boolean default True
-        :param sync_mode: if sync_mode is set True, it means that dist transpiler
-        will transpile the program into sync_mode pserver and trainer program.
+        Run the transpiler.
+
+        Args:
+            trainer_id (int): id for current trainer worker, if you have
+                n workers, the id may range from 0 ~ n-1
+            program (Program|None): program to transpile,
+                default is fluid.default_main_program().
+            pservers (str): comma separated ip:port string for the pserver
+                list.
+            trainers (int): number of trainers in the distributed job.
+            slice_var_up (bool): Do Tensor slice for pservers, default is True.
+            split_method (PSDispatcher): RoundRobin or HashName can be used
+                try to choose the best method to balance loads for pservers.
+            sync_mode (bool): Do sync training or not, default is True.
         """
         assert (split_method.__bases__[0] == PSDispatcher)
         if program is None:
@@ -390,20 +294,33 @@ class DistributeTranspiler:
             self._split_table_grad_and_add_send_vars(program, pserver_endpoints)
 
     def get_trainer_program(self):
+        """
+        Get transpiled trainer side program.
+
+        Returns:
+            Program: trainer side program.
+        """
         # remove optimize ops and add a send op to main_program
+        # FIXME(typhoonzero): Also ops like clip_gradient, lrn_decay?
         delete_ops(self.origin_program.global_block(), self.optimize_ops)
-        # FIXME(typhoonzero): serialize once will fix error occurs when clone.
         self.origin_program.__str__()
         return self.origin_program
 
     def get_pserver_program(self, endpoint):
         """
-        Get pserver side program using the endpoint.
-        TODO(panyx0718): Revisit this assumption. what if #blocks > #pservers.
-        NOTE: assume blocks of the same variable is not distributed
-        on the same pserver, only change param/grad varnames for
-        trainers to fetch.
+        Get parameter server side program.
+
+        Args:
+            endpoint (str): current parameter server endpoint.
+
+        Returns:
+            Program: the program for current parameter server to run.
         """
+        # TODO(panyx0718): Revisit this assumption. what if #blocks > #pservers.
+        # NOTE: assume blocks of the same variable is not distributed
+        # on the same pserver, only change param/grad varnames for
+        # trainers to fetch.
+
         # step1
         pserver_program = Program()
         # step2: Create vars to receive vars at parameter servers.
@@ -466,12 +383,13 @@ class DistributeTranspiler:
             if self._is_adam_connected_op(op):
                 global_ops.append(op)
 
-        def __append_optimize_op__(op, block, grad_to_block_id, merged_var):
+        def __append_optimize_op__(op, block, grad_to_block_id, merged_var,
+                                   lr_ops):
             if self._is_optimizer_op(op):
                 self._append_pserver_ops(block, op, endpoint, grad_to_block_id,
                                          self.origin_program, merged_var)
-            else:
-                self._append_pserver_non_opt_ops(block, op, endpoint)
+            elif op not in lr_ops:
+                self._append_pserver_non_opt_ops(block, op)
 
         def __op_have_grad_input__(op):
             for varname in op.input_arg_names:
@@ -479,19 +397,50 @@ class DistributeTranspiler:
                     return varname
             return ""
 
+        def __clone_lr_op_sub_block__(op, program, lr_block):
+            if not op.has_attr('sub_block'):
+                return
+
+            origin_block_desc = op.attr('sub_block')
+            origin_block = self.origin_program.block(origin_block_desc.id)
+            assert isinstance(origin_block, Block)
+            # we put the new sub block to new block to follow the block
+            # hierarchy of the original blocks
+            new_sub_block = program.create_block(lr_block.idx)
+
+            # clone vars
+            for var in origin_block.vars:
+                new_sub_block.clone_variable(var)
+
+            # clone ops
+            for origin_op in origin_block.ops:
+                cloned_op = self._clone_lr_op(program, new_sub_block, origin_op)
+                # clone sub_block of op
+                __clone_lr_op_sub_block__(cloned_op, program, new_sub_block)
+
+            # reset the block of op
+            op.set_attr('sub_block', new_sub_block)
+
         # append lr decay ops to the child block if exists
         lr_ops = self._get_lr_ops()
+        # record optimize blocks and we can run them on pserver parallel
+        optimize_blocks = []
         if len(lr_ops) > 0:
             lr_decay_block = pserver_program.create_block(
                 pserver_program.num_blocks - 1)
+            optimize_blocks.append(lr_decay_block)
             for _, op in enumerate(lr_ops):
-                self._append_pserver_non_opt_ops(lr_decay_block, op, endpoint)
+                cloned_op = self._append_pserver_non_opt_ops(lr_decay_block, op)
+                # append sub blocks to pserver_program in lr_decay_op
+                __clone_lr_op_sub_block__(cloned_op, pserver_program,
+                                          lr_decay_block)
 
         # append op to the current block
         grad_to_block_id = []
         pre_block_idx = pserver_program.num_blocks - 1
         for idx, opt_op in enumerate(opt_op_on_pserver):
             per_opt_block = pserver_program.create_block(pre_block_idx)
+            optimize_blocks.append(per_opt_block)
             # append grad merging ops before clip and weight decay
             for _, op in enumerate(self.optimize_ops):
                 # find the origin @GRAD var before clipping
@@ -504,15 +453,18 @@ class DistributeTranspiler:
                 # optimizer is connected to itself
                 if ufind.is_connected(op, opt_op) and op not in global_ops:
                     __append_optimize_op__(op, per_opt_block, grad_to_block_id,
-                                           merged_var)
+                                           merged_var, lr_ops)
 
+        # dedup grad to ids list
+        grad_to_block_id = list(set(grad_to_block_id))
         # append global ops
         if global_ops:
             opt_state_block = pserver_program.create_block(
                 pserver_program.num_blocks - 1)
+            optimize_blocks.append(opt_state_block)
             for glb_op in global_ops:
                 __append_optimize_op__(glb_op, opt_state_block,
-                                       grad_to_block_id, None)
+                                       grad_to_block_id, None, lr_ops)
 
         # process distributed lookup_table
         prefetch_var_name_to_block_id = []
@@ -522,6 +474,8 @@ class DistributeTranspiler:
                 pserver_index, pserver_program, pre_block_idx, grad_to_block_id)
             prefetch_var_name_to_block_id = self._create_prefetch_block(
                 pserver_index, pserver_program, table_opt_block)
+            checkpoint_block_id = self._create_checkpoint_save_block(
+                pserver_program, table_opt_block.idx)
 
         # NOTE: if has_distributed_lookup_table is False, then prefetch_block will
         # not be executed, so it's safe to use optimize_block to hold the place
@@ -531,15 +485,16 @@ class DistributeTranspiler:
             assert len(prefetch_var_name_to_block_id) == 0
 
         attrs = {
-            "OptimizeBlock": pserver_program.block(1),
+            "optimize_blocks": optimize_blocks,
             "endpoint": endpoint,
             "Fanin": self.trainer_num,
             "sync_mode": self.sync_mode,
-            "grad_to_block_id": grad_to_block_id
+            "grad_to_block_id": grad_to_block_id,
         }
         if len(prefetch_var_name_to_block_id) > 0:
             attrs['prefetch_var_name_to_block_id'] \
                 = prefetch_var_name_to_block_id
+            attrs['checkpint_block_id'] = checkpoint_block_id
 
         # step5 append the listen_and_serv op
         pserver_program.global_block().append_op(
@@ -556,6 +511,14 @@ class DistributeTranspiler:
         Get startup program for current parameter server.
         Modify operator input variables if there are variables that
         were split to several blocks.
+
+        Args:
+            endpoint (str): current pserver endpoint.
+            pserver_program (Program): call get_pserver_program first and
+                pass the result here.
+
+        Returns:
+            Program: parameter server side startup program.
         """
         s_prog = Program()
         orig_s_prog = default_startup_program()
@@ -577,7 +540,6 @@ class DistributeTranspiler:
 
         # 2. rename op outputs
         for op in orig_s_prog.global_block().ops:
-            new_inputs = dict()
             new_outputs = dict()
             # do not append startup op if var is not on this pserver
             op_on_pserver = False
@@ -590,10 +552,10 @@ class DistributeTranspiler:
                     op_on_pserver = True
                     new_outputs[key] = pserver_vars[op.output(key)[0]]
 
-            # most startup program ops have no inputs
-            new_inputs = self._get_input_map_from_op(pserver_vars, op)
-
             if op_on_pserver:
+                # most startup program ops have no inputs
+                new_inputs = self._get_input_map_from_op(pserver_vars, op)
+
                 if op.type in [
                         "gaussian_random", "fill_constant", "uniform_random"
                 ]:
@@ -607,6 +569,129 @@ class DistributeTranspiler:
 
     # ====================== private transpiler functions =====================
 
+    def _has_distributed_lookup_table(self):
+        # process lookup_table_op
+        # 1. check all lookup_table_op is distributed
+        # 2. check all lookup_table_op share the same table.
+        distributed_lookup_table_ops = []
+        # support only one distributed_lookup_table now
+        self.table_name = None
+        for op in self.origin_program.global_block().ops:
+            if op.type == LOOKUP_TABLE_TYPE:
+                if op.attrs['is_distributed'] is True:
+                    if self.table_name is None:
+                        self.table_name = op.input("W")[0]
+                    if self.table_name != op.input("W")[0]:
+                        raise RuntimeError("all distributed lookup_table_ops"
+                                           " should have only one table")
+                    distributed_lookup_table_ops.append(op)
+                else:
+                    if self.table_name is not None:
+                        assert op.input("W")[0] != self.table_name
+
+        return len(distributed_lookup_table_ops) > 0
+
+    def _update_dist_lookup_table_vars(self, param_list, grad_list,
+                                       params_grads):
+        # TODO(wuyi): put find a way to put dist lookup table stuff all together.
+        # update self.table_param_grad and self.trainer_side_table_grad_list
+        program = self.origin_program
+        if self.has_distributed_lookup_table:
+            param_list = [
+                param for param in param_list if param.name != self.table_name
+            ]
+            grad_list = [
+                grad for grad in grad_list
+                if grad.name != grad_var_name(self.table_name)
+            ]
+            self.table_param_grad = [
+                param_grad for param_grad in params_grads
+                if param_grad[0].name == self.table_name
+            ][0]
+            table_grad_var = self.table_param_grad[1]
+            if self.sync_mode:
+                self.trainer_side_table_grad_list = [
+                    program.global_block().create_var(
+                        name="%s.trainer_%d.pserver_%d" %
+                        (table_grad_var.name, self.trainer_id, index),
+                        type=table_grad_var.type,
+                        shape=table_grad_var.shape,
+                        dtype=table_grad_var.dtype)
+                    for index in range(len(self.pserver_endpoints))
+                ]
+            else:
+                self.trainer_side_table_grad_list = [
+                    program.global_block().create_var(
+                        name="%s.pserver_%d" % (table_grad_var.name, index),
+                        type=table_grad_var.type,
+                        shape=table_grad_var.shape,
+                        dtype=table_grad_var.dtype)
+                    for index in range(len(self.pserver_endpoints))
+                ]
+        return param_list, grad_list
+
+    def _init_splited_vars(self, slice_var_up):
+        # update these mappings for further transpile:
+        # 1. param_var_mapping: param var name -> [splited params vars]
+        # 2. grad_var_mapping: grad var name -> [splited grads vars]
+        # 3. grad_param_mapping: grad.blockx -> param.blockx
+        # 4. param_grad_ep_mapping: ep -> {"params": [], "grads": []}
+
+        param_list = []
+        grad_list = []
+        param_grad_set = set()
+        for p, g in self.params_grads:
+            # skip parameter marked not trainable
+            if type(p) == Parameter and p.trainable == False:
+                continue
+            if p.name not in param_grad_set:
+                param_list.append(p)
+                param_grad_set.add(p.name)
+            if g.name not in param_grad_set:
+                grad_list.append(g)
+                param_grad_set.add(g.name)
+
+        param_list, grad_list = self._update_dist_lookup_table_vars(
+            param_list, grad_list, self.params_grads)
+
+        if slice_var_up:
+            # when we slice var up into blocks, we will slice the var according to
+            # pserver services' count. A pserver may have two or more listening ports.
+            grad_blocks = slice_variable(grad_list, len(self.pserver_endpoints))
+            param_blocks = slice_variable(param_list,
+                                          len(self.pserver_endpoints))
+        else:
+            # when we do NOT slice var up into blocks, we will always slice params
+            # grads into one block.
+            grad_blocks = slice_variable(grad_list, 1)
+            param_blocks = slice_variable(param_list, 1)
+        assert (len(grad_blocks) == len(param_blocks))
+
+        # origin_varname -> [splited_var]
+        self.param_var_mapping = self._create_vars_from_blocklist(
+            self.origin_program, param_blocks)
+        self.grad_var_mapping = self._create_vars_from_blocklist(
+            self.origin_program,
+            grad_blocks,
+            add_trainer_suffix=self.trainer_num > 1)
+        self.grad_param_mapping = dict()
+        for g, p in zip(grad_blocks, param_blocks):
+            g_name, g_bid, _ = g.split(":")
+            p_name, p_bid, _ = p.split(":")
+            self.grad_param_mapping[self.grad_var_mapping[g_name][int(g_bid)]] =  \
+                    self.param_var_mapping[p_name][int(p_bid)]
+
+        # create mapping of endpoint -> split var to create pserver side program
+        self.param_grad_ep_mapping = dict()
+        [
+            self.param_grad_ep_mapping.update({
+                ep: {
+                    "params": [],
+                    "grads": []
+                }
+            }) for ep in self.pserver_endpoints
+        ]
+
     # transpiler function for dis lookup_table
     def _replace_lookup_table_op_with_prefetch(self, program,
                                                pserver_endpoints):
@@ -798,7 +883,8 @@ class DistributeTranspiler:
             table_opt_block.append_op(
                 type="sum",
                 inputs={"X": pserver_side_table_grad_list},
-                outputs={"Out": [grad_var]})
+                outputs={"Out": [grad_var]},
+                attrs={"use_mkldnn": False})
         else:
             # in async_mode, for table gradient, it also need to be splited to each parameter server
             origin_grad_name = grad_var.name
@@ -829,6 +915,27 @@ class DistributeTranspiler:
 
         return table_opt_block
 
+    def _create_checkpoint_save_block(self, pserver_program, pre_block_idx):
+        """
+        create a new block to handle save checkpoint.
+        """
+        import os
+
+        pserver_program.global_block().create_var(
+            name="kLookupTablePath",
+            persistable=True,
+            type=core.VarDesc.VarType.RAW)
+
+        checkpoint_save_block = pserver_program.create_block(pre_block_idx)
+        # this 'file_path' do not be used in save lookup table variable
+        checkpoint_save_block.append_op(
+            type='save',
+            inputs={'X': [self.table_name]},
+            outputs={},
+            attrs={'file_path': "none"})
+
+        return checkpoint_save_block.idx
+
     def _create_vars_from_blocklist(self,
                                     program,
                                     block_list,
@@ -855,8 +962,6 @@ class DistributeTranspiler:
             if not block_map.has_key(varname):
                 block_map[varname] = []
             block_map[varname].append((long(offset), long(size)))
-        # Do not remove this important debug message:
-        print("block map: %s" % block_map)
 
         for varname, splited in block_map.iteritems():
             orig_var = program.global_block().var(varname)
@@ -1030,7 +1135,8 @@ class DistributeTranspiler:
             optimize_block.append_op(
                 type="sum",
                 inputs={"X": vars2merge},
-                outputs={"Out": merged_var})
+                outputs={"Out": merged_var},
+                attrs={"use_mkldnn": False})
             # TODO(panyx0718): What if it's SELECTED_ROWS.
             if not merged_var.type == core.VarDesc.VarType.SELECTED_ROWS:
                 optimize_block.append_op(
@@ -1116,7 +1222,29 @@ class DistributeTranspiler:
                     break
         return grad_block
 
-    def _append_pserver_non_opt_ops(self, optimize_block, opt_op, endpoint):
+    def _clone_lr_op(self, program, block, op):
+        inputs = self._get_input_map_from_op(
+            self.origin_program.global_block().vars, op)
+        for key, varlist in inputs.iteritems():
+            if not isinstance(varlist, list):
+                varlist = [varlist]
+            for var in varlist:
+                if var not in program.global_block().vars:
+                    block.clone_variable(var)
+
+        outputs = self._get_output_map_from_op(
+            self.origin_program.global_block().vars, op)
+        for key, varlist in outputs.iteritems():
+            if not isinstance(varlist, list):
+                varlist = [varlist]
+            for var in varlist:
+                if var not in program.global_block().vars:
+                    block.clone_variable(var)
+
+        return block.append_op(
+            type=op.type, inputs=inputs, outputs=outputs, attrs=op.attrs)
+
+    def _append_pserver_non_opt_ops(self, optimize_block, opt_op):
         program = optimize_block.program
         # Append the ops for parameters that do not need to be optimized/updated
         inputs = self._get_input_map_from_op(
@@ -1151,7 +1279,7 @@ class DistributeTranspiler:
                 elif not program.global_block().vars.has_key(var.name):
                     program.global_block().clone_variable(var)
 
-        optimize_block.append_op(
+        return optimize_block.append_op(
             type=opt_op.type,
             inputs=inputs,
             outputs=outputs,
@@ -1195,16 +1323,6 @@ class DistributeTranspiler:
                     ufind.union(op1, op2)
         return ufind
 
-    def _is_opt_role_op(self, op):
-        # NOTE: depend on oprole to find out whether this op is for
-        # optimize
-        op_maker = core.op_proto_and_checker_maker
-        optimize_role = core.op_proto_and_checker_maker.OpRole.Optimize
-        if op_maker.kOpRoleAttrName() in op.attrs and \
-            int(op.attrs[op_maker.kOpRoleAttrName()]) == int(optimize_role):
-            return True
-        return False
-
     def _is_optimizer_op(self, op):
         if "Param" in op.input_names and \
             "LearningRate" in op.input_names:
@@ -1283,6 +1401,16 @@ class DistributeTranspiler:
                     break
         return lr_ops
 
+    def _is_opt_role_op(self, op):
+        # NOTE: depend on oprole to find out whether this op is for
+        # optimize
+        op_maker = core.op_proto_and_checker_maker
+        optimize_role = core.op_proto_and_checker_maker.OpRole.Optimize
+        if op_maker.kOpRoleAttrName() in op.attrs and \
+            int(op.attrs[op_maker.kOpRoleAttrName()]) == int(optimize_role):
+            return True
+        return False
+
     def _get_optimize_pass(self):
         """
         Get optimizer operators, paramters and gradients from origin_program
diff --git a/python/paddle/fluid/transpiler/inference_transpiler.py b/python/paddle/fluid/transpiler/inference_transpiler.py
index 202aa76084432b4b2378470919b2e924301f2130..b8afeae5ebd6ef7948a7c0c2775f419af461da04 100644
--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
@@ -12,23 +12,41 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import numpy as np
 from .. import core
 from ..framework import Program
 from ..executor import global_scope
 
 
-class InferenceTranspiler:
+class InferenceTranspiler(object):
+    '''
+    Convert the fluid program to optimized inference program.
+
+    There are several optimizations:
+
+      - fuse convolution and batch normalization
+      - fuse batch normalization and relu (MKLDNN only)
+
+    Examples:
+
+    .. code-block:: python
+
+        # As InferenceTranspiler will modify the original program,
+        # please clone before use it.
+        inference_transpiler_program = program.clone()
+        t = fluid.InferenceTranspiler()
+        t.transpile(inference_transpiler_program, place)
+    '''
+
     def transpile(self, program, place, scope=None):
         '''
-        Transpile the program. Support only fuse batch normalization now.
+        Run the transpiler.
 
-        :param program: program to transpile 
-        :type program: Program
-        :param place: inference place 
-        :type place: Place
-        :param scope: inference scope 
-        :type scope: Scope or None
+        Args:
+            program (Program): program to transpile
+            place (Place): inference place
+            scope (Scope|None): inference Scope
         '''
         if not isinstance(program, Program):
             raise TypeError("program should be as Program type")
@@ -40,58 +58,110 @@ class InferenceTranspiler:
         if not isinstance(scope, core.Scope):
             raise TypeError("scope should be as Scope type or None")
         self.fuse_batch_norm(program, place, scope)
+        self.fuse_relu_mkldnn(program)
+
+    def fuse_relu_mkldnn(self, program):
+        '''
+        Transpile the program by fused relu activation for MKLDNN program.
+
+        Relu activation following batch norm OP can be fused by adding
+        :math:`fuse_with_relu` attribute to batch norm OP.
+
+        The result of fuse is:
+
+        - before:
+
+          - batch_norm->relu->any_other_op
+
+        - after:
+
+          - batch_norm->any_other_op
+
+        :param program: program to transpile
+        :type program: Program
+        '''
+        use_mkldnn = bool(os.getenv("FLAGS_use_mkldnn", False))
+        if not use_mkldnn:
+            return
+
+        self.block = program.block(0)
+
+        i = 0
+        while i < len(self.block.ops) - 1:
+            current_op = self.block.ops[i]
+            if current_op.type in ['batch_norm']:
+                next_op = self.block.ops[i + 1]
+                if next_op.type == 'relu':
+                    # modify bnorm OP to include relu
+                    current_op.set_attr("fuse_with_relu", True)
+                    # remove relu OP
+                    self.block.remove_op(i + 1)
+            i = i + 1
+
+        self._remove_unused_var()
+        # TODO(luotao): use clone() method to flush the program.desc in force,
+        # since some large program.desc will not be flushed immediately.
+        # And a better solution will be considered later.
+        program = program.clone()
 
     def fuse_batch_norm(self, program, place, scope):
         '''
         Transpile the program by fused batch normalization.
- 
-        The batch normalization followed the convolution or fully connected layer 
-        can be integrated with them. Doing so will give us a forward acceleration, 
+
+        The batch normalization followed the convolution or fully connected layer
+        can be integrated with them. Doing so will give us a forward acceleration,
         especially in environments like mobile or embedded.
-                    
-        For input X:
-        - Conv process:        X = input * W + bias 
-        - Batch norm process:  X' = (X - mean) / std 
-        - Scale Process:       Y = a * X' + b
+
+        For input :math:`X`:
+
+        - Conv process:        :math:`X = input * W + bias`
+        - Batch norm process:  :math:`X' = (X - mean) / std`
+        - Scale Process:       :math:`Y = a * X' + b`
 
         After fuse into one operation:
 
-        Y = (input * W + bias - mean) / std * a + b
-          = input * a * W / std + ((bias - mean) / std * a + b)
+        .. math::
+
+            Y &= (input * W + bias - mean) / std * a + b \\\\
+              &= input * a * W / std + ((bias - mean) / std * a + b)
+
+        The operator transformation is:
 
-        The operator transformation is: 
         - before:
+
           - conv->batch_norm->any_other_op (bias == 0)
           - conv->elementwise_add->batch_norm->any_other_op (bias != 0)
-        - after: 
+
+        - after:
+
           - conv->elementwise_add->any_other_op
-        
+
         The transpile stages are:
+
         1. insert elementwise_add op when bias == 0.
         2. fuse the batch_norm's parameters to conv and elementwise_add operators.
         3. remove batch_norm ops which are not used in any other ops.
         4. adjust the input of any_other_op to be the output of elementwise_add operator.
         5. remove unused variables.
 
-        :param program: program to transpile 
-        :type program: Program
-        :param place: inference place 
-        :type place: Place
-        :param scope: inference scope 
-        :type scope: Scope
+        Args:
+            program (Program): program to transpile
+            place (Place): inference place
+            scope (Scope): inference Scope
+
         '''
         self.scope = scope
         self.place = place
         self.block = program.block(0)
-        self.input_map = {}  # store the input names should be adjusted 
+        self.input_map = {}  # store the input names should be adjusted
 
         i = 0
-        while i < len(self.block.ops):
+        while i < len(self.block.ops) - 2:
             current_op = self.block.ops[i]
             # TODO(luotao1): consider only conv2d now. fc would be delt later.
             if current_op.type in ['conv2d']:
-                # TODO(luotao1): consider single chain network now. 
-                # For branch network, we counldn't use block.ops[i + 1] as 
+                # TODO(luotao1): consider single chain network now.
+                # For branch network, we counldn't use block.ops[i + 1] as
                 # the judgment condition.
                 next_op = self.block.ops[i + 1]
                 # conv2d without bias
@@ -116,17 +186,17 @@ class InferenceTranspiler:
 
         self._adjust_input()
         self._remove_unused_var()
-        # TODO(luotao): use clone() method to flush the program.desc in force, 
-        # since some large program.desc will not be flushed immediately. 
+        # TODO(luotao): use clone() method to flush the program.desc in force,
+        # since some large program.desc will not be flushed immediately.
         # And a better solution will be considered later.
         program = program.clone()
 
     # ====================== private transpiler functions =====================
     def _insert_bias_op(self, index, current_op, bn_op):
         '''
-        Construct elementwise_add operator for adding bias 
+        Construct elementwise_add operator for adding bias
         and insert it into program.
-        
+
         :param index: insert location of bias_op
         :type index: Int
         :param current_op: current operator (conv or fc)
@@ -154,14 +224,14 @@ class InferenceTranspiler:
     def _fuse_param(self, current_op, bn_op, bias_op, with_bias):
         '''
         fuse the batch_norm_op' parameters to current_op (conv or fc)
-        
+
         :param current_op: current operator (conv or fc)
         :type current_op: Operator
         :param bn_op: batch norm operator
         :type bn_op: Operator
         :param bias_op: elementwise_add operator for adding bias
         :type bias_op: Operator
-        :param with_bias: If current operator has bias, with_bias = 1; otherwise 0. 
+        :param with_bias: If current operator has bias, with_bias = 1; otherwise 0.
         :type with_bias: Int
         '''
 
diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
index 9ff0ae6fca27d4681891b2033e2f8f95bd825942..999ef43ca0feacbddff5f9db59589ce7097fe77e 100644
--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
@@ -157,9 +157,11 @@ class ControlFlowGraph(object):
             if op.type() == "fill_constant" and op.attr("force_cpu") == True:
                 self._skip_opt.update(op.output_arg_names())
 
-    def release_memory(self):
+    def release_memory(self, skip_opt_set=None):
         self._dataflow_analyze()
         self._update_skip_opt_set()
+        if skip_opt_set:
+            self._skip_opt.update(skip_opt_set)
         fwd_id = 0
         bwd_id = 0
         for i in range(self.op_size):
@@ -183,7 +185,7 @@ class ControlFlowGraph(object):
                 else:
                     bwd_id += 1
 
-    def memory_optimize(self, level=0):
+    def memory_optimize(self, skip_opt_set=None, level=0):
         def compare_shape(x_shape, cache_shape, opt_level):
             if opt_level == 0:
                 return x_shape == cache_shape
@@ -200,6 +202,9 @@ class ControlFlowGraph(object):
 
         self._dataflow_analyze()
         self._update_skip_opt_set()
+        # update skip set to meet users' demand
+        if skip_opt_set:
+            self._skip_opt.update(skip_opt_set)
         self.pool = []
         for i in range(self.op_size):
             op = self._ops[i]
@@ -358,7 +363,7 @@ def _get_cfgs(input_program):
     return cfgs
 
 
-def memory_optimize(input_program, print_log=False, level=0):
+def memory_optimize(input_program, skip_opt_set=None, print_log=False, level=0):
     """Optimize memory by reusing var memory.
 
       Note: it doesn't not support subblock nested in subblock.
@@ -374,10 +379,20 @@ def memory_optimize(input_program, print_log=False, level=0):
     PRINT_LOG = print_log
     cfgs = _get_cfgs(input_program)
     for cfg in cfgs:
-        cfg.memory_optimize(level)
+        cfg.memory_optimize(skip_opt_set=skip_opt_set, level=level)
 
 
-def release_memory(input_program):
+def release_memory(input_program, skip_opt_set=None):
+    """
+    Modify the input program and insert :code:`delete_op` to early drop not used
+    variables. The modification will be performed inplace.
+
+    Notes: This is an experimental API and could be removed in next few
+    releases. Users should not use this API.
+
+    Args:
+        input_program(Program): The program will be inserted :code:`delete_op`.
+    """
     cfgs = _get_cfgs(input_program)
     for cfg in cfgs:
-        cfg.release_memory()
+        cfg.release_memory(skip_opt_set=skip_opt_set)
diff --git a/python/paddle/fluid/transpiler/ps_dispatcher.py b/python/paddle/fluid/transpiler/ps_dispatcher.py
index d6a68677527deb09ace0e3a23cbc093d6d7b4349..dcffadd531719431f27feb464ed58a65c04770ee 100644
--- a/python/paddle/fluid/transpiler/ps_dispatcher.py
+++ b/python/paddle/fluid/transpiler/ps_dispatcher.py
@@ -33,15 +33,21 @@ class PSDispatcher(object):
 
     def dispatch(self, varlist):
         """
-        :param varlist: a list of Variables
-        :return: a map of pserver endpoint -> varname 
+        Args:
+            varlist(list): a list of Variables
+        Returns:
+            a map of pserver endpoint -> varname
         """
         AssertionError("Interface has not been implemented.")
 
 
 class HashName(PSDispatcher):
     """
-      Hash variable names to several endpoints
+    Hash variable names to several endpoints using python
+    "hash()" function.
+
+    Args:
+        pserver_endpoints (list): list of endpoint(ip:port).
     """
 
     def __init__(self, pserver_endpoints):
@@ -61,7 +67,11 @@ class HashName(PSDispatcher):
 
 class RoundRobin(PSDispatcher):
     """
-    Distribute variables to serveral endpoints.
+    Distribute variables to serveral endpoints using
+    RondRobin<https://en.wikipedia.org/wiki/Round-robin_scheduling> method.
+
+    Args:
+        pserver_endpoints (list): list of endpoint(ip:port).
     """
 
     def __init__(self, pserver_endpoints):
diff --git a/python/paddle/fluid/unique_name.py b/python/paddle/fluid/unique_name.py
index 33c53113ae7e8ed9aeada31f2aed6990b6fea110..776619cd36722e338a9fdd5e13bceeaf3724de2c 100644
--- a/python/paddle/fluid/unique_name.py
+++ b/python/paddle/fluid/unique_name.py
@@ -16,7 +16,7 @@ import collections
 import contextlib
 import sys
 
-__all__ = ['generate', 'switch', 'guard', 'UniqueNameGenerator']
+__all__ = ['generate', 'switch', 'guard']
 
 
 class UniqueNameGenerator(object):
diff --git a/python/paddle/libs/__init__.py b/python/paddle/libs/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..34d4f4d07ed0d452c1965c5f1f198230571931aa
--- /dev/null
+++ b/python/paddle/libs/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# used for setup.py.in to store the thirdparty shared libraries
diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py
index 44a6e344630bb35d28ee29078bf8727053a24bef..1f83cabb8481451736944823be45185deea4f43b 100644
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
@@ -336,7 +336,7 @@ def _buf2lines(buf, line_break="\n"):
 
 class PipeReader:
     """
-        PipeReader read data by stream from a command, take it's 
+        PipeReader read data by stream from a command, take it's
         stdout into a pipe buffer and redirect it to the parser to
         parse, then yield data as your desired format.
 
@@ -352,7 +352,7 @@ class PipeReader:
         An example:
 
         .. code-block:: python
-    
+
            def example_reader():
                for f in myfiles:
                    pr = PipeReader("cat %s"%f)
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 460eb3b3491a0626eb6ecbf89132e24177a2adaa..5b90facd49d655f56c037e087d86e41372cbfdb9 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -67,7 +67,7 @@ extension_module_name=[MODULE_NAME], then config_parser will call
 MODULE_NAME.get_config_funcs(g_config)
 MODULE_NAME.get_config_funcs() should return a dictionary of name to functions,
 those functions will be available in the config file.
-See trainer/tests/config_parser_test.py for example
+See legacy/trainer/tests/config_parser_test.py for example
 
 To use this from paddle_trainer, paddle_trainer should be called with
 --config_args=extension_module_name=[MODULE_NAME]
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index e6a03759ef431086390e217eabcdff47e610346c..d9787ef42a31b8dfd1836e7a01d5664049cc66b5 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -4182,9 +4182,9 @@ def recurrent_group(step, input, reverse=False, name=None, targetInlink=None):
 
     You can see following configs for further usages:
 
-    - time steps: lstmemory_group, paddle/gserver/tests/sequence_layer_group.conf, \
+    - time steps: lstmemory_group, paddle/legacy/gserver/tests/sequence_layer_group.conf, \
                   demo/seqToseq/seqToseq_net.py
-    - sequence steps: paddle/gserver/tests/sequence_nest_layer_group.conf
+    - sequence steps: paddle/legacy/gserver/tests/sequence_nest_layer_group.conf
 
     :param step: A step function which takes the input of recurrent_group as its own
                  input and returns values as recurrent_group's output every time step.
diff --git a/python/paddle/v2/dataset/cifar.py b/python/paddle/v2/dataset/cifar.py
index 0a2a1ced11ee5cb2fb407b229ce810d553c2fa46..662655c836dbc54bd6187dcd3dac7354d6c8ecd1 100644
--- a/python/paddle/v2/dataset/cifar.py
+++ b/python/paddle/v2/dataset/cifar.py
@@ -43,7 +43,7 @@ CIFAR100_URL = URL_PREFIX + 'cifar-100-python.tar.gz'
 CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85'
 
 
-def reader_creator(filename, sub_name):
+def reader_creator(filename, sub_name, cycle=False):
     def read_batch(batch):
         data = batch['data']
         labels = batch.get('labels', batch.get('fine_labels', None))
@@ -56,10 +56,13 @@ def reader_creator(filename, sub_name):
             names = (each_item.name for each_item in f
                      if sub_name in each_item.name)
 
-            for name in names:
-                batch = cPickle.load(f.extractfile(name))
-                for item in read_batch(batch):
-                    yield item
+            while True:
+                for name in names:
+                    batch = cPickle.load(f.extractfile(name))
+                    for item in read_batch(batch):
+                        yield item
+                if not cycle:
+                    break
 
     return reader
 
@@ -94,34 +97,40 @@ def test100():
         'test')
 
 
-def train10():
+def train10(cycle=False):
     """
     CIFAR-10 training set creator.
 
     It returns a reader creator, each sample in the reader is image pixels in
     [0, 1] and label in [0, 9].
 
+    :param cycle: whether to cycle through the dataset
+    :type cycle: bool
     :return: Training reader creator
     :rtype: callable
     """
     return reader_creator(
         paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
-        'data_batch')
+        'data_batch',
+        cycle=cycle)
 
 
-def test10():
+def test10(cycle=False):
     """
     CIFAR-10 test set creator.
 
     It returns a reader creator, each sample in the reader is image pixels in
     [0, 1] and label in [0, 9].
 
+    :param cycle: whether to cycle through the dataset
+    :type cycle: bool
     :return: Test reader creator.
     :rtype: callable
     """
     return reader_creator(
         paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
-        'test_batch')
+        'test_batch',
+        cycle=cycle)
 
 
 def fetch():
diff --git a/python/paddle/v2/dataset/flowers.py b/python/paddle/v2/dataset/flowers.py
index 357a4e9b000ea81afe291ff39dde2bed5c67e619..db12076d54064781bd1060947497622b14783768 100644
--- a/python/paddle/v2/dataset/flowers.py
+++ b/python/paddle/v2/dataset/flowers.py
@@ -76,7 +76,8 @@ def reader_creator(data_file,
                    dataset_name,
                    mapper,
                    buffered_size=1024,
-                   use_xmap=True):
+                   use_xmap=True,
+                   cycle=False):
     '''
     1. read images from tar file and
         merge images into batch files in 102flowers.tgz_batch/
@@ -96,6 +97,8 @@ def reader_creator(data_file,
     :type mapper: callable
     :param buffered_size: the size of buffer used to process images
     :type buffered_size: int
+    :param cycle: whether to cycle through the dataset
+    :type cycle: bool
     :return: data reader
     :rtype: callable
     '''
@@ -108,15 +111,18 @@ def reader_creator(data_file,
     file_list = batch_images_from_tar(data_file, dataset_name, img2label)
 
     def reader():
-        for file in open(file_list):
-            file = file.strip()
-            batch = None
-            with open(file, 'r') as f:
-                batch = cPickle.load(f)
-            data = batch['data']
-            labels = batch['label']
-            for sample, label in itertools.izip(data, batch['label']):
-                yield sample, int(label) - 1
+        while True:
+            for file in open(file_list):
+                file = file.strip()
+                batch = None
+                with open(file, 'r') as f:
+                    batch = cPickle.load(f)
+                data = batch['data']
+                labels = batch['label']
+                for sample, label in itertools.izip(data, batch['label']):
+                    yield sample, int(label) - 1
+            if not cycle:
+                break
 
     if use_xmap:
         cpu_num = int(os.environ.get('CPU_NUM', cpu_count()))
@@ -125,7 +131,7 @@ def reader_creator(data_file,
         return map_readers(mapper, reader)
 
 
-def train(mapper=train_mapper, buffered_size=1024, use_xmap=True):
+def train(mapper=train_mapper, buffered_size=1024, use_xmap=True, cycle=False):
     '''
     Create flowers training set reader.
     It returns a reader, each sample in the reader is
@@ -138,17 +144,23 @@ def train(mapper=train_mapper, buffered_size=1024, use_xmap=True):
     :type mapper: callable
     :param buffered_size: the size of buffer used to process images
     :type buffered_size: int
+    :param cycle: whether to cycle through the dataset
+    :type cycle: bool
     :return: train data reader
     :rtype: callable
     '''
     return reader_creator(
         download(DATA_URL, 'flowers', DATA_MD5),
         download(LABEL_URL, 'flowers', LABEL_MD5),
-        download(SETID_URL, 'flowers', SETID_MD5), TRAIN_FLAG, mapper,
-        buffered_size, use_xmap)
+        download(SETID_URL, 'flowers', SETID_MD5),
+        TRAIN_FLAG,
+        mapper,
+        buffered_size,
+        use_xmap,
+        cycle=cycle)
 
 
-def test(mapper=test_mapper, buffered_size=1024, use_xmap=True):
+def test(mapper=test_mapper, buffered_size=1024, use_xmap=True, cycle=False):
     '''
     Create flowers test set reader.
     It returns a reader, each sample in the reader is
@@ -161,14 +173,20 @@ def test(mapper=test_mapper, buffered_size=1024, use_xmap=True):
     :type mapper: callable
     :param buffered_size: the size of buffer used to process images
     :type buffered_size: int
+    :param cycle: whether to cycle through the dataset
+    :type cycle: bool
     :return: test data reader
     :rtype: callable
     '''
     return reader_creator(
         download(DATA_URL, 'flowers', DATA_MD5),
         download(LABEL_URL, 'flowers', LABEL_MD5),
-        download(SETID_URL, 'flowers', SETID_MD5), TEST_FLAG, mapper,
-        buffered_size, use_xmap)
+        download(SETID_URL, 'flowers', SETID_MD5),
+        TEST_FLAG,
+        mapper,
+        buffered_size,
+        use_xmap,
+        cycle=cycle)
 
 
 def valid(mapper=test_mapper, buffered_size=1024, use_xmap=True):
diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/v2/dataset/mnist.py
index 9f675bed895223e054cd3bb6e504fe1607f19858..2b959c48e4bc62e08f6f57981b61b7c5fe3a1d06 100644
--- a/python/paddle/v2/dataset/mnist.py
+++ b/python/paddle/v2/dataset/mnist.py
@@ -112,7 +112,7 @@ def fetch():
     paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5)
     paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
     paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5)
-    paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
+    paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist', TEST_LABEL_MD5)
 
 
 def convert(path):
diff --git a/python/paddle/v2/inference.py b/python/paddle/v2/inference.py
index 14b64742fd09bf6c197c5d1aa2354271293df239..28ee042282a08be32c13d91312fd97b211277522 100644
--- a/python/paddle/v2/inference.py
+++ b/python/paddle/v2/inference.py
@@ -63,7 +63,7 @@ class Inference(object):
             assert isinstance(val, api.Vector)
             val.copyFromNumpyArray(parameters.get(name).flatten())
             # the setValueUpdated function is called in randomize, zeroMem,
-            # load function in paddle/parameter/Parameter.cpp. But in the
+            # load function in paddle/legacy/parameter/Parameter.cpp. But in the
             # inference mode, the setValueUpdated is never called, it will
             # cause the parameter will not be dispatched
             # in MultiGradientMachine for multi-GPU. So setValueUpdated is
diff --git a/python/setup.py.in b/python/setup.py.in
index 8257f1d5e212a84188a4c51bc2d0f4d4c7af91fb..a0cb39070bf7a89e3ea4cb1d31f54f919d6ff74e 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -1,14 +1,13 @@
 from setuptools import setup, Distribution, Extension
 import subprocess
+import os
+import re
+import shutil
 class BinaryDistribution(Distribution):
     def has_ext_modules(foo):
         return True
 
-MAJOR   = 0
-MINOR   = 11
-PATCH   = 0
 RC      = 0
-ISTAGED = False
 
 
 
@@ -20,14 +19,47 @@ def git_commit():
         git_commit = 'Unknown'
     return git_commit
 
+def _get_version_detail(idx):
+    assert idx < 3, "vesion info consists of %(major)d.%(minor)d.%(patch)d, \
+        so detail index must less than 3"
+
+    if re.match('@TAG_VERSION_REGEX@', '@PADDLE_VERSION@'):
+        version_details = '@PADDLE_VERSION@'.split('.')
+
+        if len(version_details) == 3:
+            return version_details[idx]
+
+    return 0
+
+def get_major():
+    return int(_get_version_detail(0))
+
+def get_minor():
+    return int(_get_version_detail(1))
+
+def get_patch():
+    return str(_get_version_detail(2))
+
+def is_taged():
+    try:
+        cmd = ['git', 'describe', '--exact-match', '--tags']
+        git_tag = subprocess.Popen(cmd, stdout = subprocess.PIPE).communicate()[0].strip()
+    except:
+        return False
+
+    if git_tag.replace('v', '') == '@PADDLE_VERSION@':
+        return True
+    else:
+        return False
+
 def write_version_py(filename='paddle/version.py'):
     cnt = '''
 # THIS FILE IS GENERATED FROM PADDLEPADDLE SETUP.PY
 #
-full_version    = '%(major)d.%(minor)d.%(patch)d'
+full_version    = '%(major)d.%(minor)d.%(patch)s'
 major           = '%(major)d'
 minor           = '%(minor)d'
-patch           = '%(patch)d'
+patch           = '%(patch)s'
 rc              = '%(rc)d'
 istaged         = %(istaged)s
 commit          = '%(commit)s'
@@ -49,19 +81,20 @@ def mkl():
     commit = git_commit()
     with open(filename, 'w') as f:
         f.write(cnt % {
-            'major': MAJOR,
-            'minor': MINOR,
-            'patch': PATCH,
+            'major': get_major(),
+            'minor': get_minor(),
+            'patch': get_patch(),
             'rc': RC,
             'version': '${PADDLE_VERSION}',
             'commit': commit,
-            'istaged': ISTAGED,
+            'istaged': is_taged(),
             'with_mkl': '@WITH_MKL@'})
 
 write_version_py(filename='@PADDLE_BINARY_DIR@/python/paddle/version.py')
 
 
 packages=['paddle',
+          'paddle.libs',
           'paddle.utils',
           'paddle.dataset',
           'paddle.reader',
@@ -93,9 +126,9 @@ if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
 paddle_bins = ''
 if '${WITH_FLUID_ONLY}'== 'OFF':
     paddle_bin_dir = 'opt/paddle/bin'
-    paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/trainer/paddle_trainer',
-                   '${PADDLE_BINARY_DIR}/paddle/trainer/paddle_merge_model',
-                   '${PADDLE_BINARY_DIR}/paddle/pserver/paddle_pserver_main',
+    paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/legacy/trainer/paddle_trainer',
+                   '${PADDLE_BINARY_DIR}/paddle/legacy/trainer/paddle_merge_model',
+                   '${PADDLE_BINARY_DIR}/paddle/legacy/pserver/paddle_pserver_main',
                    '${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
 
 package_data={'paddle.fluid': ['core.so']}
@@ -113,12 +146,35 @@ package_dir={
 }
 if '${WITH_FLUID_ONLY}'== 'OFF':
     package_dir['py_paddle']='${PADDLE_BINARY_DIR}/python/py_paddle'
-    
 
-paddle_rt_lib_dir = 'lib'
-paddle_rt_libs = ['${WARPCTC_LIBRARIES}']
-if '${MKL_SHARED_LIBS}'!= '':
-  paddle_rt_libs += '${MKL_SHARED_LIBS}'.split(';')
+# put all thirdparty libraries in paddle.libs
+package_data['paddle.libs']=['libwarpctc.so']
+libs_path='${PADDLE_BINARY_DIR}/python/paddle/libs'
+shutil.copy('${WARPCTC_LIBRARIES}', libs_path)
+if '${WITH_MKL}' == 'ON':
+    shutil.copy('${MKLML_LIB}', libs_path)
+    shutil.copy('${MKLML_IOMP_LIB}', libs_path)
+    package_data['paddle.libs']+=['libmklml_intel.so','libiomp5.so']
+if '${WITH_MKLDNN}' == 'ON':
+    # change rpath of libmkldnn.so.0, add $ORIGIN/ to it.
+    # The reason is that all thirdparty libraries in the same directory,
+    # thus, libmkldnn.so.0 will find libmklml_intel.so and libiomp5.so.
+    command = "patchelf --set-rpath '$ORIGIN/' ${MKLDNN_SHARED_LIB}"
+    if os.system(command) != 0:
+        raise Exception("patchelf --set-rpath for libmkldnn.so.0 fails")
+    package_data['paddle.libs']+=['libmkldnn.so.0']
+    shutil.copy('${MKLDNN_SHARED_LIB}', libs_path)
+# remove unused paddle/libs/__init__.py
+os.remove(libs_path+'/__init__.py')
+package_dir['paddle.libs']=libs_path
+
+# change rpath of core.so, add $ORIGIN/../libs/ to it.
+# The reason is that libwarpctc.so, libiomp5.so etc are in paddle.libs, and
+# core.so is in paddle.fluid, thus paddle/fluid/../libs will pointer to above libraries.
+# This operation will fix https://github.com/PaddlePaddle/Paddle/issues/3213
+command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so"
+if os.system(command) != 0:
+    raise Exception("patchelf --set-rpath for core.so fails")
 
 setup(name='${PACKAGE_NAME}',
       version='${PADDLE_VERSION}',
@@ -128,6 +184,5 @@ setup(name='${PACKAGE_NAME}',
       ext_modules=[Extension('_foo', ['stub.cc'])],
       package_data=package_data,
       package_dir=package_dir,
-      scripts=paddle_bins,
-      data_files=[(paddle_rt_lib_dir, paddle_rt_libs)]
+      scripts=paddle_bins
 )
diff --git a/tools/check_ctest_hung.py b/tools/check_ctest_hung.py
new file mode 100644
index 0000000000000000000000000000000000000000..7de76c381b29a1ff8dcf2167f0e861dc261aa47b
--- /dev/null
+++ b/tools/check_ctest_hung.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import re
+
+
+def escape(input):
+    o = input.replace("\n", "")
+    o = o.replace("\r", "")
+    return o
+
+
+def main():
+    usage = """Usage:
+1. Download the Paddle_PR_CI_*.log from TeamCity
+2. run: python check_ctest_hung.py Paddle_PR_CI_*.log
+3. If there is hung ctest, the result likes:
+Diff:  set(['test_parallel_executor_crf'])
+    """
+    if len(sys.argv) < 2:
+        print(usage)
+        exit(0)
+
+    logfile = sys.argv[1]
+    started = set()
+    passed = set()
+    with open(logfile, "r") as fn:
+        for l in fn.readlines():
+            if l.find("Test ") != -1 and \
+                l.find("Passed") != -1:
+                m = re.search("Test\s+#[0-9]*\:\s([a-z0-9_]+)", escape(l))
+                passed.add(m.group(1))
+            if l.find("Start ") != -1:
+                start_parts = escape(l).split(" ")
+                m = re.search("Start\s+[0-9]+\:\s([a-z0-9_]+)", escape(l))
+                started.add(m.group(1))
+    print "Diff: ", started - passed
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.clang_format.hook b/tools/codestyle/clang_format.hook
similarity index 100%
rename from .clang_format.hook
rename to tools/codestyle/clang_format.hook
diff --git a/.copyright.hook b/tools/codestyle/copyright.hook
similarity index 100%
rename from .copyright.hook
rename to tools/codestyle/copyright.hook
diff --git a/tools/codestyle/cpplint_pre_commit.hook b/tools/codestyle/cpplint_pre_commit.hook
index b194af76dc529fd52b0aedfab9c41d625fe64c0d..2c65222c8aa7a019f0f8fea68fe02612f70bd41f 100755
--- a/tools/codestyle/cpplint_pre_commit.hook
+++ b/tools/codestyle/cpplint_pre_commit.hook
@@ -4,10 +4,10 @@ TOTAL_ERRORS=0
 
 # The trick to remove deleted files: https://stackoverflow.com/a/2413151
 for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}'); do
-    if [[ $file =~ ^(paddle/api/.*|paddle/capi/.*|paddle/contrib/.*|paddle/cuda/.*|paddle/function/.*|paddle/gserver/.*|paddle/math/.*|paddle/optimizer/.*|paddle/parameter/.*|paddle/pserver/.*|paddle/trainer/.*|paddle/utils/.*) ]]; then
+    if [[ $file =~ ^(paddle/legacy/api/.*|paddle/legacy/capi/.*|paddle/contrib/.*|paddle/legacy/cuda/.*|paddle/legacy/function/.*|paddle/legacy/gserver/.*|paddle/legacy/math/.*|paddle/legacy/optimizer/.*|paddle/legacy/parameter/.*|paddle/legacy/pserver/.*|paddle/legacy/trainer/.*|paddle/legacy/utils/.*|paddle/testing/TestUtil.*) ]]; then
         continue;
     else
-        cpplint $file;
+        cpplint --filter=-readability/fn_size $file;
         TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
     fi
 done
diff --git a/tools/codestyle/docstring_checker.py b/tools/codestyle/docstring_checker.py
index 54a690462699651d3e14f9b24383df01a9740336..8d4b24a0cf6b743b72dca58fd885f927560964bf 100644
--- a/tools/codestyle/docstring_checker.py
+++ b/tools/codestyle/docstring_checker.py
@@ -291,6 +291,8 @@ class DocstringChecker(BaseChecker):
             True if successful otherwise False.
         """
 
+        if node.name.startswith("__") or node.name.startswith("_"):
+            return True
         find = False
         for t in node.body:
             if not isinstance(t, astroid.Return):
@@ -316,6 +318,8 @@ class DocstringChecker(BaseChecker):
         Returns:
             True if successful otherwise False.
         """
+        if node.name.startswith("__") or node.name.startswith("_"):
+            return True
         args = []
         for arg in node.args.get_children():
             if (not isinstance(arg, astroid.AssignName)) \
diff --git a/tools/diff_api.py b/tools/diff_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf9f2c72cb78ddf88ff2a7bb1c0ee4b00ec0ec96
--- /dev/null
+++ b/tools/diff_api.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python
+from __future__ import print_function
+import difflib
+import sys
+
+with open(sys.argv[1], 'r') as f:
+    origin = f.read()
+    origin = origin.splitlines()
+
+with open(sys.argv[2], 'r') as f:
+    new = f.read()
+    new = new.splitlines()
+
+differ = difflib.Differ()
+result = differ.compare(origin, new)
+
+error = False
+print('API Difference is: ')
+for each_diff in result:
+    if each_diff[0] in ['-', '?']:  # delete or change API is not allowed
+        error = True
+    elif each_diff[0] == '+':
+        # only new layers is allowed.
+        if not each_diff.startswith('+ paddle.fluid.layers.'):
+            error = True
+
+    if each_diff[0] != ' ':
+        print(each_diff)
+
+if error:
+    sys.exit(1)
diff --git a/tools/print_signatures.py b/tools/print_signatures.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e7ffd44c7b0ba2270069bc4467dc377a58b2417
--- /dev/null
+++ b/tools/print_signatures.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Print all signature of a python module in alphabet order.
+
+Usage:
+    ./print_signature  "paddle.fluid" > signature.txt
+"""
+import importlib
+import inspect
+import collections
+import sys
+import pydoc
+
+member_dict = collections.OrderedDict()
+
+
+def visit_member(parent_name, member):
+    cur_name = ".".join([parent_name, member.__name__])
+    if inspect.isclass(member):
+        for name, value in inspect.getmembers(member):
+            if hasattr(value, '__name__') and (not name.startswith("_") or
+                                               name == "__init__"):
+                visit_member(cur_name, value)
+    elif callable(member):
+        try:
+            member_dict[cur_name] = inspect.getargspec(member)
+        except TypeError:  # special for PyBind method
+            member_dict[cur_name] = "  ".join([
+                line.strip() for line in pydoc.render_doc(member).split('\n')
+                if "->" in line
+            ])
+
+    else:
+        raise RuntimeError("Unsupported generate signature of member, type {0}".
+                           format(str(type(member))))
+
+
+def visit_all_module(mod):
+    for member_name in (
+            name
+            for name in (mod.__all__ if hasattr(mod, "__all__") else dir(mod))
+            if not name.startswith("_")):
+        instance = getattr(mod, member_name, None)
+        if instance is None:
+            continue
+        if inspect.ismodule(instance):
+            visit_all_module(instance)
+        else:
+            visit_member(mod.__name__, instance)
+
+
+visit_all_module(importlib.import_module(sys.argv[1]))
+
+for name in member_dict:
+    print name, member_dict[name]